Skip to content
3 changes: 3 additions & 0 deletions client/app.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,9 @@ ACTIVE_TASK::ACTIVE_TASK() {
fraction_done_elapsed_time = 0;
first_fraction_done = 0;
first_fraction_done_elapsed_time = 0;
stuck_check_fraction_done = 0;
stuck_check_elapsed_time = 0;
stuck_check_cpu_time = 0;
scheduler_state = CPU_SCHED_UNINITIALIZED;
next_scheduler_state = CPU_SCHED_UNINITIALIZED;
signal = 0;
Expand Down
6 changes: 6 additions & 0 deletions client/app.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,12 @@ struct ACTIVE_TASK {
// first frac done reported during this run of task
double first_fraction_done_elapsed_time;
// elapsed time when the above was reported
double stuck_check_fraction_done;
// fraction done since last check for stuck
double stuck_check_elapsed_time;
// elapsed time at last stuck check
double stuck_check_cpu_time;
// cpu time at last check
SCHEDULER_STATE scheduler_state;
SCHEDULER_STATE next_scheduler_state; // temp
int signal;
Expand Down
31 changes: 31 additions & 0 deletions client/app_control.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,37 @@ bool ACTIVE_TASK_SET::poll() {
}
}
}

// check if a job is "stuck" (did not make progress in the last hour)
// notify the user about the issue
// abort after some time
static double last_stuck_check_time = 0;
if (gstate.now - last_stuck_check_time > STUCK_CHECK_POLL_PERIOD) {
last_stuck_check_time = gstate.now;
for (i=0; i<active_tasks.size(); i++){
ACTIVE_TASK* atp = active_tasks[i];
if (atp->non_cpu_intensive()) continue;
if (atp->sporadic()) continue;
if (atp->stuck_check_elapsed_time == 0) {
// first pass
atp->stuck_check_elapsed_time = atp->elapsed_time;
atp->stuck_check_fraction_done = atp->fraction_done;
atp->stuck_check_cpu_time = atp->current_cpu_time;
continue;
}
if (atp->elapsed_time < atp->stuck_check_elapsed_time + STUCK_CHECK_POLL_PERIOD) continue;
if (atp->stuck_check_fraction_done == atp->fraction_done &&
(atp->current_cpu_time - atp->stuck_check_cpu_time) < 10) {
// if fraction done does not change and cpu time is <10, message the user
msg_printf(atp->result->project, MSG_USER_ALERT,
Comment thread
AenBleidd marked this conversation as resolved.
"Task has not made progress in last hour, consider aborting");
}
atp->stuck_check_elapsed_time = atp->elapsed_time;
atp->stuck_check_fraction_done = atp->fraction_done;
atp->stuck_check_cpu_time = atp->current_cpu_time;
}
}

if (action) {
gstate.set_client_state_dirty("ACTIVE_TASK_SET::poll");
}
Expand Down
3 changes: 3 additions & 0 deletions client/client_state.h
Original file line number Diff line number Diff line change
Expand Up @@ -601,6 +601,9 @@ extern THREAD throttle_thread;
#define MEMORY_USAGE_PERIOD 10
// computer memory usage and check for exclusive apps this often

#define STUCK_CHECK_POLL_PERIOD 3600
// poll if a job is ever stuck

//////// WORK FETCH

#define WORK_FETCH_PERIOD 60
Expand Down