[torqueusers] cput statistic not correct for some jobs
David Singleton
David.Singleton at anu.edu.au
Tue Jan 30 22:12:01 MST 2007
Martin Schafföner wrote:
> On Monday 29 January 2007 17:08, David Golden wrote:
>
> @ David Golden: Sorry, my mail program screwed up some things...
>
>> Probably not myself, but what version of torque are you running?
>
> 2.1.6
>
>> There _was_ a cput stats collection problem found in ~ 2.1.0p0-2.1.3
>> last year, though:
>> http://www.supercluster.org/pipermail/torqueusers/2006-October/004522.htm
>
> Back then, no cput stats were working for me, so I was happy when I saw some
> figure appear. Now I only noticed that jobs with many TM-spawned tasks
> don't count correctly.
>
I think this is because PBS doesn't carry around the cpu usage of
exited tasks. A job is only ever attributed with the cputime of
live processes or their accrued child cputime usage - PBS just keeps
the max of all these "snapshots". Once a task exits all its cpu
usage is no longer available from /proc.
We are carrying information around in the fields of the ti_hold
element of the task struct. We should have just added our own
task struct elements but instead we just defined these indices
in job.h:
/*
** Elements of "reserved space" ti_hold component of task struct
** Data placed in here to survive task save/restore.
** Should actually replace some of ti_hold
*/
#define TASKCPUTIME 0 // for maintaining task cpu time
#define TASKCPUTIMESCRATCH 1 // for accumulating cpu time in cput_sum()
#define TASKPROCESSCOUNT 2 // tracking nprocs in a task
#define TASKPROPERTYFLAGS 3 // Possible task properties below
#define TASKSYSTIME 4 // for maintaining task sys time
#define TASKSYSTIMESCRATCH 5 // for accumulating sys time in cput_sum()
This is the gist of the code in mom_mach that finds the job
cpu usage:
/*
* Internal session cpu time decoding routine.
*
* Accepts a job pointer. Returns the sum of all cpu time
* consumed for all tasks executed by the job on this node,
* in seconds. The "real" time is adjusted by "cputfactor".
*/
/* <DJH 14 Nov 2001>.
* A little accessor that returns a pointer to an
* element of our per-task 'miscellaneous info' array.
*/
static int *getTaskMiscInfo(task *ptask, int index) {
return (ptask->ti_qs.ti_u.ti_hold)+index;
}
// DJH 3 May 2002. keep both user and system cpu time.
struct cpuAndSysTime {
unsigned long cput, syst;
};
static struct cpuAndSysTime cput_sum(job *pjob)
{
struct cpuAndSysTime ret = {0, 0};
char *id = "cput_sum";
ulong cputime, addtime;
ulong systime=0, addsystime=0;
int i;
int local_job_nprocs = 0;
psinfo_t *pi;
task *ptask;
cputime = 0;
// Initialise our scratch space for accumulating cpu time across all
// processes in each task in this job on this node. Note that
// ti_hold[TASKCPUTIMESCRATCH] is really just used as a local
// variable. The alternative would be to invent a local hashtable
// keyed by task, but there's space in ti_hold so we may as well use
// it.
for (ptask = (task *)GET_NEXT(pjob->ji_tasks);
ptask;
ptask = (task *)GET_NEXT(ptask->ti_jobtask)) {
*getTaskMiscInfo(ptask,TASKCPUTIMESCRATCH)=0;
*getTaskMiscInfo(ptask,TASKPROCESSCOUNT)=0;
*getTaskMiscInfo(ptask,TASKSYSTIMESCRATCH)=0;
}
// Loop over all processes. Ignore those that PBS doesn't know about
for (i=0; i<nproc; i++) {
pi = &proc_info[i];
ptask=injob(pjob, pi->pr_sid);
if (ptask==NULL) continue;
if (ptask->ti_qs.ti_status!=TI_STATE_RUNNING) {
// Situation where task is supposed to be gone but process is
// still about. Probably process is hung in U state or similar
// and cant be killed. Dont want to add in its resource usage
// because that should have been accounted for in task
// termination. Would also like to keep job alive until
// process disappears - how?
DBPRT(("%s: warning found pid %d in task %d which is in state %d!\n",
id, pi->pr_pid, ptask->ti_qs.ti_task,ptask->ti_qs.ti_status));
continue;
}
(*getTaskMiscInfo(ptask,TASKPROCESSCOUNT))++;
local_job_nprocs++;
DBPRT(("%s: ses %d pid %d ", id, pi->pr_sid, pi->pr_pid));
// A feeble attempt to ignore the memory/cpu use of recently forked pids
if (time_now < (time_t) ISECS(pi->pr_start) + 2 ) {
DBPRT((" baby process\n"));
continue;
}
addtime = CPUTIME(pi);
cputime += addtime;
addsystime = SYSTIME(pi);
systime += addsystime;
// Accumulate the per-task cpu time too... we will need it below
// to keep a cpu-time high water mark for each task.
*getTaskMiscInfo(ptask,TASKCPUTIMESCRATCH) +=addtime;
*getTaskMiscInfo(ptask,TASKSYSTIMESCRATCH) +=addsystime;
DBPRT((" cputime %lu\n", cputime));
}
// Got all the cpu times now for all processes that PBS knows about
// on this node. Grand total is in cputime and per-task totals are
// in ti_hold[TASKCPUTIMESCRATCH]
for (ptask = (task *)GET_NEXT(pjob->ji_tasks);
ptask;
ptask = (task *)GET_NEXT(ptask->ti_jobtask)) {
DBPRT(("%s: task %d state %d ",
id, ptask->ti_qs.ti_task,ptask->ti_qs.ti_status));
// If the task state is 'running' but there are no processes for
// it, mark the state as now 'exited'. This is how all adopted
// tasks become 'exited', and might also be the way that child-
// process tasks become 'exited' if we get to them before
// scan_for_terminated() does.
//
// Beware of not getting all pids from /proc, find no processes
// twice before setting exiting. Appears the only way to come thru
// here twice without doing mom_get_sample() between is if cputime
// job limits are set so mom_over_limits calls here.
if (ptask->ti_qs.ti_status==TI_STATE_RUNNING) {
if ( *getTaskMiscInfo(ptask,TASKPROCESSCOUNT)==0) {
if ( ptask->ti_flags & TI_FLAGS_NO_PROCS) {
extern int exiting_tasks;
// This provokes finish_loop() to call scan_for_exiting()
exiting_tasks=1;
ptask->ti_qs.ti_status = TI_STATE_EXITED;
(void)task_save(ptask);
DBPRT((" (set exiting) "));
}
else
ptask->ti_flags |= TI_FLAGS_NO_PROCS;
}
else
ptask->ti_flags &= ~TI_FLAGS_NO_PROCS;
}
if (ptask->ti_qs.ti_status >= TI_STATE_EXITED) {
// Task has exited or is dead. Add on the cpu time that we
// accumulated for it in times gone past (see 'else'
// branch). Note that processes that have just become 'exited'
// in the test above are OK - since there were no processes, the
// only cpu time we have is what was accumulated in the past.
DBPRT(("cputime %lu \n",(ulong)*getTaskMiscInfo(ptask,TASKCPUTIME)));
cputime += (ulong) *getTaskMiscInfo(ptask,TASKCPUTIME);
systime += (ulong) *getTaskMiscInfo(ptask,TASKSYSTIME);
} else {
// Got a running task. Update its cpu usage in TASKCPUTIME so that
// we still have a record of it after it dies.
int *taskCpuTime=getTaskMiscInfo(ptask,TASKCPUTIME);
int *taskCpuTimeScratch= getTaskMiscInfo(ptask,TASKCPUTIMESCRATCH);
int *taskSysTime = getTaskMiscInfo(ptask,TASKSYSTIME);
int *taskSysTimeScratch = getTaskMiscInfo(ptask,TASKSYSTIMESCRATCH);
*taskSysTime = MAX( *taskSysTime, *taskSysTimeScratch );
*taskCpuTime = MAX( *taskCpuTime, *taskCpuTimeScratch );
DBPRT(("\n"));
(void)task_save(ptask);
//</DJH 14 Nov 2001>
}
}
if (local_job_nprocs==0) {
DBPRT(("%s: no processes found\n", id));
pjob->ji_flags |= MOM_NO_PROC;
// job state not changed here. Rechecked in mom main loop.
}
ret.cput= ((unsigned)((double)cputime * cputfactor));
ret.syst = ((unsigned)((double)systime * cputfactor));
return ret;
}
David
More information about the torqueusers
mailing list