[torqueusers] cput statistic not correct for some jobs

David Singleton David.Singleton at anu.edu.au
Tue Jan 30 22:12:01 MST 2007


Martin Schafföner wrote:
> On Monday 29 January 2007 17:08, David Golden wrote:
> 
> @ David Golden: Sorry, my mail program screwed up some things...
> 
>> Probably not myself, but what version of torque are you running?
> 
> 2.1.6
> 
>> There _was_ a cput stats collection  problem found in ~ 2.1.0p0-2.1.3
>> last year, though:
>> http://www.supercluster.org/pipermail/torqueusers/2006-October/004522.htm
> 
> Back then, no cput stats were working for me, so I was happy when I saw some 
> figure appear. Now I only noticed that jobs with many TM-spawned tasks 
> don't count correctly.
> 

I think this is because PBS doesn't carry around the cpu usage of
exited tasks.  A job is only ever attributed with the cputime of
live processes or their accrued child cputime usage - PBS just keeps
the max of all these "snapshots".  Once a task exits all its cpu
usage is no longer available from /proc.

We are carrying information around in the fields of the ti_hold
element of the task struct.  We should have just added our own
task struct elements but instead we just defined these indices
in job.h:

/*
** Elements of "reserved space" ti_hold component of task struct
** Data placed in here to survive task save/restore.
** Should actually replace some of  ti_hold
*/
#define TASKCPUTIME         0   // for maintaining task cpu time
#define TASKCPUTIMESCRATCH  1   // for accumulating cpu time in cput_sum()
#define TASKPROCESSCOUNT    2   // tracking nprocs in a task
#define TASKPROPERTYFLAGS   3   // Possible task properties below
#define TASKSYSTIME         4   // for maintaining task sys time
#define TASKSYSTIMESCRATCH  5   // for accumulating sys time in cput_sum()



This is the gist of the code in mom_mach that finds the job
cpu usage:


/*
  * Internal session cpu time decoding routine.
  *
  * Accepts a job pointer.  Returns the sum of all cpu time
  * consumed for all tasks executed by the job on this node,
  * in seconds. The "real" time is adjusted by "cputfactor".
  */


/* <DJH 14 Nov 2001>.
  * A little accessor that returns a pointer to an
  * element of our per-task 'miscellaneous info' array.
  */
static int *getTaskMiscInfo(task *ptask, int index) {
	return (ptask->ti_qs.ti_u.ti_hold)+index;
}

// DJH 3 May 2002. keep both user and system cpu time.
struct cpuAndSysTime {
	unsigned long cput, syst;
};


static struct cpuAndSysTime cput_sum(job *pjob)
{
	struct cpuAndSysTime ret = {0, 0};
	char     *id = "cput_sum";
	ulong    cputime, addtime;
	ulong    systime=0, addsystime=0;

	int      i;
	int      local_job_nprocs = 0;
	psinfo_t *pi;
	task    *ptask;

	cputime = 0;

	// Initialise our scratch space for accumulating cpu time across all
	// processes in each task in this job on this node. Note that
	// ti_hold[TASKCPUTIMESCRATCH] is really just used as a local
	// variable.  The alternative would be to invent a local hashtable
	// keyed by task, but there's space in ti_hold so we may as well use
	// it.
	for (ptask = (task *)GET_NEXT(pjob->ji_tasks);
	     ptask;
	     ptask = (task *)GET_NEXT(ptask->ti_jobtask)) {
		*getTaskMiscInfo(ptask,TASKCPUTIMESCRATCH)=0;
		*getTaskMiscInfo(ptask,TASKPROCESSCOUNT)=0;
		*getTaskMiscInfo(ptask,TASKSYSTIMESCRATCH)=0;
	}

	// Loop over all processes. Ignore those that PBS doesn't know about
	for (i=0; i<nproc; i++) {
		pi = &proc_info[i];
	
		ptask=injob(pjob, pi->pr_sid);

		if (ptask==NULL)  continue;

		if (ptask->ti_qs.ti_status!=TI_STATE_RUNNING) {
			// Situation where task is supposed to be gone but process is
			// still about.  Probably process is hung in U state or similar
			// and cant be killed.  Dont want to add in its resource usage
			// because that should have been accounted for in task
			// termination.  Would also like to keep job alive until
			// process disappears - how?
			DBPRT(("%s: warning found pid %d in task %d which is in state %d!\n",
			       id, pi->pr_pid, ptask->ti_qs.ti_task,ptask->ti_qs.ti_status));
			continue;
		}

		(*getTaskMiscInfo(ptask,TASKPROCESSCOUNT))++;
		local_job_nprocs++;
	
		DBPRT(("%s: ses %d pid %d ", id, pi->pr_sid, pi->pr_pid));

		// A feeble attempt to ignore the memory/cpu use of recently forked pids
		if (time_now < (time_t) ISECS(pi->pr_start) + 2 ) {
			DBPRT((" baby process\n"));
			continue;
		}
	
		addtime = CPUTIME(pi);
		cputime += addtime;

		addsystime = SYSTIME(pi);
		systime += addsystime;

		// Accumulate the per-task cpu time too... we will need it below
		// to keep a cpu-time high water mark for each task.
		*getTaskMiscInfo(ptask,TASKCPUTIMESCRATCH) +=addtime;
		*getTaskMiscInfo(ptask,TASKSYSTIMESCRATCH) +=addsystime;

		DBPRT((" cputime %lu\n", cputime));
	}


	// Got all the cpu times now for all processes that PBS knows about
	// on this node. Grand total is in cputime and per-task totals are
	// in ti_hold[TASKCPUTIMESCRATCH]

	for (ptask = (task *)GET_NEXT(pjob->ji_tasks);
	     ptask;
	     ptask = (task *)GET_NEXT(ptask->ti_jobtask)) {
	
		DBPRT(("%s: task %d state %d ",
		       id, ptask->ti_qs.ti_task,ptask->ti_qs.ti_status));
	
		// If the task state is 'running' but there are no processes for
		// it, mark the state as now 'exited'. This is how all adopted
		// tasks become 'exited', and might also be the way that child-
		// process tasks become 'exited' if we get to them before
		// scan_for_terminated() does.
		//
		// Beware of not getting all pids from /proc, find no processes
		// twice before setting exiting. Appears the only way to come thru
		// here twice without doing mom_get_sample() between is if cputime
		// job limits are set so mom_over_limits calls here.
		if (ptask->ti_qs.ti_status==TI_STATE_RUNNING) {

			if ( *getTaskMiscInfo(ptask,TASKPROCESSCOUNT)==0) {
			
				if ( ptask->ti_flags & TI_FLAGS_NO_PROCS) {
					extern int       exiting_tasks;
					// This provokes finish_loop() to call scan_for_exiting()
					exiting_tasks=1;
				
					ptask->ti_qs.ti_status = TI_STATE_EXITED;
					(void)task_save(ptask);
					DBPRT((" (set exiting) "));
				}
				else
					ptask->ti_flags |= TI_FLAGS_NO_PROCS;
			}
			else
				ptask->ti_flags &= ~TI_FLAGS_NO_PROCS;
		}
	
		if (ptask->ti_qs.ti_status >= TI_STATE_EXITED) {
			// Task has exited or is dead. Add on the cpu time that we
			// accumulated for it in times gone past (see 'else'
			// branch). Note that processes that have just become 'exited'
			// in the test above are OK - since there were no processes, the
			// only cpu time we have is what was accumulated in the past.
			DBPRT(("cputime %lu \n",(ulong)*getTaskMiscInfo(ptask,TASKCPUTIME)));
			cputime += (ulong) *getTaskMiscInfo(ptask,TASKCPUTIME);
			systime += (ulong) *getTaskMiscInfo(ptask,TASKSYSTIME);

		} else {
			// Got a running task. Update its cpu usage in TASKCPUTIME so that
			// we still have a record of it after it dies.
			int *taskCpuTime=getTaskMiscInfo(ptask,TASKCPUTIME);
			int *taskCpuTimeScratch= getTaskMiscInfo(ptask,TASKCPUTIMESCRATCH);

			int *taskSysTime = getTaskMiscInfo(ptask,TASKSYSTIME);
			int *taskSysTimeScratch = getTaskMiscInfo(ptask,TASKSYSTIMESCRATCH);

			*taskSysTime = MAX( *taskSysTime, *taskSysTimeScratch );
			*taskCpuTime = MAX( *taskCpuTime, *taskCpuTimeScratch );
	
			DBPRT(("\n"));
			(void)task_save(ptask);
	
			//</DJH 14 Nov 2001>
		}
	}

	if (local_job_nprocs==0) {
		DBPRT(("%s: no processes found\n", id));
		pjob->ji_flags |= MOM_NO_PROC;
		// job state not changed here. Rechecked in mom main loop.
	}

	ret.cput=  ((unsigned)((double)cputime * cputfactor));
	ret.syst = ((unsigned)((double)systime * cputfactor));

	return ret;
}


David


More information about the torqueusers mailing list