[torqueusers] Is resources_used.mem reliable? + make -lpvmem working to set virtual memory limit in ulimit

Ake Ake.Sandgren at hpc2n.umu.se
Tue Apr 12 10:51:25 MDT 2005


On Tue, Apr 12, 2005 at 06:41:02PM +0200, etienne gondet wrote:
> 
>    Dear torque folks
> 
>    I would also prefer to use -lpvem to limit jobs max memory usage 
> including data and stack.
> I noticed that limiting virtual memory with ulimit were working with 
> ulmit -Sv 1000000
> a.out asking for more tht 1 Gb of stack+size+heap are killed.
> 
>    But torque 1.2.0.p1 when you specify -lpvem=1024mb limits stack and 
> data to 1 gb
> so a fortran code may use 1 gb  of local or automatci variables + 1 gb 
> of BSS+data+heap.
> what is not what we hoped.
> 
>    I looked on rehat + linux FAQS and they advices to replace 
> RLIMIT_DATA by RLIMIT_AS
> to limit virtual memory :
> 
>        So I modify the src/resmom/linux/mom_mach.c like that from line 
> 1214  :
> if (set_mode == SET_LIMIT_SET)
>    {
>    /* if either of vmem or pvmem was given, set sys limit to lesser */
> 
>    if (mem_limit != 0)
>      {
>      reslim.rlim_cur = reslim.rlim_max = mem_limit;
> 
> /* Replace _DATA by _AS to modify virtual memory in ulimit -Sa/Ha
>      if (setrlimit(RLIMIT_DATA,&reslim) < 0)
> ETG 11/04/2005 */
>      if (setrlimit(RLIMIT_AS,&reslim) < 0)
>        {
>        return(error("RLIMIT_AS",PBSE_SYSTEM));
>        }
> 
> /* To avoid at qsub messages : -bash: ulimit: stack size: cannot modify 
> limit: Invalid argument
>      if (setrlimit(RLIMIT_STACK,&reslim) < 0)
>        {
>        return(error("RLIMIT_STACK",PBSE_SYSTEM));
>        }
> */
>      }

This is just part of what needs to be done.

Since i'm currently buried in other work i'm attaching the two patches i
was talking about in my last mail.  If any one feels up to it please
read them carefully and test them.

The resource limiting patch makes sure that vmem limit is never smaller
then the mem limit but it uses (like before) the smaller of pvmem/vmem.
I sent an earlier (and broken) version of this patch to the list some
time ago and i think Greg tested it. Please test this one instead since
it is as far as i can see working correctly.

The vmem reporting patch needs to be applied to ALL mom-nodes in a
cluster at the same time since it affects communication between sister
and Mother Superior, it doesn't affect the server.

-- 
Ake Sandgren, HPC2N, Umea University, S-90187 Umea, Sweden
Internet: ake at hpc2n.umu.se	Phone: +46 90 7866134 Fax: +46 90 7866126
Mobile: +46 70 7716134 WWW: http://www.hpc2n.umu.se
-------------- next part --------------
diff -rbBwup site/src/resmom/linux/mom_mach.c p1/src/resmom/linux/mom_mach.c
--- site/src/resmom/linux/mom_mach.c	Tue Mar  8 23:58:49 2005
+++ p1/src/resmom/linux/mom_mach.c	Wed Mar  9 11:16:29 2005
@@ -360,6 +360,7 @@ proc_mem_t *get_proc_mem()
   static proc_mem_t  mm;
   FILE              *fp;
   char               str[32];
+  unsigned long bfsz, casz;
 
   if ((fp = fopen("/proc/meminfo","r")) == NULL) 
     {
@@ -377,10 +378,13 @@ proc_mem_t *get_proc_mem()
 
     fscanf(fp,"%*[^\n]%*c");      /* remove text header */;
 
-    fscanf(fp,"%*s %lu %lu %lu %*[^\n]%*c",
+    fscanf(fp,"%*s %lu %lu %lu %*lu %lu %lu",
       &mm.mem_total,
       &mm.mem_used,
-      &mm.mem_free);
+      &mm.mem_free,
+      &bfsz,
+      &casz);
+    mm.mem_free += casz + bfsz;
 
     fscanf(fp,"%*s %lu %lu %lu %*[^\n]%*c",
       &mm.swap_total,
@@ -405,6 +409,20 @@ proc_mem_t *get_proc_mem()
 
       mm.mem_free *= 1024;
       } 
+    else if (!strncmp(str,"Buffers:",sizeof(str))) 
+      {
+      fscanf(fp,"%lu",
+        &bfsz);
+
+      mm.mem_free += bfsz * 1024;
+      } 
+    else if (!strncmp(str,"Cached:",sizeof(str))) 
+      {
+      fscanf(fp,"%lu",
+        &casz);
+
+      mm.mem_free += casz * 1024;
+      } 
     else if (!strncmp(str,"SwapTotal:",sizeof(str))) 
       {
       fscanf(fp,"%lu",
@@ -1008,6 +1026,7 @@ int mom_set_limits(
   unsigned long	value;	/* place in which to build resource value */
   resource	*pres;
   struct rlimit	reslim;
+  unsigned long	vmem_limit = 0;
   unsigned long	mem_limit = 0;
 
   if (LOGLEVEL >= 2)
@@ -1118,8 +1137,8 @@ int mom_set_limits(
         return(error(pname,retval));
         }
 
-      if ((mem_limit == 0) || (value < mem_limit))
-        mem_limit = value;
+      if ((vmem_limit == 0) || (value < vmem_limit))
+        vmem_limit = value;
       } 
     else if (strcmp(pname,"pvmem") == 0) 
       {	
@@ -1139,8 +1158,8 @@ int mom_set_limits(
           return(error(pname,PBSE_BADATVAL));
           }
 
-        if ((mem_limit == 0) || (value < mem_limit))
-          mem_limit = value;
+        if ((vmem_limit == 0) || (value < vmem_limit))
+          vmem_limit = value;
         }
       } 
     else if (strcmp(pname,"mem") == 0) 
@@ -1162,10 +1181,19 @@ int mom_set_limits(
 
         reslim.rlim_cur = reslim.rlim_max = value;
 
+        if (setrlimit(RLIMIT_DATA,&reslim) < 0)
+          {
+          return(error("RLIMIT_DATA",PBSE_SYSTEM));
+          }
         if (setrlimit(RLIMIT_RSS,&reslim) < 0)
           {
           return(error("RLIMIT_RSS",PBSE_SYSTEM));
           }
+	mem_limit = value;
+	if (getrlimit(RLIMIT_STACK,&reslim) >= 0)
+	  {
+	    mem_limit = value + reslim.rlim_cur;
+	  }
         }
       } 
     else if (strcmp(pname,"walltime") == 0) 
@@ -1207,19 +1235,20 @@ int mom_set_limits(
     {
     /* if either of vmem or pvmem was given, set sys limit to lesser */
 
-    if (mem_limit != 0) 
+    if (vmem_limit != 0) 
       {
-      reslim.rlim_cur = reslim.rlim_max = mem_limit;
-
-      if (setrlimit(RLIMIT_DATA,&reslim) < 0)
-        {
-        return(error("RLIMIT_DATA",PBSE_SYSTEM));
+	/* Don't make (p)vmem < pmem */
+	if (mem_limit > vmem_limit) {
+	    vmem_limit = mem_limit;
         }
 
-      if (setrlimit(RLIMIT_STACK,&reslim) < 0)
+      reslim.rlim_cur = reslim.rlim_max = vmem_limit;
+
+      if (setrlimit(RLIMIT_AS,&reslim) < 0)
         {
-        return(error("RLIMIT_STACK",PBSE_SYSTEM));
+        return(error("RLIMIT_AS",PBSE_SYSTEM));
         }
+
       }
     }
 
-------------- next part --------------
diff -bBwrpu site/src/include/job.h vmem-fix/src/include/job.h
--- site/src/include/job.h	Wed Mar  9 11:19:26 2005
+++ vmem-fix/src/include/job.h	Thu Mar 24 12:38:15 2005
@@ -287,12 +287,13 @@ typedef struct vnodent {
 
 
 /*
-**	Mothere Superior gets to hold an array of information from each
+**	Mother Superior gets to hold an array of information from each
 **	of the other nodes for resource usage.
 */
 typedef struct	noderes {
 	long		nr_cput;	/* cpu time */
 	long		nr_mem;		/* memory */
+	long		nr_vmem;	/* virtual memory */
 } noderes;
 
 
diff -bBwrpu site/src/resmom/mom_comm.c vmem-fix/src/resmom/mom_comm.c
--- site/src/resmom/mom_comm.c	Mon Feb 28 18:17:55 2005
+++ vmem-fix/src/resmom/mom_comm.c	Thu Mar 24 13:06:34 2005
@@ -2870,6 +2870,11 @@ void im_request(
 
       ret = diswul(stream, resc_used(pjob, "mem", getsize));
 
+      if (ret != DIS_SUCCESS)
+        break;
+
+      ret = diswul(stream, resc_used(pjob, "vmem", getsize));
+
       break;
 
     case IM_ABORT_JOB:
@@ -3107,8 +3112,9 @@ void im_request(
         ** I'm mother superior.
         **
         ** auxiliary info (
-        **	cput	int;
-        **	mem	int;
+        **	cput	u_long;
+        **	mem	u_long;
+        **	vmem	u_long;
         ** )
         */
 
@@ -3142,12 +3148,18 @@ void im_request(
           if (ret != DIS_SUCCESS)
             goto err;
       
-          DBPRT(("%s: %s FINAL from %d cpu %lu sec mem %lu kb\n",
+          pjob->ji_resources[nodeidx - 1].nr_vmem = disrul(stream,&ret);
+
+          if (ret != DIS_SUCCESS)
+            goto err;
+      
+          DBPRT(("%s: %s FINAL from %d cpu %lu sec mem %lu kb vmem %lu kb\n",
             id,
             jobid, 
             nodeidx,
             pjob->ji_resources[nodeidx-1].nr_cput,
-            pjob->ji_resources[nodeidx-1].nr_mem))
+            pjob->ji_resources[nodeidx-1].nr_mem,
+            pjob->ji_resources[nodeidx-1].nr_vmem))
           }  /* END if (pjob_ji_resources != NULL) */
       
         /* don't close stream in case other jobs use it */
@@ -3340,6 +3352,7 @@ void im_request(
           **	recommendation	int;
           **	cput		u_long;
           **	mem		u_long;
+          **	vmem		u_long;
           ** )
           */
 
@@ -3365,12 +3378,18 @@ void im_request(
           if (ret != DIS_SUCCESS)
             goto err;
   
-          DBPRT(("%s: POLL_JOB %s OKAY kill %d cpu %lu mem %lu\n",
+          pjob->ji_resources[nodeidx-1].nr_vmem = disrul(stream,&ret);
+  
+          if (ret != DIS_SUCCESS)
+            goto err;
+  
+          DBPRT(("%s: POLL_JOB %s OKAY kill %d cpu %lu mem %lu vmem %lu\n",
             id, 
             jobid, 
             exitval,
             pjob->ji_resources[nodeidx-1].nr_cput,
-            pjob->ji_resources[nodeidx-1].nr_mem))
+            pjob->ji_resources[nodeidx-1].nr_mem,
+            pjob->ji_resources[nodeidx-1].nr_vmem))
   
           if (exitval != 0)
             {
diff -bBwrpu site/src/resmom/requests.c vmem-fix/src/resmom/requests.c
--- site/src/resmom/requests.c	Tue Feb 22 21:59:14 2005
+++ vmem-fix/src/resmom/requests.c	Thu Mar 24 13:00:33 2005
@@ -1456,6 +1456,12 @@ void encode_used(
 					lnum += nr->nr_mem;
 				}
 			}
+			else if (strcmp(rd->rs_name, "vmem") == 0) {
+				for (i=0; i<pjob->ji_numnodes-1; i++) {
+					noderes	*nr = &pjob->ji_resources[i];
+					lnum += nr->nr_vmem;
+				}
+			}
 		}
 		val.at_val.at_long += lnum;
 


More information about the torqueusers mailing list