From cc9075de03a216c438353bd05f6e563a34faccf5 Mon Sep 17 00:00:00 2001 From: mohalkh5 Date: Sat, 29 Nov 2025 10:11:45 -0700 Subject: [PATCH 01/11] adding gpu metrics detection logic to seff --- bin/seff | 131 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 79 insertions(+), 52 deletions(-) diff --git a/bin/seff b/bin/seff index 511176f..ccf085e 100644 --- a/bin/seff +++ b/bin/seff @@ -11,12 +11,8 @@ use Sys::Hostname; use lib qw(/usr/lib64/perl5); use Slurmdb ':all'; use Slurm ':all'; -#use Data::Dumper; -my $VERSION = "2.1"; - -# This script is roughtly equivalent to: -# sacct -P -n -a --format JobID,User,Group,State,Cluster,AllocCPUS,REQMEM,TotalCPU,Elapsed,MaxRSS,ExitCode,NNodes,NTasks -j +my $VERSION = "2.2"; my %opts; getopts('hvdf:',\%opts); @@ -42,7 +38,6 @@ my $jobid_arg = $ARGV[0]; my $db_conn = Slurmdb::connection_get(); my $slurm = Slurm::new(); -# Get cluster name from SLurm config file. my $conf = $slurm->load_ctl_conf(); my $clustername = $conf->{'cluster_name'}; @@ -53,14 +48,12 @@ $job_cond{step_list} = $jobid_arg; $job_cond{usage_start} = 0; $job_cond{usage_end} = 0; -# Get and test for a single job. my $jobs = Slurmdb::jobs_get($db_conn, \%job_cond); if (scalar @$jobs < 1) { print STDERR "Job not found.\n"; exit 2; } my $job = @$jobs[0]; -#print Dumper($job); my $jobid = $job->{'jobid'}; my $user = $job->{'user'}; @@ -76,9 +69,8 @@ if (exists $job->{'alloc_cpus'}) { $ncpus = Slurmdb::find_tres_count_in_string($job->{'tres_alloc_str'}, TRES_CPU); } } -# Check for missing number of cpus if (($ncpus == INFINITE64) || ($ncpus == 0)) { - $ncpus = 1; + $ncpus = 1; } my $nnodes = 1; @@ -89,26 +81,23 @@ if (exists $job->{'alloc_nodes'}) { $nnodes = Slurmdb::find_tres_count_in_string($job->{'tres_alloc_str'}, TRES_NODE); } } -# Check for missing number of nodes. if (($nnodes == INFINITE64) || ($nnodes == 0)) { - $nnodes = 1; + $nnodes = 1; } my $pernode = 1; if ($job->{'req_mem'} & MEM_PER_CPU) { - $pernode = 0; - } + $pernode = 0; +} my $reqmem = Slurmdb::find_tres_count_in_string($job->{'tres_req_str'}, TRES_MEM); $reqmem = $reqmem * 1024; my $walltime = $job->{'elapsed'}; -# Only use hi-order byte for error code. my $exit_status = $job->{'exitcode'} >> 8; my $array_job_id = $job->{'array_job_id'}; my $array_jobid = ""; if ($array_job_id != 0) { - # Convert array_task_id to a signed long integer. my $array_task_id = unpack('l', pack('l', $job->{'array_task_id'})); if ($array_task_id == -2) { print STDERR "Badly formatted array jobid $array_job_id with task_id = -2\n"; @@ -116,40 +105,37 @@ if ($array_job_id != 0) { } $array_jobid = "${array_job_id}_${array_task_id}"; } + my $tot_cpu_sec = 0; my $tot_cpu_usec = 0; my $mem = 0; my $maxmem = 0; -my $avemem = 0; + for my $step (@{$job->{'steps'}}) { $tot_cpu_sec += $step->{'tot_cpu_sec'}; $tot_cpu_usec += $step->{'tot_cpu_usec'}; - # TRES_MEM in tres_usage_in_tot is the sum of memory peaks of all tasks. if (exists $step->{'stats'} && exists $step->{'stats'}{'tres_usage_in_tot'}) { my $lmem = Slurmdb::find_tres_count_in_string($step->{'stats'}{'tres_usage_in_tot'}, TRES_MEM); if ($lmem == INFINITE64) { - $lmem = 0; + $lmem = 0; } else { - $lmem = $lmem / 1024; + $lmem = $lmem / 1024; } if ($mem < $lmem) { $mem = $lmem; - # Get the task MaxRSS seen in this step and the AveRSS. $maxmem = Slurmdb::find_tres_count_in_string($step->{'stats'}{'tres_usage_in_max'}, TRES_MEM); - $avemem = Slurmdb::find_tres_count_in_string($step->{'stats'}{'tres_usage_in_ave'}, TRES_MEM); - if ($maxmem == INFINITE64) { - $maxmem = 0; - } } } } + my $cput = $tot_cpu_sec + int(($tot_cpu_usec / 1000000) + 0.5); if ($mydebug) { print "Slurm data: JobID ArrayJobID User Group State Clustername Ncpus Nnodes Ntasks Reqmem PerNode Cput Walltime Mem ExitStatus\n"; print "Slurm data: $jobid $array_jobid $user $group $state $clustername $ncpus $nnodes $reqmem $pernode $cput $walltime $mem $exit_status\n\n"; } + print "Job ID: $jobid\n"; if (length $array_jobid) { print "Array Job ID: $array_jobid\n"; @@ -167,43 +153,78 @@ if ($ncpus == 1) { print "Nodes: $nnodes\n"; printf "Cores per node: %d\n", $ncpus/$nnodes; } + if ($state ne "PENDING") { my $corewalltime = $walltime * $ncpus; - my $cpu_eff; - if ($corewalltime != 0) { - $cpu_eff = $cput / $corewalltime * 100; - } else { - $cpu_eff = 0.0; - } + my $cpu_eff = ($corewalltime != 0) ? $cput / $corewalltime * 100 : 0.0; + printf("CPU Utilized: %s\n", time2str($cput)); printf("CPU Efficiency: %.2f%% of %s core-walltime\n", $cpu_eff, time2str($corewalltime)); printf("Job Wall-clock time: %s\n", time2str($walltime)); printf("Memory Utilized: %s\n", kbytes2str($mem)); - my $mem_eff; - my $allocmem = Slurmdb::find_tres_count_in_string($job->{'tres_alloc_str'}, TRES_MEM); - $allocmem = $allocmem * 1024; - if ($allocmem != 0) { - $mem_eff = $mem / $allocmem * 100; - } else { - $mem_eff = 0.0; - } + + my $allocmem = Slurmdb::find_tres_count_in_string($job->{'tres_alloc_str'}, TRES_MEM) * 1024; + my $mem_eff = ($allocmem != 0) ? $mem / $allocmem * 100 : 0.0; + if ($pernode) { printf("Memory Efficiency: %.2f%% of %s (%s\/node)\n", $mem_eff, kbytes2str($allocmem), kbytes2str($allocmem / $nnodes)); } else { printf("Memory Efficiency: %.2f%% of %s (%s\/core)\n", $mem_eff, kbytes2str($allocmem), kbytes2str($allocmem / $ncpus)); } - if (($maxmem != 0) && ($maxmem > $avemem)) { - printf("The task which had the largest memory consumption differs by %.02f%% from the average task max memory consumption\n", ($maxmem / $avemem) * 100); - } if ($state eq "RUNNING") { print "WARNING: Efficiency statistics can only be obtained after the job has ended as seff tool is based on the accounting database data.\n"; } + + # --- GPU Detection and Stats --- + my $gpu_alloc_str = `sacct -n -p -o AllocTres -X -j $jobid_arg 2>/dev/null`; + chomp($gpu_alloc_str); + my ($gpu_type, $gpu_count) = ("", 0); + + if ($gpu_alloc_str =~ /gres\/gpu:([a-zA-Z0-9._-]+)=([0-9]+)/) { + $gpu_type = $1; + $gpu_count = $2; + } + + if ($gpu_count > 0) { + print "\n──────── GPU Metrics ────────\n"; + print "Number of GPUs: \U${gpu_type} " . "x${gpu_count}\n"; + + print "NOTE: GPU metric availability and accuracy may vary by GPU type and system configuration.\n"; + print " Please refer to our documentation for details.\n"; + + my $gpu_usage_str = `sacct -n -p -o TresUsageInMax --noconvert -j $jobid_arg 2>/dev/null`; + my ($max_gpu_util, $max_gpu_mem) = (undef, undef); + + for my $line (split /\n/, $gpu_usage_str) { + my $gpu_util = ($line =~ /gres\/gpuutil=([0-9]+)/) ? $1 : undef; + my $gpu_mem = ($line =~ /gres\/gpumem=([0-9]+)/) ? $1 : undef; + + if (defined $gpu_util) { + $max_gpu_util = (!defined($max_gpu_util) or $gpu_util > $max_gpu_util) + ? $gpu_util : $max_gpu_util; + } + if (defined $max_gpu_mem && defined $gpu_mem) { + $max_gpu_mem = ($gpu_mem > $max_gpu_mem) ? $gpu_mem : $max_gpu_mem; + } elsif (defined $gpu_mem) { + $max_gpu_mem = $gpu_mem; + } + } + print "Max GPU Utilization: " . + (defined($max_gpu_util) ? "$max_gpu_util%\n" : "N/A\n"); + + if (defined $max_gpu_mem) { + my $max_gpu_mem_gb = sprintf("%.2f", $max_gpu_mem / 1e9); + print "Max GPU Memory Utilized: ${max_gpu_mem_gb} GB\n"; + } else { + print "Max GPU Memory Utilized: N/A\n"; + } + } + } else { print "Efficiency not available for jobs in the PENDING state.\n"; } -# Convert elapsed time to string. sub time2str { my $time = shift; my $days = int($time / 86400); @@ -212,23 +233,29 @@ sub time2str { $time -= ($hours * 3600); my $minutes = int($time / 60); my $seconds = $time % 60; - $days = $days < 1 ? '' : "$days-"; - $time = $days . sprintf("%02s:%02s:%02s", $hours, $minutes, $seconds); - return $time; + return $days . sprintf("%02s:%02s:%02s", $hours, $minutes, $seconds); } -# Convert memory to human-readable string. sub kbytes2str { my $kbytes = shift; - if ($kbytes == 0) { - return sprintf("%.2f %sB", 0.0, 'M'); - } + if ($kbytes == 0) { return sprintf("%.2f %sB", 0.0, 'M'); } my $mul = 1024; - my $exp = int(log($kbytes) / log($mul)); my @pre = qw/ K M G T P E /; my $pre = $pre[$exp]; - return sprintf("%.2f %sB", ($kbytes / pow($mul, $exp)), $pre ? $pre : ""); } + +sub convert_to_seconds { + my $timestamp = shift; + my ($days, $hours, $minutes, $seconds) = (0,0,0,0); + if ($timestamp =~ /(\d+)-(\d+):(\d+):(\d+)/) { + ($days, $hours, $minutes, $seconds) = ($1,$2,$3,$4); + } elsif ($timestamp =~ /(\d+):(\d+):(\d+)/) { + ($hours,$minutes,$seconds) = ($1,$2,$3); + } elsif ($timestamp =~ /(\d+):(\d+)/) { + ($minutes,$seconds) = ($1,$2); + } + return $days*86400 + $hours*3600 + $minutes*60 + $seconds; +} From 28473b58195e10d23898189c4f0ae0c734d9367c Mon Sep 17 00:00:00 2001 From: mohalkh5 Date: Sat, 29 Nov 2025 10:27:55 -0700 Subject: [PATCH 02/11] adding gpu metrics detection logic to seff --- bin/seff | 47 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 13 deletions(-) diff --git a/bin/seff b/bin/seff index ccf085e..caf8d05 100644 --- a/bin/seff +++ b/bin/seff @@ -11,9 +11,13 @@ use Sys::Hostname; use lib qw(/usr/lib64/perl5); use Slurmdb ':all'; use Slurm ':all'; +#use Data::Dumper; my $VERSION = "2.2"; +# This script is roughtly equivalent to: +# sacct -P -n -a --format JobID,User,Group,State,Cluster,AllocCPUS,REQMEM,TotalCPU,Elapsed,MaxRSS,ExitCode,NNodes,NTasks -j + my %opts; getopts('hvdf:',\%opts); @@ -38,6 +42,7 @@ my $jobid_arg = $ARGV[0]; my $db_conn = Slurmdb::connection_get(); my $slurm = Slurm::new(); +# Get cluster name from SLurm config file. my $conf = $slurm->load_ctl_conf(); my $clustername = $conf->{'cluster_name'}; @@ -48,12 +53,14 @@ $job_cond{step_list} = $jobid_arg; $job_cond{usage_start} = 0; $job_cond{usage_end} = 0; +# Get and test for a single job. my $jobs = Slurmdb::jobs_get($db_conn, \%job_cond); if (scalar @$jobs < 1) { print STDERR "Job not found.\n"; exit 2; } my $job = @$jobs[0]; +#print Dumper($job); my $jobid = $job->{'jobid'}; my $user = $job->{'user'}; @@ -69,6 +76,7 @@ if (exists $job->{'alloc_cpus'}) { $ncpus = Slurmdb::find_tres_count_in_string($job->{'tres_alloc_str'}, TRES_CPU); } } +# Check for missing number of cpus if (($ncpus == INFINITE64) || ($ncpus == 0)) { $ncpus = 1; } @@ -81,6 +89,7 @@ if (exists $job->{'alloc_nodes'}) { $nnodes = Slurmdb::find_tres_count_in_string($job->{'tres_alloc_str'}, TRES_NODE); } } +# Check for missing number of nodes. if (($nnodes == INFINITE64) || ($nnodes == 0)) { $nnodes = 1; } @@ -93,11 +102,13 @@ my $reqmem = Slurmdb::find_tres_count_in_string($job->{'tres_req_str'}, TRES_MEM $reqmem = $reqmem * 1024; my $walltime = $job->{'elapsed'}; +# Only use hi-order byte for error code. my $exit_status = $job->{'exitcode'} >> 8; my $array_job_id = $job->{'array_job_id'}; my $array_jobid = ""; if ($array_job_id != 0) { + # Convert array_task_id to a signed long integer. my $array_task_id = unpack('l', pack('l', $job->{'array_task_id'})); if ($array_task_id == -2) { print STDERR "Badly formatted array jobid $array_job_id with task_id = -2\n"; @@ -105,7 +116,6 @@ if ($array_job_id != 0) { } $array_jobid = "${array_job_id}_${array_task_id}"; } - my $tot_cpu_sec = 0; my $tot_cpu_usec = 0; my $mem = 0; @@ -114,7 +124,8 @@ my $maxmem = 0; for my $step (@{$job->{'steps'}}) { $tot_cpu_sec += $step->{'tot_cpu_sec'}; $tot_cpu_usec += $step->{'tot_cpu_usec'}; - + + # TRES_MEM in tres_usage_in_tot is the sum of memory peaks of all tasks. if (exists $step->{'stats'} && exists $step->{'stats'}{'tres_usage_in_tot'}) { my $lmem = Slurmdb::find_tres_count_in_string($step->{'stats'}{'tres_usage_in_tot'}, TRES_MEM); if ($lmem == INFINITE64) { @@ -124,18 +135,21 @@ for my $step (@{$job->{'steps'}}) { } if ($mem < $lmem) { $mem = $lmem; + # Get the task MaxRSS seen in this step and the AveRSS. $maxmem = Slurmdb::find_tres_count_in_string($step->{'stats'}{'tres_usage_in_max'}, TRES_MEM); + $avemem = Slurmdb::find_tres_count_in_string($step->{'stats'}{'tres_usage_in_ave'}, TRES_MEM); + if ($maxmem == INFINITE64) { + $maxmem = 0; + } } } } - my $cput = $tot_cpu_sec + int(($tot_cpu_usec / 1000000) + 0.5); if ($mydebug) { print "Slurm data: JobID ArrayJobID User Group State Clustername Ncpus Nnodes Ntasks Reqmem PerNode Cput Walltime Mem ExitStatus\n"; print "Slurm data: $jobid $array_jobid $user $group $state $clustername $ncpus $nnodes $reqmem $pernode $cput $walltime $mem $exit_status\n\n"; } - print "Job ID: $jobid\n"; if (length $array_jobid) { print "Array Job ID: $array_jobid\n"; @@ -153,19 +167,26 @@ if ($ncpus == 1) { print "Nodes: $nnodes\n"; printf "Cores per node: %d\n", $ncpus/$nnodes; } - if ($state ne "PENDING") { my $corewalltime = $walltime * $ncpus; - my $cpu_eff = ($corewalltime != 0) ? $cput / $corewalltime * 100 : 0.0; - + my $cpu_eff; + if ($corewalltime != 0) { + $cpu_eff = $cput / $corewalltime * 100; + } else { + $cpu_eff = 0.0; + } printf("CPU Utilized: %s\n", time2str($cput)); printf("CPU Efficiency: %.2f%% of %s core-walltime\n", $cpu_eff, time2str($corewalltime)); printf("Job Wall-clock time: %s\n", time2str($walltime)); printf("Memory Utilized: %s\n", kbytes2str($mem)); - - my $allocmem = Slurmdb::find_tres_count_in_string($job->{'tres_alloc_str'}, TRES_MEM) * 1024; - my $mem_eff = ($allocmem != 0) ? $mem / $allocmem * 100 : 0.0; - + my $mem_eff; + my $allocmem = Slurmdb::find_tres_count_in_string($job->{'tres_alloc_str'}, TRES_MEM); + $allocmem = $allocmem * 1024; + if ($allocmem != 0) { + $mem_eff = $mem / $allocmem * 100; + } else { + $mem_eff = 0.0; + } if ($pernode) { printf("Memory Efficiency: %.2f%% of %s (%s\/node)\n", $mem_eff, kbytes2str($allocmem), kbytes2str($allocmem / $nnodes)); } else { @@ -224,7 +245,7 @@ if ($state ne "PENDING") { } else { print "Efficiency not available for jobs in the PENDING state.\n"; } - +# Convert elapsed time to string. sub time2str { my $time = shift; my $days = int($time / 86400); @@ -236,7 +257,7 @@ sub time2str { $days = $days < 1 ? '' : "$days-"; return $days . sprintf("%02s:%02s:%02s", $hours, $minutes, $seconds); } - +# Convert memory to human-readable string. sub kbytes2str { my $kbytes = shift; if ($kbytes == 0) { return sprintf("%.2f %sB", 0.0, 'M'); } From 5231f172fccd415430aee3ac55a842e705191ca7 Mon Sep 17 00:00:00 2001 From: mohalkh5 Date: Sun, 30 Nov 2025 20:55:34 -0700 Subject: [PATCH 03/11] declaring missing variable and removing uppercase --- bin/seff | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/seff b/bin/seff index caf8d05..e7761b0 100644 --- a/bin/seff +++ b/bin/seff @@ -120,7 +120,7 @@ my $tot_cpu_sec = 0; my $tot_cpu_usec = 0; my $mem = 0; my $maxmem = 0; - +my $avemem = 0; for my $step (@{$job->{'steps'}}) { $tot_cpu_sec += $step->{'tot_cpu_sec'}; $tot_cpu_usec += $step->{'tot_cpu_usec'}; @@ -209,7 +209,7 @@ if ($state ne "PENDING") { if ($gpu_count > 0) { print "\n──────── GPU Metrics ────────\n"; - print "Number of GPUs: \U${gpu_type} " . "x${gpu_count}\n"; + print "Number of GPUs: ${gpu_type} x${gpu_count}\n"; print "NOTE: GPU metric availability and accuracy may vary by GPU type and system configuration.\n"; print " Please refer to our documentation for details.\n"; From edf0ce74f226fb4803998a1441dbcd71b9b85b89 Mon Sep 17 00:00:00 2001 From: mohalkh5 Date: Tue, 2 Dec 2025 08:43:37 -0700 Subject: [PATCH 04/11] update the regex for unknown gpus This is for GH200s on Alpine and a40's on Blanca --- bin/seff | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/seff b/bin/seff index e7761b0..3ed378d 100644 --- a/bin/seff +++ b/bin/seff @@ -202,8 +202,8 @@ if ($state ne "PENDING") { chomp($gpu_alloc_str); my ($gpu_type, $gpu_count) = ("", 0); - if ($gpu_alloc_str =~ /gres\/gpu:([a-zA-Z0-9._-]+)=([0-9]+)/) { - $gpu_type = $1; + if ($gpu_alloc_str =~ /gres\/gpu(?::([^=]+))?=([0-9]+)/) { + $gpu_type = $1 // "unknown gpu"; $gpu_count = $2; } From bb4dbc3d33b0bab560a5b8977d2354d10972a4ce Mon Sep 17 00:00:00 2001 From: mohalkh5 Date: Tue, 2 Dec 2025 11:12:43 -0700 Subject: [PATCH 05/11] change executable permissions nfor bin directory --- bin/jobstats | 0 bin/levelfs | 0 bin/seff | 0 bin/seff-array | 0 bin/seff-array.py | 0 bin/suacct | 0 bin/suacct_raw | 0 bin/suuser | 0 8 files changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 bin/jobstats mode change 100644 => 100755 bin/levelfs mode change 100644 => 100755 bin/seff mode change 100644 => 100755 bin/seff-array mode change 100644 => 100755 bin/seff-array.py mode change 100644 => 100755 bin/suacct mode change 100644 => 100755 bin/suacct_raw mode change 100644 => 100755 bin/suuser diff --git a/bin/jobstats b/bin/jobstats old mode 100644 new mode 100755 diff --git a/bin/levelfs b/bin/levelfs old mode 100644 new mode 100755 diff --git a/bin/seff b/bin/seff old mode 100644 new mode 100755 diff --git a/bin/seff-array b/bin/seff-array old mode 100644 new mode 100755 diff --git a/bin/seff-array.py b/bin/seff-array.py old mode 100644 new mode 100755 diff --git a/bin/suacct b/bin/suacct old mode 100644 new mode 100755 diff --git a/bin/suacct_raw b/bin/suacct_raw old mode 100644 new mode 100755 diff --git a/bin/suuser b/bin/suuser old mode 100644 new mode 100755 From 5afd0eba05f167f0ba1d72f63a6c669269f9fe9f Mon Sep 17 00:00:00 2001 From: mohalkh5 Date: Mon, 15 Dec 2025 05:45:37 -0700 Subject: [PATCH 06/11] updating seff-array to source the uv env --- bin/seff-array | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/seff-array b/bin/seff-array index c29fa63..957f8ba 100755 --- a/bin/seff-array +++ b/bin/seff-array @@ -1,6 +1,7 @@ #!/bin/bash # Gathering efficiency statistics for array jobs # -#last modified 2023-12-01 (Hall) +#last modified 2025-12-15 +source /curc/sw/uv_env/seff-array-env/bin/activate python /curc/sw/slurmtools/0.0.1/bin/seff-array.py $@ From 17b87f8cea417c094bbbe103ad9e033dcdeecadc Mon Sep 17 00:00:00 2001 From: Brandon <53541061+b-reyes@users.noreply.github.com> Date: Thu, 18 Dec 2025 16:00:51 -0700 Subject: [PATCH 07/11] Use the Slurm Perl API (#1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * instead of calling salloc use the Slurm perl API to get GPU metrics * add a space above ──────── CPU Metrics ──────── * modify the units displayed in kbytes2str so they reflect the true units e.g. iB * Call function to get the GPU type Co-authored-by: mohalkh5 * apply @mohalkh5's suggestion for a function that gets the GPU type Co-authored-by: mohalkh5 --------- Co-authored-by: mohalkh5 --- bin/seff | 146 +++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 104 insertions(+), 42 deletions(-) diff --git a/bin/seff b/bin/seff index 3ed378d..81eccbb 100755 --- a/bin/seff +++ b/bin/seff @@ -11,12 +11,13 @@ use Sys::Hostname; use lib qw(/usr/lib64/perl5); use Slurmdb ':all'; use Slurm ':all'; +use List::Util qw(max); #use Data::Dumper; my $VERSION = "2.2"; -# This script is roughtly equivalent to: -# sacct -P -n -a --format JobID,User,Group,State,Cluster,AllocCPUS,REQMEM,TotalCPU,Elapsed,MaxRSS,ExitCode,NNodes,NTasks -j +# The script is roughly equivalent to: +# sacct -P -n -a --format JobID,User,Group,State,Cluster,AllocCPUS,REQMEM,TresUsageInMax,TotalCPU,Elapsed,MaxRSS,ExitCode,NNodes,NTasks -j my %opts; getopts('hvdf:',\%opts); @@ -116,15 +117,29 @@ if ($array_job_id != 0) { } $array_jobid = "${array_job_id}_${array_task_id}"; } + +my %gres_map = get_gres_map($db_conn, $clustername); + my $tot_cpu_sec = 0; my $tot_cpu_usec = 0; my $mem = 0; my $maxmem = 0; my $avemem = 0; +my @gpumem_values; +my @gpuutil_values; for my $step (@{$job->{'steps'}}) { $tot_cpu_sec += $step->{'tot_cpu_sec'}; $tot_cpu_usec += $step->{'tot_cpu_usec'}; + # Grab gpumem and gputil for the the job step and store their values + if (exists $step->{'stats'} && exists $step->{'stats'}{'tres_usage_in_max'} && + exists $gres_map{'gres/gpumem'} && exists $gres_map{'gres/gpuutil'}) { + my $gpumem = Slurmdb::find_tres_count_in_string($step->{'stats'}{'tres_usage_in_max'}, $gres_map{'gres/gpumem'}); + my $gpuutil = Slurmdb::find_tres_count_in_string($step->{'stats'}{'tres_usage_in_max'}, $gres_map{'gres/gpuutil'}); + push @gpumem_values, $gpumem; + push @gpuutil_values, $gpuutil; + } + # TRES_MEM in tres_usage_in_tot is the sum of memory peaks of all tasks. if (exists $step->{'stats'} && exists $step->{'stats'}{'tres_usage_in_tot'}) { my $lmem = Slurmdb::find_tres_count_in_string($step->{'stats'}{'tres_usage_in_tot'}, TRES_MEM); @@ -175,6 +190,8 @@ if ($state ne "PENDING") { } else { $cpu_eff = 0.0; } + + print "\n──────── CPU Metrics ────────\n"; printf("CPU Utilized: %s\n", time2str($cput)); printf("CPU Efficiency: %.2f%% of %s core-walltime\n", $cpu_eff, time2str($corewalltime)); printf("Job Wall-clock time: %s\n", time2str($walltime)); @@ -198,50 +215,37 @@ if ($state ne "PENDING") { } # --- GPU Detection and Stats --- - my $gpu_alloc_str = `sacct -n -p -o AllocTres -X -j $jobid_arg 2>/dev/null`; - chomp($gpu_alloc_str); - my ($gpu_type, $gpu_count) = ("", 0); - - if ($gpu_alloc_str =~ /gres\/gpu(?::([^=]+))?=([0-9]+)/) { - $gpu_type = $1 // "unknown gpu"; - $gpu_count = $2; - } - - if ($gpu_count > 0) { - print "\n──────── GPU Metrics ────────\n"; - print "Number of GPUs: ${gpu_type} x${gpu_count}\n"; - - print "NOTE: GPU metric availability and accuracy may vary by GPU type and system configuration.\n"; - print " Please refer to our documentation for details.\n"; + my $gpucount = exists $gres_map{'gres/gpu'} ? Slurmdb::find_tres_count_in_string($job->{'tres_alloc_str'}, $gres_map{'gres/gpu'}) : 0; + if ($gpucount > 0 && $gpucount != INFINITE64) { - my $gpu_usage_str = `sacct -n -p -o TresUsageInMax --noconvert -j $jobid_arg 2>/dev/null`; - my ($max_gpu_util, $max_gpu_mem) = (undef, undef); + # Get maximum values over the job steps + my $max_gpumem = @gpumem_values ? max(@gpumem_values) : 'N/A'; + my $max_gpuutil = @gpuutil_values ? max(@gpuutil_values) : 'N/A'; - for my $line (split /\n/, $gpu_usage_str) { - my $gpu_util = ($line =~ /gres\/gpuutil=([0-9]+)/) ? $1 : undef; - my $gpu_mem = ($line =~ /gres\/gpumem=([0-9]+)/) ? $1 : undef; - - if (defined $gpu_util) { - $max_gpu_util = (!defined($max_gpu_util) or $gpu_util > $max_gpu_util) - ? $gpu_util : $max_gpu_util; - } - if (defined $max_gpu_mem && defined $gpu_mem) { - $max_gpu_mem = ($gpu_mem > $max_gpu_mem) ? $gpu_mem : $max_gpu_mem; - } elsif (defined $gpu_mem) { - $max_gpu_mem = $gpu_mem; - } + # Account for cases where this is an incompatible GPU type e.g. one of the values is equal to 4294967295 + if (($max_gpumem ne 'N/A' && $max_gpumem == INFINITE64) || + ($max_gpuutil ne 'N/A' && $max_gpuutil == INFINITE64)) { + $max_gpumem = 'N/A'; + $max_gpuutil = 'N/A'; } - print "Max GPU Utilization: " . - (defined($max_gpu_util) ? "$max_gpu_util%\n" : "N/A\n"); - if (defined $max_gpu_mem) { - my $max_gpu_mem_gb = sprintf("%.2f", $max_gpu_mem / 1e9); - print "Max GPU Memory Utilized: ${max_gpu_mem_gb} GB\n"; - } else { - print "Max GPU Memory Utilized: N/A\n"; + # convert to kibibytes so it can be used in kbytes2str + if ($max_gpumem ne 'N/A'){ + $max_gpumem = $max_gpumem / 1024; + $max_gpumem = kbytes2str($max_gpumem) } - } + print "\n──────── GPU Metrics ────────\n"; + print "Number of GPUs: ${gpucount}\n"; + # TODO: If we want, we could get the GPU type using one of the other gres/gpu options that are present in the code + # e.g. print $job->{'tres_alloc_str'} . "\n"; +my $gpu_type = get_gpu_type($job->{'tres_alloc_str'}, %gres_map); +print "GPU Type: $gpu_type\n"; + print "NOTE: GPU metric availability may vary by GPU type.\n"; + print " Please refer to our documentation for details.\n"; + print "Max GPU Utilization: " . ($max_gpuutil ne 'N/A' ? "${max_gpuutil}%" : $max_gpuutil) . "\n"; + print "Max GPU Memory Utilized: ${max_gpumem}\n"; + } } else { print "Efficiency not available for jobs in the PENDING state.\n"; } @@ -260,12 +264,12 @@ sub time2str { # Convert memory to human-readable string. sub kbytes2str { my $kbytes = shift; - if ($kbytes == 0) { return sprintf("%.2f %sB", 0.0, 'M'); } + if ($kbytes == 0) { return sprintf("%.2f %siB", 0.0, 'M'); } my $mul = 1024; my $exp = int(log($kbytes) / log($mul)); my @pre = qw/ K M G T P E /; my $pre = $pre[$exp]; - return sprintf("%.2f %sB", ($kbytes / pow($mul, $exp)), $pre ? $pre : ""); + return sprintf("%.2f %siB", ($kbytes / pow($mul, $exp)), $pre ? $pre : ""); } sub convert_to_seconds { @@ -280,3 +284,61 @@ sub convert_to_seconds { } return $days*86400 + $hours*3600 + $minutes*60 + $seconds; } + +sub get_gres_map { + my ($db_conn, $clustername) = @_; + + my %hv = (); + my $clusters = Slurmdb::clusters_get($db_conn, \%hv); + my ($target_cluster) = grep { $_->{'name'} eq $clustername } @$clusters; + + return () unless $target_cluster; # Return empty hash if cluster not found + + my %tres_map; + for my $tres_usage (@{$target_cluster->{'accounting_list'}}) { + if (exists $tres_usage->{'tres_rec'}) { + my $tres = $tres_usage->{'tres_rec'}; + + next unless exists $tres->{'name'}; + my $id = $tres->{'id'}; + + next unless exists $tres->{'name'}; + my $name = $tres->{'type'} . '/' . $tres->{'name'}; + + if ($name =~ /^gres\//) { + $tres_map{$name} = $id; + } + } + } + + return %tres_map; +} + +sub get_gpu_type { + my ($tres_alloc_str, %gres_map) = @_; + + return "Unknown" unless $tres_alloc_str; + + # Reverse mapping: id -> name + my %id_to_name = reverse %gres_map; + + for my $entry (split ',', $tres_alloc_str) { + + my ($key, $count) = split '=', $entry, 2; + next unless defined $count && $count > 0; + + if (exists $id_to_name{$key}) { + my $name = $id_to_name{$key}; + + my ($type, $gpu_name) = split ':', $name, 2; + return $gpu_name if defined $gpu_name; + } + + my ($gres, $gpu_name) = split ':', $key, 2; + if (defined $gpu_name && $gres eq 'gres/gpu') { + return $gpu_name; + } + } + + return "Unknown"; +} \ No newline at end of file From 8cbccdcb394b7b7290428790650856a5d04e48bb Mon Sep 17 00:00:00 2001 From: mohalkh5 Date: Mon, 5 Jan 2026 13:46:44 -0700 Subject: [PATCH 08/11] Incorporated review feedback: - Added link to GPU documentation - Restored missing max memory comparison print line - Updated get_gres_map: renamed tres_map -> gres_map and added explanatory comment - Updated get_gpu_type: * renamed variables for clarity (tres_id, tres_value, gres_id_to_name) * added comment explaining GPU type detection - Removed unused convert_to_seconds function --- bin/seff | 53 +++++++++++++++++++---------------------------------- 1 file changed, 19 insertions(+), 34 deletions(-) diff --git a/bin/seff b/bin/seff index 81eccbb..efcbedb 100755 --- a/bin/seff +++ b/bin/seff @@ -209,6 +209,9 @@ if ($state ne "PENDING") { } else { printf("Memory Efficiency: %.2f%% of %s (%s\/core)\n", $mem_eff, kbytes2str($allocmem), kbytes2str($allocmem / $ncpus)); } + if (($maxmem != 0) && ($maxmem > $avemem)) { + printf("The task which had the largest memory consumption differs by %.02f%% from the average task max memory consumption\n", ($maxmem / $avemem) * 100); + } if ($state eq "RUNNING") { print "WARNING: Efficiency statistics can only be obtained after the job has ended as seff tool is based on the accounting database data.\n"; @@ -237,12 +240,11 @@ if ($state ne "PENDING") { print "\n──────── GPU Metrics ────────\n"; print "Number of GPUs: ${gpucount}\n"; - # TODO: If we want, we could get the GPU type using one of the other gres/gpu options that are present in the code - # e.g. print $job->{'tres_alloc_str'} . "\n"; -my $gpu_type = get_gpu_type($job->{'tres_alloc_str'}, %gres_map); -print "GPU Type: $gpu_type\n"; + + my $gpu_type = get_gpu_type($job->{'tres_alloc_str'}, %gres_map); + print "GPU Type: $gpu_type\n"; print "NOTE: GPU metric availability may vary by GPU type.\n"; - print " Please refer to our documentation for details.\n"; + print " Please refer to our documentation for details: https://curc.readthedocs.io/en/latest/compute/monitoring-resources.html#how-can-i-check-memory-and-gpu-utilization-for-my-jobs\n"; print "Max GPU Utilization: " . ($max_gpuutil ne 'N/A' ? "${max_gpuutil}%" : $max_gpuutil) . "\n"; print "Max GPU Memory Utilized: ${max_gpumem}\n"; } @@ -272,19 +274,7 @@ sub kbytes2str { return sprintf("%.2f %siB", ($kbytes / pow($mul, $exp)), $pre ? $pre : ""); } -sub convert_to_seconds { - my $timestamp = shift; - my ($days, $hours, $minutes, $seconds) = (0,0,0,0); - if ($timestamp =~ /(\d+)-(\d+):(\d+):(\d+)/) { - ($days, $hours, $minutes, $seconds) = ($1,$2,$3,$4); - } elsif ($timestamp =~ /(\d+):(\d+):(\d+)/) { - ($hours,$minutes,$seconds) = ($1,$2,$3); - } elsif ($timestamp =~ /(\d+):(\d+)/) { - ($minutes,$seconds) = ($1,$2); - } - return $days*86400 + $hours*3600 + $minutes*60 + $seconds; -} - +# Creates a map of resources for a cluster by going through its accounting list and picking out the resources that are of type "gres/". sub get_gres_map { my ($db_conn, $clustername) = @_; @@ -294,7 +284,7 @@ sub get_gres_map { return () unless $target_cluster; # Return empty hash if cluster not found - my %tres_map; + my %gres_map; for my $tres_usage (@{$target_cluster->{'accounting_list'}}) { if (exists $tres_usage->{'tres_rec'}) { my $tres = $tres_usage->{'tres_rec'}; @@ -306,39 +296,34 @@ sub get_gres_map { my $name = $tres->{'type'} . '/' . $tres->{'name'}; if ($name =~ /^gres\//) { - $tres_map{$name} = $id; + $gres_map{$name} = $id; } } } - return %tres_map; + return %gres_map; } +# Returns the GPU type allocated, or "Unknown" if it can’t be determined. sub get_gpu_type { my ($tres_alloc_str, %gres_map) = @_; return "Unknown" unless $tres_alloc_str; + return "Unknown" unless %gres_map; # Reverse mapping: id -> name - my %id_to_name = reverse %gres_map; + my %gres_id_to_name = reverse %gres_map; for my $entry (split ',', $tres_alloc_str) { - my ($key, $count) = split '=', $entry, 2; - next unless defined $count && $count > 0; - - if (exists $id_to_name{$key}) { - my $name = $id_to_name{$key}; + my ($tres_id, $tres_value) = split '=', $entry, 2; + next unless defined $tres_value && $tres_value > 0; + if (exists $gres_id_to_name{$tres_id}) { + my $name = $gres_id_to_name{$tres_id}; my ($type, $gpu_name) = split ':', $name, 2; return $gpu_name if defined $gpu_name; } - - my ($gres, $gpu_name) = split ':', $key, 2; - if (defined $gpu_name && $gres eq 'gres/gpu') { - return $gpu_name; - } } - return "Unknown"; -} \ No newline at end of file +} From c21545d0fd894cdd8fabf054ccfea20bdb81bdca Mon Sep 17 00:00:00 2001 From: mohalkh5 Date: Mon, 5 Jan 2026 14:27:27 -0700 Subject: [PATCH 09/11] update seff-array to use env Python, add environment readme, remove .out file --- bin/seff-array | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/bin/seff-array b/bin/seff-array index 957f8ba..2229167 100755 --- a/bin/seff-array +++ b/bin/seff-array @@ -2,6 +2,4 @@ # Gathering efficiency statistics for array jobs # #last modified 2025-12-15 -source /curc/sw/uv_env/seff-array-env/bin/activate - -python /curc/sw/slurmtools/0.0.1/bin/seff-array.py $@ +/curc/sw/uv_env/seff-array-env/bin/python /curc/sw/slurmtools/0.0.1/bin/seff-array.py "$@" From dc636fd42f839305338479f14ef4cb59420e1ece Mon Sep 17 00:00:00 2001 From: mohalkh5 Date: Mon, 5 Jan 2026 15:07:03 -0700 Subject: [PATCH 10/11] remove explicit interpreter line --- bin/seff-array.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/bin/seff-array.py b/bin/seff-array.py index 220c2c4..508e728 100755 --- a/bin/seff-array.py +++ b/bin/seff-array.py @@ -1,5 +1,3 @@ -#!/gpfs/gibbs/pi/support/software/utilities/bin/python - import argparse import subprocess import sys From 8558b3c8c7c3e84fd6b2f577039a7ca5b0ad2e72 Mon Sep 17 00:00:00 2001 From: mohalkh5 Date: Wed, 7 Jan 2026 11:27:49 -0700 Subject: [PATCH 11/11] updating the documentation url --- bin/seff | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/seff b/bin/seff index efcbedb..110d298 100755 --- a/bin/seff +++ b/bin/seff @@ -244,7 +244,7 @@ if ($state ne "PENDING") { my $gpu_type = get_gpu_type($job->{'tres_alloc_str'}, %gres_map); print "GPU Type: $gpu_type\n"; print "NOTE: GPU metric availability may vary by GPU type.\n"; - print " Please refer to our documentation for details: https://curc.readthedocs.io/en/latest/compute/monitoring-resources.html#how-can-i-check-memory-and-gpu-utilization-for-my-jobs\n"; + print " Please refer to our documentation for details: https://curc.readthedocs.io/en/latest/getting_started/faq.html#why-am-i-getting-unexpected-results-for-my-gpu-memory-or-utilization-metrics\n"; print "Max GPU Utilization: " . ($max_gpuutil ne 'N/A' ? "${max_gpuutil}%" : $max_gpuutil) . "\n"; print "Max GPU Memory Utilized: ${max_gpumem}\n"; }