Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file modified bin/jobstats
100644 → 100755
Empty file.
Empty file modified bin/levelfs
100644 → 100755
Empty file.
154 changes: 132 additions & 22 deletions bin/seff
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@ use Sys::Hostname;
use lib qw(/usr/lib64/perl5);
use Slurmdb ':all';
use Slurm ':all';
use List::Util qw(max);
#use Data::Dumper;

my $VERSION = "2.1";
my $VERSION = "2.2";

# This script is roughtly equivalent to:
# sacct -P -n -a --format JobID,User,Group,State,Cluster,AllocCPUS,REQMEM,TotalCPU,Elapsed,MaxRSS,ExitCode,NNodes,NTasks -j <job_id>
# The script is roughly equivalent to:
# sacct -P -n -a --format JobID,User,Group,State,Cluster,AllocCPUS,REQMEM,TresUsageInMax,TotalCPU,Elapsed,MaxRSS,ExitCode,NNodes,NTasks -j <job_id>

my %opts;
getopts('hvdf:',\%opts);
Expand Down Expand Up @@ -78,7 +79,7 @@ if (exists $job->{'alloc_cpus'}) {
}
# Check for missing number of cpus
if (($ncpus == INFINITE64) || ($ncpus == 0)) {
$ncpus = 1;
$ncpus = 1;
}

my $nnodes = 1;
Expand All @@ -91,13 +92,13 @@ if (exists $job->{'alloc_nodes'}) {
}
# Check for missing number of nodes.
if (($nnodes == INFINITE64) || ($nnodes == 0)) {
$nnodes = 1;
$nnodes = 1;
}

my $pernode = 1;
if ($job->{'req_mem'} & MEM_PER_CPU) {
$pernode = 0;
}
$pernode = 0;
}
my $reqmem = Slurmdb::find_tres_count_in_string($job->{'tres_req_str'}, TRES_MEM);
$reqmem = $reqmem * 1024;

Expand All @@ -116,22 +117,36 @@ if ($array_job_id != 0) {
}
$array_jobid = "${array_job_id}_${array_task_id}";
}

my %gres_map = get_gres_map($db_conn, $clustername);

my $tot_cpu_sec = 0;
my $tot_cpu_usec = 0;
my $mem = 0;
my $maxmem = 0;
my $avemem = 0;
my @gpumem_values;
my @gpuutil_values;
for my $step (@{$job->{'steps'}}) {
$tot_cpu_sec += $step->{'tot_cpu_sec'};
$tot_cpu_usec += $step->{'tot_cpu_usec'};

# Grab gpumem and gputil for the the job step and store their values
if (exists $step->{'stats'} && exists $step->{'stats'}{'tres_usage_in_max'} &&
exists $gres_map{'gres/gpumem'} && exists $gres_map{'gres/gpuutil'}) {
my $gpumem = Slurmdb::find_tres_count_in_string($step->{'stats'}{'tres_usage_in_max'}, $gres_map{'gres/gpumem'});
my $gpuutil = Slurmdb::find_tres_count_in_string($step->{'stats'}{'tres_usage_in_max'}, $gres_map{'gres/gpuutil'});
push @gpumem_values, $gpumem;
push @gpuutil_values, $gpuutil;
}

# TRES_MEM in tres_usage_in_tot is the sum of memory peaks of all tasks.
if (exists $step->{'stats'} && exists $step->{'stats'}{'tres_usage_in_tot'}) {
my $lmem = Slurmdb::find_tres_count_in_string($step->{'stats'}{'tres_usage_in_tot'}, TRES_MEM);
if ($lmem == INFINITE64) {
$lmem = 0;
$lmem = 0;
} else {
$lmem = $lmem / 1024;
$lmem = $lmem / 1024;
}
if ($mem < $lmem) {
$mem = $lmem;
Expand Down Expand Up @@ -175,6 +190,8 @@ if ($state ne "PENDING") {
} else {
$cpu_eff = 0.0;
}

print "\n──────── CPU Metrics ────────\n";
printf("CPU Utilized: %s\n", time2str($cput));
printf("CPU Efficiency: %.2f%% of %s core-walltime\n", $cpu_eff, time2str($corewalltime));
printf("Job Wall-clock time: %s\n", time2str($walltime));
Expand All @@ -192,17 +209,46 @@ if ($state ne "PENDING") {
} else {
printf("Memory Efficiency: %.2f%% of %s (%s\/core)\n", $mem_eff, kbytes2str($allocmem), kbytes2str($allocmem / $ncpus));
}
if (($maxmem != 0) && ($maxmem > $avemem)) {
printf("The task which had the largest memory consumption differs by %.02f%% from the average task max memory consumption\n", ($maxmem / $avemem) * 100);
}

if ($state eq "RUNNING") {
print "WARNING: Efficiency statistics can only be obtained after the job has ended as seff tool is based on the accounting database data.\n";
}

# --- GPU Detection and Stats ---
my $gpucount = exists $gres_map{'gres/gpu'} ? Slurmdb::find_tres_count_in_string($job->{'tres_alloc_str'}, $gres_map{'gres/gpu'}) : 0;
if ($gpucount > 0 && $gpucount != INFINITE64) {

# Get maximum values over the job steps
my $max_gpumem = @gpumem_values ? max(@gpumem_values) : 'N/A';
my $max_gpuutil = @gpuutil_values ? max(@gpuutil_values) : 'N/A';

# Account for cases where this is an incompatible GPU type e.g. one of the values is equal to 4294967295
if (($max_gpumem ne 'N/A' && $max_gpumem == INFINITE64) ||
($max_gpuutil ne 'N/A' && $max_gpuutil == INFINITE64)) {
$max_gpumem = 'N/A';
$max_gpuutil = 'N/A';
}

# convert to kibibytes so it can be used in kbytes2str
if ($max_gpumem ne 'N/A'){
$max_gpumem = $max_gpumem / 1024;
$max_gpumem = kbytes2str($max_gpumem)
}

print "\n──────── GPU Metrics ────────\n";
print "Number of GPUs: ${gpucount}\n";
# TODO: If we want, we could get the GPU type using one of the other gres/gpu options that are present in the code
# e.g. print $job->{'tres_alloc_str'} . "\n";
my $gpu_type = get_gpu_type($job->{'tres_alloc_str'}, %gres_map);
print "GPU Type: $gpu_type\n";
print "NOTE: GPU metric availability may vary by GPU type.\n";
print " Please refer to our documentation for details.\n";
print "Max GPU Utilization: " . ($max_gpuutil ne 'N/A' ? "${max_gpuutil}%" : $max_gpuutil) . "\n";
print "Max GPU Memory Utilized: ${max_gpumem}\n";
}
} else {
print "Efficiency not available for jobs in the PENDING state.\n";
}

# Convert elapsed time to string.
sub time2str {
my $time = shift;
Expand All @@ -212,23 +258,87 @@ sub time2str {
$time -= ($hours * 3600);
my $minutes = int($time / 60);
my $seconds = $time % 60;

$days = $days < 1 ? '' : "$days-";
$time = $days . sprintf("%02s:%02s:%02s", $hours, $minutes, $seconds);
return $time;
return $days . sprintf("%02s:%02s:%02s", $hours, $minutes, $seconds);
}

# Convert memory to human-readable string.
sub kbytes2str {
my $kbytes = shift;
if ($kbytes == 0) {
return sprintf("%.2f %sB", 0.0, 'M');
}
if ($kbytes == 0) { return sprintf("%.2f %siB", 0.0, 'M'); }
my $mul = 1024;

my $exp = int(log($kbytes) / log($mul));
my @pre = qw/ K M G T P E /;
my $pre = $pre[$exp];
return sprintf("%.2f %siB", ($kbytes / pow($mul, $exp)), $pre ? $pre : "");
}

return sprintf("%.2f %sB", ($kbytes / pow($mul, $exp)), $pre ? $pre : "");
sub convert_to_seconds {
my $timestamp = shift;
my ($days, $hours, $minutes, $seconds) = (0,0,0,0);
if ($timestamp =~ /(\d+)-(\d+):(\d+):(\d+)/) {
($days, $hours, $minutes, $seconds) = ($1,$2,$3,$4);
} elsif ($timestamp =~ /(\d+):(\d+):(\d+)/) {
($hours,$minutes,$seconds) = ($1,$2,$3);
} elsif ($timestamp =~ /(\d+):(\d+)/) {
($minutes,$seconds) = ($1,$2);
}
return $days*86400 + $hours*3600 + $minutes*60 + $seconds;
}

sub get_gres_map {
my ($db_conn, $clustername) = @_;

my %hv = ();
my $clusters = Slurmdb::clusters_get($db_conn, \%hv);
my ($target_cluster) = grep { $_->{'name'} eq $clustername } @$clusters;

return () unless $target_cluster; # Return empty hash if cluster not found

my %tres_map;
for my $tres_usage (@{$target_cluster->{'accounting_list'}}) {
if (exists $tres_usage->{'tres_rec'}) {
my $tres = $tres_usage->{'tres_rec'};

next unless exists $tres->{'name'};
my $id = $tres->{'id'};

next unless exists $tres->{'name'};
my $name = $tres->{'type'} . '/' . $tres->{'name'};

if ($name =~ /^gres\//) {
$tres_map{$name} = $id;
}
}
}

return %tres_map;
}

sub get_gpu_type {
my ($tres_alloc_str, %gres_map) = @_;

return "Unknown" unless $tres_alloc_str;

# Reverse mapping: id -> name
my %id_to_name = reverse %gres_map;

for my $entry (split ',', $tres_alloc_str) {

my ($key, $count) = split '=', $entry, 2;
next unless defined $count && $count > 0;

if (exists $id_to_name{$key}) {
my $name = $id_to_name{$key};

my ($type, $gpu_name) = split ':', $name, 2;
return $gpu_name if defined $gpu_name;
}

my ($gres, $gpu_name) = split ':', $key, 2;
if (defined $gpu_name && $gres eq 'gres/gpu') {
return $gpu_name;
}
}

return "Unknown";
}
3 changes: 2 additions & 1 deletion bin/seff-array
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/bin/bash
# Gathering efficiency statistics for array jobs
#
#last modified 2023-12-01 (Hall)
#last modified 2025-12-15
source /curc/sw/uv_env/seff-array-env/bin/activate

python /curc/sw/slurmtools/0.0.1/bin/seff-array.py $@
Empty file modified bin/seff-array.py
100644 → 100755
Empty file.
Empty file modified bin/suacct
100644 → 100755
Empty file.
Empty file modified bin/suacct_raw
100644 → 100755
Empty file.
Empty file modified bin/suuser
100644 → 100755
Empty file.