Skip to content

Commit 2ec2345

Browse files
committed
New dcgm-exporter update with visualizers
We provide a new `dcgm-exporter` which is Python3 based, along with updated visualizers and a container example for DCGM.
1 parent df76133 commit 2ec2345

File tree

14 files changed

+443
-104
lines changed

14 files changed

+443
-104
lines changed
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
<?xml version="1.0" encoding="utf-8"?>
2+
<service>
3+
<short>pbench-dcgm-exporter</short>
4+
<description>Pbench Agent Prometheus dcgm-exporter</description>
5+
<port protocol="tcp" port="9400"/>
6+
</service>
7+
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# NOTE: Must be run with --privileged
2+
# RECOMMENDED: Use with the fedora image variants for direct compatibility
3+
FROM pbench-agent-tool-meister-{{ distro }}:{{ tag }}
4+
5+
RUN {% if distro == 'centos-7' %}yum{% else %}dnf{% endif %} install -y 'dnf-command(config-manager)' && \
6+
{% if distro == 'centos-7' %}yum{% else %}dnf{% endif %} config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/{{ distro.split("-")|join("") }}/x86_64/cuda-{{ distro.split("-")|join("") }}.repo && \
7+
{% if distro == 'centos-7' %}yum{% else %}dnf{% endif %} clean expire-cache && \
8+
{% if distro == 'centos-7' %}yum{% else %}dnf{% endif %} install -y nvidia-driver-cuda nvidia-modprobe datacenter-gpu-manager-2.1.4 golang && \
9+
git clone https://github.com/NVIDIA/gpu-monitoring-tools.git && \
10+
(cd gpu-monitoring-tools; git checkout tags/2.1.2 -b build; make binary install) && \
11+
{% if distro == 'centos-7' %}yum{% else %}dnf{% endif %} -y clean all && \
12+
rm -rf /var/cache/{% if distro == 'centos-7' %}yum{% else %}dnf{% endif %}
13+
14+
ENV NVIDIA_DISABLE_REQUIRE="true" \
15+
NVIDIA_VISIBLE_DEVICES=all
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# {{ distro }} pbench-agent-tool-meister image
22
FROM pbench-agent-tools-{{ distro }}:{{ tag }}
33

4-
# Port 8000 should be the optional dcgm tool, 9100 the optional node_exporter
4+
# Port 9400 should be the optional dcgm tool, 9100 the optional node_exporter
55
# tool, and 55677 the pcp (pmcd) tool.
6-
EXPOSE 8000 9100 55677
6+
EXPOSE 9100 9400 55677
77
ENTRYPOINT [ "/opt/pbench-agent/util-scripts/tool-meister/tool-meister-ep" ]

agent/containers/images/visualizers/combo.json

Lines changed: 190 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@
123123
"steppedLine": false,
124124
"targets": [
125125
{
126-
"expr": "dcgm_gpu_temp",
126+
"expr": "DCGM_FI_DEV_GPU_TEMP",
127127
"format": "time_series",
128128
"instant": false,
129129
"interval": "",
@@ -227,7 +227,7 @@
227227
"pluginVersion": "7.1.2",
228228
"targets": [
229229
{
230-
"expr": "avg(dcgm_gpu_temp)",
230+
"expr": "avg(DCGM_FI_DEV_GPU_TEMP)",
231231
"interval": "",
232232
"legendFormat": "",
233233
"refId": "A"
@@ -286,7 +286,7 @@
286286
"steppedLine": false,
287287
"targets": [
288288
{
289-
"expr": "dcgm_power_usage",
289+
"expr": "DCGM_FI_DEV_POWER_USAGE",
290290
"interval": "",
291291
"legendFormat": "GPU {{gpu}}",
292292
"refId": "A"
@@ -408,7 +408,7 @@
408408
"pluginVersion": "7.1.2",
409409
"targets": [
410410
{
411-
"expr": "sum(dcgm_power_usage)",
411+
"expr": "sum(DCGM_FI_DEV_POWER_USAGE)",
412412
"instant": true,
413413
"interval": "",
414414
"legendFormat": "",
@@ -471,7 +471,7 @@
471471
"steppedLine": false,
472472
"targets": [
473473
{
474-
"expr": "dcgm_sm_clock",
474+
"expr": "DCGM_FI_DEV_SM_CLOCK",
475475
"format": "time_series",
476476
"instant": false,
477477
"interval": "",
@@ -523,6 +523,97 @@
523523
"alignLevel": null
524524
}
525525
},
526+
{
527+
"aliasColors": {},
528+
"bars": false,
529+
"dashLength": 10,
530+
"dashes": false,
531+
"datasource": "${DS_PROMETHEUS}",
532+
"fill": 1,
533+
"fillGradient": 0,
534+
"gridPos": {
535+
"h": 8,
536+
"w": 12,
537+
"x": 12,
538+
"y": 16
539+
},
540+
"hiddenSeries": false,
541+
"id": 4,
542+
"legend": {
543+
"alignAsTable": true,
544+
"avg": true,
545+
"current": true,
546+
"max": true,
547+
"min": false,
548+
"rightSide": true,
549+
"show": true,
550+
"total": false,
551+
"values": true
552+
},
553+
"lines": true,
554+
"linewidth": 2,
555+
"nullPointMode": "null",
556+
"options": {
557+
"dataLinks": []
558+
},
559+
"percentage": false,
560+
"pointradius": 2,
561+
"points": false,
562+
"renderer": "flot",
563+
"seriesOverrides": [],
564+
"spaceLength": 10,
565+
"stack": false,
566+
"steppedLine": false,
567+
"targets": [
568+
{
569+
"expr": "DCGM_FI_DEV_MEM_CLOCK",
570+
"interval": "",
571+
"legendFormat": "GPU {{gpu}}",
572+
"refId": "A"
573+
}
574+
],
575+
"thresholds": [],
576+
"timeFrom": null,
577+
"timeRegions": [],
578+
"timeShift": null,
579+
"interval": "3",
580+
"title": "GPU Memory Clocks",
581+
"tooltip": {
582+
"shared": true,
583+
"sort": 0,
584+
"value_type": "individual"
585+
},
586+
"type": "graph",
587+
"xaxis": {
588+
"buckets": null,
589+
"mode": "time",
590+
"name": null,
591+
"show": true,
592+
"values": []
593+
},
594+
"yaxes": [
595+
{
596+
"format": "hertz",
597+
"label": null,
598+
"logBase": 1,
599+
"max": "100",
600+
"min": "0",
601+
"show": true
602+
},
603+
{
604+
"format": "short",
605+
"label": null,
606+
"logBase": 1,
607+
"max": null,
608+
"min": null,
609+
"show": true
610+
}
611+
],
612+
"yaxis": {
613+
"align": false,
614+
"alignLevel": null
615+
}
616+
},
526617
{
527618
"aliasColors": {},
528619
"bars": false,
@@ -570,7 +661,7 @@
570661
"steppedLine": false,
571662
"targets": [
572663
{
573-
"expr": "dcgm_gpu_utilization",
664+
"expr": "DCGM_FI_DEV_GPU_UTIL",
574665
"interval": "",
575666
"legendFormat": "GPU {{gpu}}",
576667
"refId": "A"
@@ -618,6 +709,97 @@
618709
"alignLevel": null
619710
}
620711
},
712+
{
713+
"aliasColors": {},
714+
"bars": false,
715+
"dashLength": 10,
716+
"dashes": false,
717+
"datasource": "${DS_PROMETHEUS}",
718+
"fill": 1,
719+
"fillGradient": 0,
720+
"gridPos": {
721+
"h": 8,
722+
"w": 12,
723+
"x": 12,
724+
"y": 24
725+
},
726+
"hiddenSeries": false,
727+
"id": 8,
728+
"legend": {
729+
"alignAsTable": true,
730+
"avg": true,
731+
"current": true,
732+
"max": true,
733+
"min": false,
734+
"rightSide": true,
735+
"show": true,
736+
"total": false,
737+
"values": true
738+
},
739+
"lines": true,
740+
"linewidth": 2,
741+
"nullPointMode": "null",
742+
"options": {
743+
"dataLinks": []
744+
},
745+
"percentage": false,
746+
"pointradius": 2,
747+
"points": false,
748+
"renderer": "flot",
749+
"seriesOverrides": [],
750+
"spaceLength": 10,
751+
"stack": false,
752+
"steppedLine": false,
753+
"targets": [
754+
{
755+
"expr": "DCGM_FI_DEV_MEM_COPY_UTIL",
756+
"interval": "",
757+
"legendFormat": "GPU {{gpu}}",
758+
"refId": "A"
759+
}
760+
],
761+
"thresholds": [],
762+
"timeFrom": null,
763+
"timeRegions": [],
764+
"timeShift": null,
765+
"interval": 3,
766+
"title": "GPU Mem Cpy Utilization",
767+
"tooltip": {
768+
"shared": true,
769+
"sort": 0,
770+
"value_type": "cumulative"
771+
},
772+
"type": "graph",
773+
"xaxis": {
774+
"buckets": null,
775+
"mode": "time",
776+
"name": null,
777+
"show": true,
778+
"values": []
779+
},
780+
"yaxes": [
781+
{
782+
"format": "percent",
783+
"label": null,
784+
"logBase": 1,
785+
"max": "100",
786+
"min": "0",
787+
"show": true
788+
},
789+
{
790+
"format": "short",
791+
"label": null,
792+
"logBase": 1,
793+
"max": null,
794+
"min": null,
795+
"show": true
796+
}
797+
],
798+
"yaxis": {
799+
"align": false,
800+
"alignLevel": null
801+
}
802+
},
621803
{
622804
"aliasColors": {},
623805
"bars": false,
@@ -664,7 +846,7 @@
664846
"steppedLine": false,
665847
"targets": [
666848
{
667-
"expr": "dcgm_fb_used",
849+
"expr": "DCGM_FI_DEV_FB_USED",
668850
"interval": "",
669851
"legendFormat": "GPU {{gpu}}",
670852
"refId": "A"
@@ -759,7 +941,7 @@
759941
"steppedLine": false,
760942
"targets": [
761943
{
762-
"expr": "dcgm_fb_free",
944+
"expr": "DCGM_FI_DEV_FB_FREE",
763945
"interval": "",
764946
"legendFormat": "GPU {{gpu}}",
765947
"refId": "A"

0 commit comments

Comments
 (0)