diff --git a/agent/config/firewalld/pbench-dcgm-exporter.xml b/agent/config/firewalld/pbench-dcgm-exporter.xml new file mode 100644 index 0000000000..5ae4e1727b --- /dev/null +++ b/agent/config/firewalld/pbench-dcgm-exporter.xml @@ -0,0 +1,7 @@ + + + pbench-dcgm-exporter + Pbench Agent Prometheus dcgm-exporter + + + diff --git a/agent/config/firewalld/pbench-redis.xml b/agent/config/firewalld/pbench-redis.xml index 7b3f97e0b6..22f6f0d169 100644 --- a/agent/config/firewalld/pbench-redis.xml +++ b/agent/config/firewalld/pbench-redis.xml @@ -1,6 +1,6 @@ - pbench-tool-data-sink - Pbench Agent Tool Data Sink - + pbench-redis + Pbench Agent Redis Server + diff --git a/agent/config/firewalld/pbench-tool-data-sink.xml b/agent/config/firewalld/pbench-tool-data-sink.xml index 22f6f0d169..7b3f97e0b6 100644 --- a/agent/config/firewalld/pbench-tool-data-sink.xml +++ b/agent/config/firewalld/pbench-tool-data-sink.xml @@ -1,6 +1,6 @@ - pbench-redis - Pbench Agent Redis Server - + pbench-tool-data-sink + Pbench Agent Tool Data Sink + diff --git a/agent/containers/images/Dockerfile.base.j2 b/agent/containers/images/Dockerfile.base.j2 index 436f0bc40a..79c73cca6f 100644 --- a/agent/containers/images/Dockerfile.base.j2 +++ b/agent/containers/images/Dockerfile.base.j2 @@ -13,9 +13,9 @@ RUN \ {{ pkgmgr }} module -y disable python38 && \ {% endif %} {% if distro_image.startswith('centos') %} - {{ pkgmgr }} install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-{{ distro_image.split(':', 1)[1] }}.noarch.rpm && \ + {{ pkgmgr }} install -y --setopt=tsflags=nodocs https://dl.fedoraproject.org/pub/epel/epel-release-latest-{{ distro_image.split(':', 1)[1] }}.noarch.rpm && \ {% endif %} - {{ pkgmgr }} install -y {% if distro_image == 'centos:8' %}--enablerepo powertools glibc-locale-source {% endif %} pbench-agent && \ + {{ pkgmgr }} install -y --setopt=tsflags=nodocs {% if distro_image == 'centos:8' %}--enablerepo powertools glibc-locale-source {% endif %} pbench-agent && \ {% if distro_image == 'centos:8' %} localedef -i en_US -f UTF-8 en_US.UTF-8 && \ {% endif %} diff --git a/agent/containers/images/Dockerfile.dcgmEX.j2 b/agent/containers/images/Dockerfile.dcgmEX.j2 new file mode 100644 index 0000000000..f307d45d3b --- /dev/null +++ b/agent/containers/images/Dockerfile.dcgmEX.j2 @@ -0,0 +1,15 @@ +# NOTE: Must be run with --privileged +# RECOMMENDED: Use with the fedora image variants for direct compatibility +FROM pbench-agent-tool-meister-{{ distro }}:{{ tag }} + +RUN {% if distro == 'centos-7' %}yum{% else %}dnf{% endif %} install -y 'dnf-command(config-manager)' && \ + {% if distro == 'centos-7' %}yum{% else %}dnf{% endif %} config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/{{ distro.split("-")|join("") }}/x86_64/cuda-{{ distro.split("-")|join("") }}.repo && \ + {% if distro == 'centos-7' %}yum{% else %}dnf{% endif %} clean expire-cache && \ + {% if distro == 'centos-7' %}yum{% else %}dnf{% endif %} install -y nvidia-driver-cuda nvidia-modprobe datacenter-gpu-manager-2.1.4 golang && \ + git clone https://github.com/NVIDIA/gpu-monitoring-tools.git && \ + (cd gpu-monitoring-tools; git checkout tags/2.1.2 -b build; make binary install) && \ + {% if distro == 'centos-7' %}yum{% else %}dnf{% endif %} -y clean all && \ + rm -rf /var/cache/{% if distro == 'centos-7' %}yum{% else %}dnf{% endif %} + +ENV NVIDIA_DISABLE_REQUIRE="true" \ + NVIDIA_VISIBLE_DEVICES=all diff --git a/agent/containers/images/Dockerfile.layered.j2 b/agent/containers/images/Dockerfile.layered.j2 index b7b4685548..a9d91ff4c5 100644 --- a/agent/containers/images/Dockerfile.layered.j2 +++ b/agent/containers/images/Dockerfile.layered.j2 @@ -1,10 +1,14 @@ # {{ distro }} pbench-agent {{ kind }} image FROM pbench-agent-base-{{ distro }}:{{ tag }} +{% if kind in ('tools', 'all') %} +COPY ./{{ distro }}-pcp.repo /etc/yum.repos.d/pcp.repo +{% endif %} + # Install all the RPMs required for this image. # # FIXME: this is not exhaustive, it does not include RPMs to support # Kubernetes or RHV environments. -RUN {% if distro == 'centos-7' %}yum{% else %}dnf{% endif %} install -y {% if distro == 'centos-8' %}--enablerepo powertools {% endif %}{{ rpms }} && \ +RUN {% if distro == 'centos-7' %}yum{% else %}dnf{% endif %} install -y --setopt=tsflags=nodocs {% if distro == 'centos-8' %}--enablerepo powertools {% endif %}{% if kind in ('tools', 'all') %}--enablerepo pcp {% endif %}{{ rpms }} && \ {% if distro == 'centos-7' %}yum{% else %}dnf{% endif %} -y clean all && \ rm -rf /var/cache/{% if distro == 'centos-7' %}yum{% else %}dnf{% endif %} diff --git a/agent/containers/images/Dockerfile.tds.j2 b/agent/containers/images/Dockerfile.tds.j2 new file mode 100644 index 0000000000..f59e620117 --- /dev/null +++ b/agent/containers/images/Dockerfile.tds.j2 @@ -0,0 +1,9 @@ +# {{ distro }} pbench-agent-tool-data-sink image +FROM pbench-agent-tools-{{ distro }}:{{ tag }} + +VOLUME /var/lib/pbench-agent + +# Port 8080 should be Bottle server, 9090 optional Prometheus server, and 44566 +# the optional pmproxy server. +EXPOSE 8080 9090 44566 +ENTRYPOINT [ "/opt/pbench-agent/util-scripts/tool-meister/tool-data-sink-ep" ] diff --git a/agent/containers/images/Dockerfile.tm.j2 b/agent/containers/images/Dockerfile.tm.j2 new file mode 100644 index 0000000000..95f9fde6f4 --- /dev/null +++ b/agent/containers/images/Dockerfile.tm.j2 @@ -0,0 +1,7 @@ +# {{ distro }} pbench-agent-tool-meister image +FROM pbench-agent-tools-{{ distro }}:{{ tag }} + +# Port 9400 should be the optional dcgm tool, 9100 the optional node_exporter +# tool, and 55677 the pcp (pmcd) tool. +EXPOSE 9100 9400 55677 +ENTRYPOINT [ "/opt/pbench-agent/util-scripts/tool-meister/tool-meister-ep" ] diff --git a/agent/containers/images/Makefile b/agent/containers/images/Makefile index 0d9e79abdf..c1d053fe62 100644 --- a/agent/containers/images/Makefile +++ b/agent/containers/images/Makefile @@ -27,9 +27,43 @@ IMAGE_REPO = docker://quay.io/pbench # Not intended to be overridden with an environment variable. _REPO_TEMPLATE = ../../ansible/pbench/agent/roles/pbench_repo_install/templates/etc/yum.repos.d/pbench.repo.j2 +# NOTE: Currently we require 5.2.2 of the PCP RPMs because using the 5.2.3 +# version prevents us from integrating with Grafana, see PCP Issue #1183, +# https://github.com/performancecopilot/pcp/issues/1183. +# NOTE: We also have to enumerate so many RPMs because for CentOS 7 the RPM +# dependency resolver does not properly resolve to the same version RPMs. Once +# we no longer have to use v5.2.2, we can just list 3 RPMs: pcp-zeroconf, +# pcp-system-tools, and pcp-gui. +_PCP_RPMS = \ + pcp-doc-5.2.2 \ + pcp-gui-5.2.2 \ + pcp-pmda-dm-5.2.2 \ + pcp-pmda-nfsclient-5.2.2 \ + pcp-pmda-openmetrics-5.2.2 \ + pcp-system-tools-5.2.2 \ + pcp-zeroconf-5.2.2 \ + python3-pcp-5.2.2 # The list of RPMs which provide the various tools we offer. # Not intended to be overridden with an environment variable. -_TOOL_RPMS = prometheus2 node_exporter blktrace bpftrace cpupowerutils golang kernel-tools libvirt-client nmap-ncat numactl pbench-sysstat pcp-system-tools perf procps-ng strace tcpdump trace-cmd +# Please keep the lists sorted. +_TOOL_RPMS = \ + blktrace \ + bpftrace \ + cpupowerutils \ + golang \ + kernel-tools \ + libvirt-client \ + nmap-ncat \ + node_exporter \ + numactl \ + pbench-sysstat \ + ${_PCP_RPMS} \ + perf \ + procps-ng \ + prometheus2 \ + strace \ + tcpdump \ + trace-cmd # The list of RPMs for the default workloads we offer. # Not intended to be overridden with an environment variable. @@ -41,8 +75,38 @@ _ALL_RPMS = ${_TOOL_RPMS} ${_WORKLOAD_RPMS} # By default we only build images for the following distributions: _DISTROS = centos-8 centos-7 fedora-33 fedora-32 +# By default we won't build the Tool Data Sink and Tool Meister images all: all-tags $(foreach distro, ${_DISTROS}, ${distro}-all-tagged) +tds: all-tags $(foreach distro, ${_DISTROS}, ${distro}-tool-data-sink-tagged) + +tm: all-tags $(foreach distro, ${_DISTROS}, ${distro}-tool-meister-tagged) + +# We also offer targets per distribution target +centos-8: all-tags centos-8-all-tagged + +centos-7: all-tags centos-7-all-tagged + +fedora-33: all-tags fedora-33-all-tagged + +fedora-32: all-tags fedora-32-all-tagged + +centos-8-tds: all-tags centos-8-tool-data-sink-tagged + +centos-7-tds: all-tags centos-7-tool-data-sink-tagged + +fedora-33-tds: all-tags fedora-33-tool-data-sink-tagged + +fedora-32-tds: all-tags fedora-32-tool-data-sink-tagged + +centos-8-tm: all-tags centos-8-tool-meister-tagged + +centos-7-tm: all-tags centos-7-tool-meister-tagged + +fedora-33-tm: all-tags fedora-33-tool-meister-tagged + +fedora-32-tm: all-tags fedora-32-tool-meister-tagged + #+ # Tagging targets #- @@ -97,16 +161,34 @@ push-major-minor: $(foreach distro, ${_DISTROS}, ${distro}-push-major-minor) %-all-tagged: %-all %-tags.lis ./apply-tags pbench-agent-all-$* $*-tags.lis -%-all: %-tools-tagged %-workloads-tagged %-all.Dockerfile +%-all: %-workloads-tagged %-tool-data-sink-tagged %-tool-meister-tagged %-all.Dockerfile ./build-image all $* $*-tags.lis %-all.Dockerfile: Dockerfile.layered.j2 %-tags.lis jinja2 Dockerfile.layered.j2 -D distro=$* -D tag="$$(grep -v -E '^v' $*-tags.lis)" -D kind="all" -D rpms="${_ALL_RPMS}" > ./$@ +%-tool-data-sink-tagged: %-tool-data-sink %-tags.lis + ./apply-tags pbench-agent-tool-data-sink-$* $*-tags.lis + +%-tool-data-sink: %-tools-tagged %-tool-data-sink.Dockerfile + ./build-image tool-data-sink $* $*-tags.lis + +%-tool-data-sink.Dockerfile: Dockerfile.tds.j2 %-tags.lis + jinja2 Dockerfile.tds.j2 -D distro=$* -D tag="$$(grep -v -E '^v' $*-tags.lis)" > ./$@ + +%-tool-meister-tagged: %-tool-meister %-tags.lis + ./apply-tags pbench-agent-tool-meister-$* $*-tags.lis + +%-tool-meister: %-tools-tagged %-tool-meister.Dockerfile + ./build-image tool-meister $* $*-tags.lis + +%-tool-meister.Dockerfile: Dockerfile.tm.j2 %-tags.lis + jinja2 Dockerfile.tm.j2 -D distro=$* -D tag="$$(grep -v -E '^v' $*-tags.lis)" > ./$@ + %-tools-tagged: %-tools %-tags.lis ./apply-tags pbench-agent-tools-$* $*-tags.lis -%-tools: %-base-tagged %-tools.Dockerfile +%-tools: %-base-tagged %-tools.Dockerfile %-pcp.repo ./build-image tools $* $*-tags.lis %-tools.Dockerfile: Dockerfile.layered.j2 %-tags.lis @@ -204,15 +286,23 @@ fedora-32-base.Dockerfile: Dockerfile.base.j2 fedora-32-pbench.repo # Helper target to build each distro's ".repo" and ".Dockerfile" all-dockerfiles: $(foreach distro, ${_DISTROS}, ${distro}-base.Dockerfile ${distro}-tools.Dockerfile ${distro}-workloads.Dockerfile ${distro}-all.Dockerfile) -# Rule pattern dependencies on non-patterned targets have to be set up -# separately for some reason. -%.repo: ${_REPO_TEMPLATE} +%-pbench.repo: %-pbench.yml ${_REPO_TEMPLATE} + jinja2 ${_REPO_TEMPLATE} $*-pbench.yml -o $@ + +%-pbench.yml: repo.yml.j2 + jinja2 repo.yml.j2 -D distro=$* -D url_prefix=${URL_PREFIX} -D test_suffix=${_TEST_SUFFIX} -D user=${USER} -o $@ + +fedora-33-pcp.repo: pcp.repo.j2 + jinja2 pcp.repo.j2 -D target=f33 -o $@ + +fedora-32-pcp.repo: pcp.repo.j2 + jinja2 pcp.repo.j2 -D target=f32 -o $@ -%.repo: %.yml - jinja2 ${_REPO_TEMPLATE} $*.yml -o $@ +centos-8-pcp.repo: pcp.repo.j2 + jinja2 pcp.repo.j2 -D target=el8 -o $@ -%.yml: repo.yml.j2 - jinja2 repo.yml.j2 -D distro=${@:-pbench.yml=} -D url_prefix=${URL_PREFIX} -D test_suffix=${_TEST_SUFFIX} -D user=${USER} -o $@ +centos-7-pcp.repo: pcp.repo.j2 + jinja2 pcp.repo.j2 -D target=el7 -o $@ clean: rm -f *.Dockerfile *.repo *.yml *-tags.lis diff --git a/agent/containers/images/pcp-pmcd/Dockerfile b/agent/containers/images/pcp-pmcd/Dockerfile deleted file mode 100644 index 78c3b177f0..0000000000 --- a/agent/containers/images/pcp-pmcd/Dockerfile +++ /dev/null @@ -1,16 +0,0 @@ -FROM fedora:33 - -ENV SUMMARY="Performance Co-Pilot" \ - DESCRIPTION="Performance Co-Pilot is a system performance analysis toolkit." \ - VERSION=5 - -RUN dnf install -y --setopt=tsflags=nodocs procps-ng gettext pcp pcp-zeroconf && \ - dnf install -y pcp-doc pcp-gui pcp-system-tools && \ - dnf clean all -RUN systemctl enable pmcd && systemctl disable pmlogger - -COPY config /etc/sysconfig/pmcd - -EXPOSE 44321 -CMD ["/usr/sbin/init"] - diff --git a/agent/containers/images/pcp-pmcd/config b/agent/containers/images/pcp-pmcd/config deleted file mode 100644 index 9506ba807c..0000000000 --- a/agent/containers/images/pcp-pmcd/config +++ /dev/null @@ -1,30 +0,0 @@ -# Environment variables for the pmcd daemon. Refer also to the -# pmcd.options and pmcd.conf files for additional configuration. - -# Behaviour regarding listening on external-facing interfaces; -# unset PMCD_LOCAL to allow connections from remote hosts. -# A value of 0 permits remote connections, 1 permits local only. -PMCD_LOCAL=0 - -# Max length to which the queue of pending connections may grow -# A value of 5 is the default. -# PMCD_MAXPENDING=5 - -# Default behaviour regarding pmcd's approach to starting PMDAs; -# In cases where pmdaroot is available, setting this variable to -# 1, offloads starting and stopping of agents to pmdaroot. This -# allows pmcd to not require a restart when starting a new PMDA. -PMCD_ROOT_AGENT=1 - -# Default behaviour regarding pmcd's approach to re-starting any -# unresponsive PMDAs; this should only be used with pmdaroot and -# PMCD_ROOT_AGENT=1 as it allows pmcd to attempt to automatically -# restart any exited PMDA that it detects (which usually requires -# privileges not available to pmcd itself). -PMCD_RESTART_AGENTS=1 - -# Default timeout for waiting on pmcd to accept connections; any -# longer than this value and the rc scripts report it as failed. -# The value is a PCPIntro(1) interval in units of seconds and it -# will be passed directly to the pmcd_wait(1) utility. -# PMCD_WAIT_TIMEOUT=60 diff --git a/agent/containers/images/pcp-pmlogger/Dockerfile b/agent/containers/images/pcp-pmlogger/Dockerfile deleted file mode 100644 index 52214587cc..0000000000 --- a/agent/containers/images/pcp-pmlogger/Dockerfile +++ /dev/null @@ -1,14 +0,0 @@ -FROM fedora:33 - -ENV SUMMARY="Performance Co-Pilot" \ - DESCRIPTION="Performance Co-Pilot is a system performance analysis toolkit." \ - VERSION=5 - -RUN dnf install -y --setopt=tsflags=nodocs procps-ng gettext pcp pcp-zeroconf && \ - dnf install -y pcp-doc pcp-gui pcp-system-tools && \ - dnf clean all && \ - rm -rf /etc/pcp/pmlogger/control.d/local -RUN systemctl enable pmlogger && systemctl disable pmcd - -VOLUME ["/var/log/pcp/pmlogger"] -CMD ["/usr/sbin/init"] diff --git a/agent/containers/images/pcp.repo.j2 b/agent/containers/images/pcp.repo.j2 new file mode 100644 index 0000000000..02a3eeb545 --- /dev/null +++ b/agent/containers/images/pcp.repo.j2 @@ -0,0 +1,6 @@ +[pcp] +name=pcp +baseurl=https://dl.bintray.com/pcp/{{ target }} +gpgcheck=0 +repo_gpgcheck=0 +enabled=1 diff --git a/agent/containers/images/push b/agent/containers/images/push index c31536c519..af19c6329d 100755 --- a/agent/containers/images/push +++ b/agent/containers/images/push @@ -27,7 +27,7 @@ function pushit { buildah push ${1} ${image_repo}/${1} } -for image in base tools workloads all; do +for image in base tools tool-meister tool-data-sink workloads all; do pushit pbench-agent-${image}-${distro}:${githash} pushit pbench-agent-${image}-${distro}:${ver} if [[ ! -z "${other}" ]]; then diff --git a/agent/containers/images/tagit b/agent/containers/images/tagit index 28727c9afc..858c0bb6a4 100755 --- a/agent/containers/images/tagit +++ b/agent/containers/images/tagit @@ -16,6 +16,6 @@ function tagit { buildah tag ${1}:${githash} ${1}:${tag} } -for image in base tools workloads all; do +for image in base tools tool-meister tool-data-sink workloads all; do tagit pbench-agent-${image}-${distro} done diff --git a/agent/containers/images/visualizers/combo.json b/agent/containers/images/visualizers/combo.json index 9c9e4e4591..b08568d7a0 100644 --- a/agent/containers/images/visualizers/combo.json +++ b/agent/containers/images/visualizers/combo.json @@ -123,7 +123,7 @@ "steppedLine": false, "targets": [ { - "expr": "dcgm_gpu_temp", + "expr": "DCGM_FI_DEV_GPU_TEMP", "format": "time_series", "instant": false, "interval": "", @@ -227,7 +227,7 @@ "pluginVersion": "7.1.2", "targets": [ { - "expr": "avg(dcgm_gpu_temp)", + "expr": "avg(DCGM_FI_DEV_GPU_TEMP)", "interval": "", "legendFormat": "", "refId": "A" @@ -286,7 +286,7 @@ "steppedLine": false, "targets": [ { - "expr": "dcgm_power_usage", + "expr": "DCGM_FI_DEV_POWER_USAGE", "interval": "", "legendFormat": "GPU {{gpu}}", "refId": "A" @@ -408,7 +408,7 @@ "pluginVersion": "7.1.2", "targets": [ { - "expr": "sum(dcgm_power_usage)", + "expr": "sum(DCGM_FI_DEV_POWER_USAGE)", "instant": true, "interval": "", "legendFormat": "", @@ -471,7 +471,7 @@ "steppedLine": false, "targets": [ { - "expr": "dcgm_sm_clock", + "expr": "DCGM_FI_DEV_SM_CLOCK", "format": "time_series", "instant": false, "interval": "", @@ -523,6 +523,97 @@ "alignLevel": null } }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "hiddenSeries": false, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "DCGM_FI_DEV_MEM_CLOCK", + "interval": "", + "legendFormat": "GPU {{gpu}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "interval": "3", + "title": "GPU Memory Clocks", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "hertz", + "label": null, + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, { "aliasColors": {}, "bars": false, @@ -570,7 +661,7 @@ "steppedLine": false, "targets": [ { - "expr": "dcgm_gpu_utilization", + "expr": "DCGM_FI_DEV_GPU_UTIL", "interval": "", "legendFormat": "GPU {{gpu}}", "refId": "A" @@ -618,6 +709,97 @@ "alignLevel": null } }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "hiddenSeries": false, + "id": 8, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "DCGM_FI_DEV_MEM_COPY_UTIL", + "interval": "", + "legendFormat": "GPU {{gpu}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "interval": 3, + "title": "GPU Mem Cpy Utilization", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, { "aliasColors": {}, "bars": false, @@ -664,7 +846,7 @@ "steppedLine": false, "targets": [ { - "expr": "dcgm_fb_used", + "expr": "DCGM_FI_DEV_FB_USED", "interval": "", "legendFormat": "GPU {{gpu}}", "refId": "A" @@ -759,7 +941,7 @@ "steppedLine": false, "targets": [ { - "expr": "dcgm_fb_free", + "expr": "DCGM_FI_DEV_FB_FREE", "interval": "", "legendFormat": "GPU {{gpu}}", "refId": "A" diff --git a/agent/containers/images/visualizers/dcgm.json b/agent/containers/images/visualizers/dcgm.json index 32f8b16992..2c4106c85e 100644 --- a/agent/containers/images/visualizers/dcgm.json +++ b/agent/containers/images/visualizers/dcgm.json @@ -119,7 +119,7 @@ "steppedLine": false, "targets": [ { - "expr": "dcgm_gpu_temp", + "expr": "DCGM_FI_DEV_GPU_TEMP", "format": "time_series", "instant": false, "interval": "", @@ -223,7 +223,7 @@ "pluginVersion": "7.1.2", "targets": [ { - "expr": "avg(dcgm_gpu_temp)", + "expr": "avg(DCGM_FI_DEV_GPU_TEMP)", "interval": "", "legendFormat": "", "refId": "A" @@ -282,7 +282,7 @@ "steppedLine": false, "targets": [ { - "expr": "dcgm_power_usage", + "expr": "DCGM_FI_DEV_POWER_USAGE", "interval": "", "legendFormat": "GPU {{gpu}}", "refId": "A" @@ -404,7 +404,7 @@ "pluginVersion": "7.1.2", "targets": [ { - "expr": "sum(dcgm_power_usage)", + "expr": "sum(DCGM_FI_DEV_POWER_USAGE)", "instant": true, "interval": "", "legendFormat": "", @@ -467,7 +467,7 @@ "steppedLine": false, "targets": [ { - "expr": "dcgm_sm_clock", + "expr": "DCGM_FI_DEV_SM_CLOCK", "format": "time_series", "instant": false, "interval": "", @@ -519,6 +519,97 @@ "alignLevel": null } }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "hiddenSeries": false, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "DCGM_FI_DEV_MEM_CLOCK", + "interval": "", + "legendFormat": "GPU {{gpu}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "interval": "3", + "title": "GPU Memory Clocks", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "hertz", + "label": null, + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, { "aliasColors": {}, "bars": false, @@ -566,7 +657,7 @@ "steppedLine": false, "targets": [ { - "expr": "dcgm_gpu_utilization", + "expr": "DCGM_FI_DEV_GPU_UTIL", "interval": "", "legendFormat": "GPU {{gpu}}", "refId": "A" @@ -614,6 +705,97 @@ "alignLevel": null } }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "hiddenSeries": false, + "id": 8, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "DCGM_FI_DEV_MEM_COPY_UTIL", + "interval": "", + "legendFormat": "GPU {{gpu}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "interval": 3, + "title": "GPU Mem Cpy Utilization", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, { "aliasColors": {}, "bars": false, @@ -660,7 +842,7 @@ "steppedLine": false, "targets": [ { - "expr": "dcgm_fb_used", + "expr": "DCGM_FI_DEV_FB_USED", "interval": "", "legendFormat": "GPU {{gpu}}", "refId": "A" @@ -755,7 +937,7 @@ "steppedLine": false, "targets": [ { - "expr": "dcgm_fb_free", + "expr": "DCGM_FI_DEV_FB_FREE", "interval": "", "legendFormat": "GPU {{gpu}}", "refId": "A" diff --git a/agent/tool-scripts/dcgm b/agent/tool-scripts/dcgm index 835b4bf899..af21817452 100755 --- a/agent/tool-scripts/dcgm +++ b/agent/tool-scripts/dcgm @@ -15,10 +15,9 @@ import sys if len(sys.argv) == 2 and sys.argv[1] == "--help": help = """Options: ---inst= (required) --interval=# (number of seconds between collections) -For more information on this tool, please Nvidia's "dcgm-exporter" at: +For more information on this tool, please see Nvidia's "dcgm-exporter" at: \thttps://ngc.nvidia.com/catalog/containers/nvidia:k8s:dcgm-exporter """ print(help) diff --git a/agent/tool-scripts/meta.json b/agent/tool-scripts/meta.json index c69a941a75..edd8dc782e 100644 --- a/agent/tool-scripts/meta.json +++ b/agent/tool-scripts/meta.json @@ -42,7 +42,7 @@ "persistent":{ "node-exporter": {"collector": "prometheus", "port": "9100"}, - "dcgm": {"collector": "prometheus", "port": "8000"}, + "dcgm": {"collector": "prometheus", "port": "9400"}, "pcp": {"collector": "pcp", "port": "44321"} } } diff --git a/agent/util-scripts/gold/pbench-tool-meister-start/test-54.txt b/agent/util-scripts/gold/pbench-tool-meister-start/test-54.txt index 335c2c4e4c..8e9a30a8a8 100644 --- a/agent/util-scripts/gold/pbench-tool-meister-start/test-54.txt +++ b/agent/util-scripts/gold/pbench-tool-meister-start/test-54.txt @@ -1,13 +1,18 @@ +++ Running test-54 pbench-tool-meister-start --help usage: Usage: pbench-tool-meister-start [--sysinfo ] - [-h] [--sysinfo SYSINFO] tool_group + [-h] [--sysinfo SYSINFO] [--redis-server REDIS_SERVER] tool_group positional arguments: - tool_group The tool group of items to be run by the Tool Meisters. + tool_group The tool group name of tools to be run by the Tool + Meisters. optional arguments: - -h, --help show this help message and exit - --sysinfo SYSINFO The list of system information items to be collected. + -h, --help show this help message and exit + --sysinfo SYSINFO The list of system information items to be collected. + --redis-server REDIS_SERVER + Use an existing Redis server specified by + :; implies an existing Tool Data Sink + and Tool Meisters as well. --- Finished test-54 pbench-tool-meister-start (status=0) +++ pbench tree state /var/tmp/pbench-test-utils/pbench diff --git a/agent/util-scripts/gold/pbench-tool-meister-stop/test-55.txt b/agent/util-scripts/gold/pbench-tool-meister-stop/test-55.txt index a2b7d7cf2b..0f987393ae 100644 --- a/agent/util-scripts/gold/pbench-tool-meister-stop/test-55.txt +++ b/agent/util-scripts/gold/pbench-tool-meister-stop/test-55.txt @@ -1,15 +1,21 @@ +++ Running test-55 pbench-tool-meister-stop --help usage: Usage: pbench-tool-meister-stop [--sysinfo ] - [-h] [--sysinfo SYSINFO] [--interrupt] tool_group + [-h] [--sysinfo SYSINFO] [--interrupt] [--redis-server REDIS_SERVER] + tool_group positional arguments: - tool_group The tool group of items being run in the Tool Meisters. + tool_group The tool group name of tools being run in the Tool + Meisters. optional arguments: - -h, --help show this help message and exit - --sysinfo SYSINFO The list of system information items to be collected. - --interrupt Whether or not the stop operation is in response to an - interrupt. + -h, --help show this help message and exit + --sysinfo SYSINFO The list of system information items to be collected. + --interrupt Whether or not the stop operation is in response to an + interrupt. + --redis-server REDIS_SERVER + Use an existing Redis server specified by + :; implies the use of an existing Tool + Data Sink and Tool Meisters as well. --- Finished test-55 pbench-tool-meister-stop (status=0) +++ pbench tree state /var/tmp/pbench-test-utils/pbench diff --git a/agent/util-scripts/gold/test-client-tool-meister/test-53.txt b/agent/util-scripts/gold/test-client-tool-meister/test-53.txt index 9c287003d6..0f12446984 100644 --- a/agent/util-scripts/gold/test-client-tool-meister/test-53.txt +++ b/agent/util-scripts/gold/test-client-tool-meister/test-53.txt @@ -1,6 +1,8 @@ +++ Running test-53 test-client-tool-meister "mpstat" tool is now registered for host "testhost.example.com" in group "default" "dcgm" tool is now registered for host "testhost.example.com" in group "default" +pbench-tool-data-sink: connected to redis server localhost:17001 +pbench-tool-meister: connected to redis server localhost:17001 Collecting system information --- Finished test-53 test-client-tool-meister (status=0) +++ pbench tree state @@ -55,7 +57,6 @@ Collecting system information /var/tmp/pbench-test-utils/pbench/mock-run/sysinfo/end/testhost.example.com/tm-sysinfo.out /var/tmp/pbench-test-utils/pbench/mock-run/tm /var/tmp/pbench-test-utils/pbench/mock-run/tm/pbench-tool-data-sink.err -/var/tmp/pbench-test-utils/pbench/mock-run/tm/pbench-tool-data-sink.log /var/tmp/pbench-test-utils/pbench/mock-run/tm/pbench-tool-data-sink.out /var/tmp/pbench-test-utils/pbench/mock-run/tm/redis.conf /var/tmp/pbench-test-utils/pbench/mock-run/tm/redis.log @@ -139,17 +140,15 @@ install_check_output = mpstat: pbench-sysstat-12.0.3 is installed --- mock-run/metadata.log file contents +++ mock-run/tm/pbench-tool-data-sink.err file contents +INFO pbench-tool-data-sink web_server_run -- Running Bottle web server ... Bottle v#.##.## server starting up (using DataSinkWsgiServer(handler_class=.DataSinkWsgiRequestHandler'>))... Listening on http://localhost:8080/ Hit Ctrl-C to quit. ---- mock-run/tm/pbench-tool-data-sink.err file contents -+++ mock-run/tm/pbench-tool-data-sink.log file contents -INFO pbench-tool-data-sink web_server_run -- Running Bottle web server ... INFO pbench-tool-data-sink tm_log_capture -- Running Tool Meister log capture ... INFO pbench-tool-data-sink execute -- Tool Data Sink terminating INFO pbench-tool-data-sink web_server_run -- Bottle web server exited ---- mock-run/tm/pbench-tool-data-sink.log file contents +--- mock-run/tm/pbench-tool-data-sink.err file contents +++ mock-run/tm/pbench-tool-data-sink.out file contents --- mock-run/tm/pbench-tool-data-sink.out file contents +++ mock-run/tm/redis.conf file contents @@ -173,7 +172,7 @@ port 17001 +++ mock-run/tm/tm-default-testhost.example.com.err file contents INFO pbench-tool-meister sysinfo -- pbench-sysinfo-dump -- /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pbench-sysinfo-dump /var/tmp/pbench-test-utils/pbench/mock-run/sysinfo/beg/testhost.example.com block,security_mitigations,sos parallel INFO pbench-tool-meister sysinfo -- testhost.example.com: sysinfo send (no-op) default /var/tmp/pbench-test-utils/pbench/mock-run/sysinfo/beg/testhost.example.com -INFO pbench-tool-meister start -- Started persistent tool dcgm, env PYTHONPATH=/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings:/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings/common, args ['python2', '/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py'] +INFO pbench-tool-meister start -- Started persistent tool dcgm, ['/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter'] INFO pbench-tool-meister start -- mpstat: start_tool -- /var/tmp/pbench-test-utils/opt/pbench-agent/tool-scripts/mpstat --start --dir=/var/tmp/pbench-test-utils/pbench/mock-run/0-iter-zero/sample42/tools-default/testhost.example.com --interval=42 --options=forty-two INFO pbench-tool-meister stop -- mpstat: stop_tool -- /var/tmp/pbench-test-utils/opt/pbench-agent/tool-scripts/mpstat --stop --dir=/var/tmp/pbench-test-utils/pbench/mock-run/0-iter-zero/sample42/tools-default/testhost.example.com --interval=42 --options=forty-two INFO pbench-tool-meister wait -- Waiting for transient tool mpstat stop process @@ -196,7 +195,7 @@ INFO pbench-tool-meister __exit__ -- testhost.example.com: terminating pbench-tool-meister-start - verify logging channel up testhost.example.com 0000 INFO pbench-tool-meister sysinfo -- pbench-sysinfo-dump -- /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pbench-sysinfo-dump /var/tmp/pbench-test-utils/pbench/mock-run/sysinfo/beg/testhost.example.com block,security_mitigations,sos parallel testhost.example.com 0001 INFO pbench-tool-meister sysinfo -- testhost.example.com: sysinfo send (no-op) default /var/tmp/pbench-test-utils/pbench/mock-run/sysinfo/beg/testhost.example.com -testhost.example.com 0002 INFO pbench-tool-meister start -- Started persistent tool dcgm, env PYTHONPATH=/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings:/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings/common, args ['python2', '/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py'] +testhost.example.com 0002 INFO pbench-tool-meister start -- Started persistent tool dcgm, ['/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter'] testhost.example.com 0003 INFO pbench-tool-meister start -- mpstat: start_tool -- /var/tmp/pbench-test-utils/opt/pbench-agent/tool-scripts/mpstat --start --dir=/var/tmp/pbench-test-utils/pbench/mock-run/0-iter-zero/sample42/tools-default/testhost.example.com --interval=42 --options=forty-two testhost.example.com 0004 INFO pbench-tool-meister stop -- mpstat: stop_tool -- /var/tmp/pbench-test-utils/opt/pbench-agent/tool-scripts/mpstat --stop --dir=/var/tmp/pbench-test-utils/pbench/mock-run/0-iter-zero/sample42/tools-default/testhost.example.com --interval=42 --options=forty-two testhost.example.com 0005 INFO pbench-tool-meister wait -- Waiting for transient tool mpstat stop process @@ -231,10 +230,10 @@ scrape_configs: - job_name: 'testhost.example.com_dcgm' static_configs: - - targets: ['testhost.example.com:8000'] + - targets: ['testhost.example.com:9400'] --- tools-default/prometheus/prometheus.yml file contents +++ tools-default/testhost.example.com/dcgm/dcgm.file file contents -/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py +/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter --- tools-default/testhost.example.com/dcgm/dcgm.file file contents +++ tools-default/testhost.example.com/dcgm/tm-dcgm-start.err file contents --- tools-default/testhost.example.com/dcgm/tm-dcgm-start.err file contents @@ -242,10 +241,10 @@ scrape_configs: --- tools-default/testhost.example.com/dcgm/tm-dcgm-start.out file contents +++ test-execution.log file contents /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/cp -rL /etc/ssh/ssh_config.d /var/tmp/pbench-test-utils/pbench/mock-run/ +/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pbench-sysinfo-dump /var/tmp/pbench-test-utils/pbench/mock-run/sysinfo/beg/testhost.example.com block,security_mitigations,sos parallel /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pbench-sysinfo-dump /var/tmp/pbench-test-utils/pbench/mock-run/sysinfo/end/testhost.example.com block,security_mitigations,sos parallel /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pidof -x mpstat /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pidof -x mpstat /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/prometheus --config.file=/var/tmp/pbench-test-utils/pbench/mock-run/tools-default/prometheus/prometheus.yml --storage.tsdb.path=/var/tmp/pbench-test-utils/pbench/mock-run/tools-default/prometheus --web.console.libraries=/usr/share/prometheus/console_libraries --web.console.templates=/usr/share/prometheus/consoles -/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py --- test-execution.log file contents diff --git a/agent/util-scripts/gold/test-client-tool-meister/test-56.txt b/agent/util-scripts/gold/test-client-tool-meister/test-56.txt index 4decd73edd..60af8c011f 100644 --- a/agent/util-scripts/gold/test-client-tool-meister/test-56.txt +++ b/agent/util-scripts/gold/test-client-tool-meister/test-56.txt @@ -6,6 +6,11 @@ "node-exporter" tool is now registered for host "remote_b.example.com", with label "blue", in group "lite" "dcgm" tool is now registered for host "remote_c.example.com", with label "red", in group "lite" "pcp" tool is now registered for host "remote_c.example.com", with label "red", in group "lite" +pbench-tool-data-sink: connected to redis server localhost:17001 +pbench-tool-meister: connected to redis server localhost:17001 +pbench-tool-meister: connected to redis server localhost:17001 +pbench-tool-meister: connected to redis server localhost:17001 +pbench-tool-meister: connected to redis server localhost:17001 Collecting system information --- Finished test-56 test-client-tool-meister (status=0) +++ pbench tree state @@ -142,7 +147,6 @@ Collecting system information /var/tmp/pbench-test-utils/pbench/mock-run/sysinfo/end/testhost.example.com/tm-sysinfo.out /var/tmp/pbench-test-utils/pbench/mock-run/tm /var/tmp/pbench-test-utils/pbench/mock-run/tm/pbench-tool-data-sink.err -/var/tmp/pbench-test-utils/pbench/mock-run/tm/pbench-tool-data-sink.log /var/tmp/pbench-test-utils/pbench/mock-run/tm/pbench-tool-data-sink.out /var/tmp/pbench-test-utils/pbench/mock-run/tm/redis.conf /var/tmp/pbench-test-utils/pbench/mock-run/tm/redis.log @@ -241,7 +245,7 @@ INFO pbench-tool-meister __exit__ -- remote_b.example.com: terminating === /var/tmp/pbench-test-utils/pbench/tmp/tm-lite-remote_c.example.com.err: INFO pbench-tool-meister sysinfo -- pbench-sysinfo-dump -- /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pbench-sysinfo-dump /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/red:remote_c.example.com block,security_mitigations,sos parallel INFO pbench-tool-meister _send_directory -- remote_c.example.com: PUT sysinfo-data completed lite /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/red:remote_c.example.com -INFO pbench-tool-meister start -- Started persistent tool dcgm, env PYTHONPATH=/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings:/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings/common, args ['python2', '/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py'] +INFO pbench-tool-meister start -- Started persistent tool dcgm, ['/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter'] INFO pbench-tool-meister start -- Started persistent tool pcp, ['/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pmcd', '--foreground', '--socket=./pmcd.socket', '--port=55677', '--config=/var/tmp/pbench-test-utils/opt/pbench-agent/templates/pmcd.conf'] INFO pbench-tool-meister stop -- Terminate issued for persistent tool dcgm INFO pbench-tool-meister stop -- Terminate issued for persistent tool pcp @@ -256,7 +260,7 @@ INFO pbench-tool-meister __exit__ -- remote_c.example.com: terminating === /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/blue:remote_b.example.com/node-exporter/tm-node-exporter-start.err: === /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/blue:remote_b.example.com/node-exporter/tm-node-exporter-start.out: === /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/red:remote_c.example.com/dcgm/dcgm.file: -/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py +/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter === /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/red:remote_c.example.com/dcgm/tm-dcgm-start.err: === /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/red:remote_c.example.com/dcgm/tm-dcgm-start.out: === /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/red:remote_c.example.com/pcp/pmcd.file: @@ -411,12 +415,9 @@ install_check_output = mpstat: pbench-sysstat-12.0.3 is installed --- mock-run/metadata.log file contents +++ mock-run/tm/pbench-tool-data-sink.err file contents + Bottle v#.##.## server starting up (using DataSinkWsgiServer(handler_class=.DataSinkWsgiRequestHandler'>))... -Listening on http://localhost:8080/ Hit Ctrl-C to quit. - ---- mock-run/tm/pbench-tool-data-sink.err file contents -+++ mock-run/tm/pbench-tool-data-sink.log file contents INFO pbench-tool-data-sink execute -- Tool Data Sink terminating INFO pbench-tool-data-sink log_request -- 127.0.0.1 - - "PUT /sysinfo-data/27c00bc325171c4893ef3862b4340952/remote_a.example.com HTTP/1.1" 200 0 INFO pbench-tool-data-sink log_request -- 127.0.0.1 - - "PUT /sysinfo-data/27c00bc325171c4893ef3862b4340952/remote_b.example.com HTTP/1.1" 200 0 @@ -431,7 +432,8 @@ INFO pbench-tool-data-sink log_request -- 127.0.0.1 - - "PUT /tool-data/9e243dae INFO pbench-tool-data-sink tm_log_capture -- Running Tool Meister log capture ... INFO pbench-tool-data-sink web_server_run -- Bottle web server exited INFO pbench-tool-data-sink web_server_run -- Running Bottle web server ... ---- mock-run/tm/pbench-tool-data-sink.log file contents +Listening on http://localhost:8080/ +--- mock-run/tm/pbench-tool-data-sink.err file contents +++ mock-run/tm/pbench-tool-data-sink.out file contents --- mock-run/tm/pbench-tool-data-sink.out file contents +++ mock-run/tm/redis.conf file contents @@ -455,7 +457,7 @@ port 17001 +++ mock-run/tm/tm-lite-testhost.example.com.err file contents INFO pbench-tool-meister sysinfo -- pbench-sysinfo-dump -- /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pbench-sysinfo-dump /var/tmp/pbench-test-utils/pbench/mock-run/sysinfo/beg/testhost.example.com block,security_mitigations,sos parallel INFO pbench-tool-meister sysinfo -- testhost.example.com: sysinfo send (no-op) lite /var/tmp/pbench-test-utils/pbench/mock-run/sysinfo/beg/testhost.example.com -INFO pbench-tool-meister start -- Started persistent tool dcgm, env PYTHONPATH=/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings:/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings/common, args ['python2', '/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py'] +INFO pbench-tool-meister start -- Started persistent tool dcgm, ['/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter'] INFO pbench-tool-meister start -- mpstat: start_tool -- /var/tmp/pbench-test-utils/opt/pbench-agent/tool-scripts/mpstat --start --dir=/var/tmp/pbench-test-utils/pbench/mock-run/0-iter-zero/sample42/tools-lite/testhost.example.com --interval=42 --options=forty-two INFO pbench-tool-meister stop -- mpstat: stop_tool -- /var/tmp/pbench-test-utils/opt/pbench-agent/tool-scripts/mpstat --stop --dir=/var/tmp/pbench-test-utils/pbench/mock-run/0-iter-zero/sample42/tools-lite/testhost.example.com --interval=42 --options=forty-two INFO pbench-tool-meister wait -- Waiting for transient tool mpstat stop process @@ -511,7 +513,7 @@ remote_b.example.com 0016 INFO pbench-tool-meister _send_directory -- remote_b.e remote_b.example.com 0017 INFO pbench-tool-meister __exit__ -- remote_b.example.com: terminating remote_c.example.com 0000 INFO pbench-tool-meister sysinfo -- pbench-sysinfo-dump -- /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pbench-sysinfo-dump /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/red:remote_c.example.com block,security_mitigations,sos parallel remote_c.example.com 0001 INFO pbench-tool-meister _send_directory -- remote_c.example.com: PUT sysinfo-data completed lite /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/red:remote_c.example.com -remote_c.example.com 0002 INFO pbench-tool-meister start -- Started persistent tool dcgm, env PYTHONPATH=/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings:/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings/common, args ['python2', '/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py'] +remote_c.example.com 0002 INFO pbench-tool-meister start -- Started persistent tool dcgm, ['/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter'] remote_c.example.com 0003 INFO pbench-tool-meister start -- Started persistent tool pcp, ['/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pmcd', '--foreground', '--socket=./pmcd.socket', '--port=55677', '--config=/var/tmp/pbench-test-utils/opt/pbench-agent/templates/pmcd.conf'] remote_c.example.com 0004 INFO pbench-tool-meister stop -- Terminate issued for persistent tool dcgm remote_c.example.com 0005 INFO pbench-tool-meister stop -- Terminate issued for persistent tool pcp @@ -522,7 +524,7 @@ remote_c.example.com 0009 INFO pbench-tool-meister _send_directory -- remote_c.e remote_c.example.com 0010 INFO pbench-tool-meister __exit__ -- remote_c.example.com: terminating testhost.example.com 0000 INFO pbench-tool-meister sysinfo -- pbench-sysinfo-dump -- /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pbench-sysinfo-dump /var/tmp/pbench-test-utils/pbench/mock-run/sysinfo/beg/testhost.example.com block,security_mitigations,sos parallel testhost.example.com 0001 INFO pbench-tool-meister sysinfo -- testhost.example.com: sysinfo send (no-op) lite /var/tmp/pbench-test-utils/pbench/mock-run/sysinfo/beg/testhost.example.com -testhost.example.com 0002 INFO pbench-tool-meister start -- Started persistent tool dcgm, env PYTHONPATH=/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings:/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings/common, args ['python2', '/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py'] +testhost.example.com 0002 INFO pbench-tool-meister start -- Started persistent tool dcgm, ['/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter'] testhost.example.com 0003 INFO pbench-tool-meister start -- mpstat: start_tool -- /var/tmp/pbench-test-utils/opt/pbench-agent/tool-scripts/mpstat --start --dir=/var/tmp/pbench-test-utils/pbench/mock-run/0-iter-zero/sample42/tools-lite/testhost.example.com --interval=42 --options=forty-two testhost.example.com 0004 INFO pbench-tool-meister stop -- mpstat: stop_tool -- /var/tmp/pbench-test-utils/opt/pbench-agent/tool-scripts/mpstat --stop --dir=/var/tmp/pbench-test-utils/pbench/mock-run/0-iter-zero/sample42/tools-lite/testhost.example.com --interval=42 --options=forty-two testhost.example.com 0005 INFO pbench-tool-meister wait -- Waiting for transient tool mpstat stop process @@ -575,15 +577,15 @@ scrape_configs: - job_name: 'remote_c.example.com_dcgm' static_configs: - - targets: ['remote_c.example.com:8000'] + - targets: ['remote_c.example.com:9400'] - job_name: 'testhost.example.com_dcgm' static_configs: - - targets: ['testhost.example.com:8000'] + - targets: ['testhost.example.com:9400'] --- tools-lite/prometheus/prometheus.yml file contents +++ tools-lite/testhost.example.com/dcgm/dcgm.file file contents -/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py +/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter --- tools-lite/testhost.example.com/dcgm/dcgm.file file contents +++ tools-lite/testhost.example.com/dcgm/tm-dcgm-start.err file contents --- tools-lite/testhost.example.com/dcgm/tm-dcgm-start.err file contents @@ -591,6 +593,8 @@ scrape_configs: --- tools-lite/testhost.example.com/dcgm/tm-dcgm-start.out file contents +++ test-execution.log file contents /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/cp -rL /etc/ssh/ssh_config.d /var/tmp/pbench-test-utils/pbench/mock-run/ +/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter +/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/mpstat -P ALL --options=forty-two 42 /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/mpstat -P ALL --options=forty-two 42 /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/mpstat -P ALL --options=forty-two 42 @@ -615,8 +619,6 @@ scrape_configs: /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pmlogger --log=- --report -t 3s -c /var/tmp/pbench-test-utils/opt/pbench-agent/templates/pmlogger.conf --host=remote_c.example.com:55677 /var/tmp/pbench-test-utils/pbench/mock-run/tools-lite/pcp/data/red:remote_c.example.com/%Y%m%d.%H.%M /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pmproxy --log=- --foreground --timeseries --port=44566 --redishost=localhost --redisport=17001 --config=/var/tmp/pbench-test-utils/opt/pbench-agent/templates/pmproxy.conf /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/prometheus --config.file=/var/tmp/pbench-test-utils/pbench/mock-run/tools-lite/prometheus/prometheus.yml --storage.tsdb.path=/var/tmp/pbench-test-utils/pbench/mock-run/tools-lite/prometheus --web.console.libraries=/usr/share/prometheus/console_libraries --web.console.templates=/usr/share/prometheus/consoles -/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py -/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/ssh -o StrictHostKeyChecking=no remote_a.example.com /var/tmp/pbench-test-utils/opt/pbench-agent/util-scripts/tool-meister/pbench-tool-meister-remote localhost 17001 tm-lite-remote_a.example.com yes /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/ssh -o StrictHostKeyChecking=no remote_b.example.com /var/tmp/pbench-test-utils/opt/pbench-agent/util-scripts/tool-meister/pbench-tool-meister-remote localhost 17001 tm-lite-remote_b.example.com yes /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/ssh -o StrictHostKeyChecking=no remote_c.example.com /var/tmp/pbench-test-utils/opt/pbench-agent/util-scripts/tool-meister/pbench-tool-meister-remote localhost 17001 tm-lite-remote_c.example.com yes diff --git a/agent/util-scripts/gold/test-client-tool-meister/test-57.txt b/agent/util-scripts/gold/test-client-tool-meister/test-57.txt index d2ca802e3f..426dace95a 100644 --- a/agent/util-scripts/gold/test-client-tool-meister/test-57.txt +++ b/agent/util-scripts/gold/test-client-tool-meister/test-57.txt @@ -6,6 +6,11 @@ "node-exporter" tool is now registered for host "remote_b.example.com", with label "blue", in group "lite" "dcgm" tool is now registered for host "remote_c.example.com", with label "red", in group "lite" "pcp" tool is now registered for host "remote_c.example.com", with label "red", in group "lite" +pbench-tool-data-sink: connected to redis server localhost:17001 +pbench-tool-meister: connected to redis server localhost:17001 +pbench-tool-meister: connected to redis server localhost:17001 +pbench-tool-meister: connected to redis server localhost:17001 +pbench-tool-meister: connected to redis server localhost:17001 system information not collected when --interrupt specified --- Finished test-57 test-client-tool-meister (status=0) +++ pbench tree state @@ -121,7 +126,6 @@ system information not collected when --interrupt specified /var/tmp/pbench-test-utils/pbench/mock-run/sysinfo/beg/testhost.example.com/tm-sysinfo.out /var/tmp/pbench-test-utils/pbench/mock-run/tm /var/tmp/pbench-test-utils/pbench/mock-run/tm/pbench-tool-data-sink.err -/var/tmp/pbench-test-utils/pbench/mock-run/tm/pbench-tool-data-sink.log /var/tmp/pbench-test-utils/pbench/mock-run/tm/pbench-tool-data-sink.out /var/tmp/pbench-test-utils/pbench/mock-run/tm/redis.conf /var/tmp/pbench-test-utils/pbench/mock-run/tm/redis.log @@ -216,7 +220,7 @@ INFO pbench-tool-meister __exit__ -- remote_b.example.com: terminating === /var/tmp/pbench-test-utils/pbench/tmp/tm-lite-remote_c.example.com.err: INFO pbench-tool-meister sysinfo -- pbench-sysinfo-dump -- /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pbench-sysinfo-dump /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/red:remote_c.example.com block,security_mitigations,sos parallel INFO pbench-tool-meister _send_directory -- remote_c.example.com: PUT sysinfo-data completed lite /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/red:remote_c.example.com -INFO pbench-tool-meister start -- Started persistent tool dcgm, env PYTHONPATH=/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings:/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings/common, args ['python2', '/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py'] +INFO pbench-tool-meister start -- Started persistent tool dcgm, ['/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter'] INFO pbench-tool-meister start -- Started persistent tool pcp, ['/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pmcd', '--foreground', '--socket=./pmcd.socket', '--port=55677', '--config=/var/tmp/pbench-test-utils/opt/pbench-agent/templates/pmcd.conf'] INFO pbench-tool-meister stop -- Terminate issued for persistent tool dcgm INFO pbench-tool-meister stop -- Terminate issued for persistent tool pcp @@ -229,7 +233,7 @@ INFO pbench-tool-meister __exit__ -- remote_c.example.com: terminating === /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/blue:remote_b.example.com/node-exporter/tm-node-exporter-start.err: === /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/blue:remote_b.example.com/node-exporter/tm-node-exporter-start.out: === /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/red:remote_c.example.com/dcgm/dcgm.file: -/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py +/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter === /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/red:remote_c.example.com/dcgm/tm-dcgm-start.err: === /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/red:remote_c.example.com/dcgm/tm-dcgm-start.out: === /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/red:remote_c.example.com/pcp/pmcd.file: @@ -384,12 +388,9 @@ install_check_output = mpstat: pbench-sysstat-12.0.3 is installed --- mock-run/metadata.log file contents +++ mock-run/tm/pbench-tool-data-sink.err file contents + Bottle v#.##.## server starting up (using DataSinkWsgiServer(handler_class=.DataSinkWsgiRequestHandler'>))... -Listening on http://localhost:8080/ Hit Ctrl-C to quit. - ---- mock-run/tm/pbench-tool-data-sink.err file contents -+++ mock-run/tm/pbench-tool-data-sink.log file contents INFO pbench-tool-data-sink execute -- Tool Data Sink terminating INFO pbench-tool-data-sink log_request -- 127.0.0.1 - - "PUT /sysinfo-data/27c00bc325171c4893ef3862b4340952/remote_a.example.com HTTP/1.1" 200 0 INFO pbench-tool-data-sink log_request -- 127.0.0.1 - - "PUT /sysinfo-data/27c00bc325171c4893ef3862b4340952/remote_b.example.com HTTP/1.1" 200 0 @@ -401,7 +402,8 @@ INFO pbench-tool-data-sink log_request -- 127.0.0.1 - - "PUT /tool-data/9e243dae INFO pbench-tool-data-sink tm_log_capture -- Running Tool Meister log capture ... INFO pbench-tool-data-sink web_server_run -- Bottle web server exited INFO pbench-tool-data-sink web_server_run -- Running Bottle web server ... ---- mock-run/tm/pbench-tool-data-sink.log file contents +Listening on http://localhost:8080/ +--- mock-run/tm/pbench-tool-data-sink.err file contents +++ mock-run/tm/pbench-tool-data-sink.out file contents --- mock-run/tm/pbench-tool-data-sink.out file contents +++ mock-run/tm/redis.conf file contents @@ -425,7 +427,7 @@ port 17001 +++ mock-run/tm/tm-lite-testhost.example.com.err file contents INFO pbench-tool-meister sysinfo -- pbench-sysinfo-dump -- /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pbench-sysinfo-dump /var/tmp/pbench-test-utils/pbench/mock-run/sysinfo/beg/testhost.example.com block,security_mitigations,sos parallel INFO pbench-tool-meister sysinfo -- testhost.example.com: sysinfo send (no-op) lite /var/tmp/pbench-test-utils/pbench/mock-run/sysinfo/beg/testhost.example.com -INFO pbench-tool-meister start -- Started persistent tool dcgm, env PYTHONPATH=/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings:/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings/common, args ['python2', '/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py'] +INFO pbench-tool-meister start -- Started persistent tool dcgm, ['/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter'] INFO pbench-tool-meister start -- mpstat: start_tool -- /var/tmp/pbench-test-utils/opt/pbench-agent/tool-scripts/mpstat --start --dir=/var/tmp/pbench-test-utils/pbench/mock-run/0-iter-zero/sample42/tools-lite/testhost.example.com --interval=42 --options=forty-two INFO pbench-tool-meister stop -- mpstat: stop_tool -- /var/tmp/pbench-test-utils/opt/pbench-agent/tool-scripts/mpstat --stop --dir=/var/tmp/pbench-test-utils/pbench/mock-run/0-iter-zero/sample42/tools-lite/testhost.example.com --interval=42 --options=forty-two INFO pbench-tool-meister wait -- Waiting for transient tool mpstat stop process @@ -475,7 +477,7 @@ remote_b.example.com 0014 INFO pbench-tool-meister wait -- Stopped persistent to remote_b.example.com 0015 INFO pbench-tool-meister __exit__ -- remote_b.example.com: terminating remote_c.example.com 0000 INFO pbench-tool-meister sysinfo -- pbench-sysinfo-dump -- /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pbench-sysinfo-dump /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/red:remote_c.example.com block,security_mitigations,sos parallel remote_c.example.com 0001 INFO pbench-tool-meister _send_directory -- remote_c.example.com: PUT sysinfo-data completed lite /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/red:remote_c.example.com -remote_c.example.com 0002 INFO pbench-tool-meister start -- Started persistent tool dcgm, env PYTHONPATH=/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings:/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings/common, args ['python2', '/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py'] +remote_c.example.com 0002 INFO pbench-tool-meister start -- Started persistent tool dcgm, ['/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter'] remote_c.example.com 0003 INFO pbench-tool-meister start -- Started persistent tool pcp, ['/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pmcd', '--foreground', '--socket=./pmcd.socket', '--port=55677', '--config=/var/tmp/pbench-test-utils/opt/pbench-agent/templates/pmcd.conf'] remote_c.example.com 0004 INFO pbench-tool-meister stop -- Terminate issued for persistent tool dcgm remote_c.example.com 0005 INFO pbench-tool-meister stop -- Terminate issued for persistent tool pcp @@ -484,7 +486,7 @@ remote_c.example.com 0007 INFO pbench-tool-meister wait -- Stopped persistent to remote_c.example.com 0008 INFO pbench-tool-meister __exit__ -- remote_c.example.com: terminating testhost.example.com 0000 INFO pbench-tool-meister sysinfo -- pbench-sysinfo-dump -- /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pbench-sysinfo-dump /var/tmp/pbench-test-utils/pbench/mock-run/sysinfo/beg/testhost.example.com block,security_mitigations,sos parallel testhost.example.com 0001 INFO pbench-tool-meister sysinfo -- testhost.example.com: sysinfo send (no-op) lite /var/tmp/pbench-test-utils/pbench/mock-run/sysinfo/beg/testhost.example.com -testhost.example.com 0002 INFO pbench-tool-meister start -- Started persistent tool dcgm, env PYTHONPATH=/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings:/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings/common, args ['python2', '/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py'] +testhost.example.com 0002 INFO pbench-tool-meister start -- Started persistent tool dcgm, ['/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter'] testhost.example.com 0003 INFO pbench-tool-meister start -- mpstat: start_tool -- /var/tmp/pbench-test-utils/opt/pbench-agent/tool-scripts/mpstat --start --dir=/var/tmp/pbench-test-utils/pbench/mock-run/0-iter-zero/sample42/tools-lite/testhost.example.com --interval=42 --options=forty-two testhost.example.com 0004 INFO pbench-tool-meister stop -- mpstat: stop_tool -- /var/tmp/pbench-test-utils/opt/pbench-agent/tool-scripts/mpstat --stop --dir=/var/tmp/pbench-test-utils/pbench/mock-run/0-iter-zero/sample42/tools-lite/testhost.example.com --interval=42 --options=forty-two testhost.example.com 0005 INFO pbench-tool-meister wait -- Waiting for transient tool mpstat stop process @@ -535,15 +537,15 @@ scrape_configs: - job_name: 'remote_c.example.com_dcgm' static_configs: - - targets: ['remote_c.example.com:8000'] + - targets: ['remote_c.example.com:9400'] - job_name: 'testhost.example.com_dcgm' static_configs: - - targets: ['testhost.example.com:8000'] + - targets: ['testhost.example.com:9400'] --- tools-lite/prometheus/prometheus.yml file contents +++ tools-lite/testhost.example.com/dcgm/dcgm.file file contents -/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py +/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter --- tools-lite/testhost.example.com/dcgm/dcgm.file file contents +++ tools-lite/testhost.example.com/dcgm/tm-dcgm-start.err file contents --- tools-lite/testhost.example.com/dcgm/tm-dcgm-start.err file contents @@ -551,6 +553,8 @@ scrape_configs: --- tools-lite/testhost.example.com/dcgm/tm-dcgm-start.out file contents +++ test-execution.log file contents /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/cp -rL /etc/ssh/ssh_config.d /var/tmp/pbench-test-utils/pbench/mock-run/ +/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter +/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/mpstat -P ALL --options=forty-two 42 /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/mpstat -P ALL --options=forty-two 42 /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/mpstat -P ALL --options=forty-two 42 @@ -571,8 +575,6 @@ scrape_configs: /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pmlogger --log=- --report -t 3s -c /var/tmp/pbench-test-utils/opt/pbench-agent/templates/pmlogger.conf --host=remote_c.example.com:55677 /var/tmp/pbench-test-utils/pbench/mock-run/tools-lite/pcp/data/red:remote_c.example.com/%Y%m%d.%H.%M /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pmproxy --log=- --foreground --timeseries --port=44566 --redishost=localhost --redisport=17001 --config=/var/tmp/pbench-test-utils/opt/pbench-agent/templates/pmproxy.conf /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/prometheus --config.file=/var/tmp/pbench-test-utils/pbench/mock-run/tools-lite/prometheus/prometheus.yml --storage.tsdb.path=/var/tmp/pbench-test-utils/pbench/mock-run/tools-lite/prometheus --web.console.libraries=/usr/share/prometheus/console_libraries --web.console.templates=/usr/share/prometheus/consoles -/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py -/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/ssh -o StrictHostKeyChecking=no remote_a.example.com /var/tmp/pbench-test-utils/opt/pbench-agent/util-scripts/tool-meister/pbench-tool-meister-remote localhost 17001 tm-lite-remote_a.example.com yes /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/ssh -o StrictHostKeyChecking=no remote_b.example.com /var/tmp/pbench-test-utils/opt/pbench-agent/util-scripts/tool-meister/pbench-tool-meister-remote localhost 17001 tm-lite-remote_b.example.com yes /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/ssh -o StrictHostKeyChecking=no remote_c.example.com /var/tmp/pbench-test-utils/opt/pbench-agent/util-scripts/tool-meister/pbench-tool-meister-remote localhost 17001 tm-lite-remote_c.example.com yes diff --git a/agent/util-scripts/gold/test-start-stop-tool-meister/test-51.txt b/agent/util-scripts/gold/test-start-stop-tool-meister/test-51.txt index 969847d2bd..81c79120e9 100644 --- a/agent/util-scripts/gold/test-start-stop-tool-meister/test-51.txt +++ b/agent/util-scripts/gold/test-start-stop-tool-meister/test-51.txt @@ -5,7 +5,7 @@ 3. push tool group data and metadata 4. starting tool data sink 5a. starting localhost tool meister -6. waiting for all successfully spawned SSH processes to show up as subscribers +6. waiting for all successfully created Tool Meister processes to show up as subscribers 8. Initialize persistent tools channel pbench-agent-cli-to-client payload, '{"action": "end", "kind": "ds", "status": "success"}' channel pbench-agent-cli-to-client payload, '{"action": "init", "kind": "ds", "status": "success"}' @@ -18,6 +18,8 @@ next pbench-agent-cli-to-client payload from pbench-agent-cli-to-client: {'type': 'message', 'pattern': None, 'channel': b'pbench-agent-cli-to-client', 'data': b'{"action": "end", "kind": "ds", "status": "success"}'} payload from pbench-agent-cli-to-client: {'type': 'message', 'pattern': None, 'channel': b'pbench-agent-cli-to-client', 'data': b'{"action": "init", "kind": "ds", "status": "success"}'} payload from pbench-agent-cli-to-client: {'type': 'message', 'pattern': None, 'channel': b'pbench-agent-cli-to-client', 'data': b'{"action": "startup", "kind": "ds", "status": "success"}'} +pbench-tool-data-sink: connected to redis server localhost:17001 +pbench-tool-meister: connected to redis server localhost:17001 publish end on chan pbench-agent-cli-from-client publish init on chan pbench-agent-cli-from-client publish terminate on chan pbench-agent-cli-from-client @@ -33,7 +35,6 @@ waiting for tool-data-sink (#####) to exit /var/tmp/pbench-test-utils/pbench/mock-run/ssh_config.d /var/tmp/pbench-test-utils/pbench/mock-run/tm /var/tmp/pbench-test-utils/pbench/mock-run/tm/pbench-tool-data-sink.err -/var/tmp/pbench-test-utils/pbench/mock-run/tm/pbench-tool-data-sink.log /var/tmp/pbench-test-utils/pbench/mock-run/tm/pbench-tool-data-sink.out /var/tmp/pbench-test-utils/pbench/mock-run/tm/redis.conf /var/tmp/pbench-test-utils/pbench/mock-run/tm/redis.log @@ -108,22 +109,20 @@ install_check_output = perf: perf is installed --- mock-run/metadata.log file contents +++ mock-run/tm/pbench-tool-data-sink.err file contents +DEBUG pbench-tool-data-sink daemon -- re-constructing Redis server object +DEBUG pbench-tool-data-sink daemon -- reconstructed Redis server object +DEBUG pbench-tool-data-sink driver -- params_key (tds-default): {'benchmark_run_dir': '/var/tmp/pbench-test-utils/pbench/mock-run', 'bind_hostname': 'localhost', 'channel_prefix': 'pbench-agent-cli', 'group': 'default', 'optional_md': {'config': '', 'date': '1900-01-01T00:00:00', 'script': 'fake-bm', 'ssh_opts': '-o StrictHostKeyChecking=no'}, 'tool_metadata': {'persistent': {'dcgm': {'collector': 'prometheus', 'port': '9400'}, 'node-exporter': {'collector': 'prometheus', 'port': '9100'}, 'pcp': {'collector': 'pcp', 'port': '44321'}}, 'transient': {'blktrace': None, 'bpftrace': None, 'cpuacct': None, 'disk': None, 'dm-cache': None, 'docker': None, 'docker-info': None, 'external-data-source': None, 'haproxy-ocp': None, 'iostat': None, 'jmap': None, 'jstack': None, 'kvm-spinlock': None, 'kvmstat': None, 'kvmtrace': None, 'lockstat': None, 'mpstat': None, 'numastat': None, 'oc': None, 'openvswitch': None, 'perf': None, 'pidstat': None, 'pprof': None, 'proc-interrupts': None, 'proc-sched_debug': None, 'proc-vmstat': None, 'prometheus-metrics': None, 'qemu-migrate': None, 'rabbit': None, 'sar': None, 'strace': None, 'sysfs': None, 'systemtap': None, 'tcpdump': None, 'turbostat': None, 'user-tool': None, 'virsh-migrate': None, 'vmstat': None}}, 'tool_trigger': None, 'tools': {'testhost.example.com': {'mpstat': '', 'perf': '--record-opts="-a -freq=100 -g --event=branch-misses --event=cache-misses --event=instructions" --report-opts="-I -g"'}}} +INFO pbench-tool-data-sink web_server_run -- Running Bottle web server ... Bottle v#.##.## server starting up (using DataSinkWsgiServer(handler_class=.DataSinkWsgiRequestHandler'>))... Listening on http://localhost:8080/ Hit Ctrl-C to quit. ---- mock-run/tm/pbench-tool-data-sink.err file contents -+++ mock-run/tm/pbench-tool-data-sink.log file contents -DEBUG pbench-tool-data-sink main -- params_key (tds-default): b'{"benchmark_run_dir": "/var/tmp/pbench-test-utils/pbench/mock-run", "bind_hostname": "localhost", "channel_prefix": "pbench-agent-cli", "group": "default", "optional_md": {"config": "", "date": "1900-01-01T00:00:00", "script": "fake-bm", "ssh_opts": "-o StrictHostKeyChecking=no"}, "tool_metadata": {"persistent": {"dcgm": {"collector": "prometheus", "port": "8000"}, "node-exporter": {"collector": "prometheus", "port": "9100"}, "pcp": {"collector": "pcp", "port": "44321"}}, "transient": {"blktrace": null, "bpftrace": null, "cpuacct": null, "disk": null, "dm-cache": null, "docker": null, "docker-info": null, "external-data-source": null, "haproxy-ocp": null, "iostat": null, "jmap": null, "jstack": null, "kvm-spinlock": null, "kvmstat": null, "kvmtrace": null, "lockstat": null, "mpstat": null, "numastat": null, "oc": null, "openvswitch": null, "perf": null, "pidstat": null, "pprof": null, "proc-interrupts": null, "proc-sched_debug": null, "proc-vmstat": null, "prometheus-metrics": null, "qemu-migrate": null, "rabbit": null, "sar": null, "strace": null, "sysfs": null, "systemtap": null, "tcpdump": null, "turbostat": null, "user-tool": null, "virsh-migrate": null, "vmstat": null}}, "tool_trigger": null, "tools": {"testhost.example.com": {"mpstat": "", "perf": "--record-opts=\\"-a -freq=100 -g --event=branch-misses --event=cache-misses --event=instructions\\" --report-opts=\\"-I -g\\""}}}' -DEBUG pbench-tool-data-sink main -- Tool Data Sink parameters check out, daemonizing ... -DEBUG pbench-tool-data-sink main -- constructing Redis() object -DEBUG pbench-tool-data-sink main -- constructed Redis() object -INFO pbench-tool-data-sink web_server_run -- Running Bottle web server ... DEBUG pbench-tool-data-sink run -- Making tool data sink WSGI server ... +DEBUG pbench-tool-data-sink run -- Successfully created WSGI server DEBUG pbench-tool-data-sink run -- Running tool data sink WSGI server ... DEBUG pbench-tool-data-sink __enter__ -- web server 'run' thread started, processing payloads ... INFO pbench-tool-data-sink tm_log_capture -- Running Tool Meister log capture ... -DEBUG pbench-tool-data-sink __enter__ -- 'tm_log_capture' thread started, processing logs ... +DEBUG pbench-tool-data-sink __enter__ -- 'tm_log_capture' thread started, processing Tool Meister logs ... DEBUG pbench-tool-data-sink fetch_message -- next pbench-agent-cli-from-tms DEBUG pbench-tool-data-sink fetch_message -- payload from pbench-agent-cli-from-tms: {'type': 'message', 'pattern': None, 'channel': b'pbench-agent-cli-from-tms', 'data': b'{"hostname": "testhost.example.com", "hostname_A": "agent.example.com", "hostname_I": "agent.example.com", "hostname_f": "agent.example.com", "hostname_i": "agent.example.com", "hostname_s": "agent.example.com", "installs": {"mpstat": [0, "mpstat: pbench-sysstat-12.0.3 is installed"], "perf": [0, "perf: perf is installed"]}, "kind": "tm", "label": "", "pid": NNNNN, "seqno": "", "sha1": "(unknown)", "version": "(unknown)"}'} DEBUG pbench-tool-data-sink fetch_message -- channel pbench-agent-cli-from-tms payload, '{"hostname": "testhost.example.com", "hostname_A": "agent.example.com", "hostname_I": "agent.example.com", "hostname_f": "agent.example.com", "hostname_i": "agent.example.com", "hostname_s": "agent.example.com", "installs": {"mpstat": [0, "mpstat: pbench-sysstat-12.0.3 is installed"], "perf": [0, "perf: perf is installed"]}, "kind": "tm", "label": "", "pid": NNNNN, "seqno": "", "sha1": "(unknown)", "version": "(unknown)"}' @@ -170,7 +169,7 @@ INFO pbench-tool-data-sink web_server_run -- Bottle web server exited DEBUG pbench-tool-data-sink __exit__ -- Waiting for the web server thread to exit ... DEBUG pbench-tool-data-sink __exit__ -- Waiting for the log capture thread to exit ... DEBUG pbench-tool-data-sink __exit__ -- Exiting Tool Data Sink context ... ---- mock-run/tm/pbench-tool-data-sink.log file contents +--- mock-run/tm/pbench-tool-data-sink.err file contents +++ mock-run/tm/pbench-tool-data-sink.out file contents --- mock-run/tm/pbench-tool-data-sink.out file contents +++ mock-run/tm/redis.conf file contents @@ -194,7 +193,7 @@ port 17001 +++ mock-run/tm/tm-default-testhost.example.com.err file contents DEBUG pbench-tool-meister daemon -- re-constructing Redis server object DEBUG pbench-tool-meister daemon -- re-constructed Redis server object -DEBUG pbench-tool-meister driver -- params_key (tm-default-testhost.example.com): {'benchmark_run_dir': '/var/tmp/pbench-test-utils/pbench/mock-run', 'channel_prefix': 'pbench-agent-cli', 'controller': 'testhost.example.com', 'group': 'default', 'hostname': 'testhost.example.com', 'label': '', 'tool_metadata': {'persistent': {'dcgm': {'collector': 'prometheus', 'port': '8000'}, 'node-exporter': {'collector': 'prometheus', 'port': '9100'}, 'pcp': {'collector': 'pcp', 'port': '44321'}}, 'transient': {'blktrace': None, 'bpftrace': None, 'cpuacct': None, 'disk': None, 'dm-cache': None, 'docker': None, 'docker-info': None, 'external-data-source': None, 'haproxy-ocp': None, 'iostat': None, 'jmap': None, 'jstack': None, 'kvm-spinlock': None, 'kvmstat': None, 'kvmtrace': None, 'lockstat': None, 'mpstat': None, 'numastat': None, 'oc': None, 'openvswitch': None, 'perf': None, 'pidstat': None, 'pprof': None, 'proc-interrupts': None, 'proc-sched_debug': None, 'proc-vmstat': None, 'prometheus-metrics': None, 'qemu-migrate': None, 'rabbit': None, 'sar': None, 'strace': None, 'sysfs': None, 'systemtap': None, 'tcpdump': None, 'turbostat': None, 'user-tool': None, 'virsh-migrate': None, 'vmstat': None}}, 'tools': {'mpstat': '', 'perf': '--record-opts="-a -freq=100 -g --event=branch-misses --event=cache-misses --event=instructions" --report-opts="-I -g"'}} +DEBUG pbench-tool-meister driver -- params_key (tm-default-testhost.example.com): {'benchmark_run_dir': '/var/tmp/pbench-test-utils/pbench/mock-run', 'channel_prefix': 'pbench-agent-cli', 'controller': 'testhost.example.com', 'group': 'default', 'hostname': 'testhost.example.com', 'label': '', 'tool_metadata': {'persistent': {'dcgm': {'collector': 'prometheus', 'port': '9400'}, 'node-exporter': {'collector': 'prometheus', 'port': '9100'}, 'pcp': {'collector': 'pcp', 'port': '44321'}}, 'transient': {'blktrace': None, 'bpftrace': None, 'cpuacct': None, 'disk': None, 'dm-cache': None, 'docker': None, 'docker-info': None, 'external-data-source': None, 'haproxy-ocp': None, 'iostat': None, 'jmap': None, 'jstack': None, 'kvm-spinlock': None, 'kvmstat': None, 'kvmtrace': None, 'lockstat': None, 'mpstat': None, 'numastat': None, 'oc': None, 'openvswitch': None, 'perf': None, 'pidstat': None, 'pprof': None, 'proc-interrupts': None, 'proc-sched_debug': None, 'proc-vmstat': None, 'prometheus-metrics': None, 'qemu-migrate': None, 'rabbit': None, 'sar': None, 'strace': None, 'sysfs': None, 'systemtap': None, 'tcpdump': None, 'turbostat': None, 'user-tool': None, 'virsh-migrate': None, 'vmstat': None}}, 'tools': {'mpstat': '', 'perf': '--record-opts="-a -freq=100 -g --event=branch-misses --event=cache-misses --event=instructions" --report-opts="-I -g"'}} DEBUG pbench-tool-meister __enter__ -- publish pbench-agent-cli-from-tms DEBUG pbench-tool-meister __enter__ -- published pbench-agent-cli-from-tms DEBUG pbench-tool-meister driver -- waiting ... @@ -226,7 +225,7 @@ DEBUG pbench-tool-meister _send_client_status -- publish pbench-agent-cli-from-t --- mock-run/tm/tm-default-testhost.example.com.out file contents +++ mock-run/tm/tm.logs file contents pbench-tool-meister-start - verify logging channel up -testhost.example.com 0000 DEBUG pbench-tool-meister driver -- params_key (tm-default-testhost.example.com): {'benchmark_run_dir': '/var/tmp/pbench-test-utils/pbench/mock-run', 'channel_prefix': 'pbench-agent-cli', 'controller': 'testhost.example.com', 'group': 'default', 'hostname': 'testhost.example.com', 'label': '', 'tool_metadata': {'persistent': {'dcgm': {'collector': 'prometheus', 'port': '8000'}, 'node-exporter': {'collector': 'prometheus', 'port': '9100'}, 'pcp': {'collector': 'pcp', 'port': '44321'}}, 'transient': {'blktrace': None, 'bpftrace': None, 'cpuacct': None, 'disk': None, 'dm-cache': None, 'docker': None, 'docker-info': None, 'external-data-source': None, 'haproxy-ocp': None, 'iostat': None, 'jmap': None, 'jstack': None, 'kvm-spinlock': None, 'kvmstat': None, 'kvmtrace': None, 'lockstat': None, 'mpstat': None, 'numastat': None, 'oc': None, 'openvswitch': None, 'perf': None, 'pidstat': None, 'pprof': None, 'proc-interrupts': None, 'proc-sched_debug': None, 'proc-vmstat': None, 'prometheus-metrics': None, 'qemu-migrate': None, 'rabbit': None, 'sar': None, 'strace': None, 'sysfs': None, 'systemtap': None, 'tcpdump': None, 'turbostat': None, 'user-tool': None, 'virsh-migrate': None, 'vmstat': None}}, 'tools': {'mpstat': '', 'perf': '--record-opts="-a -freq=100 -g --event=branch-misses --event=cache-misses --event=instructions" --report-opts="-I -g"'}} +testhost.example.com 0000 DEBUG pbench-tool-meister driver -- params_key (tm-default-testhost.example.com): {'benchmark_run_dir': '/var/tmp/pbench-test-utils/pbench/mock-run', 'channel_prefix': 'pbench-agent-cli', 'controller': 'testhost.example.com', 'group': 'default', 'hostname': 'testhost.example.com', 'label': '', 'tool_metadata': {'persistent': {'dcgm': {'collector': 'prometheus', 'port': '9400'}, 'node-exporter': {'collector': 'prometheus', 'port': '9100'}, 'pcp': {'collector': 'pcp', 'port': '44321'}}, 'transient': {'blktrace': None, 'bpftrace': None, 'cpuacct': None, 'disk': None, 'dm-cache': None, 'docker': None, 'docker-info': None, 'external-data-source': None, 'haproxy-ocp': None, 'iostat': None, 'jmap': None, 'jstack': None, 'kvm-spinlock': None, 'kvmstat': None, 'kvmtrace': None, 'lockstat': None, 'mpstat': None, 'numastat': None, 'oc': None, 'openvswitch': None, 'perf': None, 'pidstat': None, 'pprof': None, 'proc-interrupts': None, 'proc-sched_debug': None, 'proc-vmstat': None, 'prometheus-metrics': None, 'qemu-migrate': None, 'rabbit': None, 'sar': None, 'strace': None, 'sysfs': None, 'systemtap': None, 'tcpdump': None, 'turbostat': None, 'user-tool': None, 'virsh-migrate': None, 'vmstat': None}}, 'tools': {'mpstat': '', 'perf': '--record-opts="-a -freq=100 -g --event=branch-misses --event=cache-misses --event=instructions" --report-opts="-I -g"'}} testhost.example.com 0001 DEBUG pbench-tool-meister __enter__ -- publish pbench-agent-cli-from-tms testhost.example.com 0002 DEBUG pbench-tool-meister __enter__ -- published pbench-agent-cli-from-tms testhost.example.com 0003 DEBUG pbench-tool-meister driver -- waiting ... diff --git a/agent/util-scripts/gold/test-start-stop-tool-meister/test-52.txt b/agent/util-scripts/gold/test-start-stop-tool-meister/test-52.txt index 38cef83404..64cfa744a8 100644 --- a/agent/util-scripts/gold/test-start-stop-tool-meister/test-52.txt +++ b/agent/util-scripts/gold/test-start-stop-tool-meister/test-52.txt @@ -5,7 +5,7 @@ 3. push tool group data and metadata 4. starting tool data sink 5a. starting localhost tool meister -6. waiting for all successfully spawned SSH processes to show up as subscribers +6. waiting for all successfully created Tool Meister processes to show up as subscribers 8. Initialize persistent tools channel pbench-agent-cli-to-client payload, '{"action": "end", "kind": "ds", "status": "success"}' channel pbench-agent-cli-to-client payload, '{"action": "init", "kind": "ds", "status": "success"}' @@ -18,6 +18,8 @@ next pbench-agent-cli-to-client payload from pbench-agent-cli-to-client: {'type': 'message', 'pattern': None, 'channel': b'pbench-agent-cli-to-client', 'data': b'{"action": "end", "kind": "ds", "status": "success"}'} payload from pbench-agent-cli-to-client: {'type': 'message', 'pattern': None, 'channel': b'pbench-agent-cli-to-client', 'data': b'{"action": "init", "kind": "ds", "status": "success"}'} payload from pbench-agent-cli-to-client: {'type': 'message', 'pattern': None, 'channel': b'pbench-agent-cli-to-client', 'data': b'{"action": "startup", "kind": "ds", "status": "success"}'} +pbench-tool-data-sink: connected to redis server localhost:17001 +pbench-tool-meister: connected to redis server localhost:17001 publish end on chan pbench-agent-cli-from-client publish init on chan pbench-agent-cli-from-client publish terminate on chan pbench-agent-cli-from-client @@ -33,7 +35,6 @@ waiting for tool-data-sink (#####) to exit /var/tmp/pbench-test-utils/pbench/mock-run/ssh_config.d /var/tmp/pbench-test-utils/pbench/mock-run/tm /var/tmp/pbench-test-utils/pbench/mock-run/tm/pbench-tool-data-sink.err -/var/tmp/pbench-test-utils/pbench/mock-run/tm/pbench-tool-data-sink.log /var/tmp/pbench-test-utils/pbench/mock-run/tm/pbench-tool-data-sink.out /var/tmp/pbench-test-utils/pbench/mock-run/tm/redis.conf /var/tmp/pbench-test-utils/pbench/mock-run/tm/redis.log @@ -108,22 +109,20 @@ install_check_output = perf: perf is installed --- mock-run/metadata.log file contents +++ mock-run/tm/pbench-tool-data-sink.err file contents +DEBUG pbench-tool-data-sink daemon -- re-constructing Redis server object +DEBUG pbench-tool-data-sink daemon -- reconstructed Redis server object +DEBUG pbench-tool-data-sink driver -- params_key (tds-mygroup): {'benchmark_run_dir': '/var/tmp/pbench-test-utils/pbench/mock-run', 'bind_hostname': 'localhost', 'channel_prefix': 'pbench-agent-cli', 'group': 'mygroup', 'optional_md': {'config': '', 'date': '1900-01-01T00:00:00', 'script': 'fake-bm', 'ssh_opts': '-o StrictHostKeyChecking=no'}, 'tool_metadata': {'persistent': {'dcgm': {'collector': 'prometheus', 'port': '9400'}, 'node-exporter': {'collector': 'prometheus', 'port': '9100'}, 'pcp': {'collector': 'pcp', 'port': '44321'}}, 'transient': {'blktrace': None, 'bpftrace': None, 'cpuacct': None, 'disk': None, 'dm-cache': None, 'docker': None, 'docker-info': None, 'external-data-source': None, 'haproxy-ocp': None, 'iostat': None, 'jmap': None, 'jstack': None, 'kvm-spinlock': None, 'kvmstat': None, 'kvmtrace': None, 'lockstat': None, 'mpstat': None, 'numastat': None, 'oc': None, 'openvswitch': None, 'perf': None, 'pidstat': None, 'pprof': None, 'proc-interrupts': None, 'proc-sched_debug': None, 'proc-vmstat': None, 'prometheus-metrics': None, 'qemu-migrate': None, 'rabbit': None, 'sar': None, 'strace': None, 'sysfs': None, 'systemtap': None, 'tcpdump': None, 'turbostat': None, 'user-tool': None, 'virsh-migrate': None, 'vmstat': None}}, 'tool_trigger': None, 'tools': {'testhost.example.com': {'mpstat': '', 'perf': '--record-opts="-a -freq=100 -g --event=branch-misses --event=cache-misses --event=instructions" --report-opts="-I -g"'}}} +INFO pbench-tool-data-sink web_server_run -- Running Bottle web server ... Bottle v#.##.## server starting up (using DataSinkWsgiServer(handler_class=.DataSinkWsgiRequestHandler'>))... Listening on http://localhost:8080/ Hit Ctrl-C to quit. ---- mock-run/tm/pbench-tool-data-sink.err file contents -+++ mock-run/tm/pbench-tool-data-sink.log file contents -DEBUG pbench-tool-data-sink main -- params_key (tds-mygroup): b'{"benchmark_run_dir": "/var/tmp/pbench-test-utils/pbench/mock-run", "bind_hostname": "localhost", "channel_prefix": "pbench-agent-cli", "group": "mygroup", "optional_md": {"config": "", "date": "1900-01-01T00:00:00", "script": "fake-bm", "ssh_opts": "-o StrictHostKeyChecking=no"}, "tool_metadata": {"persistent": {"dcgm": {"collector": "prometheus", "port": "8000"}, "node-exporter": {"collector": "prometheus", "port": "9100"}, "pcp": {"collector": "pcp", "port": "44321"}}, "transient": {"blktrace": null, "bpftrace": null, "cpuacct": null, "disk": null, "dm-cache": null, "docker": null, "docker-info": null, "external-data-source": null, "haproxy-ocp": null, "iostat": null, "jmap": null, "jstack": null, "kvm-spinlock": null, "kvmstat": null, "kvmtrace": null, "lockstat": null, "mpstat": null, "numastat": null, "oc": null, "openvswitch": null, "perf": null, "pidstat": null, "pprof": null, "proc-interrupts": null, "proc-sched_debug": null, "proc-vmstat": null, "prometheus-metrics": null, "qemu-migrate": null, "rabbit": null, "sar": null, "strace": null, "sysfs": null, "systemtap": null, "tcpdump": null, "turbostat": null, "user-tool": null, "virsh-migrate": null, "vmstat": null}}, "tool_trigger": null, "tools": {"testhost.example.com": {"mpstat": "", "perf": "--record-opts=\\"-a -freq=100 -g --event=branch-misses --event=cache-misses --event=instructions\\" --report-opts=\\"-I -g\\""}}}' -DEBUG pbench-tool-data-sink main -- Tool Data Sink parameters check out, daemonizing ... -DEBUG pbench-tool-data-sink main -- constructing Redis() object -DEBUG pbench-tool-data-sink main -- constructed Redis() object -INFO pbench-tool-data-sink web_server_run -- Running Bottle web server ... DEBUG pbench-tool-data-sink run -- Making tool data sink WSGI server ... +DEBUG pbench-tool-data-sink run -- Successfully created WSGI server DEBUG pbench-tool-data-sink run -- Running tool data sink WSGI server ... DEBUG pbench-tool-data-sink __enter__ -- web server 'run' thread started, processing payloads ... INFO pbench-tool-data-sink tm_log_capture -- Running Tool Meister log capture ... -DEBUG pbench-tool-data-sink __enter__ -- 'tm_log_capture' thread started, processing logs ... +DEBUG pbench-tool-data-sink __enter__ -- 'tm_log_capture' thread started, processing Tool Meister logs ... DEBUG pbench-tool-data-sink fetch_message -- next pbench-agent-cli-from-tms DEBUG pbench-tool-data-sink fetch_message -- payload from pbench-agent-cli-from-tms: {'type': 'message', 'pattern': None, 'channel': b'pbench-agent-cli-from-tms', 'data': b'{"hostname": "testhost.example.com", "hostname_A": "agent.example.com", "hostname_I": "agent.example.com", "hostname_f": "agent.example.com", "hostname_i": "agent.example.com", "hostname_s": "agent.example.com", "installs": {"mpstat": [0, "mpstat: pbench-sysstat-12.0.3 is installed"], "perf": [0, "perf: perf is installed"]}, "kind": "tm", "label": "", "pid": NNNNN, "seqno": "", "sha1": "(unknown)", "version": "(unknown)"}'} DEBUG pbench-tool-data-sink fetch_message -- channel pbench-agent-cli-from-tms payload, '{"hostname": "testhost.example.com", "hostname_A": "agent.example.com", "hostname_I": "agent.example.com", "hostname_f": "agent.example.com", "hostname_i": "agent.example.com", "hostname_s": "agent.example.com", "installs": {"mpstat": [0, "mpstat: pbench-sysstat-12.0.3 is installed"], "perf": [0, "perf: perf is installed"]}, "kind": "tm", "label": "", "pid": NNNNN, "seqno": "", "sha1": "(unknown)", "version": "(unknown)"}' @@ -170,7 +169,7 @@ INFO pbench-tool-data-sink web_server_run -- Bottle web server exited DEBUG pbench-tool-data-sink __exit__ -- Waiting for the web server thread to exit ... DEBUG pbench-tool-data-sink __exit__ -- Waiting for the log capture thread to exit ... DEBUG pbench-tool-data-sink __exit__ -- Exiting Tool Data Sink context ... ---- mock-run/tm/pbench-tool-data-sink.log file contents +--- mock-run/tm/pbench-tool-data-sink.err file contents +++ mock-run/tm/pbench-tool-data-sink.out file contents --- mock-run/tm/pbench-tool-data-sink.out file contents +++ mock-run/tm/redis.conf file contents @@ -194,7 +193,7 @@ port 17001 +++ mock-run/tm/tm-mygroup-testhost.example.com.err file contents DEBUG pbench-tool-meister daemon -- re-constructing Redis server object DEBUG pbench-tool-meister daemon -- re-constructed Redis server object -DEBUG pbench-tool-meister driver -- params_key (tm-mygroup-testhost.example.com): {'benchmark_run_dir': '/var/tmp/pbench-test-utils/pbench/mock-run', 'channel_prefix': 'pbench-agent-cli', 'controller': 'testhost.example.com', 'group': 'mygroup', 'hostname': 'testhost.example.com', 'label': '', 'tool_metadata': {'persistent': {'dcgm': {'collector': 'prometheus', 'port': '8000'}, 'node-exporter': {'collector': 'prometheus', 'port': '9100'}, 'pcp': {'collector': 'pcp', 'port': '44321'}}, 'transient': {'blktrace': None, 'bpftrace': None, 'cpuacct': None, 'disk': None, 'dm-cache': None, 'docker': None, 'docker-info': None, 'external-data-source': None, 'haproxy-ocp': None, 'iostat': None, 'jmap': None, 'jstack': None, 'kvm-spinlock': None, 'kvmstat': None, 'kvmtrace': None, 'lockstat': None, 'mpstat': None, 'numastat': None, 'oc': None, 'openvswitch': None, 'perf': None, 'pidstat': None, 'pprof': None, 'proc-interrupts': None, 'proc-sched_debug': None, 'proc-vmstat': None, 'prometheus-metrics': None, 'qemu-migrate': None, 'rabbit': None, 'sar': None, 'strace': None, 'sysfs': None, 'systemtap': None, 'tcpdump': None, 'turbostat': None, 'user-tool': None, 'virsh-migrate': None, 'vmstat': None}}, 'tools': {'mpstat': '', 'perf': '--record-opts="-a -freq=100 -g --event=branch-misses --event=cache-misses --event=instructions" --report-opts="-I -g"'}} +DEBUG pbench-tool-meister driver -- params_key (tm-mygroup-testhost.example.com): {'benchmark_run_dir': '/var/tmp/pbench-test-utils/pbench/mock-run', 'channel_prefix': 'pbench-agent-cli', 'controller': 'testhost.example.com', 'group': 'mygroup', 'hostname': 'testhost.example.com', 'label': '', 'tool_metadata': {'persistent': {'dcgm': {'collector': 'prometheus', 'port': '9400'}, 'node-exporter': {'collector': 'prometheus', 'port': '9100'}, 'pcp': {'collector': 'pcp', 'port': '44321'}}, 'transient': {'blktrace': None, 'bpftrace': None, 'cpuacct': None, 'disk': None, 'dm-cache': None, 'docker': None, 'docker-info': None, 'external-data-source': None, 'haproxy-ocp': None, 'iostat': None, 'jmap': None, 'jstack': None, 'kvm-spinlock': None, 'kvmstat': None, 'kvmtrace': None, 'lockstat': None, 'mpstat': None, 'numastat': None, 'oc': None, 'openvswitch': None, 'perf': None, 'pidstat': None, 'pprof': None, 'proc-interrupts': None, 'proc-sched_debug': None, 'proc-vmstat': None, 'prometheus-metrics': None, 'qemu-migrate': None, 'rabbit': None, 'sar': None, 'strace': None, 'sysfs': None, 'systemtap': None, 'tcpdump': None, 'turbostat': None, 'user-tool': None, 'virsh-migrate': None, 'vmstat': None}}, 'tools': {'mpstat': '', 'perf': '--record-opts="-a -freq=100 -g --event=branch-misses --event=cache-misses --event=instructions" --report-opts="-I -g"'}} DEBUG pbench-tool-meister __enter__ -- publish pbench-agent-cli-from-tms DEBUG pbench-tool-meister __enter__ -- published pbench-agent-cli-from-tms DEBUG pbench-tool-meister driver -- waiting ... @@ -226,7 +225,7 @@ DEBUG pbench-tool-meister _send_client_status -- publish pbench-agent-cli-from-t --- mock-run/tm/tm-mygroup-testhost.example.com.out file contents +++ mock-run/tm/tm.logs file contents pbench-tool-meister-start - verify logging channel up -testhost.example.com 0000 DEBUG pbench-tool-meister driver -- params_key (tm-mygroup-testhost.example.com): {'benchmark_run_dir': '/var/tmp/pbench-test-utils/pbench/mock-run', 'channel_prefix': 'pbench-agent-cli', 'controller': 'testhost.example.com', 'group': 'mygroup', 'hostname': 'testhost.example.com', 'label': '', 'tool_metadata': {'persistent': {'dcgm': {'collector': 'prometheus', 'port': '8000'}, 'node-exporter': {'collector': 'prometheus', 'port': '9100'}, 'pcp': {'collector': 'pcp', 'port': '44321'}}, 'transient': {'blktrace': None, 'bpftrace': None, 'cpuacct': None, 'disk': None, 'dm-cache': None, 'docker': None, 'docker-info': None, 'external-data-source': None, 'haproxy-ocp': None, 'iostat': None, 'jmap': None, 'jstack': None, 'kvm-spinlock': None, 'kvmstat': None, 'kvmtrace': None, 'lockstat': None, 'mpstat': None, 'numastat': None, 'oc': None, 'openvswitch': None, 'perf': None, 'pidstat': None, 'pprof': None, 'proc-interrupts': None, 'proc-sched_debug': None, 'proc-vmstat': None, 'prometheus-metrics': None, 'qemu-migrate': None, 'rabbit': None, 'sar': None, 'strace': None, 'sysfs': None, 'systemtap': None, 'tcpdump': None, 'turbostat': None, 'user-tool': None, 'virsh-migrate': None, 'vmstat': None}}, 'tools': {'mpstat': '', 'perf': '--record-opts="-a -freq=100 -g --event=branch-misses --event=cache-misses --event=instructions" --report-opts="-I -g"'}} +testhost.example.com 0000 DEBUG pbench-tool-meister driver -- params_key (tm-mygroup-testhost.example.com): {'benchmark_run_dir': '/var/tmp/pbench-test-utils/pbench/mock-run', 'channel_prefix': 'pbench-agent-cli', 'controller': 'testhost.example.com', 'group': 'mygroup', 'hostname': 'testhost.example.com', 'label': '', 'tool_metadata': {'persistent': {'dcgm': {'collector': 'prometheus', 'port': '9400'}, 'node-exporter': {'collector': 'prometheus', 'port': '9100'}, 'pcp': {'collector': 'pcp', 'port': '44321'}}, 'transient': {'blktrace': None, 'bpftrace': None, 'cpuacct': None, 'disk': None, 'dm-cache': None, 'docker': None, 'docker-info': None, 'external-data-source': None, 'haproxy-ocp': None, 'iostat': None, 'jmap': None, 'jstack': None, 'kvm-spinlock': None, 'kvmstat': None, 'kvmtrace': None, 'lockstat': None, 'mpstat': None, 'numastat': None, 'oc': None, 'openvswitch': None, 'perf': None, 'pidstat': None, 'pprof': None, 'proc-interrupts': None, 'proc-sched_debug': None, 'proc-vmstat': None, 'prometheus-metrics': None, 'qemu-migrate': None, 'rabbit': None, 'sar': None, 'strace': None, 'sysfs': None, 'systemtap': None, 'tcpdump': None, 'turbostat': None, 'user-tool': None, 'virsh-migrate': None, 'vmstat': None}}, 'tools': {'mpstat': '', 'perf': '--record-opts="-a -freq=100 -g --event=branch-misses --event=cache-misses --event=instructions" --report-opts="-I -g"'}} testhost.example.com 0001 DEBUG pbench-tool-meister __enter__ -- publish pbench-agent-cli-from-tms testhost.example.com 0002 DEBUG pbench-tool-meister __enter__ -- published pbench-agent-cli-from-tms testhost.example.com 0003 DEBUG pbench-tool-meister driver -- waiting ... diff --git a/agent/util-scripts/pbench-tool-meister-client b/agent/util-scripts/pbench-tool-meister-client index 6168dd2196..3226264591 100755 --- a/agent/util-scripts/pbench-tool-meister-client +++ b/agent/util-scripts/pbench-tool-meister-client @@ -13,7 +13,7 @@ import sys from pbench.agent.constants import ( cli_tm_allowed_actions, cli_tm_channel_prefix, - redis_port, + def_redis_port, ) from pbench.agent.tool_meister_client import Client @@ -47,30 +47,48 @@ def main(argv): try: group = argv[1] except IndexError: - raise Exception("Missing group argument") + logger.error("Missing group argument") + return 1 try: directory = argv[2] except IndexError: - raise Exception("Missing directory argument") + logger.error("Missing directory argument") + return 1 try: action = argv[3] except IndexError: - raise Exception("Missing action argument") + logger.error("Missing action argument") + return 1 else: if action not in cli_tm_allowed_actions: - raise Exception( - f"Unrecognized action, '{action}', allowed actions are:" - f" {cli_tm_allowed_actions}" + logger.error( + "Unrecognized action, '{}', allowed actions are: {}", + action, + cli_tm_allowed_actions, ) + return 1 elif action == "kill": # FIXME: we need to implement the gritty method of killing all the # tool meisters, locally and remotely, and ensuring they are all # properly shut down. return 0 + redis_server = os.environ.get("PBENCH_REDIS_SERVER", f"localhost:{def_redis_port}") + parts = redis_server.split(":", 1) + if len(parts) != 2: + logger.error("Bad Redis server specified, {!r}", redis_server) + return 1 + try: + redis_port = int(parts[1]) + except Exception: + logger.error("Bad port for Redis server specified in {!r}", redis_server) + return 1 + else: + redis_host = parts[0] + # The Redis server is always running on the local host with the CLI. with Client( - redis_host="localhost", + redis_host=redis_host, redis_port=redis_port, channel_prefix=cli_tm_channel_prefix, logger=logger, diff --git a/agent/util-scripts/pbench-tool-meister-start b/agent/util-scripts/pbench-tool-meister-start index 214c28c825..7816b01ff1 100755 --- a/agent/util-scripts/pbench-tool-meister-start +++ b/agent/util-scripts/pbench-tool-meister-start @@ -82,9 +82,8 @@ from pathlib import Path import redis -from pbench.agent import PbenchAgentConfig from pbench.agent.constants import ( - redis_port, + def_redis_port, cli_tm_channel_prefix, tm_channel_suffix_to_client, tm_channel_suffix_from_client, @@ -92,16 +91,11 @@ from pbench.agent.constants import ( ) from pbench.agent.redis import RedisChannelSubscriber from pbench.agent.tool_data_sink import main as tds_main +from pbench.agent.tool_group import BadToolGroup, ToolGroup from pbench.agent.tool_meister import main as tm_main from pbench.agent.tool_meister_client import Client from pbench.agent.toolmetadata import ToolMetadata -from pbench.agent.utils import ( - cli_verify_sysinfo, - info_log, - verify_tool_group, - BadToolGroup, -) -from pbench.common.exceptions import BadConfig +from pbench.agent.utils import cli_verify_sysinfo, error_log, info_log # Redis server configuration template for pbench's use @@ -116,120 +110,6 @@ port {redis_port:d} """ -class ToolGroup: - """Provides an in-memory representation of the registered tools as recorded - on-disk. - """ - - def __init__(self, group): - """Construct a ToolGroup object from the on-disk data of the given - tool group. - - If the given tool group is valid, the contents are read into the three - dictionary structures: - - "toolnames" - each tool name is the key, with separate dictionaries - for each registered host - - "hostnames" - each registered host is the key, with separate - dictionaries for each tool registered on that host - - "labels" - each registered host name, that has a label, is the key, - and the label as the value; if a host is not labeled, it does not - show up in this dictionary - - Raises BadToolGroup via the verify_tool_group() method on error. - """ - self.tg_dir = verify_tool_group(group) - self.group = group - - # __trigger__ - try: - _trigger = (self.tg_dir / "__trigger__").read_text() - except OSError as ex: - if ex.errno != errno.ENOENT: - raise - # Ignore missing trigger file - self.trigger = None - else: - if len(_trigger) == 0: - # Ignore empty trigger file contents - self.trigger = None - else: - self.trigger = _trigger - - # toolnames - Dict with tool name as the key, dictionary with host - # names and parameters for each host - self.toolnames = {} - # hostnames - Dict with host name as the key, dictionary with tool - # names and parameters for each tool - self.hostnames = {} - self.labels = {} - for hdirent in os.listdir(self.tg_dir): - if hdirent == "__trigger__": - # Ignore handled above - continue - if not (self.tg_dir / hdirent).is_dir(): - # Ignore wayward non-directory files - continue - # We assume this directory is a hostname. - host = hdirent - if host not in self.hostnames: - self.hostnames[host] = {} - for tdirent in os.listdir(self.tg_dir / host): - if tdirent == "__label__": - self.labels[host] = ( - (self.tg_dir / host / tdirent).read_text().strip() - ) - continue - if tdirent.endswith("__noinstall__"): - # FIXME: ignore "noinstall" for now, tools are going to be - # in containers so this does not make sense going forward. - continue - # This directory entry is the name of a tool. - tool = tdirent - tool_opts_raw_lines = ( - (self.tg_dir / host / tool).read_text().split("\n") - ) - tool_opts_lines = [] - for line_raw in tool_opts_raw_lines: - line = line_raw.strip() - if not line: - # Ignore blank lines - continue - tool_opts_lines.append(line) - tool_opts = " ".join(tool_opts_lines) - if tool not in self.toolnames: - self.toolnames[tool] = {} - self.toolnames[tool][host] = tool_opts - - def get_tools(self, host): - """get_tools - given a target host, return a dictionary with the list - of tool names as keys, and the values being their options for that - host. - """ - tools = dict() - for tool, opts in self.toolnames.items(): - try: - host_opts = opts[host] - except KeyError: - # This host does not have this tool registered, ignore. - pass - else: - tools[tool] = host_opts - return tools - - def get_label(self, host): - """get_label - given a target host, return the label associated with - that host. - """ - try: - label = self.labels[host] - except KeyError: - label = "" - return label - - def wait_for_tds(chan, logger): """wait_for_tds - Wait for the Tool Data Sink to report back success or failure regarding the Tool Meister environment setup. @@ -260,14 +140,13 @@ def wait_for_tds(chan, logger): class ReturnCode: - """ReturnCode - symbolic return codes for when the main program of + """ReturnCode - symbolic return codes for the main program of pbench-tool-meister-start. """ SUCCESS = 0 BADTOOLGROUP = 1 BADAGENTCONFIG = 2 - EXCAGENTCONFIG = 3 MISSINGINSTALLDIR = 4 EXCINSTALLDIR = 5 BADTOOLMETADATA = 6 @@ -290,6 +169,10 @@ class ReturnCode: EXCTOOLGROUPDIR = 23 SYSINFOFAILED = 24 INITFAILED = 25 + TDSSTARTUPTIMEOUT = 26 + TOOLGROUPEXC = 27 + BADREDISARG = 28 + BADREDISPORT = 29 # Kill sub-codes KILL_SUCCESS = 0 @@ -327,7 +210,7 @@ def kill_redis_server(pid_file, ret_val): else: try: pid = int(raw_pid) - except Exception: + except ValueError: # Bad pid value return ReturnCode.kill_ret_code(ReturnCode.KILL_BADPID, ret_val) try: @@ -395,10 +278,10 @@ def main(_prog, cli_params): tool_group = ToolGroup(group) except BadToolGroup as exc: logger.error(str(exc)) - return 1 + return ReturnCode.BADTOOLGROUP except Exception: logger.exception("failed to load tool group data for '%s'", group) - return ReturnCode.BADTOOLGROUP + return ReturnCode.TOOLGROUPEXC else: if not tool_group.hostnames: # If a tool group has no tools registered, then there will be no @@ -411,38 +294,31 @@ def main(_prog, cli_params): # Load the tool metadata try: - inst_dir = PbenchAgentConfig( - os.environ["_PBENCH_AGENT_CONFIG"] - ).pbench_install_dir - except BadConfig as exc: - logger.error("%s", exc) + inst_dir = os.environ["pbench_install_dir"] + except KeyError: + logger.error( + "The required 'pbench_install_dir' environment variable appears to be missing" + ) return ReturnCode.BADAGENTCONFIG - except Exception as exc: + try: + tm_start_path = Path(inst_dir).resolve(strict=True) + except FileNotFoundError: logger.error( - "Unexpected error encountered loading pbench agent configuration: '%s'", exc + "Unable to determine proper installation directory, '%s' not found", + inst_dir, ) - return ReturnCode.EXCAGENTCONFIG + return ReturnCode.MISSINGINSTALLDIR + except Exception as exc: + logger.exception( + "Unexpected error encountered resolving installation directory: '%s'", exc, + ) + return ReturnCode.EXCINSTALLDIR else: try: - tm_start_path = Path(inst_dir).resolve(strict=True) - except FileNotFoundError: - logger.error( - "Unable to determine proper installation directory, '%s' not found", - inst_dir, - ) - return ReturnCode.MISSINGINSTALLDIR - except Exception as exc: - logger.exception( - "Unexpected error encountered resolving installation directory: '%s'", - exc, - ) - return ReturnCode.EXCINSTALLDIR - else: - try: - tool_metadata = ToolMetadata(tm_start_path) - except Exception: - logger.exception("failed to load tool metadata") - return ReturnCode.BADTOOLMETADATA + tool_metadata = ToolMetadata(tm_start_path) + except Exception: + logger.exception("failed to load tool metadata") + return ReturnCode.BADTOOLMETADATA # Load and verify required and optional environment variables. try: @@ -463,7 +339,8 @@ def main(_prog, cli_params): if not full_hostname or not hostname: logger.error( "ERROR - _pbench_hostname ('%s') and _pbench_full_hostname ('%s')" - " environment variables are required", + " environment variables are required to represent the respective" + " hostname strings", hostname, full_hostname, ) @@ -537,33 +414,52 @@ def main(_prog, cli_params): # + # Step 2. - Start the Redis Server # - - - # Create the Redis server pbench-specific configuration file - redis_conf = tm_dir / "redis.conf" - params = {"hostnames": hostnames, "tm_dir": tm_dir, "redis_port": redis_port} - try: - with redis_conf.open("w") as fp: - fp.write(redis_conf_tmpl.format(**params)) - except Exception: - logger.exception("failed to create redis server configuration") - return ReturnCode.EXCREDISCONFIG - - # Start the Redis Server itself - redis_srvr = "redis-server" - redis_srvr_path = find_executable(redis_srvr) - redis_pid = tm_dir / f"redis_{redis_port:d}.pid" - logger.debug("2. starting redis server") - try: - retcode = os.spawnl(os.P_WAIT, redis_srvr_path, redis_srvr, redis_conf) - except Exception: - logger.exception("failed to create redis server, daemonized") - return ReturnCode.EXCSPAWNREDIS + if cli_params.redis_server is None: + # Create the Redis server pbench-specific configuration file + redis_conf = tm_dir / "redis.conf" + params = { + "hostnames": hostnames, + "tm_dir": tm_dir, + "redis_port": def_redis_port, + } + try: + with redis_conf.open("w") as fp: + fp.write(redis_conf_tmpl.format(**params)) + except Exception: + logger.exception("failed to create redis server configuration") + return ReturnCode.EXCREDISCONFIG + + # Start the Redis Server itself + redis_srvr = "redis-server" + redis_srvr_path = find_executable(redis_srvr) + redis_pid = tm_dir / f"redis_{def_redis_port:d}.pid" + logger.debug("2. starting redis server") + try: + retcode = os.spawnl(os.P_WAIT, redis_srvr_path, redis_srvr, redis_conf) + except Exception: + logger.exception("failed to create redis server, daemonized") + return ReturnCode.EXCSPAWNREDIS + else: + if retcode != 0: + logger.error( + "failed to create redis server, daemonized; return code: %d", + retcode, + ) + return ReturnCode.REDISFAILED + redis_host = "localhost" + redis_port = def_redis_port else: - if retcode != 0: - logger.error( - "failed to create redis server, daemonized; return code: %d", retcode - ) - return ReturnCode.REDISFAILED + parts = cli_params.redis_server.split(":", 1) + if len(parts) != 2: + logger.error("Bad Redis server specified, '%s'", cli_params.redis_server) + return ReturnCode.BADREDISARG + try: + redis_port = int(parts[1]) + except ValueError: + logger.error("Bad Redis port specified, '%s'", cli_params.redis_server) + return ReturnCode.BADREDISPORT + else: + redis_host = parts[0] # Connect to the Redis Server. # @@ -574,11 +470,11 @@ def main(_prog, cli_params): # listen for responses from the Tool Data Sink. try: to_client_channel = f"{cli_tm_channel_prefix}-{tm_channel_suffix_to_client}" - redis_server = redis.Redis(host="localhost", port=redis_port, db=0) + redis_server = redis.Redis(host=redis_host, port=redis_port, db=0) to_client_chan = RedisChannelSubscriber(redis_server, to_client_channel) except Exception as exc: logger.error( - "Unable to connect to redis server, %s:%d: %r", "localhost", redis_port, exc + "Unable to connect to redis server, %s:%d: %r", redis_host, redis_port, exc ) return kill_redis_server(redis_pid, ReturnCode.REDISCHANFAILED) @@ -601,9 +497,9 @@ def main(_prog, cli_params): controller=_controller, group=group, hostname=host, + label=tool_group.get_label(host), tool_metadata=tool_metadata.getFullData(), tools=tools, - label=tool_group.get_label(host), ) # Create a separate key for the Tool Meister that will be on that host # @@ -624,13 +520,13 @@ def main(_prog, cli_params): # Sink. tds_param_key = f"tds-{group}" tds = dict( - channel_prefix=cli_tm_channel_prefix, benchmark_run_dir=str(benchmark_run_dir), bind_hostname=tm_bind_hostname, + channel_prefix=cli_tm_channel_prefix, group=group, + tool_metadata=tool_metadata.getFullData(), tool_trigger=tool_group.trigger, tools=tool_group_data, - tool_metadata=tool_metadata.getFullData(), # The following are optional optional_md=optional_md, ) @@ -646,186 +542,201 @@ def main(_prog, cli_params): # 4. Start the Tool Data Sink process # - - # FIXME: if only one host is registered, and that host is the same as this - # controller, then don't bother starting the Tool Data Sink. - logger.debug("4. starting tool data sink") - try: - pid = os.fork() - if pid == 0: - # In the child! - - # The main() of the Tool Data Sink module will not return here - # since it will daemonize itself and this child pid will be - # replaced by a new pid. - status = tds_main( - [ - PROG.parent / "pbench-tool-data-sink", - "localhost", - str(redis_port), - tds_param_key, - ] - ) - sys.exit(status) - else: - # In the parent! - - # Wait for the child to finish daemonizing itself. - retcode = waitpid(pid) - if retcode != 0: - logger.error( - "failed to create pbench data sink, daemonized; return code: %d", - retcode, - ) - except Exception: - logger.exception("failed to create pbench data sink, daemonized") - return kill_redis_server(redis_pid, ReturnCode.TDSFORKFAILED) - else: - # Wait for logging channel to be up and ready before we start the - # local and remote Tool Meisters. - num_present = 0 - while num_present == 0: - try: - num_present = redis_server.publish( - f"{cli_tm_channel_prefix}-{tm_channel_suffix_to_logging}", - "pbench-tool-meister-start - verify logging channel up", + if cli_params.redis_server is None: + # FIXME: if only one host is registered, and that host is the same as this + # controller, then don't bother starting the Tool Data Sink. + logger.debug("4. starting tool data sink") + try: + pid = os.fork() + if pid == 0: + # In the child! + + # The main() of the Tool Data Sink module will not return here + # since it will daemonize itself and this child pid will be + # replaced by a new pid. + status = tds_main( + [ + PROG.parent / "tool-meister" / "pbench-tool-data-sink", + "localhost", + str(redis_port), + tds_param_key, + "yes", # Request tool-data-sink daemonize itself + ] ) - except Exception: - logger.exception("Failed to verify Tool Data Sink logging sink working") - return kill_redis_server(redis_pid, ReturnCode.TDSLOGPUBFAILED) + sys.exit(status) else: - if num_present == 0: - time.sleep(0.1) + # In the parent! + + # Wait for the child to finish daemonizing itself. + retcode = waitpid(pid) + if retcode != 0: + logger.error( + "failed to create pbench data sink, daemonized; return code: %d", + retcode, + ) + except Exception: + logger.exception("failed to create pbench data sink, daemonized") + return kill_redis_server(redis_pid, ReturnCode.TDSFORKFAILED) + else: + # Wait for logging channel to be up and ready before we start the + # local and remote Tool Meisters. + timeout = time.time() + 60 + num_present = 0 + while num_present == 0: + try: + num_present = redis_server.publish( + f"{cli_tm_channel_prefix}-{tm_channel_suffix_to_logging}", + "pbench-tool-meister-start - verify logging channel up", + ) + except Exception: + logger.exception( + "Failed to verify Tool Data Sink logging sink working" + ) + return kill_redis_server(redis_pid, ReturnCode.TDSLOGPUBFAILED) + else: + if num_present == 0: + if time.time() > timeout: + logger.error( + "The Tool Data Sink failed to start within one minute" + ) + return kill_redis_server( + redis_pid, ReturnCode.TDSSTARTUPTIMEOUT + ) + else: + time.sleep(0.1) # + # 5. Start all the local and remote Tool Meisters # - - failures = 0 - successes = 0 - # NOTE: it is assumed that the location of the pbench-tool-meister command - # is the same on the local host as it is on any remote host. - tool_meister_cmd = PROG.parent / "tool-meister" / "pbench-tool-meister" - ssh_cmd = "ssh" - ssh_path = find_executable(ssh_cmd) - base_args = [ - ssh_cmd, - ] - base_args.extend(shlex.split(ssh_opts)) - args = [ - "", - f"{tool_meister_cmd}-remote", - tm_bind_hostname, - str(redis_port), - "", - "yes", - ] - tms = dict() - tm_count = 0 - for host in tool_group.hostnames.keys(): - tm_count += 1 - tm_param_key = f"tm-{group}-{host}" - if host == full_hostname: - logger.debug("5a. starting localhost tool meister") - try: - pid = os.fork() - if pid == 0: - # In the child! - - # The main() of the Tool Meister module will not return - # here since it will daemonize itself and this child pid - # will be replaced by a new pid. - status = tm_main( - [ - str(tool_meister_cmd), - "localhost", - str(redis_port), - tm_param_key, - "yes", - ] + if cli_params.redis_server is None: + failures = 0 + successes = 0 + # NOTE: it is assumed that the location of the pbench-tool-meister command + # is the same on the local host as it is on any remote host. + tool_meister_cmd = PROG.parent / "tool-meister" / "pbench-tool-meister" + ssh_cmd = "ssh" + ssh_path = find_executable(ssh_cmd) + base_args = [ + ssh_cmd, + ] + base_args.extend(shlex.split(ssh_opts)) + args = [ + "", + f"{tool_meister_cmd}-remote", + tm_bind_hostname, + str(redis_port), + "", + "yes", # Yes, request the tool meister daemonize itself + ] + tms = dict() + tm_count = 0 + for host in tool_group.hostnames.keys(): + tm_count += 1 + tm_param_key = f"tm-{group}-{host}" + if host == full_hostname: + logger.debug("5a. starting localhost tool meister") + try: + pid = os.fork() + if pid == 0: + # In the child! + + # The main() of the Tool Meister module will not return + # here since it will daemonize itself and this child pid + # will be replaced by a new pid. + status = tm_main( + [ + str(tool_meister_cmd), + "localhost", + str(redis_port), + tm_param_key, + "yes", # Yes, daemonize yourself TM ... + ] + ) + sys.exit(status) + else: + # In the parent! + pass + except Exception: + logger.exception( + "failed to create localhost tool meister, daemonized" ) - sys.exit(status) + failures += 1 + tms[host] = {"pid": None, "status": "failed"} else: - # In the parent! - pass - except Exception: - logger.exception("failed to create localhost tool meister, daemonized") - failures += 1 - tms[host] = {"pid": None, "status": "failed"} + # Record the child pid to wait below. + tms[host] = {"pid": pid, "status": "forked"} else: - # Record the child pid to wait below. - tms[host] = {"pid": pid, "status": "forked"} - else: - args[0] = host - args[4] = tm_param_key - ssh_args = base_args + args - logger.debug( - "5b. starting remote tool meister, ssh_path=%r ssh_args=%r", - ssh_path, - ssh_args, - ) + args[0] = host + args[4] = tm_param_key + ssh_args = base_args + args + logger.debug( + "5b. starting remote tool meister, ssh_path=%r ssh_args=%r", + ssh_path, + ssh_args, + ) + try: + pid = os.spawnv(os.P_NOWAIT, ssh_path, ssh_args) + except Exception: + logger.exception( + "failed to create a tool meister instance for host %s", host + ) + tms[host] = {"pid": None, "status": "failed"} + else: + # Record the child pid to wait below. + tms[host] = {"pid": pid, "status": "spawned"} + + for host, tm_proc in tms.items(): + if tm_proc["status"] == "failed": + failures += 1 + continue + pid = tm_proc["pid"] try: - pid = os.spawnv(os.P_NOWAIT, ssh_path, ssh_args) + exit_status = waitpid(pid) except Exception: + failures += 1 logger.exception( "failed to create a tool meister instance for host %s", host ) - tms[host] = {"pid": None, "status": "failed"} else: - # Record the child pid to wait below. - tms[host] = {"pid": pid, "status": "spawned"} + if exit_status != 0: + failures += 1 + logger.error( + "failed to start tool meister on remote host '%s'" + " (pid %d), exit status: %d", + host, + pid, + exit_status, + ) + else: + successes += 1 - failures = 0 - for host, tm_proc in tms.items(): - if tm_proc["status"] == "failed": - failures += 1 - continue - pid = tm_proc["pid"] - try: - exit_status = waitpid(pid) - except Exception: - failures += 1 - logger.exception( - "failed to create a tool meister instance for host %s", host - ) - else: - if exit_status != 0: - failures += 1 - logger.error( - "failed to start tool meister on remote host '%s'" - " (pid %d), exit status: %d", - host, - pid, - exit_status, + if failures > 0: + # Don't wait for the Tool Meisters + logger.info("terminating tool meister startup due to failures") + terminate_msg = dict(action="terminate", group=group, directory=None) + try: + ret = redis_server.publish( + f"{cli_tm_channel_prefix}-{tm_channel_suffix_from_client}", + json.dumps(terminate_msg, sort_keys=True), ) + except Exception: + logger.exception("Failed to publish terminate message") else: - successes += 1 + logger.debug("publish('terminate') = %r", ret) + return kill_redis_server(redis_pid, ReturnCode.TMFAILURES) - if failures > 0: - # Don't wait for the Tool Meisters - logger.info("terminating tool meister startup due to failures") - terminate_msg = dict(action="terminate", group=group, directory=None) - try: - ret = redis_server.publish( - f"{cli_tm_channel_prefix}-{tm_channel_suffix_from_client}", - json.dumps(terminate_msg, sort_keys=True), + if successes == 0: + logger.warning( + "unable to successfully start any tool meisters," + " but encountered no failures either: terminating" ) - except Exception: - logger.exception("Failed to publish terminate message") - else: - logger.debug("publish('terminate') = %r", ret) - return kill_redis_server(redis_pid, ReturnCode.TMFAILURES) + return kill_redis_server(redis_pid, ReturnCode.TMNOSUCCESSES) - if successes == 0: - logger.warning( - "unable to successfully start any tool meisters," - " but encountered no failures either: terminating" + assert successes == tm_count, ( + f"Logic Bomb! Number of created Tool Meisters, {successes}, does not" + f" match the expected number of Tool Meisters, {tm_count}" ) - return kill_redis_server(redis_pid, ReturnCode.TMNOSUCCESSES) - - assert successes == tm_count, ( - f"Logic Bomb! Number of created Tool Meisters, {successes}, does not" - f" match the expected number of Tool Meisters, {tm_count}" - ) # + # 6. Wait for the TDS to send a message reporting that it, and all the @@ -835,12 +746,16 @@ def main(_prog, cli_params): # If any successes, then we need to wait for them to show up as # subscribers. logger.debug( - "6. waiting for all successfully spawned SSH processes" + "6. waiting for all successfully created Tool Meister processes" " to show up as subscribers" ) ret_val = wait_for_tds(to_client_chan, logger) if ret_val != 0: - return kill_redis_server(redis_pid, ReturnCode.TDSWAITFAILURE) + if cli_params.redis_server is None: + # We created the Redis server, so we should clean it up. + return kill_redis_server(redis_pid, ReturnCode.TDSWAITFAILURE) + else: + return ReturnCode.TDSWAITFAILURE # Setup a Client API object using our existing to_client_chan object to # drive the following client operations ("sysinfo" [optional] and "init" @@ -856,34 +771,29 @@ def main(_prog, cli_params): try: sysinfo_path.mkdir(parents=True) except Exception: - logger.error( - "Unable to create sysinfo-dump directory base path: {}", - sysinfo_path, + error_log( + f"Unable to create sysinfo-dump directory base path: {sysinfo_path}" ) - ret_val = ReturnCode.EXCSYSINFODIR else: logger.debug("7. Collecting system information") info_log("Collecting system information") - ret_val = client.publish(group, sysinfo_path, "sysinfo", sysinfo) - ret_val = ( - ReturnCode.SUCCESS if ret_val == 0 else ReturnCode.SYSINFOFAILED - ) + # Collecting system information is optional, so we don't gate + # the success or failure of the startup on it. + client.publish(group, sysinfo_path, "sysinfo", sysinfo) - if ret_val == ReturnCode.SUCCESS: - tool_dir = benchmark_run_dir / f"tools-{group}" - try: - tool_dir.mkdir(exist_ok=True) - except Exception as exc: - logger.error( - 'failed to create tool output directory, "{}": {}', tool_dir, exc - ) - ret_val = ReturnCode.EXCTOOLGROUPDIR - else: - logger.debug("8. Initialize persistent tools") - ret_val = client.publish(group, tool_dir, "init", None) - if ret_val != ReturnCode.SUCCESS: + tool_dir = benchmark_run_dir / f"tools-{group}" + try: + tool_dir.mkdir(exist_ok=True) + except Exception as exc: + error_log(f"failed to create tool output directory, '{tool_dir}': {exc}") + return ReturnCode.EXCTOOLGROUPDIR + else: + logger.debug("8. Initialize persistent tools") + ret_val = client.publish(group, tool_dir, "init", None) + if ret_val != 0: + if cli_params.redis_server is None: + # We created the Redis server, so we should clean it up. ret_val = kill_redis_server(redis_pid, ReturnCode.INITFAILED) - return ret_val @@ -900,7 +810,17 @@ if __name__ == "__main__": help="The list of system information items to be collected.", ) parser.add_argument( - "tool_group", help="The tool group of items to be run by the Tool Meisters." + "--redis-server", + dest="redis_server", + default=os.environ.get("PBENCH_REDIS_SERVER", None), + help=( + "Use an existing Redis server specified by :;" + " implies an existing Tool Data Sink and Tool Meisters as well." + ), + ) + parser.add_argument( + "tool_group", + help="The tool group name of tools to be run by the Tool Meisters.", ) parsed = parser.parse_args() status = main(sys.argv[0], parsed) diff --git a/agent/util-scripts/pbench-tool-meister-stop b/agent/util-scripts/pbench-tool-meister-stop index 6a68603dbe..ce4f9b62b9 100755 --- a/agent/util-scripts/pbench-tool-meister-stop +++ b/agent/util-scripts/pbench-tool-meister-stop @@ -19,14 +19,10 @@ import time from argparse import ArgumentParser from pathlib import Path -from pbench.agent.constants import redis_port, cli_tm_channel_prefix +from pbench.agent.constants import def_redis_port, cli_tm_channel_prefix +from pbench.agent.tool_group import BadToolGroup, ToolGroup from pbench.agent.tool_meister_client import Client -from pbench.agent.utils import ( - cli_verify_sysinfo, - info_log, - verify_tool_group, - BadToolGroup, -) +from pbench.agent.utils import cli_verify_sysinfo, error_log, info_log def is_running(pid): @@ -42,6 +38,87 @@ def is_running(pid): return True +def wait_for_pid(pid): + """wait_for_pid - wait for a process to actually stop running. + """ + while is_running(pid): + time.sleep(0.1) + + +def graceful_shutdown( + benchmark_run_dir, full_hostname, group, redis_server_pid_file, logger +): + # The assumption/assertion here is that the tool meister "stop" command is + # run on the same node as the tool meister "start" command ran, creating + # the local Tool Data Sink and the optional local Tool Meister. We want to + # make sure anything "local" to this stop command is shut down gracefully + # before we report back to the user. If Tool Meisters from remote nodes + # have already reported that they have received the "terminate" message, + # then we trust they will shutdown gracefully themselves. + try: + tds_pid_file = benchmark_run_dir / "tm" / "pbench-tool-data-sink.pid" + try: + pid_str = tds_pid_file.read_text() + except OSError as exc: + if exc.errno != errno.ENOENT: + raise + else: + tds_pid = int(pid_str) + logger.debug("waiting for tool-data-sink (%d) to exit", tds_pid) + wait_for_pid(tds_pid) + except Exception: + logger.exception("Exception encountered waiting for tool-data-sink") + ret_val = 1 + else: + ret_val = 0 + + try: + ltm_pid_file = benchmark_run_dir / "tm" / f"tm-{group}-{full_hostname}.pid" + try: + pid_str = ltm_pid_file.read_text() + except OSError as exc: + if exc.errno != errno.ENOENT: + raise + else: + ltm_pid = int(pid_str) + logger.debug("waiting for local tool-meister (%d) to exit", ltm_pid) + wait_for_pid(ltm_pid) + except Exception: + logger.exception("Exception encountered waiting for local tool-meister") + ret_val = 1 + + # All was good so far, so we can terminate the redis server. + try: + try: + pid_str = redis_server_pid_file.read_text() + except OSError as exc: + if exc.errno != errno.ENOENT: + raise + else: + redis_server_pid = int(pid_str) + pid_exists = True + timeout = time.time() + 60 + while pid_exists: + try: + os.kill(redis_server_pid, signal.SIGTERM) + except ProcessLookupError: + pid_exists = False + else: + if time.time() > timeout: + try: + os.kill(redis_server_pid, signal.SIGKILL) + except ProcessLookupError: + pid_exists = False + except Exception: + raise + time.sleep(0.1) + except Exception: + logger.exception("Exception encountered terminating Redis server") + ret_val = 1 + + return ret_val + + def main(_prog, cli_params): """Main program for the tool meister stop CLI interface. @@ -81,7 +158,7 @@ def main(_prog, cli_params): logger.addHandler(sh) try: - verify_tool_group(cli_params.tool_group) + ToolGroup.verify_tool_group(cli_params.tool_group) except BadToolGroup as exc: logger.error(str(exc)) return 1 @@ -103,20 +180,38 @@ def main(_prog, cli_params): full_hostname = os.environ["_pbench_full_hostname"] benchmark_run_dir = Path(os.environ["benchmark_run_dir"]).resolve(strict=True) except Exception: - logger.exception("failed to fetch parameters from the environment") + logger.exception("failed to fetch required parameters from the environment") return 1 - try: - redis_server_pid_file = ( - benchmark_run_dir / "tm" / f"redis_{redis_port:d}.pid" - ).resolve(strict=True) - except FileNotFoundError: - # No Redis server, nothing to do. - return 0 + if cli_params.redis_server is None: + # No Redis server was given, so look locally to see if we can find it. + # If no Redis server locally, we're done. + try: + redis_server_pid_file = ( + benchmark_run_dir / "tm" / f"redis_{def_redis_port:d}.pid" + ).resolve(strict=True) + except FileNotFoundError: + # No Redis server, nothing to do. + return 0 + else: + redis_host = "localhost" + redis_port = def_redis_port + else: + parts = cli_params.redis_server.split(":", 1) + if len(parts) != 2: + logger.error("Bad Redis server specified, '%s'", cli_params.redis_server) + return 1 + try: + redis_port = int(parts[1]) + except Exception: + logger.error("Bad Redis port specified, '%s'", cli_params.redis_server) + return 1 + else: + redis_host = parts[0] # The Redis server is always running on the local host with the CLI. with Client( - redis_host="localhost", + redis_host=redis_host, redis_port=redis_port, channel_prefix=cli_tm_channel_prefix, logger=logger, @@ -126,23 +221,19 @@ def main(_prog, cli_params): try: tool_dir.mkdir(exist_ok=True) except Exception as exc: - logger.error( - 'failed to create tool output directory, "{}": {}', tool_dir, exc - ) + error_log(f"failed to create tool output directory, '{tool_dir}': {exc}") end_ret_val = 1 else: end_ret_val = client.publish(group, tool_dir, "end", None) - - # Next we collect the system configuration information, but only if the - # "end" operation was successful, and if it was requested. + # Next we collect the system configuration information only if we were + # successfully able to end the persistent tools run. if end_ret_val == 0 and sysinfo: sysinfo_path = benchmark_run_dir / "sysinfo" / "end" try: sysinfo_path.mkdir(parents=True) except Exception: - logger.error( - "Unable to create sysinfo-dump directory base path: {}", - sysinfo_path, + error_log( + f"Unable to create sysinfo-dump directory base path: {sysinfo_path}", ) else: logger.info("Collecting system information") @@ -163,65 +254,19 @@ def main(_prog, cli_params): # just return the success/failure of the terminate operation. ret_val = end_ret_val if end_ret_val != 0 else term_ret_val - # The assumption/assertion here is that the tool meister "stop" command is - # run on the same node as the tool meister "start" command ran, creating - # the local Tool Data Sink and the optional local Tool Meister. We want to - # make sure anything "local" to this stop command is shut down gracefully - # before we report back to the user. If Tool Meisters from remote nodes - # have already reported that they have received the "terminate" message, - # then we trust they will shutdown gracefully themselves. - try: - tds_pid_file = benchmark_run_dir / "tm" / "pbench-tool-data-sink.pid" - try: - pid_str = tds_pid_file.read_text() - except OSError as exc: - if exc.errno != errno.ENOENT: - raise - else: - tds_pid = int(pid_str) - logger.debug("waiting for tool-data-sink (%d) to exit", tds_pid) - while is_running(tds_pid): - time.sleep(0.1) - except Exception: - logger.exception("Exception encountered waiting for tool-data-sink") - ret_val = 1 - - try: - ltm_pid_file = benchmark_run_dir / "tm" / f"tm-{group}-{full_hostname}.pid" - try: - pid_str = ltm_pid_file.read_text() - except OSError as exc: - if exc.errno != errno.ENOENT: - raise - else: - ltm_pid = int(pid_str) - logger.debug("waiting for local tool-meister (%d) to exit", ltm_pid) - while is_running(ltm_pid): - time.sleep(0.1) - except Exception: - logger.exception("Exception encountered waiting for local tool-meister") - ret_val = 1 - - # All was good so far, so we can terminate the Redis server. - try: - try: - pid_str = redis_server_pid_file.read_text() - except OSError as exc: - if exc.errno != errno.ENOENT: - raise - else: - redis_server_pid = int(pid_str) - pid_exists = True - while pid_exists: - try: - os.kill(redis_server_pid, signal.SIGTERM) - except ProcessLookupError: - pid_exists = False - else: - time.sleep(0.1) - except Exception: - logger.exception("Exception encountered terminating Redis server") - ret_val = 1 + if cli_params.redis_server is None: + # The client operations have finished, successful or unsuccessfully, + # and we were not given an explicit Redis server to use. So the + # previous pbench-tool-meister-start must have set up the local Tool + # Data Sink, Tool Meister (if registered), and the Redis server. It is + # our responsibility to make sure these processes shut down correctly. + shutdown_ret_val = graceful_shutdown( + benchmark_run_dir, full_hostname, group, redis_server_pid_file, logger + ) + if ret_val == 0: + # If client termination was successful, report the status of the + # graceful shutdown of the Tool Data Sink and the Redis server. + ret_val = shutdown_ret_val return ret_val @@ -243,7 +288,18 @@ if __name__ == "__main__": help="Whether or not the stop operation is in response to an interrupt.", ) parser.add_argument( - "tool_group", help="The tool group of items being run in the Tool Meisters." + "--redis-server", + dest="redis_server", + default=os.environ.get("PBENCH_REDIS_SERVER", None), + help=( + "Use an existing Redis server specified by :;" + " implies the use of an existing Tool Data Sink and Tool Meisters" + " as well." + ), + ) + parser.add_argument( + "tool_group", + help="The tool group name of tools being run in the Tool Meisters.", ) parsed = parser.parse_args() status = main(sys.argv[0], parsed) diff --git a/agent/util-scripts/test-bin/samples/scripts/dcgm_prometheus.py b/agent/util-scripts/test-bin/dcgm-exporter similarity index 100% rename from agent/util-scripts/test-bin/samples/scripts/dcgm_prometheus.py rename to agent/util-scripts/test-bin/dcgm-exporter diff --git a/agent/util-scripts/tool-meister/pbench-tool-data-sink b/agent/util-scripts/tool-meister/pbench-tool-data-sink new file mode 100755 index 0000000000..985f830799 --- /dev/null +++ b/agent/util-scripts/tool-meister/pbench-tool-data-sink @@ -0,0 +1,14 @@ +#!/usr/bin/env python3 + +"""Simple command-line wrapper to keep the tool data sink from being in the +CLI command set, while still allowing it to be invoked by the container entry +point. +""" + +import sys + +from pbench.agent.tool_data_sink import main + + +status = main(sys.argv) +sys.exit(status) diff --git a/agent/util-scripts/tool-meister/tool-data-sink-ep b/agent/util-scripts/tool-meister/tool-data-sink-ep new file mode 100755 index 0000000000..47d02d65a5 --- /dev/null +++ b/agent/util-scripts/tool-meister/tool-data-sink-ep @@ -0,0 +1,8 @@ +#!/bin/bash + +_dir="$(dirname ${0})" + +source /etc/profile.d/pbench-agent.sh +source /opt/pbench-agent/base +# Instruct the Tool Data Sink not to daemonize. +/opt/pbench-agent/util-scripts/tool-meister/pbench-tool-data-sink "${REDIS_HOST}" "${REDIS_PORT}" "${PARAM_KEY}" no diff --git a/agent/util-scripts/tool-meister/tool-meister-ep b/agent/util-scripts/tool-meister/tool-meister-ep new file mode 100755 index 0000000000..dce85d64b9 --- /dev/null +++ b/agent/util-scripts/tool-meister/tool-meister-ep @@ -0,0 +1,8 @@ +#!/bin/bash + +_dir="$(dirname ${0})" + +source /etc/profile.d/pbench-agent.sh +source /opt/pbench-agent/base +# Instruct the Tool Meister not to daemonize. +/opt/pbench-agent/util-scripts/tool-meister/pbench-tool-meister "${REDIS_HOST}" "${REDIS_PORT}" "${PARAM_KEY}" no diff --git a/agent/util-scripts/unittests b/agent/util-scripts/unittests index 66682e7e20..4333c5c425 100755 --- a/agent/util-scripts/unittests +++ b/agent/util-scripts/unittests @@ -528,7 +528,7 @@ function sort_log_file { } function sort_tdslog { - sort_log_file ${_testdir}/mock-run/tm/pbench-tool-data-sink.log + sort_log_file ${_testdir}/mock-run/tm/pbench-tool-data-sink.err } function sort_tmlogs { diff --git a/lib/pbench/agent/base.py b/lib/pbench/agent/base.py index 30c6dfd96b..d2105c5b04 100644 --- a/lib/pbench/agent/base.py +++ b/lib/pbench/agent/base.py @@ -8,6 +8,7 @@ import click from pbench.agent import PbenchAgentConfig +from pbench.agent.tool_group import ToolGroup, BadToolGroup from pbench.agent.utils import setup_logging @@ -108,15 +109,16 @@ def get_path(self, path): def verify_tool_group(self, group): """Ensure we have a tools group directory to work with""" - self.tool_group_dir = self.pbench_run / f"tools-v1-{group}" - if not self.tool_group_dir.exists(): - click.secho( - f'\t{self.name}: invalid --group option ("{group}"), directory not found: {self.tool_group_dir}' - ) + try: + self.tool_group_dir = self.gen_tools_group_dir(group) + except BadToolGroup as exc: + click.echo(f'\t{self.name}: invalid --group option ("{group}"), {exc}') ctxt = click.get_current_context() click.echo(ctxt.get_help()) - return 1 - return 0 + ret_code = 1 + else: + ret_code = 0 + return ret_code def gen_tools_group_dir(self, group): - return self.pbench_run / f"tools-v1-{group}" + return ToolGroup.verify_tool_group(group, pbench_run=self.pbench_run) diff --git a/lib/pbench/agent/constants.py b/lib/pbench/agent/constants.py index a1fde65974..eea0e1eb63 100644 --- a/lib/pbench/agent/constants.py +++ b/lib/pbench/agent/constants.py @@ -2,10 +2,10 @@ """ # Default Redis server port number used is "One Tool" in hex 0x17001 -redis_port = 17001 +def_redis_port = 17001 # Default port number used for the Tool Data Sink -tds_port = 8080 +def_tds_port = 8080 # The amount of time a TM tries to publish its setup message. TDS_RETRY_PERIOD_SECS = 60 diff --git a/lib/pbench/agent/redis.py b/lib/pbench/agent/redis.py index 86e11a13c9..28388aca17 100644 --- a/lib/pbench/agent/redis.py +++ b/lib/pbench/agent/redis.py @@ -220,3 +220,41 @@ def emit(self, record): self.dropped += 1 finally: self.counter += 1 + + +def wait_for_conn_and_key(redis_server, key, prog, redis_host, redis_port): + """wait_for_conn_and_key - convenience method of both the Tool Meister and + the Tool Data Sink to startup and wait for an initial connection to the + Redis server, and for the expected key to show up. + """ + # Loop waiting for the key to show up. + connected = None + payload = None + while payload is None: + try: + payload = redis_server.get(key) + except redis.ConnectionError: + if connected is None: + print( + f"{prog}: waiting to connect to redis server {redis_host}:{redis_port}", + flush=True, + ) + connected = False + elif connected: + print( + f"{prog}: disconnected from redis server {redis_host}:{redis_port}", + flush=True, + ) + connected = False + time.sleep(1) + else: + if not connected: + print( + f"{prog}: connected to redis server {redis_host}:{redis_port}", + flush=True, + ) + connected = True + if payload is None: + print(f'{prog}: key, "{key}" does not exist yet', flush=True) + time.sleep(1) + return payload.decode("utf-8") diff --git a/lib/pbench/agent/tool_data_sink.py b/lib/pbench/agent/tool_data_sink.py index f1d9f93970..09db6257bf 100644 --- a/lib/pbench/agent/tool_data_sink.py +++ b/lib/pbench/agent/tool_data_sink.py @@ -18,7 +18,6 @@ import subprocess import sys import tempfile -import time from configparser import ConfigParser, DuplicateSectionError from datetime import datetime @@ -29,14 +28,14 @@ from threading import Thread, Lock, Condition from wsgiref.simple_server import WSGIRequestHandler, make_server -import daemon import pidfile import redis from bottle import Bottle, ServerAdapter, request, abort +from daemon import DaemonContext from pbench.agent.constants import ( - tds_port, + def_tds_port, tm_allowed_actions, tm_channel_suffix_from_client, tm_channel_suffix_from_tms, @@ -44,26 +43,27 @@ tm_channel_suffix_to_logging, tm_channel_suffix_to_tms, ) -from pbench.agent.redis import RedisChannelSubscriber +from pbench.agent.redis import RedisChannelSubscriber, wait_for_conn_and_key from pbench.agent.toolmetadata import ToolMetadata from pbench.agent.utils import collect_local_info +# Logging format string for unit tests +fmtstr_ut = "%(levelname)s %(name)s %(funcName)s -- %(message)s" +fmtstr = "%(asctime)s %(levelname)s %(process)s %(thread)s %(name)s %(funcName)s %(lineno)d -- %(message)s" + + # Read in 64 KB chunks off the wire for HTTP PUT requests. _BUFFER_SIZE = 65536 # Maximum size of the tar ball for collected tool data. _MAX_TOOL_DATA_SIZE = 2 ** 30 -# Executable path of the tar and cp programs. -tar_path = None -cp_path = None - def _now(when): - """_now - An ugly hack to facility testing without the ability to mock. + """_now - An ugly hack to facilitate testing without the ability to mock. - Instead of directly calling `datatime.utcnow().isoformat()`, each call + Instead of directly calling `datetime.utcnow().isoformat()`, each call site invokes this method with an argument only used during unit testing to determine the expected behavior. This allows us to provide a "start" time that is one microsecond less than the "end" time. @@ -76,9 +76,9 @@ def _now(when): class DataSinkWsgiServer(ServerAdapter): - """DataSinkWsgiServer - an re-implementation of Bottle's WSGIRefServer + """DataSinkWsgiServer - a re-implementation of Bottle's WSGIRefServer where we have access to the underlying WSGIServer instance in order to - invoke it's stop() method, and we also provide an WSGIReqeustHandler with + invoke its stop() method, and we also provide an WSGIRequestHandler with an opinionated logging implementation. """ @@ -124,25 +124,78 @@ def log_request(self, code="-", size="-"): self.options["handler_class"] = DataSinkWsgiRequestHandler self._server = None + self._err_code = None + self._err_text = None self._lock = Lock() self._cv = Condition(lock=self._lock) self._logger = logger - def run(self, app): - assert self._server is None, "'run' method called twice" - self._logger.debug("Making tool data sink WSGI server ...") - server = make_server(self.host, self.port, app, **self.options) + def _do_notify(self, text=None, code=0, server=None): + """_do_notify - simple helper method to encapsulate method of notification. + """ with self._lock: + self._err_text = text + self._err_code = code self._server = server self._cv.notify() - self._logger.debug("Running tool data sink WSGI server ...") - self._server.serve_forever() - def stop(self): + def run(self, app): + """run - Start the WSGI server, called by the Bottle framework. + + Intended to be run as a separate thread. + + We record the outcome of the `make_server` call for success or failure + and notify anybody waiting for this thread to succeed. + """ + assert self._server is None, "'run' method called twice" + self._logger.debug("Making tool data sink WSGI server ...") + try: + server = make_server(self.host, self.port, app, **self.options) + except OSError as exc: + assert exc.errno != 0, "Logic bomb! OSError exception with no errno value" + self._do_notify(str(exc), exc.errno) + raise + except Exception as exc: + self._logger.exception("Unexpected error in WSGI server") + self._do_notify(str(exc), -1) + raise + else: + self._logger.debug("Successfully created WSGI server") + self._do_notify(server=server) + self._logger.debug("Running tool data sink WSGI server ...") + server.serve_forever() + + def wait(self): + """ wait - wait for the WSGI thread executing the `run` method to start + running and successfully create a WSGI server object, or fail trying. + + Returns a tuple of the error text and the error code set by the _run() + method attempting to create the WSGI server. The error code will be + 0 on success, an Errno value, or -1 if an expected exception was + raised. + """ with self._lock: - while self._server is None: + while self._err_code is None: self._cv.wait() - self._server.shutdown() + return self._err_text, self._err_code + + def stop(self): + """ stop - stop the running WSGI server via the shutdown() method of + the WSGI server object. + """ + # We have to wait for the thread to start the server and fill in the + # server object first. + self.wait() + if self._err_code == 0: + self._server.shutdown() + + +class ToolDataSinkError(Exception): + """ToolDataSinkError - generic exception class for Tool Data Sink related + exceptions. + """ + + pass class BaseCollector: @@ -159,20 +212,25 @@ def __init__( tool_group, host_tools_dict, tool_metadata, + tar_path, logger, ): """Constructor - responsible for recording the arguments, and creating the Environment() for template rendering. """ self.templates_path = pbench_bin / "templates" + assert ( + self.templates_path.is_dir() + ), f"Logic bomb! {self.templates_path} does not exist as a directory" self.benchmark_run_dir = benchmark_run_dir self.tool_group = tool_group self.host_tools_dict = host_tools_dict self.tool_metadata = tool_metadata + self.tar_path = tar_path self.logger = logger self.run = [] - self.tool_group_dir = self.benchmark_run_dir / f"tools-{self.tool_group}" + self.tool_group_dir = self.benchmark_run_dir.local / f"tools-{self.tool_group}" self.tool_dir = self.tool_group_dir / self.name self.template_dir = Environment( autoescape=False, @@ -253,7 +311,7 @@ def terminate(self): if sts != 0: self.logger.warning("Collector process terminated with %d", sts) if errors > 0: - raise Exception("Failed to terminate all the collector processes") + raise ToolDataSinkError("Failed to terminate all the collector processes") class PromCollector(BaseCollector): @@ -268,7 +326,7 @@ def __init__(self, *args, **kwargs): """ self.prometheus_path = find_executable("prometheus") if self.prometheus_path is None: - raise Exception("External 'prometheus' executable not found") + raise ToolDataSinkError("External 'prometheus' executable not found") super().__init__(*args, **kwargs) self.tool_context = [] @@ -279,7 +337,9 @@ def __init__(self, *args, **kwargs): dict(hostname=f"{host}_{tool}", hostport=f"{host}:{port}") ) if not self.tool_context: - raise Exception("Expected prometheus persistent tool context not found") + raise ToolDataSinkError( + "Expected prometheus persistent tool context not found" + ) def launch(self): """launch - creates the YAML file that directs Prometheus's behavior, @@ -329,7 +389,7 @@ def terminate(self): self.logger.debug("Prometheus terminated") args = [ - tar_path, + self.tar_path, "--remove-files", "-Jcf", f"{self.tool_group_dir}/prometheus_data.tar.xz", @@ -537,7 +597,7 @@ def terminate(self): self.logger.debug("Pmproxy and pmlogger(s) terminated") args = [ - tar_path, + self.tar_path, "--remove-files", "-Jcf", f"{self.tool_group_dir}/pcp_data.tar.xz", @@ -550,6 +610,125 @@ def terminate(self): self.logger.warning("Failed to tar up pmlogger data: %r", args) +class BenchmarkRunDir: + """BenchmarkRunDir - helper class for handling the benchmark_run_dir + directory Redis parameter vs the actual "local" benchmark run directory. + + It is a requirement of the Tool Meister sub-system that the ${pbench_run} + directory is always a prefix of the ${benchmark_run_dir}. + + When the pbench CLI starts the Tool Data Sink directly, the local + benchmark run directory is the same as the value of the benchmark_run_dir + parameter. + + But when the Tool Data Sink runs in a container, the path to the benchmark + run directory inside the container might be different from the parameter + value because the mount point for the external file system has a different + path inside the container. Typically, the container is constructed with + the default pbench installation, where the ${pbench_run} directory is + "/var/lib/pbench-agent". + + The entity responsible for starting the Tool Data Sink container typically + mounts a different directory for /var/lib/pbench-agent via 'podman run + --volume /srv/data/pbench-run-dir:/var/lib/pbench-agent:Z'. This leads to + a conflict where the external ${pbench_run} path is different from the + internal-to-the-container ${pbench_run} path. To resolve this, the entity + which creates the external pbench run directory creates a ".path" file in + that directory containing the full "external" path to the pbench run + directory. The Tool Data Sink uses that path to validate that external + benchmark_run_dir parameter values are valid. + + This class implements the mechanism that allows the Tool Data Sink code to + handle that seamlessly. + """ + + class Exists(Exception): + pass + + class Prefix(Exception): + pass + + def __init__(self, ext_benchmark_run_dir, int_pbench_run): + self._ext_benchmark_run_dir = Path(ext_benchmark_run_dir) + self._ext_pbench_run = self._ext_benchmark_run_dir.parent + self._int_pbench_run = Path(int_pbench_run) + + # The Tool Data Sink could be running in a container. If so, then + # it'll be using the default benchmark run directory. If the + # benchmark_run_dir parameter is valid, there will be a file + # called ".path" in the default benchmark run directory which will + # match. + # + # E.g.: + # $ pbench_run="/home//run-dir" + # $ benchmark_run_dir="${pbench_run}/script_config_" + # $ cat ${pbench_run}/.path + # /home//run-dir + # $ podman run --volume ${pbench_run}:/var/lib/pbench-agent \ + # pbench-agent-tool-data-sink bash + # [ abcdefg /]$ cat /var/lib/pbench-agent/.path + # /home//run-dir + try: + benchmark_run_dir_lcl = self._ext_benchmark_run_dir.resolve(strict=True) + except Exception: + # Might be in a container; let's first construct the + # internal-to-the-container benchmark run directory. + benchmark_run_dir_lcl = ( + self._int_pbench_run / self._ext_benchmark_run_dir.name + ) + dot_path = self._int_pbench_run / ".path" + try: + dot_path_contents = dot_path.read_text().strip() + except Exception as exc: + # Failed to read ".path" contents, give up. + raise ToolDataSinkError( + f"Run directory parameter, '{ext_benchmark_run_dir}', must" + f" be an existing directory ('{self._ext_pbench_run}/" + f".path' not found, '{exc}').", + ) + else: + if dot_path_contents != str(self._ext_pbench_run): + raise ToolDataSinkError( + f"Run directory parameter, '{ext_benchmark_run_dir}'," + " must be an existing directory (.path contents" + f" mismatch, .path='{dot_path_contents}' !=" + f" '{self._ext_pbench_run}').", + ) + else: + # We can access the benchmark_run_dir directly, no need to + # consider contents of ".path" file. + pass + if not benchmark_run_dir_lcl.is_dir(): + raise ToolDataSinkError( + f"Run directory parameter, '{ext_benchmark_run_dir}', must be" + " a real directory.", + ) + self.local = benchmark_run_dir_lcl + + def __str__(self): + """__str__ - the string representation of a BenchmarkRunDir object is + the original external benchmark run directory string. + """ + return str(self._ext_benchmark_run_dir) + + def validate(self, directory): + """validate - check that an external directory has a prefix of the external + benchmark run directory. + """ + directory_p = Path(directory) + try: + # Check that "directory" has a prefix of + rel_path = directory_p.relative_to(self._ext_benchmark_run_dir) + except ValueError: + raise self.Prefix() + local_dir = self.local / rel_path + if not local_dir.is_dir(): + # The internal benchmark run directory does not have the same + # sub-directory hierarchy. + raise self.Exists() + return local_dir + + class ToolDataSink(Bottle): """ToolDataSink - sub-class of Bottle representing state for tracking data sent from tool meisters via an HTTP PUT method. @@ -558,20 +737,41 @@ class ToolDataSink(Bottle): # The list of actions where we expect Tool Meisters to send data to us. _data_actions = frozenset(("send", "sysinfo")) + @staticmethod + def fetch_params(params, pbench_run): + try: + _benchmark_run_dir = params["benchmark_run_dir"] + bind_hostname = params["bind_hostname"] + channel_prefix = params["channel_prefix"] + tool_group = params["group"] + tool_metadata = ToolMetadata.tool_md_from_dict(params["tool_metadata"]) + tool_trigger = params["tool_trigger"] + tools = params["tools"] + except KeyError as exc: + raise ToolDataSinkError(f"Invalid parameter block, missing key {exc}") + else: + benchmark_run_dir = BenchmarkRunDir(_benchmark_run_dir, pbench_run) + return ( + benchmark_run_dir, + bind_hostname, + channel_prefix, + tool_group, + tool_metadata, + tool_trigger, + tools, + ) + def __init__( self, pbench_bin, + pbench_run, hostname, - bind_hostname, + tar_path, + cp_path, redis_server, redis_host, redis_port, - channel_prefix, - benchmark_run_dir, - tool_group, - tool_trigger, - tools, - tool_metadata, + params, optional_md, logger, ): @@ -583,16 +783,21 @@ def __init__( # Save external state self.pbench_bin = pbench_bin self.hostname = hostname - self.bind_hostname = bind_hostname + self.tar_path = tar_path + self.cp_path = cp_path self.redis_server = redis_server self.redis_host = redis_host self.redis_port = redis_port - self.channel_prefix = channel_prefix - self.benchmark_run_dir = benchmark_run_dir - self.tool_group = tool_group - self.tool_trigger = tool_trigger - self.tools = tools - self.tool_metadata = tool_metadata + ret_val = self.fetch_params(params, pbench_run) + ( + self.benchmark_run_dir, + self.bind_hostname, + self.channel_prefix, + self.tool_group, + self.tool_metadata, + self.tool_trigger, + self.tools, + ) = ret_val self.optional_md = optional_md self.logger = logger # Initialize internal state @@ -615,6 +820,8 @@ def __init__( self._lock = Lock() self._cv = Condition(lock=self._lock) self.web_server_thread = None + self._tm_log_capture_thread_cv = Condition(lock=self._lock) + self._tm_log_capture_thread_state = None self.tm_log_capture_thread = None def __enter__(self): @@ -630,13 +837,19 @@ def __enter__(self): callback=self.put_document, ) self._server = DataSinkWsgiServer( - host=self.bind_hostname, port=tds_port, logger=self.logger + host=self.bind_hostname, port=def_tds_port, logger=self.logger ) self.web_server_thread = Thread(target=self.web_server_run) self.web_server_thread.start() - # FIXME - ugly hack for consistent unit tests; why not just use a - # condition variable? - time.sleep(0.1) + err_text, err_code = self._server.wait() + if err_code > 0: + # Pass along the OSError with its errno, let's us handle cleanly + # EADDRINUSE errors. + raise OSError(err_code, err_text) + elif err_code < 0: + # All other errors encountered by the WSGI thread are already + # logged. + raise ToolDataSinkError(f"Failure to create WSGI server - {err_text!r}") self.logger.debug("web server 'run' thread started, processing payloads ...") # Setup the two Redis channels to which the Tool Data Sink subscribes. @@ -654,10 +867,18 @@ def __enter__(self): self.tm_log_capture_thread = Thread(target=self.tm_log_capture) self.tm_log_capture_thread.start() - # FIXME - ugly hack for consistent unit tests; why not just use a - # condition variable? - time.sleep(0.1) - self.logger.debug("'tm_log_capture' thread started, processing logs ...") + with self._lock: + while self._tm_log_capture_thread_state is None: + self._tm_log_capture_thread_cv.wait() + if self._tm_log_capture_thread_state != "started": + self.logger.warning( + "'tm_log_capture' thread failed to start, not processing Tool" + " Meister logs ..." + ) + else: + self.logger.debug( + "'tm_log_capture' thread started, processing Tool Meister logs ..." + ) # The ToolDataSink object itself is the object of the context manager. return self @@ -709,20 +930,18 @@ def tm_log_capture(self): # logs from remote Tool Meisters. logger = logging.getLogger("tm_log_capture_thread") logger.setLevel(logging.WARNING) - tm_log_file = self.benchmark_run_dir / "tm" / "tm.logs" + tm_log_file = self.benchmark_run_dir.local / "tm" / "tm.logs" with tm_log_file.open("w") as fp: try: + with self._lock: + self._tm_log_capture_thread_state = "started" + self._tm_log_capture_thread_cv.notify() for log_msg in self._to_logging_chan.fetch_message(logger): fp.write(f"{log_msg}\n") fp.flush() except redis.ConnectionError: # We don't bother reporting any connection errors. pass - except ValueError as exc: - # FIXME - Why do we need to do this? - if exc.args[0] == "I/O operation on closed file.": - pass - raise except Exception: self.logger.exception("Failed to capture logs from Redis server") @@ -796,9 +1015,9 @@ def record_tms(self, tms): """record_tms - record the Tool Meister data and metadata returned from the startup acknowledgement messages collected in "tms". - The first thing we have to do is setup self._tm_tracking properly, - adding which tools are no-ops, transient, and persistent, and properly - record the initial "posted" state. + The first thing we have to do is to determine which tools are no-ops, + transient, and persistent, and properly record the initial "posted" + state. The second thing we do is record all the data and metadata about the Tool Meisters in the ${benchmark_run_dir}/metadata.log file. @@ -855,7 +1074,7 @@ def record_tms(self, tms): home = os.environ.get("HOME", "") if home: src = str(Path(home) / ".ssh" / "config") - dst = str(self.benchmark_run_dir / "ssh.config") + dst = str(self.benchmark_run_dir.local / "ssh.config") try: shutil.copyfile(src, dst) except FileNotFoundError: @@ -865,7 +1084,7 @@ def record_tms(self, tms): # cp -L /etc/ssh/ssh_config ${dir}/ > /dev/null 2>&1 etc_ssh = Path("/etc") / "ssh" src = str(etc_ssh / "ssh_config") - dst = str(self.benchmark_run_dir / "ssh_config") + dst = str(self.benchmark_run_dir.local / "ssh_config") try: shutil.copyfile(src, dst) except FileNotFoundError: @@ -898,13 +1117,18 @@ def record_tms(self, tms): # # cp -rL /etc/ssh/ssh_config.d ${dir}/ > /dev/null 2>&1 subprocess.run( - [cp_path, "-rL", "/etc/ssh/ssh_config.d", f"{self.benchmark_run_dir}/"], + [ + self.cp_path, + "-rL", + "/etc/ssh/ssh_config.d", + f"{self.benchmark_run_dir.local}/", + ], stdin=subprocess.DEVNULL, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, ) - mdlog_name = self.benchmark_run_dir / "metadata.log" + mdlog_name = self.benchmark_run_dir.local / "metadata.log" mdlog = ConfigParser() try: with mdlog_name.open("r") as fp: @@ -923,7 +1147,7 @@ def record_tms(self, tms): # Users have a funny way of adding '%' characters to the run # directory, so we have to be sure we handle "%" characters in the # directory name metadata properly. - mdlog.set(section, "name", self.benchmark_run_dir.name.replace("%", "%%")) + mdlog.set(section, "name", self.benchmark_run_dir.local.name.replace("%", "%%")) version, seqno, sha1, hostdata = collect_local_info(self.pbench_bin) rpm_version = f"v{version}-{seqno}g{sha1}" mdlog.set(section, "rpm-version", rpm_version) @@ -1050,7 +1274,9 @@ def execute(self): self._to_client_channel, json.dumps(started_msg, sort_keys=True) ) if num_present == 0: - raise Exception("Tool Data Sink started by nobody is listening") + raise ToolDataSinkError( + "Tool Data Sink started, but nobody is listening" + ) self.logger.debug("published %s", self._to_client_channel) for data in self._from_client_chan.fetch_json(self.logger): @@ -1214,7 +1440,7 @@ def execute_action(self, action, directory_str, args, data): # the caller wants to report that it is stopping all the Tool # Meisters due to an interruption (SIGINT or otherwise). # - mdlog_name = self.benchmark_run_dir / "metadata.log" + mdlog_name = self.benchmark_run_dir.local / "metadata.log" mdlog = ConfigParser() try: with (mdlog_name).open("r") as fp: @@ -1233,7 +1459,7 @@ def execute_action(self, action, directory_str, args, data): if args["interrupt"]: # args["interrupt"] == True ==> run / run_interrupted mdlog.set(section, "run_interrupted", "true") - iterations = self.benchmark_run_dir / ".iterations" + iterations = self.benchmark_run_dir.local / ".iterations" try: iterations_val = iterations.read_text() except FileNotFoundError: @@ -1258,26 +1484,27 @@ def execute_action(self, action, directory_str, args, data): self._to_logging_chan.unsubscribe() return - directory = Path(directory_str) - if not directory.is_dir(): + try: + local_dir = self.benchmark_run_dir.validate(directory_str) + except self.benchmark_run_dir.Prefix: self.logger.error( - "action '%s' with non-existent directory, '%s'", action, directory, + "action '%s' with invalid directory, '%s' (not a sub-directory of '%s')", + action, + directory_str, + self.benchmark_run_dir, ) - self._send_client_status(action, "invalid directory") + self._send_client_status(action, "directory not a sub-dir of run directory") return - try: - # Check that "directory" has a prefix of self.benchmark_run_dir - directory.relative_to(self.benchmark_run_dir) - except ValueError: + except self.benchmark_run_dir.Exists: self.logger.error( - "action '%s' with invalid directory," - " '%s' (not a sub-directory of '%s')", + "action '%s' with invalid directory, '%s' (does not exist)", action, - directory, - self.benchmark_run_dir, + directory_str, ) - self._send_client_status(action, "directory not a prefix of run directory") + self._send_client_status(action, "directory does not exist") return + else: + assert local_dir is not None, f"Logic bomb! local_dir = {local_dir!r}" with self._lock: # Handle all actions underneath the lock for consistency. @@ -1326,6 +1553,7 @@ def execute_action(self, action, directory_str, args, data): self.tool_group, prom_tool_dict, self.tool_metadata, + self.tar_path, logger=self.logger, ) self._prom_server.launch() @@ -1340,6 +1568,7 @@ def execute_action(self, action, directory_str, args, data): self.tool_group, pcp_tool_dict, self.tool_metadata, + self.tar_path, redis_host=self.redis_host, redis_port=self.redis_port, logger=self.logger, @@ -1364,7 +1593,7 @@ def execute_action(self, action, directory_str, args, data): # the URL for the PUT method. directory_bytes = directory_str.encode("utf-8") self.data_ctx = hashlib.md5(directory_bytes).hexdigest() - self.directory = Path(directory_str) + self.directory = local_dir # Forward to TMs ret_val = self._forward_tms(data) @@ -1567,7 +1796,7 @@ def put_document(self, data_ctx, hostname): # Invoke tar directly for efficiency. with o_file.open("w") as ofp, e_file.open("w") as efp: cp = subprocess.run( - [tar_path, "-xf", host_data_tb_name], + [self.tar_path, "-xf", host_data_tb_name], cwd=target_dir, stdin=None, stdout=ofp, @@ -1609,71 +1838,224 @@ def put_document(self, data_ctx, hostname): abort(500, "INTERNAL ERROR") -def main(argv): - _prog = Path(argv[0]) - PROG = _prog.name - pbench_bin = _prog.parent.parent +def get_logger(PROG, daemon=False): + """get_logger - construct a logger for a Tool Meister instance. + If in the Unit Test environment, just log to console. + If in non-unit test environment: + If daemonized, log to syslog and log back to Redis. + If not daemonized, log to console AND log back to Redis + """ logger = logging.getLogger(PROG) - fh = logging.FileHandler(f"{PROG}.log") - if os.environ.get("_PBENCH_UNIT_TESTS"): - fmtstr = "%(levelname)s %(name)s %(funcName)s -- %(message)s" - else: - fmtstr = ( - "%(asctime)s %(levelname)s %(process)s %(thread)s" - " %(name)s %(funcName)s %(lineno)d -- %(message)s" - ) - fhf = logging.Formatter(fmtstr) - fh.setFormatter(fhf) if os.environ.get("_PBENCH_TOOL_DATA_SINK_LOG_LEVEL") == "debug": log_level = logging.DEBUG else: log_level = logging.INFO - fh.setLevel(log_level) - logger.addHandler(fh) logger.setLevel(log_level) + unit_tests = bool(os.environ.get("_PBENCH_UNIT_TESTS")) + if unit_tests or not daemon: + sh = logging.StreamHandler() + else: + sh = logging.FileHandler(f"{PROG}.log") + sh.setLevel(log_level) + shf = logging.Formatter(fmtstr_ut if unit_tests else fmtstr) + sh.setFormatter(shf) + logger.addHandler(sh) + + return logger + + +def driver( + PROG, + redis_server, + redis_host, + redis_port, + pbench_bin, + pbench_run, + hostname, + tar_path, + cp_path, + param_key, + params, + optional_md, + logger=None, +): + if logger is None: + logger = get_logger(PROG) + + logger.debug("params_key (%s): %r", param_key, params) + + try: + with ToolDataSink( + pbench_bin, + pbench_run, + hostname, + tar_path, + cp_path, + redis_server, + redis_host, + redis_port, + params, + optional_md, + logger, + ) as tds_app: + tds_app.execute() + except OSError as exc: + if exc.errno == errno.EADDRINUSE: + logger.error( + "ERROR - tool data sink failed to start, %s:%s already in use", + params["bind_hostname"], + def_tds_port, + ) + ret_val = 8 + else: + logger.exception("ERROR - failed to start the tool data sink") + ret_val = 9 + except Exception: + logger.exception("ERROR - failed to start the tool data sink") + ret_val = 10 + else: + ret_val = 0 + return ret_val + + +def daemon( + PROG, + redis_server, + redis_host, + redis_port, + pbench_bin, + pbench_run, + hostname, + tar_path, + cp_path, + param_key, + params, + optional_md, +): + # Disconnect any existing connections to the Redis server. + redis_server.connection_pool.disconnect() + del redis_server + + # Before we daemonize, flush any data written to stdout or stderr. + sys.stderr.flush() + sys.stdout.flush() + + pidfile_name = f"{PROG}.pid" + pfctx = pidfile.PIDFile(pidfile_name) + with open(f"{PROG}.out", "w") as sofp, open( + f"{PROG}.err", "w" + ) as sefp, DaemonContext( + stdout=sofp, + stderr=sefp, + working_directory=os.getcwd(), + umask=0o022, + pidfile=pfctx, + ): + logger = get_logger(PROG, daemon=True) + + # We have to re-open the connection to the redis server now that we + # are "daemonized". + logger.debug("re-constructing Redis server object") + try: + redis_server = redis.Redis(host=redis_host, port=redis_port, db=0) + except Exception as e: + logger.error( + "Unable to construct Redis server object, %s:%s: %s", + redis_host, + redis_port, + e, + ) + return 7 + else: + logger.debug("reconstructed Redis server object") + return driver( + PROG, + redis_server, + redis_host, + redis_port, + pbench_bin, + pbench_run, + hostname, + tar_path, + cp_path, + param_key, + params, + optional_md, + logger=logger, + ) + + +def main(argv): + _prog = Path(argv[0]) + PROG = _prog.name + # The Tool Data Sink executable is in: + # ${pbench_bin}/util-scripts/tool-meister/pbench-tool-data-sink + # So .parent at each level is: + # _prog ${pbench_bin}/util-scripts/tool-meister/pbench-tool-data-sink + # .parent ${pbench_bin}/util-scripts/tool-meister + # .parent ${pbench_bin}/util-scripts + # .parent ${pbench_bin} + pbench_bin = _prog.parent.parent.parent + try: redis_host = argv[1] redis_port = argv[2] param_key = argv[3] except IndexError as e: - logger.error("Invalid arguments: %s", e) + print(f"{PROG}: Invalid arguments: {e}", file=sys.stderr) return 1 + else: + if not redis_host or not redis_port or not param_key: + print(f"{PROG}: Invalid arguments: {argv!r}", file=sys.stderr) + return 1 + try: + daemonize = argv[4] + except IndexError: + daemonize = "no" - global tar_path tar_path = find_executable("tar") if tar_path is None: - logger.error("External 'tar' executable not found") + print("External 'tar' executable not found", file=sys.stderr) return 2 - global cp_path cp_path = find_executable("cp") if cp_path is None: - logger.error("External 'cp' executable not found") + print("External 'cp' executable not found", file=sys.stderr) return 2 + try: + pbench_run = os.environ["pbench_run"] + except KeyError: + print( + "Unable to fetch pbench_run environment variable", file=sys.stderr, + ) + return 3 + try: redis_server = redis.Redis(host=redis_host, port=redis_port, db=0) except Exception as e: - logger.error( - "Unable to connect to redis server, %s:%s: %s", redis_host, redis_port, e + print( + f"Unable to connect to redis server, {redis_host}:{redis_port}: {e}", + file=sys.stderr, ) return 4 try: hostname = os.environ["_pbench_full_hostname"] except KeyError: - logger.error("Unable to fetch _pbench_full_hostname environment variable") - return 4 + print( + "Unable to fetch _pbench_full_hostname environment variable", + file=sys.stderr, + ) + return 5 try: - params_raw = redis_server.get(param_key) - if params_raw is None: - logger.error('Parameter key, "%s" does not exist.', param_key) - return 5 - logger.debug("params_key (%s): %r", param_key, params_raw) - params_str = params_raw.decode("utf-8") + # Wait for the parameter key value to show up. + params_str = wait_for_conn_and_key( + redis_server, param_key, PROG, redis_host, redis_port + ) # The expected parameters for this "data-sink" is what "channel" to # subscribe to for the tool meister operational life-cycle. The # data-sink listens for the actions, sysinfo | init | start | stop | @@ -1683,89 +2065,29 @@ def main(argv): # E.g. params = '{ "channel_prefix": "some-prefix", # "benchmark_run_dir": "/loo/goo" }' params = json.loads(params_str) - channel_prefix = params["channel_prefix"] - benchmark_run_dir = Path(params["benchmark_run_dir"]).resolve(strict=True) - bind_hostname = params["bind_hostname"] - tool_group = params["group"] - tool_trigger = params["tool_trigger"] - tools = params["tools"] - tool_metadata = ToolMetadata.tool_md_from_dict(params["tool_metadata"]) + ToolDataSink.fetch_params(params, pbench_run) except Exception as ex: - logger.error("Unable to fetch and decode parameter key, %s: %s", param_key, ex) + print( + f"Unable to fetch and decode parameter key, {param_key}: {ex}", + file=sys.stderr, + ) return 6 - else: - if not benchmark_run_dir.is_dir(): - logger.error( - "Run directory argument, %s, must be a real directory.", - benchmark_run_dir, - ) - return 7 - logger.debug("Tool Data Sink parameters check out, daemonizing ...") - redis_server.connection_pool.disconnect() - redis_server = None optional_md = params["optional_md"] - # Before we daemonize, flush any data written to stdout or stderr. - sys.stderr.flush() - sys.stdout.flush() - - pidfile_name = f"{PROG}.pid" - pfctx = pidfile.PIDFile(pidfile_name) - with open(f"{PROG}.out", "w") as sofp, open( - f"{PROG}.err", "w" - ) as sefp, daemon.DaemonContext( - stdout=sofp, - stderr=sefp, - working_directory=os.getcwd(), - umask=0o022, - pidfile=pfctx, - files_preserve=[fh.stream.fileno()], - ): - try: - # We have to re-open the connection to the redis server now that we - # are "daemonized". - logger.debug("constructing Redis() object") - try: - redis_server = redis.Redis(host=redis_host, port=redis_port, db=0) - except Exception as e: - logger.error( - "Unable to connect to redis server, %s:%s: %s", - redis_host, - redis_port, - e, - ) - return 8 - else: - logger.debug("constructed Redis() object") - - with ToolDataSink( - pbench_bin, - hostname, - bind_hostname, - redis_server, - redis_host, - redis_port, - channel_prefix, - benchmark_run_dir, - tool_group, - tool_trigger, - tools, - tool_metadata, - optional_md, - logger, - ) as tds_app: - tds_app.execute() - except OSError as exc: - if exc.errno == errno.EADDRINUSE: - logger.error( - "ERROR - tool data sink failed to start, %s:%s already in use", - bind_hostname, - tds_port, - ) - else: - logger.exception("ERROR - failed to start the tool data sink") - except Exception: - logger.exception("ERROR - failed to start the tool data sink") - - return 0 + func = daemon if daemonize == "yes" else driver + ret_val = func( + PROG, + redis_server, + redis_host, + redis_port, + pbench_bin, + pbench_run, + hostname, + tar_path, + cp_path, + param_key, + params, + optional_md, + ) + return ret_val diff --git a/lib/pbench/agent/tool_group.py b/lib/pbench/agent/tool_group.py new file mode 100644 index 0000000000..4d2c5d7057 --- /dev/null +++ b/lib/pbench/agent/tool_group.py @@ -0,0 +1,157 @@ +import os +import re + +from pathlib import Path + + +class BadToolGroup(Exception): + """Exception representing a tool group that does not exist or is invalid. + """ + + pass + + +class ToolGroup: + """Provides an in-memory representation of the registered tools as recorded + on-disk. + """ + + # Current tool group prefix in use. + TOOL_GROUP_PREFIX = "tools-v1" + + @staticmethod + def verify_tool_group(group, pbench_run=None): + """verify_tool_group - given a tool group name, verify it exists in the + ${pbench_run} directory as a properly prefixed tool group directory + name. + + Raises a BadToolGroup exception if the directory is invalid or does not + exist, or if the pbench_run argument is None and the environment + variable of the same name is missing. + + Returns a Pathlib object of the tool group directory on success. + """ + _pbench_run = os.environ.get("pbench_run") if pbench_run is None else pbench_run + if not _pbench_run: + raise BadToolGroup( + "Cannot validate tool group, '{group}', 'pbench_run'" + " environment variable missing" + ) + + tg_dir_name = Path(_pbench_run, f"{ToolGroup.TOOL_GROUP_PREFIX}-{group}") + try: + tg_dir = tg_dir_name.resolve(strict=True) + except FileNotFoundError: + raise BadToolGroup( + f"Bad tool group, '{group}': directory {tg_dir_name} does not exist" + ) + except Exception as exc: + raise BadToolGroup( + f"Bad tool group, '{group}': error resolving {tg_dir_name} directory" + ) from exc + else: + if not tg_dir.is_dir(): + raise BadToolGroup( + f"Bad tool group, '{group}': directory {tg_dir_name} not valid" + ) + else: + return tg_dir + + def __init__(self, group): + """Construct a ToolGroup object from the on-disk data of the given + tool group. + + If the given tool group is valid, the contents are read into the three + dictionary structures: + + "toolnames" - each tool name is the key, with separate dictionaries + for each registered host + + "hostnames" - each registered host is the key, with separate + dictionaries for each tool registered on that host + + "labels" - each registered host name, that has a label, is the key, + and the label is the value; if a host is not labeled, it does not + show up in this dictionary + + Raises BadToolGroup via the verify_tool_group() method on error. + """ + self.tg_dir = self.verify_tool_group(group) + self.group = group + + # __trigger__ + try: + _trigger = (self.tg_dir / "__trigger__").read_text() + except FileNotFoundError: + # Ignore missing trigger file + self.trigger = None + else: + if len(_trigger) == 0: + # Ignore empty trigger file contents + self.trigger = None + else: + self.trigger = _trigger + + # toolnames - Dict with tool name as the key, dictionary with host + # names and parameters for each host + self.toolnames = {} + # hostnames - Dict with host name as the key, dictionary with tool + # names and parameters for each tool + self.hostnames = {} + self.labels = {} + for hdirent in os.listdir(self.tg_dir): + if hdirent == "__trigger__": + # Ignore handled above + continue + if not (self.tg_dir / hdirent).is_dir(): + # Ignore wayward non-directory files + continue + # We assume this directory is a hostname. + host = hdirent + assert ( + host not in self.hostnames + ), f"Logic bomb! {host} in {self.hostnames!r}" + self.hostnames[host] = {} + for tdirent in os.listdir(self.tg_dir / host): + if tdirent == "__label__": + self.labels[host] = ( + (self.tg_dir / host / tdirent).read_text().strip() + ) + continue + if tdirent.endswith("__noinstall__"): + # FIXME: ignore "noinstall" for now, tools are going to be + # in containers so this does not make sense going forward. + continue + # This directory entry is the name of a tool. + tool = tdirent + tool_opts_text = (self.tg_dir / host / tool).read_text().strip() + tool_opts = re.sub(r"\n\s*", " ", tool_opts_text) + if tool not in self.toolnames: + self.toolnames[tool] = {} + self.toolnames[tool][host] = tool_opts + assert ( + tool not in self.hostnames[host] + ), f"Logic bomb! {tool} in {self.hostnames[host]!r}" + self.hostnames[host][tool] = tool_opts + + def get_tools(self, host): + """get_tools - given a target host, return a dictionary with the list + of tool names as keys, and the values being their options for that + host. + """ + tools = dict() + for tool, opts in self.toolnames.items(): + try: + host_opts = opts[host] + except KeyError: + # This host does not have this tool registered, ignore. + pass + else: + tools[tool] = host_opts + return tools + + def get_label(self, host): + """get_label - given a target host, return the label associated with + that host. + """ + return self.labels.get(host, "") diff --git a/lib/pbench/agent/tool_meister.py b/lib/pbench/agent/tool_meister.py index 97e39c9320..19034f552b 100644 --- a/lib/pbench/agent/tool_meister.py +++ b/lib/pbench/agent/tool_meister.py @@ -65,7 +65,11 @@ tm_channel_suffix_to_logging, TDS_RETRY_PERIOD_SECS, ) -from pbench.agent.redis import RedisHandler, RedisChannelSubscriber +from pbench.agent.redis import ( + RedisHandler, + RedisChannelSubscriber, + wait_for_conn_and_key, +) from pbench.agent.toolmetadata import ToolMetadata from pbench.agent.utils import collect_local_info @@ -396,66 +400,20 @@ class DcgmTool(PersistentTool): """DcgmTool - provide specific persistent tool behaviors for the "dcgm" tool. - In particular, the dcgm tool requires the "--inst" option, requires the - PYTHONPATH environment variable be set properly, and must use a python2 - environment. + The only particular behavior is that we find the proper "dcgm-exporter" + executable in our PATH. """ def __init__(self, name, tool_opts, logger=None, **kwargs): super().__init__(name, tool_opts, logger=logger, **kwargs) - # Looking for required "--inst" option, reformatting appropriately if - # found. - tool_opts_l = self.tool_opts.split(" ") - for opt in tool_opts_l: - if opt.startswith("--inst="): - if opt[-1] == "\n": - install_path = opt[7:-1] - else: - install_path = opt[7:] - self.install_path = Path(install_path) - self.logger.debug( - "install path for tool %s, %s", name, self.install_path - ) - break - else: - self.install_path = None - self.logger.debug("missing install path") - if self.install_path is None: - self.script_path = None - self.args = None - self.env = None - else: - self.script_path = ( - self.install_path / "samples" / "scripts" / "dcgm_prometheus.py" - ) - if not self.script_path.exists(): - self.logger.error("missing script path, %s", self.script_path) - self.args = None - self.env = None - else: - self.args = ["python2", f"{self.script_path}"] - new_path_l = [ - str(self.install_path / "bindings"), - str(self.install_path / "bindings" / "common"), - ] - unit_tests = bool(os.environ.get("_PBENCH_UNIT_TESTS")) - prev_path = os.environ.get("PYTHONPATH", "") - if prev_path and not unit_tests: - new_path_l.append(prev_path) - self.env = os.environ.copy() - self.env["PYTHONPATH"] = ":".join(new_path_l) + executable = find_executable("dcgm-exporter") + self.args = None if executable is None else [executable] def install(self): - if self.install_path is None: - return (1, "dcgm tool --inst argument missing") - elif self.args is None: - return (1, f"dcgm tool path, '{self.script_path}', not found") + if self.args is None: + return (1, "dcgm-exporter tool not found") return (0, "dcgm tool properly installed") - def start(self): - # The dcgm tool needs PYTHONPATH, and run via the shell. - super().start(env=self.env) - class NodeExporterTool(PersistentTool): """NodeExporterTool - provide specifics for running the "node-exporter" @@ -468,10 +426,7 @@ class NodeExporterTool(PersistentTool): def __init__(self, name, tool_opts, logger=None, **kwargs): super().__init__(name, tool_opts, logger=logger, **kwargs) executable = find_executable("node_exporter") - if executable is None: - self.args = None - else: - self.args = [executable] + self.args = None if executable is None else [executable] def install(self): if self.args is None: @@ -779,7 +734,7 @@ def __enter__(self): num_present = 0 if num_present == 0 and time.time() >= timeout: raise Exception( - "Unable to publish startup ack message, {started_msg!r}" + f"Unable to publish startup ack message, {started_msg!r}" ) self.logger.debug("published %s", self._from_tms_channel) return self @@ -999,13 +954,17 @@ def start_tools(self, data): # Name of the temporary tool data directory to use when invoking # tools. This is a local temporary directory when the Tool Meister is - # remote from the pbench controller. - if self._controller == self._hostname: + # remote from the pbench controller. When the Tool Meister is run in + # a container the "directory" parameter will not map into its + # namespace, so we always consider containerized Tool Meisters as + # remote. + _dir = Path(data["directory"]) + if self._controller == self._hostname and _dir.exists(): # This is the case when the Tool Meister instance is running on # the same host as the controller. We just use the directory # given to us in the `start` message. try: - _dir = Path(data["directory"]).resolve(strict=True) + _dir = _dir.resolve(strict=True) except Exception: self.logger.exception( "Failed to access provided result directory, %s", data["directory"] @@ -1705,7 +1664,7 @@ def daemon( redis_server = redis.Redis(host=redis_host, port=redis_port, db=0) except Exception as exc: logger.error( - "Unable to construct to Redis server object, %s:%s: %s", + "Unable to construct Redis server object, %s:%s: %s", redis_host, redis_port, exc, @@ -1755,10 +1714,17 @@ def main(argv): except IndexError as e: print(f"{PROG}: Invalid arguments: {e}", file=sys.stderr) return 1 + else: + if not redis_host or not redis_port or not param_key: + print(f"{PROG}: Invalid arguments: {argv!r}", file=sys.stderr) + return 1 try: daemonize = argv[4] except IndexError: daemonize = "no" + else: + if not daemonize: + daemonize = "no" tar_path = find_executable("tar") if tar_path is None: @@ -1824,13 +1790,10 @@ def main(argv): return 5 try: - params_raw = redis_server.get(param_key) - if params_raw is None: - print( - f'{PROG}: Parameter key, "{param_key}" does not exist.', file=sys.stderr - ) - return 6 - params_str = params_raw.decode("utf-8") + # Wait for the key to show up with a value. + params_str = wait_for_conn_and_key( + redis_server, param_key, PROG, redis_host, redis_port + ) params = json.loads(params_str) # Validate the tool meister parameters without constructing an object # just yet, as we want to make sure we can talk to the redis server @@ -1841,30 +1804,20 @@ def main(argv): f"{PROG}: Unable to fetch and decode parameter key, '{param_key}': {exc}", file=sys.stderr, ) - return 7 + return 6 + func_args = ( + PROG, + tar_path, + sysinfo_dump, + pbench_install_dir, + tmp_dir, + param_key, + params, + redis_server, + ) if daemonize == "yes": - ret_val = daemon( - PROG, - tar_path, - sysinfo_dump, - pbench_install_dir, - tmp_dir, - param_key, - params, - redis_server, - redis_host, - redis_port, - ) + ret_val = daemon(*func_args, redis_host, redis_port) else: - ret_val = driver( - PROG, - tar_path, - sysinfo_dump, - pbench_install_dir, - tmp_dir, - param_key, - params, - redis_server, - ) + ret_val = driver(*func_args) return ret_val diff --git a/lib/pbench/agent/utils.py b/lib/pbench/agent/utils.py index 995f196a7e..95bbbb1e37 100644 --- a/lib/pbench/agent/utils.py +++ b/lib/pbench/agent/utils.py @@ -4,7 +4,6 @@ import sys from datetime import datetime -from pathlib import Path from pbench.agent.constants import ( sysinfo_opts_available, @@ -189,40 +188,3 @@ def collect_local_info(pbench_bin): hostdata[arg] = cp.stdout.strip() if cp.stdout is not None else "" return (version, seqno, sha1, hostdata) - - -class BadToolGroup(Exception): - """Exception representing a tool group that does not exist or is invalid. - """ - - pass - - -# Current tool group prefix in use. -TOOL_GROUP_PREFIX = "tools-v1" - - -def verify_tool_group(group, pbench_run=None): - """verify_tool_group - given a tool group name, verify it exists in the - ${pbench_run} directory as a properly prefixed tool group directory name. - - Raises a BadToolGroup exception if the directory is invalid or does not - exist. - - Returns a Pathlib object of the tool group directory on success. - """ - _pbench_run = os.environ["pbench_run"] if pbench_run is None else pbench_run - tg_dir_name = Path(_pbench_run, f"{TOOL_GROUP_PREFIX}-{group}") - try: - tg_dir = tg_dir_name.resolve(strict=True) - except FileNotFoundError: - raise BadToolGroup( - f"Bad tool group, '{group}': directory {tg_dir_name} does not exist" - ) - else: - if not tg_dir.is_dir(): - raise BadToolGroup( - f"Bad tool group, '{group}': directory {tg_dir_name} not valid" - ) - else: - return tg_dir diff --git a/lib/pbench/test/unit/agent/test_tool_data_sink.py b/lib/pbench/test/unit/agent/test_tool_data_sink.py new file mode 100644 index 0000000000..50b6527b76 --- /dev/null +++ b/lib/pbench/test/unit/agent/test_tool_data_sink.py @@ -0,0 +1,480 @@ +"""Tests for the Tool Data Sink module. +""" + +import logging +import pytest +import shutil +import time + +from http import HTTPStatus +from io import BytesIO +from pathlib import Path +from threading import Condition, Lock, Thread +from unittest.mock import patch +from wsgiref.simple_server import WSGIRequestHandler + +from pbench.agent import tool_data_sink +from pbench.agent.tool_data_sink import ( + BenchmarkRunDir, + ToolDataSinkError, + DataSinkWsgiServer, +) + + +class TestBenchmarkRunDir: + """Verify the Tool Data Sink BenchmarkRunDir class. + """ + + @pytest.fixture + def cleanup_tmp(self, pytestconfig): + TMP = Path(pytestconfig.cache.get("TMP", None)) + self.int_pb_run = TMP / "pbench-run-int" + self.ext_pb_run = TMP / "pbench-run-ext" + yield + try: + shutil.rmtree(self.int_pb_run) + except Exception as exc: + print(exc) + try: + shutil.rmtree(self.ext_pb_run) + except Exception as exc: + print(exc) + + def test_validate(self, cleanup_tmp): + """test_validate - verify the behavior of the validate() using both an + internal - external difference and when the internal and external + directories are the same. + + This implicitly tests the constructor as well. + """ + self.int_pb_run.mkdir() + ext_bm_rd = self.int_pb_run / "bm-run-dir" + ext_bm_rd.mkdir() + brd = BenchmarkRunDir(str(ext_bm_rd), str(self.int_pb_run)) + assert str(ext_bm_rd) == str(brd) + + valpre = ext_bm_rd / "valid-prefix" + valpre.mkdir() + obj = brd.validate(str(valpre)) + assert str(valpre) == str(obj) + + with pytest.raises(brd.Prefix): + brd.validate("/not/a/valid-prefix") + + self.ext_pb_run.mkdir() + ext_bm_rd = self.ext_pb_run / "bm-run-dir" + ext_bm_rd.mkdir() + brd = BenchmarkRunDir(str(ext_bm_rd), str(self.int_pb_run)) + + valpre = ext_bm_rd / "not-a-prefix" + with pytest.raises(brd.Exists): + brd.validate(valpre) + + def test_constructor_errors(self, cleanup_tmp): + """test_constructor_errors - verify errors are properly raised during + the execution of the constructor. + """ + self.int_pb_run.mkdir() + + ext_bm_rd = self.int_pb_run / "bm-run-dir" + ext_bm_rd.write_text("Should be a directory!") + with pytest.raises(ToolDataSinkError) as exc: + BenchmarkRunDir(str(ext_bm_rd), str(self.int_pb_run)) + exp_err = f"Run directory parameter, '{ext_bm_rd}', must be a real directory." + assert exp_err == str(exc.value) + ext_bm_rd.unlink() + + # NOTE: in a container the "internal" pbench run directory must exist, + # the external pbench run directory does not exist from within the + # container. + ext_bm_rd = self.ext_pb_run / "bm-run-dir" + int_bm_rd = self.int_pb_run / "bm-run-dir" + int_bm_rd.mkdir() + with pytest.raises(ToolDataSinkError) as exc: + BenchmarkRunDir(str(ext_bm_rd), str(self.int_pb_run)) + exp_err = ( + f"Run directory parameter, '{ext_bm_rd}', must be an existing" + f" directory ('{self.ext_pb_run}/.path' not found, '" + ) + assert str(exc.value).startswith(exp_err) + + self.ext_pb_run.mkdir() + dot_path = self.int_pb_run / ".path" + dot_path_contents = f"{self.ext_pb_run}-mismatch" + dot_path.write_text(dot_path_contents) + with pytest.raises(ToolDataSinkError) as exc: + BenchmarkRunDir(str(ext_bm_rd), str(self.int_pb_run)) + exp_err = ( + f"Run directory parameter, '{ext_bm_rd}', must be an existing" + f" directory (.path contents mismatch, .path='{dot_path_contents}'" + f" != '{self.ext_pb_run}')." + ) + assert exp_err == str(exc.value) + + +def _test_app(environ, start_response): + start_response( + "200 OK", + [("Content-Type", "text/plain"), ("Date", "Fri, 12 Feb 2021 23:35:42 UTC")], + ) + return [b"Hello, world! 42"] + + +class TestDataSinkWsgiServer: + """Verify the DataSinkWsgiServer wrapper class. + """ + + def test_constructor(self): + """test_constructor - verify the DataSinkWsgiServer constructor. + """ + with pytest.raises(Exception) as exc: + DataSinkWsgiServer() + assert "DataSinkWsgiServer requires a logger" == str(exc.value) + + wsgi = DataSinkWsgiServer( + host="host.example.com", port="42", logger="__logger__" + ) + assert wsgi.options.get("handler_class", "missing") != "missing" + klass = wsgi.options.get("handler_class") + assert isinstance(klass, type(WSGIRequestHandler)) + assert wsgi._server is None + assert wsgi._err_code is None + assert wsgi._err_text is None + assert isinstance(wsgi._lock, type(Lock())) + assert isinstance(wsgi._cv, type(Condition())) + assert wsgi._logger == "__logger__" + + def test_log_methods(self, caplog): + logger = logging.getLogger("test_log_methods") + wsgi_server = DataSinkWsgiServer( + host="host.example.com", port="42", logger=logger + ) + wrh = wsgi_server.options["handler_class"] + # This forces the base WSGI methods to not buffer writes. + wrh.wbufsize = 1 + + class MockBytesIO(BytesIO): + def close(self, *args, **kwargs): + self._saved_value = self.getvalue() + super().close(*args, **kwargs) + + class MockSocket: + def getsockname(self): + return ("sockname",) + + class MockRequest: + _sock = MockSocket() + + def __init__(self, path): + self._path = path + + def makefile(self, *args, **kwargs): + if args[0] == "rb": + return MockBytesIO(b"GET %s HTTP/1.1" % self._path) + elif args[0] == "wb": + return MockBytesIO(b"") + else: + raise ValueError( + "MockRequest: unrecognized file type", args, kwargs + ) + + class MockServer: + def __init__(self): + self.base_environ = {} + + def get_app(self): + return _test_app + + mock_server = MockServer() + + # We perform all these above mock infrastructure just to get a usable + # DataSinkWsgiRequestHandler() object. The MockRequest() mimics a + # single request being handled, where a response is generated, and + # captured in the handlers "wfile" attribute value. This one request + # will also emit one informational log. + handler = wrh(MockRequest(b"/"), (0, 0), mock_server) + assert handler.wfile._saved_value.startswith(b"HTTP/1.0 200 OK") + assert handler.wfile._saved_value.endswith(b"Hello, world! 42") + assert caplog.records[0].levelname == "INFO" + assert caplog.records[0].message == '0 - - "GET / HTTP/1.1" 200 16' + + # Now that we have this handler object, we can directly invoke the + # other logging methods to verify their behavior. + handler.log_error("test error %d %s", 42, "43") + assert caplog.records[1].levelname == "ERROR" + assert caplog.records[1].message == "0 - - test error 42 43" + handler.log_message("test msg %d %s", 42, "43") + assert caplog.records[2].levelname == "WARNING" + assert caplog.records[2].message == "0 - - test msg 42 43" + handler.log_request(code=HTTPStatus(404), size=42) + assert caplog.records[3].levelname == "INFO" + assert caplog.records[3].message == '0 - - "GET / HTTP/1.1" 404 42' + + class MockServer: + def __init__(self, host, port, app, *args, **kwargs): + self.host = host + self.port = port + self.app = app + self.args = args + self.kwargs = kwargs + self.serve_forever_called = False + self.shutdown_called = False + if self.host.startswith("oserror"): + raise OSError(42, "oserror") + elif self.host.startswith("exception"): + raise Exception("exception") + + def shutdown(self): + self.shutdown_called = True + + def serve_forever(self): + self.serve_forever_called = True + + def test_run(self, caplog): + """test_run - verify code paths of run method directly. + + NOTE: We are not using threads to do this. Instead we are mocking out + the `make_server` call to create a fake server that we control that + does nothing when "serve_forever" is called. + """ + logger = logging.getLogger("test_run") + wsgi_server = DataSinkWsgiServer( + host="host.example.com", port="42", logger=logger + ) + mocked_servers = [] + + def mock_make_server(host, port, app, *args, **kwargs): + mocked_server = self.MockServer(host, port, app, *args, **kwargs) + mocked_servers.append(mocked_server) + return mocked_server + + with patch.object(tool_data_sink, "make_server", mock_make_server): + # First we invoke the "run" method once to let it execute normally. + try: + wsgi_server.run(_test_app) + except Exception as exc: + pytest.fail(f"WSGI server failed with an exception, {exc}") + else: + # Retrieve the internal server object that we created, and + # verify that it is created as expected, and that + # "serve_forever" was called. + mock_server = mocked_servers[0] + assert wsgi_server._server is mock_server + assert wsgi_server._err_code == 0 + assert wsgi_server._err_text is None + assert mock_server.host == "host.example.com" + assert mock_server.port == 42 + assert mock_server.app is _test_app + assert mock_server.args == () + klass = mock_server.kwargs.get("handler_class") + assert isinstance(klass, type(WSGIRequestHandler)) + assert mock_server.serve_forever_called + # The success path of "run" should have emitted three debug + # messages. + assert len(caplog.records) == 3 + assert caplog.records[0].levelname == "DEBUG" + assert ( + caplog.records[0].message == "Making tool data sink WSGI server ..." + ) + assert caplog.records[1].levelname == "DEBUG" + assert caplog.records[1].message == "Successfully created WSGI server" + assert caplog.records[2].levelname == "DEBUG" + assert ( + caplog.records[2].message + == "Running tool data sink WSGI server ..." + ) + with pytest.raises(AssertionError) as exc: + # Call it again to verify the assertion fires + wsgi_server.run(_test_app) + assert "'run' method called twice" in str(exc.value), f"{exc.value}" + # No logs should have been emitted. + assert len(caplog.records) == 3 + + def test_stop_and_wait(self, caplog): + """test_stop_and_wait - verify the operation of run() in conjunction + with stop() and wait() methods from separate threads. + + There are a number of scenarios for the order of operations between + threads that we need to test. We list them here using "MainThr" as the + name of the "main thread" which _creates_ the WSGI thread, and "WsgiThr" + as the name of the created WSGI thread invoking the "run" method. + + References: + .wait() called in + .stop() method + __enter__() method + .stop() called in + __exit__() method + + Scenario A: + + * MainThr creates WSGI thread (WsgiThr not running) + * MainThr calls stop() + * WsgiThr starts running + * WsgiThr reports err_code == 0 + + Scenario B: + + * MainThr creates WSGI thread + * WsgiThr starts running + * WsgiThr reports err_code == 0 + * MainThr calls stop() + + Scenario C: + + * MainThr creates WSGI thread (WsgiThr not running) + * MainThr calls stop() + * WsgiThr starts running + * WsgiThr reports err_code > 0 + + Scenario D: + + * MainThr creates WSGI thread + * WsgiThr starts running + * WsgiThr reports err_code > 0 + * MainThr calls stop() + + Scenario E: + + * MainThr creates WSGI thread (WsgiThr not running) + * MainThr calls stop() + * WsgiThr starts running + * WsgiThr reports err_code < 0 + + Scenario F: + + * MainThr creates WSGI thread + * WsgiThr starts running + * WsgiThr reports err_code < 0 + * MainThr calls stop() + """ + + def wsgi_run(scenario, wsgi_server, trace): + ret_val = None + if scenario in ("A", "C", "E"): + time.sleep(0.1) + try: + trace.append("WsgiThr - run") + wsgi_server.run(_test_app) + except Exception as exc: + ret_val = exc + return ret_val + + def do_wait(scenario, wsgi_server, trace): + if scenario in ("B", "D", "F"): + time.sleep(0.1) + trace.append("MainThr - wait") + err_text, err_code = wsgi_server.wait() + return err_text, err_code + + def do_stop(scenario, wsgi_server, trace): + if scenario in ("B", "D", "F"): + time.sleep(0.1) + trace.append("MainThr - stop") + wsgi_server.stop() + + # The host name prefix directs the MockServer class to behave by + # raising an OSError or Exception base on the name. + hostnames = dict( + A="host.example.com", + B="host.example.com", + C="oserror.example.com", + D="oserror.example.com", + E="exception.example.com", + F="exception.example.com", + ) + caplog_idx = 0 + logger = logging.getLogger("test_run") + for scenario in ["A", "B", "C", "D", "E", "F"]: + wsgi_server = DataSinkWsgiServer( + host=hostnames[scenario], port="42", logger=logger + ) + mocked_servers = [] + + def mock_make_server(host, port, app, *args, **kwargs): + mocked_server = self.MockServer(host, port, app, *args, **kwargs) + mocked_servers.append(mocked_server) + return mocked_server + + with patch.object(tool_data_sink, "make_server", mock_make_server): + trace = [] + wsgithr = Thread(target=wsgi_run, args=(scenario, wsgi_server, trace)) + wsgithr.start() + err_text, err_code = do_wait(scenario, wsgi_server, trace) + wsgithr.join() + assert caplog.records[caplog_idx].levelname == "DEBUG" + assert ( + caplog.records[caplog_idx].message + == "Making tool data sink WSGI server ..." + ) + caplog_idx += 1 + if scenario in ("A", "B"): + mock_server = mocked_servers[0] + assert mock_server.serve_forever_called + assert not mock_server.shutdown_called + assert err_code == 0 + assert err_text is None + assert caplog.records[caplog_idx].levelname == "DEBUG" + assert ( + caplog.records[caplog_idx].message + == "Successfully created WSGI server" + ) + caplog_idx += 1 + assert caplog.records[caplog_idx].levelname == "DEBUG" + assert ( + caplog.records[caplog_idx].message + == "Running tool data sink WSGI server ..." + ) + caplog_idx += 1 + elif scenario in ("C", "D"): + assert len(mocked_servers) == 0 + assert err_code == 42 + assert err_text == "[Errno 42] oserror" + # Only 1 log message is emitted when OSErrors are encountered + else: + assert scenario in ("E", "F") + assert len(mocked_servers) == 0 + assert err_code == -1 + assert err_text == "exception" + assert caplog.records[caplog_idx].levelname == "ERROR" + assert ( + caplog.records[caplog_idx].message + == "Unexpected error in WSGI server" + ) + caplog_idx += 1 + assert len(caplog.records) == caplog_idx + + # Now we test two cases for the stop() method + for scenario in ["A", "E"]: + wsgi_server = DataSinkWsgiServer( + host=hostnames[scenario], port="42", logger=logger + ) + mocked_servers = [] + + def mock_make_server(host, port, app, *args, **kwargs): + mocked_server = self.MockServer(host, port, app, *args, **kwargs) + mocked_servers.append(mocked_server) + return mocked_server + + with patch.object(tool_data_sink, "make_server", mock_make_server): + trace = [] + wsgithr = Thread(target=wsgi_run, args=(scenario, wsgi_server, trace)) + wsgithr.start() + do_stop(scenario, wsgi_server, trace) + wsgithr.join() + assert caplog.records[caplog_idx].levelname == "DEBUG" + assert ( + caplog.records[caplog_idx].message + == "Making tool data sink WSGI server ..." + ) + caplog_idx += 1 + if scenario == "A": + mock_server = mocked_servers[0] + assert mock_server.serve_forever_called + assert mock_server.shutdown_called + caplog_idx += 2 + else: + assert scenario == "E" + assert len(mocked_servers) == 0 + caplog_idx += 1 + assert len(caplog.records) == caplog_idx