diff --git a/agent/config/firewalld/pbench-dcgm-exporter.xml b/agent/config/firewalld/pbench-dcgm-exporter.xml
new file mode 100644
index 0000000000..5ae4e1727b
--- /dev/null
+++ b/agent/config/firewalld/pbench-dcgm-exporter.xml
@@ -0,0 +1,7 @@
+
+
+ pbench-dcgm-exporter
+ Pbench Agent Prometheus dcgm-exporter
+
+
+
diff --git a/agent/config/firewalld/pbench-redis.xml b/agent/config/firewalld/pbench-redis.xml
index 7b3f97e0b6..22f6f0d169 100644
--- a/agent/config/firewalld/pbench-redis.xml
+++ b/agent/config/firewalld/pbench-redis.xml
@@ -1,6 +1,6 @@
- pbench-tool-data-sink
- Pbench Agent Tool Data Sink
-
+ pbench-redis
+ Pbench Agent Redis Server
+
diff --git a/agent/config/firewalld/pbench-tool-data-sink.xml b/agent/config/firewalld/pbench-tool-data-sink.xml
index 22f6f0d169..7b3f97e0b6 100644
--- a/agent/config/firewalld/pbench-tool-data-sink.xml
+++ b/agent/config/firewalld/pbench-tool-data-sink.xml
@@ -1,6 +1,6 @@
- pbench-redis
- Pbench Agent Redis Server
-
+ pbench-tool-data-sink
+ Pbench Agent Tool Data Sink
+
diff --git a/agent/containers/images/Dockerfile.base.j2 b/agent/containers/images/Dockerfile.base.j2
index 436f0bc40a..79c73cca6f 100644
--- a/agent/containers/images/Dockerfile.base.j2
+++ b/agent/containers/images/Dockerfile.base.j2
@@ -13,9 +13,9 @@ RUN \
{{ pkgmgr }} module -y disable python38 && \
{% endif %}
{% if distro_image.startswith('centos') %}
- {{ pkgmgr }} install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-{{ distro_image.split(':', 1)[1] }}.noarch.rpm && \
+ {{ pkgmgr }} install -y --setopt=tsflags=nodocs https://dl.fedoraproject.org/pub/epel/epel-release-latest-{{ distro_image.split(':', 1)[1] }}.noarch.rpm && \
{% endif %}
- {{ pkgmgr }} install -y {% if distro_image == 'centos:8' %}--enablerepo powertools glibc-locale-source {% endif %} pbench-agent && \
+ {{ pkgmgr }} install -y --setopt=tsflags=nodocs {% if distro_image == 'centos:8' %}--enablerepo powertools glibc-locale-source {% endif %} pbench-agent && \
{% if distro_image == 'centos:8' %}
localedef -i en_US -f UTF-8 en_US.UTF-8 && \
{% endif %}
diff --git a/agent/containers/images/Dockerfile.dcgmEX.j2 b/agent/containers/images/Dockerfile.dcgmEX.j2
new file mode 100644
index 0000000000..f307d45d3b
--- /dev/null
+++ b/agent/containers/images/Dockerfile.dcgmEX.j2
@@ -0,0 +1,15 @@
+# NOTE: Must be run with --privileged
+# RECOMMENDED: Use with the fedora image variants for direct compatibility
+FROM pbench-agent-tool-meister-{{ distro }}:{{ tag }}
+
+RUN {% if distro == 'centos-7' %}yum{% else %}dnf{% endif %} install -y 'dnf-command(config-manager)' && \
+ {% if distro == 'centos-7' %}yum{% else %}dnf{% endif %} config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/{{ distro.split("-")|join("") }}/x86_64/cuda-{{ distro.split("-")|join("") }}.repo && \
+ {% if distro == 'centos-7' %}yum{% else %}dnf{% endif %} clean expire-cache && \
+ {% if distro == 'centos-7' %}yum{% else %}dnf{% endif %} install -y nvidia-driver-cuda nvidia-modprobe datacenter-gpu-manager-2.1.4 golang && \
+ git clone https://github.com/NVIDIA/gpu-monitoring-tools.git && \
+ (cd gpu-monitoring-tools; git checkout tags/2.1.2 -b build; make binary install) && \
+ {% if distro == 'centos-7' %}yum{% else %}dnf{% endif %} -y clean all && \
+ rm -rf /var/cache/{% if distro == 'centos-7' %}yum{% else %}dnf{% endif %}
+
+ENV NVIDIA_DISABLE_REQUIRE="true" \
+ NVIDIA_VISIBLE_DEVICES=all
diff --git a/agent/containers/images/Dockerfile.layered.j2 b/agent/containers/images/Dockerfile.layered.j2
index b7b4685548..a9d91ff4c5 100644
--- a/agent/containers/images/Dockerfile.layered.j2
+++ b/agent/containers/images/Dockerfile.layered.j2
@@ -1,10 +1,14 @@
# {{ distro }} pbench-agent {{ kind }} image
FROM pbench-agent-base-{{ distro }}:{{ tag }}
+{% if kind in ('tools', 'all') %}
+COPY ./{{ distro }}-pcp.repo /etc/yum.repos.d/pcp.repo
+{% endif %}
+
# Install all the RPMs required for this image.
#
# FIXME: this is not exhaustive, it does not include RPMs to support
# Kubernetes or RHV environments.
-RUN {% if distro == 'centos-7' %}yum{% else %}dnf{% endif %} install -y {% if distro == 'centos-8' %}--enablerepo powertools {% endif %}{{ rpms }} && \
+RUN {% if distro == 'centos-7' %}yum{% else %}dnf{% endif %} install -y --setopt=tsflags=nodocs {% if distro == 'centos-8' %}--enablerepo powertools {% endif %}{% if kind in ('tools', 'all') %}--enablerepo pcp {% endif %}{{ rpms }} && \
{% if distro == 'centos-7' %}yum{% else %}dnf{% endif %} -y clean all && \
rm -rf /var/cache/{% if distro == 'centos-7' %}yum{% else %}dnf{% endif %}
diff --git a/agent/containers/images/Dockerfile.tds.j2 b/agent/containers/images/Dockerfile.tds.j2
new file mode 100644
index 0000000000..f59e620117
--- /dev/null
+++ b/agent/containers/images/Dockerfile.tds.j2
@@ -0,0 +1,9 @@
+# {{ distro }} pbench-agent-tool-data-sink image
+FROM pbench-agent-tools-{{ distro }}:{{ tag }}
+
+VOLUME /var/lib/pbench-agent
+
+# Port 8080 should be Bottle server, 9090 optional Prometheus server, and 44566
+# the optional pmproxy server.
+EXPOSE 8080 9090 44566
+ENTRYPOINT [ "/opt/pbench-agent/util-scripts/tool-meister/tool-data-sink-ep" ]
diff --git a/agent/containers/images/Dockerfile.tm.j2 b/agent/containers/images/Dockerfile.tm.j2
new file mode 100644
index 0000000000..95f9fde6f4
--- /dev/null
+++ b/agent/containers/images/Dockerfile.tm.j2
@@ -0,0 +1,7 @@
+# {{ distro }} pbench-agent-tool-meister image
+FROM pbench-agent-tools-{{ distro }}:{{ tag }}
+
+# Port 9400 should be the optional dcgm tool, 9100 the optional node_exporter
+# tool, and 55677 the pcp (pmcd) tool.
+EXPOSE 9100 9400 55677
+ENTRYPOINT [ "/opt/pbench-agent/util-scripts/tool-meister/tool-meister-ep" ]
diff --git a/agent/containers/images/Makefile b/agent/containers/images/Makefile
index 0d9e79abdf..c1d053fe62 100644
--- a/agent/containers/images/Makefile
+++ b/agent/containers/images/Makefile
@@ -27,9 +27,43 @@ IMAGE_REPO = docker://quay.io/pbench
# Not intended to be overridden with an environment variable.
_REPO_TEMPLATE = ../../ansible/pbench/agent/roles/pbench_repo_install/templates/etc/yum.repos.d/pbench.repo.j2
+# NOTE: Currently we require 5.2.2 of the PCP RPMs because using the 5.2.3
+# version prevents us from integrating with Grafana, see PCP Issue #1183,
+# https://github.com/performancecopilot/pcp/issues/1183.
+# NOTE: We also have to enumerate so many RPMs because for CentOS 7 the RPM
+# dependency resolver does not properly resolve to the same version RPMs. Once
+# we no longer have to use v5.2.2, we can just list 3 RPMs: pcp-zeroconf,
+# pcp-system-tools, and pcp-gui.
+_PCP_RPMS = \
+ pcp-doc-5.2.2 \
+ pcp-gui-5.2.2 \
+ pcp-pmda-dm-5.2.2 \
+ pcp-pmda-nfsclient-5.2.2 \
+ pcp-pmda-openmetrics-5.2.2 \
+ pcp-system-tools-5.2.2 \
+ pcp-zeroconf-5.2.2 \
+ python3-pcp-5.2.2
# The list of RPMs which provide the various tools we offer.
# Not intended to be overridden with an environment variable.
-_TOOL_RPMS = prometheus2 node_exporter blktrace bpftrace cpupowerutils golang kernel-tools libvirt-client nmap-ncat numactl pbench-sysstat pcp-system-tools perf procps-ng strace tcpdump trace-cmd
+# Please keep the lists sorted.
+_TOOL_RPMS = \
+ blktrace \
+ bpftrace \
+ cpupowerutils \
+ golang \
+ kernel-tools \
+ libvirt-client \
+ nmap-ncat \
+ node_exporter \
+ numactl \
+ pbench-sysstat \
+ ${_PCP_RPMS} \
+ perf \
+ procps-ng \
+ prometheus2 \
+ strace \
+ tcpdump \
+ trace-cmd
# The list of RPMs for the default workloads we offer.
# Not intended to be overridden with an environment variable.
@@ -41,8 +75,38 @@ _ALL_RPMS = ${_TOOL_RPMS} ${_WORKLOAD_RPMS}
# By default we only build images for the following distributions:
_DISTROS = centos-8 centos-7 fedora-33 fedora-32
+# By default we won't build the Tool Data Sink and Tool Meister images
all: all-tags $(foreach distro, ${_DISTROS}, ${distro}-all-tagged)
+tds: all-tags $(foreach distro, ${_DISTROS}, ${distro}-tool-data-sink-tagged)
+
+tm: all-tags $(foreach distro, ${_DISTROS}, ${distro}-tool-meister-tagged)
+
+# We also offer targets per distribution target
+centos-8: all-tags centos-8-all-tagged
+
+centos-7: all-tags centos-7-all-tagged
+
+fedora-33: all-tags fedora-33-all-tagged
+
+fedora-32: all-tags fedora-32-all-tagged
+
+centos-8-tds: all-tags centos-8-tool-data-sink-tagged
+
+centos-7-tds: all-tags centos-7-tool-data-sink-tagged
+
+fedora-33-tds: all-tags fedora-33-tool-data-sink-tagged
+
+fedora-32-tds: all-tags fedora-32-tool-data-sink-tagged
+
+centos-8-tm: all-tags centos-8-tool-meister-tagged
+
+centos-7-tm: all-tags centos-7-tool-meister-tagged
+
+fedora-33-tm: all-tags fedora-33-tool-meister-tagged
+
+fedora-32-tm: all-tags fedora-32-tool-meister-tagged
+
#+
# Tagging targets
#-
@@ -97,16 +161,34 @@ push-major-minor: $(foreach distro, ${_DISTROS}, ${distro}-push-major-minor)
%-all-tagged: %-all %-tags.lis
./apply-tags pbench-agent-all-$* $*-tags.lis
-%-all: %-tools-tagged %-workloads-tagged %-all.Dockerfile
+%-all: %-workloads-tagged %-tool-data-sink-tagged %-tool-meister-tagged %-all.Dockerfile
./build-image all $* $*-tags.lis
%-all.Dockerfile: Dockerfile.layered.j2 %-tags.lis
jinja2 Dockerfile.layered.j2 -D distro=$* -D tag="$$(grep -v -E '^v' $*-tags.lis)" -D kind="all" -D rpms="${_ALL_RPMS}" > ./$@
+%-tool-data-sink-tagged: %-tool-data-sink %-tags.lis
+ ./apply-tags pbench-agent-tool-data-sink-$* $*-tags.lis
+
+%-tool-data-sink: %-tools-tagged %-tool-data-sink.Dockerfile
+ ./build-image tool-data-sink $* $*-tags.lis
+
+%-tool-data-sink.Dockerfile: Dockerfile.tds.j2 %-tags.lis
+ jinja2 Dockerfile.tds.j2 -D distro=$* -D tag="$$(grep -v -E '^v' $*-tags.lis)" > ./$@
+
+%-tool-meister-tagged: %-tool-meister %-tags.lis
+ ./apply-tags pbench-agent-tool-meister-$* $*-tags.lis
+
+%-tool-meister: %-tools-tagged %-tool-meister.Dockerfile
+ ./build-image tool-meister $* $*-tags.lis
+
+%-tool-meister.Dockerfile: Dockerfile.tm.j2 %-tags.lis
+ jinja2 Dockerfile.tm.j2 -D distro=$* -D tag="$$(grep -v -E '^v' $*-tags.lis)" > ./$@
+
%-tools-tagged: %-tools %-tags.lis
./apply-tags pbench-agent-tools-$* $*-tags.lis
-%-tools: %-base-tagged %-tools.Dockerfile
+%-tools: %-base-tagged %-tools.Dockerfile %-pcp.repo
./build-image tools $* $*-tags.lis
%-tools.Dockerfile: Dockerfile.layered.j2 %-tags.lis
@@ -204,15 +286,23 @@ fedora-32-base.Dockerfile: Dockerfile.base.j2 fedora-32-pbench.repo
# Helper target to build each distro's ".repo" and ".Dockerfile"
all-dockerfiles: $(foreach distro, ${_DISTROS}, ${distro}-base.Dockerfile ${distro}-tools.Dockerfile ${distro}-workloads.Dockerfile ${distro}-all.Dockerfile)
-# Rule pattern dependencies on non-patterned targets have to be set up
-# separately for some reason.
-%.repo: ${_REPO_TEMPLATE}
+%-pbench.repo: %-pbench.yml ${_REPO_TEMPLATE}
+ jinja2 ${_REPO_TEMPLATE} $*-pbench.yml -o $@
+
+%-pbench.yml: repo.yml.j2
+ jinja2 repo.yml.j2 -D distro=$* -D url_prefix=${URL_PREFIX} -D test_suffix=${_TEST_SUFFIX} -D user=${USER} -o $@
+
+fedora-33-pcp.repo: pcp.repo.j2
+ jinja2 pcp.repo.j2 -D target=f33 -o $@
+
+fedora-32-pcp.repo: pcp.repo.j2
+ jinja2 pcp.repo.j2 -D target=f32 -o $@
-%.repo: %.yml
- jinja2 ${_REPO_TEMPLATE} $*.yml -o $@
+centos-8-pcp.repo: pcp.repo.j2
+ jinja2 pcp.repo.j2 -D target=el8 -o $@
-%.yml: repo.yml.j2
- jinja2 repo.yml.j2 -D distro=${@:-pbench.yml=} -D url_prefix=${URL_PREFIX} -D test_suffix=${_TEST_SUFFIX} -D user=${USER} -o $@
+centos-7-pcp.repo: pcp.repo.j2
+ jinja2 pcp.repo.j2 -D target=el7 -o $@
clean:
rm -f *.Dockerfile *.repo *.yml *-tags.lis
diff --git a/agent/containers/images/pcp-pmcd/Dockerfile b/agent/containers/images/pcp-pmcd/Dockerfile
deleted file mode 100644
index 78c3b177f0..0000000000
--- a/agent/containers/images/pcp-pmcd/Dockerfile
+++ /dev/null
@@ -1,16 +0,0 @@
-FROM fedora:33
-
-ENV SUMMARY="Performance Co-Pilot" \
- DESCRIPTION="Performance Co-Pilot is a system performance analysis toolkit." \
- VERSION=5
-
-RUN dnf install -y --setopt=tsflags=nodocs procps-ng gettext pcp pcp-zeroconf && \
- dnf install -y pcp-doc pcp-gui pcp-system-tools && \
- dnf clean all
-RUN systemctl enable pmcd && systemctl disable pmlogger
-
-COPY config /etc/sysconfig/pmcd
-
-EXPOSE 44321
-CMD ["/usr/sbin/init"]
-
diff --git a/agent/containers/images/pcp-pmcd/config b/agent/containers/images/pcp-pmcd/config
deleted file mode 100644
index 9506ba807c..0000000000
--- a/agent/containers/images/pcp-pmcd/config
+++ /dev/null
@@ -1,30 +0,0 @@
-# Environment variables for the pmcd daemon. Refer also to the
-# pmcd.options and pmcd.conf files for additional configuration.
-
-# Behaviour regarding listening on external-facing interfaces;
-# unset PMCD_LOCAL to allow connections from remote hosts.
-# A value of 0 permits remote connections, 1 permits local only.
-PMCD_LOCAL=0
-
-# Max length to which the queue of pending connections may grow
-# A value of 5 is the default.
-# PMCD_MAXPENDING=5
-
-# Default behaviour regarding pmcd's approach to starting PMDAs;
-# In cases where pmdaroot is available, setting this variable to
-# 1, offloads starting and stopping of agents to pmdaroot. This
-# allows pmcd to not require a restart when starting a new PMDA.
-PMCD_ROOT_AGENT=1
-
-# Default behaviour regarding pmcd's approach to re-starting any
-# unresponsive PMDAs; this should only be used with pmdaroot and
-# PMCD_ROOT_AGENT=1 as it allows pmcd to attempt to automatically
-# restart any exited PMDA that it detects (which usually requires
-# privileges not available to pmcd itself).
-PMCD_RESTART_AGENTS=1
-
-# Default timeout for waiting on pmcd to accept connections; any
-# longer than this value and the rc scripts report it as failed.
-# The value is a PCPIntro(1) interval in units of seconds and it
-# will be passed directly to the pmcd_wait(1) utility.
-# PMCD_WAIT_TIMEOUT=60
diff --git a/agent/containers/images/pcp-pmlogger/Dockerfile b/agent/containers/images/pcp-pmlogger/Dockerfile
deleted file mode 100644
index 52214587cc..0000000000
--- a/agent/containers/images/pcp-pmlogger/Dockerfile
+++ /dev/null
@@ -1,14 +0,0 @@
-FROM fedora:33
-
-ENV SUMMARY="Performance Co-Pilot" \
- DESCRIPTION="Performance Co-Pilot is a system performance analysis toolkit." \
- VERSION=5
-
-RUN dnf install -y --setopt=tsflags=nodocs procps-ng gettext pcp pcp-zeroconf && \
- dnf install -y pcp-doc pcp-gui pcp-system-tools && \
- dnf clean all && \
- rm -rf /etc/pcp/pmlogger/control.d/local
-RUN systemctl enable pmlogger && systemctl disable pmcd
-
-VOLUME ["/var/log/pcp/pmlogger"]
-CMD ["/usr/sbin/init"]
diff --git a/agent/containers/images/pcp.repo.j2 b/agent/containers/images/pcp.repo.j2
new file mode 100644
index 0000000000..02a3eeb545
--- /dev/null
+++ b/agent/containers/images/pcp.repo.j2
@@ -0,0 +1,6 @@
+[pcp]
+name=pcp
+baseurl=https://dl.bintray.com/pcp/{{ target }}
+gpgcheck=0
+repo_gpgcheck=0
+enabled=1
diff --git a/agent/containers/images/push b/agent/containers/images/push
index c31536c519..af19c6329d 100755
--- a/agent/containers/images/push
+++ b/agent/containers/images/push
@@ -27,7 +27,7 @@ function pushit {
buildah push ${1} ${image_repo}/${1}
}
-for image in base tools workloads all; do
+for image in base tools tool-meister tool-data-sink workloads all; do
pushit pbench-agent-${image}-${distro}:${githash}
pushit pbench-agent-${image}-${distro}:${ver}
if [[ ! -z "${other}" ]]; then
diff --git a/agent/containers/images/tagit b/agent/containers/images/tagit
index 28727c9afc..858c0bb6a4 100755
--- a/agent/containers/images/tagit
+++ b/agent/containers/images/tagit
@@ -16,6 +16,6 @@ function tagit {
buildah tag ${1}:${githash} ${1}:${tag}
}
-for image in base tools workloads all; do
+for image in base tools tool-meister tool-data-sink workloads all; do
tagit pbench-agent-${image}-${distro}
done
diff --git a/agent/containers/images/visualizers/combo.json b/agent/containers/images/visualizers/combo.json
index 9c9e4e4591..b08568d7a0 100644
--- a/agent/containers/images/visualizers/combo.json
+++ b/agent/containers/images/visualizers/combo.json
@@ -123,7 +123,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "dcgm_gpu_temp",
+ "expr": "DCGM_FI_DEV_GPU_TEMP",
"format": "time_series",
"instant": false,
"interval": "",
@@ -227,7 +227,7 @@
"pluginVersion": "7.1.2",
"targets": [
{
- "expr": "avg(dcgm_gpu_temp)",
+ "expr": "avg(DCGM_FI_DEV_GPU_TEMP)",
"interval": "",
"legendFormat": "",
"refId": "A"
@@ -286,7 +286,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "dcgm_power_usage",
+ "expr": "DCGM_FI_DEV_POWER_USAGE",
"interval": "",
"legendFormat": "GPU {{gpu}}",
"refId": "A"
@@ -408,7 +408,7 @@
"pluginVersion": "7.1.2",
"targets": [
{
- "expr": "sum(dcgm_power_usage)",
+ "expr": "sum(DCGM_FI_DEV_POWER_USAGE)",
"instant": true,
"interval": "",
"legendFormat": "",
@@ -471,7 +471,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "dcgm_sm_clock",
+ "expr": "DCGM_FI_DEV_SM_CLOCK",
"format": "time_series",
"instant": false,
"interval": "",
@@ -523,6 +523,97 @@
"alignLevel": null
}
},
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 16
+ },
+ "hiddenSeries": false,
+ "id": 4,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "DCGM_FI_DEV_MEM_CLOCK",
+ "interval": "",
+ "legendFormat": "GPU {{gpu}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "interval": "3",
+ "title": "GPU Memory Clocks",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "hertz",
+ "label": null,
+ "logBase": 1,
+ "max": "100",
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
{
"aliasColors": {},
"bars": false,
@@ -570,7 +661,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "dcgm_gpu_utilization",
+ "expr": "DCGM_FI_DEV_GPU_UTIL",
"interval": "",
"legendFormat": "GPU {{gpu}}",
"refId": "A"
@@ -618,6 +709,97 @@
"alignLevel": null
}
},
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 24
+ },
+ "hiddenSeries": false,
+ "id": 8,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "DCGM_FI_DEV_MEM_COPY_UTIL",
+ "interval": "",
+ "legendFormat": "GPU {{gpu}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "interval": 3,
+ "title": "GPU Mem Cpy Utilization",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "cumulative"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "percent",
+ "label": null,
+ "logBase": 1,
+ "max": "100",
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
{
"aliasColors": {},
"bars": false,
@@ -664,7 +846,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "dcgm_fb_used",
+ "expr": "DCGM_FI_DEV_FB_USED",
"interval": "",
"legendFormat": "GPU {{gpu}}",
"refId": "A"
@@ -759,7 +941,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "dcgm_fb_free",
+ "expr": "DCGM_FI_DEV_FB_FREE",
"interval": "",
"legendFormat": "GPU {{gpu}}",
"refId": "A"
diff --git a/agent/containers/images/visualizers/dcgm.json b/agent/containers/images/visualizers/dcgm.json
index 32f8b16992..2c4106c85e 100644
--- a/agent/containers/images/visualizers/dcgm.json
+++ b/agent/containers/images/visualizers/dcgm.json
@@ -119,7 +119,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "dcgm_gpu_temp",
+ "expr": "DCGM_FI_DEV_GPU_TEMP",
"format": "time_series",
"instant": false,
"interval": "",
@@ -223,7 +223,7 @@
"pluginVersion": "7.1.2",
"targets": [
{
- "expr": "avg(dcgm_gpu_temp)",
+ "expr": "avg(DCGM_FI_DEV_GPU_TEMP)",
"interval": "",
"legendFormat": "",
"refId": "A"
@@ -282,7 +282,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "dcgm_power_usage",
+ "expr": "DCGM_FI_DEV_POWER_USAGE",
"interval": "",
"legendFormat": "GPU {{gpu}}",
"refId": "A"
@@ -404,7 +404,7 @@
"pluginVersion": "7.1.2",
"targets": [
{
- "expr": "sum(dcgm_power_usage)",
+ "expr": "sum(DCGM_FI_DEV_POWER_USAGE)",
"instant": true,
"interval": "",
"legendFormat": "",
@@ -467,7 +467,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "dcgm_sm_clock",
+ "expr": "DCGM_FI_DEV_SM_CLOCK",
"format": "time_series",
"instant": false,
"interval": "",
@@ -519,6 +519,97 @@
"alignLevel": null
}
},
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 16
+ },
+ "hiddenSeries": false,
+ "id": 4,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "DCGM_FI_DEV_MEM_CLOCK",
+ "interval": "",
+ "legendFormat": "GPU {{gpu}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "interval": "3",
+ "title": "GPU Memory Clocks",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "hertz",
+ "label": null,
+ "logBase": 1,
+ "max": "100",
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
{
"aliasColors": {},
"bars": false,
@@ -566,7 +657,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "dcgm_gpu_utilization",
+ "expr": "DCGM_FI_DEV_GPU_UTIL",
"interval": "",
"legendFormat": "GPU {{gpu}}",
"refId": "A"
@@ -614,6 +705,97 @@
"alignLevel": null
}
},
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 24
+ },
+ "hiddenSeries": false,
+ "id": 8,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "DCGM_FI_DEV_MEM_COPY_UTIL",
+ "interval": "",
+ "legendFormat": "GPU {{gpu}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "interval": 3,
+ "title": "GPU Mem Cpy Utilization",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "cumulative"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "percent",
+ "label": null,
+ "logBase": 1,
+ "max": "100",
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
{
"aliasColors": {},
"bars": false,
@@ -660,7 +842,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "dcgm_fb_used",
+ "expr": "DCGM_FI_DEV_FB_USED",
"interval": "",
"legendFormat": "GPU {{gpu}}",
"refId": "A"
@@ -755,7 +937,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "dcgm_fb_free",
+ "expr": "DCGM_FI_DEV_FB_FREE",
"interval": "",
"legendFormat": "GPU {{gpu}}",
"refId": "A"
diff --git a/agent/tool-scripts/dcgm b/agent/tool-scripts/dcgm
index 835b4bf899..af21817452 100755
--- a/agent/tool-scripts/dcgm
+++ b/agent/tool-scripts/dcgm
@@ -15,10 +15,9 @@ import sys
if len(sys.argv) == 2 and sys.argv[1] == "--help":
help = """Options:
---inst= (required)
--interval=# (number of seconds between collections)
-For more information on this tool, please Nvidia's "dcgm-exporter" at:
+For more information on this tool, please see Nvidia's "dcgm-exporter" at:
\thttps://ngc.nvidia.com/catalog/containers/nvidia:k8s:dcgm-exporter
"""
print(help)
diff --git a/agent/tool-scripts/meta.json b/agent/tool-scripts/meta.json
index c69a941a75..edd8dc782e 100644
--- a/agent/tool-scripts/meta.json
+++ b/agent/tool-scripts/meta.json
@@ -42,7 +42,7 @@
"persistent":{
"node-exporter": {"collector": "prometheus", "port": "9100"},
- "dcgm": {"collector": "prometheus", "port": "8000"},
+ "dcgm": {"collector": "prometheus", "port": "9400"},
"pcp": {"collector": "pcp", "port": "44321"}
}
}
diff --git a/agent/util-scripts/gold/pbench-tool-meister-start/test-54.txt b/agent/util-scripts/gold/pbench-tool-meister-start/test-54.txt
index 335c2c4e4c..8e9a30a8a8 100644
--- a/agent/util-scripts/gold/pbench-tool-meister-start/test-54.txt
+++ b/agent/util-scripts/gold/pbench-tool-meister-start/test-54.txt
@@ -1,13 +1,18 @@
+++ Running test-54 pbench-tool-meister-start --help
usage: Usage: pbench-tool-meister-start [--sysinfo ]
- [-h] [--sysinfo SYSINFO] tool_group
+ [-h] [--sysinfo SYSINFO] [--redis-server REDIS_SERVER] tool_group
positional arguments:
- tool_group The tool group of items to be run by the Tool Meisters.
+ tool_group The tool group name of tools to be run by the Tool
+ Meisters.
optional arguments:
- -h, --help show this help message and exit
- --sysinfo SYSINFO The list of system information items to be collected.
+ -h, --help show this help message and exit
+ --sysinfo SYSINFO The list of system information items to be collected.
+ --redis-server REDIS_SERVER
+ Use an existing Redis server specified by
+ :; implies an existing Tool Data Sink
+ and Tool Meisters as well.
--- Finished test-54 pbench-tool-meister-start (status=0)
+++ pbench tree state
/var/tmp/pbench-test-utils/pbench
diff --git a/agent/util-scripts/gold/pbench-tool-meister-stop/test-55.txt b/agent/util-scripts/gold/pbench-tool-meister-stop/test-55.txt
index a2b7d7cf2b..0f987393ae 100644
--- a/agent/util-scripts/gold/pbench-tool-meister-stop/test-55.txt
+++ b/agent/util-scripts/gold/pbench-tool-meister-stop/test-55.txt
@@ -1,15 +1,21 @@
+++ Running test-55 pbench-tool-meister-stop --help
usage: Usage: pbench-tool-meister-stop [--sysinfo ]
- [-h] [--sysinfo SYSINFO] [--interrupt] tool_group
+ [-h] [--sysinfo SYSINFO] [--interrupt] [--redis-server REDIS_SERVER]
+ tool_group
positional arguments:
- tool_group The tool group of items being run in the Tool Meisters.
+ tool_group The tool group name of tools being run in the Tool
+ Meisters.
optional arguments:
- -h, --help show this help message and exit
- --sysinfo SYSINFO The list of system information items to be collected.
- --interrupt Whether or not the stop operation is in response to an
- interrupt.
+ -h, --help show this help message and exit
+ --sysinfo SYSINFO The list of system information items to be collected.
+ --interrupt Whether or not the stop operation is in response to an
+ interrupt.
+ --redis-server REDIS_SERVER
+ Use an existing Redis server specified by
+ :; implies the use of an existing Tool
+ Data Sink and Tool Meisters as well.
--- Finished test-55 pbench-tool-meister-stop (status=0)
+++ pbench tree state
/var/tmp/pbench-test-utils/pbench
diff --git a/agent/util-scripts/gold/test-client-tool-meister/test-53.txt b/agent/util-scripts/gold/test-client-tool-meister/test-53.txt
index 9c287003d6..0f12446984 100644
--- a/agent/util-scripts/gold/test-client-tool-meister/test-53.txt
+++ b/agent/util-scripts/gold/test-client-tool-meister/test-53.txt
@@ -1,6 +1,8 @@
+++ Running test-53 test-client-tool-meister
"mpstat" tool is now registered for host "testhost.example.com" in group "default"
"dcgm" tool is now registered for host "testhost.example.com" in group "default"
+pbench-tool-data-sink: connected to redis server localhost:17001
+pbench-tool-meister: connected to redis server localhost:17001
Collecting system information
--- Finished test-53 test-client-tool-meister (status=0)
+++ pbench tree state
@@ -55,7 +57,6 @@ Collecting system information
/var/tmp/pbench-test-utils/pbench/mock-run/sysinfo/end/testhost.example.com/tm-sysinfo.out
/var/tmp/pbench-test-utils/pbench/mock-run/tm
/var/tmp/pbench-test-utils/pbench/mock-run/tm/pbench-tool-data-sink.err
-/var/tmp/pbench-test-utils/pbench/mock-run/tm/pbench-tool-data-sink.log
/var/tmp/pbench-test-utils/pbench/mock-run/tm/pbench-tool-data-sink.out
/var/tmp/pbench-test-utils/pbench/mock-run/tm/redis.conf
/var/tmp/pbench-test-utils/pbench/mock-run/tm/redis.log
@@ -139,17 +140,15 @@ install_check_output = mpstat: pbench-sysstat-12.0.3 is installed
--- mock-run/metadata.log file contents
+++ mock-run/tm/pbench-tool-data-sink.err file contents
+INFO pbench-tool-data-sink web_server_run -- Running Bottle web server ...
Bottle v#.##.## server starting up (using DataSinkWsgiServer(handler_class=.DataSinkWsgiRequestHandler'>))...
Listening on http://localhost:8080/
Hit Ctrl-C to quit.
---- mock-run/tm/pbench-tool-data-sink.err file contents
-+++ mock-run/tm/pbench-tool-data-sink.log file contents
-INFO pbench-tool-data-sink web_server_run -- Running Bottle web server ...
INFO pbench-tool-data-sink tm_log_capture -- Running Tool Meister log capture ...
INFO pbench-tool-data-sink execute -- Tool Data Sink terminating
INFO pbench-tool-data-sink web_server_run -- Bottle web server exited
---- mock-run/tm/pbench-tool-data-sink.log file contents
+--- mock-run/tm/pbench-tool-data-sink.err file contents
+++ mock-run/tm/pbench-tool-data-sink.out file contents
--- mock-run/tm/pbench-tool-data-sink.out file contents
+++ mock-run/tm/redis.conf file contents
@@ -173,7 +172,7 @@ port 17001
+++ mock-run/tm/tm-default-testhost.example.com.err file contents
INFO pbench-tool-meister sysinfo -- pbench-sysinfo-dump -- /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pbench-sysinfo-dump /var/tmp/pbench-test-utils/pbench/mock-run/sysinfo/beg/testhost.example.com block,security_mitigations,sos parallel
INFO pbench-tool-meister sysinfo -- testhost.example.com: sysinfo send (no-op) default /var/tmp/pbench-test-utils/pbench/mock-run/sysinfo/beg/testhost.example.com
-INFO pbench-tool-meister start -- Started persistent tool dcgm, env PYTHONPATH=/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings:/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings/common, args ['python2', '/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py']
+INFO pbench-tool-meister start -- Started persistent tool dcgm, ['/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter']
INFO pbench-tool-meister start -- mpstat: start_tool -- /var/tmp/pbench-test-utils/opt/pbench-agent/tool-scripts/mpstat --start --dir=/var/tmp/pbench-test-utils/pbench/mock-run/0-iter-zero/sample42/tools-default/testhost.example.com --interval=42 --options=forty-two
INFO pbench-tool-meister stop -- mpstat: stop_tool -- /var/tmp/pbench-test-utils/opt/pbench-agent/tool-scripts/mpstat --stop --dir=/var/tmp/pbench-test-utils/pbench/mock-run/0-iter-zero/sample42/tools-default/testhost.example.com --interval=42 --options=forty-two
INFO pbench-tool-meister wait -- Waiting for transient tool mpstat stop process
@@ -196,7 +195,7 @@ INFO pbench-tool-meister __exit__ -- testhost.example.com: terminating
pbench-tool-meister-start - verify logging channel up
testhost.example.com 0000 INFO pbench-tool-meister sysinfo -- pbench-sysinfo-dump -- /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pbench-sysinfo-dump /var/tmp/pbench-test-utils/pbench/mock-run/sysinfo/beg/testhost.example.com block,security_mitigations,sos parallel
testhost.example.com 0001 INFO pbench-tool-meister sysinfo -- testhost.example.com: sysinfo send (no-op) default /var/tmp/pbench-test-utils/pbench/mock-run/sysinfo/beg/testhost.example.com
-testhost.example.com 0002 INFO pbench-tool-meister start -- Started persistent tool dcgm, env PYTHONPATH=/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings:/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings/common, args ['python2', '/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py']
+testhost.example.com 0002 INFO pbench-tool-meister start -- Started persistent tool dcgm, ['/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter']
testhost.example.com 0003 INFO pbench-tool-meister start -- mpstat: start_tool -- /var/tmp/pbench-test-utils/opt/pbench-agent/tool-scripts/mpstat --start --dir=/var/tmp/pbench-test-utils/pbench/mock-run/0-iter-zero/sample42/tools-default/testhost.example.com --interval=42 --options=forty-two
testhost.example.com 0004 INFO pbench-tool-meister stop -- mpstat: stop_tool -- /var/tmp/pbench-test-utils/opt/pbench-agent/tool-scripts/mpstat --stop --dir=/var/tmp/pbench-test-utils/pbench/mock-run/0-iter-zero/sample42/tools-default/testhost.example.com --interval=42 --options=forty-two
testhost.example.com 0005 INFO pbench-tool-meister wait -- Waiting for transient tool mpstat stop process
@@ -231,10 +230,10 @@ scrape_configs:
- job_name: 'testhost.example.com_dcgm'
static_configs:
- - targets: ['testhost.example.com:8000']
+ - targets: ['testhost.example.com:9400']
--- tools-default/prometheus/prometheus.yml file contents
+++ tools-default/testhost.example.com/dcgm/dcgm.file file contents
-/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py
+/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter
--- tools-default/testhost.example.com/dcgm/dcgm.file file contents
+++ tools-default/testhost.example.com/dcgm/tm-dcgm-start.err file contents
--- tools-default/testhost.example.com/dcgm/tm-dcgm-start.err file contents
@@ -242,10 +241,10 @@ scrape_configs:
--- tools-default/testhost.example.com/dcgm/tm-dcgm-start.out file contents
+++ test-execution.log file contents
/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/cp -rL /etc/ssh/ssh_config.d /var/tmp/pbench-test-utils/pbench/mock-run/
+/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter
/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pbench-sysinfo-dump /var/tmp/pbench-test-utils/pbench/mock-run/sysinfo/beg/testhost.example.com block,security_mitigations,sos parallel
/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pbench-sysinfo-dump /var/tmp/pbench-test-utils/pbench/mock-run/sysinfo/end/testhost.example.com block,security_mitigations,sos parallel
/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pidof -x mpstat
/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pidof -x mpstat
/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/prometheus --config.file=/var/tmp/pbench-test-utils/pbench/mock-run/tools-default/prometheus/prometheus.yml --storage.tsdb.path=/var/tmp/pbench-test-utils/pbench/mock-run/tools-default/prometheus --web.console.libraries=/usr/share/prometheus/console_libraries --web.console.templates=/usr/share/prometheus/consoles
-/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py
--- test-execution.log file contents
diff --git a/agent/util-scripts/gold/test-client-tool-meister/test-56.txt b/agent/util-scripts/gold/test-client-tool-meister/test-56.txt
index 4decd73edd..60af8c011f 100644
--- a/agent/util-scripts/gold/test-client-tool-meister/test-56.txt
+++ b/agent/util-scripts/gold/test-client-tool-meister/test-56.txt
@@ -6,6 +6,11 @@
"node-exporter" tool is now registered for host "remote_b.example.com", with label "blue", in group "lite"
"dcgm" tool is now registered for host "remote_c.example.com", with label "red", in group "lite"
"pcp" tool is now registered for host "remote_c.example.com", with label "red", in group "lite"
+pbench-tool-data-sink: connected to redis server localhost:17001
+pbench-tool-meister: connected to redis server localhost:17001
+pbench-tool-meister: connected to redis server localhost:17001
+pbench-tool-meister: connected to redis server localhost:17001
+pbench-tool-meister: connected to redis server localhost:17001
Collecting system information
--- Finished test-56 test-client-tool-meister (status=0)
+++ pbench tree state
@@ -142,7 +147,6 @@ Collecting system information
/var/tmp/pbench-test-utils/pbench/mock-run/sysinfo/end/testhost.example.com/tm-sysinfo.out
/var/tmp/pbench-test-utils/pbench/mock-run/tm
/var/tmp/pbench-test-utils/pbench/mock-run/tm/pbench-tool-data-sink.err
-/var/tmp/pbench-test-utils/pbench/mock-run/tm/pbench-tool-data-sink.log
/var/tmp/pbench-test-utils/pbench/mock-run/tm/pbench-tool-data-sink.out
/var/tmp/pbench-test-utils/pbench/mock-run/tm/redis.conf
/var/tmp/pbench-test-utils/pbench/mock-run/tm/redis.log
@@ -241,7 +245,7 @@ INFO pbench-tool-meister __exit__ -- remote_b.example.com: terminating
=== /var/tmp/pbench-test-utils/pbench/tmp/tm-lite-remote_c.example.com.err:
INFO pbench-tool-meister sysinfo -- pbench-sysinfo-dump -- /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pbench-sysinfo-dump /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/red:remote_c.example.com block,security_mitigations,sos parallel
INFO pbench-tool-meister _send_directory -- remote_c.example.com: PUT sysinfo-data completed lite /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/red:remote_c.example.com
-INFO pbench-tool-meister start -- Started persistent tool dcgm, env PYTHONPATH=/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings:/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings/common, args ['python2', '/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py']
+INFO pbench-tool-meister start -- Started persistent tool dcgm, ['/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter']
INFO pbench-tool-meister start -- Started persistent tool pcp, ['/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pmcd', '--foreground', '--socket=./pmcd.socket', '--port=55677', '--config=/var/tmp/pbench-test-utils/opt/pbench-agent/templates/pmcd.conf']
INFO pbench-tool-meister stop -- Terminate issued for persistent tool dcgm
INFO pbench-tool-meister stop -- Terminate issued for persistent tool pcp
@@ -256,7 +260,7 @@ INFO pbench-tool-meister __exit__ -- remote_c.example.com: terminating
=== /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/blue:remote_b.example.com/node-exporter/tm-node-exporter-start.err:
=== /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/blue:remote_b.example.com/node-exporter/tm-node-exporter-start.out:
=== /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/red:remote_c.example.com/dcgm/dcgm.file:
-/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py
+/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter
=== /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/red:remote_c.example.com/dcgm/tm-dcgm-start.err:
=== /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/red:remote_c.example.com/dcgm/tm-dcgm-start.out:
=== /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/red:remote_c.example.com/pcp/pmcd.file:
@@ -411,12 +415,9 @@ install_check_output = mpstat: pbench-sysstat-12.0.3 is installed
--- mock-run/metadata.log file contents
+++ mock-run/tm/pbench-tool-data-sink.err file contents
+
Bottle v#.##.## server starting up (using DataSinkWsgiServer(handler_class=.DataSinkWsgiRequestHandler'>))...
-Listening on http://localhost:8080/
Hit Ctrl-C to quit.
-
---- mock-run/tm/pbench-tool-data-sink.err file contents
-+++ mock-run/tm/pbench-tool-data-sink.log file contents
INFO pbench-tool-data-sink execute -- Tool Data Sink terminating
INFO pbench-tool-data-sink log_request -- 127.0.0.1 - - "PUT /sysinfo-data/27c00bc325171c4893ef3862b4340952/remote_a.example.com HTTP/1.1" 200 0
INFO pbench-tool-data-sink log_request -- 127.0.0.1 - - "PUT /sysinfo-data/27c00bc325171c4893ef3862b4340952/remote_b.example.com HTTP/1.1" 200 0
@@ -431,7 +432,8 @@ INFO pbench-tool-data-sink log_request -- 127.0.0.1 - - "PUT /tool-data/9e243dae
INFO pbench-tool-data-sink tm_log_capture -- Running Tool Meister log capture ...
INFO pbench-tool-data-sink web_server_run -- Bottle web server exited
INFO pbench-tool-data-sink web_server_run -- Running Bottle web server ...
---- mock-run/tm/pbench-tool-data-sink.log file contents
+Listening on http://localhost:8080/
+--- mock-run/tm/pbench-tool-data-sink.err file contents
+++ mock-run/tm/pbench-tool-data-sink.out file contents
--- mock-run/tm/pbench-tool-data-sink.out file contents
+++ mock-run/tm/redis.conf file contents
@@ -455,7 +457,7 @@ port 17001
+++ mock-run/tm/tm-lite-testhost.example.com.err file contents
INFO pbench-tool-meister sysinfo -- pbench-sysinfo-dump -- /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pbench-sysinfo-dump /var/tmp/pbench-test-utils/pbench/mock-run/sysinfo/beg/testhost.example.com block,security_mitigations,sos parallel
INFO pbench-tool-meister sysinfo -- testhost.example.com: sysinfo send (no-op) lite /var/tmp/pbench-test-utils/pbench/mock-run/sysinfo/beg/testhost.example.com
-INFO pbench-tool-meister start -- Started persistent tool dcgm, env PYTHONPATH=/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings:/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings/common, args ['python2', '/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py']
+INFO pbench-tool-meister start -- Started persistent tool dcgm, ['/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter']
INFO pbench-tool-meister start -- mpstat: start_tool -- /var/tmp/pbench-test-utils/opt/pbench-agent/tool-scripts/mpstat --start --dir=/var/tmp/pbench-test-utils/pbench/mock-run/0-iter-zero/sample42/tools-lite/testhost.example.com --interval=42 --options=forty-two
INFO pbench-tool-meister stop -- mpstat: stop_tool -- /var/tmp/pbench-test-utils/opt/pbench-agent/tool-scripts/mpstat --stop --dir=/var/tmp/pbench-test-utils/pbench/mock-run/0-iter-zero/sample42/tools-lite/testhost.example.com --interval=42 --options=forty-two
INFO pbench-tool-meister wait -- Waiting for transient tool mpstat stop process
@@ -511,7 +513,7 @@ remote_b.example.com 0016 INFO pbench-tool-meister _send_directory -- remote_b.e
remote_b.example.com 0017 INFO pbench-tool-meister __exit__ -- remote_b.example.com: terminating
remote_c.example.com 0000 INFO pbench-tool-meister sysinfo -- pbench-sysinfo-dump -- /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pbench-sysinfo-dump /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/red:remote_c.example.com block,security_mitigations,sos parallel
remote_c.example.com 0001 INFO pbench-tool-meister _send_directory -- remote_c.example.com: PUT sysinfo-data completed lite /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/red:remote_c.example.com
-remote_c.example.com 0002 INFO pbench-tool-meister start -- Started persistent tool dcgm, env PYTHONPATH=/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings:/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings/common, args ['python2', '/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py']
+remote_c.example.com 0002 INFO pbench-tool-meister start -- Started persistent tool dcgm, ['/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter']
remote_c.example.com 0003 INFO pbench-tool-meister start -- Started persistent tool pcp, ['/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pmcd', '--foreground', '--socket=./pmcd.socket', '--port=55677', '--config=/var/tmp/pbench-test-utils/opt/pbench-agent/templates/pmcd.conf']
remote_c.example.com 0004 INFO pbench-tool-meister stop -- Terminate issued for persistent tool dcgm
remote_c.example.com 0005 INFO pbench-tool-meister stop -- Terminate issued for persistent tool pcp
@@ -522,7 +524,7 @@ remote_c.example.com 0009 INFO pbench-tool-meister _send_directory -- remote_c.e
remote_c.example.com 0010 INFO pbench-tool-meister __exit__ -- remote_c.example.com: terminating
testhost.example.com 0000 INFO pbench-tool-meister sysinfo -- pbench-sysinfo-dump -- /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pbench-sysinfo-dump /var/tmp/pbench-test-utils/pbench/mock-run/sysinfo/beg/testhost.example.com block,security_mitigations,sos parallel
testhost.example.com 0001 INFO pbench-tool-meister sysinfo -- testhost.example.com: sysinfo send (no-op) lite /var/tmp/pbench-test-utils/pbench/mock-run/sysinfo/beg/testhost.example.com
-testhost.example.com 0002 INFO pbench-tool-meister start -- Started persistent tool dcgm, env PYTHONPATH=/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings:/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings/common, args ['python2', '/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py']
+testhost.example.com 0002 INFO pbench-tool-meister start -- Started persistent tool dcgm, ['/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter']
testhost.example.com 0003 INFO pbench-tool-meister start -- mpstat: start_tool -- /var/tmp/pbench-test-utils/opt/pbench-agent/tool-scripts/mpstat --start --dir=/var/tmp/pbench-test-utils/pbench/mock-run/0-iter-zero/sample42/tools-lite/testhost.example.com --interval=42 --options=forty-two
testhost.example.com 0004 INFO pbench-tool-meister stop -- mpstat: stop_tool -- /var/tmp/pbench-test-utils/opt/pbench-agent/tool-scripts/mpstat --stop --dir=/var/tmp/pbench-test-utils/pbench/mock-run/0-iter-zero/sample42/tools-lite/testhost.example.com --interval=42 --options=forty-two
testhost.example.com 0005 INFO pbench-tool-meister wait -- Waiting for transient tool mpstat stop process
@@ -575,15 +577,15 @@ scrape_configs:
- job_name: 'remote_c.example.com_dcgm'
static_configs:
- - targets: ['remote_c.example.com:8000']
+ - targets: ['remote_c.example.com:9400']
- job_name: 'testhost.example.com_dcgm'
static_configs:
- - targets: ['testhost.example.com:8000']
+ - targets: ['testhost.example.com:9400']
--- tools-lite/prometheus/prometheus.yml file contents
+++ tools-lite/testhost.example.com/dcgm/dcgm.file file contents
-/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py
+/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter
--- tools-lite/testhost.example.com/dcgm/dcgm.file file contents
+++ tools-lite/testhost.example.com/dcgm/tm-dcgm-start.err file contents
--- tools-lite/testhost.example.com/dcgm/tm-dcgm-start.err file contents
@@ -591,6 +593,8 @@ scrape_configs:
--- tools-lite/testhost.example.com/dcgm/tm-dcgm-start.out file contents
+++ test-execution.log file contents
/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/cp -rL /etc/ssh/ssh_config.d /var/tmp/pbench-test-utils/pbench/mock-run/
+/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter
+/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter
/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/mpstat -P ALL --options=forty-two 42
/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/mpstat -P ALL --options=forty-two 42
/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/mpstat -P ALL --options=forty-two 42
@@ -615,8 +619,6 @@ scrape_configs:
/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pmlogger --log=- --report -t 3s -c /var/tmp/pbench-test-utils/opt/pbench-agent/templates/pmlogger.conf --host=remote_c.example.com:55677 /var/tmp/pbench-test-utils/pbench/mock-run/tools-lite/pcp/data/red:remote_c.example.com/%Y%m%d.%H.%M
/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pmproxy --log=- --foreground --timeseries --port=44566 --redishost=localhost --redisport=17001 --config=/var/tmp/pbench-test-utils/opt/pbench-agent/templates/pmproxy.conf
/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/prometheus --config.file=/var/tmp/pbench-test-utils/pbench/mock-run/tools-lite/prometheus/prometheus.yml --storage.tsdb.path=/var/tmp/pbench-test-utils/pbench/mock-run/tools-lite/prometheus --web.console.libraries=/usr/share/prometheus/console_libraries --web.console.templates=/usr/share/prometheus/consoles
-/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py
-/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py
/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/ssh -o StrictHostKeyChecking=no remote_a.example.com /var/tmp/pbench-test-utils/opt/pbench-agent/util-scripts/tool-meister/pbench-tool-meister-remote localhost 17001 tm-lite-remote_a.example.com yes
/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/ssh -o StrictHostKeyChecking=no remote_b.example.com /var/tmp/pbench-test-utils/opt/pbench-agent/util-scripts/tool-meister/pbench-tool-meister-remote localhost 17001 tm-lite-remote_b.example.com yes
/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/ssh -o StrictHostKeyChecking=no remote_c.example.com /var/tmp/pbench-test-utils/opt/pbench-agent/util-scripts/tool-meister/pbench-tool-meister-remote localhost 17001 tm-lite-remote_c.example.com yes
diff --git a/agent/util-scripts/gold/test-client-tool-meister/test-57.txt b/agent/util-scripts/gold/test-client-tool-meister/test-57.txt
index d2ca802e3f..426dace95a 100644
--- a/agent/util-scripts/gold/test-client-tool-meister/test-57.txt
+++ b/agent/util-scripts/gold/test-client-tool-meister/test-57.txt
@@ -6,6 +6,11 @@
"node-exporter" tool is now registered for host "remote_b.example.com", with label "blue", in group "lite"
"dcgm" tool is now registered for host "remote_c.example.com", with label "red", in group "lite"
"pcp" tool is now registered for host "remote_c.example.com", with label "red", in group "lite"
+pbench-tool-data-sink: connected to redis server localhost:17001
+pbench-tool-meister: connected to redis server localhost:17001
+pbench-tool-meister: connected to redis server localhost:17001
+pbench-tool-meister: connected to redis server localhost:17001
+pbench-tool-meister: connected to redis server localhost:17001
system information not collected when --interrupt specified
--- Finished test-57 test-client-tool-meister (status=0)
+++ pbench tree state
@@ -121,7 +126,6 @@ system information not collected when --interrupt specified
/var/tmp/pbench-test-utils/pbench/mock-run/sysinfo/beg/testhost.example.com/tm-sysinfo.out
/var/tmp/pbench-test-utils/pbench/mock-run/tm
/var/tmp/pbench-test-utils/pbench/mock-run/tm/pbench-tool-data-sink.err
-/var/tmp/pbench-test-utils/pbench/mock-run/tm/pbench-tool-data-sink.log
/var/tmp/pbench-test-utils/pbench/mock-run/tm/pbench-tool-data-sink.out
/var/tmp/pbench-test-utils/pbench/mock-run/tm/redis.conf
/var/tmp/pbench-test-utils/pbench/mock-run/tm/redis.log
@@ -216,7 +220,7 @@ INFO pbench-tool-meister __exit__ -- remote_b.example.com: terminating
=== /var/tmp/pbench-test-utils/pbench/tmp/tm-lite-remote_c.example.com.err:
INFO pbench-tool-meister sysinfo -- pbench-sysinfo-dump -- /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pbench-sysinfo-dump /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/red:remote_c.example.com block,security_mitigations,sos parallel
INFO pbench-tool-meister _send_directory -- remote_c.example.com: PUT sysinfo-data completed lite /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/red:remote_c.example.com
-INFO pbench-tool-meister start -- Started persistent tool dcgm, env PYTHONPATH=/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings:/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings/common, args ['python2', '/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py']
+INFO pbench-tool-meister start -- Started persistent tool dcgm, ['/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter']
INFO pbench-tool-meister start -- Started persistent tool pcp, ['/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pmcd', '--foreground', '--socket=./pmcd.socket', '--port=55677', '--config=/var/tmp/pbench-test-utils/opt/pbench-agent/templates/pmcd.conf']
INFO pbench-tool-meister stop -- Terminate issued for persistent tool dcgm
INFO pbench-tool-meister stop -- Terminate issued for persistent tool pcp
@@ -229,7 +233,7 @@ INFO pbench-tool-meister __exit__ -- remote_c.example.com: terminating
=== /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/blue:remote_b.example.com/node-exporter/tm-node-exporter-start.err:
=== /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/blue:remote_b.example.com/node-exporter/tm-node-exporter-start.out:
=== /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/red:remote_c.example.com/dcgm/dcgm.file:
-/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py
+/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter
=== /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/red:remote_c.example.com/dcgm/tm-dcgm-start.err:
=== /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/red:remote_c.example.com/dcgm/tm-dcgm-start.out:
=== /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/red:remote_c.example.com/pcp/pmcd.file:
@@ -384,12 +388,9 @@ install_check_output = mpstat: pbench-sysstat-12.0.3 is installed
--- mock-run/metadata.log file contents
+++ mock-run/tm/pbench-tool-data-sink.err file contents
+
Bottle v#.##.## server starting up (using DataSinkWsgiServer(handler_class=.DataSinkWsgiRequestHandler'>))...
-Listening on http://localhost:8080/
Hit Ctrl-C to quit.
-
---- mock-run/tm/pbench-tool-data-sink.err file contents
-+++ mock-run/tm/pbench-tool-data-sink.log file contents
INFO pbench-tool-data-sink execute -- Tool Data Sink terminating
INFO pbench-tool-data-sink log_request -- 127.0.0.1 - - "PUT /sysinfo-data/27c00bc325171c4893ef3862b4340952/remote_a.example.com HTTP/1.1" 200 0
INFO pbench-tool-data-sink log_request -- 127.0.0.1 - - "PUT /sysinfo-data/27c00bc325171c4893ef3862b4340952/remote_b.example.com HTTP/1.1" 200 0
@@ -401,7 +402,8 @@ INFO pbench-tool-data-sink log_request -- 127.0.0.1 - - "PUT /tool-data/9e243dae
INFO pbench-tool-data-sink tm_log_capture -- Running Tool Meister log capture ...
INFO pbench-tool-data-sink web_server_run -- Bottle web server exited
INFO pbench-tool-data-sink web_server_run -- Running Bottle web server ...
---- mock-run/tm/pbench-tool-data-sink.log file contents
+Listening on http://localhost:8080/
+--- mock-run/tm/pbench-tool-data-sink.err file contents
+++ mock-run/tm/pbench-tool-data-sink.out file contents
--- mock-run/tm/pbench-tool-data-sink.out file contents
+++ mock-run/tm/redis.conf file contents
@@ -425,7 +427,7 @@ port 17001
+++ mock-run/tm/tm-lite-testhost.example.com.err file contents
INFO pbench-tool-meister sysinfo -- pbench-sysinfo-dump -- /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pbench-sysinfo-dump /var/tmp/pbench-test-utils/pbench/mock-run/sysinfo/beg/testhost.example.com block,security_mitigations,sos parallel
INFO pbench-tool-meister sysinfo -- testhost.example.com: sysinfo send (no-op) lite /var/tmp/pbench-test-utils/pbench/mock-run/sysinfo/beg/testhost.example.com
-INFO pbench-tool-meister start -- Started persistent tool dcgm, env PYTHONPATH=/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings:/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings/common, args ['python2', '/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py']
+INFO pbench-tool-meister start -- Started persistent tool dcgm, ['/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter']
INFO pbench-tool-meister start -- mpstat: start_tool -- /var/tmp/pbench-test-utils/opt/pbench-agent/tool-scripts/mpstat --start --dir=/var/tmp/pbench-test-utils/pbench/mock-run/0-iter-zero/sample42/tools-lite/testhost.example.com --interval=42 --options=forty-two
INFO pbench-tool-meister stop -- mpstat: stop_tool -- /var/tmp/pbench-test-utils/opt/pbench-agent/tool-scripts/mpstat --stop --dir=/var/tmp/pbench-test-utils/pbench/mock-run/0-iter-zero/sample42/tools-lite/testhost.example.com --interval=42 --options=forty-two
INFO pbench-tool-meister wait -- Waiting for transient tool mpstat stop process
@@ -475,7 +477,7 @@ remote_b.example.com 0014 INFO pbench-tool-meister wait -- Stopped persistent to
remote_b.example.com 0015 INFO pbench-tool-meister __exit__ -- remote_b.example.com: terminating
remote_c.example.com 0000 INFO pbench-tool-meister sysinfo -- pbench-sysinfo-dump -- /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pbench-sysinfo-dump /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/red:remote_c.example.com block,security_mitigations,sos parallel
remote_c.example.com 0001 INFO pbench-tool-meister _send_directory -- remote_c.example.com: PUT sysinfo-data completed lite /var/tmp/pbench-test-utils/pbench/tmp/tm.lite.NNNNN.nnnnnnnn/red:remote_c.example.com
-remote_c.example.com 0002 INFO pbench-tool-meister start -- Started persistent tool dcgm, env PYTHONPATH=/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings:/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings/common, args ['python2', '/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py']
+remote_c.example.com 0002 INFO pbench-tool-meister start -- Started persistent tool dcgm, ['/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter']
remote_c.example.com 0003 INFO pbench-tool-meister start -- Started persistent tool pcp, ['/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pmcd', '--foreground', '--socket=./pmcd.socket', '--port=55677', '--config=/var/tmp/pbench-test-utils/opt/pbench-agent/templates/pmcd.conf']
remote_c.example.com 0004 INFO pbench-tool-meister stop -- Terminate issued for persistent tool dcgm
remote_c.example.com 0005 INFO pbench-tool-meister stop -- Terminate issued for persistent tool pcp
@@ -484,7 +486,7 @@ remote_c.example.com 0007 INFO pbench-tool-meister wait -- Stopped persistent to
remote_c.example.com 0008 INFO pbench-tool-meister __exit__ -- remote_c.example.com: terminating
testhost.example.com 0000 INFO pbench-tool-meister sysinfo -- pbench-sysinfo-dump -- /var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pbench-sysinfo-dump /var/tmp/pbench-test-utils/pbench/mock-run/sysinfo/beg/testhost.example.com block,security_mitigations,sos parallel
testhost.example.com 0001 INFO pbench-tool-meister sysinfo -- testhost.example.com: sysinfo send (no-op) lite /var/tmp/pbench-test-utils/pbench/mock-run/sysinfo/beg/testhost.example.com
-testhost.example.com 0002 INFO pbench-tool-meister start -- Started persistent tool dcgm, env PYTHONPATH=/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings:/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/bindings/common, args ['python2', '/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py']
+testhost.example.com 0002 INFO pbench-tool-meister start -- Started persistent tool dcgm, ['/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter']
testhost.example.com 0003 INFO pbench-tool-meister start -- mpstat: start_tool -- /var/tmp/pbench-test-utils/opt/pbench-agent/tool-scripts/mpstat --start --dir=/var/tmp/pbench-test-utils/pbench/mock-run/0-iter-zero/sample42/tools-lite/testhost.example.com --interval=42 --options=forty-two
testhost.example.com 0004 INFO pbench-tool-meister stop -- mpstat: stop_tool -- /var/tmp/pbench-test-utils/opt/pbench-agent/tool-scripts/mpstat --stop --dir=/var/tmp/pbench-test-utils/pbench/mock-run/0-iter-zero/sample42/tools-lite/testhost.example.com --interval=42 --options=forty-two
testhost.example.com 0005 INFO pbench-tool-meister wait -- Waiting for transient tool mpstat stop process
@@ -535,15 +537,15 @@ scrape_configs:
- job_name: 'remote_c.example.com_dcgm'
static_configs:
- - targets: ['remote_c.example.com:8000']
+ - targets: ['remote_c.example.com:9400']
- job_name: 'testhost.example.com_dcgm'
static_configs:
- - targets: ['testhost.example.com:8000']
+ - targets: ['testhost.example.com:9400']
--- tools-lite/prometheus/prometheus.yml file contents
+++ tools-lite/testhost.example.com/dcgm/dcgm.file file contents
-/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py
+/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter
--- tools-lite/testhost.example.com/dcgm/dcgm.file file contents
+++ tools-lite/testhost.example.com/dcgm/tm-dcgm-start.err file contents
--- tools-lite/testhost.example.com/dcgm/tm-dcgm-start.err file contents
@@ -551,6 +553,8 @@ scrape_configs:
--- tools-lite/testhost.example.com/dcgm/tm-dcgm-start.out file contents
+++ test-execution.log file contents
/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/cp -rL /etc/ssh/ssh_config.d /var/tmp/pbench-test-utils/pbench/mock-run/
+/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter
+/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/dcgm-exporter
/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/mpstat -P ALL --options=forty-two 42
/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/mpstat -P ALL --options=forty-two 42
/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/mpstat -P ALL --options=forty-two 42
@@ -571,8 +575,6 @@ scrape_configs:
/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pmlogger --log=- --report -t 3s -c /var/tmp/pbench-test-utils/opt/pbench-agent/templates/pmlogger.conf --host=remote_c.example.com:55677 /var/tmp/pbench-test-utils/pbench/mock-run/tools-lite/pcp/data/red:remote_c.example.com/%Y%m%d.%H.%M
/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/pmproxy --log=- --foreground --timeseries --port=44566 --redishost=localhost --redisport=17001 --config=/var/tmp/pbench-test-utils/opt/pbench-agent/templates/pmproxy.conf
/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/prometheus --config.file=/var/tmp/pbench-test-utils/pbench/mock-run/tools-lite/prometheus/prometheus.yml --storage.tsdb.path=/var/tmp/pbench-test-utils/pbench/mock-run/tools-lite/prometheus --web.console.libraries=/usr/share/prometheus/console_libraries --web.console.templates=/usr/share/prometheus/consoles
-/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py
-/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/samples/scripts/dcgm_prometheus.py
/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/ssh -o StrictHostKeyChecking=no remote_a.example.com /var/tmp/pbench-test-utils/opt/pbench-agent/util-scripts/tool-meister/pbench-tool-meister-remote localhost 17001 tm-lite-remote_a.example.com yes
/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/ssh -o StrictHostKeyChecking=no remote_b.example.com /var/tmp/pbench-test-utils/opt/pbench-agent/util-scripts/tool-meister/pbench-tool-meister-remote localhost 17001 tm-lite-remote_b.example.com yes
/var/tmp/pbench-test-utils/opt/pbench-agent/unittest-scripts/ssh -o StrictHostKeyChecking=no remote_c.example.com /var/tmp/pbench-test-utils/opt/pbench-agent/util-scripts/tool-meister/pbench-tool-meister-remote localhost 17001 tm-lite-remote_c.example.com yes
diff --git a/agent/util-scripts/gold/test-start-stop-tool-meister/test-51.txt b/agent/util-scripts/gold/test-start-stop-tool-meister/test-51.txt
index 969847d2bd..81c79120e9 100644
--- a/agent/util-scripts/gold/test-start-stop-tool-meister/test-51.txt
+++ b/agent/util-scripts/gold/test-start-stop-tool-meister/test-51.txt
@@ -5,7 +5,7 @@
3. push tool group data and metadata
4. starting tool data sink
5a. starting localhost tool meister
-6. waiting for all successfully spawned SSH processes to show up as subscribers
+6. waiting for all successfully created Tool Meister processes to show up as subscribers
8. Initialize persistent tools
channel pbench-agent-cli-to-client payload, '{"action": "end", "kind": "ds", "status": "success"}'
channel pbench-agent-cli-to-client payload, '{"action": "init", "kind": "ds", "status": "success"}'
@@ -18,6 +18,8 @@ next pbench-agent-cli-to-client
payload from pbench-agent-cli-to-client: {'type': 'message', 'pattern': None, 'channel': b'pbench-agent-cli-to-client', 'data': b'{"action": "end", "kind": "ds", "status": "success"}'}
payload from pbench-agent-cli-to-client: {'type': 'message', 'pattern': None, 'channel': b'pbench-agent-cli-to-client', 'data': b'{"action": "init", "kind": "ds", "status": "success"}'}
payload from pbench-agent-cli-to-client: {'type': 'message', 'pattern': None, 'channel': b'pbench-agent-cli-to-client', 'data': b'{"action": "startup", "kind": "ds", "status": "success"}'}
+pbench-tool-data-sink: connected to redis server localhost:17001
+pbench-tool-meister: connected to redis server localhost:17001
publish end on chan pbench-agent-cli-from-client
publish init on chan pbench-agent-cli-from-client
publish terminate on chan pbench-agent-cli-from-client
@@ -33,7 +35,6 @@ waiting for tool-data-sink (#####) to exit
/var/tmp/pbench-test-utils/pbench/mock-run/ssh_config.d
/var/tmp/pbench-test-utils/pbench/mock-run/tm
/var/tmp/pbench-test-utils/pbench/mock-run/tm/pbench-tool-data-sink.err
-/var/tmp/pbench-test-utils/pbench/mock-run/tm/pbench-tool-data-sink.log
/var/tmp/pbench-test-utils/pbench/mock-run/tm/pbench-tool-data-sink.out
/var/tmp/pbench-test-utils/pbench/mock-run/tm/redis.conf
/var/tmp/pbench-test-utils/pbench/mock-run/tm/redis.log
@@ -108,22 +109,20 @@ install_check_output = perf: perf is installed
--- mock-run/metadata.log file contents
+++ mock-run/tm/pbench-tool-data-sink.err file contents
+DEBUG pbench-tool-data-sink daemon -- re-constructing Redis server object
+DEBUG pbench-tool-data-sink daemon -- reconstructed Redis server object
+DEBUG pbench-tool-data-sink driver -- params_key (tds-default): {'benchmark_run_dir': '/var/tmp/pbench-test-utils/pbench/mock-run', 'bind_hostname': 'localhost', 'channel_prefix': 'pbench-agent-cli', 'group': 'default', 'optional_md': {'config': '', 'date': '1900-01-01T00:00:00', 'script': 'fake-bm', 'ssh_opts': '-o StrictHostKeyChecking=no'}, 'tool_metadata': {'persistent': {'dcgm': {'collector': 'prometheus', 'port': '9400'}, 'node-exporter': {'collector': 'prometheus', 'port': '9100'}, 'pcp': {'collector': 'pcp', 'port': '44321'}}, 'transient': {'blktrace': None, 'bpftrace': None, 'cpuacct': None, 'disk': None, 'dm-cache': None, 'docker': None, 'docker-info': None, 'external-data-source': None, 'haproxy-ocp': None, 'iostat': None, 'jmap': None, 'jstack': None, 'kvm-spinlock': None, 'kvmstat': None, 'kvmtrace': None, 'lockstat': None, 'mpstat': None, 'numastat': None, 'oc': None, 'openvswitch': None, 'perf': None, 'pidstat': None, 'pprof': None, 'proc-interrupts': None, 'proc-sched_debug': None, 'proc-vmstat': None, 'prometheus-metrics': None, 'qemu-migrate': None, 'rabbit': None, 'sar': None, 'strace': None, 'sysfs': None, 'systemtap': None, 'tcpdump': None, 'turbostat': None, 'user-tool': None, 'virsh-migrate': None, 'vmstat': None}}, 'tool_trigger': None, 'tools': {'testhost.example.com': {'mpstat': '', 'perf': '--record-opts="-a -freq=100 -g --event=branch-misses --event=cache-misses --event=instructions" --report-opts="-I -g"'}}}
+INFO pbench-tool-data-sink web_server_run -- Running Bottle web server ...
Bottle v#.##.## server starting up (using DataSinkWsgiServer(handler_class=.DataSinkWsgiRequestHandler'>))...
Listening on http://localhost:8080/
Hit Ctrl-C to quit.
---- mock-run/tm/pbench-tool-data-sink.err file contents
-+++ mock-run/tm/pbench-tool-data-sink.log file contents
-DEBUG pbench-tool-data-sink main -- params_key (tds-default): b'{"benchmark_run_dir": "/var/tmp/pbench-test-utils/pbench/mock-run", "bind_hostname": "localhost", "channel_prefix": "pbench-agent-cli", "group": "default", "optional_md": {"config": "", "date": "1900-01-01T00:00:00", "script": "fake-bm", "ssh_opts": "-o StrictHostKeyChecking=no"}, "tool_metadata": {"persistent": {"dcgm": {"collector": "prometheus", "port": "8000"}, "node-exporter": {"collector": "prometheus", "port": "9100"}, "pcp": {"collector": "pcp", "port": "44321"}}, "transient": {"blktrace": null, "bpftrace": null, "cpuacct": null, "disk": null, "dm-cache": null, "docker": null, "docker-info": null, "external-data-source": null, "haproxy-ocp": null, "iostat": null, "jmap": null, "jstack": null, "kvm-spinlock": null, "kvmstat": null, "kvmtrace": null, "lockstat": null, "mpstat": null, "numastat": null, "oc": null, "openvswitch": null, "perf": null, "pidstat": null, "pprof": null, "proc-interrupts": null, "proc-sched_debug": null, "proc-vmstat": null, "prometheus-metrics": null, "qemu-migrate": null, "rabbit": null, "sar": null, "strace": null, "sysfs": null, "systemtap": null, "tcpdump": null, "turbostat": null, "user-tool": null, "virsh-migrate": null, "vmstat": null}}, "tool_trigger": null, "tools": {"testhost.example.com": {"mpstat": "", "perf": "--record-opts=\\"-a -freq=100 -g --event=branch-misses --event=cache-misses --event=instructions\\" --report-opts=\\"-I -g\\""}}}'
-DEBUG pbench-tool-data-sink main -- Tool Data Sink parameters check out, daemonizing ...
-DEBUG pbench-tool-data-sink main -- constructing Redis() object
-DEBUG pbench-tool-data-sink main -- constructed Redis() object
-INFO pbench-tool-data-sink web_server_run -- Running Bottle web server ...
DEBUG pbench-tool-data-sink run -- Making tool data sink WSGI server ...
+DEBUG pbench-tool-data-sink run -- Successfully created WSGI server
DEBUG pbench-tool-data-sink run -- Running tool data sink WSGI server ...
DEBUG pbench-tool-data-sink __enter__ -- web server 'run' thread started, processing payloads ...
INFO pbench-tool-data-sink tm_log_capture -- Running Tool Meister log capture ...
-DEBUG pbench-tool-data-sink __enter__ -- 'tm_log_capture' thread started, processing logs ...
+DEBUG pbench-tool-data-sink __enter__ -- 'tm_log_capture' thread started, processing Tool Meister logs ...
DEBUG pbench-tool-data-sink fetch_message -- next pbench-agent-cli-from-tms
DEBUG pbench-tool-data-sink fetch_message -- payload from pbench-agent-cli-from-tms: {'type': 'message', 'pattern': None, 'channel': b'pbench-agent-cli-from-tms', 'data': b'{"hostname": "testhost.example.com", "hostname_A": "agent.example.com", "hostname_I": "agent.example.com", "hostname_f": "agent.example.com", "hostname_i": "agent.example.com", "hostname_s": "agent.example.com", "installs": {"mpstat": [0, "mpstat: pbench-sysstat-12.0.3 is installed"], "perf": [0, "perf: perf is installed"]}, "kind": "tm", "label": "", "pid": NNNNN, "seqno": "", "sha1": "(unknown)", "version": "(unknown)"}'}
DEBUG pbench-tool-data-sink fetch_message -- channel pbench-agent-cli-from-tms payload, '{"hostname": "testhost.example.com", "hostname_A": "agent.example.com", "hostname_I": "agent.example.com", "hostname_f": "agent.example.com", "hostname_i": "agent.example.com", "hostname_s": "agent.example.com", "installs": {"mpstat": [0, "mpstat: pbench-sysstat-12.0.3 is installed"], "perf": [0, "perf: perf is installed"]}, "kind": "tm", "label": "", "pid": NNNNN, "seqno": "", "sha1": "(unknown)", "version": "(unknown)"}'
@@ -170,7 +169,7 @@ INFO pbench-tool-data-sink web_server_run -- Bottle web server exited
DEBUG pbench-tool-data-sink __exit__ -- Waiting for the web server thread to exit ...
DEBUG pbench-tool-data-sink __exit__ -- Waiting for the log capture thread to exit ...
DEBUG pbench-tool-data-sink __exit__ -- Exiting Tool Data Sink context ...
---- mock-run/tm/pbench-tool-data-sink.log file contents
+--- mock-run/tm/pbench-tool-data-sink.err file contents
+++ mock-run/tm/pbench-tool-data-sink.out file contents
--- mock-run/tm/pbench-tool-data-sink.out file contents
+++ mock-run/tm/redis.conf file contents
@@ -194,7 +193,7 @@ port 17001
+++ mock-run/tm/tm-default-testhost.example.com.err file contents
DEBUG pbench-tool-meister daemon -- re-constructing Redis server object
DEBUG pbench-tool-meister daemon -- re-constructed Redis server object
-DEBUG pbench-tool-meister driver -- params_key (tm-default-testhost.example.com): {'benchmark_run_dir': '/var/tmp/pbench-test-utils/pbench/mock-run', 'channel_prefix': 'pbench-agent-cli', 'controller': 'testhost.example.com', 'group': 'default', 'hostname': 'testhost.example.com', 'label': '', 'tool_metadata': {'persistent': {'dcgm': {'collector': 'prometheus', 'port': '8000'}, 'node-exporter': {'collector': 'prometheus', 'port': '9100'}, 'pcp': {'collector': 'pcp', 'port': '44321'}}, 'transient': {'blktrace': None, 'bpftrace': None, 'cpuacct': None, 'disk': None, 'dm-cache': None, 'docker': None, 'docker-info': None, 'external-data-source': None, 'haproxy-ocp': None, 'iostat': None, 'jmap': None, 'jstack': None, 'kvm-spinlock': None, 'kvmstat': None, 'kvmtrace': None, 'lockstat': None, 'mpstat': None, 'numastat': None, 'oc': None, 'openvswitch': None, 'perf': None, 'pidstat': None, 'pprof': None, 'proc-interrupts': None, 'proc-sched_debug': None, 'proc-vmstat': None, 'prometheus-metrics': None, 'qemu-migrate': None, 'rabbit': None, 'sar': None, 'strace': None, 'sysfs': None, 'systemtap': None, 'tcpdump': None, 'turbostat': None, 'user-tool': None, 'virsh-migrate': None, 'vmstat': None}}, 'tools': {'mpstat': '', 'perf': '--record-opts="-a -freq=100 -g --event=branch-misses --event=cache-misses --event=instructions" --report-opts="-I -g"'}}
+DEBUG pbench-tool-meister driver -- params_key (tm-default-testhost.example.com): {'benchmark_run_dir': '/var/tmp/pbench-test-utils/pbench/mock-run', 'channel_prefix': 'pbench-agent-cli', 'controller': 'testhost.example.com', 'group': 'default', 'hostname': 'testhost.example.com', 'label': '', 'tool_metadata': {'persistent': {'dcgm': {'collector': 'prometheus', 'port': '9400'}, 'node-exporter': {'collector': 'prometheus', 'port': '9100'}, 'pcp': {'collector': 'pcp', 'port': '44321'}}, 'transient': {'blktrace': None, 'bpftrace': None, 'cpuacct': None, 'disk': None, 'dm-cache': None, 'docker': None, 'docker-info': None, 'external-data-source': None, 'haproxy-ocp': None, 'iostat': None, 'jmap': None, 'jstack': None, 'kvm-spinlock': None, 'kvmstat': None, 'kvmtrace': None, 'lockstat': None, 'mpstat': None, 'numastat': None, 'oc': None, 'openvswitch': None, 'perf': None, 'pidstat': None, 'pprof': None, 'proc-interrupts': None, 'proc-sched_debug': None, 'proc-vmstat': None, 'prometheus-metrics': None, 'qemu-migrate': None, 'rabbit': None, 'sar': None, 'strace': None, 'sysfs': None, 'systemtap': None, 'tcpdump': None, 'turbostat': None, 'user-tool': None, 'virsh-migrate': None, 'vmstat': None}}, 'tools': {'mpstat': '', 'perf': '--record-opts="-a -freq=100 -g --event=branch-misses --event=cache-misses --event=instructions" --report-opts="-I -g"'}}
DEBUG pbench-tool-meister __enter__ -- publish pbench-agent-cli-from-tms
DEBUG pbench-tool-meister __enter__ -- published pbench-agent-cli-from-tms
DEBUG pbench-tool-meister driver -- waiting ...
@@ -226,7 +225,7 @@ DEBUG pbench-tool-meister _send_client_status -- publish pbench-agent-cli-from-t
--- mock-run/tm/tm-default-testhost.example.com.out file contents
+++ mock-run/tm/tm.logs file contents
pbench-tool-meister-start - verify logging channel up
-testhost.example.com 0000 DEBUG pbench-tool-meister driver -- params_key (tm-default-testhost.example.com): {'benchmark_run_dir': '/var/tmp/pbench-test-utils/pbench/mock-run', 'channel_prefix': 'pbench-agent-cli', 'controller': 'testhost.example.com', 'group': 'default', 'hostname': 'testhost.example.com', 'label': '', 'tool_metadata': {'persistent': {'dcgm': {'collector': 'prometheus', 'port': '8000'}, 'node-exporter': {'collector': 'prometheus', 'port': '9100'}, 'pcp': {'collector': 'pcp', 'port': '44321'}}, 'transient': {'blktrace': None, 'bpftrace': None, 'cpuacct': None, 'disk': None, 'dm-cache': None, 'docker': None, 'docker-info': None, 'external-data-source': None, 'haproxy-ocp': None, 'iostat': None, 'jmap': None, 'jstack': None, 'kvm-spinlock': None, 'kvmstat': None, 'kvmtrace': None, 'lockstat': None, 'mpstat': None, 'numastat': None, 'oc': None, 'openvswitch': None, 'perf': None, 'pidstat': None, 'pprof': None, 'proc-interrupts': None, 'proc-sched_debug': None, 'proc-vmstat': None, 'prometheus-metrics': None, 'qemu-migrate': None, 'rabbit': None, 'sar': None, 'strace': None, 'sysfs': None, 'systemtap': None, 'tcpdump': None, 'turbostat': None, 'user-tool': None, 'virsh-migrate': None, 'vmstat': None}}, 'tools': {'mpstat': '', 'perf': '--record-opts="-a -freq=100 -g --event=branch-misses --event=cache-misses --event=instructions" --report-opts="-I -g"'}}
+testhost.example.com 0000 DEBUG pbench-tool-meister driver -- params_key (tm-default-testhost.example.com): {'benchmark_run_dir': '/var/tmp/pbench-test-utils/pbench/mock-run', 'channel_prefix': 'pbench-agent-cli', 'controller': 'testhost.example.com', 'group': 'default', 'hostname': 'testhost.example.com', 'label': '', 'tool_metadata': {'persistent': {'dcgm': {'collector': 'prometheus', 'port': '9400'}, 'node-exporter': {'collector': 'prometheus', 'port': '9100'}, 'pcp': {'collector': 'pcp', 'port': '44321'}}, 'transient': {'blktrace': None, 'bpftrace': None, 'cpuacct': None, 'disk': None, 'dm-cache': None, 'docker': None, 'docker-info': None, 'external-data-source': None, 'haproxy-ocp': None, 'iostat': None, 'jmap': None, 'jstack': None, 'kvm-spinlock': None, 'kvmstat': None, 'kvmtrace': None, 'lockstat': None, 'mpstat': None, 'numastat': None, 'oc': None, 'openvswitch': None, 'perf': None, 'pidstat': None, 'pprof': None, 'proc-interrupts': None, 'proc-sched_debug': None, 'proc-vmstat': None, 'prometheus-metrics': None, 'qemu-migrate': None, 'rabbit': None, 'sar': None, 'strace': None, 'sysfs': None, 'systemtap': None, 'tcpdump': None, 'turbostat': None, 'user-tool': None, 'virsh-migrate': None, 'vmstat': None}}, 'tools': {'mpstat': '', 'perf': '--record-opts="-a -freq=100 -g --event=branch-misses --event=cache-misses --event=instructions" --report-opts="-I -g"'}}
testhost.example.com 0001 DEBUG pbench-tool-meister __enter__ -- publish pbench-agent-cli-from-tms
testhost.example.com 0002 DEBUG pbench-tool-meister __enter__ -- published pbench-agent-cli-from-tms
testhost.example.com 0003 DEBUG pbench-tool-meister driver -- waiting ...
diff --git a/agent/util-scripts/gold/test-start-stop-tool-meister/test-52.txt b/agent/util-scripts/gold/test-start-stop-tool-meister/test-52.txt
index 38cef83404..64cfa744a8 100644
--- a/agent/util-scripts/gold/test-start-stop-tool-meister/test-52.txt
+++ b/agent/util-scripts/gold/test-start-stop-tool-meister/test-52.txt
@@ -5,7 +5,7 @@
3. push tool group data and metadata
4. starting tool data sink
5a. starting localhost tool meister
-6. waiting for all successfully spawned SSH processes to show up as subscribers
+6. waiting for all successfully created Tool Meister processes to show up as subscribers
8. Initialize persistent tools
channel pbench-agent-cli-to-client payload, '{"action": "end", "kind": "ds", "status": "success"}'
channel pbench-agent-cli-to-client payload, '{"action": "init", "kind": "ds", "status": "success"}'
@@ -18,6 +18,8 @@ next pbench-agent-cli-to-client
payload from pbench-agent-cli-to-client: {'type': 'message', 'pattern': None, 'channel': b'pbench-agent-cli-to-client', 'data': b'{"action": "end", "kind": "ds", "status": "success"}'}
payload from pbench-agent-cli-to-client: {'type': 'message', 'pattern': None, 'channel': b'pbench-agent-cli-to-client', 'data': b'{"action": "init", "kind": "ds", "status": "success"}'}
payload from pbench-agent-cli-to-client: {'type': 'message', 'pattern': None, 'channel': b'pbench-agent-cli-to-client', 'data': b'{"action": "startup", "kind": "ds", "status": "success"}'}
+pbench-tool-data-sink: connected to redis server localhost:17001
+pbench-tool-meister: connected to redis server localhost:17001
publish end on chan pbench-agent-cli-from-client
publish init on chan pbench-agent-cli-from-client
publish terminate on chan pbench-agent-cli-from-client
@@ -33,7 +35,6 @@ waiting for tool-data-sink (#####) to exit
/var/tmp/pbench-test-utils/pbench/mock-run/ssh_config.d
/var/tmp/pbench-test-utils/pbench/mock-run/tm
/var/tmp/pbench-test-utils/pbench/mock-run/tm/pbench-tool-data-sink.err
-/var/tmp/pbench-test-utils/pbench/mock-run/tm/pbench-tool-data-sink.log
/var/tmp/pbench-test-utils/pbench/mock-run/tm/pbench-tool-data-sink.out
/var/tmp/pbench-test-utils/pbench/mock-run/tm/redis.conf
/var/tmp/pbench-test-utils/pbench/mock-run/tm/redis.log
@@ -108,22 +109,20 @@ install_check_output = perf: perf is installed
--- mock-run/metadata.log file contents
+++ mock-run/tm/pbench-tool-data-sink.err file contents
+DEBUG pbench-tool-data-sink daemon -- re-constructing Redis server object
+DEBUG pbench-tool-data-sink daemon -- reconstructed Redis server object
+DEBUG pbench-tool-data-sink driver -- params_key (tds-mygroup): {'benchmark_run_dir': '/var/tmp/pbench-test-utils/pbench/mock-run', 'bind_hostname': 'localhost', 'channel_prefix': 'pbench-agent-cli', 'group': 'mygroup', 'optional_md': {'config': '', 'date': '1900-01-01T00:00:00', 'script': 'fake-bm', 'ssh_opts': '-o StrictHostKeyChecking=no'}, 'tool_metadata': {'persistent': {'dcgm': {'collector': 'prometheus', 'port': '9400'}, 'node-exporter': {'collector': 'prometheus', 'port': '9100'}, 'pcp': {'collector': 'pcp', 'port': '44321'}}, 'transient': {'blktrace': None, 'bpftrace': None, 'cpuacct': None, 'disk': None, 'dm-cache': None, 'docker': None, 'docker-info': None, 'external-data-source': None, 'haproxy-ocp': None, 'iostat': None, 'jmap': None, 'jstack': None, 'kvm-spinlock': None, 'kvmstat': None, 'kvmtrace': None, 'lockstat': None, 'mpstat': None, 'numastat': None, 'oc': None, 'openvswitch': None, 'perf': None, 'pidstat': None, 'pprof': None, 'proc-interrupts': None, 'proc-sched_debug': None, 'proc-vmstat': None, 'prometheus-metrics': None, 'qemu-migrate': None, 'rabbit': None, 'sar': None, 'strace': None, 'sysfs': None, 'systemtap': None, 'tcpdump': None, 'turbostat': None, 'user-tool': None, 'virsh-migrate': None, 'vmstat': None}}, 'tool_trigger': None, 'tools': {'testhost.example.com': {'mpstat': '', 'perf': '--record-opts="-a -freq=100 -g --event=branch-misses --event=cache-misses --event=instructions" --report-opts="-I -g"'}}}
+INFO pbench-tool-data-sink web_server_run -- Running Bottle web server ...
Bottle v#.##.## server starting up (using DataSinkWsgiServer(handler_class=.DataSinkWsgiRequestHandler'>))...
Listening on http://localhost:8080/
Hit Ctrl-C to quit.
---- mock-run/tm/pbench-tool-data-sink.err file contents
-+++ mock-run/tm/pbench-tool-data-sink.log file contents
-DEBUG pbench-tool-data-sink main -- params_key (tds-mygroup): b'{"benchmark_run_dir": "/var/tmp/pbench-test-utils/pbench/mock-run", "bind_hostname": "localhost", "channel_prefix": "pbench-agent-cli", "group": "mygroup", "optional_md": {"config": "", "date": "1900-01-01T00:00:00", "script": "fake-bm", "ssh_opts": "-o StrictHostKeyChecking=no"}, "tool_metadata": {"persistent": {"dcgm": {"collector": "prometheus", "port": "8000"}, "node-exporter": {"collector": "prometheus", "port": "9100"}, "pcp": {"collector": "pcp", "port": "44321"}}, "transient": {"blktrace": null, "bpftrace": null, "cpuacct": null, "disk": null, "dm-cache": null, "docker": null, "docker-info": null, "external-data-source": null, "haproxy-ocp": null, "iostat": null, "jmap": null, "jstack": null, "kvm-spinlock": null, "kvmstat": null, "kvmtrace": null, "lockstat": null, "mpstat": null, "numastat": null, "oc": null, "openvswitch": null, "perf": null, "pidstat": null, "pprof": null, "proc-interrupts": null, "proc-sched_debug": null, "proc-vmstat": null, "prometheus-metrics": null, "qemu-migrate": null, "rabbit": null, "sar": null, "strace": null, "sysfs": null, "systemtap": null, "tcpdump": null, "turbostat": null, "user-tool": null, "virsh-migrate": null, "vmstat": null}}, "tool_trigger": null, "tools": {"testhost.example.com": {"mpstat": "", "perf": "--record-opts=\\"-a -freq=100 -g --event=branch-misses --event=cache-misses --event=instructions\\" --report-opts=\\"-I -g\\""}}}'
-DEBUG pbench-tool-data-sink main -- Tool Data Sink parameters check out, daemonizing ...
-DEBUG pbench-tool-data-sink main -- constructing Redis() object
-DEBUG pbench-tool-data-sink main -- constructed Redis() object
-INFO pbench-tool-data-sink web_server_run -- Running Bottle web server ...
DEBUG pbench-tool-data-sink run -- Making tool data sink WSGI server ...
+DEBUG pbench-tool-data-sink run -- Successfully created WSGI server
DEBUG pbench-tool-data-sink run -- Running tool data sink WSGI server ...
DEBUG pbench-tool-data-sink __enter__ -- web server 'run' thread started, processing payloads ...
INFO pbench-tool-data-sink tm_log_capture -- Running Tool Meister log capture ...
-DEBUG pbench-tool-data-sink __enter__ -- 'tm_log_capture' thread started, processing logs ...
+DEBUG pbench-tool-data-sink __enter__ -- 'tm_log_capture' thread started, processing Tool Meister logs ...
DEBUG pbench-tool-data-sink fetch_message -- next pbench-agent-cli-from-tms
DEBUG pbench-tool-data-sink fetch_message -- payload from pbench-agent-cli-from-tms: {'type': 'message', 'pattern': None, 'channel': b'pbench-agent-cli-from-tms', 'data': b'{"hostname": "testhost.example.com", "hostname_A": "agent.example.com", "hostname_I": "agent.example.com", "hostname_f": "agent.example.com", "hostname_i": "agent.example.com", "hostname_s": "agent.example.com", "installs": {"mpstat": [0, "mpstat: pbench-sysstat-12.0.3 is installed"], "perf": [0, "perf: perf is installed"]}, "kind": "tm", "label": "", "pid": NNNNN, "seqno": "", "sha1": "(unknown)", "version": "(unknown)"}'}
DEBUG pbench-tool-data-sink fetch_message -- channel pbench-agent-cli-from-tms payload, '{"hostname": "testhost.example.com", "hostname_A": "agent.example.com", "hostname_I": "agent.example.com", "hostname_f": "agent.example.com", "hostname_i": "agent.example.com", "hostname_s": "agent.example.com", "installs": {"mpstat": [0, "mpstat: pbench-sysstat-12.0.3 is installed"], "perf": [0, "perf: perf is installed"]}, "kind": "tm", "label": "", "pid": NNNNN, "seqno": "", "sha1": "(unknown)", "version": "(unknown)"}'
@@ -170,7 +169,7 @@ INFO pbench-tool-data-sink web_server_run -- Bottle web server exited
DEBUG pbench-tool-data-sink __exit__ -- Waiting for the web server thread to exit ...
DEBUG pbench-tool-data-sink __exit__ -- Waiting for the log capture thread to exit ...
DEBUG pbench-tool-data-sink __exit__ -- Exiting Tool Data Sink context ...
---- mock-run/tm/pbench-tool-data-sink.log file contents
+--- mock-run/tm/pbench-tool-data-sink.err file contents
+++ mock-run/tm/pbench-tool-data-sink.out file contents
--- mock-run/tm/pbench-tool-data-sink.out file contents
+++ mock-run/tm/redis.conf file contents
@@ -194,7 +193,7 @@ port 17001
+++ mock-run/tm/tm-mygroup-testhost.example.com.err file contents
DEBUG pbench-tool-meister daemon -- re-constructing Redis server object
DEBUG pbench-tool-meister daemon -- re-constructed Redis server object
-DEBUG pbench-tool-meister driver -- params_key (tm-mygroup-testhost.example.com): {'benchmark_run_dir': '/var/tmp/pbench-test-utils/pbench/mock-run', 'channel_prefix': 'pbench-agent-cli', 'controller': 'testhost.example.com', 'group': 'mygroup', 'hostname': 'testhost.example.com', 'label': '', 'tool_metadata': {'persistent': {'dcgm': {'collector': 'prometheus', 'port': '8000'}, 'node-exporter': {'collector': 'prometheus', 'port': '9100'}, 'pcp': {'collector': 'pcp', 'port': '44321'}}, 'transient': {'blktrace': None, 'bpftrace': None, 'cpuacct': None, 'disk': None, 'dm-cache': None, 'docker': None, 'docker-info': None, 'external-data-source': None, 'haproxy-ocp': None, 'iostat': None, 'jmap': None, 'jstack': None, 'kvm-spinlock': None, 'kvmstat': None, 'kvmtrace': None, 'lockstat': None, 'mpstat': None, 'numastat': None, 'oc': None, 'openvswitch': None, 'perf': None, 'pidstat': None, 'pprof': None, 'proc-interrupts': None, 'proc-sched_debug': None, 'proc-vmstat': None, 'prometheus-metrics': None, 'qemu-migrate': None, 'rabbit': None, 'sar': None, 'strace': None, 'sysfs': None, 'systemtap': None, 'tcpdump': None, 'turbostat': None, 'user-tool': None, 'virsh-migrate': None, 'vmstat': None}}, 'tools': {'mpstat': '', 'perf': '--record-opts="-a -freq=100 -g --event=branch-misses --event=cache-misses --event=instructions" --report-opts="-I -g"'}}
+DEBUG pbench-tool-meister driver -- params_key (tm-mygroup-testhost.example.com): {'benchmark_run_dir': '/var/tmp/pbench-test-utils/pbench/mock-run', 'channel_prefix': 'pbench-agent-cli', 'controller': 'testhost.example.com', 'group': 'mygroup', 'hostname': 'testhost.example.com', 'label': '', 'tool_metadata': {'persistent': {'dcgm': {'collector': 'prometheus', 'port': '9400'}, 'node-exporter': {'collector': 'prometheus', 'port': '9100'}, 'pcp': {'collector': 'pcp', 'port': '44321'}}, 'transient': {'blktrace': None, 'bpftrace': None, 'cpuacct': None, 'disk': None, 'dm-cache': None, 'docker': None, 'docker-info': None, 'external-data-source': None, 'haproxy-ocp': None, 'iostat': None, 'jmap': None, 'jstack': None, 'kvm-spinlock': None, 'kvmstat': None, 'kvmtrace': None, 'lockstat': None, 'mpstat': None, 'numastat': None, 'oc': None, 'openvswitch': None, 'perf': None, 'pidstat': None, 'pprof': None, 'proc-interrupts': None, 'proc-sched_debug': None, 'proc-vmstat': None, 'prometheus-metrics': None, 'qemu-migrate': None, 'rabbit': None, 'sar': None, 'strace': None, 'sysfs': None, 'systemtap': None, 'tcpdump': None, 'turbostat': None, 'user-tool': None, 'virsh-migrate': None, 'vmstat': None}}, 'tools': {'mpstat': '', 'perf': '--record-opts="-a -freq=100 -g --event=branch-misses --event=cache-misses --event=instructions" --report-opts="-I -g"'}}
DEBUG pbench-tool-meister __enter__ -- publish pbench-agent-cli-from-tms
DEBUG pbench-tool-meister __enter__ -- published pbench-agent-cli-from-tms
DEBUG pbench-tool-meister driver -- waiting ...
@@ -226,7 +225,7 @@ DEBUG pbench-tool-meister _send_client_status -- publish pbench-agent-cli-from-t
--- mock-run/tm/tm-mygroup-testhost.example.com.out file contents
+++ mock-run/tm/tm.logs file contents
pbench-tool-meister-start - verify logging channel up
-testhost.example.com 0000 DEBUG pbench-tool-meister driver -- params_key (tm-mygroup-testhost.example.com): {'benchmark_run_dir': '/var/tmp/pbench-test-utils/pbench/mock-run', 'channel_prefix': 'pbench-agent-cli', 'controller': 'testhost.example.com', 'group': 'mygroup', 'hostname': 'testhost.example.com', 'label': '', 'tool_metadata': {'persistent': {'dcgm': {'collector': 'prometheus', 'port': '8000'}, 'node-exporter': {'collector': 'prometheus', 'port': '9100'}, 'pcp': {'collector': 'pcp', 'port': '44321'}}, 'transient': {'blktrace': None, 'bpftrace': None, 'cpuacct': None, 'disk': None, 'dm-cache': None, 'docker': None, 'docker-info': None, 'external-data-source': None, 'haproxy-ocp': None, 'iostat': None, 'jmap': None, 'jstack': None, 'kvm-spinlock': None, 'kvmstat': None, 'kvmtrace': None, 'lockstat': None, 'mpstat': None, 'numastat': None, 'oc': None, 'openvswitch': None, 'perf': None, 'pidstat': None, 'pprof': None, 'proc-interrupts': None, 'proc-sched_debug': None, 'proc-vmstat': None, 'prometheus-metrics': None, 'qemu-migrate': None, 'rabbit': None, 'sar': None, 'strace': None, 'sysfs': None, 'systemtap': None, 'tcpdump': None, 'turbostat': None, 'user-tool': None, 'virsh-migrate': None, 'vmstat': None}}, 'tools': {'mpstat': '', 'perf': '--record-opts="-a -freq=100 -g --event=branch-misses --event=cache-misses --event=instructions" --report-opts="-I -g"'}}
+testhost.example.com 0000 DEBUG pbench-tool-meister driver -- params_key (tm-mygroup-testhost.example.com): {'benchmark_run_dir': '/var/tmp/pbench-test-utils/pbench/mock-run', 'channel_prefix': 'pbench-agent-cli', 'controller': 'testhost.example.com', 'group': 'mygroup', 'hostname': 'testhost.example.com', 'label': '', 'tool_metadata': {'persistent': {'dcgm': {'collector': 'prometheus', 'port': '9400'}, 'node-exporter': {'collector': 'prometheus', 'port': '9100'}, 'pcp': {'collector': 'pcp', 'port': '44321'}}, 'transient': {'blktrace': None, 'bpftrace': None, 'cpuacct': None, 'disk': None, 'dm-cache': None, 'docker': None, 'docker-info': None, 'external-data-source': None, 'haproxy-ocp': None, 'iostat': None, 'jmap': None, 'jstack': None, 'kvm-spinlock': None, 'kvmstat': None, 'kvmtrace': None, 'lockstat': None, 'mpstat': None, 'numastat': None, 'oc': None, 'openvswitch': None, 'perf': None, 'pidstat': None, 'pprof': None, 'proc-interrupts': None, 'proc-sched_debug': None, 'proc-vmstat': None, 'prometheus-metrics': None, 'qemu-migrate': None, 'rabbit': None, 'sar': None, 'strace': None, 'sysfs': None, 'systemtap': None, 'tcpdump': None, 'turbostat': None, 'user-tool': None, 'virsh-migrate': None, 'vmstat': None}}, 'tools': {'mpstat': '', 'perf': '--record-opts="-a -freq=100 -g --event=branch-misses --event=cache-misses --event=instructions" --report-opts="-I -g"'}}
testhost.example.com 0001 DEBUG pbench-tool-meister __enter__ -- publish pbench-agent-cli-from-tms
testhost.example.com 0002 DEBUG pbench-tool-meister __enter__ -- published pbench-agent-cli-from-tms
testhost.example.com 0003 DEBUG pbench-tool-meister driver -- waiting ...
diff --git a/agent/util-scripts/pbench-tool-meister-client b/agent/util-scripts/pbench-tool-meister-client
index 6168dd2196..3226264591 100755
--- a/agent/util-scripts/pbench-tool-meister-client
+++ b/agent/util-scripts/pbench-tool-meister-client
@@ -13,7 +13,7 @@ import sys
from pbench.agent.constants import (
cli_tm_allowed_actions,
cli_tm_channel_prefix,
- redis_port,
+ def_redis_port,
)
from pbench.agent.tool_meister_client import Client
@@ -47,30 +47,48 @@ def main(argv):
try:
group = argv[1]
except IndexError:
- raise Exception("Missing group argument")
+ logger.error("Missing group argument")
+ return 1
try:
directory = argv[2]
except IndexError:
- raise Exception("Missing directory argument")
+ logger.error("Missing directory argument")
+ return 1
try:
action = argv[3]
except IndexError:
- raise Exception("Missing action argument")
+ logger.error("Missing action argument")
+ return 1
else:
if action not in cli_tm_allowed_actions:
- raise Exception(
- f"Unrecognized action, '{action}', allowed actions are:"
- f" {cli_tm_allowed_actions}"
+ logger.error(
+ "Unrecognized action, '{}', allowed actions are: {}",
+ action,
+ cli_tm_allowed_actions,
)
+ return 1
elif action == "kill":
# FIXME: we need to implement the gritty method of killing all the
# tool meisters, locally and remotely, and ensuring they are all
# properly shut down.
return 0
+ redis_server = os.environ.get("PBENCH_REDIS_SERVER", f"localhost:{def_redis_port}")
+ parts = redis_server.split(":", 1)
+ if len(parts) != 2:
+ logger.error("Bad Redis server specified, {!r}", redis_server)
+ return 1
+ try:
+ redis_port = int(parts[1])
+ except Exception:
+ logger.error("Bad port for Redis server specified in {!r}", redis_server)
+ return 1
+ else:
+ redis_host = parts[0]
+
# The Redis server is always running on the local host with the CLI.
with Client(
- redis_host="localhost",
+ redis_host=redis_host,
redis_port=redis_port,
channel_prefix=cli_tm_channel_prefix,
logger=logger,
diff --git a/agent/util-scripts/pbench-tool-meister-start b/agent/util-scripts/pbench-tool-meister-start
index 214c28c825..7816b01ff1 100755
--- a/agent/util-scripts/pbench-tool-meister-start
+++ b/agent/util-scripts/pbench-tool-meister-start
@@ -82,9 +82,8 @@ from pathlib import Path
import redis
-from pbench.agent import PbenchAgentConfig
from pbench.agent.constants import (
- redis_port,
+ def_redis_port,
cli_tm_channel_prefix,
tm_channel_suffix_to_client,
tm_channel_suffix_from_client,
@@ -92,16 +91,11 @@ from pbench.agent.constants import (
)
from pbench.agent.redis import RedisChannelSubscriber
from pbench.agent.tool_data_sink import main as tds_main
+from pbench.agent.tool_group import BadToolGroup, ToolGroup
from pbench.agent.tool_meister import main as tm_main
from pbench.agent.tool_meister_client import Client
from pbench.agent.toolmetadata import ToolMetadata
-from pbench.agent.utils import (
- cli_verify_sysinfo,
- info_log,
- verify_tool_group,
- BadToolGroup,
-)
-from pbench.common.exceptions import BadConfig
+from pbench.agent.utils import cli_verify_sysinfo, error_log, info_log
# Redis server configuration template for pbench's use
@@ -116,120 +110,6 @@ port {redis_port:d}
"""
-class ToolGroup:
- """Provides an in-memory representation of the registered tools as recorded
- on-disk.
- """
-
- def __init__(self, group):
- """Construct a ToolGroup object from the on-disk data of the given
- tool group.
-
- If the given tool group is valid, the contents are read into the three
- dictionary structures:
-
- "toolnames" - each tool name is the key, with separate dictionaries
- for each registered host
-
- "hostnames" - each registered host is the key, with separate
- dictionaries for each tool registered on that host
-
- "labels" - each registered host name, that has a label, is the key,
- and the label as the value; if a host is not labeled, it does not
- show up in this dictionary
-
- Raises BadToolGroup via the verify_tool_group() method on error.
- """
- self.tg_dir = verify_tool_group(group)
- self.group = group
-
- # __trigger__
- try:
- _trigger = (self.tg_dir / "__trigger__").read_text()
- except OSError as ex:
- if ex.errno != errno.ENOENT:
- raise
- # Ignore missing trigger file
- self.trigger = None
- else:
- if len(_trigger) == 0:
- # Ignore empty trigger file contents
- self.trigger = None
- else:
- self.trigger = _trigger
-
- # toolnames - Dict with tool name as the key, dictionary with host
- # names and parameters for each host
- self.toolnames = {}
- # hostnames - Dict with host name as the key, dictionary with tool
- # names and parameters for each tool
- self.hostnames = {}
- self.labels = {}
- for hdirent in os.listdir(self.tg_dir):
- if hdirent == "__trigger__":
- # Ignore handled above
- continue
- if not (self.tg_dir / hdirent).is_dir():
- # Ignore wayward non-directory files
- continue
- # We assume this directory is a hostname.
- host = hdirent
- if host not in self.hostnames:
- self.hostnames[host] = {}
- for tdirent in os.listdir(self.tg_dir / host):
- if tdirent == "__label__":
- self.labels[host] = (
- (self.tg_dir / host / tdirent).read_text().strip()
- )
- continue
- if tdirent.endswith("__noinstall__"):
- # FIXME: ignore "noinstall" for now, tools are going to be
- # in containers so this does not make sense going forward.
- continue
- # This directory entry is the name of a tool.
- tool = tdirent
- tool_opts_raw_lines = (
- (self.tg_dir / host / tool).read_text().split("\n")
- )
- tool_opts_lines = []
- for line_raw in tool_opts_raw_lines:
- line = line_raw.strip()
- if not line:
- # Ignore blank lines
- continue
- tool_opts_lines.append(line)
- tool_opts = " ".join(tool_opts_lines)
- if tool not in self.toolnames:
- self.toolnames[tool] = {}
- self.toolnames[tool][host] = tool_opts
-
- def get_tools(self, host):
- """get_tools - given a target host, return a dictionary with the list
- of tool names as keys, and the values being their options for that
- host.
- """
- tools = dict()
- for tool, opts in self.toolnames.items():
- try:
- host_opts = opts[host]
- except KeyError:
- # This host does not have this tool registered, ignore.
- pass
- else:
- tools[tool] = host_opts
- return tools
-
- def get_label(self, host):
- """get_label - given a target host, return the label associated with
- that host.
- """
- try:
- label = self.labels[host]
- except KeyError:
- label = ""
- return label
-
-
def wait_for_tds(chan, logger):
"""wait_for_tds - Wait for the Tool Data Sink to report back success or
failure regarding the Tool Meister environment setup.
@@ -260,14 +140,13 @@ def wait_for_tds(chan, logger):
class ReturnCode:
- """ReturnCode - symbolic return codes for when the main program of
+ """ReturnCode - symbolic return codes for the main program of
pbench-tool-meister-start.
"""
SUCCESS = 0
BADTOOLGROUP = 1
BADAGENTCONFIG = 2
- EXCAGENTCONFIG = 3
MISSINGINSTALLDIR = 4
EXCINSTALLDIR = 5
BADTOOLMETADATA = 6
@@ -290,6 +169,10 @@ class ReturnCode:
EXCTOOLGROUPDIR = 23
SYSINFOFAILED = 24
INITFAILED = 25
+ TDSSTARTUPTIMEOUT = 26
+ TOOLGROUPEXC = 27
+ BADREDISARG = 28
+ BADREDISPORT = 29
# Kill sub-codes
KILL_SUCCESS = 0
@@ -327,7 +210,7 @@ def kill_redis_server(pid_file, ret_val):
else:
try:
pid = int(raw_pid)
- except Exception:
+ except ValueError:
# Bad pid value
return ReturnCode.kill_ret_code(ReturnCode.KILL_BADPID, ret_val)
try:
@@ -395,10 +278,10 @@ def main(_prog, cli_params):
tool_group = ToolGroup(group)
except BadToolGroup as exc:
logger.error(str(exc))
- return 1
+ return ReturnCode.BADTOOLGROUP
except Exception:
logger.exception("failed to load tool group data for '%s'", group)
- return ReturnCode.BADTOOLGROUP
+ return ReturnCode.TOOLGROUPEXC
else:
if not tool_group.hostnames:
# If a tool group has no tools registered, then there will be no
@@ -411,38 +294,31 @@ def main(_prog, cli_params):
# Load the tool metadata
try:
- inst_dir = PbenchAgentConfig(
- os.environ["_PBENCH_AGENT_CONFIG"]
- ).pbench_install_dir
- except BadConfig as exc:
- logger.error("%s", exc)
+ inst_dir = os.environ["pbench_install_dir"]
+ except KeyError:
+ logger.error(
+ "The required 'pbench_install_dir' environment variable appears to be missing"
+ )
return ReturnCode.BADAGENTCONFIG
- except Exception as exc:
+ try:
+ tm_start_path = Path(inst_dir).resolve(strict=True)
+ except FileNotFoundError:
logger.error(
- "Unexpected error encountered loading pbench agent configuration: '%s'", exc
+ "Unable to determine proper installation directory, '%s' not found",
+ inst_dir,
)
- return ReturnCode.EXCAGENTCONFIG
+ return ReturnCode.MISSINGINSTALLDIR
+ except Exception as exc:
+ logger.exception(
+ "Unexpected error encountered resolving installation directory: '%s'", exc,
+ )
+ return ReturnCode.EXCINSTALLDIR
else:
try:
- tm_start_path = Path(inst_dir).resolve(strict=True)
- except FileNotFoundError:
- logger.error(
- "Unable to determine proper installation directory, '%s' not found",
- inst_dir,
- )
- return ReturnCode.MISSINGINSTALLDIR
- except Exception as exc:
- logger.exception(
- "Unexpected error encountered resolving installation directory: '%s'",
- exc,
- )
- return ReturnCode.EXCINSTALLDIR
- else:
- try:
- tool_metadata = ToolMetadata(tm_start_path)
- except Exception:
- logger.exception("failed to load tool metadata")
- return ReturnCode.BADTOOLMETADATA
+ tool_metadata = ToolMetadata(tm_start_path)
+ except Exception:
+ logger.exception("failed to load tool metadata")
+ return ReturnCode.BADTOOLMETADATA
# Load and verify required and optional environment variables.
try:
@@ -463,7 +339,8 @@ def main(_prog, cli_params):
if not full_hostname or not hostname:
logger.error(
"ERROR - _pbench_hostname ('%s') and _pbench_full_hostname ('%s')"
- " environment variables are required",
+ " environment variables are required to represent the respective"
+ " hostname strings",
hostname,
full_hostname,
)
@@ -537,33 +414,52 @@ def main(_prog, cli_params):
# +
# Step 2. - Start the Redis Server
# -
-
- # Create the Redis server pbench-specific configuration file
- redis_conf = tm_dir / "redis.conf"
- params = {"hostnames": hostnames, "tm_dir": tm_dir, "redis_port": redis_port}
- try:
- with redis_conf.open("w") as fp:
- fp.write(redis_conf_tmpl.format(**params))
- except Exception:
- logger.exception("failed to create redis server configuration")
- return ReturnCode.EXCREDISCONFIG
-
- # Start the Redis Server itself
- redis_srvr = "redis-server"
- redis_srvr_path = find_executable(redis_srvr)
- redis_pid = tm_dir / f"redis_{redis_port:d}.pid"
- logger.debug("2. starting redis server")
- try:
- retcode = os.spawnl(os.P_WAIT, redis_srvr_path, redis_srvr, redis_conf)
- except Exception:
- logger.exception("failed to create redis server, daemonized")
- return ReturnCode.EXCSPAWNREDIS
+ if cli_params.redis_server is None:
+ # Create the Redis server pbench-specific configuration file
+ redis_conf = tm_dir / "redis.conf"
+ params = {
+ "hostnames": hostnames,
+ "tm_dir": tm_dir,
+ "redis_port": def_redis_port,
+ }
+ try:
+ with redis_conf.open("w") as fp:
+ fp.write(redis_conf_tmpl.format(**params))
+ except Exception:
+ logger.exception("failed to create redis server configuration")
+ return ReturnCode.EXCREDISCONFIG
+
+ # Start the Redis Server itself
+ redis_srvr = "redis-server"
+ redis_srvr_path = find_executable(redis_srvr)
+ redis_pid = tm_dir / f"redis_{def_redis_port:d}.pid"
+ logger.debug("2. starting redis server")
+ try:
+ retcode = os.spawnl(os.P_WAIT, redis_srvr_path, redis_srvr, redis_conf)
+ except Exception:
+ logger.exception("failed to create redis server, daemonized")
+ return ReturnCode.EXCSPAWNREDIS
+ else:
+ if retcode != 0:
+ logger.error(
+ "failed to create redis server, daemonized; return code: %d",
+ retcode,
+ )
+ return ReturnCode.REDISFAILED
+ redis_host = "localhost"
+ redis_port = def_redis_port
else:
- if retcode != 0:
- logger.error(
- "failed to create redis server, daemonized; return code: %d", retcode
- )
- return ReturnCode.REDISFAILED
+ parts = cli_params.redis_server.split(":", 1)
+ if len(parts) != 2:
+ logger.error("Bad Redis server specified, '%s'", cli_params.redis_server)
+ return ReturnCode.BADREDISARG
+ try:
+ redis_port = int(parts[1])
+ except ValueError:
+ logger.error("Bad Redis port specified, '%s'", cli_params.redis_server)
+ return ReturnCode.BADREDISPORT
+ else:
+ redis_host = parts[0]
# Connect to the Redis Server.
#
@@ -574,11 +470,11 @@ def main(_prog, cli_params):
# listen for responses from the Tool Data Sink.
try:
to_client_channel = f"{cli_tm_channel_prefix}-{tm_channel_suffix_to_client}"
- redis_server = redis.Redis(host="localhost", port=redis_port, db=0)
+ redis_server = redis.Redis(host=redis_host, port=redis_port, db=0)
to_client_chan = RedisChannelSubscriber(redis_server, to_client_channel)
except Exception as exc:
logger.error(
- "Unable to connect to redis server, %s:%d: %r", "localhost", redis_port, exc
+ "Unable to connect to redis server, %s:%d: %r", redis_host, redis_port, exc
)
return kill_redis_server(redis_pid, ReturnCode.REDISCHANFAILED)
@@ -601,9 +497,9 @@ def main(_prog, cli_params):
controller=_controller,
group=group,
hostname=host,
+ label=tool_group.get_label(host),
tool_metadata=tool_metadata.getFullData(),
tools=tools,
- label=tool_group.get_label(host),
)
# Create a separate key for the Tool Meister that will be on that host
#
@@ -624,13 +520,13 @@ def main(_prog, cli_params):
# Sink.
tds_param_key = f"tds-{group}"
tds = dict(
- channel_prefix=cli_tm_channel_prefix,
benchmark_run_dir=str(benchmark_run_dir),
bind_hostname=tm_bind_hostname,
+ channel_prefix=cli_tm_channel_prefix,
group=group,
+ tool_metadata=tool_metadata.getFullData(),
tool_trigger=tool_group.trigger,
tools=tool_group_data,
- tool_metadata=tool_metadata.getFullData(),
# The following are optional
optional_md=optional_md,
)
@@ -646,186 +542,201 @@ def main(_prog, cli_params):
# 4. Start the Tool Data Sink process
# -
- # FIXME: if only one host is registered, and that host is the same as this
- # controller, then don't bother starting the Tool Data Sink.
- logger.debug("4. starting tool data sink")
- try:
- pid = os.fork()
- if pid == 0:
- # In the child!
-
- # The main() of the Tool Data Sink module will not return here
- # since it will daemonize itself and this child pid will be
- # replaced by a new pid.
- status = tds_main(
- [
- PROG.parent / "pbench-tool-data-sink",
- "localhost",
- str(redis_port),
- tds_param_key,
- ]
- )
- sys.exit(status)
- else:
- # In the parent!
-
- # Wait for the child to finish daemonizing itself.
- retcode = waitpid(pid)
- if retcode != 0:
- logger.error(
- "failed to create pbench data sink, daemonized; return code: %d",
- retcode,
- )
- except Exception:
- logger.exception("failed to create pbench data sink, daemonized")
- return kill_redis_server(redis_pid, ReturnCode.TDSFORKFAILED)
- else:
- # Wait for logging channel to be up and ready before we start the
- # local and remote Tool Meisters.
- num_present = 0
- while num_present == 0:
- try:
- num_present = redis_server.publish(
- f"{cli_tm_channel_prefix}-{tm_channel_suffix_to_logging}",
- "pbench-tool-meister-start - verify logging channel up",
+ if cli_params.redis_server is None:
+ # FIXME: if only one host is registered, and that host is the same as this
+ # controller, then don't bother starting the Tool Data Sink.
+ logger.debug("4. starting tool data sink")
+ try:
+ pid = os.fork()
+ if pid == 0:
+ # In the child!
+
+ # The main() of the Tool Data Sink module will not return here
+ # since it will daemonize itself and this child pid will be
+ # replaced by a new pid.
+ status = tds_main(
+ [
+ PROG.parent / "tool-meister" / "pbench-tool-data-sink",
+ "localhost",
+ str(redis_port),
+ tds_param_key,
+ "yes", # Request tool-data-sink daemonize itself
+ ]
)
- except Exception:
- logger.exception("Failed to verify Tool Data Sink logging sink working")
- return kill_redis_server(redis_pid, ReturnCode.TDSLOGPUBFAILED)
+ sys.exit(status)
else:
- if num_present == 0:
- time.sleep(0.1)
+ # In the parent!
+
+ # Wait for the child to finish daemonizing itself.
+ retcode = waitpid(pid)
+ if retcode != 0:
+ logger.error(
+ "failed to create pbench data sink, daemonized; return code: %d",
+ retcode,
+ )
+ except Exception:
+ logger.exception("failed to create pbench data sink, daemonized")
+ return kill_redis_server(redis_pid, ReturnCode.TDSFORKFAILED)
+ else:
+ # Wait for logging channel to be up and ready before we start the
+ # local and remote Tool Meisters.
+ timeout = time.time() + 60
+ num_present = 0
+ while num_present == 0:
+ try:
+ num_present = redis_server.publish(
+ f"{cli_tm_channel_prefix}-{tm_channel_suffix_to_logging}",
+ "pbench-tool-meister-start - verify logging channel up",
+ )
+ except Exception:
+ logger.exception(
+ "Failed to verify Tool Data Sink logging sink working"
+ )
+ return kill_redis_server(redis_pid, ReturnCode.TDSLOGPUBFAILED)
+ else:
+ if num_present == 0:
+ if time.time() > timeout:
+ logger.error(
+ "The Tool Data Sink failed to start within one minute"
+ )
+ return kill_redis_server(
+ redis_pid, ReturnCode.TDSSTARTUPTIMEOUT
+ )
+ else:
+ time.sleep(0.1)
# +
# 5. Start all the local and remote Tool Meisters
# -
- failures = 0
- successes = 0
- # NOTE: it is assumed that the location of the pbench-tool-meister command
- # is the same on the local host as it is on any remote host.
- tool_meister_cmd = PROG.parent / "tool-meister" / "pbench-tool-meister"
- ssh_cmd = "ssh"
- ssh_path = find_executable(ssh_cmd)
- base_args = [
- ssh_cmd,
- ]
- base_args.extend(shlex.split(ssh_opts))
- args = [
- "",
- f"{tool_meister_cmd}-remote",
- tm_bind_hostname,
- str(redis_port),
- "",
- "yes",
- ]
- tms = dict()
- tm_count = 0
- for host in tool_group.hostnames.keys():
- tm_count += 1
- tm_param_key = f"tm-{group}-{host}"
- if host == full_hostname:
- logger.debug("5a. starting localhost tool meister")
- try:
- pid = os.fork()
- if pid == 0:
- # In the child!
-
- # The main() of the Tool Meister module will not return
- # here since it will daemonize itself and this child pid
- # will be replaced by a new pid.
- status = tm_main(
- [
- str(tool_meister_cmd),
- "localhost",
- str(redis_port),
- tm_param_key,
- "yes",
- ]
+ if cli_params.redis_server is None:
+ failures = 0
+ successes = 0
+ # NOTE: it is assumed that the location of the pbench-tool-meister command
+ # is the same on the local host as it is on any remote host.
+ tool_meister_cmd = PROG.parent / "tool-meister" / "pbench-tool-meister"
+ ssh_cmd = "ssh"
+ ssh_path = find_executable(ssh_cmd)
+ base_args = [
+ ssh_cmd,
+ ]
+ base_args.extend(shlex.split(ssh_opts))
+ args = [
+ "",
+ f"{tool_meister_cmd}-remote",
+ tm_bind_hostname,
+ str(redis_port),
+ "",
+ "yes", # Yes, request the tool meister daemonize itself
+ ]
+ tms = dict()
+ tm_count = 0
+ for host in tool_group.hostnames.keys():
+ tm_count += 1
+ tm_param_key = f"tm-{group}-{host}"
+ if host == full_hostname:
+ logger.debug("5a. starting localhost tool meister")
+ try:
+ pid = os.fork()
+ if pid == 0:
+ # In the child!
+
+ # The main() of the Tool Meister module will not return
+ # here since it will daemonize itself and this child pid
+ # will be replaced by a new pid.
+ status = tm_main(
+ [
+ str(tool_meister_cmd),
+ "localhost",
+ str(redis_port),
+ tm_param_key,
+ "yes", # Yes, daemonize yourself TM ...
+ ]
+ )
+ sys.exit(status)
+ else:
+ # In the parent!
+ pass
+ except Exception:
+ logger.exception(
+ "failed to create localhost tool meister, daemonized"
)
- sys.exit(status)
+ failures += 1
+ tms[host] = {"pid": None, "status": "failed"}
else:
- # In the parent!
- pass
- except Exception:
- logger.exception("failed to create localhost tool meister, daemonized")
- failures += 1
- tms[host] = {"pid": None, "status": "failed"}
+ # Record the child pid to wait below.
+ tms[host] = {"pid": pid, "status": "forked"}
else:
- # Record the child pid to wait below.
- tms[host] = {"pid": pid, "status": "forked"}
- else:
- args[0] = host
- args[4] = tm_param_key
- ssh_args = base_args + args
- logger.debug(
- "5b. starting remote tool meister, ssh_path=%r ssh_args=%r",
- ssh_path,
- ssh_args,
- )
+ args[0] = host
+ args[4] = tm_param_key
+ ssh_args = base_args + args
+ logger.debug(
+ "5b. starting remote tool meister, ssh_path=%r ssh_args=%r",
+ ssh_path,
+ ssh_args,
+ )
+ try:
+ pid = os.spawnv(os.P_NOWAIT, ssh_path, ssh_args)
+ except Exception:
+ logger.exception(
+ "failed to create a tool meister instance for host %s", host
+ )
+ tms[host] = {"pid": None, "status": "failed"}
+ else:
+ # Record the child pid to wait below.
+ tms[host] = {"pid": pid, "status": "spawned"}
+
+ for host, tm_proc in tms.items():
+ if tm_proc["status"] == "failed":
+ failures += 1
+ continue
+ pid = tm_proc["pid"]
try:
- pid = os.spawnv(os.P_NOWAIT, ssh_path, ssh_args)
+ exit_status = waitpid(pid)
except Exception:
+ failures += 1
logger.exception(
"failed to create a tool meister instance for host %s", host
)
- tms[host] = {"pid": None, "status": "failed"}
else:
- # Record the child pid to wait below.
- tms[host] = {"pid": pid, "status": "spawned"}
+ if exit_status != 0:
+ failures += 1
+ logger.error(
+ "failed to start tool meister on remote host '%s'"
+ " (pid %d), exit status: %d",
+ host,
+ pid,
+ exit_status,
+ )
+ else:
+ successes += 1
- failures = 0
- for host, tm_proc in tms.items():
- if tm_proc["status"] == "failed":
- failures += 1
- continue
- pid = tm_proc["pid"]
- try:
- exit_status = waitpid(pid)
- except Exception:
- failures += 1
- logger.exception(
- "failed to create a tool meister instance for host %s", host
- )
- else:
- if exit_status != 0:
- failures += 1
- logger.error(
- "failed to start tool meister on remote host '%s'"
- " (pid %d), exit status: %d",
- host,
- pid,
- exit_status,
+ if failures > 0:
+ # Don't wait for the Tool Meisters
+ logger.info("terminating tool meister startup due to failures")
+ terminate_msg = dict(action="terminate", group=group, directory=None)
+ try:
+ ret = redis_server.publish(
+ f"{cli_tm_channel_prefix}-{tm_channel_suffix_from_client}",
+ json.dumps(terminate_msg, sort_keys=True),
)
+ except Exception:
+ logger.exception("Failed to publish terminate message")
else:
- successes += 1
+ logger.debug("publish('terminate') = %r", ret)
+ return kill_redis_server(redis_pid, ReturnCode.TMFAILURES)
- if failures > 0:
- # Don't wait for the Tool Meisters
- logger.info("terminating tool meister startup due to failures")
- terminate_msg = dict(action="terminate", group=group, directory=None)
- try:
- ret = redis_server.publish(
- f"{cli_tm_channel_prefix}-{tm_channel_suffix_from_client}",
- json.dumps(terminate_msg, sort_keys=True),
+ if successes == 0:
+ logger.warning(
+ "unable to successfully start any tool meisters,"
+ " but encountered no failures either: terminating"
)
- except Exception:
- logger.exception("Failed to publish terminate message")
- else:
- logger.debug("publish('terminate') = %r", ret)
- return kill_redis_server(redis_pid, ReturnCode.TMFAILURES)
+ return kill_redis_server(redis_pid, ReturnCode.TMNOSUCCESSES)
- if successes == 0:
- logger.warning(
- "unable to successfully start any tool meisters,"
- " but encountered no failures either: terminating"
+ assert successes == tm_count, (
+ f"Logic Bomb! Number of created Tool Meisters, {successes}, does not"
+ f" match the expected number of Tool Meisters, {tm_count}"
)
- return kill_redis_server(redis_pid, ReturnCode.TMNOSUCCESSES)
-
- assert successes == tm_count, (
- f"Logic Bomb! Number of created Tool Meisters, {successes}, does not"
- f" match the expected number of Tool Meisters, {tm_count}"
- )
# +
# 6. Wait for the TDS to send a message reporting that it, and all the
@@ -835,12 +746,16 @@ def main(_prog, cli_params):
# If any successes, then we need to wait for them to show up as
# subscribers.
logger.debug(
- "6. waiting for all successfully spawned SSH processes"
+ "6. waiting for all successfully created Tool Meister processes"
" to show up as subscribers"
)
ret_val = wait_for_tds(to_client_chan, logger)
if ret_val != 0:
- return kill_redis_server(redis_pid, ReturnCode.TDSWAITFAILURE)
+ if cli_params.redis_server is None:
+ # We created the Redis server, so we should clean it up.
+ return kill_redis_server(redis_pid, ReturnCode.TDSWAITFAILURE)
+ else:
+ return ReturnCode.TDSWAITFAILURE
# Setup a Client API object using our existing to_client_chan object to
# drive the following client operations ("sysinfo" [optional] and "init"
@@ -856,34 +771,29 @@ def main(_prog, cli_params):
try:
sysinfo_path.mkdir(parents=True)
except Exception:
- logger.error(
- "Unable to create sysinfo-dump directory base path: {}",
- sysinfo_path,
+ error_log(
+ f"Unable to create sysinfo-dump directory base path: {sysinfo_path}"
)
- ret_val = ReturnCode.EXCSYSINFODIR
else:
logger.debug("7. Collecting system information")
info_log("Collecting system information")
- ret_val = client.publish(group, sysinfo_path, "sysinfo", sysinfo)
- ret_val = (
- ReturnCode.SUCCESS if ret_val == 0 else ReturnCode.SYSINFOFAILED
- )
+ # Collecting system information is optional, so we don't gate
+ # the success or failure of the startup on it.
+ client.publish(group, sysinfo_path, "sysinfo", sysinfo)
- if ret_val == ReturnCode.SUCCESS:
- tool_dir = benchmark_run_dir / f"tools-{group}"
- try:
- tool_dir.mkdir(exist_ok=True)
- except Exception as exc:
- logger.error(
- 'failed to create tool output directory, "{}": {}', tool_dir, exc
- )
- ret_val = ReturnCode.EXCTOOLGROUPDIR
- else:
- logger.debug("8. Initialize persistent tools")
- ret_val = client.publish(group, tool_dir, "init", None)
- if ret_val != ReturnCode.SUCCESS:
+ tool_dir = benchmark_run_dir / f"tools-{group}"
+ try:
+ tool_dir.mkdir(exist_ok=True)
+ except Exception as exc:
+ error_log(f"failed to create tool output directory, '{tool_dir}': {exc}")
+ return ReturnCode.EXCTOOLGROUPDIR
+ else:
+ logger.debug("8. Initialize persistent tools")
+ ret_val = client.publish(group, tool_dir, "init", None)
+ if ret_val != 0:
+ if cli_params.redis_server is None:
+ # We created the Redis server, so we should clean it up.
ret_val = kill_redis_server(redis_pid, ReturnCode.INITFAILED)
-
return ret_val
@@ -900,7 +810,17 @@ if __name__ == "__main__":
help="The list of system information items to be collected.",
)
parser.add_argument(
- "tool_group", help="The tool group of items to be run by the Tool Meisters."
+ "--redis-server",
+ dest="redis_server",
+ default=os.environ.get("PBENCH_REDIS_SERVER", None),
+ help=(
+ "Use an existing Redis server specified by :;"
+ " implies an existing Tool Data Sink and Tool Meisters as well."
+ ),
+ )
+ parser.add_argument(
+ "tool_group",
+ help="The tool group name of tools to be run by the Tool Meisters.",
)
parsed = parser.parse_args()
status = main(sys.argv[0], parsed)
diff --git a/agent/util-scripts/pbench-tool-meister-stop b/agent/util-scripts/pbench-tool-meister-stop
index 6a68603dbe..ce4f9b62b9 100755
--- a/agent/util-scripts/pbench-tool-meister-stop
+++ b/agent/util-scripts/pbench-tool-meister-stop
@@ -19,14 +19,10 @@ import time
from argparse import ArgumentParser
from pathlib import Path
-from pbench.agent.constants import redis_port, cli_tm_channel_prefix
+from pbench.agent.constants import def_redis_port, cli_tm_channel_prefix
+from pbench.agent.tool_group import BadToolGroup, ToolGroup
from pbench.agent.tool_meister_client import Client
-from pbench.agent.utils import (
- cli_verify_sysinfo,
- info_log,
- verify_tool_group,
- BadToolGroup,
-)
+from pbench.agent.utils import cli_verify_sysinfo, error_log, info_log
def is_running(pid):
@@ -42,6 +38,87 @@ def is_running(pid):
return True
+def wait_for_pid(pid):
+ """wait_for_pid - wait for a process to actually stop running.
+ """
+ while is_running(pid):
+ time.sleep(0.1)
+
+
+def graceful_shutdown(
+ benchmark_run_dir, full_hostname, group, redis_server_pid_file, logger
+):
+ # The assumption/assertion here is that the tool meister "stop" command is
+ # run on the same node as the tool meister "start" command ran, creating
+ # the local Tool Data Sink and the optional local Tool Meister. We want to
+ # make sure anything "local" to this stop command is shut down gracefully
+ # before we report back to the user. If Tool Meisters from remote nodes
+ # have already reported that they have received the "terminate" message,
+ # then we trust they will shutdown gracefully themselves.
+ try:
+ tds_pid_file = benchmark_run_dir / "tm" / "pbench-tool-data-sink.pid"
+ try:
+ pid_str = tds_pid_file.read_text()
+ except OSError as exc:
+ if exc.errno != errno.ENOENT:
+ raise
+ else:
+ tds_pid = int(pid_str)
+ logger.debug("waiting for tool-data-sink (%d) to exit", tds_pid)
+ wait_for_pid(tds_pid)
+ except Exception:
+ logger.exception("Exception encountered waiting for tool-data-sink")
+ ret_val = 1
+ else:
+ ret_val = 0
+
+ try:
+ ltm_pid_file = benchmark_run_dir / "tm" / f"tm-{group}-{full_hostname}.pid"
+ try:
+ pid_str = ltm_pid_file.read_text()
+ except OSError as exc:
+ if exc.errno != errno.ENOENT:
+ raise
+ else:
+ ltm_pid = int(pid_str)
+ logger.debug("waiting for local tool-meister (%d) to exit", ltm_pid)
+ wait_for_pid(ltm_pid)
+ except Exception:
+ logger.exception("Exception encountered waiting for local tool-meister")
+ ret_val = 1
+
+ # All was good so far, so we can terminate the redis server.
+ try:
+ try:
+ pid_str = redis_server_pid_file.read_text()
+ except OSError as exc:
+ if exc.errno != errno.ENOENT:
+ raise
+ else:
+ redis_server_pid = int(pid_str)
+ pid_exists = True
+ timeout = time.time() + 60
+ while pid_exists:
+ try:
+ os.kill(redis_server_pid, signal.SIGTERM)
+ except ProcessLookupError:
+ pid_exists = False
+ else:
+ if time.time() > timeout:
+ try:
+ os.kill(redis_server_pid, signal.SIGKILL)
+ except ProcessLookupError:
+ pid_exists = False
+ except Exception:
+ raise
+ time.sleep(0.1)
+ except Exception:
+ logger.exception("Exception encountered terminating Redis server")
+ ret_val = 1
+
+ return ret_val
+
+
def main(_prog, cli_params):
"""Main program for the tool meister stop CLI interface.
@@ -81,7 +158,7 @@ def main(_prog, cli_params):
logger.addHandler(sh)
try:
- verify_tool_group(cli_params.tool_group)
+ ToolGroup.verify_tool_group(cli_params.tool_group)
except BadToolGroup as exc:
logger.error(str(exc))
return 1
@@ -103,20 +180,38 @@ def main(_prog, cli_params):
full_hostname = os.environ["_pbench_full_hostname"]
benchmark_run_dir = Path(os.environ["benchmark_run_dir"]).resolve(strict=True)
except Exception:
- logger.exception("failed to fetch parameters from the environment")
+ logger.exception("failed to fetch required parameters from the environment")
return 1
- try:
- redis_server_pid_file = (
- benchmark_run_dir / "tm" / f"redis_{redis_port:d}.pid"
- ).resolve(strict=True)
- except FileNotFoundError:
- # No Redis server, nothing to do.
- return 0
+ if cli_params.redis_server is None:
+ # No Redis server was given, so look locally to see if we can find it.
+ # If no Redis server locally, we're done.
+ try:
+ redis_server_pid_file = (
+ benchmark_run_dir / "tm" / f"redis_{def_redis_port:d}.pid"
+ ).resolve(strict=True)
+ except FileNotFoundError:
+ # No Redis server, nothing to do.
+ return 0
+ else:
+ redis_host = "localhost"
+ redis_port = def_redis_port
+ else:
+ parts = cli_params.redis_server.split(":", 1)
+ if len(parts) != 2:
+ logger.error("Bad Redis server specified, '%s'", cli_params.redis_server)
+ return 1
+ try:
+ redis_port = int(parts[1])
+ except Exception:
+ logger.error("Bad Redis port specified, '%s'", cli_params.redis_server)
+ return 1
+ else:
+ redis_host = parts[0]
# The Redis server is always running on the local host with the CLI.
with Client(
- redis_host="localhost",
+ redis_host=redis_host,
redis_port=redis_port,
channel_prefix=cli_tm_channel_prefix,
logger=logger,
@@ -126,23 +221,19 @@ def main(_prog, cli_params):
try:
tool_dir.mkdir(exist_ok=True)
except Exception as exc:
- logger.error(
- 'failed to create tool output directory, "{}": {}', tool_dir, exc
- )
+ error_log(f"failed to create tool output directory, '{tool_dir}': {exc}")
end_ret_val = 1
else:
end_ret_val = client.publish(group, tool_dir, "end", None)
-
- # Next we collect the system configuration information, but only if the
- # "end" operation was successful, and if it was requested.
+ # Next we collect the system configuration information only if we were
+ # successfully able to end the persistent tools run.
if end_ret_val == 0 and sysinfo:
sysinfo_path = benchmark_run_dir / "sysinfo" / "end"
try:
sysinfo_path.mkdir(parents=True)
except Exception:
- logger.error(
- "Unable to create sysinfo-dump directory base path: {}",
- sysinfo_path,
+ error_log(
+ f"Unable to create sysinfo-dump directory base path: {sysinfo_path}",
)
else:
logger.info("Collecting system information")
@@ -163,65 +254,19 @@ def main(_prog, cli_params):
# just return the success/failure of the terminate operation.
ret_val = end_ret_val if end_ret_val != 0 else term_ret_val
- # The assumption/assertion here is that the tool meister "stop" command is
- # run on the same node as the tool meister "start" command ran, creating
- # the local Tool Data Sink and the optional local Tool Meister. We want to
- # make sure anything "local" to this stop command is shut down gracefully
- # before we report back to the user. If Tool Meisters from remote nodes
- # have already reported that they have received the "terminate" message,
- # then we trust they will shutdown gracefully themselves.
- try:
- tds_pid_file = benchmark_run_dir / "tm" / "pbench-tool-data-sink.pid"
- try:
- pid_str = tds_pid_file.read_text()
- except OSError as exc:
- if exc.errno != errno.ENOENT:
- raise
- else:
- tds_pid = int(pid_str)
- logger.debug("waiting for tool-data-sink (%d) to exit", tds_pid)
- while is_running(tds_pid):
- time.sleep(0.1)
- except Exception:
- logger.exception("Exception encountered waiting for tool-data-sink")
- ret_val = 1
-
- try:
- ltm_pid_file = benchmark_run_dir / "tm" / f"tm-{group}-{full_hostname}.pid"
- try:
- pid_str = ltm_pid_file.read_text()
- except OSError as exc:
- if exc.errno != errno.ENOENT:
- raise
- else:
- ltm_pid = int(pid_str)
- logger.debug("waiting for local tool-meister (%d) to exit", ltm_pid)
- while is_running(ltm_pid):
- time.sleep(0.1)
- except Exception:
- logger.exception("Exception encountered waiting for local tool-meister")
- ret_val = 1
-
- # All was good so far, so we can terminate the Redis server.
- try:
- try:
- pid_str = redis_server_pid_file.read_text()
- except OSError as exc:
- if exc.errno != errno.ENOENT:
- raise
- else:
- redis_server_pid = int(pid_str)
- pid_exists = True
- while pid_exists:
- try:
- os.kill(redis_server_pid, signal.SIGTERM)
- except ProcessLookupError:
- pid_exists = False
- else:
- time.sleep(0.1)
- except Exception:
- logger.exception("Exception encountered terminating Redis server")
- ret_val = 1
+ if cli_params.redis_server is None:
+ # The client operations have finished, successful or unsuccessfully,
+ # and we were not given an explicit Redis server to use. So the
+ # previous pbench-tool-meister-start must have set up the local Tool
+ # Data Sink, Tool Meister (if registered), and the Redis server. It is
+ # our responsibility to make sure these processes shut down correctly.
+ shutdown_ret_val = graceful_shutdown(
+ benchmark_run_dir, full_hostname, group, redis_server_pid_file, logger
+ )
+ if ret_val == 0:
+ # If client termination was successful, report the status of the
+ # graceful shutdown of the Tool Data Sink and the Redis server.
+ ret_val = shutdown_ret_val
return ret_val
@@ -243,7 +288,18 @@ if __name__ == "__main__":
help="Whether or not the stop operation is in response to an interrupt.",
)
parser.add_argument(
- "tool_group", help="The tool group of items being run in the Tool Meisters."
+ "--redis-server",
+ dest="redis_server",
+ default=os.environ.get("PBENCH_REDIS_SERVER", None),
+ help=(
+ "Use an existing Redis server specified by :;"
+ " implies the use of an existing Tool Data Sink and Tool Meisters"
+ " as well."
+ ),
+ )
+ parser.add_argument(
+ "tool_group",
+ help="The tool group name of tools being run in the Tool Meisters.",
)
parsed = parser.parse_args()
status = main(sys.argv[0], parsed)
diff --git a/agent/util-scripts/test-bin/samples/scripts/dcgm_prometheus.py b/agent/util-scripts/test-bin/dcgm-exporter
similarity index 100%
rename from agent/util-scripts/test-bin/samples/scripts/dcgm_prometheus.py
rename to agent/util-scripts/test-bin/dcgm-exporter
diff --git a/agent/util-scripts/tool-meister/pbench-tool-data-sink b/agent/util-scripts/tool-meister/pbench-tool-data-sink
new file mode 100755
index 0000000000..985f830799
--- /dev/null
+++ b/agent/util-scripts/tool-meister/pbench-tool-data-sink
@@ -0,0 +1,14 @@
+#!/usr/bin/env python3
+
+"""Simple command-line wrapper to keep the tool data sink from being in the
+CLI command set, while still allowing it to be invoked by the container entry
+point.
+"""
+
+import sys
+
+from pbench.agent.tool_data_sink import main
+
+
+status = main(sys.argv)
+sys.exit(status)
diff --git a/agent/util-scripts/tool-meister/tool-data-sink-ep b/agent/util-scripts/tool-meister/tool-data-sink-ep
new file mode 100755
index 0000000000..47d02d65a5
--- /dev/null
+++ b/agent/util-scripts/tool-meister/tool-data-sink-ep
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+_dir="$(dirname ${0})"
+
+source /etc/profile.d/pbench-agent.sh
+source /opt/pbench-agent/base
+# Instruct the Tool Data Sink not to daemonize.
+/opt/pbench-agent/util-scripts/tool-meister/pbench-tool-data-sink "${REDIS_HOST}" "${REDIS_PORT}" "${PARAM_KEY}" no
diff --git a/agent/util-scripts/tool-meister/tool-meister-ep b/agent/util-scripts/tool-meister/tool-meister-ep
new file mode 100755
index 0000000000..dce85d64b9
--- /dev/null
+++ b/agent/util-scripts/tool-meister/tool-meister-ep
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+_dir="$(dirname ${0})"
+
+source /etc/profile.d/pbench-agent.sh
+source /opt/pbench-agent/base
+# Instruct the Tool Meister not to daemonize.
+/opt/pbench-agent/util-scripts/tool-meister/pbench-tool-meister "${REDIS_HOST}" "${REDIS_PORT}" "${PARAM_KEY}" no
diff --git a/agent/util-scripts/unittests b/agent/util-scripts/unittests
index 66682e7e20..4333c5c425 100755
--- a/agent/util-scripts/unittests
+++ b/agent/util-scripts/unittests
@@ -528,7 +528,7 @@ function sort_log_file {
}
function sort_tdslog {
- sort_log_file ${_testdir}/mock-run/tm/pbench-tool-data-sink.log
+ sort_log_file ${_testdir}/mock-run/tm/pbench-tool-data-sink.err
}
function sort_tmlogs {
diff --git a/lib/pbench/agent/base.py b/lib/pbench/agent/base.py
index 30c6dfd96b..d2105c5b04 100644
--- a/lib/pbench/agent/base.py
+++ b/lib/pbench/agent/base.py
@@ -8,6 +8,7 @@
import click
from pbench.agent import PbenchAgentConfig
+from pbench.agent.tool_group import ToolGroup, BadToolGroup
from pbench.agent.utils import setup_logging
@@ -108,15 +109,16 @@ def get_path(self, path):
def verify_tool_group(self, group):
"""Ensure we have a tools group directory to work with"""
- self.tool_group_dir = self.pbench_run / f"tools-v1-{group}"
- if not self.tool_group_dir.exists():
- click.secho(
- f'\t{self.name}: invalid --group option ("{group}"), directory not found: {self.tool_group_dir}'
- )
+ try:
+ self.tool_group_dir = self.gen_tools_group_dir(group)
+ except BadToolGroup as exc:
+ click.echo(f'\t{self.name}: invalid --group option ("{group}"), {exc}')
ctxt = click.get_current_context()
click.echo(ctxt.get_help())
- return 1
- return 0
+ ret_code = 1
+ else:
+ ret_code = 0
+ return ret_code
def gen_tools_group_dir(self, group):
- return self.pbench_run / f"tools-v1-{group}"
+ return ToolGroup.verify_tool_group(group, pbench_run=self.pbench_run)
diff --git a/lib/pbench/agent/constants.py b/lib/pbench/agent/constants.py
index a1fde65974..eea0e1eb63 100644
--- a/lib/pbench/agent/constants.py
+++ b/lib/pbench/agent/constants.py
@@ -2,10 +2,10 @@
"""
# Default Redis server port number used is "One Tool" in hex 0x17001
-redis_port = 17001
+def_redis_port = 17001
# Default port number used for the Tool Data Sink
-tds_port = 8080
+def_tds_port = 8080
# The amount of time a TM tries to publish its setup message.
TDS_RETRY_PERIOD_SECS = 60
diff --git a/lib/pbench/agent/redis.py b/lib/pbench/agent/redis.py
index 86e11a13c9..28388aca17 100644
--- a/lib/pbench/agent/redis.py
+++ b/lib/pbench/agent/redis.py
@@ -220,3 +220,41 @@ def emit(self, record):
self.dropped += 1
finally:
self.counter += 1
+
+
+def wait_for_conn_and_key(redis_server, key, prog, redis_host, redis_port):
+ """wait_for_conn_and_key - convenience method of both the Tool Meister and
+ the Tool Data Sink to startup and wait for an initial connection to the
+ Redis server, and for the expected key to show up.
+ """
+ # Loop waiting for the key to show up.
+ connected = None
+ payload = None
+ while payload is None:
+ try:
+ payload = redis_server.get(key)
+ except redis.ConnectionError:
+ if connected is None:
+ print(
+ f"{prog}: waiting to connect to redis server {redis_host}:{redis_port}",
+ flush=True,
+ )
+ connected = False
+ elif connected:
+ print(
+ f"{prog}: disconnected from redis server {redis_host}:{redis_port}",
+ flush=True,
+ )
+ connected = False
+ time.sleep(1)
+ else:
+ if not connected:
+ print(
+ f"{prog}: connected to redis server {redis_host}:{redis_port}",
+ flush=True,
+ )
+ connected = True
+ if payload is None:
+ print(f'{prog}: key, "{key}" does not exist yet', flush=True)
+ time.sleep(1)
+ return payload.decode("utf-8")
diff --git a/lib/pbench/agent/tool_data_sink.py b/lib/pbench/agent/tool_data_sink.py
index f1d9f93970..09db6257bf 100644
--- a/lib/pbench/agent/tool_data_sink.py
+++ b/lib/pbench/agent/tool_data_sink.py
@@ -18,7 +18,6 @@
import subprocess
import sys
import tempfile
-import time
from configparser import ConfigParser, DuplicateSectionError
from datetime import datetime
@@ -29,14 +28,14 @@
from threading import Thread, Lock, Condition
from wsgiref.simple_server import WSGIRequestHandler, make_server
-import daemon
import pidfile
import redis
from bottle import Bottle, ServerAdapter, request, abort
+from daemon import DaemonContext
from pbench.agent.constants import (
- tds_port,
+ def_tds_port,
tm_allowed_actions,
tm_channel_suffix_from_client,
tm_channel_suffix_from_tms,
@@ -44,26 +43,27 @@
tm_channel_suffix_to_logging,
tm_channel_suffix_to_tms,
)
-from pbench.agent.redis import RedisChannelSubscriber
+from pbench.agent.redis import RedisChannelSubscriber, wait_for_conn_and_key
from pbench.agent.toolmetadata import ToolMetadata
from pbench.agent.utils import collect_local_info
+# Logging format string for unit tests
+fmtstr_ut = "%(levelname)s %(name)s %(funcName)s -- %(message)s"
+fmtstr = "%(asctime)s %(levelname)s %(process)s %(thread)s %(name)s %(funcName)s %(lineno)d -- %(message)s"
+
+
# Read in 64 KB chunks off the wire for HTTP PUT requests.
_BUFFER_SIZE = 65536
# Maximum size of the tar ball for collected tool data.
_MAX_TOOL_DATA_SIZE = 2 ** 30
-# Executable path of the tar and cp programs.
-tar_path = None
-cp_path = None
-
def _now(when):
- """_now - An ugly hack to facility testing without the ability to mock.
+ """_now - An ugly hack to facilitate testing without the ability to mock.
- Instead of directly calling `datatime.utcnow().isoformat()`, each call
+ Instead of directly calling `datetime.utcnow().isoformat()`, each call
site invokes this method with an argument only used during unit testing
to determine the expected behavior. This allows us to provide a "start"
time that is one microsecond less than the "end" time.
@@ -76,9 +76,9 @@ def _now(when):
class DataSinkWsgiServer(ServerAdapter):
- """DataSinkWsgiServer - an re-implementation of Bottle's WSGIRefServer
+ """DataSinkWsgiServer - a re-implementation of Bottle's WSGIRefServer
where we have access to the underlying WSGIServer instance in order to
- invoke it's stop() method, and we also provide an WSGIReqeustHandler with
+ invoke its stop() method, and we also provide an WSGIRequestHandler with
an opinionated logging implementation.
"""
@@ -124,25 +124,78 @@ def log_request(self, code="-", size="-"):
self.options["handler_class"] = DataSinkWsgiRequestHandler
self._server = None
+ self._err_code = None
+ self._err_text = None
self._lock = Lock()
self._cv = Condition(lock=self._lock)
self._logger = logger
- def run(self, app):
- assert self._server is None, "'run' method called twice"
- self._logger.debug("Making tool data sink WSGI server ...")
- server = make_server(self.host, self.port, app, **self.options)
+ def _do_notify(self, text=None, code=0, server=None):
+ """_do_notify - simple helper method to encapsulate method of notification.
+ """
with self._lock:
+ self._err_text = text
+ self._err_code = code
self._server = server
self._cv.notify()
- self._logger.debug("Running tool data sink WSGI server ...")
- self._server.serve_forever()
- def stop(self):
+ def run(self, app):
+ """run - Start the WSGI server, called by the Bottle framework.
+
+ Intended to be run as a separate thread.
+
+ We record the outcome of the `make_server` call for success or failure
+ and notify anybody waiting for this thread to succeed.
+ """
+ assert self._server is None, "'run' method called twice"
+ self._logger.debug("Making tool data sink WSGI server ...")
+ try:
+ server = make_server(self.host, self.port, app, **self.options)
+ except OSError as exc:
+ assert exc.errno != 0, "Logic bomb! OSError exception with no errno value"
+ self._do_notify(str(exc), exc.errno)
+ raise
+ except Exception as exc:
+ self._logger.exception("Unexpected error in WSGI server")
+ self._do_notify(str(exc), -1)
+ raise
+ else:
+ self._logger.debug("Successfully created WSGI server")
+ self._do_notify(server=server)
+ self._logger.debug("Running tool data sink WSGI server ...")
+ server.serve_forever()
+
+ def wait(self):
+ """ wait - wait for the WSGI thread executing the `run` method to start
+ running and successfully create a WSGI server object, or fail trying.
+
+ Returns a tuple of the error text and the error code set by the _run()
+ method attempting to create the WSGI server. The error code will be
+ 0 on success, an Errno value, or -1 if an expected exception was
+ raised.
+ """
with self._lock:
- while self._server is None:
+ while self._err_code is None:
self._cv.wait()
- self._server.shutdown()
+ return self._err_text, self._err_code
+
+ def stop(self):
+ """ stop - stop the running WSGI server via the shutdown() method of
+ the WSGI server object.
+ """
+ # We have to wait for the thread to start the server and fill in the
+ # server object first.
+ self.wait()
+ if self._err_code == 0:
+ self._server.shutdown()
+
+
+class ToolDataSinkError(Exception):
+ """ToolDataSinkError - generic exception class for Tool Data Sink related
+ exceptions.
+ """
+
+ pass
class BaseCollector:
@@ -159,20 +212,25 @@ def __init__(
tool_group,
host_tools_dict,
tool_metadata,
+ tar_path,
logger,
):
"""Constructor - responsible for recording the arguments, and creating
the Environment() for template rendering.
"""
self.templates_path = pbench_bin / "templates"
+ assert (
+ self.templates_path.is_dir()
+ ), f"Logic bomb! {self.templates_path} does not exist as a directory"
self.benchmark_run_dir = benchmark_run_dir
self.tool_group = tool_group
self.host_tools_dict = host_tools_dict
self.tool_metadata = tool_metadata
+ self.tar_path = tar_path
self.logger = logger
self.run = []
- self.tool_group_dir = self.benchmark_run_dir / f"tools-{self.tool_group}"
+ self.tool_group_dir = self.benchmark_run_dir.local / f"tools-{self.tool_group}"
self.tool_dir = self.tool_group_dir / self.name
self.template_dir = Environment(
autoescape=False,
@@ -253,7 +311,7 @@ def terminate(self):
if sts != 0:
self.logger.warning("Collector process terminated with %d", sts)
if errors > 0:
- raise Exception("Failed to terminate all the collector processes")
+ raise ToolDataSinkError("Failed to terminate all the collector processes")
class PromCollector(BaseCollector):
@@ -268,7 +326,7 @@ def __init__(self, *args, **kwargs):
"""
self.prometheus_path = find_executable("prometheus")
if self.prometheus_path is None:
- raise Exception("External 'prometheus' executable not found")
+ raise ToolDataSinkError("External 'prometheus' executable not found")
super().__init__(*args, **kwargs)
self.tool_context = []
@@ -279,7 +337,9 @@ def __init__(self, *args, **kwargs):
dict(hostname=f"{host}_{tool}", hostport=f"{host}:{port}")
)
if not self.tool_context:
- raise Exception("Expected prometheus persistent tool context not found")
+ raise ToolDataSinkError(
+ "Expected prometheus persistent tool context not found"
+ )
def launch(self):
"""launch - creates the YAML file that directs Prometheus's behavior,
@@ -329,7 +389,7 @@ def terminate(self):
self.logger.debug("Prometheus terminated")
args = [
- tar_path,
+ self.tar_path,
"--remove-files",
"-Jcf",
f"{self.tool_group_dir}/prometheus_data.tar.xz",
@@ -537,7 +597,7 @@ def terminate(self):
self.logger.debug("Pmproxy and pmlogger(s) terminated")
args = [
- tar_path,
+ self.tar_path,
"--remove-files",
"-Jcf",
f"{self.tool_group_dir}/pcp_data.tar.xz",
@@ -550,6 +610,125 @@ def terminate(self):
self.logger.warning("Failed to tar up pmlogger data: %r", args)
+class BenchmarkRunDir:
+ """BenchmarkRunDir - helper class for handling the benchmark_run_dir
+ directory Redis parameter vs the actual "local" benchmark run directory.
+
+ It is a requirement of the Tool Meister sub-system that the ${pbench_run}
+ directory is always a prefix of the ${benchmark_run_dir}.
+
+ When the pbench CLI starts the Tool Data Sink directly, the local
+ benchmark run directory is the same as the value of the benchmark_run_dir
+ parameter.
+
+ But when the Tool Data Sink runs in a container, the path to the benchmark
+ run directory inside the container might be different from the parameter
+ value because the mount point for the external file system has a different
+ path inside the container. Typically, the container is constructed with
+ the default pbench installation, where the ${pbench_run} directory is
+ "/var/lib/pbench-agent".
+
+ The entity responsible for starting the Tool Data Sink container typically
+ mounts a different directory for /var/lib/pbench-agent via 'podman run
+ --volume /srv/data/pbench-run-dir:/var/lib/pbench-agent:Z'. This leads to
+ a conflict where the external ${pbench_run} path is different from the
+ internal-to-the-container ${pbench_run} path. To resolve this, the entity
+ which creates the external pbench run directory creates a ".path" file in
+ that directory containing the full "external" path to the pbench run
+ directory. The Tool Data Sink uses that path to validate that external
+ benchmark_run_dir parameter values are valid.
+
+ This class implements the mechanism that allows the Tool Data Sink code to
+ handle that seamlessly.
+ """
+
+ class Exists(Exception):
+ pass
+
+ class Prefix(Exception):
+ pass
+
+ def __init__(self, ext_benchmark_run_dir, int_pbench_run):
+ self._ext_benchmark_run_dir = Path(ext_benchmark_run_dir)
+ self._ext_pbench_run = self._ext_benchmark_run_dir.parent
+ self._int_pbench_run = Path(int_pbench_run)
+
+ # The Tool Data Sink could be running in a container. If so, then
+ # it'll be using the default benchmark run directory. If the
+ # benchmark_run_dir parameter is valid, there will be a file
+ # called ".path" in the default benchmark run directory which will
+ # match.
+ #
+ # E.g.:
+ # $ pbench_run="/home//run-dir"
+ # $ benchmark_run_dir="${pbench_run}/script_config_"
+ # $ cat ${pbench_run}/.path
+ # /home//run-dir
+ # $ podman run --volume ${pbench_run}:/var/lib/pbench-agent \
+ # pbench-agent-tool-data-sink bash
+ # [ abcdefg /]$ cat /var/lib/pbench-agent/.path
+ # /home//run-dir
+ try:
+ benchmark_run_dir_lcl = self._ext_benchmark_run_dir.resolve(strict=True)
+ except Exception:
+ # Might be in a container; let's first construct the
+ # internal-to-the-container benchmark run directory.
+ benchmark_run_dir_lcl = (
+ self._int_pbench_run / self._ext_benchmark_run_dir.name
+ )
+ dot_path = self._int_pbench_run / ".path"
+ try:
+ dot_path_contents = dot_path.read_text().strip()
+ except Exception as exc:
+ # Failed to read ".path" contents, give up.
+ raise ToolDataSinkError(
+ f"Run directory parameter, '{ext_benchmark_run_dir}', must"
+ f" be an existing directory ('{self._ext_pbench_run}/"
+ f".path' not found, '{exc}').",
+ )
+ else:
+ if dot_path_contents != str(self._ext_pbench_run):
+ raise ToolDataSinkError(
+ f"Run directory parameter, '{ext_benchmark_run_dir}',"
+ " must be an existing directory (.path contents"
+ f" mismatch, .path='{dot_path_contents}' !="
+ f" '{self._ext_pbench_run}').",
+ )
+ else:
+ # We can access the benchmark_run_dir directly, no need to
+ # consider contents of ".path" file.
+ pass
+ if not benchmark_run_dir_lcl.is_dir():
+ raise ToolDataSinkError(
+ f"Run directory parameter, '{ext_benchmark_run_dir}', must be"
+ " a real directory.",
+ )
+ self.local = benchmark_run_dir_lcl
+
+ def __str__(self):
+ """__str__ - the string representation of a BenchmarkRunDir object is
+ the original external benchmark run directory string.
+ """
+ return str(self._ext_benchmark_run_dir)
+
+ def validate(self, directory):
+ """validate - check that an external directory has a prefix of the external
+ benchmark run directory.
+ """
+ directory_p = Path(directory)
+ try:
+ # Check that "directory" has a prefix of
+ rel_path = directory_p.relative_to(self._ext_benchmark_run_dir)
+ except ValueError:
+ raise self.Prefix()
+ local_dir = self.local / rel_path
+ if not local_dir.is_dir():
+ # The internal benchmark run directory does not have the same
+ # sub-directory hierarchy.
+ raise self.Exists()
+ return local_dir
+
+
class ToolDataSink(Bottle):
"""ToolDataSink - sub-class of Bottle representing state for tracking data
sent from tool meisters via an HTTP PUT method.
@@ -558,20 +737,41 @@ class ToolDataSink(Bottle):
# The list of actions where we expect Tool Meisters to send data to us.
_data_actions = frozenset(("send", "sysinfo"))
+ @staticmethod
+ def fetch_params(params, pbench_run):
+ try:
+ _benchmark_run_dir = params["benchmark_run_dir"]
+ bind_hostname = params["bind_hostname"]
+ channel_prefix = params["channel_prefix"]
+ tool_group = params["group"]
+ tool_metadata = ToolMetadata.tool_md_from_dict(params["tool_metadata"])
+ tool_trigger = params["tool_trigger"]
+ tools = params["tools"]
+ except KeyError as exc:
+ raise ToolDataSinkError(f"Invalid parameter block, missing key {exc}")
+ else:
+ benchmark_run_dir = BenchmarkRunDir(_benchmark_run_dir, pbench_run)
+ return (
+ benchmark_run_dir,
+ bind_hostname,
+ channel_prefix,
+ tool_group,
+ tool_metadata,
+ tool_trigger,
+ tools,
+ )
+
def __init__(
self,
pbench_bin,
+ pbench_run,
hostname,
- bind_hostname,
+ tar_path,
+ cp_path,
redis_server,
redis_host,
redis_port,
- channel_prefix,
- benchmark_run_dir,
- tool_group,
- tool_trigger,
- tools,
- tool_metadata,
+ params,
optional_md,
logger,
):
@@ -583,16 +783,21 @@ def __init__(
# Save external state
self.pbench_bin = pbench_bin
self.hostname = hostname
- self.bind_hostname = bind_hostname
+ self.tar_path = tar_path
+ self.cp_path = cp_path
self.redis_server = redis_server
self.redis_host = redis_host
self.redis_port = redis_port
- self.channel_prefix = channel_prefix
- self.benchmark_run_dir = benchmark_run_dir
- self.tool_group = tool_group
- self.tool_trigger = tool_trigger
- self.tools = tools
- self.tool_metadata = tool_metadata
+ ret_val = self.fetch_params(params, pbench_run)
+ (
+ self.benchmark_run_dir,
+ self.bind_hostname,
+ self.channel_prefix,
+ self.tool_group,
+ self.tool_metadata,
+ self.tool_trigger,
+ self.tools,
+ ) = ret_val
self.optional_md = optional_md
self.logger = logger
# Initialize internal state
@@ -615,6 +820,8 @@ def __init__(
self._lock = Lock()
self._cv = Condition(lock=self._lock)
self.web_server_thread = None
+ self._tm_log_capture_thread_cv = Condition(lock=self._lock)
+ self._tm_log_capture_thread_state = None
self.tm_log_capture_thread = None
def __enter__(self):
@@ -630,13 +837,19 @@ def __enter__(self):
callback=self.put_document,
)
self._server = DataSinkWsgiServer(
- host=self.bind_hostname, port=tds_port, logger=self.logger
+ host=self.bind_hostname, port=def_tds_port, logger=self.logger
)
self.web_server_thread = Thread(target=self.web_server_run)
self.web_server_thread.start()
- # FIXME - ugly hack for consistent unit tests; why not just use a
- # condition variable?
- time.sleep(0.1)
+ err_text, err_code = self._server.wait()
+ if err_code > 0:
+ # Pass along the OSError with its errno, let's us handle cleanly
+ # EADDRINUSE errors.
+ raise OSError(err_code, err_text)
+ elif err_code < 0:
+ # All other errors encountered by the WSGI thread are already
+ # logged.
+ raise ToolDataSinkError(f"Failure to create WSGI server - {err_text!r}")
self.logger.debug("web server 'run' thread started, processing payloads ...")
# Setup the two Redis channels to which the Tool Data Sink subscribes.
@@ -654,10 +867,18 @@ def __enter__(self):
self.tm_log_capture_thread = Thread(target=self.tm_log_capture)
self.tm_log_capture_thread.start()
- # FIXME - ugly hack for consistent unit tests; why not just use a
- # condition variable?
- time.sleep(0.1)
- self.logger.debug("'tm_log_capture' thread started, processing logs ...")
+ with self._lock:
+ while self._tm_log_capture_thread_state is None:
+ self._tm_log_capture_thread_cv.wait()
+ if self._tm_log_capture_thread_state != "started":
+ self.logger.warning(
+ "'tm_log_capture' thread failed to start, not processing Tool"
+ " Meister logs ..."
+ )
+ else:
+ self.logger.debug(
+ "'tm_log_capture' thread started, processing Tool Meister logs ..."
+ )
# The ToolDataSink object itself is the object of the context manager.
return self
@@ -709,20 +930,18 @@ def tm_log_capture(self):
# logs from remote Tool Meisters.
logger = logging.getLogger("tm_log_capture_thread")
logger.setLevel(logging.WARNING)
- tm_log_file = self.benchmark_run_dir / "tm" / "tm.logs"
+ tm_log_file = self.benchmark_run_dir.local / "tm" / "tm.logs"
with tm_log_file.open("w") as fp:
try:
+ with self._lock:
+ self._tm_log_capture_thread_state = "started"
+ self._tm_log_capture_thread_cv.notify()
for log_msg in self._to_logging_chan.fetch_message(logger):
fp.write(f"{log_msg}\n")
fp.flush()
except redis.ConnectionError:
# We don't bother reporting any connection errors.
pass
- except ValueError as exc:
- # FIXME - Why do we need to do this?
- if exc.args[0] == "I/O operation on closed file.":
- pass
- raise
except Exception:
self.logger.exception("Failed to capture logs from Redis server")
@@ -796,9 +1015,9 @@ def record_tms(self, tms):
"""record_tms - record the Tool Meister data and metadata returned from
the startup acknowledgement messages collected in "tms".
- The first thing we have to do is setup self._tm_tracking properly,
- adding which tools are no-ops, transient, and persistent, and properly
- record the initial "posted" state.
+ The first thing we have to do is to determine which tools are no-ops,
+ transient, and persistent, and properly record the initial "posted"
+ state.
The second thing we do is record all the data and metadata about the
Tool Meisters in the ${benchmark_run_dir}/metadata.log file.
@@ -855,7 +1074,7 @@ def record_tms(self, tms):
home = os.environ.get("HOME", "")
if home:
src = str(Path(home) / ".ssh" / "config")
- dst = str(self.benchmark_run_dir / "ssh.config")
+ dst = str(self.benchmark_run_dir.local / "ssh.config")
try:
shutil.copyfile(src, dst)
except FileNotFoundError:
@@ -865,7 +1084,7 @@ def record_tms(self, tms):
# cp -L /etc/ssh/ssh_config ${dir}/ > /dev/null 2>&1
etc_ssh = Path("/etc") / "ssh"
src = str(etc_ssh / "ssh_config")
- dst = str(self.benchmark_run_dir / "ssh_config")
+ dst = str(self.benchmark_run_dir.local / "ssh_config")
try:
shutil.copyfile(src, dst)
except FileNotFoundError:
@@ -898,13 +1117,18 @@ def record_tms(self, tms):
#
# cp -rL /etc/ssh/ssh_config.d ${dir}/ > /dev/null 2>&1
subprocess.run(
- [cp_path, "-rL", "/etc/ssh/ssh_config.d", f"{self.benchmark_run_dir}/"],
+ [
+ self.cp_path,
+ "-rL",
+ "/etc/ssh/ssh_config.d",
+ f"{self.benchmark_run_dir.local}/",
+ ],
stdin=subprocess.DEVNULL,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
- mdlog_name = self.benchmark_run_dir / "metadata.log"
+ mdlog_name = self.benchmark_run_dir.local / "metadata.log"
mdlog = ConfigParser()
try:
with mdlog_name.open("r") as fp:
@@ -923,7 +1147,7 @@ def record_tms(self, tms):
# Users have a funny way of adding '%' characters to the run
# directory, so we have to be sure we handle "%" characters in the
# directory name metadata properly.
- mdlog.set(section, "name", self.benchmark_run_dir.name.replace("%", "%%"))
+ mdlog.set(section, "name", self.benchmark_run_dir.local.name.replace("%", "%%"))
version, seqno, sha1, hostdata = collect_local_info(self.pbench_bin)
rpm_version = f"v{version}-{seqno}g{sha1}"
mdlog.set(section, "rpm-version", rpm_version)
@@ -1050,7 +1274,9 @@ def execute(self):
self._to_client_channel, json.dumps(started_msg, sort_keys=True)
)
if num_present == 0:
- raise Exception("Tool Data Sink started by nobody is listening")
+ raise ToolDataSinkError(
+ "Tool Data Sink started, but nobody is listening"
+ )
self.logger.debug("published %s", self._to_client_channel)
for data in self._from_client_chan.fetch_json(self.logger):
@@ -1214,7 +1440,7 @@ def execute_action(self, action, directory_str, args, data):
# the caller wants to report that it is stopping all the Tool
# Meisters due to an interruption (SIGINT or otherwise).
#
- mdlog_name = self.benchmark_run_dir / "metadata.log"
+ mdlog_name = self.benchmark_run_dir.local / "metadata.log"
mdlog = ConfigParser()
try:
with (mdlog_name).open("r") as fp:
@@ -1233,7 +1459,7 @@ def execute_action(self, action, directory_str, args, data):
if args["interrupt"]:
# args["interrupt"] == True ==> run / run_interrupted
mdlog.set(section, "run_interrupted", "true")
- iterations = self.benchmark_run_dir / ".iterations"
+ iterations = self.benchmark_run_dir.local / ".iterations"
try:
iterations_val = iterations.read_text()
except FileNotFoundError:
@@ -1258,26 +1484,27 @@ def execute_action(self, action, directory_str, args, data):
self._to_logging_chan.unsubscribe()
return
- directory = Path(directory_str)
- if not directory.is_dir():
+ try:
+ local_dir = self.benchmark_run_dir.validate(directory_str)
+ except self.benchmark_run_dir.Prefix:
self.logger.error(
- "action '%s' with non-existent directory, '%s'", action, directory,
+ "action '%s' with invalid directory, '%s' (not a sub-directory of '%s')",
+ action,
+ directory_str,
+ self.benchmark_run_dir,
)
- self._send_client_status(action, "invalid directory")
+ self._send_client_status(action, "directory not a sub-dir of run directory")
return
- try:
- # Check that "directory" has a prefix of self.benchmark_run_dir
- directory.relative_to(self.benchmark_run_dir)
- except ValueError:
+ except self.benchmark_run_dir.Exists:
self.logger.error(
- "action '%s' with invalid directory,"
- " '%s' (not a sub-directory of '%s')",
+ "action '%s' with invalid directory, '%s' (does not exist)",
action,
- directory,
- self.benchmark_run_dir,
+ directory_str,
)
- self._send_client_status(action, "directory not a prefix of run directory")
+ self._send_client_status(action, "directory does not exist")
return
+ else:
+ assert local_dir is not None, f"Logic bomb! local_dir = {local_dir!r}"
with self._lock:
# Handle all actions underneath the lock for consistency.
@@ -1326,6 +1553,7 @@ def execute_action(self, action, directory_str, args, data):
self.tool_group,
prom_tool_dict,
self.tool_metadata,
+ self.tar_path,
logger=self.logger,
)
self._prom_server.launch()
@@ -1340,6 +1568,7 @@ def execute_action(self, action, directory_str, args, data):
self.tool_group,
pcp_tool_dict,
self.tool_metadata,
+ self.tar_path,
redis_host=self.redis_host,
redis_port=self.redis_port,
logger=self.logger,
@@ -1364,7 +1593,7 @@ def execute_action(self, action, directory_str, args, data):
# the URL for the PUT method.
directory_bytes = directory_str.encode("utf-8")
self.data_ctx = hashlib.md5(directory_bytes).hexdigest()
- self.directory = Path(directory_str)
+ self.directory = local_dir
# Forward to TMs
ret_val = self._forward_tms(data)
@@ -1567,7 +1796,7 @@ def put_document(self, data_ctx, hostname):
# Invoke tar directly for efficiency.
with o_file.open("w") as ofp, e_file.open("w") as efp:
cp = subprocess.run(
- [tar_path, "-xf", host_data_tb_name],
+ [self.tar_path, "-xf", host_data_tb_name],
cwd=target_dir,
stdin=None,
stdout=ofp,
@@ -1609,71 +1838,224 @@ def put_document(self, data_ctx, hostname):
abort(500, "INTERNAL ERROR")
-def main(argv):
- _prog = Path(argv[0])
- PROG = _prog.name
- pbench_bin = _prog.parent.parent
+def get_logger(PROG, daemon=False):
+ """get_logger - construct a logger for a Tool Meister instance.
+ If in the Unit Test environment, just log to console.
+ If in non-unit test environment:
+ If daemonized, log to syslog and log back to Redis.
+ If not daemonized, log to console AND log back to Redis
+ """
logger = logging.getLogger(PROG)
- fh = logging.FileHandler(f"{PROG}.log")
- if os.environ.get("_PBENCH_UNIT_TESTS"):
- fmtstr = "%(levelname)s %(name)s %(funcName)s -- %(message)s"
- else:
- fmtstr = (
- "%(asctime)s %(levelname)s %(process)s %(thread)s"
- " %(name)s %(funcName)s %(lineno)d -- %(message)s"
- )
- fhf = logging.Formatter(fmtstr)
- fh.setFormatter(fhf)
if os.environ.get("_PBENCH_TOOL_DATA_SINK_LOG_LEVEL") == "debug":
log_level = logging.DEBUG
else:
log_level = logging.INFO
- fh.setLevel(log_level)
- logger.addHandler(fh)
logger.setLevel(log_level)
+ unit_tests = bool(os.environ.get("_PBENCH_UNIT_TESTS"))
+ if unit_tests or not daemon:
+ sh = logging.StreamHandler()
+ else:
+ sh = logging.FileHandler(f"{PROG}.log")
+ sh.setLevel(log_level)
+ shf = logging.Formatter(fmtstr_ut if unit_tests else fmtstr)
+ sh.setFormatter(shf)
+ logger.addHandler(sh)
+
+ return logger
+
+
+def driver(
+ PROG,
+ redis_server,
+ redis_host,
+ redis_port,
+ pbench_bin,
+ pbench_run,
+ hostname,
+ tar_path,
+ cp_path,
+ param_key,
+ params,
+ optional_md,
+ logger=None,
+):
+ if logger is None:
+ logger = get_logger(PROG)
+
+ logger.debug("params_key (%s): %r", param_key, params)
+
+ try:
+ with ToolDataSink(
+ pbench_bin,
+ pbench_run,
+ hostname,
+ tar_path,
+ cp_path,
+ redis_server,
+ redis_host,
+ redis_port,
+ params,
+ optional_md,
+ logger,
+ ) as tds_app:
+ tds_app.execute()
+ except OSError as exc:
+ if exc.errno == errno.EADDRINUSE:
+ logger.error(
+ "ERROR - tool data sink failed to start, %s:%s already in use",
+ params["bind_hostname"],
+ def_tds_port,
+ )
+ ret_val = 8
+ else:
+ logger.exception("ERROR - failed to start the tool data sink")
+ ret_val = 9
+ except Exception:
+ logger.exception("ERROR - failed to start the tool data sink")
+ ret_val = 10
+ else:
+ ret_val = 0
+ return ret_val
+
+
+def daemon(
+ PROG,
+ redis_server,
+ redis_host,
+ redis_port,
+ pbench_bin,
+ pbench_run,
+ hostname,
+ tar_path,
+ cp_path,
+ param_key,
+ params,
+ optional_md,
+):
+ # Disconnect any existing connections to the Redis server.
+ redis_server.connection_pool.disconnect()
+ del redis_server
+
+ # Before we daemonize, flush any data written to stdout or stderr.
+ sys.stderr.flush()
+ sys.stdout.flush()
+
+ pidfile_name = f"{PROG}.pid"
+ pfctx = pidfile.PIDFile(pidfile_name)
+ with open(f"{PROG}.out", "w") as sofp, open(
+ f"{PROG}.err", "w"
+ ) as sefp, DaemonContext(
+ stdout=sofp,
+ stderr=sefp,
+ working_directory=os.getcwd(),
+ umask=0o022,
+ pidfile=pfctx,
+ ):
+ logger = get_logger(PROG, daemon=True)
+
+ # We have to re-open the connection to the redis server now that we
+ # are "daemonized".
+ logger.debug("re-constructing Redis server object")
+ try:
+ redis_server = redis.Redis(host=redis_host, port=redis_port, db=0)
+ except Exception as e:
+ logger.error(
+ "Unable to construct Redis server object, %s:%s: %s",
+ redis_host,
+ redis_port,
+ e,
+ )
+ return 7
+ else:
+ logger.debug("reconstructed Redis server object")
+ return driver(
+ PROG,
+ redis_server,
+ redis_host,
+ redis_port,
+ pbench_bin,
+ pbench_run,
+ hostname,
+ tar_path,
+ cp_path,
+ param_key,
+ params,
+ optional_md,
+ logger=logger,
+ )
+
+
+def main(argv):
+ _prog = Path(argv[0])
+ PROG = _prog.name
+ # The Tool Data Sink executable is in:
+ # ${pbench_bin}/util-scripts/tool-meister/pbench-tool-data-sink
+ # So .parent at each level is:
+ # _prog ${pbench_bin}/util-scripts/tool-meister/pbench-tool-data-sink
+ # .parent ${pbench_bin}/util-scripts/tool-meister
+ # .parent ${pbench_bin}/util-scripts
+ # .parent ${pbench_bin}
+ pbench_bin = _prog.parent.parent.parent
+
try:
redis_host = argv[1]
redis_port = argv[2]
param_key = argv[3]
except IndexError as e:
- logger.error("Invalid arguments: %s", e)
+ print(f"{PROG}: Invalid arguments: {e}", file=sys.stderr)
return 1
+ else:
+ if not redis_host or not redis_port or not param_key:
+ print(f"{PROG}: Invalid arguments: {argv!r}", file=sys.stderr)
+ return 1
+ try:
+ daemonize = argv[4]
+ except IndexError:
+ daemonize = "no"
- global tar_path
tar_path = find_executable("tar")
if tar_path is None:
- logger.error("External 'tar' executable not found")
+ print("External 'tar' executable not found", file=sys.stderr)
return 2
- global cp_path
cp_path = find_executable("cp")
if cp_path is None:
- logger.error("External 'cp' executable not found")
+ print("External 'cp' executable not found", file=sys.stderr)
return 2
+ try:
+ pbench_run = os.environ["pbench_run"]
+ except KeyError:
+ print(
+ "Unable to fetch pbench_run environment variable", file=sys.stderr,
+ )
+ return 3
+
try:
redis_server = redis.Redis(host=redis_host, port=redis_port, db=0)
except Exception as e:
- logger.error(
- "Unable to connect to redis server, %s:%s: %s", redis_host, redis_port, e
+ print(
+ f"Unable to connect to redis server, {redis_host}:{redis_port}: {e}",
+ file=sys.stderr,
)
return 4
try:
hostname = os.environ["_pbench_full_hostname"]
except KeyError:
- logger.error("Unable to fetch _pbench_full_hostname environment variable")
- return 4
+ print(
+ "Unable to fetch _pbench_full_hostname environment variable",
+ file=sys.stderr,
+ )
+ return 5
try:
- params_raw = redis_server.get(param_key)
- if params_raw is None:
- logger.error('Parameter key, "%s" does not exist.', param_key)
- return 5
- logger.debug("params_key (%s): %r", param_key, params_raw)
- params_str = params_raw.decode("utf-8")
+ # Wait for the parameter key value to show up.
+ params_str = wait_for_conn_and_key(
+ redis_server, param_key, PROG, redis_host, redis_port
+ )
# The expected parameters for this "data-sink" is what "channel" to
# subscribe to for the tool meister operational life-cycle. The
# data-sink listens for the actions, sysinfo | init | start | stop |
@@ -1683,89 +2065,29 @@ def main(argv):
# E.g. params = '{ "channel_prefix": "some-prefix",
# "benchmark_run_dir": "/loo/goo" }'
params = json.loads(params_str)
- channel_prefix = params["channel_prefix"]
- benchmark_run_dir = Path(params["benchmark_run_dir"]).resolve(strict=True)
- bind_hostname = params["bind_hostname"]
- tool_group = params["group"]
- tool_trigger = params["tool_trigger"]
- tools = params["tools"]
- tool_metadata = ToolMetadata.tool_md_from_dict(params["tool_metadata"])
+ ToolDataSink.fetch_params(params, pbench_run)
except Exception as ex:
- logger.error("Unable to fetch and decode parameter key, %s: %s", param_key, ex)
+ print(
+ f"Unable to fetch and decode parameter key, {param_key}: {ex}",
+ file=sys.stderr,
+ )
return 6
- else:
- if not benchmark_run_dir.is_dir():
- logger.error(
- "Run directory argument, %s, must be a real directory.",
- benchmark_run_dir,
- )
- return 7
- logger.debug("Tool Data Sink parameters check out, daemonizing ...")
- redis_server.connection_pool.disconnect()
- redis_server = None
optional_md = params["optional_md"]
- # Before we daemonize, flush any data written to stdout or stderr.
- sys.stderr.flush()
- sys.stdout.flush()
-
- pidfile_name = f"{PROG}.pid"
- pfctx = pidfile.PIDFile(pidfile_name)
- with open(f"{PROG}.out", "w") as sofp, open(
- f"{PROG}.err", "w"
- ) as sefp, daemon.DaemonContext(
- stdout=sofp,
- stderr=sefp,
- working_directory=os.getcwd(),
- umask=0o022,
- pidfile=pfctx,
- files_preserve=[fh.stream.fileno()],
- ):
- try:
- # We have to re-open the connection to the redis server now that we
- # are "daemonized".
- logger.debug("constructing Redis() object")
- try:
- redis_server = redis.Redis(host=redis_host, port=redis_port, db=0)
- except Exception as e:
- logger.error(
- "Unable to connect to redis server, %s:%s: %s",
- redis_host,
- redis_port,
- e,
- )
- return 8
- else:
- logger.debug("constructed Redis() object")
-
- with ToolDataSink(
- pbench_bin,
- hostname,
- bind_hostname,
- redis_server,
- redis_host,
- redis_port,
- channel_prefix,
- benchmark_run_dir,
- tool_group,
- tool_trigger,
- tools,
- tool_metadata,
- optional_md,
- logger,
- ) as tds_app:
- tds_app.execute()
- except OSError as exc:
- if exc.errno == errno.EADDRINUSE:
- logger.error(
- "ERROR - tool data sink failed to start, %s:%s already in use",
- bind_hostname,
- tds_port,
- )
- else:
- logger.exception("ERROR - failed to start the tool data sink")
- except Exception:
- logger.exception("ERROR - failed to start the tool data sink")
-
- return 0
+ func = daemon if daemonize == "yes" else driver
+ ret_val = func(
+ PROG,
+ redis_server,
+ redis_host,
+ redis_port,
+ pbench_bin,
+ pbench_run,
+ hostname,
+ tar_path,
+ cp_path,
+ param_key,
+ params,
+ optional_md,
+ )
+ return ret_val
diff --git a/lib/pbench/agent/tool_group.py b/lib/pbench/agent/tool_group.py
new file mode 100644
index 0000000000..4d2c5d7057
--- /dev/null
+++ b/lib/pbench/agent/tool_group.py
@@ -0,0 +1,157 @@
+import os
+import re
+
+from pathlib import Path
+
+
+class BadToolGroup(Exception):
+ """Exception representing a tool group that does not exist or is invalid.
+ """
+
+ pass
+
+
+class ToolGroup:
+ """Provides an in-memory representation of the registered tools as recorded
+ on-disk.
+ """
+
+ # Current tool group prefix in use.
+ TOOL_GROUP_PREFIX = "tools-v1"
+
+ @staticmethod
+ def verify_tool_group(group, pbench_run=None):
+ """verify_tool_group - given a tool group name, verify it exists in the
+ ${pbench_run} directory as a properly prefixed tool group directory
+ name.
+
+ Raises a BadToolGroup exception if the directory is invalid or does not
+ exist, or if the pbench_run argument is None and the environment
+ variable of the same name is missing.
+
+ Returns a Pathlib object of the tool group directory on success.
+ """
+ _pbench_run = os.environ.get("pbench_run") if pbench_run is None else pbench_run
+ if not _pbench_run:
+ raise BadToolGroup(
+ "Cannot validate tool group, '{group}', 'pbench_run'"
+ " environment variable missing"
+ )
+
+ tg_dir_name = Path(_pbench_run, f"{ToolGroup.TOOL_GROUP_PREFIX}-{group}")
+ try:
+ tg_dir = tg_dir_name.resolve(strict=True)
+ except FileNotFoundError:
+ raise BadToolGroup(
+ f"Bad tool group, '{group}': directory {tg_dir_name} does not exist"
+ )
+ except Exception as exc:
+ raise BadToolGroup(
+ f"Bad tool group, '{group}': error resolving {tg_dir_name} directory"
+ ) from exc
+ else:
+ if not tg_dir.is_dir():
+ raise BadToolGroup(
+ f"Bad tool group, '{group}': directory {tg_dir_name} not valid"
+ )
+ else:
+ return tg_dir
+
+ def __init__(self, group):
+ """Construct a ToolGroup object from the on-disk data of the given
+ tool group.
+
+ If the given tool group is valid, the contents are read into the three
+ dictionary structures:
+
+ "toolnames" - each tool name is the key, with separate dictionaries
+ for each registered host
+
+ "hostnames" - each registered host is the key, with separate
+ dictionaries for each tool registered on that host
+
+ "labels" - each registered host name, that has a label, is the key,
+ and the label is the value; if a host is not labeled, it does not
+ show up in this dictionary
+
+ Raises BadToolGroup via the verify_tool_group() method on error.
+ """
+ self.tg_dir = self.verify_tool_group(group)
+ self.group = group
+
+ # __trigger__
+ try:
+ _trigger = (self.tg_dir / "__trigger__").read_text()
+ except FileNotFoundError:
+ # Ignore missing trigger file
+ self.trigger = None
+ else:
+ if len(_trigger) == 0:
+ # Ignore empty trigger file contents
+ self.trigger = None
+ else:
+ self.trigger = _trigger
+
+ # toolnames - Dict with tool name as the key, dictionary with host
+ # names and parameters for each host
+ self.toolnames = {}
+ # hostnames - Dict with host name as the key, dictionary with tool
+ # names and parameters for each tool
+ self.hostnames = {}
+ self.labels = {}
+ for hdirent in os.listdir(self.tg_dir):
+ if hdirent == "__trigger__":
+ # Ignore handled above
+ continue
+ if not (self.tg_dir / hdirent).is_dir():
+ # Ignore wayward non-directory files
+ continue
+ # We assume this directory is a hostname.
+ host = hdirent
+ assert (
+ host not in self.hostnames
+ ), f"Logic bomb! {host} in {self.hostnames!r}"
+ self.hostnames[host] = {}
+ for tdirent in os.listdir(self.tg_dir / host):
+ if tdirent == "__label__":
+ self.labels[host] = (
+ (self.tg_dir / host / tdirent).read_text().strip()
+ )
+ continue
+ if tdirent.endswith("__noinstall__"):
+ # FIXME: ignore "noinstall" for now, tools are going to be
+ # in containers so this does not make sense going forward.
+ continue
+ # This directory entry is the name of a tool.
+ tool = tdirent
+ tool_opts_text = (self.tg_dir / host / tool).read_text().strip()
+ tool_opts = re.sub(r"\n\s*", " ", tool_opts_text)
+ if tool not in self.toolnames:
+ self.toolnames[tool] = {}
+ self.toolnames[tool][host] = tool_opts
+ assert (
+ tool not in self.hostnames[host]
+ ), f"Logic bomb! {tool} in {self.hostnames[host]!r}"
+ self.hostnames[host][tool] = tool_opts
+
+ def get_tools(self, host):
+ """get_tools - given a target host, return a dictionary with the list
+ of tool names as keys, and the values being their options for that
+ host.
+ """
+ tools = dict()
+ for tool, opts in self.toolnames.items():
+ try:
+ host_opts = opts[host]
+ except KeyError:
+ # This host does not have this tool registered, ignore.
+ pass
+ else:
+ tools[tool] = host_opts
+ return tools
+
+ def get_label(self, host):
+ """get_label - given a target host, return the label associated with
+ that host.
+ """
+ return self.labels.get(host, "")
diff --git a/lib/pbench/agent/tool_meister.py b/lib/pbench/agent/tool_meister.py
index 97e39c9320..19034f552b 100644
--- a/lib/pbench/agent/tool_meister.py
+++ b/lib/pbench/agent/tool_meister.py
@@ -65,7 +65,11 @@
tm_channel_suffix_to_logging,
TDS_RETRY_PERIOD_SECS,
)
-from pbench.agent.redis import RedisHandler, RedisChannelSubscriber
+from pbench.agent.redis import (
+ RedisHandler,
+ RedisChannelSubscriber,
+ wait_for_conn_and_key,
+)
from pbench.agent.toolmetadata import ToolMetadata
from pbench.agent.utils import collect_local_info
@@ -396,66 +400,20 @@ class DcgmTool(PersistentTool):
"""DcgmTool - provide specific persistent tool behaviors for the "dcgm"
tool.
- In particular, the dcgm tool requires the "--inst" option, requires the
- PYTHONPATH environment variable be set properly, and must use a python2
- environment.
+ The only particular behavior is that we find the proper "dcgm-exporter"
+ executable in our PATH.
"""
def __init__(self, name, tool_opts, logger=None, **kwargs):
super().__init__(name, tool_opts, logger=logger, **kwargs)
- # Looking for required "--inst" option, reformatting appropriately if
- # found.
- tool_opts_l = self.tool_opts.split(" ")
- for opt in tool_opts_l:
- if opt.startswith("--inst="):
- if opt[-1] == "\n":
- install_path = opt[7:-1]
- else:
- install_path = opt[7:]
- self.install_path = Path(install_path)
- self.logger.debug(
- "install path for tool %s, %s", name, self.install_path
- )
- break
- else:
- self.install_path = None
- self.logger.debug("missing install path")
- if self.install_path is None:
- self.script_path = None
- self.args = None
- self.env = None
- else:
- self.script_path = (
- self.install_path / "samples" / "scripts" / "dcgm_prometheus.py"
- )
- if not self.script_path.exists():
- self.logger.error("missing script path, %s", self.script_path)
- self.args = None
- self.env = None
- else:
- self.args = ["python2", f"{self.script_path}"]
- new_path_l = [
- str(self.install_path / "bindings"),
- str(self.install_path / "bindings" / "common"),
- ]
- unit_tests = bool(os.environ.get("_PBENCH_UNIT_TESTS"))
- prev_path = os.environ.get("PYTHONPATH", "")
- if prev_path and not unit_tests:
- new_path_l.append(prev_path)
- self.env = os.environ.copy()
- self.env["PYTHONPATH"] = ":".join(new_path_l)
+ executable = find_executable("dcgm-exporter")
+ self.args = None if executable is None else [executable]
def install(self):
- if self.install_path is None:
- return (1, "dcgm tool --inst argument missing")
- elif self.args is None:
- return (1, f"dcgm tool path, '{self.script_path}', not found")
+ if self.args is None:
+ return (1, "dcgm-exporter tool not found")
return (0, "dcgm tool properly installed")
- def start(self):
- # The dcgm tool needs PYTHONPATH, and run via the shell.
- super().start(env=self.env)
-
class NodeExporterTool(PersistentTool):
"""NodeExporterTool - provide specifics for running the "node-exporter"
@@ -468,10 +426,7 @@ class NodeExporterTool(PersistentTool):
def __init__(self, name, tool_opts, logger=None, **kwargs):
super().__init__(name, tool_opts, logger=logger, **kwargs)
executable = find_executable("node_exporter")
- if executable is None:
- self.args = None
- else:
- self.args = [executable]
+ self.args = None if executable is None else [executable]
def install(self):
if self.args is None:
@@ -779,7 +734,7 @@ def __enter__(self):
num_present = 0
if num_present == 0 and time.time() >= timeout:
raise Exception(
- "Unable to publish startup ack message, {started_msg!r}"
+ f"Unable to publish startup ack message, {started_msg!r}"
)
self.logger.debug("published %s", self._from_tms_channel)
return self
@@ -999,13 +954,17 @@ def start_tools(self, data):
# Name of the temporary tool data directory to use when invoking
# tools. This is a local temporary directory when the Tool Meister is
- # remote from the pbench controller.
- if self._controller == self._hostname:
+ # remote from the pbench controller. When the Tool Meister is run in
+ # a container the "directory" parameter will not map into its
+ # namespace, so we always consider containerized Tool Meisters as
+ # remote.
+ _dir = Path(data["directory"])
+ if self._controller == self._hostname and _dir.exists():
# This is the case when the Tool Meister instance is running on
# the same host as the controller. We just use the directory
# given to us in the `start` message.
try:
- _dir = Path(data["directory"]).resolve(strict=True)
+ _dir = _dir.resolve(strict=True)
except Exception:
self.logger.exception(
"Failed to access provided result directory, %s", data["directory"]
@@ -1705,7 +1664,7 @@ def daemon(
redis_server = redis.Redis(host=redis_host, port=redis_port, db=0)
except Exception as exc:
logger.error(
- "Unable to construct to Redis server object, %s:%s: %s",
+ "Unable to construct Redis server object, %s:%s: %s",
redis_host,
redis_port,
exc,
@@ -1755,10 +1714,17 @@ def main(argv):
except IndexError as e:
print(f"{PROG}: Invalid arguments: {e}", file=sys.stderr)
return 1
+ else:
+ if not redis_host or not redis_port or not param_key:
+ print(f"{PROG}: Invalid arguments: {argv!r}", file=sys.stderr)
+ return 1
try:
daemonize = argv[4]
except IndexError:
daemonize = "no"
+ else:
+ if not daemonize:
+ daemonize = "no"
tar_path = find_executable("tar")
if tar_path is None:
@@ -1824,13 +1790,10 @@ def main(argv):
return 5
try:
- params_raw = redis_server.get(param_key)
- if params_raw is None:
- print(
- f'{PROG}: Parameter key, "{param_key}" does not exist.', file=sys.stderr
- )
- return 6
- params_str = params_raw.decode("utf-8")
+ # Wait for the key to show up with a value.
+ params_str = wait_for_conn_and_key(
+ redis_server, param_key, PROG, redis_host, redis_port
+ )
params = json.loads(params_str)
# Validate the tool meister parameters without constructing an object
# just yet, as we want to make sure we can talk to the redis server
@@ -1841,30 +1804,20 @@ def main(argv):
f"{PROG}: Unable to fetch and decode parameter key, '{param_key}': {exc}",
file=sys.stderr,
)
- return 7
+ return 6
+ func_args = (
+ PROG,
+ tar_path,
+ sysinfo_dump,
+ pbench_install_dir,
+ tmp_dir,
+ param_key,
+ params,
+ redis_server,
+ )
if daemonize == "yes":
- ret_val = daemon(
- PROG,
- tar_path,
- sysinfo_dump,
- pbench_install_dir,
- tmp_dir,
- param_key,
- params,
- redis_server,
- redis_host,
- redis_port,
- )
+ ret_val = daemon(*func_args, redis_host, redis_port)
else:
- ret_val = driver(
- PROG,
- tar_path,
- sysinfo_dump,
- pbench_install_dir,
- tmp_dir,
- param_key,
- params,
- redis_server,
- )
+ ret_val = driver(*func_args)
return ret_val
diff --git a/lib/pbench/agent/utils.py b/lib/pbench/agent/utils.py
index 995f196a7e..95bbbb1e37 100644
--- a/lib/pbench/agent/utils.py
+++ b/lib/pbench/agent/utils.py
@@ -4,7 +4,6 @@
import sys
from datetime import datetime
-from pathlib import Path
from pbench.agent.constants import (
sysinfo_opts_available,
@@ -189,40 +188,3 @@ def collect_local_info(pbench_bin):
hostdata[arg] = cp.stdout.strip() if cp.stdout is not None else ""
return (version, seqno, sha1, hostdata)
-
-
-class BadToolGroup(Exception):
- """Exception representing a tool group that does not exist or is invalid.
- """
-
- pass
-
-
-# Current tool group prefix in use.
-TOOL_GROUP_PREFIX = "tools-v1"
-
-
-def verify_tool_group(group, pbench_run=None):
- """verify_tool_group - given a tool group name, verify it exists in the
- ${pbench_run} directory as a properly prefixed tool group directory name.
-
- Raises a BadToolGroup exception if the directory is invalid or does not
- exist.
-
- Returns a Pathlib object of the tool group directory on success.
- """
- _pbench_run = os.environ["pbench_run"] if pbench_run is None else pbench_run
- tg_dir_name = Path(_pbench_run, f"{TOOL_GROUP_PREFIX}-{group}")
- try:
- tg_dir = tg_dir_name.resolve(strict=True)
- except FileNotFoundError:
- raise BadToolGroup(
- f"Bad tool group, '{group}': directory {tg_dir_name} does not exist"
- )
- else:
- if not tg_dir.is_dir():
- raise BadToolGroup(
- f"Bad tool group, '{group}': directory {tg_dir_name} not valid"
- )
- else:
- return tg_dir
diff --git a/lib/pbench/test/unit/agent/test_tool_data_sink.py b/lib/pbench/test/unit/agent/test_tool_data_sink.py
new file mode 100644
index 0000000000..50b6527b76
--- /dev/null
+++ b/lib/pbench/test/unit/agent/test_tool_data_sink.py
@@ -0,0 +1,480 @@
+"""Tests for the Tool Data Sink module.
+"""
+
+import logging
+import pytest
+import shutil
+import time
+
+from http import HTTPStatus
+from io import BytesIO
+from pathlib import Path
+from threading import Condition, Lock, Thread
+from unittest.mock import patch
+from wsgiref.simple_server import WSGIRequestHandler
+
+from pbench.agent import tool_data_sink
+from pbench.agent.tool_data_sink import (
+ BenchmarkRunDir,
+ ToolDataSinkError,
+ DataSinkWsgiServer,
+)
+
+
+class TestBenchmarkRunDir:
+ """Verify the Tool Data Sink BenchmarkRunDir class.
+ """
+
+ @pytest.fixture
+ def cleanup_tmp(self, pytestconfig):
+ TMP = Path(pytestconfig.cache.get("TMP", None))
+ self.int_pb_run = TMP / "pbench-run-int"
+ self.ext_pb_run = TMP / "pbench-run-ext"
+ yield
+ try:
+ shutil.rmtree(self.int_pb_run)
+ except Exception as exc:
+ print(exc)
+ try:
+ shutil.rmtree(self.ext_pb_run)
+ except Exception as exc:
+ print(exc)
+
+ def test_validate(self, cleanup_tmp):
+ """test_validate - verify the behavior of the validate() using both an
+ internal - external difference and when the internal and external
+ directories are the same.
+
+ This implicitly tests the constructor as well.
+ """
+ self.int_pb_run.mkdir()
+ ext_bm_rd = self.int_pb_run / "bm-run-dir"
+ ext_bm_rd.mkdir()
+ brd = BenchmarkRunDir(str(ext_bm_rd), str(self.int_pb_run))
+ assert str(ext_bm_rd) == str(brd)
+
+ valpre = ext_bm_rd / "valid-prefix"
+ valpre.mkdir()
+ obj = brd.validate(str(valpre))
+ assert str(valpre) == str(obj)
+
+ with pytest.raises(brd.Prefix):
+ brd.validate("/not/a/valid-prefix")
+
+ self.ext_pb_run.mkdir()
+ ext_bm_rd = self.ext_pb_run / "bm-run-dir"
+ ext_bm_rd.mkdir()
+ brd = BenchmarkRunDir(str(ext_bm_rd), str(self.int_pb_run))
+
+ valpre = ext_bm_rd / "not-a-prefix"
+ with pytest.raises(brd.Exists):
+ brd.validate(valpre)
+
+ def test_constructor_errors(self, cleanup_tmp):
+ """test_constructor_errors - verify errors are properly raised during
+ the execution of the constructor.
+ """
+ self.int_pb_run.mkdir()
+
+ ext_bm_rd = self.int_pb_run / "bm-run-dir"
+ ext_bm_rd.write_text("Should be a directory!")
+ with pytest.raises(ToolDataSinkError) as exc:
+ BenchmarkRunDir(str(ext_bm_rd), str(self.int_pb_run))
+ exp_err = f"Run directory parameter, '{ext_bm_rd}', must be a real directory."
+ assert exp_err == str(exc.value)
+ ext_bm_rd.unlink()
+
+ # NOTE: in a container the "internal" pbench run directory must exist,
+ # the external pbench run directory does not exist from within the
+ # container.
+ ext_bm_rd = self.ext_pb_run / "bm-run-dir"
+ int_bm_rd = self.int_pb_run / "bm-run-dir"
+ int_bm_rd.mkdir()
+ with pytest.raises(ToolDataSinkError) as exc:
+ BenchmarkRunDir(str(ext_bm_rd), str(self.int_pb_run))
+ exp_err = (
+ f"Run directory parameter, '{ext_bm_rd}', must be an existing"
+ f" directory ('{self.ext_pb_run}/.path' not found, '"
+ )
+ assert str(exc.value).startswith(exp_err)
+
+ self.ext_pb_run.mkdir()
+ dot_path = self.int_pb_run / ".path"
+ dot_path_contents = f"{self.ext_pb_run}-mismatch"
+ dot_path.write_text(dot_path_contents)
+ with pytest.raises(ToolDataSinkError) as exc:
+ BenchmarkRunDir(str(ext_bm_rd), str(self.int_pb_run))
+ exp_err = (
+ f"Run directory parameter, '{ext_bm_rd}', must be an existing"
+ f" directory (.path contents mismatch, .path='{dot_path_contents}'"
+ f" != '{self.ext_pb_run}')."
+ )
+ assert exp_err == str(exc.value)
+
+
+def _test_app(environ, start_response):
+ start_response(
+ "200 OK",
+ [("Content-Type", "text/plain"), ("Date", "Fri, 12 Feb 2021 23:35:42 UTC")],
+ )
+ return [b"Hello, world! 42"]
+
+
+class TestDataSinkWsgiServer:
+ """Verify the DataSinkWsgiServer wrapper class.
+ """
+
+ def test_constructor(self):
+ """test_constructor - verify the DataSinkWsgiServer constructor.
+ """
+ with pytest.raises(Exception) as exc:
+ DataSinkWsgiServer()
+ assert "DataSinkWsgiServer requires a logger" == str(exc.value)
+
+ wsgi = DataSinkWsgiServer(
+ host="host.example.com", port="42", logger="__logger__"
+ )
+ assert wsgi.options.get("handler_class", "missing") != "missing"
+ klass = wsgi.options.get("handler_class")
+ assert isinstance(klass, type(WSGIRequestHandler))
+ assert wsgi._server is None
+ assert wsgi._err_code is None
+ assert wsgi._err_text is None
+ assert isinstance(wsgi._lock, type(Lock()))
+ assert isinstance(wsgi._cv, type(Condition()))
+ assert wsgi._logger == "__logger__"
+
+ def test_log_methods(self, caplog):
+ logger = logging.getLogger("test_log_methods")
+ wsgi_server = DataSinkWsgiServer(
+ host="host.example.com", port="42", logger=logger
+ )
+ wrh = wsgi_server.options["handler_class"]
+ # This forces the base WSGI methods to not buffer writes.
+ wrh.wbufsize = 1
+
+ class MockBytesIO(BytesIO):
+ def close(self, *args, **kwargs):
+ self._saved_value = self.getvalue()
+ super().close(*args, **kwargs)
+
+ class MockSocket:
+ def getsockname(self):
+ return ("sockname",)
+
+ class MockRequest:
+ _sock = MockSocket()
+
+ def __init__(self, path):
+ self._path = path
+
+ def makefile(self, *args, **kwargs):
+ if args[0] == "rb":
+ return MockBytesIO(b"GET %s HTTP/1.1" % self._path)
+ elif args[0] == "wb":
+ return MockBytesIO(b"")
+ else:
+ raise ValueError(
+ "MockRequest: unrecognized file type", args, kwargs
+ )
+
+ class MockServer:
+ def __init__(self):
+ self.base_environ = {}
+
+ def get_app(self):
+ return _test_app
+
+ mock_server = MockServer()
+
+ # We perform all these above mock infrastructure just to get a usable
+ # DataSinkWsgiRequestHandler() object. The MockRequest() mimics a
+ # single request being handled, where a response is generated, and
+ # captured in the handlers "wfile" attribute value. This one request
+ # will also emit one informational log.
+ handler = wrh(MockRequest(b"/"), (0, 0), mock_server)
+ assert handler.wfile._saved_value.startswith(b"HTTP/1.0 200 OK")
+ assert handler.wfile._saved_value.endswith(b"Hello, world! 42")
+ assert caplog.records[0].levelname == "INFO"
+ assert caplog.records[0].message == '0 - - "GET / HTTP/1.1" 200 16'
+
+ # Now that we have this handler object, we can directly invoke the
+ # other logging methods to verify their behavior.
+ handler.log_error("test error %d %s", 42, "43")
+ assert caplog.records[1].levelname == "ERROR"
+ assert caplog.records[1].message == "0 - - test error 42 43"
+ handler.log_message("test msg %d %s", 42, "43")
+ assert caplog.records[2].levelname == "WARNING"
+ assert caplog.records[2].message == "0 - - test msg 42 43"
+ handler.log_request(code=HTTPStatus(404), size=42)
+ assert caplog.records[3].levelname == "INFO"
+ assert caplog.records[3].message == '0 - - "GET / HTTP/1.1" 404 42'
+
+ class MockServer:
+ def __init__(self, host, port, app, *args, **kwargs):
+ self.host = host
+ self.port = port
+ self.app = app
+ self.args = args
+ self.kwargs = kwargs
+ self.serve_forever_called = False
+ self.shutdown_called = False
+ if self.host.startswith("oserror"):
+ raise OSError(42, "oserror")
+ elif self.host.startswith("exception"):
+ raise Exception("exception")
+
+ def shutdown(self):
+ self.shutdown_called = True
+
+ def serve_forever(self):
+ self.serve_forever_called = True
+
+ def test_run(self, caplog):
+ """test_run - verify code paths of run method directly.
+
+ NOTE: We are not using threads to do this. Instead we are mocking out
+ the `make_server` call to create a fake server that we control that
+ does nothing when "serve_forever" is called.
+ """
+ logger = logging.getLogger("test_run")
+ wsgi_server = DataSinkWsgiServer(
+ host="host.example.com", port="42", logger=logger
+ )
+ mocked_servers = []
+
+ def mock_make_server(host, port, app, *args, **kwargs):
+ mocked_server = self.MockServer(host, port, app, *args, **kwargs)
+ mocked_servers.append(mocked_server)
+ return mocked_server
+
+ with patch.object(tool_data_sink, "make_server", mock_make_server):
+ # First we invoke the "run" method once to let it execute normally.
+ try:
+ wsgi_server.run(_test_app)
+ except Exception as exc:
+ pytest.fail(f"WSGI server failed with an exception, {exc}")
+ else:
+ # Retrieve the internal server object that we created, and
+ # verify that it is created as expected, and that
+ # "serve_forever" was called.
+ mock_server = mocked_servers[0]
+ assert wsgi_server._server is mock_server
+ assert wsgi_server._err_code == 0
+ assert wsgi_server._err_text is None
+ assert mock_server.host == "host.example.com"
+ assert mock_server.port == 42
+ assert mock_server.app is _test_app
+ assert mock_server.args == ()
+ klass = mock_server.kwargs.get("handler_class")
+ assert isinstance(klass, type(WSGIRequestHandler))
+ assert mock_server.serve_forever_called
+ # The success path of "run" should have emitted three debug
+ # messages.
+ assert len(caplog.records) == 3
+ assert caplog.records[0].levelname == "DEBUG"
+ assert (
+ caplog.records[0].message == "Making tool data sink WSGI server ..."
+ )
+ assert caplog.records[1].levelname == "DEBUG"
+ assert caplog.records[1].message == "Successfully created WSGI server"
+ assert caplog.records[2].levelname == "DEBUG"
+ assert (
+ caplog.records[2].message
+ == "Running tool data sink WSGI server ..."
+ )
+ with pytest.raises(AssertionError) as exc:
+ # Call it again to verify the assertion fires
+ wsgi_server.run(_test_app)
+ assert "'run' method called twice" in str(exc.value), f"{exc.value}"
+ # No logs should have been emitted.
+ assert len(caplog.records) == 3
+
+ def test_stop_and_wait(self, caplog):
+ """test_stop_and_wait - verify the operation of run() in conjunction
+ with stop() and wait() methods from separate threads.
+
+ There are a number of scenarios for the order of operations between
+ threads that we need to test. We list them here using "MainThr" as the
+ name of the "main thread" which _creates_ the WSGI thread, and "WsgiThr"
+ as the name of the created WSGI thread invoking the "run" method.
+
+ References:
+ .wait() called in
+ .stop() method
+ __enter__() method
+ .stop() called in
+ __exit__() method
+
+ Scenario A:
+
+ * MainThr creates WSGI thread (WsgiThr not running)
+ * MainThr calls stop()
+ * WsgiThr starts running
+ * WsgiThr reports err_code == 0
+
+ Scenario B:
+
+ * MainThr creates WSGI thread
+ * WsgiThr starts running
+ * WsgiThr reports err_code == 0
+ * MainThr calls stop()
+
+ Scenario C:
+
+ * MainThr creates WSGI thread (WsgiThr not running)
+ * MainThr calls stop()
+ * WsgiThr starts running
+ * WsgiThr reports err_code > 0
+
+ Scenario D:
+
+ * MainThr creates WSGI thread
+ * WsgiThr starts running
+ * WsgiThr reports err_code > 0
+ * MainThr calls stop()
+
+ Scenario E:
+
+ * MainThr creates WSGI thread (WsgiThr not running)
+ * MainThr calls stop()
+ * WsgiThr starts running
+ * WsgiThr reports err_code < 0
+
+ Scenario F:
+
+ * MainThr creates WSGI thread
+ * WsgiThr starts running
+ * WsgiThr reports err_code < 0
+ * MainThr calls stop()
+ """
+
+ def wsgi_run(scenario, wsgi_server, trace):
+ ret_val = None
+ if scenario in ("A", "C", "E"):
+ time.sleep(0.1)
+ try:
+ trace.append("WsgiThr - run")
+ wsgi_server.run(_test_app)
+ except Exception as exc:
+ ret_val = exc
+ return ret_val
+
+ def do_wait(scenario, wsgi_server, trace):
+ if scenario in ("B", "D", "F"):
+ time.sleep(0.1)
+ trace.append("MainThr - wait")
+ err_text, err_code = wsgi_server.wait()
+ return err_text, err_code
+
+ def do_stop(scenario, wsgi_server, trace):
+ if scenario in ("B", "D", "F"):
+ time.sleep(0.1)
+ trace.append("MainThr - stop")
+ wsgi_server.stop()
+
+ # The host name prefix directs the MockServer class to behave by
+ # raising an OSError or Exception base on the name.
+ hostnames = dict(
+ A="host.example.com",
+ B="host.example.com",
+ C="oserror.example.com",
+ D="oserror.example.com",
+ E="exception.example.com",
+ F="exception.example.com",
+ )
+ caplog_idx = 0
+ logger = logging.getLogger("test_run")
+ for scenario in ["A", "B", "C", "D", "E", "F"]:
+ wsgi_server = DataSinkWsgiServer(
+ host=hostnames[scenario], port="42", logger=logger
+ )
+ mocked_servers = []
+
+ def mock_make_server(host, port, app, *args, **kwargs):
+ mocked_server = self.MockServer(host, port, app, *args, **kwargs)
+ mocked_servers.append(mocked_server)
+ return mocked_server
+
+ with patch.object(tool_data_sink, "make_server", mock_make_server):
+ trace = []
+ wsgithr = Thread(target=wsgi_run, args=(scenario, wsgi_server, trace))
+ wsgithr.start()
+ err_text, err_code = do_wait(scenario, wsgi_server, trace)
+ wsgithr.join()
+ assert caplog.records[caplog_idx].levelname == "DEBUG"
+ assert (
+ caplog.records[caplog_idx].message
+ == "Making tool data sink WSGI server ..."
+ )
+ caplog_idx += 1
+ if scenario in ("A", "B"):
+ mock_server = mocked_servers[0]
+ assert mock_server.serve_forever_called
+ assert not mock_server.shutdown_called
+ assert err_code == 0
+ assert err_text is None
+ assert caplog.records[caplog_idx].levelname == "DEBUG"
+ assert (
+ caplog.records[caplog_idx].message
+ == "Successfully created WSGI server"
+ )
+ caplog_idx += 1
+ assert caplog.records[caplog_idx].levelname == "DEBUG"
+ assert (
+ caplog.records[caplog_idx].message
+ == "Running tool data sink WSGI server ..."
+ )
+ caplog_idx += 1
+ elif scenario in ("C", "D"):
+ assert len(mocked_servers) == 0
+ assert err_code == 42
+ assert err_text == "[Errno 42] oserror"
+ # Only 1 log message is emitted when OSErrors are encountered
+ else:
+ assert scenario in ("E", "F")
+ assert len(mocked_servers) == 0
+ assert err_code == -1
+ assert err_text == "exception"
+ assert caplog.records[caplog_idx].levelname == "ERROR"
+ assert (
+ caplog.records[caplog_idx].message
+ == "Unexpected error in WSGI server"
+ )
+ caplog_idx += 1
+ assert len(caplog.records) == caplog_idx
+
+ # Now we test two cases for the stop() method
+ for scenario in ["A", "E"]:
+ wsgi_server = DataSinkWsgiServer(
+ host=hostnames[scenario], port="42", logger=logger
+ )
+ mocked_servers = []
+
+ def mock_make_server(host, port, app, *args, **kwargs):
+ mocked_server = self.MockServer(host, port, app, *args, **kwargs)
+ mocked_servers.append(mocked_server)
+ return mocked_server
+
+ with patch.object(tool_data_sink, "make_server", mock_make_server):
+ trace = []
+ wsgithr = Thread(target=wsgi_run, args=(scenario, wsgi_server, trace))
+ wsgithr.start()
+ do_stop(scenario, wsgi_server, trace)
+ wsgithr.join()
+ assert caplog.records[caplog_idx].levelname == "DEBUG"
+ assert (
+ caplog.records[caplog_idx].message
+ == "Making tool data sink WSGI server ..."
+ )
+ caplog_idx += 1
+ if scenario == "A":
+ mock_server = mocked_servers[0]
+ assert mock_server.serve_forever_called
+ assert mock_server.shutdown_called
+ caplog_idx += 2
+ else:
+ assert scenario == "E"
+ assert len(mocked_servers) == 0
+ caplog_idx += 1
+ assert len(caplog.records) == caplog_idx