Skip to content

Commit acbea8d

Browse files
committed
HDDS-1708. Add container scrubber metrics.
Contributed by Hrishikesh Gadre.
1 parent 0ccf4b0 commit acbea8d

File tree

8 files changed

+442
-39
lines changed

8 files changed

+442
-39
lines changed

hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/keyvalue/impl/ChunkManagerFactory.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020

2121
import com.google.common.base.Preconditions;
2222
import org.apache.hadoop.conf.Configuration;
23-
import org.apache.hadoop.hdds.HddsConfigKeys;
2423
import org.apache.hadoop.ozone.container.keyvalue.interfaces.ChunkManager;
2524
import org.slf4j.Logger;
2625
import org.slf4j.LoggerFactory;

hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/ContainerDataScanner.java

Lines changed: 76 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,10 @@
1919

2020
import java.io.IOException;
2121
import java.util.Iterator;
22+
import java.util.concurrent.TimeUnit;
2223

24+
import com.google.common.annotations.VisibleForTesting;
25+
import org.apache.hadoop.conf.Configuration;
2326
import org.apache.hadoop.hdfs.util.Canceler;
2427
import org.apache.hadoop.hdfs.util.DataTransferThrottler;
2528
import org.apache.hadoop.ozone.container.common.interfaces.Container;
@@ -42,6 +45,7 @@ public class ContainerDataScanner extends Thread {
4245
private final ContainerController controller;
4346
private final DataTransferThrottler throttler;
4447
private final Canceler canceler;
48+
private final ContainerDataScrubberMetrics metrics;
4549

4650
/**
4751
* True if the thread is stopping.<p/>
@@ -50,12 +54,15 @@ public class ContainerDataScanner extends Thread {
5054
private volatile boolean stopping = false;
5155

5256

53-
public ContainerDataScanner(ContainerController controller,
57+
public ContainerDataScanner(Configuration conf,
58+
ContainerController controller,
5459
HddsVolume volume, long bytesPerSec) {
5560
this.controller = controller;
5661
this.volume = volume;
57-
this.throttler = new DataTransferThrottler(bytesPerSec);
62+
this.throttler = new HddsDataTransferThrottler(bytesPerSec);
5863
this.canceler = new Canceler();
64+
this.metrics = ContainerDataScrubberMetrics.create(conf,
65+
volume.toString());
5966
setName("ContainerDataScanner(" + volume + ")");
6067
setDaemon(true);
6168
}
@@ -65,26 +72,54 @@ public void run() {
6572
LOG.trace("{}: thread starting.", this);
6673
try {
6774
while (!stopping) {
68-
Iterator<Container> itr = controller.getContainers(volume);
69-
while (!stopping && itr.hasNext()) {
70-
Container c = itr.next();
71-
try {
72-
if (c.shouldScanData()) {
73-
if(!c.scanData(throttler, canceler)) {
74-
controller.markContainerUnhealthy(
75-
c.getContainerData().getContainerID());
76-
}
77-
}
78-
} catch (IOException ex) {
79-
long containerId = c.getContainerData().getContainerID();
80-
LOG.warn("Unexpected exception while scanning container "
81-
+ containerId, ex);
82-
}
83-
}
75+
runIteration();
76+
metrics.resetNumContainersScanned();
77+
metrics.resetNumUnhealthyContainers();
8478
}
8579
LOG.info("{} exiting.", this);
8680
} catch (Throwable e) {
8781
LOG.error("{} exiting because of exception ", this, e);
82+
} finally {
83+
if (metrics != null) {
84+
metrics.unregister();
85+
}
86+
}
87+
}
88+
89+
@VisibleForTesting
90+
public void runIteration() {
91+
long startTime = System.nanoTime();
92+
Iterator<Container> itr = controller.getContainers(volume);
93+
while (!stopping && itr.hasNext()) {
94+
Container c = itr.next();
95+
if (c.shouldScanData()) {
96+
try {
97+
if (!c.scanData(throttler, canceler)) {
98+
metrics.incNumUnHealthyContainers();
99+
controller.markContainerUnhealthy(
100+
c.getContainerData().getContainerID());
101+
}
102+
} catch (IOException ex) {
103+
long containerId = c.getContainerData().getContainerID();
104+
LOG.warn("Unexpected exception while scanning container "
105+
+ containerId, ex);
106+
} finally {
107+
metrics.incNumContainersScanned();
108+
}
109+
}
110+
}
111+
long totalDuration = System.nanoTime() - startTime;
112+
if (!stopping) {
113+
metrics.incNumScanIterations();
114+
LOG.info("Completed an iteration of container data scrubber in" +
115+
" {} minutes." +
116+
" Number of iterations (since the data-node restart) : {}" +
117+
", Number of containers scanned in this iteration : {}" +
118+
", Number of unhealthy containers found in this iteration : {}",
119+
TimeUnit.NANOSECONDS.toMinutes(totalDuration),
120+
metrics.getNumScanIterations(),
121+
metrics.getNumContainersScanned(),
122+
metrics.getNumUnHealthyContainers());
88123
}
89124
}
90125

@@ -100,9 +135,32 @@ public synchronized void shutdown() {
100135
}
101136
}
102137

138+
@VisibleForTesting
139+
public ContainerDataScrubberMetrics getMetrics() {
140+
return metrics;
141+
}
142+
103143
@Override
104144
public String toString() {
105145
return "ContainerDataScanner(" + volume +
106146
", " + volume.getStorageID() + ")";
107147
}
148+
149+
private class HddsDataTransferThrottler extends DataTransferThrottler {
150+
HddsDataTransferThrottler(long bandwidthPerSec) {
151+
super(bandwidthPerSec);
152+
}
153+
154+
@Override
155+
public synchronized void throttle(long numOfBytes) {
156+
ContainerDataScanner.this.metrics.incNumBytesScanned(numOfBytes);
157+
super.throttle(numOfBytes);
158+
}
159+
160+
@Override
161+
public synchronized void throttle(long numOfBytes, Canceler c) {
162+
ContainerDataScanner.this.metrics.incNumBytesScanned(numOfBytes);
163+
super.throttle(numOfBytes, c);
164+
}
165+
}
108166
}
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
/**
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.hadoop.ozone.container.ozoneimpl;
19+
20+
import org.apache.hadoop.classification.InterfaceAudience;
21+
import org.apache.hadoop.conf.Configuration;
22+
import org.apache.hadoop.metrics2.MetricsSystem;
23+
import org.apache.hadoop.metrics2.annotation.Metric;
24+
import org.apache.hadoop.metrics2.annotation.Metrics;
25+
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
26+
import org.apache.hadoop.metrics2.lib.MutableCounterInt;
27+
import org.apache.hadoop.metrics2.lib.MutableGaugeInt;
28+
import org.apache.hadoop.metrics2.lib.MutableRate;
29+
30+
import java.util.concurrent.ThreadLocalRandom;
31+
32+
/**
33+
* This class captures the container data scrubber metrics on the data-node.
34+
**/
35+
@InterfaceAudience.Private
36+
@Metrics(about="DataNode container data scrubber metrics", context="dfs")
37+
public final class ContainerDataScrubberMetrics {
38+
private final String name;
39+
private final MetricsSystem ms;
40+
@Metric("number of containers scanned in the current iteration")
41+
private MutableGaugeInt numContainersScanned;
42+
@Metric("number of unhealthy containers found in the current iteration")
43+
private MutableGaugeInt numUnHealthyContainers;
44+
@Metric("number of iterations of scanner completed since the restart")
45+
private MutableCounterInt numScanIterations;
46+
@Metric("disk bandwidth used by the container data scrubber per volume")
47+
private MutableRate numBytesScanned;
48+
49+
public int getNumContainersScanned() {
50+
return numContainersScanned.value();
51+
}
52+
53+
public void incNumContainersScanned() {
54+
numContainersScanned.incr();
55+
}
56+
57+
public void resetNumContainersScanned() {
58+
numContainersScanned.decr(getNumContainersScanned());
59+
}
60+
61+
public int getNumUnHealthyContainers() {
62+
return numUnHealthyContainers.value();
63+
}
64+
65+
public void incNumUnHealthyContainers() {
66+
numUnHealthyContainers.incr();
67+
}
68+
69+
public void resetNumUnhealthyContainers() {
70+
numUnHealthyContainers.decr(getNumUnHealthyContainers());
71+
}
72+
73+
public int getNumScanIterations() {
74+
return numScanIterations.value();
75+
}
76+
77+
public void incNumScanIterations() {
78+
numScanIterations.incr();
79+
}
80+
81+
public double getNumBytesScannedMean() {
82+
return numBytesScanned.lastStat().mean();
83+
}
84+
85+
public long getNumBytesScannedSampleCount() {
86+
return numBytesScanned.lastStat().numSamples();
87+
}
88+
89+
public double getNumBytesScannedStdDev() {
90+
return numBytesScanned.lastStat().stddev();
91+
}
92+
93+
public void incNumBytesScanned(long bytes) {
94+
numBytesScanned.add(bytes);
95+
}
96+
97+
public void unregister() {
98+
ms.unregisterSource(name);
99+
}
100+
101+
private ContainerDataScrubberMetrics(String name, MetricsSystem ms) {
102+
this.name = name;
103+
this.ms = ms;
104+
}
105+
106+
public static ContainerDataScrubberMetrics create(final Configuration conf,
107+
final String volumeName) {
108+
MetricsSystem ms = DefaultMetricsSystem.instance();
109+
String name = "ContainerDataScrubberMetrics-"+ (volumeName.isEmpty()
110+
? "UndefinedDataNodeVolume"+ ThreadLocalRandom.current().nextInt()
111+
: volumeName.replace(':', '-'));
112+
113+
return ms.register(name, null, new ContainerDataScrubberMetrics(name, ms));
114+
}
115+
}

hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/ContainerMetadataScanner.java

Lines changed: 42 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
package org.apache.hadoop.ozone.container.ozoneimpl;
1919

2020
import com.google.common.annotations.VisibleForTesting;
21+
import org.apache.hadoop.conf.Configuration;
2122
import org.apache.hadoop.ozone.container.common.interfaces.Container;
2223
import org.slf4j.Logger;
2324
import org.slf4j.LoggerFactory;
@@ -36,16 +37,19 @@ public class ContainerMetadataScanner extends Thread {
3637

3738
private final ContainerController controller;
3839
private final long metadataScanInterval;
40+
private final ContainerMetadataScrubberMetrics metrics;
3941
/**
4042
* True if the thread is stopping.<p/>
4143
* Protected by this object's lock.
4244
*/
4345
private boolean stopping = false;
4446

45-
public ContainerMetadataScanner(ContainerController controller,
47+
public ContainerMetadataScanner(Configuration conf,
48+
ContainerController controller,
4649
long metadataScanInterval) {
4750
this.controller = controller;
4851
this.metadataScanInterval = metadataScanInterval;
52+
this.metrics = ContainerMetadataScrubberMetrics.create(conf);
4953
setName("ContainerMetadataScanner");
5054
setDaemon(true);
5155
}
@@ -58,46 +62,67 @@ public void run() {
5862
LOG.info("Background ContainerMetadataScanner starting up");
5963
while (!stopping) {
6064
long start = System.nanoTime();
61-
scrub();
62-
long interval = TimeUnit.NANOSECONDS.toMillis(System.nanoTime()-start);
63-
// ensure to delay next metadata scan with respect to user config.
64-
if (!stopping && interval < metadataScanInterval) {
65-
try {
66-
Thread.sleep(metadataScanInterval - interval);
67-
} catch (InterruptedException e) {
68-
LOG.info("Background ContainerMetadataScanner interrupted." +
69-
" Going to exit");
70-
}
65+
runIteration();
66+
if(!stopping) {
67+
metrics.resetNumUnhealthyContainers();
68+
metrics.resetNumContainersScanned();
7169
}
7270
}
7371
}
7472

75-
private void scrub() {
73+
@VisibleForTesting
74+
public void runIteration() {
75+
long start = System.nanoTime();
7676
Iterator<Container> containerIt = controller.getContainers();
77-
long count = 0;
78-
7977
while (!stopping && containerIt.hasNext()) {
8078
Container container = containerIt.next();
8179
try {
8280
scrub(container);
8381
} catch (IOException e) {
8482
LOG.info("Unexpected error while scrubbing container {}",
8583
container.getContainerData().getContainerID());
84+
} finally {
85+
metrics.incNumContainersScanned();
86+
}
87+
}
88+
long interval = System.nanoTime()-start;
89+
if (!stopping) {
90+
metrics.incNumScanIterations();
91+
LOG.info("Completed an iteration of container metadata scrubber in" +
92+
" {} minutes." +
93+
" Number of iterations (since the data-node restart) : {}" +
94+
", Number of containers scanned in this iteration : {}" +
95+
", Number of unhealthy containers found in this iteration : {}",
96+
TimeUnit.NANOSECONDS.toMinutes(interval),
97+
metrics.getNumScanIterations(),
98+
metrics.getNumContainersScanned(),
99+
metrics.getNumUnHealthyContainers());
100+
// ensure to delay next metadata scan with respect to user config.
101+
if (interval < metadataScanInterval) {
102+
try {
103+
Thread.sleep(metadataScanInterval - interval);
104+
} catch (InterruptedException e) {
105+
LOG.info("Background ContainerMetadataScanner interrupted." +
106+
" Going to exit");
107+
}
86108
}
87-
count++;
88109
}
89-
90-
LOG.debug("iterator ran integrity checks on {} containers", count);
91110
}
92111

93112
@VisibleForTesting
94113
public void scrub(Container container) throws IOException {
95114
if (!container.scanMetaData()) {
115+
metrics.incNumUnHealthyContainers();
96116
controller.markContainerUnhealthy(
97117
container.getContainerData().getContainerID());
98118
}
99119
}
100120

121+
@VisibleForTesting
122+
public ContainerMetadataScrubberMetrics getMetrics() {
123+
return metrics;
124+
}
125+
101126
public synchronized void shutdown() {
102127
this.stopping = true;
103128
this.interrupt();

0 commit comments

Comments
 (0)