Skip to content

Commit dee6dc2

Browse files
authored
YARN-10893. Adding metrics for getClusterMetrics and getApplications APIs in FederationClientInterceptor (#3325)
1 parent 99cb2b6 commit dee6dc2

File tree

3 files changed

+104
-6
lines changed

3 files changed

+104
-6
lines changed

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/RouterMetrics.java

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,8 @@ public final class RouterMetrics {
5353
private MutableGaugeInt numMultipleAppsFailedRetrieved;
5454
@Metric("# of applicationAttempt reports failed to be retrieved")
5555
private MutableGaugeInt numAppAttemptsFailedRetrieved;
56+
@Metric("# of getClusterMetrics failed to be retrieved")
57+
private MutableGaugeInt numGetClusterMetricsFailedRetrieved;
5658

5759
// Aggregate metrics are shared, and don't have to be looked up per call
5860
@Metric("Total number of successful Submitted apps and latency(ms)")
@@ -69,6 +71,9 @@ public final class RouterMetrics {
6971
@Metric("Total number of successful Retrieved " +
7072
"appAttempt reports and latency(ms)")
7173
private MutableRate totalSucceededAppAttemptsRetrieved;
74+
@Metric("Total number of successful Retrieved getClusterMetrics and "
75+
+ "latency(ms)")
76+
private MutableRate totalSucceededGetClusterMetricsRetrieved;
7277

7378

7479
/**
@@ -80,6 +85,7 @@ public final class RouterMetrics {
8085
private MutableQuantiles getApplicationReportLatency;
8186
private MutableQuantiles getApplicationsReportLatency;
8287
private MutableQuantiles getApplicationAttemptReportLatency;
88+
private MutableQuantiles getClusterMetricsLatency;
8389

8490
private static volatile RouterMetrics INSTANCE = null;
8591
private static MetricsRegistry registry;
@@ -103,6 +109,9 @@ private RouterMetrics() {
103109
registry.newQuantiles("getApplicationAttemptReportLatency",
104110
"latency of get applicationattempt " +
105111
"report", "ops", "latency", 10);
112+
getClusterMetricsLatency =
113+
registry.newQuantiles("getClusterMetricsLatency",
114+
"latency of get cluster metrics", "ops", "latency", 10);
106115
}
107116

108117
public static RouterMetrics getMetrics() {
@@ -154,6 +163,11 @@ public long getNumSucceededMultipleAppsRetrieved() {
154163
return totalSucceededMultipleAppsRetrieved.lastStat().numSamples();
155164
}
156165

166+
@VisibleForTesting
167+
public long getNumSucceededGetClusterMetricsRetrieved(){
168+
return totalSucceededGetClusterMetricsRetrieved.lastStat().numSamples();
169+
}
170+
157171
@VisibleForTesting
158172
public double getLatencySucceededAppsCreated() {
159173
return totalSucceededAppsCreated.lastStat().mean();
@@ -184,6 +198,11 @@ public double getLatencySucceededMultipleGetAppReport() {
184198
return totalSucceededMultipleAppsRetrieved.lastStat().mean();
185199
}
186200

201+
@VisibleForTesting
202+
public double getLatencySucceededGetClusterMetricsRetrieved() {
203+
return totalSucceededGetClusterMetricsRetrieved.lastStat().mean();
204+
}
205+
187206
@VisibleForTesting
188207
public int getAppsFailedCreated() {
189208
return numAppsFailedCreated.value();
@@ -214,6 +233,11 @@ public int getMultipleAppsFailedRetrieved() {
214233
return numMultipleAppsFailedRetrieved.value();
215234
}
216235

236+
@VisibleForTesting
237+
public int getClusterMetricsFailedRetrieved() {
238+
return numGetClusterMetricsFailedRetrieved.value();
239+
}
240+
217241
public void succeededAppsCreated(long duration) {
218242
totalSucceededAppsCreated.add(duration);
219243
getNewApplicationLatency.add(duration);
@@ -244,6 +268,11 @@ public void succeededAppAttemptsRetrieved(long duration) {
244268
getApplicationAttemptReportLatency.add(duration);
245269
}
246270

271+
public void succeededGetClusterMetricsRetrieved(long duration) {
272+
totalSucceededGetClusterMetricsRetrieved.add(duration);
273+
getClusterMetricsLatency.add(duration);
274+
}
275+
247276
public void incrAppsFailedCreated() {
248277
numAppsFailedCreated.incr();
249278
}
@@ -268,4 +297,8 @@ public void incrAppAttemptsFailedRetrieved() {
268297
numAppAttemptsFailedRetrieved.incr();
269298
}
270299

300+
public void incrGetClusterMetricsFailedRetrieved() {
301+
numGetClusterMetricsFailedRetrieved.incr();
302+
}
303+
271304
}

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/clientrm/FederationClientInterceptor.java

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -628,18 +628,29 @@ public GetApplicationReportResponse getApplicationReport(
628628
public GetApplicationsResponse getApplications(GetApplicationsRequest request)
629629
throws YarnException, IOException {
630630
if (request == null) {
631+
routerMetrics.incrMultipleAppsFailedRetrieved();
631632
RouterServerUtil.logAndThrowException(
632633
"Missing getApplications request.",
633634
null);
634635
}
636+
long startTime = clock.getTime();
635637
Map<SubClusterId, SubClusterInfo> subclusters =
636638
federationFacade.getSubClusters(true);
637639
ClientMethod remoteMethod = new ClientMethod("getApplications",
638640
new Class[] {GetApplicationsRequest.class}, new Object[] {request});
639-
Map<SubClusterId, GetApplicationsResponse> applications =
640-
invokeConcurrent(subclusters.keySet(), remoteMethod,
641-
GetApplicationsResponse.class);
641+
Map<SubClusterId, GetApplicationsResponse> applications;
642+
643+
try {
644+
applications = invokeConcurrent(subclusters.keySet(), remoteMethod,
645+
GetApplicationsResponse.class);
642646

647+
} catch (Exception ex) {
648+
routerMetrics.incrMultipleAppsFailedRetrieved();
649+
LOG.error("Unable to get applications due to exception.", ex);
650+
throw ex;
651+
}
652+
long stopTime = clock.getTime();
653+
routerMetrics.succeededMultipleAppsRetrieved(stopTime - startTime);
643654
// Merge the Application Reports
644655
return RouterYarnClientUtils.mergeApplications(applications.values(),
645656
returnPartialReport);
@@ -648,14 +659,26 @@ public GetApplicationsResponse getApplications(GetApplicationsRequest request)
648659
@Override
649660
public GetClusterMetricsResponse getClusterMetrics(
650661
GetClusterMetricsRequest request) throws YarnException, IOException {
662+
long startTime = clock.getTime();
651663
Map<SubClusterId, SubClusterInfo> subclusters =
652664
federationFacade.getSubClusters(true);
653665
ClientMethod remoteMethod = new ClientMethod("getClusterMetrics",
654666
new Class[] {GetClusterMetricsRequest.class}, new Object[] {request});
655667
ArrayList<SubClusterId> clusterList = new ArrayList<>(subclusters.keySet());
656-
Map<SubClusterId, GetClusterMetricsResponse> clusterMetrics =
657-
invokeConcurrent(clusterList, remoteMethod,
658-
GetClusterMetricsResponse.class);
668+
Map<SubClusterId, GetClusterMetricsResponse> clusterMetrics;
669+
670+
try {
671+
clusterMetrics = invokeConcurrent(clusterList, remoteMethod,
672+
GetClusterMetricsResponse.class);
673+
674+
} catch (Exception ex) {
675+
routerMetrics.incrGetClusterMetricsFailedRetrieved();
676+
LOG.error("Unable to get cluster metrics due to exception.", ex);
677+
throw ex;
678+
}
679+
680+
long stopTime = clock.getTime();
681+
routerMetrics.succeededGetClusterMetricsRetrieved(stopTime - startTime);
659682
return RouterYarnClientUtils.merge(clusterMetrics.values());
660683
}
661684

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/TestRouterMetrics.java

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,37 @@ public void testMulipleAppsReportFailed() {
279279
metrics.getMultipleAppsFailedRetrieved());
280280
}
281281

282+
/**
283+
* This test validates the correctness of the metric: Retrieved getClusterMetrics
284+
* multiple times successfully.
285+
*/
286+
@Test
287+
public void testSucceededGetClusterMetrics() {
288+
long totalGoodBefore = metrics.getNumSucceededGetClusterMetricsRetrieved();
289+
goodSubCluster.getClusterMetrics(100);
290+
Assert.assertEquals(totalGoodBefore + 1,
291+
metrics.getNumSucceededGetClusterMetricsRetrieved());
292+
Assert.assertEquals(100, metrics.getLatencySucceededGetClusterMetricsRetrieved(),
293+
0);
294+
goodSubCluster.getClusterMetrics(200);
295+
Assert.assertEquals(totalGoodBefore + 2,
296+
metrics.getNumSucceededGetClusterMetricsRetrieved());
297+
Assert.assertEquals(150, metrics.getLatencySucceededGetClusterMetricsRetrieved(),
298+
0);
299+
}
300+
301+
/**
302+
* This test validates the correctness of the metric: Failed to
303+
* retrieve getClusterMetrics.
304+
*/
305+
@Test
306+
public void testGetClusterMetricsFailed() {
307+
long totalBadbefore = metrics.getClusterMetricsFailedRetrieved();
308+
badSubCluster.getClusterMetrics();
309+
Assert.assertEquals(totalBadbefore + 1,
310+
metrics.getClusterMetricsFailedRetrieved());
311+
}
312+
282313
// Records failures for all calls
283314
private class MockBadSubCluster {
284315
public void getNewApplication() {
@@ -310,6 +341,11 @@ public void getApplicationsReport() {
310341
LOG.info("Mocked: failed getApplicationsReport call");
311342
metrics.incrMultipleAppsFailedRetrieved();
312343
}
344+
345+
public void getClusterMetrics() {
346+
LOG.info("Mocked: failed getClusterMetrics call");
347+
metrics.incrGetClusterMetricsFailedRetrieved();
348+
}
313349
}
314350

315351
// Records successes for all calls
@@ -350,5 +386,11 @@ public void getApplicationsReport(long duration) {
350386
duration);
351387
metrics.succeededMultipleAppsRetrieved(duration);
352388
}
389+
390+
public void getClusterMetrics(long duration){
391+
LOG.info("Mocked: successful getClusterMetrics call with duration {}",
392+
duration);
393+
metrics.succeededGetClusterMetricsRetrieved(duration);
394+
}
353395
}
354396
}

0 commit comments

Comments
 (0)