Skip to content

Commit ed65aa2

Browse files
YARN-11067. Resource overcommitment due to incorrect resource normalisation logical order. Contributed by Andras Gyori
1 parent 481da19 commit ed65aa2

File tree

2 files changed

+64
-20
lines changed
  • hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src

2 files changed

+64
-20
lines changed

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/ParentQueue.java

Lines changed: 17 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1294,26 +1294,32 @@ public boolean hasChildQueues() {
12941294

12951295
private void calculateEffectiveResourcesAndCapacity(String label,
12961296
Resource clusterResource) {
1297+
// Update effective resources for my self;
1298+
if (rootQueue) {
1299+
Resource resourceByLabel = labelManager.getResourceByLabel(label, clusterResource);
1300+
usageTracker.getQueueResourceQuotas().setEffectiveMinResource(label, resourceByLabel);
1301+
usageTracker.getQueueResourceQuotas().setEffectiveMaxResource(label, resourceByLabel);
1302+
} else {
1303+
super.updateEffectiveResources(clusterResource);
1304+
}
1305+
1306+
recalculateEffectiveMinRatio(label, clusterResource);
1307+
}
1308+
1309+
private void recalculateEffectiveMinRatio(String label, Resource clusterResource) {
12971310
// For root queue, ensure that max/min resource is updated to latest
12981311
// cluster resource.
1299-
Resource resourceByLabel = labelManager.getResourceByLabel(label,
1300-
clusterResource);
1301-
1302-
/*
1303-
* == Below logic are added to calculate effectiveMinRatioPerResource ==
1304-
*/
1312+
Resource resourceByLabel = labelManager.getResourceByLabel(label, clusterResource);
13051313

1306-
// Total configured min resources of direct children of this given parent
1307-
// queue
1314+
// Total configured min resources of direct children of this given parent queue
13081315
Resource configuredMinResources = Resource.newInstance(0L, 0);
13091316
for (CSQueue childQueue : getChildQueues()) {
13101317
Resources.addTo(configuredMinResources,
13111318
childQueue.getQueueResourceQuotas().getConfiguredMinResource(label));
13121319
}
13131320

13141321
// Factor to scale down effective resource: When cluster has sufficient
1315-
// resources, effective_min_resources will be same as configured
1316-
// min_resources.
1322+
// resources, effective_min_resources will be same as configured min_resources.
13171323
Resource numeratorForMinRatio = null;
13181324
if (getQueuePath().equals("root")) {
13191325
if (!resourceByLabel.equals(Resources.none()) && Resources.lessThan(resourceCalculator,
@@ -1324,21 +1330,12 @@ private void calculateEffectiveResourcesAndCapacity(String label,
13241330
if (Resources.lessThan(resourceCalculator, clusterResource,
13251331
usageTracker.getQueueResourceQuotas().getEffectiveMinResource(label),
13261332
configuredMinResources)) {
1327-
numeratorForMinRatio = usageTracker.getQueueResourceQuotas()
1328-
.getEffectiveMinResource(label);
1333+
numeratorForMinRatio = usageTracker.getQueueResourceQuotas().getEffectiveMinResource(label);
13291334
}
13301335
}
13311336

13321337
effectiveMinResourceRatio.put(label, getEffectiveMinRatio(
13331338
configuredMinResources, numeratorForMinRatio));
1334-
1335-
// Update effective resources for my self;
1336-
if (rootQueue) {
1337-
usageTracker.getQueueResourceQuotas().setEffectiveMinResource(label, resourceByLabel);
1338-
usageTracker.getQueueResourceQuotas().setEffectiveMaxResource(label, resourceByLabel);
1339-
} else{
1340-
super.updateEffectiveResources(clusterResource);
1341-
}
13421339
}
13431340

13441341
private Map<String, Float> getEffectiveMinRatio(

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestAbsoluteResourceConfiguration.java

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import org.apache.hadoop.yarn.server.resourcemanager.MockRM;
3131
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceLimits;
3232
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
33+
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeRemovedSchedulerEvent;
3334
import org.apache.hadoop.yarn.util.resource.Resources;
3435
import org.junit.Assert;
3536
import org.junit.Test;
@@ -100,6 +101,21 @@ public class TestAbsoluteResourceConfiguration {
100101
private static Set<String> resourceTypes = new HashSet<>(
101102
Arrays.asList("memory", "vcores"));
102103

104+
private CapacitySchedulerConfiguration setupNormalizationConfiguration() {
105+
CapacitySchedulerConfiguration csConf = new CapacitySchedulerConfiguration();
106+
csConf.setQueues(CapacitySchedulerConfiguration.ROOT,
107+
new String[]{QUEUEA, QUEUEB});
108+
csConf.setQueues(QUEUEA_FULL.getFullPath(), new String[]{QUEUEA1, QUEUEA2});
109+
110+
// 60, 28
111+
csConf.setMinimumResourceRequirement("", QUEUEA_FULL, Resource.newInstance(50 * GB, 20));
112+
csConf.setMinimumResourceRequirement("", QUEUEA1_FULL, Resource.newInstance(30 * GB, 15));
113+
csConf.setMinimumResourceRequirement("", QUEUEA2_FULL, Resource.newInstance(20 * GB, 5));
114+
csConf.setMinimumResourceRequirement("", QUEUEB_FULL, Resource.newInstance(10 * GB, 8));
115+
116+
return csConf;
117+
}
118+
103119
private CapacitySchedulerConfiguration setupSimpleQueueConfiguration(
104120
boolean isCapacityNeeded) {
105121
CapacitySchedulerConfiguration csConf = new CapacitySchedulerConfiguration();
@@ -292,6 +308,37 @@ public void testSimpleMinMaxResourceConfigurartionPerQueue()
292308
rm.close();
293309
}
294310

311+
@Test
312+
public void testNormalizationAfterNodeRemoval() throws Exception {
313+
CapacitySchedulerConfiguration csConf = setupNormalizationConfiguration();
314+
csConf.setClass(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class,
315+
ResourceScheduler.class);
316+
317+
MockRM rm = new MockRM(csConf);
318+
319+
rm.start();
320+
rm.registerNode("h1:1234", 8 * GB, 4);
321+
rm.registerNode("h2:1234", 8 * GB, 4);
322+
rm.registerNode("h3:1234", 8 * GB, 4);
323+
MockNM nm = rm.registerNode("h4:1234", 8 * GB, 4);
324+
rm.registerNode("h5:1234", 28 * GB, 12);
325+
326+
// Send a removal event to CS. MockRM#unregisterNode does not reflect the real world scenario,
327+
// therefore we manually need to invoke this removal event.
328+
CapacityScheduler cs = (CapacityScheduler) rm.getResourceScheduler();
329+
cs.handle(new NodeRemovedSchedulerEvent(rm.getRMContext().getRMNodes().get(nm.getNodeId())));
330+
331+
Resource res = Resources.add(
332+
cs.getQueue(QUEUEA1_FULL.getFullPath()).getEffectiveCapacity(""),
333+
cs.getQueue(QUEUEA2_FULL.getFullPath()).getEffectiveCapacity(""));
334+
Resource resParent = cs.getQueue(QUEUEA_FULL.getFullPath()).getEffectiveCapacity("");
335+
336+
// Check if there is no overcommitment on behalf of the child queues
337+
Assert.assertTrue(String.format("Summarized resource %s of all children is greater than " +
338+
"their parent's %s", res, resParent),
339+
Resources.lessThan(cs.getResourceCalculator(), cs.getClusterResource(), res, resParent));
340+
}
341+
295342
@Test
296343
public void testEffectiveMinMaxResourceConfigurartionPerQueue()
297344
throws Exception {

0 commit comments

Comments
 (0)