Skip to content

Commit 18c514c

Browse files
author
zhangyizhong
committed
Support automatically killing container when the node load is high
1 parent 2a425d0 commit 18c514c

File tree

11 files changed

+176
-10
lines changed

11 files changed

+176
-10
lines changed

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ContainerExitStatus.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,4 +83,9 @@ public class ContainerExitStatus {
8383
*/
8484
public static final int KILLED_FOR_EXCESS_LOGS = -109;
8585

86+
/**
87+
* Container was terminated since exceeds CPU limit.
88+
*/
89+
public static final int KILLED_EXCEEDED_PCORE = -110;
90+
8691
}

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1652,6 +1652,21 @@ public static boolean isAclEnabled(Configuration conf) {
16521652
+ "elastic-memory-control.enabled";
16531653
public static final boolean DEFAULT_NM_ELASTIC_MEMORY_CONTROL_ENABLED = false;
16541654

1655+
/** Specifies whether physical core check is enabled. */
1656+
public static final String NM_PCORE_CHECK_ENABLED = NM_PREFIX
1657+
+ "pcore-check-enabled";
1658+
public static final boolean DEFAULT_NM_PCORE_CHECK_ENABLED = true;
1659+
1660+
/** Specifies the max pcore ratio before killed. */
1661+
public static final String NM_PCORE_LIMIT_RATIO = NM_PREFIX
1662+
+ "pcore-limit-ratio";
1663+
public static final float DEFAULT_NM_PCORE_LIMIT_RATIO = 1.0f;
1664+
1665+
/** Specifies the times of exceed pcore ratio limit before killed. */
1666+
public static final String NM_PCORE_LIMIT_TIMES = NM_PREFIX
1667+
+ "pcore-limit-times";
1668+
public static final int DEFAULT_NM_PCORE_LIMIT_TIMES = 3;
1669+
16551670
/** Specifies the OOM handler code. */
16561671
public static final String NM_ELASTIC_MEMORY_CONTROL_OOM_HANDLER = NM_PREFIX
16571672
+ "elastic-memory-control.oom-handler";

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ResourceView.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,5 +36,7 @@ public interface ResourceView {
3636

3737
boolean isPmemCheckEnabled();
3838

39+
boolean isPcoreCheckEnabled();
40+
3941
long getVCoresAllocatedForContainers();
4042
}

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java

Lines changed: 114 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252

5353
import java.util.Arrays;
5454
import java.io.File;
55+
import java.util.HashMap;
5556
import java.util.Map;
5657
import java.util.List;
5758
import java.util.Map.Entry;
@@ -109,6 +110,11 @@ public class ContainersMonitorImpl extends AbstractService implements
109110
private static final long UNKNOWN_MEMORY_LIMIT = -1L;
110111
private int nodeCpuPercentageForYARN;
111112

113+
private boolean pcoreCheckEnabled;
114+
private float cpuLimitRatio;
115+
private int cpuLimitTimes;
116+
private Map<ContainerId,Integer> cpuExceedTimesMap = new HashMap<>();
117+
112118
/**
113119
* Type of container metric.
114120
*/
@@ -210,6 +216,16 @@ protected void serviceInit(Configuration myConf) throws Exception {
210216
LOG.info("Elastic memory control enabled: {}", elasticMemoryEnforcement);
211217
LOG.info("Strict memory control enabled: {}", strictMemoryEnforcement);
212218

219+
pcoreCheckEnabled = conf.getBoolean(YarnConfiguration.NM_PCORE_CHECK_ENABLED,
220+
YarnConfiguration.DEFAULT_NM_PCORE_CHECK_ENABLED);
221+
222+
// if cpuLimitRatio is 3.0, it means currentPcoreUsagePercentage shouldn't exceed 300
223+
cpuLimitRatio = conf.getFloat(YarnConfiguration.NM_PCORE_LIMIT_RATIO,
224+
YarnConfiguration.DEFAULT_NM_PCORE_LIMIT_RATIO);
225+
cpuLimitTimes = conf.getInt(YarnConfiguration.NM_PCORE_LIMIT_TIMES,
226+
YarnConfiguration.DEFAULT_NM_PCORE_LIMIT_TIMES);
227+
LOG.info("Physical core check enabled: " + pcoreCheckEnabled);
228+
213229
if (elasticMemoryEnforcement) {
214230
if (!CGroupElasticMemoryController.isAvailable()) {
215231
// Test for availability outside the constructor
@@ -463,20 +479,51 @@ private boolean isProcessTreeOverLimit(String containerId,
463479

464480
if (currentMemUsage > (2 * memLimit)) {
465481
LOG.warn("Process tree for container: {} running over twice "
466-
+ "the configured limit. Limit={}, current usage = {}",
482+
+ "the configured memory limit. Limit={}, current memory usage = {}",
467483
containerId, memLimit, currentMemUsage);
468484
isOverLimit = true;
469485
} else if (curMemUsageOfAgedProcesses > memLimit) {
470486
LOG.warn("Process tree for container: {} has processes older than 1 "
471-
+ "iteration running over the configured limit. "
472-
+ "Limit={}, current usage = {}",
487+
+ "iteration running over the configured memory limit. "
488+
+ "Limit={}, current memory usage = {}",
473489
containerId, memLimit, curMemUsageOfAgedProcesses);
474490
isOverLimit = true;
475491
}
476492

477493
return isOverLimit;
478494
}
479495

496+
/**
497+
* Container exceeding cpu limit `cpuLimitRatio` for more than
498+
* yarn.nodemanager.pcore-limit-times * yarn.nodemanager.container-monitor.interval-ms,
499+
* will be killed automatically.
500+
*/
501+
boolean isProcessTreeOverLimit(
502+
ContainerId containerId, float currentPcoreUsagePercentage) {
503+
boolean isOverLimit = false;
504+
if (currentPcoreUsagePercentage > 100 * cpuLimitRatio) {
505+
int cpuExceedTimes =
506+
cpuExceedTimesMap.getOrDefault(containerId, 0);
507+
cpuExceedTimes++;
508+
LOG.warn("Process tree for container: " + containerId
509+
+ " running over " + "the configured CPU limit. Limit="
510+
+ 100 * cpuLimitRatio + ", current usage = "
511+
+ currentPcoreUsagePercentage + ", cpuExceedTimes ="
512+
+ cpuExceedTimes);
513+
if (cpuExceedTimes >= cpuLimitTimes) {
514+
isOverLimit = true;
515+
cpuExceedTimesMap.remove(containerId);
516+
LOG.warn("Container " + containerId +
517+
" meets the max cpu limit times " + cpuLimitTimes);
518+
} else {
519+
cpuExceedTimesMap.put(containerId, cpuExceedTimes);
520+
}
521+
} else {
522+
cpuExceedTimesMap.remove(containerId);
523+
}
524+
return isOverLimit;
525+
}
526+
480527
// method provided just for easy testing purposes
481528
boolean isProcessTreeOverLimit(ResourceCalculatorProcessTree pTree,
482529
String containerId, long limit) {
@@ -537,6 +584,8 @@ public void run() {
537584
pTree.updateProcessTree(); // update process-tree
538585
long currentVmemUsage = pTree.getVirtualMemorySize();
539586
long currentPmemUsage = pTree.getRssMemorySize();
587+
float currentPcoreUsagePercentage =
588+
pTree.getCpuUsagePercent() / ptInfo.getCpuVcores();
540589
if (currentVmemUsage < 0 || currentPmemUsage < 0) {
541590
// YARN-6862/YARN-5021 If the container just exited or for
542591
// another reason the physical/virtual memory is UNAVAILABLE (-1)
@@ -556,6 +605,12 @@ public void run() {
556605
LOG.info("Skipping monitoring container {} since "
557606
+ "CPU usage is not yet available.", containerId);
558607
continue;
608+
} else {
609+
LOG.info(String.format(
610+
"CPU usage of ProcessTree %s for container-id %s: ",
611+
pId, containerId.toString()) +
612+
String.format("%s of %s per physical core used; ",
613+
currentPcoreUsagePercentage, 100 * cpuLimitRatio));
559614
}
560615

561616
recordUsage(containerId, pId, pTree, ptInfo, currentVmemUsage,
@@ -599,6 +654,10 @@ public void run() {
599654
trackedContainersUtilization.getCPU());
600655
}
601656

657+
// Remove the outdated key
658+
cpuExceedTimesMap.entrySet().removeIf(
659+
e -> !trackingContainers.containsKey(e.getKey()));
660+
602661
try {
603662
Thread.sleep(monitoringInterval);
604663
} catch (InterruptedException e) {
@@ -752,6 +811,7 @@ private void checkLimit(ContainerId containerId, String pId,
752811
return;
753812
}
754813
boolean isMemoryOverLimit = false;
814+
boolean isCpuOverLimit = false;
755815
String msg = "";
756816
int containerExitStatus = ContainerExitStatus.INVALID;
757817

@@ -761,6 +821,9 @@ private void checkLimit(ContainerId containerId, String pId,
761821
// are processes more than 1 iteration old.
762822
long curMemUsageOfAgedProcesses = pTree.getVirtualMemorySize(1);
763823
long curRssMemUsageOfAgedProcesses = pTree.getRssMemorySize(1);
824+
float currentPcoreUsagePercentage =
825+
pTree.getCpuUsagePercent() / ptInfo.getCpuVcores() *
826+
maxVCoresAllottedForContainers / resourceCalculatorPlugin.getNumProcessors();
764827
if (isVmemCheckEnabled()
765828
&& isProcessTreeOverLimit(containerId.toString(),
766829
currentVmemUsage, curMemUsageOfAgedProcesses, vmemLimit)) {
@@ -771,7 +834,7 @@ && isProcessTreeOverLimit(containerId.toString(),
771834
// Container (the root process) is still alive and overflowing
772835
// memory.
773836
// Dump the process-tree and then clean it up.
774-
msg = formatErrorMessage("virtual",
837+
msg = formatMemoryErrorMessage("virtual",
775838
formatUsageString(currentVmemUsage, vmemLimit,
776839
currentPmemUsage, pmemLimit),
777840
pId, containerId, pTree, delta);
@@ -788,19 +851,28 @@ && isProcessTreeOverLimit(containerId.toString(),
788851
// Container (the root process) is still alive and overflowing
789852
// memory.
790853
// Dump the process-tree and then clean it up.
791-
msg = formatErrorMessage("physical",
854+
msg = formatMemoryErrorMessage("physical",
792855
formatUsageString(currentVmemUsage, vmemLimit,
793856
currentPmemUsage, pmemLimit),
794857
pId, containerId, pTree, delta);
795858
isMemoryOverLimit = true;
796859
containerExitStatus = ContainerExitStatus.KILLED_EXCEEDED_PMEM;
860+
} else if (isPcoreCheckEnabled()
861+
&& isProcessTreeOverLimit(containerId,
862+
currentPcoreUsagePercentage)) {
863+
// Container (the root process) is still alive and exceed cpu limit.
864+
// Dump the process-tree and then clean it up.
865+
msg = formatCpuErrorMessage(currentPcoreUsagePercentage, cpuLimitRatio,
866+
pId, containerId, pTree);
867+
isCpuOverLimit = true;
868+
containerExitStatus = ContainerExitStatus.KILLED_EXCEEDED_PCORE;
797869
}
798870

799-
if (isMemoryOverLimit
871+
872+
if ((isMemoryOverLimit || isCpuOverLimit)
800873
&& trackingContainers.remove(containerId) != null) {
801-
// Virtual or physical memory over limit. Fail the container and
802-
// remove
803-
// the corresponding process tree
874+
// Virtual or physical memory or physical core over limit.
875+
// Fail the container and remove the corresponding process tree.
804876
LOG.warn(msg);
805877
// warn if not a leader
806878
if (!pTree.checkPidPgrpidForMatch()) {
@@ -846,7 +918,7 @@ private void reportResourceUsage(ContainerId containerId,
846918
* @param pTree process tree to dump full resource utilization graph
847919
* @return formatted resource usage information
848920
*/
849-
private String formatErrorMessage(String memTypeExceeded,
921+
private String formatMemoryErrorMessage(String memTypeExceeded,
850922
String usageString, String pId, ContainerId containerId,
851923
ResourceCalculatorProcessTree pTree, long delta) {
852924
return
@@ -859,6 +931,28 @@ private String formatErrorMessage(String memTypeExceeded,
859931
pTree.getProcessTreeDump();
860932
}
861933

934+
/**
935+
* Format string when memory limit has been exceeded.
936+
* @param currentPcoreUsagePercentage current pcore usage
937+
* @param pcoreLimitRatio pcore limit threshold
938+
* @param pId process id
939+
* @param containerId container id
940+
* @param pTree process tree to dump full resource utilization graph
941+
* @return formatted resource usage information
942+
*/
943+
private String formatCpuErrorMessage(
944+
float currentPcoreUsagePercentage, float pcoreLimitRatio,
945+
String pId, ContainerId containerId, ResourceCalculatorProcessTree pTree) {
946+
return
947+
String.format("Container [pid=%s,containerID=%s] is running beyond %s cpu limits. ",
948+
pId, containerId, pcoreLimitRatio * 100) +
949+
"Current usage: " + currentPcoreUsagePercentage +
950+
". Killing container.\n" +
951+
"Dump of the process-tree for " + containerId + " :\n" +
952+
pTree.getProcessTreeDump();
953+
}
954+
955+
862956
/**
863957
* Format memory usage string for reporting.
864958
* @param currentVmemUsage virtual memory usage
@@ -1013,6 +1107,16 @@ public boolean isPmemCheckEnabled() {
10131107
return this.pmemCheckEnabled;
10141108
}
10151109

1110+
/**
1111+
* Is the total physical core check enabled?
1112+
*
1113+
* @return true if total physical core check is enabled.
1114+
*/
1115+
@Override
1116+
public boolean isPcoreCheckEnabled() {
1117+
return this.pcoreCheckEnabled;
1118+
}
1119+
10161120
@Override
10171121
public long getPmemAllocatedForContainers() {
10181122
return this.maxPmemAllottedForContainers;

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMContainerWebSocket.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,11 @@ public boolean isVmemCheckEnabled() {
101101
public boolean isPmemCheckEnabled() {
102102
return true;
103103
}
104+
105+
@Override
106+
public boolean isPcoreCheckEnabled() {
107+
return false;
108+
}
104109
};
105110
conf.set(YarnConfiguration.NM_LOCAL_DIRS, TESTROOTDIR.getAbsolutePath());
106111
conf.set(YarnConfiguration.NM_LOG_DIRS, testLogDir.getAbsolutePath());

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServer.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,11 @@ public boolean isVmemCheckEnabled() {
108108
public boolean isPmemCheckEnabled() {
109109
return true;
110110
}
111+
112+
@Override
113+
public boolean isPcoreCheckEnabled() {
114+
return false;
115+
}
111116
};
112117
conf.set(YarnConfiguration.NM_LOCAL_DIRS, testRootDir.getAbsolutePath());
113118
conf.set(YarnConfiguration.NM_LOG_DIRS, testLogDir.getAbsolutePath());
@@ -171,6 +176,11 @@ public boolean isVmemCheckEnabled() {
171176
public boolean isPmemCheckEnabled() {
172177
return true;
173178
}
179+
180+
@Override
181+
public boolean isPcoreCheckEnabled() {
182+
return false;
183+
}
174184
};
175185
conf.set(YarnConfiguration.NM_LOCAL_DIRS, testRootDir.getAbsolutePath());
176186
conf.set(YarnConfiguration.NM_LOG_DIRS, testLogDir.getAbsolutePath());

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServices.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,11 @@ public boolean isVmemCheckEnabled() {
174174
public boolean isPmemCheckEnabled() {
175175
return true;
176176
}
177+
178+
@Override
179+
public boolean isPcoreCheckEnabled() {
180+
return false;
181+
}
177182
};
178183
nmWebApp = new NMWebApp(resourceView, aclsManager, dirsHandler);
179184
bind(JAXBContextResolver.class);

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesApps.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,11 @@ public boolean isVmemCheckEnabled() {
141141
public boolean isPmemCheckEnabled() {
142142
return true;
143143
}
144+
145+
@Override
146+
public boolean isPcoreCheckEnabled() {
147+
return false;
148+
}
144149
};
145150
nmWebApp = new NMWebApp(resourceView, aclsManager, dirsHandler);
146151
bind(JAXBContextResolver.class);

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesAuxServices.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,11 @@ public boolean isVmemCheckEnabled() {
120120
public boolean isPmemCheckEnabled() {
121121
return true;
122122
}
123+
124+
@Override
125+
public boolean isPcoreCheckEnabled() {
126+
return false;
127+
}
123128
};
124129
conf.set(YarnConfiguration.NM_LOCAL_DIRS, testRootDir.getAbsolutePath());
125130
conf.set(YarnConfiguration.NM_LOG_DIRS, testLogDir.getAbsolutePath());

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestNMWebServicesContainers.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,11 @@ public boolean isVmemCheckEnabled() {
125125
public boolean isPmemCheckEnabled() {
126126
return true;
127127
}
128+
129+
@Override
130+
public boolean isPcoreCheckEnabled() {
131+
return false;
132+
}
128133
};
129134
conf.set(YarnConfiguration.NM_LOCAL_DIRS, testRootDir.getAbsolutePath());
130135
conf.set(YarnConfiguration.NM_LOG_DIRS, testLogDir.getAbsolutePath());

0 commit comments

Comments
 (0)