Skip to content

Commit 7fc91d0

Browse files
CDPD-75272 YARN-11709. NodeManager should be marked unhealthy on localizer config issues (apache#7043) (apache#159)
(cherry picked from commit d1311e5) Co-authored-by: Benjamin Teke <[email protected]> (cherry picked from commit 468b7fcda0d8c3e8dd1c28cb913d51dd730c83ff)
1 parent 1493e57 commit 7fc91d0

File tree

4 files changed

+87
-8
lines changed

4 files changed

+87
-8
lines changed

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,9 +164,10 @@ public Path localizeClasspathJar(Path jarPath, Path target, String owner)
164164
* for starting a localizer.
165165
* @throws IOException for most application init failures
166166
* @throws InterruptedException if application init thread is halted by NM
167+
* @throws ConfigurationException if config error was found
167168
*/
168169
public abstract void startLocalizer(LocalizerStartContext ctx)
169-
throws IOException, InterruptedException;
170+
throws IOException, InterruptedException, ConfigurationException;
170171

171172
/**
172173
* Prepare the container prior to the launch environment being written.

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -344,7 +344,7 @@ public void init(Context context) throws IOException {
344344

345345
@Override
346346
public void startLocalizer(LocalizerStartContext ctx)
347-
throws IOException, InterruptedException {
347+
throws IOException, InterruptedException, ConfigurationException {
348348
Path nmPrivateContainerTokensPath = ctx.getNmPrivateContainerTokens();
349349
InetSocketAddress nmAddr = ctx.getNmAddr();
350350
String user = ctx.getUser();
@@ -395,9 +395,9 @@ public void startLocalizer(LocalizerStartContext ctx)
395395
localizerArgs = replaceWithContainerLogDir(localizerArgs, containerLogDir);
396396

397397
initializeContainerOp.appendArgs(localizerArgs);
398+
Configuration conf = super.getConf();
398399

399400
try {
400-
Configuration conf = super.getConf();
401401
PrivilegedOperationExecutor privilegedOperationExecutor =
402402
getPrivilegedOperationExecutor();
403403

@@ -409,6 +409,25 @@ public void startLocalizer(LocalizerStartContext ctx)
409409
LOG.warn("Exit code from container " + locId + " startLocalizer is : "
410410
+ exitCode, e);
411411

412+
if (exitCode ==
413+
ExitCode.INVALID_CONTAINER_EXEC_PERMISSIONS.getExitCode() ||
414+
exitCode == ExitCode.INVALID_CONFIG_FILE.getExitCode()) {
415+
throw new ConfigurationException("Application " + appId + " initialization failed" +
416+
" (exitCode=" + exitCode + ") with an unrecoverable config error. " +
417+
"Output: " + e.getOutput(), e);
418+
}
419+
420+
// Check if the failure was due to a missing container-executor binary
421+
Throwable cause = e.getCause() != null ? e.getCause() : e;
422+
if (cause instanceof IOException) {
423+
IOException io = (IOException) cause;
424+
if (io.getMessage().contains("No such file or directory")) {
425+
throw new ConfigurationException("Application " + appId + " initialization failed" +
426+
"(exitCode=" + exitCode + "). Container executor not found at "
427+
+ getContainerExecutorExecutablePath(conf), e);
428+
}
429+
}
430+
412431
throw new IOException("Application " + appId + " initialization failed" +
413432
" (exitCode=" + exitCode + ") with output: " + e.getOutput(), e);
414433
}

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/localizer/ResourceLocalizationService.java

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import static org.apache.hadoop.fs.CreateFlag.CREATE;
2121
import static org.apache.hadoop.fs.CreateFlag.OVERWRITE;
2222

23+
import org.apache.hadoop.yarn.exceptions.ConfigurationException;
2324
import org.apache.hadoop.yarn.server.nodemanager.recovery.RecoveryIterator;
2425
import org.slf4j.Logger;
2526
import org.slf4j.LoggerFactory;
@@ -1260,10 +1261,13 @@ public void run() {
12601261
throw new IOException("All disks failed. "
12611262
+ dirsHandler.getDisksHealthReport(false));
12621263
}
1263-
// TODO handle ExitCodeException separately?
1264-
} catch (FSError fe) {
1265-
exception = fe;
1266-
} catch (Exception e) {
1264+
// TODO handle ExitCodeException separately?
1265+
} catch (ConfigurationException e) {
1266+
exception = e;
1267+
LOG.error("Failed to launch localizer for " + localizerId + ", due to configuration error. " +
1268+
"Marking the node unhealthy.", e);
1269+
nmContext.getNodeStatusUpdater().reportException(e);
1270+
} catch (Exception | FSError e) {
12671271
exception = e;
12681272
} finally {
12691273
if (exception != null) {

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutorWithMocks.java

Lines changed: 56 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -335,7 +335,7 @@ public void testStartLocalizer() throws IOException {
335335
Assert.assertEquals(result.get(22), "localhost");
336336
Assert.assertEquals(result.get(23), "8040");
337337

338-
} catch (InterruptedException e) {
338+
} catch (ConfigurationException | InterruptedException e) {
339339
LOG.error("Error:"+e.getMessage(),e);
340340
Assert.fail();
341341
}
@@ -642,6 +642,61 @@ protected PrivilegedOperationExecutor getPrivilegedOperationExecutor() {
642642
e.getMessage().contains("exitCode"));
643643
}
644644

645+
final int[] exitCodesToThrow = {
646+
LinuxContainerExecutor.ExitCode.INVALID_CONTAINER_EXEC_PERMISSIONS.getExitCode(),
647+
LinuxContainerExecutor.ExitCode.INVALID_CONFIG_FILE.getExitCode(),
648+
};
649+
650+
for (int i = 0; i < exitCodesToThrow.length; i++) {
651+
int exitCode = exitCodesToThrow[i];
652+
doThrow(new PrivilegedOperationException("invalid config", exitCode, null, null))
653+
.when(spyPrivilegedExecutor).executePrivilegedOperation(
654+
any(), any(PrivilegedOperation.class),
655+
any(), any(), anyBoolean(), anyBoolean());
656+
657+
try {
658+
lce.startLocalizer(new LocalizerStartContext.Builder()
659+
.setNmPrivateContainerTokens(nmPrivateCTokensPath)
660+
.setNmAddr(address)
661+
.setUser(appSubmitter)
662+
.setAppId(appId.toString())
663+
.setLocId("12345")
664+
.setDirsHandler(dirService)
665+
.build());
666+
Assert.fail("startLocalizer should have thrown a ConfigurationException");
667+
} catch (ConfigurationException e) {
668+
assertTrue("Unexpected exception " + e,
669+
e.getMessage().contains("exitCode=" + exitCode));
670+
}
671+
}
672+
673+
doThrow(new PrivilegedOperationException("IO error",
674+
new IOException("No such file or directory")))
675+
.when(spyPrivilegedExecutor).executePrivilegedOperation(
676+
any(), any(PrivilegedOperation.class),
677+
any(), any(), anyBoolean(), anyBoolean());
678+
679+
try {
680+
lce.startLocalizer(new LocalizerStartContext.Builder()
681+
.setNmPrivateContainerTokens(nmPrivateCTokensPath)
682+
.setNmAddr(address)
683+
.setUser(appSubmitter)
684+
.setAppId(appId.toString())
685+
.setLocId("12345")
686+
.setDirsHandler(dirService)
687+
.build());
688+
Assert.fail("startLocalizer should have thrown a ConfigurationException");
689+
} catch (ConfigurationException e) {
690+
assertTrue("Unexpected exception " + e,
691+
e.getMessage().contains("Container executor not found"));
692+
}
693+
694+
695+
doThrow(new PrivilegedOperationException("interrupted"))
696+
.when(spyPrivilegedExecutor).executePrivilegedOperation(
697+
any(), any(PrivilegedOperation.class),
698+
any(), any(), anyBoolean(), anyBoolean());
699+
645700
lce.activateContainer(cid, new Path(workDir, "pid.txt"));
646701
lce.launchContainer(new ContainerStartContext.Builder()
647702
.setContainer(container)

0 commit comments

Comments
 (0)