Skip to content

Commit 4cf4cb6

Browse files
committed
Fix crash issue in RM unit tests.
1 parent dd8cd2e commit 4cf4cb6

File tree

11 files changed

+140
-79
lines changed

11 files changed

+140
-79
lines changed

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/ParameterizedSchedulerTestBase.java

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -31,18 +31,20 @@
3131
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair
3232
.allocationfile.AllocationFileWriter;
3333
import org.junit.jupiter.api.AfterEach;
34+
import org.junit.jupiter.api.BeforeEach;
35+
import org.junit.jupiter.api.io.TempDir;
3436

3537
import java.io.File;
3638
import java.io.IOException;
39+
import java.nio.file.Path;
3740
import java.util.Arrays;
3841
import java.util.Collection;
3942
import java.util.stream.Collectors;
4043

4144
public abstract class ParameterizedSchedulerTestBase {
42-
protected final static String TEST_DIR =
43-
new File(System.getProperty("test.build.data", "/tmp")).getAbsolutePath();
44-
private final static String FS_ALLOC_FILE =
45-
new File(TEST_DIR, "test-fs-queues.xml").getAbsolutePath();
45+
@TempDir
46+
Path tempDir;
47+
private File fsAllocFile;
4648

4749
public enum SchedulerType {
4850
CAPACITY, FAIR
@@ -100,17 +102,22 @@ protected void configureFairScheduler(YarnConfiguration configuration) {
100102
.minSharePreemptionTimeout(120)
101103
.fairSharePreemptionThreshold(.5)
102104
.build())
103-
.writeToFile(FS_ALLOC_FILE);
105+
.writeToFile(fsAllocFile.getAbsolutePath());
104106

105107
configuration.set(FairSchedulerConfiguration.ALLOCATION_FILE,
106-
FS_ALLOC_FILE);
108+
fsAllocFile.getAbsolutePath());
107109
configuration.setLong(FairSchedulerConfiguration.UPDATE_INTERVAL_MS, 10);
108110
}
109111

110-
@AfterEach
112+
@BeforeEach
113+
public void setUp() throws IOException {
114+
fsAllocFile = tempDir.resolve("test-fs-queues.xml").toFile();
115+
}
116+
117+
@AfterEach
111118
public void tearDown() {
112119
if (schedulerType == SchedulerType.FAIR) {
113-
(new File(FS_ALLOC_FILE)).delete();
120+
fsAllocFile.delete();
114121
}
115122
}
116123

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestApplicationACLs.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,7 @@ public void tearDown() {
189189
if(resourceManager != null) {
190190
resourceManager.stop();
191191
}
192+
super.tearDown();
192193
}
193194

194195
@Override

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRM.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ public void tearDown() {
107107
ClusterMetrics.destroy();
108108
QueueMetrics.clearQueueMetrics();
109109
DefaultMetricsSystem.shutdown();
110+
super.tearDown();
110111
}
111112

112113
@ParameterizedTest(name = "{0}")

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@
4141
import java.io.IOException;
4242
import java.net.InetSocketAddress;
4343
import java.nio.ByteBuffer;
44+
import java.nio.file.Files;
45+
import java.nio.file.Path;
4446
import java.util.ArrayList;
4547
import java.util.Arrays;
4648
import java.util.EnumSet;
@@ -139,6 +141,7 @@
139141
import org.apache.hadoop.thirdparty.com.google.common.collect.ImmutableMap;
140142
import org.junit.jupiter.api.AfterEach;
141143
import org.junit.jupiter.api.Timeout;
144+
import org.junit.jupiter.api.io.TempDir;
142145
import org.junit.jupiter.params.ParameterizedTest;
143146
import org.junit.jupiter.params.provider.MethodSource;
144147
import org.slf4j.Logger;
@@ -148,9 +151,11 @@
148151
public class TestRMRestart extends ParameterizedSchedulerTestBase {
149152
private static final Logger LOG =
150153
LoggerFactory.getLogger(TestRMRestart.class);
151-
private final static File TEMP_DIR = new File(System.getProperty(
152-
"test.build.data", "/tmp"), "decommision");
153-
private File hostFile = new File(TEMP_DIR + File.separator + "hostFile.txt");
154+
@TempDir
155+
Path tempDir;
156+
private Path decommissionDir;
157+
private File hostFile;
158+
154159
private YarnConfiguration conf;
155160

156161
// Fake rmAddr for token-renewal
@@ -163,6 +168,9 @@ public void initTestRMRestart(SchedulerType type) throws IOException {
163168
}
164169

165170
public void setup() throws IOException {
171+
172+
decommissionDir = Files.createDirectory(tempDir.resolve("decommission"));
173+
hostFile = decommissionDir.resolve("hostFile.txt").toFile();
166174
conf = getConf();
167175
GenericTestUtils.setRootLogLevel(Level.DEBUG);
168176
UserGroupInformation.setConfiguration(conf);
@@ -182,7 +190,7 @@ public void tearDown() {
182190
}
183191
rms.clear();
184192

185-
TEMP_DIR.delete();
193+
super.tearDown();
186194
}
187195

188196
/**
@@ -2407,7 +2415,6 @@ protected void serviceStart() throws Exception {
24072415

24082416
private void writeToHostsFile(String... hosts) throws IOException {
24092417
if (!hostFile.exists()) {
2410-
TEMP_DIR.mkdirs();
24112418
hostFile.createNewFile();
24122419
}
24132420
FileOutputStream fStream = null;

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestWorkPreservingRMRestart.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,7 @@ public void tearDown() {
152152
rm2.stop();
153153
}
154154
conf = null;
155+
super.tearDown();
155156
}
156157

157158
// Test common scheduler state including SchedulerAttempt, SchedulerNode,

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/reservation/TestReservationSystem.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ public void tearDown() {
7878
scheduler = null;
7979
clearRMContext();
8080
QueueMetrics.clearQueueMetrics();
81+
super.tearDown();
8182
}
8283

8384
@ParameterizedTest(name = "{0}")

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/resourcetracker/TestNMReconnect.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,10 @@ public void setUp() {
130130

131131
@AfterEach
132132
public void tearDown() {
133-
resourceTrackerService.stop();
133+
if(resourceTrackerService!=null) {
134+
resourceTrackerService.stop();
135+
}
136+
super.tearDown();
134137
}
135138

136139
@ParameterizedTest(name = "{0}")

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/TestApplicationLifetimeMonitor.java

Lines changed: 81 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@
6464
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.QueuePath;
6565
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler;
6666
import org.apache.hadoop.yarn.util.Times;
67+
import org.junit.jupiter.api.AfterEach;
6768
import org.junit.jupiter.params.ParameterizedTest;
6869
import org.junit.jupiter.params.provider.MethodSource;
6970
import org.slf4j.event.Level;
@@ -96,6 +97,11 @@ private void initTestApplicationLifetimeMonitor(Class schedulerParameter)
9697
setup();
9798
}
9899

100+
@AfterEach
101+
public void tearDown() {
102+
conf = null;
103+
}
104+
99105
public void setup() throws IOException {
100106
if (scheduler.equals(CapacityScheduler.class)) {
101107
// Since there is limited lifetime monitoring support in fair scheduler
@@ -233,6 +239,7 @@ public void testApplicationLifetimeMonitor(Class schedulerParameter)
233239
assertTrue(totalTimeRun < ((maxLifetime + 10L) * 1000),
234240
"Application killed before lifetime value " + totalTimeRun);
235241
}
242+
nm1.unRegisterNode();
236243
} finally {
237244
stopRM(rm);
238245
}
@@ -248,72 +255,78 @@ public void testApplicationLifetimeOnRMRestart(Class schedulerParameter) throws
248255
true);
249256
conf.set(YarnConfiguration.RM_STORE, MemoryRMStateStore.class.getName());
250257

251-
MockRM rm1 = new MockRM(conf);
252-
MemoryRMStateStore memStore = (MemoryRMStateStore) rm1.getRMStateStore();
253-
rm1.start();
254-
MockNM nm1 =
255-
new MockNM("127.0.0.1:1234", 8192, rm1.getResourceTrackerService());
256-
nm1.registerNode();
257-
nm1.nodeHeartbeat(true);
258-
259-
long appLifetime = 30L;
260-
Map<ApplicationTimeoutType, Long> timeouts =
261-
new HashMap<ApplicationTimeoutType, Long>();
262-
timeouts.put(ApplicationTimeoutType.LIFETIME, appLifetime);
263-
RMApp app1 = MockRMAppSubmitter.submit(rm1,
264-
MockRMAppSubmissionData.Builder.createWithMemory(200, rm1)
265-
.withAppPriority(Priority.newInstance(0))
266-
.withApplicationTimeouts(timeouts)
267-
.build());
268-
MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
269-
270-
// Re-start RM
271-
MockRM rm2 = new MockRM(conf, memStore);
272-
273-
// make sure app has been unregistered with old RM else both will trigger
274-
// Expire event
275-
rm1.getRMContext().getRMAppLifetimeMonitor().unregisterApp(
276-
app1.getApplicationId(), ApplicationTimeoutType.LIFETIME);
277-
278-
rm2.start();
279-
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
280-
281-
// recover app
282-
RMApp recoveredApp1 =
258+
MockRM rm1 = null;
259+
MockRM rm2 = null;
260+
try {
261+
rm1 = new MockRM(conf);
262+
MemoryRMStateStore memStore = (MemoryRMStateStore) rm1.getRMStateStore();
263+
rm1.start();
264+
MockNM nm1 =
265+
new MockNM("127.0.0.1:1234", 8192, rm1.getResourceTrackerService());
266+
nm1.registerNode();
267+
nm1.nodeHeartbeat(true);
268+
269+
long appLifetime = 30L;
270+
Map<ApplicationTimeoutType, Long> timeouts = new HashMap<ApplicationTimeoutType, Long>();
271+
timeouts.put(ApplicationTimeoutType.LIFETIME, appLifetime);
272+
RMApp app1 = MockRMAppSubmitter.submit(rm1,
273+
MockRMAppSubmissionData.Builder.createWithMemory(200, rm1)
274+
.withAppPriority(Priority.newInstance(0))
275+
.withApplicationTimeouts(timeouts)
276+
.build());
277+
MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
278+
279+
// Re-start RM
280+
rm2 = new MockRM(conf, memStore);
281+
282+
// make sure app has been unregistered with old RM else both will trigger
283+
// Expire event
284+
rm1.getRMContext().getRMAppLifetimeMonitor().unregisterApp(
285+
app1.getApplicationId(), ApplicationTimeoutType.LIFETIME);
286+
287+
rm2.start();
288+
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
289+
290+
// recover app
291+
RMApp recoveredApp1 =
283292
rm2.getRMContext().getRMApps().get(app1.getApplicationId());
284293

285-
NMContainerStatus amContainer = TestRMRestart.createNMContainerStatus(
286-
am1.getApplicationAttemptId(), 1, ContainerState.RUNNING);
287-
NMContainerStatus runningContainer = TestRMRestart.createNMContainerStatus(
288-
am1.getApplicationAttemptId(), 2, ContainerState.RUNNING);
289-
290-
nm1.registerNode(Arrays.asList(amContainer, runningContainer), null);
291-
292-
// Wait for RM to settle down on recovering containers;
293-
TestWorkPreservingRMRestart.waitForNumContainersToRecover(2, rm2,
294-
am1.getApplicationAttemptId());
295-
Set<ContainerId> launchedContainers =
296-
((RMNodeImpl) rm2.getRMContext().getRMNodes().get(nm1.getNodeId()))
297-
.getLaunchedContainers();
298-
assertTrue(launchedContainers.contains(amContainer.getContainerId()));
299-
assertTrue(launchedContainers.contains(runningContainer.getContainerId()));
300-
301-
// check RMContainers are re-recreated and the container state is correct.
302-
rm2.waitForState(nm1, amContainer.getContainerId(),
303-
RMContainerState.RUNNING);
304-
rm2.waitForState(nm1, runningContainer.getContainerId(),
305-
RMContainerState.RUNNING);
306-
307-
// re register attempt to rm2
308-
rm2.waitForState(recoveredApp1.getApplicationId(), RMAppState.ACCEPTED);
309-
am1.setAMRMProtocol(rm2.getApplicationMasterService(), rm2.getRMContext());
310-
am1.registerAppAttempt();
311-
rm2.waitForState(recoveredApp1.getApplicationId(), RMAppState.RUNNING);
312-
313-
// wait for app life time and application to be in killed state.
314-
rm2.waitForState(recoveredApp1.getApplicationId(), RMAppState.KILLED);
315-
assertTrue(recoveredApp1.getFinishTime() > (recoveredApp1.getSubmitTime()
316-
+ appLifetime * 1000), "Application killed before lifetime value");
294+
NMContainerStatus amContainer = TestRMRestart.createNMContainerStatus(
295+
am1.getApplicationAttemptId(), 1, ContainerState.RUNNING);
296+
NMContainerStatus runningContainer = TestRMRestart.createNMContainerStatus(
297+
am1.getApplicationAttemptId(), 2, ContainerState.RUNNING);
298+
299+
nm1.registerNode(Arrays.asList(amContainer, runningContainer), null);
300+
301+
// Wait for RM to settle down on recovering containers;
302+
TestWorkPreservingRMRestart.waitForNumContainersToRecover(2, rm2,
303+
am1.getApplicationAttemptId());
304+
Set<ContainerId> launchedContainers =
305+
((RMNodeImpl) rm2.getRMContext().getRMNodes().get(nm1.getNodeId()))
306+
.getLaunchedContainers();
307+
assertTrue(launchedContainers.contains(amContainer.getContainerId()));
308+
assertTrue(launchedContainers.contains(runningContainer.getContainerId()));
309+
310+
// check RMContainers are re-recreated and the container state is correct.
311+
rm2.waitForState(nm1, amContainer.getContainerId(),
312+
RMContainerState.RUNNING);
313+
rm2.waitForState(nm1, runningContainer.getContainerId(),
314+
RMContainerState.RUNNING);
315+
316+
// re register attempt to rm2
317+
rm2.waitForState(recoveredApp1.getApplicationId(), RMAppState.ACCEPTED);
318+
am1.setAMRMProtocol(rm2.getApplicationMasterService(), rm2.getRMContext());
319+
am1.registerAppAttempt();
320+
rm2.waitForState(recoveredApp1.getApplicationId(), RMAppState.RUNNING);
321+
322+
// wait for app life time and application to be in killed state.
323+
rm2.waitForState(recoveredApp1.getApplicationId(), RMAppState.KILLED);
324+
assertTrue(recoveredApp1.getFinishTime() > (recoveredApp1.getSubmitTime()
325+
+ appLifetime * 1000), "Application killed before lifetime value");
326+
} finally {
327+
stopRM(rm1);
328+
stopRM(rm2);
329+
}
317330
}
318331

319332
@Timeout(value = 60)
@@ -394,6 +407,7 @@ public synchronized void updateApplicationStateInternal(
394407
// verify for app killed with updated lifetime
395408
assertTrue(app1.getFinishTime() > afterUpdate,
396409
"Application killed before lifetime value");
410+
nm1.unRegisterNode();
397411
} finally {
398412
stopRM(rm1);
399413
}
@@ -458,6 +472,7 @@ public void testInheritAppLifetimeFromParentQueue(Class schedulerParameter) thro
458472
assertTrue(totalTimeRun < (maxRootLifetime * 1000),
459473
"Application killed after max lifetime value " + totalTimeRun);
460474
}
475+
nm1.unRegisterNode();
461476
} finally {
462477
stopRM(rm);
463478
}
@@ -519,6 +534,7 @@ public void testOverrideParentQueueMaxAppLifetime(Class schedulerParameter) thro
519534
"Child queue max lifetime should have overridden"
520535
+ " parent value");
521536
}
537+
nm1.unRegisterNode();
522538
} finally {
523539
stopRM(rm);
524540
}
@@ -584,6 +600,7 @@ public void testOverrideParentQueueDefaultAppLifetime(
584600
"Child queue default lifetime should have"
585601
+ " overridden parent value");
586602
}
603+
nm1.unRegisterNode();
587604
} finally {
588605
stopRM(rm);
589606
}

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestAbstractYarnScheduler.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1221,6 +1221,15 @@ public void testResourceRequestRestoreWhenRMContainerIsAtAllocated(SchedulerType
12211221
ContainerId containerId4 =
12221222
ContainerId.newContainerId(am1.getApplicationAttemptId(), 4);
12231223
rm1.waitForState(nm2, containerId4, RMContainerState.RUNNING);
1224+
1225+
if (nm1 != null) {
1226+
nm1.unRegisterNode();
1227+
}
1228+
1229+
if (nm2 != null) {
1230+
nm2.unRegisterNode();
1231+
}
1232+
12241233
} finally {
12251234
rm1.stop();
12261235
}

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/TestSchedulerOvercommit.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,12 @@ public void cleanup() throws Exception {
170170
am.unregisterAppAttempt();
171171
am = null;
172172
}
173+
174+
if(nm != null) {
175+
nm.unRegisterNode();
176+
nm = null;
177+
}
178+
173179
if (rm != null) {
174180
rm.drainEvents();
175181
rm.stop();

0 commit comments

Comments
 (0)