Skip to content

Commit 825db8f

Browse files
Prabhu JosephPrabhu Joseph
authored andcommitted
YARN-10107. Fix GpuResourcePlugin#getNMResourceInfo to honor Auto Discovery Enabled
Contributed by Szilard Nemeth.
1 parent e578e52 commit 825db8f

File tree

3 files changed

+91
-18
lines changed

3 files changed

+91
-18
lines changed

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ public synchronized GpuDeviceInformation getGpuDeviceInformation()
136136
return lastDiscoveredGpuInformation;
137137
}
138138

139-
private boolean isAutoDiscoveryEnabled() {
139+
boolean isAutoDiscoveryEnabled() {
140140
String allowedDevicesStr = getConf().get(
141141
YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
142142
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -94,27 +94,29 @@ public DockerCommandPlugin getDockerCommandPluginInstance() {
9494

9595
@Override
9696
public synchronized NMResourceInfo getNMResourceInfo() throws YarnException {
97-
GpuDeviceInformation gpuDeviceInformation;
98-
99-
//At this point the gpu plugin is already enabled
100-
checkGpuResourceHandler();
101-
102-
checkErrorCount();
103-
try{
104-
gpuDeviceInformation = gpuDiscoverer.getGpuDeviceInformation();
105-
numOfErrorExecutionSinceLastSucceed = 0;
106-
} catch (YarnException e) {
107-
LOG.error(e.getMessage(), e);
108-
numOfErrorExecutionSinceLastSucceed++;
109-
throw e;
97+
final GpuDeviceInformation gpuDeviceInformation;
98+
99+
if (gpuDiscoverer.isAutoDiscoveryEnabled()) {
100+
//At this point the gpu plugin is already enabled
101+
checkGpuResourceHandler();
102+
103+
checkErrorCount();
104+
try{
105+
gpuDeviceInformation = gpuDiscoverer.getGpuDeviceInformation();
106+
numOfErrorExecutionSinceLastSucceed = 0;
107+
} catch (YarnException e) {
108+
LOG.error(e.getMessage(), e);
109+
numOfErrorExecutionSinceLastSucceed++;
110+
throw e;
111+
}
112+
} else {
113+
gpuDeviceInformation = null;
110114
}
111-
112115
GpuResourceAllocator gpuResourceAllocator =
113116
gpuResourceHandler.getGpuAllocator();
114117
List<GpuDevice> totalGpus = gpuResourceAllocator.getAllowedGpus();
115118
List<AssignedGpuDevice> assignedGpuDevices =
116119
gpuResourceAllocator.getAssignedGpus();
117-
118120
return new NMGpuResourceInfo(gpuDeviceInformation, totalGpus,
119121
assignedGpuDevices);
120122
}

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuResourcePlugin.java

Lines changed: 73 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,38 @@
1919
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
2020

2121
import static org.mockito.Mockito.mock;
22+
import static org.mockito.Mockito.when;
2223

24+
import com.google.common.collect.Lists;
2325
import org.apache.hadoop.yarn.exceptions.YarnException;
26+
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
27+
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.NMGpuResourceInfo;
28+
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.PerGpuDeviceInformation;
29+
import org.junit.Assert;
2430
import org.junit.Test;
31+
import java.util.List;
2532

2633
public class TestGpuResourcePlugin {
2734

35+
private GpuDiscoverer createMockDiscoverer() throws YarnException {
36+
GpuDiscoverer gpuDiscoverer = mock(GpuDiscoverer.class);
37+
when(gpuDiscoverer.isAutoDiscoveryEnabled()).thenReturn(true);
38+
39+
PerGpuDeviceInformation gpu =
40+
new PerGpuDeviceInformation();
41+
gpu.setProductName("testGpu");
42+
List<PerGpuDeviceInformation> gpus = Lists.newArrayList();
43+
gpus.add(gpu);
44+
45+
GpuDeviceInformation gpuDeviceInfo = new GpuDeviceInformation();
46+
gpuDeviceInfo.setGpus(gpus);
47+
when(gpuDiscoverer.getGpuDeviceInformation()).thenReturn(gpuDeviceInfo);
48+
return gpuDiscoverer;
49+
}
50+
2851
@Test(expected = YarnException.class)
2952
public void testResourceHandlerNotInitialized() throws YarnException {
30-
GpuDiscoverer gpuDiscoverer = mock(GpuDiscoverer.class);
53+
GpuDiscoverer gpuDiscoverer = createMockDiscoverer();
3154
GpuNodeResourceUpdateHandler gpuNodeResourceUpdateHandler =
3255
mock(GpuNodeResourceUpdateHandler.class);
3356

@@ -39,7 +62,7 @@ public void testResourceHandlerNotInitialized() throws YarnException {
3962

4063
@Test
4164
public void testResourceHandlerIsInitialized() throws YarnException {
42-
GpuDiscoverer gpuDiscoverer = mock(GpuDiscoverer.class);
65+
GpuDiscoverer gpuDiscoverer = createMockDiscoverer();
4366
GpuNodeResourceUpdateHandler gpuNodeResourceUpdateHandler =
4467
mock(GpuNodeResourceUpdateHandler.class);
4568

@@ -51,4 +74,52 @@ public void testResourceHandlerIsInitialized() throws YarnException {
5174
//Not throwing any exception
5275
target.getNMResourceInfo();
5376
}
77+
78+
@Test
79+
public void testGetNMResourceInfoAutoDiscoveryEnabled()
80+
throws YarnException {
81+
GpuDiscoverer gpuDiscoverer = createMockDiscoverer();
82+
83+
GpuNodeResourceUpdateHandler gpuNodeResourceUpdateHandler =
84+
mock(GpuNodeResourceUpdateHandler.class);
85+
86+
GpuResourcePlugin target =
87+
new GpuResourcePlugin(gpuNodeResourceUpdateHandler, gpuDiscoverer);
88+
89+
target.createResourceHandler(null, null, null);
90+
91+
NMGpuResourceInfo resourceInfo =
92+
(NMGpuResourceInfo) target.getNMResourceInfo();
93+
Assert.assertNotNull("GpuDeviceInformation should not be null",
94+
resourceInfo.getGpuDeviceInformation());
95+
96+
List<PerGpuDeviceInformation> gpus =
97+
resourceInfo.getGpuDeviceInformation().getGpus();
98+
Assert.assertNotNull("List of PerGpuDeviceInformation should not be null",
99+
gpus);
100+
101+
Assert.assertEquals("List of PerGpuDeviceInformation should have a " +
102+
"size of 1", 1, gpus.size());
103+
Assert.assertEquals("Product name of GPU does not match",
104+
"testGpu", gpus.get(0).getProductName());
105+
}
106+
107+
@Test
108+
public void testGetNMResourceInfoAutoDiscoveryDisabled()
109+
throws YarnException {
110+
GpuDiscoverer gpuDiscoverer = createMockDiscoverer();
111+
when(gpuDiscoverer.isAutoDiscoveryEnabled()).thenReturn(false);
112+
113+
GpuNodeResourceUpdateHandler gpuNodeResourceUpdateHandler =
114+
mock(GpuNodeResourceUpdateHandler.class);
115+
116+
GpuResourcePlugin target =
117+
new GpuResourcePlugin(gpuNodeResourceUpdateHandler, gpuDiscoverer);
118+
119+
target.createResourceHandler(null, null, null);
120+
121+
NMGpuResourceInfo resourceInfo =
122+
(NMGpuResourceInfo) target.getNMResourceInfo();
123+
Assert.assertNull(resourceInfo.getGpuDeviceInformation());
124+
}
54125
}

0 commit comments

Comments
 (0)