Skip to content

Commit de15a66

Browse files
committed
YARN-9265. FPGA plugin fails to recognize Intel Processing Accelerator Card. Contributed by Peter Bacsko.
1 parent fb851c9 commit de15a66

File tree

12 files changed

+699
-87
lines changed

12 files changed

+699
-87
lines changed

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1715,6 +1715,15 @@ public static boolean isAclEnabled(Configuration conf) {
17151715
public static final String DEFAULT_NM_FPGA_VENDOR_PLUGIN =
17161716
"org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga.IntelFpgaOpenclPlugin";
17171717

1718+
@Private
1719+
public static final String NM_FPGA_DEVICE_DISCOVERY_SCRIPT =
1720+
NM_FPGA_RESOURCE_PREFIX + "device-discovery-script";
1721+
1722+
@Private
1723+
public static final String NM_FPGA_AVAILABLE_DEVICES =
1724+
NM_FPGA_RESOURCE_PREFIX + "available-devices";
1725+
1726+
17181727
public static final String NM_NETWORK_TAG_PREFIX = NM_PREFIX
17191728
+ "network-tagging";
17201729

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3960,6 +3960,33 @@
39603960
<value>auto</value>
39613961
</property>
39623962

3963+
<property>
3964+
<description>
3965+
Absolute path to a script or executable that returns the available FPGA cards.
3966+
The returned string must be a single line and follow the format:
3967+
"deviceA/N:M,deviceB/X:Y". Example: "acl0/243:0,acl1/243:1". The numbers after
3968+
the "/" character are the device major and minor numbers.
3969+
3970+
When the script is enabled, auto-discovery is disabled the "aocl" command is not
3971+
invoked to verify the available cards.
3972+
</description>
3973+
<name>yarn.nodemanager.resource-plugins.fpga.device-discovery-script</name>
3974+
<value></value>
3975+
</property>
3976+
3977+
<property>
3978+
<description>
3979+
List of FPGA available devices in the given node.
3980+
The value must follow the format: "deviceA/N:M,deviceB/X:Y".
3981+
Example: "acl0/243:0,acl1/243:1". The numbers after
3982+
the "/" character are the device major and minor numbers.
3983+
3984+
When this property is used, both auto-discovery and external script are ignored.
3985+
</description>
3986+
<name>yarn.nodemanager.resource-plugins.fpga.available-devices</name>
3987+
<value></value>
3988+
</property>
3989+
39633990
<property>
39643991
<description>The http address of the timeline reader web application.</description>
39653992
<name>yarn.timeline-service.reader.webapp.address</name>

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/fpga/FpgaDiscoverer.java

Lines changed: 107 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -19,21 +19,34 @@
1919

2020
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga;
2121

22-
import com.google.common.annotations.VisibleForTesting;
22+
import java.io.File;
23+
import java.io.IOException;
24+
import java.util.List;
25+
import java.util.Optional;
26+
import java.util.Set;
27+
import java.util.function.Function;
28+
import java.util.stream.Collectors;
29+
2330
import org.apache.hadoop.conf.Configuration;
31+
import org.apache.hadoop.fs.FileUtil;
32+
import org.apache.hadoop.util.Shell.ShellCommandExecutor;
2433
import org.apache.hadoop.yarn.conf.YarnConfiguration;
2534
import org.apache.hadoop.yarn.exceptions.YarnException;
2635
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
2736
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.fpga.FpgaResourceAllocator;
37+
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga.discovery.AoclOutputBasedDiscoveryStrategy;
38+
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga.discovery.FPGADiscoveryStrategy;
39+
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga.discovery.ScriptBasedFPGADiscoveryStrategy;
40+
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga.discovery.SettingsBasedFPGADiscoveryStrategy;
2841
import org.slf4j.Logger;
2942
import org.slf4j.LoggerFactory;
3043

31-
import java.util.Iterator;
32-
import java.util.List;
44+
import com.google.common.annotations.VisibleForTesting;
45+
import com.google.common.collect.ImmutableList;
46+
import com.google.common.collect.Sets;
3347

3448
public class FpgaDiscoverer {
35-
36-
public static final Logger LOG = LoggerFactory.getLogger(
49+
private static final Logger LOG = LoggerFactory.getLogger(
3750
FpgaDiscoverer.class);
3851

3952
private static FpgaDiscoverer instance;
@@ -44,8 +57,10 @@ public class FpgaDiscoverer {
4457

4558
private List<FpgaResourceAllocator.FpgaDevice> currentFpgaInfo = null;
4659

60+
private Function<String, Optional<String>> scriptRunner = this::runScript;
61+
4762
// shell command timeout
48-
private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000;
63+
public static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000;
4964

5065
static {
5166
instance = new FpgaDiscoverer();
@@ -56,31 +71,41 @@ public static FpgaDiscoverer getInstance() {
5671
}
5772

5873
@VisibleForTesting
59-
public synchronized static FpgaDiscoverer setInstance(FpgaDiscoverer newInstance) {
74+
void setScriptRunner(Function<String, Optional<String>> scriptRunner) {
75+
this.scriptRunner = scriptRunner;
76+
}
77+
78+
@VisibleForTesting
79+
static void reset() {
80+
instance = new FpgaDiscoverer();
81+
}
82+
83+
@VisibleForTesting
84+
public static FpgaDiscoverer setInstance(FpgaDiscoverer newInstance) {
6085
instance = newInstance;
6186
return instance;
6287
}
6388

6489
@VisibleForTesting
65-
public synchronized void setConf(Configuration conf) {
66-
this.conf = conf;
90+
public void setConf(Configuration configuration) {
91+
this.conf = configuration;
6792
}
6893

6994
public List<FpgaResourceAllocator.FpgaDevice> getCurrentFpgaInfo() {
7095
return currentFpgaInfo;
7196
}
7297

73-
public synchronized void setResourceHanderPlugin(AbstractFpgaVendorPlugin plugin) {
74-
this.plugin = plugin;
98+
public void setResourceHanderPlugin(AbstractFpgaVendorPlugin vendorPlugin) {
99+
this.plugin = vendorPlugin;
75100
}
76101

77-
public synchronized boolean diagnose() {
102+
public boolean diagnose() {
78103
return this.plugin.diagnose(MAX_EXEC_TIMEOUT_MS);
79104
}
80105

81-
public synchronized void initialize(Configuration conf) throws YarnException {
82-
this.conf = conf;
83-
this.plugin.initPlugin(conf);
106+
public void initialize(Configuration config) throws YarnException {
107+
this.conf = config;
108+
this.plugin.initPlugin(config);
84109
// Try to diagnose FPGA
85110
LOG.info("Trying to diagnose FPGA information ...");
86111
if (!diagnose()) {
@@ -91,40 +116,45 @@ public synchronized void initialize(Configuration conf) throws YarnException {
91116
/**
92117
* get avialable devices minor numbers from toolchain or static configuration
93118
* */
94-
public synchronized List<FpgaResourceAllocator.FpgaDevice> discover() throws ResourceHandlerException {
119+
public List<FpgaResourceAllocator.FpgaDevice> discover()
120+
throws ResourceHandlerException {
95121
List<FpgaResourceAllocator.FpgaDevice> list;
96122
String allowed = this.conf.get(YarnConfiguration.NM_FPGA_ALLOWED_DEVICES);
97-
// whatever static or auto discover, we always needs
98-
// the vendor plugin to discover. For instance, IntelFpgaOpenclPlugin need to
99-
// setup a mapping of <major:minor> to <aliasDevName>
100-
list = this.plugin.discover(MAX_EXEC_TIMEOUT_MS);
101-
if (0 == list.size()) {
102-
throw new ResourceHandlerException("No FPGA devices detected!");
123+
124+
String availableDevices = conf.get(
125+
YarnConfiguration.NM_FPGA_AVAILABLE_DEVICES);
126+
String discoveryScript = conf.get(
127+
YarnConfiguration.NM_FPGA_DEVICE_DISCOVERY_SCRIPT);
128+
129+
FPGADiscoveryStrategy discoveryStrategy;
130+
if (availableDevices != null) {
131+
discoveryStrategy =
132+
new SettingsBasedFPGADiscoveryStrategy(
133+
plugin.getFpgaType(), availableDevices);
134+
} else if (discoveryScript != null) {
135+
discoveryStrategy =
136+
new ScriptBasedFPGADiscoveryStrategy(
137+
plugin.getFpgaType(), scriptRunner, discoveryScript);
138+
} else {
139+
discoveryStrategy = new AoclOutputBasedDiscoveryStrategy(plugin);
103140
}
104-
currentFpgaInfo = list;
105-
if (allowed.equalsIgnoreCase(
141+
142+
list = discoveryStrategy.discover();
143+
144+
if (allowed == null || allowed.equalsIgnoreCase(
106145
YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES)) {
107-
return list;
146+
return list;
108147
} else if (allowed.matches("(\\d,)*\\d")){
109-
String[] minors = allowed.split(",");
110-
Iterator<FpgaResourceAllocator.FpgaDevice> iterator = list.iterator();
111-
// remove the non-configured minor numbers
112-
FpgaResourceAllocator.FpgaDevice t;
113-
while (iterator.hasNext()) {
114-
boolean valid = false;
115-
t = iterator.next();
116-
for (String minorNumber : minors) {
117-
if (t.getMinor().toString().equals(minorNumber)) {
118-
valid = true;
119-
break;
120-
}
121-
}
122-
if (!valid) {
123-
iterator.remove();
124-
}
125-
}
148+
Set<String> minors = Sets.newHashSet(allowed.split(","));
149+
150+
// Replace list with a filtered one
151+
list = list
152+
.stream()
153+
.filter(dev -> minors.contains(dev.getMinor().toString()))
154+
.collect(Collectors.toList());
155+
126156
// if the count of user configured is still larger than actual
127-
if (list.size() != minors.length) {
157+
if (list.size() != minors.size()) {
128158
LOG.warn("We continue although there're mistakes in user's configuration " +
129159
YarnConfiguration.NM_FPGA_ALLOWED_DEVICES +
130160
"user configured:" + allowed + ", while the real:" + list.toString());
@@ -133,7 +163,41 @@ public synchronized List<FpgaResourceAllocator.FpgaDevice> discover() throws Res
133163
throw new ResourceHandlerException("Invalid value configured for " +
134164
YarnConfiguration.NM_FPGA_ALLOWED_DEVICES + ":\"" + allowed + "\"");
135165
}
166+
167+
currentFpgaInfo = ImmutableList.copyOf(list);
168+
136169
return list;
137170
}
138171

172+
private Optional<String> runScript(String path) {
173+
if (path == null || path.trim().isEmpty()) {
174+
LOG.error("Undefined script");
175+
return Optional.empty();
176+
}
177+
178+
File f = new File(path);
179+
if (!f.exists()) {
180+
LOG.error("Script does not exist");
181+
return Optional.empty();
182+
}
183+
184+
if (!FileUtil.canExecute(f)) {
185+
LOG.error("Script is not executable");
186+
return Optional.empty();
187+
}
188+
189+
ShellCommandExecutor shell = new ShellCommandExecutor(
190+
new String[] {path},
191+
null,
192+
null,
193+
MAX_EXEC_TIMEOUT_MS);
194+
try {
195+
shell.execute();
196+
String output = shell.getOutput();
197+
return Optional.of(output);
198+
} catch (IOException e) {
199+
LOG.error("Cannot execute script", e);
200+
return Optional.empty();
201+
}
202+
}
139203
}

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/fpga/FpgaNodeResourceUpdateHandler.java

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,12 @@
2020
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga;
2121

2222

23+
import static org.apache.hadoop.yarn.api.records.ResourceInformation.FPGA_URI;
24+
25+
import java.util.LinkedList;
26+
import java.util.List;
27+
import java.util.Map;
28+
2329
import org.apache.hadoop.yarn.api.records.Resource;
2430
import org.apache.hadoop.yarn.api.records.ResourceInformation;
2531
import org.apache.hadoop.yarn.conf.YarnConfiguration;
@@ -30,13 +36,6 @@
3036
import org.slf4j.Logger;
3137
import org.slf4j.LoggerFactory;
3238

33-
import java.util.ArrayList;
34-
import java.util.LinkedList;
35-
import java.util.List;
36-
import java.util.Map;
37-
38-
import static org.apache.hadoop.yarn.api.records.ResourceInformation.FPGA_URI;
39-
4039
public class FpgaNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin {
4140
private static final Logger LOG = LoggerFactory.getLogger(
4241
FpgaNodeResourceUpdateHandler.class);

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/fpga/IntelFpgaOpenclPlugin.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ public boolean initPlugin(Configuration conf) {
100100
if (this.initialized) {
101101
return true;
102102
}
103+
103104
// Find the proper toolchain, mainly aocl
104105
String pluginDefaultBinaryName = getDefaultBinaryName();
105106
String pathToExecutable = conf.get(YarnConfiguration.NM_FPGA_PATH_TO_EXEC,
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
/**
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
20+
package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga.discovery;
21+
22+
import java.util.List;
23+
24+
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
25+
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.fpga.FpgaResourceAllocator.FpgaDevice;
26+
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga.AbstractFpgaVendorPlugin;
27+
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga.FpgaDiscoverer;
28+
29+
/**
30+
* FPGA device discovery strategy which invokes the "aocl" SDK command
31+
* to retrieve the list of available FPGA cards.
32+
*/
33+
public class AoclOutputBasedDiscoveryStrategy
34+
implements FPGADiscoveryStrategy {
35+
36+
private final AbstractFpgaVendorPlugin plugin;
37+
38+
public AoclOutputBasedDiscoveryStrategy(AbstractFpgaVendorPlugin fpgaPlugin) {
39+
this.plugin = fpgaPlugin;
40+
}
41+
42+
@Override
43+
public List<FpgaDevice> discover() throws ResourceHandlerException {
44+
List<FpgaDevice> list =
45+
plugin.discover(FpgaDiscoverer.MAX_EXEC_TIMEOUT_MS);
46+
if (list.isEmpty()) {
47+
throw new ResourceHandlerException("No FPGA devices detected!");
48+
}
49+
50+
return list;
51+
}
52+
}

0 commit comments

Comments
 (0)