forked from apache/flink
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[FLINK-17408] Introduce GPUDriver and discovery script
This closes apache#11920.
- Loading branch information
1 parent
1297992
commit 29a224d
Showing
16 changed files
with
879 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
53 changes: 53 additions & 0 deletions
53
flink-external-resources/flink-external-resource-gpu/pom.xml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<!-- | ||
Licensed to the Apache Software Foundation (ASF) under one | ||
or more contributor license agreements. See the NOTICE file | ||
distributed with this work for additional information | ||
regarding copyright ownership. The ASF licenses this file | ||
to you under the Apache License, Version 2.0 (the | ||
"License"); you may not use this file except in compliance | ||
with the License. You may obtain a copy of the License at | ||
http:https://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, | ||
software distributed under the License is distributed on an | ||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
KIND, either express or implied. See the License for the | ||
specific language governing permissions and limitations | ||
under the License. | ||
--> | ||
<project xmlns="http:https://maven.apache.org/POM/4.0.0" xmlns:xsi="http:https://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http:https://maven.apache.org/POM/4.0.0 http:https://maven.apache.org/maven-v4_0_0.xsd"> | ||
|
||
<modelVersion>4.0.0</modelVersion> | ||
|
||
<parent> | ||
<artifactId>flink-external-resources</artifactId> | ||
<groupId>org.apache.flink</groupId> | ||
<version>1.11-SNAPSHOT</version> | ||
<relativePath>..</relativePath> | ||
</parent> | ||
|
||
<artifactId>flink-external-resource-gpu</artifactId> | ||
<name>flink-external-resource-gpu</name> | ||
|
||
<packaging>jar</packaging> | ||
|
||
<dependencies> | ||
<dependency> | ||
<groupId>org.apache.flink</groupId> | ||
<artifactId>flink-core</artifactId> | ||
<version>${project.version}</version> | ||
<scope>provided</scope> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>org.apache.flink</groupId> | ||
<artifactId>flink-test-utils-junit</artifactId> | ||
<version>${project.version}</version> | ||
<scope>test</scope> | ||
</dependency> | ||
</dependencies> | ||
|
||
</project> |
146 changes: 146 additions & 0 deletions
146
...-external-resource-gpu/src/main/java/org/apache/flink/externalresource/gpu/GPUDriver.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,146 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http:https://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.flink.externalresource.gpu; | ||
|
||
import org.apache.flink.annotation.VisibleForTesting; | ||
import org.apache.flink.api.common.externalresource.ExternalResourceDriver; | ||
import org.apache.flink.configuration.ConfigConstants; | ||
import org.apache.flink.configuration.ConfigOption; | ||
import org.apache.flink.configuration.Configuration; | ||
import org.apache.flink.configuration.ExternalResourceOptions; | ||
import org.apache.flink.configuration.IllegalConfigurationException; | ||
import org.apache.flink.util.FlinkException; | ||
import org.apache.flink.util.Preconditions; | ||
import org.apache.flink.util.StringUtils; | ||
|
||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
import java.io.BufferedReader; | ||
import java.io.File; | ||
import java.io.FileNotFoundException; | ||
import java.io.InputStreamReader; | ||
import java.nio.file.Path; | ||
import java.nio.file.Paths; | ||
import java.util.Arrays; | ||
import java.util.Collections; | ||
import java.util.HashSet; | ||
import java.util.Set; | ||
import java.util.concurrent.TimeUnit; | ||
import java.util.concurrent.TimeoutException; | ||
|
||
import static org.apache.flink.configuration.ConfigOptions.key; | ||
|
||
/** | ||
* Driver takes the responsibility to discover GPU resources and provide the GPU resource information. | ||
* It retrieves the GPU information by executing a user-defined discovery script. | ||
*/ | ||
class GPUDriver implements ExternalResourceDriver { | ||
|
||
private static final Logger LOG = LoggerFactory.getLogger(GPUDriver.class); | ||
|
||
private static final long DISCOVERY_SCRIPT_TIMEOUT_MS = 10000; | ||
|
||
@VisibleForTesting | ||
static final ConfigOption<String> DISCOVERY_SCRIPT_PATH = | ||
key("discovery-script.path") | ||
.stringType() | ||
.defaultValue(String.format("%s/external-resource-gpu/nvidia-gpu-discovery.sh", ConfigConstants.DEFAULT_FLINK_PLUGINS_DIRS)); | ||
|
||
@VisibleForTesting | ||
static final ConfigOption<String> DISCOVERY_SCRIPT_ARG = | ||
key("discovery-script.args") | ||
.stringType() | ||
.noDefaultValue(); | ||
|
||
private final File discoveryScriptFile; | ||
private final String args; | ||
|
||
GPUDriver(Configuration config) throws Exception { | ||
final String discoveryScriptPathStr = config.getString(DISCOVERY_SCRIPT_PATH); | ||
if (StringUtils.isNullOrWhitespaceOnly(discoveryScriptPathStr)) { | ||
throw new IllegalConfigurationException( | ||
String.format("GPU discovery script ('%s') is not configured.", ExternalResourceOptions.genericKeyWithSuffix(DISCOVERY_SCRIPT_PATH.key()))); | ||
} | ||
|
||
Path discoveryScriptPath = Paths.get(discoveryScriptPathStr); | ||
if (!discoveryScriptPath.isAbsolute()) { | ||
discoveryScriptPath = Paths.get(System.getenv().getOrDefault(ConfigConstants.ENV_FLINK_HOME_DIR, "."), discoveryScriptPathStr); | ||
} | ||
discoveryScriptFile = discoveryScriptPath.toFile(); | ||
|
||
if (!discoveryScriptFile.exists()) { | ||
throw new FileNotFoundException(String.format("The gpu discovery script does not exist in path %s.", discoveryScriptFile.getAbsolutePath())); | ||
} | ||
if (!discoveryScriptFile.canExecute()) { | ||
throw new FlinkException(String.format("The discovery script %s is not executable.", discoveryScriptFile.getAbsolutePath())); | ||
} | ||
|
||
args = config.getString(DISCOVERY_SCRIPT_ARG); | ||
} | ||
|
||
@Override | ||
public Set<GPUInfo> retrieveResourceInfo(long gpuAmount) throws Exception { | ||
Preconditions.checkArgument(gpuAmount > 0, "The gpuAmount should be positive when retrieving the GPU resource information."); | ||
|
||
final Set<GPUInfo> gpuResources = new HashSet<>(); | ||
String output = executeDiscoveryScript(discoveryScriptFile, gpuAmount, args); | ||
if (!output.isEmpty()) { | ||
String[] indexes = output.split(","); | ||
for (String index : indexes) { | ||
if (!StringUtils.isNullOrWhitespaceOnly(index)) { | ||
gpuResources.add(new GPUInfo(index.trim())); | ||
} | ||
} | ||
} | ||
LOG.info("Discover GPU resources: {}.", gpuResources); | ||
return Collections.unmodifiableSet(gpuResources); | ||
} | ||
|
||
private String executeDiscoveryScript(File discoveryScript, long gpuAmount, String args) throws Exception { | ||
final String cmd = discoveryScript.getAbsolutePath() + " " + gpuAmount + " " + args; | ||
final Process process = Runtime.getRuntime().exec(cmd); | ||
try (final BufferedReader stdoutReader = new BufferedReader(new InputStreamReader(process.getInputStream())); | ||
final BufferedReader stderrReader = new BufferedReader(new InputStreamReader(process.getErrorStream()))) { | ||
final boolean hasProcessTerminated = process.waitFor(DISCOVERY_SCRIPT_TIMEOUT_MS, TimeUnit.MILLISECONDS); | ||
if (!hasProcessTerminated) { | ||
throw new TimeoutException(String.format("The discovery script executed for over %d ms.", DISCOVERY_SCRIPT_TIMEOUT_MS)); | ||
} | ||
|
||
final int exitVal = process.exitValue(); | ||
if (exitVal != 0) { | ||
final String stdout = stdoutReader.lines().collect(StringBuilder::new, StringBuilder::append, StringBuilder::append).toString(); | ||
final String stderr = stderrReader.lines().collect(StringBuilder::new, StringBuilder::append, StringBuilder::append).toString(); | ||
LOG.warn("Discovery script exit with {}.\\nSTDOUT: {}\\nSTDERR: {}", exitVal, stdout, stderr); | ||
throw new FlinkException(String.format("Discovery script exit with non-zero return code: %s.", exitVal)); | ||
} | ||
Object[] stdout = stdoutReader.lines().toArray(); | ||
if (stdout.length > 1) { | ||
LOG.warn( | ||
"The output of the discovery script should only contain one single line. Finding {} lines with content: {}. Will only keep the first line.", stdout.length, Arrays.toString(stdout)); | ||
} | ||
if (stdout.length == 0) { | ||
return ""; | ||
} | ||
return (String) stdout[0]; | ||
} finally { | ||
process.destroyForcibly(); | ||
} | ||
} | ||
} |
33 changes: 33 additions & 0 deletions
33
...al-resource-gpu/src/main/java/org/apache/flink/externalresource/gpu/GPUDriverFactory.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http:https://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.flink.externalresource.gpu; | ||
|
||
import org.apache.flink.api.common.externalresource.ExternalResourceDriver; | ||
import org.apache.flink.api.common.externalresource.ExternalResourceDriverFactory; | ||
import org.apache.flink.configuration.Configuration; | ||
|
||
/** | ||
* Factory for creating {@link GPUDriver}. | ||
*/ | ||
public class GPUDriverFactory implements ExternalResourceDriverFactory { | ||
@Override | ||
public ExternalResourceDriver createExternalResourceDriver(Configuration config) throws Exception { | ||
return new GPUDriver(config); | ||
} | ||
} |
77 changes: 77 additions & 0 deletions
77
...nk-external-resource-gpu/src/main/java/org/apache/flink/externalresource/gpu/GPUInfo.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http:https://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.flink.externalresource.gpu; | ||
|
||
import org.apache.flink.api.common.externalresource.ExternalResourceInfo; | ||
import org.apache.flink.util.Preconditions; | ||
import org.apache.flink.util.StringUtils; | ||
|
||
import java.util.Collection; | ||
import java.util.Collections; | ||
import java.util.Optional; | ||
|
||
/** | ||
* Information for GPU resource. Currently only including the GPU index. | ||
*/ | ||
public class GPUInfo implements ExternalResourceInfo { | ||
|
||
private static final String PROPERTY_KEY_INDEX = "index"; | ||
|
||
private final String index; | ||
|
||
GPUInfo(String index) { | ||
Preconditions.checkArgument(!StringUtils.isNullOrWhitespaceOnly(index)); | ||
this.index = index; | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
return String.format("GPU Device(%s)", index); | ||
} | ||
|
||
@Override | ||
public int hashCode() { | ||
return index.hashCode(); | ||
} | ||
|
||
@Override | ||
public boolean equals(Object obj) { | ||
if (obj == this) { | ||
return true; | ||
} else if (obj instanceof GPUInfo) { | ||
final GPUInfo other = (GPUInfo) obj; | ||
return this.index.equals(other.index); | ||
} | ||
return false; | ||
} | ||
|
||
@Override | ||
public Optional<String> getProperty(String key) { | ||
if (key.equals(PROPERTY_KEY_INDEX)) { | ||
return Optional.of(index); | ||
} else { | ||
return Optional.empty(); | ||
} | ||
} | ||
|
||
@Override | ||
public Collection<String> getKeys() { | ||
return Collections.singleton(PROPERTY_KEY_INDEX); | ||
} | ||
} |
16 changes: 16 additions & 0 deletions
16
...A-INF/services/org.apache.flink.api.common.externalresource.ExternalResourceDriverFactory
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# Licensed to the Apache Software Foundation (ASF) under one or more | ||
# contributor license agreements. See the NOTICE file distributed with | ||
# this work for additional information regarding copyright ownership. | ||
# The ASF licenses this file to You under the Apache License, Version 2.0 | ||
# (the "License"); you may not use this file except in compliance with | ||
# the License. You may obtain a copy of the License at | ||
# | ||
# http:https://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
org.apache.flink.externalresource.gpu.GPUDriverFactory |
Oops, something went wrong.