Skip to content

Commit

Permalink
[FLINK-17408] Introduce GPUDriver and discovery script
Browse files Browse the repository at this point in the history
This closes apache#11920.
  • Loading branch information
KarmaGYZ authored and tillrohrmann committed May 17, 2020
1 parent 1297992 commit 29a224d
Show file tree
Hide file tree
Showing 16 changed files with 879 additions and 1 deletion.
8 changes: 8 additions & 0 deletions flink-dist/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,14 @@ under the License.
they are not included into the 'flink-dist' uber jar.
-->

<!-- start optional Flink external resource drivers -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-external-resource-gpu</artifactId>
<version>${project.version}</version>
<scope>provided</scope>
</dependency>

<!-- start optional Flink metrics reporters -->
<dependency>
<groupId>org.apache.flink</groupId>
Expand Down
22 changes: 22 additions & 0 deletions flink-dist/src/main/assemblies/opt.xml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,28 @@
<fileMode>0644</fileMode>
</file>

<!-- External Resource -->
<file>
<source>../flink-external-resources/flink-external-resource-gpu/target/flink-external-resource-gpu-${project.version}.jar</source>
<outputDirectory>opt/external-resource-gpu/</outputDirectory>
<destName>flink-external-resource-gpu-${project.version}.jar</destName>
<fileMode>0644</fileMode>
</file>

<file>
<source>../flink-external-resources/flink-external-resource-gpu/src/main/resources/gpu-discovery-common.sh</source>
<outputDirectory>opt/external-resource-gpu/</outputDirectory>
<destName>gpu-discovery-common.sh</destName>
<fileMode>0755</fileMode>
</file>

<file>
<source>../flink-external-resources/flink-external-resource-gpu/src/main/resources/nvidia-gpu-discovery.sh</source>
<outputDirectory>opt/external-resource-gpu/</outputDirectory>
<destName>nvidia-gpu-discovery.sh</destName>
<fileMode>0755</fileMode>
</file>

<!-- Metrics -->
<file>
<source>../flink-metrics/flink-metrics-graphite/target/flink-metrics-graphite-${project.version}.jar</source>
Expand Down
53 changes: 53 additions & 0 deletions flink-external-resources/flink-external-resource-gpu/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http:https://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<project xmlns="http:https://maven.apache.org/POM/4.0.0" xmlns:xsi="http:https://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http:https://maven.apache.org/POM/4.0.0 http:https://maven.apache.org/maven-v4_0_0.xsd">

<modelVersion>4.0.0</modelVersion>

<parent>
<artifactId>flink-external-resources</artifactId>
<groupId>org.apache.flink</groupId>
<version>1.11-SNAPSHOT</version>
<relativePath>..</relativePath>
</parent>

<artifactId>flink-external-resource-gpu</artifactId>
<name>flink-external-resource-gpu</name>

<packaging>jar</packaging>

<dependencies>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-core</artifactId>
<version>${project.version}</version>
<scope>provided</scope>
</dependency>

<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-test-utils-junit</artifactId>
<version>${project.version}</version>
<scope>test</scope>
</dependency>
</dependencies>

</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http:https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.flink.externalresource.gpu;

import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.api.common.externalresource.ExternalResourceDriver;
import org.apache.flink.configuration.ConfigConstants;
import org.apache.flink.configuration.ConfigOption;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.configuration.ExternalResourceOptions;
import org.apache.flink.configuration.IllegalConfigurationException;
import org.apache.flink.util.FlinkException;
import org.apache.flink.util.Preconditions;
import org.apache.flink.util.StringUtils;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.InputStreamReader;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;

import static org.apache.flink.configuration.ConfigOptions.key;

/**
* Driver takes the responsibility to discover GPU resources and provide the GPU resource information.
* It retrieves the GPU information by executing a user-defined discovery script.
*/
class GPUDriver implements ExternalResourceDriver {

private static final Logger LOG = LoggerFactory.getLogger(GPUDriver.class);

private static final long DISCOVERY_SCRIPT_TIMEOUT_MS = 10000;

@VisibleForTesting
static final ConfigOption<String> DISCOVERY_SCRIPT_PATH =
key("discovery-script.path")
.stringType()
.defaultValue(String.format("%s/external-resource-gpu/nvidia-gpu-discovery.sh", ConfigConstants.DEFAULT_FLINK_PLUGINS_DIRS));

@VisibleForTesting
static final ConfigOption<String> DISCOVERY_SCRIPT_ARG =
key("discovery-script.args")
.stringType()
.noDefaultValue();

private final File discoveryScriptFile;
private final String args;

GPUDriver(Configuration config) throws Exception {
final String discoveryScriptPathStr = config.getString(DISCOVERY_SCRIPT_PATH);
if (StringUtils.isNullOrWhitespaceOnly(discoveryScriptPathStr)) {
throw new IllegalConfigurationException(
String.format("GPU discovery script ('%s') is not configured.", ExternalResourceOptions.genericKeyWithSuffix(DISCOVERY_SCRIPT_PATH.key())));
}

Path discoveryScriptPath = Paths.get(discoveryScriptPathStr);
if (!discoveryScriptPath.isAbsolute()) {
discoveryScriptPath = Paths.get(System.getenv().getOrDefault(ConfigConstants.ENV_FLINK_HOME_DIR, "."), discoveryScriptPathStr);
}
discoveryScriptFile = discoveryScriptPath.toFile();

if (!discoveryScriptFile.exists()) {
throw new FileNotFoundException(String.format("The gpu discovery script does not exist in path %s.", discoveryScriptFile.getAbsolutePath()));
}
if (!discoveryScriptFile.canExecute()) {
throw new FlinkException(String.format("The discovery script %s is not executable.", discoveryScriptFile.getAbsolutePath()));
}

args = config.getString(DISCOVERY_SCRIPT_ARG);
}

@Override
public Set<GPUInfo> retrieveResourceInfo(long gpuAmount) throws Exception {
Preconditions.checkArgument(gpuAmount > 0, "The gpuAmount should be positive when retrieving the GPU resource information.");

final Set<GPUInfo> gpuResources = new HashSet<>();
String output = executeDiscoveryScript(discoveryScriptFile, gpuAmount, args);
if (!output.isEmpty()) {
String[] indexes = output.split(",");
for (String index : indexes) {
if (!StringUtils.isNullOrWhitespaceOnly(index)) {
gpuResources.add(new GPUInfo(index.trim()));
}
}
}
LOG.info("Discover GPU resources: {}.", gpuResources);
return Collections.unmodifiableSet(gpuResources);
}

private String executeDiscoveryScript(File discoveryScript, long gpuAmount, String args) throws Exception {
final String cmd = discoveryScript.getAbsolutePath() + " " + gpuAmount + " " + args;
final Process process = Runtime.getRuntime().exec(cmd);
try (final BufferedReader stdoutReader = new BufferedReader(new InputStreamReader(process.getInputStream()));
final BufferedReader stderrReader = new BufferedReader(new InputStreamReader(process.getErrorStream()))) {
final boolean hasProcessTerminated = process.waitFor(DISCOVERY_SCRIPT_TIMEOUT_MS, TimeUnit.MILLISECONDS);
if (!hasProcessTerminated) {
throw new TimeoutException(String.format("The discovery script executed for over %d ms.", DISCOVERY_SCRIPT_TIMEOUT_MS));
}

final int exitVal = process.exitValue();
if (exitVal != 0) {
final String stdout = stdoutReader.lines().collect(StringBuilder::new, StringBuilder::append, StringBuilder::append).toString();
final String stderr = stderrReader.lines().collect(StringBuilder::new, StringBuilder::append, StringBuilder::append).toString();
LOG.warn("Discovery script exit with {}.\\nSTDOUT: {}\\nSTDERR: {}", exitVal, stdout, stderr);
throw new FlinkException(String.format("Discovery script exit with non-zero return code: %s.", exitVal));
}
Object[] stdout = stdoutReader.lines().toArray();
if (stdout.length > 1) {
LOG.warn(
"The output of the discovery script should only contain one single line. Finding {} lines with content: {}. Will only keep the first line.", stdout.length, Arrays.toString(stdout));
}
if (stdout.length == 0) {
return "";
}
return (String) stdout[0];
} finally {
process.destroyForcibly();
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http:https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.flink.externalresource.gpu;

import org.apache.flink.api.common.externalresource.ExternalResourceDriver;
import org.apache.flink.api.common.externalresource.ExternalResourceDriverFactory;
import org.apache.flink.configuration.Configuration;

/**
* Factory for creating {@link GPUDriver}.
*/
public class GPUDriverFactory implements ExternalResourceDriverFactory {
@Override
public ExternalResourceDriver createExternalResourceDriver(Configuration config) throws Exception {
return new GPUDriver(config);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http:https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.flink.externalresource.gpu;

import org.apache.flink.api.common.externalresource.ExternalResourceInfo;
import org.apache.flink.util.Preconditions;
import org.apache.flink.util.StringUtils;

import java.util.Collection;
import java.util.Collections;
import java.util.Optional;

/**
* Information for GPU resource. Currently only including the GPU index.
*/
public class GPUInfo implements ExternalResourceInfo {

private static final String PROPERTY_KEY_INDEX = "index";

private final String index;

GPUInfo(String index) {
Preconditions.checkArgument(!StringUtils.isNullOrWhitespaceOnly(index));
this.index = index;
}

@Override
public String toString() {
return String.format("GPU Device(%s)", index);
}

@Override
public int hashCode() {
return index.hashCode();
}

@Override
public boolean equals(Object obj) {
if (obj == this) {
return true;
} else if (obj instanceof GPUInfo) {
final GPUInfo other = (GPUInfo) obj;
return this.index.equals(other.index);
}
return false;
}

@Override
public Optional<String> getProperty(String key) {
if (key.equals(PROPERTY_KEY_INDEX)) {
return Optional.of(index);
} else {
return Optional.empty();
}
}

@Override
public Collection<String> getKeys() {
return Collections.singleton(PROPERTY_KEY_INDEX);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http:https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

org.apache.flink.externalresource.gpu.GPUDriverFactory
Loading

0 comments on commit 29a224d

Please sign in to comment.