Skip to content

Commit

Permalink
Merge pull request #1132 from Kaggle/tpu-1vm-image
Browse files Browse the repository at this point in the history
Get tensorflow, jax, and pytorch working on TPU1VM
  • Loading branch information
djherbis committed May 16, 2022
2 parents 389cecd + ba96747 commit cdba7d1
Show file tree
Hide file tree
Showing 9 changed files with 158 additions and 9 deletions.
46 changes: 45 additions & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,22 @@ pipeline {
'''
}
}
stage('tensorflow TPU') {
options {
timeout(time: 180, unit: 'MINUTES')
}
steps {
sh '''#!/bin/bash
set -exo pipefail
source tpu/config.txt
cd packages/
./build_package --base-image gcr.io/kaggle-images/python:${BASE_IMAGE_TAG} \
--package tpu-tensorflow \
--version $TENSORFLOW_VERSION \
--push
'''
}
}
}
}
stage('Build/Test/Diff') {
Expand Down Expand Up @@ -150,7 +166,34 @@ pipeline {
}
}
}
}
}
stage('TPU VM') {
stages {
stage('Build Tensorflow TPU Image') {
options {
timeout(time: 20, unit: 'MINUTES')
}
steps {
sh '''#!/bin/bash
set -exo pipefail
./tpu/build | ts
./push --tpu ${PRETEST_TAG}
'''
}
}
stage('Diff TPU VM Image') {
steps {
sh '''#!/bin/bash
set -exo pipefail
docker pull gcr.io/kaggle-private-byod/python-tpuvm:${PRETEST_TAG}
./diff --tpu --target gcr.io/kaggle-private-byod/python-tpuvm:${PRETEST_TAG}
'''
}
}
}
}
}
}

Expand All @@ -161,6 +204,7 @@ pipeline {
gcloud container images add-tag gcr.io/kaggle-images/python:${PRETEST_TAG} gcr.io/kaggle-images/python:${STAGING_TAG}
gcloud container images add-tag gcr.io/kaggle-private-byod/python:${PRETEST_TAG} gcr.io/kaggle-private-byod/python:${STAGING_TAG}
gcloud container images add-tag gcr.io/kaggle-private-byod/python-tpuvm:${PRETEST_TAG} gcr.io/kaggle-private-byod/python-tpuvm:${STAGING_TAG}
'''
}
}
Expand Down
4 changes: 4 additions & 0 deletions diff
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ while :; do
BASE_IMAGE_TAG='gcr.io/kaggle-private-byod/python:latest'
TARGET_IMAGE_TAG='kaggle/python-gpu-build'
;;
-x|--tpu)
BASE_IMAGE_TAG='gcr.io/kaggle-private-byod/python-tpuvm:latest'
TARGET_IMAGE_TAG='kaggle/python-tpuvm-build'
;;
-b|--base)
if [[ -z "$2" ]]; then
usage
Expand Down
2 changes: 2 additions & 0 deletions packages/build_package
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,8 @@ fi

# Keep only `tf2-gpu.2-6:m80` in `gcr.io/deeplearning-platform-release/tf2-gpu.2-6:m80`
TAG=${BASE_IMAGE/gcr.io\/deeplearning-platform-release\//}
# Keep only `python:v108` in `gcr.io/kaggle-images/python:v108`
TAG=${TAG/gcr.io\/kaggle-images\//}
# Replace the `:` in `tf2-gpu.2-6:m80` by `-`
TAG=${TAG/:/-}
# Append the package version
Expand Down
27 changes: 22 additions & 5 deletions tpu/tensorflow.Dockerfile → packages/tpu-tensorflow.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
ARG BASE_IMAGE_TAG
ARG BASE_IMAGE

FROM gcr.io/kaggle-images/python:${BASE_IMAGE_TAG} AS builder
FROM ${BASE_IMAGE} AS builder

ARG PACKAGE_VERSION

# Use Bazelisk to ensure the proper bazel version is used.
RUN cd /usr/local/src && \
Expand All @@ -12,12 +14,12 @@ RUN cd /usr/local/src && \
RUN cd /usr/local/src && \
git clone https://github.com/tensorflow/tensorflow && \
cd tensorflow && \
git checkout tags/v${TENSORFLOW_VERSION} && \
git checkout tags/v${PACKAGE_VERSION} && \
# TODO(rosbo): Is it really needed?
pip install keras_applications --no-deps && \
pip install keras_preprocessing --no-deps

# Create a TensorFlow wheel for CPU
# Create a TensorFlow wheel for TPU
RUN cd /usr/local/src/tensorflow && \
cat /dev/null | ./configure && \
bazel build \
Expand All @@ -32,7 +34,22 @@ RUN cd /usr/local/src/tensorflow && \
RUN cd /usr/local/src/tensorflow && \
bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg

# TODO(b/152075195): Will likely need to install custom build for TFA & tensorflow-gcs-config
# Build TensorFlow addons library against TensorFlow CPU.
#RUN cd /usr/local/src/ && \
# git clone https://github.com/tensorflow/addons && \
# cd addons && \
# git checkout tags/v0.12.1 && \
# python ./configure.py && \
# bazel build --enable_runfiles build_pip_pkg && \
# bazel-bin/build_pip_pkg /tmp/tfa_cpu && \
# bazel clean

# Build tensorflow_gcs_config library against TensorFlow CPU.
#ADD tensorflow-gcs-config /usr/local/src/tensorflow_gcs_config/
#RUN cd /usr/local/src/tensorflow_gcs_config && \
# apt-get install -y libcurl4-openssl-dev && \
# python setup.py bdist_wheel -d /tmp/tensorflow_gcs_config && \
# bazel clean

# Use multi-stage builds to minimize image output size.
FROM alpine:latest
Expand Down
5 changes: 5 additions & 0 deletions push
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Push a newly-built image with the given LABEL to gcr.io and DockerHub.
Options:
-g, --gpu Push the image with GPU support.
-t, --tpu Push the image with GPU support.
-s, --source-image IMAGE Tag for the source image.
EOF
}
Expand All @@ -26,6 +27,10 @@ while :; do
SOURCE_IMAGE_TAG='kaggle/python-gpu-build:latest'
TARGET_IMAGE='gcr.io/kaggle-private-byod/python'
;;
-t|--tpu)
SOURCE_IMAGE_TAG='kaggle/python-tpuvm-build:latest'
TARGET_IMAGE='gcr.io/kaggle-private-byod/python-tpuvm'
;;
-s|--source-image)
if [[ -z $2 ]]; then
usage
Expand Down
1 change: 1 addition & 0 deletions tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@
import unittest

gpu_test = unittest.skipIf(len(os.environ.get('CUDA_VERSION', '')) == 0, 'Not running GPU tests')
tpu_test = unittest.skipIf(len(os.environ.get('ISTPUVM', '')) == 0, 'Not running TPU tests')
20 changes: 17 additions & 3 deletions tpu/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,13 +1,27 @@
ARG BASE_IMAGE_TAG
ARG LIBTPU_IMAGE_TAG
ARG TENSORFLOW_WHL_IMAGE_TAG
ARG TENSORFLOW_VERSION
ARG TORCH_VERSION

FROM gcr.io/cloud-tpu-v2-images/libtpu:${LIBTPU_IMAGE_TAG} as libtpu
FROM gcr.io/kaggle-images/python-tpu-tensorflow-whl:${TENSORFLOW_WHL_IMAGE_TAG} AS tensorflow_whl
FROM gcr.io/kaggle-images/python-tpu-tensorflow-whl:python-${BASE_IMAGE_TAG}-${TENSORFLOW_VERSION} AS tensorflow_whl
FROM gcr.io/kaggle-images/python:${BASE_IMAGE_TAG}

ENV ISTPUVM=1

COPY --from=libtpu /libtpu.so /lib

COPY --from=tensorflow_whl /tmp/tensorflow_pkg/tensorflow*.whl /tmp/tensorflow_pkg/
RUN pip install /tmp/tensorflow_pkg/tensorflow*.whl && \
rm -rf /tmp/tensorflow_pkg
rm -rf /tmp/tensorflow_pkg && \
/tmp/clean-layer.sh

# https://cloud.google.com/tpu/docs/pytorch-xla-ug-tpu-vm#changing_pytorch_version
RUN pip uninstall -y torch && \
pip install torch==${TORCH_VERSION} && \
pip install torch_xla[tpuvm] -f https://storage.googleapis.com/tpu-pytorch/wheels/tpuvm/torch_xla-${TORCH_VERSION}-cp37-cp37m-linux_x86_64.whl && \
/tmp/clean-layer.sh

# https://cloud.google.com/tpu/docs/jax-quickstart-tpu-vm#install_jax_on_your_cloud_tpu_vm
RUN pip install "jax[tpu]>=0.2.16" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html && \
/tmp/clean-layer.sh
57 changes: 57 additions & 0 deletions tpu/build
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/bin/bash
set -e

usage() {
cat << EOF
Usage: $0 [OPTIONS]
Build a new Python TPU 1VM Docker image.
Options:
-c, --use-cache Use layer cache when building a new image.
EOF
}

CACHE_FLAG='--no-cache'
DOCKERFILE='Dockerfile'
IMAGE_TAG='kaggle/python-tpuvm-build'
BUILD_ARGS=''

while :; do
case "$1" in
-h|--help)
usage
exit
;;
-c|--use-cache)
CACHE_FLAG=''
;;
-?*)
usage
printf 'ERROR: Unknown option: %s\n' "$1" >&2
exit
;;
*)
break
esac

shift
done

BUILD_ARGS+=" --build-arg GIT_COMMIT=$(git rev-parse HEAD)"
BUILD_ARGS+=" --build-arg BUILD_DATE=$(date '+%Y%m%d-%H%M%S')"

# Read build args from config.txt file.
SRCDIR=$(dirname "${BASH_SOURCE[0]}")
for l in `cat ${SRCDIR}/config.txt`; do
BUILD_ARGS+=" --build-arg $l"
done

readonly CACHE_FLAG
readonly DOCKERFILE
readonly IMAGE_TAG
readonly BUILD_ARGS

DOCKERFILE_PATH="$SRCDIR/$DOCKERFILE"

set -x
docker build --rm --pull $CACHE_FLAG -t "$IMAGE_TAG" -f "$DOCKERFILE_PATH" $BUILD_ARGS .
5 changes: 5 additions & 0 deletions tpu/config.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# TODO(b/213335159): Use ci-pretest for BASE_IMAGE_TAG once stable.
BASE_IMAGE_TAG=v108
LIBTPU_IMAGE_TAG=libtpu_1.1.0_RC00
TENSORFLOW_VERSION=2.8.0
TORCH_VERSION=1.11.0

0 comments on commit cdba7d1

Please sign in to comment.