Merge pull request #1132 from Kaggle/tpu-1vm-image

Get tensorflow, jax, and pytorch working on TPU1VM
Kaggle · May 16, 2022 · cdba7d1 · cdba7d1
2 parents 389cecd + ba96747
commit cdba7d1
Show file tree

Hide file tree

Showing 9 changed files with 158 additions and 9 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -56,6 +56,22 @@ pipeline {
  '''
  }
  }
+ stage('tensorflow TPU') {
+ options {
+ timeout(time: 180, unit: 'MINUTES')
+ }
+ steps {
+ sh '''#!/bin/bash
+ set -exo pipefail
+ source tpu/config.txt
+ cd packages/
+ ./build_package --base-image gcr.io/kaggle-images/python:${BASE_IMAGE_TAG} \
+ --package tpu-tensorflow \
+ --version $TENSORFLOW_VERSION \
+ --push
+ '''
+ }
+ }
  }
  }
  stage('Build/Test/Diff') {
@@ -150,7 +166,34 @@ pipeline {
  }
  }
  }
- } 
+ }
+ stage('TPU VM') {
+ stages {
+ stage('Build Tensorflow TPU Image') {
+ options {
+ timeout(time: 20, unit: 'MINUTES')
+ }
+ steps {
+ sh '''#!/bin/bash
+ set -exo pipefail
+
+ ./tpu/build | ts
+ ./push --tpu ${PRETEST_TAG}
+ '''
+ }
+ }
+ stage('Diff TPU VM Image') {
+ steps {
+ sh '''#!/bin/bash
+ set -exo pipefail
+
+ docker pull gcr.io/kaggle-private-byod/python-tpuvm:${PRETEST_TAG}
+ ./diff --tpu --target gcr.io/kaggle-private-byod/python-tpuvm:${PRETEST_TAG}
+ '''
+ }
+ }
+ }
+ }
  }
  }
 
@@ -161,6 +204,7 @@ pipeline {
 
  gcloud container images add-tag gcr.io/kaggle-images/python:${PRETEST_TAG} gcr.io/kaggle-images/python:${STAGING_TAG}
  gcloud container images add-tag gcr.io/kaggle-private-byod/python:${PRETEST_TAG} gcr.io/kaggle-private-byod/python:${STAGING_TAG}
+ gcloud container images add-tag gcr.io/kaggle-private-byod/python-tpuvm:${PRETEST_TAG} gcr.io/kaggle-private-byod/python-tpuvm:${STAGING_TAG}
  '''
  }
  }

diff --git a/diff b/diff
@@ -32,6 +32,10 @@ while :; do
  BASE_IMAGE_TAG='gcr.io/kaggle-private-byod/python:latest'
  TARGET_IMAGE_TAG='kaggle/python-gpu-build'
  ;;
+ -x|--tpu)
+ BASE_IMAGE_TAG='gcr.io/kaggle-private-byod/python-tpuvm:latest'
+ TARGET_IMAGE_TAG='kaggle/python-tpuvm-build'
+ ;;
  -b|--base)
  if [[ -z "$2" ]]; then
  usage

diff --git a/packages/build_package b/packages/build_package
@@ -117,6 +117,8 @@ fi
 
 # Keep only `tf2-gpu.2-6:m80` in `gcr.io/deeplearning-platform-release/tf2-gpu.2-6:m80` 
 TAG=${BASE_IMAGE/gcr.io\/deeplearning-platform-release\//}
+# Keep only `python:v108` in `gcr.io/kaggle-images/python:v108`
+TAG=${TAG/gcr.io\/kaggle-images\//}
 # Replace the `:` in `tf2-gpu.2-6:m80` by `-`
 TAG=${TAG/:/-}
 # Append the package version

diff --git a/tpu/tensorflow.Dockerfile → packages/tpu-tensorflow.Dockerfile b/tpu/tensorflow.Dockerfile → packages/tpu-tensorflow.Dockerfile
@@ -1,6 +1,8 @@
-ARG BASE_IMAGE_TAG
+ARG BASE_IMAGE
 
-FROM gcr.io/kaggle-images/python:${BASE_IMAGE_TAG} AS builder
+FROM ${BASE_IMAGE} AS builder
+
+ARG PACKAGE_VERSION
 
 # Use Bazelisk to ensure the proper bazel version is used.
 RUN cd /usr/local/src && \
@@ -12,12 +14,12 @@ RUN cd /usr/local/src && \
 RUN cd /usr/local/src && \
  git clone https://github.com/tensorflow/tensorflow && \
  cd tensorflow && \
- git checkout tags/v${TENSORFLOW_VERSION} && \
+ git checkout tags/v${PACKAGE_VERSION} && \
  # TODO(rosbo): Is it really needed?
  pip install keras_applications --no-deps && \
  pip install keras_preprocessing --no-deps
 
-# Create a TensorFlow wheel for CPU
+# Create a TensorFlow wheel for TPU
 RUN cd /usr/local/src/tensorflow && \
  cat /dev/null | ./configure && \
  bazel build \
@@ -32,7 +34,22 @@ RUN cd /usr/local/src/tensorflow && \
 RUN cd /usr/local/src/tensorflow && \
  bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg
 
-# TODO(b/152075195): Will likely need to install custom build for TFA & tensorflow-gcs-config
+# Build TensorFlow addons library against TensorFlow CPU.
+#RUN cd /usr/local/src/ && \
+# git clone https://github.com/tensorflow/addons && \
+# cd addons && \
+# git checkout tags/v0.12.1 && \
+# python ./configure.py && \
+# bazel build --enable_runfiles build_pip_pkg && \
+# bazel-bin/build_pip_pkg /tmp/tfa_cpu && \
+# bazel clean
+
+# Build tensorflow_gcs_config library against TensorFlow CPU.
+#ADD tensorflow-gcs-config /usr/local/src/tensorflow_gcs_config/
+#RUN cd /usr/local/src/tensorflow_gcs_config && \
+# apt-get install -y libcurl4-openssl-dev && \
+# python setup.py bdist_wheel -d /tmp/tensorflow_gcs_config && \
+# bazel clean
 
 # Use multi-stage builds to minimize image output size.
 FROM alpine:latest

diff --git a/push b/push
@@ -8,6 +8,7 @@ Push a newly-built image with the given LABEL to gcr.io and DockerHub.
 
 Options:
  -g, --gpu Push the image with GPU support.
+ -t, --tpu Push the image with GPU support.
  -s, --source-image IMAGE Tag for the source image. 
 EOF
 }
@@ -26,6 +27,10 @@ while :; do
  SOURCE_IMAGE_TAG='kaggle/python-gpu-build:latest'
  TARGET_IMAGE='gcr.io/kaggle-private-byod/python'
  ;;
+ -t|--tpu)
+ SOURCE_IMAGE_TAG='kaggle/python-tpuvm-build:latest'
+ TARGET_IMAGE='gcr.io/kaggle-private-byod/python-tpuvm'
+ ;;
  -s|--source-image)
  if [[ -z $2 ]]; then
  usage

diff --git a/tests/common.py b/tests/common.py
@@ -4,3 +4,4 @@
 import unittest
 
 gpu_test = unittest.skipIf(len(os.environ.get('CUDA_VERSION', '')) == 0, 'Not running GPU tests')
+tpu_test = unittest.skipIf(len(os.environ.get('ISTPUVM', '')) == 0, 'Not running TPU tests')
diff --git a/tpu/Dockerfile b/tpu/Dockerfile
@@ -1,13 +1,27 @@
 ARG BASE_IMAGE_TAG
 ARG LIBTPU_IMAGE_TAG
-ARG TENSORFLOW_WHL_IMAGE_TAG
+ARG TENSORFLOW_VERSION
+ARG TORCH_VERSION
 
 FROM gcr.io/cloud-tpu-v2-images/libtpu:${LIBTPU_IMAGE_TAG} as libtpu
-FROM gcr.io/kaggle-images/python-tpu-tensorflow-whl:${TENSORFLOW_WHL_IMAGE_TAG} AS tensorflow_whl
+FROM gcr.io/kaggle-images/python-tpu-tensorflow-whl:python-${BASE_IMAGE_TAG}-${TENSORFLOW_VERSION} AS tensorflow_whl
 FROM gcr.io/kaggle-images/python:${BASE_IMAGE_TAG}
 
+ENV ISTPUVM=1
+
 COPY --from=libtpu /libtpu.so /lib
 
 COPY --from=tensorflow_whl /tmp/tensorflow_pkg/tensorflow*.whl /tmp/tensorflow_pkg/
 RUN pip install /tmp/tensorflow_pkg/tensorflow*.whl && \
- rm -rf /tmp/tensorflow_pkg
+ rm -rf /tmp/tensorflow_pkg && \
+ /tmp/clean-layer.sh
+
+# https://cloud.google.com/tpu/docs/pytorch-xla-ug-tpu-vm#changing_pytorch_version
+RUN pip uninstall -y torch && \
+ pip install torch==${TORCH_VERSION} && \
+ pip install torch_xla[tpuvm] -f https://storage.googleapis.com/tpu-pytorch/wheels/tpuvm/torch_xla-${TORCH_VERSION}-cp37-cp37m-linux_x86_64.whl && \
+ /tmp/clean-layer.sh
+
+# https://cloud.google.com/tpu/docs/jax-quickstart-tpu-vm#install_jax_on_your_cloud_tpu_vm
+RUN pip install "jax[tpu]>=0.2.16" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html && \
+ /tmp/clean-layer.sh
diff --git a/tpu/build b/tpu/build
@@ -0,0 +1,57 @@
+#!/bin/bash
+set -e
+
+usage() {
+cat << EOF
+Usage: $0 [OPTIONS]
+Build a new Python TPU 1VM Docker image.
+
+Options:
+ -c, --use-cache Use layer cache when building a new image.
+EOF
+}
+
+CACHE_FLAG='--no-cache'
+DOCKERFILE='Dockerfile'
+IMAGE_TAG='kaggle/python-tpuvm-build'
+BUILD_ARGS=''
+
+while :; do
+ case "$1" in 
+ -h|--help)
+ usage
+ exit
+ ;;
+ -c|--use-cache)
+ CACHE_FLAG=''
+ ;;
+ -?*)
+ usage
+ printf 'ERROR: Unknown option: %s\n' "$1" >&2
+ exit
+ ;;
+ *) 
+ break
+ esac
+
+ shift
+done
+
+BUILD_ARGS+=" --build-arg GIT_COMMIT=$(git rev-parse HEAD)"
+BUILD_ARGS+=" --build-arg BUILD_DATE=$(date '+%Y%m%d-%H%M%S')"
+
+# Read build args from config.txt file.
+SRCDIR=$(dirname "${BASH_SOURCE[0]}")
+for l in `cat ${SRCDIR}/config.txt`; do
+ BUILD_ARGS+=" --build-arg $l"
+done
+
+readonly CACHE_FLAG
+readonly DOCKERFILE
+readonly IMAGE_TAG
+readonly BUILD_ARGS
+
+DOCKERFILE_PATH="$SRCDIR/$DOCKERFILE"
+
+set -x
+docker build --rm --pull $CACHE_FLAG -t "$IMAGE_TAG" -f "$DOCKERFILE_PATH" $BUILD_ARGS .
diff --git a/tpu/config.txt b/tpu/config.txt
@@ -0,0 +1,5 @@
+# TODO(b/213335159): Use ci-pretest for BASE_IMAGE_TAG once stable.
+BASE_IMAGE_TAG=v108
+LIBTPU_IMAGE_TAG=libtpu_1.1.0_RC00
+TENSORFLOW_VERSION=2.8.0
+TORCH_VERSION=1.11.0