Skip to content

Commit

Permalink
Make some fixes to long running stress tests. (#5056)
Browse files Browse the repository at this point in the history
  • Loading branch information
robertnishihara authored and pcmoritz committed Jun 28, 2019
1 parent 4ccb7b0 commit bcc3795
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 19 deletions.
15 changes: 11 additions & 4 deletions ci/long_running_tests/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,17 @@ intended to run forever until they fail.
Running the Workloads
---------------------

To run the workloads, run ``./start_workloads.sh``. This will start one EC2
instance per workload and will start the workloads running (one per instance).
Running the ``./start_workloads.sh`` script again will clean up any state from
the previous runs and will start the workloads again.
To run the workloads, run

.. code-block:: bash
./start_workloads.sh <ray-branch> <ray-version> <ray-commit>
using the appropriate values of ``<ray-branch>``, ``<ray-version>``, and
``<ray-commit>``. This will start one EC2 instance per workload and will start
the workloads running (one per instance). Running the ``./start_workloads.sh``
script again will clean up any state from the previous runs and will start the
workloads again.

Check Workload Statuses
-----------------------
Expand Down
11 changes: 6 additions & 5 deletions ci/long_running_tests/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,22 +38,23 @@ worker_nodes:

# List of shell commands to run to set up nodes.
setup_commands:
- sudo apt-get update
- sudo apt-get install -y build-essential curl unzip
# Install Anaconda.
- wget https://repo.continuum.io/archive/Anaconda3-5.0.1-Linux-x86_64.sh || true
- bash Anaconda3-5.0.1-Linux-x86_64.sh -b -p $HOME/anaconda3 || true
- echo 'export PATH="$HOME/anaconda3/bin:$PATH"' >> ~/.bashrc
- echo 'termcapinfo xterm* ti@:te@' >> ~/.screenrc
# Some Python dependencies.
- pip install boto3==1.4.8 cython==0.29.0
# # Uncomment the following if you wish to install Ray instead.
# - sudo apt-get update
# - sudo apt-get install -y build-essential curl unzip
# Uncomment the following if you wish to install Ray instead.
# - git clone https://github.com/ray-project/ray || true
# - ray/ci/travis/install-bazel.sh
# - cd ray/python; git checkout master; git pull; pip install -e . --verbose
# Install nightly Ray wheels.
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/<<RAY_RELEASE_HASH>>/ray-<<RAY_RELEASE_VERSION>>-cp36-cp36m-manylinux1_x86_64.whl
- pip install ray[rllib] ray[debug] tensorflow
- wget https://s3-us-west-2.amazonaws.com/ray-wheels/<<<RAY_BRANCH>>>/<<<RAY_COMMIT>>>/ray-<<<RAY_VERSION>>>-cp36-cp36m-manylinux1_x86_64.whl
- pip install ray-<<<RAY_VERSION>>>-cp36-cp36m-manylinux1_x86_64.whl[rllib,debug]
- pip install tensorflow
- pip install -U dask # fix error importing lz4

# Custom commands that will be run on the head node after common setup.
Expand Down
52 changes: 42 additions & 10 deletions ci/long_running_tests/start_workloads.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,28 +4,60 @@ set -e

ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd)

if [[ -z "$1" ]]; then
echo "ERROR: The first argument must be the Ray branch to test (e.g., 'master')."
exit 1
else
RAY_BRANCH=$1
fi

if [[ -z "$2" ]]; then
echo "ERROR: The second argument must be the Ray version to test (e.g., '0.8.0.dev1')."
exit 1
else
RAY_VERSION=$2
fi

if [[ -z "$3" ]]; then
echo "ERROR: The third argument must be the Ray commit to test (e.g., '62e4b591e3d6443ce25b0f05cc32b43d5e2ebb3d')."
exit 1
else
RAY_COMMIT=$3
fi

echo "Testing ray==$RAY_VERSION at commit $RAY_COMMIT."
echo "The wheels used will live under https://s3-us-west-2.amazonaws.com/ray-wheels/$RAY_BRANCH/$RAY_VERSION/$RAY_COMMIT/"


pushd "$ROOT_DIR"

# Substitute in the appropriate Ray version and commit in the config file and
# store it in a temporary file.
CLUSTER_CONFIG="config_temporary.yaml"
sed -e "s/<<<RAY_BRANCH>>>/$RAY_BRANCH/g;
s/<<<RAY_VERSION>>>/$RAY_VERSION/g;
s/<<<RAY_COMMIT>>>/$RAY_COMMIT/;" config.yaml > "$CLUSTER_CONFIG"

# Start one instance per workload.
for workload_file in "$ROOT_DIR"/workloads/*; do
file_name=$(basename -- $workload_file)
file_name=$(basename -- "$workload_file")
workload_name="${file_name%.*}"
ray up -y config.yaml --cluster-name="$workload_name" &
ray up -y $CLUSTER_CONFIG --cluster-name="$workload_name" &
done
# Wait for all of the nodes to be up.
wait

# Start the workloads running.
for workload_file in "$ROOT_DIR"/workloads/*; do
file_name=$(basename -- $workload_file)
file_name=$(basename -- "$workload_file")
workload_name="${file_name%.*}"
(
# Copy the workload to the cluster.
ray rsync_up config.yaml --cluster-name="$workload_name" "$workload_file" "$file_name"
ray rsync_up $CLUSTER_CONFIG --cluster-name="$workload_name" "$workload_file" "$file_name"
# Clean up previous runs if relevant.
ray exec config.yaml --cluster-name="$workload_name" "ray stop; rm -r /tmp/ray; tmux kill-server | true"
ray exec $CLUSTER_CONFIG --cluster-name="$workload_name" "ray stop; rm -r /tmp/ray; tmux kill-server | true"
# Start the workload.
ray exec config.yaml --cluster-name="$workload_name" "python $file_name" --tmux
ray exec $CLUSTER_CONFIG --cluster-name="$workload_name" "python $file_name" --tmux
) &
done
# Wait for child processes to finish.
Expand All @@ -41,9 +73,9 @@ echo ""
echo "To kill the instances, use the following commands."
echo ""
for workload_file in "$ROOT_DIR"/workloads/*; do
file_name=$(basename -- $workload_file)
file_name=$(basename -- "$workload_file")
workload_name="${file_name%.*}"
echo " ray down -y $ROOT_DIR/config.yaml --cluster-name=$workload_name"
echo " ray down -y $ROOT_DIR/$CLUSTER_CONFIG --cluster-name=$workload_name"
done

echo ""
Expand All @@ -52,9 +84,9 @@ echo ""
echo "Use the following commands to attach to the relevant drivers."
echo ""
for workload_file in "$ROOT_DIR"/workloads/*; do
file_name=$(basename -- $workload_file)
file_name=$(basename -- "$workload_file")
workload_name="${file_name%.*}"
echo " ray attach $ROOT_DIR/config.yaml --cluster-name=$workload_name --tmux"
echo " ray attach $ROOT_DIR/$CLUSTER_CONFIG --cluster-name=$workload_name --tmux"
done

echo ""
Expand Down

0 comments on commit bcc3795

Please sign in to comment.