Skip to content

Commit

Permalink
Pull Plasma from Apache Arrow and remove Plasma store from Ray. (ray-…
Browse files Browse the repository at this point in the history
…project#692)

* Rebase Ray on top of Plasma in Apache Arrow

* add thirdparty building scripts

* use rebased arrow

* fix

* fix build

* fix python visibility

* comment out C tests for now

* fix multithreading

* fix

* reduce logging

* fix plasma manager multithreading

* make sure old and new object IDs can coexist peacefully

* more rebasing

* update

* fixes

* fix

* install pyarrow

* install cython

* fix

* install newer cmake

* fix

* rebase on top of latest arrow

* getting runtest.py run locally (needed to comment out a test for that to work)

* work on plasma tests

* more fixes

* fix local scheduler tests

* fix global scheduler test

* more fixes

* fix python 3 bytes vs string

* fix manager tests valgrind

* fix documentation building

* fix linting

* fix c++ linting

* fix linting

* add tests back in

* Install without sudo.

* Set PKG_CONFIG_PATH in build.sh so that Ray can find plasma.

* Install pkg-config

* Link -lpthread, note that find_package(Threads) doesn't seem to work reliably.

* Comment in testGPUIDs in runtest.py.

* Set PKG_CONFIG_PATH when building pyarrow.

* Pull apache/arrow and not pcmoritz/arrow.

* Fix installation in docker image.

* adapt to changes of the plasma api

* Fix installation of pyarrow module.

* Fix linting.

* Use correct python executable to build pyarrow.
  • Loading branch information
pcmoritz authored and robertnishihara committed Aug 1, 2017
1 parent dfcd399 commit c3b39b4
Show file tree
Hide file tree
Showing 64 changed files with 474 additions and 5,765 deletions.
16 changes: 8 additions & 8 deletions .travis/install-dependencies.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,20 @@ fi

if [[ "$PYTHON" == "2.7" ]] && [[ "$platform" == "linux" ]]; then
sudo apt-get update
sudo apt-get install -y cmake build-essential autoconf curl libtool python-dev python-numpy python-pip libboost-all-dev unzip
sudo apt-get install -y cmake pkg-config build-essential autoconf curl libtool python-dev python-numpy python-pip libboost-all-dev unzip
# Install miniconda.
wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh -O miniconda.sh
bash miniconda.sh -b -p $HOME/miniconda
export PATH="$HOME/miniconda/bin:$PATH"
pip install numpy cloudpickle funcsigs click colorama psutil redis tensorflow flatbuffers
pip install numpy cloudpickle cython cmake funcsigs click colorama psutil redis tensorflow flatbuffers
elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "linux" ]]; then
sudo apt-get update
sudo apt-get install -y cmake python-dev python-numpy build-essential autoconf curl libtool libboost-all-dev unzip
sudo apt-get install -y cmake pkg-config python-dev python-numpy build-essential autoconf curl libtool libboost-all-dev unzip
# Install miniconda.
wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
bash miniconda.sh -b -p $HOME/miniconda
export PATH="$HOME/miniconda/bin:$PATH"
pip install numpy cloudpickle funcsigs click colorama psutil redis tensorflow flatbuffers
pip install numpy cloudpickle cython cmake funcsigs click colorama psutil redis tensorflow flatbuffers
elif [[ "$PYTHON" == "2.7" ]] && [[ "$platform" == "macosx" ]]; then
# check that brew is installed
which -s brew
Expand All @@ -43,12 +43,12 @@ elif [[ "$PYTHON" == "2.7" ]] && [[ "$platform" == "macosx" ]]; then
echo "Updating brew."
brew update
fi
brew install cmake automake autoconf libtool boost
brew install cmake pkg-config automake autoconf libtool boost
# Install miniconda.
wget https://repo.continuum.io/miniconda/Miniconda2-latest-MacOSX-x86_64.sh -O miniconda.sh
bash miniconda.sh -b -p $HOME/miniconda
export PATH="$HOME/miniconda/bin:$PATH"
pip install numpy cloudpickle funcsigs click colorama psutil redis tensorflow flatbuffers
pip install numpy cloudpickle cython cmake funcsigs click colorama psutil redis tensorflow flatbuffers
elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "macosx" ]]; then
# check that brew is installed
which -s brew
Expand All @@ -59,12 +59,12 @@ elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "macosx" ]]; then
echo "Updating brew."
brew update
fi
brew install cmake automake autoconf libtool boost
brew install cmake pkg-config automake autoconf libtool boost
# Install miniconda.
wget https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh
bash miniconda.sh -b -p $HOME/miniconda
export PATH="$HOME/miniconda/bin:$PATH"
pip install numpy cloudpickle funcsigs click colorama psutil redis tensorflow flatbuffers
pip install numpy cloudpickle cython cmake funcsigs click colorama psutil redis tensorflow flatbuffers
elif [[ "$LINT" == "1" ]]; then
sudo apt-get update
sudo apt-get install -y cmake build-essential autoconf curl libtool libboost-all-dev unzip
Expand Down
5 changes: 5 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@ cmake_minimum_required(VERSION 2.8)

project(ray)

set(ARROW_DIR "${CMAKE_CURRENT_LIST_DIR}/src/thirdparty/arrow/"
CACHE STRING "Path of the arrow source directory")

include_directories("${ARROW_DIR}/cpp/src/")

add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/src/common/)
add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/src/plasma/)
add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/src/local_scheduler/)
Expand Down
25 changes: 21 additions & 4 deletions build.sh
Original file line number Diff line number Diff line change
@@ -1,10 +1,19 @@
#!/usr/bin/env bash

set -x

# Cause the script to exit if a single command fails.
set -e

ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd)

if [[ -z "$1" ]]; then
PYTHON_EXECUTABLE=`which python`
else
PYTHON_EXECUTABLE=$1
fi
echo "Using Python executable $PYTHON_EXECUTABLE."

# Determine how many parallel jobs to use for make based on the number of cores
unamestr="$(uname)"
if [[ "$unamestr" == "Linux" ]]; then
Expand All @@ -20,17 +29,25 @@ pushd "$ROOT_DIR/src/common/thirdparty/"
bash build-redis.sh
popd

bash "$ROOT_DIR/src/numbuf/thirdparty/download_thirdparty.sh"
bash "$ROOT_DIR/src/numbuf/thirdparty/build_thirdparty.sh"
bash "$ROOT_DIR/src/thirdparty/download_thirdparty.sh"
bash "$ROOT_DIR/src/thirdparty/build_thirdparty.sh" $PYTHON_EXECUTABLE

# Now build everything.
pushd "$ROOT_DIR/python/ray/core"
# We use these variables to set PKG_CONFIG_PATH, which is important so that
# in cmake, pkg-config can find plasma.
TP_DIR=$ROOT_DIR/src/thirdparty
ARROW_HOME=$TP_DIR/arrow/cpp/build/cpp-install
if [ "$VALGRIND" = "1" ]
then
cmake -DCMAKE_BUILD_TYPE=Debug ../../..
PKG_CONFIG_PATH=$ARROW_HOME/lib/pkgconfig cmake -DCMAKE_BUILD_TYPE=Debug ../../..
else
cmake -DCMAKE_BUILD_TYPE=Release ../../..
PKG_CONFIG_PATH=$ARROW_HOME/lib/pkgconfig cmake -DCMAKE_BUILD_TYPE=Release ../../..
fi
make clean
make -j${PARALLEL}
popd

# Move stuff from Arrow to Ray.

mv $ROOT_DIR/src/thirdparty/arrow/cpp/build/release/plasma_store $ROOT_DIR/python/ray/core/src/plasma/
4 changes: 3 additions & 1 deletion doc/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@

# These lines added to enable Sphinx to work without installing Ray.
import mock
MOCK_MODULES = ["ray.numbuf",
MOCK_MODULES = ["pyarrow",
"pyarrow.plasma",
"ray.numbuf",
"ray.local_scheduler",
"ray.plasma",
"ray.core.generated.TaskInfo",
Expand Down
2 changes: 1 addition & 1 deletion doc/source/install-on-macosx.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ To build Ray, first install the following dependencies. We recommend using
.. code-block:: bash
brew update
brew install cmake automake autoconf libtool boost wget
brew install cmake pkg-config automake autoconf libtool boost wget
pip install numpy cloudpickle funcsigs click colorama psutil redis flatbuffers --ignore-installed six
Expand Down
2 changes: 1 addition & 1 deletion doc/source/install-on-ubuntu.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ To build Ray, first install the following dependencies. We recommend using
.. code-block:: bash
sudo apt-get update
sudo apt-get install -y cmake build-essential autoconf curl libtool libboost-all-dev unzip
sudo apt-get install -y cmake pkg-config build-essential autoconf curl libtool libboost-all-dev unzip
# If you are not using Anaconda, you need the following.
sudo apt-get install python-dev # For Python 2.
Expand Down
2 changes: 1 addition & 1 deletion docker/base-deps/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
FROM ubuntu:xenial
RUN apt-get update \
&& apt-get install -y vim git wget \
&& apt-get install -y cmake build-essential autoconf curl libtool libboost-all-dev unzip
&& apt-get install -y cmake pkg-config build-essential autoconf curl libtool libboost-all-dev unzip
RUN echo 'export PATH=/opt/conda/bin:$PATH' > /etc/profile.d/conda.sh \
&& wget --quiet 'https://repo.continuum.io/archive/Anaconda2-4.2.0-Linux-x86_64.sh' -O /tmp/anaconda.sh \
&& /bin/bash /tmp/anaconda.sh -b -p /opt/conda \
Expand Down
2 changes: 1 addition & 1 deletion python/ray/experimental/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -465,7 +465,7 @@ def task_profiles(self, start=None, end=None, num_tasks=None, fwd=True):
**params)

for (event, score) in event_list:
event_dict = json.loads(event)
event_dict = json.loads(event.decode())
task_id = ""
for event in event_dict:
if "task_id" in event[3]:
Expand Down
22 changes: 12 additions & 10 deletions python/ray/global_scheduler/test/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import time
import unittest

import pyarrow as pa
import ray.global_scheduler as global_scheduler
import ray.local_scheduler as local_scheduler
import ray.plasma as plasma
Expand Down Expand Up @@ -87,8 +88,8 @@ def setUp(self):
self.plasma_manager_pids.append(p3)
plasma_address = "{}:{}".format(self.node_ip_address,
plasma_manager_port)
plasma_client = plasma.PlasmaClient(plasma_store_name,
plasma_manager_name)
plasma_client = pa.plasma.connect(plasma_store_name,
plasma_manager_name, 64)
self.plasma_clients.append(plasma_client)
# Start the local scheduler.
local_scheduler_name, p4 = local_scheduler.start_local_scheduler(
Expand Down Expand Up @@ -203,9 +204,10 @@ def test_integration_single_task(self):
# Sleep before submitting task to local scheduler.
time.sleep(0.1)
# Submit a task to Redis.
task = local_scheduler.Task(random_driver_id(), random_function_id(),
[local_scheduler.ObjectID(object_dep)],
num_return_vals[0], random_task_id(), 0)
task = local_scheduler.Task(
random_driver_id(), random_function_id(),
[local_scheduler.ObjectID(object_dep.binary())],
num_return_vals[0], random_task_id(), 0)
self.local_scheduler_clients[0].submit(task)
time.sleep(0.1)
# There should now be a task in Redis, and it should get assigned to
Expand Down Expand Up @@ -256,11 +258,11 @@ def integration_many_tasks_helper(self, timesync=True):
# Give 10ms for object info handler to fire (long enough to
# yield CPU).
time.sleep(0.010)
task = local_scheduler.Task(random_driver_id(),
random_function_id(),
[local_scheduler.ObjectID(object_dep)],
num_return_vals[0], random_task_id(),
0)
task = local_scheduler.Task(
random_driver_id(),
random_function_id(),
[local_scheduler.ObjectID(object_dep.binary())],
num_return_vals[0], random_task_id(), 0)
self.local_scheduler_clients[0].submit(task)
# Check that there are the correct number of tasks in Redis and that
# they all get assigned to the local scheduler.
Expand Down
27 changes: 14 additions & 13 deletions python/ray/local_scheduler/test/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

import ray.local_scheduler as local_scheduler
import ray.plasma as plasma
import pyarrow as pa

USE_VALGRIND = False
ID_SIZE = 20
Expand Down Expand Up @@ -41,8 +42,7 @@ class TestLocalSchedulerClient(unittest.TestCase):
def setUp(self):
# Start Plasma store.
plasma_store_name, self.p1 = plasma.start_plasma_store()
self.plasma_client = plasma.PlasmaClient(plasma_store_name,
release_delay=0)
self.plasma_client = pa.plasma.connect(plasma_store_name, "", 0)
# Start a local scheduler.
scheduler_name, self.p2 = local_scheduler.start_local_scheduler(
plasma_store_name, use_valgrind=USE_VALGRIND)
Expand Down Expand Up @@ -72,8 +72,8 @@ def test_submit_and_get_task(self):
# Create and seal the objects in the object store so that we can
# schedule all of the subsequent tasks.
for object_id in object_ids:
self.plasma_client.create(object_id.id(), 0)
self.plasma_client.seal(object_id.id())
self.plasma_client.create(pa.plasma.ObjectID(object_id.id()), 0)
self.plasma_client.seal(pa.plasma.ObjectID(object_id.id()))
# Define some arguments to use for the tasks.
args_list = [
[],
Expand Down Expand Up @@ -153,8 +153,8 @@ def get_task():
time.sleep(0.1)
# Create and seal the object ID in the object store. This should
# trigger a scheduling event.
self.plasma_client.create(object_id.id(), 0)
self.plasma_client.seal(object_id.id())
self.plasma_client.create(pa.plasma.ObjectID(object_id.id()), 0)
self.plasma_client.seal(pa.plasma.ObjectID(object_id.id()))
# Wait until the thread finishes so that we know the task was
# scheduled.
t.join()
Expand All @@ -175,8 +175,8 @@ def get_task():
t.start()

# Make one of the dependencies available.
buf = self.plasma_client.create(object_id1.id(), 1)
self.plasma_client.seal(object_id1.id())
buf = self.plasma_client.create(pa.plasma.ObjectID(object_id1.id()), 1)
self.plasma_client.seal(pa.plasma.ObjectID(object_id1.id()))
# Release the object.
del buf
# Check that the thread is still waiting for a task.
Expand All @@ -188,23 +188,24 @@ def get_task():
time.sleep(0.1)
self.assertTrue(t.is_alive())
# Check that the first object dependency was evicted.
object1 = self.plasma_client.get([object_id1.id()], timeout_ms=0)
object1 = self.plasma_client.get([pa.plasma.ObjectID(object_id1.id())],
timeout_ms=0)
self.assertEqual(object1, [None])
# Check that the thread is still waiting for a task.
time.sleep(0.1)
self.assertTrue(t.is_alive())

# Create the second dependency.
self.plasma_client.create(object_id2.id(), 1)
self.plasma_client.seal(object_id2.id())
self.plasma_client.create(pa.plasma.ObjectID(object_id2.id()), 1)
self.plasma_client.seal(pa.plasma.ObjectID(object_id2.id()))
# Check that the thread is still waiting for a task.
time.sleep(0.1)
self.assertTrue(t.is_alive())

# Create the first dependency again. Both dependencies are now
# available.
self.plasma_client.create(object_id1.id(), 1)
self.plasma_client.seal(object_id1.id())
self.plasma_client.create(pa.plasma.ObjectID(object_id1.id()), 1)
self.plasma_client.seal(pa.plasma.ObjectID(object_id1.id()))

# Wait until the thread finishes so that we know the task was
# scheduled.
Expand Down
9 changes: 2 additions & 7 deletions python/ray/plasma/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,8 @@
from __future__ import division
from __future__ import print_function

from ray.plasma.plasma import (PlasmaBuffer, buffers_equal, PlasmaClient,
start_plasma_store, start_plasma_manager,
plasma_object_exists_error,
plasma_out_of_memory_error,
from ray.plasma.plasma import (start_plasma_store, start_plasma_manager,
DEFAULT_PLASMA_STORE_MEMORY)

__all__ = ["PlasmaBuffer", "buffers_equal", "PlasmaClient",
"start_plasma_store", "start_plasma_manager",
"plasma_object_exists_error", "plasma_out_of_memory_error",
__all__ = ["start_plasma_store", "start_plasma_manager",
"DEFAULT_PLASMA_STORE_MEMORY"]
Loading

0 comments on commit c3b39b4

Please sign in to comment.