Skip to content

Commit

Permalink
Improve error messages when nodes can't communicate with each other. (#…
Browse files Browse the repository at this point in the history
…223)

* Good error messages when nodes can't communicate with each other

* Print more information when starting the head node.

* Change retries back to 5.
  • Loading branch information
richardliaw authored and pcmoritz committed Jan 22, 2017
1 parent 7151ed5 commit 4575cd8
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 2 deletions.
1 change: 1 addition & 0 deletions python/ray/services.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ def wait_for_redis_to_start(redis_host, redis_port, num_retries=5):
while counter < num_retries:
try:
# Run some random command and see if it worked.
print("Waiting for redis server at {}:{} to respond...".format(redis_host, redis_port))
redis_client.client_list()
except redis.ConnectionError as e:
# Wait a little bit.
Expand Down
4 changes: 3 additions & 1 deletion python/ray/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -674,7 +674,9 @@ def get_address_info_from_redis(redis_address, node_ip_address, num_retries=5):
if counter == num_retries:
raise
# Some of the information may not be in Redis yet, so wait a little bit.
print("Some processes that the driver needs to connect to have not registered with Redis, so retrying.")
print("Some processes that the driver needs to connect to have not "
"registered with Redis, so retrying. Have you run "
"./scripts/start_ray.sh on this node?")
time.sleep(1)
counter += 1

Expand Down
23 changes: 22 additions & 1 deletion scripts/start_ray.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,22 @@ def check_no_existing_redis_clients(node_ip_address, redis_address):
num_workers=args.num_workers,
cleanup=False,
redirect_output=True)
print(address_info)
print("\nStarted Ray with {} workers on this node. A different number of "
"workers can be set with the --num-workers flag (but you have to "
"first terminate the existing cluster). You can add additional nodes "
"to the cluster by calling\n\n"
" ./scripts/start_ray.sh --redis-address {}\n\n"
"from the node you wish to add. You can connect a driver to the "
"cluster from Python by running\n\n"
" import ray\n"
" ray.init(redis_address=\"{}\")\n\n"
"If you have trouble connecting from a different machine, check that "
"your firewall is configured properly. If you wish to terminate the "
"processes that have been started, run\n\n"
" ./scripts/stop_ray.sh".format(args.num_workers,
address_info["redis_address"],
address_info["redis_address"]))
else:
# Start Ray on a non-head node.
if args.redis_address is None:
Expand All @@ -74,4 +90,9 @@ def check_no_existing_redis_clients(node_ip_address, redis_address):
num_workers=args.num_workers,
cleanup=False,
redirect_output=True)
print(address_info)
print(address_info)
print("\nStarted {} workers on this node. A different number of workers "
"can be set with the --num-workers flag (but you have to first "
"terminate the existing cluster). If you wish to terminate the "
"processes that have been started, run\n\n"
" ./scripts/stop_ray.sh".format(args.num_workers))

0 comments on commit 4575cd8

Please sign in to comment.