ray-project · kouroshHakha · Jul 28, 2023 · Jul 25, 2023 · Jul 26, 2023 · Jul 26, 2023
@@ -517,7 +517,8 @@ def main():
  "env_vars": {
  "HF_HOME": "/mnt/local_storage/.cache/huggingface",
  "TUNE_RESULT_DIR": os.environ["TUNE_RESULT_DIR"],
- }
+ },
+ "working_dir": ".",
  }
  )
 

@@ -63,7 +63,10 @@ def download_model_files_on_all_nodes(hf_model_id: str):
 if __name__ == "__main__":
 
  ray.init(
- runtime_env={"env_vars": {"HF_HOME": "/mnt/local_storage/.cache/huggingface"}}
+ runtime_env={
+ "env_vars": {"HF_HOME": "/mnt/local_storage/.cache/huggingface"},
+ "working_dir": ".",
+ }
  )
 
  pargs = _parse_args()

@@ -37,7 +37,7 @@ def run(cmd: str):
  parser.add_argument("args", nargs="*", type=str, help="string args to function")
  args = parser.parse_args()
 
- ray.init()
+ ray.init(runtime_env={"working_dir": "."})
  if args.function not in globals():
  raise ValueError(f"{args.function} doesn't exist")
  fn = globals()[args.function]

@@ -1,12 +1,9 @@
-# See https://hub.docker.com/r/anyscale/ray for full list of
-# available Ray, Python, and CUDA versions.
-base_image: "anyscale/ray:2.6.1-py39-cu117"
+base_image: anyscale/ray:nightly-py39-cu118
 
 env_vars: {}
 
-debian_packages: [
- libaio1
-]
+debian_packages:
+ - libaio1
 
 python:
  pip_packages: [
@@ -30,4 +27,7 @@ python:
  ]
  conda_packages: []
 
-post_build_cmds: []
+post_build_cmds:
+ # Install Ray
+ - pip3 uninstall -y ray || true && pip3 install -U {{ env["RAY_WHEELS"] | default("ray") }}
+ - {{ env["RAY_WHEELS_SANITY_CHECK"] | default("echo No Ray wheels sanity check") }}
@@ -0,0 +1,23 @@
+cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
+region: us-west-2
+
+head_node_type:
+ name: head_node_type
+ instance_type: g5.48xlarge
+ resources:
+ custom_resources: 
+ large_cpu_mem: 1
+
+worker_node_types:
+ - name: gpu_worker
+ instance_type: g5.48xlarge
+ min_workers: 3
+ max_workers: 3
+ use_spot: false
+
+aws:
+ TagSpecifications:
+ - ResourceType: "instance"
+ Tags:
+ - Key: ttl-hours
+ Value: '24'
@@ -0,0 +1,29 @@
+cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
+region: us-west-2
+
+head_node_type:
+ name: head_node_type
+ instance_type: g5.48xlarge
+ resources:
+ custom_resources: 
+ large_cpu_mem: 1
+
+worker_node_types:
+ - name: large_gpu_worker
+ instance_type: g5.48xlarge
+ min_workers: 2
+ max_workers: 2
+ use_spot: false
+
+ - name: medium_gpu_worker
+ instance_type: g5.24xlarge
+ min_workers: 2
+ max_workers: 2
+ use_spot: false
+
+aws:
+ TagSpecifications:
+ - ResourceType: "instance"
+ Tags:
+ - Key: ttl-hours
+ Value: '24'
@@ -0,0 +1,27 @@
+# 1 g5.16xlarge + 15 g5.4xlarge --> 16 GPUs, 256G RAM on trainer and 64G RAM on workers
+cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
+region: us-west-2
+
+head_node_type:
+ name: head_node
+ instance_type: g5.16xlarge
+ resources:
+ custom_resources: 
+ large_cpu_mem: 1
+
+worker_node_types:
+ - name: worker_node
+ instance_type: g5.4xlarge
+ min_workers: 15
+ max_workers: 15
+ use_spot: false
+ resources:
+ custom_resources: 
+ medium_cpu_mem: 1
+
+aws:
+ TagSpecifications:
+ - ResourceType: "instance"
+ Tags:
+ - Key: ttl-hours
+ Value: '24'
@@ -0,0 +1,21 @@
+cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
+region: us-west1
+allowed_azs:
+ - us-west1-b
+
+head_node_type:
+ name: head_node_type
+ instance_type: n1-highmem-64-nvidia-k80-12gb-1
+ resources:
+ custom_resources: 
+ large_cpu_mem: 1
+
+worker_node_types:
+- name: gpu_worker
+ instance_type: n1-standard-16-nvidia-k80-12gb-1
+ min_workers: 15
+ max_workers: 15
+ use_spot: false
+ resources:
+ custom_resources: 
+ medium_cpu_mem: 1
@@ -10,4 +10,5 @@ libjemalloc-dev
 libosmesa6-dev
 patchelf
 unzip
-zip
+zip
+libaio1
@@ -13,3 +13,11 @@ transformers
 torch
 torchtext
 torchvision
+bitsandbytes
+wandb
+pytorch-lightning
+protobuf<3.21.0
+torchmetrics
+lm_eval
+tiktoken
+sentencepiece