update docker + readme

EleutherAI · Oct 17, 2021 · 1eff7a2 · 1eff7a2
1 parent a077464
commit 1eff7a2
Show file tree

Hide file tree

Showing 3 changed files with 16 additions and 9 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -8,7 +8,7 @@ RUN apt-get update -y && \
         git python3 python3-dev libpython3-dev python3-pip sudo pdsh \
         htop llvm-9-dev tmux zstd software-properties-common build-essential autotools-dev \
         nfs-common pdsh cmake g++ gcc curl wget vim less unzip htop iftop iotop ca-certificates ssh \
-        rsync iputils-ping net-tools libcupti-dev && \
+        rsync iputils-ping net-tools libcupti-dev nano && \
     update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \
     update-alternatives --install /usr/bin/pip pip /usr/bin/pip3 1 && \
     pip install --upgrade pip && \
@@ -29,6 +29,7 @@ RUN echo 'password' >> password.txt && \
     echo "Set disable_coredump false" >> /etc/sudo.conf && \
     # Clean up
     rm password.txt
+
 # Expose SSH port
 EXPOSE 22
 
@@ -46,9 +47,11 @@ RUN mkdir -p /build && \
     test -f /usr/local/mpi/bin/mpic++ && \
     cd ~ && \
     rm -rf /build
+
 # Needs to be in docker PATH if compiling other items & bashrc PATH (later)
 ENV PATH=/usr/local/mpi/bin:${PATH} \
     LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
+
 # Create a wrapper for OpenMPI to allow running as root by default
 RUN mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \
     echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \
@@ -76,6 +79,9 @@ COPY requirements/requirements-onebitadam.txt .
 COPY requirements/requirements-sparseattention.txt .
 RUN pip install -r requirements.txt && pip install -r requirements-onebitadam.txt && pip install -r requirements-sparseattention.txt && pip cache purge
 
+# install fused kernels
+RUN sudo python setup.py install 
+
 ## Install APEX
 RUN pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" git+https://github.com/NVIDIA/apex.git@a651e2c24ecf97cbf367fd3f330df36760e1c597
 

diff --git a/README.md b/README.md
@@ -57,7 +57,7 @@ Coming soon: a colab notebook for trying out the model.
 First make sure you are in an environment with Python 3.8 or later and `torch>=1.8` installed. Then run `pip install -r requirements/requirements.txt`. 
 You may need to change the version of `cupy-cudaxxx` to match your machine's cuda version.
 
-Some features rely on apex, which you can install with the command below:
+nvidia's apex is an optional extra (used only for FusedAdam, which may offer some performance improvement):
 
 ```bash
 pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" git+https://github.com/NVIDIA/apex.git@e2083df5eb96643c61613b9df48dd4eea6b07690

diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
@@ -104,9 +104,9 @@ class NeoXArgsModel(NeoXArgsTemplate):
     Maximum number of position embeddings to use. This is the size of position embedding.
     """
 
-    norm: Literal["layernorm", "rmsnorm", "scalenorm", "apexlayernorm"] = "layernorm"
+    norm: Literal["layernorm", "rmsnorm", "scalenorm"] = "layernorm"
     """
-    Normalization layer to use. Choose from "layernorm", "rmsnorm", "scalenorm", "apexlayernorm".
+    Normalization layer to use. Choose from "layernorm", "rmsnorm", "scalenorm".
     """
 
     layernorm_epsilon: float = 1.0e-5
@@ -124,7 +124,9 @@ class NeoXArgsModel(NeoXArgsTemplate):
     Scalenorm epsilon
     """
 
-    pos_emb: Literal['learned', 'rotary', 'sinusoidal', 'rpe', 'alibi', 'none'] = "learned"
+    pos_emb: Literal[
+        "learned", "rotary", "sinusoidal", "rpe", "alibi", "none"
+    ] = "learned"
     """
     Type of positional embedding to use - choose from 'learned', 'rotary', 'sinusoidal', 'rpe', 'none'
     """
@@ -205,7 +207,7 @@ class NeoXArgsModel(NeoXArgsTemplate):
     Pad the vocab size to be divisible by this value. This is added for computational efficiency reasons.
     """
 
-    activation : Literal["gelu", "geglu", "relu", "softsign", "swish", "mish"] = "gelu"
+    activation: Literal["gelu", "geglu", "relu", "softsign", "swish", "mish"] = "gelu"
     """
     Activation function to use - choose from ["gelu", "geglu", "relu", "softsign", "swish", "mish"]
     """
@@ -301,7 +303,7 @@ class NeoXArgsModel(NeoXArgsTemplate):
     If None - gmlp model doesn't use attention.
     """
 
-    gpt_j_residual : bool = False
+    gpt_j_residual: bool = False
     """
     If false, we use the conventional residual path:
       x = x + attn(ln1(x))
@@ -310,7 +312,7 @@ class NeoXArgsModel(NeoXArgsTemplate):
       x = ln(x)
       x = x + attn(x) + mlp(x)
     """
-    
+
     soft_prompt_tuning: dict = None
     """
     Dictionary configuring the soft prompt tuning parameters. 
@@ -487,7 +489,6 @@ class NeoXArgsLogging(NeoXArgsTemplate):
     """
 
 
-
 @dataclass
 class NeoXArgsOther(NeoXArgsTemplate):
     """