samples

labmlai · May 18, 2022 · 0548572 · 0548572
1 parent c5f0a48
commit 0548572
Show file tree

Hide file tree

Showing 4 changed files with 120 additions and 8 deletions.
diff --git a/src/neox/samples/fine_tune_biases.py b/src/neox/samples/fine_tune_biases.py
@@ -21,7 +21,7 @@
 
 from labml import tracker, experiment, monit
 from neox.data import get_training_data
-from neox.utils import load_layers, balance_layers
+from neox.utils import balance_layers, LayerGenerator
 from neox.utils.training import train, get_trainable_params, train_biases_only
 
 # List of layers to load. This is used for testing.
@@ -39,7 +39,9 @@ def main():
  experiment.create(name='finetune_neox_biases', comment='Pipeline parallel', writers={'screen', 'web_api'})
 
  # Load layers
- layers = load_layers(LAYERS)
+ layers = list(LayerGenerator(is_clone_layers=True,
+ filter_layers=LAYERS,
+ ).load())
 
  # Mark `requires_grad=True` for biases using a [helper function](../utils/training.html).
  train_biases_only(layers)
@@ -67,7 +69,7 @@ def main():
  sampler=RandomSampler(dataset, replacement=True))
 
  # Initialize optimizer
- optimizer = optim.Adam(get_trainable_params(pipe_model), lr=1e-6)
+ optimizer = optim.Adam(get_trainable_params(pipe_model), lr=3e-4)
 
  # Train the model using the [helper function](../utils/training.html)
  with experiment.start():

diff --git a/src/neox/samples/generating_gpu.py b/src/neox/samples/generating_gpu.py
@@ -0,0 +1,101 @@
+"""
+---
+title: Generate Text with GPT-NeoX by Evaluating Layer by Layer
+summary: >
+ Generate Text with GPT-NeoX by evaluating layer by layer
+---
+
+# Generate Text with GPT-NeoX by Evaluating Layer by Layer
+
+This shows how to generate text from GPT-NeoX with a single GPU.
+
+This needs a GPU with more than 45GB memory.
+"""
+
+# Imports
+from typing import List
+
+import torch
+from torch import nn
+
+from labml import monit
+from neox.utils import get_tokens, print_tokens, LayerGenerator
+from neox.utils.cache import get_cache
+
+# List of layers to load. This is used for testing.
+# You can assign a subset of layers like `{0, 1}` so that it only loads
+# the first to transformer layers.
+LAYERS = None
+
+# Prompt to complete
+PROMPT = 'Einstein was born in the German Empire, but moved to Switzerland in 1895, forsaking his German'
+
+
+def infer(model: nn.Module, ids: List[int], device: torch.device):
+ """
+ ### Predict the next token
+
+ :param layers: is the list of layers
+ :param ids: are the input token ids
+ :param device: is the device of the model
+ """
+
+ with torch.no_grad():
+ # Get the tokens
+ x = torch.tensor(ids)[None, :].to(device)
+ # Eval model
+ x = model(x)
+
+ # Return predicted token
+ return x[0].max(dim=-1)[1].tolist()
+
+
+def generate():
+ """
+ ## Generate text
+ """
+
+ # Setup [cache](../utils/cache.html) to cache intermediate key/value pairs for faster generation
+ cache = get_cache()
+ cache.set('use_cache', True)
+
+ # Device
+ device = torch.device('cuda:0')
+
+ # Load layers
+ layers = list(LayerGenerator(is_clone_layers=True,
+ filter_layers=LAYERS,
+ dtype=torch.float16,
+ device=device,
+ ).load())
+
+ model = nn.Sequential(*layers)
+
+ # Get token ids
+ ids = get_tokens(PROMPT)
+
+ # Run the model
+ cache.set('state_ids', (None, 1))
+ with monit.section('Infer'):
+ next_token = infer(model, ids, device)[-1]
+
+ # Append the predicted token
+ ids += [next_token]
+
+ # Predict 100 tokens
+ for i in range(1, 100):
+ # Set the state to use cached activations
+ cache.set('state_ids', (i, i + 1))
+ # Get next token. Note that we only feed the last token to the model because
+ # we cache the key/value pairs of previous tokens.
+ with monit.section('Infer'):
+ next_token = infer(model, [next_token], device)[-1]
+ # Append the predicted token
+ ids += [next_token]
+ # Print
+ print_tokens(ids, [ids])
+
+
+#
+if __name__ == '__main__':
+ generate()
diff --git a/src/neox/samples/generating_pipe.py b/src/neox/samples/generating_pipe.py
@@ -18,7 +18,7 @@
 from torch import nn
 
 from labml import monit
-from neox.utils import load_layers, get_tokens, print_tokens, balance_layers
+from neox.utils import load_layers, get_tokens, print_tokens, balance_layers, LayerGenerator
 from neox.utils.cache import get_cache
 
 # List of layers to load. This is used for testing.
@@ -58,7 +58,9 @@ def generate():
  cache.set('use_cache', True)
 
  # Load layers
- layers = load_layers(LAYERS)
+ layers = list(LayerGenerator(is_clone_layers=True,
+ filter_layers=LAYERS,
+ ).load())
 
  # Create pipeline parallel model
  with monit.section('Pipe'):

diff --git a/src/neox/samples/generating_single_gpu.py → src/neox/samples/generating_small_gpu.py b/src/neox/samples/generating_single_gpu.py → src/neox/samples/generating_small_gpu.py
@@ -7,7 +7,11 @@
 
 # Generate Text with GPT-NeoX by Evaluating Layer by Layer
 
-This shows how to generate text from GPT-NeoX with a single GPU.
+This shows how to generate text from GPT-NeoX with a small GPU.
+It will first load all layers to memory and then load layer-by-layer to GPU for inference.
+
+This requires enough memory on computer to load entire model.
+Even a small GPU is enough since we load only a single layer at a time to the GPU.
 """
 
 # Imports
@@ -17,7 +21,7 @@
 from torch import nn
 
 from labml import monit
-from neox.utils import load_layers, get_tokens, print_tokens
+from neox.utils import get_tokens, print_tokens, LayerGenerator
 from neox.utils.cache import get_cache
 
 # List of layers to load. This is used for testing.
@@ -70,7 +74,10 @@ def generate():
  cache.set('use_cache', True)
 
  # Load layers
- layers = load_layers(LAYERS)
+ layers = list(LayerGenerator(is_clone_layers=True,
+ filter_layers=LAYERS,
+ dtype=torch.float16,
+ ).load())
 
  # Device
  device = torch.device('cuda:0')