Updates for OPT

EleutherAI · Jul 23, 2022 · 7c25eed · 7c25eed
1 parent 0709327
commit 7c25eed
Show file tree

Hide file tree

Showing 3 changed files with 25 additions and 0 deletions.
diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
@@ -308,6 +308,13 @@ Model Arguments
  T5 relative positional encoding max distance, default 128.
 
 
+- **opt_pos_emb_offset**: int
+
+ Default = 0
+
+ Learned position embedding offset (only used by OPT, where it should be set to 2).
+
+
 
 - **no_weight_tying**: bool
 

diff --git a/megatron/model/word_embeddings.py b/megatron/model/word_embeddings.py
@@ -91,6 +91,10 @@ def __init__(
 
  # Embeddings dropout
  self.embedding_dropout = torch.nn.Dropout(embedding_dropout_prob)
+ self.opt_pos_emb_offset = neox_args.opt_pos_emb_offset
+
+ # For ticking position ids forward
+ self.layer_past = None
 
  def add_tokentype_embeddings(self, num_tokentypes):
  """Add token-type embedding. This function is provided so we can add
@@ -114,6 +118,15 @@ def forward(self, input_ids, position_ids, tokentype_ids=None):
  # Embeddings.
  words_embeddings = self.word_embeddings(input_ids)
  if self.use_pos_emb and self.embedding_type in ["learned", "sinusoidal"]:
+ if self.layer_past is not None:
+ position_ids = position_ids + self.layer_past + 1
+
+ self.layer_past = position_ids[:, -1]
+
+ # OPT always adds 2 for some reason, according to the HF implementation
+ if self.opt_pos_emb_offset:
+ position_ids = position_ids + self.opt_pos_emb_offset
+ import pdb; pdb.set_trace()
  position_embeddings = self.position_embeddings(position_ids)
  embeddings = words_embeddings + position_embeddings
  else:

diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
@@ -141,6 +141,11 @@ class NeoXArgsModel(NeoXArgsTemplate):
  T5 relative positional encoding max distance, default 128.
  """
 
+ opt_pos_emb_offset: int = 0
+ """
+ Learned position embedding offset (only used by OPT, where it should be set to 2).
+ """
+
  no_weight_tying: bool = False
  """
  Disables weight tying between embedding weights and final Linear layer