add the hack (works for Mistral/Llama, destroys performance for GPT2

EleutherAI · haileyschoelkopf · Jan 7, 2024 · Jan 7, 2024 · Jan 7, 2024 · Jan 7, 2024
commit d03c9fdeec7cce9fa95cc3048211e0f35d3b7f1f
@@ -792,6 +792,26 @@ def _encode_pair(
  # context_enc = self.tok_encode(context, add_special_tokens=False)
  context_enc_len = len(context_enc)
  continuation_enc = whole_enc[context_enc_len:]
+
+ # quite the hack, but what this does:
+ # circumvents the addition of an extraneous sentencepiece underline token
+ # that was produced when passing " <word>" into the Llama / Mistral tokenizer.
+ # if instead we pass "<word>" in, we don't get this extra token (29871 for Llama.)
+ # which would hurt performance if provided.
+ if (
+ len(continuation.lstrip()) + 1 == len(continuation)
+ and continuation.startswith(" ")
+ ) or (len(continuation_enc) == 0):
+ context_enc_2 = context_enc
+ continuation_enc_2 = self.tok_encode(
+ continuation[1:], add_special_tokens=False
+ )
+
+ # assert context_enc == context_enc_2
+ # assert continuation_enc == continuation_enc_2, f"{continuation_enc},{continuation_enc_2}"
+
+ return context_enc_2, continuation_enc_2
+
  return context_enc, continuation_enc
 
  def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: