Skip to content

Commit

Permalink
add prefix space
Browse files Browse the repository at this point in the history
  • Loading branch information
AlexTMallen committed Oct 27, 2023
1 parent 70a3290 commit 3bc99b2
Showing 1 changed file with 11 additions and 2 deletions.
13 changes: 11 additions & 2 deletions elk/extraction/extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,12 +199,21 @@ def tokenize_dataset(
assert len(answer_choices) == 2
answer_ids = []
for choice in answer_choices:
a_id = tokenizer.encode(choice, add_special_tokens=False)
a_id = tokenizer.encode(" " + choice, add_special_tokens=False)

# the Llama tokenizer splits off leading spaces
if tokenizer.decode(a_id[0]).strip() == "":
a_id_without_space = tokenizer.encode(
choice, add_special_tokens=False
)
assert a_id_without_space == a_id[1:]
a_id = a_id_without_space

if len(a_id) > 1:
print(
f"WARNING: answer choice '{choice}' is more than one "
"token, LM probabilities will be calculated using the "
"first token only."
f"first token only ({tokenizer.decode(a_id[0])})"
)
answer_ids.append(a_id[0])
else:
Expand Down

0 comments on commit 3bc99b2

Please sign in to comment.