Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix showing unknown token at gpt_tokenize #801

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
18 changes: 1 addition & 17 deletions examples/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -320,38 +320,22 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri

// find the longest token that forms each word in words:
std::vector<gpt_vocab::id> tokens;
// unknown token
std::vector<char> unknown;
unknown.clear();
for (const auto & word : words) {
for (int i = 0; i < (int) word.size(); ){
for (int j = word.size() - 1; j >= i; j--){
auto cand = word.substr(i, j-i+1);
auto it = vocab.token_to_id.find(cand);
if (it != vocab.token_to_id.end()){ // word.substr(i, j-i+1) in vocab
if (!unknown.empty()){
unknown.push_back(0); // terminator
std::string unkstr(unknown.begin(), unknown.end());
fprintf(stderr, "%s: unknown token '%s'\n", __func__, unkstr.data());
unknown.clear();
}
tokens.push_back(it->second);
i = j + 1;
break;
}
else if (j == i){ // word.substr(i, 1) has no matching
auto unk = word.substr(i, 1).data();
unknown.push_back(*unk);
fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data());
i++;
}
}
}
if (!unknown.empty()){
unknown.push_back(0); // terminator
std::string unkstr(unknown.begin(), unknown.end());
fprintf(stderr, "%s: unknown token '%s'\n", __func__, unkstr.data());
unknown.clear();
}
}

return tokens;
Expand Down