-
Notifications
You must be signed in to change notification settings - Fork 11
/
generate.py
143 lines (120 loc) · 5.74 KB
/
generate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import torch
import torch.nn.functional as F
def apply_temperature(scores, tempt):
if tempt > 0:
scores = scores / tempt
return scores
def apply_top_p(scores, top_p, filter_value=-float("Inf"), min_tokens_to_keep=1):
if top_p > 0 and top_p < 1:
sorted_logits, sorted_indices = torch.sort(scores, descending=False)
cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
# Remove tokens with cumulative top_p above the threshold (token with 0 are kept)
sorted_indices_to_remove = cumulative_probs <= (1 - top_p)
if min_tokens_to_keep > 1:
# Keep at least min_tokens_to_keep
sorted_indices_to_remove[..., -min_tokens_to_keep:] = 0
# scatter sorted tensors to original indexing
indices_to_remove = sorted_indices_to_remove.scatter(
1, sorted_indices, sorted_indices_to_remove
)
scores = scores.masked_fill(indices_to_remove, filter_value)
return scores
def apply_top_k(logits, top_k):
top_k = min(top_k, logits.size(-1)) # Safety check
if top_k > 0:
# Remove all tokens with a probability less than the last token of the top-k
indices_to_remove = logits < torch.topk(logits.float(), top_k)[0][..., -1, None]
logits[indices_to_remove] = -float("Inf")
return logits
def apply_advanced_repetition_penalty(
input_ids, scores, penalty_range, penalty_slope, penalty
):
penalty_range = int(penalty_range)
clipped_penalty_range = min(input_ids.shape[-1], penalty_range)
if penalty != 1.0:
if penalty_range > 0:
if clipped_penalty_range < input_ids.shape[1]:
input_ids = input_ids[..., -clipped_penalty_range:]
if penalty_slope != 0:
_penalty = (
torch.arange(
penalty_range, dtype=scores.dtype, device=scores.device
)
/ (penalty_range - 1)
) * 2.0 - 1
_penalty = (penalty_slope * _penalty) / (
1 + torch.abs(_penalty) * (penalty_slope - 1)
)
_penalty = 1 + ((_penalty + 1) / 2).unsqueeze(0) * (penalty - 1)
penalty = _penalty[..., -clipped_penalty_range:]
score = torch.gather(scores, 1, input_ids)
score = torch.where(score <= 0, score * penalty, score / penalty)
scores.scatter_(1, input_ids, score)
return scores
class LmGeneration:
def __init__(self, model, tokenizer):
self.model = model
self.tokenizer = tokenizer
def generate(self, args, prompts, cut_off=None, cut_off_times=1):
if cut_off is not None:
cut_off_times = [cut_off_times for i in range(len(prompts))]
batch = len(prompts)
assert batch <= args.batch_size
prompt_tokens = [args.tokenizer.encode(x, bos=True, eos=False) for x in prompts]
min_prompt_len = min([len(x) for x in prompt_tokens])
# max_prompt_len = max([len(x) for x in prompt_tokens])
total_len = args.seq_length
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokens = torch.full((batch, total_len), self.tokenizer.pad_id).to(device).long()
for idx, t in enumerate(prompt_tokens):
tokens[idx, : len(t)] = torch.tensor(t).long()
mask = tokens != self.tokenizer.pad_id
start_pos = min_prompt_len
prev_pos = 0
continue_exsample = [i for i in range(batch)]
with torch.no_grad():
for cur_pos in range(start_pos, total_len):
logits = self.model.forward(tokens[continue_exsample, prev_pos:cur_pos], prev_pos, continue_exsample).float()
next_token_scores = apply_top_k(logits, top_k=args.top_k)
next_token_scores = apply_top_p(next_token_scores, args.top_p)
next_token_scores = apply_temperature(next_token_scores, args.temperature)
next_token_scores = apply_advanced_repetition_penalty(
tokens[continue_exsample, :cur_pos],
next_token_scores,
args.repetition_penalty_range,
args.repetition_penalty_slope,
args.repetition_penalty
)
scores = F.softmax(next_token_scores, dim=-1)
next_token = torch.multinomial(scores, num_samples=1).squeeze(1)
next_token = next_token.reshape(-1)
next_token = torch.where(
mask[continue_exsample, cur_pos], tokens[continue_exsample, cur_pos], next_token
)
tokens[continue_exsample, cur_pos] = next_token
prev_pos = cur_pos
# remove eos examples.
continue_exsample = []
for i, t in enumerate(tokens.tolist()):
try:
t.index(self.tokenizer.eos_id)
except ValueError:
if cut_off is not None:
if cut_off == self.tokenizer.decode(t[:cur_pos + 1])[-len(cut_off):]:
if cut_off_times[i] == 1:
continue
else:
cut_off_times[i] -= 1
continue_exsample.append(i)
if len(continue_exsample) == 0:
break
decoder = []
for i, t in enumerate(tokens.tolist()):
t = t[: args.seq_length]
try:
t = t[: t.index(self.tokenizer.pad_id)]
t = t[: t.index(self.tokenizer.eos_id)]
except ValueError:
pass
decoder.append(self.tokenizer.decode(t))
return decoder