-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbase_ngram.py
164 lines (140 loc) · 6.86 KB
/
base_ngram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import os
os.environ['TRANSFORMERS_CACHE'] = '/home/UserData/huggingface_cache'
os.environ['http_proxy'] = 'http://192.168.3.206:20171/'
os.environ['https_proxy'] = 'http://192.168.3.206:20171/'
os.environ['HTTP_PROXY'] = 'http://192.168.3.206:20171/'
os.environ['HTTPS_PROXY'] = 'http://192.168.3.206:20171/'
import pickle
from collections import defaultdict, deque
import random
import argparse
import torch
from modeling_llama import LlamaForCausalLM
from modeling_opt import OPTForCausalLM
from transformers import AutoTokenizer, BitsAndBytesConfig, AutoModelForCausalLM
from modeling_chatglm import ChatGLMForConditionalGeneration
from decoding_ml_ngram import clip_input, infer_input_ids
from rouge_score import rouge_scorer
import numpy as np
import transformers
# from collections import defaultdict
from accelerate import infer_auto_device_map, init_empty_weights
import deepspeed
from datasets import load_dataset
seed=42
torch.manual_seed(seed)
np.random.seed(seed)
PROMPT_DICT = {
"prompt_input": (
"Below is an instruction that describes a task, paired with an input that provides further context. "
"Write a response that appropriately completes the request.\n\n"
"### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
),
"prompt_no_input": (
"Below is an instruction that describes a task. "
"Write a response that appropriately completes the request.\n\n"
"### Instruction:\n{instruction}\n\n### Response:"
),
}
instruction="Summarize the following articles."
class NGramModel:
def __init__(self, text, n=2):
self.model = defaultdict(lambda: defaultdict(int))
self.total_counts = defaultdict(int)
self.prev_words = deque(maxlen=n-1)
self.n = n
self._process_text(text)
self.default_token_id = "1"
def _process_text(self, text):
tokens = text.split()
# Initialize deque with the beginning of the text
self.prev_words.extend(tokens[:self.n-1])
for i in range(self.n-1, len(tokens)):
self.update(tokens[i])
def update(self, new_word):
prev_words_tuple = tuple(self.prev_words)
if prev_words_tuple:
self.model[prev_words_tuple][new_word] += 1
self.total_counts[prev_words_tuple] += 1
self.prev_words.append(new_word)
def forward(self, current_words, method='greedy', topk=1, seed=None):
# if not isinstance(current_words,tuple):
# current_words=(current_words,)
# # print(current_words)
if seed is not None:
random.seed(seed)
current_words_tuple = tuple(current_words[-(self.n - 1):]) # Only consider the last (n-1) words
# current_words_tuple = tuple(current_words)
# Handle the case when current_words is not in the model
if current_words_tuple not in self.model:
return [self.default_token_id] * topk
next_word_candidates = list(self.model[current_words_tuple].keys())
next_word_probabilities = list(self.model[current_words_tuple].values())
# Handle the case when there are no candidates
if not next_word_candidates:
return [self.default_token_id] * topk
if method == 'greedy':
# Get indices of top-k probabilities
# topk_indices = sorted(range(len(next_word_probabilities)), key=lambda i: next_word_probabilities[i])[-topk:]
# next_words = [next_word_candidates[i] for i in reversed(topk_indices)]
# Get indices of top-k probabilities
topk_indices = np.argsort(next_word_probabilities)[-topk:]
# print(np.argsort(next_word_probabilities))
next_words = [next_word_candidates[i] for i in topk_indices]
"""
np.argsort 会根据值在原列表中的位置排序,所以具有相同概率的词,位置在前的会先被选中。
list.index(max(next_word_probabilities)) 只会返回第一个找到的最大值的索引,所以它会返回具有最大概率且在列表中首次出现的词。
"""
elif method == 'random':
next_words = random.choices(
next_word_candidates,
weights=next_word_probabilities,
k=topk)
"""
# 将概率列表转换成PyTorch Tensor
probabilities_tensor = torch.tensor(next_word_probabilities, dtype=torch.float)
# 使用torch.multinomial进行采样
# 第一个参数是概率权重,第二个参数是采样的数量
# replacement=True表示可以取相同的元素多次
indices = torch.multinomial(probabilities_tensor, topk, replacement=True)
# 根据indices获取对应的词
next_words = [next_word_candidates[i] for i in indices]
# 假设 next_word_probabilities 是一个与候选词对应的概率列表
# 并且它们的和为1
# topk 是你想要采样的次数
# 将概率列表转换成NumPy数组,并确保其和为1
probabilities = np.array(next_word_probabilities)
probabilities /= probabilities.sum()
# 使用numpy.random.multinomial进行采样
# 第一个参数是每次试验中每个事件发生的次数,这里我们每次只抽取一次样本
# 第二个参数是概率列表
# 第三个参数是试验的次数
samples = np.random.multinomial(1, probabilities, size=topk)
# 根据samples获取对应的词
# samples 每一行代表一次试验的结果,非零索引即为抽取的词的索引
next_words_indices = samples.nonzero()[1]
next_words = [next_word_candidates[i] for i in next_words_indices]
"""
else:
raise ValueError("Invalid method. Choose 'greedy' or 'random'.")
if len(next_words) < topk:
next_words.extend([self.default_token_id] * (topk - len(next_words)))
return next_words
class MultiLevelNgram:
def __init__(self, text, max_n=3):
self.models = [NGramModel(text, n=i) for i in range(2, max_n + 1)]
self.max_n = max_n
self._process_text(text)
def _process_text(self, text):
tokens = text.split()
for word in tokens:
self.update(word)
def update(self, new_word):
for model in self.models:
model.update(new_word)
def forward(self, current_words, method='greedy', topk=1, seed=None):
for model in reversed(self.models): # Start from the highest n-gram
next_words = model.forward(current_words, method=method, topk=topk, seed=seed)
if next_words[0] != model.default_token_id:
return next_words
return [model.default_token_id] * topk # If no model returns a valid word, return the default token