from transformers import AutoTokenizer
pretrained_model_name_or_path = "Yi-34B-Chat"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, use_fast=False)
print(tokenizer)
loading file tokenizer.model
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
LlamaTokenizerFast(name_or_path = ‘Yi-34B-Chat’, vocab_size = 64000, model_max_length = 4096, is_fast = True, padding_side = ‘right’, truncation_side = ‘right’, special_tokens = {
‘bos_token’: AddedToken(“<|startoftext|>”, rstrip = False, lstrip = False, single_word = False, normalized = True),
‘eos_token’: AddedToken(“<|endoftext|>”, rstrip = False, lstrip = False, single_word = False, normalized = True),
‘unk_token’: AddedToken(“<unk>”, rstrip = False, lstrip = False, single_word = False, normalized = True),
‘pad_token’: AddedToken(“<unk>”, rstrip = False, lstrip = False, single_word = False, normalized = True),
‘additional_special_tokens’: [‘<|im_start|>’, ‘<|im_end|>’, ‘<|im_sep|>’]
}, clean_up_tokenization_spaces = False)
词表:
0 <unk>
1 <s>
2 </s>
3 <|Human|>
4 <|Assistant|>
5 <|System|>
6 <|im_start|>
7 <|im_end|>
8 <|im_sep|>
9 <|reserved003|>
10 <|reserved004|>
11 <|reserved005|>
12 <|reserved006|>
13 <|reserved007|>
14 <fim_prefix>
15 <fim_middle>
16 <fim_suffix>
17 <fim_pad>
18 <filename>
19 <gh_stars>
20 <issue_start>
21 <issue_comment>
22 <issue_closed>
23 <jupyter_start>
24 <jupyter_text>
25 <jupyter_code>
26 <jupyter_output>
27 <empty_output>
28 <commit_before>
29 <commit_msg>
30 <commit_after>
31 <reponame>
32 <h1>
33 <h1/>
34 </h1>
35 <h2>
36 <h2/>
37 </h2>
38 <h3>
39 <h3/>
40 </h3>
41 <h4>
42 <h4/>
43 </h4>
44 <h5>
45 <h5/>
46 </h5>
47 <br>
48 <br/>
49 </br>
50 <strong>
51 <strong/>
52 </strong>
53 <p>
54 <p/>
55 </p>
56 <table>
57 <table/>
58 </table>
59 <li>
60 <li/>
61 </li>
62 <tr>
63 <tr/>
64 </tr>
65 <tbody>
66 <tbody/>
67 </tbody>
68 <img>
69 <img/>
70 </img>
71 <b>
72 <b/>
73 </b>
74 <td>
75 <td/>
76 </td>
77 0
78 1
79 2
80 3
81 4
82 5
83 6
84 7
85 8
86 9
87 0
88 1
89 2
90 3
91 4
92 5
93 6
94 7
95 8
96 9
97 ,
98 .
99 !
100 ?
101 ,
102 。
103 !
104 ?
105 、
106 :
107 ¥
108 《
109 》
110 【
111 】
112 『
113 』
114 ```
115 <!--
116 -->
117 ---
118 <!DOCTYPE>
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145 <|unused000|>
146 <|unused001|>
147 <|unused002|>
148 <|unused003|>
149 <|unused004|>
150 <|unused005|>
151 <|unused006|>
152 <|unused007|>
153 <|unused008|>
154 <|unused009|>
155 <|unused010|>
156 <|unused011|>
157 <|unused012|>
158 <|unused013|>
159 <|unused014|>
160 <|unused015|>
161 <|unused016|>
162 <|unused017|>
163 <|unused018|>
164 <|unused019|>
165 <|unused020|>
166 <|unused021|>
167 <|unused022|>
168 <|unused023|>
169 <|unused024|>
170 <|unused025|>
171 <|unused026|>
172 <|unused027|>
173 <|unused028|>
174 <|unused029|>
175 <|unused030|>
176 <|unused031|>
177 <|unused032|>
178 <|unused033|>
179 <|unused034|>
180 <|unused035|>
181 <|unused036|>
182 <|unused037|>
183 <|unused038|>
184 <|unused039|>
185 <|unused040|>
186 <|unused041|>
187 <|unused042|>
188 <|unused043|>
189 <|unused044|>
190 <|unused045|>
191 <|unused046|>
192 <|unused047|>
193 <|unused048|>
194 <|unused049|>
195 <|unused050|>
196 <|unused051|>
197 <|unused052|>
198 <|unused053|>
199 <|unused054|>