统计文档中的字符,并且统计字符个数。这里是为了将文字转换为数字表示。
import numpy as np
import re
import torch
class TextConverter(object):
def __init__(self,text_path,max_vocab=5000):
"""
建立一个字符索引转换,主要还是为了生成一个词汇表
:param text_path: 文本位置
:param max_vocab: 最大的单词数量
"""
with open(text_path,'r',encoding='utf-8') as f:
text_file=f.readlines()
# print('查看部分数据', text_file[:100])
# 去掉一些特殊字符
text_file = [re.sub(r'\n', '', _) for _ in text_file]
text_file = [re.sub(r' ', '', _) for _ in text_file]
text_file = [re.sub(r'\u3000', '', _) for _ in text_file]
text_file = [_.replace('\n', ' ').replace('\r', ' ').replace(',', ' ').replace('。', ' ') for _ in text_file]
# print('查看部分数据', text_file[:100])
# 只匹配中文字符
pattern = re.compile(r'[\u4e00-\u9fa5]+')
test_file = [pattern.findall(_) for _ in text_file]
# print(test_file)
word_list = [v for s in text_file for v in s]
# print(word_list)
# print('一共{}字符'.format(len(word_list)))
# 词汇表
vocab = set(word_list)
# print('一共有{}字'.format(len(vocab)))
# 统计每个字出现的频率,如果字超过最长限制,则按字出现的频率去掉最小的部分
vocab_count = {}
for word in vocab:
vocab_count[word] = 0
for word in word_list:
vocab_count[word] += 1
# 打印每个字出现的个数
# for key,value in vocab_count.items():
# print('key:{},value:{}'.format(key,value))
# #将字典转换为列表,并且排序
vocab_list = [[key, value] for key, value in vocab_count.items()]
# print(vocab_list)
vocab_list.sort(key=lambda x: x[1], reverse=True)
# vocab_list=sorted(vocab_list,key=(lambda x:x[1]),reverse=True)
# print(vocab_list)
# 如果大于最大字符数,则进行截取
if len(vocab_list) > max_vocab:
vocab_list = vocab_list[:max_vocab]
self.word_to_int_table = {c[0]: i for i, c in enumerate(vocab_list)}
self.int_to_word_table = {i: c[0] for i, c in enumerate(vocab_list)}
self.vocab=vocab_list
# @property
def vocab_size(self):
# 词汇表的字符数量
return len(self.vocab)
def int_to_word(self,index):
#根据索引找到对应的字符
if index==len(self.vocab):
return '<unk>'
elif index<len(self.vocab):
return self.int_to_word_table[index]
else:
return Exception('输入索引超过范围')
def word_to_int(self,word):
#根据字符生成对应的索引
if word in self.word_to_int_table:
return self.word_to_int_table[word]
else:
return len(self.vocab)
def text_to_arr(self,text):
#将文本生成对应的数组
arr=[]
for word in text:
arr.append(self.word_to_int(word))
return np.array(arr)
def arr_to_text(self,arr):
words=[]
for index in arr:
words.append(self.int_to_word(index))
return ''.join(words)
if __name__=='__main__':
#定义一个字符转换器
convert=TextConverter('./poetry.txt',max_vocab=1000)
# print('词汇表',convert.vocab)
print('词汇表的大小',convert.vocab_size())
with open('./poetry.txt','r',encoding='utf-8') as f:
txt=f.read()
txt=txt.replace('\n', ' ').replace('\r', ' ').replace(',', ' ').replace('。', ' ')
# txt_char=txt[:11]
# print('原始字符串: ',txt_char)
#
# # 转换成数字的形式
# arr=convert.text_to_arr(txt_char)
# print('转换成数字: ',arr)
# txt_char=convert.arr_to_text(arr)
# print('将数字转换成文字: ',txt_char)
# 构造时序样本数据
n_step=20 #每个序列的长度,这里是指每个序列拥有20个字符
#总的序列个数
num_sep=int(len(txt)/n_step)
# 去掉最后不足序列长度的部分
txt=txt[:num_sep*n_step]
print('共有{}个序列,每个序列含有{}个汉字'.format(num_sep,n_step))
print('第一个序列: ',txt[:20])
print('转换为数字: ',convert.text_to_arr(txt[:20]))
#将所有的文本转换为数字表示,且重新排列成(num_sep,n_step)的矩阵
arr=convert.text_to_arr(txt)
arr=arr.reshape((num_sep,-1))
arr=torch.from_numpy(arr)
print('arr shape',arr.shape)
print('第一个序列',arr[0,:])
# 转换为汉字
arr_text=convert.arr_to_text(np.array(arr[0,:]))
print('第一个序列转换为汉字',arr_text)
import numpy as np
import re
import torch
class TextConverter(object):
def __init__(self,text_path,max_vocab=5000):
"""
建立一个字符索引转换,主要还是为了生成一个词汇表
:param text_path: 文本位置
:param max_vocab: 最大的单词数量
"""
with open(text_path,'r',encoding='utf-8') as f:
text_file=f.readlines()
# print('查看部分数据', text_file[:100])
# 去掉一些特殊字符
text_file = [re.sub(r'\n', '', _) for _ in text_file]
text_file = [re.sub(r' ', '', _) for _ in text_file]
text_file = [re.sub(r'\u3000', '', _) for _ in text_file]
text_file = [_.replace('\n', ' ').replace('\r', ' ').replace(',', ' ').replace('。', ' ') for _ in text_file]
# print('查看部分数据', text_file[:100])
# 只匹配中文字符
pattern = re.compile(r'[\u4e00-\u9fa5]+')
test_file = [pattern.findall(_) for _ in text_file]
# print(test_file)
word_list = [v for s in text_file for v in s]
# print(word_list)
# print('一共{}字符'.format(len(word_list)))
# 词汇表
vocab = set(word_list)
# print('一共有{}字'.format(len(vocab)))
# 统计每个字出现的频率,如果字超过最长限制,则按字出现的频率去掉最小的部分
vocab_count = {}
for word in vocab:
vocab_count[word] = 0
for word in word_list:
vocab_count[word] += 1
# 打印每个字出现的个数
# for key,value in vocab_count.items():
# print('key:{},value:{}'.format(key,value))
# #将字典转换为列表,并且排序
vocab_list = [[key, value] for key, value in vocab_count.items()]
# print(vocab_list)
vocab_list.sort(key=lambda x: x[1], reverse=True)
# vocab_list=sorted(vocab_list,key=(lambda x:x[1]),reverse=True)
# print(vocab_list)
# 如果大于最大字符数,则进行截取
if len(vocab_list) > max_vocab:
vocab_list = vocab_list[:max_vocab]
self.word_to_int_table = {c[0]: i for i, c in enumerate(vocab_list)}
self.int_to_word_table = {i: c[0] for i, c in enumerate(vocab_list)}
self.vocab=vocab_list
# @property
def vocab_size(self):
# 词汇表的字符数量
return len(self.vocab)
def int_to_word(self,index):
#根据索引找到对应的字符
if index==len(self.vocab):
return '<unk>'
elif index<len(self.vocab):
return self.int_to_word_table[index]
else:
return Exception('输入索引超过范围')
def word_to_int(self,word):
#根据字符生成对应的索引
if word in self.word_to_int_table:
return self.word_to_int_table[word]
else:
return len(self.vocab)
def text_to_arr(self,text):
#将文本生成对应的数组
arr=[]
for word in text:
arr.append(self.word_to_int(word))
return np.array(arr)
def arr_to_text(self,arr):
words=[]
for index in arr:
words.append(self.int_to_word(index))
return ''.join(words)
class TextDataset(object):
"""
arr:arr表示的是所有文本的数字表示
"""
def __init__(self,arr):
self.arr=arr
def __getitem__(self, item):
x=self.arr[item,:]
#构造label
y=torch.zeros(x.shape)
#将输入的第一个字符作为最后一个输入的label
y[:-1],y[-1]=x[1:],x[0]
return x,y
def __len__(self):
return self.arr.shape[0]
if __name__=='__main__':
#定义一个字符转换器
convert=TextConverter('./poetry.txt',max_vocab=1000)
# print('词汇表',convert.vocab)
print('词汇表的大小',convert.vocab_size())
with open('./poetry.txt','r',encoding='utf-8') as f:
txt=f.read()
txt=txt.replace('\n', ' ').replace('\r', ' ').replace(',', ' ').replace('。', ' ')
# 构造时序样本数据
n_step=10 #每个序列的长度,这里是指每个序列拥有10个字符
#总的序列个数
num_sep=int(len(txt)/n_step)
# 去掉最后不足序列长度的部分
txt=txt[:num_sep*n_step]
print('共有{}个序列,每个序列含有{}个汉字'.format(num_sep,n_step))
# print('第一个序列: ',txt[:20])
# print('转换为数字: ',convert.text_to_arr(txt[:20]))
#将所有的文本转换为数字表示,且重新排列成(num_sep,n_step)的矩阵
arr=convert.text_to_arr(txt)
arr=arr.reshape((num_sep,-1))
arr=torch.from_numpy(arr)
#定义数据集
train_set=TextDataset(arr)
x,y=train_set[0]
print(x.numpy(),convert.arr_to_text(x.numpy()))
print(y.numpy(),convert.arr_to_text(y.numpy()))
import torch
from torch import nn
from torch.autograd import Variable
class CharRNN(nn.Module):
"""
num_classes:表示预测多少个类别,文本生成,num_classes=词汇表的大小,也就是说模型输出每个单词的概率
embed_dim:这里用nn.Embedding将字符映射为 embed_dim维的向量
hidden_size:表示隐藏层的大小
num_layers:表示隐藏层的个数
"""
def __init__(self,num_classes,embed_dim,hidden_size,num_layers,dropout):
super().__init__()
self.num_layers=num_layers
self.hidden_size=hidden_size
self.word_to_ver=nn.Embedding(num_classes,embed_dim)
self.rnn=nn.RNN(embed_dim,hidden_size,num_layers,batch_first=True)
self.project=nn.Linear(hidden_size,num_classes)
def forward(self,x,hs=None):
batch=x.shape[0]
if hs is None:
hs=Variable(torch.zeros(self.num_layers,batch,self.hidden_size))
word_embed=self.word_to_ver(x) #(batch,seq_len,embed_dim)
# word_embed=word_embed.permute(1,0,2) #(seq_len,batch,embed_dim)
out,h0=self.rnn(word_embed,hs) #(seq_len,batch,hidden)
batch,seq_len,hd_dim=out.shape
out=out.contiguous().view(batch*seq_len,hd_dim)
out=self.project(out)
out=out.view(batch,seq_len,-1)
# out=out.permute(1,0,2).contiguous()
return out.view(-1,out.shape[2]),h0
if __name__=='__main__':
num_classes=1000
model=CharRNN(num_classes=num_classes,embed_dim=100,hidden_size=30,num_layers=1,dropout=0.5)
print(model)
x=torch.randint(0,100,(10,5))
y,h=model(x)
print(y.shape)
print(h.shape)
import numpy as np
import re
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
class TextConverter(object):
def __init__(self,text_path,max_vocab=5000):
"""
建立一个字符索引转换,主要还是为了生成一个词汇表
:param text_path: 文本位置
:param max_vocab: 最大的单词数量
"""
with open(text_path,'r',encoding='utf-8') as f:
text_file=f.readlines()
# print('查看部分数据', text_file[:100])
# 去掉一些特殊字符
text_file = [re.sub(r'\n', '', _) for _ in text_file]
text_file = [re.sub(r' ', '', _) for _ in text_file]
text_file = [re.sub(r'\u3000', '', _) for _ in text_file]
text_file = [_.replace('\n', ' ').replace('\r', ' ').replace(',', ' ').replace('。', ' ') for _ in text_file]
# print('查看部分数据', text_file[:100])
# 只匹配中文字符
pattern = re.compile(r'[\u4e00-\u9fa5]+')
test_file = [pattern.findall(_) for _ in text_file]
# print(test_file)
word_list = [v for s in text_file for v in s]
# print(word_list)
# print('一共{}字符'.format(len(word_list)))
# 词汇表
vocab = set(word_list)
# print('一共有{}字'.format(len(vocab)))
# 统计每个字出现的频率,如果字超过最长限制,则按字出现的频率去掉最小的部分
vocab_count = {}
for word in vocab:
vocab_count[word] = 0
for word in word_list:
vocab_count[word] += 1
# 打印每个字出现的个数
# for key,value in vocab_count.items():
# print('key:{},value:{}'.format(key,value))
# #将字典转换为列表,并且排序
vocab_list = [[key, value] for key, value in vocab_count.items()]
# print(vocab_list)
vocab_list.sort(key=lambda x: x[1], reverse=True)
# vocab_list=sorted(vocab_list,key=(lambda x:x[1]),reverse=True)
# print(vocab_list)
# 如果大于最大字符数,则进行截取
if len(vocab_list) > max_vocab:
vocab_list = vocab_list[:max_vocab]
self.word_to_int_table = {c[0]: i for i, c in enumerate(vocab_list)}
self.int_to_word_table = {i: c[0] for i, c in enumerate(vocab_list)}
self.vocab=vocab_list
# @property
def vocab_size(self):
# 词汇表的字符数量
return len(self.vocab)+1
def int_to_word(self,index):
#根据索引找到对应的字符
if index.ndim>=1:
index=np.squeeze(index)
index=index.item()
else:
index=index
if index==len(self.vocab):
return '<unk>'
elif index<len(self.vocab):
return self.int_to_word_table[index]
else:
return Exception('输入索引超过范围')
def word_to_int(self,word):
#根据字符生成对应的索引
if word in self.word_to_int_table:
return self.word_to_int_table[word]
else:
return len(self.vocab)
def text_to_arr(self,text):
#将文本生成对应的数组
arr=[]
for word in text:
arr.append(self.word_to_int(word))
return np.array(arr)
def arr_to_text(self,arr):
words=[]
for index in arr:
words.append(self.int_to_word(index))
return ''.join(words)
class TextDataset(object):
"""
arr:arr表示的是所有文本的数字表示
"""
def __init__(self,arr):
self.arr=arr
def __getitem__(self, item):
x=self.arr[item,:]
#构造label
y=torch.zeros(x.shape,dtype=torch.float32)
#将输入的第一个字符作为最后一个输入的label
y[:-1],y[-1]=x[1:],x[0]
return x,y
def __len__(self):
return self.arr.shape[0]
class CharRNN(nn.Module):
def __init__(self,num_classes,embed_dim,hidden_size,num_layers):
super().__init__()
self.num_layers=num_layers #有几层
self.hidden_size=hidden_size #隐藏层维度
self.word_to_vec=nn.Embedding(num_classes,embed_dim) #一共有num_classes个词汇,每个词汇用embed_dim维度表示
self.rnn=nn.GRU(embed_dim,hidden_size,num_layers)
self.project=nn.Linear(hidden_size,num_classes)
def forward(self,x,hs=None):
batch=x.shape[0]
if hs is None:
hs=torch.autograd.Variable(torch.zeros(self.num_layers,batch,self.hidden_size))
word_embed=self.word_to_vec(x) #(batch,seq_len,embed)
word_embed=word_embed.permute(1,0,2) #(seq_len,batch,embed)
out,h0=self.rnn(word_embed,hs) #(seq_len,batch,embed)
seq_len,batch,hid_dim=out.shape
out=out.view(seq_len*batch,hid_dim)
out=self.project(out)
out=out.view(seq_len,batch,-1)
out=out.permute(1,0,2).contiguous()
return out.view(-1,out.shape[2]),h0
if __name__=='__main__':
#定义一个字符转换器
convert=TextConverter('./poetry.txt',max_vocab=1000)
# print('词汇表',convert.vocab)
print('词汇表的大小',convert.vocab_size())
with open('./poetry.txt','r',encoding='utf-8') as f:
txt=f.read()
txt=txt.replace('\n', ' ').replace('\r', ' ').replace(',', ' ').replace('。', ' ')
# 构造时序样本数据
n_step=10 #每个序列的长度,这里是指每个序列拥有10个字符
#总的序列个数
num_sep=int(len(txt)/n_step)
# 去掉最后不足序列长度的部分
txt=txt[:num_sep*n_step]
print('共有{}个序列,每个序列含有{}个汉字'.format(num_sep,n_step))
# print('第一个序列: ',txt[:20])
# print('转换为数字: ',convert.text_to_arr(txt[:20]))
#将所有的文本转换为数字表示,且重新排列成(num_sep,n_step)的矩阵
arr=convert.text_to_arr(txt)
arr=arr.reshape((num_sep,-1))
print('最大值',np.max(arr))
arr=torch.from_numpy(arr)
#定义数据集
train_set=TextDataset(arr)
x,y=train_set[0]
print(x.numpy(),convert.arr_to_text(x.numpy()))
print(y.numpy(),convert.arr_to_text(y.numpy()))
##定义一个dataloader
batchsize=128
train_data=DataLoader(train_set,batchsize,shuffle=True,num_workers=4)
# for batch in train_data:
# x,y=batch
# print(x.shape)
# print(y.shape)
# break
##定义模型
model=CharRNN(convert.vocab_size(),512,512,1)
#定义优化器和损失函数
criterion=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=1e-3)
#训练
epochs=20
for e in range(epochs):
train_loss=0
for data in train_data:
x,y=data
x,y=torch.autograd.Variable(x),torch.autograd.Variable(y)
#前向传播
score,_ =model(x)
y=y.view(-1)
score=torch.FloatTensor(score)
y=y.to(torch.int64)
loss=criterion(score,y)
#反向传播
optimizer.zero_grad()
loss.backward()
#梯度裁剪
nn.utils.clip_grad_norm_(model.parameters(),5)
optimizer.step()
train_loss+=loss.item()
print('epoch: {} 困惑度: {:.3f} '.format(e,np.exp(train_loss / len(train_data))))
# 保存模型
torch.save(model,'model{}.pth'.format(e))
import numpy as np
import re
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
class TextConverter(object):
def __init__(self,text_path,max_vocab=5000):
"""
建立一个字符索引转换,主要还是为了生成一个词汇表
:param text_path: 文本位置
:param max_vocab: 最大的单词数量
"""
with open(text_path,'r',encoding='utf-8') as f:
text_file=f.readlines()
# print('查看部分数据', text_file[:100])
# 去掉一些特殊字符
text_file = [re.sub(r'\n', '', _) for _ in text_file]
text_file = [re.sub(r' ', '', _) for _ in text_file]
text_file = [re.sub(r'\u3000', '', _) for _ in text_file]
text_file = [_.replace('\n', ' ').replace('\r', ' ').replace(',', ' ').replace('。', ' ') for _ in text_file]
# print('查看部分数据', text_file[:100])
# 只匹配中文字符
pattern = re.compile(r'[\u4e00-\u9fa5]+')
test_file = [pattern.findall(_) for _ in text_file]
# print(test_file)
word_list = [v for s in text_file for v in s]
# print(word_list)
# print('一共{}字符'.format(len(word_list)))
# 词汇表
vocab = set(word_list)
# print('一共有{}字'.format(len(vocab)))
# 统计每个字出现的频率,如果字超过最长限制,则按字出现的频率去掉最小的部分
vocab_count = {}
for word in vocab:
vocab_count[word] = 0
for word in word_list:
vocab_count[word] += 1
# 打印每个字出现的个数
# for key,value in vocab_count.items():
# print('key:{},value:{}'.format(key,value))
# #将字典转换为列表,并且排序
vocab_list = [[key, value] for key, value in vocab_count.items()]
# print(vocab_list)
vocab_list.sort(key=lambda x: x[1], reverse=True)
# vocab_list=sorted(vocab_list,key=(lambda x:x[1]),reverse=True)
# print(vocab_list)
# 如果大于最大字符数,则进行截取
if len(vocab_list) > max_vocab:
vocab_list = vocab_list[:max_vocab]
self.word_to_int_table = {c[0]: i for i, c in enumerate(vocab_list)}
self.int_to_word_table = {i: c[0] for i, c in enumerate(vocab_list)}
self.vocab=vocab_list
# @property
def vocab_size(self):
# 词汇表的字符数量
return len(self.vocab)+1
def int_to_word(self,index):
#根据索引找到对应的字符
# if index.ndim>=1:
# index=np.squeeze(index)
# index=index.item()
# else:
# index=index
if index==len(self.vocab):
return '<unk>'
elif index<len(self.vocab):
return self.int_to_word_table[index]
else:
return Exception('输入索引超过范围')
def word_to_int(self,word):
#根据字符生成对应的索引
if word in self.word_to_int_table:
return self.word_to_int_table[word]
else:
return len(self.vocab)
def text_to_arr(self,text):
#将文本生成对应的数组
arr=[]
for word in text:
arr.append(self.word_to_int(word))
return np.array(arr)
def arr_to_text(self,arr):
words=[]
for index in arr:
words.append(self.int_to_word(index))
return ''.join(words)
class TextDataset(object):
"""
arr:arr表示的是所有文本的数字表示
"""
def __init__(self,arr):
self.arr=arr
def __getitem__(self, item):
x=self.arr[item,:]
#构造label
y=torch.zeros(x.shape,dtype=torch.float32)
#将输入的第一个字符作为最后一个输入的label
y[:-1],y[-1]=x[1:],x[0]
return x,y
def __len__(self):
return self.arr.shape[0]
class CharRNN(nn.Module):
def __init__(self,num_classes,embed_dim,hidden_size,num_layers):
super().__init__()
self.num_layers=num_layers #有几层
self.hidden_size=hidden_size #隐藏层维度
self.word_to_vec=nn.Embedding(num_classes,embed_dim) #一共有num_classes个词汇,每个词汇用embed_dim维度表示
self.rnn=nn.GRU(embed_dim,hidden_size,num_layers)
self.project=nn.Linear(hidden_size,num_classes)
def forward(self,x,hs=None):
batch=x.shape[0]
if hs is None:
hs=torch.autograd.Variable(torch.zeros(self.num_layers,batch,self.hidden_size))
word_embed=self.word_to_vec(x) #(batch,seq_len,embed)
word_embed=word_embed.permute(1,0,2) #(seq_len,batch,embed)
out,h0=self.rnn(word_embed,hs) #(seq_len,batch,embed)
seq_len,batch,hid_dim=out.shape
out=out.view(seq_len*batch,hid_dim)
out=self.project(out)
out=out.view(seq_len,batch,-1)
out=out.permute(1,0,2).contiguous()
return out.view(-1,out.shape[2]),h0
def pick_top_n(preds, top_n=5):
top_pred_prob, top_pred_label = torch.topk(preds, top_n, 1)
top_pred_prob /= torch.sum(top_pred_prob)
top_pred_prob = top_pred_prob.squeeze(0).cpu().numpy()
top_pred_label = top_pred_label.squeeze(0).cpu().numpy()
c = np.random.choice(top_pred_label, size=1, p=top_pred_prob)
return c
if __name__=='__main__':
#定义一个字符转换器
convert=TextConverter('./poetry.txt',max_vocab=1000)
# print('词汇表',convert.vocab)
print('词汇表的大小',convert.vocab_size())
with open('./poetry.txt','r',encoding='utf-8') as f:
txt=f.read()
txt=txt.replace('\n', ' ').replace('\r', ' ').replace(',', ' ').replace('。', ' ')
##定义模型
#导入模型
model=torch.load('model5.pth')
print(model)
begin = '天青色等烟雨'
text_len = 30
model = model.eval()
samples = [convert.word_to_int(c) for c in begin]
input_txt = torch.LongTensor(samples)[None]
input_txt = torch.autograd.Variable(input_txt)
_, init_state = model(input_txt)
result = samples
model_input = input_txt[:, -1][:, None]
for i in range(text_len):
out, init_state = model(model_input, init_state)
pred = pick_top_n(out.data)
model_input = torch.autograd.Variable(torch.LongTensor(pred))[None]
result.append(pred[0])
text = convert.arr_to_text(result)
print('Generate text is: {}'.format(text))
参考:
在 PyTorch 中使用 LSTM 生成文本-CSDN博客
基于pytorch的LSTM进行字符级文本生成实战_pytorch文本生成-CSDN博客
NLP-使用CNN进行文本分类_cnn用于文本分类-CSDN博客