发布时间:2023-07-03 19:00
Copyright: Jingmin Wei, Pattern Recognition and Intelligent System, School of Artificial and Intelligence, Huazhong University of Science and Technology
Pytorch教程专栏链接
本教程不商用,仅供学习和参考交流使用,如需转载,请联系本人。
详细的 LSTM 结构可以参考教程的上篇文章。
本文主要是采用门控循环单元网络 LSTM 来进行新闻类别分类,大家也可以尝试把模型改成下篇文章的 GRU 对比两种网络的效果。
使用 THUCNews 数据库进行分类,一共包含 10 10 10 类文本数据,每个类别数据有 6500 6500 6500 条文本,切分为训练集( 5000 × 10 5000\times10 5000×10 )、验证集( 500 × 10 500\times10 500×10 )和测试集( 1000 × 10 1000\times10 1000×10 )
数据集下载链接:http://thuctc.thunlp.org/
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import copy
import time
import os
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib
import csv
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as Data
import jieba
from torchtext import data
from torchtext.vocab import Vectors
# 输出图显示中文
from matplotlib.font_manager import FontProperties
fonts = FontProperties(fname = 'C:/windows/Fonts/STXIHEI.TTF')
# 模型加载选择GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))
cuda
1
GeForce MX250
数据集划分程序参考:https://github.com/gaussic/text-classification-cnn-rnn
def _read_file(filename):
"""读取一个文件并转换为一行"""
with open(filename, 'r', encoding='utf-8') as f:
return f.read().replace('\n', '').replace('\t', '').replace('\u3000', '')
def save_file(dirname):
"""
将多个文件整合并存到3个文件中
"""
f_train = open('data/cnews1/cnews.train.txt', 'w', encoding='utf-8')
f_test = open('data/cnews1/cnews.test.txt', 'w', encoding='utf-8')
f_val = open('data/cnews1/cnews.val.txt', 'w', encoding='utf-8')
for category in os.listdir(dirname): # 分类目录
cat_dir = os.path.join(dirname, category)
if not os.path.isdir(cat_dir):
continue
files = os.listdir(cat_dir)
count = 0
for cur_file in files:
filename = os.path.join(cat_dir, cur_file)
content = _read_file(filename)
if count < 5000:
f_train.write(category + '\t' + content + '\n')
elif count < 6000:
f_test.write(category + '\t' + content + '\n')
else:
f_val.write(category + '\t' + content + '\n')
count += 1
print('Finished:', category)
f_train.close()
f_test.close()
f_val.close()
save_file('data/thucnews')
print(len(open('data/cnews/cnews.train.txt', 'r', encoding='utf-8').readlines()))
print(len(open('data/cnews/cnews.test.txt', 'r', encoding='utf-8').readlines()))
print(len(open('data/cnews/cnews.val.txt', 'r', encoding='utf-8').readlines()))
需要对文本数据进行切分和去停用词。
停用词库:https://github.com/goto456/stopwords
# 读取训练、验证和测试集
train_df = pd.read_csv('./data/cnews1/cnews.train.txt', sep = '\t',
header = None, names = ['label', 'text'])
val_df = pd.read_csv('./data/cnews1/cnews.val.txt', sep = '\t',
header = None, names = ['label', 'text'])
test_df = pd.read_csv('./data/cnews1/cnews.test.txt', sep = '\t',
header = None, names = ['label', 'text'])
stop_words = pd.read_csv('./data/cnews1/stopwords-master/hit_stopwords.txt',
header = None, names = ['text'], quoting=csv.QUOTE_NONE) # 把"看成普通字符
# 对中文文本数据进行预处理,去除一些不需要的字符、分词、停用词
def chinese_pre(text_data):
# 字母转化为小写,去除数字
text_data = text_data.lower()
text_data = re.sub('\d+', '', text_data)
# 分词,使用精确模式
text_data = list(jieba.cut(text_data, cut_all = False))
# 去停用词和多余空格
text_data = [word.strip() for word in text_data if word not in stop_words.text.values]
# 处理后的词语使用空格连接为字符串
text_data = ' '.join(text_data)
return text_data
# 对数据进行分词
train_df['cutword'] = train_df.text.apply(chinese_pre)
val_df['cutword'] = val_df.text.apply(chinese_pre)
test_df['cutword'] = test_df.text.apply(chinese_pre)
Building prefix dict from the default dictionary ...
Loading model from cache E:\TEMP\jieba.cache
Loading model cost 0.741 seconds.
Prefix dict has been built successfully.
# 查看分词前几行结果
train_df.cutword.head()
0 马晓旭 意外 受伤 国奥 警惕 无奈 大雨 格外 青睐 殷家 军 记者 傅亚雨 沈阳 报道...
1 商瑞华 首战 复仇 心切 中国 玫瑰 美国 方式 攻克 瑞典 多曼来 瑞典 商瑞华 首战 ...
2 冠军 球队 迎新 欢乐 派对 黄旭获 大奖 张军 赢 下 pk 赛 新浪 体育讯 月 日 ...
3 辽足 签约 危机 引 注册 难关 高层 威逼利诱 合同 笑里藏刀 新浪 体育讯 月 日 辽...
4 揭秘 谢亚龙 带走 总局 电话 骗局 复制 南杨 轨迹 体坛周报 特约记者 张锐 北京 报...
Name: cutword, dtype: object
原来的句子已经分割成了一个个词语。在对文本内容预处理后,将10类文本数据使用 0 − 9 0-9 0−9 的数值进行表示,对文本的类别标签进行重新编码:
使用map映射,将数据集中的label变量分别对应到 0 − 9 0-9 0−9 ,生成新的变量labelcode。
labelMap = {'体育': 0, '娱乐': 1, '家居': 2, '房产': 3, '教育': 4, '时尚': 5, '时政': 6, '游戏': 7, '科技': 8, '财经': 9}
train_df['labelcode'] = train_df['label'].map(labelMap)
val_df['labelcode'] = val_df['label'].map(labelMap)
test_df['labelcode'] = test_df['label'].map(labelMap)
# 保存与处理的数据
train_df[['labelcode', 'cutword']].to_csv('./data/cnews1/cnews_train2.csv', index = False)
val_df[['labelcode', 'cutword']].to_csv('./data/cnews1/cnews_val2.csv', index = False)
test_df[['labelcode', 'cutword']].to_csv('./data/cnews1/cnews_test2.csv', index = False)
# 使用 torchtext 库进行数据准备
mytokenize = lambda x: x.split() # 定义文本切分方法:空格切分
TEXT = data.Field(sequential = True, tokenize = mytokenize,
include_lengths = True, use_vocab = True,
batch_first = True, fix_length = 400)
LABEL = data.Field(sequential = False, use_vocab = False,
pad_token = None, unk_token = None)
# 对所要读取的数据集的列进行处理
text_data_fields = [
('labelcode', LABEL), # 对标签的操作
('cutword', TEXT) # 对文本的操作
]
# 读取数据
traindata, valdata, testdata = data.TabularDataset.splits(
path = 'data/cnews1', format = 'csv',
train = 'cnews_train2.csv', fields = text_data_fields,
validation = 'cnews_val2.csv',
test = 'cnews_test2.csv', skip_header = True
)
len(traindata), len(valdata), len(testdata)
(50000, 5000, 10000)
# 使用训练数据构建单词表,没有预训练好的词向量
TEXT.build_vocab(traindata, max_size = 20000, vectors = None)
LABEL.build_vocab(traindata)
# 可视化训练集中前50个高频词
matplotlib.rcParams['font.family'] = 'SimHei'
word_fre = TEXT.vocab.freqs.most_common(n = 50)
word_fre = pd.DataFrame(data = word_fre, columns = ['word', 'fre'])
word_fre.plot(x = 'word', y = 'fre', kind = 'bar', legend = False, figsize = (12, 7))
plt.xticks(rotation = 90, size = 10)
plt.show()
处理为数据加载器,每次输入使用64个样本
# 定义一个加载器,将类似长度的示例一起批处理
BATCH_SIZE = 64
train_iter = data.BucketIterator(traindata, batch_size = BATCH_SIZE)
val_iter = data.BucketIterator(valdata, batch_size = BATCH_SIZE)
test_iter = data.BucketIterator(testdata, batch_size = BATCH_SIZE)
class LSTMNet(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, layer_dim, output_dim):
'''
vocab_size: 词典长度
embedding_dim: 词向量的维度
hidden_dim: LSTM神经元个数
layer_dim: LSTM的层数
output_dim: 隐藏层输出的维度(分类的数量)
'''
super(LSTMNet, self).__init__()
self.hidden_dim = hidden_dim # LSTM神经元个数
self.layer_dim = layer_dim # LSTM的层数
# 对文本进行词向量处理
self.embedding = nn.Embedding(vocab_size, embedding_dim)
# LSTM+全连接层
self.lstm = nn.LSTM(embedding_dim, hidden_dim, layer_dim, batch_first = True)
self.fc1 = nn.Linear(hidden_dim, output_dim)
def forward(self, x):
embeds = self.embedding(x)
# r_out shape (batch, time_step, output_size)
# h_n shape(n_layers, batch, hidden_size)
# h_c shape(n_layers, batch, hidden_size)
r_out, (h_n, h_c) = self.lstm(embeds, None) # None表示hidden state会零初始化
# 选取最后一个时间点的out输出
out = self.fc1(r_out[:, -1, :])
return out
# 定义LSTM网络文本分类器
vocab_size = len(TEXT.vocab)
embedding_dim = 100
hidden_dim = 128
layer_dim = 1
output_dim = 10
mylstm = LSTMNet(vocab_size, embedding_dim, hidden_dim, layer_dim, output_dim)
# 推到GPU
mylstm.to(device)
mylstm
LSTMNet(
(embedding): Embedding(20002, 100)
(lstm): LSTM(100, 128, batch_first=True)
(fc1): Linear(in_features=128, out_features=10, bias=True)
)
# 定义网络的训练过程函数
def train_model(model, traindataloader, valdataloader, criterion, optimizer, num_epochs = 25):
train_loss_all = []
train_acc_all = []
val_loss_all = []
val_acc_all = []
since = time.time()
for epoch in range(num_epochs):
print('-' * 10)
print('Epoch {}/{}'.format(epoch, num_epochs-1))
train_loss = 0.0
train_corrects = 0
train_num = 0
val_loss = 0.0
val_corrects = 0
val_num = 0
# 训练阶段
model.train()
for step, batch in enumerate(traindataloader):
textdata, target = batch.cutword[0], batch.labelcode.view(-1)
textdata, target = textdata.to(device), target.to(device)
out = model(textdata)
pre_lab = torch.argmax(out, 1) # 预测的标签
loss = criterion(out, target) # 计算损失函数值
optimizer.zero_grad()
loss.backward()
optimizer.step()
train_loss += loss.item() * len(target)
train_corrects += torch.sum(pre_lab == target.data)
train_num += len(target)
# 计算一个epoch在训练集上的损失和精度
train_loss_all.append(train_loss / train_num)
train_acc_all.append(train_corrects.double().item() / train_num)
print('{} Train Loss: {:.4f} Train Acc: {:.4f}'.format(epoch, train_loss_all[-1], train_acc_all[-1]))
# 验证阶段
model.eval()
for step, batch in enumerate(valdataloader):
textdata, target = batch.cutword[0], batch.labelcode.view(-1)
textdata, target = textdata.to(device), target.to(device)
out = model(textdata)
pre_lab = torch.argmax(out, 1)
loss = criterion(out, target)
val_loss += loss.item() * len(target)
val_corrects += torch.sum(pre_lab == target.data)
val_num += len(target)
# 计算一个epoch在验证集上的损失和精度
val_loss_all.append(val_loss / val_num)
val_acc_all.append(val_corrects.double().item() / val_num)
print('{} Val Loss: {:.4f} Val Acc: {:.4f}'.format(epoch, val_loss_all[-1], val_acc_all[-1]))
train_process = pd.DataFrame(
data={'epoch': range(num_epochs),
'train_loss_all': train_loss_all,
'train_acc_all': train_acc_all,
'val_loss_all': val_loss_all,
'val_acc_all': val_acc_all})
return model, train_process
optimizer = optim.Adam(mylstm.parameters(), lr=0.0003) # 定义优化器
loss_func = nn.CrossEntropyLoss().to(device) # 定义损失函数
# 对模型迭代训练20轮
mylstm, train_process = train_model(mylstm, train_iter, val_iter, loss_func, optimizer, num_epochs = 20)
----------
Epoch 0/19
0 Train Loss: 2.0642 Train Acc: 0.2393
0 Val Loss: 2.3405 Val Acc: 0.1372
----------
Epoch 1/19
1 Train Loss: 1.9179 Train Acc: 0.3016
1 Val Loss: 1.9523 Val Acc: 0.2386
----------
Epoch 2/19
2 Train Loss: 1.6840 Train Acc: 0.3910
2 Val Loss: 1.9267 Val Acc: 0.2402
----------
Epoch 3/19
3 Train Loss: 1.3907 Train Acc: 0.5084
3 Val Loss: 1.4880 Val Acc: 0.4932
----------
Epoch 4/19
4 Train Loss: 1.0219 Train Acc: 0.6447
4 Val Loss: 1.2360 Val Acc: 0.5664
----------
Epoch 5/19
5 Train Loss: 0.7950 Train Acc: 0.7398
5 Val Loss: 0.9895 Val Acc: 0.7082
----------
Epoch 6/19
6 Train Loss: 0.6296 Train Acc: 0.7970
6 Val Loss: 0.9012 Val Acc: 0.6896
----------
Epoch 7/19
7 Train Loss: 0.5226 Train Acc: 0.8317
7 Val Loss: 0.7032 Val Acc: 0.7874
----------
Epoch 8/19
8 Train Loss: 0.4181 Train Acc: 0.8619
8 Val Loss: 0.6257 Val Acc: 0.7990
----------
Epoch 9/19
9 Train Loss: 0.3411 Train Acc: 0.8937
9 Val Loss: 0.5337 Val Acc: 0.8308
----------
Epoch 10/19
10 Train Loss: 0.2597 Train Acc: 0.9260
10 Val Loss: 0.4045 Val Acc: 0.8702
----------
Epoch 11/19
11 Train Loss: 0.2061 Train Acc: 0.9424
11 Val Loss: 0.3729 Val Acc: 0.8888
----------
Epoch 12/19
12 Train Loss: 0.1720 Train Acc: 0.9527
12 Val Loss: 0.4396 Val Acc: 0.8738
----------
Epoch 13/19
13 Train Loss: 0.1485 Train Acc: 0.9606
13 Val Loss: 0.3064 Val Acc: 0.9162
----------
Epoch 14/19
14 Train Loss: 0.1227 Train Acc: 0.9671
14 Val Loss: 0.3010 Val Acc: 0.9162
----------
Epoch 15/19
15 Train Loss: 0.1029 Train Acc: 0.9726
15 Val Loss: 0.3119 Val Acc: 0.9078
----------
Epoch 16/19
16 Train Loss: 0.0861 Train Acc: 0.9773
16 Val Loss: 0.3149 Val Acc: 0.9074
----------
Epoch 17/19
17 Train Loss: 0.0733 Train Acc: 0.9803
17 Val Loss: 0.2583 Val Acc: 0.9312
----------
Epoch 18/19
18 Train Loss: 0.0620 Train Acc: 0.9840
18 Val Loss: 0.2877 Val Acc: 0.9232
----------
Epoch 19/19
19 Train Loss: 0.0517 Train Acc: 0.9866
19 Val Loss: 0.2865 Val Acc: 0.9320
# 可视化模型训练过程
plt.figure(figsize = (18, 6))
plt.subplot(1, 2, 1)
plt.plot(train_process.epoch, train_process.train_loss_all, 'r.-', label = 'Train Loss')
plt.plot(train_process.epoch, train_process.val_loss_all, 'bs-', label = 'Val Loss')
plt.legend()
plt.xlabel('Epoch number', size = 13)
plt.ylabel('Loss value', size = 13)
plt.subplot(1, 2, 2)
plt.plot(train_process.epoch, train_process.train_acc_all, 'r.-', label = 'Train Acc')
plt.plot(train_process.epoch, train_process.val_acc_all, 'bs-', label = 'Val Acc')
plt.legend()
plt.xlabel('Epoch number', size = 13)
plt.ylabel('Acc', size = 13)
plt.show()
从可视化过程看出,损失函数在训练集和验证集上都在减少,最后保持平稳,且在最后几个epoch,损失函数有轻微的提升,说明网络继续训练会有过拟合的趋势。针对在训练集和验证集的预测精度,均是先迅速上升,然后保持一个稳定的范围内,说明网络已经训练充分。
# 保存网络模型
torch.save(mylstm, './model/mylstm.pkl')
将训练好的网络用作测试集,进行结果预测。
# 对测试集进行预测并计算精度
mylstm.eval()
test_y_all = torch.LongTensor().to(device)
pre_lab_all = torch.LongTensor().to(device)
for step, batch in enumerate(test_iter):
textdata, target = batch.cutword[0], batch.labelcode.view(-1)
textdata, target = textdata.to(device), target.to(device)
out = mylstm(textdata)
pre_lab = torch.argmax(out, 1)
test_y_all = torch.cat((test_y_all, target)) # 测试集的标签
pre_lab_all = torch.cat((pre_lab_all, pre_lab)) # 测试集的预测标签
acc = accuracy_score(test_y_all.cpu(), pre_lab_all.cpu())
print('在测试集上的预测精度:', acc)
在测试集上的预测精度: 0.9337
# 计算混淆矩阵并可视化
class_label = ['体育', '娱乐', '家居', '房产', '教育',
'时尚', '时政', '游戏', '科技', '财经']
conf_mat = confusion_matrix(test_y_all.cpu(), pre_lab_all.cpu())
df_cm = pd.DataFrame(conf_mat, index = class_label, columns = class_label)
heatmap = sns.heatmap(df_cm, annot = True, fmt = 'd', cmap = 'YlGnBu')
heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation = 0, ha = 'right', fontproperties = fonts)
heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation = 45, ha = 'right', fontproperties = fonts)
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()
从热力图可以发现,家具和房产两类数据之间更容易预测错误,而且针对家具类型的文本识别精度并不是很高。
通过TSNE降维方法对词向量的表示进行降维,并在二维空间中可视化词向量在空间中的分布。
from sklearn.manifold import TSNE
mylstm = torch.load('./model/mylstm.pkl')
# 获取词向量
word2vec = mylstm.embedding.weight.cpu()
# 词向量对应的词
words = TEXT.vocab.itos
# 使用tsne对词向量降维并可视化所有词的分布
tsne = TSNE(n_components = 2, random_state = 123)
word2vec_tsne = tsne.fit_transform(word2vec.data.numpy())
# 使用散点图可视化分布情况
plt.figure(figsize = (10, 8))
plt.scatter(word2vec_tsne[:, 0], word2vec_tsne[:, 1], s = 4)
plt.title('所有词向量的分布情况', fontproperties = fonts, size = 15)
plt.show()
通过图发现,所有的词语在空间中的分布呈现逐渐分散的圆形。
下面挑选一些感兴趣的高频词语进行可视化,观察这些词语之间的关系:
# 可视化部分感兴趣次的分布情况
vis_word = ['中国', '市场', '公司', '美国', '记者', '学生', '游戏', '北京',
'投资', '电影', '银行', '工作', '留学', '大学', '经济', '产品',
'设计', '方面', '玩家', '学校', '学习', '放假', '专家', '楼市']
# 计算词语在词向量中的索引
vis_word_index = [words.index(ii) for ii in vis_word]
plt.figure(figsize = (10, 8))
for ii, index in enumerate(vis_word_index):
plt.scatter(word2vec_tsne[index, 0], word2vec_tsne[index, 1])
plt.text(word2vec_tsne[index, 0], word2vec_tsne[index, 1], vis_word[ii], fontproperties = fonts)
plt.title('词向量的分布情况', fontproperties = fonts, size = 15)
plt.show()
从图中可以发现,在实际生活中联系较大的词语,它们的位置距离很近,如房价和投资距离很近;学生和玩家、设计、电影距离较近;游戏、公司和方面等距离较近。
C++ 基础与深度分析 Chapter11 类与面向对象编程(构造函数:缺省、单一、拷贝、移动、赋值)
GBDT与xgboost :流失预测 shap解释 调参 保存调参好的模型
vs2015搭建python 环境搭建_caffe搭建--vs2015+caffe+python3.5编译环境的搭建
【Docker】 002 PyCharm 远程使用Docker 环境
我为OpenHarmony 写代码,战“码”先锋第二期正式开启!
美团外卖无需API开发连接Excel 365,实现订单信息自动汇总
MyBatis教程[1]----项目构建并完成第一个查询操作