任务描述:
利用nltk语料库中的影评来进行朴素贝叶斯情感分类训练。
影评导入:
from nltk.corpus import movie_reviews
文件具体目录在…\nltk_data\corpora\movie_reviews,已做好分类标注,消极与积极影评各1000 条。
步骤:
经过文本预处理(去噪、分句、分词、去停词、取词干、修剪)和特征选择,生成特
征词表,之后利用朴素贝叶斯模型进行训练。(每个步骤最好注释一下)
选择前 80%(即前 800 条消极影评与前 800 条积极影评)作为训练集,后 20%作为测试集。
输出:
准确率 Accuracy,精确率 Precision,召回率 Recall 和 F1 值,精确到小数点后两位。
其中,F1 = ( 2 * Precision * Recall ) / ( Precision + Recall)。
例如:
Accuracy = 0.98
Precision = 0.67
Recall = 0.32
F1 = 0.43
from nltk.corpus import movie_reviews
all_words = movie_reviews.words()
# all_words
标准化,即全部小写、去标点、去停用词,并提取词干。
import re
import string
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
def prepro(token):
#转为小写
text_lower = [word.lower() for word in token]
#消标点
x=re.compile('[%s]' % re.escape(string.punctuation))
text_punc = []
for review in [text_lower]:
for token in review:
new_token = x.sub(u'', token)
if not new_token == u'':
text_punc.append(new_token)
#处理停用词
stops=set(stopwords.words('english'))
text_norm=[w for w in text_punc if w not in stops]
#词干提取
text_stem=[]
for word in text_norm:
text_stem.append(PorterStemmer().stem(word))
return text_stem
pro_words = prepro(all_words)
# pro_words
import nltk
# 特征选择,取词频前5%或前2000个词。
freq_words = nltk.FreqDist(pro_words)
n = int( 0.05 * len(freq_words.keys() ))
workds_features = freq_words.most_common(n)
workds_features = [x[0] for x in workds_features]
# workds_features = list(freq_words.keys())[:2000]
# workds_features
def doc_features(doc):
# 去重复
doc_words = set(doc)
features = {}
for word in workds_features:
# word 在不在总此表中 向量化
features['contains(%s)' % word] = (word in doc_words)
return features
documents = [(list(movie_reviews.words(fileid)),category)
for category in movie_reviews.categories() # 类别
for fileid in movie_reviews.fileids(category)] # 文件名
featuresets = [(doc_features(d),c) for (d,c) in documents]
featuresets
输出:
[({'contains(film)': True,
'contains(movi)': False,
'contains(one)': True,
'contains(like)': True,
'contains(charact)': False,
'contains(make)': True,
'contains(get)': True,
'contains(time)': False,
'contains(scene)': False,
'contains(even)': True,
...
'contains(fairli)': False,
'contains(seven)': False,
...},
'neg'),
...]
train_set = featuresets[0:800] + featuresets[1000:1800]
test_set = featuresets[800:1000] + featuresets[1800:2000]
from nltk.probability import DictionaryProbDist, ELEProbDist, FreqDist, sum_logs
from collections import defaultdict
"""
A classifier based on the Naive Bayes algorithm. In order to find the
probability for a label, this algorithm first uses the Bayes rule to
express P(label|features) in terms of P(label) and P(features|label):
| P(label) * P(features|label)
| P(label|features) = ------------------------------
| P(features)
The algorithm then makes the 'naive' assumption that all features are
independent, given the label:
| P(label) * P(f1|label) * ... * P(fn|label)
| P(label|features) = --------------------------------------------
| P(features)
Rather than computing P(features) explicitly, the algorithm just
calculates the numerator for each label, and normalizes them so they
sum to one:
| P(label) * P(f1|label) * ... * P(fn|label)
| P(label|features) = --------------------------------------------
| SUM[l]( P(l) * P(f1|l) * ... * P(fn|l) )
"""
def custome_train(labeled_featuresets, estimator=ELEProbDist):
"""
:param labeled_featuresets: A list of classified featuresets,
i.e., a list of tuples ``(featureset, label)``.
"""
label_freqdist = FreqDist()
feature_freqdist = defaultdict(FreqDist)
feature_values = defaultdict(set)
fnames = set()
# Count up how many times each feature value occurred, given
# the label and featurename.
for featureset, label in labeled_featuresets:
label_freqdist[label] += 1
for fname, fval in featureset.items():
# Increment freq(fval|label, fname)
feature_freqdist[label, fname][fval] += 1
# Record that fname can take the value fval.
feature_values[fname].add(fval)
# Keep a list of all feature names.
fnames.add(fname)
# If a feature didn't have a value given for an instance, then
# we assume that it gets the implicit value 'None.' This loop
# counts up the number of 'missing' feature values for each
# (label,fname) pair, and increments the count of the fval
# 'None' by that amount.
for label in label_freqdist:
num_samples = label_freqdist[label]
for fname in fnames:
count = feature_freqdist[label, fname].N()
# Only add a None key when necessary, i.e. if there are
# any samples with feature 'fname' missing.
if num_samples - count > 0:
feature_freqdist[label, fname][None] += num_samples - count
feature_values[fname].add(None)
# Create the P(label) distribution
label_probdist = estimator(label_freqdist)
# Create the P(fval|label, fname) distribution
feature_probdist = {}
for ((label, fname), freqdist) in feature_freqdist.items():
probdist = estimator(freqdist, bins=len(feature_values[fname]))
feature_probdist[label, fname] = probdist
return label_probdist, feature_probdist
def custome_prob_classify(featureset,_label_probdist,_feature_probdist):
# Discard any feature names that we've never seen before.
# Otherwise, we'll just assign a probability of 0 to
# everything.
_labels = list(_label_probdist.samples())
featureset = featureset.copy()
for fname in list(featureset.keys()):
for label in _labels:
if (label, fname) in _feature_probdist:
break
else:
# print('Ignoring unseen feature %s' % fname)
del featureset[fname]
# Find the log probability of each label, given the features.
# Start with the log probability of the label itself.
logprob = {}
for label in _labels:
logprob[label] = _label_probdist.logprob(label)
# Then add in the log probability of features given labels.
for label in _labels:
for (fname, fval) in featureset.items():
if (label, fname) in _feature_probdist:
feature_probs = _feature_probdist[label, fname]
logprob[label] += feature_probs.logprob(fval)
else:
# nb: This case will never come up if the
# classifier was created by
# NaiveBayesClassifier.train().
logprob[label] += sum_logs([]) # = -INF.
return DictionaryProbDist(logprob, normalize=True, log=True)
def custome_classify(featureset,label_probdist,feature_probdist):
return custome_prob_classify(featureset,label_probdist,feature_probdist).max()
_label_probdist, _feature_probdist = custome_train(train_set)
import collections
from nltk import precision,recall,f_measure
#准确率
print("Accuracy = %.2f" % nltk.classify.accuracy(nltk.NaiveBayesClassifier(_label_probdist, _feature_probdist),test_set))
#精确率、召回率、FI
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
for i,(feats, label) in enumerate(test_set):
refsets[label]. add(i)
observed = custome_classify(feats,_label_probdist, _feature_probdist)
testsets[observed].add(i)
# positive
print('-'*20)
print('postive类得分')
print ('Precision = %.2f'% precision(refsets['pos'],testsets['pos']))
print('Recall = %.2f'% recall(refsets['pos'],testsets['pos']))
print ('F1 = %.2f'% f_measure(refsets['pos'], testsets['pos']))
# negative
print('-'*20)
print('negative类得分')
print('Precision = %.2f'% precision(refsets['neg'],testsets['neg']))
print('Recall = %.2f'% recall(refsets['neg'], testsets['neg']))
print('F1 = %.2f'% f_measure(refsets['neg'],testsets['neg']))
输出:
Accuracy = 0.78
--------------------
postive类得分
Precision = 0.80
Recall = 0.76
F1 = 0.78
--------------------
negative类得分
Precision = 0.77
Recall = 0.81
F1 = 0.79