依次需要使用pip命令安装numpy、scipy、skikit_learn、matplotlib模块。
python代码:
import os
import sys
import codecs
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
if __name__ == '__main__':
corpus = []
labels = []
corpus_test = []
labels_test = []
f = codecs.open("i:/ML/sms_spam.text","rb")
count = 0
while True:
line = f.readline()
if count == 0:
count = count +1
continue
if line:
count = count +1
line = line.split(",")
label = line[0]
sentence = line[1]
corpus.append(sentence)
if "ham"==label:
labels.append(0)
elif "spam"==label:
labels.append(1)
if count > 5550:
corpus_test.append(0)
if "ham"==label:
labels_test.append(0)
elif "spam"==label:
labels_test.append(1)
else:
break
vectorizer=CountVectorizer()
fea_train = vectorizer.fit_transform(corpus)
print vectorizer.get_feature_names()
print fea_train.toarray()
vectorizer2=CountVectorizer(vocabulary=vectorizer.vocabulary_)
fea_test = vectorizer2.fit_transform(corpus_test)
print vectorizer2.get_feature_names()
print fea_test.toarray()
#create the Multinomial Naive Bayesian Classifier
#alpha = 1 拉普拉斯估计给每个单词加1
clf = MultinomialNB(alpha = 1)
clf.fit(fea_train,labels)
#
pred = clf.predict(fea_test);
for p in pred:
if p == 0:
print "正常邮件"
else:
print "垃圾邮件"
object Naive_bayes {
def main(args: Array[String]){
//1 构建Spark对象
val conf = new SparkConf().setAppName("Naive_bayes").setMaster("local")
val sc = new SparkContext(conf)
}
//读取样本数据1
val data = sc.textFile("sample_naive_bayes_data.txt")
val parsedData = data.map { line =》
val parts = line.split(',')
LabeledPoint(parts(0).toDouble,Vectors.dense(parts(1).split(' 、').map(_.toDouble))
}
//样本数据划分训练样本与测试样本
val splits = parsedData.randomSplit(Array(0.9,0.1),seed = 11L)
val training = splits(0)
val test = splits(1)
//新建贝叶斯分类模型,并训练,lambda拉普拉斯估计
val model = NaiveBayes.train(training,lambda = 1.0)
//对测试样本进行测试
val predictionAndLabel = test.map(p => (model.predict(p.features),p.label))
val print_predict = predictionAndLabel.take(10020)
println("prediction" + "\t" + "label")
for (i <- 0 to print_predict.length - 1) {
println(print_predict(i)._1 + "\t" + print_predict(i)._2)
}
val accuracy = 1.0 * predictionAndLabel
.filter(x => x._1 == x._2).count() / test.count()
println(accuracy)
//保存模型
val ModelPath = "naive_bayes_model"
model.save(sc, ModelPath)
val sameModel = NaiveBayesModel.load(sc, ModelPath)
}
}