from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, auc, roc_auc_score
import joblib
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import pickle
path = '/Users/xinghuatianying/data/DataSets/im_cheat/'
os.chdir(path)
训练样本demo:
label msg
1 全网最低价完善数据 手把手教你引流 详细+V:vd12388
1 耍.微.店等.级评.价销.量回头.率?15314268311
1 我收姐妹+我QQ2877613260
1 违规啥啊 佳651815289?我q
0 温和洁面膏+红粉爽肤水
0 小卡盲盒改地址
0 20杯(每个口味各4杯)
def load_data():
data = pd.read_csv('msg_train.csv', sep = "\t", names=['label', 'msg'])
#对数据进行随机打乱
data = data.sample(frac=1, random_state=42)
# print(data.shape)
# print(data.head(10))
#查看0-1的比例,可以看出来,数据集基本上平衡
# print(data['label'].value_counts())
# 对文本进行分字
data['msg'] = data['msg'].apply(lambda x: ' '.join(x))
# print(data.head())
x_train, x_test, y_train, y_test = \
train_test_split(data['msg'],
data['label'],
test_size=0.3,
random_state=42
)
# print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
return x_train, x_test, y_train, y_test
def train_model(x_train, x_test, y_train, y_test):
#tf-idf训练
vectorizer_word = TfidfVectorizer(
max_features=800000,
token_pattern=r"(?u)\b\w+\b",
min_df=1,
#max_df=0.1,
analyzer='word',
ngram_range=(1, 5)
)
tfidf_model = vectorizer_word.fit(x_train)
# 保存模型到文件
with open('tfidf_model.pkl', 'wb') as f:
pickle.dump(tfidf_model, f)
tfidf_train = vectorizer_word.transform(x_train)
tfidf_test = vectorizer_word.transform(x_test)
#查看词典的大小 vectorizer_word.vocabulary_
print(len(vectorizer_word.vocabulary_))
#逻辑回归模型的训练
lr_word = LogisticRegression(
solver='sag',
verbose=2)
lr_word.fit(tfidf_train, y_train)
# 保存模型,下次可以直接使用
joblib.dump(lr_word, 'lr_word_ngram.pkl')
#模型读取
model = joblib.load(filename="lr_word_ngram.pkl")
# 模型预测
y_pred_word_1 = lr_word.predict(tfidf_test)
y_pred_word = lr_word.predict_proba(tfidf_test)[:, 1]
# 模型评估
print(accuracy_score(y_test, y_pred_word_1))
def predcit_main():
# 拉取数据
data = pd.read_csv('test_msg.csv', names=['msg'])
#对数据进行随机打乱
data = data.sample(frac=1, random_state=42)
print(data.shape)
print(data.head(10))
# 对文本进行分字
data['msg'] = data['msg'].apply(lambda x: ' '.join(x))
x_test = data['msg']
# 从文件中加载模型
with open('tfidf_model.pkl', 'rb') as f:
tfidf_model = pickle.load(f)
print(x_test[:10])
tfidf_test = tfidf_model.transform(x_test)
# 模型读取
lr_model = joblib.load(filename="lr_word_ngram.pkl")
# 模型预测
y_pred_word = lr_model.predict_proba(tfidf_test)[:, 1]
predict_df = pd.DataFrame({ 'y_pred_word': y_pred_word, 'x_test': x_test})
# 保存到 CSV 文件
predict_df.to_csv('predict_test.csv', index=False, sep = "\t")
if __name__ == '__main__':
x_train, x_test, y_train, y_test = load_data() # 加载训练数据
train_model(x_train, x_test, y_train, y_test) # 模型训练
predcit_main() # 模型预测