一个使用pyqt的word文档查重工具

发布时间：2024年01月24日

一个使用pyqt的word文档查重工具

使用场景
代码
使用截图
打包好的软件下载链接
结尾

使用场景

有时我们在借鉴一篇文档之后还不想有太多重复，这个时候可以使用这个工具对两个word文档进行对比

代码

import sys
from PyQt5.QtWidgets import QApplication, QMainWindow, QPushButton, QVBoxLayout, QWidget, QLabel, QFileDialog
from docx import Document
import re, datetime


class WordComparerApp(QMainWindow):
    def __init__(self):
        super().__init__()

        self.initUI()

    def initUI(self):
        self.setWindowTitle('Word 文档比较器')
        self.setGeometry(100, 100, 400, 200)

        self.centralWidget = QWidget(self)
        self.setCentralWidget(self.centralWidget)

        self.layout = QVBoxLayout()

        self.file1_label = QLabel('选择文件1:')
        self.layout.addWidget(self.file1_label)

        self.file1_button = QPushButton('选择文件1')
        self.file1_button.clicked.connect(self.openFile1)
        self.layout.addWidget(self.file1_button)

        self.file2_label = QLabel('选择文件2:')
        self.layout.addWidget(self.file2_label)

        self.file2_button = QPushButton('选择文件2')
        self.file2_button.clicked.connect(self.openFile2)
        self.layout.addWidget(self.file2_button)

        self.compare_button = QPushButton('开始比较')
        self.compare_button.clicked.connect(self.compareFiles)
        self.layout.addWidget(self.compare_button)

        self.centralWidget.setLayout(self.layout)

    def openFile1(self):
        options = QFileDialog.Options()
        file1, _ = QFileDialog.getOpenFileName(self, "选择文件1", "", "Word Files (*.docx)", options=options)
        if file1:
            self.file1_label.setText(f'选择文件1: {file1}')
            self.file1 = file1

    def openFile2(self):
        options = QFileDialog.Options()
        file2, _ = QFileDialog.getOpenFileName(self, "选择文件2", "", "Word Files (*.docx)", options=options)
        if file2:
            self.file2_label.setText(f'选择文件2: {file2}')
            self.file2 = file2

    def compareFiles(self):
        if hasattr(self, 'file1') and hasattr(self, 'file2'):
            doc1 = self.readDocx(self.file1)
            doc2 = self.readDocx(self.file2)

            print('开始比对...'.center(80, '*'))
            t1 = datetime.datetime.now()
            for i in range(len(doc1)):
                if i % 100 == 0:
                    print('处理进行中，已处理段落 {0:>4d} (总数 {1:0>4d} ） '.format(i, len(doc1)))
                for j in range(len(doc2)):
                    self.compareParagraph(doc1, i, doc2, j)
            t2 = datetime.datetime.now()
            print('\n比对完成，总用时: ', t2 - t1)

    def getText(self, wordname):
        d = Document(wordname)
        texts = []
        for para in d.paragraphs:
            texts.append(para.text)
        return texts

    def msplit(self, s, separators=',|\.|\?|，|。|？|！'):
        return re.split(separators, s)

    def readDocx(self, docfile):
        print('*' * 80)
        print('文件', docfile, '加载中……')
        t1 = datetime.datetime.now()
        paras = self.getText(docfile)
        segs = []
        for p in paras:
            temp = []
            for s in self.msplit(p):
                if len(s) > 2:
                    temp.append(s.replace(' ', ""))
            if len(temp) > 0:
                segs.append(temp)
        t2 = datetime.datetime.now()
        print('加载完成，用时: ', t2 - t1)
        self.showInfo(segs, docfile)
        return segs

    def showInfo(self, doc, filename='filename'):
        chars = 0
        segs = 0
        for p in doc:
            for s in p:
                segs = segs + 1
                chars = chars + len(s)
        print('段落数: {0:>8d} 个。'.format(len(doc)))
        print('短句数: {0:>8d} 句。'.format(segs))
        print('字符数: {0:>8d} 个。'.format(chars))

    def compareParagraph(self, doc1, i, doc2, j, min_segment=5):
        p1 = doc1[i]
        p2 = doc2[j]
        len1 = sum([len(s) for s in p1])
        len2 = sum([len(s) for s in p2])
        if len1 < 10 or len2 < 10:
            return []

        lst = []
        for s1 in p1:
            if len(s1) < min_segment:
                continue
            for s2 in p2:
                if len(s2) < min_segment:
                    continue
                if s2 in s1:
                    lst.append(s2)
                elif s1 in s2:
                    lst.append(s1)

        count = sum([len(s) for s in lst])
        ratio = float(count) / min(len1, len2)
        if count > 10 and ratio > 0.1:
            print(' 发现相同内容 '.center(80, '*'))
            print('文件1第{0:0>4d}段内容：{1}'.format(i + 1, p1))
            print('文件2第{0:0>4d}段内容：{1}'.format(j + 1, p2))
            print('相同内容：', lst)
            print('相同字符比：{1:.2f}%\n相同字符数： {0}\n'.format(count, ratio * 100))
        return lst


def main():
    app = QApplication(sys.argv)
    ex = WordComparerApp()
    ex.show()
    sys.exit(app.exec_())


if __name__ == '__main__':
    main()

使用截图

在这里插入图片描述

打包好的软件下载链接

文档查重器

结尾

如果觉得文章对你有用请点赞、关注 ->> 你的点赞对我太有用了
群内交流更多技术
130856474 <-- 在这里

文章来源:https://blog.csdn.net/Silver__Wolf/article/details/135813816
本文来自互联网用户投稿，该文观点仅代表作者本人，不代表本站立场。本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如若内容造成侵权/违法违规/事实不符，请联系我的编程经验分享网邮箱：chenni525@qq.com进行投诉反馈，一经查实，立即删除！