python爬虫案例分享——微信公众号文章半自动采集

发布时间:2024年01月18日

我今天就分享一个微信公众号半自动采集的爬虫,因为本人技术有限,不能完全实现自动化,所以我只能通过微信公众号手动复制获取链接,然后保存到excel中,然后通过读取excel中的永久链接,来实现半自动的微信公众号文章采集。

一、手动获取永久链接

这里我们需要手动复制公众号的永久链接到Excel,格式如下:

二、爬虫采集

然后就是通过python代码读取excel的链接,然后遍历所有链接进行采集。这里我们主要是采集标题、发布时间、文章正文。

具体的代码如下,仅供大家参考:

__author__ = "dengxinyan"

import copy
from WeChat_spider.items import NewsspiderItem
from WeChat_spider.settings import *
from WeChat_spider.pipelines import *
from WeChat_spider.download import *
from WeChat_spider.Common3.Common3 import *
from WeChat_spider.Common3.general import *
from WeChat_spider.Excel.Excel2 import *

# 通用微信文章采集
def common_wechat_spider():
    spider_name = 'common_wechat_spider'
    header = {
        "Host": "mp.weixin.qq.com",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
        "Cookie": "_tc_unionid=063b4d4f-c82c-433c-908a-d3535349f6ab; rewardsn=; wxtokenkey=777",
        "Upgrade-Insecure-Requests": "1",
    }

    # 读取数据源excel
    excel = Excel(r'E:\Python\CISDI\WeChat_spider\data\20240117-20240118微信公众号整理模版本.xlsx')
    excel.open()
    excel_url_list = excel.excelToDict()

    excel.close()

    url_list = []
    for url_dict in excel_url_list:
        temp_dict = {
            'url': url_dict['url'],
            'clazz': '微信公众号',
            'spider_name':spider_name,
            'header': header,
            # 'source': source,
        }
        url_list.append(temp_dict)

    for url_info in url_list:
        for page in range(1,2):
            temp_url_info = copy.deepcopy(url_info)
            if page == 1:
                temp_url_info['url'] = temp_url_info['url'].format(str(page))
            else:
                temp_url_info['url'] = temp_url_info['url'].format(str(page))

            # 获取列表数据
            # 判断新闻是否已经采集
            if not is_save(temp_url_info):
                get_content(temp_url_info)

# 获取正文详情数据
def get_content(url_info):
    try:
        response_html = Drive_GET(url=url_info['url'], headers=url_info['header'], spider_name=url_info['spider_name'], wait_xpath='//div[@id="img-content" or @id="js_content"]', headless=True)
        # html_str = html_encode(response_html)
        html_str = WeChat_img_replace(response_html)
        html_str = supp_url(url_info['url'], html_str)
        html_tree = etree.HTML(html_str)
        item = NewsspiderItem()

        # 解析列表信息
        try:
            item['title'] = clean_title(html_tree.xpath('//h1[@id="activity-name"]/text()')[0])
        except:
            item['title'] = clean_title(html_tree.xpath('//title/text()')[0])
        item['clazz'] = url_info['clazz']
        try:
            item['source'] = html_tree.xpath('//a[@id="js_name"]/text()')[0].strip()
        except:
            item['source'] = html_tree.xpath('//strong[@id="js_account_nickname"]/text()')[0].strip()
        item['spider_name'] = url_info['spider_name']
        item['title_href'] = url_info['url']
        item['time'] = format_date_time(html_tree.xpath('//*[@id="publish_time"]/text()')[0].strip())

        # 获取正文
        content = etree.tostring(html_tree.xpath('//div[@id="img-content" or @id="js_content"]')[0], pretty_print=True, method='html').decode('utf-8')
        content = remove_html_tag(str(content).replace('src="//', 'src="https://').replace(r'\r', '').replace(r'\n', ''))
        item['content'] = html.unescape(content)
        item['html'] = html_str
        item['insert_time'] = str(datetime.datetime.now())

        # 保存数据
        save_item(item)

    except Exception as e:
        print('错误',url_info['spider_name'])
        logging.exception('错误:爬虫名称:{},错误信息:{}'.format(url_info['spider_name'],str(e)))

if __name__ == "__main__":
    common_wechat_spider()

文章来源:https://blog.csdn.net/Dxy1239310216/article/details/135667276
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。