我今天就分享一个微信公众号半自动采集的爬虫,因为本人技术有限,不能完全实现自动化,所以我只能通过微信公众号手动复制获取链接,然后保存到excel中,然后通过读取excel中的永久链接,来实现半自动的微信公众号文章采集。
一、手动获取永久链接
这里我们需要手动复制公众号的永久链接到Excel,格式如下:
二、爬虫采集
然后就是通过python代码读取excel的链接,然后遍历所有链接进行采集。这里我们主要是采集标题、发布时间、文章正文。
具体的代码如下,仅供大家参考:
__author__ = "dengxinyan"
import copy
from WeChat_spider.items import NewsspiderItem
from WeChat_spider.settings import *
from WeChat_spider.pipelines import *
from WeChat_spider.download import *
from WeChat_spider.Common3.Common3 import *
from WeChat_spider.Common3.general import *
from WeChat_spider.Excel.Excel2 import *
# 通用微信文章采集
def common_wechat_spider():
spider_name = 'common_wechat_spider'
header = {
"Host": "mp.weixin.qq.com",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"Cookie": "_tc_unionid=063b4d4f-c82c-433c-908a-d3535349f6ab; rewardsn=; wxtokenkey=777",
"Upgrade-Insecure-Requests": "1",
}
# 读取数据源excel
excel = Excel(r'E:\Python\CISDI\WeChat_spider\data\20240117-20240118微信公众号整理模版本.xlsx')
excel.open()
excel_url_list = excel.excelToDict()
excel.close()
url_list = []
for url_dict in excel_url_list:
temp_dict = {
'url': url_dict['url'],
'clazz': '微信公众号',
'spider_name':spider_name,
'header': header,
# 'source': source,
}
url_list.append(temp_dict)
for url_info in url_list:
for page in range(1,2):
temp_url_info = copy.deepcopy(url_info)
if page == 1:
temp_url_info['url'] = temp_url_info['url'].format(str(page))
else:
temp_url_info['url'] = temp_url_info['url'].format(str(page))
# 获取列表数据
# 判断新闻是否已经采集
if not is_save(temp_url_info):
get_content(temp_url_info)
# 获取正文详情数据
def get_content(url_info):
try:
response_html = Drive_GET(url=url_info['url'], headers=url_info['header'], spider_name=url_info['spider_name'], wait_xpath='//div[@id="img-content" or @id="js_content"]', headless=True)
# html_str = html_encode(response_html)
html_str = WeChat_img_replace(response_html)
html_str = supp_url(url_info['url'], html_str)
html_tree = etree.HTML(html_str)
item = NewsspiderItem()
# 解析列表信息
try:
item['title'] = clean_title(html_tree.xpath('//h1[@id="activity-name"]/text()')[0])
except:
item['title'] = clean_title(html_tree.xpath('//title/text()')[0])
item['clazz'] = url_info['clazz']
try:
item['source'] = html_tree.xpath('//a[@id="js_name"]/text()')[0].strip()
except:
item['source'] = html_tree.xpath('//strong[@id="js_account_nickname"]/text()')[0].strip()
item['spider_name'] = url_info['spider_name']
item['title_href'] = url_info['url']
item['time'] = format_date_time(html_tree.xpath('//*[@id="publish_time"]/text()')[0].strip())
# 获取正文
content = etree.tostring(html_tree.xpath('//div[@id="img-content" or @id="js_content"]')[0], pretty_print=True, method='html').decode('utf-8')
content = remove_html_tag(str(content).replace('src="//', 'src="https://').replace(r'\r', '').replace(r'\n', ''))
item['content'] = html.unescape(content)
item['html'] = html_str
item['insert_time'] = str(datetime.datetime.now())
# 保存数据
save_item(item)
except Exception as e:
print('错误',url_info['spider_name'])
logging.exception('错误:爬虫名称:{},错误信息:{}'.format(url_info['spider_name'],str(e)))
if __name__ == "__main__":
common_wechat_spider()