相关代码如下
def parse_links():
link_list = []
links_xpath_1 = '//div[@class="box_left"]//li//a/@href'
titles_xpath_1 = '//div[@class="box_left"]//li//a/@title'
for url in URL_LIST:
try:
res = requests.get(url, timeout=20)
sel = etree.HTML(res.text)
links = sel.xpath(links_xpath_1)[:10]
titles = sel.xpath(titles_xpath_1)[:10]
assert len(links) == len(titles)
if links and titles:
for l, t in zip(links, titles):
join_l = urljoin(url, l)
link_list.append((t, join_l))
except Exception as e:
logger.error("get links error: %s", e, exc_info=True)
return link_list
相关代码如下
def parse_details(link_list):
publ_date_xpath = '//div[@class="info"]/text()'
author_xpath = '//div[@class="info"]/span[1]/text()'
content_xpath = '//div[@class="txt_con"]/p//text()'
res_list = []
count = 0
def convert(s):
return s.strip()
for t, l in link_list:
try:
res = requests.get(l, timeout=20)
sel = etree.HTML(res.text)
publ_date = sel.xpath(publ_date_xpath)
if not publ_date:
continue
publ_date = datetime.strptime(publ_date[0].strip(), '%Y-%m-%d %H:%M')
if publ_date < datetime.now() - timedelta(days=2):
continue
author = sel.xpath(author_xpath)
author = author[0].split(':')[1].strip()
content = sel.xpath(content_xpath)
content = ' '.join(map(convert, content))
except Exception as e:
logger.error("failed to parse detail: %s", e, exc_info=True)
count += 1
else:
res_list.append((t, l, publ_date, author, content))
print(t, l, publ_date, author, content)
if count == len(link_list) and len(link_list) > 0:
return [], False
return res_list, True
完整代码分享如下:
# !/usr/bin/python
# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function, unicode_literals
import logging
from datetime import datetime, timedelta
import requests
from lxml.html import etree
# python2 and 3
try:
from urlparse import urljoin
except ImportError:
from urllib.parse import urljoin
logger = logging.getLogger(__name__)
URL_LIST = [
'http://stock.stcn.com/'
]
def parse_links():
link_list = []
links_xpath_1 = '//div[@class="box_left"]//li//a/@href'
titles_xpath_1 = '//div[@class="box_left"]//li//a/@title'
for url in URL_LIST:
try:
res = requests.get(url, timeout=20)
sel = etree.HTML(res.text)
links = sel.xpath(links_xpath_1)[:10]
titles = sel.xpath(titles_xpath_1)[:10]
assert len(links) == len(titles)
if links and titles:
for l, t in zip(links, titles):
join_l = urljoin(url, l)
link_list.append((t, join_l))
except Exception as e:
logger.error("get links error: %s", e, exc_info=True)
return link_list
def parse_details(link_list):
publ_date_xpath = '//div[@class="info"]/text()'
author_xpath = '//div[@class="info"]/span[1]/text()'
content_xpath = '//div[@class="txt_con"]/p//text()'
res_list = []
count = 0
def convert(s):
return s.strip()
for t, l in link_list:
try:
res = requests.get(l, timeout=20)
sel = etree.HTML(res.text)
publ_date = sel.xpath(publ_date_xpath)
if not publ_date:
continue
publ_date = datetime.strptime(publ_date[0].strip(), '%Y-%m-%d %H:%M')
if publ_date < datetime.now() - timedelta(days=2):
continue
author = sel.xpath(author_xpath)
author = author[0].split(':')[1].strip()
content = sel.xpath(content_xpath)
content = ' '.join(map(convert, content))
except Exception as e:
logger.error("failed to parse detail: %s", e, exc_info=True)
count += 1
else:
res_list.append((t, l, publ_date, author, content))
print(t, l, publ_date, author, content)
if count == len(link_list) and len(link_list) > 0:
return [], False
return res_list, True
def process():
link_list = parse_links()
res_list, ret = parse_details(link_list)
if res_list:
logger.info('get %d news', len(res_list))
if ret:
logger.info('successfull save news')
if __name__ == "__main__":
process()
如果你对Python感兴趣,想要学习python,这里给大家分享一份Python全套学习资料,都是我自己学习时整理的,希望可以帮到你,一起加油!
😝有需要的小伙伴,可以V扫描下方二维码免费领取🆓
?
对于从来没有接触过Python的同学,我们帮你准备了详细的学习成长路线图。可以说是最科学最系统的学习路线,你可以按照上面的知识点去找对应的学习资源,保证自己学得较为全面。
还有很多适合0基础入门的学习视频,有了这些视频,轻轻松松上手Python~
每节视频课后,都有对应的练习题哦,可以检验学习成果哈哈!
学习Python常用的开发软件都在这里了!每个都有详细的安装教程,保证你可以安装成功哦!
光学理论是没用的,要学会跟着一起敲代码,动手实操,才能将自己的所学运用到实际当中去,这时候可以搞点实战案例来学习。100+实战案例源码等你来拿!
如果觉得上面的实战案例有点枯燥,可以试试自己用Python编写小游戏,让你的学习过程中增添一点趣味!
我们学会了Python之后,有了技能就可以出去找工作啦!下面这些面试题是都来自阿里、腾讯、字节等一线互联网大厂,并且有阿里大佬给出了权威的解答,刷完这一套面试资料相信大家都能找到满意的工作。
上述所有资料 ?? ,朋友们如果有需要的,可以扫描下方👇👇👇二维码免费领取🆓
?