import requests
from bs4 import BeautifulSoup
url ='https://www.hnu.edu.cn/xysh/xshd.htm'
r = requests.get(url)
r.encoding ='utf-8'
jzxx=[]#代码开始#代码结束
f1=open("jzxx2.txt","w")for xx in jzxx:
f1.write(",".join(xx)+"\n")
f1.close()
第6关 爬取多个网页的多个div标签的信息
#湖南大学信科院陈娟版权所有import requests
from bs4 import BeautifulSoup
f1=open("jz.txt","w",encoding="utf8")#代码开始#代码结束
f1.close()
Scrapy爬虫基础
第1关 Scarpy安装与项目创建
#include <iostream>
using namespace std ;int main (){int x ; cin >> x ;while(x--){
scrapy genspider Hello www.educoder.net
}return0;}
第2关 Scrapy核心原理
# -*- coding: utf-8 -*-import scrapy
classWorldSpider(scrapy.Spider):
name ='world'
allowed_domains =['www.baidu.com']
start_urls =['http://www.baidu.com/']defparse(self, response):# ********** Begin *********## 将获取网页源码本地持久化
baidu = response.url.split(".")[1]+'.html'withopen(baidu,'wb')as f:
f.write(response.body)# ********** End *********#
网页数据解析
第1关 XPath解析网页
import urllib.request
from lxml import etree
defget_data(url):'''
:param url: 请求地址
:return: None
'''
response=urllib.request.urlopen(url=url)
html=response.read().decode("utf-8")# *************** Begin *************** #
parse= etree.HTML(html)
item_list = parse.xpath("//div[@class='left']/ul/li/span/a/text()")# *************** End ***************** #print(item_list)
第2关 BeautifulSoup解析网页
import requests
from bs4 import BeautifulSoup
defget_data(url, headers):'''
两个参数
:param url:统一资源定位符,请求网址
:param headers:请求头
:return data:list类型的所有古诗内容
'''# ***************** Begin ******************** #
obj=requests.get(url)
soup=BeautifulSoup(obj.content,"lxml",from_encoding="utf-8")##data=soup.find("div",class_="left").find('p')
data=soup.find("div",class_='left').ul.find_all("li")
data =[i.p.text for i in data]# ****************** end ********************* #return data
requests 爬虫
第1关 requests 基础
import requests
defget_html(url):'''
两个参数
:param url:统一资源定位符,请求网址
:param headers:请求头
:return:html
'''# ***************** Begin ******************** ## 补充请求头
headers={}# get请求网页
header={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}
res= requests.get(url ,headers=header)
res.encoding ='utf-8'
html=res.text
# 获取网页信息文本# ***************** End ******************** #return html
第2关 requests 进阶
import requests
defget_html(url):'''
两个参数
:param url:统一资源定位符,请求网址
:param headers:请求头
:return html 网页的源码
:return sess 创建的会话
'''# ***************** Begin ******************** ## 补充请求头
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36"}# 创建Session, 并使用Session的get请求网页
sess = requests.session()
data ={"name":"hblgysl","password":"hblgzsx",}
res = sess.post(url,headers=headers,data=data)
res1 = sess.get(url)
html=res1.text
# 获取网页信息文本# ****************** End ********************* #return html, sess