聚焦爬虫:爬取也页面内指定的页面内容,而不是整张页面。(建立在通用爬虫之上)
数据解析分类:
数据解析原理概述:
常用的正则表达式:
正则表达式练习:
import re
#提取出Python
key = 'javapythonc++php'
print(re.findall('python', key))
#提取出hello world
key = '<html><h1>hello world</h1></html>'
print(re.findall('hello world',key)[0])
print(re.findall('<h1>(.*)</h1>',key))
#提取出185
key = '我想要185大帅哥'
print(re.findall('\d+',key))
#提取出http://和https://
key ?= 'http://www.baidu.com and https://boob.com'
print(re.findall('https?://',key))
#提取hello
key = 'lalala<hTml>hello</HtMl>hahaha'
print(re.findall('<[Hh][Tt][Mm][Ll]>(.*)</[Hh][Tt][Mm][Ll]>',key))
#提取hit
key = 'bobo@hit.edu.com'
print(re.findall('h.*?\.',key))
#匹配sas和saas
key = 'saas and sas and saaas'
print(re.findall('s[a]{1,2}s',key))
? ? 其实就是找到源代码中表示图片的地址,然后根据地址的格式使用正则表达式将图片地址提取出来,之后再对图片重新进行一次爬取,最后将图片进行持久化
import requests
import re
import os
if __name__ == '__main__':
? ? headers = {
? ? ? ? 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0'
? ? }
? ? #创建一个文件夹保存所有的图片
? ? if not os.path.exists('./qiutuLibs'):
? ? ? ? os.mkdir('./qiutuLibs')
? ? #设置一个通用的URL模板
? ? url = 'https://www.qiushibaike.com/pic/page%d/?s=5184961'
? ? #分页
? ? for pageNum in range(1,36):
? ? ? ? #对应页码的URL
? ? ? ? new_url = format(url%pageNum)
? ? ? ? #使用通用爬虫对URL对应的一整张页面进行爬取
? ? ? ? page_text = requests.get(url=new_url,headers=headers).text
? ? ? ? #使用正则表达式对图片进行解析
? ? ? ? ex = '<div class="thumb">.*?<img src="(.*?)" alt.*?</div>'
? ? ? ? img_src_list = re.findall(ex,page_text,re.S)
? ? ? ? for src in img_src_list:
? ? ? ? ? ? #拼接出一个完整的图片url
? ? ? ? ? ? src = 'https:'+src
? ? ? ? ? ? #请求到了图片的二进制数据
? ? ? ? ? ? img_data = requests.get(url = src,headers = headers).content
? ? ? ? ? ? #生成图片的名字
? ? ? ? ? ? img_name = src.split('/')[-1]
? ? ? ? ? ? with open('./qiiutuLibs/'+img_name,'wb') as fp:
? ? ? ? ? ? ? ? fp.write(img_data)
pip install bs4
pip install lxml
fp = open('./皮影界面.html','r',encoding='utf-8')
soup = BeautifulSoup(fp,'lxml')
response = requests.get(url = 'https://www.taobao.com',headers=headers).text
soup = BeautifulSoup(response,'lxml')
代码写的是相当乱了!?
from bs4 import BeautifulSoup
import requests
#将本地的HTML文档中的数据加载到该对象中
fp = open('./皮影界面.html','r',encoding='utf-8')
soup = BeautifulSoup(fp,'lxml')
# print(soup)
#将互联网上获取的页面源码加载到该对象中
# headers = {
# ? ? 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
# ? ? ? ? ? ? ? ? ? ? ?'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0'
# }
# response = requests.get(url = 'https://www.taobao.com',headers=headers).text
# soup = BeautifulSoup(response,'lxml')
# print(soup)
#soup的相关属性和方法
print(soup.title)#<title>百度图片-发现多彩世界</title>
print(soup.find_all('link'))
#[<link href="//img1.bdstatic.com/static/common/img/icon_cf1b905.png" rel="shortcut icon" type="image/x-icon"/>,
# <link href="//img1.bdstatic.com/static/common/img/icon_cf1b905.png" rel="icon" sizes="any"/>,
# <link href="//img1.bdstatic.com/static/common/pkg/co_14c8d6b.css" rel="stylesheet" type="text/css"/>,
# <link href="//img1.bdstatic.com/static/albumsdetail/pkg/albumsdetail_46fde24.css" rel="stylesheet" type="text/css"/>,
# <link href="//img0.bdstatic.com/static/common/widget/ui/slider/slider_ecce195.css" rel="stylesheet" type="text/css"/>,
# <link href="//img1.bdstatic.com/static/common/widget/ui/userInfo/userInfo_81fda3f.css" rel="stylesheet" type="text/css"/>]
print(soup.find('link',href='//img1.bdstatic.com/static/common/pkg/co_14c8d6b.css'))
#<link href="//img1.bdstatic.com/static/common/pkg/co_14c8d6b.css" rel="stylesheet" type="text/css"/>
print(soup.find_all('link',href='//img1.bdstatic.com/static/common/img/icon_cf1b905.png'))
#[<link href="//img1.bdstatic.com/static/common/img/icon_cf1b905.png" rel="shortcut icon" type="image/x-icon"/>,
# <link href="//img1.bdstatic.com/static/common/img/icon_cf1b905.png" rel="icon" sizes="any"/>]
print(soup.link['href'])
#//img1.bdstatic.com/static/common/img/icon_cf1b905.png
print(soup.title.text)
? ? <https://www.shicimingju.com/book/sanguoyanyi.html>
import requests
from bs4 import BeautifulSoup
headers = {
? ? 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
? ? ? ? ? ? ? ? +'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0'
}
url = 'https://shicimingju.com/book/sanguoyanyi.html'
page_text =requests.get(url = url,headers = headers).text
#实例化BeautifulSoup对象
page_soup = BeautifulSoup(page_text,'xml')
#解析章节标题和详情页的URL
li_list = soup.select('.book-mulu>ul>li')
fp = open('./sanguo.txt','w',encoding='utf-8')
for li in li_list:
? ? title = li.a.string
? ? detail_url = 'https://www.shicimingju.com'+li.a['href']
? ? #对详情页发起请求,解析出章节内容
? ? detail_text = requests.get(detail_url,headers).text
? ? detail_soup = BeautifulSoup(detail_text,'lxml')
? ? content = detail_soup.find('div',class_='chapter_content').text
? ? fp.write(title+':'+content)
? ? print(title+'爬取成功')
解析原理:
pip install lxml
etree.parse(filePath)
etree.HTML('page_text')
xpath('xpath表达式')
?一点代码示例
from lxml import etree
tree = etree.parse('三国演义.html',etree.HTMLParser())
r = tree.xpath('/html//title/text()')
r = tree.xpath('//div[@id="top_right"]/div[1]/a/@href')
r = tree.xpath('//div[@id="main"]//div[@class="card bookmark-list"]//img/@src')
print(r)
from lxml import etree
import requests
tree = etree.parse('北京二手房.html',etree.HTMLParser(encoding='utf-8'))#需要设置编码方式防止中文乱码
house_titles = tree.xpath('//h3/text()')
print(house_titles)
with open('北京二手房名字.txt','w',encoding='utf-8') as fp:
? ? for house_title in house_titles:
? ? ? ? fp.write(house_title+'\n')
print('finished')
解决爬虫中出现中文乱码
import requests
response = requests.get(url,headers)
#手动设置响应数据的编码方式
response.encoding = 'utf-8'
#通用的处理中文乱码的解决方法(针对所需要的局部数据而言)
img_name = img_name.encode('iso-8859-1').decode('gbk')
import requests
from lxml import etree
import os
# url = 'http://pic.netbian.com/4kmeinv/'
headers = {
? ? 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
? ? ? ? ? ? ? ? ?'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0'
}
#reponse = requests.get(url=url,headers=headers)
##将相应数据的编码形式改变
#reponse.encoding='utf-8'
# page_text = response.text
# with open('图片.html','w',encoding='utf-8') as fp:
# ? ? fp.write(page_text)
# print('finished')
# tree = etree.HTML('page_text')
# li_list = tree.xpath('//div[@class="slist"]/ul/li')
# print(tree)
# print(li_list)
#还是不能直接通过从网页获取到的请求数据来获取数据,肯定是因为请求来的不是源码
tree = etree.parse('图片.html',etree.HTMLParser(encoding='utf-8'))
li_list = tree.xpath('//div[@class="slist"]/ul/li')
if not os.path.exists('./tupianLib'):
? ? os.mkdir('./tupianLib')
for li in li_list:
? ? img_src = 'https://pic.netbian.com'+li.xpath('./a/img/@src')[0]
? ? img_name = li.xpath('./a/img/@alt')[0]+'.jpg'
? ? #解决图片名称中文乱码
? ? img_name = img_name.encode('iso-8859-1').decode('gbk')
? ? #请求图片进行持久化存储
? ? img_data = requests.get(url=img_src,headers=headers).content
? ? with open('./tupianLib/'+img_name,'wb') as fp:
? ? ? ? fp.write(img_data)
print(li_list)