?其实在当今社会,网络上充斥着大量有用的数据,我们只需要耐心的观察,再加上一些技术手段,就可以获取到大量的有价值数据。这里的“技术手段”就是网络爬虫。今天就给大家分享一篇爬虫基础知识和入门教程:
import requests
from bs4 import BeautifulSoup
import csv
import matplotlib.pyplot as plt
def getHTMLText(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
html = r.text
return html
except:
print("爬取失败")
return None
def fillUnivList(ulist, html):
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table', class_='rk-table')
if table is None:
print("未找到排名表格")
return
tbody = table.find('tbody')
if tbody is None:
print("未找到<tbody>标签")
return
data = tbody.find_all('tr')
for tr in data:
tds = tr.find_all('td')
if len(tds) < 5: # 跳过不完整的行
continue
td_2 = tds[2].text.strip() if tds[2].text else ""
td_3 = tds[3].text.strip() if tds[3].text else ""
ulist.append([tds[0].string.strip(), tds[1].find('a').string.strip(),
td_2, td_3, tds[4].string.strip()])
def printUnivList(ulist, num):
file_name = "../第六周/大学排行.csv"
with open(file_name, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(["排名", "大学名称", "省市", "类型", "总分"])
for i in range(num):
u = ulist[i]
writer.writerow(u)
print(f"排名:{u[0]}\t大学名称:{u[1]}\t省市:{u[2]}\t类型:{u[3]}\t总分:{u[4]}")
def main():
ulist = []
url = 'https://www.shanghairanking.cn/rankings/bcur/202311.html'
html = getHTMLText(url)
if html is not None:
fillUnivList(ulist, html)
printUnivList(ulist, 30)
main()
def printUnivList(ulist, num):
file_name = "../第六周/大学排行.csv"
with open(file_name, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(["排名", "大学名称", "省市", "类型", "总分"])
for i in range(num):
u = ulist[i]
writer.writerow(u)
print(f"排名:{u[0]}\t大学名称:{u[1]}\t省市:{u[2]}\t类型:{u[3]}\t总分:{u[4]}")
?在进行爬虫编码的过程中要格外的注意网址的的可读性,有的网址可能是以前的网站,根本搜不出来.希望大家注意
这是我们python学习的实践作业,希望可以帮助到大家