今天爬取的是一本小说
代码如下:
from selenium import webdriver
from selenium.webdriver.common.proxy import Proxy, ProxyType
import random
import time
from selenium.webdriver.common.by import By
def check():
option = webdriver.ChromeOptions()
option.add_argument('--ignore-certificate-errors')
driver = webdriver.Chrome(options=option)
url="https://www.fd80.com/305/305890/2099286.html"
for i in range(267,445):
print("正在爬取第"+str(i)+"章")
driver.get(url)
time.sleep(1)
url=get_text(driver)
print("爬取完成")
def get_text(driver):
element = driver.find_element(By.XPATH, '//*[@id="novelcontent"]/div')
title=driver.find_element(By.XPATH, '//*[@id="chaptertitle"]')
nexthtml=driver.find_element(By.XPATH, '//*[@id="next_url"]')
# 获取下一章的链接
next_url = nexthtml.get_attribute('href')
# 将结果写入文件
with open('无敌六皇子.txt', 'a', encoding='utf-8') as f:
f.write(title.text + '\n')
f.write(element.text + '\n\n')
return next_url
if __name__ == '__main__':
check()
接着写一个网页来表示出文本内容(此段代码由陈同学提供,不方便展示),效果如下:
最近新开了公众号,请大家关注一下。