selenium+bs4爬虫案例TapTap游戏帖子

发布时间:2024年01月24日
import os
import requests
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By


class CrawlTapTap:
    def __init__(self, url, save_path, end_point):
        self.url = url
        self.save_path = save_path
        self.end_point = end_point
        self.__driver = self.__create_driver()
        self.__start = 0
        self.__end = 10
        self.__move_times = 0

    def __create_driver(self):
        option = webdriver.ChromeOptions()
        option.add_experimental_option("detach", True)
        driver = webdriver.Chrome(options=option)
        return driver

    def __create_file(self):
        txt_path = self.save_path + "taptap.txt"
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
        with open(txt_path, "w") as f:
            pass

    def __save_data(self, data):
        for j in range(len(data["block_imgs"])):
            img_url = data["block_imgs"][j].get("src")
            if "moment" in img_url:
                resp = requests.get(img_url)
                print(f"获取第{j + 1}张图片")
                if resp.status_code == 200:
                    with open(f'{self.save_path}/{data["blogger"]}_{j + 1}.jpg', 'wb') as f:
                        f.write(resp.content)
                else:
                    print("图片获取失败,状态码:", resp.status_code)
        with open(self.save_path + "taptap.txt", "a", encoding="utf-8") as f:
            f.write(data["blogger"] + "\n")
            f.write(data["title"] + "\n")
            f.write(str((data["create_time"], data["watched"], data["tag"])) + "\n")
            f.write(data["content"] + "\n")
            f.write("**" * 22 + "\n")

    def __get_into_page(self, url):
        for i in range(self.__start, self.__end):
            self.__driver.get(url)
            if i == self.__end - 1:
                self.__start = self.__end
                self.__end += self.__end
                self.__move_times += 1
            if self.__end >= self.end_point:
                print('已到结束点!')
                return
            for a in range(self.__move_times):
                time.sleep(2)
                self.__driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")
            print('正在加载主界面。。。')
            time.sleep(2)
            blocks = self.__driver.find_elements(By.XPATH,
                                                 "//div[@class='tap-router moment-list-item moment-card-new']")
            block = blocks[i]
            header = block.find_element(By.CLASS_NAME, "moment-card__header")
            header.click()
            print(f'进入第{i + 1}条帖子。。。')
            time.sleep(2)
            self.__process_page(i)

    def __process_page(self, i):
        new_page = self.__driver.page_source
        content = BeautifulSoup(new_page, "html.parser")
        block_header = content.find('div', attrs={'class': "layout-main__top"})
        block_title = content.find('h1', attrs={'itemprop': "name"})
        create_time = content.find('span', attrs={'itemprop': "dateCreated"})
        watched = content.find('span', attrs={'class': "moment-status-tag topic-detail__space-right"})
        tag = content.find('span',
                           attrs={'class': "tap-text tap-text__one-line moment-detail__labels moment-status-tag "
                                           "topic-detail__space-right"})
        block_content = content.find('div', attrs={'itemprop': "text"})
        block_imgs = content.find_all('img', attrs={'class': 'tap-image', "alt": 'TapTap', "loading": 'eager'})
        header = block_header.text if block_header else "None"
        blogger = header.split('关注')[0] if block_header else "None"
        title = block_title.text if block_title else "None"
        create_time = create_time.text if create_time else "None"
        watched = watched.text if watched else 0
        tag = tag.text if tag else "None"
        content = block_content.text if block_content else "None"
        data = {"blogger": blogger, "title": title, "create_time": create_time, "watched": watched,
                "tag": tag, "content": content, "block_imgs": block_imgs}
        self.__save_data(data)
        print(f"完成第{i + 1}条帖子!")
        print("**" * 22)
        time.sleep(2)

    def run(self):
        while True:
            self.__get_into_page(self.url)


if __name__ == '__main__':
    url = "https://www.taptap.cn/app/16734/topic?sort=created"
    save_path = "taptap/"
    tap = CrawlTapTap(url, save_path, 21)
    tap.run()

文章来源:https://blog.csdn.net/weixin_71370467/article/details/135824173
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。