1创建一个爬虫程序
import requests
urls = [
'https://www.cnblogs.com/#p{page}'
for page in range(1, 50+1)
]
def craw(url):
r = requests.get(url)
print(url, len(r.text))
craw(urls[0])
2定义单进程和多进程
import blob_spider
import threading
import time
def single_thread():
print("singe_thread begin")
for url in blob_spider.urls:
blob_spider.craw(url)
print("singe_thread end")
def multi_thread():
print("multi_thread begin")
threads = []
for url in blob_spider.urls:
threads.append(
threading.Thread(target=blob_spider.craw, args=(url, ))
)
for thread in threads:
thread.start()
for thread in threads:
thread.join()
print("multi_thread end")
if __name__ == "__main__":
start = time.time()
single_thread()
end = time.time()
print("single_thread用时:{}".format(end-start))
start = time.time()
multi_thread()
end = time.time()
print("multi_thread用时:{}".format(end - start))