多线程爬取表情包


通过threading库加上queue的安全队列进行了一次对表情包网的爬出。
代码如下:
使用了消费者和生产者的多线程爬虫。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os
import re
import threading
from queue import Queue
from urllib import request
import requests
from lxml import etree
class Producer(threading.Thread):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36"
    }
    def __init__ (self, page_queue,  img_queue, *args, **kwargs):
        super(Producer, self).__init__(*args, **kwargs)
        self.page_queue = page_queue
        self.image_queue = img_queue
    def run(self):
        while True:
            if self.page_queue.empty():
                break
            url = self.page_queue.get()
            self.spider(url)
    def spider(self, url):
        resp = requests.get(url, headers=self.headers)
        text = resp.text
        html = etree.HTML(text)
        imgs = html.xpath("//div[@class='page-content text-center']//img[@class!='gif']")
        for i in imgs:
            img = i.get('data-original')
            suffix = os.path.splitext(img)[1]
            alt = i.get('alt')
            alt = re.sub(r'[\?\.?。,,!!\*]', '', alt)
            filename = alt + suffix
            print('/images/%s' %filename)
            self.image_queue.put((img, filename))
class Consumer(threading.Thread):
    def __init__(self,page_queue,img_queue,*args,**kwargs):
        super(Consumer, self).__init__(*args, **kwargs)
        self.page_queue = page_queue
        self.img_queue = img_queue
    def run(self):
        while True:
            if self.page_queue.empty() and self.img_queue.empty():
                break
            img_url,filename=self.img_queue.get()
            request.urlretrieve(img_url,"images/"+filename)
            print(filename+"  下载完成!")
def main():
    page_queue = Queue(100)
    img_queue = Queue(1000)
    base_url = "http://www.doutula.com/photo/list/?page={}"
    for i in range(1, 101):
        url = base_url.format(i)
        page_queue.put(url)
    for i in range(5):
        t=Producer(page_queue,img_queue)
        t.start()
    for i in  range(5):
        t=Consumer(page_queue,img_queue)
        t.start()
if __name__ == '__main__':
    main()

#
Your browser is out-of-date!

Update your browser to view this website correctly. Update my browser now

×