1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
| import threading import Queue as queue import re import urllib2 import urllib import time
urlqueue = queue.Queue() headers=("User-Agent","Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:53.0) Gecko/20100101 Firefox/53.0") opener = urllib2.build_opener() opener.addheaders = [headers]
listurl = []
class getURLThread(threading.Thread): def __init__(self,key,pagestart,pageend,proxy,urlqueue): threading.Thread.__init__(self) self.pagestart = pagestart self.pageend = pageend self.proxy = urlqueue self.key = key def run(self): page = self.pagestart for page in range(self.pagestart,self.pageend+1): url='http://image.baidu.com/search/avatarjson?tn=resultjsonavatarnew&ie=utf-8&word=%E8%BE%B9%E7%89%A7&rn=60&pn='+str(60*page) data1 =opener.open(url).read() listurlpat = '"objURL":"(.+?)",' urlpage = re.compile(listurlpat,re.S).findall(data1) for urli in urlpage: time.sleep(7) for urlj in urlpage: try: print(urlj) urlqueue.put(urlj) urlqueue.task_done() except urllib2.URLError as e: if(hasattr(e,"code")): print(e.code) if(hasattr(e,"reason")): print(e.reason) time.sleep(10) except Exception as e: print('exception:'+str(e)) time.sleep(1)
class getContent(threading.Thread): def __init__(self,urlqueue,proxy): threading.Thread.__init__(self) self.urlqueue = urlqueue self.proxy = proxy def run(self): i = 1 while(True): try: imagename = 'Baidu_Dog/'+str(i)+'.jpg' imageurl = urlqueue.get() urllib.urlretrieve(imageurl,imagename) print('get image'+url) i+=1 except urllib2.URLError as e: if(hasattr(e,"code")): print(e.code) if(hasattr(e,"reason")): print(e.reason) time.sleep(10) except Exception as e: print('exception:'+str(e)) time.sleep(1)
class control(threading.Thread): def __init__(self,urlqueue): threading.Thread.__init__(self) self.urlqueue = urlqueue def run(self): while(True): print('process~ing') time.sleep(60) if(self.urlqueue.empty()): print('finished!') exit()
key='AI' proxy = '119.6.136.122:80' proxy2 = '' pagestart = 1 pageend=40
t1 = getURLThread(key,pagestart,pageend,proxy,urlqueue) t1.start()
t2 = getContent(urlqueue,proxy) t2.start()
t3 = control(urlqueue) t3.start()
|