-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathManager.py
More file actions
129 lines (110 loc) · 3.75 KB
/
Copy pathManager.py
File metadata and controls
129 lines (110 loc) · 3.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# Manager.py
# -*- coding: utf-8 -*-
import re
import urllib
import urllib2
import cookielib
import Queue,time
from multiprocessing.managers import BaseManager
# 存放问题链接的队列
questionQueue = Queue.Queue()
#存放下载结果的队列
resultQueue = Queue.Queue()
# 是否开始标志
isStart = Queue.Queue()
class QueueManager(BaseManager):
pass
# 在网络上注册队列
QueueManager.register('get_question_queue',callable=lambda:questionQueue)
QueueManager.register('get_result_queue',callable=lambda:resultQueue)
QueueManager.register('get_flag',callable=lambda:isStart)
# 绑定端口设置验证码
manager = QueueManager(address=('',5111),authkey='IamSpiderMan')
manager.start()
# 获得网络上的数据对象
questionNum = manager.get_question_queue()
resultNum = manager.get_result_queue()
isStart = manager.get_flag()
# 登陆类,负责登陆知乎
class Login(object):
def __init__(self,id,password,result):
self.id = id
self.password = password
self.result = result
# 浏览器header
self.header = {
'Connection':'Keep-Alive',
'Accept':'text/html,application/xhtml+xml,*/*',
'Accept-Language':'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
'User-Agent':'Mozilla/5.0(Windows NT 6.3;WOW64;Trident/7.0;rv:11.0) like Gecko',
'Accept-Encodeing':'gzip,deflate',
'Host':'www.zhihu.com',
'DNT':'1'
}
# 获取登陆需要的_xsrf
def getXSRF(self):
data = re.compile('name=\"_xsrf\" value=\"(.*)\"',flags = 0)
str = data.findall(self.result)
return str[0]
# 将字典转换成元组集合,放进opener
def getOpener(self):
myCookie = cookielib.CookieJar()
process = urllib2.HTTPCookieProcessor(myCookie)
opener =urllib2. build_opener(process)
header = []
for key,value in self.header.items():
elem = (key,value)
header.append(elem)
opener.addheaders = header
return opener
# 返回登陆成功后的响应信息
def login(self,loginUrl):
xsrf = self.getXSRF()
postData = {
'_xsrf':xsrf,
'email':self.id,
'password':self.password,
'rememberme':'y'
}
opener = self.getOpener()
responseData = urllib.urlencode(postData).encode()
openData = opener.open(loginUrl,responseData)
return openData.read()
# 寻找传入的数据中的问题链接并放到队列中
def putUrl(data):
if data:
# 抓取问题的号码并放入队列
questionPattern = re.compile("<a class=\"question_link\" target=\"_blank\" href=\"/question/(.*)\">")
urllist = questionPattern.findall(data)
for x in urllist:
questionNum.put(x)
# 爬虫主进程
class QuestManager():
def __init__(self,id,password,num):
self.id = id
self.password = password
self.count = num
def tStart(self):
hosturl = 'http://www.zhihu.com/'
result = urllib2.urlopen(hosturl).read()
zhihu = Login(self.id,self.password,result)
data = zhihu.login(hosturl + 'login')
putUrl(data)
isStart.put(1)
print "任务初始化完成等待爬虫仔报告......."
time.sleep(3)
i = 0
while i < self.count:
print "报!",resultNum.get(),"已完成下载"
i += 1
isStart.get()
print "主人任务已完成!"
time.sleep(3)
manager.shutdown()
test = QuestManager(account,password,20)
test.tStart()
#print "请输入知乎账号 密码 以及要爬的网页数(空格隔开)\n"
#str = raw_input()
#mess = str.split(' ')
#zhihu = QueueManager(mess[0],mess[1],int(mess[2]))
#zhihu.tStart()