Skip to content

Commit a0548f8

Browse files
committed
Merge pull request egrcc#4 from Eureka22/master
add cookies support
2 parents 917bb40 + 4cc8627 commit a0548f8

File tree

6 files changed

+78
-15
lines changed

6 files changed

+78
-15
lines changed

README.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,12 @@ zhihu-python 采用 python2.7 编写,用来方便地获取知乎上各种内
5353
$ pip install beautifulsoup4
5454
$ pip install html2text
5555
56+
或者使用
57+
58+
.. code-block:: bash
59+
60+
$ pip install -r requirements.txt
61+
5662
**注意** :如果同时安装了 python3 和 python2 , pip 命令可能默认安装的是 python3 版本的包,故需将上述命令中的 pip 换成pip2 (比如在我的Ubuntu上)。
5763

5864
快速开始

config.ini

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,14 @@
11
[info]
22
email =
33
password =
4+
5+
[cookies]
6+
z_c0 =
7+
q_c1 =
8+
c_c =
9+
_xsrf =
10+
__utmz =
11+
__utmv =
12+
__utmc =
13+
__utmb =
14+
__utma =
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# 现实可以有多美好?
2+
## 作者: 田浩 赞同: 11000
3+
一个制毒者,凌晨被我们抓到,家里全是「制毒原料」,还有一台蒸馏机,几个塑料桶,我们进去的时候他还拿着棍子在桶里搅。被抓到当时情绪就崩溃了,说自己只是个木匠,
4+
一时鬼迷心窍才去缅甸买了这些机器和原料,并表示认罪,愿意积极配合,问我们能不能不死。说实话,按他的数量,少说几十公斤,虽然嘴上应承着,但心里觉得他死定了。
5+
6+
我们把人和东西抓回去之后,化验毒品,神转折是,发现他的制毒的原料和机器全是假的。去境外被人骗了。
7+
8+
最后是算预备犯罪还是犯罪未遂我忘记了,反正关了不到一年就放出来了。
9+
10+
对他来说,这现实简直太美好啦。
11+
12+
#### 原链接: http://www.zhihu.com/question/24269892/answer/29960616

requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
requests
2+
beautifulsoup4
3+
html2text
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
现实可以有多美好?
2+
3+
作者: 田浩 赞同: 11000
4+
5+
一个制毒者,凌晨被我们抓到,家里全是「制毒原料」,还有一台蒸馏机,几个塑料桶,我们进去的时候他还拿着棍子在桶里搅。被抓到当时情绪就崩溃了,说自己只是个木匠,一时鬼迷心窍才去缅甸买了这些机器和原料,并表示认罪,愿意积极配合,问我们能不能不死。说实话,按他的数量,少说几十公斤,虽然嘴上应承着,但心里觉得他死定了。我们把人和东西抓回去之后,化验毒品,神转折是,发现他的制毒的原料和机器全是假的。去境外被人骗了。最后是算预备犯罪还是犯罪未遂我忘记了,反正关了不到一年就放出来了。对他来说,这现实简直太美好啦。
6+
7+
8+
9+
10+
11+
12+
13+
14+
15+
原链接: http://www.zhihu.com/question/24269892/answer/29960616

zhihu.py

Lines changed: 31 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,30 @@
99
import html2text
1010
import ConfigParser
1111
from bs4 import BeautifulSoup
12+
import sys
1213

14+
reload(sys)
15+
sys.setdefaultencoding('utf8')
1316
session = None
1417

18+
cookies = {}
19+
1520
def create_session():
1621

1722
global session
18-
23+
global cookies
1924
cf = ConfigParser.ConfigParser()
2025
cf.read("config.ini")
26+
27+
cookies = cf._sections['cookies']
28+
2129
email = cf.get("info", "email")
2230
password = cf.get("info", "password")
31+
32+
33+
cookies = dict(cookies)
34+
print cookies
35+
2336
s = requests.session()
2437
login_data = {"email": email, "password": password}
2538
header = {
@@ -28,9 +41,12 @@ def create_session():
2841
'Referer': "http://www.zhihu.com/",
2942
'X-Requested-With': "XMLHttpRequest"
3043
}
44+
3145
r = s.post('http://www.zhihu.com/login', data = login_data, headers = header)
3246
if r.json()["r"] == 1:
33-
raise Exception("login failed.")
47+
print "Login Failed, reason is:"
48+
print r.text
49+
print "Use cookies"
3450
session = s
3551

3652

@@ -67,7 +83,7 @@ def parser(self):
6783
if session == None:
6884
create_session()
6985
s = session
70-
r = s.get(self.url)
86+
r = s.get(self.url,cookies = cookies)
7187
soup = BeautifulSoup(r.content)
7288
self.soup = soup
7389

@@ -239,7 +255,7 @@ def get_all_answers(self):
239255
'Host': "www.zhihu.com",
240256
'Referer': self.url
241257
}
242-
r = s.post(post_url, data = data, headers = header)
258+
r = s.post(post_url, data = data, headers = header, cookies = cookies)
243259
answer_list = r.json()["msg"]
244260
for j in range(min(answers_num - i * 50, 50)):
245261
soup = BeautifulSoup(self.soup.encode("utf-8"))
@@ -332,7 +348,7 @@ def parser(self):
332348
if session == None:
333349
create_session()
334350
s = session
335-
r = s.get(self.user_url)
351+
r = s.get(self.user_url,cookies = cookies)
336352
soup = BeautifulSoup(r.content)
337353
self.soup = soup
338354

@@ -458,7 +474,7 @@ def get_followees(self):
458474
create_session()
459475
s = session
460476
followee_url = self.user_url + "/followees"
461-
r = s.get(followee_url)
477+
r = s.get(followee_url,cookies = cookies)
462478
soup = BeautifulSoup(r.content)
463479
for i in range((followees_num - 1) / 20 + 1):
464480
if i == 0:
@@ -481,7 +497,7 @@ def get_followees(self):
481497
'Host': "www.zhihu.com",
482498
'Referer': followee_url
483499
}
484-
r_post = s.post(post_url, data = data, headers = header)
500+
r_post = s.post(post_url, data = data, headers = header, cookies = cookies)
485501
followee_list = r_post.json()["msg"]
486502
for j in range(min(followees_num - i * 20, 20)):
487503
followee_soup = BeautifulSoup(followee_list[j])
@@ -506,7 +522,7 @@ def get_followers(self):
506522
create_session()
507523
s = session
508524
follower_url = self.user_url + "/followers"
509-
r = s.get(follower_url)
525+
r = s.get(follower_url,cookies = cookies)
510526
soup = BeautifulSoup(r.content)
511527
for i in range((followers_num - 1) / 20 + 1):
512528
if i == 0:
@@ -529,7 +545,7 @@ def get_followers(self):
529545
'Host': "www.zhihu.com",
530546
'Referer': follower_url
531547
}
532-
r_post = s.post(post_url, data = data, headers = header)
548+
r_post = s.post(post_url, data = data, headers = header, cookies = cookies)
533549
follower_list = r_post.json()["msg"]
534550
for j in range(min(followers_num - i * 20, 20)):
535551
follower_soup = BeautifulSoup(follower_list[j])
@@ -555,7 +571,7 @@ def get_asks(self):
555571
else:
556572
for i in range((asks_num - 1) / 20 + 1):
557573
ask_url = self.user_url + "/asks?page=" + str(i + 1)
558-
r = s.get(ask_url)
574+
r = s.get(ask_url,cookies = cookies)
559575
soup = BeautifulSoup(r.content)
560576
for question in soup.find_all("a", class_ = "question_link"):
561577
url = "http://www.zhihu.com" + question["href"]
@@ -581,7 +597,7 @@ def get_answers(self):
581597
else:
582598
for i in range((answers_num - 1) / 20 + 1):
583599
answer_url = self.user_url + "/answers?page=" + str(i + 1)
584-
r = s.get(answer_url)
600+
r = s.get(answer_url,cookies = cookies)
585601
soup = BeautifulSoup(r.content)
586602
for answer in soup.find_all("a", class_ = "question_link"):
587603
question_url = "http://www.zhihu.com" + answer["href"][0:18]
@@ -608,7 +624,7 @@ def get_collections(self):
608624
else:
609625
for i in range((collections_num - 1) / 20 + 1):
610626
collection_url = self.user_url + "/collections?page=" + str(i + 1)
611-
r = s.get(collection_url)
627+
r = s.get(collection_url,cookies = cookies)
612628
soup = BeautifulSoup(r.content)
613629
for collection in soup.find_all("div", class_ = "zm-profile-section-item zg-clear"):
614630
url = "http://www.zhihu.com" + \
@@ -652,7 +668,7 @@ def parser(self):
652668
if session == None:
653669
create_session()
654670
s = session
655-
r = s.get(self.answer_url)
671+
r = s.get(self.answer_url,cookies = cookies)
656672
soup = BeautifulSoup(r.content)
657673
self.soup = soup
658674

@@ -905,7 +921,7 @@ def parser(self):
905921
if session == None:
906922
create_session()
907923
s = session
908-
r = s.get(self.url)
924+
r = s.get(self.url,cookies = cookies)
909925
soup = BeautifulSoup(r.content)
910926
self.soup = soup
911927

@@ -972,7 +988,7 @@ def get_all_answers(self):
972988
i = 2
973989
s = session
974990
while True:
975-
r = s.get(self.url + "?page=" + str(i))
991+
r = s.get(self.url + "?page=" + str(i),cookies = cookies)
976992
answer_soup = BeautifulSoup(r.content)
977993
answer_list = answer_soup.find_all("div", class_ = "zm-item")
978994
if len(answer_list) == 0:

0 commit comments

Comments
 (0)