99import html2text
1010import ConfigParser
1111from bs4 import BeautifulSoup
12+ import sys
1213
14+ reload (sys )
15+ sys .setdefaultencoding ('utf8' )
1316session = None
1417
18+ cookies = {}
19+
1520def create_session ():
1621
1722 global session
18-
23+ global cookies
1924 cf = ConfigParser .ConfigParser ()
2025 cf .read ("config.ini" )
26+
27+ cookies = cf ._sections ['cookies' ]
28+
2129 email = cf .get ("info" , "email" )
2230 password = cf .get ("info" , "password" )
31+
32+
33+ cookies = dict (cookies )
34+ print cookies
35+
2336 s = requests .session ()
2437 login_data = {"email" : email , "password" : password }
2538 header = {
@@ -28,9 +41,12 @@ def create_session():
2841 'Referer' : "http://www.zhihu.com/" ,
2942 'X-Requested-With' : "XMLHttpRequest"
3043 }
44+
3145 r = s .post ('http://www.zhihu.com/login' , data = login_data , headers = header )
3246 if r .json ()["r" ] == 1 :
33- raise Exception ("login failed." )
47+ print "Login Failed, reason is:"
48+ print r .text
49+ print "Use cookies"
3450 session = s
3551
3652
@@ -67,7 +83,7 @@ def parser(self):
6783 if session == None :
6884 create_session ()
6985 s = session
70- r = s .get (self .url )
86+ r = s .get (self .url , cookies = cookies )
7187 soup = BeautifulSoup (r .content )
7288 self .soup = soup
7389
@@ -239,7 +255,7 @@ def get_all_answers(self):
239255 'Host' : "www.zhihu.com" ,
240256 'Referer' : self .url
241257 }
242- r = s .post (post_url , data = data , headers = header )
258+ r = s .post (post_url , data = data , headers = header , cookies = cookies )
243259 answer_list = r .json ()["msg" ]
244260 for j in range (min (answers_num - i * 50 , 50 )):
245261 soup = BeautifulSoup (self .soup .encode ("utf-8" ))
@@ -332,7 +348,7 @@ def parser(self):
332348 if session == None :
333349 create_session ()
334350 s = session
335- r = s .get (self .user_url )
351+ r = s .get (self .user_url , cookies = cookies )
336352 soup = BeautifulSoup (r .content )
337353 self .soup = soup
338354
@@ -458,7 +474,7 @@ def get_followees(self):
458474 create_session ()
459475 s = session
460476 followee_url = self .user_url + "/followees"
461- r = s .get (followee_url )
477+ r = s .get (followee_url , cookies = cookies )
462478 soup = BeautifulSoup (r .content )
463479 for i in range ((followees_num - 1 ) / 20 + 1 ):
464480 if i == 0 :
@@ -481,7 +497,7 @@ def get_followees(self):
481497 'Host' : "www.zhihu.com" ,
482498 'Referer' : followee_url
483499 }
484- r_post = s .post (post_url , data = data , headers = header )
500+ r_post = s .post (post_url , data = data , headers = header , cookies = cookies )
485501 followee_list = r_post .json ()["msg" ]
486502 for j in range (min (followees_num - i * 20 , 20 )):
487503 followee_soup = BeautifulSoup (followee_list [j ])
@@ -506,7 +522,7 @@ def get_followers(self):
506522 create_session ()
507523 s = session
508524 follower_url = self .user_url + "/followers"
509- r = s .get (follower_url )
525+ r = s .get (follower_url , cookies = cookies )
510526 soup = BeautifulSoup (r .content )
511527 for i in range ((followers_num - 1 ) / 20 + 1 ):
512528 if i == 0 :
@@ -529,7 +545,7 @@ def get_followers(self):
529545 'Host' : "www.zhihu.com" ,
530546 'Referer' : follower_url
531547 }
532- r_post = s .post (post_url , data = data , headers = header )
548+ r_post = s .post (post_url , data = data , headers = header , cookies = cookies )
533549 follower_list = r_post .json ()["msg" ]
534550 for j in range (min (followers_num - i * 20 , 20 )):
535551 follower_soup = BeautifulSoup (follower_list [j ])
@@ -555,7 +571,7 @@ def get_asks(self):
555571 else :
556572 for i in range ((asks_num - 1 ) / 20 + 1 ):
557573 ask_url = self .user_url + "/asks?page=" + str (i + 1 )
558- r = s .get (ask_url )
574+ r = s .get (ask_url , cookies = cookies )
559575 soup = BeautifulSoup (r .content )
560576 for question in soup .find_all ("a" , class_ = "question_link" ):
561577 url = "http://www.zhihu.com" + question ["href" ]
@@ -581,7 +597,7 @@ def get_answers(self):
581597 else :
582598 for i in range ((answers_num - 1 ) / 20 + 1 ):
583599 answer_url = self .user_url + "/answers?page=" + str (i + 1 )
584- r = s .get (answer_url )
600+ r = s .get (answer_url , cookies = cookies )
585601 soup = BeautifulSoup (r .content )
586602 for answer in soup .find_all ("a" , class_ = "question_link" ):
587603 question_url = "http://www.zhihu.com" + answer ["href" ][0 :18 ]
@@ -608,7 +624,7 @@ def get_collections(self):
608624 else :
609625 for i in range ((collections_num - 1 ) / 20 + 1 ):
610626 collection_url = self .user_url + "/collections?page=" + str (i + 1 )
611- r = s .get (collection_url )
627+ r = s .get (collection_url , cookies = cookies )
612628 soup = BeautifulSoup (r .content )
613629 for collection in soup .find_all ("div" , class_ = "zm-profile-section-item zg-clear" ):
614630 url = "http://www.zhihu.com" + \
@@ -652,7 +668,7 @@ def parser(self):
652668 if session == None :
653669 create_session ()
654670 s = session
655- r = s .get (self .answer_url )
671+ r = s .get (self .answer_url , cookies = cookies )
656672 soup = BeautifulSoup (r .content )
657673 self .soup = soup
658674
@@ -905,7 +921,7 @@ def parser(self):
905921 if session == None :
906922 create_session ()
907923 s = session
908- r = s .get (self .url )
924+ r = s .get (self .url , cookies = cookies )
909925 soup = BeautifulSoup (r .content )
910926 self .soup = soup
911927
@@ -972,7 +988,7 @@ def get_all_answers(self):
972988 i = 2
973989 s = session
974990 while True :
975- r = s .get (self .url + "?page=" + str (i ))
991+ r = s .get (self .url + "?page=" + str (i ), cookies = cookies )
976992 answer_soup = BeautifulSoup (r .content )
977993 answer_list = answer_soup .find_all ("div" , class_ = "zm-item" )
978994 if len (answer_list ) == 0 :
0 commit comments