2727import org .apache .http .impl .client .HttpClients ;
2828import org .apache .http .message .BasicNameValuePair ;
2929import org .apache .http .util .EntityUtils ;
30+ import org .apache .log4j .Logger ;
31+ import org .apache .log4j .PropertyConfigurator ;
3032
3133import us .codecraft .webmagic .selector .XpathSelector ;
3234
@@ -40,38 +42,43 @@ public class HttpClientDownloader implements Downloader {
4042 private static final String problemCodeURL = ".*/submissions/detail/.*" ; //检验是否是题目代码页面
4143 private static final String problemListURL = ".*/problemset/algorithms/$" ; //检验是否是题目列表页面
4244
43- private static final String problemLinkPath = "//table[@class='table table-striped table-centered']/tbody/tr/td/a/@href" ; //进入问题描述 Xpath路径
44- private static final String getProblemLinkPath = "//table[@class='table table-striped table-centered']/tbody/tr/td/a/text()" ; //得到题目名称Xpath路径
45- private static final String submissionLinkPath = "//div[@class='row']/div/div/a/@href" ; //进入题目提交页面Xpath路径
46- private static final String codePagePath = "//table[@id='result_testcases']/tbody/tr/td/a/@href" ; //进入题目代码页面Xpath路径
47- private static final String codePageStatusPath = "//table[@id='result_testcases]/t"
48- + "body/tr/td/a[@class='status-accepted text-success']/strong/text()" ; //题目提交状态Xpath路径
49-
5045 private static final String problemNamePath = "//div[@class='col-md-12']/h4/a/text()" ; //题目名称Xpath路径
5146 private static final String codePath = "//div[@class='ace_content']/text()" ; //得到代码Xpath路径
5247 private static CloseableHttpClient httpClient ;
5348 private static final HttpClientDownloader downloader = new HttpClientDownloader ();
49+ private static Logger myLog ;
5450
5551 public static HttpClientDownloader getInstance () {
5652 return downloader ;
5753 }
54+
55+ /**
56+ *
57+ * init(初始化httpclient,以单例模式保存httpclient)
58+ * @param username
59+ * @param password
60+ *void
61+ * @exception
62+ * @since 1.0.0
63+ */
5864 public static void init (String username , String password ) {
65+ PropertyConfigurator .configure ("test.log" );
66+ myLog = Logger .getLogger (HttpClientDownloader .class );
5967 httpClient = HttpClients .createDefault ();
6068 HttpGet httpGet = new HttpGet (INDEX_URL );
6169 CloseableHttpResponse response1 ;
6270 String cookieString = null ;
6371 HttpEntity entity1 ;
6472 try {
6573 response1 = httpClient .execute (httpGet );
66- printResponse (response1 );
74+ //printResponse(response1);
75+ myLog .debug ("第一次登录,试图获取cookie值,response状态为:" + response1 .getStatusLine ());
6776 cookieString = getCookie (response1 );
6877 entity1 = response1 .getEntity ();
6978 EntityUtils .consume (entity1 );
7079 } catch (ClientProtocolException e1 ) {
71- // TODO Auto-generated catch block
7280 e1 .printStackTrace ();
7381 } catch (IOException e1 ) {
74- // TODO Auto-generated catch block
7582 e1 .printStackTrace ();
7683 }
7784
@@ -89,7 +96,8 @@ public static void init(String username, String password) {
8996 try {
9097 httpPost .setEntity (new UrlEncodedFormEntity (getParam (map ), "UTF-8" ));
9198 response1 = httpClient .execute (httpPost );
92- printResponse (response1 );
99+ //printResponse(response1);
100+ myLog .debug ("试图登录leetcode网站, response状态为: " + response1 .getStatusLine ());
93101 entity1 = response1 .getEntity ();
94102 EntityUtils .consume (entity1 );
95103 } catch (UnsupportedEncodingException e ) {
@@ -99,115 +107,112 @@ public static void init(String username, String password) {
99107 } catch (IOException e ) {
100108 System .out .println (e .getMessage ());
101109 }
110+ myLog .debug ("HttpClientDownloader初始化完成" );
102111 }
103112
104113 public void setTread (Thread thread ) {
105- // TODO Auto-generated method stub
114+
115+ }
116+
117+ /**
118+ * 获取问题列表
119+ */
120+ public List <String > problemListDownloader (String pattern ) {
121+ myLog .debug ("尝试获取问题列表" );
122+ return doDispatcher (ALGORITHMS , pattern );
123+ }
106124
125+ /**
126+ * 根据问题问题描述信息所在的url,爬取问题提交列表对应的url
127+ */
128+ public List <String > problemDescriptionDownloader (String url , String pattern ) {
129+ myLog .debug ("尝试获取问题提交记录列表" );
130+ return doDispatcher (url , pattern );
131+ }
132+
133+ public List <String > submissionListDownloader (String url , String pattern ) {
134+ myLog .debug ("尝试获取代码页面url" );
135+ return doDispatcher (url , pattern );
107136 }
108137
109- public void download (String url ) {
110- CloseableHttpClient httpClient = HttpClients .createDefault ();
111- HttpGet httpGet = new HttpGet ("https://leetcode.com" );
112- CloseableHttpResponse response1 ;
113- String cookieString = null ;
114- HttpEntity entity1 ;
115- try {
116- response1 = httpClient .execute (httpGet );
117- printResponse (response1 );
118- cookieString = getCookie (response1 );
119- entity1 = response1 .getEntity ();
120- EntityUtils .consume (entity1 );
121- } catch (ClientProtocolException e1 ) {
122- System .out .println (e1 .getMessage ());
123- } catch (IOException e1 ) {
124- System .out .println (e1 .getMessage ());
125- }
138+ public List <String > codePageDownloader (String url , String pattern ) {
139+ myLog .debug ("尝试获取代码" );
140+ //爬取代码
141+ return null ;
142+ }
126143
127- HttpPost httpPost = new HttpPost ("https://leetcode.com/accounts/login/" );
128- httpPost .addHeader (
129- "User-Agent" ,
130- "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6)Gecko/20091201 Firefox/3.5.6" );
131- httpPost .addHeader ("Referer" , "https://leetcode.com/accounts/login/" );
132- httpPost .addHeader ("Origin" , "https://leetcode.com" );
133- Map <String , String > map = new HashMap <String , String >();
134- map .put ("login" , "tanghaodong25@163.com" );
135- map .put ("password" , "thd04180015" );
136- map .put ("csrfmiddlewaretoken" , cookieString );
137- map .put ("remember" , "on" );
144+ /**
145+ *
146+ * doDispatcher()
147+ * (适用于获取问题详情url和获取问题提交列表url)
148+ * @param url
149+ * @param blockingQueue
150+ *void
151+ * @exception
152+ * @since 1.0.0
153+ */
154+ public List <String > doDispatcher (String url , String pattern ) {
155+ HttpGet httpGet1 = new HttpGet (url );
156+ myLog .debug ("进入doDispatcher函数,准备对" +url +"进行解析" );
138157 try {
139- httpPost .setEntity (new UrlEncodedFormEntity (getParam (map ), "UTF-8" ));
140- response1 = httpClient .execute (httpPost );
141- printResponse (response1 );
142- entity1 = response1 .getEntity ();
158+ HttpResponse response1 = httpClient .execute (httpGet1 );
159+ myLog .debug ("正在对" +url +"进行访问,response状态为:" + response1 .getStatusLine ());
160+ myLog .debug ("试图对" +url +"进行解析,其中pattern为:" +pattern );
161+ XpathSelector xpathSelector = new XpathSelector (pattern );
162+ List <String > listTmp = xpathSelector .selectList (getHtml (response1 ));
163+ HttpEntity entity1 = response1 .getEntity ();
143164 EntityUtils .consume (entity1 );
144- } catch (UnsupportedEncodingException e ) {
145- System .out .println (e .getMessage ());
165+ return listTmp ;
146166 } catch (ClientProtocolException e ) {
147167 System .out .println (e .getMessage ());
148168 } catch (IOException e ) {
149169 System .out .println (e .getMessage ());
150170 }
171+ return null ;
151172 }
152-
153- public static List <NameValuePair > getParam (Map parameterMap ) {
154- List <NameValuePair > param = new ArrayList <NameValuePair >();
155- Iterator it = parameterMap .entrySet ().iterator ();
156- while (it .hasNext ()) {
157- Entry parmEntry = (Entry ) it .next ();
158- param .add (new BasicNameValuePair ((String ) parmEntry .getKey (),
159- (String ) parmEntry .getValue ()));
160- }
161- return param ;
162- }
163-
173+
174+ /**
175+ *
176+ * printResponse(打印response信息)
177+ * @param httpResponse
178+ * @throws ParseException
179+ * @throws IOException
180+ *void
181+ * @exception
182+ * @since 1.0.0
183+ */
164184 public static void printResponse (HttpResponse httpResponse )
165185 throws ParseException , IOException {
166186 // 获取响应消息实体
167187 HttpEntity entity = httpResponse .getEntity ();
168188 // 响应状态
169189 System .out .println ("status:" + httpResponse .getStatusLine ());
170190 System .out .println ("headers:" );
171- HeaderIterator iterator = httpResponse .headerIterator ();
172- while (iterator .hasNext ()) {
173- System .out .println ("\t " + iterator .next ());
174- }
175- // 判断响应实体是否为空
176- if (entity != null ) {
177- String responseString = EntityUtils .toString (entity );
178- System .out .println ("response length:" + responseString .length ());
179- System .out .println ("response content:"
180- + responseString .replace ("\r \n " , "" ));
181- }
182- }
183-
184- public static String getCookie (HttpResponse httpResponse ) {
185- Header [] headers = httpResponse .getAllHeaders ();
186- for (Header value : headers ) {
187- if (value .getName ().equals ("Set-Cookie" )) {
188- return value .getValue ().split (";" )[0 ].split ("=" )[1 ];
189- }
190- }
191- return null ;
191+ // HeaderIterator iterator = httpResponse.headerIterator();
192+ // while (iterator.hasNext()) {
193+ // System.out.println("\t" + iterator.next());
194+ // }
195+ // // 判断响应实体是否为空
196+ // if (entity != null) {
197+ // String responseString = EntityUtils.toString(entity);
198+ // System.out.println("response length:" + responseString.length());
199+ // System.out.println("response content:"
200+ // + responseString.replace("\r\n", ""));
201+ // }
202+ PropertyConfigurator .configure ("test.log" );
203+ Logger m_log = Logger .getLogger (HttpClientDownloader .class );
204+ m_log .debug ("hello world" );
192205 }
193206
194- public List <String > problemListDownloader () {
195- return doDispatcher (ALGORITHMS );
196- }
197-
198- public List <String > problemDescriptionDownloader (String url ) {
199- return doDispatcher (url );
200- }
201-
202- public List <String > submissionListDownloader (String url , String name ) {
203- return doDispatcher (url , name , httpClient );
204- }
205-
206- public List <String > codePageDownloader (String url ) {
207- //爬取代码
208- return null ;
209- }
210-
207+ /**
208+ *
209+ * getHtml(辅助方法,根据response信息返回String类型的html内容)
210+ * @param response
211+ * @return
212+ *String
213+ * @exception
214+ * @since 1.0.0
215+ */
211216 public String getHtml (HttpResponse response ) {
212217 BufferedReader br = null ;
213218 try {
@@ -233,61 +238,43 @@ public String getHtml(HttpResponse response) {
233238 }
234239 return null ;
235240 }
241+
236242 /**
237243 *
238- * doDispatcher()
239- * (适用于获取问题详情url和获取问题提交列表url)
240- * @param url
241- * @param blockingQueue
242- *void
244+ * getCookie(获取leetcode网站的Set-Cookie值)
245+ * @param httpResponse
246+ * @return
247+ *String
243248 * @exception
244249 * @since 1.0.0
245250 */
246- public List <String > doDispatcher (String url ) {
247- HttpGet httpGet1 = new HttpGet (url );
248- try {
249- HttpResponse response1 = httpClient .execute (httpGet1 );
250- printResponse (response1 );
251-
252- XpathSelector xpathSelector = new XpathSelector (problemLinkPath );
253- List <String > listTmp = xpathSelector .selectList (getHtml (response1 ));
254- HttpEntity entity1 = response1 .getEntity ();
255- EntityUtils .consume (entity1 );
256- return listTmp ;
257- } catch (ClientProtocolException e ) {
258- System .out .println (e .getMessage ());
259- } catch (IOException e ) {
260- System .out .println (e .getMessage ());
251+ public static String getCookie (HttpResponse httpResponse ) {
252+ Header [] headers = httpResponse .getAllHeaders ();
253+ for (Header value : headers ) {
254+ if (value .getName ().equals ("Set-Cookie" )) {
255+ return value .getValue ().split (";" )[0 ].split ("=" )[1 ];
256+ }
261257 }
262258 return null ;
263259 }
260+
264261 /**
265262 *
266- * doDispatcher()
267- * (用于获取accept代码url)
268- * @param url
269- * @param blockingQueue
270- * @param name
271- *void
263+ * getParam(辅助方法)
264+ * @param parameterMap
265+ * @return
266+ *List<NameValuePair>
272267 * @exception
273268 * @since 1.0.0
274269 */
275- public List <String > doDispatcher (String url , String name , CloseableHttpClient httpClient ) {
276- HttpGet httpGet1 = new HttpGet (url );
277- try {
278- HttpResponse response1 = httpClient .execute (httpGet1 );
279- printResponse (response1 );
280-
281- XpathSelector xpathSelector = new XpathSelector (problemLinkPath );
282- List <String > listTmp = xpathSelector .selectList (getHtml (response1 ));
283- HttpEntity entity1 = response1 .getEntity ();
284- EntityUtils .consume (entity1 );
285- return listTmp ;
286- } catch (ClientProtocolException e ) {
287- System .out .println (e .getMessage ());
288- } catch (IOException e ) {
289- System .out .println (e .getMessage ());
270+ public static List <NameValuePair > getParam (Map parameterMap ) {
271+ List <NameValuePair > param = new ArrayList <NameValuePair >();
272+ Iterator it = parameterMap .entrySet ().iterator ();
273+ while (it .hasNext ()) {
274+ Entry parmEntry = (Entry ) it .next ();
275+ param .add (new BasicNameValuePair ((String ) parmEntry .getKey (),
276+ (String ) parmEntry .getValue ()));
290277 }
291- return null ;
278+ return param ;
292279 }
293280}
0 commit comments