Skip to content

Commit d31d30f

Browse files
committed
Multispider作为程序的入口,模仿webmagic的入口方式
1 parent a398338 commit d31d30f

File tree

5 files changed

+62
-73
lines changed

5 files changed

+62
-73
lines changed

src/main/java/MultiSpider.java

Lines changed: 34 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,43 @@
11

2+
import java.util.concurrent.Callable;
3+
import java.util.concurrent.ExecutorService;
4+
import java.util.concurrent.Executors;
5+
import java.util.concurrent.Future;
26

3-
import org.apache.http.impl.client.CloseableHttpClient;
7+
import scheduler.DispatchScheduler;
8+
import downloader.HttpClientDownloader;
49

5-
public class MultiSpider implements Runnable {
6-
private CloseableHttpClient httpClient;
7-
public MultiSpider(CloseableHttpClient httpClient) {
8-
this.httpClient = httpClient;
10+
public class MultiSpider {
11+
12+
private int threadNum = 1;
13+
private ExecutorService service;
14+
private String username;
15+
private String password;
16+
private DispatchScheduler scheduler;
17+
18+
public MultiSpider create(String username, String password) {
19+
this.username = username;
20+
this.password = password;
21+
HttpClientDownloader.init(username, password);
22+
scheduler = new DispatchScheduler();
23+
scheduler.startProcess();
24+
return new MultiSpider();
925
}
26+
1027
public void run() {
28+
service = Executors.newFixedThreadPool(this.threadNum);
29+
Future future = service.submit(new Callable() {
30+
31+
public Object call() throws Exception {
32+
scheduler.taskProcceed();
33+
return null;
34+
}
1135

36+
});
1237
}
1338

39+
public MultiSpider thread(int num) {
40+
this.threadNum = num;
41+
return this;
42+
}
1443
}

src/main/java/container/URLContainer.java

Lines changed: 0 additions & 26 deletions
This file was deleted.

src/main/java/downloader/Downloader.java

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,24 +19,25 @@ public interface Downloader {
1919
/**
2020
* 爬取题目列表
2121
* @param url
22+
* @return
2223
*/
23-
public void problemListDownloader(String url);
24+
public List<String> problemListDownloader();
2425

2526
/**
2627
* 进入题目描述页面,爬取题目提交url列表
2728
* @param url
2829
*/
29-
public void problemDescriptionDownloader(String url);
30+
public List<String> problemDescriptionDownloader(String url);
3031

3132
/**
3233
* 进入题目提交列表页面,爬取code所在页面url
3334
* @param url
3435
*/
35-
public void submissionListDownloader(String url, String name);
36+
public List<String> submissionListDownloader(String url, String name);
3637

3738
/**
3839
* 进入code所在页面,爬取code
3940
* @param url
4041
*/
41-
public void codePageDownloader(String url);
42+
public List<String> codePageDownloader(String url);
4243
}

src/main/java/downloader/HttpClientDownloader.java

Lines changed: 23 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
import java.util.List;
1212
import java.util.Map;
1313
import java.util.Map.Entry;
14-
import java.util.concurrent.BlockingQueue;
1514

1615
import org.apache.http.Header;
1716
import org.apache.http.HeaderIterator;
@@ -29,10 +28,10 @@
2928
import org.apache.http.message.BasicNameValuePair;
3029
import org.apache.http.util.EntityUtils;
3130

32-
import container.URLContainer;
3331
import us.codecraft.webmagic.selector.XpathSelector;
3432

35-
public class HttpClientDownloader extends URLContainer implements Downloader {
33+
public class HttpClientDownloader implements Downloader {
34+
private static final String ALGORITHMS = "https://www.leetcode.com/problemset/algorithms/";
3635
private static final String INDEX_URL = "https://leetcode.com"; //首页的url地址
3736
private static final String LOGIN_URL = "https://leetcode.com/accounts/login/"; //登录页面url地址
3837
private static final String PASSWORD = "thd04180015"; //登录密码
@@ -51,16 +50,12 @@ public class HttpClientDownloader extends URLContainer implements Downloader {
5150
private static final String problemNamePath = "//div[@class='col-md-12']/h4/a/text()"; //题目名称Xpath路径
5251
private static final String codePath = "//div[@class='ace_content']/text()"; //得到代码Xpath路径
5352
private static CloseableHttpClient httpClient;
54-
/**
55-
*
56-
* 创建一个新的实例 HttpClientDownloader.
57-
*
58-
*/
59-
public HttpClientDownloader() {
60-
init();
61-
}
53+
private static final HttpClientDownloader downloader = new HttpClientDownloader();
6254

63-
public void init() {
55+
public static HttpClientDownloader getInstance() {
56+
return downloader;
57+
}
58+
public static void init(String username, String password) {
6459
httpClient = HttpClients.createDefault();
6560
HttpGet httpGet = new HttpGet(INDEX_URL);
6661
CloseableHttpResponse response1;
@@ -87,8 +82,8 @@ public void init() {
8782
httpPost.addHeader("Referer", LOGIN_URL);
8883
httpPost.addHeader("Origin", INDEX_URL);
8984
Map<String, String> map = new HashMap<String, String>();
90-
map.put("login", USER_NAME);
91-
map.put("password", PASSWORD);
85+
map.put("login", username);
86+
map.put("password", password);
9287
map.put("csrfmiddlewaretoken", cookieString);
9388
map.put("remember", "on");
9489
try {
@@ -196,20 +191,21 @@ public static String getCookie(HttpResponse httpResponse) {
196191
return null;
197192
}
198193

199-
public void problemListDownloader(String url) {
200-
doDispatcher(url, problemQueue);
194+
public List<String> problemListDownloader() {
195+
return doDispatcher(ALGORITHMS);
201196
}
202197

203-
public void problemDescriptionDownloader(String url) {
204-
doDispatcher(url, problemSubmission);
198+
public List<String> problemDescriptionDownloader(String url) {
199+
return doDispatcher(url);
205200
}
206201

207-
public void submissionListDownloader(String url, String name) {
208-
doDispatcher(url, problemCodePage, name);
202+
public List<String> submissionListDownloader(String url, String name) {
203+
return doDispatcher(url, name, httpClient);
209204
}
210205

211-
public void codePageDownloader(String url) {
206+
public List<String> codePageDownloader(String url) {
212207
//爬取代码
208+
return null;
213209
}
214210

215211
public String getHtml(HttpResponse response) {
@@ -247,26 +243,23 @@ public String getHtml(HttpResponse response) {
247243
* @exception
248244
* @since 1.0.0
249245
*/
250-
public void doDispatcher(String url, BlockingQueue<String> blockingQueue) {
246+
public List<String> doDispatcher(String url) {
251247
HttpGet httpGet1 = new HttpGet(url);
252248
try {
253249
HttpResponse response1 = httpClient.execute(httpGet1);
254250
printResponse(response1);
255251

256252
XpathSelector xpathSelector = new XpathSelector(problemLinkPath);
257253
List<String> listTmp = xpathSelector.selectList(getHtml(response1));
258-
for (String string: listTmp) {
259-
blockingQueue.put(string);
260-
}
261254
HttpEntity entity1 = response1.getEntity();
262255
EntityUtils.consume(entity1);
256+
return listTmp;
263257
} catch (ClientProtocolException e) {
264258
System.out.println(e.getMessage());
265259
} catch (IOException e) {
266260
System.out.println(e.getMessage());
267-
} catch (InterruptedException e) {
268-
System.out.println(e.getMessage());
269261
}
262+
return null;
270263
}
271264
/**
272265
*
@@ -279,25 +272,22 @@ public void doDispatcher(String url, BlockingQueue<String> blockingQueue) {
279272
* @exception
280273
* @since 1.0.0
281274
*/
282-
public void doDispatcher(String url, BlockingQueue<String> blockingQueue, String name) {
275+
public List<String> doDispatcher(String url, String name, CloseableHttpClient httpClient) {
283276
HttpGet httpGet1 = new HttpGet(url);
284277
try {
285278
HttpResponse response1 = httpClient.execute(httpGet1);
286279
printResponse(response1);
287280

288281
XpathSelector xpathSelector = new XpathSelector(problemLinkPath);
289282
List<String> listTmp = xpathSelector.selectList(getHtml(response1));
290-
for (String string: listTmp) {
291-
blockingQueue.put(string);
292-
}
293283
HttpEntity entity1 = response1.getEntity();
294284
EntityUtils.consume(entity1);
285+
return listTmp;
295286
} catch (ClientProtocolException e) {
296287
System.out.println(e.getMessage());
297288
} catch (IOException e) {
298289
System.out.println(e.getMessage());
299-
} catch (InterruptedException e) {
300-
System.out.println(e.getMessage());
301290
}
291+
return null;
302292
}
303293
}

src/main/java/scheduler/DispatcherScheduler.java

Lines changed: 0 additions & 5 deletions
This file was deleted.

0 commit comments

Comments
 (0)