Skip to content

Commit aa3317e

Browse files
committed
update
0 parents  commit aa3317e

File tree

12 files changed

+375
-0
lines changed

12 files changed

+375
-0
lines changed

.classpath

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<classpath>
3+
<classpathentry kind="src" output="target/classes" path="src/main/java">
4+
<attributes>
5+
<attribute name="optional" value="true"/>
6+
<attribute name="maven.pomderived" value="true"/>
7+
</attributes>
8+
</classpathentry>
9+
<classpathentry kind="src" output="target/test-classes" path="src/test/java">
10+
<attributes>
11+
<attribute name="optional" value="true"/>
12+
<attribute name="maven.pomderived" value="true"/>
13+
</attributes>
14+
</classpathentry>
15+
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/J2SE-1.5">
16+
<attributes>
17+
<attribute name="maven.pomderived" value="true"/>
18+
</attributes>
19+
</classpathentry>
20+
<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
21+
<attributes>
22+
<attribute name="maven.pomderived" value="true"/>
23+
</attributes>
24+
</classpathentry>
25+
<classpathentry kind="lib" path="lib/org.json-20120521.jar"/>
26+
<classpathentry kind="output" path="target/classes"/>
27+
</classpath>

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
/target/
2+
/target/

.project

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<projectDescription>
3+
<name>leetcode2github</name>
4+
<comment></comment>
5+
<projects>
6+
</projects>
7+
<buildSpec>
8+
<buildCommand>
9+
<name>org.eclipse.jdt.core.javabuilder</name>
10+
<arguments>
11+
</arguments>
12+
</buildCommand>
13+
<buildCommand>
14+
<name>org.eclipse.m2e.core.maven2Builder</name>
15+
<arguments>
16+
</arguments>
17+
</buildCommand>
18+
</buildSpec>
19+
<natures>
20+
<nature>org.eclipse.jdt.core.javanature</nature>
21+
<nature>org.eclipse.m2e.core.maven2Nature</nature>
22+
</natures>
23+
</projectDescription>
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
eclipse.preferences.version=1
2+
encoding//src/main/java=UTF-8
3+
encoding//src/test/java=UTF-8
4+
encoding/<project>=UTF-8
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
eclipse.preferences.version=1
2+
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.5
3+
org.eclipse.jdt.core.compiler.compliance=1.5
4+
org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
5+
org.eclipse.jdt.core.compiler.source=1.5
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
activeProfiles=
2+
eclipse.preferences.version=1
3+
resolveWorkspaceProjects=true
4+
version=1

lib/org.json-20120521.jar

39.4 KB
Binary file not shown.

pom.xml

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
2+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
3+
<modelVersion>4.0.0</modelVersion>
4+
5+
<groupId>haodong.net.cn</groupId>
6+
<artifactId>leetcode2github</artifactId>
7+
<version>0.0.1-SNAPSHOT</version>
8+
<packaging>jar</packaging>
9+
10+
<name>leetcode2github</name>
11+
<url>http://maven.apache.org</url>
12+
13+
<properties>
14+
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
15+
</properties>
16+
17+
<dependencies>
18+
<dependency>
19+
<groupId>junit</groupId>
20+
<artifactId>junit</artifactId>
21+
<version>3.8.1</version>
22+
<scope>test</scope>
23+
</dependency>
24+
<!-- webmagic模块 -->
25+
<dependency>
26+
<groupId>us.codecraft</groupId>
27+
<artifactId>webmagic-core</artifactId>
28+
<version>0.4.2</version>
29+
</dependency>
30+
<dependency>
31+
<groupId>us.codecraft</groupId>
32+
<artifactId>webmagic-extension</artifactId>
33+
<version>0.4.2</version>
34+
</dependency>
35+
</dependencies>
36+
</project>
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
package haodong.net.cn.leetcode2github;
2+
3+
import java.util.List;
4+
5+
import us.codecraft.webmagic.Page;
6+
import us.codecraft.webmagic.Site;
7+
import us.codecraft.webmagic.Spider;
8+
import us.codecraft.webmagic.processor.PageProcessor;
9+
10+
/**
11+
* Hello world!
12+
*
13+
*/
14+
public class App implements PageProcessor {
15+
protected Site site = Site.me().setRetryTimes(3).setSleepTime(1000)
16+
.setTimeOut(15000);
17+
//submission 列表url
18+
protected String submissionList;
19+
//problem 代码url
20+
protected String problemCode;
21+
//problem 描述url
22+
protected String problemDescription;
23+
//problem 列表url
24+
protected String problemList;
25+
//初始访问页面
26+
protected String url;
27+
//前往problem列表的path
28+
protected String problemLinkPath;
29+
//获得problem列表的path
30+
protected String getProblemLinkPath;
31+
//前往submission列表的path
32+
protected String submissionLinkPath;
33+
//前往code页面的path
34+
protected String codePagePath;
35+
//获得code的path
36+
protected String codePath;
37+
38+
// process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
39+
public void process(Page page) {
40+
if (url.contains(problemList)) {
41+
getLinks(page);
42+
} else if (url.contains(problemCode)) {
43+
getCode(page);
44+
} else {
45+
getSubmission(page);
46+
}
47+
}
48+
49+
public void getLinks(Page page) {
50+
page.putField("name", page.getHtml().xpath(getProblemLinkPath).all());
51+
forwards(page, problemLinkPath);
52+
}
53+
54+
public void getSubmission(Page page) {
55+
forwards(page, submissionLinkPath);
56+
}
57+
58+
public void getCode(Page page) {
59+
60+
}
61+
62+
public void forwards(Page page, String xpath) {
63+
List<String> list = page.getHtml().links().xpath(xpath).all();
64+
page.addTargetRequests(list);
65+
}
66+
67+
public Site getSite() {
68+
return site;
69+
}
70+
71+
public static void main(String[] args) {
72+
73+
Spider.create(new App())
74+
// 从"https://github.com/code4craft"开始抓
75+
.addUrl("https://leetcode.com/problemset/algorithms/")
76+
// 开启5个线程抓取
77+
.thread(5)
78+
// 启动爬虫
79+
.run();
80+
}
81+
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
package haodong.net.cn.leetcode2github;
2+
3+
import us.codecraft.webmagic.Page;
4+
5+
public class CodeProcesser extends App {
6+
7+
public CodeProcesser(String url) {
8+
submissionList = "/submissions/";
9+
problemCode = "/submissions/detail/";
10+
problemList = "/problemset/algorithms/";
11+
problemLinkPath = "//table[@class='table table-striped table-centered']/tbody/tr/td/a/@href";
12+
submissionLinkPath = "//div[@class='question-title']/a/@href";
13+
this.url = url;
14+
}
15+
@Override
16+
public void process(Page page) {
17+
super.process(page);
18+
}
19+
}

0 commit comments

Comments
 (0)