你所需要的,不仅仅是一个好用的代理。
置顶:Spiderman2 最新的预览版本已经出炉啦!简洁,更高性能,采集状态持久化,分布式,支持 JS 脚本,赶紧来体验一把吧!PS: 后面稳定版本会更新到这里
Spiderman 是一个Java开源Web数据抽取工具。它能够收集指定的Web页面并从这些页面中提取有用的数据。 Spiderman主要是运用了像XPath,正则表达式等这些技术来实数据抽取。
1
2
Spiderman 是一个Java开源Web数据抽取工具。它能够收集指定的Web页面并从这些页面中提取有用的数据。
Spiderman主要是运用了像XPath,正则表达式等这些技术来实数据抽取。
* spiderman-core 内核 * spiderman-plugin 插件
1
2
* spiderman-core 内核
* spiderman-plugin 插件
* 微内核+插件式架构、灵活、可扩展性强 * 无需编写程序代码即可完成数据抽取 * 多线程保证性能
1
2
3
* 微内核+插件式架构、灵活、可扩展性强
* 无需编写程序代码即可完成数据抽取
* 多线程保证性能
Spiderman Sample | 案例
这是使用 Spiderman 的代码:
public class TestSpider { private final Object mutex = new Object(); @Test public void test() throws Exception { String err = EWeb4JConfig.start(); if (err != null) throw new Exception(err); SpiderListener listener = new SpiderListenerAdaptor(){ public void afterScheduleCancel(){ //调度结束回调 } /** * 每次调度执行前回调此方法 * @date 2013-4-1 下午03:33:11 * @param theLastTimeScheduledAt 上一次调度时间 */ public void beforeEveryScheduleExecute(Date theLastTimeScheduledAt){ System.err.print("[SPIDERMAN] "+CommonUtil.getNowTime("HH:mm:ss")+" [LAST_SCHEDULE_AT] ~ "); System.err.println("at -> " + CommonUtil.formatTime(theLastTimeScheduledAt)); } public void onFetch(Thread thread, Task task, FetchResult result) { System.out.print("[SPIDERMAN] "+CommonUtil.getNowTime("HH:mm:ss")+" [FETCH] ~ "); System.out.println("fetch result ->" + result + " from -> " + task.sourceUrl); } public void onNewUrls(Thread thread, Task task, Collection<String> newUrls) { System.out.print("[SPIDERMAN] "+CommonUtil.getNowTime("HH:mm:ss")+" [DIG] ~ "); System.out.println(newUrls); } public void onDupRemoval(Thread currentThread, Task task, Collection<Task> validTasks) { // for (Task t : validTasks){ // System.out.print("[SPIDERMAN] "+CommonUtil.getNowTime("HH:mm:ss")+" [DUPREMOVE] ~ "); // System.out.println(t.url+" from->"+t.sourceUrl); // } } public void onTaskSort(Thread currentThread, Task task, Collection<Task> afterSortTasks) { // for (Task t : afterSortTasks){ // System.out.print("[SPIDERMAN] "+CommonUtil.getNowTime("HH:mm:ss")+" [SORT] ~ "); // System.out.println(t.url+" from->"+t.sourceUrl); // } } public void onNewTasks(Thread thread, Task task, Collection<Task> newTasks) { // for (Task t : newTasks){ // System.out.print("[SPIDERMAN] "+CommonUtil.getNowTime("HH:mm:ss")+" [NEWTASK] ~ "); // System.out.println(t.sort + ",,,," + t.url+" from->"+t.sourceUrl); // } } public void onTargetPage(Thread thread, Task task, Page page) { // System.out.print("[SPIDERMAN] "+CommonUtil.getNowTime("HH:mm:ss")+" [TARGET] ~ "); // System.out.println(page.getUrl()); } public void onInfo(Thread thread, Task task, String info) { System.out.print("[SPIDERMAN] "+CommonUtil.getNowTime("HH:mm:ss")+" [INFO] ~ "); System.out.println(info); } public void onError(Thread thread, Task task, String err, Throwable e) { System.err.print("[SPIDERMAN] "+CommonUtil.getNowTime("HH:mm:ss")+" [ERROR] ~ "); e.printStackTrace(); } public void onParse(Thread thread, Task task, List<Map<String, Object>> models) { final String projectRoot = FileUtil.getTopClassPath(TestSpider.class); final File dir = new File(projectRoot+"/Data/"+task.site.getName()+"/"+task.target.getName()); try { if (!dir.exists()) dir.mkdirs(); for (int i = 0; i < models.size(); i++) { Map<String, Object> map = models.get(i); String fileName = dir + "/count_" + task.site.counter.getCount() + i; StringBuilder sb = new StringBuilder(); for (Iterator<Entry<String,Object>> it = map.entrySet().iterator(); it.hasNext();){ Entry<String,Object> e = it.next(); boolean isBlank = false; if (e.getValue() == null) isBlank = true; else if (e.getValue() instanceof String && ((String)e.getValue()).trim().length() == 0) isBlank = true; else if (e.getValue() instanceof List && ((ArrayList<?>)e.getValue()).isEmpty()) isBlank = true; else if (e.getValue() instanceof List && !((ArrayList<?>)e.getValue()).isEmpty()) { if (((ArrayList<?>)e.getValue()).size() == 1 && String.valueOf(((ArrayList<?>)e.getValue()).get(0)).trim().length() == 0) isBlank = true; } if (isBlank){ if (sb.length() > 0) sb.append("_"); sb.append(e.getKey()); } } String content = CommonUtil.toJson(map); if (sb.length() > 0) fileName = fileName + "_no_"+sb.toString()+"_"; File file = new File(fileName+".json"); FileUtil.writeFile(file, content); System.out.print("[SPIDERMAN] "+CommonUtil.getNowTime("HH:mm:ss")+" [INFO] ~ "); System.out.println(fileName + " create finished..."); } } catch (Exception e) { e.printStackTrace(); } } }; //启动爬虫 Spiderman.me() .init(listener)//初始化 .startup()//启动 .keepStrict("2h");//存活时间,过了存活时间后马上关闭 //启动爬虫 + 调度定时重启 //Spiderman.me() //.listen(listener)//设置监听器 //.schedule("10s")//调度,爬虫运行10s //.delay("2s")//每隔 10 + 2 秒后重启爬虫 //.times(3)//调度 3 次 //.startup()//启动 //.blocking();//阻塞直到所有调度完成 } }
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
public class TestSpider {
private final Object mutex = new Object();
@Test
public void test() throws Exception {
String err = EWeb4JConfig.start();
if (err != null)
throw new Exception(err);
SpiderListener listener = new SpiderListenerAdaptor(){
public void afterScheduleCancel(){
//调度结束回调
}
/**
* 每次调度执行前回调此方法
* @date 2013-4-1 下午03:33:11
* @param theLastTimeScheduledAt 上一次调度时间
*/
public void beforeEveryScheduleExecute(Date theLastTimeScheduledAt){
System.err.print("[SPIDERMAN] "+CommonUtil.getNowTime("HH:mm:ss")+" [LAST_SCHEDULE_AT] ~ ");
System.err.println("at -> " + CommonUtil.formatTime(theLastTimeScheduledAt));
}
public void onFetch(Thread thread, Task task, FetchResult result) {
System.out.print("[SPIDERMAN] "+CommonUtil.getNowTime("HH:mm:ss")+" [FETCH] ~ ");
System.out.println("fetch result ->" + result + " from -> " + task.sourceUrl);
}
public void onNewUrls(Thread thread, Task task, Collection<String> newUrls) {
System.out.print("[SPIDERMAN] "+CommonUtil.getNowTime("HH:mm:ss")+" [DIG] ~ ");
System.out.println(newUrls);
}
public void onDupRemoval(Thread currentThread, Task task, Collection<Task> validTasks) {
// for (Task t : validTasks){
// System.out.print("[SPIDERMAN] "+CommonUtil.getNowTime("HH:mm:ss")+" [DUPREMOVE] ~ ");
// System.out.println(t.url+" from->"+t.sourceUrl);
// }
}
public void onTaskSort(Thread currentThread, Task task, Collection<Task> afterSortTasks) {
// for (Task t : afterSortTasks){
// System.out.print("[SPIDERMAN] "+CommonUtil.getNowTime("HH:mm:ss")+" [SORT] ~ ");
// System.out.println(t.url+" from->"+t.sourceUrl);
// }
}
public void onNewTasks(Thread thread, Task task, Collection<Task> newTasks) {
// for (Task t : newTasks){
// System.out.print("[SPIDERMAN] "+CommonUtil.getNowTime("HH:mm:ss")+" [NEWTASK] ~ ");
// System.out.println(t.sort + ",,,," + t.url+" from->"+t.sourceUrl);
// }
}
public void onTargetPage(Thread thread, Task task, Page page) {
// System.out.print("[SPIDERMAN] "+CommonUtil.getNowTime("HH:mm:ss")+" [TARGET] ~ ");
// System.out.println(page.getUrl());
}
public void onInfo(Thread thread, Task task, String info) {
System.out.print("[SPIDERMAN] "+CommonUtil.getNowTime("HH:mm:ss")+" [INFO] ~ ");
System.out.println(info);
}
public void onError(Thread thread, Task task, String err, Throwable e) {
System.err.print("[SPIDERMAN] "+CommonUtil.getNowTime("HH:mm:ss")+" [ERROR] ~ ");
e.printStackTrace();
}
public void onParse(Thread thread, Task task, List<Map<String, Object>> models) {
final String projectRoot = FileUtil.getTopClassPath(TestSpider.class);
final File dir = new File(projectRoot+"/Data/"+task.site.getName()+"/"+task.target.getName());
try {
if (!dir.exists())
dir.mkdirs();
for (int i = 0; i < models.size(); i++) {
Map<String, Object> map = models.get(i);
String fileName = dir + "/count_" + task.site.counter.getCount() + i;
StringBuilder sb = new StringBuilder();
for (Iterator<Entry<String,Object>> it = map.entrySet().iterator(); it.hasNext();){
Entry<String,Object> e = it.next();
boolean isBlank = false;
if (e.getValue() == null)
isBlank = true;
else if (e.getValue() instanceof String && ((String)e.getValue()).trim().length() == 0)
isBlank = true;
else if (e.getValue() instanceof List && ((ArrayList<?>)e.getValue()).isEmpty())
isBlank = true;
else if (e.getValue() instanceof List && !((ArrayList<?>)e.getValue()).isEmpty()) {
if (((ArrayList<?>)e.getValue()).size() == 1 && String.valueOf(((ArrayList<?>)e.getValue()).get(0)).trim().length() == 0)
isBlank = true;
}
if (isBlank){
if (sb.length() > 0)
sb.append("_");
sb.append(e.getKey());
}
}
String content = CommonUtil.toJson(map);
if (sb.length() > 0)
fileName = fileName + "_no_"+sb.toString()+"_";
File file = new File(fileName+".json");
FileUtil.writeFile(file, content);
System.out.print("[SPIDERMAN] "+CommonUtil.getNowTime("HH:mm:ss")+" [INFO] ~ ");
System.out.println(fileName + " create finished...");
}
} catch (Exception e) {
e.printStackTrace();
}
}
};
//启动爬虫
Spiderman.me()
.init(listener)//初始化
.startup()//启动
.keepStrict("2h");//存活时间,过了存活时间后马上关闭
//启动爬虫 + 调度定时重启
//Spiderman.me()
//.listen(listener)//设置监听器
//.schedule("10s")//调度,爬虫运行10s
//.delay("2s")//每隔 10 + 2 秒后重启爬虫
//.times(3)//调度 3 次
//.startup()//启动
//.blocking();//阻塞直到所有调度完成
}
}
下面详细看看这个 sample 的配置文件:
首先有一个初始化配置文件 spiderman.properties,它就放在 #{ClassPath} 目录下
#网站配置文件放置目录 website.xml.folder=#{ClassPath}/WebSites #网站已访问url数据库存储目录 website.visited.folder=#{ClassPath}/dbEnv #http抓取失败重试次数 http.fetch.retry=3 #http连接超时,支持单位 s秒 m分 h时 d天,不写单位则表示s秒 http.fetch.timeout=5s
1
2
3
4
5
6
7
8
#网站配置文件放置目录
website.xml.folder=#{ClassPath}/WebSites
#网站已访问url数据库存储目录
website.visited.folder=#{ClassPath}/dbEnv
#http抓取失败重试次数
http.fetch.retry=3
#http连接超时,支持单位 s秒 m分 h时 d天,不写单位则表示s秒
http.fetch.timeout=5s
然后在 #{ClassPath}/WebSites 目录下有一份 oschina.xml
<span class="cp"><?xml version="1.0" encoding="UTF-8"?></span> <span class="c"><!-- | Spiderman Java开源垂直网络爬虫 | 项目主页: https://gitcafe.com/laiweiwei/Spiderman | author: l.weiwei@163.com | blog: http://laiweiweihi.iteye.com,http://my.oschina.net/laiweiwei | qq: 493781187 | email: l.weiwei@163.com | create: 2013-01-08 16:12 | update: 2013-04-10 18:06 --></span> <span class="nt"><beans></span> <span class="c"><!-- | name:名称 | url:种子链接 | skipStatusCode:设置哪些状态码需要忽略,多个用逗号隔开 | userAgent:设置爬虫标识 | includeHttps:0|1是否抓取https页 | isDupRemovalStrict:0|1是否严格去掉重复的TargetUrl,即已访问过一次的TargetUrl不会再被访问,若否,就算是重复的TargetUrl,只要它的来源URL不同,都会被访问 | isFollowRedirects:0|1是否递归跟随30X返回的location继续抓取 | reqDelay:{n}s|{n}m|{n}h|n每次请求之前延缓时间 | enable:0|1是否开启本网站的抓取 | charset:网站字符集 | schedule:调度时间,每隔多长时间重新从种子链接抓取 | thread:分配给本网站爬虫的线程数 | waitQueue:当任务队列空的时候爬虫等待多长时间再索取任务 | timeout:HTTP请求超时 --></span> <span class="nt"><site</span> <span class="na">name=</span><span class="s">"oschina"</span> <span class="na">includeHttps=</span><span class="s">"1"</span> <span class="na">url=</span><span class="s">"http://www.oschina.net/question?catalog=1&show=&p=1"</span> <span class="na">reqDelay=</span><span class="s">"1s"</span> <span class="na">enable=</span><span class="s">"0"</span> <span class="na">charset=</span><span class="s">"utf-8"</span> <span class="na">schedule=</span><span class="s">"1h"</span> <span class="na">thread=</span><span class="s">"2"</span> <span class="na">waitQueue=</span><span class="s">"10s"</span><span class="nt">></span> <span class="c"><!-- | 配置多个种子链接 | name:种子名称 | url:种子链接 --></span> <span class="c"><!--seeds> <seed name="" url="" /> </seeds--></span> <span class="c"><!-- | 告诉爬虫仅抓取以下这些host的链接,多数是应对二级或多级域名的情况 --></span> <span class="c"><!--validHosts> <validHost value="demo.eweb4j.org" /> <validHost value="wwww.eweb4j.org" /> </validHosts--></span> <span class="c"><!-- | HTTP Header <headers> <header name="" value="" /> </headers>--></span> <span class="c"><!-- | HTTP Cookie <cookies> <cookie name="" value="" host="" path="" /> </cookies>--></span> <span class="c"><!-- | 进入任务队列的URL规则 | policy:多个rule的策略,and | or --></span> <span class="nt"><queueRules</span> <span class="na">policy=</span><span class="s">"and"</span><span class="nt">></span> <span class="c"><!-- | 规则 | type:规则类型,包括 regex | equal | start | end | contains 所有规则可以在前面添加 "!" 表示取反 | value:值 --></span> <span class="nt"><rule</span> <span class="na">type=</span><span class="s">"!regex"</span> <span class="na">value=</span><span class="s">"^.*\.(jpg|png|gif)$"</span> <span class="nt">/></span> <span class="nt"></queueRules></span> <span class="c"><!-- | 抓取目标 --></span> <span class="nt"><targets></span> <span class="c"><!-- | 限制目标URL的来源,一般来说,对应的就是网站的频道页,例如某个分类下的新闻列表页 --></span> <span class="nt"><sourceRules</span> <span class="na">policy=</span><span class="s">"and"</span><span class="nt">></span> <span class="nt"><rule</span> <span class="na">type=</span><span class="s">"regex"</span> <span class="na">value=</span><span class="s">"http://www\.oschina\.net/question\?catalog=1&show=&p=\d+"</span><span class="nt">></span> <span class="c"><!-- | 定义如何在来源页面上挖掘新的 URL | 这个节点跟 <model> 节点是一样的结构,只不过名称不叫model而是叫做digUrls而已 --></span> <span class="nt"><digUrls></span> <span class="nt"><field</span> <span class="na">name=</span><span class="s">"page_url"</span> <span class="na">isArray=</span><span class="s">"1"</span><span class="nt">></span> <span class="nt"><parsers></span> <span class="nt"><parser</span> <span class="na">xpath=</span><span class="s">"//div[@class='QuestionList']//ul[@class='pager']//li[@class='page']//a[@href]"</span> <span class="na">attribute=</span><span class="s">"href"</span> <span class="nt">/></span> <span class="nt"><parser</span> <span class="na">exp=</span><span class="s">"'http://www.oschina.net/question'+$this"</span> <span class="nt">/></span> <span class="nt"></parsers></span> <span class="nt"></field></span> <span class="nt"><field</span> <span class="na">name=</span><span class="s">"target_url"</span> <span class="na">isArray=</span><span class="s">"1"</span><span class="nt">></span> <span class="nt"><parsers></span> <span class="nt"><parser</span> <span class="na">xpath=</span><span class="s">"//div[@class='QuestionList']//ul//li[@class='question']//div[@class='qbody']/h2[1]//a[@href]"</span> <span class="na">attribute=</span><span class="s">"href"</span> <span class="nt">/></span> <span class="nt"></parsers></span> <span class="nt"></field></span> <span class="nt"></digUrls></span> <span class="nt"></rule></span> <span class="nt"></sourceRules></span> <span class="c"><!-- | name:目标名称 --></span> <span class="nt"><target</span> <span class="na">name=</span><span class="s">"question"</span><span class="nt">></span> <span class="c"><!-- | 目标URL的规则 --></span> <span class="nt"><urlRules</span> <span class="na">policy=</span><span class="s">"and"</span><span class="nt">></span> <span class="nt"><rule</span> <span class="na">type=</span><span class="s">"regex"</span> <span class="na">value=</span><span class="s">"http://www\.oschina\.net/question/\d+_\d+"</span> <span class="nt">/></span> <span class="nt"></urlRules></span> <span class="c"><!-- | 目标网页的数据模型 | cType: 目标网页的contentType | isForceUseXmlParser:0|1 是否强制使用XML的解析器来解析目标网页,此选项可以让HTML页面支持XPath2.0 | isIgnoreComments:0|1 是否忽略注释 | isArray:0|1 目标网页是否有多个数据模型,一般一些RSS XML页面上就会有很多个数据模型需要解析,即在一个xml页面上解析多个Model对象 | xpath: 搭配 isArray 来使用,可选 --></span> <span class="nt"><model></span> <span class="c"><!-- | 目标网页的命名空间配置,一般用于xml页面 | prefix: 前缀 | uri: 关联的URI <namespaces> <namespace prefix="" uri="" /> </namespaces> --></span> <span class="c"><!-- | 属性的配置 | name:属性名称 | isArray:0|1 是否是多值 | isMergeArray:0|1 是否将多值合并,搭配isArray使用 | isParam:0|1 是否作为参数提供给别的field节点使用,如果是,则生命周期不会保持到最后 | isFinal:0|1 是否是不可变的参数,搭配isParam使用,如果是,第一次赋值之后不会再被改变 | isAlsoParseInNextPage:0|1 是否在分页的下一页里继续解析,用于目标网页有分页的情况 | isTrim:0|1 是否去掉前后空格 --></span> <span class="nt"><field</span> <span class="na">name=</span><span class="s">"title"</span><span class="nt">></span> <span class="nt"><parsers></span> <span class="c"><!-- | xpath: XPath规则,如果目标页面是XML,则可以使用2.0语法,否则HTML的话暂时只能1.0 | attribute:当使用XPath解析后的内容不是文本而是一个Node节点对象的时候,可以给定一个属性名获取其属性值例如<img src="" /> | regex:当使用XPath(包括attribute)规则获取到的文本内容不满足需求时,可以继续设置regex正则表达式进行解析 | exp:当使用XPath获取的文本(如果获取的不是文本则会先执行exp而不是regex否则先执行regex)不满足需求时,可以继续这是exp表达式进行解析 | exp表达式有几个内置对象和方法: | $output(Node): 这个是内置的output函数,作用是输出某个XML节点的结构内容。参数是一个XML节点对象,可以通过XPath获得 | $this: 当使用XPath获取到的是Node节点时,这个表示节点对象,否则表示Java的字符串对象,可以调用Java字符串API进行处理 | $Tags: 这个是内置的用于过滤标签的工具类 | $Tags.xml($output($this)).rm('p').ok() | $Tags.xml($this).rm('p').empty().ok() | $Attrs: 这个是内置的用于过滤属性的工具类 | $Attrs.xml($this).rm('style').ok() | $Attrs.xml($this).tag('img').rm('src').ok() | | $Tags和$Attrs可以一起使用: | $Tags.xml($this).rm('p').Attrs().rm('style').ok() | $Attrs.xml($this).rm('style').Tags().rm('p').ok() | skipErr:0|1 是否忽略错误消息 | skipRgxFail:0|1 是否忽略正则匹配失败,如果是,则会取失败前的值 --></span> <span class="nt"><parser</span> <span class="na">xpath=</span><span class="s">"//div[@class='QTitle']/h1/text()"</span><span class="nt">/></span> <span class="nt"></parsers></span> <span class="nt"></field></span> <span class="nt"><field</span> <span class="na">name=</span><span class="s">"content"</span><span class="nt">></span> <span class="nt"><parsers></span> <span class="nt"><parser</span> <span class="na">xpath=</span><span class="s">"//div[@class='Content']//div[@class='detail']"</span> <span class="na">exp=</span><span class="s">"$output($this)"</span> <span class="nt">/></span> <span class="c"><!--attribute 黑名单--></span> <span class="nt"><parser</span> <span class="na">exp=</span><span class="s">"$Attrs.xml($this).rm('class').rm('style').rm('width').rm('height').rm('usemap').rm('align').rm('border').rm('title').rm('alt').ok()"</span> <span class="nt">/></span> <span class="c"><!--tag 黑名单,去掉内嵌内容--></span> <span class="nt"><parser</span> <span class="na">exp=</span><span class="s">"$Tags.xml($this).rm('map').rm('iframe').rm('object').empty().ok()"</span> <span class="nt">/></span> <span class="c"><!--tag 白名单,保留的标签,除此之外都要删除(不删除其他标签内嵌内容)--></span> <span class="nt"><parser</span> <span class="na">exp=</span><span class="s">"$Tags.xml($this).kp('br').kp('h1').kp('h2').kp('h3').kp('h4').kp('h5').kp('h6').kp('table').kp('th').kp('tr').kp('td').kp('img').kp('p').kp('a').kp('ul').kp('ol').kp('li').kp('td').kp('em').kp('i').kp('u').kp('er').kp('b').kp('strong').ok()"</span> <span class="nt">/></span> <span class="c"><!--其他--></span> <span class="nt"></parsers></span> <span class="nt"></field></span> <span class="nt"><field</span> <span class="na">name=</span><span class="s">"author"</span><span class="nt">></span> <span class="nt"><parsers></span> <span class="nt"><parser</span> <span class="na">xpath=</span><span class="s">"//div[@class='stat']//a[@target='_blank']/text()"</span><span class="nt">/></span> <span class="nt"></parsers></span> <span class="nt"></field></span> <span class="nt"><field</span> <span class="na">name=</span><span class="s">"tags"</span> <span class="na">isArray=</span><span class="s">"1"</span><span class="nt">></span> <span class="nt"><parsers></span> <span class="nt"><parser</span> <span class="na">xpath=</span><span class="s">"//div[@class='Tags']//a/text()"</span><span class="nt">/></span> <span class="nt"></parsers></span> <span class="nt"></field></span> <span class="nt"><field</span> <span class="na">name=</span><span class="s">"answers"</span> <span class="na">isArray=</span><span class="s">"1"</span><span class="nt">></span> <span class="nt"><parsers></span> <span class="nt"><parser</span> <span class="na">xpath=</span><span class="s">"//li[@class='Answer']//div[@class='detail']/text()"</span> <span class="nt">/></span> <span class="nt"></parsers></span> <span class="nt"></field></span> <span class="nt"></model></span> <span class="nt"></target></span> <span class="nt"></targets></span> <span class="c"><!-- | 插件 --></span> <span class="nt"><plugins></span> <span class="c"><!-- | enable:是否开启 | name:插件名 | version:插件版本 | desc:插件描述 --></span> <span class="nt"><plugin</span> <span class="na">enable=</span><span class="s">"1"</span> <span class="na">name=</span><span class="s">"spider_plugin"</span> <span class="na">version=</span><span class="s">"0.0.1"</span> <span class="na">desc=</span><span class="s">"这是一个官方实现的默认插件,实现了所有扩展点。"</span><span class="nt">></span> <span class="c"><!-- | 每个插件包含了对若干扩展点的实现 --></span> <span class="nt"><extensions></span> <span class="c"><!-- | point:扩展点名它们包括 task_poll, begin, fetch, dig, dup_removal, task_sort, task_push, target, parse, pojo, end --></span> <span class="nt"><extension</span> <span class="na">point=</span><span class="s">"task_poll"</span><span class="nt">></span> <span class="c"><!-- | 扩展点实现类 | type: 如何获取实现类 ,默认通过无参构造器实例化给定的类名,可以设置为ioc,这样就会从EWeb4J的IOC容器里获取 | value: 当时type=ioc的时候填写IOC的bean_id,否则填写完整类名 | sort: 排序,同一个扩展点有多个实现类,这些实现类会以责任链的方式进行执行,因此它们的执行顺序将变得很重要 --></span> <span class="nt"><impl</span> <span class="na">type=</span><span class="s">""</span> <span class="na">value=</span><span class="s">"org.eweb4j.spiderman.plugin.impl.TaskPollPointImpl"</span> <span class="na">sort=</span><span class="s">"0"</span><span class="nt">/></span> <span class="nt"></extension></span> <span class="nt"><extension</span> <span class="na">point=</span><span class="s">"begin"</span><span class="nt">></span> <span class="nt"><impl</span> <span class="na">type=</span><span class="s">""</span> <span class="na">value=</span><span class="s">"org.eweb4j.spiderman.plugin.impl.BeginPointImpl"</span> <span class="na">sort=</span><span class="s">"0"</span><span class="nt">/></span> <span class="nt"></extension></span> <span class="nt"><extension</span> <span class="na">point=</span><span class="s">"fetch"</span><span class="nt">></span> <span class="nt"><impl</span> <span class="na">type=</span><span class="s">""</span> <span class="na">value=</span><span class="s">"org.eweb4j.spiderman.plugin.impl.FetchPointImpl"</span> <span class="na">sort=</span><span class="s">"0"</span><span class="nt">/></span> <span class="nt"></extension></span> <span class="nt"><extension</span> <span class="na">point=</span><span class="s">"dig"</span><span class="nt">></span> <span class="nt"><impl</span> <span class="na">type=</span><span class="s">""</span> <span class="na">value=</span><span class="s">"org.eweb4j.spiderman.plugin.impl.DigPointImpl"</span> <span class="na">sort=</span><span class="s">"0"</span><span class="nt">/></span> <span class="nt"></extension></span> <span class="nt"><extension</span> <span class="na">point=</span><span class="s">"dup_removal"</span><span class="nt">></span> <span class="nt"><impl</span> <span class="na">type=</span><span class="s">""</span> <span class="na">value=</span><span class="s">"org.eweb4j.spiderman.plugin.impl.DupRemovalPointImpl"</span> <span class="na">sort=</span><span class="s">"0"</span><span class="nt">/></span> <span class="nt"></extension></span> <span class="nt"><extension</span> <span class="na">point=</span><span class="s">"task_sort"</span><span class="nt">></span> <span class="nt"><impl</span> <span class="na">type=</span><span class="s">""</span> <span class="na">value=</span><span class="s">"org.eweb4j.spiderman.plugin.impl.TaskSortPointImpl"</span> <span class="na">sort=</span><span class="s">"0"</span><span class="nt">/></span> <span class="nt"></extension></span> <span class="nt"><extension</span> <span class="na">point=</span><span class="s">"task_push"</span><span class="nt">></span> <span class="nt"><impl</span> <span class="na">type=</span><span class="s">""</span> <span class="na">value=</span><span class="s">"org.eweb4j.spiderman.plugin.impl.TaskPushPointImpl"</span> <span class="na">sort=</span><span class="s">"0"</span><span class="nt">/></span> <span class="nt"></extension></span> <span class="nt"><extension</span> <span class="na">point=</span><span class="s">"target"</span><span class="nt">></span> <span class="nt"><impl</span> <span class="na">type=</span><span class="s">""</span> <span class="na">value=</span><span class="s">"org.eweb4j.spiderman.plugin.impl.TargetPointImpl"</span> <span class="na">sort=</span><span class="s">"0"</span><span class="nt">/></span> <span class="nt"></extension></span> <span class="nt"><extension</span> <span class="na">point=</span><span class="s">"parse"</span><span class="nt">></span> <span class="nt"><impl</span> <span class="na">type=</span><span class="s">""</span> <span class="na">value=</span><span class="s">"org.eweb4j.spiderman.plugin.impl.ParsePointImpl"</span> <span class="na">sort=</span><span class="s">"0"</span><span class="nt">/></span> <span class="nt"></extension></span> <span class="nt"><extension</span> <span class="na">point=</span><span class="s">"end"</span><span class="nt">></span> <span class="nt"><impl</span> <span class="na">type=</span><span class="s">""</span> <span class="na">value=</span><span class="s">"org.eweb4j.spiderman.plugin.impl.EndPointImpl"</span> <span class="na">sort=</span><span class="s">"0"</span><span class="nt">/></span> <span class="nt"></extension></span> <span class="nt"></extensions></span> <span class="nt"><providers></span> <span class="nt"><provider></span> <span class="nt"><orgnization</span> <span class="na">name=</span><span class="s">"CFuture"</span> <span class="na">website=</span><span class="s">"http://lurencun.com"</span> <span class="na">desc=</span><span class="s">"Color your future"</span><span class="nt">></span> <span class="nt"><author</span> <span class="na">name=</span><span class="s">"weiwei"</span> <span class="na">website=</span><span class="s">"http://laiweiweihi.iteye.com | http://my.oschina.net/laiweiwei"</span> <span class="na">email=</span><span class="s">"l.weiwei@163.com"</span> <span class="na">weibo=</span><span class="s">"http://weibo.com/weiweimiss"</span> <span class="na">desc=</span><span class="s">"一个喜欢自由、音乐、绘画的IT老男孩"</span> <span class="nt">/></span> <span class="nt"></orgnization></span> <span class="nt"></provider></span> <span class="nt"></providers></span> <span class="nt"></plugin></span> <span class="nt"></plugins></span> <span class="nt"></site></span> <span class="nt"></beans></span>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
<span class="cp"><?xml version="1.0" encoding="UTF-8"?></span>
<span class="c"><!--
| Spiderman Java开源垂直网络爬虫
| 项目主页: https://gitcafe.com/laiweiwei/Spiderman
| author: l.weiwei@163.com
| blog: http://laiweiweihi.iteye.com,http://my.oschina.net/laiweiwei
| qq: 493781187
| email: l.weiwei@163.com
| create: 2013-01-08 16:12
| update: 2013-04-10 18:06
--></span>
<span class="nt"><beans></span>
<span class="c"><!--
| name:名称
| url:种子链接
| skipStatusCode:设置哪些状态码需要忽略,多个用逗号隔开
| userAgent:设置爬虫标识
| includeHttps:0|1是否抓取https页
| isDupRemovalStrict:0|1是否严格去掉重复的TargetUrl,即已访问过一次的TargetUrl不会再被访问,若否,就算是重复的TargetUrl,只要它的来源URL不同,都会被访问
| isFollowRedirects:0|1是否递归跟随30X返回的location继续抓取
| reqDelay:{n}s|{n}m|{n}h|n每次请求之前延缓时间
| enable:0|1是否开启本网站的抓取
| charset:网站字符集
| schedule:调度时间,每隔多长时间重新从种子链接抓取
| thread:分配给本网站爬虫的线程数
| waitQueue:当任务队列空的时候爬虫等待多长时间再索取任务
| timeout:HTTP请求超时
--></span>
<span class="nt"><site</span> <span class="na">name=</span><span class="s">"oschina"</span> <span class="na">includeHttps=</span><span class="s">"1"</span> <span class="na">url=</span><span class="s">"http://www.oschina.net/question?catalog=1&show=&p=1"</span> <span class="na">reqDelay=</span><span class="s">"1s"</span> <span class="na">enable=</span><span class="s">"0"</span> <span class="na">charset=</span><span class="s">"utf-8"</span> <span class="na">schedule=</span><span class="s">"1h"</span> <span class="na">thread=</span><span class="s">"2"</span> <span class="na">waitQueue=</span><span class="s">"10s"</span><span class="nt">></span>
<span class="c"><!--
| 配置多个种子链接
| name:种子名称
| url:种子链接
--></span>
<span class="c"><!--seeds>
<seed name="" url="" />
</seeds--></span>
<span class="c"><!--
| 告诉爬虫仅抓取以下这些host的链接,多数是应对二级或多级域名的情况
--></span>
<span class="c"><!--validHosts>
<validHost value="demo.eweb4j.org" />
<validHost value="wwww.eweb4j.org" />
</validHosts--></span>
<span class="c"><!--
| HTTP Header
<headers>
<header name="" value="" />
</headers>--></span>
<span class="c"><!--
| HTTP Cookie
<cookies>
<cookie name="" value="" host="" path="" />
</cookies>--></span>
<span class="c"><!--
| 进入任务队列的URL规则
| policy:多个rule的策略,and | or
--></span>
<span class="nt"><queueRules</span> <span class="na">policy=</span><span class="s">"and"</span><span class="nt">></span>
<span class="c"><!--
| 规则
| type:规则类型,包括 regex | equal | start | end | contains 所有规则可以在前面添加 "!" 表示取反
| value:值
--></span>
<span class="nt"><rule</span> <span class="na">type=</span><span class="s">"!regex"</span> <span class="na">value=</span><span class="s">"^.*\.(jpg|png|gif)$"</span> <span class="nt">/></span>
<span class="nt"></queueRules></span>
<span class="c"><!--
| 抓取目标
--></span>
<span class="nt"><targets></span>
<span class="c"><!--
| 限制目标URL的来源,一般来说,对应的就是网站的频道页,例如某个分类下的新闻列表页
--></span>
<span class="nt"><sourceRules</span> <span class="na">policy=</span><span class="s">"and"</span><span class="nt">></span>
<span class="nt"><rule</span> <span class="na">type=</span><span class="s">"regex"</span> <span class="na">value=</span><span class="s">"http://www\.oschina\.net/question\?catalog=1&show=&p=\d+"</span><span class="nt">></span>
<span class="c"><!--
| 定义如何在来源页面上挖掘新的 URL
| 这个节点跟 <model> 节点是一样的结构,只不过名称不叫model而是叫做digUrls而已
--></span>
<span class="nt"><digUrls></span>
<span class="nt"><field</span> <span class="na">name=</span><span class="s">"page_url"</span> <span class="na">isArray=</span><span class="s">"1"</span><span class="nt">></span>
<span class="nt"><parsers></span>
<span class="nt"><parser</span> <span class="na">xpath=</span><span class="s">"//div[@class='QuestionList']//ul[@class='pager']//li[@class='page']//a[@href]"</span> <span class="na">attribute=</span><span class="s">"href"</span> <span class="nt">/></span>
<span class="nt"><parser</span> <span class="na">exp=</span><span class="s">"'http://www.oschina.net/question'+$this"</span> <span class="nt">/></span>
<span class="nt"></parsers></span>
<span class="nt"></field></span>
<span class="nt"><field</span> <span class="na">name=</span><span class="s">"target_url"</span> <span class="na">isArray=</span><span class="s">"1"</span><span class="nt">></span>
<span class="nt"><parsers></span>
<span class="nt"><parser</span> <span class="na">xpath=</span><span class="s">"//div[@class='QuestionList']//ul//li[@class='question']//div[@class='qbody']/h2[1]//a[@href]"</span> <span class="na">attribute=</span><span class="s">"href"</span> <span class="nt">/></span>
<span class="nt"></parsers></span>
<span class="nt"></field></span>
<span class="nt"></digUrls></span>
<span class="nt"></rule></span>
<span class="nt"></sourceRules></span>
<span class="c"><!--
| name:目标名称
--></span>
<span class="nt"><target</span> <span class="na">name=</span><span class="s">"question"</span><span class="nt">></span>
<span class="c"><!--
| 目标URL的规则
--></span>
<span class="nt"><urlRules</span> <span class="na">policy=</span><span class="s">"and"</span><span class="nt">></span>
<span class="nt"><rule</span> <span class="na">type=</span><span class="s">"regex"</span> <span class="na">value=</span><span class="s">"http://www\.oschina\.net/question/\d+_\d+"</span> <span class="nt">/></span>
<span class="nt"></urlRules></span>
<span class="c"><!--
| 目标网页的数据模型
| cType: 目标网页的contentType
| isForceUseXmlParser:0|1 是否强制使用XML的解析器来解析目标网页,此选项可以让HTML页面支持XPath2.0
| isIgnoreComments:0|1 是否忽略注释
| isArray:0|1 目标网页是否有多个数据模型,一般一些RSS XML页面上就会有很多个数据模型需要解析,即在一个xml页面上解析多个Model对象
| xpath: 搭配 isArray 来使用,可选
--></span>
<span class="nt"><model></span>
<span class="c"><!--
| 目标网页的命名空间配置,一般用于xml页面
| prefix: 前缀
| uri: 关联的URI
<namespaces>
<namespace prefix="" uri="" />
</namespaces>
--></span>
<span class="c"><!--
| 属性的配置
| name:属性名称
| isArray:0|1 是否是多值
| isMergeArray:0|1 是否将多值合并,搭配isArray使用
| isParam:0|1 是否作为参数提供给别的field节点使用,如果是,则生命周期不会保持到最后
| isFinal:0|1 是否是不可变的参数,搭配isParam使用,如果是,第一次赋值之后不会再被改变
| isAlsoParseInNextPage:0|1 是否在分页的下一页里继续解析,用于目标网页有分页的情况
| isTrim:0|1 是否去掉前后空格
--></span>
<span class="nt"><field</span> <span class="na">name=</span><span class="s">"title"</span><span class="nt">></span>
<span class="nt"><parsers></span>
<span class="c"><!--
| xpath: XPath规则,如果目标页面是XML,则可以使用2.0语法,否则HTML的话暂时只能1.0
| attribute:当使用XPath解析后的内容不是文本而是一个Node节点对象的时候,可以给定一个属性名获取其属性值例如<img src="" />
| regex:当使用XPath(包括attribute)规则获取到的文本内容不满足需求时,可以继续设置regex正则表达式进行解析
| exp:当使用XPath获取的文本(如果获取的不是文本则会先执行exp而不是regex否则先执行regex)不满足需求时,可以继续这是exp表达式进行解析
| exp表达式有几个内置对象和方法:
| $output(Node): 这个是内置的output函数,作用是输出某个XML节点的结构内容。参数是一个XML节点对象,可以通过XPath获得
| $this: 当使用XPath获取到的是Node节点时,这个表示节点对象,否则表示Java的字符串对象,可以调用Java字符串API进行处理
| $Tags: 这个是内置的用于过滤标签的工具类
| $Tags.xml($output($this)).rm('p').ok()
| $Tags.xml($this).rm('p').empty().ok()
| $Attrs: 这个是内置的用于过滤属性的工具类
| $Attrs.xml($this).rm('style').ok()
| $Attrs.xml($this).tag('img').rm('src').ok()
|
| $Tags和$Attrs可以一起使用:
| $Tags.xml($this).rm('p').Attrs().rm('style').ok()
| $Attrs.xml($this).rm('style').Tags().rm('p').ok()
| skipErr:0|1 是否忽略错误消息
| skipRgxFail:0|1 是否忽略正则匹配失败,如果是,则会取失败前的值
--></span>
<span class="nt"><parser</span> <span class="na">xpath=</span><span class="s">"//div[@class='QTitle']/h1/text()"</span><span class="nt">/></span>
<span class="nt"></parsers></span>
<span class="nt"></field></span>
<span class="nt"><field</span> <span class="na">name=</span><span class="s">"content"</span><span class="nt">></span>
<span class="nt"><parsers></span>
<span class="nt"><parser</span> <span class="na">xpath=</span><span class="s">"//div[@class='Content']//div[@class='detail']"</span> <span class="na">exp=</span><span class="s">"$output($this)"</span> <span class="nt">/></span>
<span class="c"><!--attribute 黑名单--></span>
<span class="nt"><parser</span> <span class="na">exp=</span><span class="s">"$Attrs.xml($this).rm('class').rm('style').rm('width').rm('height').rm('usemap').rm('align').rm('border').rm('title').rm('alt').ok()"</span> <span class="nt">/></span>
<span class="c"><!--tag 黑名单,去掉内嵌内容--></span>
<span class="nt"><parser</span> <span class="na">exp=</span><span class="s">"$Tags.xml($this).rm('map').rm('iframe').rm('object').empty().ok()"</span> <span class="nt">/></span>
<span class="c"><!--tag 白名单,保留的标签,除此之外都要删除(不删除其他标签内嵌内容)--></span>
<span class="nt"><parser</span> <span class="na">exp=</span><span class="s">"$Tags.xml($this).kp('br').kp('h1').kp('h2').kp('h3').kp('h4').kp('h5').kp('h6').kp('table').kp('th').kp('tr').kp('td').kp('img').kp('p').kp('a').kp('ul').kp('ol').kp('li').kp('td').kp('em').kp('i').kp('u').kp('er').kp('b').kp('strong').ok()"</span> <span class="nt">/></span>
<span class="c"><!--其他--></span>
<span class="nt"></parsers></span>
<span class="nt"></field></span>
<span class="nt"><field</span> <span class="na">name=</span><span class="s">"author"</span><span class="nt">></span>
<span class="nt"><parsers></span>
<span class="nt"><parser</span> <span class="na">xpath=</span><span class="s">"//div[@class='stat']//a[@target='_blank']/text()"</span><span class="nt">/></span>
<span class="nt"></parsers></span>
<span class="nt"></field></span>
<span class="nt"><field</span> <span class="na">name=</span><span class="s">"tags"</span> <span class="na">isArray=</span><span class="s">"1"</span><span class="nt">></span>
<span class="nt"><parsers></span>
<span class="nt"><parser</span> <span class="na">xpath=</span><span class="s">"//div[@class='Tags']//a/text()"</span><span class="nt">/></span>
<span class="nt"></parsers></span>
<span class="nt"></field></span>
<span class="nt"><field</span> <span class="na">name=</span><span class="s">"answers"</span> <span class="na">isArray=</span><span class="s">"1"</span><span class="nt">></span>
<span class="nt"><parsers></span>
<span class="nt"><parser</span> <span class="na">xpath=</span><span class="s">"//li[@class='Answer']//div[@class='detail']/text()"</span> <span class="nt">/></span>
<span class="nt"></parsers></span>
<span class="nt"></field></span>
<span class="nt"></model></span>
<span class="nt"></target></span>
<span class="nt"></targets></span>
<span class="c"><!--
| 插件
--></span>
<span class="nt"><plugins></span>
<span class="c"><!--
| enable:是否开启
| name:插件名
| version:插件版本
| desc:插件描述
--></span>
<span class="nt"><plugin</span> <span class="na">enable=</span><span class="s">"1"</span> <span class="na">name=</span><span class="s">"spider_plugin"</span> <span class="na">version=</span><span class="s">"0.0.1"</span> <span class="na">desc=</span><span class="s">"这是一个官方实现的默认插件,实现了所有扩展点。"</span><span class="nt">></span>
<span class="c"><!--
| 每个插件包含了对若干扩展点的实现
--></span>
<span class="nt"><extensions></span>
<span class="c"><!--
| point:扩展点名它们包括 task_poll, begin, fetch, dig, dup_removal, task_sort, task_push, target, parse, pojo, end
--></span>
<span class="nt"><extension</span> <span class="na">point=</span><span class="s">"task_poll"</span><span class="nt">></span>
<span class="c"><!--
| 扩展点实现类
| type: 如何获取实现类 ,默认通过无参构造器实例化给定的类名,可以设置为ioc,这样就会从EWeb4J的IOC容器里获取
| value: 当时type=ioc的时候填写IOC的bean_id,否则填写完整类名
| sort: 排序,同一个扩展点有多个实现类,这些实现类会以责任链的方式进行执行,因此它们的执行顺序将变得很重要
--></span>
<span class="nt"><impl</span> <span class="na">type=</span><span class="s">""</span> <span class="na">value=</span><span class="s">"org.eweb4j.spiderman.plugin.impl.TaskPollPointImpl"</span> <span class="na">sort=</span><span class="s">"0"</span><span class="nt">/></span>
<span class="nt"></extension></span>
<span class="nt"><extension</span> <span class="na">point=</span><span class="s">"begin"</span><span class="nt">></span>
<span class="nt"><impl</span> <span class="na">type=</span><span class="s">""</span> <span class="na">value=</span><span class="s">"org.eweb4j.spiderman.plugin.impl.BeginPointImpl"</span> <span class="na">sort=</span><span class="s">"0"</span><span class="nt">/></span>
<span class="nt"></extension></span>
<span class="nt"><extension</span> <span class="na">point=</span><span class="s">"fetch"</span><span class="nt">></span>
<span class="nt"><impl</span> <span class="na">type=</span><span class="s">""</span> <span class="na">value=</span><span class="s">"org.eweb4j.spiderman.plugin.impl.FetchPointImpl"</span> <span class="na">sort=</span><span class="s">"0"</span><span class="nt">/></span>
<span class="nt"></extension></span>
<span class="nt"><extension</span> <span class="na">point=</span><span class="s">"dig"</span><span class="nt">></span>
<span class="nt"><impl</span> <span class="na">type=</span><span class="s">""</span> <span class="na">value=</span><span class="s">"org.eweb4j.spiderman.plugin.impl.DigPointImpl"</span> <span class="na">sort=</span><span class="s">"0"</span><span class="nt">/></span>
<span class="nt"></extension></span>
<span class="nt"><extension</span> <span class="na">point=</span><span class="s">"dup_removal"</span><span class="nt">></span>
<span class="nt"><impl</span> <span class="na">type=</span><span class="s">""</span> <span class="na">value=</span><span class="s">"org.eweb4j.spiderman.plugin.impl.DupRemovalPointImpl"</span> <span class="na">sort=</span><span class="s">"0"</span><span class="nt">/></span>
<span class="nt"></extension></span>
<span class="nt"><extension</span> <span class="na">point=</span><span class="s">"task_sort"</span><span class="nt">></span>
<span class="nt"><impl</span> <span class="na">type=</span><span class="s">""</span> <span class="na">value=</span><span class="s">"org.eweb4j.spiderman.plugin.impl.TaskSortPointImpl"</span> <span class="na">sort=</span><span class="s">"0"</span><span class="nt">/></span>
<span class="nt"></extension></span>
<span class="nt"><extension</span> <span class="na">point=</span><span class="s">"task_push"</span><span class="nt">></span>
<span class="nt"><impl</span> <span class="na">type=</span><span class="s">""</span> <span class="na">value=</span><span class="s">"org.eweb4j.spiderman.plugin.impl.TaskPushPointImpl"</span> <span class="na">sort=</span><span class="s">"0"</span><span class="nt">/></span>
<span class="nt"></extension></span>
<span class="nt"><extension</span> <span class="na">point=</span><span class="s">"target"</span><span class="nt">></span>
<span class="nt"><impl</span> <span class="na">type=</span><span class="s">""</span> <span class="na">value=</span><span class="s">"org.eweb4j.spiderman.plugin.impl.TargetPointImpl"</span> <span class="na">sort=</span><span class="s">"0"</span><span class="nt">/></span>
<span class="nt"></extension></span>
<span class="nt"><extension</span> <span class="na">point=</span><span class="s">"parse"</span><span class="nt">></span>
<span class="nt"><impl</span> <span class="na">type=</span><span class="s">""</span> <span class="na">value=</span><span class="s">"org.eweb4j.spiderman.plugin.impl.ParsePointImpl"</span> <span class="na">sort=</span><span class="s">"0"</span><span class="nt">/></span>
<span class="nt"></extension></span>
<span class="nt"><extension</span> <span class="na">point=</span><span class="s">"end"</span><span class="nt">></span>
<span class="nt"><impl</span> <span class="na">type=</span><span class="s">""</span> <span class="na">value=</span><span class="s">"org.eweb4j.spiderman.plugin.impl.EndPointImpl"</span> <span class="na">sort=</span><span class="s">"0"</span><span class="nt">/></span>
<span class="nt"></extension></span>
<span class="nt"></extensions></span>
<span class="nt"><providers></span>
<span class="nt"><provider></span>
<span class="nt"><orgnization</span> <span class="na">name=</span><span class="s">"CFuture"</span> <span class="na">website=</span><span class="s">"http://lurencun.com"</span> <span class="na">desc=</span><span class="s">"Color your future"</span><span class="nt">></span>
<span class="nt"><author</span> <span class="na">name=</span><span class="s">"weiwei"</span> <span class="na">website=</span><span class="s">"http://laiweiweihi.iteye.com | http://my.oschina.net/laiweiwei"</span> <span class="na">email=</span><span class="s">"l.weiwei@163.com"</span> <span class="na">weibo=</span><span class="s">"http://weibo.com/weiweimiss"</span> <span class="na">desc=</span><span class="s">"一个喜欢自由、音乐、绘画的IT老男孩"</span> <span class="nt">/></span>
<span class="nt"></orgnization></span>
<span class="nt"></provider></span>
<span class="nt"></providers></span>
<span class="nt"></plugin></span>
<span class="nt"></plugins></span>
<span class="nt"></site></span>
<span class="nt"></beans></span>
抓取过于频繁,服务器返回429.这个时候需要切换代理IP了,推荐使用阿布云代理,阿布云代理IP,提供高匿代理,爬虫代理.