Skip to content

Commit

Permalink
动态代理
Browse files Browse the repository at this point in the history
  • Loading branch information
xuxueli committed Nov 8, 2017
1 parent 1cd9ba7 commit ad3ec0c
Show file tree
Hide file tree
Showing 5 changed files with 77 additions and 26 deletions.
19 changes: 19 additions & 0 deletions doc/XXL-CRAWLER官方文档.md
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,11 @@ setParamMap | 请求参数
setCookieMap | 请求Cookie
setTimeoutMillis | 超时时间,毫秒
setPauseMillis | 停顿时间,爬虫线程处理完页面之后进行主动停顿,避免过于频繁被拦截;
setProxyMaker | 代理生成器,支持设置代理IP,同时支持调整代理池实现动态代理;
setThreadCount | 爬虫并发线程数
setPageParser | 页面解析器
start | 运行爬虫,可通过入参控制同步或异步方式运行
stop | 终止爬虫

### 3.4 核心注解:PageSelect

Expand All @@ -157,6 +160,22 @@ selectType | jquery 数据抽取方式,如 ".html()/.text()/.val()/.attr()"等
selectVal | jquery 数据抽取参数,SelectType=ATTR 时有效,如 ".attr("abs:src")"
datePattern | 时间格式化,日期类型数据有效

### 3.6 多线程
以线程池方式并行运行,提供对应API(可参考"章节3.3")调整线程池大小,提高运行效率;

### 3.7 异步
支持同步、异步两种方式启动运行。

- 同步:将会阻塞业务逻辑,爬虫爬取完全部页面后才会继续执行后续逻辑。
- 异步:不会阻塞业务逻辑,爬虫逻辑以异步方式运行。

### 3.8 动态代理
ProxyMaker(代理生成器):实现代理支持的组件。支持设置代理IP,同时支持调整代理池实现动态代理;

系统已经提供了两种策略实现;
- RoundProxyMaker(循环代理生成器): 以循环方式获取代理池中代理;
- RandomProxyMaker(随机代理生成器): 以随机方式获取代理池中代理;


## 四、版本更新日志
### 版本 V1.0.0,新特性[2017-09-13]
Expand Down
19 changes: 11 additions & 8 deletions src/main/java/com/xuxueli/crawler/proxy/ProxyMaker.java
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
package com.xuxueli.crawler.proxy;

import java.net.Proxy;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.CopyOnWriteArrayList;

/**
* proxy macker
Expand All @@ -11,18 +11,21 @@
*/
public abstract class ProxyMaker {

protected List<Proxy> proxyList = new ArrayList<Proxy>(); // 请求代理池,对抗反采集策略规则WAF
protected List<Proxy> proxyList = new CopyOnWriteArrayList<Proxy>(); // 请求代理池,对抗反采集策略规则WAF

public ProxyMaker(List<Proxy> proxyList) {
this.proxyList = proxyList;
public ProxyMaker addProxy(Proxy proxy) {
this.proxyList.add(proxy);
return this;
}

public void setProxyList(List<Proxy> proxyList) {
this.proxyList = proxyList;
public ProxyMaker addProxyList(List<Proxy> proxyList) {
this.proxyList.addAll(proxyList);
return this;
}

public void addProxyList(List<Proxy> proxyList) {
this.proxyList.addAll(proxyList);
public ProxyMaker clear() {
this.proxyList.clear();
return this;
}

/**
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
package com.xuxueli.crawler.proxy.impl;
package com.xuxueli.crawler.proxy.strategy;

import com.xuxueli.crawler.proxy.ProxyMaker;

import java.net.Proxy;
import java.util.List;
import java.util.Random;

/**
Expand All @@ -15,16 +14,17 @@ public class RandomProxyMaker extends ProxyMaker {

private Random random = new Random();

public RandomProxyMaker(List<Proxy> proxyList) {
super(proxyList);
}

@Override
public Proxy make() {
if (super.proxyList!=null && super.proxyList.size()>0) {
return super.proxyList.get(random.nextInt(super.proxyList.size()));
if (super.proxyList==null || super.proxyList.size()==0) {
return null;
}
return null;

if (super.proxyList.size() == 1) {
super.proxyList.get(0);
}

return super.proxyList.get(random.nextInt(super.proxyList.size()));
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
package com.xuxueli.crawler.proxy.strategy;

import com.xuxueli.crawler.proxy.ProxyMaker;

import java.net.Proxy;
import java.util.concurrent.atomic.AtomicInteger;

public class RoundProxyMaker extends ProxyMaker {

private AtomicInteger count = new AtomicInteger(0);

@Override
public Proxy make() {
if (super.proxyList==null || super.proxyList.size()==0) {
return null;
}

if (super.proxyList.size() == 1) {
super.proxyList.get(0);
}

int countVal = count.incrementAndGet();
if (countVal > 100000) {
countVal = 0;
count.set(countVal);
}

return super.proxyList.get(countVal%super.proxyList.size());
}

}
16 changes: 7 additions & 9 deletions src/test/java/com/xuxueli/crawler/test/XxlCrawlerTest04.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import com.xuxueli.crawler.XxlCrawler;
import com.xuxueli.crawler.parser.PageParser;
import com.xuxueli.crawler.proxy.ProxyMaker;
import com.xuxueli.crawler.proxy.impl.RandomProxyMaker;
import com.xuxueli.crawler.proxy.strategy.RoundProxyMaker;
import org.jsoup.nodes.Document;

import java.net.InetSocketAddress;
Expand All @@ -21,16 +21,14 @@ public class XxlCrawlerTest04 {

public static void main(String[] args) {

// 设置代理池
ProxyMaker proxyMaker = new RandomProxyMaker(Arrays.asList(
new Proxy(Proxy.Type.HTTP, new InetSocketAddress("---", 8080)),
new Proxy(Proxy.Type.HTTP, new InetSocketAddress("---", 8080))
));
// 设置代理池 (免费代理可从ip181或kxdaili获取,免费代理不稳定可以多试几个;仅供学习测试使用,如有侵犯请联系删除; )
ProxyMaker proxyMaker = new RoundProxyMaker()
.addProxy(new Proxy(Proxy.Type.HTTP, new InetSocketAddress("124.88.67.63", 80)));

// 构造爬虫
// 构造爬虫 (爬取页面为IP地址查询网IP138,可从页面响应确认代理是否生效)
XxlCrawler crawler = new XxlCrawler.Builder()
.setUrls(new HashSet<String>(Arrays.asList("https://my.oschina.net/xuxueli/blog")))
.setWhiteUrlRegexs(new HashSet<String>(Arrays.asList("https://my\\.oschina\\.net/xuxueli/blog/\\d+")))
.setUrls(new HashSet<String>(Arrays.asList("http://2017.ip138.com/ic.asp")))
.setAllowSpread(false)
.setProxyMaker(proxyMaker)
.setPageParser(new PageParser<Object>() {
@Override
Expand Down

0 comments on commit ad3ec0c

Please sign in to comment.