Skip to content

Commit

Permalink
爬虫Builder底层API优化
Browse files Browse the repository at this point in the history
  • Loading branch information
xuxueli committed Nov 9, 2017
1 parent 78f9253 commit 300d028
Show file tree
Hide file tree
Showing 7 changed files with 33 additions and 26 deletions.
2 changes: 2 additions & 0 deletions doc/XXL-CRAWLER官方文档.md
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,8 @@ ProxyMaker(代理生成器):实现代理支持的组件。支持设置代
- 6、动态代理:支持运行时动态调整代理池,以及自定义代理池路由策略;

### 版本 V1.1.1,新特性[迭代中]
- 1、爬虫Builder底层API优化;
- 1、支持设置请求Headers;

### TODO LIST
- 1、扩展SelectType;
Expand Down
38 changes: 21 additions & 17 deletions src/main/java/com/xuxueli/crawler/XxlCrawler.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,19 +21,19 @@ public class XxlCrawler {
private static Logger logger = LoggerFactory.getLogger(XxlCrawler.class);

// url
private volatile LinkedBlockingQueue<String> unVisitedUrlQueue = new LinkedBlockingQueue<String>(); // 未访问过的URL
private volatile Set<String> visitedUrlSet = Collections.synchronizedSet(new HashSet<String>());; // 已经访问过的URL
private volatile boolean allowSpread = true; // 允许扩散爬取,将会以现有URL为起点扩散爬取整站
private Set<String> whiteUrlRegexs; // URL白名单正则,非空时进行URL白名单过滤页面
private volatile LinkedBlockingQueue<String> unVisitedUrlQueue = new LinkedBlockingQueue<String>(); // 未访问过的URL
private volatile Set<String> visitedUrlSet = Collections.synchronizedSet(new HashSet<String>()); // 已经访问过的URL
private volatile boolean allowSpread = true; // 允许扩散爬取,将会以现有URL为起点扩散爬取整站
private Set<String> whiteUrlRegexs = Collections.synchronizedSet(new HashSet<String>()); // URL白名单正则,非空时进行URL白名单过滤页面

// site
private volatile boolean ifPost = false; // 请求方式:true=POST请求、false=GET请求
private volatile String userAgent = XxlCrawlerConf.USER_AGENT_SAMPLE; // UserAgent
private volatile Map<String, String> paramMap; // 请求参数
private volatile Map<String, String> cookieMap; // 请求Cookie
private volatile int timeoutMillis = XxlCrawlerConf.TIMEOUT_MILLIS_DEFAULT; // 超时时间,毫秒
private volatile int pauseMillis = 0; // 停顿时间,爬虫线程处理完页面之后进行主动停顿,避免过于频繁被拦截;
private volatile ProxyMaker proxyMaker; // 代理生成器
private volatile boolean ifPost = false; // 请求方式:true=POST请求、false=GET请求
private volatile String userAgent = XxlCrawlerConf.USER_AGENT_SAMPLE; // UserAgent
private volatile Map<String, String> paramMap; // 请求参数
private volatile Map<String, String> cookieMap; // 请求Cookie
private volatile int timeoutMillis = XxlCrawlerConf.TIMEOUT_MILLIS_DEFAULT; // 超时时间,毫秒
private volatile int pauseMillis = 0; // 停顿时间,爬虫线程处理完页面之后进行主动停顿,避免过于频繁被拦截;
private volatile ProxyMaker proxyMaker; // 代理生成器

// thread
private int threadCount = 1; // 爬虫线程数量
Expand All @@ -51,12 +51,12 @@ public static class Builder {
/**
* 待爬的URL列表
*
* @param urlSet
* @param urls
* @return
*/
public Builder setUrls(Set<String> urlSet) {
if (urlSet!=null && urlSet.size()>0) {
for (String url: urlSet) {
public Builder setUrls(String... urls) {
if (urls!=null && urls.length>0) {
for (String url: urls) {
crawler.addUrl(url);
}
}
Expand All @@ -80,8 +80,12 @@ public Builder setAllowSpread(boolean allowSpread) {
* @param whiteUrlRegexs
* @return
*/
public Builder setWhiteUrlRegexs(Set<String> whiteUrlRegexs) {
crawler.whiteUrlRegexs = whiteUrlRegexs;
public Builder setWhiteUrlRegexs(String... whiteUrlRegexs) {
if (whiteUrlRegexs!=null && whiteUrlRegexs.length>0) {
for (String whiteUrlRegex: whiteUrlRegexs) {
crawler.whiteUrlRegexs.add(whiteUrlRegex);
}
}
return this;
}

Expand Down
5 changes: 3 additions & 2 deletions src/test/java/com/xuxueli/crawler/test/XxlCrawlerTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import com.xuxueli.crawler.parser.PageParser;
import org.jsoup.nodes.Document;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
Expand Down Expand Up @@ -66,8 +67,8 @@ public String toString() {
public static void main(String[] args) {

XxlCrawler crawler = new XxlCrawler.Builder()
.setUrls(new HashSet<String>(Arrays.asList("https://my.oschina.net/xuxueli/blog")))
.setWhiteUrlRegexs(new HashSet<String>(Arrays.asList("https://my\\.oschina\\.net/xuxueli/blog/\\d+")))
.setUrls("https://my.oschina.net/xuxueli/blog")
.setWhiteUrlRegexs("https://my\\.oschina\\.net/xuxueli/blog/\\d+")
.setThreadCount(3)
.setPageParser(new PageParser<PageVo>() {
@Override
Expand Down
4 changes: 2 additions & 2 deletions src/test/java/com/xuxueli/crawler/test/XxlCrawlerTest02.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ public class XxlCrawlerTest02 {
public static void main(String[] args) {

XxlCrawler crawler = new XxlCrawler.Builder()
.setUrls(new HashSet<String>(Arrays.asList("https://my.oschina.net/xuxueli/blog")))
.setWhiteUrlRegexs(new HashSet<String>(Arrays.asList("https://my\\.oschina\\.net/xuxueli/blog/\\d+")))
.setUrls("https://my.oschina.net/xuxueli/blog")
.setWhiteUrlRegexs("https://my\\.oschina\\.net/xuxueli/blog/\\d+")
.setThreadCount(3)
.setPageParser(new PageParser<Object>() {
@Override
Expand Down
4 changes: 2 additions & 2 deletions src/test/java/com/xuxueli/crawler/test/XxlCrawlerTest03.java
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ public String toString() {
public static void main(String[] args) {

XxlCrawler crawler = new XxlCrawler.Builder()
.setUrls(new HashSet<String>(Arrays.asList("https://my.oschina.net/xuxueli/blog")))
.setWhiteUrlRegexs(new HashSet<String>(Arrays.asList("https://my\\.oschina\\.net/xuxueli/blog/\\d+")))
.setUrls("https://my.oschina.net/xuxueli/blog")
.setWhiteUrlRegexs("https://my\\.oschina\\.net/xuxueli/blog/\\d+")
.setThreadCount(3)
.setPageParser(new PageParser<PageVo>() {
@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ public static void main(String[] args) {

// 构造爬虫 (代理方式请求IP地址查询网IP138,可从页面响应确认代理是否生效)
XxlCrawler crawler = new XxlCrawler.Builder()
.setUrls(new HashSet<String>(Arrays.asList("http://2017.ip138.com/ic.asp")))
.setUrls("http://2017.ip138.com/ic.asp")
.setAllowSpread(false)
.setProxyMaker(proxyMaker)
.setPageParser(new PageParser<Object>() {
Expand Down
4 changes: 2 additions & 2 deletions src/test/java/com/xuxueli/crawler/test/XxlCrawlerTest05.java
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@ public static void main(String[] args) {

// 构造爬虫
XxlCrawler crawler = new XxlCrawler.Builder()
.setUrls(new HashSet<String>(Arrays.asList("http://www.ip181.com/daili/1.html")))
.setWhiteUrlRegexs(new HashSet<String>(Arrays.asList("http://www.ip181.com/daili/\\b[1-2].html"))) // 前2页数据
.setUrls("http://www.ip181.com/daili/1.html")
.setWhiteUrlRegexs("http://www.ip181.com/daili/\\b[1-2].html") // 前2页数据
//.setWhiteUrlRegexs(new HashSet<String>(Arrays.asList("http://www.ip181.com/daili/\\\\d+.html"))) // 全部数据
.setThreadCount(10)
.setPageParser(new PageParser<PageVo>() {
Expand Down

0 comments on commit 300d028

Please sign in to comment.