Skip to content

Commit

Permalink
支持设置多UserAgent轮询
Browse files Browse the repository at this point in the history
  • Loading branch information
xuxueli committed Nov 9, 2017
1 parent 6cbc9d6 commit 1c68819
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 12 deletions.
4 changes: 3 additions & 1 deletion doc/XXL-CRAWLER官方文档.md
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,9 @@ ProxyMaker(代理生成器):实现代理支持的组件。支持设置代

### 版本 V1.1.1,新特性[迭代中]
- 1、爬虫Builder底层API优化;
- 1、支持设置请求Headers;
- 2、支持设置请求Headers;
- 3、支持设置页面编码;
- 4、支持设置多UserAgent轮询;

### TODO LIST
- 1、扩展SelectType;
Expand Down
26 changes: 16 additions & 10 deletions src/main/java/com/xuxueli/crawler/XxlCrawler.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ public class XxlCrawler {

// site
private volatile boolean ifPost = false; // 请求方式:true=POST请求、false=GET请求
private volatile String userAgent = XxlCrawlerConf.USER_AGENT_CHROME; // UserAgent
private volatile List<String> userAgentList = Collections.synchronizedList(new ArrayList<String>(Arrays.asList(XxlCrawlerConf.USER_AGENT_CHROME))); // 请求UserAgent
private volatile Map<String, String> paramMap; // 请求参数
private volatile Map<String, String> cookieMap; // 请求Cookie
private volatile Map<String, String> headerMap; // 请求Header
Expand Down Expand Up @@ -97,19 +97,25 @@ public Builder setWhiteUrlRegexs(String... whiteUrlRegexs) {
* @param ifPost
* @return
*/
private Builder setIfPost(boolean ifPost){
public Builder setIfPost(boolean ifPost){
crawler.ifPost = ifPost;
return this;
}

/**
* UserAgent
*
* @param userAgent
* @param userAgents
* @return
*/
private Builder setUserAgent(String userAgent){
crawler.userAgent = userAgent;
public Builder setUserAgent(String... userAgents){
if (userAgents!=null && userAgents.length>0) {
for (String userAgent: userAgents) {
if (!crawler.userAgentList.contains(userAgent)) {
crawler.userAgentList.add(userAgent);
}
}
}
return this;
}

Expand All @@ -119,7 +125,7 @@ private Builder setUserAgent(String userAgent){
* @param paramMap
* @return
*/
private Builder setParamMap(Map<String, String> paramMap){
public Builder setParamMap(Map<String, String> paramMap){
crawler.paramMap = paramMap;
return this;
}
Expand All @@ -130,7 +136,7 @@ private Builder setParamMap(Map<String, String> paramMap){
* @param cookieMap
* @return
*/
private Builder setCookieMap(Map<String, String> cookieMap){
public Builder setCookieMap(Map<String, String> cookieMap){
crawler.cookieMap = cookieMap;
return this;
}
Expand All @@ -141,7 +147,7 @@ private Builder setCookieMap(Map<String, String> cookieMap){
* @param headerMap
* @return
*/
private Builder setHeaderMap(Map<String, String> headerMap){
public Builder setHeaderMap(Map<String, String> headerMap){
crawler.headerMap = headerMap;
return this;
}
Expand Down Expand Up @@ -217,8 +223,8 @@ public boolean getAllowSpread() {
return allowSpread;
}

public String getUserAgent() {
return userAgent;
public List<String> getUserAgentList() {
return userAgentList;
}

public Map<String, String> getParamMap() {
Expand Down
3 changes: 3 additions & 0 deletions src/main/java/com/xuxueli/crawler/conf/XxlCrawlerConf.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ public class XxlCrawlerConf {

// userAgent
public static final String USER_AGENT_CHROME = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36";
public static final String USER_AGENT_FIREFOX_45 = "Mozilla/5.0 (Windows NT 6.1; rv:45.0) Gecko/20100101 Firefox/45.0";
public static final String USER_AGENT_IE = "Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko";
public static final String USER_AGENT_EDGE = "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586";

// timeout default, ms
public static final int TIMEOUT_MILLIS_DEFAULT = 5*1000;
Expand Down
6 changes: 5 additions & 1 deletion src/main/java/com/xuxueli/crawler/thread/CrawlerThread.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import java.net.Proxy;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import java.util.Set;
import java.util.concurrent.TimeUnit;

Expand Down Expand Up @@ -63,13 +64,16 @@ public void run() {
}

// ------- html ----------
String userAgent = crawler.getUserAgentList().size()>1
?crawler.getUserAgentList().get(new Random().nextInt(crawler.getUserAgentList().size()))
:crawler.getUserAgentList().size()==1?crawler.getUserAgentList().get(0):null;
Proxy proxy = null;
if (crawler.getProxyMaker() != null) {
proxy = crawler.getProxyMaker().make();
}

Document html = JsoupUtil.load(link, crawler.getParamMap(), crawler.getCookieMap(), crawler.getHeaderMap(),
crawler.getIfPost(), crawler.getUserAgent(), crawler.getTimeoutMillis(), proxy);
crawler.getIfPost(), userAgent, crawler.getTimeoutMillis(), proxy);
if (html == null) {
continue;
}
Expand Down
2 changes: 2 additions & 0 deletions src/test/java/com/xuxueli/crawler/test/XxlCrawlerTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import com.xuxueli.crawler.XxlCrawler;
import com.xuxueli.crawler.annotation.PageFieldSelect;
import com.xuxueli.crawler.annotation.PageSelect;
import com.xuxueli.crawler.conf.XxlCrawlerConf;
import com.xuxueli.crawler.parser.PageParser;
import org.jsoup.nodes.Document;

Expand Down Expand Up @@ -70,6 +71,7 @@ public static void main(String[] args) {
.setUrls("https://my.oschina.net/xuxueli/blog")
.setWhiteUrlRegexs("https://my\\.oschina\\.net/xuxueli/blog/\\d+")
.setThreadCount(3)
.setUserAgent(XxlCrawlerConf.USER_AGENT_CHROME, XxlCrawlerConf.USER_AGENT_FIREFOX_45)
.setPageParser(new PageParser<PageVo>() {
@Override
public void parse(Document html, PageVo pageVo) {
Expand Down

0 comments on commit 1c68819

Please sign in to comment.