Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
y.wang committed Jan 21, 2017
1 parent 80f436f commit 884a7db
Show file tree
Hide file tree
Showing 13 changed files with 115 additions and 40 deletions.
8 changes: 5 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,10 @@
##Quick Start
Run with [Main.java](https://github.com/wycm/zhihu-crawler/blob/2.0/src/main/java/com/crawl/Main.java) <br>

##注意
因为登录方式抓取,可能会导致封号。现在采用游客模式抓取。
##Features
* 大量使用http代理,突破同一个客户端访问量限制。
* 支持持久化(mysql),相关配置见[config.properties](https://github.com/wycm/zhihu-crawler/blob/2.0/src/main/resources/config.properties)
* 多线程、快速,10小时可以抓取70w用户。

##更新
####2016.12.26
Expand All @@ -44,5 +46,5 @@ DetailPageThreadPool负责下载用户详情页面,解析出用户基本信息
* 优化爬取速度

##最后
欢迎交流,欢迎提交代码。需要数据的可以联系我。
有疑问,欢迎交流。需要数据的可以联系我。

2 changes: 1 addition & 1 deletion src/main/java/com/crawl/core/util/HttpClientUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ public static CloseableHttpResponse getResponse(String url) throws IOException {
public static void serializeObject(Object object,String filePath){
OutputStream fos = null;
try {
fos = new FileOutputStream(filePath);
fos = new FileOutputStream(filePath, false);
ObjectOutputStream oos = new ObjectOutputStream(fos);
oos.writeObject(object);
logger.info("序列化成功");
Expand Down
46 changes: 46 additions & 0 deletions src/main/java/com/crawl/core/util/SimpleThreadPoolExecutor.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
package com.crawl.core.util;


import java.util.concurrent.*;

public class SimpleThreadPoolExecutor extends ThreadPoolExecutor{
private String threadPoolName;
public SimpleThreadPoolExecutor(int corePoolSize, int maximumPoolSize, long keepAliveTime,
TimeUnit unit, BlockingQueue<Runnable> workQueue, String threadPoolName) {
super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue);
this.threadPoolName = threadPoolName;
}

public SimpleThreadPoolExecutor(int corePoolSize, int maximumPoolSize, long keepAliveTime,
TimeUnit unit, BlockingQueue<Runnable> workQueue, ThreadFactory threadFactory,
String threadPoolName) {
super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue, threadFactory);
this.threadPoolName = threadPoolName;
}

public SimpleThreadPoolExecutor(int corePoolSize, int maximumPoolSize, long keepAliveTime, TimeUnit unit,
BlockingQueue<Runnable> workQueue, RejectedExecutionHandler handler,
String threadPoolName) {
super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue, handler);
this.threadPoolName = threadPoolName;
}

public SimpleThreadPoolExecutor(int corePoolSize, int maximumPoolSize, long keepAliveTime, TimeUnit unit,
BlockingQueue<Runnable> workQueue, ThreadFactory threadFactory,
RejectedExecutionHandler handler, String threadPoolName) {
super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue, threadFactory, handler);
this.threadPoolName = threadPoolName;
}

/**
* 修改thread name
* @param t
* @param r
*/
@Override
protected void beforeExecute(Thread t, Runnable r) {
if (t.getName().startsWith("pool-")){
t.setName(t.getName().replaceAll("pool-\\d", this.threadPoolName));
}
}
}
28 changes: 17 additions & 11 deletions src/main/java/com/crawl/proxy/ProxyHttpClient.java
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
package com.crawl.proxy;

import com.crawl.core.util.Config;
import com.crawl.core.util.Constants;
import com.crawl.core.util.HttpClientUtil;
import com.crawl.core.util.ThreadPoolMonitor;
import com.crawl.core.util.*;
import com.crawl.proxy.entity.Proxy;
import com.crawl.proxy.task.ProxyPageTask;
import com.crawl.core.httpclient.AbstractHttpClient;
Expand Down Expand Up @@ -48,12 +45,15 @@ public ProxyHttpClient(){
* 初始化线程池
*/
private void initThreadPool(){
proxyTestThreadExecutor = new ThreadPoolExecutor(100, 100,
proxyTestThreadExecutor = new SimpleThreadPoolExecutor(100, 100,
0L, TimeUnit.MILLISECONDS,
new LinkedBlockingQueue<Runnable>(10000), new ThreadPoolExecutor.DiscardPolicy());
proxyDownloadThreadExecutor = new ThreadPoolExecutor(10, 10,
new LinkedBlockingQueue<Runnable>(10000),
new ThreadPoolExecutor.DiscardPolicy(),
"proxyTestThreadExecutor");
proxyDownloadThreadExecutor = new SimpleThreadPoolExecutor(10, 10,
0L, TimeUnit.MILLISECONDS,
new LinkedBlockingQueue<Runnable>());
new LinkedBlockingQueue<Runnable>(), "" +
"proxyDownloadThreadExecutor");
new Thread(new ThreadPoolMonitor(proxyTestThreadExecutor, "ProxyTestThreadPool")).start();
new Thread(new ThreadPoolMonitor(proxyDownloadThreadExecutor, "ProxyDownloadThreadExecutor")).start();
}
Expand All @@ -66,17 +66,23 @@ private void initProxy(){
Proxy[] proxyArray = null;
try {
proxyArray = (Proxy[]) HttpClientUtil.deserializeObject(Config.proxyPath);
int usableProxyCount = 0;
for (Proxy p : proxyArray){
if (p == null){
continue;
}
p.setTimeInterval(Constants.TIME_INTERVAL);
p.setFailureTimes(0);
p.setSuccessfulTimes(0);
ProxyPool.proxyQueue.add(p);
ProxyPool.proxySet.add(p);
long nowTime = System.currentTimeMillis();
if (nowTime - p.getLastSuccessfulTime() < 1000 * 60 *60){
//上次成功离现在少于一小时
ProxyPool.proxyQueue.add(p);
ProxyPool.proxySet.add(p);
usableProxyCount++;
}
}
logger.info("反序列化proxy成功," + proxyArray.length + "个代理");
logger.info("反序列化proxy成功," + proxyArray.length + "个代理,可用代理" + usableProxyCount + "个");
} catch (Exception e) {
logger.warn("反序列化proxy失败");
}
Expand Down
17 changes: 11 additions & 6 deletions src/main/java/com/crawl/proxy/entity/Proxy.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@
import java.util.concurrent.TimeUnit;

public class Proxy implements Delayed, Serializable{
private static final long serialVersionUID = -7583883432417635332L;
private long timeInterval ;//任务间隔时间,单位ms
private String ip;
private int port;
private boolean availableFlag;
private boolean anonymousFlag;
private long lastUseTime;
private long lastSuccessfulTime;//最近一次请求成功时间
private int delay;
private int failureTimes;//请求失败次数
private int successfulTimes;//请求成功次数
Expand Down Expand Up @@ -54,12 +55,16 @@ public void setAnonymousFlag(boolean anonymousFlag) {
this.anonymousFlag = anonymousFlag;
}

public long getLastUseTime() {
return lastUseTime;
public long getTimeInterval() {
return timeInterval;
}

public void setLastUseTime(long lastUseTime) {
this.lastUseTime = lastUseTime;
public long getLastSuccessfulTime() {
return lastSuccessfulTime;
}

public void setLastSuccessfulTime(long lastSuccessfulTime) {
this.lastSuccessfulTime = lastSuccessfulTime;
}

public int getDelay() {
Expand Down Expand Up @@ -108,7 +113,7 @@ public String toString() {
", port=" + port +
", availableFlag=" + availableFlag +
", anonymousFlag=" + anonymousFlag +
", lastUseTime=" + lastUseTime +
", lastSuccessfulTime=" + lastSuccessfulTime +
", delay=" + delay +
", failureTimes=" + failureTimes +
", successfulTimes=" + successfulTimes +
Expand Down
9 changes: 7 additions & 2 deletions src/main/java/com/crawl/proxy/task/ProxyPageTask.java
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ public ProxyPageTask(String url, boolean proxyFlag){
this.proxyFlag = proxyFlag;
}
public void run(){
long requestStartTime = System.currentTimeMillis();
HttpGet tempRequest = null;
try {
Page page = null;
Expand All @@ -57,11 +58,15 @@ public void run(){
}
page.setProxy(currentProxy);
int status = page.getStatusCode();
long requestEndTime = System.currentTimeMillis();
String logStr = Thread.currentThread().getName() + " " + getProxyStr(currentProxy) +
" executing request " + page.getUrl() + " response statusCode:" + status +
" request cost time:" + (requestEndTime - requestStartTime) + "ms";
if(status == HttpStatus.SC_OK){
logger.debug(Thread.currentThread().getName() + " " + getProxyStr(currentProxy) + " statusCode:" + status + " executing request " + page.getUrl());
logger.debug(logStr);
handle(page);
} else {
logger.error(Thread.currentThread().getName() + " " + getProxyStr(currentProxy) + " statusCode:" + status + " executing request " + page.getUrl());
logger.error(logStr);
Thread.sleep(100);
retry();
}
Expand Down
10 changes: 7 additions & 3 deletions src/main/java/com/crawl/proxy/task/ProxyTestTask.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ public ProxyTestTask(Proxy proxy){
@Override
public void run() {
long startTime = System.currentTimeMillis();
proxy.setLastUseTime(startTime);
HttpGet request = new HttpGet(Constants.INDEX_URL);
try {
RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(Constants.TIMEOUT).
Expand All @@ -38,12 +37,17 @@ public void run() {
build();
request.setConfig(requestConfig);
Page page = ZhiHuHttpClient.getInstance().getWebPage(request);
logger.debug(proxy.toString() + "---------" + page.toString());
long endTime = System.currentTimeMillis();
String logStr = Thread.currentThread().getName() + " " + proxy.getProxyStr() +
" executing request " + page.getUrl() + " response statusCode:" + page.getStatusCode() +
" request cost time:" + (endTime - startTime) + "ms";
if (page == null || page.getStatusCode() != 200){
logger.warn(logStr);
return;
}
request.releaseConnection();
long endTime = System.currentTimeMillis();

logger.debug(proxy.toString() + "---------" + page.toString());
if(!ProxyPool.proxySet.contains(proxy)){
logger.debug(proxy.toString() + "----------代理可用--------请求耗时:" + (endTime - startTime) + "ms");
ProxyPool.lock.writeLock().lock();
Expand Down
15 changes: 7 additions & 8 deletions src/main/java/com/crawl/zhihu/ZhiHuHttpClient.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,10 @@

import com.crawl.core.httpclient.AbstractHttpClient;
import com.crawl.core.httpclient.IHttpClient;
import com.crawl.core.util.Config;
import com.crawl.core.util.*;
import com.crawl.core.dao.ConnectionManager;
import com.crawl.proxy.ProxyHttpClient;
import com.crawl.zhihu.dao.ZhiHuDAO;
import com.crawl.core.util.HttpClientUtil;
import com.crawl.core.util.SimpleLogger;
import com.crawl.core.util.ThreadPoolMonitor;
import com.crawl.zhihu.task.DetailPageTask;
import org.apache.log4j.Logger;

Expand Down Expand Up @@ -70,13 +67,15 @@ public void initHttpClient() {
* 初始化线程池
*/
private void intiThreadPool(){
detailPageThreadPool = new ThreadPoolExecutor(Config.downloadThreadSize,
detailPageThreadPool = new SimpleThreadPoolExecutor(Config.downloadThreadSize,
Config.downloadThreadSize,
0L, TimeUnit.MILLISECONDS,
new LinkedBlockingQueue<Runnable>());
listPageThreadPool = new ThreadPoolExecutor(50, 80,
new LinkedBlockingQueue<Runnable>(),
"detailPageThreadPool");
listPageThreadPool = new SimpleThreadPoolExecutor(50, 80,
0L, TimeUnit.MILLISECONDS,
new LinkedBlockingQueue<Runnable>(1000), new ThreadPoolExecutor.DiscardPolicy());
new LinkedBlockingQueue<Runnable>(5000),
new ThreadPoolExecutor.DiscardPolicy(), "listPageThreadPool");
new Thread(new ThreadPoolMonitor(detailPageThreadPool, "DetailPageDownloadThreadPool")).start();
new Thread(new ThreadPoolMonitor(listPageThreadPool, "ListPageDownloadThreadPool")).start();
}
Expand Down
12 changes: 9 additions & 3 deletions src/main/java/com/crawl/zhihu/task/AbstractPageTask.java
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ public AbstractPageTask(HttpRequestBase request, boolean proxyFlag){
this.proxyFlag = proxyFlag;
}
public void run(){
long requestStartTime = System.currentTimeMillis();
HttpGet tempRequest = null;
try {
Page page = null;
Expand Down Expand Up @@ -82,10 +83,15 @@ public void run(){
}
page.setProxy(currentProxy);
int status = page.getStatusCode();
long requestEndTime = System.currentTimeMillis();
String logStr = Thread.currentThread().getName() + " " + getProxyStr(currentProxy) +
" executing request " + page.getUrl() + " response statusCode:" + status +
" request cost time:" + (requestEndTime - requestStartTime) + "ms";
if(status == HttpStatus.SC_OK){
if (page.getHtml().contains("zhihu")){
logger.debug(Thread.currentThread().getName() + " " + getProxyStr(currentProxy) + " statusCode:" + status + " executing request " + page.getUrl());
logger.debug(logStr);
currentProxy.setSuccessfulTimes(currentProxy.getSuccessfulTimes() + 1);
currentProxy.setLastSuccessfulTime(System.currentTimeMillis());
handle(page);
}else {
/**
Expand All @@ -100,10 +106,10 @@ public void run(){
*/
else if(status == 404 || status == 401 ||
status == 410){
logger.warn(Thread.currentThread().getName() + " " + getProxyStr(currentProxy) + " statusCode:" + status + " executing request " + page.getUrl());
logger.warn(logStr);
}
else {
logger.error(Thread.currentThread().getName() + " " + getProxyStr(currentProxy) + " statusCode:" + status + " executing request " + page.getUrl());
logger.error(logStr);
Thread.sleep(100);
retry();
}
Expand Down
2 changes: 1 addition & 1 deletion src/main/resources/log4j.properties
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ log4j.rootLogger=INFO,stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
#log4j.appender.stdout.Target=System.err
log4j.appender.stdout.layout=org.apache.log4j.SimpleLayout
log4j.appender.logfile.Threshold = ERROR
#log4j.appender.logfile.Threshold = ERROR
log4j.appender.logfile=org.apache.log4j.FileAppender
log4j.appender.logfile.File=src/main/resources/zhiHuCrawler.log
log4j.appender.logfile.layout=org.apache.log4j.PatternLayout
Expand Down
Binary file modified src/main/resources/proxies
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ public class Ip181ProxyListPageParserTest {
@Test
public void testParse() throws IOException {
System.out.println(Charset.defaultCharset().toString());
Page page = ProxyHttpClient.getInstance().getWebPage("http://www.ip181.com/daili/1.html", "gb2312");
Page page = ProxyHttpClient.getInstance().getWebPage("http://www.ip181.com/daili/1.html");
// Page page = ProxyHttpClient.getInstance().getWebPage("http://www.ip181.com/daili/1.html", "gb2312");
List<Proxy> urlList = new Ip181ProxyListPageParser().parse(page.getHtml());
System.out.println(urlList.size());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
public class Ip66ProxyListPageParserTest {
@Test
public void testParse() throws IOException {
Page page = ProxyHttpClient.getInstance().getWebPage("http://www.66ip.cn/index.html", "gb2312");
Page page = ProxyHttpClient.getInstance().getWebPage("http://www.66ip.cn/index.html");
page.setHtml(new String(page.getHtml().getBytes("GB2312"), "GB2312"));
List<Proxy> urlList = new Ip66ProxyListPageParser().parse(page.getHtml());
System.out.println(urlList.size());
}
Expand Down

0 comments on commit 884a7db

Please sign in to comment.