Skip to content

Commit

Permalink
add multithread support
Browse files Browse the repository at this point in the history
  • Loading branch information
FreedomZZQ committed Apr 30, 2016
1 parent 66ecce4 commit b3694c7
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 19 deletions.
58 changes: 41 additions & 17 deletions src/crawler/Crawler.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package crawler;

import utils.FileUtils;
import utils.LogHelper;

import java.io.IOException;
import java.io.InputStream;
Expand All @@ -11,12 +12,13 @@
import java.net.URLConnection;
import java.net.URLDecoder;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

import javax.security.auth.login.LoginException;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
Expand All @@ -25,17 +27,22 @@

public class Crawler {

private List<String> urlWaiting = new ArrayList<>(); //A list of URLs that are waiting to be processed
private List<String> urlProcessed = new ArrayList<>(); //A list of URLs that were processed
private List<String> urlError = new ArrayList<>(); //A list of URLs that resulted in an error
private List<String> urlWaiting = new CopyOnWriteArrayList<>(); //A list of URLs that are waiting to be processed
private List<String> urlProcessed = new CopyOnWriteArrayList<>(); //A list of URLs that were processed
private List<String> urlError = new CopyOnWriteArrayList<>(); //A list of URLs that resulted in an error

private Map<String, Integer> urlDepth = new ConcurrentHashMap<>(); //记录链接深度

private ExecutorService exec; //使用Executor管理线程

private static LogHelper logHelper = LogHelper.getInstance();

private int numFindUrl = 0; //find the number of url

private final SimpleDateFormat sFormat = new SimpleDateFormat(DATE_FORMAT);

public Crawler() {
exec = Executors.newFixedThreadPool(THREAD_MAXNUM);
urlDepth.put(SEED_URL, 0);
}

Expand All @@ -44,10 +51,25 @@ public Crawler() {
* start crawling
*/
public void begin() {

while (!urlWaiting.isEmpty()) {
processURL(urlWaiting.remove(0));
}

//双重校验锁,保证线程安全的同时提高运行效率
while(true){
if(!urlWaiting.isEmpty()){
synchronized (this){
if((!urlWaiting.isEmpty())){
String urlString = urlWaiting.remove(0);
exec.execute(new Runnable() {
@Override
public void run() {
processURL(urlString);
}
});
}
}
}

}


// log("finish crawling");
// log("the number of urls that were found:" + numFindUrl);
Expand All @@ -61,12 +83,12 @@ public void begin() {
* @param strUrl
* The URL to be processed.
*/
public void processURL(String strUrl) {
public synchronized void processURL(String strUrl) {
URL url = null;
try {
url = new URL(strUrl);
//log("Processing: " + url);

// get the URL's contents
URLConnection connection = url.openConnection();
connection.setRequestProperty("User-Agent", USER_AGENT);
Expand Down Expand Up @@ -100,7 +122,7 @@ public void processURL(String strUrl) {
InputStream is = connection.getInputStream();
Reader r = new InputStreamReader(is);
log(TYPE_FETCHING, url.toString(), TAG_SUCCESS);

// parse the URL
HTMLEditorKit.Parser parse = new HTMLParse().getParser();
parse.parse(r, new Parser(url), true);
Expand All @@ -121,7 +143,7 @@ public void processURL(String strUrl) {
*
* @param url
*/
public void addURL(String url) {
public synchronized void addURL(String url) {
if (urlWaiting.contains(url))
return;
if (urlError.contains(url))
Expand Down Expand Up @@ -165,7 +187,8 @@ public void log(String type, String url, String tag){
}

//将记录写入log文件
FileUtils.writeFileAppend(LOG_FILE_NAME, sb.toString());
//FileUtils.writeFileAppend(LOG_FILE_NAME, sb.toString());
logHelper.addLog(sb.toString());


}
Expand Down Expand Up @@ -212,7 +235,7 @@ public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {

}

protected void handleLink(URL base, String str) {
protected synchronized void handleLink(URL base, String str) {
try {
URL url = new URL(base, str);
addURL(url.toString());
Expand All @@ -230,6 +253,7 @@ protected void handleLink(URL base, String str) {
* @param args
*/
public static void main(String[] args) {

Crawler crawler = new Crawler();
crawler.addURL(SEED_URL);
crawler.begin();
Expand Down
8 changes: 6 additions & 2 deletions src/utils/Constants.java
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,13 @@ public abstract class Constants {
* 设置要爬取的文件大小的下限
* 单位:字节
*/
public static final int CONTENT_LENGTH = 8000;
public static final int CONTENT_LENGTH = 2000;
/**
* 设置爬取深度上限
*/
public static final int SEARCH_DEPTH = 10;
public static final int SEARCH_DEPTH = 15;
/**
* 设置最大线程数量
*/
public static final int THREAD_MAXNUM = 3;
}
44 changes: 44 additions & 0 deletions src/utils/LogHelper.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package utils;

import java.util.List;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

import static utils.Constants.LOG_FILE_NAME;

/**
* 用于线程安全地写入文件
* 该类为单例
* Created by ZQ on 2016/4/30.
*/
public class LogHelper {
private List<String> logList = new CopyOnWriteArrayList<>();
private ExecutorService exec;

private LogHelper(){
exec = Executors.newSingleThreadExecutor();
}

private static class Holder{
private static LogHelper instance = new LogHelper();
private static LogHelper getInstance(){
return instance;
}
}

public static LogHelper getInstance(){
return Holder.getInstance();
}

public void addLog(String log){
exec.execute(new Runnable() {
@Override
public void run() {
FileUtils.writeFileAppend(LOG_FILE_NAME, log);
}
});
}


}

0 comments on commit b3694c7

Please sign in to comment.