Skip to content

Commit

Permalink
add depth filter; add comments
Browse files Browse the repository at this point in the history
  • Loading branch information
FreedomZZQ committed Apr 28, 2016
1 parent d9359ba commit 339b676
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 6 deletions.
35 changes: 29 additions & 6 deletions src/crawler/Crawler.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,13 @@
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLDecoder;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;

import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
Expand All @@ -22,15 +25,19 @@

public class Crawler {

private List<String> urlWaiting = new ArrayList<String>(); //A list of URLs that are waiting to be processed
private List<String> urlProcessed = new ArrayList<String>(); //A list of URLs that were processed
private List<String> urlError = new ArrayList<String>(); //A list of URLs that resulted in an error
private List<String> urlWaiting = new ArrayList<>(); //A list of URLs that are waiting to be processed
private List<String> urlProcessed = new ArrayList<>(); //A list of URLs that were processed
private List<String> urlError = new ArrayList<>(); //A list of URLs that resulted in an error

private Map<String, Integer> urlDepth = new ConcurrentHashMap<>(); //记录链接深度

private int numFindUrl = 0; //find the number of url

private final SimpleDateFormat sFormat = new SimpleDateFormat(DATE_FORMAT);

public Crawler() {}
public Crawler() {
urlDepth.put(SEED_URL, 0);
}


/**
Expand Down Expand Up @@ -65,6 +72,7 @@ public void processURL(String strUrl) {
connection.setRequestProperty("User-Agent", USER_AGENT);

//judge url type
//过滤不符合文件类型的链接
if ((connection.getContentType() != null)
&& !connection.getContentType().toLowerCase()
.startsWith(CONTENT_TYPE)) {
Expand All @@ -74,12 +82,18 @@ public void processURL(String strUrl) {
return;
}

//过滤小于 CONTENT_LENGTH 的文件链接
//过滤文件大小小于 CONTENT_LENGTH 的链接
if((connection.getContentLength() < CONTENT_LENGTH)){
log(TYPE_CONNECTING, url.toString(), TAG_ERROR);
return;
}

//过滤大于 SEARCH_DEPTH 的链接
if(urlDepth.get(strUrl) > SEARCH_DEPTH){
log(TYPE_CONNECTING, url.toString(), TAG_ERROR);
return;
}

log(TYPE_CONNECTING, url.toString(), TAG_SUCCESS);

// read the URL
Expand Down Expand Up @@ -143,8 +157,14 @@ public void log(String type, String url, String tag){
sb.append(" ");
sb.append(tag);
sb.append("\n\r");
System.out.println(sb.toString());

//打印log方便调试
System.out.println(sb.toString());
if(type.equals(TYPE_CONNECTING) && tag.equals(TAG_SUCCESS)) {
System.out.println(urlDepth.get(url));
}

//将记录写入log文件
FileUtils.writeFileAppend(LOG_FILE_NAME, sb.toString());


Expand Down Expand Up @@ -196,6 +216,9 @@ protected void handleLink(URL base, String str) {
try {
URL url = new URL(base, str);
addURL(url.toString());
if(urlDepth.containsKey(base.toString())){
urlDepth.put(url.toString(), urlDepth.get(base.toString()) + 1);
}
} catch (MalformedURLException e) {
//log("Found malformed URL: " + str);

Expand Down
40 changes: 40 additions & 0 deletions src/utils/Constants.java
Original file line number Diff line number Diff line change
@@ -1,18 +1,58 @@
package utils;

/**
* 爬虫的各种常量
* 可以在此对各种参数进行设置
* Created by ZQ on 2016/4/22.
*/
public abstract class Constants {
/**
* HTTP请求的USER_AGENT
*/
public static final String USER_AGENT = "2016IR201330551365";
/**
* 作为爬取起点的种子URL
*/
public static final String SEED_URL = "http://www.scut.edu.cn";
/**
* log记录中时间的显示格式
*/
public static final String DATE_FORMAT = "yyyy-mm-dd hh:mm:ss.SSS";
/**
* 操作类型 连接
*/
public static final String TYPE_CONNECTING = "Connecting";
/**
* 操作类型 获取数据
*/
public static final String TYPE_FETCHING = "Fetching";
/**
* 操作类型 解析数据
*/
public static final String TYPE_PARSING = "Parsing";
/**
* 操作结果 成功
*/
public static final String TAG_SUCCESS = "Successful";
/**
* 操作结果 失败
*/
public static final String TAG_ERROR = "Error";
/**
* log文件的文件名
*/
public static final String LOG_FILE_NAME = "IR201330551365LOG.txt";
/**
* 设置要获取的文件类型
*/
public static final String CONTENT_TYPE = "text/";
/**
* 设置要爬取的文件大小的下限
* 单位:字节
*/
public static final int CONTENT_LENGTH = 8000;
/**
* 设置爬取深度上限
*/
public static final int SEARCH_DEPTH = 10;
}
11 changes: 11 additions & 0 deletions src/utils/FileUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,24 @@
public class FileUtils {


/**
* 向文件追加内容
* @param fileName 文件名
* @param content 要追加的内容
*/
public static void writeFileAppend(String fileName, String content) {
try {

// 打开一个随机访问文件流,按读写方式

RandomAccessFile randomFile = new RandomAccessFile(fileName, "rw");

// 文件长度,字节数

long fileLength = randomFile.length();

//将写文件指针移到文件尾

randomFile.seek(fileLength);
randomFile.writeBytes(content);
randomFile.close();
Expand Down

0 comments on commit 339b676

Please sign in to comment.