-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 263fb27
Showing
2 changed files
with
227 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
.metadata | ||
bin/ | ||
tmp/ | ||
*.tmp | ||
*.bak | ||
*.swp | ||
*~.nib | ||
local.properties | ||
.settings/ | ||
.loadpath | ||
.recommenders | ||
|
||
# Eclipse Core | ||
.project | ||
|
||
# External tool builders | ||
.externalToolBuilders/ | ||
|
||
# Locally stored "Eclipse launch configurations" | ||
*.launch | ||
|
||
# PyDev specific (Python IDE for Eclipse) | ||
*.pydevproject | ||
|
||
# CDT-specific (C/C++ Development Tooling) | ||
.cproject | ||
|
||
# JDT-specific (Eclipse Java Development Tools) | ||
.classpath | ||
|
||
# Java annotation processor (APT) | ||
.factorypath | ||
|
||
# PDT-specific (PHP Development Tools) | ||
.buildpath | ||
|
||
# sbteclipse plugin | ||
.target | ||
|
||
# Tern plugin | ||
.tern-project | ||
|
||
# TeXlipse plugin | ||
.texlipse | ||
|
||
# STS (Spring Tool Suite) | ||
.springBeans | ||
|
||
# Code Recommenders | ||
.recommenders/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,177 @@ | ||
package crawler; | ||
|
||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.io.InputStreamReader; | ||
import java.io.Reader; | ||
import java.net.MalformedURLException; | ||
import java.net.URL; | ||
import java.net.URLConnection; | ||
import java.text.SimpleDateFormat; | ||
import java.util.ArrayList; | ||
import java.util.Date; | ||
import java.util.List; | ||
|
||
import javax.swing.text.MutableAttributeSet; | ||
import javax.swing.text.html.HTML; | ||
import javax.swing.text.html.HTMLEditorKit; | ||
|
||
public class Crawler { | ||
|
||
private List<String> urlWaiting = new ArrayList<String>(); //A list of URLs that are waiting to be processed | ||
private List<String> urlProcessed = new ArrayList<String>(); //A list of URLs that were processed | ||
private List<String> urlError = new ArrayList<String>(); //A list of URLs that resulted in an error | ||
|
||
private int numFindUrl = 0; //find the number of url | ||
|
||
private final SimpleDateFormat sFormat = new SimpleDateFormat("yyyy-mm-dd hh:mm:ss.SSS"); | ||
private static final String USER_AGENT = "2016IR201330551365"; | ||
public Crawler() {} | ||
|
||
|
||
/** | ||
* start crawling | ||
*/ | ||
public void begin() { | ||
|
||
while (!urlWaiting.isEmpty()) { | ||
processURL(urlWaiting.remove(0)); | ||
} | ||
|
||
log("finish crawling"); | ||
log("the number of urls that were found:" + numFindUrl); | ||
log("the number of urls that were processed:" + urlProcessed.size()); | ||
log("the number of urls that resulted in an error:" + urlError.size()); | ||
} | ||
|
||
/** | ||
* Called internally to process a URL | ||
* | ||
* @param strUrl | ||
* The URL to be processed. | ||
*/ | ||
public void processURL(String strUrl) { | ||
URL url = null; | ||
try { | ||
url = new URL(strUrl); | ||
log("Processing: " + url); | ||
// get the URL's contents | ||
URLConnection connection = url.openConnection(); | ||
connection.setRequestProperty("User-Agent", USER_AGENT); | ||
|
||
if ((connection.getContentType() != null) | ||
&& !connection.getContentType().toLowerCase() | ||
.startsWith("text/")) { | ||
log("Not processing because content type is: " | ||
+ connection.getContentType()); | ||
return; | ||
} | ||
|
||
// read the URL | ||
InputStream is = connection.getInputStream(); | ||
Reader r = new InputStreamReader(is); | ||
// parse the URL | ||
HTMLEditorKit.Parser parse = new HTMLParse().getParser(); | ||
parse.parse(r, new Parser(url), true); | ||
} catch (IOException e) { | ||
urlError.add(url.toString()); | ||
log("Error: " + url); | ||
return; | ||
} | ||
// mark URL as complete | ||
urlProcessed.add(url.toString()); | ||
log("Complete: " + url); | ||
} | ||
|
||
/** | ||
* Add a URL to waiting list. | ||
* | ||
* @param url | ||
*/ | ||
public void addURL(String url) { | ||
if (urlWaiting.contains(url)) | ||
return; | ||
if (urlError.contains(url)) | ||
return; | ||
if (urlProcessed.contains(url)) | ||
return; | ||
log("Adding to workload: " + url); | ||
urlWaiting.add(url); | ||
numFindUrl++; | ||
} | ||
|
||
/** | ||
* Called internally to log information This basic method just writes the | ||
* log out to the stdout. | ||
* | ||
* @param entry | ||
* The information to be written to the log. | ||
*/ | ||
public void log(String entry) { | ||
String date = sFormat.format(new Date()); | ||
System.out.println(USER_AGENT + " " | ||
+ date + " " | ||
+ entry); | ||
} | ||
|
||
|
||
protected class HTMLParse extends HTMLEditorKit { | ||
public HTMLEditorKit.Parser getParser() { | ||
return super.getParser(); | ||
} | ||
} | ||
|
||
/** | ||
* A HTML parser callback used by this class to detect links | ||
* | ||
*/ | ||
protected class Parser extends HTMLEditorKit.ParserCallback { | ||
protected URL base; | ||
|
||
public Parser(URL base) { | ||
this.base = base; | ||
} | ||
|
||
public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) { | ||
String href = (String) a.getAttribute(HTML.Attribute.HREF); | ||
|
||
if ((href == null) && (t == HTML.Tag.FRAME)) | ||
href = (String) a.getAttribute(HTML.Attribute.SRC); | ||
|
||
if (href == null) | ||
return; | ||
|
||
int i = href.indexOf('#'); | ||
if (i != -1) | ||
href = href.substring(0, i); | ||
|
||
if (href.toLowerCase().startsWith("mailto:")) | ||
return; | ||
|
||
handleLink(base, href); | ||
} | ||
|
||
public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) { | ||
handleSimpleTag(t, a, pos); // handle the same way | ||
|
||
} | ||
|
||
protected void handleLink(URL base, String str) { | ||
try { | ||
URL url = new URL(base, str); | ||
addURL(url.toString()); | ||
} catch (MalformedURLException e) { | ||
log("Found malformed URL: " + str); | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* @param args | ||
*/ | ||
public static void main(String[] args) { | ||
Crawler crawler = new Crawler(); | ||
crawler.addURL("http://www.scut.edu.cn"); | ||
crawler.begin(); | ||
} | ||
} |