Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
FreedomZZQ committed Apr 21, 2016
0 parents commit 263fb27
Show file tree
Hide file tree
Showing 2 changed files with 227 additions and 0 deletions.
50 changes: 50 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
.metadata
bin/
tmp/
*.tmp
*.bak
*.swp
*~.nib
local.properties
.settings/
.loadpath
.recommenders

# Eclipse Core
.project

# External tool builders
.externalToolBuilders/

# Locally stored "Eclipse launch configurations"
*.launch

# PyDev specific (Python IDE for Eclipse)
*.pydevproject

# CDT-specific (C/C++ Development Tooling)
.cproject

# JDT-specific (Eclipse Java Development Tools)
.classpath

# Java annotation processor (APT)
.factorypath

# PDT-specific (PHP Development Tools)
.buildpath

# sbteclipse plugin
.target

# Tern plugin
.tern-project

# TeXlipse plugin
.texlipse

# STS (Spring Tool Suite)
.springBeans

# Code Recommenders
.recommenders/
177 changes: 177 additions & 0 deletions src/crawler/Crawler.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
package crawler;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;

public class Crawler {

private List<String> urlWaiting = new ArrayList<String>(); //A list of URLs that are waiting to be processed
private List<String> urlProcessed = new ArrayList<String>(); //A list of URLs that were processed
private List<String> urlError = new ArrayList<String>(); //A list of URLs that resulted in an error

private int numFindUrl = 0; //find the number of url

private final SimpleDateFormat sFormat = new SimpleDateFormat("yyyy-mm-dd hh:mm:ss.SSS");
private static final String USER_AGENT = "2016IR201330551365";
public Crawler() {}


/**
* start crawling
*/
public void begin() {

while (!urlWaiting.isEmpty()) {
processURL(urlWaiting.remove(0));
}

log("finish crawling");
log("the number of urls that were found:" + numFindUrl);
log("the number of urls that were processed:" + urlProcessed.size());
log("the number of urls that resulted in an error:" + urlError.size());
}

/**
* Called internally to process a URL
*
* @param strUrl
* The URL to be processed.
*/
public void processURL(String strUrl) {
URL url = null;
try {
url = new URL(strUrl);
log("Processing: " + url);
// get the URL's contents
URLConnection connection = url.openConnection();
connection.setRequestProperty("User-Agent", USER_AGENT);

if ((connection.getContentType() != null)
&& !connection.getContentType().toLowerCase()
.startsWith("text/")) {
log("Not processing because content type is: "
+ connection.getContentType());
return;
}

// read the URL
InputStream is = connection.getInputStream();
Reader r = new InputStreamReader(is);
// parse the URL
HTMLEditorKit.Parser parse = new HTMLParse().getParser();
parse.parse(r, new Parser(url), true);
} catch (IOException e) {
urlError.add(url.toString());
log("Error: " + url);
return;
}
// mark URL as complete
urlProcessed.add(url.toString());
log("Complete: " + url);
}

/**
* Add a URL to waiting list.
*
* @param url
*/
public void addURL(String url) {
if (urlWaiting.contains(url))
return;
if (urlError.contains(url))
return;
if (urlProcessed.contains(url))
return;
log("Adding to workload: " + url);
urlWaiting.add(url);
numFindUrl++;
}

/**
* Called internally to log information This basic method just writes the
* log out to the stdout.
*
* @param entry
* The information to be written to the log.
*/
public void log(String entry) {
String date = sFormat.format(new Date());
System.out.println(USER_AGENT + " "
+ date + " "
+ entry);
}


protected class HTMLParse extends HTMLEditorKit {
public HTMLEditorKit.Parser getParser() {
return super.getParser();
}
}

/**
* A HTML parser callback used by this class to detect links
*
*/
protected class Parser extends HTMLEditorKit.ParserCallback {
protected URL base;

public Parser(URL base) {
this.base = base;
}

public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) {
String href = (String) a.getAttribute(HTML.Attribute.HREF);

if ((href == null) && (t == HTML.Tag.FRAME))
href = (String) a.getAttribute(HTML.Attribute.SRC);

if (href == null)
return;

int i = href.indexOf('#');
if (i != -1)
href = href.substring(0, i);

if (href.toLowerCase().startsWith("mailto:"))
return;

handleLink(base, href);
}

public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
handleSimpleTag(t, a, pos); // handle the same way

}

protected void handleLink(URL base, String str) {
try {
URL url = new URL(base, str);
addURL(url.toString());
} catch (MalformedURLException e) {
log("Found malformed URL: " + str);
}
}
}

/**
* @param args
*/
public static void main(String[] args) {
Crawler crawler = new Crawler();
crawler.addURL("http://www.scut.edu.cn");
crawler.begin();
}
}

0 comments on commit 263fb27

Please sign in to comment.