first commit

FreedomZZQ · Apr 21, 2016 · 263fb27 · 263fb27
commit 263fb27
Show file tree

Hide file tree

Showing 2 changed files with 227 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,50 @@
+.metadata
+bin/
+tmp/
+*.tmp
+*.bak
+*.swp
+*~.nib
+local.properties
+.settings/
+.loadpath
+.recommenders
+
+# Eclipse Core
+.project
+
+# External tool builders
+.externalToolBuilders/
+
+# Locally stored "Eclipse launch configurations"
+*.launch
+
+# PyDev specific (Python IDE for Eclipse)
+*.pydevproject
+
+# CDT-specific (C/C++ Development Tooling)
+.cproject
+
+# JDT-specific (Eclipse Java Development Tools)
+.classpath
+
+# Java annotation processor (APT)
+.factorypath
+
+# PDT-specific (PHP Development Tools)
+.buildpath
+
+# sbteclipse plugin
+.target
+
+# Tern plugin
+.tern-project
+
+# TeXlipse plugin
+.texlipse
+
+# STS (Spring Tool Suite)
+.springBeans
+
+# Code Recommenders
+.recommenders/
diff --git a/src/crawler/Crawler.java b/src/crawler/Crawler.java
@@ -0,0 +1,177 @@
+package crawler;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.net.URLConnection;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.List;
+
+import javax.swing.text.MutableAttributeSet;
+import javax.swing.text.html.HTML;
+import javax.swing.text.html.HTMLEditorKit;
+
+public class Crawler {
+
+	private List<String> urlWaiting = new ArrayList<String>();		//A list of URLs that are waiting to be processed
+	private List<String> urlProcessed = new ArrayList<String>();	//A list of URLs that were processed
+	private List<String> urlError = new ArrayList<String>();		//A list of URLs that resulted in an error
+
+	private int numFindUrl = 0;		//find the number of url
+
+	private final SimpleDateFormat sFormat = new SimpleDateFormat("yyyy-mm-dd hh:mm:ss.SSS");
+	private static final String USER_AGENT = "2016IR201330551365";
+	public Crawler() {}
+
+
+	/**
+	 * start crawling
+	 */
+	public void begin() {
+
+		while (!urlWaiting.isEmpty()) {
+			processURL(urlWaiting.remove(0));
+		}
+
+		log("finish crawling");
+		log("the number of urls that were found:" + numFindUrl);
+		log("the number of urls that were processed:" + urlProcessed.size());
+		log("the number of urls that resulted in an error:" + urlError.size());
+	}
+
+	/**
+	 * Called internally to process a URL
+	 * 
+	 * @param strUrl
+	 *            The URL to be processed.
+	 */
+	public void processURL(String strUrl) {
+		URL url = null;
+		try {
+			url = new URL(strUrl);
+			log("Processing: " + url);
+			// get the URL's contents
+			URLConnection connection = url.openConnection();
+			connection.setRequestProperty("User-Agent", USER_AGENT);
+
+			if ((connection.getContentType() != null)
+					&& !connection.getContentType().toLowerCase()
+							.startsWith("text/")) {
+				log("Not processing because content type is: "
+						+ connection.getContentType());
+				return;
+			}
+
+			// read the URL
+			InputStream is = connection.getInputStream();
+			Reader r = new InputStreamReader(is);
+			// parse the URL
+			HTMLEditorKit.Parser parse = new HTMLParse().getParser();
+			parse.parse(r, new Parser(url), true);
+		} catch (IOException e) {
+			urlError.add(url.toString());
+			log("Error: " + url);
+			return;
+		}
+		// mark URL as complete
+		urlProcessed.add(url.toString());
+		log("Complete: " + url);
+	}
+
+	/**
+	 * Add a URL to waiting list.
+	 * 
+	 * @param url
+	 */
+	public void addURL(String url) {
+		if (urlWaiting.contains(url))
+			return;
+		if (urlError.contains(url))
+			return;
+		if (urlProcessed.contains(url))
+			return;
+		log("Adding to workload: " + url);
+		urlWaiting.add(url);
+		numFindUrl++;
+	}
+
+	/**
+	 * Called internally to log information This basic method just writes the
+	 * log out to the stdout.
+	 * 
+	 * @param entry
+	 *            The information to be written to the log.
+	 */
+	public void log(String entry) {
+		String date = sFormat.format(new Date());
+		System.out.println(USER_AGENT + " " 
+							+ date + " " 
+							+ entry);
+	}
+
+
+	protected class HTMLParse extends HTMLEditorKit {
+		public HTMLEditorKit.Parser getParser() {
+			return super.getParser();
+		}
+	}
+
+	/**
+	 * A HTML parser callback used by this class to detect links
+	 * 
+	 */
+	protected class Parser extends HTMLEditorKit.ParserCallback {
+		protected URL base;
+
+		public Parser(URL base) {
+			this.base = base;
+		}
+
+		public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) {
+			String href = (String) a.getAttribute(HTML.Attribute.HREF);
+
+			if ((href == null) && (t == HTML.Tag.FRAME))
+				href = (String) a.getAttribute(HTML.Attribute.SRC);
+
+			if (href == null)
+				return;
+
+			int i = href.indexOf('#');
+			if (i != -1)
+				href = href.substring(0, i);
+
+			if (href.toLowerCase().startsWith("mailto:")) 
+				return;
+
+			handleLink(base, href);
+		}
+
+		public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
+			handleSimpleTag(t, a, pos); // handle the same way
+
+		}
+
+		protected void handleLink(URL base, String str) {
+			try {
+				URL url = new URL(base, str);
+				addURL(url.toString());
+			} catch (MalformedURLException e) {
+				log("Found malformed URL: " + str);
+			}
+		}
+	}
+
+	/**
+	 * @param args
+	 */
+	public static void main(String[] args) {
+		Crawler crawler = new Crawler();
+		crawler.addURL("http://www.scut.edu.cn");
+		crawler.begin();
+	}
+}