Skip to content

Commit

Permalink
add FileUtils and some Constants
Browse files Browse the repository at this point in the history
  • Loading branch information
FreedomZZQ committed Apr 22, 2016
1 parent e9a7147 commit 024e3d4
Show file tree
Hide file tree
Showing 3 changed files with 198 additions and 19 deletions.
57 changes: 38 additions & 19 deletions src/crawler/Crawler.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;

import static utils.Constants.*;

public class Crawler {

private List<String> urlWaiting = new ArrayList<String>(); //A list of URLs that are waiting to be processed
Expand All @@ -24,8 +26,8 @@ public class Crawler {

private int numFindUrl = 0; //find the number of url

private final SimpleDateFormat sFormat = new SimpleDateFormat("yyyy-mm-dd hh:mm:ss.SSS");
private static final String USER_AGENT = "2016IR201330551365";
private final SimpleDateFormat sFormat = new SimpleDateFormat(DATE_FORMAT);

public Crawler() {}


Expand All @@ -38,10 +40,10 @@ public void begin() {
processURL(urlWaiting.remove(0));
}

log("finish crawling");
log("the number of urls that were found:" + numFindUrl);
log("the number of urls that were processed:" + urlProcessed.size());
log("the number of urls that resulted in an error:" + urlError.size());
// log("finish crawling");
// log("the number of urls that were found:" + numFindUrl);
// log("the number of urls that were processed:" + urlProcessed.size());
// log("the number of urls that resulted in an error:" + urlError.size());
}

/**
Expand All @@ -54,33 +56,40 @@ public void processURL(String strUrl) {
URL url = null;
try {
url = new URL(strUrl);
log("Processing: " + url);
//log("Processing: " + url);

// get the URL's contents
URLConnection connection = url.openConnection();
connection.setRequestProperty("User-Agent", USER_AGENT);

if ((connection.getContentType() != null)
&& !connection.getContentType().toLowerCase()
.startsWith("text/")) {
log("Not processing because content type is: "
+ connection.getContentType());
// log("Not processing because content type is: "
// + connection.getContentType());
log(TYPE_CONNECTING, url.toString(), TAG_ERROR);
return;
}
log(TYPE_CONNECTING, url.toString(), TAG_SUCCESS);

// read the URL
InputStream is = connection.getInputStream();
Reader r = new InputStreamReader(is);
log(TYPE_FETCHING, url.toString(), TAG_SUCCESS);

// parse the URL
HTMLEditorKit.Parser parse = new HTMLParse().getParser();
parse.parse(r, new Parser(url), true);
log(TYPE_PARSING, url.toString(), TAG_SUCCESS);
} catch (IOException e) {
urlError.add(url.toString());
log("Error: " + url);
return;
//log("Error: " + url);
log(TYPE_FETCHING, url.toString(), TAG_ERROR);
return;
}
// mark URL as complete
urlProcessed.add(url.toString());
log("Complete: " + url);
//log("Complete: " + url);
}

/**
Expand All @@ -95,7 +104,7 @@ public void addURL(String url) {
return;
if (urlProcessed.contains(url))
return;
log("Adding to workload: " + url);
//log("Adding to workload: " + url);
urlWaiting.add(url);
numFindUrl++;
}
Expand All @@ -107,11 +116,20 @@ public void addURL(String url) {
* @param entry
* The information to be written to the log.
*/
public void log(String entry) {
// public void log(String entry) {
// String date = sFormat.format(new Date());
// System.out.println(USER_AGENT + " "
// + date + " "
// + entry);
// }

public void log(String type, String url, String tag){
String date = sFormat.format(new Date());
System.out.println(USER_AGENT + " "
+ date + " "
+ entry);
System.out.println(USER_AGENT + " "
+ date + " "
+ type + " "
+ url + " "
+ tag);
}


Expand Down Expand Up @@ -161,7 +179,8 @@ protected void handleLink(URL base, String str) {
URL url = new URL(base, str);
addURL(url.toString());
} catch (MalformedURLException e) {
log("Found malformed URL: " + str);
//log("Found malformed URL: " + str);

}
}
}
Expand All @@ -171,7 +190,7 @@ protected void handleLink(URL base, String str) {
*/
public static void main(String[] args) {
Crawler crawler = new Crawler();
crawler.addURL("http://www.scut.edu.cn");
crawler.addURL(SEED_URL);
crawler.begin();
}
}
16 changes: 16 additions & 0 deletions src/utils/Constants.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
package utils;

/**
* Created by ZQ on 2016/4/22.
*/
public abstract class Constants {
public static final String USER_AGENT = "2016IR201330551365";
public static final String SEED_URL = "http://www.scut.edu.cn";
public static final String DATE_FORMAT = "yyyy-mm-dd hh:mm:ss.SSS";
public static final String TYPE_CONNECTING = "Connecting";
public static final String TYPE_FETCHING = "Fetching";
public static final String TYPE_PARSING = "Parsing";
public static final String TAG_SUCCESS = "Successful";
public static final String TAG_ERROR = "Error";
public static final String LOG_FILE_NAME = "IR201330551365LOG";
}
144 changes: 144 additions & 0 deletions src/utils/FileUtils.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
package utils;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;

public class FileUtils {
public static void deleteDir(File dir) {
File[] filelist = dir.listFiles();
for (File file : filelist) {
if (file.isFile()) {
file.delete();
} else {
deleteDir(file);
}
}
dir.delete();
}

public static void copy(File origin, File newfile) throws FileNotFoundException, IOException {
if (!newfile.getParentFile().exists()) {
newfile.getParentFile().mkdirs();
}
FileInputStream fis = new FileInputStream(origin);
FileOutputStream fos = new FileOutputStream(newfile);
byte[] buf = new byte[2048];
int read;
while ((read = fis.read(buf)) != -1) {
fos.write(buf, 0, read);
}
fis.close();
fos.close();
}

public static void writeFile(String fileName, String contentStr, String charset) throws FileNotFoundException, IOException {
byte[] content = contentStr.getBytes(charset);
FileOutputStream fos = new FileOutputStream(fileName);
fos.write(content);
fos.close();
}

public static void writeFile(File file, String contentStr, String charset) throws FileNotFoundException, IOException {
byte[] content = contentStr.getBytes(charset);
FileOutputStream fos = new FileOutputStream(file);
fos.write(content);
fos.close();
}

public static void writeFileWithParent(String fileName, String contentStr, String charset) throws FileNotFoundException, IOException {
File file = new File(fileName);
File parent = file.getParentFile();
if (!parent.exists()) {
parent.mkdirs();
}
byte[] content = contentStr.getBytes(charset);
FileOutputStream fos = new FileOutputStream(file);
fos.write(content);
fos.close();
}

public static void writeFileWithParent(File file, String contentStr, String charset) throws FileNotFoundException, IOException {
File parent = file.getParentFile();
if (!parent.exists()) {
parent.mkdirs();
}
byte[] content = contentStr.getBytes(charset);
FileOutputStream fos = new FileOutputStream(file);
fos.write(content);
fos.close();
}

public static void writeFile(String fileName, byte[] content) throws FileNotFoundException, IOException {
FileOutputStream fos = new FileOutputStream(fileName);
fos.write(content);
fos.close();
}

public static void writeFile(File file, byte[] content) throws FileNotFoundException, IOException {
FileOutputStream fos = new FileOutputStream(file);
fos.write(content);
fos.close();
}

public static void writeFileWithParent(String fileName, byte[] content) throws FileNotFoundException, IOException {
File file = new File(fileName);
File parent = file.getParentFile();
if (!parent.exists()) {
parent.mkdirs();
}
FileOutputStream fos = new FileOutputStream(file);
fos.write(content);
fos.close();
}

public static void writeFileWithParent(File file, byte[] content) throws FileNotFoundException, IOException {

File parent = file.getParentFile();
if (!parent.exists()) {
parent.mkdirs();
}
FileOutputStream fos = new FileOutputStream(file);
fos.write(content);
fos.close();
}

public static byte[] readFile(File file) throws IOException {
FileInputStream fis = new FileInputStream(file);
byte[] buf = new byte[2048];
int read;
ByteArrayOutputStream bos = new ByteArrayOutputStream();
while ((read = fis.read(buf)) != -1) {
bos.write(buf, 0, read);
}

fis.close();
return bos.toByteArray();
}

public static byte[] readFile(String fileName) throws IOException {
File file = new File(fileName);
return readFile(file);
}

public static String readFile(File file, String charset) throws Exception {
FileInputStream fis = new FileInputStream(file);
byte[] buf = new byte[2048];
int read;
ByteArrayOutputStream bos = new ByteArrayOutputStream();
while ((read = fis.read(buf)) != -1) {
bos.write(buf, 0, read);
}

fis.close();
return new String(bos.toByteArray(), charset);
}

public static String readFile(String fileName, String charset) throws Exception {
File file = new File(fileName);
return readFile(file, charset);
}
}

0 comments on commit 024e3d4

Please sign in to comment.