Skip to content

Commit

Permalink
add type check
Browse files Browse the repository at this point in the history
  • Loading branch information
FreedomZZQ committed Apr 23, 2016
1 parent d69c3ed commit c794f79
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 35 deletions.
71 changes: 36 additions & 35 deletions src/crawler/Crawler.java
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,10 @@ public void processURL(String strUrl) {
URLConnection connection = url.openConnection();
connection.setRequestProperty("User-Agent", USER_AGENT);

//judge url type
if ((connection.getContentType() != null)
&& !connection.getContentType().toLowerCase()
.startsWith("text/")) {
.startsWith(CONTENT_TYPE)) {
// log("Not processing because content type is: "
// + connection.getContentType());
log(TYPE_CONNECTING, url.toString(), TAG_ERROR);
Expand Down Expand Up @@ -153,51 +154,51 @@ public HTMLEditorKit.Parser getParser() {
}
}

/**
* A HTML parser callback used by this class to detect links
*
*/
protected class Parser extends HTMLEditorKit.ParserCallback {
protected URL base;
/**
* A HTML parser callback used by this class to detect links
*
*/
protected class Parser extends HTMLEditorKit.ParserCallback {
protected URL base;

public Parser(URL base) {
this.base = base;
}
public Parser(URL base) {
this.base = base;
}

public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) {
String href = (String) a.getAttribute(HTML.Attribute.HREF);
public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) {
String href = (String) a.getAttribute(HTML.Attribute.HREF);

if ((href == null) && (t == HTML.Tag.FRAME))
href = (String) a.getAttribute(HTML.Attribute.SRC);
if ((href == null) && (t == HTML.Tag.FRAME))
href = (String) a.getAttribute(HTML.Attribute.SRC);

if (href == null)
return;
if (href == null)
return;

int i = href.indexOf('#');
if (i != -1)
href = href.substring(0, i);
int i = href.indexOf('#');
if (i != -1)
href = href.substring(0, i);

if (href.toLowerCase().startsWith("mailto:"))
return;
if (href.toLowerCase().startsWith("mailto:"))
return;

handleLink(base, href);
}
handleLink(base, href);
}

public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
handleSimpleTag(t, a, pos); // handle the same way
public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
handleSimpleTag(t, a, pos); // handle the same way

}
}

protected void handleLink(URL base, String str) {
try {
URL url = new URL(base, str);
addURL(url.toString());
} catch (MalformedURLException e) {
//log("Found malformed URL: " + str);
protected void handleLink(URL base, String str) {
try {
URL url = new URL(base, str);
addURL(url.toString());
} catch (MalformedURLException e) {
//log("Found malformed URL: " + str);

}
}
}
}
}
}

/**
* @param args
Expand Down
1 change: 1 addition & 0 deletions src/utils/Constants.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ public abstract class Constants {
public static final String TAG_SUCCESS = "Successful";
public static final String TAG_ERROR = "Error";
public static final String LOG_FILE_NAME = "IR201330551365LOG.txt";
public static final String CONTENT_TYPE = "text/";
}

0 comments on commit c794f79

Please sign in to comment.