Skip to content

Commit

Permalink
Added the ability to completely override the object responsible for r…
Browse files Browse the repository at this point in the history
…etrieving HTML content for goose to parse.
  • Loading branch information
erraggy committed Oct 13, 2012
1 parent 346cfc8 commit 3286abb
Show file tree
Hide file tree
Showing 10 changed files with 383 additions and 347 deletions.
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

<groupId>com.gravity</groupId>
<artifactId>goose</artifactId>
<version>2.1.19</version>
<version>2.1.20</version>
<packaging>jar</packaging>
<name>goose</name>
<url>http://maven.apache.org</url>
Expand Down
9 changes: 9 additions & 0 deletions src/main/scala/com/gravity/goose/Configuration.scala
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

package com.gravity.goose

import network.{DefaultHtmlFetcher, HtmlFetcher}
import org.jsoup.nodes.Element
import java.util.Date
import reflect.BeanProperty
Expand Down Expand Up @@ -114,5 +115,13 @@ class Configuration {
this.additionalDataExtractor = extractor
}

var htmlFetcher: HtmlFetcher = DefaultHtmlFetcher

def setHtmlFetcher(fetcher: HtmlFetcher) {
require(fetcher != null, "fetcher MUST NOT be null!")
this.htmlFetcher = fetcher
}

def getHtmlFetcher: HtmlFetcher = htmlFetcher

}
14 changes: 6 additions & 8 deletions src/main/scala/com/gravity/goose/Crawler.scala
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ package com.gravity.goose
import cleaners.{StandardDocumentCleaner, DocumentCleaner}
import extractors.ContentExtractor
import images.{UpgradedImageIExtractor, ImageExtractor}
import network.HtmlFetcher
import org.apache.http.client.HttpClient
import org.jsoup.nodes.{Document, Element}
import org.jsoup.Jsoup
Expand All @@ -48,7 +47,7 @@ class Crawler(config: Configuration) {
rawHtml <- getHTML(crawlCandidate, parseCandidate)
doc <- getDocument(parseCandidate.url.toString, rawHtml)
} {
trace("Crawling url: %s".format(parseCandidate.url))
trace("Crawling url: " + parseCandidate.url)

val extractor = getExtractor
val docCleaner = getDocCleaner
Expand Down Expand Up @@ -96,10 +95,9 @@ class Crawler(config: Configuration) {

article.cleanedArticleText = outputFormatter.getFormattedText(article.topNode)
}
case _ => trace("NO ARTICLE FOUND");
case _ => trace("NO ARTICLE FOUND")
}
releaseResources(article)
// self.reply(article)
article
}

Expand All @@ -110,7 +108,7 @@ class Crawler(config: Configuration) {
if (crawlCandidate.rawHTML != null) {
Some(crawlCandidate.rawHTML)
} else {
HtmlFetcher.getHtml(config, parsingCandidate.url.toString) match {
config.getHtmlFetcher.getHtml(config, parsingCandidate.url.toString) match {
case Some(html) => {
Some(html)
}
Expand All @@ -121,7 +119,7 @@ class Crawler(config: Configuration) {


def getImageExtractor(article: Article): ImageExtractor = {
val httpClient: HttpClient = HtmlFetcher.getHttpClient
val httpClient: HttpClient = config.getHtmlFetcher.getHttpClient
new UpgradedImageIExtractor(httpClient, article, config)
}

Expand All @@ -139,7 +137,7 @@ class Crawler(config: Configuration) {
Some(Jsoup.parse(rawlHtml))
} catch {
case e: Exception => {
trace("Unable to parse %s properly into JSoup Doc".format(url))
trace("Unable to parse " + url + " properly into JSoup Doc")
None
}
}
Expand All @@ -153,7 +151,7 @@ class Crawler(config: Configuration) {
* cleans up any temp files we have laying around like temp images
* removes any image in the temp dir that starts with the linkhash of the url we just parsed
*/
def releaseResources(article: Article) = {
def releaseResources(article: Article) {
trace(logPrefix + "STARTING TO RELEASE ALL RESOURCES")

val dir: File = new File(config.localStoragePath)
Expand Down
4 changes: 2 additions & 2 deletions src/main/scala/com/gravity/goose/Goose.scala
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

package com.gravity.goose

import network.HtmlFetcher
import network.DefaultHtmlFetcher
import java.io.File

/**
Expand All @@ -45,7 +45,7 @@ class Goose(config: Configuration = new Configuration) {
}

def shutdownNetwork() {
HtmlFetcher.getHttpClient.getConnectionManager.shutdown()
DefaultHtmlFetcher.getHttpClient.getConnectionManager.shutdown()
}

def sendToActor(crawlCandidate: CrawlCandidate) = {
Expand Down
4 changes: 2 additions & 2 deletions src/main/scala/com/gravity/goose/images/ImageSaver.scala
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ import java.io._
import java.util.Random
import com.gravity.goose.utils.Logging
import com.gravity.goose.Configuration
import com.gravity.goose.network.HtmlFetcher
import com.gravity.goose.network.DefaultHtmlFetcher

/**
* This class will be responsible for storing images to disk
Expand Down Expand Up @@ -87,7 +87,7 @@ object ImageSaver extends Logging {
def fetchEntity(httpClient: HttpClient, imageSrc: String): Option[HttpEntity] = {

val localContext: HttpContext = new BasicHttpContext
localContext.setAttribute(ClientContext.COOKIE_STORE, HtmlFetcher.emptyCookieStore)
localContext.setAttribute(ClientContext.COOKIE_STORE, DefaultHtmlFetcher.emptyCookieStore)
val httpget = new HttpGet(imageSrc)
val response = httpClient.execute(httpget, localContext)
val respStatus: String = response.getStatusLine.toString
Expand Down
10 changes: 5 additions & 5 deletions src/main/scala/com/gravity/goose/images/ImageUtils.scala
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ import com.gravity.goose.Configuration
import com.gravity.goose.text.HashUtils
import org.apache.http.util.EntityUtils
import org.apache.commons.io.IOUtils
import com.gravity.goose.network.{ImageFetchException, HtmlFetcher}
import com.gravity.goose.network.{ImageFetchException, DefaultHtmlFetcher}

object ImageUtils extends Logging {
/**
Expand Down Expand Up @@ -169,7 +169,7 @@ object ImageUtils extends Logging {
}

trace("Not found locally...starting to download image: " + imageSrc)
fetchEntity(httpClient, imageSrc) match {
fetchEntity(httpClient, imageSrc, config) match {
case Some(entity) => {
trace("Got entity for %s".format(imageSrc))
writeEntityContentsToDisk(entity, linkhash, imageSrc, config) match {
Expand Down Expand Up @@ -259,13 +259,13 @@ object ImageUtils extends Logging {
imgSrc.replace(" ", "%20")
}

def fetchEntity(httpClient: HttpClient, imageSrc: String): Option[HttpEntity] = {
def fetchEntity(httpClient: HttpClient, imageSrc: String, config: Configuration): Option[HttpEntity] = {

val localContext: HttpContext = new BasicHttpContext
localContext.setAttribute(ClientContext.COOKIE_STORE, HtmlFetcher.emptyCookieStore)
localContext.setAttribute(ClientContext.COOKIE_STORE, DefaultHtmlFetcher.emptyCookieStore)
val httpget = new HttpGet(imageSrc)
val response = try {
HtmlFetcher.getHttpClient.execute(httpget, localContext)
config.getHtmlFetcher.getHttpClient.execute(httpget, localContext)
}
catch {
case ex: Exception => throw new ImageFetchException(imageSrc, ex)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ import collection.mutable.HashMap
import scala.collection.JavaConversions._
import com.gravity.goose.text.string
import java.net.{MalformedURLException, URL}
import com.gravity.goose.network.HtmlFetcher
import com.gravity.goose.network.DefaultHtmlFetcher
import java.io.{IOException, File}
import java.util.regex.{Pattern, Matcher}
import org.apache.http.client.methods.HttpGet
Expand Down Expand Up @@ -473,7 +473,7 @@ class StandardImageExtractor(httpClient: HttpClient, article: Article, config: C
var link: String = this.buildImagePath(src)
link = link.replace(" ", "%20")
val localContext: HttpContext = new BasicHttpContext
localContext.setAttribute(ClientContext.COOKIE_STORE, HtmlFetcher.emptyCookieStore)
localContext.setAttribute(ClientContext.COOKIE_STORE, DefaultHtmlFetcher.emptyCookieStore)
httpget = new HttpGet(link)
var response: HttpResponse = null
response = httpClient.execute(httpget, localContext)
Expand Down
Loading

0 comments on commit 3286abb

Please sign in to comment.