Skip to content

Commit

Permalink
MEDIUM - removing Akka Actor as primary crawl item, will come back to…
Browse files Browse the repository at this point in the history
… refactoring this to a load balanced actor pool for higher throughput
  • Loading branch information
jiminoc committed Sep 24, 2011
1 parent e15ae4c commit b546c4d
Show file tree
Hide file tree
Showing 5 changed files with 87 additions and 45 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -38,31 +38,18 @@ import utils.{ParsingCandidate, URLHelper, Logging}

case class CrawlCandidate(config: Configuration, url: String, rawHTML: String = null)

class CrawlingActor extends Actor with Logging {
class Crawler(config: Configuration) extends Logging {

val logPrefix = "crawler: "

var config: Configuration = null

def receive = {
case cc: CrawlCandidate => {
config = cc.config
crawl(cc)
}
case _ => throw new Exception("unknown message sent to actor")
}


def crawl(crawlCandidate: CrawlCandidate) = {
def crawl(crawlCandidate: CrawlCandidate): Article = {
val article = new Article()
for {
parseCandidate <- URLHelper.getCleanedUrl(crawlCandidate.url)
rawHtml <- getHTML(crawlCandidate, parseCandidate)
doc <- getDocument(parseCandidate.url.toString, rawHtml)
} {

trace("Crawling url: %s".format(parseCandidate.url))

val extractor = getExtractor
val docCleaner = getDocCleaner
val outputFormatter = getOutputFormatter
Expand All @@ -81,10 +68,11 @@ class CrawlingActor extends Actor with Logging {
article.canonicalLink = extractor.getCanonicalLink(article)
article.domain = extractor.getDomain(article.finalUrl)
article.tags = extractor.extractTags(article)

// before we do any calcs on the body itself let's clean up the document
article.doc = docCleaner.clean(article)



extractor.calculateBestNodeBasedOnClustering(article) match {
case Some(node: Element) => {
article.topNode = node
Expand All @@ -103,13 +91,19 @@ class CrawlingActor extends Actor with Logging {
}
article.topNode = extractor.postExtractionCleanup(article.topNode)




article.cleanedArticleText = outputFormatter.getFormattedText(article.topNode)
}
case _ => trace("NO ARTICLE FOUND");
}
releaseResources(article)
self.reply(article)
// self.reply(article)
article
}

article
}

def getHTML(crawlCandidate: CrawlCandidate, parsingCandidate: ParsingCandidate): Option[String] = {
Expand Down
18 changes: 5 additions & 13 deletions src/main/scala/com/gravity/goose/Goose.scala
Original file line number Diff line number Diff line change
Expand Up @@ -52,17 +52,9 @@ class Goose(config: Configuration = new Configuration) extends Logging {
}

def sendToActor(crawlCandidate: CrawlCandidate) = {
val result = crawlingActor !! crawlCandidate
result match {
case Some(article) => {
debug("Got a result of type: {0} from URL: {1}", article.asInstanceOf[AnyRef].getClass.getCanonicalName, crawlCandidate.url)
article.asInstanceOf[Article]
}
case _ => {
debug("DID NOT get a result back from URL: {0}", crawlCandidate.url)
null
}
}
val crawler = new Crawler(config)
val article = crawler.crawl(crawlCandidate)
article
}

def initializeEnvironment() {
Expand Down Expand Up @@ -94,7 +86,7 @@ object Goose {
val logPrefix = "goose: "

// create the crawling actor that will accept bulk crawls
val crawlingActor = Actor.actorOf[CrawlingActor]
crawlingActor.start()
// val crawlingActor = Actor.actorOf[CrawlingActor]
// crawlingActor.start()

}
27 changes: 23 additions & 4 deletions src/main/scala/com/gravity/goose/images/ImageUtils.scala
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ import java.util.{Random, ArrayList, HashMap}
import java.io._
import com.gravity.goose.Configuration
import com.gravity.goose.text.HashUtils
import org.apache.http.util.EntityUtils
import org.apache.commons.io.IOUtils

object ImageUtils extends Logging {
/**
Expand Down Expand Up @@ -169,10 +171,10 @@ object ImageUtils extends Logging {
trace("Not found locally...starting to download image: " + imageSrc)
fetchEntity(httpClient, imageSrc) match {
case Some(entity) => {
trace("Got entity for")
trace("Got entity for %s".format(imageSrc))
writeEntityContentsToDisk(entity, linkhash, imageSrc, config) match {
case Some(locallyStoredImage) => Some(locallyStoredImage)
case None => None
case Some(locallyStoredImage) => trace("Img Write successfull to disk"); Some(locallyStoredImage)
case None => trace("Unable to write contents to disk: %s".format(imageSrc)); None
}
}
case None => trace("Unable to fetch entity for: " + imageSrc); None
Expand Down Expand Up @@ -224,7 +226,24 @@ object ImageUtils extends Logging {

val localSrcPath = getLocalFileName(linkhash, imageSrc, config)
val outstream: OutputStream = new FileOutputStream(localSrcPath)
entity.writeTo(outstream)
val instream: InputStream = entity.getContent
trace("Content Length: " + entity.getContentLength)
try {
val fileCopyBytes = IOUtils.copy(instream, outstream)
trace("%d bytes copied to disk".format(fileCopyBytes))
} catch {
case e: Exception => info(e, e.toString)
} finally {
try {
outstream.flush()
outstream.close()
instream.close()
} catch {
case e: Exception => info(e, e.toString)
}
}
// entity.writeTo(outstream)
EntityUtils.consume(entity)
trace("Content Length: " + entity.getContentLength)
readExistingFileInfo(linkhash, imageSrc, config)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class UpgradedImageIExtractor(httpClient: HttpClient, article: Article, config:
/**
* What's the minimum bytes for an image we'd accept is
*/
private val minBytesForImages: Int = 0
private val minBytesForImages: Int = 4000
/**
* location to store temporary image files if need be
*/
Expand Down
57 changes: 47 additions & 10 deletions src/test/scala/com/gravity/goose/GoldSitesTestIT.scala
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,18 @@ class GoldSitesTestIT {

}

@Test
def businessWeek3() {
implicit val config = TestUtils.DEFAULT_CONFIG
val url: String = "http://www.businessinsider.com/ben-and-jerrys-schweddy-balls-one-million-moms-american-family-association-boycott-2011-9"
val article: Article = TestUtils.getArticle(url)
// if (article == null) println("NULL ARTICLE!") else println("TEXT: \n" + article.cleanedArticleText)
val content = "Not everyone's a fan of Ben & Jerry's new \"Schweddy Balls\" -- the Saturday Night Live-inspired flavor it rolled out a few weeks ago"
val image = "http://static7.businessinsider.com/image/4e68c8c36bb3f7d80a000016/conservative-moms-are-now-calling-for-a-boycott-of-ben-and-jerrys-schweddy-balls-flavor.jpg"
TestUtils.runArticleAssertions(article = article, expectedStart = content, expectedImage = image)

}

@Test
def desertNews() {
implicit val config = TestUtils.DEFAULT_CONFIG
Expand Down Expand Up @@ -228,28 +240,53 @@ class GoldSitesTestIT {
}

@Test
def cnbc() {
implicit val config = TestUtils.NO_IMAGE_CONFIG

val url: String = "http://www.cnbc.com/id/44613978"
def huffpoBusiness() {
implicit val config = TestUtils.DEFAULT_CONFIG
val url: String = "http://www.huffingtonpost.com/david-macaray/labor-union-membership_b_973038.html"
val article = TestUtils.getArticle(url)
TestUtils.runArticleAssertions(article = article,
expectedStart = "For men and women who plan on entering the job",
expectedImage = null)
TestUtils.printReport()
}

@Test
def huffpoBusiness2() {
implicit val config = TestUtils.DEFAULT_CONFIG
val url: String = "http://www.huffingtonpost.com/2011/09/21/us-sees-challenges-in-s_n_974724.html"
val article = TestUtils.getArticle(url)
TestUtils.runArticleAssertions(article = article,
expectedStart = "Some traders found Wednesday's Fed statement to be a bit gloomier than expected.")
expectedStart = "WASHINGTON (Reuters) - The government is continuing an aggressive drive to hold accountable",
expectedImage = null)
TestUtils.printReport()
}


@Test
def cnbc2() {
implicit val config = TestUtils.DEFAULT_CONFIG
val url: String = "http://www.cnbc.com/id/44614459"
def cnbc() {
implicit val config = TestUtils.NO_IMAGE_CONFIG

val url: String = "http://www.cnbc.com/id/44613978"
val article = TestUtils.getArticle(url)

TestUtils.runArticleAssertions(article = article,
expectedStart = "Some traders found Wednesday's Fed statement to be a bit gloomier than expected.",
expectedImage = "http://media.cnbc.com/i/CNBC/Sections/News_And_Analysis/__Story_Inserts/graphics/__FEDERAL_RESERVE/FED_RESERVE3.jpg")
expectedStart = "Some traders found Wednesday's Fed statement to be a bit gloomier than expected.")
TestUtils.printReport()
}

// @Test
// def cnbc2() {
// // commented out while this issue is resolve: https://github.com/jhy/jsoup/issues/130
// implicit val config = TestUtils.DEFAULT_CONFIG
// val url: String = "http://www.cnbc.com/id/44614459"
// val article = TestUtils.getArticle(url)
// println(article.cleanedArticleText)
// TestUtils.runArticleAssertions(article = article,
// expectedStart = "Some traders found Wednesday's Fed statement to be a bit gloomier than expected.",
// expectedImage = "http://media.cnbc.com/i/CNBC/Sections/News_And_Analysis/__Story_Inserts/graphics/__FEDERAL_RESERVE/FED_RESERVE3.jpg")
// TestUtils.printReport()
// }

@Test
def yahooFinance() {
val url = "http://finance.yahoo.com/news/Mulling-Meg-Whitman-HP-apf-4116866737.html?x=0"
Expand Down

0 comments on commit b546c4d

Please sign in to comment.