Merge branch 'dirbot'

rmax · Oct 24, 2013 · 997bfe9 · 997bfe9
2 parents b65c4bb + 0d875f1
commit 997bfe9
Show file tree

Hide file tree

Showing 9 changed files with 125 additions and 0 deletions.
diff --git a/README.rst b/README.rst
@@ -0,0 +1,49 @@
+======
+dirbot
+======
+
+This is a Scrapy project to scrape websites from public web directories.
+
+This project is only meant for educational purposes.
+
+Items
+=====
+
+The items scraped by this project are websites, and the item is defined in the
+class::
+
+    dirbot.items.Website
+
+See the source code for more details.
+
+Spiders
+=======
+
+This project contains one spider called ``dmoz`` that you can see by running::
+
+    scrapy list
+
+Spider: dmoz
+------------
+
+The ``dmoz`` spider scrapes the Open Directory Project (dmoz.org), and it's
+based on the dmoz spider described in the `Scrapy tutorial`_
+
+This spider doesn't crawl the entire dmoz.org site but only a few pages by
+default (defined in the ``start_pages`` attribute). These pages are:
+
+* http://www.dmoz.org/Computers/Programming/Languages/Python/Books/
+* http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
+
+So, if you run the spider regularly (with ``scrapy crawl dmoz``) it will scrape
+only those two pages.
+
+.. _Scrapy tutorial: http://doc.scrapy.org/intro/tutorial.html 
+
+Pipelines
+=========
+
+This project uses a pipeline to filter out websites containing certain
+forbidden words in their description. This pipeline is defined in the class::
+
+    dirbot.pipelines.FilterWordsPipeline
diff --git a/dirbot/__init__.py b/dirbot/__init__.py
diff --git a/dirbot/items.py b/dirbot/items.py
@@ -0,0 +1,8 @@
+from scrapy.item import Item, Field
+
+
+class Website(Item):
+
+    name = Field()
+    description = Field()
+    url = Field()
diff --git a/dirbot/pipelines.py b/dirbot/pipelines.py
@@ -0,0 +1,16 @@
+from scrapy.exceptions import DropItem
+
+
+class FilterWordsPipeline(object):
+    """A pipeline for filtering out items which contain certain words in their
+    description"""
+
+    # put all words in lowercase
+    words_to_filter = ['politics', 'religion']
+
+    def process_item(self, item, spider):
+        for word in self.words_to_filter:
+            if word in unicode(item['description']).lower():
+                raise DropItem("Contains forbidden word: %s" % word)
+        else:
+            return item
diff --git a/dirbot/settings.py b/dirbot/settings.py
@@ -0,0 +1,7 @@
+# Scrapy settings for dirbot project
+
+SPIDER_MODULES = ['dirbot.spiders']
+NEWSPIDER_MODULE = 'dirbot.spiders'
+DEFAULT_ITEM_CLASS = 'dirbot.items.Website'
+
+ITEM_PIPELINES = ['dirbot.pipelines.FilterWordsPipeline']
diff --git a/dirbot/spiders/__init__.py b/dirbot/spiders/__init__.py
@@ -0,0 +1 @@
+# Place here all your scrapy spiders
diff --git a/dirbot/spiders/dmoz.py b/dirbot/spiders/dmoz.py
@@ -0,0 +1,34 @@
+from scrapy.spider import BaseSpider
+from scrapy.selector import HtmlXPathSelector
+
+from dirbot.items import Website
+
+
+class DmozSpider(BaseSpider):
+    name = "dmoz"
+    allowed_domains = ["dmoz.org"]
+    start_urls = [
+        "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
+        "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/",
+    ]
+
+    def parse(self, response):
+        """
+        The lines below is a spider contract. For more info see:
+        http://doc.scrapy.org/en/latest/topics/contracts.html
+
+        @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
+        @scrapes name
+        """
+        hxs = HtmlXPathSelector(response)
+        sites = hxs.select('//ul[@class="directory-url"]/li')
+        items = []
+
+        for site in sites:
+            item = Website()
+            item['name'] = site.select('a/text()').extract()
+            item['url'] = site.select('a/@href').extract()
+            item['description'] = site.select('text()').re('-\s([^\n]*?)\\n')
+            items.append(item)
+
+        return items
diff --git a/scrapy.cfg b/scrapy.cfg
@@ -0,0 +1,2 @@
+[settings]
+default = dirbot.settings
diff --git a/setup.py b/setup.py
@@ -0,0 +1,8 @@
+from setuptools import setup, find_packages
+
+setup(
+    name='dirbot',
+    version='1.0',
+    packages=find_packages(),
+    entry_points={'scrapy': ['settings = dirbot.settings']},
+)