Skip to content

Commit

Permalink
Crawling only unique urls
Browse files Browse the repository at this point in the history
  • Loading branch information
karlos-perez committed Mar 27, 2017
1 parent 5294576 commit 6d0fb25
Showing 1 changed file with 13 additions and 9 deletions.
22 changes: 13 additions & 9 deletions async_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
import asyncio
from urllib.parse import urljoin, urldefrag

root_url = "http://python.org"

root_url = "http://python.org/"
crawled_urls, url_hub = [], [root_url, "%s/sitemap.xml" % (root_url), "%s/robots.txt" % (root_url)]

async def get_body(url):
Expand All @@ -22,13 +23,12 @@ async def get_body(url):
async def handle_task(task_id, work_queue):
while not work_queue.empty():
queue_url = await work_queue.get()
crawled_urls.append(queue_url)
body = await get_body(queue_url)
for new_url in get_urls(body):
if root_url in new_url and not new_url in crawled_urls:
q.put_nowait(new_url)
print(queue_url)
#await asyncio.sleep(5)
if not queue_url in crawled_urls:
crawled_urls.append(queue_url)
body = await get_body(queue_url)
for new_url in get_urls(body):
if root_url in new_url and not new_url in crawled_urls:
work_queue.put_nowait(new_url)

def remove_fragment(url):
pure_url, frag = urldefrag(url)
Expand All @@ -44,4 +44,8 @@ def get_urls(html):
loop = asyncio.get_event_loop()
tasks = [handle_task(task_id, q) for task_id in range(3)]
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
loop.close()
for u in crawled_urls:
print(u)
print('-'*30)
print(len(crawled_urls))

0 comments on commit 6d0fb25

Please sign in to comment.