Skip to content

Commit

Permalink
add queue
Browse files Browse the repository at this point in the history
  • Loading branch information
mehmet kose committed Mar 22, 2016
1 parent fcdb3de commit 2774a15
Showing 1 changed file with 20 additions and 15 deletions.
35 changes: 20 additions & 15 deletions async_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,30 +13,35 @@
from urllib.parse import urljoin, urldefrag

root_url = "http://python.org"
crawled_urls, url_hub = [], [root_url, "%s/sitemap.xml" % (root_url)]
crawled_urls, url_hub = [], [root_url, "%s/sitemap.xml" % (root_url), "%s/robots.txt" % (root_url)]

async def get_body(url):
response = await aiohttp.request('GET', url)
return await response.read()

async def handle_task(task_id, work_queue):
while not work_queue.empty():
queue_url = await work_queue.get()
crawled_urls.append(queue_url)
body = await get_body(queue_url)
for new_url in get_urls(body):
if root_url in new_url and not new_url in crawled_urls:
q.put_nowait(new_url)
print(queue_url)
#await asyncio.sleep(5)

def remove_fragment(url):
pure_url, frag = urldefrag(url)
return pure_url

def get_links(html):
new_urls = [link.split('"')[0] for link in str(html).replace("'",'"').split('href="')[1:]]
def get_urls(html):
new_urls = [url.split('"')[0] for url in str(html).replace("'",'"').split('href="')[1:]]
return [urljoin(root_url, remove_fragment(new_url)) for new_url in new_urls]

async def main():
for to_crawl in url_hub:
raw_html = await get_body(to_crawl)
for link in get_links(raw_html):
if root_url in link and not link in crawled_urls:
url_hub.append(link)
url_hub.remove(to_crawl)
crawled_urls.append(to_crawl)
print("url hub: %s | crawled: %s |url : %s" % (len(url_hub), len(crawled_urls), to_crawl))

if __name__ == '__main__':
if __name__ == "__main__":
q = asyncio.Queue()
[q.put_nowait(url) for url in url_hub]
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
tasks = [handle_task(task_id, q) for task_id in range(3)]
loop.run_until_complete(asyncio.wait(tasks))
loop.close()

0 comments on commit 2774a15

Please sign in to comment.