Naver added, Duckduckgo improved (#8)

* Naver added, Duckduckgo improved * Added logo
soxoj · Dec 9, 2021 · 983173a · 983173a
1 parent 6374cfa
commit 983173a
Show file tree

Hide file tree

Showing 4 changed files with 59 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -1,5 +1,12 @@
 # Marple
 
+<p align="center">
+  <p align="center">
+    <img src="https://raw.githubusercontent.com/soxoj/marple/main/example.png" height="200"/>
+  </p>
+</p>
+
+
 ## Summary
 
 Collect links to profiles by username through 10+ search engines ([see the full list below](#supported-sources)).
@@ -52,6 +59,11 @@ All you need is Python3. And pip. And requirements, of course.
 pip3 install -r requirements.txt
 ```
 
+You need API keys for some search engines (see requirements in [Supported sources](#supported-sources)). Keys should be exported to env in this way:
+```
+export YANDEX_KEY=key
+```
+
 ## Options
 
 You can specify 'junk threshold' with option `-t` or `--threshold` (default 300) to get more or less reliable results.
@@ -83,16 +95,16 @@ Other options:
 | ------------------- | --------------------------------------| ----------------- |
 | [Google](http://google.com/)              | scraping                              | None, works out of the box; frequent captcha  |
 | [DuckDuckGo](https://duckduckgo.com/)     | scraping                              | None, works out of the box                    |
-| [Yandex](https://yandex.ru/)              | XML API                               | [Register and get USER/API tokens](https://github.com/fluquid/yandex-search)   |
-| [Aol](https://search.aol.com/)            | scraping                              | None, scrapes with pagination  |
+| [Yandex](https://yandex.ru/)              | XML API                               | [Register and get YANDEX_USER/YANDEX_KEY tokens](https://github.com/fluquid/yandex-search)   |
+| [Naver](https://www.naver.com/)              | SerpApi                               | [Register and get SERPAPI_KEY token](https://serpapi.com/)   || [Aol](https://search.aol.com/)            | scraping                              | None, scrapes with pagination  |
 | [Ask](https://www.ask.com/)               | scraping                              | None, scrapes with pagination  |
 | [Bing](https://www.bing.com/)             | scraping                              | None, scrapes with pagination  |
 | [Startpage](https://www.startpage.com/)   | scraping                              | None, scrapes with pagination  |
 | [Yahoo](https://yahoo.com/)               | scraping                              | None, scrapes with pagination  |
 | [Mojeek](https://www.mojeek.com)          | scraping                              | None, scrapes with pagination  |
 | [Dogpile](https://www.dogpile.com/)       | scraping                              | None, scrapes with pagination  |
 | [Torch](http://torchdeedp3i2jigzjdmfpn5ttjhthh5wbmda2rr3jvqjg5p77c54dqd.onion)               | scraping                              | Tor proxies (socks5://localhost:9050 by default), scrapes with pagination  |
-| [Qwant](https://www.qwant.com/)           | scraping                              | Check [if search available](https://www.qwant.com/) in your exit IP country, scrapes with pagination  |
+| [Qwant](https://www.qwant.com/)           | Qwant API                              | Check [if search available](https://www.qwant.com/) in your exit IP country, scrapes with pagination  |
 
 
 ## Development & testing

diff --git a/example.png b/example.png
diff --git a/marple.py b/marple.py
@@ -18,7 +18,8 @@
 
 import yandex_search
 from PyPDF2 import PdfFileReader
-from search_engines import Aol, Ask, Qwant, Bing, Yahoo, Startpage, Dogpile, Mojeek, Torch
+from search_engines import Aol, Ask, Qwant, Bing, Yahoo, Startpage, Dogpile, Mojeek, Torch, Duckduckgo
+from serpapi import GoogleSearch as SerpGoogle
 
 username_marks_symbols = '/.~=?&      -'
 
@@ -49,7 +50,10 @@ def __init__(self, url, title, username, source=''):
         self.normalize()
 
     def __eq__(self, other):
-        return self.url == other.url
+        def normalize(url):
+            return url.replace('https://', 'http://')
+
+        return normalize(self.url) == normalize(other.url)
 
     def __hash__(self):
         return hash(self.url)
@@ -205,6 +209,7 @@ async def parse(self, html, username):
         return results
 
 
+# old unused parser
 class DuckParser(Parser):
     name = 'DuckDuckGo scraping'
 
@@ -317,6 +322,39 @@ class TorchParser(PaginatedParser):
     base_class = Torch
 
 
+class DuckduckgoParser(PaginatedParser):
+    name = 'Duckduckgo scraping with pagination'
+    base_class = Duckduckgo
+
+
+class NaverParser:
+    name = 'Naver parser (SerpApi)'
+
+    """
+        You should have env variables with key, e.g.
+
+        export SERPAPI_KEY=key
+    """
+    async def run(self, storage, username, count=100, lang='en', proxy=None):
+        params = {
+          "engine": "naver",
+          "query": username,
+          "where": "web",
+          "api_key": os.getenv('SERPAPI_KEY')
+        }
+
+        try:
+            search = SerpGoogle(params)
+            results = search.get_dict()
+            organic_results = results['organic_results']
+        except Exception as e:
+            return (self.name, str(e))
+
+        tuples_list = [Link(r["link"], r["title"], username, source='Naver') for r in organic_results]
+
+        storage += tuples_list
+
+
 class MarpleResult:
     def __init__(self, results, links, errors, warnings):
         self.all_links = results
@@ -328,7 +366,6 @@ def __init__(self, results, links, errors, warnings):
 async def marple(username, max_count, url_filter_enabled, is_debug=False, proxy=None):
     parsers = [
         GoogleParser(),
-        DuckParser(),
         YandexParser(),
         AolParser(),
         QwantParser(),
@@ -338,6 +375,8 @@ async def marple(username, max_count, url_filter_enabled, is_debug=False, proxy=
         BingParser(),
         DogpileParser(),
         TorchParser(),
+        DuckduckgoParser(),
+        NaverParser(),
     ]
 
     results = []

diff --git a/requirements.txt b/requirements.txt
@@ -8,4 +8,5 @@ socid_extractor
 maigret
 aiohttp_socks
 search_engines @ https://github.com/soxoj/Search-Engines-Scraper/archive/refs/heads/master.zip
-tqdm
+tqdm
+google-search-results