Skip to content

Commit

Permalink
CSV export, Yahoo
Browse files Browse the repository at this point in the history
  • Loading branch information
soxoj committed Dec 7, 2021
1 parent 23e7551 commit 60df5b7
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 3 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ Collect links to profiles by username through search engines (see the full list
Features:
- multiple engines
- proxy support
- CSV file export
- plugins
- pdf metadata extraction
- social media info [extraction](socid_extractor)
Expand Down Expand Up @@ -73,6 +74,7 @@ Other options:
-d, --debug Display all the results from sources and debug messages
-l, --list Display only list of all the URLs
--proxy PROXY Proxy string (e.g. https://user:[email protected]:8080)
--csv CSV Save results to the CSV file
```

## Supported sources
Expand All @@ -82,6 +84,7 @@ Other options:
| Google | scraping | None, works out of the box; frequent captcha |
| DuckDuckGo | scraping | None, works out of the box |
| Yandex | XML API | [Register and get USER/API tokens](https://github.com/fluquid/yandex-search) |
| Yahoo | scraping | only one page for now |

## Development & testing

Expand Down
64 changes: 61 additions & 3 deletions marple.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
#!/usr/bin/env python3
import asyncio
import csv
import json
import re
import os
from typing import List
from argparse import ArgumentParser, RawDescriptionHelpFormatter
import urllib.parse

import aiohttp
import requests
Expand Down Expand Up @@ -218,6 +220,35 @@ async def parse(self, html, username):
return results


class YahooParser(Parser):
name = 'Yahoo scraping'

def make_url(self, username, count, lang):
return 'https://search.yahoo.com/search?p={}&ei=UTF-8&nojs=1'.format(username)

async def parse(self, html, username):
results = []

soup = bs(html, 'html.parser')
result_block = soup.find_all('div', class_='compTitle')

for result in result_block:
header = result.find('h3', class_='title')
if not header or not header.find('a'):
continue

pseudo_link = header.find('span').text
title = header.find('a').text[len(pseudo_link):]
link = header.find('a')['href']

link = urllib.parse.unquote(link.split('/RU=')[1].split('/RK=2')[0])

if link and title:
results.append(Link(link, title, username))

return results


class MarpleResult:
def __init__(self, results, links, errors, warnings):
self.all_links = results
Expand All @@ -231,6 +262,7 @@ def marple(username, max_count, url_filter_enabled, is_debug=False, proxy=None):
GoogleParser(),
DuckParser(),
YandexParser(),
YahooParser(),
]

results = []
Expand Down Expand Up @@ -335,6 +367,12 @@ def main():
default="",
help="Proxy string (e.g. https://user:[email protected]:8080)",
)
parser.add_argument(
'--csv',
type=str,
default="",
help="Save results to the CSV file",
)
args = parser.parse_args()

username = args.username
Expand Down Expand Up @@ -378,9 +416,12 @@ def main():

displayed_count = 0

def is_likely_profile(r):
return r.is_it_likely_username_profile() and r.junk_score <= args.threshold and not r.filtered

# reliable links section
for r in result.unique_links:
if r.is_it_likely_username_profile() and r.junk_score <= args.threshold and not r.filtered:
if is_likely_profile(r):
displayed_count += 1

message = r.url
Expand All @@ -404,9 +445,12 @@ def main():

pdf_count = 0

def is_pdf_file(url):
return url.endswith('pdf') or '-pdf.' in url

# pdf links section
for r in result.unique_links:
if r.url.endswith('pdf') or '-pdf.' in r.url:
if is_pdf_file(r.url):
if pdf_count == 0:
print(colored('PDF files', 'cyan'))

Expand Down Expand Up @@ -439,7 +483,6 @@ def main():

print()


# show status
status_msg = f'Links: total collected {total_collected_count} / unique with username in URL {uniq_count} / reliable {displayed_count} / documents {pdf_count}'

Expand All @@ -452,6 +495,21 @@ def main():

print(f"{colored(status_msg, 'cyan')}\n{colored(error_msg, 'yellow')}")

if args.csv:
with open(args.csv, 'w', newline='') as csvfile:
writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
writer.writerow(['URL', 'Title', 'Score', 'Is profile page', 'Is PDF'])

def write_links(condition):
for r in result.unique_links:
if not condition(r):
continue
writer.writerow([r.url, r.title, r.junk_score, is_likely_profile(r), is_pdf_file(r.url)])

write_links(lambda x: is_likely_profile(x))
write_links(lambda x: not is_likely_profile(x))

print(colored(f'Results was saved to CSV file {args.csv}', 'red'))

if __name__ == '__main__':
main()

0 comments on commit 60df5b7

Please sign in to comment.