forked from mikf/gallery-dl
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[ao3] add initial support (mikf#6013)
- Loading branch information
Showing
6 changed files
with
418 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,6 +23,7 @@ | |
"8muses", | ||
"adultempire", | ||
"agnph", | ||
"ao3", | ||
"architizer", | ||
"artstation", | ||
"aryion", | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,200 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
# Copyright 2024 Mike Fährmann | ||
# | ||
# This program is free software; you can redistribute it and/or modify | ||
# it under the terms of the GNU General Public License version 2 as | ||
# published by the Free Software Foundation. | ||
|
||
"""Extractors for https://archiveofourown.org/""" | ||
|
||
from .common import Extractor, Message | ||
from .. import text, util | ||
|
||
BASE_PATTERN = r"(?:https?://)?(?:www\.)?archiveofourown.org" | ||
|
||
|
||
class Ao3Extractor(Extractor): | ||
"""Base class for ao3 extractors""" | ||
category = "ao3" | ||
root = "https://archiveofourown.org" | ||
categorytransfer = True | ||
request_interval = (0.5, 1.5) | ||
|
||
def items(self): | ||
base = self.root + "/works/" | ||
data = {"_extractor": Ao3WorkExtractor} | ||
|
||
for work_id in self.works(): | ||
yield Message.Queue, base + work_id, data | ||
|
||
def works(self): | ||
return self._pagination(self.groups[0]) | ||
|
||
def _pagination(self, path, needle='<li id="work_'): | ||
while True: | ||
page = self.request(self.root + path).text | ||
yield from text.extract_iter(page, needle, '"') | ||
path = text.extr(page, '<a rel="next" href="', '"') | ||
if not path: | ||
return | ||
path = text.unescape(path) | ||
|
||
|
||
class Ao3WorkExtractor(Ao3Extractor): | ||
"""Extractor for an AO3 work""" | ||
subcategory = "work" | ||
directory_fmt = ("{category}", "{author}") | ||
filename_fmt = "{id} {title}.{extension}" | ||
archive_fmt = "{id}.{extension}" | ||
pattern = BASE_PATTERN + r"/works/(\d+)" | ||
example = "https://archiveofourown.org/works/12345" | ||
|
||
def _init(self): | ||
formats = self.config("formats") | ||
if formats is None: | ||
self.formats = ("pdf",) | ||
elif not formats: | ||
self.formats = () | ||
elif isinstance(formats, str): | ||
self.formats = formats.lower().replace(" ", "").split(",") | ||
else: | ||
self.formats = formats | ||
|
||
self.cookies.set("view_adult", "true", domain="archiveofourown.org") | ||
|
||
def items(self): | ||
work_id = self.groups[0] | ||
url = "{}/works/{}".format(self.root, work_id) | ||
extr = text.extract_from(self.request(url).text) | ||
|
||
fmts = {} | ||
download = extr(' class="download"', "</ul>") | ||
for dl in text.extract_iter(download, ' href="', "</"): | ||
path, _, type = dl.rpartition('">') | ||
fmts[type.lower()] = path | ||
|
||
data = { | ||
"id" : text.parse_int(work_id), | ||
"rating" : text.split_html( | ||
extr('<dd class="rating tags">', "</dd>")), | ||
"warnings" : text.split_html( | ||
extr('<dd class="warning tags">', "</dd>")), | ||
"categories" : text.split_html( | ||
extr('<dd class="category tags">', "</dd>")), | ||
"fandom" : text.split_html( | ||
extr('<dd class="fandom tags">', "</dd>")), | ||
"relationships": text.split_html( | ||
extr('<dd class="relationship tags">', "</dd>")), | ||
"characters" : text.split_html( | ||
extr('<dd class="character tags">', "</dd>")), | ||
"tags" : text.split_html( | ||
extr('<dd class="freeform tags">', "</dd>")), | ||
"lang" : extr('<dd class="language" lang="', '"'), | ||
"series" : extr('<dd class="series">', "</dd>"), | ||
"date" : text.parse_datetime( | ||
extr('<dd class="published">', "<"), "%Y-%m-%d"), | ||
"words" : text.parse_int( | ||
extr('<dd class="words">', "<").replace(",", "")), | ||
"chapters" : text.parse_int( | ||
extr('<dd class="chapters">', "/")), | ||
"comments" : text.parse_int( | ||
extr('<dd class="comments">', "<").replace(",", "")), | ||
"likes" : text.parse_int( | ||
extr('<dd class="kudos">', "<").replace(",", "")), | ||
"bookmarks" : text.parse_int(text.remove_html( | ||
extr('<dd class="bookmarks">', "</dd>")).replace(",", "")), | ||
"views" : text.parse_int( | ||
extr('<dd class="hits">', "<").replace(",", "")), | ||
"title" : text.unescape( | ||
extr(' class="title heading">', "<").strip()), | ||
"author" : text.unescape(text.remove_html( | ||
extr(' class="byline heading">', "</h3>"))), | ||
"summary" : text.split_html( | ||
extr(' class="heading">Summary:</h3>', "</div>")), | ||
} | ||
data["language"] = util.code_to_language(data["lang"]) | ||
|
||
yield Message.Directory, data | ||
for fmt in self.formats: | ||
try: | ||
url = text.urljoin(self.root, fmts[fmt]) | ||
except KeyError: | ||
self.log.warning("%s: Format '%s' not available", work_id, fmt) | ||
else: | ||
yield Message.Url, url, text.nameext_from_url(url, data) | ||
|
||
|
||
class Ao3SeriesExtractor(Ao3Extractor): | ||
"""Extractor for AO3 works of a series""" | ||
subcategory = "series" | ||
pattern = BASE_PATTERN + r"(/series/(\d+))" | ||
example = "https://archiveofourown.org/series/12345" | ||
|
||
|
||
class Ao3TagExtractor(Ao3Extractor): | ||
"""Extractor for AO3 works by tag""" | ||
subcategory = "tag" | ||
pattern = BASE_PATTERN + r"(/tags/([^/?#]+)/works(?:/?\?.+)?)" | ||
example = "https://archiveofourown.org/tags/TAG/works" | ||
|
||
|
||
class Ao3SearchExtractor(Ao3Extractor): | ||
"""Extractor for AO3 search results""" | ||
subcategory = "search" | ||
pattern = BASE_PATTERN + r"(/works/search/?\?.+)" | ||
example = "https://archiveofourown.org/works/search?work_search[query]=air" | ||
|
||
|
||
class Ao3UserExtractor(Ao3Extractor): | ||
"""Extractor for an AO3 user profile""" | ||
subcategory = "user" | ||
pattern = (BASE_PATTERN + r"/users/([^/?#]+(?:/pseuds/[^/?#]+)?)" | ||
r"(?:/profile)?/?(?:$|\?|#)") | ||
example = "https://archiveofourown.org/users/USER" | ||
|
||
def initialize(self): | ||
pass | ||
|
||
def items(self): | ||
base = "{}/users/{}/".format(self.root, self.groups[0]) | ||
return self._dispatch_extractors(( | ||
(Ao3UserWorksExtractor , base + "works"), | ||
(Ao3UserSeriesExtractor , base + "series"), | ||
(Ao3UserBookmarkExtractor, base + "bookmarks"), | ||
), ("user-works", "user-series")) | ||
|
||
|
||
class Ao3UserWorksExtractor(Ao3Extractor): | ||
"""Extractor for works of an AO3 user""" | ||
subcategory = "user-works" | ||
pattern = (BASE_PATTERN + r"(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?" | ||
r"works(?:/?\?.+)?)") | ||
example = "https://archiveofourown.org/users/USER/works" | ||
|
||
|
||
class Ao3UserSeriesExtractor(Ao3Extractor): | ||
"""Extractor for series of an AO3 user""" | ||
subcategory = "user-series" | ||
pattern = (BASE_PATTERN + r"(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?" | ||
r"series(?:/?\?.+)?)") | ||
example = "https://archiveofourown.org/users/USER/series" | ||
|
||
def items(self): | ||
base = self.root + "/series/" | ||
data = {"_extractor": Ao3SeriesExtractor} | ||
|
||
for series_id in self.series(): | ||
yield Message.Queue, base + series_id, data | ||
|
||
def series(self): | ||
path, user, pseud, query = self.groups | ||
return self._pagination(self.groups[0], '<li id="series_') | ||
|
||
|
||
class Ao3UserBookmarkExtractor(Ao3Extractor): | ||
"""Extractor for bookmarked works of an AO3 user""" | ||
subcategory = "user-bookmark" | ||
pattern = (BASE_PATTERN + r"(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?" | ||
r"bookmarks(?:/?\?.+)?)") | ||
example = "https://archiveofourown.org/users/USER/bookmarks" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.