diff --git a/docs/configuration.rst b/docs/configuration.rst index 12232dd8dc..2a9029edbb 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -2745,6 +2745,16 @@ Description Also search Plurk comments for URLs. +extractor.[postmill].save-link-post-body +---------------------------------------- +Type + ``bool`` +Default + ``false`` +Description + Whether or not to save the body for link/image posts. + + extractor.reactor.gif --------------------- Type diff --git a/docs/supportedsites.md b/docs/supportedsites.md index df61f1223a..9ee1ef14ed 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1316,6 +1316,16 @@ Consider all listed sites to potentially be NSFW. + + Postmill Instances + + + Raddle + https://raddle.me/ + Forums, Home Feed, Individual Posts, Search Results, Tag Searches, User Profiles + + + Reactor Instances diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index d074de22eb..695b8b2a26 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -124,6 +124,7 @@ "poipiku", "pornhub", "pornpics", + "postmill", "pururin", "reactor", "readcomiconline", diff --git a/gallery_dl/extractor/postmill.py b/gallery_dl/extractor/postmill.py new file mode 100644 index 0000000000..29b351ba66 --- /dev/null +++ b/gallery_dl/extractor/postmill.py @@ -0,0 +1,203 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for Postmill instances""" + +import re +from .common import BaseExtractor, Message +from .. import text, exception + + +class PostmillExtractor(BaseExtractor): + """Base class for Postmill extractors""" + basecategory = "postmill" + directory_fmt = ("{category}", "{instance}", "{forum}") + filename_fmt = "{id}_{title[:220]}.{extension}" + archive_fmt = "{filename}" + + def _init(self): + self.instance = self.root.partition("://")[2] + self.save_link_post_body = self.config("save-link-post-body", False) + self._search_canonical_url = re.compile(r"/f/([\w\d_]+)/(\d+)/").search + self._search_image_tag = re.compile( + r'')) + date = text.parse_datetime(extr( + '')) + username = extr( + '') + post_canonical_url = text.unescape(extr( + '')) + + url = text.unescape(extr( + '

', + '') + + match = self._search_canonical_url(post_canonical_url) + forum = match.group(1) + id = int(match.group(2)) + + is_text_post = url.startswith("/") + is_image_post = self._search_image_tag(page) is not None + data = { + "title": title, + "date": date, + "username": username, + "forum": forum, + "id": id, + "flair": [text.unescape(i) for i in text.extract_iter( + page, '', '')], + "instance": self.instance, + } + + urls = [] + if is_text_post or self.save_link_post_body: + urls.append((Message.Url, "text:" + body)) + + if is_image_post: + urls.append((Message.Url, url)) + elif not is_text_post: + urls.append((Message.Queue, url)) + + data["count"] = len(urls) + yield Message.Directory, data + for data["num"], (msg, url) in enumerate(urls, 1): + if url.startswith("text:"): + data["filename"], data["extension"] = "", "htm" + else: + data = text.nameext_from_url(url, data) + + yield msg, url, data + + +class PostmillSubmissionsExtractor(PostmillExtractor): + """Base class for Postmill submissions extractors""" + whitelisted_parameters = () + + def __init__(self, match): + PostmillExtractor.__init__(self, match) + groups = match.groups() + self.base = groups[-3] + self.sorting_path = groups[-2] or "" + self.query = {key: value for key, value in text.parse_query( + groups[-1]).items() if self.acceptable_query(key)} + + def items(self): + url = self.root + self.base + self.sorting_path + + while url: + response = self.request(url, params=self.query) + if response.history: + redirect_url = response.url + if redirect_url == self.root + "/login": + raise exception.StopExtraction( + "HTTP redirect to login page (%s)", redirect_url) + page = response.text + + for nav in text.extract_iter(page, + ''): + post_url = text.unescape(text.extr(nav, '