From b78c35fd15f80b6d6d1a6a91811307b6406064e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 22 Nov 2024 21:06:32 +0100 Subject: [PATCH] [motherless] add 'media' and 'gallery' extractors (#2074, #4413, #6221) --- docs/supportedsites.md | 6 ++ gallery_dl/extractor/motherless.py | 167 +++++++++++++++++++++++++++++ test/results/motherless.py | 127 ++++++++++++++++++++++ 3 files changed, 300 insertions(+) create mode 100644 gallery_dl/extractor/motherless.py create mode 100644 test/results/motherless.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 14f62e0828..1c59a0ff0d 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -607,6 +607,12 @@ Consider all listed sites to potentially be NSFW. Albums, Channels Supported + + Motherless + https://motherless.com/ + Galleries, Media Files + + My Hentai Gallery https://myhentaigallery.com/ diff --git a/gallery_dl/extractor/motherless.py b/gallery_dl/extractor/motherless.py new file mode 100644 index 0000000000..c5b9322c0a --- /dev/null +++ b/gallery_dl/extractor/motherless.py @@ -0,0 +1,167 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://motherless.com/""" + +from .common import Extractor, Message +from .. import text, util +from ..cache import memcache +from datetime import timedelta + +BASE_PATTERN = r"(?:https?://)?motherless\.com" + + +class MotherlessExtractor(Extractor): + """Base class for motherless extractors""" + category = "motherless" + root = "https://motherless.com" + filename_fmt = "{id} {title}.{extension}" + archive_fmt = "{id}" + + +class MotherlessMediaExtractor(MotherlessExtractor): + """Extractor for a single image/video from motherless.com""" + subcategory = "media" + pattern = (BASE_PATTERN + + r"/((?:g/[^/?#]+/|G[IV]?[A-Z0-9]+/)?" + r"(?!G)[A-Z0-9]+)") + example = "https://motherless.com/ABC123" + + def items(self): + file = self._extract_media(self.groups[0]) + url = file["url"] + yield Message.Directory, file + yield Message.Url, url, text.nameext_from_url(url, file) + + def _extract_media(self, path): + url = self.root + "/" + path + page = self.request(url).text + extr = text.extract_from(page) + + path, _, media_id = path.rpartition("/") + data = { + "id" : media_id, + "type" : extr("__mediatype = '", "'"), + "group": extr("__group = '", "'"), + "url" : extr("__fileurl = '", "'"), + "tags" : [ + text.unescape(tag) + for tag in text.extract_iter( + extr('class="media-meta-tags">', ""), ">#", "<") + ], + "title": text.unescape(extr("

", "<")), + "views": text.parse_int(extr( + 'class="count">', " ").replace(",", "")), + "favorites": text.parse_int(extr( + 'class="count">', " ").replace(",", "")), + "date" : self._parse_datetime(extr('class="count">', "<")), + "uploader": text.unescape(extr('class="username">', "<").strip()), + } + + if path and path[0] == "G": + data["gallery_id"] = path[1:] + data["gallery_title"] = self._extract_gallery_title( + page, data["gallery_id"]) + + return data + + def _parse_datetime(self, dt): + if " ago" not in dt: + return text.parse_datetime(dt, "%d %b %Y") + + value = text.parse_int(dt[:-5]) + delta = timedelta(0, value*3600) if dt[-5] == "h" else timedelta(value) + return (util.datetime_utcnow() - delta).replace( + hour=0, minute=0, second=0) + + @memcache(keyarg=2) + def _extract_gallery_title(self, page, gallery_id): + title = text.extr( + text.extr(page, '

', "

"), + "From the gallery:", "<") + if title: + return text.unescape(title.strip()) + + pos = page.find(' href="/G' + gallery_id + '"') + if pos >= 0: + return text.unescape(text.extract( + page, ' title="', '"', pos)[0]) + + return "" + + +class MotherlessGalleryExtractor(MotherlessExtractor): + """Extractor for a motherless.com gallery""" + subcategory = "gallery" + directory_fmt = ("{category}", "{uploader}", + "{gallery_id} {gallery_title}") + archive_fmt = "{gallery_id}_{id}" + pattern = BASE_PATTERN + "/G([IVG])?([A-Z0-9]+)/?$" + example = "https://motherless.com/GABC123" + + def items(self): + type, gid = self.groups + + if not type: + data = {"_extractor": MotherlessGalleryExtractor} + yield Message.Queue, self.root + "/GI" + gid, data + yield Message.Queue, self.root + "/GV" + gid, data + return + + url = "{}/G{}{}".format(self.root, type, gid) + page = self.request(url).text + data = self._extract_gallery_data(page) + + for num, thumb in enumerate(self._pagination(page), 1): + file = self._parse_thumb_data(thumb) + file.update(data) + file["num"] = num + url = file["url"] + yield Message.Directory, file + yield Message.Url, url, text.nameext_from_url(url, file) + + def _pagination(self, page): + while True: + for thumb in text.extract_iter( + page, 'class="thumb-container', ""): + yield thumb + + url = text.extr(page, '", "<").rpartition(" | ")[0]), + "uploader": text.remove_html(extr( + 'class="gallery-member-username">', "', ")") + .rpartition("(")[2].replace(",", "")), + } + + def _parse_thumb_data(self, thumb): + extr = text.extract_from(thumb) + data = { + "id" : extr('data-codename="', '"'), + "type" : extr('data-mediatype="', '"'), + "thumbnail": extr('class="static" src="', '"'), + "title" : extr(' alt="', '"'), + } + + type = data["type"] + url = data["thumbnail"].replace("thumb", type) + if type == "video": + url = "{}/{}.mp4".format(url.rpartition("/")[0], data["id"]) + data["url"] = url + + return data diff --git a/test/results/motherless.py b/test/results/motherless.py new file mode 100644 index 0000000000..4e372b2bac --- /dev/null +++ b/test/results/motherless.py @@ -0,0 +1,127 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import motherless + + +__tests__ = ( +{ + "#url" : "https://motherless.com/B0168DB", + "#class": motherless.MotherlessMediaExtractor, + "#urls" : "https://cdn5-images.motherlessmedia.com/images/B0168DB.jpg", + "#sha1_content": "10629fc5dd7a9623af7dd57f1a322d0f24ac9acc", + + "date" : "dt:2013-03-29 00:00:00", + "extension": "jpg", + "favorites": range(0, 10), + "filename" : "B0168DB", + "group" : "", + "id" : "B0168DB", + "tags" : [ + "Lady J", + "outdoor", + "closeup. face" + ], + "title" : "388652199_d6fc8a9515_o.jpg", + "type" : "image", + "uploader" : "anonymous", + "url" : "https://cdn5-images.motherlessmedia.com/images/B0168DB.jpg", + "views" : range(90, 200), + +}, + +{ + "#url" : "https://motherless.com/G43D8704/F0C07D3", + "#class": motherless.MotherlessMediaExtractor, + "#urls" : "https://cdn5-images.motherlessmedia.com/images/F0C07D3.jpg", + + "date" : "dt:2014-08-13 00:00:00", + "extension" : "jpg", + "favorites" : range(100, 200), + "filename" : "F0C07D3", + "gallery_id": "43D8704", + "gallery_title": "SpeechLess", + "group" : "", + "id" : "F0C07D3", + "tags" : [], + "title" : "Spunky Angels Amy Black Dress", + "type" : "image", + "uploader" : "jonesyjonesy", + "url" : "https://cdn5-images.motherlessmedia.com/images/F0C07D3.jpg", + "views" : range(14000, 20000), +}, + +{ + "#url" : "https://motherless.com/g/classic_porn/19D6C80", + "#class": motherless.MotherlessMediaExtractor, + "#urls" : "https://cdn5-images.motherlessmedia.com/images/19D6C80.gif", + + "date" : "dt:2021-05-11 00:00:00", + "extension": "gif", + "favorites": range(10, 50), + "filename" : "19D6C80", + "group" : "classic_porn", + "id" : "19D6C80", + "tags" : [], + "title" : "Kaffee 1", + "type" : "image", + "uploader" : "KurtRitter", + "url" : "https://cdn5-images.motherlessmedia.com/images/19D6C80.gif", + "views" : range(150000, 300000), +}, + +{ + "#url" : "https://motherless.com/G43D8704", + "#class": motherless.MotherlessGalleryExtractor, + "#urls": ( + "https://motherless.com/GI43D8704", + "https://motherless.com/GV43D8704", + ), +}, + +{ + "#url" : "https://motherless.com/GI43D8704", + "#class": motherless.MotherlessGalleryExtractor, + "#pattern": r"https://cdn5-images\.motherlessmedia\.com/images/\w+\.(jpg|png|gif)", + "#range" : "1-100", + "#count" : 100, + + "count" : 6503, + "extension" : {"jpg", "png", "gif"}, + "filename" : str, + "gallery_id" : "43D8704", + "gallery_title": "SpeechLess", + "id" : str, + "num" : int, + "thumbnail" : r"re:https://cdn5-thumbs\.motherlessmedia\.com/thumbs/\w+\.\w+", + "title" : str, + "type" : "image", + "uploader" : "gaylobe", + "url" : r"re:https://cdn5-images\.motherlessmedia\.com/images/\w+\.(jpg|png|gif)", +}, + +{ + "#url" : "https://motherless.com/GV43D8704", + "#class": motherless.MotherlessGalleryExtractor, + "#pattern": r"https://cdn5-videos.motherlessmedia.com/videos/\w+\.mp4", + "#range" : "1-100", + "#count" : 100, + + "count" : 869, + "extension" : "mp4", + "filename" : str, + "gallery_id" : "43D8704", + "gallery_title": "SpeechLess", + "id" : str, + "num" : int, + "thumbnail" : r"re:https://cdn5-thumbs\.motherlessmedia\.com/thumbs/[\w-]+\.\w+", + "title" : str, + "type" : "video", + "uploader" : "gaylobe", + "url" : r"re:https://cdn5-videos.motherlessmedia.com/videos/\w+\.mp4", +}, + +)