From 4eb3590103bc3a19db7f7cf5b79593f6d2645bb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 5 Dec 2023 17:48:50 +0100 Subject: [PATCH] [nijie] fix image URLs of multi-image posts (#4876) --- gallery_dl/extractor/nijie.py | 34 ++++++++++---------- test/results/horne.py | 36 +++++++++++++++++++++- test/results/nijie.py | 58 ++++++++++++++++++++++++++++++++++- 3 files changed, 108 insertions(+), 20 deletions(-) diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index 76c5404e31..54f294293b 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -57,7 +57,11 @@ def items(self): data["user_name"] = data["artist_name"] yield Message.Directory, data - for image in self._extract_images(page): + for num, url in enumerate(self._extract_images(image_id, page)): + image = text.nameext_from_url(url, { + "num": num, + "url": "https:" + url, + }) image.update(data) if not image["extension"]: image["extension"] = "jpg" @@ -72,7 +76,7 @@ def _extract_data(page): extr = text.extract_from(page) keywords = text.unescape(extr( 'name="keywords" content="', '" />')).split(",") - data = { + return { "title" : keywords[0].strip(), "description": text.unescape(extr( '"description": "', '"').replace("&", "&")), @@ -82,7 +86,6 @@ def _extract_data(page): "artist_name": keywords[1], "tags" : keywords[2:-1], } - return data @staticmethod def _extract_data_horne(page): @@ -90,7 +93,7 @@ def _extract_data_horne(page): extr = text.extract_from(page) keywords = text.unescape(extr( 'name="keywords" content="', '" />')).split(",") - data = { + return { "title" : keywords[0].strip(), "description": text.unescape(extr( 'property="og:description" content="', '"')), @@ -101,21 +104,16 @@ def _extract_data_horne(page): "itemprop='datePublished' content=", "<").rpartition(">")[2], "%Y-%m-%d %H:%M:%S", 9), } - return data - @staticmethod - def _extract_images(page): - """Extract image URLs from 'page'""" - images = text.extract_iter(page, "/view_popup.php", "") - for num, image in enumerate(images): - src = text.extr(image, 'src="', '"') - if not src: - continue - url = ("https:" + src).replace("/__rs_l120x120/", "/") - yield text.nameext_from_url(url, { - "num": num, - "url": url, - }) + def _extract_images(self, image_id, page): + if '&#diff_1" ' in page: + # multiple images + url = "{}/view_popup.php?id={}".format(self.root, image_id) + page = self.request(url).text + yield from text.extract_iter( + page, 'href="javascript:void(0);">