forked from KurtBestor/Hitomi-Downloader
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgelbooru_downloader.py
141 lines (116 loc) · 3.66 KB
/
gelbooru_downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#coding: utf-8
import downloader
import re
import os
from utils import Downloader, urljoin, query_url, Soup, get_max_range
from fucking_encoding import clean_title
from translator import tr_
import urllib
import sys
from time import sleep
from constants import clean_url
LIMIT = 100
def get_tags(url):
url = clean_url(url)
qs = query_url(url)
if 'page=favorites' in url:
id = qs.get('id', ['N/A'])[0]
id = u'fav_{}'.format(id)
else:
tags = qs.get('tags', [])
tags.sort()
id = u' '.join(tags)
if not id:
id = u'N/A'
return id
@Downloader.register
class Downloader_gelbooru(Downloader):
type = 'gelbooru'
URLS = ['gelbooru.com']
_id = None
def init(self):
self.url = self.url.replace('gelbooru_', '')
if 'gelbooru.com' in self.url.lower():
self.url = self.url.replace('http://', 'https://')
else:
url = self.url
url = url.replace(' ', '+')
while '++' in url:
url = url.replace('++', '+')
url = urllib.quote(url)
url = url.replace('%2B', '+')
self.url = u'https://gelbooru.com/index.php?page=post&s=list&tags={}'.format(url)
@property
def id(self):
if self._id is None:
tags = get_tags(self.url)
self._id = tags
return self._id
@property
def name(self):
return clean_title(self.id)
def read(self):
self.title = self.name
imgs = get_imgs(self.url, self.name, customWidget=self.customWidget)
for img in imgs:
self.urls.append(img.url)
self.filenames[img.url] = img.filename
sleep(.5)
self.title = self.name
class Image(object):
def __init__(self, id, url):
self.id = id
self.url = url
ext = os.path.splitext(url)[1]
self.filename = u'{}{}'.format(id, ext)
def setPage(url, page):
# Always use HTTPS
url = url.replace('http://', 'https://')
# Change the page
if 'pid=' in url:
url = re.sub('pid=[0-9]*', 'pid={}'.format(page), url)
else:
url += '&pid={}'.format(page)
return url
def get_imgs(url, title=None, customWidget=None):
url = clean_url(url)
if 's=view' in url and 'page=favorites' not in url:
raise NotImplementedError('Not Implemented')
if 'page=dapi' not in url.lower():
tags = get_tags(url)
tags = urllib.quote(tags, safe='/')
tags = tags.replace('%20', '+')
url = "https://gelbooru.com/index.php?page=dapi&s=post&q=index&tags={}&pid={}&limit={}".format(tags, 0, LIMIT)
if customWidget is not None:
print_ = customWidget.print_
else:
def print_(*values):
sys.stdout.writelines(values + ('\n',))
# Range
max_pid = get_max_range(customWidget, 2000)
imgs = []
url_imgs = set()
for p in range(500): #1017
url = setPage(url, p)
print_(url)
html = downloader.read_html(url)
soup = Soup(html)
posts = soup.findAll('post')
if not posts:
break
for post in posts:
url_img = post.attrs['file_url']
if url_img in url_imgs:
print 'already exists', url_img
else:
url_imgs.add(url_img)
id = post.attrs['id']
img = Image(id, url_img)
imgs.append(img)
if len(imgs) >= max_pid:
break
if customWidget is not None:
if not customWidget.alive:
break
customWidget.setTitle(u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs)))
return imgs