Skip to content

Commit

Permalink
implement '-e/--error-file' command-line option (#4732)
Browse files Browse the repository at this point in the history
copying per-URL options from regular, read-only input files
does currently not work
  • Loading branch information
mikf committed Dec 5, 2023
1 parent 4eb3590 commit 99b7662
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 16 deletions.
4 changes: 3 additions & 1 deletion docs/options.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
-E, --extractor-info Print extractor defaults and settings
-K, --list-keywords Print a list of available keywords and example
values for the given URLs
-e, --error-file FILE Add input URLs which returned an error to FILE
--list-modules Print a list of available extractor modules
--list-extractors Print a list of extractor classes with
description, (sub)category and example URL
Expand All @@ -51,7 +52,8 @@
## Downloader Options:
-r, --limit-rate RATE Maximum download rate (e.g. 500k or 2.5M)
-R, --retries N Maximum number of retries for failed HTTP
requests or -1 for infinite retries (default: 4)
requests or -1 for infinite retries (default:
4)
--http-timeout SECONDS Timeout for HTTP connections (default: 30.0)
--sleep SECONDS Number of seconds to wait before each download.
This can be either a constant value or a range
Expand Down
68 changes: 53 additions & 15 deletions gallery_dl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,9 @@ def main():
input_log.error(exc)
return getattr(exc, "code", 128)

if args.error_file:
input_manager.error_file(args.error_file)

pformat = config.get(("output",), "progress", True)
if pformat and len(input_manager.urls) > 1 and \
args.loglevel < logging.ERROR:
Expand All @@ -270,6 +273,7 @@ def main():

if status:
retval |= status
input_manager.error()
else:
input_manager.success()

Expand All @@ -281,6 +285,7 @@ def main():
except exception.NoExtractorError:
log.error("Unsupported URL '%s'", url)
retval |= 64
input_manager.error()

input_manager.next()
return retval
Expand All @@ -301,9 +306,12 @@ class InputManager():
def __init__(self):
self.urls = []
self.files = ()

self._url = ""
self._item = None
self._index = 0
self._current = None
self._pformat = None
self._error_fp = None

def add_url(self, url):
self.urls.append(url)
Expand Down Expand Up @@ -428,6 +436,15 @@ def add_file(self, path, action=None):
else:
append(url)

def error_file(self, path):
try:
path = util.expand_path(path)
self._error_fp = open(path, "a", encoding="utf-8")
except Exception as exc:
self.log.warning(
"Unable to open error file (%s: %s)",
exc.__class__.__name__, exc)

def progress(self, pformat=True):
if pformat is True:
pformat = "[{current}/{total}] {url}\n"
Expand All @@ -439,17 +456,37 @@ def next(self):
self._index += 1

def success(self):
if self._current:
url, path, action, indicies = self._current
lines = self.files[path]
action(lines, indicies)
if self._item:
self._rewrite()

def error(self):
if self._error_fp:
if self._item:
url, path, action, indicies = self._item
lines = self.files[path]
out = "".join(lines[i] for i in indicies)
self._rewrite()
else:
out = str(self._url) + "\n"

try:
with open(path, "w", encoding="utf-8") as fp:
fp.writelines(lines)
self._error_fp.write(out)
except Exception as exc:
self.log.warning(
"Unable to update '%s' (%s: %s)",
path, exc.__class__.__name__, exc)
self._error_fp.name, exc.__class__.__name__, exc)

def _rewrite(self):
url, path, action, indicies = self._item
lines = self.files[path]
action(lines, indicies)
try:
with open(path, "w", encoding="utf-8") as fp:
fp.writelines(lines)
except Exception as exc:
self.log.warning(
"Unable to update '%s' (%s: %s)",
path, exc.__class__.__name__, exc)

@staticmethod
def _action_comment(lines, indicies):
Expand All @@ -467,23 +504,24 @@ def __iter__(self):

def __next__(self):
try:
item = self.urls[self._index]
url = self.urls[self._index]
except IndexError:
raise StopIteration

if isinstance(item, tuple):
self._current = item
item = item[0]
if isinstance(url, tuple):
self._item = url
url = url[0]
else:
self._current = None
self._item = None
self._url = url

if self._pformat:
output.stderr_write(self._pformat({
"total" : len(self.urls),
"current": self._index + 1,
"url" : item,
"url" : url,
}))
return item
return url


class ExtendedUrl():
Expand Down
5 changes: 5 additions & 0 deletions gallery_dl/option.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,11 @@ def build_parser():
help=("Print a list of available keywords and example values "
"for the given URLs"),
)
output.add_argument(
"-e", "--error-file",
dest="error_file", metavar="FILE",
help="Add input URLs which returned an error to FILE",
)
output.add_argument(
"--list-modules",
dest="list_modules", action="store_true",
Expand Down

5 comments on commit 99b7662

@Hrxn
Copy link
Contributor

@Hrxn Hrxn commented on 99b7662 Dec 6, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this --error-file option already supported for usage in the config file? Or will be?

@mikf
Copy link
Owner Author

@mikf mikf commented on 99b7662 Dec 6, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not and it was never intended to, as I thought of it as similar to --input-file.
I guess you'd like me to add it as a global config file option?

@Hrxn
Copy link
Contributor

@Hrxn Hrxn commented on 99b7662 Dec 6, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I see what you mean.

--error-file is designed to be used together with --input-file, for users who do not set up logging output in their config, right?
Or, alternatively, if you supply multiple URLs (dozens, maybe) on the command-line, you can "log" failing URLs of this input via --error-file, correct?

Now that I think about it, --error-file does apparently not do anything differently (or anything more) than a proper logging configuration - which I already use.

Okay, simply forget my comments here and keep it as it is..

@mikf
Copy link
Owner Author

@mikf mikf commented on 99b7662 Dec 8, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Having a designated file for input URL errors is better than having to comb through the general logging output I think, so I added this option anyway. 042a9da

Maybe I should make this part of the same mechanism as --write-unsupported ...

@Hrxn
Copy link
Contributor

@Hrxn Hrxn commented on 99b7662 Dec 9, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Having a designated file for input URL errors is better than having to comb through the general logging output I think, so I added this option anyway. 042a9da

Good point. I agree..

Maybe I should make this part of the same mechanism as --write-unsupported ...

Maybe, I can't really make any judgement here from the implementation's perspective.
On the other hand, right now we have output.unsupportedfile, which can be either of type Path or a logging config object, with path as the property here to point to the actual unsupported output file, so we cannot use path again here, using a different identifier name (like path-errorfile or whatevs) would be kinda inconsistent, so, at least in the config, it would have been it's own output.errofile, which is now a matter of fact anyway with 042a9da

I don't really know where I'm going here actually.. I realize the structure of the config can be pretty much completely independent from the details of the actual implementation.

Please sign in to comment.