-
Notifications
You must be signed in to change notification settings - Fork 70
/
Copy pathlib.sh
executable file
·721 lines (618 loc) · 24.7 KB
/
lib.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
#!/usr/bin/env bash
set -eEuo pipefail
shopt -s inherit_errexit
# A simple trap handler for exiting on errors in subshells
err_trap_handler() {
echo "ERROR on line $(caller)!" >&2
exit 13
}
trap err_trap_handler ERR
#shellcheck disable=SC2034
VERSION="0.5.1"
#shellcheck disable=SC2034
GREEN='\033[0;32m'
#shellcheck disable=SC2034
RED='\033[0;31m'
#shellcheck disable=SC2034
BOLD='\033[1m'
#shellcheck disable=SC2034
NC='\033[0m'
: "${VERBOSE:=false}"
: "${DRY_RUN:=false}"
: "${SYMLINK_ONLY:=false}"
: "${KEEP_METADATA:=false}"
: "${TESTED_ARCHIVE_EXTENSIONS:=^(7z|bz2|chm|arj|cab|gz|tgz|gzip|zip|rar|xz|tar|epub|docx|odt|ods|cbr|cbz|maff|iso)\$}"
# Horizontal whitespace and dash-like ASCII and Unicode characters that are
# used for better matching of ISBNs in (badly) OCR-ed books. Gathered from:
# - https://en.wikipedia.org/wiki/Whitespace_character
# - https://en.wikipedia.org/wiki/Dash#Similar_Unicode_characters
# - https://en.wikipedia.org/wiki/Dash#Common_dashes
: "${WSD:="[\\x{0009}\\x{0020}\\x{00A0}\\x{1680}\\x{2000}\
\\x{2001}\\x{2002}\\x{2003}\\x{2004}\\x{2005}\\x{2006}\\x{2007}\\x{2008}\
\\x{2009}\\x{200A}\\x{202F}\\x{205F}\\x{3000}\\x{180E}\\x{200B}\\x{200C}\
\\x{200D}\\x{2060}\\x{FEFF}\\x{002D}\\x{005F}\\x{007E}\\x{00AD}\\x{00AF}\
\\x{02C9}\\x{02CD}\\x{02D7}\\x{02DC}\\x{2010}\\x{2011}\\x{2012}\\x{203E}\
\\x{2043}\\x{207B}\\x{208B}\\x{2212}\\x{223C}\\x{23AF}\\x{23E4}\\x{2500}\
\\x{2796}\\x{2E3A}\\x{2E3B}\\x{10191}\\x{2012}\\x{2013}\\x{2014}\\x{2015}\
\\x{2053}\\x{058A}\\x{05BE}\\x{1428}\\x{1B78}\\x{3161}\\x{30FC}\\x{FE63}\
\\x{FF0D}\\x{10110}\\x{1104B}\\x{11052}\\x{110BE}\\x{1D360}]?"}"
# This regular expression should match most ISBN10/13-like sequences in
# texts. To minimize false-positives, matches should be passed through
# is_isbn_valid() or another ISBN validator
: "${ISBN_REGEX:="(?<![0-9])(${WSD}9${WSD}7${WSD}[789]${WSD})?+((${WSD}[0-9]${WSD}){9}[0-9xX])(?![0-9])"}"
: "${ISBN_DIRECT_GREP_FILES:="^(text/(plain|xml|html)|application/xml)\$"}"
: "${ISBN_IGNORED_FILES:="^(image/(gif|svg.+)|application/(x-shockwave-flash|CDFV2|vnd.ms-opentype|x-font-ttf|x-dosexec|vnd.ms-excel|x-java-applet)|audio/.+|video/.+)\$"}"
: "${ISBN_RET_SEPARATOR:=,}"
# This is matched against normalized valid-looking ISBNs and any numbers that
# match it are discarded.
# The default value should match 0123456789 and any ISBN-10 that uses only one
# digit (e.g. 1111111111 or 3333333333)
: "${ISBN_BLACKLIST_REGEX="^(0123456789|([0-9xX])\\2{9})\$"}"
# These options specify if and how we should reorder ISBN_DIRECT_GREP files
# before passing them to find_isbns(). If true, the first
# ISBN_GREP_RF_SCAN_FIRST lines of the files are passed as is, then we pass
# the last ISBN_GREP_RF_REVERSE_LAST in reverse order and finally we pass the
# remainder in the middle. There is no issue if files have fewer lines, there
# will be no duplicate lines passed to grep.
: "${ISBN_GREP_REORDER_FILES:=true}"
: "${ISBN_GREP_RF_SCAN_FIRST:=400}"
: "${ISBN_GREP_RF_REVERSE_LAST:=50}"
# Whether to use OCR on image files, pdfs and djvu files for ISBN searching
# and conversion to txt
: "${OCR_ENABLED:=false}"
: "${OCR_ONLY_FIRST_LAST_PAGES:=7,3}"
: "${OCR_COMMAND:=tesseract_wrapper}"
# Set to empty string if using Calibre versions before 2.84, previous versions did not support the
# --allowed-plugin option that we use to search for metadata in the order specified below, so they
# can search in all enabled sources in the GUI.
: "${ISBN_METADATA_FETCH_ORDER="Goodreads,Amazon.com,Google,ISBNDB,WorldCat xISBN,OZON.ru"}"
: "${ORGANIZE_WITHOUT_ISBN_SOURCES="Goodreads,Amazon.com,Google"}"
# Should be matched against a lowercase filename.ext, lines that start with #
# and newlines are removed. The default value should filter out most periodicals
: "${RE_YEAR:="(19[0-9]|20[0-$(date '+%Y' | cut -b 3)])[0-9]"}"
: "${WITHOUT_ISBN_IGNORE:=$(echo "
# Perdiodicals with filenames that contain something like 2010-11, 199010, 2015_7, 20110203:
(^|[^0-9])${RE_YEAR}[ _\\.-]*(0?[1-9]|10|11|12)([0-9][0-9])?(\$|[^0-9])
# Periodicals with month numbers before the year
|(^|[^0-9])([0-9][0-9])?(0?[1-9]|10|11|12)[ _\\.-]*${RE_YEAR}(\$|[^0-9])
# Periodicals with months or issues
|((^|[^a-z])(jan(uary)?|feb(ruary)?|mar(ch)?|apr(il)?|may|june?|july?|aug(ust)?|sep(tember)?|oct(ober)?|nov(ember)?|dec(ember)?|mag(azine)?|issue|#[ _\\.-]*[0-9]+)+(\$|[^a-z]))
# Periodicals with seasons and years
|((spr(ing)?|sum(mer)?|aut(umn)?|win(ter)?|fall)[ _\\.-]*${RE_YEAR})
|(${RE_YEAR}[ _\\.-]*(spr(ing)?|sum(mer)?|aut(umn)?|win(ter)?|fall))
" | grep -v '^#' | tr -d '\n')}"
: "${TOKEN_MIN_LENGTH:=3}"
: "${TOKENS_TO_IGNORE:="ebook|book|novel|series|ed(ition)?|vol(ume)?|${RE_YEAR}"}"
[ -z "${FILE_SORT_FLAGS:+x}" ] && FILE_SORT_FLAGS=()
#shellcheck disable=SC2016
[ -z "${OUTPUT_FILENAME_TEMPLATE:+x}" ] && OUTPUT_FILENAME_TEMPLATE='"${d[AUTHORS]// & /, } - ${d[SERIES]:+[${d[SERIES]//:/ -}] - }${d[TITLE]//:/ -}${d[PUBLISHED]:+ (${d[PUBLISHED]%%-*})}${d[ISBN]:+ [${d[ISBN]}]}.${d[EXT]}"'
: "${OUTPUT_METADATA_EXTENSION:=meta}"
: "${DEBUG_PREFIX_LENGTH:=40}"
# Handle parsing from arguments and setting all the common config vars
#shellcheck disable=SC2034
handle_script_arg() {
local arg="$1"
case "$arg" in
-v|--verbose) VERBOSE=true ;;
-d|--dry-run) DRY_RUN=true ;;
-sl|--symlink-only) SYMLINK_ONLY=true ;;
-km|--keep-metadata) KEEP_METADATA=true ;;
--tested-archive-extensions=*) TESTED_ARCHIVE_EXTENSIONS="${arg#*=}" ;;
-i=*|--isbn-regex=*) ISBN_REGEX="${arg#*=}" ;;
--isbn-blacklist-regex=*) ISBN_BLACKLIST_REGEX="${arg#*=}" ;;
--isbn-direct-grep-files=*) ISBN_DIRECT_GREP_FILES="${arg#*=}" ;;
--isbn-ignored-files=*) ISBN_IGNORED_FILES="${arg#*=}" ;;
--reorder-files-for-grep=*)
i="${arg#*=}"
if [[ "$arg" == "false" ]]; then
ISBN_GREP_REORDER_FILES=false
else
ISBN_GREP_REORDER_FILES=true
ISBN_GREP_RF_SCAN_FIRST="${arg%,*}"
ISBN_GREP_RF_REVERSE_LAST="${arg##*,}"
fi
;;
-ocr=*|--ocr-enabled=*) OCR_ENABLED="${arg#*=}" ;;
-ocrop=*|--ocr-only-first-last-pages=*) OCR_ONLY_FIRST_LAST_PAGES="${arg#*=}" ;;
-ocrc=*|--ocr-command=*) OCR_COMMAND="${arg#*=}" ;;
--token-min-length=*) TOKEN_MIN_LENGTH="${arg#*=}" ;;
--tokens-to-ignore=*) TOKENS_TO_IGNORE="${arg#*=}" ;;
-mfo=*|--metadata-fetch-order=*) ISBN_METADATA_FETCH_ORDER="${arg#*=}" ;;
-owis=*|--organize-without-isbn-sources=*) ORGANIZE_WITHOUT_ISBN_SOURCES="${arg#*=}" ;;
-wii=*|--without-isbn-ignore=*) WITHOUT_ISBN_IGNORE="${arg#*=}" ;;
-fsf=*|--file-sort-flags=*) IFS=" " read -r -a FILE_SORT_FLAGS <<< "${arg#*=}" ;;
-oft=*|--output-filename-template=*) OUTPUT_FILENAME_TEMPLATE="${arg#*=}" ;;
-ome=*|--output-metadata-extension=*) OUTPUT_METADATA_EXTENSION="${arg#*=}" ;;
--debug-prefix-length=*) DEBUG_PREFIX_LENGTH="${arg#*=}" ;;
--lib-hook=*)
# shellcheck source=/dev/null
source "${arg#*=}"
;;
-*) echo "Invalid option '$arg'"; exit 4; ;;
esac
}
# If the VERBOSE flag is on, outputs the arguments to stderr
decho () {
if [[ "${VERBOSE:-false}" == true ]]; then
echo "$@" >&2
fi
}
# If the VERBOSE flag is on, prefixes the stdin with the supplied prefix
# (shortened/padded or not) and outputs the result to stderr
#
# Arguments:
# prefix: the string with which we will prefix the lines
# [should_fit_in]: number of characters to which we want to shorten or pad
# the prefix so it fits; 0 is disabled
# [...]: everything else is passed to the fmt command
debug_prefixer() {
local prefix="$1"
if (( $# > 1 )); then
local should_fit_in="$2"
if (( should_fit_in > 0 )); then
if (( ${#prefix} > should_fit_in )); then
prefix="${prefix:0:10}..${prefix:(-$((should_fit_in - 12)))}"
else
prefix="$(printf "%-${should_fit_in}s" "$prefix")"
fi
fi
shift
fi
shift
( if [[ "$#" != "0" ]]; then fmt "$@"; else cat; fi ) |
while IFS= read -r line || [[ -n "$line" ]] ; do
decho "${prefix}${line}"
done
}
# Converts to lowercase (with unicode support)
to_lower() {
sed -E 's/[[:upper:]]+/\L&/g'
}
# Prints only the first instance of any line
uniq_no_sort() {
awk '!x[$0]++'
}
# Concatenate the passed arguments with $1 (can be multiple characters)
str_concat () {
local od="$1"
shift
[[ "$#" == "0" ]] && return
echo -n "$1"
shift
printf "%s" "${@/#/$od}";
}
# Read values from stdin (separated by $2 or newline by default) and
# concatenate them with $1. The delimiter can be multiple characters
stream_concat () {
local od="$1" id="${2:-$'\n'}" val
read -d "$id" -r val || return 0
echo -n "$val"
while read -d "$id" -r val || [[ -n "$val" ]]; do
echo -n "${od}${val}"
done
}
# Validates ISBN-10 and ISBN-13 numbers
is_isbn_valid() {
local isbn i number sum=0
isbn="$(echo "$1" | tr -d ' -' | tr '[:lower:]' '[:upper:]')"
if [ "${#isbn}" == "10" ]; then
for i in {0..9}; do
number="${isbn:$i:1}"
if [[ "$i" == "9" && "$number" == "X" ]]; then
number=10
fi
sum=$(( sum + (number * ( 10 - i )) ))
done
if (( sum % 11 == 0 )); then
return 0
fi
elif [ "${#isbn}" == "13" ]; then
if [[ "${isbn:0:3}" = "978" || "${isbn:0:3}" = "979" ]]; then
for i in {0..12..2}; do
sum=$(( sum + ${isbn:$i:1} ))
done
for i in {1..11..2}; do
sum=$(( sum + (${isbn:$i:1} * 3) ))
done
if (( sum % 10 == 0 )); then
return 0
fi
fi
fi
return 1
}
# Reads and echoes only n lines from STDIN, without consuming the rest
cat_n() {
local line="" lines=0
while ((lines++ < $1 )); do
if read -r line; then
echo "$line"
elif [[ $line != "" ]]; then
echo -n "$line"
fi
done
}
# If ISBN_GREP_REORDER_FILES is enabled, reorders the specified file according
# to the values of ISBN_GREP_RF_SCAN_FIRST and ISBN_GREP_RF_REVERSE_LAST
cat_file_for_isbn_grep() {
if [[ "$ISBN_GREP_REORDER_FILES" == true ]]; then
decho "Reordering input file (if possible, read first $ISBN_GREP_RF_SCAN_FIRST lines normally, then read last $ISBN_GREP_RF_REVERSE_LAST lines in reverse and then read the rest"
{ cat_n "$ISBN_GREP_RF_SCAN_FIRST"; tac | { cat_n "$ISBN_GREP_RF_REVERSE_LAST"; tac; } } < "$1"
else
cat "$1"
fi
}
# Searches STDIN for ISBN-like sequences and removes duplicates (preserving
# the order) and finally validates them using is_isbn_valid() and returns
# them separated by $ISBN_RET_SEPARATOR
find_isbns() {
local isbn
{ grep -oP "$ISBN_REGEX" || true; } | tr -c -d '0-9xX\n' | uniq_no_sort | {
while IFS='' read -r isbn || [[ -n "$isbn" ]]; do
if is_isbn_valid "$isbn"; then
echo "$isbn"
fi
done
} | {
if [ "$ISBN_BLACKLIST_REGEX" != "" ]; then
grep -vP "$ISBN_BLACKLIST_REGEX" || true
else
cat
fi
} | stream_concat "$ISBN_RET_SEPARATOR"
}
# Returns non-zero status if the supplied command does not exist
command_exists() {
command -v "$1" >/dev/null 2>&1
}
# Return "$1/$2" if no file exists at this path. Otherwrise, sequentially
# insert " ($n)" before the extension of $2 and return the first path for
# which no file is present.
unique_filename() {
local new_path="$1/$2" counter=0
while [[ -e "$new_path" ]]; do
counter="$((counter+1))"
decho "File '$new_path' already exists in destination '$1', trying with counter $counter!"
new_path="${1}/${2%.*} ($counter).${2##*.}"
done
echo "$new_path"
}
# Returns a single value by key by parsing the calibre-style text metadata
# hashmap that is passed to stdin
grep_meta_val() {
{ grep --max-count=1 "^$1" || true; } | awk -F' : ' '{ print $2 }'
}
# Splits the stdin stream into alpha or numeric tokens with length at least
# $3 (or $TOKEN_MIN_LENGTH), converts them to lowercase, optionally
# deduplicates them (if $2 is true or not specified) and finally concatenates
# them with $1 (or ' ' if not specified)
tokenize() {
local separator="${1:- }" dedup="${2:-true}"
local lenr="${3:-$TOKEN_MIN_LENGTH}" tokens_to_ignore="${4:-$TOKENS_TO_IGNORE}"
{ grep -oE "[[:alpha:]]{${lenr},}|[[:digit:]]{${lenr},}" || true; } | to_lower | {
if [[ "$dedup" == true ]]; then
uniq_no_sort
else
cat
fi
} | {
if [[ "$tokens_to_ignore" != "" ]]; then
grep -ivE "^($tokens_to_ignore)\$" || true
else
cat
fi
} | stream_concat "$separator"
}
# Checks the supplied file for different kinds of corruption:
# - If it's zero-sized or contains only \0
# - If it's has a pdf extension but different mime type
# - If it's a pdf and pdfinfo returns an error
# - If it has an archive extension but `7z t` returns an error
check_file_for_corruption() {
local file_path="$1"
decho "Testing '$file_path' for corruption..."
if [[ "$(tr -d '\0' < "$file_path" | head -c 1)" == "" ]]; then
echo "The file is empty or contains only zeros!"
return
fi
local ext="${1##*.}" mimetype
mimetype="$(file --brief --mime-type "$file_path")"
if [[ "$mimetype" == "application/octet-stream" && "$ext" =~ ^(pdf|djv|djvu)$ ]]; then
echo "The file has a .$ext extension but '$mimetype' MIME type!"
elif [[ "$mimetype" == "application/pdf" ]]; then
decho "Checking pdf file for integrity..."
if ! command_exists pdfinfo; then
decho "pdfinfo does not exist, could not check if pdf is OK"
else
local pdfinfo_output
if ! pdfinfo_output="$(pdfinfo "$file_path" 2> >(tail | debug_prefixer "[pdfinfo-err|tail] " 0 --width=80 -s))"; then
decho "pdfinfo returned an error!"
echo "$pdfinfo_output" | debug_prefixer "[pdfinfo] " 0 --width=80 -t
echo "Has pdf MIME type or extension, but pdfinfo returned an error!"
return
else
decho "pdfinfo returned successfully"
echo "$pdfinfo_output" | debug_prefixer "[pdfinfo] " 0 --width=80 -t
if echo "$pdfinfo_output" | grep --quiet -E '^Page size:\s*0 x 0 pts$'; then
decho "pdf is corrupt anyway, page size property is empty!"
echo "pdf can be parsed, but page size is 0 x 0 pts!"
fi
fi
fi
fi
if [[ "$ext" =~ $TESTED_ARCHIVE_EXTENSIONS ]]; then
decho "The file has a '.$ext' extension, testing with 7z..."
local log
if ! log="$(7z t "$file_path" 2>&1)"; then
decho "Test failed!"
echo "$log" | debug_prefixer "[7z-test-log] " 0 --width=80 -s
echo "Looks like an archive, but testing it with 7z failed!"
fi
fi
}
# Tries to convert the supplied ebook file into .txt. It uses calibre's
# ebook-convert tool. For optimization, if present, it will use pdftotext
# for pdfs, catdoc for word files and djvutxt for djvu files.
# Arguments: input path, output path (shloud have .txt extension), mimetype
convert_to_txt() {
local if="$1" of="$2" mimetype="$3"
if [[ "$mimetype" == "application/pdf" ]] && command_exists pdftotext; then
decho "The file looks like a pdf, using pdftotext to extract the text"
pdftotext "$if" "$of"
elif [[ "$mimetype" == "application/msword" ]] && command_exists catdoc; then
decho "The file looks like a doc, using catdoc to extract the text"
catdoc "$if" > "$of"
elif [[ "$mimetype" == "image/vnd.djvu"* ]] && command_exists djvutxt; then
decho "The file looks like a djvu, using djvutxt to extract the text"
djvutxt "$if" "$of"
elif [[ "$mimetype" != "image/vnd.djvu"* && "$mimetype" == "image/"* ]]; then
decho "The file looks like a normal image ($mimetype), skipping ebook-convert usage!"
return 1
else
decho "Trying to use calibre's ebook-convert to convert the '$mimetype' file to .txt"
ebook-convert "$if" "$of"
fi
}
tesseract_wrapper () {
tesseract "$1" stdout --psm 12 > "$2" || exit 1
}
ocr_file() {
local if="$1" of="$2" mimetype="$3"
local ocr_first_pages="${OCR_ONLY_FIRST_LAST_PAGES%,*}" ocr_last_pages="${OCR_ONLY_FIRST_LAST_PAGES##*,}"
local num_pages page_convert_cmd
convert_pdf_page() {
gs -dSAFER -q -r300 -dFirstPage="$3" -dLastPage="$3" -dNOPAUSE -dINTERPOLATE -sDEVICE=png16m -sOutputFile="$2" "$1" -c quit
}
convert_djvu_page() {
ddjvu -page="$3" -format=tif "$1" "$2"
}
case "$mimetype" in
application/pdf)
num_pages=$(pdfinfo "$if" | sed -n -E 's/^Pages:\s+([0-9]+)/\1/p')
page_convert_cmd=convert_pdf_page
;;
image/vnd.djvu*)
num_pages=$( djvused -e "n" "$if")
page_convert_cmd=convert_djvu_page
;;
image/*) "$OCR_COMMAND" "$if" "$of" ;;
*) decho "Unsupported mimetype '$mimetype'!"; return 4 ;;
esac
decho "Running OCR on file '$if' $num_pages pages and with mimetype '$mimetype'..."
local page=1 tmp_file tmp_file_txt
while (( page <= num_pages )); do
if [[ "$OCR_ONLY_FIRST_LAST_PAGES" == false ]] ||
(( page <= ${ocr_first_pages:-0} )) ||
(( page > num_pages - ${ocr_last_pages:-0} ));
then
tmp_file=$(mktemp)
tmp_file_txt=$(mktemp --suffix='.txt')
decho "Running OCR of page $page, using tmp files '$tmp_file' and '$tmp_file_txt' ..."
"$page_convert_cmd" "$if" "$tmp_file" "$page"
"$OCR_COMMAND" "$tmp_file" "$tmp_file_txt"
cat "$tmp_file_txt"
decho "Cleaning up tmp files '$tmp_file' and '$tmp_file_txt'"
rm "$tmp_file" "$tmp_file_txt"
fi
page=$(( page + 1))
done > "$of"
}
# Arguments: the path to the archive file
get_all_isbns_from_archive() {
local file_path="$1" isbns tmpdir
tmpdir="$(mktemp -d)"
decho "Trying to decompress '$file_path' into tmp folder '$tmpdir' and recursively scan the contents"
if ! 7z x -o"$tmpdir" "$file_path" 2>&1 | debug_prefixer "[7zx] " 0 --width=80 -s; then
decho "Error extracting the file (probably not an archive)! Removing tmp dir..."
rm -rf "$tmpdir"
return 1
fi
decho "Archive extracted successfully in '$tmpdir', scanning contents recursively..."
while IFS= read -r -d '' file_to_check; do
#decho "Searching '$file_to_check' for ISBNs..."
isbns="$(search_file_for_isbns "$file_to_check" 2> >(debug_prefixer "[${file_to_check#$tmpdir}] " "${DEBUG_PREFIX_LENGTH:-40}") )"
if [[ "$isbns" != "" ]]; then
decho "Found ISBNs $isbns!"
echo "$isbns" | tr "$ISBN_RET_SEPARATOR" '\n'
fi
decho "Removing '$file_to_check'..."
rm "$file_to_check"
done < <(find "$tmpdir" -type f -print0 | sort -z ${FILE_SORT_FLAGS[@]:+"${FILE_SORT_FLAGS[@]}"})
decho "Removing temporary folder '$tmpdir' (should be empty)..."
find "$tmpdir" -type d -empty -delete
}
# Tries to find ISBN numbers in the given ebook file by using progressively
# more "expensive" tactics. If at some point ISBN numbers are found, they
# are echoed to stdout and the function returns.
# These are the steps:
# - Check the supplied file name for ISBNs (the path is ignored)
# - If the MIME type of the file matches ISBN_DIRECT_GREP_FILES, search
# the file contents directly for ISBNs
# - If the MIME type matches ISBN_IGNORED_FILES, the function returns
# early with no results
# - Check the file metadata from calibre's `ebook-meta` for ISBNs
# - Try to extract the file as an archive with `7z`; if successful,
# recursively call search_file_for_isbns for all the extracted files
# - If the file is not an archive, try to convert it to a .txt file
# via convert_to_txt()
# - If OCR is enabled and convert_to_txt() fails or its result is empty,
# try OCR-ing the file. If the result is non-empty but does not contain
# ISBNs and OCR_ENABLED is set to "always", run OCR as well.
search_file_for_isbns() {
local file_path="$1" isbns
decho "Searching file '$file_path' for ISBN numbers..."
isbns="$(basename "$file_path" | find_isbns)"
if [[ "$isbns" != "" ]]; then
decho "Extracted ISBNs '$isbns' from the file name!"
echo -n "$isbns"
return
fi
local mimetype
mimetype="$(file --brief --mime-type "$file_path")"
decho "Ebook MIME type: $mimetype"
if [[ "$mimetype" =~ $ISBN_DIRECT_GREP_FILES ]]; then
# TODO: maybe decode entities in HTML/XML files since ISBNs can
# be intersected by or other escaped characters;
# recode/perl/php can be used for this
decho "Ebook is in text format, trying to find ISBN directly"
isbns="$(cat_file_for_isbn_grep "$file_path" | find_isbns)"
if [[ "$isbns" != "" ]]; then
decho "Extracted ISBNs '$isbns' from the text file contents!"
echo -n "$isbns"
else
decho "Did not find any ISBNs"
fi
return
elif [[ "$mimetype" =~ $ISBN_IGNORED_FILES ]]; then
decho "The file type in the blacklist, ignoring..."
return
fi
local ebookmeta
ebookmeta="$(ebook-meta "$file_path")"
decho "Ebook metadata:"
echo "$ebookmeta" | debug_prefixer " " 0 --width=80 -t
isbns="$(echo "$ebookmeta" | find_isbns)"
if [[ "$isbns" != "" ]]; then
decho "Extracted ISBNs '$isbns' from calibre ebook metadata!"
echo -n "$isbns"
return
fi
if isbns="$(get_all_isbns_from_archive "$file_path" | uniq_no_sort | stream_concat "$ISBN_RET_SEPARATOR")"; then
decho "Extracted ISBNs '$isbns' from the archive file"
echo -n "$isbns"
return
fi
local tmptxtfile try_ocr=false
tmptxtfile="$(mktemp --suffix='.txt')"
decho "Converting ebook to text format in file '$tmptxtfile'..."
if convert_to_txt "$file_path" "$tmptxtfile" "$mimetype" 2>&1 | debug_prefixer "[ebook2txt] " 0 --width=80 -s; then
decho "Conversion to text was successful, checking the result..."
if ! grep -qiE "[[:alnum:]]+" "$tmptxtfile"; then
decho "The converted txt with size $(stat -c '%s' "$tmptxtfile") bytes does not seem to contain text:"
#xxd -a $tmptxtfile | head | debug_prefixer "[cat tmp-txt] "
try_ocr=true
else
isbns="$(cat_file_for_isbn_grep "$tmptxtfile" | find_isbns)"
if [[ "$isbns" != "" ]]; then
decho "Text output contains ISBNs '$isbns'!"
elif [[ "$OCR_ENABLED" == "always" ]]; then
decho "We will try OCR because the successfully converted text did not have any ISBNs"
try_ocr=true
else
decho "Did not find any ISBNs and will NOT try OCR"
fi
fi
else
decho "There was an error converting the book to txt format"
try_ocr=true
fi
if [[ "$isbns" == "" && "$OCR_ENABLED" != false && "$try_ocr" == true ]]; then
decho "Trying to run OCR on the file..."
if ocr_file "$file_path" "$tmptxtfile" "$mimetype" 2>&1 | debug_prefixer "[ocr] " 0 --width=80 -t; then
decho "OCR was successful, checking the result..."
isbns="$(cat_file_for_isbn_grep "$tmptxtfile" | find_isbns)"
if [[ "$isbns" != "" ]]; then
decho "OCR output contains ISBNs '$isbns'!"
else
decho "Did not find any ISBNs in the OCR output"
fi
else
decho "There was an error while running OCR!"
fi
fi
decho "Removing '$tmptxtfile'..."
rm "$tmptxtfile"
if [[ "$isbns" != "" ]]; then
decho "Returning the found ISBNs '$isbns'!"
echo -n "$isbns"
else
decho "Could not find any ISBNs in '$file_path' :("
fi
}
move_or_link_file() {
local current_path="$1" new_path="$2" new_folder="${2%/*}"
$DRY_RUN && decho "(DRY RUN! All operations except metadata deletion are skipped!)"
if [[ ! -d "$new_folder" ]]; then
decho "Creating folder '$new_folder'"
if [[ "$DRY_RUN" == "false" ]]; then
mkdir -p "$new_folder"
fi
fi
if [[ "$SYMLINK_ONLY" == true ]]; then
decho "Symlinking file '$current_path' to '$new_path'..."
if [[ "$DRY_RUN" == "false" ]]; then
ln -s "$(realpath "$current_path")" "$new_path"
fi
else
decho "Moving file '$current_path' to '$new_path'..."
if [[ "$DRY_RUN" == "false" ]]; then
mv --no-clobber "$current_path" "$new_path"
fi
fi
}
# Arguments: new_folder, current_ebook_path, current_metadata_path
move_or_link_ebook_file_and_metadata() {
local new_folder="$1" current_ebook_path="$2" current_metadata_path="$3" line
declare -A d=( ["EXT"]="${current_ebook_path##*.}" ) # metadata and the file extension
while IFS='' read -r line || [[ -n "$line" ]]; do
#TODO: fix this properly
d["$(echo "${line%%:*}" | sed -e 's/[ \t]*$//' -e 's/ /_/g' -e 's/[^a-zA-Z0-9_]//g' -e 's/\(.*\)/\U\1/')"]="$(echo "${line#*: }" | sed -e 's/[\\/\*\?<>\|\x01-\x1F\x7F\x22\x24\x60]/_/g' | cut -c 1-100 )"
done < "$current_metadata_path"
decho "Variables that will be used for the new filename construction:"
local key
for key in "${!d[@]}"; do
echo "${d[${key}]}" | debug_prefixer " ${key}" 25
done
local new_name
new_name="$(eval echo "$OUTPUT_FILENAME_TEMPLATE")"
decho "The new file name of the book file/link '$current_ebook_path' will be: '$new_name'"
local new_path
new_path="$(unique_filename "${new_folder%/}" "$new_name")"
echo -n "$new_path"
move_or_link_file "$current_ebook_path" "$new_path"
if [[ "$KEEP_METADATA" != true ]]; then
decho "Removing metadata file '$current_metadata_path'..."
rm "$current_metadata_path"
else
decho "Moving metadata file '$current_metadata_path' to '${new_path}.${OUTPUT_METADATA_EXTENSION}'..."
if [[ "$DRY_RUN" != true ]]; then
mv --no-clobber "$current_metadata_path" "${new_path}.${OUTPUT_METADATA_EXTENSION}"
else
rm "$current_metadata_path"
fi
fi
}
# Uses Calibre's fetch-ebook-metadata CLI tool to download metadata from
# online sources. The first parameter is the debug prefix, the second is the
# coma-separated list of allowed plugins and the rest are passed directly
# to fetch-ebook-metadata
fetch_metadata() {
local isbn_sources
IFS=, read -ra isbn_sources <<< "$2"
local isbn_source="" args=()
for isbn_source in "${isbn_sources[@]:-}"; do
args+=("${isbn_source:+--allowed-plugin=$isbn_source}")
done
decho "Calling fetch-ebook-metadata --verbose" "${args[*]}" "${@:3}"
fetch-ebook-metadata --verbose ${args[@]:+"${args[@]}"} "${@:3}" 2> >(debug_prefixer "[$1] " 0 --width=100 -s) | grep -E '[a-zA-Z()]+ +: .*'
}