-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdftocsv.rb
38 lines (31 loc) · 1006 Bytes
/
pdftocsv.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
require 'tabula'
require 'fileutils'
pdf_file_path = "raw_data/pdf/"
def traversePath(path)
files = Dir::entries(path)
files[2..files.length].each do |file|
if File.directory?(path + file)
puts "directory: " + file
traversePath(path + file + '/')
elsif File.file?(path + file) && file.include?(".pdf") && !file.include?("hb_eagle")
convertPDF(path, file)
end
end
end
def convertPDF(path, file)
csv_path = path.gsub("pdf", "csv")
out_file_name = file.gsub(".pdf", ".csv")
FileUtils.mkdir_p csv_path
puts "convertPDF input: " + path + file
puts "convertPDF output: " + csv_path + out_file_name
out = open(csv_path + out_file_name, 'w')
extractor = Tabula::Extraction::ObjectExtractor.new(path + file, :all)
extractor.extract.each_with_index do |pdf_page, page_index|
puts "extracting file: " + file
out << pdf_page.get_table.to_csv
end
extractor.close!
puts "writing file: " + out_file_name
out.close
end
traversePath(pdf_file_path)