-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
ADDED DOWNLOAD SCRIPT,SQLITE VERSION_FOR_LINUX & XLSX_MERGER
- Loading branch information
1 parent
88d0d13
commit f8e6110
Showing
65 changed files
with
3,099 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
#using tag for fields in pdf for which information is to be extracted and value for information | ||
from lxml import html | ||
import requests | ||
import os | ||
from urllib import url2pathname | ||
import need | ||
import universal | ||
import convert | ||
import logwriter | ||
import extractor | ||
def reopen(filename): #open the html file for parasing | ||
requests_session = need.requests.session() | ||
requests_session.mount('file://', need.LocalFileAdapter()) | ||
url = "file:///"+os.getcwd().replace(" ","%20").replace("\\","/") | ||
page = requests_session.get(url+"/"+universal.tag_folder+"/"+filename) #file name | ||
universal.tree = html.fromstring(page.content) | ||
#def extractor(index,tag) : | ||
# data from html file--> abcdaaa | ||
# tag ---------> axabcydaa | ||
#Approach A1 | ||
#now what we know is that the tag is complete and a subsequence of tag will be the data from html file ....but if we use s.get_matching_blocks() it returns the longest common subsequence which will be wrong consider | ||
#Approach A2 | ||
# data from html file--> international_total_publication | ||
# tag ---------> international publication | ||
#this will match while they are two different tags so...we cant use this approach | ||
#my approach:- | ||
#remove all the whitespaces from the universal.datastring and then we will use the approach A1 to extract tags from universal.datastring | ||
def begin(): #return 1 if string is not present | ||
universal.datastring="" | ||
reopen(universal.filename+universal.filename+".html") #html-tag filename converted from pdf | ||
#page = requests_session.get('file:///home/killerbee/Desktop/test2/'+filename) #file name | ||
#universal.tree = html.fromstring(page.content) | ||
s = universal.tree.itertext() | ||
# universal.test=["(21) Application No","Date of filing of Application","Publication Date","Title of the invention","International classification","Priority Document","Priority Date","Name of priority country","International Application","Fil","International Publication","Patent of Addition to Application","Fil","Divisional to Application","Fil","Name of Applicant","(72)Name of Inventor","Abstract"] | ||
for a in s: | ||
universal.datastring += a | ||
try: | ||
return(extractor.getdetails(universal.datastring)) | ||
except Exception as e: | ||
logwriter.logwrite("Extracter: "+str(e)+" on page "+str(int(universal.filename)+1)) | ||
universal.logflag = 1 | ||
return -1 | ||
return 0 | ||
# write code for case when tayal returns -1 and you have to run your extraction function | ||
# implement ur extraction function and then call it | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
#extractor.getdetails(universal.datastring) | ||
# for tag in universal.test: | ||
# tempi=i | ||
# # i=extractor(i,tag) | ||
# if i==-1: | ||
# if(extractor.mycheck(universal.datastring)==0): | ||
# fappend=open("log.txt",'a') | ||
# fappend.write("-->"+str(universal.filename)+"->"+tag+"--->"+universal.datastring[tempi:tempi+len(tag)]+'\n') | ||
# fappend.close() | ||
# return -1 | ||
# i+=1 |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
import Tkinter,tkFileDialog | ||
def browse(): | ||
root = Tkinter.Tk() | ||
root.withdraw() | ||
filez = tkFileDialog.askopenfilenames(parent=root,title='Choose a file',filetypes = (("pdf files","*.pdf"),("all files","*.*"))) | ||
return root.tk.splitlist(filez) |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
import main | ||
import universal | ||
import browser | ||
import sqlitewriter | ||
from shutil import copyfile | ||
import os | ||
#import test | ||
universal.init() | ||
sqlitewriter.init() | ||
files=browser.browse() | ||
for _file in files : | ||
#main.run_command("cp "+str(_file)+" "+universal.current_dir) | ||
src=str(_file) | ||
universal.filename="" | ||
temp=len(_file)-1 | ||
while _file[temp]!="/": | ||
universal.filename=_file[temp]+universal.filename | ||
temp-=1 | ||
tempfile = "copy"+universal.filename | ||
dst=str(universal.current_dir+'/'+str("copy"+universal.filename)) | ||
copyfile(src,dst) | ||
universal.logfile = universal.filename.replace('.pdf','') #as univeral.filename changes in main | ||
sqlitewriter.createconnection() | ||
main.initial() | ||
sqlitewriter.closeconnection() | ||
|
||
if(universal.logflag==0): | ||
os.remove(universal.logfile+".txt") | ||
# else: | ||
# test.init(tempfile) | ||
os.remove(tempfile) | ||
os.remove(_file) | ||
#main.run_command("rm "+universal.logfile) | ||
#year=input("year\n") | ||
#s=main.run_command("ls "+str(year),1).split("\n") | ||
#fappend.close() | ||
#for x in s: | ||
# universal.filename=x | ||
# main.run_command("mv "+str(year)+"/"+str(x)+" "+universal.current_dir) | ||
# main.initial() | ||
# main.run_command("mv "+universal.current_dir+"/"+str(x)+" "+str(year)) | ||
|
||
#fappend=open("log.txt",'a') | ||
#fappend.write("\n********"+"\n"+str(year)+"\n*************\n\n\n") | ||
#fappend.close() | ||
#i=input("Filename\n") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
import commands | ||
import universal | ||
import logwriter | ||
def run_command(string): | ||
if commands.getstatusoutput(string)[0]!=0: | ||
logwriter.logwrite("ERROR IN Commands.getstatusoutput "+string) | ||
def convert(): | ||
# run_command("pdf2txt.py -t html -Y exact "+"-o "+universal.filename+".html "+universal.current_dir+"/"+universal.year+"/"+universal.filename+".pdf") | ||
run_command("pdf2txt.py -t tag -Y exact "+"-o "+universal.current_dir+"/"+universal.tag_folder+"/"+universal.filename+universal.filename+".html "+universal.current_dir +"/"+ universal.pdf_folder+"/"+universal.filename+".pdf") | ||
#universal.init(); | ||
#for i in range(8,622): | ||
# #print(i) | ||
# universal.filename=str(i); | ||
# convert() #for initializing conversion of files | ||
|
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
import xlsxwriter | ||
import universal | ||
import logwriter | ||
def init(): #for initializing the xlsx file | ||
universal.workbook = xlsxwriter.Workbook(universal.filename.replace(".pdf","")+".xlsx") | ||
universal.worksheet = universal.workbook.add_worksheet() | ||
headformat = universal.workbook.add_format() | ||
headformat.set_bold() | ||
headformat.set_text_wrap() | ||
universal.worksheet.set_row(0, 60) | ||
universal.worksheet.set_column(0,3,11) | ||
universal.worksheet.set_column(4,4,30) | ||
universal.worksheet.set_column(5,5,20) | ||
universal.worksheet.set_column(6,6,15) | ||
universal.worksheet.set_column(7,8,7) | ||
universal.worksheet.set_column(9,9,12) | ||
universal.worksheet.set_column(10,10,13) | ||
universal.worksheet.set_column(11,12,9) | ||
universal.worksheet.set_column(13,13,15) | ||
universal.worksheet.set_column(14,14,12) | ||
universal.worksheet.set_column(15,17,14) | ||
universal.worksheet.set_column(18,18,11) | ||
universal.worksheet.set_column(19,19,14) | ||
|
||
universal.worksheet.write('A1',"Application No.",headformat) | ||
universal.worksheet.write('B1',"Date of filling of Application",headformat) | ||
universal.worksheet.write('C1',"Publication Date",headformat) | ||
universal.worksheet.write('D1',"Name of Applicant",headformat) | ||
universal.worksheet.write('E1',"Title of Invention",headformat) | ||
universal.worksheet.write('F1',"Name of Inventor(s)",headformat) | ||
universal.worksheet.write('G1',"Abstract",headformat) | ||
universal.worksheet.write('H1',"No. of pages",headformat) | ||
universal.worksheet.write('I1',"No. of claims",headformat) | ||
universal.worksheet.write('J1',"International classification",headformat) | ||
universal.worksheet.write('K1',"Priority Document No.",headformat) | ||
universal.worksheet.write('L1',"Priority Date",headformat) | ||
universal.worksheet.write('M1',"Name of priority country",headformat) | ||
universal.worksheet.write('N1',"International Application No.",headformat) | ||
universal.worksheet.write('O1',"International Application Filling Date",headformat) | ||
universal.worksheet.write('P1',"International Publication No.",headformat) | ||
universal.worksheet.write('Q1',"Patent of addition to Application No.",headformat) | ||
universal.worksheet.write('R1',"Patent of addition to Application No. Filling Date",headformat) | ||
universal.worksheet.write('S1',"Divisional to Application No.",headformat) | ||
universal.worksheet.write('T1',"Divisional to Application No. Filling Date",headformat) | ||
universal.row = 1 | ||
universal.date_format = universal.workbook.add_format({'num_format':'dd mm yyyy'}) | ||
#universal.workbook.close() | ||
|
||
|
||
#inside for loop | ||
def loop() : | ||
try: | ||
universal.worksheet.write(universal.row, 0, universal.data["Application No."]) | ||
universal.worksheet.write(universal.row, 1, universal.data["Date of filing of Application"], universal.date_format) | ||
universal.worksheet.write(universal.row, 2, universal.data["Publication Date"], universal.date_format) | ||
universal.worksheet.write(universal.row, 3, universal.data["Name of Applicant"]) | ||
universal.worksheet.write(universal.row, 4, universal.data["Title of the invention"]) | ||
universal.worksheet.write(universal.row, 5, universal.data["Name of Inventor"]) | ||
universal.worksheet.write(universal.row, 6, universal.data["Abstract"]) | ||
if(universal.data["No. of Pages"].upper()!="NA"): | ||
universal.worksheet.write(universal.row, 7, int(universal.data["No. of Pages"])) | ||
else: | ||
universal.worksheet.write(universal.row, 7, universal.data["No. of Pages"].upper()) | ||
if(universal.data["No. of Claims"].upper()!="NA"): | ||
universal.worksheet.write(universal.row, 8, int(universal.data["No. of Claims"])) | ||
else: | ||
universal.worksheet.write(universal.row, 8, universal.data["No. of Claims"].upper()) | ||
universal.worksheet.write(universal.row, 9, universal.data["International classification"]) | ||
universal.worksheet.write(universal.row, 10, universal.data["Priority Document No"]) | ||
if(universal.data["Priority Date"] == "NA"): | ||
universal.worksheet.write(universal.row, 11, universal.data["Priority Date"]) | ||
else: | ||
universal.worksheet.write(universal.row, 11, universal.data["Priority Date"],universal.date_format) | ||
universal.worksheet.write(universal.row, 12, universal.data["Name of priority country"]) | ||
universal.worksheet.write(universal.row, 13, universal.data["International Application No"]) | ||
if(universal.data["IAFiling Date"] == "NA"): | ||
universal.worksheet.write(universal.row, 14, universal.data["IAFiling Date"]) | ||
else: | ||
universal.worksheet.write(universal.row, 14, universal.data["IAFiling Date"],universal.date_format) | ||
universal.worksheet.write(universal.row, 15, universal.data["International Publication No"]) | ||
universal.worksheet.write(universal.row, 16, universal.data["Patent of Addition to Application Number"]) | ||
if(universal.data["IBFiling Date"] == "NA"): | ||
universal.worksheet.write(universal.row, 17, universal.data["IBFiling Date"]) | ||
else: | ||
universal.worksheet.write(universal.row, 17, universal.data["IBFiling Date"],universal.date_format) | ||
universal.worksheet.write(universal.row, 18, universal.data["Divisional to Application Number"]) | ||
if(universal.data["ICFiling Date"] == "NA"): | ||
universal.worksheet.write(universal.row, 19, universal.data["ICFiling Date"]) | ||
else: | ||
universal.worksheet.write(universal.row, 19, universal.data["ICFiling Date"],universal.date_format) | ||
|
||
universal.row = universal.row + 1 | ||
except Exception as e: | ||
universal.logflag=1 | ||
logwriter.logwrite("Excelfile : "+str(e)+" on page "+str(int(universal.filename)+1)) | ||
|
||
|
Binary file not shown.
Oops, something went wrong.