Skip to content

Commit

Permalink
ADDED DOWNLOAD SCRIPT,SQLITE VERSION_FOR_LINUX & XLSX_MERGER
Browse files Browse the repository at this point in the history
  • Loading branch information
tapish13031997 committed Apr 30, 2017
1 parent 88d0d13 commit f8e6110
Show file tree
Hide file tree
Showing 65 changed files with 3,099 additions and 0 deletions.
65 changes: 65 additions & 0 deletions Linux_sqlite_version/Parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#using tag for fields in pdf for which information is to be extracted and value for information
from lxml import html
import requests
import os
from urllib import url2pathname
import need
import universal
import convert
import logwriter
import extractor
def reopen(filename): #open the html file for parasing
requests_session = need.requests.session()
requests_session.mount('file://', need.LocalFileAdapter())
url = "file:///"+os.getcwd().replace(" ","%20").replace("\\","/")
page = requests_session.get(url+"/"+universal.tag_folder+"/"+filename) #file name
universal.tree = html.fromstring(page.content)
#def extractor(index,tag) :
# data from html file--> abcdaaa
# tag ---------> axabcydaa
#Approach A1
#now what we know is that the tag is complete and a subsequence of tag will be the data from html file ....but if we use s.get_matching_blocks() it returns the longest common subsequence which will be wrong consider
#Approach A2
# data from html file--> international_total_publication
# tag ---------> international publication
#this will match while they are two different tags so...we cant use this approach
#my approach:-
#remove all the whitespaces from the universal.datastring and then we will use the approach A1 to extract tags from universal.datastring
def begin(): #return 1 if string is not present
universal.datastring=""
reopen(universal.filename+universal.filename+".html") #html-tag filename converted from pdf
#page = requests_session.get('file:///home/killerbee/Desktop/test2/'+filename) #file name
#universal.tree = html.fromstring(page.content)
s = universal.tree.itertext()
# universal.test=["(21) Application No","Date of filing of Application","Publication Date","Title of the invention","International classification","Priority Document","Priority Date","Name of priority country","International Application","Fil","International Publication","Patent of Addition to Application","Fil","Divisional to Application","Fil","Name of Applicant","(72)Name of Inventor","Abstract"]
for a in s:
universal.datastring += a
try:
return(extractor.getdetails(universal.datastring))
except Exception as e:
logwriter.logwrite("Extracter: "+str(e)+" on page "+str(int(universal.filename)+1))
universal.logflag = 1
return -1
return 0
# write code for case when tayal returns -1 and you have to run your extraction function
# implement ur extraction function and then call it









#extractor.getdetails(universal.datastring)
# for tag in universal.test:
# tempi=i
# # i=extractor(i,tag)
# if i==-1:
# if(extractor.mycheck(universal.datastring)==0):
# fappend=open("log.txt",'a')
# fappend.write("-->"+str(universal.filename)+"->"+tag+"--->"+universal.datastring[tempi:tempi+len(tag)]+'\n')
# fappend.close()
# return -1
# i+=1
Binary file added Linux_sqlite_version/Parser.pyc
Binary file not shown.
6 changes: 6 additions & 0 deletions Linux_sqlite_version/browser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import Tkinter,tkFileDialog
def browse():
root = Tkinter.Tk()
root.withdraw()
filez = tkFileDialog.askopenfilenames(parent=root,title='Choose a file',filetypes = (("pdf files","*.pdf"),("all files","*.*")))
return root.tk.splitlist(filez)
Binary file added Linux_sqlite_version/browser.pyc
Binary file not shown.
46 changes: 46 additions & 0 deletions Linux_sqlite_version/controller.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import main
import universal
import browser
import sqlitewriter
from shutil import copyfile
import os
#import test
universal.init()
sqlitewriter.init()
files=browser.browse()
for _file in files :
#main.run_command("cp "+str(_file)+" "+universal.current_dir)
src=str(_file)
universal.filename=""
temp=len(_file)-1
while _file[temp]!="/":
universal.filename=_file[temp]+universal.filename
temp-=1
tempfile = "copy"+universal.filename
dst=str(universal.current_dir+'/'+str("copy"+universal.filename))
copyfile(src,dst)
universal.logfile = universal.filename.replace('.pdf','') #as univeral.filename changes in main
sqlitewriter.createconnection()
main.initial()
sqlitewriter.closeconnection()

if(universal.logflag==0):
os.remove(universal.logfile+".txt")
# else:
# test.init(tempfile)
os.remove(tempfile)
os.remove(_file)
#main.run_command("rm "+universal.logfile)
#year=input("year\n")
#s=main.run_command("ls "+str(year),1).split("\n")
#fappend.close()
#for x in s:
# universal.filename=x
# main.run_command("mv "+str(year)+"/"+str(x)+" "+universal.current_dir)
# main.initial()
# main.run_command("mv "+universal.current_dir+"/"+str(x)+" "+str(year))

#fappend=open("log.txt",'a')
#fappend.write("\n********"+"\n"+str(year)+"\n*************\n\n\n")
#fappend.close()
#i=input("Filename\n")
15 changes: 15 additions & 0 deletions Linux_sqlite_version/convert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import commands
import universal
import logwriter
def run_command(string):
if commands.getstatusoutput(string)[0]!=0:
logwriter.logwrite("ERROR IN Commands.getstatusoutput "+string)
def convert():
# run_command("pdf2txt.py -t html -Y exact "+"-o "+universal.filename+".html "+universal.current_dir+"/"+universal.year+"/"+universal.filename+".pdf")
run_command("pdf2txt.py -t tag -Y exact "+"-o "+universal.current_dir+"/"+universal.tag_folder+"/"+universal.filename+universal.filename+".html "+universal.current_dir +"/"+ universal.pdf_folder+"/"+universal.filename+".pdf")
#universal.init();
#for i in range(8,622):
# #print(i)
# universal.filename=str(i);
# convert() #for initializing conversion of files

Binary file added Linux_sqlite_version/convert.pyc
Binary file not shown.
97 changes: 97 additions & 0 deletions Linux_sqlite_version/excelwriter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import xlsxwriter
import universal
import logwriter
def init(): #for initializing the xlsx file
universal.workbook = xlsxwriter.Workbook(universal.filename.replace(".pdf","")+".xlsx")
universal.worksheet = universal.workbook.add_worksheet()
headformat = universal.workbook.add_format()
headformat.set_bold()
headformat.set_text_wrap()
universal.worksheet.set_row(0, 60)
universal.worksheet.set_column(0,3,11)
universal.worksheet.set_column(4,4,30)
universal.worksheet.set_column(5,5,20)
universal.worksheet.set_column(6,6,15)
universal.worksheet.set_column(7,8,7)
universal.worksheet.set_column(9,9,12)
universal.worksheet.set_column(10,10,13)
universal.worksheet.set_column(11,12,9)
universal.worksheet.set_column(13,13,15)
universal.worksheet.set_column(14,14,12)
universal.worksheet.set_column(15,17,14)
universal.worksheet.set_column(18,18,11)
universal.worksheet.set_column(19,19,14)

universal.worksheet.write('A1',"Application No.",headformat)
universal.worksheet.write('B1',"Date of filling of Application",headformat)
universal.worksheet.write('C1',"Publication Date",headformat)
universal.worksheet.write('D1',"Name of Applicant",headformat)
universal.worksheet.write('E1',"Title of Invention",headformat)
universal.worksheet.write('F1',"Name of Inventor(s)",headformat)
universal.worksheet.write('G1',"Abstract",headformat)
universal.worksheet.write('H1',"No. of pages",headformat)
universal.worksheet.write('I1',"No. of claims",headformat)
universal.worksheet.write('J1',"International classification",headformat)
universal.worksheet.write('K1',"Priority Document No.",headformat)
universal.worksheet.write('L1',"Priority Date",headformat)
universal.worksheet.write('M1',"Name of priority country",headformat)
universal.worksheet.write('N1',"International Application No.",headformat)
universal.worksheet.write('O1',"International Application Filling Date",headformat)
universal.worksheet.write('P1',"International Publication No.",headformat)
universal.worksheet.write('Q1',"Patent of addition to Application No.",headformat)
universal.worksheet.write('R1',"Patent of addition to Application No. Filling Date",headformat)
universal.worksheet.write('S1',"Divisional to Application No.",headformat)
universal.worksheet.write('T1',"Divisional to Application No. Filling Date",headformat)
universal.row = 1
universal.date_format = universal.workbook.add_format({'num_format':'dd mm yyyy'})
#universal.workbook.close()


#inside for loop
def loop() :
try:
universal.worksheet.write(universal.row, 0, universal.data["Application No."])
universal.worksheet.write(universal.row, 1, universal.data["Date of filing of Application"], universal.date_format)
universal.worksheet.write(universal.row, 2, universal.data["Publication Date"], universal.date_format)
universal.worksheet.write(universal.row, 3, universal.data["Name of Applicant"])
universal.worksheet.write(universal.row, 4, universal.data["Title of the invention"])
universal.worksheet.write(universal.row, 5, universal.data["Name of Inventor"])
universal.worksheet.write(universal.row, 6, universal.data["Abstract"])
if(universal.data["No. of Pages"].upper()!="NA"):
universal.worksheet.write(universal.row, 7, int(universal.data["No. of Pages"]))
else:
universal.worksheet.write(universal.row, 7, universal.data["No. of Pages"].upper())
if(universal.data["No. of Claims"].upper()!="NA"):
universal.worksheet.write(universal.row, 8, int(universal.data["No. of Claims"]))
else:
universal.worksheet.write(universal.row, 8, universal.data["No. of Claims"].upper())
universal.worksheet.write(universal.row, 9, universal.data["International classification"])
universal.worksheet.write(universal.row, 10, universal.data["Priority Document No"])
if(universal.data["Priority Date"] == "NA"):
universal.worksheet.write(universal.row, 11, universal.data["Priority Date"])
else:
universal.worksheet.write(universal.row, 11, universal.data["Priority Date"],universal.date_format)
universal.worksheet.write(universal.row, 12, universal.data["Name of priority country"])
universal.worksheet.write(universal.row, 13, universal.data["International Application No"])
if(universal.data["IAFiling Date"] == "NA"):
universal.worksheet.write(universal.row, 14, universal.data["IAFiling Date"])
else:
universal.worksheet.write(universal.row, 14, universal.data["IAFiling Date"],universal.date_format)
universal.worksheet.write(universal.row, 15, universal.data["International Publication No"])
universal.worksheet.write(universal.row, 16, universal.data["Patent of Addition to Application Number"])
if(universal.data["IBFiling Date"] == "NA"):
universal.worksheet.write(universal.row, 17, universal.data["IBFiling Date"])
else:
universal.worksheet.write(universal.row, 17, universal.data["IBFiling Date"],universal.date_format)
universal.worksheet.write(universal.row, 18, universal.data["Divisional to Application Number"])
if(universal.data["ICFiling Date"] == "NA"):
universal.worksheet.write(universal.row, 19, universal.data["ICFiling Date"])
else:
universal.worksheet.write(universal.row, 19, universal.data["ICFiling Date"],universal.date_format)

universal.row = universal.row + 1
except Exception as e:
universal.logflag=1
logwriter.logwrite("Excelfile : "+str(e)+" on page "+str(int(universal.filename)+1))


Binary file added Linux_sqlite_version/excelwriter.pyc
Binary file not shown.
Loading

0 comments on commit f8e6110

Please sign in to comment.