ADDED DOWNLOAD SCRIPT,SQLITE VERSION_FOR_LINUX & XLSX_MERGER

tapish13031997 · Apr 30, 2017 · f8e6110 · f8e6110
1 parent 88d0d13
commit f8e6110
Show file tree

Hide file tree

Showing 65 changed files with 3,099 additions and 0 deletions.
diff --git a/Linux_sqlite_version/Parser.py b/Linux_sqlite_version/Parser.py
@@ -0,0 +1,65 @@
+#using tag for fields in pdf for which information is to be extracted and value for information 
+from lxml import html
+import requests
+import os
+from urllib import url2pathname
+import need
+import universal
+import convert
+import logwriter
+import extractor
+def reopen(filename):                           #open the html file for parasing
+  requests_session = need.requests.session()
+  requests_session.mount('file://', need.LocalFileAdapter())
+  url = "file:///"+os.getcwd().replace(" ","%20").replace("\\","/")
+  page = requests_session.get(url+"/"+universal.tag_folder+"/"+filename)   #file name
+  universal.tree = html.fromstring(page.content)
+#def extractor(index,tag) :
+# data from html file--> abcdaaa
+# tag         ---------> axabcydaa
+#Approach A1
+#now what we know is that the tag is complete and a subsequence of tag will be the data from html file ....but if we use s.get_matching_blocks() it returns the longest common subsequence which will be wrong consider
+#Approach A2
+# data from html file--> international_total_publication
+# tag         ---------> international publication 
+#this will match while they are two different tags so...we cant use this approach 
+#my approach:-
+#remove all the whitespaces from the universal.datastring and then we will use the approach A1 to extract tags from universal.datastring
+def begin():      #return 1 if string is not present
+  universal.datastring=""
+  reopen(universal.filename+universal.filename+".html") #html-tag filename converted from pdf
+  #page = requests_session.get('file:///home/killerbee/Desktop/test2/'+filename)   #file name
+  #universal.tree = html.fromstring(page.content)
+  s = universal.tree.itertext()
+#  universal.test=["(21) Application No","Date of filing of Application","Publication Date","Title of the invention","International classification","Priority Document","Priority Date","Name of priority country","International Application","Fil","International Publication","Patent of Addition to Application","Fil","Divisional to Application","Fil","Name of Applicant","(72)Name of Inventor","Abstract"]
+  for a in s:
+    universal.datastring += a
+  try:
+    return(extractor.getdetails(universal.datastring))
+  except Exception as e:
+    logwriter.logwrite("Extracter: "+str(e)+" on page "+str(int(universal.filename)+1))
+    universal.logflag = 1
+    return -1
+  return 0
+#  write code for case when tayal returns -1 and you have to run your extraction function 
+#  implement ur extraction function and then call it 
+
+
+
+
+
+
+
+
+
+  #extractor.getdetails(universal.datastring)
+#  for tag in universal.test:
+#    tempi=i
+#   # i=extractor(i,tag)
+#    if i==-1:
+#      if(extractor.mycheck(universal.datastring)==0):
+#        fappend=open("log.txt",'a')
+#        fappend.write("-->"+str(universal.filename)+"->"+tag+"--->"+universal.datastring[tempi:tempi+len(tag)]+'\n')
+#        fappend.close()
+#        return -1
+#    i+=1 
diff --git a/Linux_sqlite_version/Parser.pyc b/Linux_sqlite_version/Parser.pyc
diff --git a/Linux_sqlite_version/browser.py b/Linux_sqlite_version/browser.py
@@ -0,0 +1,6 @@
+import Tkinter,tkFileDialog
+def browse():
+  root = Tkinter.Tk()
+  root.withdraw()
+  filez = tkFileDialog.askopenfilenames(parent=root,title='Choose a file',filetypes = (("pdf files","*.pdf"),("all files","*.*")))
+  return root.tk.splitlist(filez)
diff --git a/Linux_sqlite_version/browser.pyc b/Linux_sqlite_version/browser.pyc
diff --git a/Linux_sqlite_version/controller.py b/Linux_sqlite_version/controller.py
@@ -0,0 +1,46 @@
+import main
+import universal
+import browser
+import sqlitewriter
+from shutil import copyfile
+import os
+#import test
+universal.init()
+sqlitewriter.init()
+files=browser.browse()
+for _file in files :
+  #main.run_command("cp "+str(_file)+" "+universal.current_dir)
+  src=str(_file)
+  universal.filename=""
+  temp=len(_file)-1
+  while _file[temp]!="/":
+    universal.filename=_file[temp]+universal.filename
+    temp-=1
+  tempfile = "copy"+universal.filename
+  dst=str(universal.current_dir+'/'+str("copy"+universal.filename))
+  copyfile(src,dst)
+  universal.logfile = universal.filename.replace('.pdf','') #as univeral.filename changes in main   
+  sqlitewriter.createconnection()
+  main.initial()
+  sqlitewriter.closeconnection()
+
+  if(universal.logflag==0):
+    os.remove(universal.logfile+".txt")
+#  else:
+#   test.init(tempfile)
+  os.remove(tempfile)
+  os.remove(_file)
+  #main.run_command("rm "+universal.logfile)
+#year=input("year\n")
+#s=main.run_command("ls "+str(year),1).split("\n")
+#fappend.close() 
+#for x in s:
+#  universal.filename=x
+#  main.run_command("mv "+str(year)+"/"+str(x)+" "+universal.current_dir)
+#  main.initial()
+#  main.run_command("mv "+universal.current_dir+"/"+str(x)+" "+str(year))
+
+#fappend=open("log.txt",'a')   
+#fappend.write("\n********"+"\n"+str(year)+"\n*************\n\n\n")
+#fappend.close()   
+#i=input("Filename\n")
diff --git a/Linux_sqlite_version/convert.py b/Linux_sqlite_version/convert.py
@@ -0,0 +1,15 @@
+import commands
+import universal
+import logwriter
+def run_command(string):
+  if commands.getstatusoutput(string)[0]!=0:
+     logwriter.logwrite("ERROR IN Commands.getstatusoutput "+string)
+def convert():
+    #  run_command("pdf2txt.py -t html -Y exact "+"-o "+universal.filename+".html "+universal.current_dir+"/"+universal.year+"/"+universal.filename+".pdf")
+  run_command("pdf2txt.py -t tag -Y exact "+"-o "+universal.current_dir+"/"+universal.tag_folder+"/"+universal.filename+universal.filename+".html "+universal.current_dir +"/"+ universal.pdf_folder+"/"+universal.filename+".pdf")
+#universal.init();
+#for i in range(8,622):
+#  #print(i)
+#  universal.filename=str(i);
+#  convert() #for initializing conversion of files
+
diff --git a/Linux_sqlite_version/convert.pyc b/Linux_sqlite_version/convert.pyc
diff --git a/Linux_sqlite_version/excelwriter.py b/Linux_sqlite_version/excelwriter.py
@@ -0,0 +1,97 @@
+import xlsxwriter
+import universal
+import logwriter
+def init():       #for initializing the xlsx file
+  universal.workbook = xlsxwriter.Workbook(universal.filename.replace(".pdf","")+".xlsx")
+  universal.worksheet = universal.workbook.add_worksheet()
+  headformat = universal.workbook.add_format()
+  headformat.set_bold()
+  headformat.set_text_wrap()
+  universal.worksheet.set_row(0, 60)
+  universal.worksheet.set_column(0,3,11)
+  universal.worksheet.set_column(4,4,30)
+  universal.worksheet.set_column(5,5,20)
+  universal.worksheet.set_column(6,6,15)
+  universal.worksheet.set_column(7,8,7)
+  universal.worksheet.set_column(9,9,12)
+  universal.worksheet.set_column(10,10,13)
+  universal.worksheet.set_column(11,12,9)
+  universal.worksheet.set_column(13,13,15)
+  universal.worksheet.set_column(14,14,12)
+  universal.worksheet.set_column(15,17,14)
+  universal.worksheet.set_column(18,18,11)
+  universal.worksheet.set_column(19,19,14)
+
+  universal.worksheet.write('A1',"Application No.",headformat)
+  universal.worksheet.write('B1',"Date of filling of Application",headformat)
+  universal.worksheet.write('C1',"Publication Date",headformat)
+  universal.worksheet.write('D1',"Name of Applicant",headformat)
+  universal.worksheet.write('E1',"Title of Invention",headformat)
+  universal.worksheet.write('F1',"Name of Inventor(s)",headformat)
+  universal.worksheet.write('G1',"Abstract",headformat)
+  universal.worksheet.write('H1',"No. of pages",headformat)
+  universal.worksheet.write('I1',"No. of claims",headformat)
+  universal.worksheet.write('J1',"International classification",headformat)
+  universal.worksheet.write('K1',"Priority Document No.",headformat)
+  universal.worksheet.write('L1',"Priority Date",headformat)
+  universal.worksheet.write('M1',"Name of priority country",headformat)
+  universal.worksheet.write('N1',"International Application No.",headformat)
+  universal.worksheet.write('O1',"International Application Filling Date",headformat)
+  universal.worksheet.write('P1',"International Publication No.",headformat)
+  universal.worksheet.write('Q1',"Patent of addition to Application No.",headformat)
+  universal.worksheet.write('R1',"Patent of addition to Application No. Filling Date",headformat)
+  universal.worksheet.write('S1',"Divisional to Application No.",headformat)
+  universal.worksheet.write('T1',"Divisional to Application No. Filling Date",headformat)
+  universal.row = 1
+  universal.date_format = universal.workbook.add_format({'num_format':'dd mm yyyy'})
+  #universal.workbook.close()  
+
+
+#inside for loop
+def loop() :
+ try:
+  universal.worksheet.write(universal.row, 0, universal.data["Application No."])
+  universal.worksheet.write(universal.row, 1, universal.data["Date of filing of Application"], universal.date_format)
+  universal.worksheet.write(universal.row, 2, universal.data["Publication Date"], universal.date_format)
+  universal.worksheet.write(universal.row, 3, universal.data["Name of Applicant"])
+  universal.worksheet.write(universal.row, 4, universal.data["Title of the invention"])
+  universal.worksheet.write(universal.row, 5, universal.data["Name of Inventor"])
+  universal.worksheet.write(universal.row, 6, universal.data["Abstract"])
+  if(universal.data["No. of Pages"].upper()!="NA"):
+   universal.worksheet.write(universal.row, 7, int(universal.data["No. of Pages"]))
+  else:
+   universal.worksheet.write(universal.row, 7, universal.data["No. of Pages"].upper()) 
+  if(universal.data["No. of Claims"].upper()!="NA"):
+   universal.worksheet.write(universal.row, 8, int(universal.data["No. of Claims"]))
+  else:
+   universal.worksheet.write(universal.row, 8, universal.data["No. of Claims"].upper()) 
+  universal.worksheet.write(universal.row, 9, universal.data["International classification"])
+  universal.worksheet.write(universal.row, 10, universal.data["Priority Document No"])
+  if(universal.data["Priority Date"] == "NA"):
+      universal.worksheet.write(universal.row, 11, universal.data["Priority Date"])
+  else:
+      universal.worksheet.write(universal.row, 11, universal.data["Priority Date"],universal.date_format)
+  universal.worksheet.write(universal.row, 12, universal.data["Name of priority country"])
+  universal.worksheet.write(universal.row, 13, universal.data["International Application No"])
+  if(universal.data["IAFiling Date"] == "NA"):
+      universal.worksheet.write(universal.row, 14, universal.data["IAFiling Date"])
+  else:
+      universal.worksheet.write(universal.row, 14, universal.data["IAFiling Date"],universal.date_format)
+  universal.worksheet.write(universal.row, 15, universal.data["International Publication No"])
+  universal.worksheet.write(universal.row, 16, universal.data["Patent of Addition to Application Number"])
+  if(universal.data["IBFiling Date"] == "NA"):
+      universal.worksheet.write(universal.row, 17, universal.data["IBFiling Date"])
+  else:
+      universal.worksheet.write(universal.row, 17, universal.data["IBFiling Date"],universal.date_format)
+  universal.worksheet.write(universal.row, 18, universal.data["Divisional to Application Number"])
+  if(universal.data["ICFiling Date"] == "NA"):
+      universal.worksheet.write(universal.row, 19, universal.data["ICFiling Date"])
+  else:
+      universal.worksheet.write(universal.row, 19, universal.data["ICFiling Date"],universal.date_format)
+
+  universal.row = universal.row + 1
+ except Exception as e:
+   universal.logflag=1
+   logwriter.logwrite("Excelfile : "+str(e)+" on page "+str(int(universal.filename)+1))
+
+
diff --git a/Linux_sqlite_version/excelwriter.pyc b/Linux_sqlite_version/excelwriter.pyc