ADDED MODULE FOR EXTRACTION

tapish13031997 · Jan 20, 2017 · 00bc774 · 00bc774
1 parent dca3cf5
commit 00bc774
Show file tree

Hide file tree

Showing 21 changed files with 391 additions and 0 deletions.
diff --git a/SRS.docx b/SRS.docx
diff --git a/SRS4.0.doc b/SRS4.0.doc
diff --git a/SRSExample-webapp.doc b/SRSExample-webapp.doc
diff --git a/SRTTemplate (copy).docx b/SRTTemplate (copy).docx
diff --git a/SRTTemplate.docx b/SRTTemplate.docx
diff --git a/module/convert.py b/module/convert.py
@@ -0,0 +1,17 @@
+import commands
+import universal
+
+def run_command(string):
+  if commands.getstatusoutput(string)[0]==1:
+     raise NameError("ERROR IN Commands.getstatusoutput "+string)
+def convert():
+    #  run_command("pdf2txt.py -t html -Y exact "+"-o "+universal.filename+".html "+universal.current_dir+"/"+universal.year+"/"+universal.filename+".pdf")
+  run_command("pdf2txt.py -t tag -Y exact "+"-o "+universal.current_dir+"/"+universal.tag_folder+"/"+universal.filename+universal.filename+".html "+universal.current_dir +"/"+ universal.pdf_folder+"/"+universal.filename+".pdf")
+def remove():
+  run_command("rm "+universal.current_dir+"/"+universal.tag_folder+"/"+universal.filename+universal.filename+".html")
+#universal.init();
+#for i in range(8,622):
+#  #print(i)
+#  universal.filename=str(i);
+#  convert() #for initializing conversion of files
+
diff --git a/module/convert.pyc b/module/convert.pyc
diff --git a/module/excelwriter.py b/module/excelwriter.py
@@ -0,0 +1,88 @@
+import xlsxwriter
+import universal
+
+def init():       #for initializing the xlsx file
+  universal.workbook = xlsxwriter.Workbook(universal.filename+".xlsx")
+  universal.worksheet = universal.workbook.add_worksheet()
+  headformat = universal.workbook.add_format()
+  headformat.set_bold()
+  headformat.set_text_wrap()
+  universal.worksheet.set_row(0, 60)
+  universal.worksheet.set_column(0,3,11)
+  universal.worksheet.set_column(4,4,30)
+  universal.worksheet.set_column(5,5,20)
+  universal.worksheet.set_column(6,6,15)
+  universal.worksheet.set_column(7,8,7)
+  universal.worksheet.set_column(9,9,12)
+  universal.worksheet.set_column(10,10,13)
+  universal.worksheet.set_column(11,12,9)
+  universal.worksheet.set_column(13,13,15)
+  universal.worksheet.set_column(14,14,12)
+  universal.worksheet.set_column(15,17,14)
+  universal.worksheet.set_column(18,18,11)
+  universal.worksheet.set_column(19,19,14)
+
+  universal.worksheet.write('A1',"Application No.",headformat)
+  universal.worksheet.write('B1',"Date of filling of Application",headformat)
+  universal.worksheet.write('C1',"Publication Date",headformat)
+  universal.worksheet.write('D1',"Name of Applicant",headformat)
+  universal.worksheet.write('E1',"Title of Invention",headformat)
+  universal.worksheet.write('F1',"Name of Inventor(s)",headformat)
+  universal.worksheet.write('G1',"Abstract",headformat)
+  universal.worksheet.write('H1',"No. of pages",headformat)
+  universal.worksheet.write('I1',"No. of claims",headformat)
+  universal.worksheet.write('J1',"International classification",headformat)
+  universal.worksheet.write('K1',"Priority Document No.",headformat)
+  universal.worksheet.write('L1',"Priority Date",headformat)
+  universal.worksheet.write('M1',"Name of priority country",headformat)
+  universal.worksheet.write('N1',"International Application No.",headformat)
+  universal.worksheet.write('O1',"International Application Filling Date",headformat)
+  universal.worksheet.write('P1',"International Publication No.",headformat)
+  universal.worksheet.write('Q1',"Patent of addition to Application No.",headformat)
+  universal.worksheet.write('R1',"Patent of addition to Application No. Filling Date",headformat)
+  universal.worksheet.write('S1',"Divisional to Application No.",headformat)
+  universal.worksheet.write('T1',"Divisional to Application No. Filling Date",headformat)
+  universal.row = 1
+  universal.date_format = universal.workbook.add_format({'num_format':'dd mm yyyy'})
+  #universal.workbook.close()  
+
+
+#inside for loop
+def loop() :
+  universal.worksheet.write(universal.row, 0, universal.data["Application No."])
+  universal.worksheet.write(universal.row, 1, universal.data["Date of filing of Application"], universal.date_format)
+  universal.worksheet.write(universal.row, 2, universal.data["Publication Date"], universal.date_format)
+  universal.worksheet.write(universal.row, 3, universal.data["Name of Applicant"])
+  universal.worksheet.write(universal.row, 4, universal.data["Title of the invention"])
+  universal.worksheet.write(universal.row, 5, universal.data["Name of Inventor"])
+  universal.worksheet.write(universal.row, 6, universal.data["Abstract"])
+  universal.worksheet.write(universal.row, 7, universal.data["No. of Pages"])
+  universal.worksheet.write(universal.row, 8, universal.data["No. of Claims"])
+  universal.worksheet.write(universal.row, 9, universal.data["International classification"])
+  universal.worksheet.write(universal.row, 10, universal.data["Priority Document No"])
+  if(universal.data["Priority Date"] == "NA"):
+      universal.worksheet.write(universal.row, 11, universal.data["Priority Date"])
+  else:
+      universal.worksheet.write(universal.row, 11, universal.data["Priority Date"],universal.date_format)
+  universal.worksheet.write(universal.row, 12, universal.data["Name of priority country"])
+  universal.worksheet.write(universal.row, 13, universal.data["International Application No"])
+  if(universal.data["IAFiling Date"] == "NA"):
+      universal.worksheet.write(universal.row, 14, universal.data["IAFiling Date"])
+  else:
+      universal.worksheet.write(universal.row, 14, universal.data["IAFiling Date"],universal.date_format)
+  universal.worksheet.write(universal.row, 15, universal.data["International Publication No"])
+  universal.worksheet.write(universal.row, 16, universal.data["Patent of Addition to Application Number"])
+  if(universal.data["IBFiling Date"] == "NA"):
+      universal.worksheet.write(universal.row, 17, universal.data["IBFiling Date"])
+  else:
+      universal.worksheet.write(universal.row, 17, universal.data["IBFiling Date"],universal.date_format)
+  universal.worksheet.write(universal.row, 18, universal.data["Divisional to Application Number"])
+  if(universal.data["ICFiling Date"] == "NA"):
+      universal.worksheet.write(universal.row, 19, universal.data["ICFiling Date"])
+  else:
+      universal.worksheet.write(universal.row, 19, universal.data["ICFiling Date"],universal.date_format)
+
+  universal.row = universal.row + 1
+
+
+
diff --git a/module/excelwriter.pyc b/module/excelwriter.pyc
diff --git a/module/file1.py b/module/file1.py
@@ -0,0 +1,122 @@
+#using tag for fields in pdf for which information is to be extracted and value for information 
+
+from lxml import html
+import requests
+import os
+from urllib import url2pathname
+import need
+import universal
+import file2
+import convert
+def reopen(filename):                           #open the html file for parasing
+  requests_session = need.requests.session()
+  requests_session.mount('file://', need.LocalFileAdapter())
+  page = requests_session.get('file:///home/killerbee/Desktop/test2/'+universal.tag_folder+"/"+filename)   #file name
+  universal.tree = html.fromstring(page.content)
+
+
+def transform(tvalue,tremove): #remove tremove from tvalue and return string after tremove
+  x=tvalue.find(tremove)      #example tvalue is the value of Application No. and tremove is Application No. 
+  x+=len(tremove)
+  return tvalue[x:]
+
+
+def extract_multi_lines(tag,path):  #for tags with mulitple lines 
+    temp=universal.tree.xpath(path)
+    fans=""
+    for x in temp :
+       fans+=x
+    fans=transform(fans,tag+" :")
+    universal.data[tag]=fans  
+def extract(path,tag):          #add value to tag in dictionary(data) using path 
+    for value in universal.tree.xpath(path) :
+      if value.find(tag) != -1 :
+        break
+    return transform(value,tag) 
+
+#def extract_claims_pages(path):         #extract the value for No of Claims tag and No of pages tag
+#    temp=extract(path,"No. of Pages : ")
+#    pos=temp.find("No. of Pages : ")
+#    y=len("No. of Pages : ");
+#    temp[pos+y:]
+#    y=0
+#    tans=0
+#    while temp[y].isdigit() :
+#      tans*=10
+#      tans+=int(temp[y])  
+#      y=y+1
+#    universal.data["No. of Pages"]=tans
+#    tans=0 
+#    temp=temp[y:]
+#    pos=temp.find("No. of Claims : ")
+#    y=len("No. of Claims : ");
+#    temp=temp[pos+y:]
+#    y=0
+#    while temp[y].isdigit() and y<len(temp) :
+#      tans*=10
+#      tans+=int(temp[y])  
+#      y=y+1
+#    universal.data["No. of Claims"]=tans 
+
+
+
+#def extract_names(path):     #for extracting information from name column
+#    test=universal.tree.xpath(path)
+#    x=0
+#    while test[x].find("Name of Applicant : ")==-1:
+#      #print(test[x])
+#      x+=1
+#    x+=1
+#    tlist=[]
+#    while test[x].find("Name of Inventor")==-1:
+#      tlist.append(test[x])
+#      x+=1
+#    tlist=tlist[0:-1]  #for removing (*number*) Before Name of Inventor
+#    universal.data["Name of Applicant"]=tlist
+#    tlist=[]
+#    while x<len(test):
+#      tlist.append(test[x])
+#      x+=1
+#    universal.data["Name of Inventor"]=",".join(tlist)   
+
+def locate(string, x="/html/body/page/p[",y="]/text()"):     #for locating xpath of column containing string
+  #x="/html/body/div["
+  #y="]/span/text()"
+
+  i=0
+  while i<100000 :
+    s=universal.tree.xpath(x+str(i)+y)
+    #print(s)
+    for a in s:
+      if a.find(string)!=-1 :
+        return x+str(i)+y
+    i+=1
+  fappend=open("log.txt",'a')
+  fappend.write(universal.filename+"->"+string+'\n')
+  fappend.close()
+  return  x+str(10)+y 
+
+def begin():      
+  reopen(universal.filename+universal.filename+".html") #html-tag filename converted from pdf
+  #page = requests_session.get('file:///home/killerbee/Desktop/test2/'+filename)   #file name
+  #universal.tree = html.fromstring(page.content)
+  universal.data["Application No."]=extract(locate("Application No."),"Application No.")
+  universal.data["Date of filing of Application"]=extract(locate("Date of filing of Application :"),"Date of filing of Application :")
+  universal.data["Publication Date"]=extract(locate("Publication Date : "),"Publication Date : ")
+  universal.data["No. of Pages"]=extract(locate("No. of Pages :"),"No. of Pages :")
+  universal.data["No. of Claims"]=extract(locate("No. of Claims :"),"No. of Claims :")
+  extract_multi_lines("Title of the invention",locate("Title of the invention"))
+  extract_multi_lines("Name of Applicant",locate("Name of Applicant"))
+  extract_multi_lines("Name of Inventor",locate("Name of Inventor"))
+  extract_multi_lines("Abstract",locate("Abstract"))
+  #try :
+  #extract_names(locate("Name of Applicant"))
+  #except :
+  #  raise
+  #  temp=input("Error occured in extracting names from file "+filename+" of year "+year+"\n"+"press 1 to continue")
+  file2.extract_final_coloum()  
+##  for z in universal.data :
+##    print(z+":"+str(universal.data[z])+"\n")
+#  #convert.remove()  
+#    #print("\n")
+
diff --git a/module/file1.pyc b/module/file1.pyc
diff --git a/module/file2.py b/module/file2.py
@@ -0,0 +1,55 @@
+#This file convert the last column (the one on the left with all the patent details)
+#This is done in 3 steps, Firstly i find the start point of the column and then i make a string(temp) of text in the required column 
+#And then i extract information from temp corresponding to each tag(which are stored in a list ->target)
+#to extract information from tag i locate ":" and then extract text after ":" till i find the end of string or next ":"    
+import universal #using universal.data and universal.tree from universal.py
+def limit(s):     #funtion for finding the start of the column
+  for x in s:
+    if x.find("(51) International classif") !=-1:
+      return 0
+  return 1
+def cal(s):       #for counting how many 
+  cnt2=0
+  for y in s:
+    for x in y:
+     if x==":":
+      cnt2+=1
+  return cnt2        
+def extract_final_coloum():
+  x="/html/body/page/p["
+  y="]/text()"
+  i=1
+  target=["International classification","Priority Document No","Priority Date","Name of priority country","International Application No","IAFiling Date","International Publication No","Patent of Addition to Application Number","IBFiling Date","Divisional to Application Number","ICFiling Date"]
+  path=x+str(i)+y
+  s=universal.tree.xpath(path)
+  while limit(s):
+    #print(s)
+    i+=1  
+    path=x+str(i)+y
+    s=universal.tree.xpath(path)
+  cnt=0
+  temp=""
+  while cnt<10:
+     if cal(s)>0:
+      cnt+=cal(s)
+      temp+="".join(s)
+     i+=1  
+     path=x+str(i)+y
+     s=universal.tree.xpath(path) 
+  i=0
+  j=0
+  pj=0   
+  pj=temp.find(":",pj)
+  while 1:
+    j=temp.find(":",pj+1)
+    if j!=-1:
+      universal.data[target[i]]=temp[pj+1:j]
+      i+=1
+      if i>len(target):
+        break
+      pj=j
+    else:
+      universal.data[target[i]]=temp[pj+1:]
+      i+=1
+      break   
+
diff --git a/module/file2.pyc b/module/file2.pyc
diff --git a/module/main.py b/module/main.py
@@ -0,0 +1,24 @@
+import file1
+import convert
+import universal
+import commands
+import excelwriter
+def run_command(string):
+  if commands.getstatusoutput(string)[0]==1:
+     raise NameError("ERROR IN Commands.getstatusoutput "+string)
+universal.init()  #for initializing global variables 
+universal.filename=str(input("Enter filename(without extension)"))
+excelwriter.init()
+run_command("mkdir "+universal.pdf_folder)
+run_command("mkdir "+universal.tag_folder)
+run_command("pdftk "+universal.filename+".pdf burst output "+universal.current_dir+"/"+universal.pdf_folder+"/%d.pdf")
+for i in range(8,500):
+  print(i)
+  universal.filename=str(i);
+  convert.convert() #for initializing conversion of files
+  file1.begin()
+  excelwriter.loop()
+universal.workbook.close()  
+run_command("rm -r "+universal.pdf_folder)
+run_command("rm -r "+universal.tag_folder)
+
diff --git a/module/need.py b/module/need.py
@@ -0,0 +1,56 @@
+import requests
+import os
+from urllib import url2pathname
+
+class LocalFileAdapter(requests.adapters.BaseAdapter):
+    """Protocol Adapter to allow Requests to GET file:// URLs
+
+    @todo: Properly handle non-empty hostname portions.
+    """
+
+    @staticmethod
+    def _chkpath(method, path):
+        """Return an HTTP status for the given filesystem path."""
+        if method.lower() in ('put', 'delete'):
+            return 501, "Not Implemented"  # TODO
+        elif method.lower() not in ('get', 'head'):
+            return 405, "Method Not Allowed"
+        elif os.path.isdir(path):
+            return 400, "Path Not A File"
+        elif not os.path.isfile(path):
+            return 404, "File Not Found"
+        elif not os.access(path, os.R_OK):
+            return 403, "Access Denied"
+        else:
+            return 200, "OK"
+
+    def send(self, req, **kwargs):  # pylint: disable=unused-argument
+        """Return the file specified by the given request
+
+        @type req: C{PreparedRequest}
+        @todo: Should I bother filling `response.headers` and processing
+               If-Modified-Since and friends using `os.stat`?
+        """
+        path = os.path.normcase(os.path.normpath(url2pathname(req.path_url)))
+        response = requests.Response()
+
+        response.status_code, response.reason = self._chkpath(req.method, path)
+        if response.status_code == 200 and req.method.lower() != 'head':
+            try:
+                response.raw = open(path, 'rb')
+            except (OSError, IOError), err:
+                response.status_code = 500
+                response.reason = str(err)
+
+        if isinstance(req.url, bytes):
+            response.url = req.url.decode('utf-8')
+        else:
+            response.url = req.url
+
+        response.request = req
+        response.connection = self
+
+        return response
+
+    def close(self):
+        pass    
diff --git a/module/need.pyc b/module/need.pyc
diff --git a/module/test.py b/module/test.py
@@ -0,0 +1,7 @@
+import commands
+import universal
+def convert():
+  if commands.getstatusoutput("pdf2txt.py -t html "+"-o "+universal.filename+".html "+universal.current_dir+"/"+universal.filename+"/"+universal.filename+".pdf")[0]==1:
+    raise NameError("ERROR IN Commands.getstatusoutput for file "+universal.filename)
+  if commands.getstatusoutput("pdf2txt.py -t tag "+"-o "+universal.filename+universal.filename+".html "+universal.current_dir+"/"+universal.filename+"/"+universal.filename+".pdf")[0]==1:
+    raise NameError("ERROR IN Commands.getstatusoutput for file "+universal.filename+universal.filename)       
diff --git a/module/universal.py b/module/universal.py
@@ -0,0 +1,19 @@
+#file containing global variables
+import commands
+def init():
+  global data
+  data={}
+  global tree
+  global filename #filename of pdf file containing patents
+  filename="15"
+  global current_dir 
+  current_dir=commands.getoutput("pwd") #In future use in-built python function which is platform independent.
+  global pdf_folder  #name of folder containing pdf burst files
+  pdf_folder="3"
+  global tag_folder #name of folder containing tag-html file
+  tag_folder="tag_folder"
+  global workbook
+  global worksheet
+  global date_format
+  global row #row counter
+
diff --git a/module/universal.pyc b/module/universal.pyc
diff --git a/module/xlsxtest.py b/module/xlsxtest.py
@@ -0,0 +1,3 @@
+import xlsxwriter
+workbook = xlsxwriter.Workbook("15.xlsx")
+worksheet = workbook.
diff --git a/srs_template2.doc b/srs_template2.doc