Skip to content

Commit

Permalink
ADDED MODULE FOR EXTRACTION
Browse files Browse the repository at this point in the history
  • Loading branch information
tapish13031997 committed Jan 20, 2017
1 parent dca3cf5 commit 00bc774
Show file tree
Hide file tree
Showing 21 changed files with 391 additions and 0 deletions.
Binary file added SRS.docx
Binary file not shown.
Binary file removed SRS4.0.doc
Binary file not shown.
Binary file removed SRSExample-webapp.doc
Binary file not shown.
Binary file removed SRTTemplate (copy).docx
Binary file not shown.
Binary file removed SRTTemplate.docx
Binary file not shown.
17 changes: 17 additions & 0 deletions module/convert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import commands
import universal

def run_command(string):
if commands.getstatusoutput(string)[0]==1:
raise NameError("ERROR IN Commands.getstatusoutput "+string)
def convert():
# run_command("pdf2txt.py -t html -Y exact "+"-o "+universal.filename+".html "+universal.current_dir+"/"+universal.year+"/"+universal.filename+".pdf")
run_command("pdf2txt.py -t tag -Y exact "+"-o "+universal.current_dir+"/"+universal.tag_folder+"/"+universal.filename+universal.filename+".html "+universal.current_dir +"/"+ universal.pdf_folder+"/"+universal.filename+".pdf")
def remove():
run_command("rm "+universal.current_dir+"/"+universal.tag_folder+"/"+universal.filename+universal.filename+".html")
#universal.init();
#for i in range(8,622):
# #print(i)
# universal.filename=str(i);
# convert() #for initializing conversion of files

Binary file added module/convert.pyc
Binary file not shown.
88 changes: 88 additions & 0 deletions module/excelwriter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import xlsxwriter
import universal

def init(): #for initializing the xlsx file
universal.workbook = xlsxwriter.Workbook(universal.filename+".xlsx")
universal.worksheet = universal.workbook.add_worksheet()
headformat = universal.workbook.add_format()
headformat.set_bold()
headformat.set_text_wrap()
universal.worksheet.set_row(0, 60)
universal.worksheet.set_column(0,3,11)
universal.worksheet.set_column(4,4,30)
universal.worksheet.set_column(5,5,20)
universal.worksheet.set_column(6,6,15)
universal.worksheet.set_column(7,8,7)
universal.worksheet.set_column(9,9,12)
universal.worksheet.set_column(10,10,13)
universal.worksheet.set_column(11,12,9)
universal.worksheet.set_column(13,13,15)
universal.worksheet.set_column(14,14,12)
universal.worksheet.set_column(15,17,14)
universal.worksheet.set_column(18,18,11)
universal.worksheet.set_column(19,19,14)

universal.worksheet.write('A1',"Application No.",headformat)
universal.worksheet.write('B1',"Date of filling of Application",headformat)
universal.worksheet.write('C1',"Publication Date",headformat)
universal.worksheet.write('D1',"Name of Applicant",headformat)
universal.worksheet.write('E1',"Title of Invention",headformat)
universal.worksheet.write('F1',"Name of Inventor(s)",headformat)
universal.worksheet.write('G1',"Abstract",headformat)
universal.worksheet.write('H1',"No. of pages",headformat)
universal.worksheet.write('I1',"No. of claims",headformat)
universal.worksheet.write('J1',"International classification",headformat)
universal.worksheet.write('K1',"Priority Document No.",headformat)
universal.worksheet.write('L1',"Priority Date",headformat)
universal.worksheet.write('M1',"Name of priority country",headformat)
universal.worksheet.write('N1',"International Application No.",headformat)
universal.worksheet.write('O1',"International Application Filling Date",headformat)
universal.worksheet.write('P1',"International Publication No.",headformat)
universal.worksheet.write('Q1',"Patent of addition to Application No.",headformat)
universal.worksheet.write('R1',"Patent of addition to Application No. Filling Date",headformat)
universal.worksheet.write('S1',"Divisional to Application No.",headformat)
universal.worksheet.write('T1',"Divisional to Application No. Filling Date",headformat)
universal.row = 1
universal.date_format = universal.workbook.add_format({'num_format':'dd mm yyyy'})
#universal.workbook.close()


#inside for loop
def loop() :
universal.worksheet.write(universal.row, 0, universal.data["Application No."])
universal.worksheet.write(universal.row, 1, universal.data["Date of filing of Application"], universal.date_format)
universal.worksheet.write(universal.row, 2, universal.data["Publication Date"], universal.date_format)
universal.worksheet.write(universal.row, 3, universal.data["Name of Applicant"])
universal.worksheet.write(universal.row, 4, universal.data["Title of the invention"])
universal.worksheet.write(universal.row, 5, universal.data["Name of Inventor"])
universal.worksheet.write(universal.row, 6, universal.data["Abstract"])
universal.worksheet.write(universal.row, 7, universal.data["No. of Pages"])
universal.worksheet.write(universal.row, 8, universal.data["No. of Claims"])
universal.worksheet.write(universal.row, 9, universal.data["International classification"])
universal.worksheet.write(universal.row, 10, universal.data["Priority Document No"])
if(universal.data["Priority Date"] == "NA"):
universal.worksheet.write(universal.row, 11, universal.data["Priority Date"])
else:
universal.worksheet.write(universal.row, 11, universal.data["Priority Date"],universal.date_format)
universal.worksheet.write(universal.row, 12, universal.data["Name of priority country"])
universal.worksheet.write(universal.row, 13, universal.data["International Application No"])
if(universal.data["IAFiling Date"] == "NA"):
universal.worksheet.write(universal.row, 14, universal.data["IAFiling Date"])
else:
universal.worksheet.write(universal.row, 14, universal.data["IAFiling Date"],universal.date_format)
universal.worksheet.write(universal.row, 15, universal.data["International Publication No"])
universal.worksheet.write(universal.row, 16, universal.data["Patent of Addition to Application Number"])
if(universal.data["IBFiling Date"] == "NA"):
universal.worksheet.write(universal.row, 17, universal.data["IBFiling Date"])
else:
universal.worksheet.write(universal.row, 17, universal.data["IBFiling Date"],universal.date_format)
universal.worksheet.write(universal.row, 18, universal.data["Divisional to Application Number"])
if(universal.data["ICFiling Date"] == "NA"):
universal.worksheet.write(universal.row, 19, universal.data["ICFiling Date"])
else:
universal.worksheet.write(universal.row, 19, universal.data["ICFiling Date"],universal.date_format)

universal.row = universal.row + 1



Binary file added module/excelwriter.pyc
Binary file not shown.
122 changes: 122 additions & 0 deletions module/file1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
#using tag for fields in pdf for which information is to be extracted and value for information

from lxml import html
import requests
import os
from urllib import url2pathname
import need
import universal
import file2
import convert
def reopen(filename): #open the html file for parasing
requests_session = need.requests.session()
requests_session.mount('file://', need.LocalFileAdapter())
page = requests_session.get('file:///home/killerbee/Desktop/test2/'+universal.tag_folder+"/"+filename) #file name
universal.tree = html.fromstring(page.content)


def transform(tvalue,tremove): #remove tremove from tvalue and return string after tremove
x=tvalue.find(tremove) #example tvalue is the value of Application No. and tremove is Application No.
x+=len(tremove)
return tvalue[x:]


def extract_multi_lines(tag,path): #for tags with mulitple lines
temp=universal.tree.xpath(path)
fans=""
for x in temp :
fans+=x
fans=transform(fans,tag+" :")
universal.data[tag]=fans
def extract(path,tag): #add value to tag in dictionary(data) using path
for value in universal.tree.xpath(path) :
if value.find(tag) != -1 :
break
return transform(value,tag)

#def extract_claims_pages(path): #extract the value for No of Claims tag and No of pages tag
# temp=extract(path,"No. of Pages : ")
# pos=temp.find("No. of Pages : ")
# y=len("No. of Pages : ");
# temp[pos+y:]
# y=0
# tans=0
# while temp[y].isdigit() :
# tans*=10
# tans+=int(temp[y])
# y=y+1
# universal.data["No. of Pages"]=tans
# tans=0
# temp=temp[y:]
# pos=temp.find("No. of Claims : ")
# y=len("No. of Claims : ");
# temp=temp[pos+y:]
# y=0
# while temp[y].isdigit() and y<len(temp) :
# tans*=10
# tans+=int(temp[y])
# y=y+1
# universal.data["No. of Claims"]=tans



#def extract_names(path): #for extracting information from name column
# test=universal.tree.xpath(path)
# x=0
# while test[x].find("Name of Applicant : ")==-1:
# #print(test[x])
# x+=1
# x+=1
# tlist=[]
# while test[x].find("Name of Inventor")==-1:
# tlist.append(test[x])
# x+=1
# tlist=tlist[0:-1] #for removing (*number*) Before Name of Inventor
# universal.data["Name of Applicant"]=tlist
# tlist=[]
# while x<len(test):
# tlist.append(test[x])
# x+=1
# universal.data["Name of Inventor"]=",".join(tlist)

def locate(string, x="/html/body/page/p[",y="]/text()"): #for locating xpath of column containing string
#x="/html/body/div["
#y="]/span/text()"

i=0
while i<100000 :
s=universal.tree.xpath(x+str(i)+y)
#print(s)
for a in s:
if a.find(string)!=-1 :
return x+str(i)+y
i+=1
fappend=open("log.txt",'a')
fappend.write(universal.filename+"->"+string+'\n')
fappend.close()
return x+str(10)+y

def begin():
reopen(universal.filename+universal.filename+".html") #html-tag filename converted from pdf
#page = requests_session.get('file:///home/killerbee/Desktop/test2/'+filename) #file name
#universal.tree = html.fromstring(page.content)
universal.data["Application No."]=extract(locate("Application No."),"Application No.")
universal.data["Date of filing of Application"]=extract(locate("Date of filing of Application :"),"Date of filing of Application :")
universal.data["Publication Date"]=extract(locate("Publication Date : "),"Publication Date : ")
universal.data["No. of Pages"]=extract(locate("No. of Pages :"),"No. of Pages :")
universal.data["No. of Claims"]=extract(locate("No. of Claims :"),"No. of Claims :")
extract_multi_lines("Title of the invention",locate("Title of the invention"))
extract_multi_lines("Name of Applicant",locate("Name of Applicant"))
extract_multi_lines("Name of Inventor",locate("Name of Inventor"))
extract_multi_lines("Abstract",locate("Abstract"))
#try :
#extract_names(locate("Name of Applicant"))
#except :
# raise
# temp=input("Error occured in extracting names from file "+filename+" of year "+year+"\n"+"press 1 to continue")
file2.extract_final_coloum()
## for z in universal.data :
## print(z+":"+str(universal.data[z])+"\n")
# #convert.remove()
# #print("\n")

Binary file added module/file1.pyc
Binary file not shown.
55 changes: 55 additions & 0 deletions module/file2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#This file convert the last column (the one on the left with all the patent details)
#This is done in 3 steps, Firstly i find the start point of the column and then i make a string(temp) of text in the required column
#And then i extract information from temp corresponding to each tag(which are stored in a list ->target)
#to extract information from tag i locate ":" and then extract text after ":" till i find the end of string or next ":"
import universal #using universal.data and universal.tree from universal.py
def limit(s): #funtion for finding the start of the column
for x in s:
if x.find("(51) International classif") !=-1:
return 0
return 1
def cal(s): #for counting how many
cnt2=0
for y in s:
for x in y:
if x==":":
cnt2+=1
return cnt2
def extract_final_coloum():
x="/html/body/page/p["
y="]/text()"
i=1
target=["International classification","Priority Document No","Priority Date","Name of priority country","International Application No","IAFiling Date","International Publication No","Patent of Addition to Application Number","IBFiling Date","Divisional to Application Number","ICFiling Date"]
path=x+str(i)+y
s=universal.tree.xpath(path)
while limit(s):
#print(s)
i+=1
path=x+str(i)+y
s=universal.tree.xpath(path)
cnt=0
temp=""
while cnt<10:
if cal(s)>0:
cnt+=cal(s)
temp+="".join(s)
i+=1
path=x+str(i)+y
s=universal.tree.xpath(path)
i=0
j=0
pj=0
pj=temp.find(":",pj)
while 1:
j=temp.find(":",pj+1)
if j!=-1:
universal.data[target[i]]=temp[pj+1:j]
i+=1
if i>len(target):
break
pj=j
else:
universal.data[target[i]]=temp[pj+1:]
i+=1
break

Binary file added module/file2.pyc
Binary file not shown.
24 changes: 24 additions & 0 deletions module/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import file1
import convert
import universal
import commands
import excelwriter
def run_command(string):
if commands.getstatusoutput(string)[0]==1:
raise NameError("ERROR IN Commands.getstatusoutput "+string)
universal.init() #for initializing global variables
universal.filename=str(input("Enter filename(without extension)"))
excelwriter.init()
run_command("mkdir "+universal.pdf_folder)
run_command("mkdir "+universal.tag_folder)
run_command("pdftk "+universal.filename+".pdf burst output "+universal.current_dir+"/"+universal.pdf_folder+"/%d.pdf")
for i in range(8,500):
print(i)
universal.filename=str(i);
convert.convert() #for initializing conversion of files
file1.begin()
excelwriter.loop()
universal.workbook.close()
run_command("rm -r "+universal.pdf_folder)
run_command("rm -r "+universal.tag_folder)

56 changes: 56 additions & 0 deletions module/need.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import requests
import os
from urllib import url2pathname

class LocalFileAdapter(requests.adapters.BaseAdapter):
"""Protocol Adapter to allow Requests to GET file:// URLs
@todo: Properly handle non-empty hostname portions.
"""

@staticmethod
def _chkpath(method, path):
"""Return an HTTP status for the given filesystem path."""
if method.lower() in ('put', 'delete'):
return 501, "Not Implemented" # TODO
elif method.lower() not in ('get', 'head'):
return 405, "Method Not Allowed"
elif os.path.isdir(path):
return 400, "Path Not A File"
elif not os.path.isfile(path):
return 404, "File Not Found"
elif not os.access(path, os.R_OK):
return 403, "Access Denied"
else:
return 200, "OK"

def send(self, req, **kwargs): # pylint: disable=unused-argument
"""Return the file specified by the given request
@type req: C{PreparedRequest}
@todo: Should I bother filling `response.headers` and processing
If-Modified-Since and friends using `os.stat`?
"""
path = os.path.normcase(os.path.normpath(url2pathname(req.path_url)))
response = requests.Response()

response.status_code, response.reason = self._chkpath(req.method, path)
if response.status_code == 200 and req.method.lower() != 'head':
try:
response.raw = open(path, 'rb')
except (OSError, IOError), err:
response.status_code = 500
response.reason = str(err)

if isinstance(req.url, bytes):
response.url = req.url.decode('utf-8')
else:
response.url = req.url

response.request = req
response.connection = self

return response

def close(self):
pass
Binary file added module/need.pyc
Binary file not shown.
7 changes: 7 additions & 0 deletions module/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import commands
import universal
def convert():
if commands.getstatusoutput("pdf2txt.py -t html "+"-o "+universal.filename+".html "+universal.current_dir+"/"+universal.filename+"/"+universal.filename+".pdf")[0]==1:
raise NameError("ERROR IN Commands.getstatusoutput for file "+universal.filename)
if commands.getstatusoutput("pdf2txt.py -t tag "+"-o "+universal.filename+universal.filename+".html "+universal.current_dir+"/"+universal.filename+"/"+universal.filename+".pdf")[0]==1:
raise NameError("ERROR IN Commands.getstatusoutput for file "+universal.filename+universal.filename)
19 changes: 19 additions & 0 deletions module/universal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#file containing global variables
import commands
def init():
global data
data={}
global tree
global filename #filename of pdf file containing patents
filename="15"
global current_dir
current_dir=commands.getoutput("pwd") #In future use in-built python function which is platform independent.
global pdf_folder #name of folder containing pdf burst files
pdf_folder="3"
global tag_folder #name of folder containing tag-html file
tag_folder="tag_folder"
global workbook
global worksheet
global date_format
global row #row counter

Binary file added module/universal.pyc
Binary file not shown.
3 changes: 3 additions & 0 deletions module/xlsxtest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import xlsxwriter
workbook = xlsxwriter.Workbook("15.xlsx")
worksheet = workbook.
Binary file removed srs_template2.doc
Binary file not shown.

0 comments on commit 00bc774

Please sign in to comment.