-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
dca3cf5
commit 00bc774
Showing
21 changed files
with
391 additions
and
0 deletions.
There are no files selected for viewing
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
import commands | ||
import universal | ||
|
||
def run_command(string): | ||
if commands.getstatusoutput(string)[0]==1: | ||
raise NameError("ERROR IN Commands.getstatusoutput "+string) | ||
def convert(): | ||
# run_command("pdf2txt.py -t html -Y exact "+"-o "+universal.filename+".html "+universal.current_dir+"/"+universal.year+"/"+universal.filename+".pdf") | ||
run_command("pdf2txt.py -t tag -Y exact "+"-o "+universal.current_dir+"/"+universal.tag_folder+"/"+universal.filename+universal.filename+".html "+universal.current_dir +"/"+ universal.pdf_folder+"/"+universal.filename+".pdf") | ||
def remove(): | ||
run_command("rm "+universal.current_dir+"/"+universal.tag_folder+"/"+universal.filename+universal.filename+".html") | ||
#universal.init(); | ||
#for i in range(8,622): | ||
# #print(i) | ||
# universal.filename=str(i); | ||
# convert() #for initializing conversion of files | ||
|
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
import xlsxwriter | ||
import universal | ||
|
||
def init(): #for initializing the xlsx file | ||
universal.workbook = xlsxwriter.Workbook(universal.filename+".xlsx") | ||
universal.worksheet = universal.workbook.add_worksheet() | ||
headformat = universal.workbook.add_format() | ||
headformat.set_bold() | ||
headformat.set_text_wrap() | ||
universal.worksheet.set_row(0, 60) | ||
universal.worksheet.set_column(0,3,11) | ||
universal.worksheet.set_column(4,4,30) | ||
universal.worksheet.set_column(5,5,20) | ||
universal.worksheet.set_column(6,6,15) | ||
universal.worksheet.set_column(7,8,7) | ||
universal.worksheet.set_column(9,9,12) | ||
universal.worksheet.set_column(10,10,13) | ||
universal.worksheet.set_column(11,12,9) | ||
universal.worksheet.set_column(13,13,15) | ||
universal.worksheet.set_column(14,14,12) | ||
universal.worksheet.set_column(15,17,14) | ||
universal.worksheet.set_column(18,18,11) | ||
universal.worksheet.set_column(19,19,14) | ||
|
||
universal.worksheet.write('A1',"Application No.",headformat) | ||
universal.worksheet.write('B1',"Date of filling of Application",headformat) | ||
universal.worksheet.write('C1',"Publication Date",headformat) | ||
universal.worksheet.write('D1',"Name of Applicant",headformat) | ||
universal.worksheet.write('E1',"Title of Invention",headformat) | ||
universal.worksheet.write('F1',"Name of Inventor(s)",headformat) | ||
universal.worksheet.write('G1',"Abstract",headformat) | ||
universal.worksheet.write('H1',"No. of pages",headformat) | ||
universal.worksheet.write('I1',"No. of claims",headformat) | ||
universal.worksheet.write('J1',"International classification",headformat) | ||
universal.worksheet.write('K1',"Priority Document No.",headformat) | ||
universal.worksheet.write('L1',"Priority Date",headformat) | ||
universal.worksheet.write('M1',"Name of priority country",headformat) | ||
universal.worksheet.write('N1',"International Application No.",headformat) | ||
universal.worksheet.write('O1',"International Application Filling Date",headformat) | ||
universal.worksheet.write('P1',"International Publication No.",headformat) | ||
universal.worksheet.write('Q1',"Patent of addition to Application No.",headformat) | ||
universal.worksheet.write('R1',"Patent of addition to Application No. Filling Date",headformat) | ||
universal.worksheet.write('S1',"Divisional to Application No.",headformat) | ||
universal.worksheet.write('T1',"Divisional to Application No. Filling Date",headformat) | ||
universal.row = 1 | ||
universal.date_format = universal.workbook.add_format({'num_format':'dd mm yyyy'}) | ||
#universal.workbook.close() | ||
|
||
|
||
#inside for loop | ||
def loop() : | ||
universal.worksheet.write(universal.row, 0, universal.data["Application No."]) | ||
universal.worksheet.write(universal.row, 1, universal.data["Date of filing of Application"], universal.date_format) | ||
universal.worksheet.write(universal.row, 2, universal.data["Publication Date"], universal.date_format) | ||
universal.worksheet.write(universal.row, 3, universal.data["Name of Applicant"]) | ||
universal.worksheet.write(universal.row, 4, universal.data["Title of the invention"]) | ||
universal.worksheet.write(universal.row, 5, universal.data["Name of Inventor"]) | ||
universal.worksheet.write(universal.row, 6, universal.data["Abstract"]) | ||
universal.worksheet.write(universal.row, 7, universal.data["No. of Pages"]) | ||
universal.worksheet.write(universal.row, 8, universal.data["No. of Claims"]) | ||
universal.worksheet.write(universal.row, 9, universal.data["International classification"]) | ||
universal.worksheet.write(universal.row, 10, universal.data["Priority Document No"]) | ||
if(universal.data["Priority Date"] == "NA"): | ||
universal.worksheet.write(universal.row, 11, universal.data["Priority Date"]) | ||
else: | ||
universal.worksheet.write(universal.row, 11, universal.data["Priority Date"],universal.date_format) | ||
universal.worksheet.write(universal.row, 12, universal.data["Name of priority country"]) | ||
universal.worksheet.write(universal.row, 13, universal.data["International Application No"]) | ||
if(universal.data["IAFiling Date"] == "NA"): | ||
universal.worksheet.write(universal.row, 14, universal.data["IAFiling Date"]) | ||
else: | ||
universal.worksheet.write(universal.row, 14, universal.data["IAFiling Date"],universal.date_format) | ||
universal.worksheet.write(universal.row, 15, universal.data["International Publication No"]) | ||
universal.worksheet.write(universal.row, 16, universal.data["Patent of Addition to Application Number"]) | ||
if(universal.data["IBFiling Date"] == "NA"): | ||
universal.worksheet.write(universal.row, 17, universal.data["IBFiling Date"]) | ||
else: | ||
universal.worksheet.write(universal.row, 17, universal.data["IBFiling Date"],universal.date_format) | ||
universal.worksheet.write(universal.row, 18, universal.data["Divisional to Application Number"]) | ||
if(universal.data["ICFiling Date"] == "NA"): | ||
universal.worksheet.write(universal.row, 19, universal.data["ICFiling Date"]) | ||
else: | ||
universal.worksheet.write(universal.row, 19, universal.data["ICFiling Date"],universal.date_format) | ||
|
||
universal.row = universal.row + 1 | ||
|
||
|
||
|
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
#using tag for fields in pdf for which information is to be extracted and value for information | ||
|
||
from lxml import html | ||
import requests | ||
import os | ||
from urllib import url2pathname | ||
import need | ||
import universal | ||
import file2 | ||
import convert | ||
def reopen(filename): #open the html file for parasing | ||
requests_session = need.requests.session() | ||
requests_session.mount('file://', need.LocalFileAdapter()) | ||
page = requests_session.get('file:///home/killerbee/Desktop/test2/'+universal.tag_folder+"/"+filename) #file name | ||
universal.tree = html.fromstring(page.content) | ||
|
||
|
||
def transform(tvalue,tremove): #remove tremove from tvalue and return string after tremove | ||
x=tvalue.find(tremove) #example tvalue is the value of Application No. and tremove is Application No. | ||
x+=len(tremove) | ||
return tvalue[x:] | ||
|
||
|
||
def extract_multi_lines(tag,path): #for tags with mulitple lines | ||
temp=universal.tree.xpath(path) | ||
fans="" | ||
for x in temp : | ||
fans+=x | ||
fans=transform(fans,tag+" :") | ||
universal.data[tag]=fans | ||
def extract(path,tag): #add value to tag in dictionary(data) using path | ||
for value in universal.tree.xpath(path) : | ||
if value.find(tag) != -1 : | ||
break | ||
return transform(value,tag) | ||
|
||
#def extract_claims_pages(path): #extract the value for No of Claims tag and No of pages tag | ||
# temp=extract(path,"No. of Pages : ") | ||
# pos=temp.find("No. of Pages : ") | ||
# y=len("No. of Pages : "); | ||
# temp[pos+y:] | ||
# y=0 | ||
# tans=0 | ||
# while temp[y].isdigit() : | ||
# tans*=10 | ||
# tans+=int(temp[y]) | ||
# y=y+1 | ||
# universal.data["No. of Pages"]=tans | ||
# tans=0 | ||
# temp=temp[y:] | ||
# pos=temp.find("No. of Claims : ") | ||
# y=len("No. of Claims : "); | ||
# temp=temp[pos+y:] | ||
# y=0 | ||
# while temp[y].isdigit() and y<len(temp) : | ||
# tans*=10 | ||
# tans+=int(temp[y]) | ||
# y=y+1 | ||
# universal.data["No. of Claims"]=tans | ||
|
||
|
||
|
||
#def extract_names(path): #for extracting information from name column | ||
# test=universal.tree.xpath(path) | ||
# x=0 | ||
# while test[x].find("Name of Applicant : ")==-1: | ||
# #print(test[x]) | ||
# x+=1 | ||
# x+=1 | ||
# tlist=[] | ||
# while test[x].find("Name of Inventor")==-1: | ||
# tlist.append(test[x]) | ||
# x+=1 | ||
# tlist=tlist[0:-1] #for removing (*number*) Before Name of Inventor | ||
# universal.data["Name of Applicant"]=tlist | ||
# tlist=[] | ||
# while x<len(test): | ||
# tlist.append(test[x]) | ||
# x+=1 | ||
# universal.data["Name of Inventor"]=",".join(tlist) | ||
|
||
def locate(string, x="/html/body/page/p[",y="]/text()"): #for locating xpath of column containing string | ||
#x="/html/body/div[" | ||
#y="]/span/text()" | ||
|
||
i=0 | ||
while i<100000 : | ||
s=universal.tree.xpath(x+str(i)+y) | ||
#print(s) | ||
for a in s: | ||
if a.find(string)!=-1 : | ||
return x+str(i)+y | ||
i+=1 | ||
fappend=open("log.txt",'a') | ||
fappend.write(universal.filename+"->"+string+'\n') | ||
fappend.close() | ||
return x+str(10)+y | ||
|
||
def begin(): | ||
reopen(universal.filename+universal.filename+".html") #html-tag filename converted from pdf | ||
#page = requests_session.get('file:///home/killerbee/Desktop/test2/'+filename) #file name | ||
#universal.tree = html.fromstring(page.content) | ||
universal.data["Application No."]=extract(locate("Application No."),"Application No.") | ||
universal.data["Date of filing of Application"]=extract(locate("Date of filing of Application :"),"Date of filing of Application :") | ||
universal.data["Publication Date"]=extract(locate("Publication Date : "),"Publication Date : ") | ||
universal.data["No. of Pages"]=extract(locate("No. of Pages :"),"No. of Pages :") | ||
universal.data["No. of Claims"]=extract(locate("No. of Claims :"),"No. of Claims :") | ||
extract_multi_lines("Title of the invention",locate("Title of the invention")) | ||
extract_multi_lines("Name of Applicant",locate("Name of Applicant")) | ||
extract_multi_lines("Name of Inventor",locate("Name of Inventor")) | ||
extract_multi_lines("Abstract",locate("Abstract")) | ||
#try : | ||
#extract_names(locate("Name of Applicant")) | ||
#except : | ||
# raise | ||
# temp=input("Error occured in extracting names from file "+filename+" of year "+year+"\n"+"press 1 to continue") | ||
file2.extract_final_coloum() | ||
## for z in universal.data : | ||
## print(z+":"+str(universal.data[z])+"\n") | ||
# #convert.remove() | ||
# #print("\n") | ||
|
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
#This file convert the last column (the one on the left with all the patent details) | ||
#This is done in 3 steps, Firstly i find the start point of the column and then i make a string(temp) of text in the required column | ||
#And then i extract information from temp corresponding to each tag(which are stored in a list ->target) | ||
#to extract information from tag i locate ":" and then extract text after ":" till i find the end of string or next ":" | ||
import universal #using universal.data and universal.tree from universal.py | ||
def limit(s): #funtion for finding the start of the column | ||
for x in s: | ||
if x.find("(51) International classif") !=-1: | ||
return 0 | ||
return 1 | ||
def cal(s): #for counting how many | ||
cnt2=0 | ||
for y in s: | ||
for x in y: | ||
if x==":": | ||
cnt2+=1 | ||
return cnt2 | ||
def extract_final_coloum(): | ||
x="/html/body/page/p[" | ||
y="]/text()" | ||
i=1 | ||
target=["International classification","Priority Document No","Priority Date","Name of priority country","International Application No","IAFiling Date","International Publication No","Patent of Addition to Application Number","IBFiling Date","Divisional to Application Number","ICFiling Date"] | ||
path=x+str(i)+y | ||
s=universal.tree.xpath(path) | ||
while limit(s): | ||
#print(s) | ||
i+=1 | ||
path=x+str(i)+y | ||
s=universal.tree.xpath(path) | ||
cnt=0 | ||
temp="" | ||
while cnt<10: | ||
if cal(s)>0: | ||
cnt+=cal(s) | ||
temp+="".join(s) | ||
i+=1 | ||
path=x+str(i)+y | ||
s=universal.tree.xpath(path) | ||
i=0 | ||
j=0 | ||
pj=0 | ||
pj=temp.find(":",pj) | ||
while 1: | ||
j=temp.find(":",pj+1) | ||
if j!=-1: | ||
universal.data[target[i]]=temp[pj+1:j] | ||
i+=1 | ||
if i>len(target): | ||
break | ||
pj=j | ||
else: | ||
universal.data[target[i]]=temp[pj+1:] | ||
i+=1 | ||
break | ||
|
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
import file1 | ||
import convert | ||
import universal | ||
import commands | ||
import excelwriter | ||
def run_command(string): | ||
if commands.getstatusoutput(string)[0]==1: | ||
raise NameError("ERROR IN Commands.getstatusoutput "+string) | ||
universal.init() #for initializing global variables | ||
universal.filename=str(input("Enter filename(without extension)")) | ||
excelwriter.init() | ||
run_command("mkdir "+universal.pdf_folder) | ||
run_command("mkdir "+universal.tag_folder) | ||
run_command("pdftk "+universal.filename+".pdf burst output "+universal.current_dir+"/"+universal.pdf_folder+"/%d.pdf") | ||
for i in range(8,500): | ||
print(i) | ||
universal.filename=str(i); | ||
convert.convert() #for initializing conversion of files | ||
file1.begin() | ||
excelwriter.loop() | ||
universal.workbook.close() | ||
run_command("rm -r "+universal.pdf_folder) | ||
run_command("rm -r "+universal.tag_folder) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
import requests | ||
import os | ||
from urllib import url2pathname | ||
|
||
class LocalFileAdapter(requests.adapters.BaseAdapter): | ||
"""Protocol Adapter to allow Requests to GET file:// URLs | ||
@todo: Properly handle non-empty hostname portions. | ||
""" | ||
|
||
@staticmethod | ||
def _chkpath(method, path): | ||
"""Return an HTTP status for the given filesystem path.""" | ||
if method.lower() in ('put', 'delete'): | ||
return 501, "Not Implemented" # TODO | ||
elif method.lower() not in ('get', 'head'): | ||
return 405, "Method Not Allowed" | ||
elif os.path.isdir(path): | ||
return 400, "Path Not A File" | ||
elif not os.path.isfile(path): | ||
return 404, "File Not Found" | ||
elif not os.access(path, os.R_OK): | ||
return 403, "Access Denied" | ||
else: | ||
return 200, "OK" | ||
|
||
def send(self, req, **kwargs): # pylint: disable=unused-argument | ||
"""Return the file specified by the given request | ||
@type req: C{PreparedRequest} | ||
@todo: Should I bother filling `response.headers` and processing | ||
If-Modified-Since and friends using `os.stat`? | ||
""" | ||
path = os.path.normcase(os.path.normpath(url2pathname(req.path_url))) | ||
response = requests.Response() | ||
|
||
response.status_code, response.reason = self._chkpath(req.method, path) | ||
if response.status_code == 200 and req.method.lower() != 'head': | ||
try: | ||
response.raw = open(path, 'rb') | ||
except (OSError, IOError), err: | ||
response.status_code = 500 | ||
response.reason = str(err) | ||
|
||
if isinstance(req.url, bytes): | ||
response.url = req.url.decode('utf-8') | ||
else: | ||
response.url = req.url | ||
|
||
response.request = req | ||
response.connection = self | ||
|
||
return response | ||
|
||
def close(self): | ||
pass |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
import commands | ||
import universal | ||
def convert(): | ||
if commands.getstatusoutput("pdf2txt.py -t html "+"-o "+universal.filename+".html "+universal.current_dir+"/"+universal.filename+"/"+universal.filename+".pdf")[0]==1: | ||
raise NameError("ERROR IN Commands.getstatusoutput for file "+universal.filename) | ||
if commands.getstatusoutput("pdf2txt.py -t tag "+"-o "+universal.filename+universal.filename+".html "+universal.current_dir+"/"+universal.filename+"/"+universal.filename+".pdf")[0]==1: | ||
raise NameError("ERROR IN Commands.getstatusoutput for file "+universal.filename+universal.filename) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
#file containing global variables | ||
import commands | ||
def init(): | ||
global data | ||
data={} | ||
global tree | ||
global filename #filename of pdf file containing patents | ||
filename="15" | ||
global current_dir | ||
current_dir=commands.getoutput("pwd") #In future use in-built python function which is platform independent. | ||
global pdf_folder #name of folder containing pdf burst files | ||
pdf_folder="3" | ||
global tag_folder #name of folder containing tag-html file | ||
tag_folder="tag_folder" | ||
global workbook | ||
global worksheet | ||
global date_format | ||
global row #row counter | ||
|
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
import xlsxwriter | ||
workbook = xlsxwriter.Workbook("15.xlsx") | ||
worksheet = workbook. |
Binary file not shown.