-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgs.py
executable file
·176 lines (144 loc) · 5.77 KB
/
gs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
#!/usr/bin/env python
import sys
import urllib
import urllib2
import hashlib
import random
import time
import glob
import bib
from HTMLParser import HTMLParser
useragent = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36"
if len(sys.argv) > 1:
user = sys.argv[1]
else:
user = None
bib_url = "http://scholar.google.com/citations?view_op=export_citations&hl=en&user=%s" % user
bib_id_url = "http://scholar.google.com/scholar.bib?q=info:%s:scholar.google.com/&output=citation&hl=en"
bib_params = "cit_fmt=0&export_selected_btn=Export+the+article+below&s=%s"
if len(sys.argv) > 2:
cookie = sys.argv[2]
else:
google_id = hashlib.md5(str(random.random())).hexdigest()[:16]
cookie = 'GSP=ID=%s:CF=4' % google_id
def get_bibtex(bib_id):
bib_id = urllib.quote(bib_id)
bib_req = urllib2.Request(bib_url, headers={'User-Agent' : useragent})
bib_data = urllib2.urlopen(bib_req, bib_params % bib_id).read()
return bib_data
def get_bibtex_by_id(bib_id):
# sleep a little bit so Google will allow me to crawl hopefully
print "Get Bibtex ", bib_id
time.sleep(3)
bib_req = urllib2.Request(bib_id_url % bib_id, headers={'User-Agent' : useragent, 'Cookie' : cookie})
bib_data = urllib2.urlopen(bib_req).read()
return bib_data
def compare_authors(paper, cite):
for idx, name in enumerate(paper):
if idx == 0 and name in cite:
return 1
elif name in cite:
return 2
return 0
def analyse(filename):
print "============"
bib_file = open(filename, 'r')
data = bib.clear_comments(bib_file.read())
bib_data = bib.Bibparser(data)
bib_data.parse()
data = bib_data.records.values()
independent = 0
self_cite = 0
if len(data) > 0:
print data[0]['title'], ":\n",
for idx, entry in enumerate(data):
if idx == 0:
continue
ret = compare_authors(data[0]['author'], entry['author'])
if ret == 0:
independent = independent + 1
else:
self_cite = self_cite + 1
print entry['title']
bib_file.close()
print "Independent citation: ", independent, " Self citation: ", self_cite
class CitationSubPageParser(HTMLParser):
bib_list = []
def __init__(self, subpage):
HTMLParser.__init__(self)
self.bib_list = []
req = urllib2.Request("http://scholar.google.com%s" % subpage, \
headers={'User-Agent' : useragent, "Cookie" : cookie})
url_data = urllib2.urlopen(req).read()
self.feed(url_data)
def handle_starttag(self, tag, attrs):
for attr in attrs:
if len(attr) >= 2 and attr[0] == "onclick" and \
attr[1].startswith("return gs_ocit"):
self.bib_list.append(attr[1][22:34])
def get_bib_list(self):
return self.bib_list
# create a subclass and override the handler methods
class CitationParser(HTMLParser):
cite_url = "http://scholar.google.com/scholar?oi=bibs&hl=en&cites=%s&num=20"
bib_list = []
def __init__(self, cite_id):
HTMLParser.__init__(self)
self.bib_list = []
req = urllib2.Request(self.cite_url % cite_id, \
headers={'User-Agent' : useragent, "Cookie" : cookie})
url_data = urllib2.urlopen(req).read()
self.feed(url_data)
def handle_starttag(self, tag, attrs):
for attr in attrs:
if len(attr) >= 2 and attr[0] == "onclick" and \
attr[1].startswith("return gs_ocit"):
self.bib_list.append(attr[1][22:34])
if len(attrs) == 2 and \
len(attrs[0]) == 2 and attrs[0][0] == "class" and \
attrs[0][1] == "gs_nma" and \
len(attrs[1]) == 2 and attrs[1][0] == "href" and \
attrs[1][1].encode('ascii','ignore').find("/scholar?start=") >= 0:
subpage = CitationSubPageParser(attrs[1][1])
self.bib_list.extend(subpage.get_bib_list())
def get_all_citations(self):
return self.bib_list
class ProfileParser(HTMLParser):
profile_url = "http://scholar.google.com/citations?user=%s&hl=en"
citation_for_view = "citation_for_view="
bib_file = None
def __init__(self, profile_id):
HTMLParser.__init__(self)
self.bib_file = None
req = urllib2.Request(self.profile_url % profile_id, \
headers={'User-Agent' : useragent, "Cookie" : cookie})
url_data = urllib2.urlopen(req).read()
self.feed(url_data)
self.calculate_citation()
def handle_starttag(self, tag, attrs):
for attr in attrs:
if len(attr) >= 2 and attr[0] == "href":
if attr[1].startswith("/citations?view_op=view_citation"):
idx = attr[1].find(self.citation_for_view) + \
len(self.citation_for_view)
bib_id = attr[1][idx:]
self.calculate_citation()
self.bib_file = open("%s.bib" % bib_id.replace(':', '_'), 'w')
self.bib_file.write(get_bibtex(bib_id))
elif attr[1].find("/scholar?oi=bibs&hl=en&") > 0:
idx = attr[1].find("cites=") + len("cites=")
cite_id = attr[1][idx:]
c_parser = CitationParser(cite_id)
for citation in c_parser.get_all_citations():
self.bib_file.write(get_bibtex_by_id(citation))
def calculate_citation(self):
if self.bib_file is not None:
filename = self.bib_file.name
self.bib_file.close()
analyse(filename)
# instantiate the parser and fed it some HTML
if user is None:
for bib_name in glob.glob("*.bib"):
analyse(bib_name)
else:
ProfileParser(user)