-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmeasure.py
126 lines (91 loc) · 3.16 KB
/
measure.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
from minhash import MinHash
from rabinkarp_serial import RabinKarpSerial
from rabinkarp_parallel import RabinKarpParallel
from preprocess import Preprocess
from multiprocessing import Process, Queue
from doc_handle import DocumentHandle
from minhash import MinHash
import time
def main():
rk = RabinKarpSerial()
prk = RabinKarpParallel()
doc = DocumentHandle()
mh = MinHash()
pp = Preprocess()
similarity = 0
# get textname and content original document
textname, txt = doc.get_txt()
# get filepattern name
filenames = doc.get_pat()
# total file
totaldoc = len(doc.get_pat())
# ---------------------------------------------------------------------------------------------------
print 'RABIN KARP SERIAL'
for i in range(1, totaldoc):
start = time.time()
patname = filenames[i].replace('\n', '')
with open (patname, 'r') as pattern:
pattern = pattern.read().replace('\n', ' ').replace('\r', ' ')
pattern = pp.prep_text(pattern)
pattern = doc.wordshingling(pattern)
# similarity measure by Rabin Karp Serial
similarity = rk.sim(txt, pattern)
end = time.time()
print 'sim(%s, %s)= %.4f on %.4f second' %(textname, patname, similarity, (end-start))
# similarity = 0
# -----------------------------------------------------------------------------------------------------
print 'RABIN KARP PARALLEL'
# define number of thread
k = 3
processes = []
R = Queue()
# lock = Lock()
for i in range(1, totaldoc):
start = time.time()
# open the pattern one by one through the loop
patname = filenames[i].replace('\n', '')
with open (patname, 'r') as pattern:
pattern = pattern.read().replace('\n', ' ').replace('\t', ' ')
pattern = pp.prep_text(pattern)
patlen = len(pattern)
# 5 is length of pattern
d = int((patlen - 5 + 1) / k+1)
pattern = doc.wordshingling(pattern)
# print patname
# print pattern
for j in range(k - 1):
# print '[%d][%d]' %(int(d * j), int((j + 1) * d) + 5 - 1)
p = Process(target=prk.sim, args=(int(d * j), int((j+1) * d) + 5 - 1, pattern, txt, R,))
processes.append(p)
p.start()
# print '[%d][%d]' %(int(d * (k-1)), patlen)
p = Process(target=prk.sim, args=(int(d * (k-1)), patlen, pattern, txt, R,))
processes.append(p)
p.start()
for pr in processes:
pr.join()
while not R.empty():
similarity += R.get()
# print similarity
end = time.time()
print "sim(%s, %s) = %.4f on %.4f" %(textname, patname, similarity, (end-start))
# refresh similarity
similarity = 0
#-----------------------------------------------------------------------------------------------------
print 'MINHASH'
txt = doc.wordshingling(txt)
for i in range(1, totaldoc):
start = time.time()
patname = filenames[i].replace('\n', '')
with open (patname, 'r') as pattern:
pattern = pattern.read().replace('\n', ' ').replace('\r', ' ')
# pattern = pattern.split()
pattern = pp.prep_text(pattern)
# singling pattern
pattern = doc.wordshingling(pattern)
# similarity measure by MinHash
similarity = mh.get_score(txt, pattern)
end = time.time()
print 'sim(%s, %s)= %.4f on %.4f second' %(textname, patname, similarity, (end-start))
if __name__ == '__main__':
main()