-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy patheditdistance.py
65 lines (57 loc) · 2.02 KB
/
editdistance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#import numpy as np
from munkres import Munkres, print_matrix, make_cost_matrix
from ngram import *
import sys
import numpy
def _ngram_matrix(set1, set2):
matrix = []
for ngram1 in set1:
unitMatrix = []
for ngram2 in set2:
ngram_similarity = float(lcs(ngram1, ngram2))/len(ngram1)
if ngram_similarity < 0.7: ngram_similarity=0
unitMatrix.append(100*ngram_similarity)
matrix.append(unitMatrix)
return matrix
def ngramset_edit_distance(set1, set2):
def get_yxgraph_distance(x, y):
import math
if(x == y):
return 0
elif(x > y):
return math.sqrt(math.pow((x-y), 2))
else:
return -math.sqrt(math.pow((y-x), 2))
matrix = _ngram_matrix(set1, set2)
# with open("matrix", 'wb') as file:
# file.write(str(matrix))
cost_matrix = make_cost_matrix(matrix, lambda cost: sys.maxint - cost)
m = Munkres()
indexes = m.compute(cost_matrix)
# total = 0.0
max_matrix = []
xygraph_distance_list = []
for row, column in indexes:
value = matrix[row][column]
max_matrix.append(value)
xygraph_distance_list.append(get_yxgraph_distance(row, column))
# total += value
edit_distance = numpy.mean(max_matrix)/100
variance = numpy.var(max_matrix)
# if edit_distance > 0.7:
# sim2 = _similarity(xygraph_distance_list, 2)
# sim3 = _similarity(xygraph_distance_list, 3)
# return edit_distance, variance, sim2, sim3, xygraph_distance_list
return edit_distance, variance, None, None, xygraph_distance_list
def _similarity(xygraph_distance_list, n):
count = 0
length = len(xygraph_distance_list)
for i in range(length-n+1):
n_indexes = xygraph_distance_list[i:i+n]
sub_count = 0
for j in range(0, n-1):
if n_indexes[j] == n_indexes[j+1]:
sub_count += 1
count += sub_count / (n-1)
similarity = float(count) / (len(xygraph_distance_list)-n+1)
return similarity