-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfeaturizer.py
144 lines (123 loc) · 6.99 KB
/
featurizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz
import warnings
warnings.filterwarnings('ignore')
import os, gc, re, distance
from sklearn.feature_extraction.text import TfidfVectorizer
def text_preprocess(x):
porter = PorterStemmer()
x = str(x).lower()
x = x.replace(",000,000", "m").replace(",000", "k").replace("′", "'").replace("’", "'")\
.replace("won't", "will not").replace("cannot", "can not").replace("can't", "can not")\
.replace("n't", " not").replace("what's", "what is").replace("it's", "it is")\
.replace("'ve", " have").replace("i'm", "i am").replace("'re", " are")\
.replace("he's", "he is").replace("she's", "she is").replace("'s", " own")\
.replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ")\
.replace("€", " euro ").replace("'ll", " will")
x = re.sub(r"([0-9]+)000000", r"\1m", x) #e.g. replace 12000000 with 12m
x = re.sub(r"([0-9]+)000", r"\1k", x) #e.g. replace 52000 with 52k
x = re.sub(r'<.*?>', '',x) # removes the htmltags: https://stackoverflow.com/a/12982689
# stemming the words
x = porter.stem(x)
return x
# Basic Feature Extraction
def basic_feature_extraction(row):
x = [0]*9
q1_len = len(row['question1'])
q2_len = len(row['question2'])
q1_n_word = len(row['question1'].split(" "))
q2_n_word = len(row['question2'].split(" "))
w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))
q1 = row['question1'].lower().replace('?','').replace('.','').replace('!','')
q2 = row['question2'].lower().replace('?','').replace('.','').replace('!','')
row_words = list(set(q1.split(' ') + q2.split(' '))) #unique words in both sentences
q1_vec, q2_vec = np.zeros((1, len(row_words))), np.zeros((1, len(row_words)))
for word in q1.split(' '):
q1_vec[0][row_words.index(word)] += 1
for word in q2.split(' '):
q2_vec[0][row_words.index(word)] += 1
x[0], x[1] = q1_len, q2_len
x[2], x[3] = q1_n_word, q2_n_word
x[4] = 1.0 * len(w1 & w2)
x[5] = 1.0 * (len(w1) + len(w2))
x[6] = 1.0 * len(w1 & w2)/(len(w1) + len(w2))
x[7] = np.linalg.norm(q1_vec-q2_vec) ## to find eucldian distance:https://stackoverflow.com/a/1401828
x[8] = (1 - np.matmul(q1_vec, q2_vec.T)/(np.linalg.norm(q1_vec) * np.linalg.norm(q2_vec)))[0][0]
return x
# To get the results in 4 decemal points
SAFE_DIV = 0.0001
STOP_WORDS = stopwords.words("english")
def get_token_features(q1, q2):
token_features = [0.0]*10
# Converting the Sentence into Tokens:
q1_tokens = q1.split()
q2_tokens = q2.split()
if len(q1_tokens) == 0 or len(q2_tokens) == 0:
return token_features
# Get the non-stopwords in Questions
q1_words = set([word for word in q1_tokens if word not in STOP_WORDS])
q2_words = set([word for word in q2_tokens if word not in STOP_WORDS])
#Get the stopwords in Questions
q1_stops = set([word for word in q1_tokens if word in STOP_WORDS])
q2_stops = set([word for word in q2_tokens if word in STOP_WORDS])
# Get the common non-stopwords from Question pair
common_word_count = len(q1_words.intersection(q2_words))
# Get the common stopwords from Question pair
common_stop_count = len(q1_stops.intersection(q2_stops))
# Get the common Tokens from Question pair
common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))
token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV)
token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV)
token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV)
token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV)
token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
# Last word of both question is same or not
token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])
# First word of both question is same or not
token_features[7] = int(q1_tokens[0] == q2_tokens[0])
token_features[8] = abs(len(q1_tokens) - len(q2_tokens))
#Average Token Length of both Questions
token_features[9] = (len(q1_tokens) + len(q2_tokens))/2
return token_features
# get the Longest Common sub string
def get_longest_substr_ratio(a, b):
strs = list(distance.lcsubstrings(a, b))
if len(strs) == 0:
return 0
else:
return len(strs[0]) / (min(len(a), len(b)) + 1)
def extract_features(df):
# print('1. Extracting Basic Features...')
basic_features = df.apply(basic_feature_extraction, axis = 1)
basic_features_names = ['q1len', 'q2len', 'q1_n_word', 'q2_n_word', 'word_Common', 'word_total', 'word_share', 'dist_eucl', 'dist_cosine']
for i, name in enumerate(basic_features_names):
df[name] = list(map(lambda x: x[i], basic_features))
# preprocessing each question
# print('2. Extracting Advance Features\n', '2.1 Text preprocessing')
df["question1"] = df["question1"].apply(text_preprocess)
df["question2"] = df["question2"].apply(text_preprocess)
# Merging Features with dataset
# print("2.2 Extracting token features...")
token_features = df.apply(lambda x: get_token_features(x["question1"], x["question2"]), axis=1)
new_columns = ["cwc_min", "cwc_max", "csc_min" ,"csc_max" ,"ctc_min","ctc_max","last_word_eq" ,"first_word_eq" ,"abs_len_diff","mean_len"]
for i, name in enumerate(new_columns):
df[name] = list(map(lambda x: x[i], token_features))
#Computing Fuzzy Features and Merging with Dataset
# do read this blog: http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/
# https://stackoverflow.com/questions/31806695/when-to-use-which-fuzz-function-to-compare-2-strings
# https://github.com/seatgeek/fuzzywuzzy
# print("2.3 Extracting Fuzzy Features...")
df["token_set_ratio"] = df.apply(lambda x: fuzz.token_set_ratio(x["question1"], x["question2"]), axis=1)
# The token sort approach involves tokenizing the string in question, sorting the tokens alphabetically, and
# then joining them back into a string We then compare the transformed strings with a simple ratio().
df["token_sort_ratio"] = df.apply(lambda x: fuzz.token_sort_ratio(x["question1"], x["question2"]), axis=1)
df["fuzz_ratio"] = df.apply(lambda x: fuzz.QRatio(x["question1"], x["question2"]), axis=1)
df["fuzz_partial_ratio"] = df.apply(lambda x: fuzz.partial_ratio(x["question1"], x["question2"]), axis=1)
df["longest_substr_ratio"] = df.apply(lambda x: get_longest_substr_ratio(x["question1"], x["question2"]), axis=1)
return df