-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathproject.py
190 lines (141 loc) · 5.46 KB
/
project.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
'''
Data Mining Course - Final Project
Dima Goldenberg
Ori Barkan
'''
fileDir = '//'
from scipy import stats #
import numpy as np #
from sklearn import tree #
from sklearn.neighbors import KNeighborsClassifier #
from sklearn.covariance import empirical_covariance #
import pandas as pd #
def read_data(filename,fileDir):
''' reading file into pandas data frame'''
filename = fileDir + filename
data = pd.read_csv(filename) #reading with pandas
return data
def splitframe(data,name):
''' returning a subset dataframe with rellevant label'''
df = data[data.Label == name]
return df
def fisher_ratio(a,b):
'''calculates fisher ratio'''
return (np.mean(a)-np.mean(b))**2 / (np.var(a) + np.var(b))
def accuracy(predict,classification):
'''calculates accuracy ratio'''
GoodPrediction = (predict == classification)
return GoodPrediction.value_counts()[True]*1.0 / len(GoodPrediction)
print 'Data Mining Final Project - Ori and Dima, Autumn 2015'
print ' '
# 1.a - spliting the data
traindata = read_data('Higgs_Train.csv', fileDir)
Signal = splitframe(traindata,'s')
Back = splitframe(traindata,'b')
#1.b - T-Test on the features
print ('1.b - T-Test on the features')
TTests =[]
for Col in Signal.columns[:-1]:
T = stats.ttest_ind(Signal[Col],Back[Col]) #performing T-test
TTests.append((T[1],Col))
if T[1] > 0.01:
print '1.b p-val for feature named:',Col, 'is: ' ,T[1]
print ''
#1.c - Fisher on the features
print ('1.c - Fisher on the features')
Fishers = []
for Col in Signal.columns[:-1]:
F = fisher_ratio(Signal[Col],Back[Col])
Fishers.append((F,Col))
Fishers.sort(reverse = True)
fishprint =[]
for col in Fishers[:4]:
fishprint.append([col[0],col[1]])
print '1.c Array of FDR per feature: ', fishprint
#for col in Fishers[:4]:
# print col[1], " Fisher's Ratio: ", col[0]
#1.d TTest vs Fisher
print ''
print ('1.d - TTest vs Fisher')
Fishers.sort()
TTests.sort(reverse = True)
for i in range(len(Fishers)):
print 'Worse #',i+1,'T-Test is', TTests[i][1],'and Fishers is', Fishers[i][1]
# 2 - K-Nearest
traindata = read_data('Higgs_subTrain.csv',fileDir)
testdata = read_data('Higgs_subTest.csv',fileDir)
Ytrain = traindata.iloc[:,-1]
Ytest = testdata.iloc[:,-1]
# Data Min-Max Normalization
trainNorm = traindata.iloc[:,:-1]
testNorm = testdata.iloc[:,:-1]
trainNorm[:] = trainNorm.apply(lambda x: (x - x.min()) / (x.max() - x.min()))
testNorm[:] = testNorm.apply(lambda x: (x - x.min()) / (x.max() - x.min()))
# regular Euclidian distance metric
print ''
print '2 - KNN'
neigh = KNeighborsClassifier(n_neighbors=1, metric='minkowski', p=2)
neigh.fit(trainNorm,Ytrain)
Accuracy1 = accuracy(neigh.predict(testNorm), Ytest)
print '2.b K=1 result :', Accuracy1
print ' K=1 train result :', accuracy(neigh.predict(trainNorm), Ytrain)
print ''
neigh = KNeighborsClassifier(n_neighbors=11, metric='minkowski', p=2)
neigh.fit(trainNorm,Ytrain)
Accuracy11 = accuracy(neigh.predict(testNorm), Ytest)
print '2.c K=11 result :', Accuracy11
print ' K=11 train result :',accuracy(neigh.predict(trainNorm), Ytrain)
# 3 - Decision tree
print ''
print '3 - Decision trees + features selection'
traindata = read_data('Higgs_Train.csv',fileDir)
testdata = read_data('Higgs_Test.csv',fileDir)
X = traindata.iloc[:,:-1]
ytrain = traindata.iloc[:,-1]
candidates = Fishers[-3:]
candidates.sort(reverse=True)
emp_cov = empirical_covariance(X)
features = list(X.columns[:])
cols = []
for i in range(len(candidates)): #printing covarience
cols.append(candidates[i][1])
for j in range(i+1,len(candidates)):
ro = emp_cov[features.index(candidates[i][1]),features.index(candidates[j][1])] / np.sqrt(emp_cov[features.index(candidates[i][1]),features.index(candidates[i][1])]*emp_cov[features.index(candidates[j][1]),features.index(candidates[j][1])])
print candidates[i][1],", ", candidates[j][1], 'ro is:', ro
Xtrain =X[cols] #selecting rellevant features
Xtest = testdata[cols]
ytest = testdata.iloc[:,-1]
clf = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth = 9 )
clf.fit(Xtrain,ytrain)
print ''
#checking tree on train
AccuracyTreetrain = accuracy(clf.predict(Xtrain), ytrain)
print '3.b Tree res_train:', AccuracyTreetrain
#checking tree on train
AccuracyTreetest = accuracy(clf.predict(Xtest), ytest)
print '3.b Tree res_test:', AccuracyTreetest
# 4 - Forest
print ''
print '4 - Forest'
#training 3 trees
clf1 = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth = 9 )
clf1.fit(Xtrain.iloc[:,[0,1]],ytrain)
clf2 = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth = 9 )
clf2.fit(Xtrain.iloc[:,[0,2]],ytrain)
clf3 = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth = 9 )
clf3.fit(Xtrain.iloc[:,[1,2]],ytrain)
# majority vote
#checking forest on train
res = np.column_stack((clf1.predict(Xtrain.iloc[:,[0,1]]),clf2.predict(Xtrain.iloc[:,[0,2]]),clf3.predict(Xtrain.iloc[:,[1,2]])))
ressum = stats.mode(res, axis = 1)[0] # mode of each row
ressum = ressum.transpose()[0]
AccuracyForesttrain = accuracy(ressum, ytrain)
print '4.b Forest res_train:', AccuracyForesttrain
#checking forest on test
res = np.column_stack((clf1.predict(Xtest.iloc[:,[0,1]]),clf2.predict(Xtest.iloc[:,[0,2]]),clf3.predict(Xtest.iloc[:,[1,2]])))
ressum = stats.mode(res, axis = 1)[0]
ressum = ressum.transpose()[0]
rs = stats.mode(res, axis = 1)[1] # vote results
AccuracyForest = accuracy(ressum, ytest)
print '4.b Forest res_test:', AccuracyForest