-
Notifications
You must be signed in to change notification settings - Fork 2
/
prepData.py
279 lines (232 loc) · 9.77 KB
/
prepData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
# -*- coding: utf-8 -*-
"""
Created on Sat Feb 20 18:43:43 2021
@author: karan
"""
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,mean_squared_error
import scipy
import pickle
import os
#Class to implement the preprocessing pipeline
###Use the trainPipeline() and testPipeline() functions to automate
# Remove missing values from the dataset
# Args: na_method- method with which missing values are dealt with
# possible values- drop(default), mode, mean
def missingVals(df,na_method ='drop'):
null=pd.DataFrame(df.isnull().sum())
if null.sum()[0]==0:
return df
else:
if na_method=='drop':
df=df.dropna()
elif na_method=='mode':
for col in df.columns:
df[col]=df[col].fillna(value=df[col].mode()[0])
elif na_method=='mean':
for col in df.columns:
if df[col].dtypes=='O' or df[col].dtypes=='object':
df[col]=df[col].fillna(value=df[col].mode()[0])
else:
df[col]=df[col].fillna(value=df[col].mean())
else:
raise Exception('Invalid value for argument na_method')
return df
# Converting categorical columsn to numerical
# Args: ohe - True if columns to be one hot encoded, False for label encoding
# dropFirst - if ohe is True, indicates if first column for each ohe conversion is to be dropped
def catEncoding(df,ohe=True,dropFirst=False):
cat_col=[]
for col in df.columns:
if df[col].dtypes=='object':
cat_col.append(col)
if (len(cat_col)==0):
return df
if ohe==True and dropFirst==True:
df=pd.get_dummies(df,columns=cat_col,drop_first=True)
elif ohe==True and dropFirst==False:
df=pd.get_dummies(df,columns=cat_col,drop_first=False)
else:
le=LabelEncoder()
for col in cat_col:
df[col]=le.fit_transform(df[col])
return df
# Remove outliers from the dataset
# Args: n_std- specifies the number of standard deviations upto which the values are to be kept
# default = 3
def remOutliers(df,n_std=3):
z_scores = scipy.stats.zscore(df)
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < n_std).all(axis=1)
new_df = df[filtered_entries]
return new_df
# Scale the values in the dataset
# Args: scale_list- list of all columns to be scaled. default: all columns
# type- type of scaling. 'std' for standard scaling and 'minmax' for min-max scaling
def scaleVals(df,scale_list=None,scale_type='std'):
if scale_list==None:
scale_list=list(df.columns)
if scale_type=='minmax':
mm=MinMaxScaler()
df[scale_list]=mm.fit_transform(df[scale_list])
elif scale_type=='std':
sc=StandardScaler()
df[scale_list]=sc.fit_transform(df[scale_list])
else:
raise Exception('Invalid value for argument scale_type)')
return df
#Function to split dataset into test and train
def testSplit(X,y,test_split=0.2):
if type(X)==pd.core.frame.DataFrame:
X=X.values
if type(y)==pd.core.frame.DataFrame:
y=y.values
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=test_split)
return X_train,X_test,y_train,y_test
# Train the random forest model on the preprocessed model and save the model as a pickle file
# Returns: the accuracy of model and the model
# Args: X- set of all features to be used for prediction
# y - Target variable
# test_split - the ratio of the test set
# folds - number of folds for k-fold cross val
# model_name- name of the model pkl file to be saved
# task_type- 'c' for classification and 'r' for regression
def splitAndTrain(X,y,test_split=0.2,folds=5,task_type='c',model_name='model'):
if type(X)==pd.core.frame.DataFrame:
X=X.values
if type(y)==pd.core.frame.DataFrame:
y=y.values
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=test_split)
if task_type=='c':
rf=RandomForestClassifier(n_estimators=10)
rf.fit(X_train,y_train)
with open(model_name+'.pkl', 'wb') as file:
pickle.dump(rf, file)
score=cross_val_score(rf,X,y,cv=folds)
return np.mean(score),rf
elif task_type=='r':
lr=LinearRegression()
lr.fit(X_train,y_train)
with open(model_name+'.pkl', 'wb') as file:
pickle.dump(lr, file)
score=cross_val_score(lr,X,y,cv=folds,scoring='r2')
return np.mean(score),lr
else:
raise Exception('Invalid value for argument task_type')
# Main function to run the complete data preprocessing and model training pipeline
# Args: dataframe - input dataframe
# features - list of columns in the dataframe to be used as features
# target - name of target column
# rem_outliers- whether outliers need to be removed (default: True)
# scale_vals- whether values need to be scaled (default: True)
# All the remaining arguments are same as used in the above functions
#
#Returns: model - the model trained on the given data
# X - preprocessed dataframe
# y - corresponding target variable after preprocessing
def trainPipeline(dataframe,features,target,na_method='drop',ohe=True,
dropFirst=False,rem_outliers=True,n_std=3,scale_vals=True,
scale_list=None,scale_type='std',test_split=0.2,folds=5,task_type='c',model_name='model'):
s=locals()
with open('saved_args.pkl', 'wb') as file:
pickle.dump(s, file)
features_temp=features.copy()
features_temp.append(target)
df=dataframe[features_temp]
df=missingVals(df,na_method)
df=catEncoding(df,ohe,dropFirst)
if rem_outliers==True:
df=remOutliers(df,n_std)
if scale_vals==True:
df=scaleVals(df,scale_list,scale_type)
y=df[target]
X=df.drop(target,axis=1)
acc,model=splitAndTrain(X.values,y.values,test_split,folds,task_type,model_name)
print(f'Accuracy with {folds} folds = {acc*100}%')
return model, X, y
# Main function to run the complete prediction pipeline
# Args: dataframe - input dataframe
# features - list of columns in the dataframe to be used as features
# model_name - name of the model saved in the trainPipeline()
# Remaining arguments are the same as trainPipeline()
# Returns: pred - array of predictions
def predictPipeline(dataframe,features,na_method='drop',ohe=True,
dropFirst=False,rem_outliers=True,n_std=3,scale_vals=True,
scale_list=None,scale_type='std',model_name='model'):
args=locals()
args.pop('dataframe')
#Checking that the trainPipeline() should be run before predictPipeline()
if os.path.exists('saved_args.pkl')==0:
raise Exception('Must run trainPipeline() before predictPipeline()')
#Checking for identical predict and train parameters
with open('saved_args.pkl','rb') as file:
saved_args=pickle.load(file)
rem_list=['dataframe','folds','target','test_split','task_type']
[saved_args.pop(key) for key in rem_list]
if args!=saved_args:
raise Exception('Test arguments must be same as Train arguments')
df=dataframe[features]
df=missingVals(df,na_method)
df=catEncoding(df,ohe,dropFirst)
if rem_outliers==True:
df=remOutliers(df,n_std)
if scale_vals==True:
df=scaleVals(df,scale_list,scale_type)
with open(model_name+'.pkl','rb') as file:
model=pickle.load(file)
pred=model.predict(df.values)
return pred
# Main function to process the dataframe
# Args: dataframe - input dataframe
# features - list of columns in the dataframe to be used as features
#
# Remaining arguments are the same as trainPipeline()
# Returns: X - the processed features dataframe
# y - pandas series containing the target variable
def processDf(dataframe,features,target,na_method='drop',ohe=True,
dropFirst=False,rem_outliers=True,n_std=3,scale_vals=True,
scale_list=None,scale_type='std'):
features_temp=features.copy()
features_temp.append(target)
df=dataframe[features_temp]
df=missingVals(df,na_method)
df=catEncoding(df,ohe,dropFirst)
if rem_outliers==True:
df=remOutliers(df,n_std)
if scale_vals==True:
df=scaleVals(df,scale_list,scale_type)
y=df[target]
X=df.drop(target,axis=1)
return X,y
# Function to process the dataframe and split into test and train set
def processAndSplit(dataframe,features,target,na_method='drop',ohe=True,
dropFirst=False,rem_outliers=True,n_std=3,scale_vals=True,
scale_list=None,scale_type='std',test_split=0.2):
features_temp=features.copy()
features_temp.append(target)
df=dataframe[features_temp]
df=missingVals(df,na_method)
df=catEncoding(df,ohe,dropFirst)
if rem_outliers==True:
df=remOutliers(df,n_std)
if scale_vals==True:
df=scaleVals(df,scale_list,scale_type)
y=df[target]
X=df.drop(target,axis=1)
return testSplit(X,y)
# Score the models predictions
#Args: y_true- the actual labels
# y_pred- predictions made by the model
# type- the type of task. 'c' for classification (default) and 'r' for regression
def predScore(y_true,y_pred,task_type='c'):
if task_type=='c':
print('Accuracy=',accuracy_score(y_true,y_pred))
elif task_type=='r':
print('MSE score=',mean_squared_error(y_true,y_pred))
else:
raise Exception("Invalid 'type' in predScore()")