-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtsdi.py
225 lines (189 loc) · 10.1 KB
/
tsdi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
import numpy as np
import pandas as pd
def get_data(df, category, time_range):
"""
this function set up the data structure as a dataframe
and enter existing data
"""
#set up data structure
new_df = pd.DataFrame()
for i in df[category].unique():
for n in range(np.min(df[time_range]), np.max(df[time_range])+1):
new_df = new_df.append(pd.DataFrame(np.array([[i,n]]), columns = [category, time_range]))
new_df = new_df.sort_values([category, time_range])
#concat the old data with the new data structure
new_df = new_df.set_index([category, time_range])
df = df.set_index([category, time_range])
result = pd.concat([new_df.reset_index(), df.reset_index(drop = True)], axis = 1, sort = False)
return result
def ADF(df, category, time_range):
"""
use the Augmented Dickey-Fuller(ADF) test to test the stationarity
null hypothesis: non-stationary
alternative hypothesis: stationary
"""
from statsmodels.tsa.stattools import adfuller
df_stationarity = pd.DataFrame()
for i in df[category].unique():
for column in df.columns:
if column != category and column != time_range:
column_value = df[column].dropna().values
column_result = adfuller(column_value)
p_value = column_result[1]
if p_value < 0.1:
df_stationarity = df_stationarity.append(pd.DataFrame(np.array([[i,column,'stationary']]),
columns = [category, 'Column Name', 'Stationarity']))
else:
df_stationarity = df_stationarity.append(pd.DataFrame(np.array([[i,column,'not stationary']]),
columns = [category, 'Column Name', 'Stationarity']))
return df_stationarity
def Order_Permutation(df, category, time_range):
"""
this function produce all possible permutation of ARIMA order (p,d,q)
"""
from itertools import product
df_order = pd.DataFrame()
df_s = ADF(get_data(df, category, time_range), category, time_range)
DoF = np.max(df[time_range])+1 - np.min(df[time_range])
for index, row in df_s.iterrows():
if row['Stationarity'] == 'stationary':
maxRange = np.array([DoF, DoF, DoF])
states = np.array([i for i in product(*(range(i+1) for i in maxRange)) if sum(i) <= DoF])
for s in states:
df_order = df_order.append(pd.DataFrame(np.array([[row[category],row['Column Name'],row['Stationarity'], s]]),
columns = [category, 'Column Name', 'Stationarity','Order']))
else:
df_order = df_order.append(pd.DataFrame(np.array([[row[category], row['Column Name'], row['Stationarity'], 'no order']]),
columns = [category, 'Column Name', 'Stationarity','Order']))
return df_order
def SES(df, category, time_range, column):
from statsmodels.tsa.holtwinters import SimpleExpSmoothing
new_df = pd.DataFrame()
for u in df[category].unique():
df2 = df[df[category] == u]
df2 = df2.reset_index(drop=True)
DoF = np.max(df[time_range])+1 - np.min(df[time_range])
if df2.isna().sum()[column] == DoF:
df2[column] = df[column].dropna().mean()
while df2.isna().sum()[column] > 0:
data = df2[column].tolist()
for n, nxt in zip (data, data[1:]):
if np.isnan(n) == False and np.isnan(nxt) == True:
data_list = list()
for m in data[:data.index(nxt)]:
if np.isnan(m) == False:
data_list.append(m)
if len(data_list) < 2:
df2[column][data.index(nxt)] = n
else:
model = SimpleExpSmoothing(data_list)
model_fit = model.fit()
#obtain predicted value
yhat = model_fit.forecast()
df2[column][data.index(nxt)] = yhat
data = df2[column].tolist()[::-1]
for n, nxt in zip (data, data[1:] ):
if np.isnan(n) == False and np.isnan(nxt) == True:
datalist = data[:data.index(nxt)]
data_list = list()
for m in datalist:
if np.isnan(m) == False:
data_list.append(m)
if len(data_list) < 2:
df2[column][len(data) - 1 - data.index(nxt)] = n
else:
model = SimpleExpSmoothing(data_list)
model_fit = model.fit()
#obtain predicted value
yhat = model_fit.forecast()
df2[column][len(data) - 1 - data.index(nxt)] = yhat
new_df = new_df.append(df2)
return new_df
def Estimate(df, category, time_range, column):
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.holtwinters import SimpleExpSmoothing
new_df = pd.DataFrame()
df_MSE = pd.DataFrame()
df_cm = ADF(df, category, time_range)
df_op = Order_Permutation(df, category, time_range)
for y in df[category].unique():
df_cd = df_cm[(df_cm[category] == y) & (df_cm['Column Name'] == column)]
stationarity_c = df_cd['Stationarity']
stationarity_c = stationarity_c.iloc[0]
if stationarity_c == "stationary":
print(y, column, 'stationary')
df2 = df[df[category] == y]
df2 = df2.reset_index()
while df2.isna().sum()[column] > 0:
for index, row in df_op.iterrows():
if row[category] == y and row['Column Name'] == column:
df3 = df2
data = df3[column].tolist()
for n, nxt in zip (data, data[1:] ):
if np.isnan(n) == False and np.isnan(nxt) == True:
datalist = data[:data.index(nxt)]
data_list = list()
for m in datalist:
if np.isnan(m) == False:
data_list.append(m)
train_data = data_list[:-1]
test_data = data_list[-1:]
model = ARIMA(train_data, tuple(row['Order']))
model_fit = model.fit(disp = False)
yhat = model_fit.predict(len(train_data), len(train_data))
squared_error = np.square(test_data - yhat)
df_MSE = df_MSE.append(pd.DataFrame(np.array([[y, column, row['Order'], squared_error]]), columns = [category, 'Column Name', 'Order', 'MSE']))
df3[column][data.index(nxt)] = yhat
data = df2[column].tolist()
for n, nxt in zip (data, data[1:] ):
if np.isnan(n) == False and np.isnan(nxt) == True:
datalist = data[:data.index(nxt)]
data_list = list()
for m in datalist:
if np.isnan(m) == False:
data_list.append(m)
df_each = df_MSE[(df_MSE[category]==y) & (df_MSE['Column Name']==column)]
df_each_number = pd.to_numeric(df_each['MSE or Error'], errors='coerce')
order_length = len(df_each_number)
order_array = df_each['Order'][df_each['MSE or Error'] == df_each_number.min()]
order = tuple(order_array.iloc[0])
print('NaN location %s %s %s' %(y, column, nxt))
print('Best Model: ARIMA %s' %(order))
model = ARIMA(data_list, order)
model_fit = model.fit(disp=False, transparams=False)
yhat = model_fit.predict(len(data_list), len(data_list))
df2[column][data.index(nxt)] = yhat
data = df2[column].tolist()[::-1]
for n, nxt in zip (data, data[1:] ):
if np.isnan(n) == False and np.isnan(nxt) == True:
datalist = data[:data.index(nxt)]
data_list = list()
for m in datalist:
if np.isnan(m) == False:
data_list.append(m)
#analyze using ARIMA
df_each = df_MSE[(df_MSE[category]==y) & (df_MSE['Column Name']==column)]
df_each_number = pd.to_numeric(df_each['MSE or Error'], errors='coerce')
order_length = len(df_each_number)
order_array = df_each['Order'][df_each['MSE or Error'] == df_each_number.min()]
order = tuple(order_array.iloc[0])
print('NaN location %s %s %s' %(y, column, nxt))
print('Best Model: ARIMA %s' %(order))
model = ARIMA(data_list, order)
model_fit = model.fit(disp=False, transparams=False)
#obtain predicted value
yhat = model_fit.predict(len(data_list), len(data_list))
df2[column][len(data) - 1 - data.index(nxt)] = yhat
new_df = new_df.append(df2)
elif stationarity_c == "not stationary":
print(y,column, 'not stationary')
print('Best model: Exponential Smoothing')
df2 = df[df[category] == y]
df3 = SES(df2, category, time_range, column)
new_df = new_df.append(df3)
return new_df
def Run(df, category, time_range):
for col in df.columns:
if col != category and col !=time_range:
df = Estimate(df, category, time_range, col)
return df