-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdata_generator.py
100 lines (77 loc) · 3.36 KB
/
data_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import os
import pandas as pd
import numpy as np
import gzip
import shutil
class DataGenerator():
def __init__(
self,
folder_path:str,
info_path:str,
verbose:bool = False,
) -> None:
self._folder_path = folder_path
self._info_path = info_path
self._verbose = verbose
self._info_df = pd.DataFrame([])
self._full_df = pd.DataFrame([])
self._context_arr, self._steps_arr = self._create_context_data()
def _extract_files(self):
for p in os.listdir(self._folder_path):
i=0
subfold = os.path.join(self._folder_path, p)
for f in os.listdir(subfold):
i+=1
if f.endswith('.gz'):
filename = os.path.join(subfold, f)
extr_filename = filename.split('.gz')[0]
with gzip.open(filename, 'rb') as f_in:
with open(extr_filename, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
if self._verbose:
print(f'Extracted {i} files for patient {p}')
def _read_info_missing(self):
self._info_df = pd.read_csv(self._info_path)
self._info_df.drop(columns='Unique ID ', inplace=True)
self._info_df.replace('-', np.nan, inplace=True)
return self
def _create_context_data(self):
ctx_l, s_l = [], []
for p in os.listdir(self._folder_path):
try:
cohort = self._info_df[self._info_df['ID']==int(p)]['Cohort'].values[0]
print('Processing subject: ', p)
subfold = os.path.join(self._folder_path, p)
for f in os.listdir(subfold):
if 'Day' in f:
if f.endswith('.json') and 'step' in f:
steps_file = pd.read_json(os.path.join(subfold, f))
s_l.append([[p, cohort, f.split('-')[3], float(el)] for el in steps_file['data'][0]['steps']])
elif f.endswith('.json') and 'Context' in f:
json_ctx_file = pd.read_json(os.path.join(subfold, f))
ctx_l.append([
[k, json_ctx_file['data'][0]['contextValues'][k][0]]
for k in json_ctx_file['data'][0]['contextValues']])
except:
continue
return np.array(ctx_l), np.array(s_l)
def _reshape_data(self, arr, last_shape):
return (
np.reshape(
arr,
(arr.shape[0], arr.shape[1], last_shape))
)
def fit(self):
ctx_df = pd.DataFrame(
self._reshape_data(self._context_arr, 2),
columns=['Timestamp', 'IndoorProb'])
step_df = pd.DataFrame(
self._reshape_data(self._context_arr, 4),
columns=['Patient', 'Cohort', 'Day', 'StepPerSec'])
self._full_df = pd.concat([step_df, ctx_df], axis=1)
self._full_df.dropna(inplace=True)
return self
def save_results(self, output_path):
self._full_df = self._full_df[self._full_df['IndoorProb']!=50]
self._full_df['StepPerSec'] = self._full_df['StepPerSec'].astype('float32')
self._full_df.to_csv(output_path)