-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
refactoring notebooks. added data generator script
- Loading branch information
Showing
7 changed files
with
149 additions
and
172 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
from data_generator import DataGenerator | ||
import argparse | ||
|
||
def main(args): | ||
folder_path = args.folder_path # e.g., 'mobilised-contextual-factors-v1' | ||
info_path = args.info_path # e.g., 'CF_RWS_missingfiles-Sheet1.csv' | ||
output_path = args.output # e.g., 'full_df.csv' | ||
|
||
cleaner = DataGenerator(folder_path, info_path) | ||
cleaner.fit() | ||
cleaner.save_resuts(output_path) | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser( | ||
description='Run the data generation from raw data folder.' | ||
) | ||
parser.add_argument('--folder_path', | ||
required=True, | ||
action='store', | ||
help="root of the subjects' contextualized factors data.") | ||
|
||
parser.add_argument('--info_path', | ||
required=True, | ||
action='store', | ||
help="root of the statistics Excel file about missing data/files. Must be in .csv") | ||
|
||
parser.add_argument('--output', | ||
required=True, | ||
action='store', | ||
help='root where to save the data to be used for weather analysis') | ||
|
||
args = parser.parse_args() | ||
main(args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
import os | ||
import pandas as pd | ||
import numpy as np | ||
import gzip | ||
import shutil | ||
|
||
|
||
class DataGenerator(): | ||
|
||
def __init__( | ||
self, | ||
folder_path:str, | ||
info_path:str, | ||
verbose:bool = False, | ||
) -> None: | ||
|
||
self._folder_path = folder_path | ||
self._info_path = info_path | ||
self._verbose = verbose | ||
|
||
self._info_df = pd.DataFrame([]) | ||
self._full_df = pd.DataFrame([]) | ||
|
||
self._context_arr, self._steps_arr = self._create_context_data() | ||
|
||
|
||
def _extract_files(self): | ||
for p in os.listdir(self._folder_path): | ||
i=0 | ||
subfold = os.path.join(self._folder_path, p) | ||
for f in os.listdir(subfold): | ||
i+=1 | ||
if f.endswith('.gz'): | ||
filename = os.path.join(subfold, f) | ||
extr_filename = filename.split('.gz')[0] | ||
with gzip.open(filename, 'rb') as f_in: | ||
with open(extr_filename, 'wb') as f_out: | ||
shutil.copyfileobj(f_in, f_out) | ||
if self._verbose: | ||
print(f'Extracted {i} files for patient {p}') | ||
|
||
|
||
def _read_info_missing(self): | ||
self._info_df = pd.read_csv(self._info_path) | ||
self._info_df.drop(columns='Unique ID ', inplace=True) | ||
self._info_df.replace('-', np.nan, inplace=True) | ||
|
||
return self | ||
|
||
|
||
def _create_context_data(self): | ||
ctx_l, s_l = [], [] | ||
|
||
for p in os.listdir(self._folder_path): | ||
try: | ||
cohort = self._info_df[self._info_df['ID']==int(p)]['Cohort'].values[0] | ||
print('Processing subject: ', p) | ||
subfold = os.path.join(self._folder_path, p) | ||
for f in os.listdir(subfold): | ||
if 'Day' in f: | ||
if f.endswith('.json') and 'step' in f: | ||
steps_file = pd.read_json(os.path.join(subfold, f)) | ||
s_l.append([[p, cohort, f.split('-')[3], float(el)] for el in steps_file['data'][0]['steps']]) | ||
elif f.endswith('.json') and 'Context' in f: | ||
json_ctx_file = pd.read_json(os.path.join(subfold, f)) | ||
ctx_l.append([ | ||
[k, json_ctx_file['data'][0]['contextValues'][k][0]] | ||
for k in json_ctx_file['data'][0]['contextValues']]) | ||
except: | ||
continue | ||
|
||
return np.array(ctx_l), np.array(s_l) | ||
|
||
|
||
def _reshape_data(self, arr, last_shape): | ||
return ( | ||
np.reshape( | ||
arr, | ||
(arr.shape[0], arr.shape[1], last_shape)) | ||
) | ||
|
||
|
||
def fit(self): | ||
ctx_df = pd.DataFrame( | ||
self._reshape_data(self._context_arr, 2), | ||
columns=['Timestamp', 'IndoorProb']) | ||
step_df = pd.DataFrame( | ||
self._reshape_data(self._context_arr, 4), | ||
columns=['Patient', 'Cohort', 'Day', 'StepPerSec']) | ||
|
||
self._full_df = pd.concat([step_df, ctx_df], axis=1) | ||
self._full_df.dropna(inplace=True) | ||
|
||
return self | ||
|
||
|
||
def save_results(self, output_path): | ||
self._full_df = self._full_df[self._full_df['IndoorProb']!=50] | ||
self._full_df['StepPerSec'] = self._full_df['StepPerSec'].astype('float32') | ||
self._full_df.to_csv(output_path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters