diff --git a/data_generation.py b/data_generation.py new file mode 100644 index 0000000..a674501 --- /dev/null +++ b/data_generation.py @@ -0,0 +1,34 @@ +from data_generator import DataGenerator +import argparse + +def main(args): + folder_path = args.folder_path # e.g., 'mobilised-contextual-factors-v1' + info_path = args.info_path # e.g., 'CF_RWS_missingfiles-Sheet1.csv' + output_path = args.output # e.g., 'full_df.csv' + + cleaner = DataGenerator(folder_path, info_path) + cleaner.fit() + cleaner.save_resuts(output_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description='Run the data generation from raw data folder.' + ) + parser.add_argument('--folder_path', + required=True, + action='store', + help="root of the subjects' contextualized factors data.") + + parser.add_argument('--info_path', + required=True, + action='store', + help="root of the statistics Excel file about missing data/files. Must be in .csv") + + parser.add_argument('--output', + required=True, + action='store', + help='root where to save the data to be used for weather analysis') + + args = parser.parse_args() + main(args) \ No newline at end of file diff --git a/data_generator.py b/data_generator.py new file mode 100644 index 0000000..04d0d1f --- /dev/null +++ b/data_generator.py @@ -0,0 +1,100 @@ +import os +import pandas as pd +import numpy as np +import gzip +import shutil + + +class DataGenerator(): + + def __init__( + self, + folder_path:str, + info_path:str, + verbose:bool = False, + ) -> None: + + self._folder_path = folder_path + self._info_path = info_path + self._verbose = verbose + + self._info_df = pd.DataFrame([]) + self._full_df = pd.DataFrame([]) + + self._context_arr, self._steps_arr = self._create_context_data() + + + def _extract_files(self): + for p in os.listdir(self._folder_path): + i=0 + subfold = os.path.join(self._folder_path, p) + for f in os.listdir(subfold): + i+=1 + if f.endswith('.gz'): + filename = os.path.join(subfold, f) + extr_filename = filename.split('.gz')[0] + with gzip.open(filename, 'rb') as f_in: + with open(extr_filename, 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) + if self._verbose: + print(f'Extracted {i} files for patient {p}') + + + def _read_info_missing(self): + self._info_df = pd.read_csv(self._info_path) + self._info_df.drop(columns='Unique ID ', inplace=True) + self._info_df.replace('-', np.nan, inplace=True) + + return self + + + def _create_context_data(self): + ctx_l, s_l = [], [] + + for p in os.listdir(self._folder_path): + try: + cohort = self._info_df[self._info_df['ID']==int(p)]['Cohort'].values[0] + print('Processing subject: ', p) + subfold = os.path.join(self._folder_path, p) + for f in os.listdir(subfold): + if 'Day' in f: + if f.endswith('.json') and 'step' in f: + steps_file = pd.read_json(os.path.join(subfold, f)) + s_l.append([[p, cohort, f.split('-')[3], float(el)] for el in steps_file['data'][0]['steps']]) + elif f.endswith('.json') and 'Context' in f: + json_ctx_file = pd.read_json(os.path.join(subfold, f)) + ctx_l.append([ + [k, json_ctx_file['data'][0]['contextValues'][k][0]] + for k in json_ctx_file['data'][0]['contextValues']]) + except: + continue + + return np.array(ctx_l), np.array(s_l) + + + def _reshape_data(self, arr, last_shape): + return ( + np.reshape( + arr, + (arr.shape[0], arr.shape[1], last_shape)) + ) + + + def fit(self): + ctx_df = pd.DataFrame( + self._reshape_data(self._context_arr, 2), + columns=['Timestamp', 'IndoorProb']) + step_df = pd.DataFrame( + self._reshape_data(self._context_arr, 4), + columns=['Patient', 'Cohort', 'Day', 'StepPerSec']) + + self._full_df = pd.concat([step_df, ctx_df], axis=1) + self._full_df.dropna(inplace=True) + + return self + + + def save_results(self, output_path): + self._full_df = self._full_df[self._full_df['IndoorProb']!=50] + self._full_df['StepPerSec'] = self._full_df['StepPerSec'].astype('float32') + self._full_df.to_csv(output_path) diff --git a/main.py b/main.py index 3fb4718..9b067ab 100644 --- a/main.py +++ b/main.py @@ -2,9 +2,9 @@ import argparse def main(args): - path = args.path - threshold = args.threshold - output_path = args.output + path = args.path # e.g., 'full_df.csv' + threshold = args.threshold # e.g., 0.5 + output_path = args.output # e.g., 'weather_analysis_05_thresh.csv' cleaner = CleanerExtractor(path, threshold) cleaner.fit() diff --git a/notebooks/10s_windowing.ipynb b/notebooks/10s_windowing.ipynb index 05a6cbc..44d0434 100644 --- a/notebooks/10s_windowing.ipynb +++ b/notebooks/10s_windowing.ipynb @@ -7,11 +7,8 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", - "import matplotlib\n", "import seaborn as sns\n", "from pylab import rcParams\n", - "import os\n", - "import gzip\n", "from tqdm import tqdm\n", "import pandas as pd\n", "import numpy as np" @@ -271,22 +268,9 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'openpyxl'", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn [10], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mExcelWriter\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mfile_for_missing_steps.xlsx\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m writer:\n\u001b[0;32m 2\u001b[0m missing_df\u001b[38;5;241m.\u001b[39mto_excel(writer, sheet_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m10s_steps\u001b[39m\u001b[38;5;124m'\u001b[39m)\n", - "File \u001b[1;32mc:\\Users\\marci\\miniconda3\\lib\\site-packages\\pandas\\io\\excel\\_openpyxl.py:49\u001b[0m, in \u001b[0;36mOpenpyxlWriter.__init__\u001b[1;34m(self, path, engine, date_format, datetime_format, mode, storage_options, if_sheet_exists, engine_kwargs, **kwargs)\u001b[0m\n\u001b[0;32m 36\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__init__\u001b[39m(\n\u001b[0;32m 37\u001b[0m \u001b[39mself\u001b[39m,\n\u001b[0;32m 38\u001b[0m path,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 47\u001b[0m ):\n\u001b[0;32m 48\u001b[0m \u001b[39m# Use the openpyxl module as the Excel writer.\u001b[39;00m\n\u001b[1;32m---> 49\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mopenpyxl\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mworkbook\u001b[39;00m \u001b[39mimport\u001b[39;00m Workbook\n\u001b[0;32m 51\u001b[0m engine_kwargs \u001b[39m=\u001b[39m combine_kwargs(engine_kwargs, kwargs)\n\u001b[0;32m 53\u001b[0m \u001b[39msuper\u001b[39m()\u001b[39m.\u001b[39m\u001b[39m__init__\u001b[39m(\n\u001b[0;32m 54\u001b[0m path,\n\u001b[0;32m 55\u001b[0m mode\u001b[39m=\u001b[39mmode,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 58\u001b[0m engine_kwargs\u001b[39m=\u001b[39mengine_kwargs,\n\u001b[0;32m 59\u001b[0m )\n", - "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'openpyxl'" - ] - } - ], + "outputs": [], "source": [ "with pd.ExcelWriter('file_for_missing_steps.xlsx') as writer:\n", " missing_df.to_excel(writer, sheet_name='10s_steps')" diff --git a/notebooks/first_test.ipynb b/notebooks/first_test.ipynb index acbd4ae..3cdddcb 100644 --- a/notebooks/first_test.ipynb +++ b/notebooks/first_test.ipynb @@ -640,47 +640,6 @@ "info_df.loc[info_df['ID']==3011]" ] }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "#nans_l = info_df.isnull().sum(axis=1)\n", - "#type(nans_l)\n", - "#idx_val = nans_l.where(nans_l<2).dropna().index.to_list()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "#val_df = info_df[info_df.index.isin(idx_val)]\n", - "#val_df.head(10)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "#print([el for el in info_df.ID.values if el not in val_df.ID.values])" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "#val_df.dropna(axis=1, inplace=True)\n", - "#val_df = val_df[(val_df.T !='0').all()].reset_index()\n", - "#val_df.head(len(val_df))" - ] - }, { "cell_type": "code", "execution_count": 13, @@ -867,84 +826,7 @@ "outputs": [], "source": [ "ctx_array = np.reshape(ctx_array, (ctx_array.shape[0]*ctx_array.shape[1], 2))\n", - "s_array = np.reshape(s_array, (s_array.shape[0]*s_array.shape[1], 4))" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - " | Timestamp | \n", - "IndoorProb | \n", - "
---|---|---|
0 | \n", - "1597273200 | \n", - "100 | \n", - "
1 | \n", - "1597273201 | \n", - "100 | \n", - "
2 | \n", - "1597273202 | \n", - "100 | \n", - "
3 | \n", - "1597273203 | \n", - "100 | \n", - "
4 | \n", - "1597273204 | \n", - "100 | \n", - "