From 90ac881f0096ed3234d34caa92ee47fb619468c4 Mon Sep 17 00:00:00 2001 From: SanBast Date: Wed, 19 Apr 2023 12:31:51 +0200 Subject: [PATCH] refactoring notebooks. added data generator script --- data_generation.py | 34 ++++++++ data_generator.py | 100 +++++++++++++++++++++++ main.py | 6 +- notebooks/10s_windowing.ipynb | 20 +---- notebooks/first_test.ipynb | 150 +--------------------------------- notebooks/main | 1 - weather_analysis.py | 10 ++- 7 files changed, 149 insertions(+), 172 deletions(-) create mode 100644 data_generation.py create mode 100644 data_generator.py delete mode 100644 notebooks/main diff --git a/data_generation.py b/data_generation.py new file mode 100644 index 0000000..a674501 --- /dev/null +++ b/data_generation.py @@ -0,0 +1,34 @@ +from data_generator import DataGenerator +import argparse + +def main(args): + folder_path = args.folder_path # e.g., 'mobilised-contextual-factors-v1' + info_path = args.info_path # e.g., 'CF_RWS_missingfiles-Sheet1.csv' + output_path = args.output # e.g., 'full_df.csv' + + cleaner = DataGenerator(folder_path, info_path) + cleaner.fit() + cleaner.save_resuts(output_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description='Run the data generation from raw data folder.' + ) + parser.add_argument('--folder_path', + required=True, + action='store', + help="root of the subjects' contextualized factors data.") + + parser.add_argument('--info_path', + required=True, + action='store', + help="root of the statistics Excel file about missing data/files. Must be in .csv") + + parser.add_argument('--output', + required=True, + action='store', + help='root where to save the data to be used for weather analysis') + + args = parser.parse_args() + main(args) \ No newline at end of file diff --git a/data_generator.py b/data_generator.py new file mode 100644 index 0000000..04d0d1f --- /dev/null +++ b/data_generator.py @@ -0,0 +1,100 @@ +import os +import pandas as pd +import numpy as np +import gzip +import shutil + + +class DataGenerator(): + + def __init__( + self, + folder_path:str, + info_path:str, + verbose:bool = False, + ) -> None: + + self._folder_path = folder_path + self._info_path = info_path + self._verbose = verbose + + self._info_df = pd.DataFrame([]) + self._full_df = pd.DataFrame([]) + + self._context_arr, self._steps_arr = self._create_context_data() + + + def _extract_files(self): + for p in os.listdir(self._folder_path): + i=0 + subfold = os.path.join(self._folder_path, p) + for f in os.listdir(subfold): + i+=1 + if f.endswith('.gz'): + filename = os.path.join(subfold, f) + extr_filename = filename.split('.gz')[0] + with gzip.open(filename, 'rb') as f_in: + with open(extr_filename, 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) + if self._verbose: + print(f'Extracted {i} files for patient {p}') + + + def _read_info_missing(self): + self._info_df = pd.read_csv(self._info_path) + self._info_df.drop(columns='Unique ID ', inplace=True) + self._info_df.replace('-', np.nan, inplace=True) + + return self + + + def _create_context_data(self): + ctx_l, s_l = [], [] + + for p in os.listdir(self._folder_path): + try: + cohort = self._info_df[self._info_df['ID']==int(p)]['Cohort'].values[0] + print('Processing subject: ', p) + subfold = os.path.join(self._folder_path, p) + for f in os.listdir(subfold): + if 'Day' in f: + if f.endswith('.json') and 'step' in f: + steps_file = pd.read_json(os.path.join(subfold, f)) + s_l.append([[p, cohort, f.split('-')[3], float(el)] for el in steps_file['data'][0]['steps']]) + elif f.endswith('.json') and 'Context' in f: + json_ctx_file = pd.read_json(os.path.join(subfold, f)) + ctx_l.append([ + [k, json_ctx_file['data'][0]['contextValues'][k][0]] + for k in json_ctx_file['data'][0]['contextValues']]) + except: + continue + + return np.array(ctx_l), np.array(s_l) + + + def _reshape_data(self, arr, last_shape): + return ( + np.reshape( + arr, + (arr.shape[0], arr.shape[1], last_shape)) + ) + + + def fit(self): + ctx_df = pd.DataFrame( + self._reshape_data(self._context_arr, 2), + columns=['Timestamp', 'IndoorProb']) + step_df = pd.DataFrame( + self._reshape_data(self._context_arr, 4), + columns=['Patient', 'Cohort', 'Day', 'StepPerSec']) + + self._full_df = pd.concat([step_df, ctx_df], axis=1) + self._full_df.dropna(inplace=True) + + return self + + + def save_results(self, output_path): + self._full_df = self._full_df[self._full_df['IndoorProb']!=50] + self._full_df['StepPerSec'] = self._full_df['StepPerSec'].astype('float32') + self._full_df.to_csv(output_path) diff --git a/main.py b/main.py index 3fb4718..9b067ab 100644 --- a/main.py +++ b/main.py @@ -2,9 +2,9 @@ import argparse def main(args): - path = args.path - threshold = args.threshold - output_path = args.output + path = args.path # e.g., 'full_df.csv' + threshold = args.threshold # e.g., 0.5 + output_path = args.output # e.g., 'weather_analysis_05_thresh.csv' cleaner = CleanerExtractor(path, threshold) cleaner.fit() diff --git a/notebooks/10s_windowing.ipynb b/notebooks/10s_windowing.ipynb index 05a6cbc..44d0434 100644 --- a/notebooks/10s_windowing.ipynb +++ b/notebooks/10s_windowing.ipynb @@ -7,11 +7,8 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", - "import matplotlib\n", "import seaborn as sns\n", "from pylab import rcParams\n", - "import os\n", - "import gzip\n", "from tqdm import tqdm\n", "import pandas as pd\n", "import numpy as np" @@ -271,22 +268,9 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'openpyxl'", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn [10], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mExcelWriter\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mfile_for_missing_steps.xlsx\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m writer:\n\u001b[0;32m 2\u001b[0m missing_df\u001b[38;5;241m.\u001b[39mto_excel(writer, sheet_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m10s_steps\u001b[39m\u001b[38;5;124m'\u001b[39m)\n", - "File \u001b[1;32mc:\\Users\\marci\\miniconda3\\lib\\site-packages\\pandas\\io\\excel\\_openpyxl.py:49\u001b[0m, in \u001b[0;36mOpenpyxlWriter.__init__\u001b[1;34m(self, path, engine, date_format, datetime_format, mode, storage_options, if_sheet_exists, engine_kwargs, **kwargs)\u001b[0m\n\u001b[0;32m 36\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__init__\u001b[39m(\n\u001b[0;32m 37\u001b[0m \u001b[39mself\u001b[39m,\n\u001b[0;32m 38\u001b[0m path,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 47\u001b[0m ):\n\u001b[0;32m 48\u001b[0m \u001b[39m# Use the openpyxl module as the Excel writer.\u001b[39;00m\n\u001b[1;32m---> 49\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mopenpyxl\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mworkbook\u001b[39;00m \u001b[39mimport\u001b[39;00m Workbook\n\u001b[0;32m 51\u001b[0m engine_kwargs \u001b[39m=\u001b[39m combine_kwargs(engine_kwargs, kwargs)\n\u001b[0;32m 53\u001b[0m \u001b[39msuper\u001b[39m()\u001b[39m.\u001b[39m\u001b[39m__init__\u001b[39m(\n\u001b[0;32m 54\u001b[0m path,\n\u001b[0;32m 55\u001b[0m mode\u001b[39m=\u001b[39mmode,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 58\u001b[0m engine_kwargs\u001b[39m=\u001b[39mengine_kwargs,\n\u001b[0;32m 59\u001b[0m )\n", - "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'openpyxl'" - ] - } - ], + "outputs": [], "source": [ "with pd.ExcelWriter('file_for_missing_steps.xlsx') as writer:\n", " missing_df.to_excel(writer, sheet_name='10s_steps')" diff --git a/notebooks/first_test.ipynb b/notebooks/first_test.ipynb index acbd4ae..3cdddcb 100644 --- a/notebooks/first_test.ipynb +++ b/notebooks/first_test.ipynb @@ -640,47 +640,6 @@ "info_df.loc[info_df['ID']==3011]" ] }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "#nans_l = info_df.isnull().sum(axis=1)\n", - "#type(nans_l)\n", - "#idx_val = nans_l.where(nans_l<2).dropna().index.to_list()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "#val_df = info_df[info_df.index.isin(idx_val)]\n", - "#val_df.head(10)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "#print([el for el in info_df.ID.values if el not in val_df.ID.values])" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "#val_df.dropna(axis=1, inplace=True)\n", - "#val_df = val_df[(val_df.T !='0').all()].reset_index()\n", - "#val_df.head(len(val_df))" - ] - }, { "cell_type": "code", "execution_count": 13, @@ -867,84 +826,7 @@ "outputs": [], "source": [ "ctx_array = np.reshape(ctx_array, (ctx_array.shape[0]*ctx_array.shape[1], 2))\n", - "s_array = np.reshape(s_array, (s_array.shape[0]*s_array.shape[1], 4))" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
TimestampIndoorProb
01597273200100
11597273201100
21597273202100
31597273203100
41597273204100
\n", - "
" - ], - "text/plain": [ - " Timestamp IndoorProb\n", - "0 1597273200 100\n", - "1 1597273201 100\n", - "2 1597273202 100\n", - "3 1597273203 100\n", - "4 1597273204 100" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ + "s_array = np.reshape(s_array, (s_array.shape[0]*s_array.shape[1], 4))\n", "ctx_df = pd.DataFrame(ctx_array, columns=['Timestamp', 'IndoorProb'])\n", "step_df = pd.DataFrame(s_array, columns=['Patient', 'Cohort', 'Day', 'StepPerSec'])\n", "\n", @@ -1052,33 +934,9 @@ "metadata": {}, "outputs": [], "source": [ - "full_df = full_df[full_df['IndoorProb']!=50]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#full_df.drop(columns='Patient', inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "full_df['StepPerSec'] = full_df['StepPerSec'].astype('float32')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + "full_df = full_df[full_df['IndoorProb']!=50]\n", + "#full_df.drop(columns='Patient', inplace=True)\n", + "full_df['StepPerSec'] = full_df['StepPerSec'].astype('float32')\n", "full_df.to_csv('full_df.csv')" ] }, diff --git a/notebooks/main b/notebooks/main deleted file mode 100644 index 8b13789..0000000 --- a/notebooks/main +++ /dev/null @@ -1 +0,0 @@ - diff --git a/weather_analysis.py b/weather_analysis.py index 4e43e09..3a28b76 100644 --- a/weather_analysis.py +++ b/weather_analysis.py @@ -11,12 +11,14 @@ class CleanerExtractor(): def __init__( self, path: str, + folder_path: str, threshold: float ) -> None: # path for csv file that has: # timestamp, patient, day, cohort, step per sec, indoor prob - self.path = path + self.data_path = path + self.folder_path = folder_path self.threshold = threshold self.df = pd.DataFrame([]) @@ -29,7 +31,7 @@ def __init__( def _load_dataframe(self): - self.df = pd.read_csv(self.path) + self.df = pd.read_csv(self.data_path) #we are just interested in outdoor envs self.df.drop(self.df[self.df.IndoorProb != 100].index, inplace=True) @@ -60,9 +62,9 @@ def _extract_weather_statistics(self): non_valid_stats = [] num_days = list(range(1,8)) - for f in os.listdir(self.path): + for f in os.listdir(self.folder_path): if f.startswith('weather') and f.endswith('.json'): - w_file = pd.read_json(os.path.join(self.path, f)) + w_file = pd.read_json(os.path.join(self.folder_path, f)) # we remove every seen day day = f.split('-')[3]