refactoring notebooks. added data generator script

SanBast · Apr 19, 2023 · 90ac881 · 90ac881
1 parent b830b79
commit 90ac881
Show file tree

Hide file tree

Showing 7 changed files with 149 additions and 172 deletions.
diff --git a/data_generation.py b/data_generation.py
@@ -0,0 +1,34 @@
+from data_generator import DataGenerator
+import argparse
+
+def main(args):
+    folder_path = args.folder_path # e.g., 'mobilised-contextual-factors-v1'
+    info_path = args.info_path # e.g., 'CF_RWS_missingfiles-Sheet1.csv'
+    output_path = args.output # e.g., 'full_df.csv'
+
+    cleaner = DataGenerator(folder_path, info_path)
+    cleaner.fit()
+    cleaner.save_resuts(output_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description='Run the data generation from raw data folder.'
+    )
+    parser.add_argument('--folder_path', 
+                        required=True, 
+                        action='store',
+                        help="root of the subjects' contextualized factors data.")
+
+    parser.add_argument('--info_path', 
+                        required=True, 
+                        action='store',
+                        help="root of the statistics Excel file about missing data/files. Must be in .csv")
+
+    parser.add_argument('--output', 
+                        required=True, 
+                        action='store',
+                        help='root where to save the data to be used for weather analysis')
+
+    args = parser.parse_args()
+    main(args)
diff --git a/data_generator.py b/data_generator.py
@@ -0,0 +1,100 @@
+import os
+import pandas as pd
+import numpy as np
+import gzip
+import shutil
+
+
+class DataGenerator():
+
+    def __init__(
+            self,
+            folder_path:str,
+            info_path:str,
+            verbose:bool = False,  
+                 ) -> None:
+
+        self._folder_path = folder_path
+        self._info_path = info_path
+        self._verbose = verbose
+
+        self._info_df = pd.DataFrame([])
+        self._full_df = pd.DataFrame([])
+
+        self._context_arr, self._steps_arr = self._create_context_data()
+
+
+    def _extract_files(self):
+        for p in os.listdir(self._folder_path):
+            i=0
+            subfold = os.path.join(self._folder_path, p)
+            for f in os.listdir(subfold):
+                i+=1
+                if f.endswith('.gz'):
+                    filename = os.path.join(subfold, f)
+                    extr_filename = filename.split('.gz')[0]
+                    with gzip.open(filename, 'rb') as f_in:
+                        with open(extr_filename, 'wb') as f_out:
+                            shutil.copyfileobj(f_in, f_out)
+            if self._verbose:
+                print(f'Extracted {i} files for patient {p}')
+
+
+    def _read_info_missing(self):
+        self._info_df = pd.read_csv(self._info_path)
+        self._info_df.drop(columns='Unique ID ', inplace=True)
+        self._info_df.replace('-', np.nan, inplace=True)
+
+        return self
+
+
+    def _create_context_data(self):
+        ctx_l, s_l = [], []
+
+        for p in os.listdir(self._folder_path):
+            try:
+                cohort = self._info_df[self._info_df['ID']==int(p)]['Cohort'].values[0]
+                print('Processing subject: ', p)
+                subfold = os.path.join(self._folder_path, p)
+                for f in os.listdir(subfold):
+                    if 'Day' in f:
+                        if f.endswith('.json') and 'step' in f:
+                            steps_file = pd.read_json(os.path.join(subfold, f))
+                            s_l.append([[p, cohort, f.split('-')[3], float(el)] for el in steps_file['data'][0]['steps']])
+                        elif f.endswith('.json') and 'Context' in f:
+                            json_ctx_file = pd.read_json(os.path.join(subfold, f))
+                            ctx_l.append([
+                                [k, json_ctx_file['data'][0]['contextValues'][k][0]] 
+                                          for k in json_ctx_file['data'][0]['contextValues']])         
+            except:
+                continue
+
+        return np.array(ctx_l), np.array(s_l)
+
+
+    def _reshape_data(self, arr, last_shape):
+        return (
+            np.reshape(
+                arr, 
+                (arr.shape[0], arr.shape[1], last_shape))
+        )
+
+
+    def fit(self):
+        ctx_df = pd.DataFrame(
+            self._reshape_data(self._context_arr, 2), 
+            columns=['Timestamp', 'IndoorProb'])
+        step_df = pd.DataFrame(
+            self._reshape_data(self._context_arr, 4), 
+            columns=['Patient', 'Cohort', 'Day', 'StepPerSec'])
+
+        self._full_df = pd.concat([step_df, ctx_df], axis=1)
+        self._full_df.dropna(inplace=True)
+
+        return self
+
+
+    def save_results(self, output_path):
+        self._full_df = self._full_df[self._full_df['IndoorProb']!=50]
+        self._full_df['StepPerSec'] = self._full_df['StepPerSec'].astype('float32')
+        self._full_df.to_csv(output_path)
diff --git a/main.py b/main.py
@@ -2,9 +2,9 @@
 import argparse
 
 def main(args):
-    path = args.path
-    threshold = args.threshold
-    output_path = args.output
+    path = args.path # e.g., 'full_df.csv'
+    threshold = args.threshold # e.g., 0.5
+    output_path = args.output # e.g., 'weather_analysis_05_thresh.csv'
 
     cleaner = CleanerExtractor(path, threshold)
     cleaner.fit()

diff --git a/notebooks/10s_windowing.ipynb b/notebooks/10s_windowing.ipynb
@@ -7,11 +7,8 @@
    "outputs": [],
    "source": [
     "import matplotlib.pyplot as plt\n",
-    "import matplotlib\n",
     "import seaborn as sns\n",
     "from pylab import rcParams\n",
-    "import os\n",
-    "import gzip\n",
     "from tqdm import tqdm\n",
     "import pandas as pd\n",
     "import numpy as np"
@@ -271,22 +268,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "ModuleNotFoundError",
-     "evalue": "No module named 'openpyxl'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
-      "Cell \u001b[1;32mIn [10], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mExcelWriter\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mfile_for_missing_steps.xlsx\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m writer:\n\u001b[0;32m      2\u001b[0m     missing_df\u001b[38;5;241m.\u001b[39mto_excel(writer, sheet_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m10s_steps\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
-      "File \u001b[1;32mc:\\Users\\marci\\miniconda3\\lib\\site-packages\\pandas\\io\\excel\\_openpyxl.py:49\u001b[0m, in \u001b[0;36mOpenpyxlWriter.__init__\u001b[1;34m(self, path, engine, date_format, datetime_format, mode, storage_options, if_sheet_exists, engine_kwargs, **kwargs)\u001b[0m\n\u001b[0;32m     36\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__init__\u001b[39m(\n\u001b[0;32m     37\u001b[0m     \u001b[39mself\u001b[39m,\n\u001b[0;32m     38\u001b[0m     path,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     47\u001b[0m ):\n\u001b[0;32m     48\u001b[0m     \u001b[39m# Use the openpyxl module as the Excel writer.\u001b[39;00m\n\u001b[1;32m---> 49\u001b[0m     \u001b[39mfrom\u001b[39;00m \u001b[39mopenpyxl\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mworkbook\u001b[39;00m \u001b[39mimport\u001b[39;00m Workbook\n\u001b[0;32m     51\u001b[0m     engine_kwargs \u001b[39m=\u001b[39m combine_kwargs(engine_kwargs, kwargs)\n\u001b[0;32m     53\u001b[0m     \u001b[39msuper\u001b[39m()\u001b[39m.\u001b[39m\u001b[39m__init__\u001b[39m(\n\u001b[0;32m     54\u001b[0m         path,\n\u001b[0;32m     55\u001b[0m         mode\u001b[39m=\u001b[39mmode,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     58\u001b[0m         engine_kwargs\u001b[39m=\u001b[39mengine_kwargs,\n\u001b[0;32m     59\u001b[0m     )\n",
-      "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'openpyxl'"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "with pd.ExcelWriter('file_for_missing_steps.xlsx') as writer:\n",
     "    missing_df.to_excel(writer, sheet_name='10s_steps')"

diff --git a/notebooks/first_test.ipynb b/notebooks/first_test.ipynb
@@ -640,47 +640,6 @@
     "info_df.loc[info_df['ID']==3011]"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#nans_l = info_df.isnull().sum(axis=1)\n",
-    "#type(nans_l)\n",
-    "#idx_val = nans_l.where(nans_l<2).dropna().index.to_list()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#val_df = info_df[info_df.index.isin(idx_val)]\n",
-    "#val_df.head(10)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#print([el for el in info_df.ID.values if el not in val_df.ID.values])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#val_df.dropna(axis=1, inplace=True)\n",
-    "#val_df = val_df[(val_df.T !='0').all()].reset_index()\n",
-    "#val_df.head(len(val_df))"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 13,
@@ -867,84 +826,7 @@
    "outputs": [],
    "source": [
     "ctx_array = np.reshape(ctx_array, (ctx_array.shape[0]*ctx_array.shape[1], 2))\n",
-    "s_array = np.reshape(s_array, (s_array.shape[0]*s_array.shape[1], 4))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>Timestamp</th>\n",
-       "      <th>IndoorProb</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>1597273200</td>\n",
-       "      <td>100</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>1597273201</td>\n",
-       "      <td>100</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>1597273202</td>\n",
-       "      <td>100</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>1597273203</td>\n",
-       "      <td>100</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>1597273204</td>\n",
-       "      <td>100</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "    Timestamp IndoorProb\n",
-       "0  1597273200        100\n",
-       "1  1597273201        100\n",
-       "2  1597273202        100\n",
-       "3  1597273203        100\n",
-       "4  1597273204        100"
-      ]
-     },
-     "execution_count": 18,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
+    "s_array = np.reshape(s_array, (s_array.shape[0]*s_array.shape[1], 4))\n",
     "ctx_df = pd.DataFrame(ctx_array, columns=['Timestamp', 'IndoorProb'])\n",
     "step_df = pd.DataFrame(s_array, columns=['Patient', 'Cohort', 'Day', 'StepPerSec'])\n",
     "\n",
@@ -1052,33 +934,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "full_df = full_df[full_df['IndoorProb']!=50]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#full_df.drop(columns='Patient', inplace=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "full_df['StepPerSec'] = full_df['StepPerSec'].astype('float32')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "full_df = full_df[full_df['IndoorProb']!=50]\n",
+    "#full_df.drop(columns='Patient', inplace=True)\n",
+    "full_df['StepPerSec'] = full_df['StepPerSec'].astype('float32')\n",
     "full_df.to_csv('full_df.csv')"
    ]
   },

diff --git a/notebooks/main b/notebooks/main
diff --git a/weather_analysis.py b/weather_analysis.py
@@ -11,12 +11,14 @@ class CleanerExtractor():
     def __init__(
             self,
             path: str,
+            folder_path: str,
             threshold: float
             ) -> None:
 
         # path for csv file that has: 
         # timestamp, patient, day, cohort, step per sec, indoor prob        
-        self.path = path
+        self.data_path = path
+        self.folder_path = folder_path
         self.threshold = threshold
         self.df = pd.DataFrame([])
 
@@ -29,7 +31,7 @@ def __init__(
 
 
     def _load_dataframe(self):
-        self.df = pd.read_csv(self.path)
+        self.df = pd.read_csv(self.data_path)
 
         #we are just interested in outdoor envs
         self.df.drop(self.df[self.df.IndoorProb != 100].index, inplace=True)
@@ -60,9 +62,9 @@ def _extract_weather_statistics(self):
         non_valid_stats = []
 
         num_days = list(range(1,8))
-        for f in os.listdir(self.path):
+        for f in os.listdir(self.folder_path):
             if f.startswith('weather') and f.endswith('.json'):
-                w_file = pd.read_json(os.path.join(self.path, f))
+                w_file = pd.read_json(os.path.join(self.folder_path, f))
 
                 # we remove every seen day
                 day = f.split('-')[3]