Add files via upload

SanBast · Sep 21, 2022 · 18a09b2 · 18a09b2
commit 18a09b2
Show file tree

Hide file tree

Showing 2 changed files with 2,035 additions and 0 deletions.
diff --git a/10s_windowing.ipynb b/10s_windowing.ipynb
@@ -0,0 +1,330 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import matplotlib\n",
+    "import seaborn as sns\n",
+    "from pylab import rcParams\n",
+    "import os\n",
+    "import gzip\n",
+    "from tqdm import tqdm\n",
+    "import pandas as pd\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%matplotlib inline\n",
+    "%config InlineBackend.figure_format='retina'\n",
+    "\n",
+    "sns.set(style='whitegrid', palette='muted', font_scale=1.2)\n",
+    "\n",
+    "HAPPY_COLORS_PALETTE = [\"#01BEFE\", \"#FFDD00\", \"#FF7D00\", \"#FF006D\", \"#ADFF02\", \"#8F00FF\"]\n",
+    "\n",
+    "sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))\n",
+    "rcParams['figure.figsize'] = 20, 10"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "full_df = pd.read_csv('full_df.csv')\n",
+    "\n",
+    "outdoor_df = full_df[full_df['IndoorProb']!=100]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\marci\\AppData\\Local\\Temp\\ipykernel_4064\\544610941.py:1: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  outdoor_df['series_id'] = np.arange(len(outdoor_df)) // 10 + 1\n"
+     ]
+    }
+   ],
+   "source": [
+    "outdoor_df['series_id'] = np.arange(len(outdoor_df)) // 10 + 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\marci\\AppData\\Local\\Temp\\ipykernel_4064\\3215045333.py:1: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  outdoor_df['Timestamp'] = pd.to_datetime(outdoor_df['Timestamp'], unit='s')\n"
+     ]
+    }
+   ],
+   "source": [
+    "outdoor_df['Timestamp'] = pd.to_datetime(outdoor_df['Timestamp'], unit='s')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "19040895"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(outdoor_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Unnamed: 0</th>\n",
+       "      <th>Patient</th>\n",
+       "      <th>Cohort</th>\n",
+       "      <th>Day</th>\n",
+       "      <th>StepPerSec</th>\n",
+       "      <th>Timestamp</th>\n",
+       "      <th>IndoorProb</th>\n",
+       "      <th>series_id</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>29017</th>\n",
+       "      <td>29017</td>\n",
+       "      <td>1000</td>\n",
+       "      <td>HA</td>\n",
+       "      <td>Day1</td>\n",
+       "      <td>0.875</td>\n",
+       "      <td>2020-08-13 07:03:37</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29018</th>\n",
+       "      <td>29018</td>\n",
+       "      <td>1000</td>\n",
+       "      <td>HA</td>\n",
+       "      <td>Day1</td>\n",
+       "      <td>0.875</td>\n",
+       "      <td>2020-08-13 07:03:38</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29019</th>\n",
+       "      <td>29019</td>\n",
+       "      <td>1000</td>\n",
+       "      <td>HA</td>\n",
+       "      <td>Day1</td>\n",
+       "      <td>0.875</td>\n",
+       "      <td>2020-08-13 07:03:39</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29020</th>\n",
+       "      <td>29020</td>\n",
+       "      <td>1000</td>\n",
+       "      <td>HA</td>\n",
+       "      <td>Day1</td>\n",
+       "      <td>0.875</td>\n",
+       "      <td>2020-08-13 07:03:40</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29021</th>\n",
+       "      <td>29021</td>\n",
+       "      <td>1000</td>\n",
+       "      <td>HA</td>\n",
+       "      <td>Day1</td>\n",
+       "      <td>0.875</td>\n",
+       "      <td>2020-08-13 07:03:41</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       Unnamed: 0  Patient Cohort   Day  StepPerSec           Timestamp  \\\n",
+       "29017       29017     1000     HA  Day1       0.875 2020-08-13 07:03:37   \n",
+       "29018       29018     1000     HA  Day1       0.875 2020-08-13 07:03:38   \n",
+       "29019       29019     1000     HA  Day1       0.875 2020-08-13 07:03:39   \n",
+       "29020       29020     1000     HA  Day1       0.875 2020-08-13 07:03:40   \n",
+       "29021       29021     1000     HA  Day1       0.875 2020-08-13 07:03:41   \n",
+       "\n",
+       "       IndoorProb  series_id  \n",
+       "29017           0          1  \n",
+       "29018           0          1  \n",
+       "29019           0          1  \n",
+       "29020           0          1  \n",
+       "29021           0          1  "
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "outdoor_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 1843185/1843185 [14:16<00:00, 2151.82it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "rows = []\n",
+    "for _,group in tqdm(outdoor_df[outdoor_df['StepPerSec'] < 1].groupby(['series_id', 'Patient']), position=0, leave=True):\n",
+    "    #if group.StepPerSec.count()==10:\n",
+    "    rows.append([\n",
+    "        group['Patient'].unique()[0], \n",
+    "        group['Cohort'].unique()[0], \n",
+    "        group['StepPerSec'].count(),\n",
+    "        group['Timestamp'].iloc[0],\n",
+    "        group['Timestamp'].iloc[-1],\n",
+    "        group['StepPerSec'].mean(),\n",
+    "        group['StepPerSec'].std()\n",
+    "        ])\n",
+    "    #print(row)\n",
+    "missing_df = pd.DataFrame(np.array(rows),columns=['Subject', 'Cohort', 'Duration', 'start_timestamp', 'end_timestamp', 'mean', 'std'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "missing_df.to_csv('walking_missing.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'openpyxl'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn [10], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mExcelWriter\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mfile_for_missing_steps.xlsx\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m writer:\n\u001b[0;32m      2\u001b[0m     missing_df\u001b[38;5;241m.\u001b[39mto_excel(writer, sheet_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m10s_steps\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
+      "File \u001b[1;32mc:\\Users\\marci\\miniconda3\\lib\\site-packages\\pandas\\io\\excel\\_openpyxl.py:49\u001b[0m, in \u001b[0;36mOpenpyxlWriter.__init__\u001b[1;34m(self, path, engine, date_format, datetime_format, mode, storage_options, if_sheet_exists, engine_kwargs, **kwargs)\u001b[0m\n\u001b[0;32m     36\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__init__\u001b[39m(\n\u001b[0;32m     37\u001b[0m     \u001b[39mself\u001b[39m,\n\u001b[0;32m     38\u001b[0m     path,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     47\u001b[0m ):\n\u001b[0;32m     48\u001b[0m     \u001b[39m# Use the openpyxl module as the Excel writer.\u001b[39;00m\n\u001b[1;32m---> 49\u001b[0m     \u001b[39mfrom\u001b[39;00m \u001b[39mopenpyxl\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mworkbook\u001b[39;00m \u001b[39mimport\u001b[39;00m Workbook\n\u001b[0;32m     51\u001b[0m     engine_kwargs \u001b[39m=\u001b[39m combine_kwargs(engine_kwargs, kwargs)\n\u001b[0;32m     53\u001b[0m     \u001b[39msuper\u001b[39m()\u001b[39m.\u001b[39m\u001b[39m__init__\u001b[39m(\n\u001b[0;32m     54\u001b[0m         path,\n\u001b[0;32m     55\u001b[0m         mode\u001b[39m=\u001b[39mmode,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     58\u001b[0m         engine_kwargs\u001b[39m=\u001b[39mengine_kwargs,\n\u001b[0;32m     59\u001b[0m     )\n",
+      "\u001b[1;31mModuleNotFoundError\u001b[0m: No module named 'openpyxl'"
+     ]
+    }
+   ],
+   "source": [
+    "with pd.ExcelWriter('file_for_missing_steps.xlsx') as writer:\n",
+    "    missing_df.to_excel(writer, sheet_name='10s_steps')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.9.12 ('base')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "9324f6f91069ef608944cf59327718832b88647e83e66beddcee769fe0e7a057"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}