Skip to content

Commit

Permalink
added main script and argument parser
Browse files Browse the repository at this point in the history
  • Loading branch information
SanBast committed Apr 13, 2023
1 parent 7219c4a commit b830b79
Show file tree
Hide file tree
Showing 5 changed files with 210 additions and 0 deletions.
35 changes: 35 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from weather_analysis import CleanerExtractor
import argparse

def main(args):
path = args.path
threshold = args.threshold
output_path = args.output

cleaner = CleanerExtractor(path, threshold)
cleaner.fit()
cleaner.save_resuts(output_path)


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='Run the cleaning analysis for step per second data.'
)
parser.add_argument('--path',
required=True,
action='store',
help='root of the data')

parser.add_argument('--threshold',
type=float,
action='store',
default=0.0,
help='threshold to filter out step per seconds. Default: 0.0')

parser.add_argument('--output',
required=True,
action='store',
help='root where to save the clean data')

args = parser.parse_args()
main(args)
File renamed without changes.
File renamed without changes.
File renamed without changes.
175 changes: 175 additions & 0 deletions weather_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
import os
import gzip
from tqdm import tqdm
import pandas as pd
import numpy as np
from scipy.stats import trim_mean, iqr


class CleanerExtractor():

def __init__(
self,
path: str,
threshold: float
) -> None:

# path for csv file that has:
# timestamp, patient, day, cohort, step per sec, indoor prob
self.path = path
self.threshold = threshold
self.df = pd.DataFrame([])

self._daily_step_count = []
self._daily_stats = []
self._hot_to_cold_daily_stats = []

self.extracted_df = pd.DataFrame([])
self.full_df = pd.DataFrame([])


def _load_dataframe(self):
self.df = pd.read_csv(self.path)

#we are just interested in outdoor envs
self.df.drop(self.df[self.df.IndoorProb != 100].index, inplace=True)
return self


def _extract_daily_step_count(self, count_df):
num_days = list(range(1,8))
step_count = []
nan_step_count = []

for i,group in count_df.groupby('Day'):
step_count.append(
(i,sum(group['StepPerSec'].values))
)
num_days.remove(int(i[-1]))
if num_days:
for d in num_days:
# we use NaN to deal with missing days from the subject self.path
nan_step_count.append((f'Day{d}', 'NaN'))

self._daily_step_count = step_count + nan_step_count
return self


def _extract_weather_statistics(self):
daily_stats = []
non_valid_stats = []

num_days = list(range(1,8))
for f in os.listdir(self.path):
if f.startswith('weather') and f.endswith('.json'):
w_file = pd.read_json(os.path.join(self.path, f))

# we remove every seen day
day = f.split('-')[3]
num_days.remove(int(day[-1]))

daily_stats.append((day,
w_file['data'][0]['temp'],
w_file['data'][0]['wind_speed'],
w_file['data'][0]['wind_dir'],
w_file['data'][0]['precip']))
#format: day, temp, wind speed, wind dir, precip, snow

if num_days:
for d in num_days:
# we use NaN to deal with missing days from the subject self.path.
# These NaNs are related to the features of each missing day.
non_valid_stats.append((f'Day{d}', 'NaN', 'NaN', 'NaN', 'NaN'))

self._hot_to_cold_daily_stats = sorted(daily_stats, key=lambda x: x[1], reverse=True)
self._daily_stats = daily_stats + non_valid_stats
return self


def _extract_daily_stats(self, feat, day=None):
d_df = self.df[self.df['Day']==day] if day is not None else self.df
return np.array([
[np.mean(d_df[feat])],
[np.median(d_df[feat])],
[np.std(d_df[feat])],
[np.max(d_df[feat])],
[np.min(d_df[feat])],
[iqr(self.df[feat])],
[trim_mean(self.df[feat], 0.1)]
])


def extract(self, df, id, counts):
self._extract_weather_statistics()

step_level = ['StepPerSec', 'StepPerMin']
stats = [
'mean',
'median',
'std',
'max',
'min',
'IQR',
'trim_mean10'
]

#stats overall aggregate all the daily stats on step per sec and step per mins (cadence) without considering the daily stats
stats_overall = pd.concat([
pd.DataFrame(
np.swapaxes(self._extract_daily_stats(df, s), 0,1), columns=[f'{s}_{i}' for i in stats]
) for s in step_level], axis=1)
#stats daily aggregate all the daily stats on step per sec and step per mins (cadence) on the daily level
stats_daily = pd.concat([
pd.DataFrame(
np.swapaxes(self._extract_daily_stats(df, s, f'Day{j}'), 0,1), columns=[f'Day{j}_{s}_{i}' for i in stats]
) for j in range(1,8) for s in step_level], axis=1)

# we create mnaually a different dataset for weather
weather_stats = pd.DataFrame(
# data for dataframe below
np.swapaxes(
np.array(
[[self._hot_to_cold_daily_stats[0][0]],
[self._hot_to_cold_daily_stats[0][1]],
[self._hot_to_cold_daily_stats[-1][0]],
[self._hot_to_cold_daily_stats[-1][1]]]
+[[self._daily_stats[i][j]] for i in range(7) for j in range(2,5)]
+[[self._daily_stats[i][1]] for i in range(7)]), 0,1
),
# columns: hottest day, hottest temp, and coldest ones + daily wind speed, dir and precip.
# columns name pipeline below
columns=[
'hottest_day', 'hottest_temp', 'coldest_day', 'coldest_temp'] \
+[f'Day{i}_wind_speed' for i in range(1,8)]
+[f'Day{i}_wind_dir' for i in range(1,8)]
+[f'Day{i}_precip' for i in range(1,8)]
+[f'Day{i}_temp' for i in range(1,8)
]
)

extracted_df = pd.concat([counts, pd.DataFrame([sum(df['StepPerSec'].values)], columns=['NumOfSteps']),\
pd.DataFrame([id], columns=['ID']), stats_overall, weather_stats, stats_daily], axis=1)

self.extracted_df.append(extracted_df)

return self


def fit(self):
final_df = pd.DataFrame()
for id, group in self.df.groupby('Patient'):
walking = group[group['StepPerSec'] > self.threshold]
walking.reset_index(inplace=True)
walking['StepPerMin'] = [el*60 for el in walking['StepPerSec'].values]

counts = self._extract_daily_step_count(walking)
counts_df = pd.DataFrame(np.swapaxes(np.array([[counts[i][1]] for i in range(7)]), 0,1), columns=[f'Day{i}_stepcount' for i in range(1,8)])

self.extract(walking, id, counts_df)
self.full_df = pd.concat([final_df, self.extracted_df])

return self


def save_resuts(self, output_path):
self.full_df.to_csv(output_path)

0 comments on commit b830b79

Please sign in to comment.