-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathstreamlit_app.py
106 lines (100 loc) · 5.04 KB
/
streamlit_app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import streamlit as st
import pandas as pd
#import pandas_profiling as pp
from pandas_profiling import ProfileReport
from streamlit_pandas_profiling import st_profile_report
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
import os
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
def main():
uploaded_file = st.file_uploader("Upload your csv file")
if uploaded_file is not None:
df = load_data(uploaded_file)
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numcols = df.select_dtypes(include=numerics).columns.to_list()
categcols = df.select_dtypes(exclude=numerics).columns.to_list()
# a selector for the graph type
st.sidebar.info('Your Catalog is *ready!* :sunglasses:')
graph_type = st.sidebar.radio("Choose the analytics type",
["Profiling", "Boxplot", "Histogram", "Scatter Plot", "Bar Plot", "Scatter Matrix", "Crosstab", "Correlation"])
# Profiling
if graph_type == "Profiling":
pr = ProfileReport(df, explorative=True)
st.title("Profiling report")
st_profile_report(pr)
# boxplots
if graph_type == "Boxplot":
st.subheader('Boxplots')
boxplotcol = st.multiselect(label='What columns you want to display', options=numcols)
boxplotfig = px.box(df, y=boxplotcol)
st.plotly_chart(boxplotfig, use_container_width=True)
# Histogram
if graph_type == "Histogram":
st.subheader('Histograms')
histcol = st.selectbox('What column you want to display', df.columns)
bins = st.number_input('Specify number of bins', value=20)
histfig = go.Figure(px.histogram(df, x=histcol,nbins=bins))
st.plotly_chart(histfig, use_container_width=True)
# scatter plot
if graph_type == "Scatter Plot":
st.subheader('Scatter Plot')
x_scatter_plot = st.selectbox('select the x-axis', numcols, key='x_scatter_plot')
y_scatter_plot = st.selectbox('select the y-axis', numcols, key='y_scatter_plot')
scatterplotfig = px.scatter(df, x=x_scatter_plot, y=y_scatter_plot)
st.plotly_chart(scatterplotfig, use_container_width=True)
# bar plot
if graph_type == "Bar Plot":
st.subheader('Bar Plot')
x_bar = st.selectbox('select the x-axis', categcols, key='x_bar')
y_bar = st.selectbox('select the y-axis', numcols, key='y_bar')
aggs = ["count","sum","mean","median","mode","rms","stddev","min","max","first","last"]
aggs_bar = st.selectbox('Use dropdown to change aggregation', aggs, key='aggs_bar')
df_bar_agg = df[[x_bar,y_bar]].groupby(x_bar).agg(aggs_bar)
bar_chart_fig = px.bar(df_bar_agg)
st.plotly_chart(bar_chart_fig, use_container_width=True)
#scatter_matrix
if graph_type == "Scatter Matrix":
st.subheader('Scatter Matrix')
dimensions = st.multiselect(label='What columns you want to display', options=numcols,key='dimensions')
scatter_matrix_color = st.selectbox('Color by', categcols, key='scatter_matrix_color')
scatter_matrix_fig = px.scatter_matrix(df, dimensions=dimensions, color=scatter_matrix_color)
st.plotly_chart(scatter_matrix_fig, use_container_width=True)
# crosstab
if graph_type == "Crosstab":
st.subheader('crosstab')
x_crosstab = st.selectbox('select the x-axis', categcols, key='x_crosstab')
y_crosstab = st.selectbox('select the y-axis', categcols, key='y_crosstab')
crosstab = df.pivot_table(index= x_crosstab, columns= y_crosstab, aggfunc=lambda x: len(x), margins=True)
st.write(crosstab)
# Correlation
if graph_type == "Correlation":
st.header("Correlation Dynamic Dropdown")
x_corr = st.selectbox("x", numcols, key = 'x_corr')
y_options, y_formats = get_y_vars(df, x_corr, numcols)
y_corr = st.selectbox(f"y (sorted by correlation with {x_corr})", y_options, format_func=y_formats.get, key = 'y_corr')
plot = alt.Chart(df).mark_circle().encode(
alt.X(x_corr,scale=alt.Scale(zero=False)),
alt.Y(y_corr,scale=alt.Scale(zero=False)))
st.altair_chart(plot)
@st.cache(allow_output_mutation=True)
def load_data(file):
df = pd.read_csv(file)
cols = [col.strip().lower().replace(" ", "_") for col in df.columns]
df.columns = cols
return df
@st.cache(allow_output_mutation=True)
def get_y_vars(dataset, x, variables):
corrs = dataset.corr()[x]
remaining_variables = [v for v in variables if v != x]
sorted_remaining_variables = sorted(
remaining_variables, key=lambda v: corrs[v], reverse=True
)
format_dict = {v: f"{v} ({corrs[v]:.2f})" for v in sorted_remaining_variables}
return sorted_remaining_variables, format_dict
if __name__ == "__main__":
main()