diff --git a/.gitignore b/.gitignore index fb16122..2eadf3d 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,9 @@ .jupyter_cache +# HTML files recursive: + + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/docs/_quarto.yml b/docs/_quarto.yml index e4020e4..24aa486 100644 --- a/docs/_quarto.yml +++ b/docs/_quarto.yml @@ -200,16 +200,39 @@ website: # # PANDAS PACKAGE OVERVIEW # - #- section: - # href: notes/pandas/overview.ipynb - # text: "Pandas Package Overview" - # contents: - # - section: - # href: notes/pandas/dataframes.qmd - # text: "Dataframes" - - + - section: + href: notes/pandas/overview.qmd + text: "Pandas Package Overview" + contents: + - section: + href: notes/pandas/dataframes.qmd + text: "Dataframes" + #- section: + # href: notes/pandas/dataframes.qmd + # text: "Dataframes" + - section: + href: notes/pandas/grouping-pivoting.qmd + text: "Grouping and Pivoting" + - section: + href: notes/pandas/shift-methods.qmd + text: "Shift based Methods" # "Growth and Cumulative Growth" + - section: + href: notes/pandas/moving-averages.qmd + text: "Moving Averages" + - section: + href: notes/pandas/joining-merging.ipynb + text: "Joining and Merging" + - section: + href: notes/applied-stats/overview.qmd + text: "Applied Statistics" + contents: + - section: + href: notes/applied-stats/basic-tests.ipynb + text: "Statistical Tests" + - section: + href: notes/applied-stats/correlation.ipynb + text: "Correlation Analysis" diff --git a/docs/images/joins-inner-outer.jpeg b/docs/images/joins-inner-outer.jpeg new file mode 100644 index 0000000..8faba91 Binary files /dev/null and b/docs/images/joins-inner-outer.jpeg differ diff --git a/docs/notes/applied-stats/basic-tests.ipynb b/docs/notes/applied-stats/basic-tests.ipynb new file mode 100644 index 0000000..a30cb1c --- /dev/null +++ b/docs/notes/applied-stats/basic-tests.ipynb @@ -0,0 +1,1462 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Basic Summary Statistics" + ], + "metadata": { + "id": "sDXJ6LdZSlY5" + } + }, + { + "cell_type": "code", + "source": [ + "from pandas import read_csv\n", + "\n", + "df = read_csv(\"https://raw.githubusercontent.com/prof-rossetti/python-for-finance/main/docs/data/monthly-indicators.csv\")\n", + "df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "TRqYcMkMURas", + "outputId": "e1f9b690-2483-42cf-dee7-5f91fdbb6e28" + }, + "execution_count": 23, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " timestamp cpi fed spy gld\n", + "0 2024-05-01 314.069 5.33 525.6718 215.30\n", + "1 2024-04-01 313.548 5.33 500.3636 211.87\n", + "2 2024-03-01 312.332 5.33 521.3857 205.72\n", + "3 2024-02-01 310.326 5.33 504.8645 189.31\n", + "4 2024-01-01 308.417 5.33 479.8240 188.45" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timestampcpifedspygld
02024-05-01314.0695.33525.6718215.30
12024-04-01313.5485.33500.3636211.87
22024-03-01312.3325.33521.3857205.72
32024-02-01310.3265.33504.8645189.31
42024-01-01308.4175.33479.8240188.45
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df", + "summary": "{\n \"name\": \"df\",\n \"rows\": 234,\n \"fields\": [\n {\n \"column\": \"timestamp\",\n \"properties\": {\n \"dtype\": \"object\",\n \"num_unique_values\": 234,\n \"samples\": [\n \"2018-08-01\",\n \"2007-03-01\",\n \"2009-05-01\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"cpi\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 30.29922027086648,\n \"min\": 190.3,\n \"max\": 314.069,\n \"num_unique_values\": 231,\n \"samples\": [\n 196.8,\n 252.038,\n 307.026\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"fed\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.8796570098480136,\n \"min\": 0.05,\n \"max\": 5.33,\n \"num_unique_values\": 106,\n \"samples\": [\n 3.04,\n 3.08,\n 4.83\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"spy\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 122.94739811506473,\n \"min\": 55.1488,\n \"max\": 525.6718,\n \"num_unique_values\": 233,\n \"samples\": [\n 213.6153,\n 92.7153,\n 81.5622\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"gld\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 40.170234378613976,\n \"min\": 41.65,\n \"max\": 215.3,\n \"num_unique_values\": 229,\n \"samples\": [\n 56.7,\n 115.15,\n 180.02\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 23 + } + ] + }, + { + "cell_type": "code", + "source": [ + "print(len(df))\n", + "print(df[\"timestamp\"].min(), \"...\", df[\"timestamp\"].max())" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "w8UKMi9WUasL", + "outputId": "709b7098-a42c-40ee-e12d-ad7a4789afed" + }, + "execution_count": 24, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "234\n", + "2004-12-01 ... 2024-05-01\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.describe.html\n", + "\n", + "We can use the describe method to quickly see the basic summary statistics for each column:" + ], + "metadata": { + "id": "keiqL_aZFF9h" + } + }, + { + "cell_type": "code", + "source": [ + "df.describe()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 300 + }, + "id": "RJJXZLICFECl", + "outputId": "0599dd65-7cf5-4419-d73d-93c40c44b237" + }, + "execution_count": 25, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " cpi fed spy gld\n", + "count 234.000000 234.000000 234.000000 234.000000\n", + "mean 239.904709 1.588761 199.248881 124.362344\n", + "std 30.299220 1.879657 122.947398 40.170234\n", + "min 190.300000 0.050000 55.148800 41.650000\n", + "25% 216.963500 0.120000 98.790475 101.575000\n", + "50% 236.409000 0.390000 164.878300 123.360000\n", + "75% 256.327500 2.480000 270.226075 159.787500\n", + "max 314.069000 5.330000 525.671800 215.300000" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cpifedspygld
count234.000000234.000000234.000000234.000000
mean239.9047091.588761199.248881124.362344
std30.2992201.879657122.94739840.170234
min190.3000000.05000055.14880041.650000
25%216.9635000.12000098.790475101.575000
50%236.4090000.390000164.878300123.360000
75%256.3275002.480000270.226075159.787500
max314.0690005.330000525.671800215.300000
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"df\",\n \"rows\": 8,\n \"fields\": [\n {\n \"column\": \"cpi\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 82.54018014358134,\n \"min\": 30.29922027086648,\n \"max\": 314.069,\n \"num_unique_values\": 8,\n \"samples\": [\n 239.90470940170943,\n 236.409,\n 234.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"fed\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 82.151619077361,\n \"min\": 0.05,\n \"max\": 234.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 1.5887606837606836,\n 0.39,\n 234.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"spy\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 146.34484875695313,\n \"min\": 55.1488,\n \"max\": 525.6718,\n \"num_unique_values\": 8,\n \"samples\": [\n 199.24888076923074,\n 164.8783,\n 234.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"gld\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 71.4526242403622,\n \"min\": 40.170234378613976,\n \"max\": 234.0,\n \"num_unique_values\": 8,\n \"samples\": [\n 124.36234444444445,\n 123.36,\n 234.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 25 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "As you may be aware, we can calculate these individually, using `Series` aggregations:" + ], + "metadata": { + "id": "05YOTLO0FP9k" + } + }, + { + "cell_type": "code", + "source": [ + "# https://pandas.pydata.org/docs/reference/api/pandas.Series.html\n", + "# https://pandas.pydata.org/docs/reference/api/pandas.Series.quantile.html\n", + "\n", + "series = df[\"fed\"]\n", + "\n", + "print(\"COUNT:\", len(series))\n", + "print(\"MEAN:\", series.mean().round(6))\n", + "print(\"STD:\", series.std().round(6))\n", + "print(\"-------------\")\n", + "print(\"MIN:\", series.min())\n", + "print(\"25TH:\", series.quantile(.25))\n", + "print(\"MED:\", series.median())\n", + "print(\"75TH:\", series.quantile(.75))\n", + "print(\"MAX:\", series.max())" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ycmWj5EIFPXw", + "outputId": "0c6ad693-7afa-4b86-fe07-b33eff0e9968" + }, + "execution_count": 52, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "COUNT: 234\n", + "MEAN: 1.588761\n", + "STD: 1.879657\n", + "-------------\n", + "MIN: 0.05\n", + "25TH: 0.12\n", + "MED: 0.39\n", + "75TH: 2.48\n", + "MAX: 5.33\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "series.describe() # for comparison" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "AjhCBtTRGgwm", + "outputId": "eecf3c8e-6e1c-4045-f689-b1f3782631bc" + }, + "execution_count": 27, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "count 234.000000\n", + "mean 1.588761\n", + "std 1.879657\n", + "min 0.050000\n", + "25% 0.120000\n", + "50% 0.390000\n", + "75% 2.480000\n", + "max 5.330000\n", + "Name: fed, dtype: float64" + ] + }, + "metadata": {}, + "execution_count": 27 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Distribution Plots" + ], + "metadata": { + "id": "xKGf_zb4Y96H" + } + }, + { + "cell_type": "markdown", + "source": [ + "Let's view some distribution plots of the federal funds rate, to tell a story about the summary statistics for this indicator." + ], + "metadata": { + "id": "7fZhuj2-IrLv" + } + }, + { + "cell_type": "code", + "source": [ + "import plotly.express as px\n", + "\n", + "px.box(df, x=\"fed\", orientation=\"h\", points=\"all\", title=\"Distribution of Federal Funds Rate (Monthly)\", hover_data=[\"timestamp\"],)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 542 + }, + "id": "x6PXwr8FzaMm", + "outputId": "ac6c5bfc-fc88-4622-8730-465d7cfcb428" + }, + "execution_count": 28, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "# https://plotly.com/python-api-reference/generated/plotly.express.violin.html\n", + "import plotly.express as px\n", + "\n", + "#px.violin(df, y=\"fed\", points=\"all\", box=True, title=\"Distribution of Federal Funds Rate (Monthly)\", hover_data=[\"timestamp\"])\n", + "px.violin(df, x=\"fed\", orientation=\"h\", points=\"all\", box=True, title=\"Distribution of Federal Funds Rate (Monthly)\", hover_data=[\"timestamp\"])" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 542 + }, + "id": "NdNSM9bvG_e_", + "outputId": "7d195112-bf61-4f14-d9c7-ffebbf4c0f9f" + }, + "execution_count": 29, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "# https://plotly.com/python-api-reference/generated/plotly.express.histogram.html\n", + "px.histogram(df, x=\"fed\", #nbins=12,\n", + " title=\"Distribution of Federal Funds Rate (Monthly)\", height=350)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 367 + }, + "id": "fgWZ7qI9HkVT", + "outputId": "f11acf1b-f3ed-4f14-8296-6d34fa8c0079" + }, + "execution_count": 30, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Looks like the recent higher funds rates are potential outliers. It is hard to say for sure if this data is normally distributed, or whether it is too skewed by the outliers." + ], + "metadata": { + "id": "AMAO4wgmMaOr" + } + }, + { + "cell_type": "markdown", + "source": [ + "# Statistical Tests with `Scipy`" + ], + "metadata": { + "id": "hb2sTkQ7O0yn" + } + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "We can use the Scipy package to perform basic statistical tests.\n", + "\n", + "https://pypi.org/project/scipy/\n" + ], + "metadata": { + "id": "JHvB8k7_O2Sj" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Normality Tests" + ], + "metadata": { + "id": "cPRKcdAokgFj" + } + }, + { + "cell_type": "markdown", + "source": [ + "We can conduct a normality test to see if a given distribution is normally distributed.\n", + "\n", + "https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.normaltest.html\n", + "\n", + "> This function tests the null hypothesis that a sample comes from a normal distribution.\n", + ">\n", + "> If the p-value is \"small\" - that is, if there is a low probability of sampling data from a normally distributed population that produces such an extreme value of the statistic - this may be taken as evidence against the null hypothesis in favor of the alternative: the weights were not drawn from a normal distribution." + ], + "metadata": { + "id": "F23JqH6NkY3g" + } + }, + { + "cell_type": "code", + "source": [ + "from scipy.stats import normaltest\n", + "\n", + "x = df[\"fed\"]\n", + "\n", + "result = normaltest(x)\n", + "print(result)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "GWYQyC_LlgZD", + "outputId": "7089492c-524d-4bfc-ee99-be727658aca3" + }, + "execution_count": 31, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "NormaltestResult(statistic=34.68795952886342, pvalue=2.9349809995776456e-08)\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Interpreting the results.\n", + "\n", + "https://support.minitab.com/en-us/minitab/21/help-and-how-to/statistics/basic-statistics/how-to/normality-test/interpret-the-results/key-results/\n", + "\n", + "> To determine whether the data do not follow a normal distribution, compare the p-value to the significance level. Usually, a significance level (denoted as α or alpha) of 0.05 works well. A significance level of 0.05 indicates a 5% risk of concluding that the data do not follow a normal distribution when the data do follow a normal distribution.\n", + ">\n", + "> P-value ≤ α: The data do not follow a normal distribution (Reject H0)\n", + "> If the p-value is less than or equal to the significance level, the decision is to reject the null hypothesis and conclude that your data do not follow a normal distribution.\n", + ">\n", + "> P-value > α: You cannot conclude that the data do not follow a normal distribution (Fail to reject H0). If the p-value is larger than the significance level, the decision is to fail to reject the null hypothesis. You do not have enough evidence to conclude that your data do not follow a normal distribution." + ], + "metadata": { + "id": "qapm_Mybn02j" + } + }, + { + "cell_type": "code", + "source": [ + "if result.pvalue <= 0.05:\n", + " print(\"REJECT (NOT NORMAL)\")\n", + "else:\n", + " print(\"NOT ABLE TO REJECT (COULD BE NORMAL)\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "-01Mdq-mlgPX", + "outputId": "aac4fe80-cf5c-4ee1-c6ec-277f9559e71c" + }, + "execution_count": 32, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "REJECT (NOT NORMAL)\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Looks like the federal fuds rate does not have a normal distribution (as this notebook was run on June 28th 2024).\n", + "\n", + "How about the market?" + ], + "metadata": { + "id": "r2Nq-ksbxNEY" + } + }, + { + "cell_type": "code", + "source": [ + "\n", + "x = df[\"spy\"]\n", + "\n", + "result = normaltest(x)\n", + "print(result)\n", + "\n", + "if result.pvalue <= 0.05:\n", + " print(\"REJECT (NOT NORMAL)\")\n", + "else:\n", + " print(\"NOT ABLE TO REJECT (COULD BE NORMAL)\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "zq28-19x73Ze", + "outputId": "ad8d43fa-5d8b-4b7f-a43d-59627b824b5a" + }, + "execution_count": 33, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "NormaltestResult(statistic=27.560328618235523, pvalue=1.0359783530157106e-06)\n", + "REJECT (NOT NORMAL)\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## T-Tests" + ], + "metadata": { + "id": "f9JGmHe8kenN" + } + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "https://www.investopedia.com/terms/t/t-test.asp\n", + "\n", + "> A t-test is an inferential statistic used to determine if there is a significant difference between the means of two groups and how they are related. T-tests are used when the data sets follow a normal distribution and have unknown variances, like the data set recorded from flipping a coin 100 times." + ], + "metadata": { + "id": "0FkY4F7Zk1mg" + } + }, + { + "cell_type": "markdown", + "source": [ + "### T-Test Considerations\n", + "\n", + "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6676026/#sec-2title\n", + "\n", + "In order to conduct a T-Test, the data needs to be normally distributed. So the examples below may not be the most methodologically sound. However they should provide code examples you can adapt for other use cases in the future." + ], + "metadata": { + "id": "LHCRfTzd0g65" + } + }, + { + "cell_type": "markdown", + "source": [ + "### 2 Sample T-Test" + ], + "metadata": { + "id": "VTfjJ2Gx9Jqk" + } + }, + { + "cell_type": "markdown", + "source": [ + "A two sample T-test is used to determine whether two independent samples come from the same distribution.\n", + "\n", + "\n", + "Let's split the most recent year's rates from the rest. And see if the most recent years are statistically different." + ], + "metadata": { + "id": "xnYpF2C31Imu" + } + }, + { + "cell_type": "code", + "source": [ + "\n", + "#cutoff_date = \"2022-06-01\" # you can chose a different one if you'd like\n", + "cutoff_date = \"2022-10-01\"\n", + "\n", + "rates_recent = df[df[\"timestamp\"] >= cutoff_date][\"fed\"]\n", + "print(len(rates_recent))\n", + "print(rates_recent)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "nw7TfPlAPE0i", + "outputId": "b810cab1-9e07-4f8c-bdb9-617be7e6dbe7" + }, + "execution_count": 53, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "20\n", + "0 5.33\n", + "1 5.33\n", + "2 5.33\n", + "3 5.33\n", + "4 5.33\n", + "5 5.33\n", + "6 5.33\n", + "7 5.33\n", + "8 5.33\n", + "9 5.33\n", + "10 5.12\n", + "11 5.08\n", + "12 5.06\n", + "13 4.83\n", + "14 4.65\n", + "15 4.57\n", + "16 4.33\n", + "17 4.10\n", + "18 3.78\n", + "19 3.08\n", + "Name: fed, dtype: float64\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "rates_historic = df[df[\"timestamp\"] < cutoff_date][\"fed\"]\n", + "print(len(rates_historic))\n", + "print(rates_historic)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "pOUT50wEaDI-", + "outputId": "54864fe4-4708-495c-919e-60cae4bd52f7" + }, + "execution_count": 54, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "214\n", + "20 2.56\n", + "21 2.33\n", + "22 1.68\n", + "23 1.21\n", + "24 0.77\n", + " ... \n", + "229 2.79\n", + "230 2.63\n", + "231 2.50\n", + "232 2.28\n", + "233 2.16\n", + "Name: fed, Length: 214, dtype: float64\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html\n", + "\n", + "> Calculate the T-test for the means of two independent samples of scores.\n", + ">\n", + "> This is a test for the null hypothesis that 2 independent samples have identical average (expected) values. This test assumes that the populations have identical variances by default.\n", + ">\n", + "> The t-test quantifies the difference between the arithmetic means of the two samples. The p-value quantifies the probability of observing as or more extreme values assuming the null hypothesis, that the samples are drawn from populations with the same population means, is true. A p-value larger than a chosen threshold (e.g. 5% or 1%) indicates that our observation is not so unlikely to have occurred by chance. Therefore, we do not reject the null hypothesis of equal population means. If the p-value is smaller than our threshold, then we have evidence against the null hypothesis of equal population means." + ], + "metadata": { + "id": "R3malw701tHd" + } + }, + { + "cell_type": "code", + "source": [ + "print(rates_recent.var())\n", + "print(rates_historic.var())" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "UTr2JoM93cvq", + "outputId": "5428b926-32d4-4b18-dde3-0e7173c60d37" + }, + "execution_count": 35, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "0.4033105263157895\n", + "2.7065506493791407\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from scipy.stats import ttest_ind\n", + "\n", + "result = ttest_ind(rates_recent, rates_historic)\n", + "print(result)\n", + "\n", + "if result.pvalue <= 0.05:\n", + " print(\"REJECT (MEANS NOT THE SAME)\")\n", + "else:\n", + " print(\"NOT ABLE TO REJECT (MEANS COULD BE THE SAME)\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "V2n3UDRt33uU", + "outputId": "760caf9b-c335-4d71-f8e4-b8b96f734059" + }, + "execution_count": 36, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "TtestResult(statistic=9.743816217891522, pvalue=5.021356895595338e-19, df=232.0)\n", + "REJECT (MEANS NOT THE SAME)\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### 1 Sample T-Test" + ], + "metadata": { + "id": "hIwURKVT9E5H" + } + }, + { + "cell_type": "markdown", + "source": [ + "https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_1samp.html\n", + "\n", + "\n", + "> Calculate the T-test for the mean of ONE group of scores.\n", + ">\n", + "> This is a test for the null hypothesis that the expected value (mean) of a sample of independent observations a is equal to the given population mean, popmean.\n", + ">\n", + "> Under certain assumptions about the population from which a sample is drawn, the confidence interval with confidence level 95% is expected to contain the true population mean in 95% of sample replications." + ], + "metadata": { + "id": "zsyC_OHb21MR" + } + }, + { + "cell_type": "markdown", + "source": [ + "Suppose we wish to test the null hypothesis that the mean of the fed funds rates is equal to 2.5%.\n" + ], + "metadata": { + "id": "fo-RJclB6PXh" + } + }, + { + "cell_type": "code", + "source": [ + "from scipy.stats import ttest_1samp\n", + "\n", + "x = df[\"fed\"]\n", + "print(x.mean())\n", + "\n", + "popmean = 2.5 # for example\n", + "result = ttest_1samp(x, popmean=popmean)\n", + "print(result)\n", + "\n", + "if result.pvalue <= 0.05:\n", + " print(\"REJECT (MEAN NOT EQUAL TO POPMEAN)\")\n", + "else:\n", + " print(\"NOT ABLE TO REJECT (MEAN COULT BE EQUAL TO POPMEAN)\")\n", + "\n", + "ci = result.confidence_interval(confidence_level=0.95)\n", + "print(ci)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "J5HWDSOtPFX3", + "outputId": "5eaa649f-0654-4064-fa78-4f2d2b9cc83a" + }, + "execution_count": 37, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "1.5887606837606836\n", + "TtestResult(statistic=-7.415864219982758, pvalue=2.2306437030862214e-12, df=233)\n", + "REJECT (MEAN NOT EQUAL TO POPMEAN)\n", + "ConfidenceInterval(low=1.346668668631088, high=1.8308526988902791)\n" + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/docs/notes/applied-stats/basic_tests.py b/docs/notes/applied-stats/basic_tests.py new file mode 100644 index 0000000..d65b32d --- /dev/null +++ b/docs/notes/applied-stats/basic_tests.py @@ -0,0 +1,205 @@ +# -*- coding: utf-8 -*- +"""Basic Statistics Overview + +Automatically generated by Colab. + +Original file is located at + https://colab.research.google.com/drive/1A-RKDqX_l3C87eFt73m2jkmLiAt-mg9V + +# Basic Summary Statistics +""" + +from pandas import read_csv + +df = read_csv("https://raw.githubusercontent.com/prof-rossetti/python-for-finance/main/docs/data/monthly-indicators.csv") +df.head() + +print(len(df)) +print(df["timestamp"].min(), "...", df["timestamp"].max()) + +"""https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.describe.html + +We can use the describe method to quickly see the basic summary statistics for each column: +""" + +df.describe() + +"""As you may be aware, we can calculate these individually, using `Series` aggregations:""" + +# https://pandas.pydata.org/docs/reference/api/pandas.Series.html +# https://pandas.pydata.org/docs/reference/api/pandas.Series.quantile.html + +series = df["fed"] + +print("COUNT:", len(series)) +print("MEAN:", series.mean().round(6)) +print("STD:", series.std().round(6)) +print("-------------") +print("MIN:", series.min()) +print("25TH:", series.quantile(.25)) +print("MED:", series.median()) +print("75TH:", series.quantile(.75)) +print("MAX:", series.max()) + +series.describe() # for comparison + +"""## Distribution Plots + +Let's view some distribution plots of the federal funds rate, to tell a story about the summary statistics for this indicator. +""" + +import plotly.express as px + +px.box(df, x="fed", orientation="h", points="all", title="Distribution of Federal Funds Rate (Monthly)", hover_data=["timestamp"],) + +# https://plotly.com/python-api-reference/generated/plotly.express.violin.html +import plotly.express as px + +#px.violin(df, y="fed", points="all", box=True, title="Distribution of Federal Funds Rate (Monthly)", hover_data=["timestamp"]) +px.violin(df, x="fed", orientation="h", points="all", box=True, title="Distribution of Federal Funds Rate (Monthly)", hover_data=["timestamp"]) + +# https://plotly.com/python-api-reference/generated/plotly.express.histogram.html +px.histogram(df, x="fed", #nbins=12, + title="Distribution of Federal Funds Rate (Monthly)", height=350) + +"""Looks like the recent higher funds rates are potential outliers. It is hard to say for sure if this data is normally distributed, or whether it is too skewed by the outliers. + +# Statistical Tests with `Scipy` + +We can use the Scipy package to perform basic statistical tests. + +https://pypi.org/project/scipy/ + +## Normality Tests + +We can conduct a normality test to see if a given distribution is normally distributed. + +https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.normaltest.html + +> This function tests the null hypothesis that a sample comes from a normal distribution. +> +> If the p-value is "small" - that is, if there is a low probability of sampling data from a normally distributed population that produces such an extreme value of the statistic - this may be taken as evidence against the null hypothesis in favor of the alternative: the weights were not drawn from a normal distribution. +""" + +from scipy.stats import normaltest + +x = df["fed"] + +result = normaltest(x) +print(result) + +"""Interpreting the results. + +https://support.minitab.com/en-us/minitab/21/help-and-how-to/statistics/basic-statistics/how-to/normality-test/interpret-the-results/key-results/ + +> To determine whether the data do not follow a normal distribution, compare the p-value to the significance level. Usually, a significance level (denoted as α or alpha) of 0.05 works well. A significance level of 0.05 indicates a 5% risk of concluding that the data do not follow a normal distribution when the data do follow a normal distribution. +> +> P-value ≤ α: The data do not follow a normal distribution (Reject H0) +> If the p-value is less than or equal to the significance level, the decision is to reject the null hypothesis and conclude that your data do not follow a normal distribution. +> +> P-value > α: You cannot conclude that the data do not follow a normal distribution (Fail to reject H0). If the p-value is larger than the significance level, the decision is to fail to reject the null hypothesis. You do not have enough evidence to conclude that your data do not follow a normal distribution. +""" + +if result.pvalue <= 0.05: + print("REJECT (NOT NORMAL)") +else: + print("NOT ABLE TO REJECT (COULD BE NORMAL)") + +"""Looks like the federal fuds rate does not have a normal distribution (as this notebook was run on June 28th 2024). + +How about the market? +""" + +x = df["spy"] + +result = normaltest(x) +print(result) + +if result.pvalue <= 0.05: + print("REJECT (NOT NORMAL)") +else: + print("NOT ABLE TO REJECT (COULD BE NORMAL)") + +"""## T-Tests + +https://www.investopedia.com/terms/t/t-test.asp + +> A t-test is an inferential statistic used to determine if there is a significant difference between the means of two groups and how they are related. T-tests are used when the data sets follow a normal distribution and have unknown variances, like the data set recorded from flipping a coin 100 times. + +### T-Test Considerations + +https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6676026/#sec-2title + +In order to conduct a T-Test, the data needs to be normally distributed. So the examples below may not be the most methodologically sound. However they should provide code examples you can adapt for other use cases in the future. + +### 2 Sample T-Test + +A two sample T-test is used to determine whether two independent samples come from the same distribution. + + +Let's split the most recent year's rates from the rest. And see if the most recent years are statistically different. +""" + +#cutoff_date = "2022-06-01" # you can chose a different one if you'd like +cutoff_date = "2022-10-01" + +rates_recent = df[df["timestamp"] >= cutoff_date]["fed"] +print(len(rates_recent)) +print(rates_recent) + +rates_historic = df[df["timestamp"] < cutoff_date]["fed"] +print(len(rates_historic)) +print(rates_historic) + +"""https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html + +> Calculate the T-test for the means of two independent samples of scores. +> +> This is a test for the null hypothesis that 2 independent samples have identical average (expected) values. This test assumes that the populations have identical variances by default. +> +> The t-test quantifies the difference between the arithmetic means of the two samples. The p-value quantifies the probability of observing as or more extreme values assuming the null hypothesis, that the samples are drawn from populations with the same population means, is true. A p-value larger than a chosen threshold (e.g. 5% or 1%) indicates that our observation is not so unlikely to have occurred by chance. Therefore, we do not reject the null hypothesis of equal population means. If the p-value is smaller than our threshold, then we have evidence against the null hypothesis of equal population means. +""" + +print(rates_recent.var()) +print(rates_historic.var()) + +from scipy.stats import ttest_ind + +result = ttest_ind(rates_recent, rates_historic) +print(result) + +if result.pvalue <= 0.05: + print("REJECT (MEANS NOT THE SAME)") +else: + print("NOT ABLE TO REJECT (MEANS COULD BE THE SAME)") + +"""### 1 Sample T-Test + +https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_1samp.html + + +> Calculate the T-test for the mean of ONE group of scores. +> +> This is a test for the null hypothesis that the expected value (mean) of a sample of independent observations a is equal to the given population mean, popmean. +> +> Under certain assumptions about the population from which a sample is drawn, the confidence interval with confidence level 95% is expected to contain the true population mean in 95% of sample replications. + +Suppose we wish to test the null hypothesis that the mean of the fed funds rates is equal to 2.5%. +""" + +from scipy.stats import ttest_1samp + +x = df["fed"] +print(x.mean()) + +popmean = 2.5 # for example +result = ttest_1samp(x, popmean=popmean) +print(result) + +if result.pvalue <= 0.05: + print("REJECT (MEAN NOT EQUAL TO POPMEAN)") +else: + print("NOT ABLE TO REJECT (MEAN COULT BE EQUAL TO POPMEAN)") + +ci = result.confidence_interval(confidence_level=0.95) +print(ci) \ No newline at end of file diff --git a/docs/notes/applied-stats/correlation.ipynb b/docs/notes/applied-stats/correlation.ipynb new file mode 100644 index 0000000..86d9fda --- /dev/null +++ b/docs/notes/applied-stats/correlation.ipynb @@ -0,0 +1,1424 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Correlation" + ], + "metadata": { + "id": "IXoPRAfSXiNb" + } + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "Let's revisit our dataset of economic indicators.\n", + "\n", + "We will focus on correlation, and determining which of these indicators may be positively or negatively correlated with eachother. This will allow us to answer questions like, \"is gold a good hedge against inflation?\"." + ], + "metadata": { + "id": "WJDXvggrpqav" + } + }, + { + "cell_type": "code", + "source": [ + "from pandas import read_csv\n", + "\n", + "df = read_csv(\"https://raw.githubusercontent.com/prof-rossetti/python-for-finance/main/docs/data/monthly-indicators.csv\")\n", + "df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "TRqYcMkMURas", + "outputId": "e1f9b690-2483-42cf-dee7-5f91fdbb6e28" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " timestamp cpi fed spy gld\n", + "0 2024-05-01 314.069 5.33 525.6718 215.30\n", + "1 2024-04-01 313.548 5.33 500.3636 211.87\n", + "2 2024-03-01 312.332 5.33 521.3857 205.72\n", + "3 2024-02-01 310.326 5.33 504.8645 189.31\n", + "4 2024-01-01 308.417 5.33 479.8240 188.45" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timestampcpifedspygld
02024-05-01314.0695.33525.6718215.30
12024-04-01313.5485.33500.3636211.87
22024-03-01312.3325.33521.3857205.72
32024-02-01310.3265.33504.8645189.31
42024-01-01308.4175.33479.8240188.45
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "df", + "summary": "{\n \"name\": \"df\",\n \"rows\": 234,\n \"fields\": [\n {\n \"column\": \"timestamp\",\n \"properties\": {\n \"dtype\": \"object\",\n \"num_unique_values\": 234,\n \"samples\": [\n \"2018-08-01\",\n \"2007-03-01\",\n \"2009-05-01\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"cpi\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 30.29922027086648,\n \"min\": 190.3,\n \"max\": 314.069,\n \"num_unique_values\": 231,\n \"samples\": [\n 196.8,\n 252.038,\n 307.026\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"fed\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1.8796570098480136,\n \"min\": 0.05,\n \"max\": 5.33,\n \"num_unique_values\": 106,\n \"samples\": [\n 3.04,\n 3.08,\n 4.83\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"spy\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 122.94739811506473,\n \"min\": 55.1488,\n \"max\": 525.6718,\n \"num_unique_values\": 233,\n \"samples\": [\n 213.6153,\n 92.7153,\n 81.5622\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"gld\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 40.170234378613976,\n \"min\": 41.65,\n \"max\": 215.3,\n \"num_unique_values\": 229,\n \"samples\": [\n 56.7,\n 115.15,\n 180.02\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 23 + } + ] + }, + { + "cell_type": "code", + "source": [ + "print(len(df))\n", + "print(df[\"timestamp\"].min(), \"...\", df[\"timestamp\"].max())" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "w8UKMi9WUasL", + "outputId": "709b7098-a42c-40ee-e12d-ad7a4789afed" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "234\n", + "2004-12-01 ... 2024-05-01\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "The primary reason why we fetched all these different datasets and merged them together, is so we can explore the correlation between them." + ], + "metadata": { + "id": "WvOcYANIOceH" + } + }, + { + "cell_type": "markdown", + "source": [ + "**Correlation** is a measure of how two datasets are related to eachother.\n" + ], + "metadata": { + "id": "y7JKi-_ohF37" + } + }, + { + "cell_type": "markdown", + "source": [ + "https://www.investopedia.com/terms/c/correlation.asp\n", + "\n", + "\n", + "\n", + "\n", + "> Investment managers, traders, and analysts find it very important to calculate correlation because the risk reduction benefits of diversification rely on this statistic." + ], + "metadata": { + "id": "W_K7rK_TVDHr" + } + }, + { + "cell_type": "markdown", + "source": [ + "Let's take a quick detour to make a scaled version of this data, to make it easier to plot all these different series on a graph, so we can perhaps start to get a sense of how their movements might correlate (in an unofficial way)." + ], + "metadata": { + "id": "4AxAx5WzSojE" + } + }, + { + "cell_type": "code", + "source": [ + "scaled_df = df.copy()\n", + "scaled_df.index = df[\"timestamp\"] # save the ts for charting, knowing we will remove it\n", + "scaled_df.drop(columns=[\"timestamp\"], inplace=True) # remove the ts column, in preparation to operate on all numeric columns\n", + "scaled_df = scaled_df / scaled_df.max() # dividing all numeric col values by their column's max. there are many alternative methods for scaling the data\n", + "scaled_df.head()\n", + "\n", + "import plotly.express as px\n", + "px.line(scaled_df, y=[\"cpi\", \"fed\", \"spy\", \"gld\",\n", + " #\"btc\"\n", + " ],\n", + " title=\"Scaled data over time\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 542 + }, + "id": "jYW5YzrbRDks", + "outputId": "0a055557-f6c0-4161-fd91-1951281df652" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Looks like the [...] has been moving [upward/downward] at a time when [...] has been moving [upward/downward]. We might start to suspect they are correlated in a [pos/neg] way.\n", + "\n", + "> NOTE: correlation does not imply causation!" + ], + "metadata": { + "id": "DfJbRoy6TO_T" + } + }, + { + "cell_type": "markdown", + "source": [ + "Let's now perform tests for correlation in more official / formal ways.\n", + "\n" + ], + "metadata": { + "id": "FbVefIAfUjKx" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Correlation Considerations" + ], + "metadata": { + "id": "02Y6JN77j0ti" + } + }, + { + "cell_type": "markdown", + "source": [ + "Certain methods for calculating correlation may depend on the normality of our data's distribution, or the sample size, so we should keep these in mind as we determine if we are able to calculate correlation, and which method to use.\n" + ], + "metadata": { + "id": "b3iuo-AIjDqZ" + } + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "https://www.investopedia.com/terms/n/nonparametric-method.asp\n", + "\n", + "\n", + "> The nonparametric method refers to a type of statistic that does not make any assumptions about the characteristics of the sample (its parameters) or whether the observed data is quantitative or qualitative.\n", + ">\n", + "> Nonparametric statistics can include certain descriptive statistics, statistical models, inference, and statistical tests. The model structure of nonparametric methods is not specified a priori but is instead determined from data.\n", + ">\n", + "> Common nonparametric tests include Chi-Square, Wilcoxon rank-sum test, Kruskal-Wallis test, and Spearman's rank-order correlation.\n", + ">\n", + "> In contrast, well-known statistical methods such as ANOVA, Pearson's correlation, t-test, and others do make assumptions about the data being analyzed. One of the most common parametric assumptions is that population data have a \"normal distribution.\"\n" + ], + "metadata": { + "id": "OMFsf-9-iqS9" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Correlation with `scipy`" + ], + "metadata": { + "id": "Tmhw-IlgWWVY" + } + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "We can always calculate correlation between two lists of numbers, using the `pearsonr` and `spearmanr` functions from the `scipy` package.\n", + "\n", + "One difference between these two correlation methods is that Spearman is more robust to (i.e. less affected by) outliers. Also being nonparametric, the Spearman method does not assume our data is normally distributed.\n" + ], + "metadata": { + "id": "Z6WfgjKSWjyX" + } + }, + { + "cell_type": "markdown", + "source": [ + "https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.pearsonr.html\n", + "\n", + "> Pearson correlation coefficient and p-value for testing non-correlation.\n", + ">\n", + "> The Pearson correlation coefficient [1] measures the linear relationship between two datasets. Like other correlation coefficients, this one varies between -1 and +1 with 0 implying no correlation. Correlations of -1 or +1 imply an exact linear relationship. Positive correlations imply that as x increases, so does y. Negative correlations imply that as x increases, y decreases.\n", + ">\n", + "> This function also performs a test of the null hypothesis that the distributions underlying the samples are uncorrelated and normally distributed. (See Kowalski [3] for a discussion of the effects of non-normality of the input on the distribution of the correlation coefficient.) The p-value roughly indicates the probability of an uncorrelated system producing datasets that have a Pearson correlation at least as extreme as the one computed from these datasets." + ], + "metadata": { + "id": "Qm0kJFOiQeM1" + } + }, + { + "cell_type": "markdown", + "source": [ + "https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.spearmanr.html\n", + "\n", + "> Calculate a Spearman correlation coefficient with associated p-value.\n", + ">\n", + "> The Spearman rank-order correlation coefficient is a nonparametric measure of the monotonicity of the relationship between two datasets. Like other correlation coefficients, this one varies between -1 and +1 with 0 implying no correlation. Correlations of -1 or +1 imply an exact monotonic relationship. Positive correlations imply that as x increases, so does y. Negative correlations imply that as x increases, y decreases.\n", + ">\n", + "> The p-value roughly indicates the probability of an uncorrelated system producing datasets that have a Spearman correlation at least as extreme as the one computed from these datasets. Although calculation of the p-value does not make strong assumptions about the distributions underlying the samples, it is only accurate for very large samples (>500 observations). For smaller sample sizes, consider a permutation test instead (see docs for examples)." + ], + "metadata": { + "id": "JB0RbCo1UqtD" + } + }, + { + "cell_type": "code", + "source": [ + "from scipy.stats import pearsonr\n", + "\n", + "x = df[\"fed\"]\n", + "y = df[\"spy\"]\n", + "\n", + "result = pearsonr(x, y)\n", + "print(result)" + ], + "metadata": { + "id": "Nz5EsO5QWY76", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "a1457efb-7f6e-4c57-8e3e-2bc5a5ffc46e" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "PearsonRResult(statistic=0.17282057382978896, pvalue=0.008062179433931187)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "from scipy.stats import spearmanr\n", + "\n", + "x = df[\"fed\"]\n", + "y = df[\"spy\"]\n", + "\n", + "result = spearmanr(x, y)\n", + "print(result)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "fKnyEs4aQguI", + "outputId": "1b8af722-2acc-487b-ac5d-9abd0d21c914" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "SignificanceResult(statistic=0.005936198901328186, pvalue=0.9280322090398303)\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Correlation Matrix with `pandas`" + ], + "metadata": { + "id": "eqvrD4BYWUMs" + } + }, + { + "cell_type": "markdown", + "source": [ + "OK sure we can calculate correlation between two sets of data. But what if we wanted to calculate correlation between many different data sets? We could perhaps set up a loop, but there is an easier way.\n", + "\n", + "If we have a pandas dataframe, we can use it's `corr()` method to produce a \"correlation matrix\", which shows us the \"pairwise correlation of columns\", in other words, the correlation of each column with respect to each other column.\n", + "\n", + "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.corr.html\n", + "\n" + ], + "metadata": { + "id": "QeUeVEI4VBwB" + } + }, + { + "cell_type": "code", + "source": [ + "#df.corr(method=\"pearson\") # method is pearson by default\n", + "df.corr(method=\"pearson\", numeric_only=True) # numeric_only to suppress warning" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 175 + }, + "id": "weE3kA5NcjR5", + "outputId": "5e4ab037-5b93-4e72-eb1e-448d3325d6e7" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " cpi fed spy gld\n", + "cpi 1.000000 0.078102 0.949065 0.823717\n", + "fed 0.078102 1.000000 0.172821 -0.263213\n", + "spy 0.949065 0.172821 1.000000 0.719160\n", + "gld 0.823717 -0.263213 0.719160 1.000000" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cpifedspygld
cpi1.0000000.0781020.9490650.823717
fed0.0781021.0000000.172821-0.263213
spy0.9490650.1728211.0000000.719160
gld0.823717-0.2632130.7191601.000000
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"df\",\n \"rows\": 4,\n \"fields\": [\n {\n \"column\": \"cpi\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.4295147697055192,\n \"min\": 0.07810235316943387,\n \"max\": 1.0,\n \"num_unique_values\": 4,\n \"samples\": [\n 0.07810235316943387,\n 0.8237168740513675,\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"fed\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.5358342284057406,\n \"min\": -0.263212680762633,\n \"max\": 1.0,\n \"num_unique_values\": 4,\n \"samples\": [\n 1.0,\n -0.263212680762633,\n 0.07810235316943387\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"spy\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.37854882497720416,\n \"min\": 0.1728205738297885,\n \"max\": 1.0,\n \"num_unique_values\": 4,\n \"samples\": [\n 0.1728205738297885,\n 0.7191601809353221,\n 0.9490654041714747\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"gld\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.5673812225119731,\n \"min\": -0.263212680762633,\n \"max\": 1.0,\n \"num_unique_values\": 4,\n \"samples\": [\n -0.263212680762633,\n 1.0,\n 0.8237168740513675\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 42 + } + ] + }, + { + "cell_type": "code", + "source": [ + "#df.corr(method=\"spearman\")\n", + "df.corr(method=\"spearman\", numeric_only=True) # numeric_only to suppress warning" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 175 + }, + "id": "0LxOOqZ1dknJ", + "outputId": "56265273-7931-4c60-fa48-ea2729b94188" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " cpi fed spy gld\n", + "cpi 1.000000 -0.102732 0.953588 0.790661\n", + "fed -0.102732 1.000000 0.005936 -0.308626\n", + "spy 0.953588 0.005936 1.000000 0.714306\n", + "gld 0.790661 -0.308626 0.714306 1.000000" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cpifedspygld
cpi1.000000-0.1027320.9535880.790661
fed-0.1027321.0000000.005936-0.308626
spy0.9535880.0059361.0000000.714306
gld0.790661-0.3086260.7143061.000000
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "summary": "{\n \"name\": \"df\",\n \"rows\": 4,\n \"fields\": [\n {\n \"column\": \"cpi\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.5165997176052513,\n \"min\": -0.10273218370450073,\n \"max\": 1.0,\n \"num_unique_values\": 4,\n \"samples\": [\n -0.10273218370450073,\n 0.7906610391016119,\n 1.0\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"fed\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.5823683713456621,\n \"min\": -0.3086264001658012,\n \"max\": 1.0,\n \"num_unique_values\": 4,\n \"samples\": [\n 1.0,\n -0.3086264001658012,\n -0.10273218370450073\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"spy\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.4590773818179508,\n \"min\": 0.005936198901328188,\n \"max\": 1.0,\n \"num_unique_values\": 4,\n \"samples\": [\n 0.005936198901328188,\n 0.7143055161417022,\n 0.9535876162464759\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"gld\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.5844227878347225,\n \"min\": -0.3086264001658012,\n \"max\": 1.0,\n \"num_unique_values\": 4,\n \"samples\": [\n -0.3086264001658012,\n 1.0,\n 0.7906610391016119\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 43 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "We may begin to notice the diagonal of 1s values. This is because each dataset is perfectly positively correlated with itself.\n", + "\n", + "We may also start to notice the symmetry of values mirrored across the diagonal. In other words, the value in column 1, row 4 is the same as the value in column 4, row 1." + ], + "metadata": { + "id": "PXqd16CIzQpN" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Plotting Correlation Matrix" + ], + "metadata": { + "id": "_eCIjPrWVm11" + } + }, + { + "cell_type": "markdown", + "source": [ + "It may not be easy to quickly interpret the rest of the values in the correlation matrix, but if we plot it with colors as a \"heat map\" then we will be able to use color to more easily interpret the data and tell a story." + ], + "metadata": { + "id": "CnbhJJ5gVdvq" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Correlation Heatmap with `plotly`" + ], + "metadata": { + "id": "asvEa2V9fEZm" + } + }, + { + "cell_type": "markdown", + "source": [ + "https://plotly.com/python-api-reference/generated/plotly.express.imshow.html" + ], + "metadata": { + "id": "Mb8wY3D82iak" + } + }, + { + "cell_type": "code", + "source": [ + "# https://plotly.com/python/heatmaps/\n", + "# https://plotly.com/python-api-reference/generated/plotly.express.imshow.html\n", + "import plotly.express as px\n", + "\n", + "cor_mat = df.corr(method=\"spearman\", numeric_only=True) # using numeric_only to suppress warning\n", + "\n", + "title= \"Spearman Correlation between Economic Indicators\"\n", + "fig = px.imshow(cor_mat,\n", + " height=600, # title=title,\n", + " text_auto= \".2f\", # round to two decimal places\n", + " color_continuous_scale=\"Blues\",\n", + " color_continuous_midpoint=0, # set color midpoint at zero because correlation coeficient ranges from -1 to 1 (see correlation notes)\n", + " labels={\"x\": \"Indicator\", \"y\": \"Indicator\"}\n", + ")\n", + "fig.update_layout(title={'text': title, 'x':0.485, 'xanchor': 'center'}) # https://stackoverflow.com/questions/64571789/center-plotly-title-by-default\n", + "fig.show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 617 + }, + "id": "z41ZxGpMe2S8", + "outputId": "ff96425e-f5cc-448f-b4d7-deccac060c4e" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "What stories can we tell with the correlation heatmap? Which indicators are most positively correlated? Which are most negatively correlated?\n", + "\n", + "Is gold a hedge against inflation, or is there another indicator which may be a better hedge?\n" + ], + "metadata": { + "id": "_pL3AkGotQXk" + } + } + ] +} \ No newline at end of file diff --git a/docs/notes/applied-stats/correlation.py b/docs/notes/applied-stats/correlation.py new file mode 100644 index 0000000..b56bc04 --- /dev/null +++ b/docs/notes/applied-stats/correlation.py @@ -0,0 +1,159 @@ +# -*- coding: utf-8 -*- +"""Correlation + +Automatically generated by Colab. + +Original file is located at + https://colab.research.google.com/drive/1Vwvi_d6wEw9ti82ufoilSfe7QFAyPlvx + +# Correlation + +Let's revisit our dataset of economic indicators. + +We will focus on correlation, and determining which of these indicators may be positively or negatively correlated with eachother. This will allow us to answer questions like, "is gold a good hedge against inflation?". +""" + +from pandas import read_csv + +df = read_csv("https://raw.githubusercontent.com/prof-rossetti/python-for-finance/main/docs/data/monthly-indicators.csv") +df.head() + +print(len(df)) +print(df["timestamp"].min(), "...", df["timestamp"].max()) + +"""The primary reason why we fetched all these different datasets and merged them together, is so we can explore the correlation between them. + +**Correlation** is a measure of how two datasets are related to eachother. + +https://www.investopedia.com/terms/c/correlation.asp + + + + +> Investment managers, traders, and analysts find it very important to calculate correlation because the risk reduction benefits of diversification rely on this statistic. + +Let's take a quick detour to make a scaled version of this data, to make it easier to plot all these different series on a graph, so we can perhaps start to get a sense of how their movements might correlate (in an unofficial way). +""" + +scaled_df = df.copy() +scaled_df.index = df["timestamp"] # save the ts for charting, knowing we will remove it +scaled_df.drop(columns=["timestamp"], inplace=True) # remove the ts column, in preparation to operate on all numeric columns +scaled_df = scaled_df / scaled_df.max() # dividing all numeric col values by their column's max. there are many alternative methods for scaling the data +scaled_df.head() + +import plotly.express as px +px.line(scaled_df, y=["cpi", "fed", "spy", "gld", + #"btc" + ], + title="Scaled data over time") + +"""Looks like the [...] has been moving [upward/downward] at a time when [...] has been moving [upward/downward]. We might start to suspect they are correlated in a [pos/neg] way. + +> NOTE: correlation does not imply causation! + +Let's now perform tests for correlation in more official / formal ways. + +## Correlation Considerations + +Certain methods for calculating correlation may depend on the normality of our data's distribution, or the sample size, so we should keep these in mind as we determine if we are able to calculate correlation, and which method to use. + +https://www.investopedia.com/terms/n/nonparametric-method.asp + + +> The nonparametric method refers to a type of statistic that does not make any assumptions about the characteristics of the sample (its parameters) or whether the observed data is quantitative or qualitative. +> +> Nonparametric statistics can include certain descriptive statistics, statistical models, inference, and statistical tests. The model structure of nonparametric methods is not specified a priori but is instead determined from data. +> +> Common nonparametric tests include Chi-Square, Wilcoxon rank-sum test, Kruskal-Wallis test, and Spearman's rank-order correlation. +> +> In contrast, well-known statistical methods such as ANOVA, Pearson's correlation, t-test, and others do make assumptions about the data being analyzed. One of the most common parametric assumptions is that population data have a "normal distribution." + +## Correlation with `scipy` + +We can always calculate correlation between two lists of numbers, using the `pearsonr` and `spearmanr` functions from the `scipy` package. + +One difference between these two correlation methods is that Spearman is more robust to (i.e. less affected by) outliers. Also being nonparametric, the Spearman method does not assume our data is normally distributed. + +https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.pearsonr.html + +> Pearson correlation coefficient and p-value for testing non-correlation. +> +> The Pearson correlation coefficient [1] measures the linear relationship between two datasets. Like other correlation coefficients, this one varies between -1 and +1 with 0 implying no correlation. Correlations of -1 or +1 imply an exact linear relationship. Positive correlations imply that as x increases, so does y. Negative correlations imply that as x increases, y decreases. +> +> This function also performs a test of the null hypothesis that the distributions underlying the samples are uncorrelated and normally distributed. (See Kowalski [3] for a discussion of the effects of non-normality of the input on the distribution of the correlation coefficient.) The p-value roughly indicates the probability of an uncorrelated system producing datasets that have a Pearson correlation at least as extreme as the one computed from these datasets. + +https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.spearmanr.html + +> Calculate a Spearman correlation coefficient with associated p-value. +> +> The Spearman rank-order correlation coefficient is a nonparametric measure of the monotonicity of the relationship between two datasets. Like other correlation coefficients, this one varies between -1 and +1 with 0 implying no correlation. Correlations of -1 or +1 imply an exact monotonic relationship. Positive correlations imply that as x increases, so does y. Negative correlations imply that as x increases, y decreases. +> +> The p-value roughly indicates the probability of an uncorrelated system producing datasets that have a Spearman correlation at least as extreme as the one computed from these datasets. Although calculation of the p-value does not make strong assumptions about the distributions underlying the samples, it is only accurate for very large samples (>500 observations). For smaller sample sizes, consider a permutation test instead (see docs for examples). +""" + +from scipy.stats import pearsonr + +x = df["fed"] +y = df["spy"] + +result = pearsonr(x, y) +print(result) + +from scipy.stats import spearmanr + +x = df["fed"] +y = df["spy"] + +result = spearmanr(x, y) +print(result) + +"""## Correlation Matrix with `pandas` + +OK sure we can calculate correlation between two sets of data. But what if we wanted to calculate correlation between many different data sets? We could perhaps set up a loop, but there is an easier way. + +If we have a pandas dataframe, we can use it's `corr()` method to produce a "correlation matrix", which shows us the "pairwise correlation of columns", in other words, the correlation of each column with respect to each other column. + +https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.corr.html +""" + +#df.corr(method="pearson") # method is pearson by default +df.corr(method="pearson", numeric_only=True) # numeric_only to suppress warning + +#df.corr(method="spearman") +df.corr(method="spearman", numeric_only=True) # numeric_only to suppress warning + +"""We may begin to notice the diagonal of 1s values. This is because each dataset is perfectly positively correlated with itself. + +We may also start to notice the symmetry of values mirrored across the diagonal. In other words, the value in column 1, row 4 is the same as the value in column 4, row 1. + +## Plotting Correlation Matrix + +It may not be easy to quickly interpret the rest of the values in the correlation matrix, but if we plot it with colors as a "heat map" then we will be able to use color to more easily interpret the data and tell a story. + +### Correlation Heatmap with `plotly` + +https://plotly.com/python-api-reference/generated/plotly.express.imshow.html +""" + +# https://plotly.com/python/heatmaps/ +# https://plotly.com/python-api-reference/generated/plotly.express.imshow.html +import plotly.express as px + +cor_mat = df.corr(method="spearman", numeric_only=True) # using numeric_only to suppress warning + +title= "Spearman Correlation between Economic Indicators" +fig = px.imshow(cor_mat, + height=600, # title=title, + text_auto= ".2f", # round to two decimal places + color_continuous_scale="Blues", + color_continuous_midpoint=0, # set color midpoint at zero because correlation coeficient ranges from -1 to 1 (see correlation notes) + labels={"x": "Indicator", "y": "Indicator"} +) +fig.update_layout(title={'text': title, 'x':0.485, 'xanchor': 'center'}) # https://stackoverflow.com/questions/64571789/center-plotly-title-by-default +fig.show() + +"""What stories can we tell with the correlation heatmap? Which indicators are most positively correlated? Which are most negatively correlated? + +Is gold a hedge against inflation, or is there another indicator which may be a better hedge? + +""" \ No newline at end of file diff --git a/docs/notes/applied-stats/overview.qmd b/docs/notes/applied-stats/overview.qmd new file mode 100644 index 0000000..545f655 --- /dev/null +++ b/docs/notes/applied-stats/overview.qmd @@ -0,0 +1,5 @@ +# Applied Statistics + + + + [Statistical Tests](./Basic_Statistics_Overview.ipynb) + + [Correlation](Correlation.ipynb) diff --git a/docs/notes/fetching-data/apis.qmd b/docs/notes/fetching-data/apis.qmd index 59b8f89..0d79d98 100644 --- a/docs/notes/fetching-data/apis.qmd +++ b/docs/notes/fetching-data/apis.qmd @@ -1 +1,11 @@ +--- +format: + html: + code-fold: false +jupyter: python3 +execute: + cache: true # re-render only when source changes +--- + + # APIs diff --git a/docs/notes/fetching-data/csv-data.qmd b/docs/notes/fetching-data/csv-data.qmd index 96e0c35..e02348a 100644 --- a/docs/notes/fetching-data/csv-data.qmd +++ b/docs/notes/fetching-data/csv-data.qmd @@ -1,3 +1,13 @@ +--- +format: + html: + code-fold: false +jupyter: python3 +execute: + cache: true # re-render only when source changes +--- + + # Fetching CSV Data diff --git a/docs/notes/fetching-data/html-web-scraping.qmd b/docs/notes/fetching-data/html-web-scraping.qmd index d911bbe..8dab422 100644 --- a/docs/notes/fetching-data/html-web-scraping.qmd +++ b/docs/notes/fetching-data/html-web-scraping.qmd @@ -1,3 +1,13 @@ +--- +format: + html: + code-fold: false +jupyter: python3 +execute: + cache: true # re-render only when source changes +--- + + # Fetching HTML Data If the data you want to fetch is in XML or HTML format, we can use the `requests` package to fetch it, and the `beautifulsoup4` package to process it. diff --git a/docs/notes/fetching-data/json-data.qmd b/docs/notes/fetching-data/json-data.qmd index e0208e5..fb7b19c 100644 --- a/docs/notes/fetching-data/json-data.qmd +++ b/docs/notes/fetching-data/json-data.qmd @@ -1,3 +1,13 @@ +--- +format: + html: + code-fold: false +jupyter: python3 +execute: + cache: true # re-render only when source changes +--- + + # Fetching JSON Data If the data you want to fetch is in JSON format, we can use the [`requests` package](https://requests.readthedocs.io/en/latest/) to fetch and process it. diff --git a/docs/notes/pandas/dataframes.qmd b/docs/notes/pandas/dataframes.qmd index 5e80d02..0d7d787 100644 --- a/docs/notes/pandas/dataframes.qmd +++ b/docs/notes/pandas/dataframes.qmd @@ -1 +1,11 @@ +--- +format: + html: + code-fold: false +jupyter: python3 +execute: + cache: true # re-render only when source changes +--- + + # Dataframes diff --git a/docs/notes/pandas/grouping-pivoting.qmd b/docs/notes/pandas/grouping-pivoting.qmd new file mode 100644 index 0000000..abe5308 --- /dev/null +++ b/docs/notes/pandas/grouping-pivoting.qmd @@ -0,0 +1,176 @@ +--- +format: + html: + code-fold: false #show +jupyter: python3 +execute: + cache: true # re-render only when source changes +--- + + + +# Grouping and Pivoting + +In this sales dataset, you will see we have a row per date per product sold on that date. + +```{python} +from pandas import read_csv, to_datetime + +sales_df = read_csv(f"https://raw.githubusercontent.com/prof-rossetti/python-for-finance/main/docs/data/monthly-sales.csv") +sales_df["date"] = to_datetime(sales_df["date"]) +sales_df.head() +``` + +```{python} +len(sales_df) +``` + +```{python} +products = sales_df["product"].unique() +print(products) +``` + +```{python} +days = sales_df["date"].unique() +print(len(days)) +print(days.min().date(), "...", days.max().date()) +``` + +We know we can calculate total sales using a `Series` aggregation: + + +```{python} +sales_df["sales price"].sum().round(2) +``` + +But how can we calculate the total sales for each product? + +## Grouping + +Enter the [`groupby` method](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.groupby.html) for group-based aggregations. + +Calculating total sales per product, to identify the top selling products: + +```{python} +sales_by_product = sales_df.groupby("product")["sales price"].sum() +sales_by_product.sort_values(ascending=False) +``` + +Calculating total sales per day: + +```{python} +sales_by_date = sales_df.groupby("date")["sales price"].sum() +sales_by_date.sort_values(ascending=False).head() +``` + +Calculating total sales per day of week: + + +```{python} +sales_df["weekday"] = sales_df["date"].dt.strftime("%A") + +sales_by_weekday = sales_df.groupby("weekday")["sales price"].sum() +sales_by_weekday.sort_values(ascending=False) +``` + +## Pivot Tables + +We can alternatively use the [`pivot_table` function](https://pandas.pydata.org/docs/reference/api/pandas.pivot_table.html) to perform more fine-grained grouping and aggregation. + +Parameters of the `pivot_table` function: + + + The `index` parameter specifies the rows we want to wind up with (i.e. "row per what?"). + + The `values` parameter specifies what columns we would like to aggregate. + + The `aggfunc` parameter specifies how to aggregate those columns. We can pass our own aggregation function(s) or get them from the `numpy` package. We can aggregate different columns differently. + +Pivoting by date: + +```{python} +from pandas import pivot_table +#import numpy as np + +dates_pivot = pivot_table(sales_df, + index=["date"], + values=["sales price", "units sold"], + aggfunc={ + "sales price": "sum", #np.sum, + "units sold": "sum", #np.sum, + } # designate the agg function to be used for each original column. can use our own custom functions here as well +) + +dates_pivot.rename(columns={"sales price": "sales_total", "units sold": "units_sold"}, inplace=True) +dates_pivot.sort_values(by=["sales_total"], ascending=False, inplace=True) +dates_pivot.head() +``` + +Pivoting by product: + +```{python} +products_pivot = pivot_table(sales_df, + index=["product"], + values=["sales price", "units sold"], + aggfunc={ + "sales price": "sum", #np.sum, + "units sold": "sum", #np.sum, + } +) + +products_pivot.rename(columns={"sales price": "sales_total", "units sold": "units_sold"}, inplace=True) +products_pivot.sort_values(by=["sales_total"], ascending=False, inplace=True) +products_pivot.head() +``` + +Pivot by weekday: + +```{python} +weekdays_pivot = pivot_table(sales_df, + index=["weekday"], + values=["sales price", "units sold"], + aggfunc={ + "sales price": ["sum", "mean"], #np.mean, + #"units sold": ["sum", "mean"] #np.mean + } +) + +weekdays_pivot.columns = ["sales_avg", "sales_total"] +weekdays_pivot.sort_values(by=["sales_avg"], ascending=False, inplace=True) +weekdays_pivot +``` + +These pivot tables are now suitable for charting as well: + + + +```{python} +import plotly.express as px + +chart_df = dates_pivot.copy() +#chart_df["date"] = chart_df.index +chart_df.sort_values(by=["sales_total"], ascending=True, inplace=True) + +px.bar(chart_df, #x="date", + y="sales_total", + title="Sales by Day (March 2024)", height=350 +) +``` + +```{python} +chart_df = products_pivot.copy() +chart_df["product"] = chart_df.index +chart_df.sort_values(by=["sales_total"], ascending=True, inplace=True) + +px.bar(chart_df, y="product", x="sales_total", orientation="h", + title="Top Selling Products (March 2024)", height=350 +) +``` + + +```{python} +chart_df = weekdays_pivot.copy() +chart_df["weekday"] = chart_df.index +chart_df.sort_values(by=["sales_avg"], ascending=True, inplace=True) + +px.bar(chart_df, y="weekday", x="sales_avg", orientation="h", + title="Average Sales per Weekday (March 2024)", height=350 +) +``` diff --git a/docs/notes/pandas/joining-merging-simple.qmd b/docs/notes/pandas/joining-merging-simple.qmd new file mode 100644 index 0000000..bb7efc0 --- /dev/null +++ b/docs/notes/pandas/joining-merging-simple.qmd @@ -0,0 +1,20 @@ +--- +format: + html: + code-fold: false #show +jupyter: python3 +execute: + cache: true # re-render only when source changes +--- + +# Joining and Merging + + +![Join strategies: inner vs outer](../../images/joins-inner-outer.jpeg) + + +```{python} +#from pandas import read_csv, to_datetime + + +``` diff --git a/docs/notes/pandas/joining-merging.ipynb b/docs/notes/pandas/joining-merging.ipynb new file mode 100644 index 0000000..00970ea --- /dev/null +++ b/docs/notes/pandas/joining-merging.ipynb @@ -0,0 +1,5427 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Joining and Merging" + ], + "metadata": { + "id": "UwoohLt-OOEZ" + } + }, + { + "cell_type": "markdown", + "source": [ + "To provide a practical example of merging data.\n", + "\n", + "Let's grab lots of different economic and market indicators from the AlphaVantage API, and prepare each source dataset in such a way that allows it to be merged with the others later." + ], + "metadata": { + "id": "nVSEPFOG5dBV" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Fetching Data" + ], + "metadata": { + "id": "-1qhOMoizxGl" + } + }, + { + "cell_type": "markdown", + "source": [ + "We have obtained an [AlphaVantage API Key](https://www.alphavantage.co/support/#api-key) and set it as a notebook secret." + ], + "metadata": { + "id": "kLQRHrmE5cln" + } + }, + { + "cell_type": "code", + "source": [ + "from google.colab import userdata\n", + "\n", + "API_KEY = userdata.get(\"ALPHAVANTAGE_API_KEY\")" + ], + "metadata": { + "id": "qGRE6QNZzSg3" + }, + "execution_count": 111, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### Inflation\n", + "\n", + "https://www.alphavantage.co/documentation/#inflation" + ], + "metadata": { + "id": "qH4Sth_0znlj" + } + }, + { + "cell_type": "code", + "source": [ + "from pandas import read_csv\n", + "\n", + "request_url = f\"https://www.alphavantage.co/query?function=INFLATION&apikey={API_KEY}&datatype=csv\"\n", + "inflation = read_csv(request_url)\n", + "inflation.head()" + ], + "metadata": { + "id": "m_DQ2NxJzP9J", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "outputId": "06fd7d4f-8265-4696-ed3d-0414a5373660" + }, + "execution_count": 112, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " timestamp value\n", + "0 2022-01-01 8.002800\n", + "1 2021-01-01 4.697859\n", + "2 2020-01-01 1.233584\n", + "3 2019-01-01 1.812210\n", + "4 2018-01-01 2.442583" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timestampvalue
02022-01-018.002800
12021-01-014.697859
22020-01-011.233584
32019-01-011.812210
42018-01-012.442583
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "inflation", + "summary": "{\n \"name\": \"inflation\",\n \"rows\": 63,\n \"fields\": [\n {\n \"column\": \"timestamp\",\n \"properties\": {\n \"dtype\": \"object\",\n \"num_unique_values\": 63,\n \"samples\": [\n \"1961-01-01\",\n \"1965-01-01\",\n \"2022-01-01\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"value\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2.784628179867319,\n \"min\": -0.355546266299747,\n \"max\": 13.5492019749684,\n \"num_unique_values\": 63,\n \"samples\": [\n 1.07072414764724,\n 1.58516926383662,\n 8.00279982052121\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 112 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Here we have annual data. Doesn't look like the endpoint provides more frequent intervals. So let's not use this." + ], + "metadata": { + "id": "hjCQGfGi0-3D" + } + }, + { + "cell_type": "markdown", + "source": [ + "### Consumer Price Index (CPI)\n", + "\n", + "https://www.alphavantage.co/documentation/#cpi\n", + "\n", + "> CPI is widely regarded as the barometer of inflation levels in the broader economy." + ], + "metadata": { + "id": "3gvLl4JA0vn6" + } + }, + { + "cell_type": "markdown", + "source": [ + "The CPI endpoint does provide access to monthly data. So let's use CPI as our desired measure of inflation." + ], + "metadata": { + "id": "NtDr8fJk1Obb" + } + }, + { + "cell_type": "code", + "source": [ + "from pandas import read_csv\n", + "\n", + "request_url = f\"https://www.alphavantage.co/query?function=CPI&interval=monthly&apikey={API_KEY}&datatype=csv\"\n", + "cpi_df = read_csv(request_url)\n", + "cpi_df.rename(columns={\"value\": \"cpi\"}, inplace=True)\n", + "cpi_df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "V6BF3bZn00Ma", + "outputId": "91e6be9e-fd55-42c7-ac71-5dc58a8d2ff3" + }, + "execution_count": 113, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " timestamp cpi\n", + "0 2024-05-01 314.069\n", + "1 2024-04-01 313.548\n", + "2 2024-03-01 312.332\n", + "3 2024-02-01 310.326\n", + "4 2024-01-01 308.417" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timestampcpi
02024-05-01314.069
12024-04-01313.548
22024-03-01312.332
32024-02-01310.326
42024-01-01308.417
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "cpi_df", + "summary": "{\n \"name\": \"cpi_df\",\n \"rows\": 1337,\n \"fields\": [\n {\n \"column\": \"timestamp\",\n \"properties\": {\n \"dtype\": \"object\",\n \"num_unique_values\": 1337,\n \"samples\": [\n \"1949-06-01\",\n \"1935-10-01\",\n \"1919-10-01\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"cpi\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 86.26235350272239,\n \"min\": 9.7,\n \"max\": 314.069,\n \"num_unique_values\": 832,\n \"samples\": [\n 39.4,\n 10.9,\n 164.5\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 113 + } + ] + }, + { + "cell_type": "code", + "source": [ + "print(\"EARLIEST:\", cpi_df.iloc[-1][\"timestamp\"])" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "JmMWCB3o9Hom", + "outputId": "70bbe536-ccf9-465f-ce13-3e008236bfcc" + }, + "execution_count": 114, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "EARLIEST: 1913-01-01\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import plotly.express as px\n", + "\n", + "fig = px.line(cpi_df, x=\"timestamp\", y=\"cpi\", title=\"Consumer Price Index (CPI) by Month\", height=350)\n", + "fig.show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 367 + }, + "id": "hC8vI04YJYAr", + "outputId": "2c96b1ae-76ae-4f9b-94bd-6b8c429fb791" + }, + "execution_count": 115, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Federal Funds Rate" + ], + "metadata": { + "id": "VV5QqjEC2Is9" + } + }, + { + "cell_type": "markdown", + "source": [ + "https://www.alphavantage.co/documentation/#interest-rate" + ], + "metadata": { + "id": "V-R48zMH2LkB" + } + }, + { + "cell_type": "code", + "source": [ + "request_url = f\"https://www.alphavantage.co/query?function=FEDERAL_FUNDS_RATE&interval=monthly&apikey={API_KEY}&datatype=csv\"\n", + "fed_funds_df = read_csv(request_url)\n", + "fed_funds_df.rename(columns={\"value\": \"fed\"}, inplace=True)\n", + "fed_funds_df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "YdDGFL-C2K5m", + "outputId": "6e444699-db73-4da0-e469-9b86bc08cc37" + }, + "execution_count": 116, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " timestamp fed\n", + "0 2024-05-01 5.33\n", + "1 2024-04-01 5.33\n", + "2 2024-03-01 5.33\n", + "3 2024-02-01 5.33\n", + "4 2024-01-01 5.33" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timestampfed
02024-05-015.33
12024-04-015.33
22024-03-015.33
32024-02-015.33
42024-01-015.33
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "fed_funds_df", + "summary": "{\n \"name\": \"fed_funds_df\",\n \"rows\": 839,\n \"fields\": [\n {\n \"column\": \"timestamp\",\n \"properties\": {\n \"dtype\": \"object\",\n \"num_unique_values\": 839,\n \"samples\": [\n \"2006-10-01\",\n \"1955-08-01\",\n \"2017-11-01\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"fed\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3.582144050031423,\n \"min\": 0.05,\n \"max\": 19.1,\n \"num_unique_values\": 503,\n \"samples\": [\n 7.53,\n 0.22,\n 8.98\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 116 + } + ] + }, + { + "cell_type": "code", + "source": [ + "print(\"EARLIEST:\", fed_funds_df[\"timestamp\"].min())" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "frTl4l789Rz3", + "outputId": "6ee6e806-8e22-44cf-fea0-b6793f1ee054" + }, + "execution_count": 117, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "EARLIEST: 1954-07-01\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import plotly.express as px\n", + "\n", + "px.line(fed_funds_df, x=\"timestamp\", y=\"fed\", title=\"Federal Funds Rate by Month\", height=350)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 367 + }, + "id": "iRgtofWuJP7T", + "outputId": "111273a3-3917-4eac-b153-9dc2099d4881" + }, + "execution_count": 118, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "px.histogram(fed_funds_df, x=\"fed\", #nbins=12,\n", + " title=\"Distribution of Federal Funds Rate (Monthly)\", height=350)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 367 + }, + "id": "q-BeWM85rgS8", + "outputId": "a3467b24-9897-4ae5-c532-28a87d9166fe" + }, + "execution_count": 119, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### The Market (S&P 500)\n", + "\n" + ], + "metadata": { + "id": "57_CbH3W1z9U" + } + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "https://www.investopedia.com/articles/investing/122215/spy-spdr-sp-500-trust-etf.asp\n", + "\n", + "> The SPDR S&P 500 ETF Trust is one of the most popular funds. It aims to track the Standard & Poor's (S&P) 500 Index, which comprises 500 large-cap U.S. stocks. These stocks are selected by a committee based on market size, liquidity, and industry. The S&P 500 serves as one of the main benchmarks of the U.S. equity market and indicates the financial health and stability of the economy\n" + ], + "metadata": { + "id": "GVAngW-K4XNt" + } + }, + { + "cell_type": "markdown", + "source": [ + "https://www.alphavantage.co/documentation/#monthlyadj\n", + "\n", + "We can use the \"SPY\" ETF as a measure of the market. Looks like the data only covers the past 20 years (see endpoint docs).\n" + ], + "metadata": { + "id": "nXs5iaHk2Gcx" + } + }, + { + "cell_type": "code", + "source": [ + "from pandas import read_csv\n", + "\n", + "request_url = f\"https://www.alphavantage.co/query?function=TIME_SERIES_MONTHLY_ADJUSTED&symbol=SPY&apikey={API_KEY}&datatype=csv\"\n", + "spy_df = read_csv(request_url)\n", + "spy_df.drop(columns=[\"open\", \"high\", \"low\", \"close\", \"volume\", \"dividend amount\"], inplace=True)\n", + "spy_df.rename(columns={\"adjusted close\": \"spy\"}, inplace=True)\n", + "spy_df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "F8YZIgz42G81", + "outputId": "2448efc9-5fc1-417f-a388-f235dcc91ade" + }, + "execution_count": 120, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " timestamp spy\n", + "0 2024-06-27 546.3700\n", + "1 2024-05-31 525.6718\n", + "2 2024-04-30 500.3636\n", + "3 2024-03-28 521.3857\n", + "4 2024-02-29 504.8645" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timestampspy
02024-06-27546.3700
12024-05-31525.6718
22024-04-30500.3636
32024-03-28521.3857
42024-02-29504.8645
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "spy_df", + "summary": "{\n \"name\": \"spy_df\",\n \"rows\": 295,\n \"fields\": [\n {\n \"column\": \"timestamp\",\n \"properties\": {\n \"dtype\": \"object\",\n \"num_unique_values\": 295,\n \"samples\": [\n \"2001-09-28\",\n \"2011-07-29\",\n \"2017-06-30\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"spy\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 122.30327833059096,\n \"min\": 54.2445,\n \"max\": 546.37,\n \"num_unique_values\": 294,\n \"samples\": [\n 411.808,\n 423.0565,\n 111.6245\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 120 + } + ] + }, + { + "cell_type": "code", + "source": [ + "print(\"ROWS:\", len(spy_df))\n", + "print(\"EARLIEST:\", spy_df[\"timestamp\"].min())" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "od5B5qyY9jqB", + "outputId": "2d954b8c-5c03-452a-c8c4-c0478ae00a47" + }, + "execution_count": 121, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "ROWS: 295\n", + "EARLIEST: 1999-12-31\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# standardizing the timestamp values so we can merge on them later\n", + "# we have to decide to treat \"2023-05-31\" as \"2023-05\" or \"2023-06\"\n", + "# since we see the latest value represents the current incompleted month,\n", + "# let's \"round down\" the monthly values\n", + "\n", + "from pandas import to_datetime\n", + "\n", + "spy_df[\"timestamp\"] = to_datetime(spy_df[\"timestamp\"]).dt.strftime(\"%Y-%m-01\")\n", + "spy_df.head()" + ], + "metadata": { + "id": "bXLfF147AwNX", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "outputId": "f188c418-6cad-4e53-8de0-ec2839fc2d74" + }, + "execution_count": 122, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " timestamp spy\n", + "0 2024-06-01 546.3700\n", + "1 2024-05-01 525.6718\n", + "2 2024-04-01 500.3636\n", + "3 2024-03-01 521.3857\n", + "4 2024-02-01 504.8645" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timestampspy
02024-06-01546.3700
12024-05-01525.6718
22024-04-01500.3636
32024-03-01521.3857
42024-02-01504.8645
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "spy_df", + "summary": "{\n \"name\": \"spy_df\",\n \"rows\": 295,\n \"fields\": [\n {\n \"column\": \"timestamp\",\n \"properties\": {\n \"dtype\": \"object\",\n \"num_unique_values\": 295,\n \"samples\": [\n \"2001-09-01\",\n \"2011-07-01\",\n \"2017-06-01\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"spy\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 122.30327833059096,\n \"min\": 54.2445,\n \"max\": 546.37,\n \"num_unique_values\": 294,\n \"samples\": [\n 411.808,\n 423.0565,\n 111.6245\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 122 + } + ] + }, + { + "cell_type": "code", + "source": [ + "from pandas import to_datetime\n", + "\n", + "# packaging up this code into a reusable function becuse we'll have to perform this same operation on multiple datasets (see cells below related to gold and bitcoin)\n", + "\n", + "def round_down_monthly_timestamp(original_df):\n", + " \"\"\" Param original_df: pandas DataFrame that has a \"timestamp\" column of values representing each month\"\"\"\n", + " # standardizing the timestamp values so we can merge on them later\n", + " # we have to decide to treat \"2023-05-31\" as \"2023-05-01\" or \"2023-06-01\"\n", + " # since we see the latest value represents the current incompleted month,\n", + " # let's \"round down\" the monthly values\n", + " # see: https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior\n", + " original_df[\"timestamp\"] = to_datetime(original_df[\"timestamp\"]).dt.strftime(\"%Y-%m-01\")\n", + "\n", + "\n", + "round_down_monthly_timestamp(spy_df)\n", + "spy_df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "bKRmUu_RkVvO", + "outputId": "118c5cc1-da25-4067-a58f-05a8b1ac2ea8" + }, + "execution_count": 123, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " timestamp spy\n", + "0 2024-06-01 546.3700\n", + "1 2024-05-01 525.6718\n", + "2 2024-04-01 500.3636\n", + "3 2024-03-01 521.3857\n", + "4 2024-02-01 504.8645" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timestampspy
02024-06-01546.3700
12024-05-01525.6718
22024-04-01500.3636
32024-03-01521.3857
42024-02-01504.8645
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "spy_df", + "summary": "{\n \"name\": \"spy_df\",\n \"rows\": 295,\n \"fields\": [\n {\n \"column\": \"timestamp\",\n \"properties\": {\n \"dtype\": \"object\",\n \"num_unique_values\": 295,\n \"samples\": [\n \"2001-09-01\",\n \"2011-07-01\",\n \"2017-06-01\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"spy\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 122.30327833059096,\n \"min\": 54.2445,\n \"max\": 546.37,\n \"num_unique_values\": 294,\n \"samples\": [\n 411.808,\n 423.0565,\n 111.6245\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 123 + } + ] + }, + { + "cell_type": "code", + "source": [ + "import plotly.express as px\n", + "\n", + "px.line(spy_df, x=\"timestamp\", y=\"spy\", title=\"S&P 500 (SPY ETF) Prices by Month\", height=350)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 367 + }, + "id": "HVyqxmvDJpdK", + "outputId": "949a558a-848a-4ad7-95cf-0547c61ef266" + }, + "execution_count": 124, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Gold" + ], + "metadata": { + "id": "qO_H01SA0EHf" + } + }, + { + "cell_type": "markdown", + "source": [ + "https://www.investopedia.com/articles/investing/122515/gld-ishares-gold-trust-etf.asp\n", + "\n", + "> The SPDR Gold Shares ETF (GLD) tracks the price of gold bullion in the over-the-counter (OTC) market." + ], + "metadata": { + "id": "_XvoSUtC7hvM" + } + }, + { + "cell_type": "markdown", + "source": [ + "https://money.usnews.com/investing/funds/slideshows/best-gold-etfs-to-hedge-volatility\n", + "\n", + "> The largest gold exchange-traded fund, or ETF, by a wide margin is the SPDR Gold Trust... And as the fund is benchmarked to physical gold, you can get a direct play on gold bullion prices via this ETF." + ], + "metadata": { + "id": "AyxiLblf7oht" + } + }, + { + "cell_type": "markdown", + "source": [ + "OK we can perhaps use the \"GLD\" index fund as a measure of gold prices." + ], + "metadata": { + "id": "lyHbTHOx77Id" + } + }, + { + "cell_type": "code", + "source": [ + "from pandas import read_csv\n", + "\n", + "request_url = f\"https://www.alphavantage.co/query?function=TIME_SERIES_MONTHLY_ADJUSTED&symbol=GLD&apikey={API_KEY}&datatype=csv\"\n", + "gld_df = read_csv(request_url)\n", + "gld_df.drop(columns=[\"open\", \"high\", \"low\", \"close\", \"volume\", \"dividend amount\"], inplace=True)\n", + "gld_df.rename(columns={\"adjusted close\": \"gld\"}, inplace=True)\n", + "gld_df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "DuOEaMPl0Pvl", + "outputId": "79cb6a4b-6471-4967-e807-37a3f41b1d40" + }, + "execution_count": 125, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " timestamp gld\n", + "0 2024-06-27 214.99\n", + "1 2024-05-31 215.30\n", + "2 2024-04-30 211.87\n", + "3 2024-03-28 205.72\n", + "4 2024-02-29 189.31" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timestampgld
02024-06-27214.99
12024-05-31215.30
22024-04-30211.87
32024-03-28205.72
42024-02-29189.31
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "gld_df", + "summary": "{\n \"name\": \"gld_df\",\n \"rows\": 235,\n \"fields\": [\n {\n \"column\": \"timestamp\",\n \"properties\": {\n \"dtype\": \"object\",\n \"num_unique_values\": 235,\n \"samples\": [\n \"2018-09-28\",\n \"2007-03-30\",\n \"2009-06-30\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"gld\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 40.51792589095711,\n \"min\": 41.65,\n \"max\": 215.3,\n \"num_unique_values\": 230,\n \"samples\": [\n 56.7,\n 115.54,\n 171.45\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 125 + } + ] + }, + { + "cell_type": "code", + "source": [ + "print(\"ROWS:\", len(gld_df))\n", + "print(\"EARLIEST:\", gld_df[\"timestamp\"].min())" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "WWemFO55-MOc", + "outputId": "ef3b7c4d-8327-4498-8830-09bcd22b3370" + }, + "execution_count": 126, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "ROWS: 235\n", + "EARLIEST: 2004-12-31\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "round_down_monthly_timestamp(gld_df)\n", + "gld_df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "xbU4M_xPBisQ", + "outputId": "63de0693-6f15-42b9-f837-e9e2d2dc42c7" + }, + "execution_count": 127, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " timestamp gld\n", + "0 2024-06-01 214.99\n", + "1 2024-05-01 215.30\n", + "2 2024-04-01 211.87\n", + "3 2024-03-01 205.72\n", + "4 2024-02-01 189.31" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timestampgld
02024-06-01214.99
12024-05-01215.30
22024-04-01211.87
32024-03-01205.72
42024-02-01189.31
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "gld_df", + "summary": "{\n \"name\": \"gld_df\",\n \"rows\": 235,\n \"fields\": [\n {\n \"column\": \"timestamp\",\n \"properties\": {\n \"dtype\": \"object\",\n \"num_unique_values\": 235,\n \"samples\": [\n \"2018-09-01\",\n \"2007-03-01\",\n \"2009-06-01\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"gld\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 40.51792589095711,\n \"min\": 41.65,\n \"max\": 215.3,\n \"num_unique_values\": 230,\n \"samples\": [\n 56.7,\n 115.54,\n 171.45\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 127 + } + ] + }, + { + "cell_type": "code", + "source": [ + "import plotly.express as px\n", + "\n", + "px.line(gld_df, x=\"timestamp\", y=\"gld\", title=\"Gold (GLD ETF) Prices by Month\", height=350)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 367 + }, + "id": "Jxw9hbYhJ6pi", + "outputId": "c360dc1d-5300-482e-fbdd-3663928a4810" + }, + "execution_count": 128, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Bitcoin" + ], + "metadata": { + "id": "c2ucbZIK0Fdv" + } + }, + { + "cell_type": "markdown", + "source": [ + "https://www.alphavantage.co/documentation/#currency-monthly\n", + "\n" + ], + "metadata": { + "id": "67hFzuRY8dU0" + } + }, + { + "cell_type": "markdown", + "source": [ + "The earliest Bitcoin data we have is from 2020." + ], + "metadata": { + "id": "X3x_k-2A-XDj" + } + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "fH_l-t-PLx0A" + }, + "execution_count": 128, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "request_url = f\"https://www.alphavantage.co/query?function=DIGITAL_CURRENCY_MONTHLY&symbol=BTC&market=USD&apikey={API_KEY}&datatype=csv\"\n", + "btc_df = read_csv(request_url)\n", + "btc_df = btc_df[[\"timestamp\", \"close\"]]\n", + "btc_df.rename(columns={\"close\": \"btc\"}, inplace=True)\n", + "print(len(btc_df))\n", + "btc_df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 223 + }, + "id": "bbXjAcTPzP6C", + "outputId": "089be34e-a246-4464-a664-0c2294a6ab1e" + }, + "execution_count": 129, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "9\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " timestamp btc\n", + "0 2024-06-28 61499.77\n", + "1 2024-05-31 67472.41\n", + "2 2024-04-30 60622.10\n", + "3 2024-03-31 71291.28\n", + "4 2024-02-29 61179.03" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timestampbtc
02024-06-2861499.77
12024-05-3167472.41
22024-04-3060622.10
32024-03-3171291.28
42024-02-2961179.03
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "btc_df", + "summary": "{\n \"name\": \"btc_df\",\n \"rows\": 9,\n \"fields\": [\n {\n \"column\": \"timestamp\",\n \"properties\": {\n \"dtype\": \"object\",\n \"num_unique_values\": 9,\n \"samples\": [\n \"2023-11-30\",\n \"2024-05-31\",\n \"2024-01-31\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"btc\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 13848.5457879553,\n \"min\": 34656.4,\n \"max\": 71291.28,\n \"num_unique_values\": 9,\n \"samples\": [\n 37732.27,\n 67472.41,\n 42548.08\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 129 + } + ] + }, + { + "cell_type": "code", + "source": [ + "print(\"ROWS:\", len(btc_df))\n", + "print(\"EARLIEST:\", btc_df[\"timestamp\"].min())" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "30a5xM2hzP3E", + "outputId": "fba28d09-897d-40dd-fba7-6a265a554421" + }, + "execution_count": 130, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "ROWS: 9\n", + "EARLIEST: 2023-10-31\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "round_down_monthly_timestamp(btc_df)\n", + "btc_df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "Zz2lWprDBmvs", + "outputId": "ea424641-bea5-4552-fee2-140465ca9d36" + }, + "execution_count": 131, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " timestamp btc\n", + "0 2024-06-01 61499.77\n", + "1 2024-05-01 67472.41\n", + "2 2024-04-01 60622.10\n", + "3 2024-03-01 71291.28\n", + "4 2024-02-01 61179.03" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timestampbtc
02024-06-0161499.77
12024-05-0167472.41
22024-04-0160622.10
32024-03-0171291.28
42024-02-0161179.03
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "btc_df", + "summary": "{\n \"name\": \"btc_df\",\n \"rows\": 9,\n \"fields\": [\n {\n \"column\": \"timestamp\",\n \"properties\": {\n \"dtype\": \"object\",\n \"num_unique_values\": 9,\n \"samples\": [\n \"2023-11-01\",\n \"2024-05-01\",\n \"2024-01-01\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"btc\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 13848.5457879553,\n \"min\": 34656.4,\n \"max\": 71291.28,\n \"num_unique_values\": 9,\n \"samples\": [\n 37732.27,\n 67472.41,\n 42548.08\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 131 + } + ] + }, + { + "cell_type": "code", + "source": [ + "import plotly.express as px\n", + "\n", + "px.line(btc_df, x=\"timestamp\", y=\"btc\", title=\"Bitcoin Prices by Month, in USD\", height=350)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 367 + }, + "id": "5Yc-4r5kKG3z", + "outputId": "44739170-c8d2-477c-c99f-a9359ce2d6b7" + }, + "execution_count": 132, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "
\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Merging Data" + ], + "metadata": { + "id": "xVRW7oGx2oQE" + } + }, + { + "cell_type": "markdown", + "source": [ + "Let's merge all datasets together, on the basis of their common date values (i.e. the \"timestamp\" column common across all datasets)." + ], + "metadata": { + "id": "fI1z7vu92qvW" + } + }, + { + "cell_type": "code", + "source": [ + "print(cpi_df.columns.tolist())\n", + "print(fed_funds_df.columns.tolist())\n", + "print(spy_df.columns.tolist())\n", + "print(gld_df.columns.tolist())\n", + "#print(btc_df.columns.tolist())" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "8yd2rccJ_fgm", + "outputId": "a0a007fd-4ac8-433b-8a04-84bba31f9773" + }, + "execution_count": 133, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "['timestamp', 'cpi']\n", + "['timestamp', 'fed']\n", + "['timestamp', 'spy']\n", + "['timestamp', 'gld']\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "cpi_df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "iWeyVb85RtmI", + "outputId": "2783a278-c150-42ed-d7b2-c6fb8eb4eb5c" + }, + "execution_count": 134, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " timestamp cpi\n", + "0 2024-05-01 314.069\n", + "1 2024-04-01 313.548\n", + "2 2024-03-01 312.332\n", + "3 2024-02-01 310.326\n", + "4 2024-01-01 308.417" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timestampcpi
02024-05-01314.069
12024-04-01313.548
22024-03-01312.332
32024-02-01310.326
42024-01-01308.417
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "cpi_df", + "summary": "{\n \"name\": \"cpi_df\",\n \"rows\": 1337,\n \"fields\": [\n {\n \"column\": \"timestamp\",\n \"properties\": {\n \"dtype\": \"object\",\n \"num_unique_values\": 1337,\n \"samples\": [\n \"1949-06-01\",\n \"1935-10-01\",\n \"1919-10-01\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"cpi\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 86.26235350272239,\n \"min\": 9.7,\n \"max\": 314.069,\n \"num_unique_values\": 832,\n \"samples\": [\n 39.4,\n 10.9,\n 164.5\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 134 + } + ] + }, + { + "cell_type": "code", + "source": [ + "fed_funds_df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "1idhDhPeRwF3", + "outputId": "6fc476d3-1d4a-4fcf-f847-8c6b1cb04fa6" + }, + "execution_count": 135, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " timestamp fed\n", + "0 2024-05-01 5.33\n", + "1 2024-04-01 5.33\n", + "2 2024-03-01 5.33\n", + "3 2024-02-01 5.33\n", + "4 2024-01-01 5.33" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timestampfed
02024-05-015.33
12024-04-015.33
22024-03-015.33
32024-02-015.33
42024-01-015.33
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "fed_funds_df", + "summary": "{\n \"name\": \"fed_funds_df\",\n \"rows\": 839,\n \"fields\": [\n {\n \"column\": \"timestamp\",\n \"properties\": {\n \"dtype\": \"object\",\n \"num_unique_values\": 839,\n \"samples\": [\n \"2006-10-01\",\n \"1955-08-01\",\n \"2017-11-01\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"fed\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 3.582144050031423,\n \"min\": 0.05,\n \"max\": 19.1,\n \"num_unique_values\": 503,\n \"samples\": [\n 7.53,\n 0.22,\n 8.98\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 135 + } + ] + }, + { + "cell_type": "code", + "source": [ + "spy_df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "c0jxYtLuRxqU", + "outputId": "b82e8b48-0d93-4041-b7d5-20a48fc60613" + }, + "execution_count": 136, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " timestamp spy\n", + "0 2024-06-01 546.3700\n", + "1 2024-05-01 525.6718\n", + "2 2024-04-01 500.3636\n", + "3 2024-03-01 521.3857\n", + "4 2024-02-01 504.8645" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timestampspy
02024-06-01546.3700
12024-05-01525.6718
22024-04-01500.3636
32024-03-01521.3857
42024-02-01504.8645
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "spy_df", + "summary": "{\n \"name\": \"spy_df\",\n \"rows\": 295,\n \"fields\": [\n {\n \"column\": \"timestamp\",\n \"properties\": {\n \"dtype\": \"object\",\n \"num_unique_values\": 295,\n \"samples\": [\n \"2001-09-01\",\n \"2011-07-01\",\n \"2017-06-01\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"spy\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 122.30327833059096,\n \"min\": 54.2445,\n \"max\": 546.37,\n \"num_unique_values\": 294,\n \"samples\": [\n 411.808,\n 423.0565,\n 111.6245\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 136 + } + ] + }, + { + "cell_type": "code", + "source": [ + "gld_df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "FCFnKeG5RyyS", + "outputId": "f91988e4-49df-4cd3-84dc-da1eb2f15b4c" + }, + "execution_count": 137, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " timestamp gld\n", + "0 2024-06-01 214.99\n", + "1 2024-05-01 215.30\n", + "2 2024-04-01 211.87\n", + "3 2024-03-01 205.72\n", + "4 2024-02-01 189.31" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timestampgld
02024-06-01214.99
12024-05-01215.30
22024-04-01211.87
32024-03-01205.72
42024-02-01189.31
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "variable_name": "gld_df", + "summary": "{\n \"name\": \"gld_df\",\n \"rows\": 235,\n \"fields\": [\n {\n \"column\": \"timestamp\",\n \"properties\": {\n \"dtype\": \"object\",\n \"num_unique_values\": 235,\n \"samples\": [\n \"2018-09-01\",\n \"2007-03-01\",\n \"2009-06-01\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"gld\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 40.51792589095711,\n \"min\": 41.65,\n \"max\": 215.3,\n \"num_unique_values\": 230,\n \"samples\": [\n 56.7,\n 115.54,\n 171.45\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" + } + }, + "metadata": {}, + "execution_count": 137 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html\n", + "\n", + "\n", + "We use a precise merge on matching timestamp values, instead of making assumptions about the row frequency and order of source datasets. In this way, the merge operation is similar to a `VLOOKUP` operation in spreadsheet software.\n", + "\n", + "You'll notice we have been renaming the columns in the source datasets, and ensuring their \"timestamp\" values are represented in a standardized way (i.e. all at the beginning of month), to facilitate a clean merge on these common values.\n", + "\n", + "We do an \"inner\" join strategy, to only keep the rows that have matching timestamp values across all datasets.\n", + "\n", + "So the resulting merged dataset starts in 2020, because that is the earliest data available across ALL datasets (as constrained by the Bitcoin dataset). For analyses that only involve two of these indicators, it may be worth it to create a separate merged dataset from only those two (not including Bitcoin), to obtain as much historical context as possible." + ], + "metadata": { + "id": "hZTX0_LuAnTK" + } + }, + { + "cell_type": "code", + "source": [ + "df = cpi_df.merge(fed_funds_df, on=\"timestamp\", how=\"inner\")\n", + "df = df.merge(spy_df, on=\"timestamp\", how=\"inner\")\n", + "df = df.merge(gld_df, on=\"timestamp\", how=\"inner\")\n", + "#df = df.merge(btc_df, on=\"timestamp\", how=\"inner\")\n", + "df.index = df[\"timestamp\"]\n", + "df.tail()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 237 + }, + "id": "2qHCKXYozPvG", + "outputId": "e02a64ff-542d-41b8-8b60-53a9cffa334f" + }, + "execution_count": 138, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " timestamp cpi fed spy gld\n", + "timestamp \n", + "2005-04-01 2005-04-01 194.6 2.79 80.2135 43.35\n", + "2005-03-01 2005-03-01 193.3 2.63 81.7450 42.82\n", + "2005-02-01 2005-02-01 191.8 2.50 83.2672 43.52\n", + "2005-01-01 2005-01-01 190.7 2.28 81.5622 42.22\n", + "2004-12-01 2004-12-01 190.3 2.16 83.4328 43.80" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timestampcpifedspygld
timestamp
2005-04-012005-04-01194.62.7980.213543.35
2005-03-012005-03-01193.32.6381.745042.82
2005-02-012005-02-01191.82.5083.267243.52
2005-01-012005-01-01190.72.2881.562242.22
2004-12-012004-12-01190.32.1683.432843.80
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe", + "repr_error": "cannot insert timestamp, already exists" + } + }, + "metadata": {}, + "execution_count": 138 + } + ] + }, + { + "cell_type": "code", + "source": [ + "len(df)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "cqXakEhfPVRJ", + "outputId": "7df3268a-a5bf-43c5-811d-9b1cb0104fdc" + }, + "execution_count": 139, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "234" + ] + }, + "metadata": {}, + "execution_count": 139 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Now that we have a single merged dataset, let's save it for later, so we can use it to illustrate some basic statistical concepts and techniques.\n" + ], + "metadata": { + "id": "1Bxkz3VpSuDM" + } + }, + { + "cell_type": "code", + "source": [ + "df.to_csv(\"monthly-indicators.csv\", index=False)" + ], + "metadata": { + "id": "ZPHGKI60OckC" + }, + "execution_count": 140, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/docs/notes/pandas/joining_and_merging.py b/docs/notes/pandas/joining_and_merging.py new file mode 100644 index 0000000..f7ced3c --- /dev/null +++ b/docs/notes/pandas/joining_and_merging.py @@ -0,0 +1,233 @@ +# -*- coding: utf-8 -*- +"""Joining and Merging + +Automatically generated by Colab. + +Original file is located at + https://colab.research.google.com/drive/1aomJTh-fdQYiEogLNtgmfFNuXsZR2D17 + +# Joining and Merging + +To provide a practical example of merging data. + +Let's grab lots of different economic and market indicators from the AlphaVantage API, and prepare each source dataset in such a way that allows it to be merged with the others later. + +## Fetching Data + +We have obtained an [AlphaVantage API Key](https://www.alphavantage.co/support/#api-key) and set it as a notebook secret. +""" + +from google.colab import userdata + +API_KEY = userdata.get("ALPHAVANTAGE_API_KEY") + +"""### Inflation + +https://www.alphavantage.co/documentation/#inflation +""" + +from pandas import read_csv + +request_url = f"https://www.alphavantage.co/query?function=INFLATION&apikey={API_KEY}&datatype=csv" +inflation = read_csv(request_url) +inflation.head() + +"""Here we have annual data. Doesn't look like the endpoint provides more frequent intervals. So let's not use this. + +### Consumer Price Index (CPI) + +https://www.alphavantage.co/documentation/#cpi + +> CPI is widely regarded as the barometer of inflation levels in the broader economy. + +The CPI endpoint does provide access to monthly data. So let's use CPI as our desired measure of inflation. +""" + +from pandas import read_csv + +request_url = f"https://www.alphavantage.co/query?function=CPI&interval=monthly&apikey={API_KEY}&datatype=csv" +cpi_df = read_csv(request_url) +cpi_df.rename(columns={"value": "cpi"}, inplace=True) +cpi_df.head() + +print("EARLIEST:", cpi_df.iloc[-1]["timestamp"]) + +import plotly.express as px + +fig = px.line(cpi_df, x="timestamp", y="cpi", title="Consumer Price Index (CPI) by Month", height=350) +fig.show() + +"""### Federal Funds Rate + +https://www.alphavantage.co/documentation/#interest-rate +""" + +request_url = f"https://www.alphavantage.co/query?function=FEDERAL_FUNDS_RATE&interval=monthly&apikey={API_KEY}&datatype=csv" +fed_funds_df = read_csv(request_url) +fed_funds_df.rename(columns={"value": "fed"}, inplace=True) +fed_funds_df.head() + +print("EARLIEST:", fed_funds_df["timestamp"].min()) + +import plotly.express as px + +px.line(fed_funds_df, x="timestamp", y="fed", title="Federal Funds Rate by Month", height=350) + +px.histogram(fed_funds_df, x="fed", #nbins=12, + title="Distribution of Federal Funds Rate (Monthly)", height=350) + +"""### The Market (S&P 500) + +https://www.investopedia.com/articles/investing/122215/spy-spdr-sp-500-trust-etf.asp + +> The SPDR S&P 500 ETF Trust is one of the most popular funds. It aims to track the Standard & Poor's (S&P) 500 Index, which comprises 500 large-cap U.S. stocks. These stocks are selected by a committee based on market size, liquidity, and industry. The S&P 500 serves as one of the main benchmarks of the U.S. equity market and indicates the financial health and stability of the economy + +https://www.alphavantage.co/documentation/#monthlyadj + +We can use the "SPY" ETF as a measure of the market. Looks like the data only covers the past 20 years (see endpoint docs). +""" + +from pandas import read_csv + +request_url = f"https://www.alphavantage.co/query?function=TIME_SERIES_MONTHLY_ADJUSTED&symbol=SPY&apikey={API_KEY}&datatype=csv" +spy_df = read_csv(request_url) +spy_df.drop(columns=["open", "high", "low", "close", "volume", "dividend amount"], inplace=True) +spy_df.rename(columns={"adjusted close": "spy"}, inplace=True) +spy_df.head() + +print("ROWS:", len(spy_df)) +print("EARLIEST:", spy_df["timestamp"].min()) + +# standardizing the timestamp values so we can merge on them later +# we have to decide to treat "2023-05-31" as "2023-05" or "2023-06" +# since we see the latest value represents the current incompleted month, +# let's "round down" the monthly values + +from pandas import to_datetime + +spy_df["timestamp"] = to_datetime(spy_df["timestamp"]).dt.strftime("%Y-%m-01") +spy_df.head() + +from pandas import to_datetime + +# packaging up this code into a reusable function becuse we'll have to perform this same operation on multiple datasets (see cells below related to gold and bitcoin) + +def round_down_monthly_timestamp(original_df): + """ Param original_df: pandas DataFrame that has a "timestamp" column of values representing each month""" + # standardizing the timestamp values so we can merge on them later + # we have to decide to treat "2023-05-31" as "2023-05-01" or "2023-06-01" + # since we see the latest value represents the current incompleted month, + # let's "round down" the monthly values + # see: https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior + original_df["timestamp"] = to_datetime(original_df["timestamp"]).dt.strftime("%Y-%m-01") + + +round_down_monthly_timestamp(spy_df) +spy_df.head() + +import plotly.express as px + +px.line(spy_df, x="timestamp", y="spy", title="S&P 500 (SPY ETF) Prices by Month", height=350) + +"""### Gold + +https://www.investopedia.com/articles/investing/122515/gld-ishares-gold-trust-etf.asp + +> The SPDR Gold Shares ETF (GLD) tracks the price of gold bullion in the over-the-counter (OTC) market. + +https://money.usnews.com/investing/funds/slideshows/best-gold-etfs-to-hedge-volatility + +> The largest gold exchange-traded fund, or ETF, by a wide margin is the SPDR Gold Trust... And as the fund is benchmarked to physical gold, you can get a direct play on gold bullion prices via this ETF. + +OK we can perhaps use the "GLD" index fund as a measure of gold prices. +""" + +from pandas import read_csv + +request_url = f"https://www.alphavantage.co/query?function=TIME_SERIES_MONTHLY_ADJUSTED&symbol=GLD&apikey={API_KEY}&datatype=csv" +gld_df = read_csv(request_url) +gld_df.drop(columns=["open", "high", "low", "close", "volume", "dividend amount"], inplace=True) +gld_df.rename(columns={"adjusted close": "gld"}, inplace=True) +gld_df.head() + +print("ROWS:", len(gld_df)) +print("EARLIEST:", gld_df["timestamp"].min()) + +round_down_monthly_timestamp(gld_df) +gld_df.head() + +import plotly.express as px + +px.line(gld_df, x="timestamp", y="gld", title="Gold (GLD ETF) Prices by Month", height=350) + +"""### Bitcoin + +https://www.alphavantage.co/documentation/#currency-monthly + +The earliest Bitcoin data we have is from 2020. +""" + + + +request_url = f"https://www.alphavantage.co/query?function=DIGITAL_CURRENCY_MONTHLY&symbol=BTC&market=USD&apikey={API_KEY}&datatype=csv" +btc_df = read_csv(request_url) +btc_df = btc_df[["timestamp", "close"]] +btc_df.rename(columns={"close": "btc"}, inplace=True) +print(len(btc_df)) +btc_df.head() + +print("ROWS:", len(btc_df)) +print("EARLIEST:", btc_df["timestamp"].min()) + +round_down_monthly_timestamp(btc_df) +btc_df.head() + +import plotly.express as px + +px.line(btc_df, x="timestamp", y="btc", title="Bitcoin Prices by Month, in USD", height=350) + +"""## Merging Data + +Let's merge all datasets together, on the basis of their common date values (i.e. the "timestamp" column common across all datasets). +""" + +print(cpi_df.columns.tolist()) +print(fed_funds_df.columns.tolist()) +print(spy_df.columns.tolist()) +print(gld_df.columns.tolist()) +#print(btc_df.columns.tolist()) + +cpi_df.head() + +fed_funds_df.head() + +spy_df.head() + +gld_df.head() + +"""https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html + + +We use a precise merge on matching timestamp values, instead of making assumptions about the row frequency and order of source datasets. In this way, the merge operation is similar to a `VLOOKUP` operation in spreadsheet software. + +You'll notice we have been renaming the columns in the source datasets, and ensuring their "timestamp" values are represented in a standardized way (i.e. all at the beginning of month), to facilitate a clean merge on these common values. + +We do an "inner" join strategy, to only keep the rows that have matching timestamp values across all datasets. + +So the resulting merged dataset starts in 2020, because that is the earliest data available across ALL datasets (as constrained by the Bitcoin dataset). For analyses that only involve two of these indicators, it may be worth it to create a separate merged dataset from only those two (not including Bitcoin), to obtain as much historical context as possible. +""" + +df = cpi_df.merge(fed_funds_df, on="timestamp", how="inner") +df = df.merge(spy_df, on="timestamp", how="inner") +df = df.merge(gld_df, on="timestamp", how="inner") +#df = df.merge(btc_df, on="timestamp", how="inner") +df.index = df["timestamp"] +df.tail() + +len(df) + +"""Now that we have a single merged dataset, let's save it for later, so we can use it to illustrate some basic statistical concepts and techniques. + +""" + +df.to_csv("monthly-indicators.csv", index=False) \ No newline at end of file diff --git a/docs/notes/pandas/moving-averages.qmd b/docs/notes/pandas/moving-averages.qmd new file mode 100644 index 0000000..5d5416a --- /dev/null +++ b/docs/notes/pandas/moving-averages.qmd @@ -0,0 +1,80 @@ +--- +format: + html: + code-fold: false +jupyter: python3 +execute: + cache: true # re-render only when source changes +--- + + +# Moving Averages + +We can leverage the concept of [shift-based methods](./shift-methods.qmd) to calculate our own trends and moving averages. + +We will implement two different kinds of moving average: + + + Rolling Window Averages, using the [`rolling` method](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rolling.html) + + Exponential Weighted Moving Averages, using the [`ewm` method](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.ewm.html) + +Let's consider the following time series dataset of stock prices: + +```{python} +from pandas import read_csv + +request_url = "https://raw.githubusercontent.com/prof-rossetti/python-for-finance/main/docs/data/daily_adjusted_NFLX.csv" +prices_df = read_csv(request_url) +prices_df.head() +``` + +Sort in preparation to perform shift-based methods: + +```{python} +prices_df.sort_values(by=["timestamp"], inplace=True) # +prices_df.head() +``` + + +```{python} + + +prices_df["ma_50"] = prices_df["adjusted_close"].rolling(window=50).mean() + +prices_df["ema_50"] = prices_df["adjusted_close"].ewm(span=50, min_periods=0, adjust=False, ignore_na=False).mean() + +prices_df[["timestamp", "adjusted_close", "ma_50", "ema_50"]] +``` + +```{python} +import plotly.express as px + +px.line(prices_df, x="timestamp", y=["close", "ma_50", "ema_50"], + title=f"Adjusted Closing Prices", + color_discrete_map={ + "close": "royalblue", + "ma_50": "orange", + "ema_50":"yellow" + } +) +``` + +You'll notice there are no values for the first N number of periods in our rolling window average (where N is the size of the window). This is because there aren't enough values to complete the average. It's OK! + +If you would like to change this behavior to be less methodologically strict, we can apply the `min_periods` parameter, setting minimum number of periods to zero, in which case as many values will be used until we get to fifty, at which point the true rolling average takes over: + + +```{python} +# SETTING MIN PERIODS = ZERO: +prices_df["ma_50_min_0"] = prices_df["adjusted_close"].rolling(window=50, min_periods=0).mean() + +px.line(prices_df, x="timestamp", + y=["close", "ma_50_min_0", "ma_50", "ema_50"], + title=f"Adjusted Closing Prices", + color_discrete_map={ + "close": "royalblue", + "ma_50_min_0": "pink", + "ma_50": "orange", + "ema_50":"yellow" + } +) +``` diff --git a/docs/notes/pandas/overview.qmd b/docs/notes/pandas/overview.qmd index 8c5fb58..58dbeeb 100644 --- a/docs/notes/pandas/overview.qmd +++ b/docs/notes/pandas/overview.qmd @@ -1 +1,9 @@ # Processing Tabular Data with `pandas` + +**Pandas Package Overview for Data Science (Mega Notebook)** + +The [`pandas` package](https://pypi.org/project/pandas/) makes it easy to work with CSV formatted (i.e. \"tabular\") data, by providing us with two new datatypes, called the `DataFrame` and the `Series`. + +The pandas `DataFrame` datatype represents tabular data (with rows and columns). Whereas the pandas `Series` datatype represents a single row or single column in the dataset. + +Let's explore some common and practical ways of working with these objects. diff --git a/docs/notes/pandas/series.qmd b/docs/notes/pandas/series.qmd new file mode 100644 index 0000000..e5ec2a9 --- /dev/null +++ b/docs/notes/pandas/series.qmd @@ -0,0 +1,8 @@ +--- +format: + html: + code-fold: false +jupyter: python3 +execute: + cache: true # re-render only when source changes +--- diff --git a/docs/notes/pandas/shift-methods.qmd b/docs/notes/pandas/shift-methods.qmd new file mode 100644 index 0000000..c6acf6f --- /dev/null +++ b/docs/notes/pandas/shift-methods.qmd @@ -0,0 +1,131 @@ +--- +format: + html: + code-fold: false +jupyter: python3 +execute: + cache: true # re-render only when source changes +--- + +# Shift-based Methods + +So far we have been performing operations using values within the same row, or by operating on all values in a given column. + +But what if we want to compare a value in one row to corresponding values in the rows below or above? + +We can use \"shift-based\" methods for this purpose. + + +Let's consider this simple example time series dataset: + +```{python} +from pandas import DataFrame + +gdp_df = DataFrame([ + {"year": 1990, "gdp": 100}, + {"year": 1991, "gdp": 105}, + {"year": 1992, "gdp": 110}, + {"year": 1993, "gdp": 115}, + {"year": 1994, "gdp": 110} + +]) +gdp_df.head() +``` + +Before performing shift-based methods, because row order matters, it is important to ensure the rows are sorted in the proper order (usually in ascending order by date). + +```{python} +# sorting by year for good measure: +gdp_df.sort_values(by=["year"], ascending=True, inplace=True) +gdp_df.head() +``` + +We can use the dataframe's [`shift` method](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.shift.html) to reference a corresponding value in another row above or below by specifying the number of rows above or below via the `periods` parameter. + + + +We use positive numbers to reference rows above, and negative numbers to reference cells below: + + +```{python} +gdp_df["gdp"].shift(periods=1) # 1 or -1 depending on order +``` + + +```{python} +gdp_df["gdp"].shift(periods=-1) # 1 or -1 depending on order +``` + +## Growth and Percent Change + +By comparing current against previous values, this allows us to perform an ad-hoc percent growth calculation from one period to the next. + +Here we illustrate the methods one step at a time. + + +```{python} +# creating a new column using previous values: +gdp_df["gdp_prev"] = gdp_df["gdp"].shift(periods=1) +gdp_df +``` + +```{python} +# calculating change: +gdp_df["gdp_change"] = gdp_df["gdp"] - gdp_df["gdp_prev"] +gdp_df +``` + +```{python} +# calculating percent change: +#gdp_df["gdp_growth"] = (gdp_df["gdp"] - gdp_df["gdp_prev"]) / gdp_df["gdp_prev"] + +gdp_df["gdp_pct_change"] = gdp_df["gdp_change"] / gdp_df["gdp_prev"] +gdp_df +``` + + +Even though we are able to perform this growth calculation ourselves, we should know the dataframe has a dedicated [`pct_change` method](https://pandas.pydata.org/docs/reference/api/pandas.Series.pct_change.html) for this purpose, which allows us to skip the intermediate steps: + +```{python} +# equivalent, leveraging the pct_change method: +gdp_df["gdp_pct_change"] = gdp_df["gdp"].pct_change(periods=1) + +gdp_df[["year", "gdp", "gdp_pct_change"]] +``` + +## Cumulative Growth + +Alright, we have studied how to calculate growth from one period to another, but what about calculating cumulative growth over the entire time period? + +To calculate cumulative growth for a particular period, we can use the([`cumprod` method](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.cumprod.html) to calculate the cumulative product (or the [`product` method](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.product.html), depending on the use case). When calculating cumulative product, each value gets multiplied by the values that follow, in succession. + + +Before we calculate a product, to make the multiplication work, we'll need the growth numbers to be relative to 1, rather than 0. We'll also need to fill in the initial null value with a 1, so the first period represents 100%. + +Let's break this down one step at a time, to illustrate each method, before putting them all together at the end. + + +Overwriting an initial null value that results from there being no previous row for the first row: + +```{python} +gdp_df.loc[0, "gdp_pct_change"] = 0 + +gdp_df +``` + +Expressing growth relative to one instead of zero (so we can calculate cumulative product later): + +```{python} +gdp_df["gdp_pct_change"] = gdp_df["gdp_pct_change"] + 1 +``` + +Calculating the cumulative product of the growth: + + +```{python} +#gdp_df["cumulative_growth"] = (gdp_df["gdp_growth"] + 1).cumprod() + +gdp_df["gdp_cumulative_growth"] = gdp_df["gdp_pct_change"].cumprod() + +gdp_df +``` diff --git a/docs/requirements.txt b/docs/requirements.txt index 27ae452..7987b9e 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -20,4 +20,9 @@ requests pandas beautifulsoup4 lxml # bs4 needs this to parse XML + +scipy + + + #gspread==6.0.2