Change return text for Fastapi

MarkoBrie · Mar 4, 2024 · 0e4e34c · 0e4e34c
1 parent b5a708b
commit 0e4e34c
Show file tree

Hide file tree

Showing 10 changed files with 308,415 additions and 3,119 deletions.
diff --git a/.ipynb_checkpoints/2_Model_selection-checkpoint.ipynb b/.ipynb_checkpoints/2_Model_selection-checkpoint.ipynb
@@ -13,6 +13,9 @@
    "id": "d2c08eb9",
    "metadata": {},
    "source": [
+    "**Supervised:** The labels are included in the training data and the goal is to train a model to learn to predict the labels from the features  \n",
+    "**Classification:** The label is a binary variable, 0 (will repay loan on time), 1 (will have difficulty repaying loan)\n",
+    "\n",
     "- Evaluation des performances des modèles d’apprentissage supervisé selon différents critères (scores, temps d'entraînement, etc.) en adaptant les paramètres afin de choisir le modèle le plus performant pour la problématique métier.\n",
     "- Calcul un score avec fp + 10 * fn\n",
     "\n",
@@ -45,7 +48,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 139,
+   "execution_count": 258,
    "id": "1c8b0045",
    "metadata": {},
    "outputs": [],
@@ -58,6 +61,7 @@
     "\n",
     "from sklearn.model_selection import train_test_split, GridSearchCV\n",
     "from sklearn.model_selection import RandomizedSearchCV\n",
+    "from sklearn.preprocessing import MinMaxScaler\n",
     "from sklearn.ensemble import RandomForestClassifier\n",
     "from sklearn.metrics import precision_recall_curve\n",
     "from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score\n",
@@ -128,7 +132,7 @@
   {
    "cell_type": "code",
    "execution_count": 255,
-   "id": "81ed0da2",
+   "id": "2d0bbc23",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2318,61 +2322,64 @@
    "id": "9e08b8cf",
    "metadata": {},
    "source": [
-    "<a name='4'></a>\n",
+    "<a name='5'></a>\n",
     "# 5 Evalutation and selection of best model"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 254,
+   "execution_count": 264,
    "id": "73845c9f",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "                       time_in_s  FP_10_FN       FP      TP  Accuracy  \\\n",
-      "Run Name                                                                \n",
-      "LGBM_Shap002          508.117909   35429.0  15139.0  2855.0  0.720859   \n",
-      "RFC_newFEATURE_001  13614.842443   36741.0  19881.0  3198.0  0.649334   \n",
-      "XGB_Shap002           401.376422   32718.0  12948.0  2907.0  0.757329   \n",
-      "RFC_newFEATURE_002   5635.813629   35370.0  17020.0  3049.0  0.693430   \n",
-      "RFC_newFEATURE      12256.283958   36534.0  19814.0  3212.0  0.650651   \n",
-      "XGB                 26824.582238   32675.0  12635.0  2880.0  0.761979   \n",
-      "RFC                 13921.119884   37986.0  18046.0  2890.0  0.674162   \n",
-      "RFC_smote            1510.164263   35067.0  10407.0  2418.0  0.790693   \n",
-      "LightGBM_smote        302.068259   35415.0  15435.0  2886.0  0.716550   \n",
-      "LightGBM              499.470917   35415.0  15435.0  2886.0  0.716550   \n",
-      "XGB_smote             194.733210   33112.0  18292.0  3402.0  0.678487   \n",
+      "                        time_in_s  FP_10_FN       FP      TP  Accuracy  \\\n",
+      "Run Name                                                                 \n",
+      "LGBM_Shap002_scaled    485.448492   34979.0  15079.0  2894.0  0.722469   \n",
+      "LGBM_Shap002           508.117909   35429.0  15139.0  2855.0  0.720859   \n",
+      "RFC_newFEATURE_001   13614.842443   36741.0  19881.0  3198.0  0.649334   \n",
+      "XGB_Shap002            401.376422   32718.0  12948.0  2907.0  0.757329   \n",
+      "RFC_newFEATURE_002    5635.813629   35370.0  17020.0  3049.0  0.693430   \n",
+      "RFC_newFEATURE       12256.283958   36534.0  19814.0  3212.0  0.650651   \n",
+      "XGB                  26824.582238   32675.0  12635.0  2880.0  0.761979   \n",
+      "RFC                  13921.119884   37986.0  18046.0  2890.0  0.674162   \n",
+      "RFC_smote             1510.164263   35067.0  10407.0  2418.0  0.790693   \n",
+      "LightGBM_smote         302.068259   35415.0  15435.0  2886.0  0.716550   \n",
+      "LightGBM               499.470917   35415.0  15435.0  2886.0  0.716550   \n",
+      "XGB_smote              194.733210   33112.0  18292.0  3402.0  0.678487   \n",
       "\n",
-      "                      Recall  threshold   ROC_AUC      FN  Precision  \\\n",
-      "Run Name                                                               \n",
-      "LGBM_Shap002        0.584562        0.2  0.658589  2029.0   0.158664   \n",
-      "RFC_newFEATURE_001  0.654791        0.1  0.651827  1686.0   0.138568   \n",
-      "XGB_Shap002         0.595209        0.1  0.683261  1977.0   0.183349   \n",
-      "RFC_newFEATURE_002  0.624283        0.1  0.661839  1835.0   0.151926   \n",
-      "RFC_newFEATURE      0.657658        0.1  0.653852  1672.0   0.139494   \n",
-      "XGB                 0.589681        0.1  0.683261  2004.0   0.185627   \n",
-      "RFC                 0.591728        0.1  0.636501  1994.0   0.138040   \n",
-      "RFC_smote           0.495086        0.4  0.655639  2466.0   0.188538   \n",
-      "LightGBM_smote      0.590909        0.2  0.659149  1998.0   0.157524   \n",
-      "LightGBM            0.590909        0.2  0.659149  1998.0   0.157524   \n",
-      "XGB_smote           0.696560        0.3  0.686744  1482.0   0.156818   \n",
+      "                       Recall  threshold   ROC_AUC      FN  Precision  \\\n",
+      "Run Name                                                                \n",
+      "LGBM_Shap002_scaled  0.592547        0.2  0.663112  1990.0   0.161019   \n",
+      "LGBM_Shap002         0.584562        0.2  0.658589  2029.0   0.158664   \n",
+      "RFC_newFEATURE_001   0.654791        0.1  0.651827  1686.0   0.138568   \n",
+      "XGB_Shap002          0.595209        0.1  0.683261  1977.0   0.183349   \n",
+      "RFC_newFEATURE_002   0.624283        0.1  0.661839  1835.0   0.151926   \n",
+      "RFC_newFEATURE       0.657658        0.1  0.653852  1672.0   0.139494   \n",
+      "XGB                  0.589681        0.1  0.683261  2004.0   0.185627   \n",
+      "RFC                  0.591728        0.1  0.636501  1994.0   0.138040   \n",
+      "RFC_smote            0.495086        0.4  0.655639  2466.0   0.188538   \n",
+      "LightGBM_smote       0.590909        0.2  0.659149  1998.0   0.157524   \n",
+      "LightGBM             0.590909        0.2  0.659149  1998.0   0.157524   \n",
+      "XGB_smote            0.696560        0.3  0.686744  1482.0   0.156818   \n",
       "\n",
-      "                          F1       TN  \n",
-      "Run Name                               \n",
-      "LGBM_Shap002        0.249585  41480.0  \n",
-      "RFC_newFEATURE_001  0.228731  36738.0  \n",
-      "XGB_Shap002         0.280341  43671.0  \n",
-      "RFC_newFEATURE_002  0.244379  39599.0  \n",
-      "RFC_newFEATURE      0.230168  36805.0  \n",
-      "XGB                 0.282367  43984.0  \n",
-      "RFC                 0.223857  38573.0  \n",
-      "RFC_smote           0.273081  46212.0  \n",
-      "LightGBM_smote      0.248739  41184.0  \n",
-      "LightGBM            0.248739  41184.0  \n",
-      "XGB_smote           0.256001  38327.0  \n"
+      "                           F1       TN  \n",
+      "Run Name                                \n",
+      "LGBM_Shap002_scaled  0.253227  41540.0  \n",
+      "LGBM_Shap002         0.249585  41480.0  \n",
+      "RFC_newFEATURE_001   0.228731  36738.0  \n",
+      "XGB_Shap002          0.280341  43671.0  \n",
+      "RFC_newFEATURE_002   0.244379  39599.0  \n",
+      "RFC_newFEATURE       0.230168  36805.0  \n",
+      "XGB                  0.282367  43984.0  \n",
+      "RFC                  0.223857  38573.0  \n",
+      "RFC_smote            0.273081  46212.0  \n",
+      "LightGBM_smote       0.248739  41184.0  \n",
+      "LightGBM             0.248739  41184.0  \n",
+      "XGB_smote            0.256001  38327.0  \n"
      ]
     }
    ],
@@ -2533,7 +2540,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "cebf98bc",
+   "id": "f5ffa864",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3271,7 +3278,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "0b3389b1",
+   "id": "d555a1ce",
    "metadata": {},
    "source": [
     "<a name='7'></a>\n",
@@ -3280,7 +3287,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "df6a7489",
+   "id": "a5654c2e",
    "metadata": {},
    "source": [
     "## Filter not useful features"
@@ -3379,7 +3386,7 @@
   {
    "cell_type": "code",
    "execution_count": 237,
-   "id": "b09617a1",
+   "id": "7faa22f0",
    "metadata": {},
    "outputs": [
     {
@@ -3603,7 +3610,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "b7b2f0ca",
+   "id": "6f9d46be",
    "metadata": {},
    "source": [
     "### First attempt to improve feature selection and model training"
@@ -4648,7 +4655,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "ce809a7d",
+   "id": "4e795d20",
    "metadata": {},
    "source": [
     "### Second attempt to improve feature selection and model improvement"
@@ -4657,7 +4664,7 @@
   {
    "cell_type": "code",
    "execution_count": 253,
-   "id": "c1d66850",
+   "id": "54a0b6a6",
    "metadata": {},
    "outputs": [
     {
@@ -4847,7 +4854,7 @@
   {
    "cell_type": "code",
    "execution_count": 228,
-   "id": "db117fe4",
+   "id": "c8e8fb39",
    "metadata": {},
    "outputs": [
     {
@@ -4963,7 +4970,7 @@
   {
    "cell_type": "code",
    "execution_count": 231,
-   "id": "339217b2",
+   "id": "3335d4b5",
    "metadata": {},
    "outputs": [
     {
@@ -5097,7 +5104,7 @@
   {
    "cell_type": "code",
    "execution_count": 251,
-   "id": "36e5f24d",
+   "id": "3be94358",
    "metadata": {},
    "outputs": [
     {
@@ -5327,7 +5334,7 @@
   {
    "cell_type": "code",
    "execution_count": 245,
-   "id": "0ab66442",
+   "id": "786f91bb",
    "metadata": {},
    "outputs": [
     {
@@ -5569,6 +5576,49 @@
     "new_X_train_002.head()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "884ca746",
+   "metadata": {},
+   "source": [
+    "## Run with Shap filtered and scaled data to assess impact on metrics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 259,
+   "id": "ed32852c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train_002_scale, X_test_002_scale = scale_data(new_X_train_002, new_X_test_002)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 263,
+   "id": "ea1a18d6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Artifact PATH LGBM_Shap002_scaled_artifactPATH\n",
+      "{'TN': 41540, 'FP': 15079, 'FN': 1990, 'TP': 2894, 'FP_10_FN': 34979, 'Accuracy': 0.7224688226590573, 'F1': 0.2532265826661417, 'Precision': 0.16101930673788462, 'Recall': 0.5925470925470926, 'ROC_AUC': 0.6631115335216432, 'threshold': 0.2, 'time_in_s': 485.4484920501709}\n",
+      "{'subsample': 0.8, 'reg_lambda': 0.1, 'reg_alpha': 0.1, 'objective': 'binary', 'num_leaves': 31, 'n_estimators': 10000, 'metric': 'binary_logloss', 'learning_rate': 0.05, 'class_weight': 'balanced', 'boosting_type': 'gbdt'}\n",
+      "Active run_id: d9587cc928794dddac2b5407fd4412c6\n"
+     ]
+    }
+   ],
+   "source": [
+    "run_name = \"LGBM_Shap002_scaled\"\n",
+    "#LGBM_model_002_scale, LGBM_002_scale_params, time_LGBM_002 = train_LightGBM_model(X_train_002_scale, Y_train)\n",
+    "#LGBM_002_scale_metrics, best_metrics_LGBM_scale           = generate_model_report(LGBM_model_002_scale, run_name, X_test_002_scale, Y_test, time_LGBM_002)\n",
+    "run_MLflow(experiment_name, run_name, LGBM_002_scale_metrics, \n",
+    "           LGBM_002_scale_params, LGBM_model_002_scale, X_train_002_scale)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "8233f52e",