model.py removed

MarkoBrie · Mar 4, 2024 · b5a708b · b5a708b
1 parent b8833a5
commit b5a708b
Show file tree

Hide file tree

Showing 5 changed files with 2,199 additions and 734 deletions.
diff --git a/.ipynb_checkpoints/2_Model_selection-checkpoint.ipynb b/.ipynb_checkpoints/2_Model_selection-checkpoint.ipynb
diff --git a/2_Model_selection.ipynb b/2_Model_selection.ipynb
@@ -45,7 +45,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 139,
+   "execution_count": 258,
    "id": "1c8b0045",
    "metadata": {},
    "outputs": [],
@@ -58,6 +58,7 @@
     "\n",
     "from sklearn.model_selection import train_test_split, GridSearchCV\n",
     "from sklearn.model_selection import RandomizedSearchCV\n",
+    "from sklearn.preprocessing import MinMaxScaler\n",
     "from sklearn.ensemble import RandomForestClassifier\n",
     "from sklearn.metrics import precision_recall_curve\n",
     "from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score\n",
@@ -127,18 +128,33 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 252,
-   "id": "19dc90c7",
+   "execution_count": 255,
+   "id": "e615899c",
    "metadata": {},
    "outputs": [],
    "source": [
     "def scale_data(df_train, df_test):\n",
-    "    # Scale the domainnomial features\n",
-    "    scaler = MinMaxScaler(feature_range = (0, 1))\n",
+    "    \"\"\"\n",
+    "    Scale the features in the training and testing datasets using Min-Max scaling.\n",
+    "\n",
+    "    Args:\n",
+    "    df_train (DataFrame): The training dataset to be scaled.\n",
+    "    df_test (DataFrame): The testing dataset to be scaled.\n",
+    "\n",
+    "    Returns:\n",
+    "    df_train_scaled (DataFrame): The scaled training dataset.\n",
+    "    df_test_scaled (DataFrame): The scaled testing dataset.\n",
+    "    \"\"\"\n",
+    "    # Initialize MinMaxScaler with feature range between 0 and 1\n",
+    "    scaler = MinMaxScaler(feature_range=(0, 1))\n",
+    "\n",
+    "    # Fit and transform the training dataset\n",
+    "    df_train_scaled = scaler.fit_transform(df_train)\n",
     "\n",
-    "    df_train = scaler.fit_transform(df_train)\n",
-    "    df_test = scaler.transform(df_test)\n",
-    "    return df_train, df_test"
+    "    # Transform the testing dataset using the same scaler fitted on the training data\n",
+    "    df_test_scaled = scaler.transform(df_test)\n",
+    "\n",
+    "    return df_train_scaled, df_test_scaled"
    ]
   },
   {
@@ -2518,7 +2534,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "44e60bc6",
+   "id": "4e456a3a",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3256,7 +3272,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "ff16d6e2",
+   "id": "2d8e6c39",
    "metadata": {},
    "source": [
     "<a name='7'></a>\n",
@@ -3265,7 +3281,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "57009d8f",
+   "id": "1a2ff8da",
    "metadata": {},
    "source": [
     "## Filter not useful features"
@@ -3364,7 +3380,7 @@
   {
    "cell_type": "code",
    "execution_count": 237,
-   "id": "1e3c05c9",
+   "id": "f59f05e1",
    "metadata": {},
    "outputs": [
     {
@@ -3588,7 +3604,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "659c04a5",
+   "id": "090091d6",
    "metadata": {},
    "source": [
     "### First attempt to improve feature selection and model training"
@@ -4633,7 +4649,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "23408fe1",
+   "id": "aba1d118",
    "metadata": {},
    "source": [
     "### Second attempt to improve feature selection and model improvement"
@@ -4642,7 +4658,7 @@
   {
    "cell_type": "code",
    "execution_count": 253,
-   "id": "08d83fe3",
+   "id": "ebf04268",
    "metadata": {},
    "outputs": [
     {
@@ -4832,7 +4848,7 @@
   {
    "cell_type": "code",
    "execution_count": 228,
-   "id": "338f0ee3",
+   "id": "bf4fd419",
    "metadata": {},
    "outputs": [
     {
@@ -4948,7 +4964,7 @@
   {
    "cell_type": "code",
    "execution_count": 231,
-   "id": "482c7fea",
+   "id": "dc0d7d7a",
    "metadata": {},
    "outputs": [
     {
@@ -5082,7 +5098,7 @@
   {
    "cell_type": "code",
    "execution_count": 251,
-   "id": "d79df67a",
+   "id": "3af6dc9d",
    "metadata": {},
    "outputs": [
     {
@@ -5312,7 +5328,7 @@
   {
    "cell_type": "code",
    "execution_count": 245,
-   "id": "c59241fb",
+   "id": "069b126d",
    "metadata": {},
    "outputs": [
     {
@@ -5554,6 +5570,133 @@
     "new_X_train_002.head()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "d24f329c",
+   "metadata": {},
+   "source": [
+    "## Run with Shap filtered and scaled data to assess impact on metrics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 259,
+   "id": "5bc23e7c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train_002_scale, X_test_002_scale = scale_data(new_X_train_002, new_X_test_002)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 261,
+   "id": "19fc9fc7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "START time Mon Mar  4 10:37:28 2024\n",
+      "Fitting 5 folds for each of 1 candidates, totalling 5 fits\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "The total space of parameters 1 is smaller than n_iter=50. Running 1 iterations. For exhaustive searches, use GridSearchCV.\n",
+      "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
+      "A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[LightGBM] [Info] Number of positive: 19941, number of negative: 226067\n",
+      "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.034363 seconds.\n",
+      "You can set `force_row_wise=true` to remove the overhead.\n",
+      "And if memory is not enough, you can set `force_col_wise=true`.\n",
+      "[LightGBM] [Info] Total Bins 10311\n",
+      "[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 96\n",
+      "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000\n",
+      "[LightGBM] [Info] Start training from score 0.000000\n",
+      "START time Mon Mar  4 10:37:28 2024\n",
+      "END time Mon Mar  4 10:45:34 2024  duration 8.090808200836182 min\n"
+     ]
+    },
+    {
+     "ename": "NameError",
+     "evalue": "name 'X_test_002__scale' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[261], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m run_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mLGBM_Shap002_scaled\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m      2\u001b[0m LGBM_model_002_scale, LGBM_002_scale_params, time_LGBM_002 \u001b[38;5;241m=\u001b[39m train_LightGBM_model(X_train_002_scale, Y_train)\n\u001b[0;32m----> 3\u001b[0m LGBM_002_scale_metrics, best_metrics_LGBM_scale           \u001b[38;5;241m=\u001b[39m generate_model_report(LGBM_model_002_scale, run_name, \u001b[43mX_test_002__scale\u001b[49m, Y_test, time_LGBM_002)\n\u001b[1;32m      4\u001b[0m run_MLflow(experiment_name, run_name, LGBM_002__scale_metrics, \n\u001b[1;32m      5\u001b[0m            LGBM_002_scale_params, LGBM_model_002_scale, X_train_002_scale)\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'X_test_002__scale' is not defined"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[LightGBM] [Info] Number of positive: 15953, number of negative: 180854\n",
+      "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.107711 seconds.\n",
+      "You can set `force_row_wise=true` to remove the overhead.\n",
+      "And if memory is not enough, you can set `force_col_wise=true`.\n",
+      "[LightGBM] [Info] Total Bins 10236\n",
+      "[LightGBM] [Info] Number of data points in the train set: 196807, number of used features: 96\n",
+      "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000\n",
+      "[LightGBM] [Info] Start training from score 0.000000\n",
+      "[CV 5/5] END boosting_type=gbdt, class_weight=balanced, learning_rate=0.05, metric=binary_logloss, n_estimators=10000, num_leaves=31, objective=binary, reg_alpha=0.1, reg_lambda=0.1, subsample=0.8;, score=0.888 total time= 5.5min\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/markobriesemann/opt/anaconda3/lib/python3.8/site-packages/sklearn/preprocessing/_label.py:97: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
+      "  y = column_or_1d(y, warn=True)\n",
+      "/Users/markobriesemann/opt/anaconda3/lib/python3.8/site-packages/sklearn/preprocessing/_label.py:132: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
+      "  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[LightGBM] [Info] Number of positive: 15953, number of negative: 180854\n",
+      "[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.049505 seconds.\n",
+      "You can set `force_row_wise=true` to remove the overhead.\n",
+      "And if memory is not enough, you can set `force_col_wise=true`.\n",
+      "[LightGBM] [Info] Total Bins 10305\n",
+      "[LightGBM] [Info] Number of data points in the train set: 196807, number of used features: 96\n",
+      "[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000\n",
+      "[LightGBM] [Info] Start training from score 0.000000\n",
+      "[CV 4/5] END boosting_type=gbdt, class_weight=balanced, learning_rate=0.05, metric=binary_logloss, n_estimators=10000, num_leaves=31, objective=binary, reg_alpha=0.1, reg_lambda=0.1, subsample=0.8;, score=0.885 total time= 5.5min\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/markobriesemann/opt/anaconda3/lib/python3.8/site-packages/sklearn/preprocessing/_label.py:97: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
+      "  y = column_or_1d(y, warn=True)\n",
+      "/Users/markobriesemann/opt/anaconda3/lib/python3.8/site-packages/sklearn/preprocessing/_label.py:132: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
+      "  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)\n"
+     ]
+    }
+   ],
+   "source": [
+    "run_name = \"LGBM_Shap002_scaled\"\n",
+    "LGBM_model_002_scale, LGBM_002_scale_params, time_LGBM_002 = train_LightGBM_model(X_train_002_scale, Y_train)\n",
+    "LGBM_002_scale_metrics, best_metrics_LGBM_scale           = generate_model_report(LGBM_model_002_scale, run_name, X_test_002__scale, Y_test, time_LGBM_002)\n",
+    "run_MLflow(experiment_name, run_name, LGBM_002__scale_metrics, \n",
+    "           LGBM_002_scale_params, LGBM_model_002_scale, X_train_002_scale)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "8233f52e",

diff --git a/Dashboard_test.ipynb b/Dashboard_test.ipynb
diff --git a/Model.py b/Model.py
diff --git a/main.py b/main.py
@@ -52,7 +52,6 @@ def predict_credit_score(data: DataPoint):
 
         sklearn_pyfunc = mlflow.lightgbm.load_model(model_uri="LightGBM")
 
-
         prediction = sklearn_pyfunc.predict_proba([data.data_point]).max()
 
         return {
Original file line number	Diff line number	Diff line change
Expand Up		@@ -52,7 +52,6 @@ def predict_credit_score(data: DataPoint):

		sklearn_pyfunc = mlflow.lightgbm.load_model(model_uri="LightGBM")


		prediction = sklearn_pyfunc.predict_proba([data.data_point]).max()

		return {
Expand Down