Skip to content

Commit

Permalink
model.py removed
Browse files Browse the repository at this point in the history
  • Loading branch information
MarkoBrie committed Mar 4, 2024
1 parent b8833a5 commit b5a708b
Show file tree
Hide file tree
Showing 5 changed files with 2,199 additions and 734 deletions.
2,650 changes: 1,969 additions & 681 deletions .ipynb_checkpoints/2_Model_selection-checkpoint.ipynb

Large diffs are not rendered by default.

181 changes: 162 additions & 19 deletions 2_Model_selection.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
},
{
"cell_type": "code",
"execution_count": 139,
"execution_count": 258,
"id": "1c8b0045",
"metadata": {},
"outputs": [],
Expand All @@ -58,6 +58,7 @@
"\n",
"from sklearn.model_selection import train_test_split, GridSearchCV\n",
"from sklearn.model_selection import RandomizedSearchCV\n",
"from sklearn.preprocessing import MinMaxScaler\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.metrics import precision_recall_curve\n",
"from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score\n",
Expand Down Expand Up @@ -127,18 +128,33 @@
},
{
"cell_type": "code",
"execution_count": 252,
"id": "19dc90c7",
"execution_count": 255,
"id": "e615899c",
"metadata": {},
"outputs": [],
"source": [
"def scale_data(df_train, df_test):\n",
" # Scale the domainnomial features\n",
" scaler = MinMaxScaler(feature_range = (0, 1))\n",
" \"\"\"\n",
" Scale the features in the training and testing datasets using Min-Max scaling.\n",
"\n",
" Args:\n",
" df_train (DataFrame): The training dataset to be scaled.\n",
" df_test (DataFrame): The testing dataset to be scaled.\n",
"\n",
" Returns:\n",
" df_train_scaled (DataFrame): The scaled training dataset.\n",
" df_test_scaled (DataFrame): The scaled testing dataset.\n",
" \"\"\"\n",
" # Initialize MinMaxScaler with feature range between 0 and 1\n",
" scaler = MinMaxScaler(feature_range=(0, 1))\n",
"\n",
" # Fit and transform the training dataset\n",
" df_train_scaled = scaler.fit_transform(df_train)\n",
"\n",
" df_train = scaler.fit_transform(df_train)\n",
" df_test = scaler.transform(df_test)\n",
" return df_train, df_test"
" # Transform the testing dataset using the same scaler fitted on the training data\n",
" df_test_scaled = scaler.transform(df_test)\n",
"\n",
" return df_train_scaled, df_test_scaled"
]
},
{
Expand Down Expand Up @@ -2518,7 +2534,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "44e60bc6",
"id": "4e456a3a",
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -3256,7 +3272,7 @@
},
{
"cell_type": "markdown",
"id": "ff16d6e2",
"id": "2d8e6c39",
"metadata": {},
"source": [
"<a name='7'></a>\n",
Expand All @@ -3265,7 +3281,7 @@
},
{
"cell_type": "markdown",
"id": "57009d8f",
"id": "1a2ff8da",
"metadata": {},
"source": [
"## Filter not useful features"
Expand Down Expand Up @@ -3364,7 +3380,7 @@
{
"cell_type": "code",
"execution_count": 237,
"id": "1e3c05c9",
"id": "f59f05e1",
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -3588,7 +3604,7 @@
},
{
"cell_type": "markdown",
"id": "659c04a5",
"id": "090091d6",
"metadata": {},
"source": [
"### First attempt to improve feature selection and model training"
Expand Down Expand Up @@ -4633,7 +4649,7 @@
},
{
"cell_type": "markdown",
"id": "23408fe1",
"id": "aba1d118",
"metadata": {},
"source": [
"### Second attempt to improve feature selection and model improvement"
Expand All @@ -4642,7 +4658,7 @@
{
"cell_type": "code",
"execution_count": 253,
"id": "08d83fe3",
"id": "ebf04268",
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -4832,7 +4848,7 @@
{
"cell_type": "code",
"execution_count": 228,
"id": "338f0ee3",
"id": "bf4fd419",
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -4948,7 +4964,7 @@
{
"cell_type": "code",
"execution_count": 231,
"id": "482c7fea",
"id": "dc0d7d7a",
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -5082,7 +5098,7 @@
{
"cell_type": "code",
"execution_count": 251,
"id": "d79df67a",
"id": "3af6dc9d",
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -5312,7 +5328,7 @@
{
"cell_type": "code",
"execution_count": 245,
"id": "c59241fb",
"id": "069b126d",
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -5554,6 +5570,133 @@
"new_X_train_002.head()"
]
},
{
"cell_type": "markdown",
"id": "d24f329c",
"metadata": {},
"source": [
"## Run with Shap filtered and scaled data to assess impact on metrics"
]
},
{
"cell_type": "code",
"execution_count": 259,
"id": "5bc23e7c",
"metadata": {},
"outputs": [],
"source": [
"X_train_002_scale, X_test_002_scale = scale_data(new_X_train_002, new_X_test_002)"
]
},
{
"cell_type": "code",
"execution_count": 261,
"id": "19fc9fc7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"START time Mon Mar 4 10:37:28 2024\n",
"Fitting 5 folds for each of 1 candidates, totalling 5 fits\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The total space of parameters 1 is smaller than n_iter=50. Running 1 iterations. For exhaustive searches, use GridSearchCV.\n",
"A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
"A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[LightGBM] [Info] Number of positive: 19941, number of negative: 226067\n",
"[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.034363 seconds.\n",
"You can set `force_row_wise=true` to remove the overhead.\n",
"And if memory is not enough, you can set `force_col_wise=true`.\n",
"[LightGBM] [Info] Total Bins 10311\n",
"[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 96\n",
"[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000\n",
"[LightGBM] [Info] Start training from score 0.000000\n",
"START time Mon Mar 4 10:37:28 2024\n",
"END time Mon Mar 4 10:45:34 2024 duration 8.090808200836182 min\n"
]
},
{
"ename": "NameError",
"evalue": "name 'X_test_002__scale' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[261], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m run_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mLGBM_Shap002_scaled\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2\u001b[0m LGBM_model_002_scale, LGBM_002_scale_params, time_LGBM_002 \u001b[38;5;241m=\u001b[39m train_LightGBM_model(X_train_002_scale, Y_train)\n\u001b[0;32m----> 3\u001b[0m LGBM_002_scale_metrics, best_metrics_LGBM_scale \u001b[38;5;241m=\u001b[39m generate_model_report(LGBM_model_002_scale, run_name, \u001b[43mX_test_002__scale\u001b[49m, Y_test, time_LGBM_002)\n\u001b[1;32m 4\u001b[0m run_MLflow(experiment_name, run_name, LGBM_002__scale_metrics, \n\u001b[1;32m 5\u001b[0m LGBM_002_scale_params, LGBM_model_002_scale, X_train_002_scale)\n",
"\u001b[0;31mNameError\u001b[0m: name 'X_test_002__scale' is not defined"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[LightGBM] [Info] Number of positive: 15953, number of negative: 180854\n",
"[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.107711 seconds.\n",
"You can set `force_row_wise=true` to remove the overhead.\n",
"And if memory is not enough, you can set `force_col_wise=true`.\n",
"[LightGBM] [Info] Total Bins 10236\n",
"[LightGBM] [Info] Number of data points in the train set: 196807, number of used features: 96\n",
"[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000\n",
"[LightGBM] [Info] Start training from score 0.000000\n",
"[CV 5/5] END boosting_type=gbdt, class_weight=balanced, learning_rate=0.05, metric=binary_logloss, n_estimators=10000, num_leaves=31, objective=binary, reg_alpha=0.1, reg_lambda=0.1, subsample=0.8;, score=0.888 total time= 5.5min\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/markobriesemann/opt/anaconda3/lib/python3.8/site-packages/sklearn/preprocessing/_label.py:97: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
" y = column_or_1d(y, warn=True)\n",
"/Users/markobriesemann/opt/anaconda3/lib/python3.8/site-packages/sklearn/preprocessing/_label.py:132: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
" y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[LightGBM] [Info] Number of positive: 15953, number of negative: 180854\n",
"[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.049505 seconds.\n",
"You can set `force_row_wise=true` to remove the overhead.\n",
"And if memory is not enough, you can set `force_col_wise=true`.\n",
"[LightGBM] [Info] Total Bins 10305\n",
"[LightGBM] [Info] Number of data points in the train set: 196807, number of used features: 96\n",
"[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000\n",
"[LightGBM] [Info] Start training from score 0.000000\n",
"[CV 4/5] END boosting_type=gbdt, class_weight=balanced, learning_rate=0.05, metric=binary_logloss, n_estimators=10000, num_leaves=31, objective=binary, reg_alpha=0.1, reg_lambda=0.1, subsample=0.8;, score=0.885 total time= 5.5min\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/markobriesemann/opt/anaconda3/lib/python3.8/site-packages/sklearn/preprocessing/_label.py:97: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
" y = column_or_1d(y, warn=True)\n",
"/Users/markobriesemann/opt/anaconda3/lib/python3.8/site-packages/sklearn/preprocessing/_label.py:132: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
" y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)\n"
]
}
],
"source": [
"run_name = \"LGBM_Shap002_scaled\"\n",
"LGBM_model_002_scale, LGBM_002_scale_params, time_LGBM_002 = train_LightGBM_model(X_train_002_scale, Y_train)\n",
"LGBM_002_scale_metrics, best_metrics_LGBM_scale = generate_model_report(LGBM_model_002_scale, run_name, X_test_002__scale, Y_test, time_LGBM_002)\n",
"run_MLflow(experiment_name, run_name, LGBM_002__scale_metrics, \n",
" LGBM_002_scale_params, LGBM_model_002_scale, X_train_002_scale)"
]
},
{
"cell_type": "markdown",
"id": "8233f52e",
Expand Down
77 changes: 68 additions & 9 deletions Dashboard_test.ipynb

Large diffs are not rendered by default.

24 changes: 0 additions & 24 deletions Model.py

This file was deleted.

1 change: 0 additions & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ def predict_credit_score(data: DataPoint):

sklearn_pyfunc = mlflow.lightgbm.load_model(model_uri="LightGBM")


prediction = sklearn_pyfunc.predict_proba([data.data_point]).max()

return {
Expand Down

0 comments on commit b5a708b

Please sign in to comment.