Add option to use different degree of freedome for HL test. Add docum…

…entation for it.
DIDSR · Oct 25, 2024 · 93fe7de · 93fe7de
1 parent 61258e5
commit 93fe7de
Show file tree

Hide file tree

Showing 27 changed files with 474 additions and 121 deletions.
diff --git a/GUI_cal_metrics.py b/GUI_cal_metrics.py
@@ -63,6 +63,11 @@ def run_program():
         args.append("--verbose")
     if topclass_checkbox.value:
         args.append("--topclass")
+    if hl_test_validation_checkbox.value:
+        if not any(metric in selected_metrics for metric in ["HL-H", "HL-C"]):
+            ui.notify("Error: HL test validation requires either HL-H or HL-C metric to be selected.", type="error")
+            return
+        args.append("--hl_test_validation")
 
     command = ["python", "cal_metrics.py"] + args
     print("Running command:", " ".join(command))
@@ -110,7 +115,6 @@ def clear_cache():
         }
         clearCache();
     ''')
-    ui.notify('Browser cache cleared')
 
     plot_image.clear()
     plot_image.set_source(None)  # Set the image source to None
@@ -159,6 +163,7 @@ async def pick_file() -> None:
                                              value=1, step=1)
         num_bins_input = ui.number(label='Number of Bins for ECE/MCE/HL Test',
                                    value=10, min=2, step=1)
+        hl_test_validation_checkbox = ui.checkbox('HL Test Validation set', value=False)
 
     with ui.column().classes('w-1/3 p-4'):
         ui.label('Output Paths:')

diff --git a/cal_metrics.py b/cal_metrics.py
@@ -21,7 +21,10 @@ def perform_calculation(probs, labels, args, suffix=""):
     cal_metrics = CalibrationMetrics(
         class_to_calculate=args.class_to_calculate, num_bins=args.num_bins
     )
-
+    if args.hl_test_validation:
+        df = args.num_bins
+    else:
+        df = args.num_bins - 2
     metrics_to_calculate = args.metrics.split(",") if args.metrics else ["all"]
     if metrics_to_calculate == ["all"]:
         metrics_to_calculate = "all"
@@ -30,6 +33,7 @@ def perform_calculation(probs, labels, args, suffix=""):
         y_proba=probs,
         metrics=metrics_to_calculate,
         perform_pervalance_adjustment=args.prevalence_adjustment,
+        df = df
     )
 
     keys = list(result.keys())
@@ -42,6 +46,7 @@ def perform_calculation(probs, labels, args, suffix=""):
             n_samples=args.n_bootstrap,
             metrics=metrics_to_calculate,
             perform_pervalance_adjustment=args.prevalence_adjustment,
+            df = df
         )
         CI = get_CI(bootstrap_results)
         result = np.vstack((result, np.array(list(CI.values())).T))
@@ -214,6 +219,12 @@ def main():
         default=10,
         help="Number of bins for ECE/MCE/HL calculations (default: 10)",
     )
+    parser.add_argument(
+        "--hl_test_validation",
+        default=False,
+        action="store_true",
+        help="Using nbin instead of nbin-2 as HL test DOF. Use it if the dataset is validation set.",
+    )
     parser.add_argument(
         "--topclass",
         default=False,

diff --git a/calzone/metrics.py b/calzone/metrics.py
@@ -16,7 +16,7 @@
 import numpy.lib.recfunctions as rf
 import contextlib
 
-def hosmer_lemeshow_test(reliability, confidence, bin_count, df=None):
+def hosmer_lemeshow_test(reliability, confidence, bin_count, df=None, **kwargs):
     """
     Compute the Hosmer-Lemeshow test for goodness of fit.
 
@@ -546,7 +546,7 @@ def calculate_metrics(
                     results["MCE-H"] = mce_h_class
                 elif metric == "HL-H":
                     hl_h_score, hl_h, _ = hosmer_lemeshow_test(
-                        acc_H_class, confidence_H_class, bin_count_H_class
+                        acc_H_class, confidence_H_class, bin_count_H_class, **kwargs
                     )
                     results["HL-H score"] = hl_h_score
                     results["HL-H p-value"] = hl_h
@@ -591,7 +591,7 @@ def calculate_metrics(
                     results["MCE-C"] = mce_c_class
                 elif metric == "HL-C":
                     hl_c_score, hl_c, _ = hosmer_lemeshow_test(
-                        acc_C_class, confidence_C_class, bin_count_C_class
+                        acc_C_class, confidence_C_class, bin_count_C_class, **kwargs
                     )
                     results["HL-C score"] = hl_c_score
                     results["HL-C p-value"] = hl_c

diff --git a/docs/build/doctrees/calzone.doctree b/docs/build/doctrees/calzone.doctree
diff --git a/docs/build/doctrees/environment.pickle b/docs/build/doctrees/environment.pickle
diff --git a/docs/build/doctrees/nbsphinx/notebooks/hl_test.ipynb b/docs/build/doctrees/nbsphinx/notebooks/hl_test.ipynb
@@ -20,7 +20,7 @@
     "\\text{HL} = \\sum_{m=1}^{M} \\left[\\frac{(O_{1,m}-E_{1,m})^2}{E_{1,m}} + \\frac{(O_{0,m}-E_{0,m})^2}{E_{0,m}}\\right]  = \\sum_{m=1}^{M} \\frac{(O_{1,m}-E_{1,m})^2}{E_{1,m}(1-\\frac{E_{1,m}}{N_m})} \\sim \\chi^2_{M-2}\n",
     "$$\n",
     "\n",
-    "where $E_{1,m}$ is the expected number of class 1 events in the $\\text{m}^{th}$ bin, $O_{1,m}$ is the observed number of class 1 events in the $\\text{m}^{th}$ bin, $N_m$ is the total number of observations in the $\\text{m}^{th}$ bin, and $M$ is the number of bins. The HL test statistic is distributed as a chi-squared distribution with $M-2$ degrees of freedom. We can then use this test statistic to calculate the p-value for the test and determine whether we can reject the null hypothesis that the model is well-calibrated. (Notice that the degree of freedom of HL test is $M-2$ by default but some literature suggests that the degree of freedom should be $M$ instead when the samples is not used for training. We could not find a proof to this statement and we provides the option to specify the degree of freedom in the `calzone`. The default value is still $M-2$.)"
+    "where $E_{1,m}$ is the expected number of class 1 events in the $\\text{m}^{th}$ bin, $O_{1,m}$ is the observed number of class 1 events in the $\\text{m}^{th}$ bin, $N_m$ is the total number of observations in the $\\text{m}^{th}$ bin, and $M$ is the number of bins. The HL test statistic is distributed as a chi-squared distribution with $M-2$ degrees of freedom. We can then use this test statistic to calculate the p-value for the test and determine whether we can reject the null hypothesis that the model is well-calibrated. Notice that the degree of freedom of HL test is $M-2$ by default but some literature suggests that the degree of freedom should be $M$ instead when the samples is not used for training. We provides the option to specify the degree of freedom in the `calzone`. The default value is still $M-2$."
    ]
   },
   {
@@ -280,7 +280,54 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We can see that both the equal-width and the equal-count method have the incorrect size."
+    "We can see that both the equal-width and the equal-count method have the incorrect size. The simulation support the claim that the degree of freedom should be M instead of M-2. We can show it with simulation. We are not proving the claim here since it is beyond the scope of this documentation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The size of HL-H with df=M is : 0.047\n",
+      "The size of HL-C with df=M  is : 0.055\n"
+     ]
+    }
+   ],
+   "source": [
+    "### The size of HL Test\n",
+    "from calzone.utils import fake_binary_data_generator\n",
+    "from importlib import reload\n",
+    "import calzone.metrics\n",
+    "reload(calzone.metrics)\n",
+    "from calzone.metrics import CalibrationMetrics\n",
+    "np.random.seed(123)\n",
+    "fakedata_generator = fake_binary_data_generator(alpha_val=0.5, beta_val=0.5)\n",
+    "cal_metrics = CalibrationMetrics()\n",
+    "sample_size = 1000\n",
+    "simulation_size = 10000\n",
+    "results = []\n",
+    "# generate data\n",
+    "for i in range(simulation_size):\n",
+    "    X, y = fakedata_generator.generate_data(sample_size)\n",
+    "    if i == 0:\n",
+    "        tempresult = cal_metrics.calculate_metrics(y, X, ['HL-H', 'HL-C'],return_numpy=False, df = 10)\n",
+    "        keys = list(tempresult.keys())\n",
+    "        results.append(np.array(list(tempresult.values())))\n",
+    "    else:\n",
+    "        tempresult = cal_metrics.calculate_metrics(y, X, ['HL-H', 'HL-C'],return_numpy=True, df = 10)\n",
+    "        results.append(tempresult)\n",
+    "results = np.array(results)\n",
+    "\n",
+    "hl_h_pvalue = results[:,1]\n",
+    "hl_c_pvalue = results[:,3]\n",
+    "size_h = np.mean(hl_h_pvalue < 0.05)\n",
+    "size_c = np.mean(hl_c_pvalue < 0.05)\n",
+    "print(\"The size of HL-H with df=M is :\", round(size_h,3))\n",
+    "print(\"The size of HL-C with df=M  is :\", round(size_c,3))"
    ]
   },
   {

diff --git a/docs/build/doctrees/nbsphinx/notebooks/quickstart.ipynb b/docs/build/doctrees/nbsphinx/notebooks/quickstart.ipynb
@@ -184,7 +184,8 @@
       "                      [--prevalence_adjustment] [--n_bootstrap N_BOOTSTRAP]\n",
       "                      [--bootstrap_ci BOOTSTRAP_CI]\n",
       "                      [--class_to_calculate CLASS_TO_CALCULATE]\n",
-      "                      [--num_bins NUM_BINS] [--topclass]\n",
+      "                      [--num_bins NUM_BINS]\n",
+      "                      [--hl_test_validation HL_TEST_VALIDATION] [--topclass]\n",
       "                      [--save_metrics SAVE_METRICS] [--plot]\n",
       "                      [--plot_bins PLOT_BINS] [--save_plot SAVE_PLOT]\n",
       "                      [--save_diagram_output SAVE_DIAGRAM_OUTPUT] [--verbose]\n",
@@ -211,14 +212,18 @@
       "                        Class to calculate metrics for (default: 1)\n",
       "  --num_bins NUM_BINS   Number of bins for ECE/MCE/HL calculations (default:\n",
       "                        10)\n",
+      "  --hl_test_validation HL_TEST_VALIDATION\n",
+      "                        Using nbin instead of nbin-2 as HL test DOF. Use it if\n",
+      "                        the dataset is validation set.\n",
       "  --topclass            Whether to transform the problem to top-class problem.\n",
       "  --save_metrics SAVE_METRICS\n",
       "                        Save the metrics to a csv file\n",
       "  --plot                Plot reliability diagram (default: False)\n",
       "  --plot_bins PLOT_BINS\n",
       "                        Number of bins for reliability diagram\n",
       "  --save_plot SAVE_PLOT\n",
-      "                        Save the plot to a file\n",
+      "                        Save the plot to a file. Must end with valid image\n",
+      "                        formats.\n",
       "  --save_diagram_output SAVE_DIAGRAM_OUTPUT\n",
       "                        Save the reliability diagram output to a file\n",
       "  --verbose             Print verbose output\n"
@@ -290,7 +295,8 @@
     "--verbose \\\n",
     "--save_diagram_output '../../../example_data/simulated_welldata_diagram_output.csv' \n",
     "### save_diagram_output only when you want to save the reliability diagram output\n",
-    "#--prevalence_adjustment # only when you want to apply prevalence adjustment"
+    "#--prevalence_adjustment # only when you want to apply prevalence adjustment\n",
+    "#--hl_test_validation #use it only when the data is from validation set"
    ]
   },
   {

diff --git a/docs/build/doctrees/notebooks/hl_test.doctree b/docs/build/doctrees/notebooks/hl_test.doctree
diff --git a/docs/build/doctrees/notebooks/quickstart.doctree b/docs/build/doctrees/notebooks/quickstart.doctree
diff --git a/docs/build/html/_modules/calzone/metrics.html b/docs/build/html/_modules/calzone/metrics.html
@@ -105,7 +105,7 @@ <h1>Source code for calzone.metrics</h1><div class="highlight"><pre>
 
 <div class="viewcode-block" id="hosmer_lemeshow_test">
 <a class="viewcode-back" href="../../calzone.html#calzone.metrics.hosmer_lemeshow_test">[docs]</a>
-<span class="k">def</span> <span class="nf">hosmer_lemeshow_test</span><span class="p">(</span><span class="n">reliability</span><span class="p">,</span> <span class="n">confidence</span><span class="p">,</span> <span class="n">bin_count</span><span class="p">,</span> <span class="n">df</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
+<span class="k">def</span> <span class="nf">hosmer_lemeshow_test</span><span class="p">(</span><span class="n">reliability</span><span class="p">,</span> <span class="n">confidence</span><span class="p">,</span> <span class="n">bin_count</span><span class="p">,</span> <span class="n">df</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
 <span class="sd">    Compute the Hosmer-Lemeshow test for goodness of fit.</span>
 
@@ -667,7 +667,7 @@ <h1>Source code for calzone.metrics</h1><div class="highlight"><pre>
                     <span class="n">results</span><span class="p">[</span><span class="s2">&quot;MCE-H&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">mce_h_class</span>
                 <span class="k">elif</span> <span class="n">metric</span> <span class="o">==</span> <span class="s2">&quot;HL-H&quot;</span><span class="p">:</span>
                     <span class="n">hl_h_score</span><span class="p">,</span> <span class="n">hl_h</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">hosmer_lemeshow_test</span><span class="p">(</span>
-                        <span class="n">acc_H_class</span><span class="p">,</span> <span class="n">confidence_H_class</span><span class="p">,</span> <span class="n">bin_count_H_class</span>
+                        <span class="n">acc_H_class</span><span class="p">,</span> <span class="n">confidence_H_class</span><span class="p">,</span> <span class="n">bin_count_H_class</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span>
                     <span class="p">)</span>
                     <span class="n">results</span><span class="p">[</span><span class="s2">&quot;HL-H score&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">hl_h_score</span>
                     <span class="n">results</span><span class="p">[</span><span class="s2">&quot;HL-H p-value&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">hl_h</span>
@@ -712,7 +712,7 @@ <h1>Source code for calzone.metrics</h1><div class="highlight"><pre>
                     <span class="n">results</span><span class="p">[</span><span class="s2">&quot;MCE-C&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">mce_c_class</span>
                 <span class="k">elif</span> <span class="n">metric</span> <span class="o">==</span> <span class="s2">&quot;HL-C&quot;</span><span class="p">:</span>
                     <span class="n">hl_c_score</span><span class="p">,</span> <span class="n">hl_c</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">hosmer_lemeshow_test</span><span class="p">(</span>
-                        <span class="n">acc_C_class</span><span class="p">,</span> <span class="n">confidence_C_class</span><span class="p">,</span> <span class="n">bin_count_C_class</span>
+                        <span class="n">acc_C_class</span><span class="p">,</span> <span class="n">confidence_C_class</span><span class="p">,</span> <span class="n">bin_count_C_class</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span>
                     <span class="p">)</span>
                     <span class="n">results</span><span class="p">[</span><span class="s2">&quot;HL-C score&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">hl_c_score</span>
                     <span class="n">results</span><span class="p">[</span><span class="s2">&quot;HL-C p-value&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">hl_c</span>

diff --git a/docs/build/html/_sources/notebooks/hl_test.ipynb.txt b/docs/build/html/_sources/notebooks/hl_test.ipynb.txt
@@ -20,7 +20,7 @@
     "\\text{HL} = \\sum_{m=1}^{M} \\left[\\frac{(O_{1,m}-E_{1,m})^2}{E_{1,m}} + \\frac{(O_{0,m}-E_{0,m})^2}{E_{0,m}}\\right]  = \\sum_{m=1}^{M} \\frac{(O_{1,m}-E_{1,m})^2}{E_{1,m}(1-\\frac{E_{1,m}}{N_m})} \\sim \\chi^2_{M-2}\n",
     "$$\n",
     "\n",
-    "where $E_{1,m}$ is the expected number of class 1 events in the $\\text{m}^{th}$ bin, $O_{1,m}$ is the observed number of class 1 events in the $\\text{m}^{th}$ bin, $N_m$ is the total number of observations in the $\\text{m}^{th}$ bin, and $M$ is the number of bins. The HL test statistic is distributed as a chi-squared distribution with $M-2$ degrees of freedom. We can then use this test statistic to calculate the p-value for the test and determine whether we can reject the null hypothesis that the model is well-calibrated. (Notice that the degree of freedom of HL test is $M-2$ by default but some literature suggests that the degree of freedom should be $M$ instead when the samples is not used for training. We could not find a proof to this statement and we provides the option to specify the degree of freedom in the `calzone`. The default value is still $M-2$.)"
+    "where $E_{1,m}$ is the expected number of class 1 events in the $\\text{m}^{th}$ bin, $O_{1,m}$ is the observed number of class 1 events in the $\\text{m}^{th}$ bin, $N_m$ is the total number of observations in the $\\text{m}^{th}$ bin, and $M$ is the number of bins. The HL test statistic is distributed as a chi-squared distribution with $M-2$ degrees of freedom. We can then use this test statistic to calculate the p-value for the test and determine whether we can reject the null hypothesis that the model is well-calibrated. Notice that the degree of freedom of HL test is $M-2$ by default but some literature suggests that the degree of freedom should be $M$ instead when the samples is not used for training. We provides the option to specify the degree of freedom in the `calzone`. The default value is still $M-2$."
    ]
   },
   {
@@ -280,7 +280,54 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We can see that both the equal-width and the equal-count method have the incorrect size."
+    "We can see that both the equal-width and the equal-count method have the incorrect size. The simulation support the claim that the degree of freedom should be M instead of M-2. We can show it with simulation. We are not proving the claim here since it is beyond the scope of this documentation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The size of HL-H with df=M is : 0.047\n",
+      "The size of HL-C with df=M  is : 0.055\n"
+     ]
+    }
+   ],
+   "source": [
+    "### The size of HL Test\n",
+    "from calzone.utils import fake_binary_data_generator\n",
+    "from importlib import reload\n",
+    "import calzone.metrics\n",
+    "reload(calzone.metrics)\n",
+    "from calzone.metrics import CalibrationMetrics\n",
+    "np.random.seed(123)\n",
+    "fakedata_generator = fake_binary_data_generator(alpha_val=0.5, beta_val=0.5)\n",
+    "cal_metrics = CalibrationMetrics()\n",
+    "sample_size = 1000\n",
+    "simulation_size = 10000\n",
+    "results = []\n",
+    "# generate data\n",
+    "for i in range(simulation_size):\n",
+    "    X, y = fakedata_generator.generate_data(sample_size)\n",
+    "    if i == 0:\n",
+    "        tempresult = cal_metrics.calculate_metrics(y, X, ['HL-H', 'HL-C'],return_numpy=False, df = 10)\n",
+    "        keys = list(tempresult.keys())\n",
+    "        results.append(np.array(list(tempresult.values())))\n",
+    "    else:\n",
+    "        tempresult = cal_metrics.calculate_metrics(y, X, ['HL-H', 'HL-C'],return_numpy=True, df = 10)\n",
+    "        results.append(tempresult)\n",
+    "results = np.array(results)\n",
+    "\n",
+    "hl_h_pvalue = results[:,1]\n",
+    "hl_c_pvalue = results[:,3]\n",
+    "size_h = np.mean(hl_h_pvalue < 0.05)\n",
+    "size_c = np.mean(hl_c_pvalue < 0.05)\n",
+    "print(\"The size of HL-H with df=M is :\", round(size_h,3))\n",
+    "print(\"The size of HL-C with df=M  is :\", round(size_c,3))"
    ]
   },
   {