Skip to content

Commit

Permalink
Add option to use different degree of freedome for HL test. Add docum…
Browse files Browse the repository at this point in the history
…entation for it.
  • Loading branch information
jasonfan1997 committed Oct 25, 2024
1 parent 61258e5 commit 93fe7de
Show file tree
Hide file tree
Showing 27 changed files with 474 additions and 121 deletions.
7 changes: 6 additions & 1 deletion GUI_cal_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,11 @@ def run_program():
args.append("--verbose")
if topclass_checkbox.value:
args.append("--topclass")
if hl_test_validation_checkbox.value:
if not any(metric in selected_metrics for metric in ["HL-H", "HL-C"]):
ui.notify("Error: HL test validation requires either HL-H or HL-C metric to be selected.", type="error")
return
args.append("--hl_test_validation")

command = ["python", "cal_metrics.py"] + args
print("Running command:", " ".join(command))
Expand Down Expand Up @@ -110,7 +115,6 @@ def clear_cache():
}
clearCache();
''')
ui.notify('Browser cache cleared')

plot_image.clear()
plot_image.set_source(None) # Set the image source to None
Expand Down Expand Up @@ -159,6 +163,7 @@ async def pick_file() -> None:
value=1, step=1)
num_bins_input = ui.number(label='Number of Bins for ECE/MCE/HL Test',
value=10, min=2, step=1)
hl_test_validation_checkbox = ui.checkbox('HL Test Validation set', value=False)

with ui.column().classes('w-1/3 p-4'):
ui.label('Output Paths:')
Expand Down
13 changes: 12 additions & 1 deletion cal_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,10 @@ def perform_calculation(probs, labels, args, suffix=""):
cal_metrics = CalibrationMetrics(
class_to_calculate=args.class_to_calculate, num_bins=args.num_bins
)

if args.hl_test_validation:
df = args.num_bins
else:
df = args.num_bins - 2
metrics_to_calculate = args.metrics.split(",") if args.metrics else ["all"]
if metrics_to_calculate == ["all"]:
metrics_to_calculate = "all"
Expand All @@ -30,6 +33,7 @@ def perform_calculation(probs, labels, args, suffix=""):
y_proba=probs,
metrics=metrics_to_calculate,
perform_pervalance_adjustment=args.prevalence_adjustment,
df = df
)

keys = list(result.keys())
Expand All @@ -42,6 +46,7 @@ def perform_calculation(probs, labels, args, suffix=""):
n_samples=args.n_bootstrap,
metrics=metrics_to_calculate,
perform_pervalance_adjustment=args.prevalence_adjustment,
df = df
)
CI = get_CI(bootstrap_results)
result = np.vstack((result, np.array(list(CI.values())).T))
Expand Down Expand Up @@ -214,6 +219,12 @@ def main():
default=10,
help="Number of bins for ECE/MCE/HL calculations (default: 10)",
)
parser.add_argument(
"--hl_test_validation",
default=False,
action="store_true",
help="Using nbin instead of nbin-2 as HL test DOF. Use it if the dataset is validation set.",
)
parser.add_argument(
"--topclass",
default=False,
Expand Down
6 changes: 3 additions & 3 deletions calzone/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import numpy.lib.recfunctions as rf
import contextlib

def hosmer_lemeshow_test(reliability, confidence, bin_count, df=None):
def hosmer_lemeshow_test(reliability, confidence, bin_count, df=None, **kwargs):
"""
Compute the Hosmer-Lemeshow test for goodness of fit.
Expand Down Expand Up @@ -546,7 +546,7 @@ def calculate_metrics(
results["MCE-H"] = mce_h_class
elif metric == "HL-H":
hl_h_score, hl_h, _ = hosmer_lemeshow_test(
acc_H_class, confidence_H_class, bin_count_H_class
acc_H_class, confidence_H_class, bin_count_H_class, **kwargs
)
results["HL-H score"] = hl_h_score
results["HL-H p-value"] = hl_h
Expand Down Expand Up @@ -591,7 +591,7 @@ def calculate_metrics(
results["MCE-C"] = mce_c_class
elif metric == "HL-C":
hl_c_score, hl_c, _ = hosmer_lemeshow_test(
acc_C_class, confidence_C_class, bin_count_C_class
acc_C_class, confidence_C_class, bin_count_C_class, **kwargs
)
results["HL-C score"] = hl_c_score
results["HL-C p-value"] = hl_c
Expand Down
Binary file modified docs/build/doctrees/calzone.doctree
Binary file not shown.
Binary file modified docs/build/doctrees/environment.pickle
Binary file not shown.
51 changes: 49 additions & 2 deletions docs/build/doctrees/nbsphinx/notebooks/hl_test.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"\\text{HL} = \\sum_{m=1}^{M} \\left[\\frac{(O_{1,m}-E_{1,m})^2}{E_{1,m}} + \\frac{(O_{0,m}-E_{0,m})^2}{E_{0,m}}\\right] = \\sum_{m=1}^{M} \\frac{(O_{1,m}-E_{1,m})^2}{E_{1,m}(1-\\frac{E_{1,m}}{N_m})} \\sim \\chi^2_{M-2}\n",
"$$\n",
"\n",
"where $E_{1,m}$ is the expected number of class 1 events in the $\\text{m}^{th}$ bin, $O_{1,m}$ is the observed number of class 1 events in the $\\text{m}^{th}$ bin, $N_m$ is the total number of observations in the $\\text{m}^{th}$ bin, and $M$ is the number of bins. The HL test statistic is distributed as a chi-squared distribution with $M-2$ degrees of freedom. We can then use this test statistic to calculate the p-value for the test and determine whether we can reject the null hypothesis that the model is well-calibrated. (Notice that the degree of freedom of HL test is $M-2$ by default but some literature suggests that the degree of freedom should be $M$ instead when the samples is not used for training. We could not find a proof to this statement and we provides the option to specify the degree of freedom in the `calzone`. The default value is still $M-2$.)"
"where $E_{1,m}$ is the expected number of class 1 events in the $\\text{m}^{th}$ bin, $O_{1,m}$ is the observed number of class 1 events in the $\\text{m}^{th}$ bin, $N_m$ is the total number of observations in the $\\text{m}^{th}$ bin, and $M$ is the number of bins. The HL test statistic is distributed as a chi-squared distribution with $M-2$ degrees of freedom. We can then use this test statistic to calculate the p-value for the test and determine whether we can reject the null hypothesis that the model is well-calibrated. Notice that the degree of freedom of HL test is $M-2$ by default but some literature suggests that the degree of freedom should be $M$ instead when the samples is not used for training. We provides the option to specify the degree of freedom in the `calzone`. The default value is still $M-2$."
]
},
{
Expand Down Expand Up @@ -280,7 +280,54 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"We can see that both the equal-width and the equal-count method have the incorrect size."
"We can see that both the equal-width and the equal-count method have the incorrect size. The simulation support the claim that the degree of freedom should be M instead of M-2. We can show it with simulation. We are not proving the claim here since it is beyond the scope of this documentation."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The size of HL-H with df=M is : 0.047\n",
"The size of HL-C with df=M is : 0.055\n"
]
}
],
"source": [
"### The size of HL Test\n",
"from calzone.utils import fake_binary_data_generator\n",
"from importlib import reload\n",
"import calzone.metrics\n",
"reload(calzone.metrics)\n",
"from calzone.metrics import CalibrationMetrics\n",
"np.random.seed(123)\n",
"fakedata_generator = fake_binary_data_generator(alpha_val=0.5, beta_val=0.5)\n",
"cal_metrics = CalibrationMetrics()\n",
"sample_size = 1000\n",
"simulation_size = 10000\n",
"results = []\n",
"# generate data\n",
"for i in range(simulation_size):\n",
" X, y = fakedata_generator.generate_data(sample_size)\n",
" if i == 0:\n",
" tempresult = cal_metrics.calculate_metrics(y, X, ['HL-H', 'HL-C'],return_numpy=False, df = 10)\n",
" keys = list(tempresult.keys())\n",
" results.append(np.array(list(tempresult.values())))\n",
" else:\n",
" tempresult = cal_metrics.calculate_metrics(y, X, ['HL-H', 'HL-C'],return_numpy=True, df = 10)\n",
" results.append(tempresult)\n",
"results = np.array(results)\n",
"\n",
"hl_h_pvalue = results[:,1]\n",
"hl_c_pvalue = results[:,3]\n",
"size_h = np.mean(hl_h_pvalue < 0.05)\n",
"size_c = np.mean(hl_c_pvalue < 0.05)\n",
"print(\"The size of HL-H with df=M is :\", round(size_h,3))\n",
"print(\"The size of HL-C with df=M is :\", round(size_c,3))"
]
},
{
Expand Down
12 changes: 9 additions & 3 deletions docs/build/doctrees/nbsphinx/notebooks/quickstart.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,8 @@
" [--prevalence_adjustment] [--n_bootstrap N_BOOTSTRAP]\n",
" [--bootstrap_ci BOOTSTRAP_CI]\n",
" [--class_to_calculate CLASS_TO_CALCULATE]\n",
" [--num_bins NUM_BINS] [--topclass]\n",
" [--num_bins NUM_BINS]\n",
" [--hl_test_validation HL_TEST_VALIDATION] [--topclass]\n",
" [--save_metrics SAVE_METRICS] [--plot]\n",
" [--plot_bins PLOT_BINS] [--save_plot SAVE_PLOT]\n",
" [--save_diagram_output SAVE_DIAGRAM_OUTPUT] [--verbose]\n",
Expand All @@ -211,14 +212,18 @@
" Class to calculate metrics for (default: 1)\n",
" --num_bins NUM_BINS Number of bins for ECE/MCE/HL calculations (default:\n",
" 10)\n",
" --hl_test_validation HL_TEST_VALIDATION\n",
" Using nbin instead of nbin-2 as HL test DOF. Use it if\n",
" the dataset is validation set.\n",
" --topclass Whether to transform the problem to top-class problem.\n",
" --save_metrics SAVE_METRICS\n",
" Save the metrics to a csv file\n",
" --plot Plot reliability diagram (default: False)\n",
" --plot_bins PLOT_BINS\n",
" Number of bins for reliability diagram\n",
" --save_plot SAVE_PLOT\n",
" Save the plot to a file\n",
" Save the plot to a file. Must end with valid image\n",
" formats.\n",
" --save_diagram_output SAVE_DIAGRAM_OUTPUT\n",
" Save the reliability diagram output to a file\n",
" --verbose Print verbose output\n"
Expand Down Expand Up @@ -290,7 +295,8 @@
"--verbose \\\n",
"--save_diagram_output '../../../example_data/simulated_welldata_diagram_output.csv' \n",
"### save_diagram_output only when you want to save the reliability diagram output\n",
"#--prevalence_adjustment # only when you want to apply prevalence adjustment"
"#--prevalence_adjustment # only when you want to apply prevalence adjustment\n",
"#--hl_test_validation #use it only when the data is from validation set"
]
},
{
Expand Down
Binary file modified docs/build/doctrees/notebooks/hl_test.doctree
Binary file not shown.
Binary file modified docs/build/doctrees/notebooks/quickstart.doctree
Binary file not shown.
6 changes: 3 additions & 3 deletions docs/build/html/_modules/calzone/metrics.html
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ <h1>Source code for calzone.metrics</h1><div class="highlight"><pre>

<div class="viewcode-block" id="hosmer_lemeshow_test">
<a class="viewcode-back" href="../../calzone.html#calzone.metrics.hosmer_lemeshow_test">[docs]</a>
<span class="k">def</span> <span class="nf">hosmer_lemeshow_test</span><span class="p">(</span><span class="n">reliability</span><span class="p">,</span> <span class="n">confidence</span><span class="p">,</span> <span class="n">bin_count</span><span class="p">,</span> <span class="n">df</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="k">def</span> <span class="nf">hosmer_lemeshow_test</span><span class="p">(</span><span class="n">reliability</span><span class="p">,</span> <span class="n">confidence</span><span class="p">,</span> <span class="n">bin_count</span><span class="p">,</span> <span class="n">df</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Compute the Hosmer-Lemeshow test for goodness of fit.</span>

Expand Down Expand Up @@ -667,7 +667,7 @@ <h1>Source code for calzone.metrics</h1><div class="highlight"><pre>
<span class="n">results</span><span class="p">[</span><span class="s2">&quot;MCE-H&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">mce_h_class</span>
<span class="k">elif</span> <span class="n">metric</span> <span class="o">==</span> <span class="s2">&quot;HL-H&quot;</span><span class="p">:</span>
<span class="n">hl_h_score</span><span class="p">,</span> <span class="n">hl_h</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">hosmer_lemeshow_test</span><span class="p">(</span>
<span class="n">acc_H_class</span><span class="p">,</span> <span class="n">confidence_H_class</span><span class="p">,</span> <span class="n">bin_count_H_class</span>
<span class="n">acc_H_class</span><span class="p">,</span> <span class="n">confidence_H_class</span><span class="p">,</span> <span class="n">bin_count_H_class</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span>
<span class="p">)</span>
<span class="n">results</span><span class="p">[</span><span class="s2">&quot;HL-H score&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">hl_h_score</span>
<span class="n">results</span><span class="p">[</span><span class="s2">&quot;HL-H p-value&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">hl_h</span>
Expand Down Expand Up @@ -712,7 +712,7 @@ <h1>Source code for calzone.metrics</h1><div class="highlight"><pre>
<span class="n">results</span><span class="p">[</span><span class="s2">&quot;MCE-C&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">mce_c_class</span>
<span class="k">elif</span> <span class="n">metric</span> <span class="o">==</span> <span class="s2">&quot;HL-C&quot;</span><span class="p">:</span>
<span class="n">hl_c_score</span><span class="p">,</span> <span class="n">hl_c</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">hosmer_lemeshow_test</span><span class="p">(</span>
<span class="n">acc_C_class</span><span class="p">,</span> <span class="n">confidence_C_class</span><span class="p">,</span> <span class="n">bin_count_C_class</span>
<span class="n">acc_C_class</span><span class="p">,</span> <span class="n">confidence_C_class</span><span class="p">,</span> <span class="n">bin_count_C_class</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span>
<span class="p">)</span>
<span class="n">results</span><span class="p">[</span><span class="s2">&quot;HL-C score&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">hl_c_score</span>
<span class="n">results</span><span class="p">[</span><span class="s2">&quot;HL-C p-value&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">hl_c</span>
Expand Down
51 changes: 49 additions & 2 deletions docs/build/html/_sources/notebooks/hl_test.ipynb.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"\\text{HL} = \\sum_{m=1}^{M} \\left[\\frac{(O_{1,m}-E_{1,m})^2}{E_{1,m}} + \\frac{(O_{0,m}-E_{0,m})^2}{E_{0,m}}\\right] = \\sum_{m=1}^{M} \\frac{(O_{1,m}-E_{1,m})^2}{E_{1,m}(1-\\frac{E_{1,m}}{N_m})} \\sim \\chi^2_{M-2}\n",
"$$\n",
"\n",
"where $E_{1,m}$ is the expected number of class 1 events in the $\\text{m}^{th}$ bin, $O_{1,m}$ is the observed number of class 1 events in the $\\text{m}^{th}$ bin, $N_m$ is the total number of observations in the $\\text{m}^{th}$ bin, and $M$ is the number of bins. The HL test statistic is distributed as a chi-squared distribution with $M-2$ degrees of freedom. We can then use this test statistic to calculate the p-value for the test and determine whether we can reject the null hypothesis that the model is well-calibrated. (Notice that the degree of freedom of HL test is $M-2$ by default but some literature suggests that the degree of freedom should be $M$ instead when the samples is not used for training. We could not find a proof to this statement and we provides the option to specify the degree of freedom in the `calzone`. The default value is still $M-2$.)"
"where $E_{1,m}$ is the expected number of class 1 events in the $\\text{m}^{th}$ bin, $O_{1,m}$ is the observed number of class 1 events in the $\\text{m}^{th}$ bin, $N_m$ is the total number of observations in the $\\text{m}^{th}$ bin, and $M$ is the number of bins. The HL test statistic is distributed as a chi-squared distribution with $M-2$ degrees of freedom. We can then use this test statistic to calculate the p-value for the test and determine whether we can reject the null hypothesis that the model is well-calibrated. Notice that the degree of freedom of HL test is $M-2$ by default but some literature suggests that the degree of freedom should be $M$ instead when the samples is not used for training. We provides the option to specify the degree of freedom in the `calzone`. The default value is still $M-2$."
]
},
{
Expand Down Expand Up @@ -280,7 +280,54 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"We can see that both the equal-width and the equal-count method have the incorrect size."
"We can see that both the equal-width and the equal-count method have the incorrect size. The simulation support the claim that the degree of freedom should be M instead of M-2. We can show it with simulation. We are not proving the claim here since it is beyond the scope of this documentation."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The size of HL-H with df=M is : 0.047\n",
"The size of HL-C with df=M is : 0.055\n"
]
}
],
"source": [
"### The size of HL Test\n",
"from calzone.utils import fake_binary_data_generator\n",
"from importlib import reload\n",
"import calzone.metrics\n",
"reload(calzone.metrics)\n",
"from calzone.metrics import CalibrationMetrics\n",
"np.random.seed(123)\n",
"fakedata_generator = fake_binary_data_generator(alpha_val=0.5, beta_val=0.5)\n",
"cal_metrics = CalibrationMetrics()\n",
"sample_size = 1000\n",
"simulation_size = 10000\n",
"results = []\n",
"# generate data\n",
"for i in range(simulation_size):\n",
" X, y = fakedata_generator.generate_data(sample_size)\n",
" if i == 0:\n",
" tempresult = cal_metrics.calculate_metrics(y, X, ['HL-H', 'HL-C'],return_numpy=False, df = 10)\n",
" keys = list(tempresult.keys())\n",
" results.append(np.array(list(tempresult.values())))\n",
" else:\n",
" tempresult = cal_metrics.calculate_metrics(y, X, ['HL-H', 'HL-C'],return_numpy=True, df = 10)\n",
" results.append(tempresult)\n",
"results = np.array(results)\n",
"\n",
"hl_h_pvalue = results[:,1]\n",
"hl_c_pvalue = results[:,3]\n",
"size_h = np.mean(hl_h_pvalue < 0.05)\n",
"size_c = np.mean(hl_c_pvalue < 0.05)\n",
"print(\"The size of HL-H with df=M is :\", round(size_h,3))\n",
"print(\"The size of HL-C with df=M is :\", round(size_c,3))"
]
},
{
Expand Down
Loading

0 comments on commit 93fe7de

Please sign in to comment.