diff --git a/paper/paper.bib b/paper/paper.bib index 079a83b..3a7661c 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -10,415 +10,274 @@ @article{meta_cusum } @article{loess_graph_austin, -author = {Austin, Peter C. and Steyerberg, Ewout W.}, -title = {Graphical assessment of internal and external calibration of logistic regression models by using loess smoothers}, -journal = {Statistics in Medicine}, -volume = {33}, -number = {3}, -pages = {517-535}, -keywords = {logistic regression, prediction, calibration, graphical methods, prediction models}, -doi = {https://doi.org/10.1002/sim.5941}, -url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/sim.5941}, -eprint = {https://onlinelibrary.wiley.com/doi/pdf/10.1002/sim.5941}, -abstract = {Predicting the probability of the occurrence of a binary outcome or condition is important in biomedical research. While assessing discrimination is an essential issue in developing and validating binary prediction models, less attention has been paid to methods for assessing model calibration. Calibration refers to the degree of agreement between observed and predicted probabilities and is often assessed by testing for lack-of-fit. The objective of our study was to examine the ability of graphical methods to assess the calibration of logistic regression models. We examined lack of internal calibration, which was related to misspecification of the logistic regression model, and external calibration, which was related to an overfit model or to shrinkage of the linear predictor. We conducted an extensive set of Monte Carlo simulations with a locally weighted least squares regression smoother (i.e., the loess algorithm) to examine the ability of graphical methods to assess model calibration. We found that loess-based methods were able to provide evidence of moderate departures from linearity and indicate omission of a moderately strong interaction. Misspecification of the link function was harder to detect. Visual patterns were clearer with higher sample sizes, higher incidence of the outcome, or higher discrimination. Loess-based methods were also able to identify the lack of calibration in external validation samples when an overfit regression model had been used. In conclusion, loess-based smoothing methods are adequate tools to graphically assess calibration and merit wider application. © 2013 The Authors. Statistics in Medicine published by John Wiley \& Sons, Ltd}, -year = {2014} + author = {Austin, Peter C. and Steyerberg, Ewout W.}, + title = {Graphical assessment of internal and external calibration of logistic regression models by using loess smoothers}, + journal = {Statistics in Medicine}, + volume = {33}, + number = {3}, + pages = {517-535}, + keywords = {logistic regression, prediction, calibration, graphical methods, prediction models}, + doi = {10.1002/sim.5941}, + url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/sim.5941}, + eprint = {https://onlinelibrary.wiley.com/doi/pdf/10.1002/sim.5941}, + abstract = {Predicting the probability of the occurrence of a binary outcome or condition is important in biomedical research. While assessing discrimination is an essential issue in developing and validating binary prediction models, less attention has been paid to methods for assessing model calibration. Calibration refers to the degree of agreement between observed and predicted probabilities and is often assessed by testing for lack-of-fit. The objective of our study was to examine the ability of graphical methods to assess the calibration of logistic regression models. We examined lack of internal calibration, which was related to misspecification of the logistic regression model, and external calibration, which was related to an overfit model or to shrinkage of the linear predictor. We conducted an extensive set of Monte Carlo simulations with a locally weighted least squares regression smoother (i.e., the loess algorithm) to examine the ability of graphical methods to assess model calibration. We found that loess-based methods were able to provide evidence of moderate departures from linearity and indicate omission of a moderately strong interaction. Misspecification of the link function was harder to detect. Visual patterns were clearer with higher sample sizes, higher incidence of the outcome, or higher discrimination. Loess-based methods were also able to identify the lack of calibration in external validation samples when an overfit regression model had been used. In conclusion, loess-based smoothing methods are adequate tools to graphically assess calibration and merit wider application.}, + year = {2014} +} + +@article{Naeini_ece, + title = {Obtaining Well Calibrated Probabilities Using Bayesian Binning}, + volume = {29}, + url = {https://ojs.aaai.org/index.php/AAAI/article/view/9602}, + doi = {10.1609/aaai.v29i1.9602}, + number = {1}, + journal = {Proceedings of the AAAI Conference on Artificial Intelligence}, + author = {Pakdaman Naeini, Mahdi and Cooper, Gregory and Hauskrecht, Milos}, + year = {2015}, + month = {Feb} } -@article{ Naeini_ece -, title={Obtaining Well Calibrated Probabilities Using Bayesian Binning}, volume={29}, url={https://ojs.aaai.org/index.php/AAAI/article/view/9602}, DOI={10.1609/aaai.v29i1.9602}, abstractNote={ <p> Learning probabilistic predictive models that are well calibrated is critical for many prediction and decision-making tasks in artificial intelligence. In this paper we present a new non-parametric calibration method called Bayesian Binning into Quantiles (BBQ) which addresses key limitations of existing calibration methods. The method post processes the output of a binary classification algorithm; thus, it can be readily combined with many existing classification algorithms. The method is computationally tractable, and empirically accurate, as evidenced by the set of experiments reported here on both real and simulated datasets. </p> }, number={1}, journal={Proceedings of the AAAI Conference on Artificial Intelligence}, author={Pakdaman Naeini, Mahdi and Cooper, Gregory and Hauskrecht, Milos}, year={2015}, month={Feb.} } @article{wilson_interval, - title={Probable inference, the law of succession, and statistical inference}, - author={Wilson, Edwin B}, - journal={Journal of the American Statistical Association}, - volume={22}, - number={158}, - pages={209--212}, - year={1927}, - publisher={Taylor \& Francis} -} -@article { Brocker_reldia, - author = "Jochen Bröcker and Leonard A. Smith", - title = "Increasing the Reliability of Reliability Diagrams", - journal = "Weather and Forecasting", - year = "2007", - publisher = "American Meteorological Society", - address = "Boston MA, USA", - volume = "22", - number = "3", - doi = "10.1175/WAF993.1", - pages= "651 - 661", - url = "https://journals.ametsoc.org/view/journals/wefo/22/3/waf993_1.xml" + title = {Probable inference, the law of succession, and statistical inference}, + author = {Wilson, Edwin B}, + journal = {Journal of the American Statistical Association}, + volume = {22}, + number = {158}, + pages = {209--212}, + year = {1927}, + publisher = {Taylor \& Francis} +} + +@article{Brocker_reldia, + author = {Bröcker, Jochen and Smith, Leonard A.}, + title = {Increasing the Reliability of Reliability Diagrams}, + journal = {Weather and Forecasting}, + year = {2007}, + publisher = {American Meteorological Society}, + address = {Boston MA, USA}, + volume = {22}, + number = {3}, + doi = {10.1175/WAF993.1}, + pages = {651--661}, + url = {https://journals.ametsoc.org/view/journals/wefo/22/3/waf993_1.xml} } + @article{ICI_austin, -author = {Austin, Peter C. and Steyerberg, Ewout W.}, -title = {The Integrated Calibration Index (ICI) and related metrics for quantifying the calibration of logistic regression models}, -journal = {Statistics in Medicine}, -volume = {38}, -number = {21}, -pages = {4051-4065}, -keywords = {calibration, logistic regression, model validation}, -doi = {https://doi.org/10.1002/sim.8281}, -url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/sim.8281}, -eprint = {https://onlinelibrary.wiley.com/doi/pdf/10.1002/sim.8281}, -abstract = {Assessing the calibration of methods for estimating the probability of the occurrence of a binary outcome is an important aspect of validating the performance of risk-prediction algorithms. Calibration commonly refers to the agreement between predicted and observed probabilities of the outcome. Graphical methods are an attractive approach to assess calibration, in which observed and predicted probabilities are compared using loess-based smoothing functions. We describe the Integrated Calibration Index (ICI) that is motivated by Harrell's Emax index, which is the maximum absolute difference between a smooth calibration curve and the diagonal line of perfect calibration. The ICI can be interpreted as weighted difference between observed and predicted probabilities, in which observations are weighted by the empirical density function of the predicted probabilities. As such, the ICI is a measure of calibration that explicitly incorporates the distribution of predicted probabilities. We also discuss two related measures of calibration, E50 and E90, which represent the median and 90th percentile of the absolute difference between observed and predicted probabilities. We illustrate the utility of the ICI, E50, and E90 by using them to compare the calibration of logistic regression with that of random forests and boosted regression trees for predicting mortality in patients hospitalized with a heart attack. The use of these numeric metrics permitted for a greater differentiation in calibration than was permissible by visual inspection of graphical calibration curves.}, -year = {2019} + author = {Austin, Peter C. and Steyerberg, Ewout W.}, + title = {The Integrated Calibration Index (ICI) and related metrics for quantifying the calibration of logistic regression models}, + journal = {Statistics in Medicine}, + volume = {38}, + number = {21}, + pages = {4051-4065}, + keywords = {calibration, logistic regression, model validation}, + doi = {10.1002/sim.8281}, + url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/sim.8281}, + eprint = {https://onlinelibrary.wiley.com/doi/pdf/10.1002/sim.8281}, + year = {2019} } + @article{beta-binomial, - ISSN = {0006341X, 15410420}, - URL = {http://www.jstor.org/stable/2529131}, - abstract = {In part I, maximum likelihood (ML) estimation for the beta-binomial distribution (BBD) is considered. The BBD can be used as a model for the incidence in households of noninfectious disease. Typically households in which there are no cases of disease will not be included in the data. It is then necessary to fit a truncated BBD. Alternative modes of infection are discussed in part II. These give rise to a variety of models for the household distribution of the number of cases of a disease. The BBD is fitted to some data on the common cold and influenza. Other models have been fitted by previous authors to the same data. Independent epidemiological evidence would be necessary for choosing among these models.}, - author = {D. A. Griffiths}, - journal = {Biometrics}, - number = {4}, - pages = {637--648}, - publisher = {International Biometric Society}, - title = {Maximum Likelihood Estimation for the Beta-Binomial Distribution and an Application to the Household Distribution of the Total Number of Cases of a Disease}, - urldate = {2024-10-07}, - volume = {29}, - year = {1973} + issn = {0006341X, 15410420}, + url = {http://www.jstor.org/stable/2529131}, + author = {Griffiths, D. A.}, + journal = {Biometrics}, + number = {4}, + pages = {637--648}, + publisher = {International Biometric Society}, + title = {Maximum Likelihood Estimation for the Beta-Binomial Distribution and an Application to the Household Distribution of the Total Number of Cases of a Disease}, + volume = {29}, + year = {1973} } + @article{Murphy_reliability, - ISSN = {00359254, 14679876}, - URL = {http://www.jstor.org/stable/2346866}, - abstract = {This paper briefly describes some results of operational and experimental programmes in the United States involving subjective probability forecasts of precipitation occurrence and of maximum and minimum temperatures. These results indicate that weather forecasters can formulate such forecasts in a reliable manner.}, - author = {Allan H. Murphy and Robert L. Winkler}, - journal = {Journal of the Royal Statistical Society. Series C (Applied Statistics)}, - number = {1}, - pages = {41--47}, - publisher = {[Royal Statistical Society, Oxford University Press]}, - title = {Reliability of Subjective Probability Forecasts of Precipitation and Temperature}, - urldate = {2024-11-05}, - volume = {26}, - year = {1977} + issn = {00359254, 14679876}, + url = {http://www.jstor.org/stable/2346866}, + author = {Murphy, Allan H. and Winkler, Robert L.}, + journal = {Journal of the Royal Statistical Society. Series C (Applied Statistics)}, + number = {1}, + pages = {41--47}, + publisher = {Royal Statistical Society}, + title = {Reliability of Subjective Probability Forecasts of Precipitation and Temperature}, + volume = {26}, + year = {1977} } + @article{Prevalence_HORSCH, -title = {Prevalence Scaling: Applications to an Intelligent Workstation for the Diagnosis of Breast Cancer}, -journal = {Academic Radiology}, -volume = {15}, -number = {11}, -pages = {1446-1457}, -year = {2008}, -issn = {1076-6332}, -doi = {https://doi.org/10.1016/j.acra.2008.04.022}, -url = {https://www.sciencedirect.com/science/article/pii/S1076633208002936}, -author = {Karla Horsch and Maryellen L. Giger and Charles E. Metz}, -keywords = {ROC analysis, computer-aided diagnosis, mammography, ultrasound}, -abstract = {Rationale and Objectives -Our goal was to investigate the effects of changes that the prevalence of cancer in a population have on the probability of malignancy (PM) output and an optimal combination of a true-positive fraction (TPF) and a false-positive fraction (FPF) of a mammographic and sonographic automatic classifier for the diagnosis of breast cancer. -Materials and Methods -We investigate how a prevalence-scaling transformation that is used to change the prevalence inherent in the computer estimates of the PM affects the numerical and histographic output of a previously developed multimodality intelligent workstation. Using Bayes' rule and the binormal model, we study how changes in the prevalence of cancer in the diagnostic breast population affect our computer classifiers' optimal operating points, as defined by maximizing the expected utility. -Results -Prevalence scaling affects the threshold at which a particular TPF and FPF pair is achieved. Tables giving the thresholds on the scaled PM estimates that result in particular pairs of TPF and FPF are presented. Histograms of PMs scaled to reflect clinically relevant prevalence values differ greatly from histograms of laboratory-designed PMs. The optimal pair (TPF, FPF) of our lower performing mammographic classifier is more sensitive to changes in clinical prevalence than that of our higher performing sonographic classifier. -Conclusions -Prevalence scaling can be used to change computer PM output to reflect clinically more appropriate prevalence. Relatively small changes in clinical prevalence can have large effects on the computer classifier's optimal operating point.} + title = {Prevalence Scaling: Applications to an Intelligent Workstation for the Diagnosis of Breast Cancer}, + journal = {Academic Radiology}, + volume = {15}, + number = {11}, + pages = {1446-1457}, + year = {2008}, + issn = {1076-6332}, + doi = {10.1016/j.acra.2008.04.022}, + url = {https://www.sciencedirect.com/science/article/pii/S1076633208002936}, + author = {Horsch, Karla and Giger, Maryellen L. and Metz, Charles E.}, + keywords = {ROC analysis, computer-aided diagnosis, mammography, ultrasound} } @article{hl_test, -author = {David W. Hosmer and Stanley Lemesbow}, -title = {Goodness of fit tests for the multiple logistic regression model}, -journal = {Communications in Statistics - Theory and Methods}, -volume = {9}, -number = {10}, -pages = {1043--1069}, -year = {1980}, -publisher = {Taylor \& Francis}, -doi = {10.1080/03610928008827941}, - - -URL = { - - - https://www.tandfonline.com/doi/abs/10.1080/03610928008827941 - - -}, -eprint = { - - - https://www.tandfonline.com/doi/pdf/10.1080/03610928008827941 - - -} - + author = {Hosmer, David W. and Lemesbow, Stanley}, + title = {Goodness of fit tests for the multiple logistic regression model}, + journal = {Communications in Statistics - Theory and Methods}, + volume = {9}, + number = {10}, + pages = {1043--1069}, + year = {1980}, + publisher = {Taylor \& Francis}, + doi = {10.1080/03610928008827941}, + url = {https://www.tandfonline.com/doi/abs/10.1080/03610928008827941}, + eprint = {https://www.tandfonline.com/doi/pdf/10.1080/03610928008827941} } @book{hosmer2013applied, - title={Applied logistic regression}, - author={Hosmer Jr, David W and Lemeshow, Stanley and Sturdivant, Rodney X}, - year={2013}, - publisher={John Wiley \& Sons} + title = {Applied logistic regression}, + author = {Hosmer Jr, David W and Lemeshow, Stanley and Sturdivant, Rodney X}, + year = {2013}, + publisher = {John Wiley \& Sons} } + @article{Brocker_decompose, -author = {Bröcker, Jochen}, -title = {Reliability, sufficiency, and the decomposition of proper scores}, -journal = {Quarterly Journal of the Royal Meteorological Society}, -volume = {135}, -number = {643}, -pages = {1512-1519}, -keywords = {probabilistic forecasts, scoring rules, reliability, resolution}, -doi = {https://doi.org/10.1002/qj.456}, -url = {https://rmets.onlinelibrary.wiley.com/doi/abs/10.1002/qj.456}, -eprint = {https://rmets.onlinelibrary.wiley.com/doi/pdf/10.1002/qj.456}, -abstract = {Abstract Scoring rules are an important tool for evaluating the performance of probabilistic forecasting schemes. A scoring rule is called strictly proper if its expectation is optimal if and only if the forecast probability represents the true distribution of the target. In the binary case, strictly proper scoring rules allow for a decomposition into terms related to the resolution and the reliability of a forecast. This fact is particularly well known for the Brier Score. In this article, this result is extended to forecasts for finite-valued targets. Both resolution and reliability are shown to have a positive effect on the score. It is demonstrated that resolution and reliability are directly related to forecast attributes that are desirable on grounds independent of the notion of scores. This finding can be considered an epistemological justification of measuring forecast quality by proper scoring rules. A link is provided to the original work of DeGroot and Fienberg, extending their concepts of sufficiency and refinement. The relation to the conjectured sharpness principle of Gneiting, et al., is elucidated. Copyright © 2009 Royal Meteorological Society}, -year = {2009} + author = {Bröcker, Jochen}, + title = {Reliability, sufficiency, and the decomposition of proper scores}, + journal = {Quarterly Journal of the Royal Meteorological Society}, + volume = {135}, + number = {643}, + pages = {1512-1519}, + keywords = {probabilistic forecasts, scoring rules, reliability, resolution}, + doi = {10.1002/qj.456}, + url = {https://rmets.onlinelibrary.wiley.com/doi/abs/10.1002/qj.456}, + eprint = {https://rmets.onlinelibrary.wiley.com/doi/pdf/10.1002/qj.456}, + year = {2009} } + @article{gneiting2007strictly, - title={Strictly proper scoring rules, prediction, and estimation}, - author={Gneiting, Tilmann and Raftery, Adrian E}, - journal={Journal of the American statistical Association}, - volume={102}, - number={477}, - pages={359--378}, - year={2007}, - publisher={Taylor \& Francis} + title = {Strictly proper scoring rules, prediction, and estimation}, + author = {Gneiting, Tilmann and Raftery, Adrian E}, + journal = {Journal of the American statistical Association}, + volume = {102}, + number = {477}, + pages = {359--378}, + year = {2007}, + publisher = {Taylor \& Francis} } @book{McCullagh:1989, - added-at = {2010-01-10T01:48:50.000+0100}, address = {London}, author = {McCullagh, P. and Nelder, J. A.}, - biburl = {https://www.bibsonomy.org/bibtex/21236b0d4dcf920ff44d2c578d82bd780/vivion}, - date = {(1989)}, - interhash = {57a5eea9902828a90b76e8e38a420073}, - intrahash = {1236b0d4dcf920ff44d2c578d82bd780}, - keywords = {generalized glm linear models statistics}, - location = {London, UK: Chapman & Hall / CRC}, - publisher = {Chapman & Hall / CRC}, - timestamp = {2010-01-10T01:48:50.000+0100}, + publisher = {Chapman \& Hall / CRC}, title = {Generalized Linear Models}, - year = 1989 + year = {1989} } + @inbook{Calster_weak_cal, -author = {Van Calster, Ben and Steyerberg, Ewout W.}, -publisher = {John Wiley & Sons, Ltd}, -isbn = {9781118445112}, -title = {Calibration of Prognostic Risk Scores}, -booktitle = {Wiley StatsRef: Statistics Reference Online}, -chapter = {}, -pages = {1-10}, -doi = {https://doi.org/10.1002/9781118445112.stat08078}, -url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/9781118445112.stat08078}, -eprint = {https://onlinelibrary.wiley.com/doi/pdf/10.1002/9781118445112.stat08078}, -year = {2018}, -keywords = {calibration, validation, prediction, regression model, goodness-of-fit, logistic regression, risk prediction model}, -abstract = {Abstract Prognostic risk scores provide individualized risk estimates for an outcome (“event”) based on included covariates. Calibration indicates the extent to which risk estimates are reliable. We review four increasingly stringent definitions of calibration: mean, weak, moderate, and strong calibration. The latter requires that the risk model perfectly corresponds to the observed proportions for every single covariate pattern, and hence can be regarded as impossible to achieve in practice (“utopic”). We present an overview of methods to assess calibration, including goodness-of-fit tests, summary measures, and graphical procedures. We show that estimation and visualization of the degree of (mis)calibration is essential, whereas testing is problematic. We illustrate the methods with a case study on the prediction of the histology of retroperitoneal lymph nodes following chemotherapy for testicular cancer.} + author = {Van Calster, Ben and Steyerberg, Ewout W.}, + publisher = {John Wiley \& Sons, Ltd}, + isbn = {9781118445112}, + title = {Calibration of Prognostic Risk Scores}, + booktitle = {Wiley StatsRef: Statistics Reference Online}, + pages = {1-10}, + doi = {10.1002/9781118445112.stat08078}, + url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/9781118445112.stat08078}, + eprint = {https://onlinelibrary.wiley.com/doi/pdf/10.1002/9781118445112.stat08078}, + year = {2018}, + keywords = {calibration, validation, prediction, regression model, goodness-of-fit, logistic regression, risk prediction model} } -@article{weijie_prevalence_adjustment, -author = {Weijie Chen and Berkman Sahiner and Frank Samuelson and Aria Pezeshk and Nicholas Petrick}, -title ={Calibration of medical diagnostic classifier scores to the probability of disease}, - -journal = {Statistical Methods in Medical Research}, -volume = {27}, -number = {5}, -pages = {1394-1409}, -year = {2018}, -doi = {10.1177/0962280216661371}, - note ={PMID: 27507287}, - -URL = { - - https://doi.org/10.1177/0962280216661371 - - -}, -eprint = { - - https://doi.org/10.1177/0962280216661371 - - - -} -, - abstract = { Scores produced by statistical classifiers in many clinical decision support systems and other medical diagnostic devices are generally on an arbitrary scale, so the clinical meaning of these scores is unclear. Calibration of classifier scores to a meaningful scale such as the probability of disease is potentially useful when such scores are used by a physician. In this work, we investigated three methods (parametric, semi-parametric, and non-parametric) for calibrating classifier scores to the probability of disease scale and developed uncertainty estimation techniques for these methods. We showed that classifier scores on arbitrary scales can be calibrated to the probability of disease scale without affecting their discrimination performance. With a finite dataset to train the calibration function, it is important to accompany the probability estimate with its confidence interval. Our simulations indicate that, when a dataset used for finding the transformation for calibration is also used for estimating the performance of calibration, the resubstitution bias exists for a performance metric involving the truth states in evaluating the calibration performance. However, the bias is small for the parametric and semi-parametric methods when the sample size is moderate to large (>100 per class). } +@article{weijie_prevalence_adjustment, + author = {Chen, Weijie and Sahiner, Berkman and Samuelson, Frank and Pezeshk, Aria and Petrick, Nicholas}, + title = {Calibration of medical diagnostic classifier scores to the probability of disease}, + journal = {Statistical Methods in Medical Research}, + volume = {27}, + number = {5}, + pages = {1394-1409}, + year = {2018}, + doi = {10.1177/0962280216661371}, + note = {PMID: 27507287}, + url = {https://doi.org/10.1177/0962280216661371}, + eprint = {https://doi.org/10.1177/0962280216661371} } @article{DIAMOND199285, -title = {What price perfection? Calibration and discrimination of clinical prediction models}, -journal = {Journal of Clinical Epidemiology}, -volume = {45}, -number = {1}, -pages = {85-89}, -year = {1992}, -issn = {0895-4356}, -doi = {https://doi.org/10.1016/0895-4356(92)90192-P}, -url = {https://www.sciencedirect.com/science/article/pii/089543569290192P}, -author = {George A. Diamond} + title = {What price perfection? Calibration and discrimination of clinical prediction models}, + journal = {Journal of Clinical Epidemiology}, + volume = {45}, + number = {1}, + pages = {85-89}, + year = {1992}, + issn = {0895-4356}, + doi = {10.1016/0895-4356(92)90192-P}, + url = {https://www.sciencedirect.com/science/article/pii/089543569290192P}, + author = {Diamond, George A.} } + @article{taquet2022mapie, - title={MAPIE: an open-source library for distribution-free uncertainty quantification}, - author={Taquet, Vianney and Blot, Vincent and Morzadec, Thomas and Lacombe, Louis and Brunel, Nicolas}, - journal={arXiv preprint arXiv:2207.12274}, - year={2022} + title = {MAPIE: an open-source library for distribution-free uncertainty quantification}, + author = {Taquet, Vianney and Blot, Vincent and Morzadec, Thomas and Lacombe, Louis and Brunel, Nicolas}, + journal = {arXiv preprint arXiv:2207.12274}, + year = {2022} } @article{uncertaintyToolbox, - title={Uncertainty Toolbox: an Open-Source Library for Assessing, Visualizing, and Improving Uncertainty Quantification}, - author={Chung, Youngseog and Char, Ian and Guo, Han and Schneider, Jeff and Neiswanger, Willie}, - journal={arXiv preprint arXiv:2109.10254}, - year={2021} + title = {Uncertainty Toolbox: an Open-Source Library for Assessing, Visualizing, and Improving Uncertainty Quantification}, + author = {Chung, Youngseog and Char, Ian and Guo, Han and Schneider, Jeff and Neiswanger, Willie}, + journal = {arXiv preprint arXiv:2109.10254}, + year = {2021} } @Manual{ResourceSelection, title = {ResourceSelection: Resource Selection (Probability) Functions for Use-Availability Data}, - author = {Subhash R. Lele and Jonah L. Keim and Peter Solymos}, + author = {Lele, Subhash R. and Keim, Jonah L. and Solymos, Peter}, year = {2024}, note = {R package version 0.3-6}, - url = {https://github.com/psolymos/ResourceSelection}, + url = {https://github.com/psolymos/ResourceSelection} } + @article{scikit, - title={Scikit-learn: Machine learning in Python}, - author={Pedregosa, Fabian and Varoquaux, Ga{\"e}l and Gramfort, Alexandre and Michel, Vincent and Thirion, Bertrand and Grisel, Olivier and Blondel, Mathieu and Prettenhofer, Peter and Weiss, Ron and Dubourg, Vincent and others}, - journal={Journal of machine learning research}, - volume={12}, - number={Oct}, - pages={2825--2830}, - year={2011} + title = {Scikit-learn: Machine learning in Python}, + author = {Pedregosa, Fabian and Varoquaux, Gaël and Gramfort, Alexandre and Michel, Vincent and Thirion, Bertrand and Grisel, Olivier and Blondel, Mathieu and Prettenhofer, Peter and Weiss, Ron and Dubourg, Vincent and others}, + journal = {Journal of machine learning research}, + volume = {12}, + number = {Oct}, + pages = {2825--2830}, + year = {2011} } + @InProceedings{netcal, - author = {Küppers, Fabian and Kronenberger, Jan and Shantia, Amirhossein and Haselhoff, Anselm}, - title = {Multivariate Confidence Calibration for Object Detection}, - booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, - month = {June}, - year = {2020} + author = {Küppers, Fabian and Kronenberger, Jan and Shantia, Amirhossein and Haselhoff, Anselm}, + title = {Multivariate Confidence Calibration for Object Detection}, + booktitle = {The IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops}, + month = {June}, + year = {2020} } + @article{Cox, - author = {COX, D. R.}, - title = "{Two further applications of a model for binary regression}", - journal = {Biometrika}, - volume = {45}, - number = {3-4}, - pages = {562-565}, - year = {1958}, - month = {12}, - issn = {0006-3444}, - doi = {10.1093/biomet/45.3-4.562}, - url = {https://doi.org/10.1093/biomet/45.3-4.562}, - eprint = {https://academic.oup.com/biomet/article-pdf/45/3-4/562/639297/45-3-4-562.pdf}, + author = {Cox, D. R.}, + title = {Two further applications of a model for binary regression}, + journal = {Biometrika}, + volume = {45}, + number = {3-4}, + pages = {562-565}, + year = {1958}, + month = {12}, + issn = {0006-3444}, + doi = {10.1093/biomet/45.3-4.562}, + url = {https://doi.org/10.1093/biomet/45.3-4.562}, + eprint = {https://academic.oup.com/biomet/article-pdf/45/3-4/562/639297/45-3-4-562.pdf} } @article{gu_likelihod_ratio, - author = {Gu, Wen and Pepe, Margaret Sullivan}, - title = "{Estimating the diagnostic likelihood ratio of a continuous marker}", - journal = {Biostatistics}, - volume = {12}, - number = {1}, - pages = {87-101}, - year = {2010}, - month = {07}, - abstract = "{The diagnostic likelihood ratio function, DLR, is a statistical measure used to evaluate risk prediction markers. The goal of this paper is to develop new methods to estimate the DLR function. Furthermore, we show how risk prediction markers can be compared using rank-invariant DLR functions. Various estimators are proposed that accommodate cohort or case–control study designs. Performances of the estimators are compared using simulation studies. The methods are illustrated by comparing a lung function measure and a nutritional status measure for predicting subsequent onset of major pulmonary infection in children suffering from cystic fibrosis. For continuous markers, the DLR function is mathematically related to the slope of the receiver operating characteristic (ROC) curve, an entity used to evaluate diagnostic markers. We show that our methodology can be used to estimate the slope of the ROC curve and illustrate use of the estimated ROC derivative in variance and sample size calculations for a diagnostic biomarker study.}", - issn = {1465-4644}, - doi = {10.1093/biostatistics/kxq045}, - url = {https://doi.org/10.1093/biostatistics/kxq045}, - eprint = {https://academic.oup.com/biostatistics/article-pdf/12/1/87/17739283/kxq045.pdf}, -} - - - - -@InProceedings{guo_calibration, - title = {On Calibration of Modern Neural Networks}, - author = {Chuan Guo and Geoff Pleiss and Yu Sun and Kilian Q. Weinberger}, - booktitle = {Proceedings of the 34th International Conference on Machine Learning}, - pages = {1321--1330}, - year = {2017}, - editor = {Precup, Doina and Teh, Yee Whye}, - volume = {70}, - series = {Proceedings of Machine Learning Research}, - month = {06--11 Aug}, - publisher = {PMLR}, - pdf = {http://proceedings.mlr.press/v70/guo17a/guo17a.pdf}, - url = {https://proceedings.mlr.press/v70/guo17a.html}, - abstract = {Confidence calibration – the problem of predicting probability estimates representative of the true correctness likelihood – is important for classification models in many applications. We discover that modern neural networks, unlike those from a decade ago, are poorly calibrated. Through extensive experiments, we observe that depth, width, weight decay, and Batch Normalization are important factors influencing calibration. We evaluate the performance of various post-processing calibration methods on state-of-the-art architectures with image and document classification datasets. Our analysis and experiments not only offer insights into neural network learning, but also provide a simple and straightforward recipe for practical settings: on most datasets, temperature scaling – a single-parameter variant of Platt Scaling – is surprisingly effective at calibrating predictions.} -} - -@article{hl_test, -author = {David W. Hosmer and Stanley Lemesbow}, -title = {Goodness of fit tests for the multiple logistic regression model}, -journal = {Communications in Statistics - Theory and Methods}, -volume = {9}, -number = {10}, -pages = {1043--1069}, -year = {1980}, -publisher = {Taylor \& Francis}, -doi = {10.1080/03610928008827941}, - - -URL = { - - - https://www.tandfonline.com/doi/abs/10.1080/03610928008827941 - - -}, -eprint = { - - - https://www.tandfonline.com/doi/pdf/10.1080/03610928008827941 - - -} - -} - - -@article{huang_tutorial, - author = {Huang, Yingxiang and Li, Wentao and Macheret, Fima and Gabriel, Rodney A and Ohno-Machado, Lucila}, - title = "{A tutorial on calibration measurements and calibration models for clinical prediction models}", - journal = {Journal of the American Medical Informatics Association}, - volume = {27}, - number = {4}, - pages = {621-633}, - year = {2020}, - month = {02}, - abstract = "{Our primary objective is to provide the clinical informatics community with an introductory tutorial on calibration measurements and calibration models for predictive models using existing R packages and custom implemented code in R on real and simulated data. Clinical predictive model performance is commonly published based on discrimination measures, but use of models for individualized predictions requires adequate model calibration. This tutorial is intended for clinical researchers who want to evaluate predictive models in terms of their applicability to a particular population. It is also for informaticians and for software engineers who want to understand the role that calibration plays in the evaluation of a clinical predictive model, and to provide them with a solid starting point to consider incorporating calibration evaluation and calibration models in their work.Covered topics include (1) an introduction to the importance of calibration in the clinical setting, (2) an illustration of the distinct roles that discrimination and calibration play in the assessment of clinical predictive models, (3) a tutorial and demonstration of selected calibration measurements, (4) a tutorial and demonstration of selected calibration models, and (5) a brief discussion of limitations of these methods and practical suggestions on how to use them in practice.}", - issn = {1527-974X}, - doi = {10.1093/jamia/ocz228}, - url = {https://doi.org/10.1093/jamia/ocz228}, - eprint = {https://academic.oup.com/jamia/article-pdf/27/4/621/34153143/ocz228.pdf}, -} - - - -@inproceedings{nixon_ace, - title={Measuring Calibration in Deep Learning.}, - author={Nixon, Jeremy and Dusenberry, Michael W and Zhang, Linchuan and Jerfel, Ghassen and Tran, Dustin}, - booktitle={CVPR workshops}, - volume={2}, - number={7}, - year={2019} -} - -@article{spiegelhalter_z, - title={Probabilistic prediction in patient management and clinical trials}, - author={Spiegelhalter, David J}, - journal={Statistics in medicine}, - volume={5}, - number={5}, - pages={421--433}, - year={1986}, - publisher={Wiley Online Library} -} - -@inproceedings{prevalence_shift, - author = {Tian, Junjiao and Liu, Yen-Cheng and Glaser, Nathaniel and Hsu, Yen-Chang and Kira, Zsolt}, - booktitle = {Advances in Neural Information Processing Systems}, - editor = {H. Larochelle and M. Ranzato and R. Hadsell and M.F. Balcan and H. Lin}, - pages = {8101--8113}, - publisher = {Curran Associates, Inc.}, - title = {Posterior Re-calibration for Imbalanced Datasets}, - url = {https://proceedings.neurips.cc/paper_files/paper/2020/file/5ca359ab1e9e3b9c478459944a2d9ca5-Paper.pdf}, - volume = {33}, - year = {2020} -} - - -@article{Walsh_overview, - title={Beyond discrimination: a comparison of calibration methods and clinical usefulness of predictive models of readmission risk}, - author={Walsh, Colin G and Sharman, Kavya and Hripcsak, George}, - journal={Journal of biomedical informatics}, - volume={76}, - pages={9--18}, - year={2017}, - publisher={Elsevier} -} \ No newline at end of file + author = {Gu, Wen and Pepe, Margaret Sullivan}, + title = {Estimating the diagnostic likelihood ratio of a continuous marker}, + journal = {Biostatistics}, + volume = {12}, + number = {1}, + pages = {87-101}, + year = {2010}, + month = {07}, + issn = {1465-4644}, + doi = {10.1093/biostatistics/kxq045}, + url = {https://doi.org/10.1093/biostatistics/kxq045}, + eprint = {https://academic.oup.com/biostatistics/article-pdf/12/1/87/17739283/kxq045.pdf} +} + +@InProceedings{guo_calibration, \ No newline at end of file