From 1ecda76fa61b389585ba045661b0a3477450d92d Mon Sep 17 00:00:00 2001 From: beckyperriment <93582518+beckyperriment@users.noreply.github.com> Date: Thu, 7 Dec 2023 14:25:17 +0000 Subject: [PATCH 01/13] Update paper.bib --- joss/paper.bib | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/joss/paper.bib b/joss/paper.bib index 6eaa4b7..8e955e7 100644 --- a/joss/paper.bib +++ b/joss/paper.bib @@ -224,3 +224,61 @@ @misc{UCRArchive2018 month = {October}, note = {\url{https://www.cs.ucr.edu/~eamonn/time_series_data_2018/}} } + +@article{Sakoe1978, + abstract = {This paper reports on an optimum dynamic programming (DP) based time-normalization algorithm for spoken word recognition. First, a general principle of time-normalization is given using time-warping function. Then, two time-normalized distance definitions, called symmetric and asymmetric forms, are derived from the principle. These two forms are compared with each other through theoretical discussions and experimental studies. The symmetric form algorithm superiority is established. A new technique, called slope constraint, is successfully introduced, in which the warping function slope is restricted so as to improve discrimination between words in different categories. The effective slope constraint characteristic is qualitatively analyzed, and the optimum slope constraint condition is determined through experiments. The optimized algorithm is then extensively subjected to experimental comparison with various DP-algorithms, previously applied to spoken word recognition by different research groups. The experiment shows that the present algorithm gives no more than about two-thirds errors, even compared to the best conventional algorithm. © 1978 IEEE}, + author = {Hiroaki Sakoe and Seibi Chiba}, + doi = {10.1109/TASSP.1978.1163055}, + issn = {00963518}, + issue = {1}, + journal = {IEEE Transactions on Acoustics, Speech, and Signal Processing}, + pages = {43-49}, + title = {Dynamic Programming Algorithm Optimization for Spoken Word Recognition}, + volume = {26}, + year = {1978}, +} + +@article{Rajabi2020, + abstract = {Smart meters have been widely deployed in power networks since the last decade. This trend has resulted in an enormous volume of data being collected from the electricity customers. To gain benefits for various stakeholders in power systems, proper data mining techniques, such as clustering, need to be employed to extract the underlying patterns from energy consumptions. In this paper, a comparative study of different techniques for load pattern clustering is carried out. Different parameters of the methods that affect the clustering results are evaluated and the clustering algorithms are compared for two data sets. In addition, the two suitable and commonly used data size reduction techniques and feature definition/extraction methods for load pattern clustering are analysed. Furthermore, the existing studies on clustering of electricity customers are reviewed and the main results are highlighted. Finally, the future trends and major applications of clustering consumption patterns are outlined to inform industry practitioners and academic researchers to optimize smart meter operational use and effectiveness.}, + author = {Amin Rajabi and Mohsen Eskandari and Mojtaba Jabbari Ghadi and Li Li and Jiangfeng Zhang and Pierluigi Siano}, + doi = {10.1016/j.rser.2019.109628}, + issn = {18790690}, + journal = {Renewable and Sustainable Energy Reviews}, + keywords = {Clustering algorithms,Comparative study,Data mining,Load pattern,Smart grids,Smart meters}, + month = {3}, + publisher = {Elsevier Ltd}, + title = {A comparative study of clustering techniques for electrical load pattern segmentation}, + volume = {120}, + year = {2020}, +} +@misc{Tavenard2020, + abstract = {tslearn is a general-purpose Python machine learning library for time series that offers tools for pre-processing and feature extraction as well as dedicated models for clustering, classification and regression. It follows scikit-learn's Application Programming Interface for transformers and estimators, allowing the use of standard pipelines and model selection tools on top of tslearn objects. It is distributed under the BSD-2-Clause license, and its source code is available at https://github.com/tslearn-team/tslearn.}, + author = {Romain Tavenard and Johann Faouzi and Gilles Vandewiele and Felix Divo and Guillaume Androz and Chester Holtz and Marie Payne and Roman Yurchak and Marc Rußwurm}, + journal = {Journal of Machine Learning Research}, + keywords = {classification,clustering,data mining,pre-processing,time series}, + pages = {1-6}, + title = {Tslearn, A Machine Learning Toolkit for Time Series Data}, + volume = {21}, + url = {https://github.com/tslearn-team/tslearn.}, + year = {2020}, +} + +@misc{Tavenard2020, + abstract = {tslearn is a general-purpose Python machine learning library for time series that offers tools for pre-processing and feature extraction as well as dedicated models for clustering, classification and regression. It follows scikit-learn's Application Programming Interface for transformers and estimators, allowing the use of standard pipelines and model selection tools on top of tslearn objects. It is distributed under the BSD-2-Clause license, and its source code is available at https://github.com/tslearn-team/tslearn.}, + author = {Romain Tavenard and Johann Faouzi and Gilles Vandewiele and Felix Divo and Guillaume Androz and Chester Holtz and Marie Payne and Roman Yurchak and Marc Rußwurm}, + journal = {Journal of Machine Learning Research}, + keywords = {classification,clustering,data mining,pre-processing,time series}, + pages = {1-6}, + title = {Tslearn, A Machine Learning Toolkit for Time Series Data}, + volume = {21}, + url = {https://github.com/tslearn-team/tslearn.}, + year = {2020}, +} + +@misc{Dau2018, + author = {Hoang Anh Dau and Eamonn Keogh and Kaveh Kamgar and Chin-Chia Michael Yeh and Yan Zhu and Shaghayegh Gharghabi and Chotirat Ann Ratanamahatana and Yanping and Bing Hu and Nurjahan Begum and Anthony Bagnall and Abdullah Mueen and Gustavo Batista}, + month = {10}, + title = {The UCR Time Series Classification Archive}, + url = {https://www.cs.ucr.edu/~eamonn/time_series_data_2018/}, + year = {2018}, +} From 2b9d6c9762e01a0afefd234736365172cf429dbc Mon Sep 17 00:00:00 2001 From: beckyperriment <93582518+beckyperriment@users.noreply.github.com> Date: Thu, 7 Dec 2023 14:27:35 +0000 Subject: [PATCH 02/13] Update paper.md --- joss/paper.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/joss/paper.md b/joss/paper.md index f459e88..8acf602 100644 --- a/joss/paper.md +++ b/joss/paper.md @@ -34,7 +34,7 @@ We present an approach for computationally efficient dynamic time warping (DTW) Clustering time series is becoming increasingly popular as data availability increases; however as the data avilability increases, so does the complexity of the clustering problem. Most time series clustering objectives currently depend on dimension reduction techniques or finding features from the time series which can induce bias into the clustering [@Aghabozorgi2015]. Dynamic time warping [@Sakoe1978DynamicRecognition] is a well-known technique for manipulating time series to enable comparisons between datasets, using local warping (stretching or compressing along the time axis) of the elements within each time series to find an optimal alignment between series. This emphasises the similarity of the shapes of the respective time series, rather than the exact alignment of specific features. Unfortunately, DTW does not scale well in computational speed as the length and number of time series to be compared increases---the computational complexity grows quadratically with the total number of data points. This complexity is a barrier to DTW being widely implemented in time series clustering [@Rajabi2020ASegmentation]. ``DTW-C++`` is written to handle large time series datasets, working on the raw data rather than reduced dimension data or selected features from the time series, across the various applications. -While there are other packages available for time series clustering using DTW, namely [@Petitjean2011] and [@meert2020wannesm], ``DTW-C++`` offers signficant imporvements in both speed and memory use, allowing larger datasets to be clustered. This is done by task level parallelisation, allowing multiple pairwise comparsions between time series to be evaluated simulataneously, as well as more efficient memory management by solving the DTW distance using only the preceding vector rather than storing the entire warping matrix. This means that the warping path between each time series is not stored, but this is not required for the clustering process - only the final cost is needed. In addition, MIP is preferable to other DTW clustering packages which use k-based methods for clustering, as k-based methods are suseptible to sticking in local optima. MIP finds the global optimum in most cases, and in the rare event that the global optimum is not found, the gap between the best solution found and the global optimum is given. +While there are other packages available for time series clustering using DTW, namely \texttt{DTAIDistance} [@meert2020wannesm] and \texttt{TSlearn} [@Tavenard2020TslearnData], ``DTW-C++`` offers signficant imporvements in both speed and memory use, allowing larger datasets to be clustered. This is done by task level parallelisation, allowing multiple pairwise comparsions between time series to be evaluated simulataneously, as well as more efficient memory management by solving the DTW distance using only the preceding vector rather than storing the entire warping matrix. This means that the warping path between each time series is not stored, but this is not required for the clustering process - only the final cost is needed. In addition, MIP is preferable to other DTW clustering packages which use k-based methods for clustering, as k-based methods are suseptible to sticking in local optima. MIP finds the global optimum in most cases, and in the rare event that the global optimum is not found, the gap between the best solution found and the global optimum is given. Time series clustering applications range from energy to find consumption patterns, to detecting brain activity in medical applications, to discovering patterns in stock price trends in the fincance industry. The target audience for this software can therefore range acorss multiple disciplines, intended for any user with a requirement for time-series clustering. @@ -124,7 +124,7 @@ Finding global optimality can increase the computation time, depending on the nu # Comparison -We compared our approach with two other DTW clustering packages, \texttt{DTAIDistance} [@Meert2020Dtaidistance] and \texttt{TSlearn} [@Tavenard2020TslearnData]. The datasets used for the comparison are from the UCR Time Series Classification Archive [@Dau2018TheArchive], and consist of 128 time series datasets with up to 16,800 data series of lengths up to 2,844. The full results can be found in the Appendix. Benchmarking against \texttt{TSlearn} was stopped after the first 22 datasets because the results were consistently over 20 times slower than \texttt{DTW-C++}. \autoref{tab} shows the results for datasets downselected to have a number of time series ($N$) greater than 100 and a length of each time series greater than 500 points. This is because \texttt{DTW-C++} is aimed at larger datasets where the speed improvements are more relevant. +We compared our approach with two other DTW clustering packages, \texttt{DTAIDistance} [@meert2020wannesm] and \texttt{TSlearn} [@Tavenard2020TslearnData]. The datasets used for the comparison are from the UCR Time Series Classification Archive [@Dau2018TheArchive], and consist of 128 time series datasets with up to 16,800 data series of lengths up to 2,844. The full results can be found in the Appendix. Benchmarking against \texttt{TSlearn} was stopped after the first 22 datasets because the results were consistently over 20 times slower than \texttt{DTW-C++}. \autoref{tab} shows the results for datasets downselected to have a number of time series ($N$) greater than 100 and a length of each time series greater than 500 points. This is because \texttt{DTW-C++} is aimed at larger datasets where the speed improvements are more relevant.