diff --git a/docs/about.rst b/docs/about.rst index cbf8ad42..1f939129 100644 --- a/docs/about.rst +++ b/docs/about.rst @@ -61,7 +61,7 @@ Adam Goodge (PhD Researcher @ National University of Singapore): - Joined in 2022 (implemented LUNAR) - `LinkedIn (Adam Goodge) `_ -Daniel Kulik (Machine Learning Developer; MSc Student @ University of the Free State): +Daniel Kulik (Machine Learning Developer; MSc Astrophysics @ University of the Free State): - Joined 2022 (implemented integration with PyThresh and more) - `LinkedIn (Daniel Kulik) `_ diff --git a/docs/requirements.txt b/docs/requirements.txt index 76dbe72f..3a135a98 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,13 +1,12 @@ combo furo -geomstats joblib>=1.5 matplotlib numpy>=1.19 numba>=0.51 pyclustering pytest -pythresh>=0.3.1 +pythresh>=1.0.0 ruptures scipy>=1.5.1 scikit-learn>=0.22.0 diff --git a/pyod/models/base.py b/pyod/models/base.py index 7519997d..d16a32f3 100644 --- a/pyod/models/base.py +++ b/pyod/models/base.py @@ -167,7 +167,7 @@ def predict(self, X, return_confidence=False): # if this is a PyThresh object else: - prediction = self.contamination.eval(pred_score) + prediction = self.contamination.predict(pred_score) if return_confidence: confidence = self.predict_confidence(X) @@ -291,7 +291,7 @@ def predict_confidence(self, X): prediction = (test_scores > self.threshold_).astype('int').ravel() # if this is a PyThresh object else: - prediction = self.contamination.eval(test_scores) + prediction = self.contamination.predict(test_scores) np.place(confidence, prediction == 0, 1 - confidence[prediction == 0]) return confidence @@ -575,7 +575,8 @@ def _process_decision_scores(self): # if this is a PyThresh object else: - self.labels_ = self.contamination.eval(self.decision_scores_) + self.contamination.fit(self.decision_scores_) + self.labels_ = self.contamination.labels_ self.threshold_ = self.contamination.thresh_ if not self.threshold_: self.threshold_ = np.sum(self.labels_) / len(self.labels_) diff --git a/pyod/models/thresholds.py b/pyod/models/thresholds.py index 73012009..61739ffe 100755 --- a/pyod/models/thresholds.py +++ b/pyod/models/thresholds.py @@ -5,6 +5,13 @@ def AUCP(**kwargs): to threshold scores generated by the decision_scores where outliers are set to any value beyond where the auc of the kde is less than the (mean + abs(mean-median)) percent of the total kde auc. + + Parameters + ---------- + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. """ from pythresh.thresholds.aucp import AUCP as AUCP_thres @@ -47,6 +54,10 @@ def CHAU(**kwargs): - 'mean': Construct a scaler with the mean of the scores - 'median: Construct a scaler with the median of the scores - 'gmean': Construct a scaler with the geometric mean of the scores + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. """ from pythresh.thresholds.chau import CHAU as CHAU_thres @@ -68,6 +79,10 @@ def CLF(**kwargs): - 'simple': Uses only the scores - 'complex': Uses the scores, log of the scores, and the scores' PDF + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. """ from pythresh.thresholds.clf import CLF as CLF_thres @@ -134,6 +149,10 @@ def CPD(**kwargs): - 'cdf': Use the cumulative distribution function - 'kde': Use the kernel density estimation + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. """ @@ -215,6 +234,13 @@ def EB(**kwargs): to threshold scores generated by the decision_scores where outliers are set to any value beyond a pseudo-random elliptical boundary set between inliers and outliers. + + Parameters + ---------- + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. """ from pythresh.thresholds.eb import EB as EB_thres @@ -229,6 +255,13 @@ def FGD(**kwargs): are set to any value beyond where the first derivative of the kde with respect to the decision scores passes the mean of the first and second inflection points. + + Parameters + ---------- + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. """ from pythresh.thresholds.fgd import FGD as FGD_thres @@ -269,6 +302,10 @@ def FILTER(**kwargs): - 'decimate': downsampling factor - 'detrend': number of break points - 'resample': resampling window size + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. """ from pythresh.thresholds.filter import FILTER as FILTER_thres @@ -282,12 +319,71 @@ def FWFM(**kwargs): a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any value beyond the base width. + + Parameters + ---------- + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. """ from pythresh.thresholds.fwfm import FWFM as FWFM_thres return FWFM_thres(**kwargs) +def GAMGMM(**kwargs): + """GAMGMM class for gammaGMM thresholder. + + Use a Bayesian method for estimating the posterior distribution + of the contamination factor (i.e., the proportion of anomalies) + for a given unlabeled dataset. The threshold is set such + that the proportion of predicted anomalies equals the + contamination factor. + + Parameters + ---------- + + n_contaminations : int, optional (default=1000) + number of samples to draw from the contamination posterior distribution + + n_draws : int, optional (default=50) + number of samples simultaneously drawn from each DPGMM component + + p0 : float, optional (default=0.01) + probability that no anomalies are in the data + + phigh : float, optional (default=0.01) + probability that there are more than high_gamma anomalies + + high_gamma : float, optional (default=0.15) + sensibly high number of anomalies that has low probability to occur + + gamma_lim : float, optional (default=0.5) + Upper gamma/proportion of anomalies limit + + K : int, optional (default=100) + number of components for DPGMM used to approximate the Dirichlet Process + + skip : bool, optional (default=False) + skip optimal hyperparameter test (this may return a sub-optimal solution) + + steps : int, optional (default=100) + number of iterations to test for optimal hyperparameters + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. + + verbose : bool, optional (default=False) + 20 iterations step printout of the DPGMM process + + """ + + from pythresh.thresholds.gamgmm import GAMGMM as GAMGMM_thres + return GAMGMM_thres(**kwargs) + + def GESD(**kwargs): """GESD class for Generalized Extreme Studentized Deviate thresholder. @@ -299,11 +395,16 @@ def GESD(**kwargs): ---------- max_outliers : int, optional (default='auto') - mamiximum number of outliers that the dataset may have. Default sets + maximum number of outliers that the dataset may have. Default sets max_outliers to be half the size of the dataset alpha : float, optional (default=0.05) significance level + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. + """ from pythresh.thresholds.gesd import GESD as GESD_thres @@ -322,8 +423,8 @@ def HIST(**kwargs): ---------- nbins : int, optional (default='auto') - Number of bins to use in the hostogram, default set to int(len(scores)**0.7) - + Number of bins to use in the histogram, default set to int(len(scores)**0.7) + method : {'otsu', 'yen', 'isodata', 'li', 'minimum', 'triangle'}, optional (default='triangle') Histogram filtering based method @@ -333,6 +434,10 @@ def HIST(**kwargs): - 'li': Li's iterative Minimum Cross Entropy method for filtering - 'minimum': Minimum between two maxima via smoothing method for filtering - 'triangle': Triangle algorithm method for filtering + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. """ from pythresh.thresholds.hist import HIST as HIST_thres @@ -346,6 +451,13 @@ def IQR(**kwargs): means to threshold scores generated by the decision_scores where outliers are set to any value beyond the third quartile plus 1.5 times the inter-quartile region. + + Parameters + ---------- + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. """ from pythresh.thresholds.iqr import IQR as IQR_thres @@ -371,6 +483,10 @@ def KARCH(**kwargs): - 'simple': Compute the Karcher mean using the 1D array of scores - 'complex': Compute the Karcher mean between a 2D array dot product of the scores and the sorted scores arrays + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. """ from pythresh.thresholds.karch import KARCH as KARCH_thres @@ -384,6 +500,17 @@ def MAD(**kwargs): means to threshold scores generated by the decision_scores where outliers are set to any value beyond the mean plus the median absolute deviation over the standard deviation. + + Parameters + ---------- + + factor : int, optional (default=1) + The factor to multiply the MAD by to set the threshold. + The default is 1. + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. """ from pythresh.thresholds.mad import MAD as MAD_thres @@ -430,11 +557,48 @@ def META(**kwargs): - 'GNBC': Gaussian Naive Bayes trained classifier meta-model on best contamination - 'GNBM': Gaussian Naive Bayes multivariate trained classifier meta-model + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. + """ from pythresh.thresholds.meta import META as META_thres return META_thres(**kwargs) +def MIXMOD(**kwargs): + """MIXMOD class for the Normal & Non-Normal Mixture Models thresholder. + + Use normal & non-normal mixture models to find a non-parametric means + to threshold scores generated by the decision_scores, where outliers + are set to any value beyond the posterior probability threshold + for equal posteriors of a two distribution mixture model. + + Parameters + ---------- + + method : str, optional (default='mean') + Method to evaluate selecting the best fit mixture model. Default + 'mean' sets this as the closest mixture models to the mean of the posterior + probability threshold for equal posteriors of a two distribution mixture model + for all fits. Setting 'ks' uses the two-sample Kolmogorov-Smirnov test for + goodness of fit. + + tol : float, optional (default=1e-5) + Tolerance for convergence of the EM fit + + max_iter : int, optional (default=250) + Max number of iterations to run EM during fit + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. + + """ + + from pythresh.thresholds.mixmod import MIXMOD as MIXMOD_thres + return MIXMOD_thres(**kwargs) + def MOLL(**kwargs): """MOLL class for Friedrichs' mollifier thresholder. @@ -443,6 +607,12 @@ def MOLL(**kwargs): to threshold scores generated by the decision_scores where outliers are set to any value beyond one minus the maximum of the smoothed dataset via convolution. + + Parameters + ---------- + + random_state : int, optional (default=1234) + Random seed for the uniform distribution. Can also be set to None. """ from pythresh.thresholds.moll import MOLL as MOLL_thres @@ -459,8 +629,12 @@ def MTT(**kwargs): Parameters ---------- - strictness : [1,2,3,4,5], optional (default=4) - Level of strictness corresponding to the t-Student distribution map to sample + alpha : float, optional (default=0.01) + Confidence level corresponding to the t-Student distribution map to sample + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. """ from pythresh.thresholds.mtt import MTT as MTT_thres return MTT_thres(**kwargs) @@ -539,9 +713,14 @@ def QMCD(**kwargs): lim : {'Q', 'P'}, optional (default='P') Filtering method to threshold scores using 1 - discrepancy - - - 'Q': Use quntile limiting + + - 'Q': Use quantile limiting - 'P': Use percentile limiting + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. + """ from pythresh.thresholds.qmcd import QMCD as QMCD_thres @@ -647,6 +826,13 @@ def YJ(**kwargs): a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any value beyond the max value in the YJ transformed data. + + Parameters + ---------- + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. """ from pythresh.thresholds.yj import YJ as YJ_thres @@ -659,7 +845,18 @@ def ZSCORE(**kwargs): Use the zscore to evaluate a non-parametric means to threshold scores generated by the decision_scores where outliers are set to any value beyond a zscore of one. + + Parameters + ---------- + + factor : int, optional (default=1) + The factor to multiply the zscore by to set the threshold. + The default is 1. + + random_state : int, optional (default=1234) + Random seed for the random number generators of the thresholders. Can also + be set to None. """ from pythresh.thresholds.zscore import ZSCORE as ZSCORE_thres - return ZSCORE_thres(**kwargs) + return ZSCORE_thres(**kwargs) \ No newline at end of file diff --git a/pyod/test/test_thresholds.py b/pyod/test/test_thresholds.py index 35d638c9..162db952 100644 --- a/pyod/test/test_thresholds.py +++ b/pyod/test/test_thresholds.py @@ -88,9 +88,9 @@ def setUpClass(cls): def setUp(self): from pyod.models.thresholds import (AUCP, BOOT, CHAU, CLF, CLUST, CPD, DECOMP, DSN, EB, FGD, FILTER, - FWFM, GESD, HIST, IQR, KARCH, MAD, - MCST, META, MOLL, MTT, OCSVM, QMCD, - REGR, VAE, WIND, YJ, ZSCORE) + FWFM, GAMGMM, GESD, HIST, IQR, KARCH, + MAD, MCST, META, MIXMOD, MOLL, MTT, + OCSVM, QMCD, REGR, VAE, WIND, YJ, ZSCORE) self.n_train = 200 self.n_test = 100 @@ -103,11 +103,11 @@ def setUp(self): random_state=42, ) - self.contam = [AUCP(), BOOT(), CHAU(), CLF(), CLUST(), - CPD(), DECOMP(), DSN(), EB(), FGD(), FILTER(), - FWFM(), GESD(), HIST(), IQR(), KARCH(), MAD(), - MCST(), META(), MOLL(), MTT(), OCSVM(), QMCD(), - REGR(), VAE(), WIND(), YJ(), ZSCORE()] + self.contam = [AUCP(), BOOT(), CHAU(), CLF(), CLUST(), CPD(), + DECOMP(), DSN(), EB(), FGD(), FILTER(), FWFM(), + GAMGMM(skip=True), GESD(), HIST(), IQR(), KARCH(), + MAD(), MCST(), META(), MIXMOD(), MOLL(), MTT(), + OCSVM(), QMCD(), REGR(), VAE(), WIND(), YJ(), ZSCORE()] for contam in self.contam: self.clf = KDE(contamination=contam) @@ -233,4 +233,4 @@ def tearDown(self): if __name__ == "__main__": - unittest.main() + unittest.main() \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index d9cc7d5a..58cfdd48 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,7 +62,7 @@ all = [ "suod", "xgboost", "combo", - "pythresh", + "pythresh>=1.0.0", "sentence-transformers>=5.0.0", "openai>=1.0", "transformers>=4.25.1",