diff --git a/docs/about.rst b/docs/about.rst
index cbf8ad42..1f939129 100644
--- a/docs/about.rst
+++ b/docs/about.rst
@@ -61,7 +61,7 @@ Adam Goodge (PhD Researcher @ National University of Singapore):
- Joined in 2022 (implemented LUNAR)
- `LinkedIn (Adam Goodge) `_
-Daniel Kulik (Machine Learning Developer; MSc Student @ University of the Free State):
+Daniel Kulik (Machine Learning Developer; MSc Astrophysics @ University of the Free State):
- Joined 2022 (implemented integration with PyThresh and more)
- `LinkedIn (Daniel Kulik) `_
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 76dbe72f..3a135a98 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,13 +1,12 @@
combo
furo
-geomstats
joblib>=1.5
matplotlib
numpy>=1.19
numba>=0.51
pyclustering
pytest
-pythresh>=0.3.1
+pythresh>=1.0.0
ruptures
scipy>=1.5.1
scikit-learn>=0.22.0
diff --git a/pyod/models/base.py b/pyod/models/base.py
index 7519997d..d16a32f3 100644
--- a/pyod/models/base.py
+++ b/pyod/models/base.py
@@ -167,7 +167,7 @@ def predict(self, X, return_confidence=False):
# if this is a PyThresh object
else:
- prediction = self.contamination.eval(pred_score)
+ prediction = self.contamination.predict(pred_score)
if return_confidence:
confidence = self.predict_confidence(X)
@@ -291,7 +291,7 @@ def predict_confidence(self, X):
prediction = (test_scores > self.threshold_).astype('int').ravel()
# if this is a PyThresh object
else:
- prediction = self.contamination.eval(test_scores)
+ prediction = self.contamination.predict(test_scores)
np.place(confidence, prediction == 0, 1 - confidence[prediction == 0])
return confidence
@@ -575,7 +575,8 @@ def _process_decision_scores(self):
# if this is a PyThresh object
else:
- self.labels_ = self.contamination.eval(self.decision_scores_)
+ self.contamination.fit(self.decision_scores_)
+ self.labels_ = self.contamination.labels_
self.threshold_ = self.contamination.thresh_
if not self.threshold_:
self.threshold_ = np.sum(self.labels_) / len(self.labels_)
diff --git a/pyod/models/thresholds.py b/pyod/models/thresholds.py
index 73012009..61739ffe 100755
--- a/pyod/models/thresholds.py
+++ b/pyod/models/thresholds.py
@@ -5,6 +5,13 @@ def AUCP(**kwargs):
to threshold scores generated by the decision_scores where outliers
are set to any value beyond where the auc of the kde is less
than the (mean + abs(mean-median)) percent of the total kde auc.
+
+ Parameters
+ ----------
+
+ random_state : int, optional (default=1234)
+ Random seed for the random number generators of the thresholders. Can also
+ be set to None.
"""
from pythresh.thresholds.aucp import AUCP as AUCP_thres
@@ -47,6 +54,10 @@ def CHAU(**kwargs):
- 'mean': Construct a scaler with the mean of the scores
- 'median: Construct a scaler with the median of the scores
- 'gmean': Construct a scaler with the geometric mean of the scores
+
+ random_state : int, optional (default=1234)
+ Random seed for the random number generators of the thresholders. Can also
+ be set to None.
"""
from pythresh.thresholds.chau import CHAU as CHAU_thres
@@ -68,6 +79,10 @@ def CLF(**kwargs):
- 'simple': Uses only the scores
- 'complex': Uses the scores, log of the scores, and the scores' PDF
+
+ random_state : int, optional (default=1234)
+ Random seed for the random number generators of the thresholders. Can also
+ be set to None.
"""
from pythresh.thresholds.clf import CLF as CLF_thres
@@ -134,6 +149,10 @@ def CPD(**kwargs):
- 'cdf': Use the cumulative distribution function
- 'kde': Use the kernel density estimation
+
+ random_state : int, optional (default=1234)
+ Random seed for the random number generators of the thresholders. Can also
+ be set to None.
"""
@@ -215,6 +234,13 @@ def EB(**kwargs):
to threshold scores generated by the decision_scores where outliers
are set to any value beyond a pseudo-random elliptical boundary set
between inliers and outliers.
+
+ Parameters
+ ----------
+
+ random_state : int, optional (default=1234)
+ Random seed for the random number generators of the thresholders. Can also
+ be set to None.
"""
from pythresh.thresholds.eb import EB as EB_thres
@@ -229,6 +255,13 @@ def FGD(**kwargs):
are set to any value beyond where the first derivative of the kde
with respect to the decision scores passes the mean of the first
and second inflection points.
+
+ Parameters
+ ----------
+
+ random_state : int, optional (default=1234)
+ Random seed for the random number generators of the thresholders. Can also
+ be set to None.
"""
from pythresh.thresholds.fgd import FGD as FGD_thres
@@ -269,6 +302,10 @@ def FILTER(**kwargs):
- 'decimate': downsampling factor
- 'detrend': number of break points
- 'resample': resampling window size
+
+ random_state : int, optional (default=1234)
+ Random seed for the random number generators of the thresholders. Can also
+ be set to None.
"""
from pythresh.thresholds.filter import FILTER as FILTER_thres
@@ -282,12 +319,71 @@ def FWFM(**kwargs):
a non-parametric means to threshold scores generated by the
decision_scores where outliers are set to any value beyond the base
width.
+
+ Parameters
+ ----------
+
+ random_state : int, optional (default=1234)
+ Random seed for the random number generators of the thresholders. Can also
+ be set to None.
"""
from pythresh.thresholds.fwfm import FWFM as FWFM_thres
return FWFM_thres(**kwargs)
+def GAMGMM(**kwargs):
+ """GAMGMM class for gammaGMM thresholder.
+
+ Use a Bayesian method for estimating the posterior distribution
+ of the contamination factor (i.e., the proportion of anomalies)
+ for a given unlabeled dataset. The threshold is set such
+ that the proportion of predicted anomalies equals the
+ contamination factor.
+
+ Parameters
+ ----------
+
+ n_contaminations : int, optional (default=1000)
+ number of samples to draw from the contamination posterior distribution
+
+ n_draws : int, optional (default=50)
+ number of samples simultaneously drawn from each DPGMM component
+
+ p0 : float, optional (default=0.01)
+ probability that no anomalies are in the data
+
+ phigh : float, optional (default=0.01)
+ probability that there are more than high_gamma anomalies
+
+ high_gamma : float, optional (default=0.15)
+ sensibly high number of anomalies that has low probability to occur
+
+ gamma_lim : float, optional (default=0.5)
+ Upper gamma/proportion of anomalies limit
+
+ K : int, optional (default=100)
+ number of components for DPGMM used to approximate the Dirichlet Process
+
+ skip : bool, optional (default=False)
+ skip optimal hyperparameter test (this may return a sub-optimal solution)
+
+ steps : int, optional (default=100)
+ number of iterations to test for optimal hyperparameters
+
+ random_state : int, optional (default=1234)
+ Random seed for the random number generators of the thresholders. Can also
+ be set to None.
+
+ verbose : bool, optional (default=False)
+ 20 iterations step printout of the DPGMM process
+
+ """
+
+ from pythresh.thresholds.gamgmm import GAMGMM as GAMGMM_thres
+ return GAMGMM_thres(**kwargs)
+
+
def GESD(**kwargs):
"""GESD class for Generalized Extreme Studentized Deviate thresholder.
@@ -299,11 +395,16 @@ def GESD(**kwargs):
----------
max_outliers : int, optional (default='auto')
- mamiximum number of outliers that the dataset may have. Default sets
+ maximum number of outliers that the dataset may have. Default sets
max_outliers to be half the size of the dataset
alpha : float, optional (default=0.05)
significance level
+
+ random_state : int, optional (default=1234)
+ Random seed for the random number generators of the thresholders. Can also
+ be set to None.
+
"""
from pythresh.thresholds.gesd import GESD as GESD_thres
@@ -322,8 +423,8 @@ def HIST(**kwargs):
----------
nbins : int, optional (default='auto')
- Number of bins to use in the hostogram, default set to int(len(scores)**0.7)
-
+ Number of bins to use in the histogram, default set to int(len(scores)**0.7)
+
method : {'otsu', 'yen', 'isodata', 'li', 'minimum', 'triangle'}, optional (default='triangle')
Histogram filtering based method
@@ -333,6 +434,10 @@ def HIST(**kwargs):
- 'li': Li's iterative Minimum Cross Entropy method for filtering
- 'minimum': Minimum between two maxima via smoothing method for filtering
- 'triangle': Triangle algorithm method for filtering
+
+ random_state : int, optional (default=1234)
+ Random seed for the random number generators of the thresholders. Can also
+ be set to None.
"""
from pythresh.thresholds.hist import HIST as HIST_thres
@@ -346,6 +451,13 @@ def IQR(**kwargs):
means to threshold scores generated by the decision_scores
where outliers are set to any value beyond the third quartile
plus 1.5 times the inter-quartile region.
+
+ Parameters
+ ----------
+
+ random_state : int, optional (default=1234)
+ Random seed for the random number generators of the thresholders. Can also
+ be set to None.
"""
from pythresh.thresholds.iqr import IQR as IQR_thres
@@ -371,6 +483,10 @@ def KARCH(**kwargs):
- 'simple': Compute the Karcher mean using the 1D array of scores
- 'complex': Compute the Karcher mean between a 2D array dot product of the scores and the sorted scores arrays
+
+ random_state : int, optional (default=1234)
+ Random seed for the random number generators of the thresholders. Can also
+ be set to None.
"""
from pythresh.thresholds.karch import KARCH as KARCH_thres
@@ -384,6 +500,17 @@ def MAD(**kwargs):
means to threshold scores generated by the decision_scores
where outliers are set to any value beyond the mean plus the
median absolute deviation over the standard deviation.
+
+ Parameters
+ ----------
+
+ factor : int, optional (default=1)
+ The factor to multiply the MAD by to set the threshold.
+ The default is 1.
+
+ random_state : int, optional (default=1234)
+ Random seed for the random number generators of the thresholders. Can also
+ be set to None.
"""
from pythresh.thresholds.mad import MAD as MAD_thres
@@ -430,11 +557,48 @@ def META(**kwargs):
- 'GNBC': Gaussian Naive Bayes trained classifier meta-model on best contamination
- 'GNBM': Gaussian Naive Bayes multivariate trained classifier meta-model
+ random_state : int, optional (default=1234)
+ Random seed for the random number generators of the thresholders. Can also
+ be set to None.
+
"""
from pythresh.thresholds.meta import META as META_thres
return META_thres(**kwargs)
+def MIXMOD(**kwargs):
+ """MIXMOD class for the Normal & Non-Normal Mixture Models thresholder.
+
+ Use normal & non-normal mixture models to find a non-parametric means
+ to threshold scores generated by the decision_scores, where outliers
+ are set to any value beyond the posterior probability threshold
+ for equal posteriors of a two distribution mixture model.
+
+ Parameters
+ ----------
+
+ method : str, optional (default='mean')
+ Method to evaluate selecting the best fit mixture model. Default
+ 'mean' sets this as the closest mixture models to the mean of the posterior
+ probability threshold for equal posteriors of a two distribution mixture model
+ for all fits. Setting 'ks' uses the two-sample Kolmogorov-Smirnov test for
+ goodness of fit.
+
+ tol : float, optional (default=1e-5)
+ Tolerance for convergence of the EM fit
+
+ max_iter : int, optional (default=250)
+ Max number of iterations to run EM during fit
+
+ random_state : int, optional (default=1234)
+ Random seed for the random number generators of the thresholders. Can also
+ be set to None.
+
+ """
+
+ from pythresh.thresholds.mixmod import MIXMOD as MIXMOD_thres
+ return MIXMOD_thres(**kwargs)
+
def MOLL(**kwargs):
"""MOLL class for Friedrichs' mollifier thresholder.
@@ -443,6 +607,12 @@ def MOLL(**kwargs):
to threshold scores generated by the decision_scores where outliers
are set to any value beyond one minus the maximum of the smoothed
dataset via convolution.
+
+ Parameters
+ ----------
+
+ random_state : int, optional (default=1234)
+ Random seed for the uniform distribution. Can also be set to None.
"""
from pythresh.thresholds.moll import MOLL as MOLL_thres
@@ -459,8 +629,12 @@ def MTT(**kwargs):
Parameters
----------
- strictness : [1,2,3,4,5], optional (default=4)
- Level of strictness corresponding to the t-Student distribution map to sample
+ alpha : float, optional (default=0.01)
+ Confidence level corresponding to the t-Student distribution map to sample
+
+ random_state : int, optional (default=1234)
+ Random seed for the random number generators of the thresholders. Can also
+ be set to None.
"""
from pythresh.thresholds.mtt import MTT as MTT_thres
return MTT_thres(**kwargs)
@@ -539,9 +713,14 @@ def QMCD(**kwargs):
lim : {'Q', 'P'}, optional (default='P')
Filtering method to threshold scores using 1 - discrepancy
-
- - 'Q': Use quntile limiting
+
+ - 'Q': Use quantile limiting
- 'P': Use percentile limiting
+
+ random_state : int, optional (default=1234)
+ Random seed for the random number generators of the thresholders. Can also
+ be set to None.
+
"""
from pythresh.thresholds.qmcd import QMCD as QMCD_thres
@@ -647,6 +826,13 @@ def YJ(**kwargs):
a non-parametric means to threshold scores generated by the
decision_scores where outliers are set to any value beyond the
max value in the YJ transformed data.
+
+ Parameters
+ ----------
+
+ random_state : int, optional (default=1234)
+ Random seed for the random number generators of the thresholders. Can also
+ be set to None.
"""
from pythresh.thresholds.yj import YJ as YJ_thres
@@ -659,7 +845,18 @@ def ZSCORE(**kwargs):
Use the zscore to evaluate a non-parametric means to threshold
scores generated by the decision_scores where outliers are set
to any value beyond a zscore of one.
+
+ Parameters
+ ----------
+
+ factor : int, optional (default=1)
+ The factor to multiply the zscore by to set the threshold.
+ The default is 1.
+
+ random_state : int, optional (default=1234)
+ Random seed for the random number generators of the thresholders. Can also
+ be set to None.
"""
from pythresh.thresholds.zscore import ZSCORE as ZSCORE_thres
- return ZSCORE_thres(**kwargs)
+ return ZSCORE_thres(**kwargs)
\ No newline at end of file
diff --git a/pyod/test/test_thresholds.py b/pyod/test/test_thresholds.py
index 35d638c9..162db952 100644
--- a/pyod/test/test_thresholds.py
+++ b/pyod/test/test_thresholds.py
@@ -88,9 +88,9 @@ def setUpClass(cls):
def setUp(self):
from pyod.models.thresholds import (AUCP, BOOT, CHAU, CLF, CLUST,
CPD, DECOMP, DSN, EB, FGD, FILTER,
- FWFM, GESD, HIST, IQR, KARCH, MAD,
- MCST, META, MOLL, MTT, OCSVM, QMCD,
- REGR, VAE, WIND, YJ, ZSCORE)
+ FWFM, GAMGMM, GESD, HIST, IQR, KARCH,
+ MAD, MCST, META, MIXMOD, MOLL, MTT,
+ OCSVM, QMCD, REGR, VAE, WIND, YJ, ZSCORE)
self.n_train = 200
self.n_test = 100
@@ -103,11 +103,11 @@ def setUp(self):
random_state=42,
)
- self.contam = [AUCP(), BOOT(), CHAU(), CLF(), CLUST(),
- CPD(), DECOMP(), DSN(), EB(), FGD(), FILTER(),
- FWFM(), GESD(), HIST(), IQR(), KARCH(), MAD(),
- MCST(), META(), MOLL(), MTT(), OCSVM(), QMCD(),
- REGR(), VAE(), WIND(), YJ(), ZSCORE()]
+ self.contam = [AUCP(), BOOT(), CHAU(), CLF(), CLUST(), CPD(),
+ DECOMP(), DSN(), EB(), FGD(), FILTER(), FWFM(),
+ GAMGMM(skip=True), GESD(), HIST(), IQR(), KARCH(),
+ MAD(), MCST(), META(), MIXMOD(), MOLL(), MTT(),
+ OCSVM(), QMCD(), REGR(), VAE(), WIND(), YJ(), ZSCORE()]
for contam in self.contam:
self.clf = KDE(contamination=contam)
@@ -233,4 +233,4 @@ def tearDown(self):
if __name__ == "__main__":
- unittest.main()
+ unittest.main()
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index d9cc7d5a..58cfdd48 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -62,7 +62,7 @@ all = [
"suod",
"xgboost",
"combo",
- "pythresh",
+ "pythresh>=1.0.0",
"sentence-transformers>=5.0.0",
"openai>=1.0",
"transformers>=4.25.1",