forked from scikit-learn/scikit-learn
-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[MRG+2] LOF algorithm (Anomaly Detection) (scikit-learn#5279)
* LOF algorithm add tests and example fix DepreciationWarning by reshape(1,-1) one-sample data LOF with inheritance lof and lof2 return same score fix bugs fix bugs optimized and cosmit rm lof2 cosmit rm MixinLOF + fit_predict fix travis - optimize pairwise_distance like in KNeighborsMixin.kneighbors add comparison example + doc LOF -> LocalOutlierFactor cosmit change LOF API: -fit(X).predict() and fit(X).decision_function() do prediction on X without considering samples as their own neighbors (ie without considering X as a new dataset as does fit(X).predict(X)) -rm fit_predict() method -add a contamination parameter st predict returns a binary value like other anomaly detection algos cosmit doc + debug example correction doc pass on doc + examples pep8 + fix warnings first attempt at fixing API issues minor changes takes into account tguillemot advice -remove pairwise_distance calculation as to heavy in memory -add benchmarks cosmit minor changes + deals with duplicates fix depreciation warnings * factorize the two for loops * take into account @albertthomas88 review and cosmit * fix doc * alex review + rebase * make predict private add outlier_factor_ attribute and update tests * make fit_predict take y argument * fix benchmarks file * update examples * make decision_function public (rm X=None default) * fix travis * take into account tguillemot review + remove useless k_distance function * fix broken links :meth:`kneighbors` * cosmit * whatsnew * amueller review + remove _local_outlier_factor method * add n_neighbors_ parameter the effective nb neighbors we use * make decision_function private and negative_outlier_factor attribute
- Loading branch information
Showing
12 changed files
with
710 additions
and
33 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
""" | ||
============================ | ||
LocalOutlierFactor benchmark | ||
============================ | ||
A test of LocalOutlierFactor on classical anomaly detection datasets. | ||
""" | ||
|
||
from time import time | ||
import numpy as np | ||
import matplotlib.pyplot as plt | ||
from sklearn.neighbors import LocalOutlierFactor | ||
from sklearn.metrics import roc_curve, auc | ||
from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_mldata | ||
from sklearn.preprocessing import LabelBinarizer | ||
from sklearn.utils import shuffle as sh | ||
|
||
print(__doc__) | ||
|
||
np.random.seed(2) | ||
|
||
# datasets available: ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] | ||
datasets = ['shuttle'] | ||
|
||
novelty_detection = True # if False, training set polluted by outliers | ||
|
||
for dataset_name in datasets: | ||
# loading and vectorization | ||
print('loading data') | ||
if dataset_name in ['http', 'smtp', 'SA', 'SF']: | ||
dataset = fetch_kddcup99(subset=dataset_name, shuffle=True, | ||
percent10=False) | ||
X = dataset.data | ||
y = dataset.target | ||
|
||
if dataset_name == 'shuttle': | ||
dataset = fetch_mldata('shuttle') | ||
X = dataset.data | ||
y = dataset.target | ||
X, y = sh(X, y) | ||
# we remove data with label 4 | ||
# normal data are then those of class 1 | ||
s = (y != 4) | ||
X = X[s, :] | ||
y = y[s] | ||
y = (y != 1).astype(int) | ||
|
||
if dataset_name == 'forestcover': | ||
dataset = fetch_covtype(shuffle=True) | ||
X = dataset.data | ||
y = dataset.target | ||
# normal data are those with attribute 2 | ||
# abnormal those with attribute 4 | ||
s = (y == 2) + (y == 4) | ||
X = X[s, :] | ||
y = y[s] | ||
y = (y != 2).astype(int) | ||
|
||
print('vectorizing data') | ||
|
||
if dataset_name == 'SF': | ||
lb = LabelBinarizer() | ||
lb.fit(X[:, 1]) | ||
x1 = lb.transform(X[:, 1]) | ||
X = np.c_[X[:, :1], x1, X[:, 2:]] | ||
y = (y != 'normal.').astype(int) | ||
|
||
if dataset_name == 'SA': | ||
lb = LabelBinarizer() | ||
lb.fit(X[:, 1]) | ||
x1 = lb.transform(X[:, 1]) | ||
lb.fit(X[:, 2]) | ||
x2 = lb.transform(X[:, 2]) | ||
lb.fit(X[:, 3]) | ||
x3 = lb.transform(X[:, 3]) | ||
X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]] | ||
y = (y != 'normal.').astype(int) | ||
|
||
if dataset_name == 'http' or dataset_name == 'smtp': | ||
y = (y != 'normal.').astype(int) | ||
|
||
n_samples, n_features = np.shape(X) | ||
n_samples_train = n_samples // 2 | ||
n_samples_test = n_samples - n_samples_train | ||
|
||
X = X.astype(float) | ||
X_train = X[:n_samples_train, :] | ||
X_test = X[n_samples_train:, :] | ||
y_train = y[:n_samples_train] | ||
y_test = y[n_samples_train:] | ||
|
||
if novelty_detection: | ||
X_train = X_train[y_train == 0] | ||
y_train = y_train[y_train == 0] | ||
|
||
print('LocalOutlierFactor processing...') | ||
model = LocalOutlierFactor(n_neighbors=20) | ||
tstart = time() | ||
model.fit(X_train) | ||
fit_time = time() - tstart | ||
tstart = time() | ||
|
||
scoring = -model.decision_function(X_test) # the lower, the more normal | ||
predict_time = time() - tstart | ||
fpr, tpr, thresholds = roc_curve(y_test, scoring) | ||
AUC = auc(fpr, tpr) | ||
plt.plot(fpr, tpr, lw=1, | ||
label=('ROC for %s (area = %0.3f, train-time: %0.2fs,' | ||
'test-time: %0.2fs)' % (dataset_name, AUC, fit_time, | ||
predict_time))) | ||
|
||
plt.xlim([-0.05, 1.05]) | ||
plt.ylim([-0.05, 1.05]) | ||
plt.xlabel('False Positive Rate') | ||
plt.ylabel('True Positive Rate') | ||
plt.title('Receiver operating characteristic') | ||
plt.legend(loc="lower right") | ||
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.