From 3d772e9034b27f0339d9a3cb77e20de1610260cf Mon Sep 17 00:00:00 2001 From: Mathieu Blondel Date: Fri, 30 Aug 2013 22:31:09 +0900 Subject: [PATCH] Add pairwise_distances_argmin. --- examples/cluster/plot_color_quantization.py | 8 +- examples/cluster/plot_mini_batch_kmeans.py | 6 +- sklearn/cluster/affinity_propagation_.py | 4 +- sklearn/cluster/mean_shift_.py | 4 +- sklearn/metrics/__init__.py | 1 + sklearn/metrics/pairwise.py | 83 +++++++++++++++++++-- 6 files changed, 89 insertions(+), 17 deletions(-) diff --git a/examples/cluster/plot_color_quantization.py b/examples/cluster/plot_color_quantization.py index 9d1ada71b9d8d..935083289b9e5 100644 --- a/examples/cluster/plot_color_quantization.py +++ b/examples/cluster/plot_color_quantization.py @@ -28,7 +28,7 @@ import numpy as np import pylab as pl from sklearn.cluster import KMeans -from sklearn.metrics import pairwise_distances_argmin_min +from sklearn.metrics import pairwise_distances_argmin from sklearn.datasets import load_sample_image from sklearn.utils import shuffle from time import time @@ -64,9 +64,9 @@ codebook_random = shuffle(image_array, random_state=0)[:n_colors + 1] print("Predicting color indices on the full image (random)") t0 = time() -labels_random = pairwise_distances_argmin_min(codebook_random, - image_array, - axis=0)[0] +labels_random = pairwise_distances_argmin(codebook_random, + image_array, + axis=0) print("done in %0.3fs." % (time() - t0)) diff --git a/examples/cluster/plot_mini_batch_kmeans.py b/examples/cluster/plot_mini_batch_kmeans.py index e90c6ebcb72cf..10ab07918a659 100644 --- a/examples/cluster/plot_mini_batch_kmeans.py +++ b/examples/cluster/plot_mini_batch_kmeans.py @@ -20,7 +20,7 @@ import pylab as pl from sklearn.cluster import MiniBatchKMeans, KMeans -from sklearn.metrics.pairwise import pairwise_distances_argmin_min +from sklearn.metrics.pairwise import pairwise_distances_argmin from sklearn.datasets.samples_generator import make_blobs ############################################################################## @@ -66,8 +66,8 @@ # MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per # closest one. -order = pairwise_distances_argmin_min(k_means_cluster_centers, - mbk_means_cluster_centers)[0] +order = pairwise_distances_argmin(k_means_cluster_centers, + mbk_means_cluster_centers) # KMeans ax = fig.add_subplot(1, 3, 1) diff --git a/sklearn/cluster/affinity_propagation_.py b/sklearn/cluster/affinity_propagation_.py index 2c63daaded291..90c0ad8f54f1a 100644 --- a/sklearn/cluster/affinity_propagation_.py +++ b/sklearn/cluster/affinity_propagation_.py @@ -12,7 +12,7 @@ from ..base import BaseEstimator, ClusterMixin from ..utils import as_float_array from ..metrics import euclidean_distances -from ..metrics import pairwise_distances_argmin_min +from ..metrics import pairwise_distances_argmin def affinity_propagation(S, preference=None, convergence_iter=15, max_iter=200, @@ -302,4 +302,4 @@ def predict(self, X): raise ValueError("Predict method is not supported when " "affinity='precomputed'.") - return pairwise_distances_argmin_min(X, self.cluster_centers_)[0] + return pairwise_distances_argmin(X, self.cluster_centers_) diff --git a/sklearn/cluster/mean_shift_.py b/sklearn/cluster/mean_shift_.py index 0b56a18b751ba..efe483acd8e01 100644 --- a/sklearn/cluster/mean_shift_.py +++ b/sklearn/cluster/mean_shift_.py @@ -12,7 +12,7 @@ from ..base import BaseEstimator, ClusterMixin from ..neighbors import NearestNeighbors from ..metrics.pairwise import euclidean_distances -from ..metrics.pairwise import pairwise_distances_argmin_min +from ..metrics.pairwise import pairwise_distances_argmin def estimate_bandwidth(X, quantile=0.3, n_samples=None, random_state=0): @@ -295,4 +295,4 @@ def predict(self, X): labels : array, shape [n_samples,] Index of the cluster each sample belongs to. """ - return pairwise_distances_argmin_min(X, self.cluster_centers_)[0] + return pairwise_distances_argmin(X, self.cluster_centers_) diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index 227da6a21a741..2a8541a07de22 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -52,6 +52,7 @@ from .pairwise import (euclidean_distances, pairwise_distances, pairwise_distances_argmin_min, + pairwise_distances_argmin, pairwise_kernels) __all__ = ['accuracy_score', diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py index 2f864e738a7ed..5d31acb279415 100644 --- a/sklearn/metrics/pairwise.py +++ b/sklearn/metrics/pairwise.py @@ -200,10 +200,10 @@ def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean", This is mostly equivalent to calling: - pairwise_distances(X, Y=Y, metric=metric).argmin(axis=axis) + (pairwise_distances(X, Y=Y, metric=metric).argmin(axis=axis), + pairwise_distances(X, Y=Y, metric=metric).min(axis=axis)) - but uses much less memory, and is faster for large arrays. It also returns - the minimum values at the same time. + but uses much less memory, and is faster for large arrays. This function works with dense 2D arrays only. @@ -259,9 +259,7 @@ def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean", See also ======== sklearn.metrics.pairwise_distances - - Notes - ===== + sklearn.metrics.pairwise_distances_argmin """ dist_func = None if metric in PAIRWISE_DISTANCE_FUNCTIONS: @@ -317,6 +315,79 @@ def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean", return indices, values +def pairwise_distances_argmin(X, Y, axis=1, metric="euclidean", + batch_size=500, metric_kwargs={}): + """Compute minimum distances between one point and a set of points. + + This function computes for each row in X, the index of the row of Y which + is closest (according to the specified distance). + + This is mostly equivalent to calling: + + pairwise_distances(X, Y=Y, metric=metric).argmin(axis=axis) + + but uses much less memory, and is faster for large arrays. + + This function works with dense 2D arrays only. + + Parameters + ========== + X, Y : array-like + Arrays containing points. Respective shapes (n_samples1, n_features) + and (n_samples2, n_features) + + batch_size : integer + To reduce memory consumption over the naive solution, data are + processed in batches, comprising batch_size rows of X and + batch_size rows of Y. The default value is quite conservative, but + can be changed for fine-tuning. The larger the number, the larger the + memory usage. + + metric : string or callable + metric to use for distance computation. Any metric from scikit-learn + or scipy.spatial.distance can be used. + + If metric is a callable function, it is called on each + pair of instances (rows) and the resulting value recorded. The callable + should take two arrays as input and return one value indicating the + distance between them. This works for Scipy's metrics, but is less + efficient than passing the metric name as a string. + + Distance matrices are not supported. + + Valid values for metric are: + + - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', + 'manhattan'] + + - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', + 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', + 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', + 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'] + See the documentation for scipy.spatial.distance for details on these + metrics. + + metric_kwargs : dict + keyword arguments to pass to specified metric function. + + Returns + ======= + argmin : numpy.ndarray + Y[argmin[i], :] is the row in Y that is closest to X[i, :]. + + distances : numpy.ndarray + distances[i] is the distance between the i-th row in X and the + argmin[i]-th row in Y. + + See also + ======== + sklearn.metrics.pairwise_distances + sklearn.metrics.pairwise_distances_argmin_min + """ + return pairwise_distances_argmin_min(X, Y, axis, metric, batch_size, + metric_kwargs)[0] + + def manhattan_distances(X, Y=None, sum_over_features=True, size_threshold=5e8): """ Compute the L1 distances between the vectors in X and Y.