forked from scikit-learn/scikit-learn
-
Notifications
You must be signed in to change notification settings - Fork 6
/
test_spectral_embedding.py
285 lines (240 loc) · 11.6 KB
/
test_spectral_embedding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
import pytest
import numpy as np
from scipy import sparse
from scipy.sparse import csgraph
from scipy.linalg import eigh
from sklearn.manifold.spectral_embedding_ import SpectralEmbedding
from sklearn.manifold.spectral_embedding_ import _graph_is_connected
from sklearn.manifold.spectral_embedding_ import _graph_connected_component
from sklearn.manifold import spectral_embedding
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.metrics import normalized_mutual_info_score
from sklearn.cluster import KMeans
from sklearn.datasets.samples_generator import make_blobs
from sklearn.utils.extmath import _deterministic_vector_sign_flip
from sklearn.utils.testing import assert_array_almost_equal
from sklearn.utils.testing import assert_array_equal
from sklearn.utils.testing import assert_true, assert_equal, assert_raises
from sklearn.utils.testing import SkipTest
# non centered, sparse centers to check the
centers = np.array([
[0.0, 5.0, 0.0, 0.0, 0.0],
[0.0, 0.0, 4.0, 0.0, 0.0],
[1.0, 0.0, 0.0, 5.0, 1.0],
])
n_samples = 1000
n_clusters, n_features = centers.shape
S, true_labels = make_blobs(n_samples=n_samples, centers=centers,
cluster_std=1., random_state=42)
def _check_with_col_sign_flipping(A, B, tol=0.0):
""" Check array A and B are equal with possible sign flipping on
each columns"""
sign = True
for column_idx in range(A.shape[1]):
sign = sign and ((((A[:, column_idx] -
B[:, column_idx]) ** 2).mean() <= tol ** 2) or
(((A[:, column_idx] +
B[:, column_idx]) ** 2).mean() <= tol ** 2))
if not sign:
return False
return True
def test_sparse_graph_connected_component():
rng = np.random.RandomState(42)
n_samples = 300
boundaries = [0, 42, 121, 200, n_samples]
p = rng.permutation(n_samples)
connections = []
for start, stop in zip(boundaries[:-1], boundaries[1:]):
group = p[start:stop]
# Connect all elements within the group at least once via an
# arbitrary path that spans the group.
for i in range(len(group) - 1):
connections.append((group[i], group[i + 1]))
# Add some more random connections within the group
min_idx, max_idx = 0, len(group) - 1
n_random_connections = 1000
source = rng.randint(min_idx, max_idx, size=n_random_connections)
target = rng.randint(min_idx, max_idx, size=n_random_connections)
connections.extend(zip(group[source], group[target]))
# Build a symmetric affinity matrix
row_idx, column_idx = tuple(np.array(connections).T)
data = rng.uniform(.1, 42, size=len(connections))
affinity = sparse.coo_matrix((data, (row_idx, column_idx)))
affinity = 0.5 * (affinity + affinity.T)
for start, stop in zip(boundaries[:-1], boundaries[1:]):
component_1 = _graph_connected_component(affinity, p[start])
component_size = stop - start
assert_equal(component_1.sum(), component_size)
# We should retrieve the same component mask by starting by both ends
# of the group
component_2 = _graph_connected_component(affinity, p[stop - 1])
assert_equal(component_2.sum(), component_size)
assert_array_equal(component_1, component_2)
def test_spectral_embedding_two_components(seed=36):
# Test spectral embedding with two components
random_state = np.random.RandomState(seed)
n_sample = 100
affinity = np.zeros(shape=[n_sample * 2, n_sample * 2])
# first component
affinity[0:n_sample,
0:n_sample] = np.abs(random_state.randn(n_sample, n_sample)) + 2
# second component
affinity[n_sample::,
n_sample::] = np.abs(random_state.randn(n_sample, n_sample)) + 2
# Test of internal _graph_connected_component before connection
component = _graph_connected_component(affinity, 0)
assert_true(component[:n_sample].all())
assert_true(not component[n_sample:].any())
component = _graph_connected_component(affinity, -1)
assert_true(not component[:n_sample].any())
assert_true(component[n_sample:].all())
# connection
affinity[0, n_sample + 1] = 1
affinity[n_sample + 1, 0] = 1
affinity.flat[::2 * n_sample + 1] = 0
affinity = 0.5 * (affinity + affinity.T)
true_label = np.zeros(shape=2 * n_sample)
true_label[0:n_sample] = 1
se_precomp = SpectralEmbedding(n_components=1, affinity="precomputed",
random_state=np.random.RandomState(seed))
embedded_coordinate = se_precomp.fit_transform(affinity)
# Some numpy versions are touchy with types
embedded_coordinate = \
se_precomp.fit_transform(affinity.astype(np.float32))
# thresholding on the first components using 0.
label_ = np.array(embedded_coordinate.ravel() < 0, dtype="float")
assert_equal(normalized_mutual_info_score(true_label, label_), 1.0)
def test_spectral_embedding_precomputed_affinity(seed=36):
# Test spectral embedding with precomputed kernel
gamma = 1.0
se_precomp = SpectralEmbedding(n_components=2, affinity="precomputed",
random_state=np.random.RandomState(seed))
se_rbf = SpectralEmbedding(n_components=2, affinity="rbf",
gamma=gamma,
random_state=np.random.RandomState(seed))
embed_precomp = se_precomp.fit_transform(rbf_kernel(S, gamma=gamma))
embed_rbf = se_rbf.fit_transform(S)
assert_array_almost_equal(
se_precomp.affinity_matrix_, se_rbf.affinity_matrix_)
assert_true(_check_with_col_sign_flipping(embed_precomp, embed_rbf, 0.05))
def test_spectral_embedding_callable_affinity(seed=36):
# Test spectral embedding with callable affinity
gamma = 0.9
kern = rbf_kernel(S, gamma=gamma)
se_callable = SpectralEmbedding(n_components=2,
affinity=(
lambda x: rbf_kernel(x, gamma=gamma)),
gamma=gamma,
random_state=np.random.RandomState(seed))
se_rbf = SpectralEmbedding(n_components=2, affinity="rbf",
gamma=gamma,
random_state=np.random.RandomState(seed))
embed_rbf = se_rbf.fit_transform(S)
embed_callable = se_callable.fit_transform(S)
assert_array_almost_equal(
se_callable.affinity_matrix_, se_rbf.affinity_matrix_)
assert_array_almost_equal(kern, se_rbf.affinity_matrix_)
assert_true(
_check_with_col_sign_flipping(embed_rbf, embed_callable, 0.05))
def test_spectral_embedding_amg_solver(seed=36):
# Test spectral embedding with amg solver
try:
from pyamg import smoothed_aggregation_solver # noqa
except ImportError:
raise SkipTest("pyamg not available.")
se_amg = SpectralEmbedding(n_components=2, affinity="nearest_neighbors",
eigen_solver="amg", n_neighbors=5,
random_state=np.random.RandomState(seed))
se_arpack = SpectralEmbedding(n_components=2, affinity="nearest_neighbors",
eigen_solver="arpack", n_neighbors=5,
random_state=np.random.RandomState(seed))
embed_amg = se_amg.fit_transform(S)
embed_arpack = se_arpack.fit_transform(S)
assert_true(_check_with_col_sign_flipping(embed_amg, embed_arpack, 0.05))
def test_pipeline_spectral_clustering(seed=36):
# Test using pipeline to do spectral clustering
random_state = np.random.RandomState(seed)
se_rbf = SpectralEmbedding(n_components=n_clusters,
affinity="rbf",
random_state=random_state)
se_knn = SpectralEmbedding(n_components=n_clusters,
affinity="nearest_neighbors",
n_neighbors=5,
random_state=random_state)
for se in [se_rbf, se_knn]:
km = KMeans(n_clusters=n_clusters, random_state=random_state)
km.fit(se.fit_transform(S))
assert_array_almost_equal(
normalized_mutual_info_score(
km.labels_,
true_labels), 1.0, 2)
def test_spectral_embedding_unknown_eigensolver(seed=36):
# Test that SpectralClustering fails with an unknown eigensolver
se = SpectralEmbedding(n_components=1, affinity="precomputed",
random_state=np.random.RandomState(seed),
eigen_solver="<unknown>")
assert_raises(ValueError, se.fit, S)
def test_spectral_embedding_unknown_affinity(seed=36):
# Test that SpectralClustering fails with an unknown affinity type
se = SpectralEmbedding(n_components=1, affinity="<unknown>",
random_state=np.random.RandomState(seed))
assert_raises(ValueError, se.fit, S)
def test_connectivity(seed=36):
# Test that graph connectivity test works as expected
graph = np.array([[1, 0, 0, 0, 0],
[0, 1, 1, 0, 0],
[0, 1, 1, 1, 0],
[0, 0, 1, 1, 1],
[0, 0, 0, 1, 1]])
assert_equal(_graph_is_connected(graph), False)
assert_equal(_graph_is_connected(sparse.csr_matrix(graph)), False)
assert_equal(_graph_is_connected(sparse.csc_matrix(graph)), False)
graph = np.array([[1, 1, 0, 0, 0],
[1, 1, 1, 0, 0],
[0, 1, 1, 1, 0],
[0, 0, 1, 1, 1],
[0, 0, 0, 1, 1]])
assert_equal(_graph_is_connected(graph), True)
assert_equal(_graph_is_connected(sparse.csr_matrix(graph)), True)
assert_equal(_graph_is_connected(sparse.csc_matrix(graph)), True)
def test_spectral_embedding_deterministic():
# Test that Spectral Embedding is deterministic
random_state = np.random.RandomState(36)
data = random_state.randn(10, 30)
sims = rbf_kernel(data)
embedding_1 = spectral_embedding(sims)
embedding_2 = spectral_embedding(sims)
assert_array_almost_equal(embedding_1, embedding_2)
def test_spectral_embedding_unnormalized():
# Test that spectral_embedding is also processing unnormalized laplacian
# correctly
random_state = np.random.RandomState(36)
data = random_state.randn(10, 30)
sims = rbf_kernel(data)
n_components = 8
embedding_1 = spectral_embedding(sims,
norm_laplacian=False,
n_components=n_components,
drop_first=False)
# Verify using manual computation with dense eigh
laplacian, dd = csgraph.laplacian(sims, normed=False,
return_diag=True)
_, diffusion_map = eigh(laplacian)
embedding_2 = diffusion_map.T[:n_components]
embedding_2 = _deterministic_vector_sign_flip(embedding_2).T
assert_array_almost_equal(embedding_1, embedding_2)
def test_spectral_embedding_first_eigen_vector():
# Test that the first eigenvector of spectral_embedding
# is constant and that the second is not (for a connected graph)
random_state = np.random.RandomState(36)
data = random_state.randn(10, 30)
sims = rbf_kernel(data)
n_components = 2
for seed in range(10):
embedding = spectral_embedding(sims,
norm_laplacian=False,
n_components=n_components,
drop_first=False,
random_state=seed)
assert np.std(embedding[:, 0]) == pytest.approx(0)
assert np.std(embedding[:, 1]) > 1e-3