-
Notifications
You must be signed in to change notification settings - Fork 45
Add PAM algorithm to K-Medoids #73
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
91891ae
40e42e7
4362f85
5a86b95
6e6c90d
2ed1803
8422c17
6eae84b
2a92978
ecce8c8
1cba61c
258c262
9078628
482bc37
50d0eb3
7637891
eeaa2a3
093f8b0
bd04827
e979579
b51d23b
e675bdb
4250681
8f2ada3
552294b
b024b8e
018c9c7
2f6368f
f1a33ad
498d9b6
daa9879
213bb2e
9c15afa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
# cython: infer_types=True | ||
# Fast swap step and build step in PAM algorithm for k_medoid. | ||
# Author: Timothée Mathieu | ||
# License: 3-clause BSD | ||
|
||
cimport cython | ||
|
||
import numpy as np | ||
cimport numpy as np | ||
from cython cimport floating, integral | ||
|
||
@cython.boundscheck(False) # Deactivate bounds checking | ||
def _compute_optimal_swap( floating[:,:] D, | ||
int[:] medoid_idxs, | ||
int[:] not_medoid_idxs, | ||
floating[:] Djs, | ||
floating[:] Ejs, | ||
int n_clusters): | ||
"""Compute best cost change for all the possible swaps.""" | ||
|
||
# Initialize best cost change and the associated swap couple. | ||
cdef (int, int, floating) best_cost_change = (1, 1, 0.0) | ||
cdef int sample_size = len(D) | ||
cdef int i, j, h, id_i, id_h, id_j | ||
cdef floating cost_change | ||
cdef int not_medoid_shape = sample_size - n_clusters | ||
cdef bint cluster_i_bool, not_cluster_i_bool, second_best_medoid | ||
cdef bint not_second_best_medoid | ||
|
||
# Compute the change in cost for each swap. | ||
for h in range(not_medoid_shape): | ||
# id of the potential new medoid. | ||
id_h = not_medoid_idxs[h] | ||
for i in range(n_clusters): | ||
# id of the medoid we want to replace. | ||
id_i = medoid_idxs[i] | ||
cost_change = 0.0 | ||
# compute for all not-selected points the change in cost | ||
for j in range(not_medoid_shape): | ||
id_j = not_medoid_idxs[j] | ||
cluster_i_bool = D[id_i, id_j] == Djs[id_j] | ||
not_cluster_i_bool = D[id_i, id_j] != Djs[id_j] | ||
second_best_medoid = D[id_h, id_j] < Ejs[id_j] | ||
not_second_best_medoid = D[id_h, id_j] >= Ejs[id_j] | ||
|
||
if cluster_i_bool & second_best_medoid: | ||
cost_change += D[id_j, id_h] - Djs[id_j] | ||
elif cluster_i_bool & not_second_best_medoid: | ||
cost_change += Ejs[id_j] - Djs[id_j] | ||
elif not_cluster_i_bool & (D[id_j, id_h] < Djs[id_j]): | ||
cost_change += D[id_j, id_h] - Djs[id_j] | ||
|
||
# same for i | ||
second_best_medoid = D[id_h, id_i] < Ejs[id_i] | ||
if second_best_medoid: | ||
cost_change += D[id_i, id_h] | ||
else: | ||
cost_change += Ejs[id_i] | ||
|
||
if cost_change < best_cost_change[2]: | ||
best_cost_change = (id_i, id_h, cost_change) | ||
|
||
# If one of the swap decrease the objective, return that swap. | ||
if best_cost_change[2] < 0: | ||
return best_cost_change | ||
else: | ||
return None | ||
|
||
|
||
|
||
|
||
def _build( floating[:, :] D, int n_clusters): | ||
"""Compute BUILD initialization, a greedy medoid initialization.""" | ||
|
||
cdef int[:] medoid_idxs = np.zeros(n_clusters, dtype = np.intc) | ||
cdef int sample_size = len(D) | ||
cdef int[:] not_medoid_idxs = np.zeros(sample_size, dtype = np.intc) | ||
cdef int i, j, id_i, id_j | ||
|
||
medoid_idxs[0] = np.argmin(np.sum(D,axis=0)) | ||
not_medoid_idxs = np.delete(not_medoid_idxs, medoid_idxs[0]) | ||
|
||
cdef int n_medoids_current = 1 | ||
|
||
cdef floating[:] Dj = D[medoid_idxs[0]].copy() | ||
cdef floating cost_change | ||
cdef (int, int) new_medoid = (medoid_idxs[0], 0) | ||
cdef floating cost_change_max | ||
|
||
for _ in range(n_clusters -1): | ||
cost_change_max = 0 | ||
for i in range(sample_size - n_medoids_current): | ||
id_i = not_medoid_idxs[i] | ||
cost_change = 0 | ||
for j in range(sample_size - n_medoids_current): | ||
id_j = not_medoid_idxs[j] | ||
cost_change += max(0, Dj[id_j] - D[id_i, id_j]) | ||
if cost_change >= cost_change_max: | ||
cost_change_max = cost_change | ||
new_medoid = (id_i, i) | ||
|
||
|
||
medoid_idxs[n_medoids_current] = new_medoid[0] | ||
n_medoids_current += 1 | ||
not_medoid_idxs = np.delete(not_medoid_idxs, new_medoid[1]) | ||
|
||
|
||
for id_j in range(sample_size): | ||
Dj[id_j] = min(Dj[id_j], D[id_j, new_medoid[0]]) | ||
return np.array(medoid_idxs) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,10 +12,46 @@ | |
|
||
from sklearn_extra.cluster import KMedoids | ||
from sklearn.cluster import KMeans | ||
from sklearn.datasets import make_blobs | ||
|
||
|
||
seed = 0 | ||
X = np.random.RandomState(seed).rand(100, 5) | ||
|
||
# test kmedoid's results | ||
rng = np.random.RandomState(seed) | ||
X_cc, y_cc = make_blobs( | ||
n_samples=100, | ||
centers=np.array([[-1, -1], [1, 1]]), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe we can reduce the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I commented to explain. I prefer to keep the clusters not so separable in order to test the stability of the method at the same time. |
||
random_state=rng, | ||
shuffle=False, | ||
) | ||
|
||
|
||
@pytest.mark.parametrize("method", ["alternate", "pam"]) | ||
@pytest.mark.parametrize( | ||
"init", ["random", "heuristic", "build", "k-medoids++"] | ||
) | ||
def test_kmedoid_results(method, init): | ||
expected = np.hstack([np.zeros(50), np.ones(50)]) | ||
km = KMedoids(n_clusters=2, init=init, method=method) | ||
km.fit(X_cc) | ||
# This test use data that are not perfectly separable so the | ||
# accuracy is not 1. Accuracy around 0.85 | ||
assert (np.mean(km.labels_ == expected) > 0.8) or ( | ||
1 - np.mean(km.labels_ == expected) > 0.8 | ||
) | ||
|
||
|
||
def test_medoids_invalid_method(): | ||
with pytest.raises(ValueError, match="invalid is not supported"): | ||
KMedoids(n_clusters=1, method="invalid").fit([[0, 1], [1, 1]]) | ||
|
||
|
||
def test_medoids_invalid_init(): | ||
with pytest.raises(ValueError, match="init needs to be one of"): | ||
KMedoids(n_clusters=1, init="invalid").fit([[0, 1], [1, 1]]) | ||
|
||
|
||
def test_kmedoids_input_validation_and_fit_check(): | ||
rng = np.random.RandomState(seed) | ||
|
@@ -28,9 +64,9 @@ def test_kmedoids_input_validation_and_fit_check(): | |
with pytest.raises(ValueError, match=msg): | ||
KMedoids(n_clusters=None).fit(X) | ||
|
||
msg = "max_iter should be a nonnegative integer. 0 was given" | ||
msg = "max_iter should be a nonnegative integer. -1 was given" | ||
with pytest.raises(ValueError, match=msg): | ||
KMedoids(n_clusters=1, max_iter=0).fit(X) | ||
KMedoids(n_clusters=1, max_iter=-1).fit(X) | ||
|
||
msg = "max_iter should be a nonnegative integer. None was given" | ||
with pytest.raises(ValueError, match=msg): | ||
|
Uh oh!
There was an error while loading. Please reload this page.