Skip to content
This repository has been archived by the owner on Feb 23, 2021. It is now read-only.

Commit

Permalink
The pieces seem to work, but getting NaNs for center coordinates afte…
Browse files Browse the repository at this point in the history
…r iterating...hmmm...
  • Loading branch information
jackmaney committed Jan 12, 2014
1 parent d04f305 commit 9f036ef
Showing 1 changed file with 17 additions and 10 deletions.
27 changes: 17 additions & 10 deletions k_means_plus_plus.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,17 @@
from numbers import Integral

class KMeansPlusPlus:
def __init__(self,data_frame,k,columns=None,max_iterations=None):
def __init__(self,data_frame,k
,columns=None
,max_iterations=None
,tolerance = 1e-5):
if not isinstance(data_frame,DataFrame):
raise Exception("data_frame argument is not a pandas DataFrame")
elif data_frame.empty:
raise Exception("The given data frame is empty")

if max_iterations is not None and max_iterations <= 0:
raise Exception("")
raise Exception("max_iterations must be positive!")


self.data_frame = data_frame # m x n
Expand All @@ -21,6 +24,7 @@ def __init__(self,data_frame,k,columns=None,max_iterations=None):
# from point i to center j (where i and j start at 0)
self.clusters = None # Series of length m, consisting of integers 0,1,...,k-1
self.previous_clusters = None
self.max_iterations = max_iterations

if not isinstance(k,Integral) or k <= 0:
raise Exception("The value of k must be a positive integer")
Expand Down Expand Up @@ -74,13 +78,16 @@ def _get_clusters(self):

min_distances = self.distance_matrix.min(axis=1)

cluster_list = [
boolean_series.index[j][0]
for boolean_series in [
self.distance_matrix.iloc[i,:] == min_distances[i]
for i in list(range(len(self.columns)))
]
for j in list(range(self.k))
#We need to make sure the index
min_distances.index = list(range(self.numRows))

cluster_list = [boolean_series.index[j]
for boolean_series in
[
self.distance_matrix.iloc[i,:] == min_distances.iloc[i]
for i in list(range(self.numRows))
]
for j in list(range(self.k))
if boolean_series[j]
]

Expand Down Expand Up @@ -113,7 +120,7 @@ def cluster(self):
self._compute_distances()
self._get_clusters()

if max_iterations is not None and counter >= max_iterations:
if self.max_iterations is not None and counter >= self.max_iterations:
break
elif all(self.clusters == self.previous_clusters):
break
Expand Down

0 comments on commit 9f036ef

Please sign in to comment.