Merge pull request #4 from AlpineBlack/master

AttributeError fixed in cluster.py
jackmaney · Nov 7, 2018 · dfd523c · dfd523c
2 parents da64a0e + 7bbee5c
commit dfd523c
Show file tree

Hide file tree

Showing 2 changed files with 182 additions and 1 deletion.
diff --git a/cluster.py b/cluster.py
@@ -0,0 +1,181 @@
+#import _version
+from pandas import DataFrame, Series
+import pandas as pd
+import numpy as np
+import warnings
+from numbers import Integral
+
+
+class KMeansPlusPlus:
+
+ def __init__(self, data_frame, k, columns=None, max_iterations=None,
+ appended_column_name=None):
+ if not isinstance(data_frame, DataFrame):
+ raise Exception("data_frame argument is not a pandas DataFrame")
+ elif data_frame.empty:
+ raise Exception("The given data frame is empty")
+
+ if max_iterations is not None and max_iterations <= 0:
+ raise Exception("max_iterations must be positive!")
+
+ if not isinstance(k, Integral) or k <= 0:
+ raise Exception("The value of k must be a positive integer")
+
+ self.data_frame = data_frame # m x n
+ self.numRows = data_frame.shape[0] # m
+
+ # k x n, the i,j entry being the jth coordinate of center i
+ self.centers = None
+
+ # m x k , the i,j entry represents the distance
+ # from point i to center j
+ # (where i and j start at 0)
+ self.distance_matrix = None
+
+ # Series of length m, consisting of integers 0,1,...,k-1
+ self.clusters = None
+
+ # To keep track of clusters in the previous iteration
+ self.previous_clusters = None
+
+ self.max_iterations = max_iterations
+ self.appended_column_name = appended_column_name
+ self.k = k
+
+ if columns is None:
+ self.columns = data_frame.columns
+ else:
+ for col in columns:
+ if col not in data_frame.columns:
+ raise Exception(
+ "Column '%s' not found in the given DataFrame" % col)
+ if not self._is_numeric(col):
+ raise Exception(
+ "The column '%s' is either not numeric or contains NaN values" % col)
+ self.columns = columns
+
+ def _populate_initial_centers(self):
+ rows = []
+ rows.append(self._grab_random_point())
+ distances = None
+
+ while len(rows) < self.k:
+ if distances is None:
+ distances = self._distances_from_point(rows[0])
+ else:
+ distances = self._distances_from_point_list(rows)
+
+ normalized_distances = distances / distances.sum()
+ normalized_distances.sort_values()
+ dice_roll = np.random.rand()
+ min_over_roll = normalized_distances[
+ normalized_distances.cumsum() >= dice_roll].min()
+ index = normalized_distances[
+ normalized_distances == min_over_roll].index[0]
+ rows.append(self.data_frame[self.columns].iloc[index, :])
+
+ self.centers = DataFrame(rows, columns=self.columns)
+
+ def _compute_distances(self):
+ if self.centers is None:
+ raise Exception(
+ "Must populate centers before distances can be calculated!")
+
+ column_dict = {}
+
+ for i in list(range(self.k)):
+ column_dict[i] = self._distances_from_point(
+ self.centers.iloc[i, :])
+
+ self.distance_matrix = DataFrame(
+ column_dict, columns=list(range(self.k)))
+
+ def _get_clusters(self):
+ if self.distance_matrix is None:
+ raise Exception(
+ "Must compute distances before closest centers can be calculated")
+
+ min_distances = self.distance_matrix.min(axis=1)
+
+ # We need to make sure the index
+ min_distances.index = list(range(self.numRows))
+
+ cluster_list = [boolean_series.index[j]
+ for boolean_series in
+ [
+ self.distance_matrix.iloc[i,
+ :] == min_distances.iloc[i]
+ for i in list(range(self.numRows))
+ ]
+ for j in list(range(self.k))
+ if boolean_series[j]
+ ]
+
+ self.clusters = Series(cluster_list, index=self.data_frame.index)
+
+ def _compute_new_centers(self):
+ if self.centers is None:
+ raise Exception("Centers not initialized!")
+
+ if self.clusters is None:
+ raise Exception("Clusters not computed!")
+
+ for i in list(range(self.k)):
+ self.centers.ix[i, :] = self.data_frame[
+ self.columns].ix[self.clusters == i].mean()
+
+ def cluster(self):
+
+ self._populate_initial_centers()
+ self._compute_distances()
+ self._get_clusters()
+
+ counter = 0
+
+ while True:
+ counter += 1
+
+ self.previous_clusters = self.clusters.copy()
+
+ self._compute_new_centers()
+ self._compute_distances()
+ self._get_clusters()
+
+ if self.max_iterations is not None and counter >= self.max_iterations:
+ break
+ elif all(self.clusters == self.previous_clusters):
+ break
+
+ if self.appended_column_name is not None:
+ try:
+ self.data_frame[self.appended_column_name] = self.clusters
+ except:
+ warnings.warn(
+ "Unable to append a column named %s to your data." %
+ self.appended_column_name)
+ warnings.warn(
+ "However, the clusters are available via the cluster attribute")
+
+ def _distances_from_point(self, point):
+ # pandas Series
+ return np.power(self.data_frame[self.columns] - point, 2).sum(axis=1)
+
+ def _distances_from_point_list(self, point_list):
+ result = None
+
+ for point in point_list:
+ if result is None:
+ result = self._distances_from_point(point)
+ else:
+ result = pd.concat(
+ [result, self._distances_from_point(point)], axis=1).min(axis=1)
+
+ return result
+
+ def _grab_random_point(self):
+ index = np.random.random_integers(0, self.numRows - 1)
+ # NumPy array
+ return self.data_frame[self.columns].iloc[index, :].values
+
+ def _is_numeric(self, col):
+ return all(np.isreal(self.data_frame[col])) and not any(np.isnan(self.data_frame[col]))
diff --git a/k_means_plus_plus/cluster.py b/k_means_plus_plus/cluster.py
@@ -66,7 +66,7 @@ def _populate_initial_centers(self):
  distances = self._distances_from_point_list(rows)
 
  normalized_distances = distances / distances.sum()
- normalized_distances.sort()
+ normalized_distances.sort_values()
  dice_roll = np.random.rand()
  min_over_roll = normalized_distances[
  normalized_distances.cumsum() >= dice_roll].min()