Cleaned up. Added to README

jackmaney · Jan 13, 2014 · b93b2cd · b93b2cd
1 parent 413ed6a
commit b93b2cd
Show file tree

Hide file tree

Showing 2 changed files with 48 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -1,4 +1,35 @@
 k-means++-pandas
 ================
 
-An implementation of the [k-means++ clustering algorithm](http:https://en.wikipedia.org/wiki/K-means%2B%2B) using [Pandas](http:https://pandas.pydata.org/)
+An implementation of the [k-means++ clustering algorithm](http:https://en.wikipedia.org/wiki/K-means%2B%2B) using [Pandas](http:https://pandas.pydata.org/).
+
+Prerequisites
+-------------
+
+* Python 2.7 or lower; this is not Python 3 compatible (yet).
+* [Pandas](http:https://pandas.pydata.org/) (obviously).
+* [NumPy](http:https://numpy.org)
+
+Usage
+-----
+
+Here are the constructor arguments:
+
+* `data_frame`: A Pandas data frame representing the data that you wish to cluster. Rows represent observations, and columns represent variables.
+
+* `k`: The number of clusters that you want.
+
+* `columns=None`: A list of column names upon which you wish to cluster your data. If this argument isn't provided, then all of the columns are selected. **Note:** Columns upon which you want to cluster must be numeric and have no `numpy.nan` values.
+
+* `max_iterations=None`: The maximum number of times that you wish to iterate k-means. If no value is provided, then the iterations continue until stability is reached (ie the cluster assignments don't change between one iteration and the next).
+
+* `appended_column_name=None`: If this value is set with a string, then a column will be appended to your data with the given name that contains the cluster assignments (which are integers from 0 to `k`).
+
+Once you've constructed a `KMeansPlusPlus` object, then just call the `cluster` method, and everything else should happen automagically. Take a look at the `examples` folder.
+
+TODO:
+----
+
+* Add on features that take iterations of k-means++ clusters and compares them via, eg, concordance matrices, Jaccard indices, etc.
+
+* Given a data frame, implement the so-called [Elbow Method](http:https://en.wikipedia.org/wiki/Determining_the_number_of_clusters_in_a_data_set#The_Elbow_Method) to take a stab at an optimal value for `k`.
diff --git a/k_means_plus_plus.py b/k_means_plus_plus.py
@@ -1,12 +1,14 @@
 from pandas import DataFrame,Series
 import pandas as pd
 import numpy as np
+import warnings
 from numbers import Integral
 
 class KMeansPlusPlus:
  def __init__(self,data_frame,k
  ,columns=None
- ,max_iterations=None):
+ ,max_iterations=None
+ ,appended_column_name=None):
  if not isinstance(data_frame,DataFrame):
  raise Exception("data_frame argument is not a pandas DataFrame")
  elif data_frame.empty:
@@ -15,19 +17,20 @@ def __init__(self,data_frame,k
  if max_iterations is not None and max_iterations <= 0:
  raise Exception("max_iterations must be positive!")
 
+ if not isinstance(k,Integral) or k <= 0:
+ raise Exception("The value of k must be a positive integer")
+
 
  self.data_frame = data_frame # m x n
  self.numRows = data_frame.shape[0] # m
  self.centers = None # k x n, the i,j entry being the jth coordinate of center i
  self.distance_matrix = None # m x k , the i,j entry represents the distance 
  # from point i to center j (where i and j start at 0)
  self.clusters = None # Series of length m, consisting of integers 0,1,...,k-1
- self.previous_clusters = None
+ self.previous_clusters = None # To keep track of clusters in the previous iteration
+
  self.max_iterations = max_iterations
-
- if not isinstance(k,Integral) or k <= 0:
- raise Exception("The value of k must be a positive integer")
-
+ self.appended_column_name = appended_column_name
  self.k = k
 
  if columns is None:
@@ -124,6 +127,13 @@ def cluster(self):
  elif all(self.clusters == self.previous_clusters):
  break
 
+ if self.appended_column_name is not None:
+ try:
+ self.data_frame[self.appended_column_name] = self.clusters
+ except:
+ warnings.warn("Unable to append a column named %s to your data." % self.appended_column_name)
+ warnings.warn("However, the clusters are available via the cluster attribute")
+
  def _distances_from_point(self,point):
  return np.power(self.data_frame[self.columns] - point,2).sum(axis=1) #pandas Series