Skip to content
This repository has been archived by the owner on Feb 23, 2021. It is now read-only.

Commit

Permalink
Initial bits
Browse files Browse the repository at this point in the history
  • Loading branch information
jackmaney committed Jan 7, 2014
1 parent 0428f65 commit 7a42270
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 2 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
k-means---pandas
k-means++-pandas
================

An implementation of the k-means++ clustering algorithm using Pandas
An implementation of the [k-means++ clustering algorithm](http:https://en.wikipedia.org/wiki/K-means%2B%2B) using [Pandas](http:https://pandas.pydata.org/)
37 changes: 37 additions & 0 deletions k-means++.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from pandas import DataFrame,Series
import pandas as pd
import numpy as np

class KMeansPlusPlus:
def __init__(self,dataFrame,columns=None,maxIterations=None):
if not isinstance(dataFrame,DataFrame):
raise Exception("dataFrame argument is not a pandas DataFrame")
elif dataFrame.empty:
raise Exception("The given data frame is empty")

self.dataFrame = dataFrame
self.numRows = dataFrame.shape[0]

if columns is None:
self.columns = dataFrame.columns
else:
for col in columns:
if col not in dataFrame.columns:
raise Exception("Column '%s' not found in the given DataFrame" % col)
if not self.__is_numeric(col):
raise Exception("The column '%s' is either not numeric or contains NaN values" % col)
self.columns = columns

def distance_from_point(self,point):
if not isinstance(point,np.array):
raise Exception("Argument '%s' is not a NumPy ndarray" % point)
elif point.ndim != 1:
raise Exception("One-dimensional points only, please.")
elif point.shape[0] != len(self.columns):
raise Exception("The point '%s' is not of the same dimension as the given set of columns" % point)

return np.power(self.dataFrame[columns] - point,2).sum(axis=1) #pandas Series


def __is_numeric(self,col):
return all(np.isreal(self.dataFrame[col])) and not any(np.isnan(self.dataFrame[col]))

0 comments on commit 7a42270

Please sign in to comment.