Skip to content

Commit

Permalink
Improve python reference code to better interface with C
Browse files Browse the repository at this point in the history
  • Loading branch information
JiaweiZhuang committed Apr 16, 2017
1 parent 2d14e49 commit 7296bc0
Show file tree
Hide file tree
Showing 4 changed files with 120 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
*.out
*.nc
__pycache__
50 changes: 50 additions & 0 deletions Parallel_Algorithm/python_reference/Apply_Kmean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import numpy as np
import xarray as xr
from netCDF4 import Dataset
from timeit import default_timer as timer
from sklearn.cluster import KMeans

dirname = "../test_data/"
filename = "Blobs_smp20000_fea30_cls8.nc"

# read data from nc file
start1 = timer()
with xr.open_dataset(dirname+filename) as ds:
n_clusters = ds.dims["N_clusters"]
n_features = ds.dims["N_features"]
n_repeat = ds.dims["N_repeat"]
X = ds["X"].values
GUESS = ds["GUESS"].values
del ds

elapse1 = timer()-start1

# apply Kmeans
start2 = timer()
inert_best = np.inf
for i_repeat in range(n_repeat):
# manually guess initial clusters (to compare with C)
initial_idx = GUESS[i_repeat,:]
initial_position = X[initial_idx,:]
kmeans = KMeans(n_clusters=n_clusters,n_init=1,init=initial_position,
algorithm='full',tol=1e-4)
kmeans.fit(X)

if kmeans.inertia_ < inert_best:
inert_best = kmeans.inertia_
y_kmeans = kmeans.labels_

elapse2 = timer()-start2

# write results back
with Dataset(dirname+filename,mode='r+') as dset:
dset["Y_Py"][:] = y_kmeans
dset["INERT_Py"][:] = inert_best

# summary
print("final inertia:",inert_best)
print("Kmean time use (ms):",elapse2*1e3)
print("I/O time use (ms):",elapse1*1e3)
43 changes: 43 additions & 0 deletions Parallel_Algorithm/python_reference/IO_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import numpy as np
import xarray as xr

def Raw_to_NetCDF(X,ind,filename,y_true=None,feature_names=None):

N_samples,N_features = X.shape
label_zero = np.zeros(N_samples,dtype=np.int32)
if feature_names is None:
feature_names = np.arange(N_features,dtype=np.int32)
if y_true is None:
y_true = label_zero

ds = xr.Dataset()
ds['X'] = (['N_samples', 'N_features'], np.float32(X) )
ds['X'].attrs["long_name"]="data points"

ds['GUESS'] = (['N_repeat', 'N_clusters'], ind)
ds['GUESS'].attrs["long_name"]="indices of data points as initial guess of cluster centers"
ds['GUESS'].attrs["purpose"]="make sure that C and python use the same initial starting points"

ds['Y_TRUE']=(['N_samples'], np.int32(y_true) )
ds['Y_TRUE'].attrs["long_name"]="(optional) true label of each data point"

ds['Y_Py']=(['N_samples'], label_zero)
ds['Y_Py'].attrs["long_name"]="labels predicted by python Kmean function"

ds['Y_C']=(['N_samples'], label_zero)
ds['Y_C'].attrs["long_name"] = "labels predicted by C implementation"
ds['Y_C'].attrs["purpose"] = "make sure that C implementation gives the same result as python"

ds['INERT_Py'] = np.float32(0.0)
ds['INERT_Py'].attrs["long_name"] = "kmeans.inertia_ in python code, "+\
"i.e. sum of distances between data points and cluster centers"

ds['INERT_C'] = np.float32(0.0)
ds['INERT_C'].attrs["long_name"] = "the C version of kmeans.inertia_"
ds['INERT_C'].attrs["purpose"] = "make sure that C implementation gives the same result as python"

ds['FEATURES']=(['N_features'], feature_names)
ds['FEATURES'].attrs["long_name"] = "(optional) the meaning of each feature"

ds.to_netcdf(filename)
ds.close()
26 changes: 26 additions & 0 deletions Parallel_Algorithm/python_reference/make_fake_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import numpy as np
from sklearn.datasets.samples_generator import make_blobs
from IO_util import Raw_to_NetCDF

N_clusters = 8
N_samples = 20000
N_features = 30
N_repeat = 20

X, y = make_blobs(n_samples=N_samples, centers=N_clusters,
n_features=N_features,random_state=0,
cluster_std=1.0)

initial_ind = np.zeros([N_repeat,N_clusters],dtype=np.int32)

for i in range(N_repeat):
initial_ind[i,:] = np.random.choice(np.arange(N_samples),
N_clusters,replace=False)

dirname = "../test_data/"
filename = "Blobs_smp{0}_fea{1}_cls{2}.nc".format(N_samples,N_features,N_clusters)

Raw_to_NetCDF(X,initial_ind,dirname+filename,y_true=y)

0 comments on commit 7296bc0

Please sign in to comment.