Improve python reference code to better interface with C

JiaweiZhuang · Apr 16, 2017 · 7296bc0 · 7296bc0
1 parent 2d14e49
commit 7296bc0
Show file tree

Hide file tree

Showing 4 changed files with 120 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
 *.out
 *.nc
+__pycache__
diff --git a/Parallel_Algorithm/python_reference/Apply_Kmean.py b/Parallel_Algorithm/python_reference/Apply_Kmean.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import numpy as np
+import xarray as xr
+from netCDF4 import Dataset
+from timeit import default_timer as timer
+from sklearn.cluster import KMeans
+
+dirname = "../test_data/"
+filename = "Blobs_smp20000_fea30_cls8.nc"
+
+# read data from nc file
+start1 = timer()
+with xr.open_dataset(dirname+filename) as ds: 
+ n_clusters = ds.dims["N_clusters"]
+ n_features = ds.dims["N_features"]
+ n_repeat = ds.dims["N_repeat"]
+ X = ds["X"].values
+ GUESS = ds["GUESS"].values
+del ds
+
+elapse1 = timer()-start1
+
+# apply Kmeans
+start2 = timer()
+inert_best = np.inf
+for i_repeat in range(n_repeat):
+ # manually guess initial clusters (to compare with C)
+ initial_idx = GUESS[i_repeat,:]
+ initial_position = X[initial_idx,:]
+ kmeans = KMeans(n_clusters=n_clusters,n_init=1,init=initial_position,
+ algorithm='full',tol=1e-4) 
+ kmeans.fit(X)
+
+ if kmeans.inertia_ < inert_best:
+ inert_best = kmeans.inertia_
+ y_kmeans = kmeans.labels_
+
+elapse2 = timer()-start2
+
+# write results back
+with Dataset(dirname+filename,mode='r+') as dset:
+ dset["Y_Py"][:] = y_kmeans
+ dset["INERT_Py"][:] = inert_best
+
+# summary 
+print("final inertia:",inert_best)
+print("Kmean time use (ms):",elapse2*1e3)
+print("I/O time use (ms):",elapse1*1e3)
diff --git a/Parallel_Algorithm/python_reference/IO_util.py b/Parallel_Algorithm/python_reference/IO_util.py
@@ -0,0 +1,43 @@
+import numpy as np
+import xarray as xr
+
+def Raw_to_NetCDF(X,ind,filename,y_true=None,feature_names=None):
+
+ N_samples,N_features = X.shape
+ label_zero = np.zeros(N_samples,dtype=np.int32)
+ if feature_names is None:
+ feature_names = np.arange(N_features,dtype=np.int32)
+ if y_true is None:
+ y_true = label_zero
+
+ ds = xr.Dataset()
+ ds['X'] = (['N_samples', 'N_features'], np.float32(X) )
+ ds['X'].attrs["long_name"]="data points"
+
+ ds['GUESS'] = (['N_repeat', 'N_clusters'], ind)
+ ds['GUESS'].attrs["long_name"]="indices of data points as initial guess of cluster centers"
+ ds['GUESS'].attrs["purpose"]="make sure that C and python use the same initial starting points"
+
+ ds['Y_TRUE']=(['N_samples'], np.int32(y_true) )
+ ds['Y_TRUE'].attrs["long_name"]="(optional) true label of each data point"
+
+ ds['Y_Py']=(['N_samples'], label_zero)
+ ds['Y_Py'].attrs["long_name"]="labels predicted by python Kmean function"
+
+ ds['Y_C']=(['N_samples'], label_zero)
+ ds['Y_C'].attrs["long_name"] = "labels predicted by C implementation"
+ ds['Y_C'].attrs["purpose"] = "make sure that C implementation gives the same result as python"
+
+ ds['INERT_Py'] = np.float32(0.0)
+ ds['INERT_Py'].attrs["long_name"] = "kmeans.inertia_ in python code, "+\
+ "i.e. sum of distances between data points and cluster centers"
+
+ ds['INERT_C'] = np.float32(0.0)
+ ds['INERT_C'].attrs["long_name"] = "the C version of kmeans.inertia_"
+ ds['INERT_C'].attrs["purpose"] = "make sure that C implementation gives the same result as python"
+
+ ds['FEATURES']=(['N_features'], feature_names)
+ ds['FEATURES'].attrs["long_name"] = "(optional) the meaning of each feature"
+
+ ds.to_netcdf(filename)
+ ds.close()
diff --git a/Parallel_Algorithm/python_reference/make_fake_data.py b/Parallel_Algorithm/python_reference/make_fake_data.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import numpy as np
+from sklearn.datasets.samples_generator import make_blobs
+from IO_util import Raw_to_NetCDF
+
+N_clusters = 8
+N_samples = 20000
+N_features = 30
+N_repeat = 20
+
+X, y = make_blobs(n_samples=N_samples, centers=N_clusters,
+ n_features=N_features,random_state=0,
+ cluster_std=1.0)
+
+initial_ind = np.zeros([N_repeat,N_clusters],dtype=np.int32)
+
+for i in range(N_repeat):
+ initial_ind[i,:] = np.random.choice(np.arange(N_samples),
+ N_clusters,replace=False)
+
+dirname = "../test_data/"
+filename = "Blobs_smp{0}_fea{1}_cls{2}.nc".format(N_samples,N_features,N_clusters)
+
+Raw_to_NetCDF(X,initial_ind,dirname+filename,y_true=y)