-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Improve python reference code to better interface with C
- Loading branch information
1 parent
2d14e49
commit 7296bc0
Showing
4 changed files
with
120 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,3 @@ | ||
*.out | ||
*.nc | ||
__pycache__ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
#!/usr/bin/env python3 | ||
# -*- coding: utf-8 -*- | ||
|
||
import numpy as np | ||
import xarray as xr | ||
from netCDF4 import Dataset | ||
from timeit import default_timer as timer | ||
from sklearn.cluster import KMeans | ||
|
||
dirname = "../test_data/" | ||
filename = "Blobs_smp20000_fea30_cls8.nc" | ||
|
||
# read data from nc file | ||
start1 = timer() | ||
with xr.open_dataset(dirname+filename) as ds: | ||
n_clusters = ds.dims["N_clusters"] | ||
n_features = ds.dims["N_features"] | ||
n_repeat = ds.dims["N_repeat"] | ||
X = ds["X"].values | ||
GUESS = ds["GUESS"].values | ||
del ds | ||
|
||
elapse1 = timer()-start1 | ||
|
||
# apply Kmeans | ||
start2 = timer() | ||
inert_best = np.inf | ||
for i_repeat in range(n_repeat): | ||
# manually guess initial clusters (to compare with C) | ||
initial_idx = GUESS[i_repeat,:] | ||
initial_position = X[initial_idx,:] | ||
kmeans = KMeans(n_clusters=n_clusters,n_init=1,init=initial_position, | ||
algorithm='full',tol=1e-4) | ||
kmeans.fit(X) | ||
|
||
if kmeans.inertia_ < inert_best: | ||
inert_best = kmeans.inertia_ | ||
y_kmeans = kmeans.labels_ | ||
|
||
elapse2 = timer()-start2 | ||
|
||
# write results back | ||
with Dataset(dirname+filename,mode='r+') as dset: | ||
dset["Y_Py"][:] = y_kmeans | ||
dset["INERT_Py"][:] = inert_best | ||
|
||
# summary | ||
print("final inertia:",inert_best) | ||
print("Kmean time use (ms):",elapse2*1e3) | ||
print("I/O time use (ms):",elapse1*1e3) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
import numpy as np | ||
import xarray as xr | ||
|
||
def Raw_to_NetCDF(X,ind,filename,y_true=None,feature_names=None): | ||
|
||
N_samples,N_features = X.shape | ||
label_zero = np.zeros(N_samples,dtype=np.int32) | ||
if feature_names is None: | ||
feature_names = np.arange(N_features,dtype=np.int32) | ||
if y_true is None: | ||
y_true = label_zero | ||
|
||
ds = xr.Dataset() | ||
ds['X'] = (['N_samples', 'N_features'], np.float32(X) ) | ||
ds['X'].attrs["long_name"]="data points" | ||
|
||
ds['GUESS'] = (['N_repeat', 'N_clusters'], ind) | ||
ds['GUESS'].attrs["long_name"]="indices of data points as initial guess of cluster centers" | ||
ds['GUESS'].attrs["purpose"]="make sure that C and python use the same initial starting points" | ||
|
||
ds['Y_TRUE']=(['N_samples'], np.int32(y_true) ) | ||
ds['Y_TRUE'].attrs["long_name"]="(optional) true label of each data point" | ||
|
||
ds['Y_Py']=(['N_samples'], label_zero) | ||
ds['Y_Py'].attrs["long_name"]="labels predicted by python Kmean function" | ||
|
||
ds['Y_C']=(['N_samples'], label_zero) | ||
ds['Y_C'].attrs["long_name"] = "labels predicted by C implementation" | ||
ds['Y_C'].attrs["purpose"] = "make sure that C implementation gives the same result as python" | ||
|
||
ds['INERT_Py'] = np.float32(0.0) | ||
ds['INERT_Py'].attrs["long_name"] = "kmeans.inertia_ in python code, "+\ | ||
"i.e. sum of distances between data points and cluster centers" | ||
|
||
ds['INERT_C'] = np.float32(0.0) | ||
ds['INERT_C'].attrs["long_name"] = "the C version of kmeans.inertia_" | ||
ds['INERT_C'].attrs["purpose"] = "make sure that C implementation gives the same result as python" | ||
|
||
ds['FEATURES']=(['N_features'], feature_names) | ||
ds['FEATURES'].attrs["long_name"] = "(optional) the meaning of each feature" | ||
|
||
ds.to_netcdf(filename) | ||
ds.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
#!/usr/bin/env python3 | ||
# -*- coding: utf-8 -*- | ||
|
||
import numpy as np | ||
from sklearn.datasets.samples_generator import make_blobs | ||
from IO_util import Raw_to_NetCDF | ||
|
||
N_clusters = 8 | ||
N_samples = 20000 | ||
N_features = 30 | ||
N_repeat = 20 | ||
|
||
X, y = make_blobs(n_samples=N_samples, centers=N_clusters, | ||
n_features=N_features,random_state=0, | ||
cluster_std=1.0) | ||
|
||
initial_ind = np.zeros([N_repeat,N_clusters],dtype=np.int32) | ||
|
||
for i in range(N_repeat): | ||
initial_ind[i,:] = np.random.choice(np.arange(N_samples), | ||
N_clusters,replace=False) | ||
|
||
dirname = "../test_data/" | ||
filename = "Blobs_smp{0}_fea{1}_cls{2}.nc".format(N_samples,N_features,N_clusters) | ||
|
||
Raw_to_NetCDF(X,initial_ind,dirname+filename,y_true=y) |