Skip to content

Commit

Permalink
cluster SSW data by C
Browse files Browse the repository at this point in the history
  • Loading branch information
JiaweiZhuang committed Apr 29, 2017
1 parent 0e45d12 commit bf22487
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 3 deletions.
9 changes: 6 additions & 3 deletions Parallel_Algorithm/OpenMP/Kmean_omp.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
#include "../shared/math_util.h"

/* This is the name of the data file we will read. */
#define FILE_NAME "../test_data/Blobs_smp20000_fea30_cls8.nc"
//#define FILE_NAME "../test_data/Blobs_smp20000_fea30_cls8.nc"
#define FILE_NAME "../../Data_Analysis/data/SSWdata.nc"
#define TOL 0.0001
#define MAX_ITER 100

Expand Down Expand Up @@ -93,9 +94,11 @@ int main() {
#pragma omp for schedule(static)
for (i = 0; i < N_samples; i++) {
k_best = 0;//assume cluster no.0 is the nearest
dist_min = distance(N_features, X[i], old_cluster_centers[k_best]);
//dist_min = distance(N_features, X[i], old_cluster_centers[k_best]);
dist_min = correlation(N_features, X[i], old_cluster_centers[k_best]);
for (k = 1; k < N_clusters; k++){
dist = distance(N_features, X[i], old_cluster_centers[k]);
//dist = distance(N_features, X[i], old_cluster_centers[k]);
dist = correlation(N_features, X[i], old_cluster_centers[k]);
if (dist < dist_min){
dist_min = dist;
k_best = k;
Expand Down
18 changes: 18 additions & 0 deletions Parallel_Algorithm/python_reference/check_SSWdata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import sys
import numpy as np
from IO_util import Raw_to_NetCDF
import xarray as xr

dirname = '../../Data_Analysis/data/'
filename='SSWdata.nc'

ds = xr.open_dataset(dirname+filename)

print('total data size',ds["Y_TRUE"].size)
print('size of 2nd cluster by MATLAB',ds["Y_TRUE"].sum())
print('size of 2nd cluster by C',ds["Y_C"].sum())

mismatch = (ds["Y_TRUE"].values != ds["Y_C"].values)
print("inconsistent labels: ",mismatch.sum())

#ds.close()
42 changes: 42 additions & 0 deletions Parallel_Algorithm/python_reference/convert_SSWdata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import sys
import numpy as np
from IO_util import Raw_to_NetCDF

ndata = 17878
nfeatures = 252

dirname = '../../Data_Analysis/data/'
# Read data points
file1=open(dirname+'SSWdata.bin','rb')
X=np.fromfile(file1)
if sys.byteorder=='little':
X.byteswap(True)
X=X.reshape(ndata,nfeatures)

# Read python label
file1=open(dirname+'Label_py.bin','rb')
Y_py=np.fromfile(file1,np.int32)
if sys.byteorder=='little':
Y_py.byteswap(True)

# Read matlab label
file1=open(dirname+'Label_matlab.bin','rb')
Y_matlab=np.fromfile(file1,np.int32)
Y_matlab -= 1 # 1~2 to 0~1

# ========================
# convert the NetCDF format
# ========================
N_clusters = 2
N_samples = ndata
N_features = nfeatures
N_repeat = 20

initial_ind = np.zeros([N_repeat,N_clusters],dtype=np.int32)
for i in range(N_repeat):
initial_ind[i,:] = np.random.choice(np.arange(N_samples),
N_clusters,replace=False)

filename='SSWdata.nc'
Raw_to_NetCDF(X,initial_ind,dirname+filename,y_true=Y_matlab)

0 comments on commit bf22487

Please sign in to comment.