Skip to content

Commit

Permalink
serial K-mean now works correctly. Better I/O TBD
Browse files Browse the repository at this point in the history
  • Loading branch information
JiaweiZhuang committed Apr 6, 2017
1 parent c54e15e commit 2d14e49
Show file tree
Hide file tree
Showing 2 changed files with 98 additions and 6 deletions.
99 changes: 94 additions & 5 deletions Parallel_Algorithm/OpenMP/Kmean_seq.c
Original file line number Diff line number Diff line change
Expand Up @@ -67,20 +67,109 @@ int readX(float*** p_X,size_t* p_N_samples,size_t* p_N_features) {
return 0;
}

// square of the distance between x1[N_features] and x2[N_features]
float distance(int N_features,float *x1,float *x2){
float dist=0.0;
for (int j=0; j<N_features; j++)
dist += (x1[j]-x2[j])*(x1[j]-x2[j]);
return(dist);
}


int main() {

int N_clusters=3;
size_t N_samples,N_features;
//i for samples; j for features; k for clusters (typically)
int i,j,k;
int k_best;
float** X;
float dist,dist_min;

// get input data and its size
readX(&X,&N_samples,&N_features);

// check the input data
for (int i=0; i<N_samples; i=i+N_samples-1){
printf("no.%d ",i+1);
for (int j=0; j<N_features; j++)
printf("%f ",X[i][j]);
printf("\n");
for (i=0; i<N_samples; i=i+N_samples-1){
printf("no.%d ",i+1);
for (j=0; j<N_features; j++)
printf("%f ",X[i][j]);
printf("\n");
}

// each data point belongs to which cluster
// values range from 0 to N_cluster-1
int* labels = (int *)malloc(N_samples*sizeof(int));

// The position of each cluster center.
// Two arrays are needed as we are calculating the distance to the
// old centers and accumulating the new centers in the same iteration.
float** old_cluster_centers = Make2DFloatArray(N_clusters,N_features);
float** new_cluster_centers = Make2DFloatArray(N_clusters,N_features);

// how many data points in the cluster
// needed by calculating the average position of data points in each cluster
int* cluster_sizes = (int *)malloc(N_clusters*sizeof(int));

// guess initial centers
// use the tops elements (random guess TBD)
for (k=0; k<N_clusters; k++){
cluster_sizes[k] = 0; // for accumulating

for (j=0; j<N_features; j++){
old_cluster_centers[k][j]=X[k][j];
//set the "new" array to 0 for accumulating
new_cluster_centers[k][j] = 0.0;
}
}


// K-mean stepping begins here!!
for (int step=0; step < 10; step++){

// E-Step: assign points to the nearest cluster center
for (i = 0; i < N_samples; i++) {

k_best = 0;//assume cluster no.0 is the nearest
dist_min = distance(N_features, X[i], old_cluster_centers[k_best]);
for (k = 1; k < N_clusters; k++){
dist = distance(N_features, X[i], old_cluster_centers[k]);
if (dist < dist_min){
dist_min = dist;
k_best = k;
}
}
labels[i] = k_best;

// M-Step: set the cluster centers to the mean
cluster_sizes[k_best]++; // add one more points to this cluster
// As the total number of samples in each cluster is not known yet,
// here we are just calculating the sum, not the mean.
for (j=0; j<N_features; j++)
new_cluster_centers[k_best][j] += X[i][j];

}

// M-Step-continued: convert the sum to the mean
for (k=0; k<N_clusters; k++) {
for (j=0; j<N_features; j++) {

if (cluster_sizes[k] > 0) //avoid divide-by-zero error
// sum -> mean
old_cluster_centers[k][j] = new_cluster_centers[k][j] / cluster_sizes[k];

new_cluster_centers[k][j] = 0.0;//for the next iteration
}
cluster_sizes[k] = 0;//for the next iteration
}

// check the classification results
printf("\n step %d, labels: \n",step);
for (i=0; i<N_samples; i++){
printf("%d ",labels[i]);
}

} //end of K-mean stepping

return 0;
}
5 changes: 4 additions & 1 deletion Parallel_Algorithm/OpenMP/compile.sh
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
gcc -std=c99 -lnetcdf Kmean_seq.c -o Kmean_seq.out
gcc -O2 -std=c99 -lnetcdf Kmean_seq.c -o Kmean_seq.out

#For debugging with gdb
#gcc -g -O0 -std=c99 -lnetcdf Kmean_seq.c -o Kmean_seq.out

0 comments on commit 2d14e49

Please sign in to comment.