Skip to content

Commit

Permalink
minor bugs to be fixed
Browse files Browse the repository at this point in the history
  • Loading branch information
Jason Song committed Apr 27, 2017
1 parent 2cf87b3 commit 7235bf8
Showing 1 changed file with 340 additions and 0 deletions.
340 changes: 340 additions & 0 deletions Parallel_Algorithm/Cuda/kmeans.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,340 @@
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <cassert>
#include <fstream>
#include <string>
#include <vector>
#include <sstream>

using namespace std;

struct Info {
int numPoints;
int dim;
int numCentroids;
int numRepeats;
int *belongs;
float **points;
float **centroids;
int thresholdLoops;
float thresholdFraction;
};

inline int nextPowerOfTwo(int v) {
v--;
v = v >> 1 | v;
v = v >> 2 | v;
v = v >> 4 | v;
v = v >> 8 | v;
v = v >> 16 | v;
return ++v;
}

__host__ __device__ inline static float computeDist(int dim,
int numPoints,
int numCentroids,
float *objects,
float *clusters,
int objectId,
int clusterId) {
float res = 0;

for (int i = 0; i < dim; i++) {
res +=
(objects[numPoints * i + objectId] - clusters[numCentroids * i + clusterId]) *
(objects[numPoints * i + objectId] -
clusters[numCentroids * i + clusterId]);
}
return res;
}

// Use reduction to compute the sum of an array
// Refer to
// http:https://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/reduction/doc/reduction.pdf
__global__ static void reduce(int *g_idata, int l1, int l2) {
extern __shared__ unsigned int sdata[];
unsigned int tid = threadIdx.x;

if (tid < l1) {
sdata[tid] = g_idata[tid];
} else {
sdata[tid] = 0;
}
__syncthreads();

// Parallel Reduction (l2 must be power of 2)
for (unsigned int s = l2 / 2; s > 0; s >>= 1) {
if (tid < s) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}

if (tid == 0) {
g_idata[0] = sdata[0];
}
}

__global__ static void find_nearest_cluster(int dim,
int numPoints,
int numCentroids,
float *objects,
float *deviceClusters,
int *belongs,
int *intermediates) {
extern __shared__ char sharedMemory[];
unsigned char *membershipChanged = (unsigned char *)sharedMemory;
float *clusters = deviceClusters;

membershipChanged[threadIdx.x] = 0;

int objectId = blockDim.x * blockIdx.x + threadIdx.x;

if (objectId < numPoints)
{
int index;
float dist, min_dist;

/*find the cluster id that has min distance to object*/
index = 0;
min_dist = computeDist(dim,
numPoints,
numCentroids,
objects,
clusters,
objectId,
0);

for (int i = 0; i < numCentroids; i++)
{
dist = computeDist(dim,
numPoints,
numCentroids,
objects,
clusters,
objectId,
i);

/* no need square root */
if (dist < min_dist)
{
min_dist = dist;
index = i;
}
}

if (belongs[objectId] != index) {
membershipChanged[threadIdx.x] = 1;
}
belongs[objectId] = index;

__syncthreads();

for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1)
{
if (threadIdx.x < s)
{
membershipChanged[threadIdx.x] += membershipChanged[threadIdx.x + s];
}
__syncthreads();
}

if (threadIdx.x == 0)
{
intermediates[blockIdx.x] = membershipChanged[0];
}
}
}

void processData(char *fileName, Info *info) {
float **X;
int **GUESS;

int N_samples, N_features, N_clusters, N_repeat;

// readX(FILE_NAME,&X,&GUESS,&N_samples,&N_features,&N_clusters,&N_repeat);

// Test purpose
N_samples = 100;
N_features = 9;
N_clusters = 4;
N_repeat = 10;

info->numPoints = N_samples;
info->dim = N_features;
info->numCentroids = N_clusters;
info->numRepeats = N_repeat;
info->thresholdFraction = 0.001;
info->thresholdLoops = 500;

// Process data point
X = new float*[N_samples];
for(int i = 0; i < N_samples; i++) {
X[i] = new float[N_features];
}

string str(fileName);
ifstream file(str);
string line1;
int i = 0;
while (getline(file, line1)) {
std::istringstream iss(line1);
int j = -1;
for(string s; iss >> s; ) {
if (j == -1) {
j++;
continue;
}
// cout << s << " ";
X[i][j] = stof(s);
j++;
}
cout << "\n";
i++;
}
info->points = X;


/* belongs: the cluster id for each data object */
int *belongs = new int[N_samples];

for (i = 0; i < N_samples; i++) belongs[i] = -1;
info->belongs = belongs;
}

float** make2DArray(int x, int y) {
float **res = (float **)malloc(x * sizeof(float *));

for (int i = 0; i < x; i++) {
res[i] = (float *)malloc(y * sizeof(float));
}

for (int i = 0; i < x; i++) {
for (int j = 0; j < y; j++) {
res[i][j] = 0;
}
}
return res;
}

void invert2DArray(float **A, float **B, int x, int y) {
for (int i = 0; i < x; i++) {
for (int j = 0; j < y; j++) {
A[i][j] = B[j][i];
}
}
}

void copy2DArray(float **A, float **B, int x, int y) {
for (int i = 0; i < x; i++) {
for (int j = 0; j < y; j++) {
A[i][j] = B[i][j];
}
}
}

void cudaKmeans(Info *info) {
// Initialization
int numPoints = info->numPoints;
int dim = info->dim;
int numCentroids = info->numCentroids;
int thresholdLoops = info->thresholdLoops;
int thresholdFraction = info->thresholdFraction;
int* belongs = info->belongs;
float **points = info->points;
float **centroids = info->centroids;


// invert
float **iPoints = make2DArray(dim, numPoints);
invert2DArray(iPoints, points, dim, numPoints);

// initial guess
float **iCentroids = make2DArray(numCentroids, numPoints);
copy2DArray(iCentroids, points, numCentroids, numPoints);

// centroid -> number of points
int *pointsCount = new int[numCentroids];
float **iNewCentroids = make2DArray(numCentroids, numPoints);

// Some cuda constants
const unsigned int bthreads = 32;
const unsigned int l1 = (numPoints + bthreads - 1) / bthreads;
const unsigned int l2 = nextPowerOfTwo(l1);
const unsigned int sdsize1 = bthreads * sizeof(unsigned char); // shared
// memory size
const unsigned int sdsize2 = l2 * sizeof(unsigned int); // shared
// memory size

// Cuda device Initialization
float *gPoints;
float *gCentroids;
int *gBelongs;
int *tmp;
cudaMalloc(&gPoints, numPoints * dim * sizeof(float));
cudaMalloc(&gCentroids, numCentroids * dim * sizeof(float));
cudaMalloc(&gBelongs, numPoints * sizeof(int));
cudaMalloc(&tmp, l2 * sizeof(unsigned int)); // For reduction
cudaMemcpy(gBelongs,
belongs,
numPoints * sizeof(int),
cudaMemcpyHostToDevice);
cudaMemcpy(gPoints,
iPoints[0],
numPoints * dim * sizeof(float),
cudaMemcpyHostToDevice);

int count = 0;
float frac = 0.0;
do {
cudaMemcpy(gCentroids, iCentroids[0], numCentroids * dim * sizeof(float), cudaMemcpyHostToDevice);

find_nearest_cluster <<< l1, bthreads, sdsize1 >>>
(dim, numPoints, numCentroids, gPoints, gCentroids, gBelongs, tmp);
cudaDeviceSynchronize();

reduce <<< 1, l2, sdsize2 >>> (tmp, l1, l2);
cudaDeviceSynchronize();

int d;
cudaMemcpy(&d, tmp, sizeof(int), cudaMemcpyDeviceToHost);
frac = (float)d;

cudaMemcpy(belongs, gBelongs, numPoints * sizeof(int), cudaMemcpyDeviceToHost);

// Add up objects coordinates for each centroid
for (int i = 0; i < numPoints; i++) {
int idx = belongs[i];
pointsCount[idx] += 1;

for (int j = 0; j < dim; j++) {
iNewCentroids[j][idx] += points[i][j];
}
}

// Update centroids
for (int i = 0; i < numCentroids; i++) {
for (int j = 0; j < dim; j++) {
if (pointsCount[i] > 0) {
iCentroids[j][i] = iNewCentroids[j][i] / pointsCount[i];
}
iNewCentroids[j][i] = 0.0;
}
pointsCount[i] = 0;
}
frac /= numPoints;
count++;
} while (frac > thresholdFraction && count < 500);

invert2DArray(centroids, iCentroids, numCentroids, dim);

}

int main(int argc, char *argv[]) {
Info *info = new Info;
char *fileName = argv[1];

processData(fileName, info);
cudaKmeans(info);
}

0 comments on commit 7235bf8

Please sign in to comment.