Skip to content

Commit

Permalink
Add timing results
Browse files Browse the repository at this point in the history
  • Loading branch information
Jason Song committed May 1, 2017
1 parent 3ac51ff commit 0be8075
Show file tree
Hide file tree
Showing 2 changed files with 106 additions and 2 deletions.
10 changes: 8 additions & 2 deletions Parallel_Algorithm/Cuda/kmeans_cdf.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ using namespace std;
#define ERRCODE 2
#define ERR(e) {printf("Error: %s\n", nc_strerror(e)); exit(ERRCODE);}

double iStart1, iStart2, iStart3a, iStart3b, iStart4a, iStart4b, iStart4c, iStart5;
double iStart1, iStart2, iStart3a, iStart3b, iStart4a, iStart4b, iStart4c, iStart4d, iStart5;
double iElaps1=0, iElaps2=0, iElaps3a=0, iElaps3b=0, iElaps4=0, iElaps5=0;
// Hold configurations for Kmeans
struct Info {
Expand Down Expand Up @@ -299,6 +299,8 @@ void cudaKmeans(Info *info) {
float **guess = info->guess;
int threadPerBlock = info->threadPerBlock;

iStart4d = cpuSecond();

// invert (transpose matrix)
float **iPoints = make2DArray(dim, numPoints);
invert2DArray(iPoints, points, dim, numPoints);
Expand All @@ -312,6 +314,8 @@ void cudaKmeans(Info *info) {
int *pointsCount = new int[numCentroids];
float **iNewCentroids = make2DArray(dim, numCentroids);

iElaps4 += cpuSecond() - iStart4d;

// Some cuda constants
const unsigned int bthreads = threadPerBlock;
const unsigned int l1 = (numPoints + bthreads - 1) / bthreads;
Expand Down Expand Up @@ -415,9 +419,11 @@ void cudaKmeans(Info *info) {

}

iStart4d = cpuSecond();
centroids = make2DArray(numCentroids, dim);
invert2DArray(centroids, iCentroids, numCentroids, dim);
info->centroids = centroids;
iElaps4 += cpuSecond() - iStart4d;

// Free device memory
cudaFree(gPoints);
Expand Down Expand Up @@ -461,5 +467,5 @@ int main(int argc, char *argv[]) {
cout << "M-step-1st-half time use (ms): " << iElaps3a*1000 << "\n";
cout << "M-step-2nd-half time use (ms): " << iElaps3b*1000 << "\n";
cout << "Cuda Data IO (ms): " << iElaps4*1000 << "\n";
cout << "Other (ms): " << iElaps5*1000 << "\n";
cout << "Check Convergence (ms): " << iElaps5*1000 << "\n";
}
98 changes: 98 additions & 0 deletions Timing_Results/log/Blobs_Cuda.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@

=========================================
=========================================
testing with 1 threads per block on device
Number of samples: 20000
Number of features: 30
Number of clusters: 8
Number of repeated runs: 20
Total time: 783.192
E-step time use (ms): 356.685
M-step-1st-half time use (ms): 119.512
M-step-2nd-half time use (ms): 0.067234
Cuda Data IO (ms): 302.046
Check Convergence (ms): 0.521421

=========================================
=========================================
testing with 2 threads per block on device
Number of samples: 20000
Number of features: 30
Number of clusters: 8
Number of repeated runs: 20
Total time: 636.066
E-step time use (ms): 202.417
M-step-1st-half time use (ms): 132.744
M-step-2nd-half time use (ms): 0.0743866
Cuda Data IO (ms): 294.903
Check Convergence (ms): 0.592709

=========================================
=========================================
testing with 4 threads per block on device
Number of samples: 20000
Number of features: 30
Number of clusters: 8
Number of repeated runs: 20
Total time: 547.731
E-step time use (ms): 108.101
M-step-1st-half time use (ms): 138.242
M-step-2nd-half time use (ms): 0.0698566
Cuda Data IO (ms): 295.276
Check Convergence (ms): 0.645161

=========================================
=========================================
testing with 8 threads per block on device
Number of samples: 20000
Number of features: 30
Number of clusters: 8
Number of repeated runs: 20
Total time: 541.216
E-step time use (ms): 69.1309
M-step-1st-half time use (ms): 169.611
M-step-2nd-half time use (ms): 0.089407
Cuda Data IO (ms): 296.227
Check Convergence (ms): 0.853777

=========================================
=========================================
testing with 16 threads per block on device
Number of samples: 20000
Number of features: 30
Number of clusters: 8
Number of repeated runs: 20
Total time: 511.788
E-step time use (ms): 38.1258
M-step-1st-half time use (ms): 171.403
M-step-2nd-half time use (ms): 0.0870228
Cuda Data IO (ms): 295.944
Check Convergence (ms): 0.88644

=========================================
=========================================
testing with 32 threads per block on device
Number of samples: 20000
Number of features: 30
Number of clusters: 8
Number of repeated runs: 20
Total time: 2496.99
E-step time use (ms): 245.165
M-step-1st-half time use (ms): 1893.42
M-step-2nd-half time use (ms): 0.981092
Cuda Data IO (ms): 322.414
Check Convergence (ms): 27.8046

=========================================
=========================================
testing with 64 threads per block on device
Number of samples: 20000
Number of features: 30
Number of clusters: 8
Number of repeated runs: 20
Total time: 2437.49
E-step time use (ms): 162.151
M-step-1st-half time use (ms): 1918.99
M-step-2nd-half time use (ms): 0.959396
Cuda Data IO (ms): 321.023
Check Convergence (ms): 27.1459

0 comments on commit 0be8075

Please sign in to comment.