Add timing results

JiaweiZhuang · May 1, 2017 · 0be8075 · 0be8075
1 parent 3ac51ff
commit 0be8075
Show file tree

Hide file tree

Showing 2 changed files with 106 additions and 2 deletions.
diff --git a/Parallel_Algorithm/Cuda/kmeans_cdf.cu b/Parallel_Algorithm/Cuda/kmeans_cdf.cu
@@ -17,7 +17,7 @@ using namespace std;
 #define ERRCODE 2
 #define ERR(e) {printf("Error: %s\n", nc_strerror(e)); exit(ERRCODE);}
 
-double iStart1, iStart2, iStart3a, iStart3b, iStart4a, iStart4b, iStart4c, iStart5;
+double iStart1, iStart2, iStart3a, iStart3b, iStart4a, iStart4b, iStart4c, iStart4d, iStart5;
 double iElaps1=0, iElaps2=0, iElaps3a=0, iElaps3b=0, iElaps4=0, iElaps5=0;
 // Hold configurations for Kmeans
 struct Info {
@@ -299,6 +299,8 @@ void cudaKmeans(Info *info) {
  float **guess = info->guess;
  int threadPerBlock = info->threadPerBlock;
 
+ iStart4d = cpuSecond();
+
  // invert (transpose matrix)
  float **iPoints = make2DArray(dim, numPoints);
  invert2DArray(iPoints, points, dim, numPoints);
@@ -312,6 +314,8 @@ void cudaKmeans(Info *info) {
  int *pointsCount = new int[numCentroids];
  float **iNewCentroids = make2DArray(dim, numCentroids);
 
+ iElaps4 += cpuSecond() - iStart4d;
+
  // Some cuda constants
  const unsigned int bthreads = threadPerBlock;
  const unsigned int l1 = (numPoints + bthreads - 1) / bthreads;
@@ -415,9 +419,11 @@ void cudaKmeans(Info *info) {
 
  }
 
+ iStart4d = cpuSecond();
  centroids = make2DArray(numCentroids, dim);
  invert2DArray(centroids, iCentroids, numCentroids, dim);
  info->centroids = centroids;
+ iElaps4 += cpuSecond() - iStart4d;
 
  // Free device memory
  cudaFree(gPoints);
@@ -461,5 +467,5 @@ int main(int argc, char *argv[]) {
  cout << "M-step-1st-half time use (ms): " << iElaps3a*1000 << "\n";
  cout << "M-step-2nd-half time use (ms): " << iElaps3b*1000 << "\n";
  cout << "Cuda Data IO (ms): " << iElaps4*1000 << "\n";
- cout << "Other (ms): " << iElaps5*1000 << "\n";
+ cout << "Check Convergence (ms): " << iElaps5*1000 << "\n";
 }
diff --git a/Timing_Results/log/Blobs_Cuda.log b/Timing_Results/log/Blobs_Cuda.log
@@ -0,0 +1,98 @@
+
+=========================================
+=========================================
+testing with 1 threads per block on device
+Number of samples: 20000 
+Number of features: 30 
+Number of clusters: 8 
+Number of repeated runs: 20 
+Total time: 783.192
+E-step time use (ms): 356.685
+M-step-1st-half time use (ms): 119.512
+M-step-2nd-half time use (ms): 0.067234
+Cuda Data IO (ms): 302.046
+Check Convergence (ms): 0.521421
+
+=========================================
+=========================================
+testing with 2 threads per block on device
+Number of samples: 20000 
+Number of features: 30 
+Number of clusters: 8 
+Number of repeated runs: 20 
+Total time: 636.066
+E-step time use (ms): 202.417
+M-step-1st-half time use (ms): 132.744
+M-step-2nd-half time use (ms): 0.0743866
+Cuda Data IO (ms): 294.903
+Check Convergence (ms): 0.592709
+
+=========================================
+=========================================
+testing with 4 threads per block on device
+Number of samples: 20000 
+Number of features: 30 
+Number of clusters: 8 
+Number of repeated runs: 20 
+Total time: 547.731
+E-step time use (ms): 108.101
+M-step-1st-half time use (ms): 138.242
+M-step-2nd-half time use (ms): 0.0698566
+Cuda Data IO (ms): 295.276
+Check Convergence (ms): 0.645161
+
+=========================================
+=========================================
+testing with 8 threads per block on device
+Number of samples: 20000 
+Number of features: 30 
+Number of clusters: 8 
+Number of repeated runs: 20 
+Total time: 541.216
+E-step time use (ms): 69.1309
+M-step-1st-half time use (ms): 169.611
+M-step-2nd-half time use (ms): 0.089407
+Cuda Data IO (ms): 296.227
+Check Convergence (ms): 0.853777
+
+=========================================
+=========================================
+testing with 16 threads per block on device
+Number of samples: 20000 
+Number of features: 30 
+Number of clusters: 8 
+Number of repeated runs: 20 
+Total time: 511.788
+E-step time use (ms): 38.1258
+M-step-1st-half time use (ms): 171.403
+M-step-2nd-half time use (ms): 0.0870228
+Cuda Data IO (ms): 295.944
+Check Convergence (ms): 0.88644
+
+=========================================
+=========================================
+testing with 32 threads per block on device
+Number of samples: 20000 
+Number of features: 30 
+Number of clusters: 8 
+Number of repeated runs: 20 
+Total time: 2496.99
+E-step time use (ms): 245.165
+M-step-1st-half time use (ms): 1893.42
+M-step-2nd-half time use (ms): 0.981092
+Cuda Data IO (ms): 322.414
+Check Convergence (ms): 27.8046
+
+=========================================
+=========================================
+testing with 64 threads per block on device
+Number of samples: 20000 
+Number of features: 30 
+Number of clusters: 8 
+Number of repeated runs: 20 
+Total time: 2437.49
+E-step time use (ms): 162.151
+M-step-1st-half time use (ms): 1918.99
+M-step-2nd-half time use (ms): 0.959396
+Cuda Data IO (ms): 321.023
+Check Convergence (ms): 27.1459