parallel processing - CUDA timing for multi-gpu applications -

this standard way timing in cuda performed:

cudaevent_t start, stop; float time; cudaeventcreate(&start); cudaeventcreate(&stop);  cudaeventrecord(start, 0);  // timed  cudaeventrecord(stop, 0); cudaeventsynchronize(stop);  cudaeventelapsedtime(&time, start, stop); printf ("time: %f ms\n", time);

in cuda simplep2p (peer-to-peer) example, timing performed in way:

cudaevent_t start, stop; float time; int eventflags = cudaeventblockingsync; cudaeventcreatewithflags(&start,eventflags); cudaeventcreatewithflags(&stop,eventflags);  cudaeventrecord(start,0);  // timed  cudaeventrecord(stop,0); cudaeventsynchronize(stop); cudaeventelapsedtime(&time,start,stop);

my questions are:

why, p2p example, timing has been performed cudaeventcreatewithflags cudaeventblockingsync?
is needed in, speaking, multi-gpu applications (including peer-to-peer memcopy timings?

thanks.

after 3 years, i'm answering own question.

to end, i'll consider examples in concurrency in cuda multi-gpu executions has been underlined how using asynchronous copies enables achieving true multi-gpu concurrency. in particular, consider test case #8 of post.

the full code profiler timeline test case #8 reported here sake of clarity.

#include "utilities.cuh" #include "inputoutput.cuh"  #define blocksize 128  /*******************/ /* kernel function */ /*******************/ template<class t> __global__ void kernelfunction(t * __restrict__ d_data, const unsigned int npergpu) {      const int tid = threadidx.x + blockidx.x * blockdim.x;      if (tid < npergpu) (int k = 0; k < 1000; k++) d_data[tid] = d_data[tid] * d_data[tid];  }  /******************/ /* plan structure */ /******************/ // --- async template<class t> struct plan {     t               *d_data; };  /*********************/ /* svd plan creation */ /*********************/ template<class t> void createplan(plan<t>& plan, unsigned int npergpu, unsigned int gpuid) {      // --- device allocation     gpuerrchk(cudasetdevice(gpuid));     gpuerrchk(cudamalloc(&(plan.d_data), npergpu * sizeof(t))); }  /********/ /* main */ /********/ int main() {      const int numgpus   = 4;     const int npergpu   = 500000;     const int n         = npergpu * numgpus;      plan<double> plan[numgpus];     (int k = 0; k < numgpus; k++) createplan(plan[k], npergpu, k);      // --- "breadth-first" approach - async     double *inputmatrices;   gpuerrchk(cudamallochost(&inputmatrices, n * sizeof(double)));     (int k = 0; k < numgpus; k++) {         gpuerrchk(cudasetdevice(k));         gpuerrchk(cudamemcpyasync(plan[k].d_data, inputmatrices + k * npergpu, npergpu * sizeof(double), cudamemcpyhosttodevice));     }      (int k = 0; k < numgpus; k++) {         gpuerrchk(cudasetdevice(k));         kernelfunction<<<idivup(npergpu, blocksize), blocksize>>>(plan[k].d_data, npergpu);     }      (int k = 0; k < numgpus; k++) {         gpuerrchk(cudasetdevice(k));         gpuerrchk(cudamemcpyasync(inputmatrices + k * npergpu, plan[k].d_data, npergpu * sizeof(double), cudamemcpydevicetohost));     }      gpuerrchk(cudadevicereset()); }

timing asynchronous copies - concurrency destroyed

now, let begin timing asynchronous copies. possible way so, using following snippet:

float time[numgpus]; cudaevent_t start[numgpus], stop[numgpus];  // --- "breadth-first" approach - async (int k = 0; k < numgpus; k++) {     gpuerrchk(cudasetdevice(k));     cudaeventcreatewithflags(&start[k], cudaeventblockingsync);     cudaeventcreatewithflags(&stop[k], cudaeventblockingsync);     cudaeventrecord(start[k], 0);     gpuerrchk(cudamemcpyasync(plan[k].d_data, plan[k].h_data, npergpu * sizeof(double), cudamemcpyhosttodevice));     cudaeventrecord(stop[k], 0);     cudaeventsynchronize(stop[k]);     cudaeventelapsedtime(&time[k], start[k], stop[k]); } (int k = 0; k < numgpus; k++) printf("elapsed time:  %3.1f ms \n", time[k]);

unfortunately, way of timing destroys concurrency, possible appreciate profiler timeline below:

timing asynchronous copies - concurrency preserved

to avoid problem, possibility launch gpu tasks openmp threads follows:

int maxnumprocessors = omp_get_max_threads(); std::cout << "maximum number of cpu threads = " << maxnumprocessors << std::endl;  // --- "breadth-first" approach - async omp_set_num_threads(numgpus); #pragma omp parallel {     unsigned int k = omp_get_thread_num();     gpuerrchk(cudasetdevice(k));     cudaeventcreatewithflags(&start[k], cudaeventblockingsync);     cudaeventcreatewithflags(&stop[k], cudaeventblockingsync);     cudaeventrecord(start[k], 0);     gpuerrchk(cudamemcpyasync(plan[k].d_data, plan[k].h_data, npergpu * sizeof(double), cudamemcpyhosttodevice));     cudaeventrecord(stop[k], 0);     cudaeventsynchronize(stop[k]);     cudaeventelapsedtime(&time[k], start[k], stop[k]);     printf("thread nr. %i; elapsed time:  %3.1f ms \n", k, time[k]); }

as can seen profiler timeline, concurrency preserved.

timing kernel launches - concurrency destroyed

the same happens when timing kernel launches. using following snippet, concurrency destroyed.

for (int k = 0; k < numgpus; k++) {     gpuerrchk(cudasetdevice(k));     cudaeventcreatewithflags(&start[k], cudaeventblockingsync);     cudaeventcreatewithflags(&stop[k], cudaeventblockingsync);     cudaeventrecord(start[k], 0);     kernelfunction<<<idivup(npergpu, blocksize), blocksize>>>(plan[k].d_data, npergpu);     cudaeventrecord(stop[k], 0);     cudaeventsynchronize(stop[k]);     cudaeventelapsedtime(&time[k], start[k], stop[k]); } (int k = 0; k < numgpus; k++) printf("elapsed time:  %3.1f ms \n", time[k]);

timing kernel launches - concurrency preserved

opposite above, using openmp, concurrency preserved.

int maxnumprocessors = omp_get_max_threads(); std::cout << "maximum number of cpu threads = " << maxnumprocessors << std::endl;  omp_set_num_threads(numgpus); #pragma omp parallel {     unsigned int k = omp_get_thread_num();     gpuerrchk(cudasetdevice(k));     cudaeventcreatewithflags(&start[k], cudaeventblockingsync);     cudaeventcreatewithflags(&stop[k], cudaeventblockingsync);     cudaeventrecord(start[k], 0);     kernelfunction<<<idivup(npergpu, blocksize), blocksize>>>(plan[k].d_data, npergpu);     cudaeventrecord(stop[k], 0);     cudaeventsynchronize(stop[k]);     cudaeventelapsedtime(&time[k], start[k], stop[k]);     printf("thread nr. %i; elapsed time:  %3.1f ms \n", k, time[k]); }

Search This Blog

Sher

parallel processing - CUDA timing for multi-gpu applications -

Comments

Post a Comment

Popular posts from this blog

Detect support for Shoutcast ICY MP3 without navigator.userAgent in Firefox? -

web - SVG not rendering properly in Firefox -

java - JavaFX 2 slider labelFormatter not being used -