parallel processing - CUDA timing for multi-gpu applications -
this standard way timing in cuda performed:
cudaevent_t start, stop; float time; cudaeventcreate(&start); cudaeventcreate(&stop); cudaeventrecord(start, 0); // timed cudaeventrecord(stop, 0); cudaeventsynchronize(stop); cudaeventelapsedtime(&time, start, stop); printf ("time: %f ms\n", time);
in cuda simplep2p
(peer-to-peer) example, timing performed in way:
cudaevent_t start, stop; float time; int eventflags = cudaeventblockingsync; cudaeventcreatewithflags(&start,eventflags); cudaeventcreatewithflags(&stop,eventflags); cudaeventrecord(start,0); // timed cudaeventrecord(stop,0); cudaeventsynchronize(stop); cudaeventelapsedtime(&time,start,stop);
my questions are:
- why, p2p example, timing has been performed
cudaeventcreatewithflags
cudaeventblockingsync
? - is needed in, speaking, multi-gpu applications (including peer-to-peer memcopy timings?
thanks.
after 3 years, i'm answering own question.
to end, i'll consider examples in concurrency in cuda multi-gpu executions has been underlined how using asynchronous copies enables achieving true multi-gpu concurrency. in particular, consider test case #8 of post.
the full code profiler timeline test case #8 reported here sake of clarity.
#include "utilities.cuh" #include "inputoutput.cuh" #define blocksize 128 /*******************/ /* kernel function */ /*******************/ template<class t> __global__ void kernelfunction(t * __restrict__ d_data, const unsigned int npergpu) { const int tid = threadidx.x + blockidx.x * blockdim.x; if (tid < npergpu) (int k = 0; k < 1000; k++) d_data[tid] = d_data[tid] * d_data[tid]; } /******************/ /* plan structure */ /******************/ // --- async template<class t> struct plan { t *d_data; }; /*********************/ /* svd plan creation */ /*********************/ template<class t> void createplan(plan<t>& plan, unsigned int npergpu, unsigned int gpuid) { // --- device allocation gpuerrchk(cudasetdevice(gpuid)); gpuerrchk(cudamalloc(&(plan.d_data), npergpu * sizeof(t))); } /********/ /* main */ /********/ int main() { const int numgpus = 4; const int npergpu = 500000; const int n = npergpu * numgpus; plan<double> plan[numgpus]; (int k = 0; k < numgpus; k++) createplan(plan[k], npergpu, k); // --- "breadth-first" approach - async double *inputmatrices; gpuerrchk(cudamallochost(&inputmatrices, n * sizeof(double))); (int k = 0; k < numgpus; k++) { gpuerrchk(cudasetdevice(k)); gpuerrchk(cudamemcpyasync(plan[k].d_data, inputmatrices + k * npergpu, npergpu * sizeof(double), cudamemcpyhosttodevice)); } (int k = 0; k < numgpus; k++) { gpuerrchk(cudasetdevice(k)); kernelfunction<<<idivup(npergpu, blocksize), blocksize>>>(plan[k].d_data, npergpu); } (int k = 0; k < numgpus; k++) { gpuerrchk(cudasetdevice(k)); gpuerrchk(cudamemcpyasync(inputmatrices + k * npergpu, plan[k].d_data, npergpu * sizeof(double), cudamemcpydevicetohost)); } gpuerrchk(cudadevicereset()); }
timing asynchronous copies - concurrency destroyed
now, let begin timing asynchronous copies. possible way so, using following snippet:
float time[numgpus]; cudaevent_t start[numgpus], stop[numgpus]; // --- "breadth-first" approach - async (int k = 0; k < numgpus; k++) { gpuerrchk(cudasetdevice(k)); cudaeventcreatewithflags(&start[k], cudaeventblockingsync); cudaeventcreatewithflags(&stop[k], cudaeventblockingsync); cudaeventrecord(start[k], 0); gpuerrchk(cudamemcpyasync(plan[k].d_data, plan[k].h_data, npergpu * sizeof(double), cudamemcpyhosttodevice)); cudaeventrecord(stop[k], 0); cudaeventsynchronize(stop[k]); cudaeventelapsedtime(&time[k], start[k], stop[k]); } (int k = 0; k < numgpus; k++) printf("elapsed time: %3.1f ms \n", time[k]);
unfortunately, way of timing destroys concurrency, possible appreciate profiler timeline below:
timing asynchronous copies - concurrency preserved
to avoid problem, possibility launch gpu tasks openmp threads follows:
int maxnumprocessors = omp_get_max_threads(); std::cout << "maximum number of cpu threads = " << maxnumprocessors << std::endl; // --- "breadth-first" approach - async omp_set_num_threads(numgpus); #pragma omp parallel { unsigned int k = omp_get_thread_num(); gpuerrchk(cudasetdevice(k)); cudaeventcreatewithflags(&start[k], cudaeventblockingsync); cudaeventcreatewithflags(&stop[k], cudaeventblockingsync); cudaeventrecord(start[k], 0); gpuerrchk(cudamemcpyasync(plan[k].d_data, plan[k].h_data, npergpu * sizeof(double), cudamemcpyhosttodevice)); cudaeventrecord(stop[k], 0); cudaeventsynchronize(stop[k]); cudaeventelapsedtime(&time[k], start[k], stop[k]); printf("thread nr. %i; elapsed time: %3.1f ms \n", k, time[k]); }
as can seen profiler timeline, concurrency preserved.
timing kernel launches - concurrency destroyed
the same happens when timing kernel launches. using following snippet, concurrency destroyed.
for (int k = 0; k < numgpus; k++) { gpuerrchk(cudasetdevice(k)); cudaeventcreatewithflags(&start[k], cudaeventblockingsync); cudaeventcreatewithflags(&stop[k], cudaeventblockingsync); cudaeventrecord(start[k], 0); kernelfunction<<<idivup(npergpu, blocksize), blocksize>>>(plan[k].d_data, npergpu); cudaeventrecord(stop[k], 0); cudaeventsynchronize(stop[k]); cudaeventelapsedtime(&time[k], start[k], stop[k]); } (int k = 0; k < numgpus; k++) printf("elapsed time: %3.1f ms \n", time[k]);
timing kernel launches - concurrency preserved
opposite above, using openmp, concurrency preserved.
int maxnumprocessors = omp_get_max_threads(); std::cout << "maximum number of cpu threads = " << maxnumprocessors << std::endl; omp_set_num_threads(numgpus); #pragma omp parallel { unsigned int k = omp_get_thread_num(); gpuerrchk(cudasetdevice(k)); cudaeventcreatewithflags(&start[k], cudaeventblockingsync); cudaeventcreatewithflags(&stop[k], cudaeventblockingsync); cudaeventrecord(start[k], 0); kernelfunction<<<idivup(npergpu, blocksize), blocksize>>>(plan[k].d_data, npergpu); cudaeventrecord(stop[k], 0); cudaeventsynchronize(stop[k]); cudaeventelapsedtime(&time[k], start[k], stop[k]); printf("thread nr. %i; elapsed time: %3.1f ms \n", k, time[k]); }
Comments
Post a Comment