@@ -293,13 +293,13 @@ int main(int argc, char* argv[]) {
293293 NCCL_CALL (ncclGroupEnd ());
294294 CUDA_RT_CALL (cudaStreamSynchronize (compute_stream));
295295#else
296- MPI_CALL (MPI_Sendrecv (a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0 ,
296+ MPI_CALL (MPI_Sendrecv (a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0 ,
297297 a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0 , MPI_COMM_WORLD,
298298 MPI_STATUS_IGNORE));
299299 MPI_CALL (MPI_Sendrecv (a_new + (iy_end - 1 ) * nx, nx, MPI_REAL_TYPE, bottom, 0 , a_new, nx,
300- MPI_REAL_TYPE, top, 0 , MPI_COMM_WORLD, MPI_STATUS_IGNORE));
300+ MPI_REAL_TYPE, top, 0 , MPI_COMM_WORLD, MPI_STATUS_IGNORE));
301301#endif
302- std::swap (a_new, a);
302+ std::swap (a_new, a);
303303 }
304304 POP_RANGE
305305
@@ -326,7 +326,7 @@ int main(int argc, char* argv[]) {
326326 CUDA_RT_CALL (cudaStreamWaitEvent (push_stream, reset_l2norm_done, 0 ));
327327 calculate_norm = (iter % nccheck) == 0 || (!csv && (iter % 100 ) == 0 );
328328
329- launch_jacobi_kernel (a_new, a, l2_norm_d, (iy_start + 1 ), (iy_end - 1 ), nx, calculate_norm,
329+ launch_jacobi_kernel (a_new, a, l2_norm_d, (iy_start + 1 ), (iy_end - 1 ), nx, calculate_norm,
330330 compute_stream);
331331
332332 launch_jacobi_kernel (a_new, a, l2_norm_d, iy_start, (iy_start + 1 ), nx, calculate_norm,
@@ -346,7 +346,7 @@ int main(int argc, char* argv[]) {
346346 const int bottom = (rank + 1 ) % size;
347347
348348 // Apply periodic boundary conditions
349- // TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls
349+ // TODO: Modify the lable for the RANGE, and replace MPI_Sendrecv with ncclSend and ncclRecv calls
350350 // using the nccl communicator and push_stream.
351351 // Remember to use ncclGroupStart() and ncclGroupEnd()
352352#ifdef SOLUTION
@@ -358,14 +358,14 @@ int main(int argc, char* argv[]) {
358358 NCCL_CALL (ncclSend (a_new + iy_start * nx, nx, NCCL_REAL_TYPE, top, nccl_comm, push_stream));
359359 NCCL_CALL (ncclGroupEnd ());
360360#else
361- PUSH_RANGE (" MPI" , 5 )
361+ PUSH_RANGE (" MPI" , 5 )
362362 MPI_CALL (MPI_Sendrecv (a_new + iy_start * nx, nx, MPI_REAL_TYPE, top, 0 ,
363363 a_new + (iy_end * nx), nx, MPI_REAL_TYPE, bottom, 0 , MPI_COMM_WORLD,
364364 MPI_STATUS_IGNORE));
365365 MPI_CALL (MPI_Sendrecv (a_new + (iy_end - 1 ) * nx, nx, MPI_REAL_TYPE, bottom, 0 , a_new, nx,
366366 MPI_REAL_TYPE, top, 0 , MPI_COMM_WORLD, MPI_STATUS_IGNORE));
367367#endif
368- CUDA_RT_CALL (cudaEventRecord (push_done, push_stream));
368+ CUDA_RT_CALL (cudaEventRecord (push_done, push_stream));
369369 POP_RANGE
370370
371371 if (calculate_norm) {
@@ -410,13 +410,13 @@ int main(int argc, char* argv[]) {
410410
411411 if (rank == 0 && result_correct) {
412412 if (csv) {
413- // TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap
413+ // TODO: Dont forget to change your output lable from mpi_overlap to nccl_overlap
414414#ifdef SOLUTION
415415 printf (" nccl_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n " , nx, ny, iter_max, nccheck, size,
416416#else
417- printf (" mpi_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n " , nx, ny, iter_max, nccheck, size,
417+ printf (" mpi_overlap, %d, %d, %d, %d, %d, 1, %f, %f\n " , nx, ny, iter_max, nccheck, size,
418418#endif
419- (stop - start), runtime_serial);
419+ (stop - start), runtime_serial);
420420 } else {
421421 printf (" Num GPUs: %d.\n " , size);
422422 printf (
0 commit comments