Research Article
Multi-GPU Support on Single Node Using Directive-Based Programming Model
Algorithm 10
Multi-GPU implementation with OpenACC extension of 2D heat equation.
for(d=0; d<num_devices; d++){ | #pragma acc enter data copyin(temp1_h[drowsLDA:(rows+2)LDA]) device(d) | #pragma acc enter data copyin(temp2_h[drowsLDA:(rows+2)LDA]) device(d) | } | for(istep=0; istep<nstep; istep++){ | for(d=0; d<num_devices; d++) | step_kernel_(ni+2, rows+2, tfac, temp1_h+drowsLDA, temp2_h+drowsLDA) | } | / wait to finish the kernel computation / | for(d=0; d<num_devices; d++){ | #pragma acc wait device(d) | } | / exchange halo data / | for(d=0; d<num_devices; d++){ | if(d > 0){ | #pragma acc update peer to(temp2_h[drowsLDA:LDA]) to_device(d) | from(temp2_h[drowsLDA:LDA]) from_device(d-1) | } | if(d < num_devices - 1){ | #pragma acc update peer to(temp2_h[(d+1)rowsLDA+LDA:LDA]) to_device(d) | from(temp2_h[(d+1)rowsLDA+LDA:LDA]) from_device(d+1) | } | } | / swap pointer of in and out data / | temp_tmp = temp1_h; | temp1_h = temp2_h; | temp2_h = temp_tmp; | } | for(d=0; d<num_devices; d++){ | #pragma acc exit data copyout(temp1_h[(drows+1)LDA:rowsLDA]) deviceid(d) | } | for(d=0; d<num_devices; d++){ | #pragma acc wait device(d) | } |
|