Research Article

Multi-GPU Support on Single Node Using Directive-Based Programming Model

Algorithm 10

Multi-GPU implementation with OpenACC extension of 2D heat equation.
for(d=0; d<num_devices; d++){
#pragma acc enter data copyin(temp1_h[drowsLDA:(rows+2)LDA]) device(d)
#pragma acc enter data copyin(temp2_h[drowsLDA:(rows+2)LDA]) device(d)
}
for(istep=0; istep<nstep; istep++){
for(d=0; d<num_devices; d++)
step_kernel_(ni+2, rows+2, tfac, temp1_h+drowsLDA, temp2_h+drowsLDA)
}
/  wait to finish the kernel computation /
for(d=0; d<num_devices; d++){
#pragma acc wait device(d)
}
/  exchange halo data /
for(d=0; d<num_devices; d++){
if(d > 0){
#pragma acc update peer to(temp2_h[drowsLDA:LDA]) to_device(d)
from(temp2_h[drowsLDA:LDA]) from_device(d-1)
}
if(d < num_devices - 1){
#pragma acc update peer to(temp2_h[(d+1)rowsLDA+LDA:LDA]) to_device(d)
from(temp2_h[(d+1)rowsLDA+LDA:LDA]) from_device(d+1)
}
}
/  swap pointer of in and out data /
temp_tmp = temp1_h;
temp1_h = temp2_h;
temp2_h = temp_tmp;
}
for(d=0; d<num_devices; d++){
#pragma acc exit data copyout(temp1_h[(drows+1)LDA:rowsLDA]) deviceid(d)
}
for(d=0; d<num_devices; d++){
#pragma acc wait device(d)
}