PGI: Hints

1.) Compile and run CPU_Sgemm as is for N =10000
to see how many gflops it achieves;
2). Add the following as device interface to generate a CUDA
FORTRAN code GPU_Sgemm.f90
!use cudafor
interface
subroutine sgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc )
bind(c,name='cublasSgemm')
use iso_c_binding
integer(c_int), value :: m, n, k, lda, ldb, ldc
real(c_float), device, dimension(m,n) :: a, b, c
real(c_float), value :: alpha, beta
character(kind=c_char), value :: transa, transb
end subroutine sgemm
end interface

  1. Compile and run GPU_Sgemm