PGI: Sample Solution

program GPU_Sgemm
use cudafor
 
interface
subroutine sgemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc ) bind(c,name='cublasSgemm')
use iso_c_binding
integer(c_int), value :: m, n, k, lda, ldb, ldc
real(c_float), device, dimension(m,n) :: a, b, c
real(c_float), value :: alpha, beta
character(kind=c_char), value :: transa, transb
end subroutine sgemm
end interface
 
real, device, allocatable, dimension(:,:) :: dA, dB, dC
real, allocatable, dimension(:,:) :: a, b, c
real :: alpha = 1.0e0
real :: beta = 0.0e0
real :: t1, t2, tt, gflops
integer :: i, j, k
 
print *, "Enter N: "
read(5,*) n
 
allocate(a(n,n), b(n,n), c(n,n))
allocate(dA(n,n), dB(n,n), dC(n,n))
 
a = 2.0e0
b = 1.5e0
c = -9.9e0
 
call cpu_time(t1)
 
dA = a
dB = b
if (beta .ne. 0.0) then
dC = c
endif
 
call sgemm('n','n', n, n, n, alpha, dA, n, dB, n, beta, dC, n)
c = dC
 
call cpu_time(t2)
 
print *, "Checking results...."
 
do j = 1, n
do i = 1, n
if (c(i,j) .ne. (3.0e0*real(n))) then
print *, "error: ",i,j,c(i,j)
endif
enddo
enddo
 
gflops = (real(n) * real(n) * real(n) * 2.0) / 1000000000.0
tt = t2 - t1
print *, "Total Time: ",tt
print *, "Total SGEMM gflops: ",gflops/tt
print *, "Done...."
end