CUDA Programming 2: Sample Solution

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
// includes, project
#include <cutil_inline.h>
#include <cutil.h>
 
#define epsilon (1E-10)
 
const int vlen = 2048; //length of vector
 
//a gpu function of vector addition
__global__ void vadd_gpu(float *v1,float *v2,float *v3)
{
int idx = threadIdx.x+blockDim.x*blockIdx.x;
 
if (idx < vlen) {
v3[idx]=v1[idx]+v2[idx];
}
}
 
//a cpu function of vector addition
void vadd_cpu(float *v1,float *v2,float *v3,int vlen)
{
for (int i=0;i<vlen;i++)
v3[i]=v1[i]+v2[i];
}
 
void compare_results(float *v1,float *v2,int vlen)
{
float error_norm=0;
 
for (int i=0;i<vlen;i++)
error_norm += (v1[i]-v2[i])*(v1[i]-v2[i]);
error_norm = sqrt(error_norm/vlen);
if (error_norm < epsilon)
printf("The results are correct ");
else {
printf("The results are incorrect ");
printf("error_norm is %.6f ",error_norm);
}
}
 
int main(int argc, char **argv)
{
int dev;
int thx,blks;
 
 
float * device_v1,*device_v2,*device_v3;
float * host_v1,*host_v2,*host_v3;
 
dev = cutGetMaxGflopsDeviceId();
cudaSetDevice(dev);
 
//allocate device memory
cudaMalloc((void**) &device_v1,vlen*sizeof(float));
cudaMalloc((void**) &device_v2,vlen*sizeof(float));
cudaMalloc((void**) &device_v3,vlen*sizeof(float));
 
//allocate host memory
host_v1 = (float *)malloc(vlen*sizeof(float));
host_v2 = (float *)malloc(vlen*sizeof(float));
host_v3 = (float *)malloc(vlen*sizeof(float));
 
//initialize vector
for (int i=0;i<vlen;i++) {
host_v1[i] = i;
host_v2[i] = i;
}
 
//copy data from host to device
 
cudaMemcpy(device_v1, host_v1, vlen*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(device_v2, host_v2, vlen*sizeof(int), cudaMemcpyHostToDevice);
 
if (vlen<=512) {
blks =1;
thx = vlen;
}
else {
thx = 512;
blks = (vlen/512)+((vlen%512)?1:0);
}
 
vadd_gpu<<<blks,thx>>>(device_v1,device_v2,device_v3);
cudaThreadSynchronize();
 
//copy result from device to host
cudaMemcpy(host_v3,device_v3,vlen*sizeof(int),cudaMemcpyDeviceToHost);
 
 
float *host_v4;
host_v4 = (float *)malloc(vlen*sizeof(float));
 
vadd_cpu(host_v1,host_v2,host_v4,vlen);
compare_results(host_v3,host_v4,vlen);
 
//free device memory
cudaFree(device_v1);
cudaFree(device_v2);
cudaFree(device_v3);
 
//free host memory
free(host_v1);
free(host_v2);
free(host_v3);
free(host_v4);
}