# Jacket: Sample Solution

type='single'

type='double'

N = 128; % matrix size
M = 400; % number of matrices
trials = 30; % several timing trials, pick best

disp 'Computing the CPU for-loop benchmarks...'
A = ones(N,N,M,type); % many matrices...
B = ones(N,N,type); % ... each multiplied against one
cpu_for_seconds = inf;
for t = 1:trials
tic
for i = 1:M
C = A(:,:,i) * B;
end
cpu_for_seconds = min(toc, cpu_for_seconds);
end
cpu_for_gflops = 2 * M * (N^3) ./ (1e9 * cpu_for_seconds);

disp 'Computing the GPU for-loop benchmarks...'
A = gones(N,N,M,type); % many matrices...
B = gones(N,N,type); % ... each multiplied against one
gpu_for_seconds = inf;
for t = 1:trials
gsync; tic
for i = 1:M
C = A(:,:,i) * B;
end
gsync
gpu_for_seconds = min(toc, gpu_for_seconds);
end
gpu_for_gflops = 2 * M * (N^3) ./ (1e9 * gpu_for_seconds);

disp 'Computing the GPU gfor-loop benchmarks...'
gpu_gfor_seconds = inf;
for t = 1:trials
gsync; tic
gfor i = 1:M
C = A(:,:,i) * B;
gend
gsync
gpu_gfor_seconds = min(toc, gpu_gfor_seconds);
end
gpu_gfor_gflops = 2 * M * (N^3) ./ (1e9 * gpu_gfor_seconds);

fprintf('Speed-up: gpu,gfor vs gpu,for = %g ', gpu_gfor_gflops/gpu_for_gflops);
fprintf('Speed-up: gpu,gfor vs cpu,for = %g ', gpu_gfor_gflops/cpu_for_gflops);