13 #include <cuda_runtime.h>
16 #include <helper_functions.h>
17 #include <helper_cuda.h>
21 __device__
double axpy(
double a,
double x,
double y){
25 __global__
void vector_axpy_kernel(
const double a,
const double *d_X,
const double *d_Y,
double *d_Z,
int N){
26 const int tid = blockDim.x * blockIdx.x + threadIdx.x;
27 const int Incl= blockDim.x * gridDim.x;
29 for (
int Pos=tid;Pos<N ;Pos+=Incl)
30 d_Z[Pos]= a*d_X[Pos]+d_Y[Pos];
33 void vector_axpy(
const double a,
const double *X,
const double *Y,
double *Z,
int N){
38 double errorinf(
const double *X,
const double *Y,
int N){
39 double norm=fabs(X[0]-Y[0]);
40 for (
int i=1;i<N;i++){
41 double s=fabs(X[i]-Y[i]);
47 void printArray(
const double *X,
int N,
int n){
52 std::cout.precision(16);
53 for (
int i=0;i<np;i++)
54 std::cout <<
" [" << i <<
"]: " << X[i] << std::endl;
56 std::cout <<
" ...\n";
57 for (
int i=nd;i<N;i++)
58 std::cout <<
" [" << i <<
"]: " << X[i] << std::endl;
62 int main(
int argc,
char** argv){
63 double *d_X,*d_Y,*d_Z;
64 double *h_X,*h_Y,*h_Z,*h_Zgpu;
67 std::cout <<
"Start ...\n";
73 h_Zgpu =
new double[N];
74 if ((h_X == NULL)||(h_Y == NULL)||(h_Z == NULL)||(h_Zgpu == NULL)){
75 fprintf(stderr,
"Allocation error on CPU\n");
80 checkCudaErrors( cudaMalloc((
void**) &d_X, N *
sizeof(
double)) );
81 cudaMalloc((
void**) &d_Y, N *
sizeof(
double));
82 err = cudaGetLastError();
83 if( cudaSuccess != err) {
84 fprintf(stderr,
"%s(%i) : CUDA Malloc error : (%d) %s.\n",
85 __FILE__,__LINE__, (
int)err, cudaGetErrorString( err ) );
86 checkCudaErrors(cudaFree(d_X));
89 cudaMalloc((
void**) &d_Z, N *
sizeof(
double));
90 err = cudaGetLastError();
91 if( cudaSuccess != err) {
92 fprintf(stderr,
"%s(%i) : CUDA Malloc error : (%d) %s.\n",
93 __FILE__,__LINE__, (
int)err, cudaGetErrorString( err ) );
94 checkCudaErrors(cudaFree(d_X));
95 checkCudaErrors(cudaFree(d_Y));
100 curandGenerator_t prngCPU;
101 checkCudaErrors(curandCreateGeneratorHost(&prngCPU, CURAND_RNG_PSEUDO_MTGP32));
102 checkCudaErrors(curandSetPseudoRandomGeneratorSeed(prngCPU, 777));
106 checkCudaErrors(curandGenerateUniformDouble(prngCPU, h_X, N));
107 checkCudaErrors(curandGenerateUniformDouble(prngCPU, h_Y, N));
111 std::cout <<
"h_Z=\n";
115 checkCudaErrors( cudaMemcpy(d_X, h_X, N *
sizeof(
double), cudaMemcpyHostToDevice) );
116 checkCudaErrors( cudaMemcpy(d_Y, h_Y, N *
sizeof(
double), cudaMemcpyHostToDevice) );
122 vector_axpy_kernel<<< 512, 256>>>(2.,d_X,d_Y,d_Z,N);
125 checkCudaErrors( cudaMemcpy(h_Zgpu, d_Z, N *
sizeof(
double), cudaMemcpyDeviceToHost) );
127 std::cout <<
"\nh_Zgpu=\n";
128 printArray(h_Zgpu,N,3);
131 std::cout.precision(16);
132 std::cout <<
"Error : " <<
errorinf(h_Zgpu,h_Z,N);
135 checkCudaErrors(cudaFree(d_X));
136 checkCudaErrors(cudaFree(d_Y));
137 checkCudaErrors(cudaFree(d_Z));
140 delete [] h_X;
delete [] h_Y;
delete [] h_Z;
delete [] h_Zgpu;
141 std::cout <<
"\n...Stop\n";