TP CUDA 5.0 / Exemple axpy
axpy (CUDA 5.0)
 Tout Classes Fichiers Fonctions Pages
vector_axpy.cu
Aller à la documentation de ce fichier.
1 
8 // includes, system
9 #include <iostream>
10 #include <stdlib.h>
11 
12 // Utilities and system includes
13 #include <cuda_runtime.h>
14 #include <curand.h>
15 
16 #include <helper_functions.h>
17 #include <helper_cuda.h>
18 
19 #include <curand.h>
20 
21 __device__ double axpy(double a,double x,double y){
22  return a*x+y;
23 }
24 
25 __global__ void vector_axpy_kernel(const double a,const double *d_X, const double *d_Y, double *d_Z, int N){
26  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
27  const int Incl= blockDim.x * gridDim.x;
28 
29  for (int Pos=tid;Pos<N ;Pos+=Incl)
30  d_Z[Pos]= a*d_X[Pos]+d_Y[Pos];
31 }
32 
33 void vector_axpy(const double a,const double *X, const double *Y, double *Z, int N){
34  for (int i=0;i<N;i++)
35  Z[i]=a*X[i]+Y[i];
36 }
37 
38 double errorinf(const double *X,const double *Y,int N){
39  double norm=fabs(X[0]-Y[0]);
40  for (int i=1;i<N;i++){
41  double s=fabs(X[i]-Y[i]);
42  if (s>norm) norm=s;
43  }
44  return norm;
45 }
46 
47 void printArray(const double *X,int N,int n){
48  int nd=N-n,np=n;
49  if (n>N){
50  np=N;nd=N+1;
51  }
52  std::cout.precision(16);
53  for (int i=0;i<np;i++)
54  std::cout << " [" << i << "]: " << X[i] << std::endl;
55  if (nd<N)
56  std::cout << " ...\n";
57  for (int i=nd;i<N;i++)
58  std::cout << " [" << i << "]: " << X[i] << std::endl;
59 
60 }
61 
62 int main(int argc, char** argv){
63  double *d_X,*d_Y,*d_Z; // device variable (on GPU memory)
64  double *h_X,*h_Y,*h_Z,*h_Zgpu; // host variable (on CPU memory)
65  cudaError_t err;
66  int N=1<<24;
67  std::cout << "Start ...\n";
68 
69  // 1) Arrays allocations on <host>
70  h_X = new double[N];
71  h_Y = new double[N];
72  h_Z = new double[N];
73  h_Zgpu = new double[N];
74  if ((h_X == NULL)||(h_Y == NULL)||(h_Z == NULL)||(h_Zgpu == NULL)){
75  fprintf(stderr,"Allocation error on CPU\n");
76  exit(EXIT_FAILURE);
77  }
78 
79  // 2) Arrays allocations on <device>
80  checkCudaErrors( cudaMalloc((void**) &d_X, N * sizeof(double)) );
81  cudaMalloc((void**) &d_Y, N * sizeof(double));
82  err = cudaGetLastError();
83  if( cudaSuccess != err) { // d_Y allocation failed
84  fprintf(stderr, "%s(%i) : CUDA Malloc error : (%d) %s.\n",
85  __FILE__,__LINE__, (int)err, cudaGetErrorString( err ) );
86  checkCudaErrors(cudaFree(d_X)); // Free <device> array d_X
87  exit(EXIT_FAILURE);
88  }
89  cudaMalloc((void**) &d_Z, N * sizeof(double));
90  err = cudaGetLastError();
91  if( cudaSuccess != err) { // d_Z allocation failed
92  fprintf(stderr, "%s(%i) : CUDA Malloc error : (%d) %s.\n",
93  __FILE__,__LINE__, (int)err, cudaGetErrorString( err ) );
94  checkCudaErrors(cudaFree(d_X)); // Free <device> array d_X
95  checkCudaErrors(cudaFree(d_Y)); // Free <device> array d_X
96  exit(EXIT_FAILURE);
97  }
98 
99  // 3) Set pseudo random generator on <host> using CURAND library
100  curandGenerator_t prngCPU;
101  checkCudaErrors(curandCreateGeneratorHost(&prngCPU, CURAND_RNG_PSEUDO_MTGP32));
102  checkCudaErrors(curandSetPseudoRandomGeneratorSeed(prngCPU, 777));
103 
104  // 4) Generate uniformly distributed random numbers in double precision. Values are between 0.0 and 1.0,
105  // where 0.0 is excluded and 1.0 is included.
106  checkCudaErrors(curandGenerateUniformDouble(prngCPU, h_X, N));
107  checkCudaErrors(curandGenerateUniformDouble(prngCPU, h_Y, N));
108 
109  // 5) Commpute Z <- a*X+Y on <host>
110  vector_axpy(2.,h_X,h_Y,h_Z,N);
111  std::cout << "h_Z=\n";
112  printArray(h_Z,N,3);
113 
114  // 6) Copy <host> arrays h_X and h_Y respectively in <device> arrays d_X and d_Y
115  checkCudaErrors( cudaMemcpy(d_X, h_X, N * sizeof(double), cudaMemcpyHostToDevice) );
116  checkCudaErrors( cudaMemcpy(d_Y, h_Y, N * sizeof(double), cudaMemcpyHostToDevice) );
117 
118  // 7) Commpute Z <- a*X+Y on <device> using 256 threads and 512 blocks
119  /* dim3 dimBlock(256, 1, 1);
120  dim3 dimGrid(512, 1, 1);
121  vector_axpy_kernel<<< dimGrid, dimBlock>>>(2.,d_X,d_Y,d_Z,N);*/
122  vector_axpy_kernel<<< 512, 256>>>(2.,d_X,d_Y,d_Z,N);
123 
124  // 8) Copy <device> array d_Z on <host> array h_Zgpu
125  checkCudaErrors( cudaMemcpy(h_Zgpu, d_Z, N * sizeof(double), cudaMemcpyDeviceToHost) );
126 
127  std::cout << "\nh_Zgpu=\n";
128  printArray(h_Zgpu,N,3);
129 
130  // 9) Print max|h_Z - h_Zgpu|
131  std::cout.precision(16);
132  std::cout << "Error : " << errorinf(h_Zgpu,h_Z,N);
133 
134  // 10) Free <device> arrays
135  checkCudaErrors(cudaFree(d_X));
136  checkCudaErrors(cudaFree(d_Y));
137  checkCudaErrors(cudaFree(d_Z));
138 
139  // 11) Free <host> arrays
140  delete [] h_X; delete [] h_Y; delete [] h_Z; delete [] h_Zgpu;
141  std::cout << "\n...Stop\n";
142 }