Worked well

(base) jk@jkDL:~/dev/ctst$ g++ jadd.cpp -o v1


Issues came up nvcc not in path

(base) jk@jkDL:~/dev/ctst$ nvcc jadd.cpp -o v1


Effort to find out issue and provided path for NVCC

( solution 1 )


Worked well

(base) jk@jkDL:~/dev/ctst$ nvcc jadd.cpp -o v1


Vector add executed in GPU and also profiled and results are there in the following pages

Note… cmake and make are not used in the above..

Using a multi-dimensional block means that we have to be careful about distributing this number of threads among all the dimensions. In a 1D block, we can set 1024 threads at most in the x axis, but in a 2D block, if you set 2 as the size of y, you cannot exceed 512 for the x!

For example,

dim3 threadsPerBlock(1024, 1, 1) is allowed,

as well as dim3 threadsPerBlock(512, 2, 1),

but not dim3 threadsPerBlock(256, 3, 2).

#include <iostream>

#include <math.h>


// function to add the elements of two arrays

void add(int n, float *x, float *y)

{

for (int i = 0; i < n; i++)

y[i] = x[i] + y[i];

}


int main(void)

{

int N = 1<<20; // 1M elements


float *x = new float[N];

float *y = new float[N];


// initialize x and y arrays on the host

for (int i = 0; i < N; i++) {

x[i] = 1.0f;

y[i] = 2.0f;

}


// Run kernel on 1M elements on the CPU

add(N, x, y);


// Check for errors (all values should be 3.0f)

float maxError = 0.0f;

for (int i = 0; i < N; i++)

maxError = fmax(maxError, fabs(y[i]-3.0f));

std::cout << "Max error: " << maxError << std::endl;

// Free memory

delete [] x;

delete [] y;


return 0;

}

(base) jk@jkDL:~/dev/ctst$ g++ jadd.cpp -o v1

(base) jk@jkDL:~/dev/ctst$ ./v1

jk ...Max error: 0

(base) jk@jkDL:~/dev/ctst$


export PATH=/usr/local/cuda-10.2/bin:$PATH


(base) jk@jkDL:~/dev/ctst$ nvcc jadd.cpp -o v1

(base) jk@jkDL:~/dev/ctst$ ./v1

jk ...Max error: 0

(base) jk@jkDL:~/dev/ctst$

//This can put in.bashrc Of home folder

export PATH=$PATH:/usr/local/cuda-10.2/bin

export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-10.2/lib64

export CPATH=$CPATH:/usr/local/cuda-10.2/include


#include <iostream>

#include <math.h>

// Kernel function to add the elements of two arrays

__global__

void add(int n, float *x, float *y)

{

for (int i = 0; i < n; i++)

y[i] = x[i] + y[i];

}


int main(void)

{

int N = 1<<20;

float *x, *y;


// Allocate Unified Memory – accessible from CPU or GPU

cudaMallocManaged(&x, N*sizeof(float));

cudaMallocManaged(&y, N*sizeof(float));


// initialize x and y arrays on the host

for (int i = 0; i < N; i++) {

x[i] = 1.0f;

y[i] = 2.0f;

// Run kernel on 1M elements on the GPU

add<<<1, 1>>>(N, x, y);


// Wait for GPU to finish before accessing on host

cudaDeviceSynchronize();


// Check for errors (all values should be 3.0f)

float maxError = 0.0f;

for (int i = 0; i < N; i++)

maxError = fmax(maxError, fabs(y[i]-3.0f));

std::cout << "Max error: " << maxError << std::endl;


// Free memory

cudaFree(x);

cudaFree(y);

return 0;

}


(base) jk@jkDL:~/dev/ctst$ nvcc jadd1.cu -o jadd1_cuda

(base) jk@jkDL:~/dev/ctst$ ./jadd1_cuda

Max error: 0

(base) jk@jkDL:~/dev/ctst$


(base) jk@jkDL:~/dev/ctst$ nvprof ./jadd1_cuda

==3023== NVPROF is profiling process 3023, command: ./jadd1_cuda

Max error: 0

==3023== Profiling application: ./jadd1_cuda

==3023== Profiling result:

Type Time(%) Time Calls Avg Min Max Name

GPU activities: 100.00% 68.815ms 1 68.815ms 68.815ms 68.815ms add(int, float*, float*)

API calls: 72.48% 183.65ms 2 91.824ms 29.083us 183.62ms cudaMallocManaged

27.16% 68.816ms 1 68.816ms 68.816ms 68.816ms cudaDeviceSynchronize

0.17% 420.79us 1 420.79us 420.79us 420.79us cuDeviceTotalMem

0.09% 230.58us 97 2.3770us 443ns 84.907us cuDeviceGetAttribute

0.08% 200.15us 2 100.08us 96.366us 103.79us cudaFree

0.02% 42.503us 1 42.503us 42.503us 42.503us cudaLaunchKernel

0.01% 19.278us 1 19.278us 19.278us 19.278us cuDeviceGetName

0.00% 2.8740us 3 958ns 570ns 1.7030us cuDeviceGetCount

0.00% 1.3380us 2 669ns 473ns 865ns cuDeviceGet

0.00% 596ns 1 596ns 596ns 596ns cuDeviceGetUuid


==3023== Unified Memory profiling result:

Device "GeForce RTX 2070 (0)"

Count Avg Size Min Size Max Size Total Size Total Time Name

40 204.80KB 64.000KB 960.00KB 8.000000MB 5.503744ms Host To Device

20 204.80KB 64.000KB 960.00KB 4.000000MB 2.532480ms Device To Host

12 - - - - 13.38730ms Gpu page fault groups

Total CPU Page faults: 36


Now that you’ve run a kernel with one thread that does some computation, how do you make it parallel? The key is in CUDA’s <<<1, 1>>>syntax. This is called the execution configuration, and it tells the CUDA runtime how many parallel threads to use for the launch on the GPU. There are two parameters here, but let’s start by changing the second one: the number of threads in a thread block. CUDA GPUs run kernels using blocks of threads that are a multiple of 32 in size, so 256 threads is a reasonable size to choose

Note this point …. ( per call max threads can be at 32 )

Each of the N threads that execute VecAdd() performs one pair-wise addition.

#include <iostream>

#include <math.h>

// Kernel function to add the elements of two arrays

__global__

void add(int n, float *x, float *y)

{

for (int i = 0; i < n; i++)

y[i] = x[i] + y[i];

}

__global__

void addT(int n, float *x, float *y)

{

int index = threadIdx.x;

int stride = blockDim.x;

for (int i = index; i < n; i += stride)

y[i] = x[i] + y[i];

}


int main(void)

{

int N = 1<<20;

float *x, *y;


// Allocate Unified Memory – accessible from CPU or GPU

cudaMallocManaged(&x, N*sizeof(float));

cudaMallocManaged(&y, N*sizeof(float));


// initialize x and y arrays on the host

for (int i = 0; i < N; i++) {

x[i] = 1.0f;

y[i] = 2.0f;

}

/ Run kernel on 1M elements on the GPU

// add<<<1, 1>>>(N, x, y);

addT<<<1,256>>>(N, x, y);


// Wait for GPU to finish before accessing on host

cudaDeviceSynchronize();


// Check for errors (all values should be 3.0f)

float maxError = 0.0f;

for (int i = 0; i < N; i++)

maxError = fmax(maxError, fabs(y[i]-3.0f));

std::cout << "Max error: " << maxError << std::endl;


// Free memory

cudaFree(x);

cudaFree(y);

return 0;

}

(base) jk@jkDL:~/dev/ctst$


(base) jk@jkDL:~/dev/ctst$ nvprof ./jaddg

==3181== NVPROF is profiling process 3181, command: ./jaddg

Max error: 0

==3181== Profiling application: ./jaddg

==3181== Profiling result:

Type Time(%) Time Calls Avg Min Max Name

GPU activities: 100.00% 7.0370ms 1 7.0370ms 7.0370ms 7.0370ms addT(int, float*, float*)

API calls: 95.57% 174.27ms 2 87.133ms 24.782us 174.24ms cudaMallocManaged

3.84% 7.0000ms 1 7.0000ms 7.0000ms 7.0000ms cudaDeviceSynchronize

0.28% 504.46us 1 504.46us 504.46us 504.46us cuDeviceTotalMem

0.15% 268.99us 97 2.7730us 531ns 97.974us cuDeviceGetAttribute

0.11% 192.40us 2 96.200us 93.803us 98.597us cudaFree

0.04% 79.374us 1 79.374us 79.374us 79.374us cudaLaunchKernel

0.01% 23.772us 1 23.772us 23.772us 23.772us cuDeviceGetName

0.00% 3.8410us 3 1.2800us 652ns 2.4740us cuDeviceGetCount

0.00% 1.6260us 2 813ns 595ns 1.0310us cuDeviceGet

0.00% 795ns 1 795ns 795ns 795ns cuDeviceGetUuid


==3181== Unified Memory profiling result:

Device "GeForce RTX 2070 (0)"

Count Avg Size Min Size Max Size Total Size Total Time Name

40 204.80KB 64.000KB 960.00KB 8.000000MB 5.505760ms Host To Device

20 204.80KB 64.000KB 960.00KB 4.000000MB 2.532736ms Device To Host

12 - - - - 6.191616ms Gpu page fault groups

Total CPU Page faults: 36

(base) jk@jkDL:~/dev/ctst$


Thread Hierarchy

For a one-dimensional block, they are the same;

for a two-dimensional block of size (Dx, Dy),the thread ID of a thread of index (x, y) is (x + y Dx);

for a three-dimensional block of size (Dx, Dy, Dz), the thread ID of a thread of index (x, y, z) is (x + y Dx + z Dx Dy).


threadIdx is a 3-component vector, so that threads can be identified using a one-dimensional, two-dimensional, or three-dimensional thread index, forming a one-dimensional, two-dimensional, or three-dimensional block of threads, called a thread block


adds two matrices A and B of size NxN and stores the result into matrix C

#include <iostream>

#include <math.h>

// Kernel function to add the elements of two arrays

__global__

void add(int n, float *x, float *y)

{

for (int i = 0; i < n; i++)

y[i] = x[i] + y[i];

}


__global__

void addT(int n, float *x, float *y)

{

int index = threadIdx.x;

int stride = blockDim.x;

for (int i = index; i < n; i += stride)

y[i] = x[i] + y[i];

}


__global__

void addBT(int n, float *x, float *y)

{

int index = blockIdx.x * blockDim.x + threadIdx.x;

int stride = blockDim.x * gridDim.x;

for (int i = index; i < n; i += stride)

y[i] = x[i] + y[i];

}

int main(void)

{

int N = 1<<20;

float *x, *y;

// Allocate Unified Memory – accessible from CPU or GPU

cudaMallocManaged(&x, N*sizeof(float));

cudaMallocManaged(&y, N*sizeof(float));


// initialize x and y arrays on the host

for (int i = 0; i < N; i++) {

x[i] = 1.0f;

y[i] = 2.0f;

}


// Run kernel on 1M elements on the GPU

// add<<<1, 1>>>(N, x, y);

// add<<<1, 256>>>(N, x, y);

// add<<<6, 31>>>(N, x, y);

// add<<<14, 32>>>(N, x, y);

//

// addT<<<14, 32>>>(N, x, y);

// addT<<<1, 32>>>(N, x, y);

// addT<<<2, 32>>>(N, x, y);



int blockSize = 256;

int numBlocks = (N + blockSize - 1) / blockSize;

addBT<<<numBlocks, blockSize>>>(N, x, y);


// Wait for GPU to finish before accessing on host

cudaDeviceSynchronize();


// Check for errors (all values should be 3.0f)

float maxError = 0.0f;

for (int i = 0; i < N; i++)

maxError = fmax(maxError, fabs(y[i]-3.0f));

std::cout << "Max error: " << maxError << std::endl;


// Free memory

cudaFree(x);

cudaFree(y);


return 0;

}


(base) jk@jkDL:~/dev/ctst$ nvprof ./jaddg

==3916== NVPROF is profiling process 3916, command: ./jaddg

Max error: 0

==3916== Profiling application: ./jaddg

==3916== Profiling result:

Type Time(%) Time Calls Avg Min Max Name

GPU activities: 100.00% 7.9952ms 1 7.9952ms 7.9952ms 7.9952ms addBT(int, float*, float*)

API calls: 95.18% 174.69ms 2 87.343ms 22.796us 174.66ms cudaMallocManaged

4.36% 7.9976ms 1 7.9976ms 7.9976ms 7.9976ms cudaDeviceSynchronize

0.21% 392.74us 1 392.74us 392.74us 392.74us cuDeviceTotalMem

0.11% 200.33us 97 2.0650us 384ns 73.144us cuDeviceGetAttribute

0.11% 194.54us 2 97.269us 96.630us 97.908us cudaFree

0.02% 37.817us 1 37.817us 37.817us 37.817us cudaLaunchKernel

0.01% 18.009us 1 18.009us 18.009us 18.009us cuDeviceGetName

0.00% 2.8130us 3 937ns 517ns 1.7770us cuDeviceGetCount

0.00% 1.1850us 2 592ns 476ns 709ns cuDeviceGet

0.00% 506ns 1 506ns 506ns 506ns cuDeviceGetUuid