There are cu files and two header files. CMakeLists.txt is created and cmake is used. Cmake could able to create make successfully . but make did not run and that had put error in permission side to use cuda compiler.


This issue need to be resolved. Though cmake worked well ( apparently) but that did not create good make file. Thus need to find a issue in CMakeLists.txt and fided as well. Mentioned effort is discussed in the following slides. .bashrc fie edited and CUDA path related infra is added. After all these, successfully worked with


mat/bd$ cmake ..

mat/bd$ make ..


Then executed matrix multiplication in GPU . things went well and all these dicsuccsued in upcoming slides.

Note cmake and make are put in use successfully


Resolved. I had an extra space after the CUDA_PATH, correct configuration is as follows for future reference

Someone given this advice and i did follow it up

CUDA_PATH = /usr/local/cuda-9.0/ # <= Change this

#CUDA_PATH = /opt/sw/packages/cuda/7.5

CUDA_INC_PATH = $(CUDA_PATH)/include

CUDA_BIN_PATH = $(CUDA_PATH)/bin

CUDA_LIB_PATH = $(CUDA_PATH)/lib64

LDFLAGS1 = -L$(CUDA_LIB_PATH) -lcudart -lcublas -lcurand -lcusparse

CFLAGS1 = -Isrc/com -Isrc/data -Isrc/nnet -D__AZ_SMAT_SINGLE__ -D__AZ_GPU__ -I$(CUDA_INC_PATH) -O2 \

-gencode arch=compute_30,code=sm_30 \

-gencode arch=compute_32,code=sm_32 \

-gencode arch=compute_35,code=sm_35 \

-gencode arch=compute_37,code=sm_37 \

-gencode arch=compute_50,code=sm_50 \

-gencode arch=compute_52,code=sm_52 \

-gencode arch=compute_53,code=sm_53

I had to remove lines with "arch=compute_20:


Remove spaces in this and add to .bashrc then reboot power 9

Test bashrc by type

exec .bachrc

These items are not found in installation that came up during testing bashrc by using

exec .bachrc

This part is removed in .bashrc


CMakeLists.txt

Following worked for Matrix multiplication


#https://stackoverflow.com/questions/36551469/triggering-c11-support-in-nvcc-with-cmake

cmake_minimum_required(VERSION 3.10.2)

enable_language(CUDA) // solution uncomment this line

#set(CMAKE_CUDA_COMPILER "/usr/local/cuda-10.2/") // solution .. comment this libe

add_executable(DLtrain ../matsrc/jkernel.cu ../matsrc/jmatMul.cu)

set_target_properties(DLtrain PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/bin)

set_property(TARGET DLtrain PROPERTY CUDA_STANDARD 11)


I had added this by commenting one line earlier, but given a problem ..which is defined two page before . Thus CUDA path is set in .bashrc and then commented this line and un commented earlier line.


/usr/local/cuda-10.2/

CUDA SDK installed in this folder


Objective is to create a function to multiply a Matrix in GPU. But call is made from CPU with data generated in CPU. This project include , kernel.cu, kernel.h, jmatArrau.h, jmatMul.h



file Name : jkernel.cu

// May 23 2020 //

#include <math.h>

#include <iostream>

#include "cuda_runtime.h"

#include "kernel.h"

#include <stdlib.h>


using namespace std;


__global__ void matrixMultiplicationKernel(float* A, float* B, float* C, int N) {


int ROW = blockIdx.y*blockDim.y+threadIdx.y;

int COL = blockIdx.x*blockDim.x+threadIdx.x;


float tmpSum = 0;


if (ROW < N && COL < N) {

// each thread computes one element of the block sub-matrix

for (int i = 0; i < N; i++) {

tmpSum += A[ROW * N + i] * B[i * N + COL];

}

}

C[ROW * N + COL] = tmpSum;

}



void matrixMultiplication(float *A, float *B, float *C, int N){


// declare the number of blocks per grid and the number of threads per block

// use 1 to 512 threads per block

dim3 threadsPerBlock(N, N);

dim3 blocksPerGrid(1, 1);

if (N*N > 512){

threadsPerBlock.x = 512;

threadsPerBlock.y = 512;

blocksPerGrid.x = ceil(double(N)/double(threadsPerBlock.x));

blocksPerGrid.y = ceil(double(N)/double(threadsPerBlock.y));

}


matrixMultiplicationKernel<<<blocksPerGrid,threadsPerBlock>>>(A, B, C, N);

}




file Name : jkernel.h

// May 23 2020 //


#ifndef KERNEL_CUH_

#define KERNEL_CUH_


void matrixMultiplication(float *A, float *B, float *C, int N);


#endif


file Name : jmatArray.h

// May 23 2020 //


#ifndef _DEV_ARRAY_H_

#define _DEV_ARRAY_H_


#include <stdexcept>

#include <algorithm>

#include <cuda_runtime.h>


template <class T>

class dev_array

{

// public functions

public:

explicit dev_array()

: start_(0),

end_(0)

{}


// constructor

explicit dev_array(size_t size)

{

allocate(size);

}

// destructor

~dev_array()

{

free();

}


// resize the vector

void resize(size_t size)

{

free();

allocate(size);

}


// get the size of the array

size_t getSize() const

{

return end_ - start_;

}


// get data

const T* getData() const

{

return start_;

}


T* getData()

{

return start_;

}


// set

void set(const T* src, size_t size)

{

size_t min = std::min(size, getSize());

cudaError_t result = cudaMemcpy(start_, src, min * sizeof(T), cudaMemcpyHostToDevice);

if (result != cudaSuccess)

{

throw std::runtime_error("failed to copy to device memory");

}

}

// get

void get(T* dest, size_t size)

{

size_t min = std::min(size, getSize());

cudaError_t result = cudaMemcpy(dest, start_, min * sizeof(T), cudaMemcpyDeviceToHost);

if (result != cudaSuccess)

{

throw std::runtime_error("failed to copy to host memory");

}

}



// private functions

private:

// allocate memory on the device

void allocate(size_t size)

{

cudaError_t result = cudaMalloc((void**)&start_, size * sizeof(T));

if (result != cudaSuccess)

{

start_ = end_ = 0;

throw std::runtime_error("failed to allocate device memory");

}

end_ = start_ + size;

}


// free memory on the device

void free()

{

if (start_ != 0)

{

cudaFree(start_);

start_ = end_ = 0;

}

}


T* start_;

T* end_;

};


#endif




file Name : jmaMul.cu

// May 23 2020 //

#include <iostream>

#include <vector>

#include <stdlib.h>

#include <time.h>

#include <cuda_runtime.h>

#include "kernel.h"

#include "kernel.cu"

#include "dev_array.h"

#include <math.h>


using namespace std;


int main()

{

// Perform matrix multiplication C = A*B

// where A, B and C are NxN matrices

int N = 16;

int SIZE = N*N;


// Allocate memory on the host

vector<float> h_A(SIZE);

vector<float> h_B(SIZE);

vector<float> h_C(SIZE);


// Initialize matrices on the host

for (int i=0; i<N; i++){

for (int j=0; j<N; j++){

h_A[i*N+j] = sin(i);

h_B[i*N+j] = cos(j);

}

}


// Allocate memory on the device

dev_array<float> d_A(SIZE);

dev_array<float> d_B(SIZE);

dev_array<float> d_C(SIZE);


d_A.set(&h_A[0], SIZE);

d_B.set(&h_B[0], SIZE);


matrixMultiplication(d_A.getData(), d_B.getData(), d_C.getData(), N);

cudaDeviceSynchronize();


d_C.get(&h_C[0], SIZE);

cudaDeviceSynchronize();


float *cpu_C;

cpu_C=new float[SIZE];


// Now do the matrix multiplication on the CPU

float sum;

for (int row=0; row<N; row++){

for (int col=0; col<N; col++){

sum = 0.f;

for (int n=0; n<N; n++){

sum += h_A[row*N+n]*h_B[n*N+col];

}

cpu_C[row*N+col] = sum;

}

}


double err = 0;

// Check the result and make sure it is correct

for (int ROW=0; ROW < N; ROW++){

for (int COL=0; COL < N; COL++){

err += cpu_C[ROW * N + COL] - h_C[ROW * N + COL];

}

}


cout << "Error: " << err << endl;


return 0;

}