There are cu files and two header files. CMakeLists.txt is created and cmake is used. Cmake could able to create make successfully . but make did not run and that had put error in permission side to use cuda compiler.
This issue need to be resolved. Though cmake worked well ( apparently) but that did not create good make file. Thus need to find a issue in CMakeLists.txt and fided as well. Mentioned effort is discussed in the following slides. .bashrc fie edited and CUDA path related infra is added. After all these, successfully worked with
mat/bd$ cmake ..
mat/bd$ make ..
Then executed matrix multiplication in GPU . things went well and all these dicsuccsued in upcoming slides.
Note cmake and make are put in use successfully
Resolved. I had an extra space after the CUDA_PATH, correct configuration is as follows for future reference
Someone given this advice and i did follow it up
CUDA_PATH = /usr/local/cuda-9.0/ # <= Change this
#CUDA_PATH = /opt/sw/packages/cuda/7.5
CUDA_INC_PATH = $(CUDA_PATH)/include
CUDA_BIN_PATH = $(CUDA_PATH)/bin
CUDA_LIB_PATH = $(CUDA_PATH)/lib64
LDFLAGS1 = -L$(CUDA_LIB_PATH) -lcudart -lcublas -lcurand -lcusparse
CFLAGS1 = -Isrc/com -Isrc/data -Isrc/nnet -D__AZ_SMAT_SINGLE__ -D__AZ_GPU__ -I$(CUDA_INC_PATH) -O2 \
-gencode arch=compute_30,code=sm_30 \
-gencode arch=compute_32,code=sm_32 \
-gencode arch=compute_35,code=sm_35 \
-gencode arch=compute_37,code=sm_37 \
-gencode arch=compute_50,code=sm_50 \
-gencode arch=compute_52,code=sm_52 \
-gencode arch=compute_53,code=sm_53
I had to remove lines with "arch=compute_20:
Remove spaces in this and add to .bashrc then reboot power 9
Test bashrc by type
exec .bachrc
These items are not found in installation that came up during testing bashrc by using
exec .bachrc
This part is removed in .bashrc
CMakeLists.txt
Following worked for Matrix multiplication
#https://stackoverflow.com/questions/36551469/triggering-c11-support-in-nvcc-with-cmake
cmake_minimum_required(VERSION 3.10.2)
enable_language(CUDA) // solution uncomment this line
#set(CMAKE_CUDA_COMPILER "/usr/local/cuda-10.2/") // solution .. comment this libe
add_executable(DLtrain ../matsrc/jkernel.cu ../matsrc/jmatMul.cu)
set_target_properties(DLtrain PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/bin)
set_property(TARGET DLtrain PROPERTY CUDA_STANDARD 11)
I had added this by commenting one line earlier, but given a problem ..which is defined two page before . Thus CUDA path is set in .bashrc and then commented this line and un commented earlier line.
/usr/local/cuda-10.2/
CUDA SDK installed in this folder
Objective is to create a function to multiply a Matrix in GPU. But call is made from CPU with data generated in CPU. This project include , kernel.cu, kernel.h, jmatArrau.h, jmatMul.h
file Name : jkernel.cu
// May 23 2020 //
#include <math.h>
#include <iostream>
#include "cuda_runtime.h"
#include "kernel.h"
#include <stdlib.h>
using namespace std;
__global__ void matrixMultiplicationKernel(float* A, float* B, float* C, int N) {
int ROW = blockIdx.y*blockDim.y+threadIdx.y;
int COL = blockIdx.x*blockDim.x+threadIdx.x;
float tmpSum = 0;
if (ROW < N && COL < N) {
// each thread computes one element of the block sub-matrix
for (int i = 0; i < N; i++) {
tmpSum += A[ROW * N + i] * B[i * N + COL];
}
}
C[ROW * N + COL] = tmpSum;
}
void matrixMultiplication(float *A, float *B, float *C, int N){
// declare the number of blocks per grid and the number of threads per block
// use 1 to 512 threads per block
dim3 threadsPerBlock(N, N);
dim3 blocksPerGrid(1, 1);
if (N*N > 512){
threadsPerBlock.x = 512;
threadsPerBlock.y = 512;
blocksPerGrid.x = ceil(double(N)/double(threadsPerBlock.x));
blocksPerGrid.y = ceil(double(N)/double(threadsPerBlock.y));
}
matrixMultiplicationKernel<<<blocksPerGrid,threadsPerBlock>>>(A, B, C, N);
}
file Name : jkernel.h
// May 23 2020 //
#ifndef KERNEL_CUH_
#define KERNEL_CUH_
void matrixMultiplication(float *A, float *B, float *C, int N);
#endif
file Name : jmatArray.h
// May 23 2020 //
#ifndef _DEV_ARRAY_H_
#define _DEV_ARRAY_H_
#include <stdexcept>
#include <algorithm>
#include <cuda_runtime.h>
template <class T>
class dev_array
{
// public functions
public:
explicit dev_array()
: start_(0),
end_(0)
{}
// constructor
explicit dev_array(size_t size)
{
allocate(size);
}
// destructor
~dev_array()
{
free();
}
// resize the vector
void resize(size_t size)
{
free();
allocate(size);
}
// get the size of the array
size_t getSize() const
{
return end_ - start_;
}
// get data
const T* getData() const
{
return start_;
}
T* getData()
{
return start_;
}
// set
void set(const T* src, size_t size)
{
size_t min = std::min(size, getSize());
cudaError_t result = cudaMemcpy(start_, src, min * sizeof(T), cudaMemcpyHostToDevice);
if (result != cudaSuccess)
{
throw std::runtime_error("failed to copy to device memory");
}
}
// get
void get(T* dest, size_t size)
{
size_t min = std::min(size, getSize());
cudaError_t result = cudaMemcpy(dest, start_, min * sizeof(T), cudaMemcpyDeviceToHost);
if (result != cudaSuccess)
{
throw std::runtime_error("failed to copy to host memory");
}
}
// private functions
private:
// allocate memory on the device
void allocate(size_t size)
{
cudaError_t result = cudaMalloc((void**)&start_, size * sizeof(T));
if (result != cudaSuccess)
{
start_ = end_ = 0;
throw std::runtime_error("failed to allocate device memory");
}
end_ = start_ + size;
}
// free memory on the device
void free()
{
if (start_ != 0)
{
cudaFree(start_);
start_ = end_ = 0;
}
}
T* start_;
T* end_;
};
#endif
file Name : jmaMul.cu
// May 23 2020 //
#include <iostream>
#include <vector>
#include <stdlib.h>
#include <time.h>
#include <cuda_runtime.h>
#include "kernel.h"
#include "kernel.cu"
#include "dev_array.h"
#include <math.h>
using namespace std;
int main()
{
// Perform matrix multiplication C = A*B
// where A, B and C are NxN matrices
int N = 16;
int SIZE = N*N;
// Allocate memory on the host
vector<float> h_A(SIZE);
vector<float> h_B(SIZE);
vector<float> h_C(SIZE);
// Initialize matrices on the host
for (int i=0; i<N; i++){
for (int j=0; j<N; j++){
h_A[i*N+j] = sin(i);
h_B[i*N+j] = cos(j);
}
}
// Allocate memory on the device
dev_array<float> d_A(SIZE);
dev_array<float> d_B(SIZE);
dev_array<float> d_C(SIZE);
d_A.set(&h_A[0], SIZE);
d_B.set(&h_B[0], SIZE);
matrixMultiplication(d_A.getData(), d_B.getData(), d_C.getData(), N);
cudaDeviceSynchronize();
d_C.get(&h_C[0], SIZE);
cudaDeviceSynchronize();
float *cpu_C;
cpu_C=new float[SIZE];
// Now do the matrix multiplication on the CPU
float sum;
for (int row=0; row<N; row++){
for (int col=0; col<N; col++){
sum = 0.f;
for (int n=0; n<N; n++){
sum += h_A[row*N+n]*h_B[n*N+col];
}
cpu_C[row*N+col] = sum;
}
}
double err = 0;
// Check the result and make sure it is correct
for (int ROW=0; ROW < N; ROW++){
for (int COL=0; COL < N; COL++){
err += cpu_C[ROW * N + COL] - h_C[ROW * N + COL];
}
}
cout << "Error: " << err << endl;
return 0;
}