Hi
I'm trying to make DAV work with NVIDIA CUDA.
The way I do this is I compile a piece of CUDA C code into a shared library and hook it up as a normal service. It should be completely encapsulated.
When I compile my code in "device emulation" mode (meaning it gets executed on the CPU) it works fine. But when I target a GPU, problems start.
I start the broker and the server, status shows my cuda library is running:
[root@localhost bin]# ./davServiceBroker -d &
[1] 4947
[root@localhost bin]# ./dav start
[root@localhost bin]# ./dav status
Service Port Status
Library 12300 Running
blas 12301 Running
cudaDavApp 12303 Running
I then launch a test app on the client (properly virtualized) and it fails, throwing a std::string exception with no text (empty string). On the server side, this error gets logged as:
[ERR] 4991-3087718096 : WB-Error in executing the request:
<0>:SIGNAL_IN_LIBRARY:Severe error in library (cudaDavApp) fuction (vectorAdd). Error code: 10000
After this, the process "davService cudaDavApp" hangs and doesn't get properly killed by ./davServiceBroker -s (I have to do this manually).
My CUDA code is here:
#include "cudaDavApp.h"
__global__ void vectorAddKernel(float a[], float b[], float result[], int size)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid < size)
result[tid] = a[tid] + b[tid];
}
#if defined(__cplusplus)
extern "C" {
#endif
void vectorAdd(float a[], float b[], float output[], int size)
{
float *da=0, *db=0, *dout=0;
cudaMalloc((void**) &da, sizeof(float) * size);
cudaMalloc((void**) &db, sizeof(float) * size);
cudaMalloc((void**) &dout, sizeof(float) * size);
//printf("%s\n",cudaGetErrorString(cudaGetLastError()));
cudaMemcpy(da, a, sizeof(float)*size, cudaMemcpyHostToDevice);
cudaMemcpy(db, b, sizeof(float)*size, cudaMemcpyHostToDevice);
//printf("%s\n",cudaGetErrorString(cudaGetLastError()));
int blockSize=64;
int gridSize=size/blockSize;
if(size % blockSize) gridSize++;
vectorAddKernel<<<gridSize, blockSize>>>(da,db,dout,size);
//printf("%s\n",cudaGetErrorString(cudaGetLastError()));
cudaMemcpy(output, dout, sizeof(float)*size, cudaMemcpyDeviceToHost);
//printf("%s\n",cudaGetErrorString(cudaGetLastError()));
cudaFree(da);
cudaFree(db);
cudaFree(dout);
//printf("%s\n",cudaGetErrorString(cudaGetLastError()));
}
#if defined(__cplusplus)
};
#endif
cudaDavApp.h is:
#if defined(__cplusplus)
extern "C" {
#endif
/**IBMDAV* @function vectorAdd
@param[in] a @dimensions [size]
@param[in] b @dimensions [size]
@param[inout] output @dimensions [size]
*/
void vectorAdd(float a[], float b[], float output[], int size);
#if defined(__cplusplus)
};
#endif
When I compile this as:
nvcc cudaDavApp.cu -o libcudaDavApp.so -shared -Xcompiler '-fPIC' -deviceemu
everything works (I can call vectorAdd from my client-side app and get proper results)
When I drop the -deviceemu flag, I get the error reported above.
I confirmed that CUDA works on my system and that the code itself can run on the GPU without problems.
I realize this is not a CUDA forum and I don't expect CUDA-specific insight. But knowing what this DAV error means might point me towards the right direction.