OpenCV On CUDA

This is the take away from this video accompanied by this slide. It talks about running OpenCV on CUDA GPU. This seminar is based on OpenCV 2.4. The API in Opencv 3.X is different. A few points worth noting before entering the code:

  1. GpuMat is a padded image container, the upload/download operations are described in pages 15 - 17. Page 20 gives an example of template matching.

  2. Concurrent operation with CUDA is described in pages 24 to 28. CudaMem is page-locked CPU memory for asynchronous data transfers. Stream is an OpenCV wrapper of cuda class cudaStream_t. It represents asynchronous queue of operations, frees up CPU.

  3. The following is an example of converting GpuMat to common GPU format and writing custom CUDA code to swap the r and b channel of the image.

swap_rb.cpp

#include <opencv2/gpu/stream_accessor.hpp>

void swap_rb_caller(const PtrStepSz<uchar3>& src, trStep<uchar3> dst, cudaStream_t stream);

void swap_rb(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null())  // convert GpuMat to cuda format to process
{
    CV_Assert(src.type() == CV_8UC3);
    dst.create(src.size(), src.type()); // create if not allocated yet
    cudaStream_t s = StreamAccessor::getStream(stream);
    swap_rb_caller(src, dst, s);
}

swap_rb.cu

#include <opencv2/core/cuda_devptrs.hpp>

__global__ void swap_rb_kernel(const PtrStepSz<uchar3> src, PteStep<uchar3> dst)
{
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;i
    
    if (x < src.cols && y < src.rows)
    {
        uchar3 v = src(y, x); // Reads pixel in GPU memory. Valid! We are on GPU!
        dst(y, x) = make_uchar3(v.z, v.y, v.x);
    }
}

void swap_rb_caller(const PtrStepSz<uchar3>& src, PtrStep<uchar3> dst, cudaStream_t stream)
{
    dim3 block(32, 8);
    dim3 grid((src.cols + block.x - 1)/block.x, (src.rows + block.y - 1)/ block.y);

    swap_rb_kernel<<<grid, block, 0, stream>>>(src, dst);

    if (stream == 0)
        cudaDeviceSynchronize();
}