OpenCV On CUDA
This is the take away from this video accompanied by this slide. It talks about running OpenCV on CUDA GPU. This seminar is based on OpenCV 2.4. The API in Opencv 3.X is different. A few points worth noting before entering the code:
GpuMat
is a padded image container, the upload/download operations are described in pages 15 - 17. Page 20 gives an example of template matching.Concurrent operation with CUDA is described in pages 24 to 28.
CudaMem
is page-locked CPU memory for asynchronous data transfers.Stream
is an OpenCV wrapper of cuda classcudaStream_t
. It represents asynchronous queue of operations, frees up CPU.The following is an example of converting
GpuMat
to common GPU format and writing custom CUDA code to swap the r and b channel of the image.
swap_rb.cpp
#include <opencv2/gpu/stream_accessor.hpp>
void swap_rb_caller(const PtrStepSz<uchar3>& src, trStep<uchar3> dst, cudaStream_t stream);
void swap_rb(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null()) // convert GpuMat to cuda format to process
{
CV_Assert(src.type() == CV_8UC3);
dst.create(src.size(), src.type()); // create if not allocated yet
cudaStream_t s = StreamAccessor::getStream(stream);
swap_rb_caller(src, dst, s);
}
swap_rb.cu
#include <opencv2/core/cuda_devptrs.hpp>
__global__ void swap_rb_kernel(const PtrStepSz<uchar3> src, PteStep<uchar3> dst)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;i
if (x < src.cols && y < src.rows)
{
uchar3 v = src(y, x); // Reads pixel in GPU memory. Valid! We are on GPU!
dst(y, x) = make_uchar3(v.z, v.y, v.x);
}
}
void swap_rb_caller(const PtrStepSz<uchar3>& src, PtrStep<uchar3> dst, cudaStream_t stream)
{
dim3 block(32, 8);
dim3 grid((src.cols + block.x - 1)/block.x, (src.rows + block.y - 1)/ block.y);
swap_rb_kernel<<<grid, block, 0, stream>>>(src, dst);
if (stream == 0)
cudaDeviceSynchronize();
}