forked from Qortal/Brooklyn
100 lines
3.1 KiB
Plaintext
100 lines
3.1 KiB
Plaintext
#include "cuda.h"
|
|
#include "cuda_runtime.h"
|
|
#include "device_launch_parameters.h"
|
|
|
|
#include "color_conversion.h"
|
|
|
|
#include "stdio.h"
|
|
|
|
__host__ __device__ __forceinline__ int divUp(int total, int grain)
|
|
{
|
|
return (total + grain - 1) / grain;
|
|
}
|
|
|
|
__global__ void RGB_to_jp(uchar4 *input, unsigned char *yuv_luma, unsigned char *yuv_cb, unsigned char *yuv_cr, int width, int height)
|
|
{
|
|
const int x = blockIdx.x * blockDim.x + threadIdx.x;
|
|
const int y = blockIdx.y * blockDim.y + threadIdx.y;
|
|
|
|
if (x >= width || y>=height) return;
|
|
|
|
uchar4 px = input[y * width + x];
|
|
int Y = ( ( 66 * px.x + 129 * px.y + 25 * px.z + 128) >> 8) + 16;
|
|
int U = ( ( -38 * px.x - 74 * px.y + 112 * px.z + 128) >> 8) + 128;
|
|
int V = ( ( 112 * px.x - 94 * px.y - 18 * px.z + 128) >> 8) + 128;
|
|
|
|
yuv_luma[y * width + x] = Y;
|
|
|
|
int pos = (y >> 1) * (width >> 1) + (x >> 1);
|
|
yuv_cr[pos] = U;
|
|
yuv_cb[pos] = V;
|
|
}
|
|
|
|
bool RGB_to_YV12(int width, int height, void *pPixels, void* yuv_luma, void* yuv_cb, void* yuv_cr)
|
|
{
|
|
cudaError_t cudaStatus;
|
|
|
|
const dim3 block(32, 8);
|
|
const dim3 grid(divUp(width, block.x), divUp(height, block.y));
|
|
|
|
unsigned char *yuv_luma_device;
|
|
cudaMalloc(&yuv_luma_device, width *height * sizeof(unsigned char));
|
|
|
|
unsigned char *yuv_cb_device;
|
|
cudaMalloc(&yuv_cb_device, width *height * sizeof(unsigned char) / 4);
|
|
|
|
unsigned char *yuv_cr_device;
|
|
cudaMalloc(&yuv_cr_device, width *height * sizeof(unsigned char) / 4);
|
|
|
|
|
|
// Copy input vectors from host memory to GPU buffers.
|
|
uchar4 *dev_pPixels;
|
|
cudaStatus = cudaMalloc((void**)&dev_pPixels, width *height * sizeof(uchar4));
|
|
if (cudaStatus != cudaSuccess) {
|
|
fprintf(stderr, "cudaMalloc failed!");
|
|
return false;
|
|
}
|
|
|
|
cudaStatus = cudaMemcpy(dev_pPixels, pPixels, width *height * sizeof(uchar4), cudaMemcpyHostToDevice);
|
|
if (cudaStatus != cudaSuccess) {
|
|
fprintf(stderr, "cudaMemcpy 1 failed!");
|
|
return false;
|
|
}
|
|
|
|
RGB_to_jp<<< grid, block >>>(dev_pPixels, yuv_luma_device, yuv_cb_device, yuv_cr_device, width, height);
|
|
|
|
cudaStatus = cudaGetLastError();
|
|
if (cudaStatus != cudaSuccess) {
|
|
fprintf(stderr, "RGB_to_jp failed!");
|
|
return false;
|
|
}
|
|
|
|
cudaStatus = cudaDeviceSynchronize();
|
|
if (cudaStatus != cudaSuccess) {
|
|
fprintf(stderr, "cudaDeviceSynchronize failed!");
|
|
return false;
|
|
}
|
|
|
|
cudaStatus = cudaMemcpy(yuv_luma, yuv_luma_device, width *height * sizeof(unsigned char), cudaMemcpyDeviceToHost);
|
|
if (cudaStatus != cudaSuccess) {
|
|
fprintf(stderr, "cudaMemcpy 2 failed!");
|
|
return false;
|
|
}
|
|
cudaStatus = cudaMemcpy(yuv_cb, yuv_cb_device, width *height * sizeof(unsigned char) / 4, cudaMemcpyDeviceToHost);
|
|
if (cudaStatus != cudaSuccess) {
|
|
fprintf(stderr, "cudaMemcpy 2 failed!");
|
|
return false;
|
|
}
|
|
cudaStatus = cudaMemcpy(yuv_cr, yuv_cr_device, width *height * sizeof(unsigned char) / 4, cudaMemcpyDeviceToHost);
|
|
if (cudaStatus != cudaSuccess) {
|
|
fprintf(stderr, "cudaMemcpy 2 failed!");
|
|
return false;
|
|
}
|
|
|
|
cudaFree(yuv_luma_device);
|
|
cudaFree(yuv_cb_device);
|
|
cudaFree(yuv_cr_device);
|
|
cudaFree(dev_pPixels);
|
|
|
|
return true;
|
|
} |