forked from Qortal/Brooklyn
VC4Stdlib
This commit is contained in:
parent
d4be28e3f2
commit
1c4c363d5c
60
drivers/videocore4_stdlib/CMakeLists.txt
Normal file
60
drivers/videocore4_stdlib/CMakeLists.txt
Normal file
@ -0,0 +1,60 @@
|
||||
cmake_minimum_required (VERSION 3.1)
|
||||
|
||||
####
|
||||
# General configuration
|
||||
####
|
||||
# Option whether to create deb package
|
||||
option(BUILD_DEB_PACKAGE "Enables creating .deb package" ON)
|
||||
# Option whether to compile for raspberry-pi (default: ON, for the compatibility)
|
||||
option(CROSS_COMPILE "Cross compile for Raspbian" ON)
|
||||
option(BUILD_EXPERIMENTAL "Build experimental test program" OFF)
|
||||
|
||||
if(NOT BUILD_NUMBER)
|
||||
set(BUILD_NUMBER 9999)
|
||||
endif()
|
||||
|
||||
project(VC4CLStdLib VERSION 0.4.${BUILD_NUMBER})
|
||||
|
||||
|
||||
#Include headers in the project structure
|
||||
file( GLOB HDRS "${PROJECT_SOURCE_DIR}/include/*.h")
|
||||
add_library(VC4CLStdLib STATIC ${HDRS})
|
||||
set_target_properties(VC4CLStdLib PROPERTIES LINKER_LANGUAGE C)
|
||||
|
||||
##
|
||||
# Installation targets
|
||||
##
|
||||
# Adds the public headers to the target, so they are exported
|
||||
target_include_directories(VC4CLStdLib PUBLIC $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include> $<INSTALL_INTERFACE:include/vc4cl-stdlib>)
|
||||
# Creates the install target for the headers
|
||||
install(DIRECTORY "${PROJECT_SOURCE_DIR}/include/" DESTINATION include/vc4cl-stdlib FILES_MATCHING PATTERN "*.h")
|
||||
# Adds custom uninstall command
|
||||
add_custom_target(uninstall "${CMAKE_COMMAND}" -P "cmake_uninstall.cmake")
|
||||
|
||||
if (BUILD_EXPERIMENTAL)
|
||||
add_subdirectory(experimental)
|
||||
endif (BUILD_EXPERIMENTAL)
|
||||
|
||||
####
|
||||
# Building package
|
||||
####
|
||||
if (BUILD_DEB_PACKAGE)
|
||||
message(STATUS "build deb package...")
|
||||
|
||||
set(CPACK_GENERATOR "DEB")
|
||||
set(CPACK_PACKAGING_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
|
||||
set(CPACK_PACKAGE_NAME "vc4cl-stdlib")
|
||||
string(TIMESTAMP BUILD_TIMESTAMP "%Y-%m-%d")
|
||||
set(CPACK_PACKAGE_VERSION "${PROJECT_VERSION}-${BUILD_TIMESTAMP}")
|
||||
if (CROSS_COMPILE)
|
||||
set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE "armhf")
|
||||
else()
|
||||
set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE "amd64")
|
||||
endif()
|
||||
set(CPACK_PACKAGE_VENDOR "doe300")
|
||||
set(CPACK_PACKAGE_CONTACT "doe300@web.de")
|
||||
set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "OpenCL C headers for the VC4CL implementation (raspberrypi only)")
|
||||
set(CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/doe300/VC4CLStdLib")
|
||||
set(CPACK_PACKAGE_FILE_NAME "vc4cl-stdlib-0.4-Linux")
|
||||
include(CPack)
|
||||
endif (BUILD_DEB_PACKAGE)
|
21
drivers/videocore4_stdlib/LICENSE
Normal file
21
drivers/videocore4_stdlib/LICENSE
Normal file
@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2022
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
4
drivers/videocore4_stdlib/Readme.md
Normal file
4
drivers/videocore4_stdlib/Readme.md
Normal file
@ -0,0 +1,4 @@
|
||||
# VC4CLStdLib
|
||||
|
||||
Implementation of the OpenCL standard-library and is required to build the [VC4C] compiler.
|
||||
|
7
drivers/videocore4_stdlib/experimental/CMakeLists.txt
Normal file
7
drivers/videocore4_stdlib/experimental/CMakeLists.txt
Normal file
@ -0,0 +1,7 @@
|
||||
find_package(OpenCL REQUIRED)
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
add_executable(compare_implementations compare_implementations.cpp)
|
||||
target_compile_features(compare_implementations PRIVATE cxx_std_14)
|
||||
target_compile_options(compare_implementations PRIVATE -g -Og)
|
||||
target_link_libraries(compare_implementations OpenCL::OpenCL Threads::Threads)
|
91
drivers/videocore4_stdlib/experimental/cbrt.cl
Normal file
91
drivers/videocore4_stdlib/experimental/cbrt.cl
Normal file
@ -0,0 +1,91 @@
|
||||
#define arg_t float16
|
||||
#define result_t float16
|
||||
#define int_t int16
|
||||
#define uint_t uint16
|
||||
|
||||
#define CONCAT(a, b) a##b
|
||||
#define CAT(a, b) CONCAT(a, b)
|
||||
|
||||
result_t approx_rootn(arg_t x, int_t n)
|
||||
{
|
||||
// Divides the exponent by n and emplaces it back into the number
|
||||
// Adapted from: https://web.archive.org/web/20131227144655/http://metamerist.com/cbrt/cbrt.htm
|
||||
int_t i = CAT(as_, int_t)(x);
|
||||
int_t exp = (i - (int_t) (127 << 23)) / n + (int_t) (127 << 23);
|
||||
return CAT(as_, result_t)((i & (int_t) 0x807FFFFF) | (exp));
|
||||
}
|
||||
|
||||
result_t approx_cbrt(arg_t f)
|
||||
{
|
||||
// Adapted from: https://web.archive.org/web/20120111112244/http://metamerist.com/cbrt/CubeRoot.cpp
|
||||
uint_t p = CAT(as_, uint_t)(f);
|
||||
p = p / 3 + 709921077;
|
||||
return CAT(as_, result_t)(p);
|
||||
}
|
||||
|
||||
result_t cbrt_halley_step(arg_t x, arg_t base)
|
||||
{
|
||||
// Adapted from: https://web.archive.org/web/20120111112244/http://metamerist.com/cbrt/CubeRoot.cpp
|
||||
result_t x3 = x * x * x;
|
||||
return x * (x3 + base + base) / (x3 + x3 + base);
|
||||
}
|
||||
|
||||
result_t cbrt_halley(arg_t val)
|
||||
{
|
||||
// Adapted from: https://web.archive.org/web/20120111112244/http://metamerist.com/cbrt/CubeRoot.cpp
|
||||
arg_t abs = fabs(val);
|
||||
arg_t approx = approx_rootn(abs, 3);
|
||||
|
||||
result_t result = approx;
|
||||
#pragma loop unroll
|
||||
for(int i = 0; i < 4; ++i) // TODO can be adapted for accuracy
|
||||
{
|
||||
result = cbrt_halley_step(result, val);
|
||||
}
|
||||
return copysign(result, val);
|
||||
}
|
||||
|
||||
__kernel void cbrt_halley_kernel(__global arg_t *out, const __global arg_t *in)
|
||||
{
|
||||
uint gid = get_global_id(0);
|
||||
out[gid] = cbrt_halley(in[gid]);
|
||||
}
|
||||
|
||||
result_t cbrt_newton_step(arg_t x, arg_t base)
|
||||
{
|
||||
// Adapted from: https://web.archive.org/web/20120111112244/http://metamerist.com/cbrt/CubeRoot.cpp
|
||||
return x - (1.0f / 3.0f) * (x - base / (x * x));
|
||||
}
|
||||
|
||||
result_t cbrt_newton(arg_t val)
|
||||
{
|
||||
// Adapted from: https://web.archive.org/web/20120111112244/http://metamerist.com/cbrt/CubeRoot.cpp
|
||||
arg_t abs = fabs(val);
|
||||
arg_t approx = approx_cbrt(abs);
|
||||
|
||||
result_t result = approx;
|
||||
#pragma loop unroll
|
||||
for(int i = 0; i < 4; ++i) // TODO can be adapted for accuracy
|
||||
{
|
||||
result = cbrt_newton_step(result, val);
|
||||
}
|
||||
return copysign(result, val);
|
||||
}
|
||||
|
||||
__kernel void cbrt_newton_kernel(__global arg_t *out, const __global arg_t *in)
|
||||
{
|
||||
uint gid = get_global_id(0);
|
||||
out[gid] = cbrt_newton(in[gid]);
|
||||
}
|
||||
|
||||
__kernel void cbrt_builtin_kernel(__global arg_t *out, const __global arg_t *in)
|
||||
{
|
||||
uint gid = get_global_id(0);
|
||||
out[gid] = cbrt(in[gid]);
|
||||
}
|
||||
|
||||
__kernel void cbrt_pow_kernel(__global arg_t *out, const __global arg_t *in)
|
||||
{
|
||||
uint gid = get_global_id(0);
|
||||
out[gid] = pow(in[gid], 1.0f / 3.0f);
|
||||
}
|
@ -0,0 +1,404 @@
|
||||
|
||||
#define CL_TARGET_OPENCL_VERSION 120
|
||||
#define CL_HPP_CL_1_2_DEFAULT_BUILD 1
|
||||
#define CL_HPP_ENABLE_EXCEPTIONS 1
|
||||
#define CL_HPP_TARGET_OPENCL_VERSION 120
|
||||
#define CL_HPP_MINIMUM_OPENCL_VERSION 120
|
||||
#include <CL/cl.hpp>
|
||||
|
||||
#include <algorithm>
|
||||
#include <chrono>
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
#include <limits>
|
||||
#include <random>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <unistd.h> // geteuid()
|
||||
#include <vector>
|
||||
|
||||
static constexpr uint32_t DEFAULT_NUM_LINEAR = 12 * 16 * 8;
|
||||
static constexpr uint32_t DEFAULT_NUM_RANDOM = 12 * 16 * 8;
|
||||
|
||||
// VC4CL performance counters
|
||||
#define CL_PROFILING_PERFORMANCE_COUNTER_EXECUTION_CYCLES_VC4CL (CL_PROFILING_COMMAND_END + 10)
|
||||
#define CL_PROFILING_PERFORMANCE_COUNTER_IDLE_CYCLES_VC4CL (CL_PROFILING_COMMAND_END + 11)
|
||||
#define CL_PROFILING_PERFORMANCE_COUNTER_INSTRUCTION_COUNT_VC4CL (CL_PROFILING_COMMAND_END + 12)
|
||||
#define CL_PROFILING_PERFORMANCE_COUNTER_INSTRUCTION_CACHE_MISSES_VC4CL (CL_PROFILING_COMMAND_END + 13)
|
||||
#define CL_PROFILING_PERFORMANCE_COUNTER_L2_CACHE_MISSES_VC4CL (CL_PROFILING_COMMAND_END + 14)
|
||||
|
||||
struct Range
|
||||
{
|
||||
float min;
|
||||
float max;
|
||||
};
|
||||
|
||||
struct ReferenceFunction
|
||||
{
|
||||
ReferenceFunction(float (*func)(float)) : numParameters(1), ptr(reinterpret_cast<void *>(func)) {}
|
||||
ReferenceFunction(float (*func)(float, float)) : numParameters(2), ptr(reinterpret_cast<void *>(func)) {}
|
||||
ReferenceFunction(float (*func)(float, float, float)) : numParameters(3), ptr(reinterpret_cast<void *>(func)) {}
|
||||
|
||||
float operator()(float val) const
|
||||
{
|
||||
if(numParameters != 1)
|
||||
throw std::runtime_error{"Reference function called with the wrong number of arguments"};
|
||||
return reinterpret_cast<float (*)(float)>(ptr)(val);
|
||||
}
|
||||
|
||||
float operator()(float val0, float val1) const
|
||||
{
|
||||
if(numParameters != 2)
|
||||
throw std::runtime_error{"Reference function called with the wrong number of arguments"};
|
||||
return reinterpret_cast<float (*)(float, float)>(ptr)(val0, val1);
|
||||
}
|
||||
|
||||
float operator()(float val0, float val1, float val2) const
|
||||
{
|
||||
if(numParameters != 3)
|
||||
throw std::runtime_error{"Reference function called with the wrong number of arguments"};
|
||||
return reinterpret_cast<float (*)(float, float, float)>(ptr)(val0, val1, val2);
|
||||
}
|
||||
|
||||
std::vector<float> operator()(const std::vector<std::vector<float>> &inputs) const
|
||||
{
|
||||
std::vector<float> out(inputs.front().size());
|
||||
for(std::size_t i = 0; i < out.size(); ++i)
|
||||
{
|
||||
if(numParameters == 1)
|
||||
out[i] = (*this)(inputs[0][i]);
|
||||
if(numParameters == 2)
|
||||
out[i] = (*this)(inputs[0][i], inputs[1][i]);
|
||||
if(numParameters == 3)
|
||||
out[i] = (*this)(inputs[0][i], inputs[1][i], inputs[2][i]);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
uint8_t numParameters;
|
||||
void *ptr;
|
||||
};
|
||||
|
||||
struct Test
|
||||
{
|
||||
std::string name;
|
||||
ReferenceFunction reference;
|
||||
uint32_t allowedErrorInUlp;
|
||||
std::string sourceFile;
|
||||
std::vector<Range> ranges;
|
||||
};
|
||||
|
||||
static float identity(float val)
|
||||
{
|
||||
return val;
|
||||
}
|
||||
|
||||
// XXX OpenCL-CTS calculates reference in double, thus is more accurate. So tests being accurate here might not be in
|
||||
// the CTS!
|
||||
static const std::vector<Test> floatTests = {
|
||||
Test{"log", logf, 4, "log.cl",
|
||||
{
|
||||
{0.5, 1.0}, // reduced range some implementations use
|
||||
{std::numeric_limits<float>::min(), std::numeric_limits<float>::max()} // full range
|
||||
}},
|
||||
Test{"exp", expf, 4, "exp.cl",
|
||||
{
|
||||
{0.0, 0.5f * logf(2.0f)}, // reduced range some implementations use
|
||||
{-87.0f /* everything below e^-87 is subnormal */, 89.0f /* everything above e^89 is Inf */} // full range
|
||||
}},
|
||||
Test{"identity", identity, 0, "identity.cl",
|
||||
{
|
||||
{-10.0f, 10.0f}, {std::numeric_limits<float>::lowest(), std::numeric_limits<float>::max()} // full range
|
||||
}},
|
||||
Test{"cbrt", cbrtf, 4, "cbrt.cl",
|
||||
{
|
||||
{-1.0, 1.0}, // limited range for precision testing
|
||||
{std::numeric_limits<float>::lowest(), std::numeric_limits<float>::max()} // full range
|
||||
}},
|
||||
Test{"fma", fmaf, 0, "fma.cl",
|
||||
{
|
||||
{-100.0f, 100.0f}, // reduced range to not run into NaN/Inf
|
||||
{std::numeric_limits<float>::lowest(), std::numeric_limits<float>::max()} // full range
|
||||
}}};
|
||||
|
||||
static std::vector<float> generateInputData(const Range &range, uint32_t numLinear, uint32_t numRandom)
|
||||
{
|
||||
std::vector<float> data{};
|
||||
data.reserve(numLinear + numRandom);
|
||||
auto step = (range.max - range.min) / static_cast<float>(numLinear); // TODO overflows on full ranges
|
||||
for(float val = range.min; val < range.max; val += step)
|
||||
data.emplace_back(val);
|
||||
|
||||
std::random_device rd{};
|
||||
std::default_random_engine gen(rd());
|
||||
std::uniform_real_distribution<> dist{range.min, range.max};
|
||||
|
||||
while(data.size() < (numLinear + numRandom))
|
||||
data.emplace_back(static_cast<float>(dist(gen)));
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
static std::vector<std::vector<float>> generateInputData(
|
||||
const Range &range, uint32_t numLinear, uint32_t numRandom, uint8_t numInputs)
|
||||
{
|
||||
std::vector<std::vector<float>> data{};
|
||||
for(uint8_t i = 0; i < numInputs; ++i)
|
||||
data.emplace_back(generateInputData(range, numLinear, numRandom));
|
||||
return data;
|
||||
}
|
||||
|
||||
static std::vector<cl::Kernel> createKernels(const cl::Context &context, const Test &test)
|
||||
{
|
||||
std::stringstream ss;
|
||||
{
|
||||
std::ifstream fis{test.sourceFile};
|
||||
ss << fis.rdbuf();
|
||||
}
|
||||
cl::Program program(context, ss.str(), true);
|
||||
|
||||
std::vector<cl::Kernel> kernels;
|
||||
program.createKernels(&kernels);
|
||||
return kernels;
|
||||
}
|
||||
|
||||
struct ErrorResult
|
||||
{
|
||||
std::vector<float> inputValues;
|
||||
float expected;
|
||||
float actual;
|
||||
uint32_t errorInUlp;
|
||||
|
||||
// ordered by "most wrong" first
|
||||
bool operator<(const ErrorResult &other) const noexcept
|
||||
{
|
||||
if(errorInUlp > other.errorInUlp)
|
||||
return true;
|
||||
if(errorInUlp < other.errorInUlp)
|
||||
return false;
|
||||
return inputValues < other.inputValues;
|
||||
}
|
||||
|
||||
friend std::ostream &operator<<(std::ostream &os, const ErrorResult &error)
|
||||
{
|
||||
os << "Error of " << error.errorInUlp << " ULP for ";
|
||||
if(error.inputValues.size() == 1)
|
||||
os << std::scientific << error.inputValues.front();
|
||||
else if(error.inputValues.size() == 2)
|
||||
os << std::scientific << '{' << error.inputValues.front() << ", " << error.inputValues.back() << '}';
|
||||
else if(error.inputValues.size() == 3)
|
||||
os << std::scientific << '{' << error.inputValues[0] << ", " << error.inputValues[1] << ", "
|
||||
<< error.inputValues[2] << '}';
|
||||
else
|
||||
{
|
||||
os << '{';
|
||||
for(auto input : error.inputValues)
|
||||
os << std::scientific << input << ", ";
|
||||
os << '}';
|
||||
}
|
||||
os << ", expected " << error.expected << ", got " << error.actual << std::defaultfloat;
|
||||
return os;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Out, typename In>
|
||||
static Out bit_cast(In val)
|
||||
{
|
||||
union
|
||||
{
|
||||
In in;
|
||||
Out out;
|
||||
} u;
|
||||
u.in = val;
|
||||
return u.out;
|
||||
}
|
||||
|
||||
static uint32_t calculateError(float reference, float result, uint32_t allowedErrorInUlp)
|
||||
{
|
||||
if(std::isinf(reference) && std::isinf(result) && std::signbit(reference) == std::signbit(result))
|
||||
return 0;
|
||||
if(std::isnan(reference) && std::isnan(result))
|
||||
return 0;
|
||||
// auto ulp = std::abs(reference * std::numeric_limits<float>::epsilon());
|
||||
// float difference = std::abs(result - reference);
|
||||
// if(difference > static_cast<float>(allowedErrorInUlp))
|
||||
// return static_cast<uint32_t>(std::ceil(difference / ulp));
|
||||
// return 0;
|
||||
return static_cast<uint32_t>(std::abs(bit_cast<int32_t>(reference) - bit_cast<int32_t>(result)));
|
||||
}
|
||||
|
||||
static std::pair<std::vector<ErrorResult>, uint32_t> checkResults(const std::vector<std::vector<float>> &inputs,
|
||||
const std::vector<float> &reference, const std::vector<float> &result, uint32_t allowedErrorInUlp)
|
||||
{
|
||||
std::vector<ErrorResult> errors;
|
||||
uint32_t maxError = 0;
|
||||
|
||||
for(std::size_t i = 0; i < std::min(reference.size(), result.size()); ++i)
|
||||
{
|
||||
auto error = calculateError(reference.at(i), result.at(i), allowedErrorInUlp);
|
||||
maxError = std::max(maxError, error);
|
||||
if(error > allowedErrorInUlp)
|
||||
{
|
||||
std::vector<float> errorInputs;
|
||||
for(const auto &input : inputs)
|
||||
errorInputs.push_back(input.at(i));
|
||||
errors.emplace_back(ErrorResult{std::move(errorInputs), reference.at(i), result.at(i), error});
|
||||
}
|
||||
}
|
||||
|
||||
std::sort(errors.begin(), errors.end());
|
||||
return std::make_pair(std::move(errors), maxError);
|
||||
}
|
||||
|
||||
static void runTest(
|
||||
const cl::Context &context, const cl::CommandQueue &queue, const Test &test, uint32_t numLinear, uint32_t numRandom)
|
||||
{
|
||||
std::cout << "Running test " << test.sourceFile << " ..." << std::endl;
|
||||
std::cout << "\tRunning " << test.ranges.size() << " ranges with " << (numLinear + numRandom) << " values"
|
||||
<< std::endl;
|
||||
auto kernels = createKernels(context, test);
|
||||
std::cout << "\tTesting " << kernels.size() << " implementations " << std::endl;
|
||||
|
||||
for(const auto &range : test.ranges)
|
||||
{
|
||||
auto inputs = generateInputData(range, numLinear, numRandom, test.reference.numParameters);
|
||||
auto inputSize = inputs.front().size();
|
||||
cl::NDRange globalSize(inputSize / 16);
|
||||
std::vector<float> reference = test.reference(inputs);
|
||||
|
||||
std::vector<cl::Buffer> inputBuffers;
|
||||
for(auto &input : inputs)
|
||||
inputBuffers.emplace_back(queue, input.begin(), input.end(), true);
|
||||
cl::Buffer outputBuffer(context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, inputSize * sizeof(float));
|
||||
|
||||
for(auto &kernel : kernels)
|
||||
{
|
||||
kernel.setArg(0, outputBuffer);
|
||||
for(std::size_t i = 0; i < inputBuffers.size(); ++i)
|
||||
kernel.setArg(1 + i, inputBuffers[i]);
|
||||
|
||||
std::cout << "\tRunning kernel '" << kernel.getInfo<CL_KERNEL_FUNCTION_NAME>() << "' with "
|
||||
<< (inputSize / 16) << " work-items ... " << std::endl;
|
||||
auto start = std::chrono::steady_clock::now();
|
||||
cl::Event kernelEvent{};
|
||||
queue.enqueueNDRangeKernel(kernel, cl::NullRange, globalSize, cl::NullRange, nullptr, &kernelEvent);
|
||||
kernelEvent.wait();
|
||||
auto end = std::chrono::steady_clock::now();
|
||||
std::cout << "\t- Finished in "
|
||||
<< std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << " us"
|
||||
<< std::endl;
|
||||
std::chrono::nanoseconds deviceDuration{kernelEvent.getProfilingInfo<CL_PROFILING_COMMAND_END>() -
|
||||
kernelEvent.getProfilingInfo<CL_PROFILING_COMMAND_START>()};
|
||||
std::cout << "\t- Executed for "
|
||||
<< std::chrono::duration_cast<std::chrono::microseconds>(deviceDuration).count() << " us"
|
||||
<< std::endl;
|
||||
if(geteuid() == 0) // TODO only on hardware
|
||||
{
|
||||
cl_ulong numInstructions = 0;
|
||||
kernelEvent.getProfilingInfo(
|
||||
CL_PROFILING_PERFORMANCE_COUNTER_INSTRUCTION_COUNT_VC4CL, &numInstructions);
|
||||
cl_ulong numCycles = 0;
|
||||
kernelEvent.getProfilingInfo(CL_PROFILING_PERFORMANCE_COUNTER_EXECUTION_CYCLES_VC4CL, &numCycles);
|
||||
std::cout << "\t- Executed " << numInstructions << " instructions in " << numCycles << " cycles"
|
||||
<< std::endl;
|
||||
}
|
||||
|
||||
std::vector<float> result(inputSize);
|
||||
queue.enqueueReadBuffer(outputBuffer, CL_TRUE, 0, inputSize * sizeof(float), result.data());
|
||||
auto errors = checkResults(inputs, reference, result, test.allowedErrorInUlp);
|
||||
std::cout << "\t- Has " << errors.first.size() << " wrong results and a maximum error of " << errors.second
|
||||
<< " ULP (of allowed " << test.allowedErrorInUlp << " ULP)" << std::endl;
|
||||
for(std::size_t i = 0; i < std::min(errors.first.size(), std::size_t{8}); ++i)
|
||||
std::cout << "\t\t" << errors.first[i] << std::endl;
|
||||
if(errors.first.size() > 8)
|
||||
std::cout << "\t\t[...]" << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void printHelp()
|
||||
{
|
||||
std::cout << "Usage: <program> [<options>] <test> [<test>...]" << std::endl;
|
||||
std::cout << "Options: " << std::endl;
|
||||
std::cout << "\t--help Shows this help message" << std::endl;
|
||||
std::cout << "\t--linear=<num> Specifies the number of linear test values, defaults to " << DEFAULT_NUM_LINEAR
|
||||
<< std::endl;
|
||||
std::cout << "\t--random=<num> Specifies the number of random test values, defaults to " << DEFAULT_NUM_RANDOM
|
||||
<< std::endl;
|
||||
std::cout << "Available tests: ";
|
||||
for(const auto &test : floatTests)
|
||||
std::cout << test.name << ", ";
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
uint32_t numLinear = DEFAULT_NUM_LINEAR;
|
||||
uint32_t numRandom = DEFAULT_NUM_RANDOM;
|
||||
|
||||
if(argc < 2)
|
||||
{
|
||||
printHelp();
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
||||
auto platform = cl::Platform::get();
|
||||
cl::Device device{};
|
||||
{
|
||||
std::vector<cl::Device> devices;
|
||||
platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
|
||||
if(devices.empty())
|
||||
{
|
||||
std::cout << "No device found!" << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
device = devices.front();
|
||||
}
|
||||
cl::Context context(device);
|
||||
cl::CommandQueue queue(context, CL_QUEUE_PROFILING_ENABLE);
|
||||
|
||||
std::vector<std::reference_wrapper<const Test>> selectedTests;
|
||||
for(int i = 1; i < argc; ++i)
|
||||
{
|
||||
if(argv[i][0] == '-')
|
||||
{
|
||||
if(std::string{"--help"} == argv[i])
|
||||
{
|
||||
printHelp();
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
else if(strstr(argv[i], "--linear=") == argv[i])
|
||||
numLinear = static_cast<uint32_t>(std::atoi(argv[i] + strlen("--linear=")));
|
||||
else if(strstr(argv[i], "--random=") == argv[i])
|
||||
numRandom = static_cast<uint32_t>(std::atoi(argv[i] + strlen("--random=")));
|
||||
else
|
||||
{
|
||||
std::cout << "Unknown option: " << argv[i] << std::endl;
|
||||
printHelp();
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
}
|
||||
auto testIt =
|
||||
std::find_if(floatTests.begin(), floatTests.end(), [&](const Test &test) { return test.name == argv[i]; });
|
||||
if(testIt != floatTests.end())
|
||||
selectedTests.emplace_back(std::cref(*testIt));
|
||||
else
|
||||
{
|
||||
std::cout << "No such test '" << argv[i] << "', available tests: ";
|
||||
for(const auto &test : floatTests)
|
||||
std::cout << test.name << ", ";
|
||||
std::cout << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
}
|
||||
|
||||
for(const auto &test : selectedTests)
|
||||
runTest(context, queue, test.get(), numLinear, numRandom);
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
364
drivers/videocore4_stdlib/experimental/exp.cl
Normal file
364
drivers/videocore4_stdlib/experimental/exp.cl
Normal file
@ -0,0 +1,364 @@
|
||||
|
||||
#define arg_t float16
|
||||
#define result_t float16
|
||||
#define int_t int16
|
||||
|
||||
#define CONCAT(a, b) a##b
|
||||
#define CAT(a, b) CONCAT(a, b)
|
||||
|
||||
// vc4cl_split(double) of M_LN2
|
||||
#define M_LN2_FF 0xB102E3083F317218
|
||||
|
||||
float16 vc4cl_lossy(ulong16) __attribute__((overloadable));
|
||||
ulong16 vc4cl_add(ulong16, ulong16) __attribute__((overloadable));
|
||||
ulong16 vc4cl_sub(ulong16, ulong16) __attribute__((overloadable));
|
||||
ulong16 vc4cl_mul(float16, float16) __attribute__((overloadable));
|
||||
ulong16 vc4cl_mul(ulong16, ulong16) __attribute__((overloadable));
|
||||
ulong16 vc4cl_extend(float16 val) __attribute__((overloadable));
|
||||
|
||||
result_t pow2(int_t val)
|
||||
{
|
||||
// y = 2^x = 1.0 [implied] * 2^(x + offset)
|
||||
int_t tmp = val << 23;
|
||||
// alternative: tmp = (val + 127) << 23;
|
||||
tmp += (int_t) 0x3F800000;
|
||||
return CAT(as_, result_t)(tmp & (int_t) 0x7F800000);
|
||||
}
|
||||
|
||||
int_t powerOfTwo(arg_t val)
|
||||
{
|
||||
// Original code, produces Inf for e^(~10^38)
|
||||
// return CAT(convert_, int_t)(ceil((val / M_LN2_F) - 0.5f));
|
||||
// Using floor() instead of ceil(),
|
||||
// - fixes Inf for large exponents
|
||||
// - slightly reduces accuracy of Chebyshev implementations (by ~4 ULP),
|
||||
// - greatly reduces accuracy of Taylor (<10 ULP -> >1200 ULP) -> requires more iterations
|
||||
return CAT(convert_, int_t)(floor((val / M_LN2_F) - 0.5f));
|
||||
}
|
||||
|
||||
/*
|
||||
* Taylor series with Horner's method and range reduction,
|
||||
*
|
||||
* https://www.pseudorandom.com/implementing-exp#section-6
|
||||
*/
|
||||
result_t exp_taylor(arg_t val)
|
||||
{
|
||||
arg_t positive = fabs(val);
|
||||
|
||||
// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)]
|
||||
int_t k = powerOfTwo(positive);
|
||||
arg_t r = positive - CAT(convert_, arg_t)(k) * M_LN2_F;
|
||||
|
||||
arg_t tk = 1.0f;
|
||||
arg_t tn = 1.0f;
|
||||
#pragma loop unroll
|
||||
for(int i = 1; i < 10; i++) // TODO can adjust number of iterations
|
||||
{
|
||||
tk *= r / i;
|
||||
tn += tk;
|
||||
};
|
||||
|
||||
tn = tn * pow2(k);
|
||||
return val < 0 ? 1 / tn : tn;
|
||||
}
|
||||
|
||||
__kernel void exp_taylor_kernel(__global arg_t *out, const __global arg_t *in)
|
||||
{
|
||||
uint gid = get_global_id(0);
|
||||
out[gid] = exp_taylor(in[gid]);
|
||||
}
|
||||
|
||||
result_t exp_taylor_extended_precision_exact(arg_t val)
|
||||
{
|
||||
arg_t positive = fabs(val);
|
||||
|
||||
// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)]
|
||||
int_t k = powerOfTwo(positive);
|
||||
ulong16 r = vc4cl_sub(vc4cl_extend(positive), vc4cl_mul(vc4cl_extend(CAT(convert_, arg_t)(k)), M_LN2_FF));
|
||||
|
||||
ulong16 tk = 0x000000003F800000; // 1.0
|
||||
ulong16 tn = 0x000000003F800000; // 1.0
|
||||
|
||||
tk = vc4cl_mul(tk, r);
|
||||
tn = vc4cl_add(tn, tk);
|
||||
|
||||
tk = vc4cl_mul(tk, vc4cl_mul(r, 0x000000003F000000)); // 1 / 2
|
||||
tn = vc4cl_add(tn, tk);
|
||||
|
||||
tk = vc4cl_mul(tk, vc4cl_mul(r, 0xB22AAAAB3EAAAAAB)); // 1 / 3
|
||||
tn = vc4cl_add(tn, tk);
|
||||
|
||||
tk = vc4cl_mul(tk, vc4cl_mul(r, 0x000000003E800000)); // 1 / 4
|
||||
tn = vc4cl_add(tn, tk);
|
||||
|
||||
tk = vc4cl_mul(tk, vc4cl_mul(r, 0xB14CCCCD3E4CCCCD)); // 1 / 5
|
||||
tn = vc4cl_add(tn, tk);
|
||||
|
||||
tk = vc4cl_mul(tk, vc4cl_mul(r, 0xB1AAAAAB3E2AAAAB)); // 1 / 6
|
||||
tn = vc4cl_add(tn, tk);
|
||||
|
||||
tk = vc4cl_mul(tk, vc4cl_mul(r, 0xB1DB6DB73E124925)); // 1 / 7
|
||||
tn = vc4cl_add(tn, tk);
|
||||
|
||||
tk = vc4cl_mul(tk, vc4cl_mul(r, 0x000000003E000000)); // 1 / 8
|
||||
tn = vc4cl_add(tn, tk);
|
||||
|
||||
tk = vc4cl_mul(tk, vc4cl_mul(r, 0xB0638E393DE38E39)); // 1 / 9
|
||||
tn = vc4cl_add(tn, tk);
|
||||
// removing any iteration makes the result inaccurate (removing last iteration gives 19 ULP)
|
||||
|
||||
result_t result = vc4cl_lossy(tn) * pow2(k);
|
||||
return val < 0 ? 1.0f / result : result;
|
||||
}
|
||||
|
||||
// __kernel void exp_taylor_extended_precision_exact_kernel(__global arg_t *out, const __global arg_t *in)
|
||||
// {
|
||||
// uint gid = get_global_id(0);
|
||||
// out[gid] = exp_taylor_extended_precision_exact(in[gid]);
|
||||
// }
|
||||
|
||||
// TODO Lagrange and Barycentric interpolations from https://www.pseudorandom.com/implementing-exp
|
||||
|
||||
/*
|
||||
* Chebyshev interpolation with range reduction,
|
||||
*
|
||||
* https://www.pseudorandom.com/implementing-exp#section-18
|
||||
*/
|
||||
result_t exp_chebyshev(arg_t val)
|
||||
{
|
||||
// XXX could remove unneeded coefficients once we fix precision
|
||||
const float coefficients[] = {
|
||||
1.266065877752008335598244625214717537923,
|
||||
1.130318207984970054415392055219726613610,
|
||||
0.2714953395340765623657051399899818507081,
|
||||
0.04433684984866380495257149525979922986386,
|
||||
0.00547424044209373265027616843118645948703,
|
||||
0.000542926311913943750362147810307554678760,
|
||||
0.00004497732295429514665469032811091269841937,
|
||||
3.198436462401990505863872976602295688795e-6,
|
||||
1.992124806672795725961064384805589035648e-7,
|
||||
1.103677172551734432616996091335324170860e-8,
|
||||
5.50589607967374725047142040200552692791e-10,
|
||||
2.497956616984982522712010934218766985311e-11,
|
||||
1.039152230678570050499634672423840849837e-12,
|
||||
3.991263356414401512887720401532162026594e-14,
|
||||
};
|
||||
arg_t positive = fabs(val);
|
||||
|
||||
// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)]
|
||||
int_t k = powerOfTwo(positive);
|
||||
arg_t r = positive - CAT(convert_, arg_t)(k) * M_LN2_F;
|
||||
|
||||
arg_t ti = 1.0f;
|
||||
arg_t tj = r;
|
||||
arg_t p = coefficients[0] + (coefficients[1] * r);
|
||||
#pragma loop unroll
|
||||
for(int i = 2; i < 8; i++) // TODO can adjust number of iterations
|
||||
{
|
||||
arg_t tk = (2 * r * tj) - ti;
|
||||
p += coefficients[i] * tk;
|
||||
ti = tj;
|
||||
tj = tk;
|
||||
}
|
||||
|
||||
p = p * pow2(k);
|
||||
return val < 0 ? 1 / p : p;
|
||||
}
|
||||
|
||||
__kernel void exp_chebyshev_kernel(__global arg_t *out, const __global arg_t *in)
|
||||
{
|
||||
uint gid = get_global_id(0);
|
||||
out[gid] = exp_chebyshev(in[gid]);
|
||||
}
|
||||
|
||||
result_t exp_chebyshev_extended_precision_exact(arg_t val)
|
||||
{
|
||||
arg_t positive = fabs(val);
|
||||
|
||||
// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)]
|
||||
int_t k = powerOfTwo(positive);
|
||||
ulong16 r = vc4cl_sub(vc4cl_extend(positive), vc4cl_mul(vc4cl_extend(CAT(convert_, arg_t)(k)), M_LN2_FF));
|
||||
|
||||
ulong16 ti = 0x000000003F800000; // 1.0
|
||||
ulong16 tj = r;
|
||||
// 1.266065877752008335598244625214717537923 and 1.130318207984970054415392055219726613610
|
||||
ulong16 p = vc4cl_add(0x333386C33FA20E72, vc4cl_mul(0x33395E683F90AE44, r));
|
||||
r = vc4cl_mul(r, 0x0000000040000000); // 2.0
|
||||
|
||||
ulong16 tk = vc4cl_sub(vc4cl_mul(r, tj), ti);
|
||||
p = vc4cl_add(p, vc4cl_mul(0xB13AF4A23E8B0170, tk)); // 0.2714953395340765623657051399899818507081
|
||||
ti = tj;
|
||||
tj = tk;
|
||||
|
||||
tk = vc4cl_sub(vc4cl_mul(r, tj), ti);
|
||||
p = vc4cl_add(p, vc4cl_mul(0xB0FC8DF03D359A8F, tk)); // 0.04433684984866380495257149525979922986386
|
||||
ti = tj;
|
||||
tj = tk;
|
||||
|
||||
tk = vc4cl_sub(vc4cl_mul(r, tj), ti);
|
||||
p = vc4cl_add(p, vc4cl_mul(0xAEA95A453BB36142, tk)); // 0.00547424044209373265027616843118645948703
|
||||
ti = tj;
|
||||
tj = tk;
|
||||
|
||||
tk = vc4cl_sub(vc4cl_mul(r, tj), ti);
|
||||
p = vc4cl_add(p, vc4cl_mul(0x2B7994663A0E532B, tk)); // 0.000542926311913943750362147810307554678760
|
||||
ti = tj;
|
||||
tj = tk;
|
||||
|
||||
tk = vc4cl_sub(vc4cl_mul(r, tj), ti);
|
||||
p = vc4cl_add(p, vc4cl_mul(0x2BC988B0383CA608, tk)); // 0.00004497732295429514665469032811091269841937
|
||||
ti = tj;
|
||||
tj = tk;
|
||||
|
||||
tk = vc4cl_sub(vc4cl_mul(r, tj), ti);
|
||||
p = vc4cl_add(p, vc4cl_mul(0x29A61EF43656A4B8, tk)); // 3.198436462401990505863872976602295688795e-6
|
||||
ti = tj;
|
||||
tj = tk;
|
||||
|
||||
tk = vc4cl_sub(vc4cl_mul(r, tj), ti);
|
||||
p = vc4cl_add(p, vc4cl_mul(0x26B66C3C3455E71C, tk)); // 1.992124806672795725961064384805589035648e-7
|
||||
ti = tj;
|
||||
tj = tk;
|
||||
// removing any iteration makes the result inaccurate (removing last iteration gives 5 ULP)
|
||||
|
||||
result_t result = vc4cl_lossy(p) * pow2(k);
|
||||
return val < 0 ? 1.0f / result : result;
|
||||
}
|
||||
|
||||
// __kernel void exp_chebyshev_extended_precision_exact_kernel(__global arg_t *out, const __global arg_t *in)
|
||||
// {
|
||||
// uint gid = get_global_id(0);
|
||||
// out[gid] = exp_chebyshev_extended_precision_exact(in[gid]);
|
||||
// }
|
||||
|
||||
/*
|
||||
* Chebyshev interpolation with monomial basis and range reduction,
|
||||
*
|
||||
* https://www.pseudorandom.com/implementing-exp#section-18
|
||||
*/
|
||||
result_t exp_chebyshev_monomial(arg_t val)
|
||||
{
|
||||
// XXX could remove unneeded coefficients once we fix precision
|
||||
// TODO invert order of coefficients and traversal ?!
|
||||
const float coefficients[] = {
|
||||
1.000000000000000,
|
||||
1.000000000000000,
|
||||
0.500000000000002,
|
||||
0.166666666666680,
|
||||
0.041666666666727,
|
||||
0.008333333333342,
|
||||
0.001388888888388,
|
||||
1.984126978734782e-4,
|
||||
2.480158866546844e-5,
|
||||
2.755734045527853e-6,
|
||||
2.755715675968011e-7,
|
||||
2.504861486483735e-8,
|
||||
2.088459690899721e-9,
|
||||
1.632461784798319e-10,
|
||||
};
|
||||
arg_t positive = fabs(val);
|
||||
|
||||
// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)]
|
||||
int_t k = powerOfTwo(positive);
|
||||
arg_t r = positive - CAT(convert_, arg_t)(k) * M_LN2_F;
|
||||
|
||||
arg_t pn = 1.143364767943110e-11;
|
||||
#pragma loop unroll
|
||||
for(int i = 0; i < 14; i++)
|
||||
{
|
||||
pn = pn * r + coefficients[13 - i];
|
||||
}
|
||||
|
||||
pn = pn * pow2(k);
|
||||
return val < 0 ? 1 / pn : pn;
|
||||
}
|
||||
|
||||
__kernel void exp_chebyshev_monomial_kernel(__global arg_t *out, const __global arg_t *in)
|
||||
{
|
||||
uint gid = get_global_id(0);
|
||||
out[gid] = exp_chebyshev_monomial(in[gid]);
|
||||
}
|
||||
|
||||
result_t exp_chebyshev_monomial_exact(arg_t val)
|
||||
{
|
||||
arg_t positive = fabs(val);
|
||||
|
||||
// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)]
|
||||
int_t k = powerOfTwo(positive);
|
||||
arg_t kFloat = CAT(convert_, arg_t)(k);
|
||||
arg_t r = vc4cl_lossy(vc4cl_sub(vc4cl_extend(positive), vc4cl_mul(vc4cl_extend(kFloat), M_LN2_FF)));
|
||||
|
||||
arg_t pn = 1.143364767943110e-11;
|
||||
|
||||
pn = pn * r + 1.632461784798319e-10f;
|
||||
pn = pn * r + 2.088459690899721e-9f;
|
||||
pn = pn * r + 2.504861486483735e-8f;
|
||||
pn = pn * r + 2.755715675968011e-7f;
|
||||
pn = pn * r + 2.755734045527853e-6f;
|
||||
pn = pn * r + 2.480158866546844e-5f;
|
||||
pn = pn * r + 1.984126978734782e-4f;
|
||||
pn = pn * r + 0.001388888888388f;
|
||||
pn = pn * r + 0.008333333333342f;
|
||||
pn = pn * r + 0.041666666666727f;
|
||||
pn = pn * r + 0.166666666666680f;
|
||||
pn = pn * r + 0.500000000000002f;
|
||||
pn = pn * r + 1.000000000000000f;
|
||||
pn = pn * r + 1.000000000000000f;
|
||||
|
||||
pn = pn * pow2(k);
|
||||
return val < 0 ? 1 / pn : pn;
|
||||
}
|
||||
|
||||
__kernel void exp_chebyshev_monomial_exact_kernel(__global arg_t *out, const __global arg_t *in)
|
||||
{
|
||||
uint gid = get_global_id(0);
|
||||
out[gid] = exp_chebyshev_monomial_exact(in[gid]);
|
||||
}
|
||||
|
||||
result_t exp_chebyshev_monomial_extended_precision_exact(arg_t val)
|
||||
{
|
||||
arg_t positive = fabs(val);
|
||||
|
||||
// range reduction: e^x = 2^k * e^r [with x = r + k * log2(x)]
|
||||
int_t k = powerOfTwo(positive);
|
||||
ulong16 r = vc4cl_sub(vc4cl_extend(positive), vc4cl_mul(vc4cl_extend(CAT(convert_, arg_t)(k)), M_LN2_FF));
|
||||
|
||||
ulong16 pn = 0x209249252D492492; // 1.143364767943110e-11
|
||||
pn = vc4cl_add(vc4cl_mul(pn, r), 0xA21249252F337DB7); // 1.632461784798319e-10
|
||||
pn = vc4cl_add(vc4cl_mul(pn, r), 0x24924925310F8492); // 2.088459690899721e-9
|
||||
pn = vc4cl_add(vc4cl_mul(pn, r), 0xA65B6DB732D72A7D); // 2.504861486483735e-8
|
||||
pn = vc4cl_add(vc4cl_mul(pn, r), 0xA85B6DB73493F245); // 2.755715675968011e-7
|
||||
pn = vc4cl_add(vc4cl_mul(pn, r), 0xA9FDB6DB3638EF27); // 2.755734045527853e-6
|
||||
pn = vc4cl_add(vc4cl_mul(pn, r), 0xAB60000037D00D02); // 2.480158866546844e-5
|
||||
pn = vc4cl_add(vc4cl_mul(pn, r), 0xAC65BDB739500D01); // 1.984126978734782e-4
|
||||
pn = vc4cl_add(vc4cl_mul(pn, r), 0xAE161D323AB60B61); // 0.001388888888388
|
||||
pn = vc4cl_add(vc4cl_mul(pn, r), 0xAFEEEDB73C088889); // 0.008333333333342
|
||||
pn = vc4cl_add(vc4cl_mul(pn, r), 0xB0AAA88B3D2AAAAB); // 0.041666666666727
|
||||
pn = vc4cl_add(vc4cl_mul(pn, r), 0xB1AAAA8D3E2AAAAB); // 0.166666666666680
|
||||
pn = vc4cl_add(vc4cl_mul(pn, r), 0x271000003F000000); // 0.500000000000002
|
||||
pn = vc4cl_add(vc4cl_mul(pn, r), 0x000000003F800000); // 1.000000000000000
|
||||
pn = vc4cl_add(vc4cl_mul(pn, r), 0x000000003F800000); // 1.000000000000000
|
||||
|
||||
result_t result = vc4cl_lossy(pn) * pow2(k);
|
||||
return val < 0 ? 1.0f / result : result;
|
||||
}
|
||||
|
||||
// __kernel void exp_chebyshev_monomial_extended_precision_exact_kernel(__global arg_t *out, const __global arg_t *in)
|
||||
// {
|
||||
// uint gid = get_global_id(0);
|
||||
// out[gid] = exp_chebyshev_monomial_extended_precision_exact(in[gid]);
|
||||
// }
|
||||
|
||||
// TODO Remes from www.netlib.org/fdlibm/e_exp.c
|
||||
|
||||
// TODO Matters computational (sections 32.2.2.2 and 32.2.3)
|
||||
// Pade Approximation (16 steps): (1680 + 840x + 180 x^2 + 20 x^3 + x^4) / (1680 - 840 x + 180 x^2 - 20 x^3 + x^4)
|
||||
|
||||
// TODO https://math.stackexchange.com/questions/1988901/approximating-the-exponential-function-with-taylor-series?rq=1
|
||||
// TODO http://www.netlib.org/fdlibm/
|
||||
|
||||
__kernel void exp_builtin_kernel(__global arg_t *out, const __global arg_t *in)
|
||||
{
|
||||
uint gid = get_global_id(0);
|
||||
out[gid] = exp(in[gid]);
|
||||
}
|
46
drivers/videocore4_stdlib/experimental/fma.cl
Normal file
46
drivers/videocore4_stdlib/experimental/fma.cl
Normal file
@ -0,0 +1,46 @@
|
||||
#define arg_t float16
|
||||
#define result_t float16
|
||||
#define int_t int16
|
||||
#define uint_t uint16
|
||||
|
||||
#define CONCAT(a, b) a##b
|
||||
#define CAT(a, b) CONCAT(a, b)
|
||||
|
||||
float16 vc4cl_lossy(ulong16) __attribute__((overloadable));
|
||||
ulong16 vc4cl_add(ulong16, ulong16) __attribute__((overloadable));
|
||||
ulong16 vc4cl_sub(ulong16, ulong16) __attribute__((overloadable));
|
||||
ulong16 vc4cl_mul(float16, float16) __attribute__((overloadable));
|
||||
ulong16 vc4cl_mul(ulong16, ulong16) __attribute__((overloadable));
|
||||
ulong16 vc4cl_extend(float16 val) __attribute__((overloadable));
|
||||
|
||||
result_t fma_simple(arg_t in0, arg_t in1, arg_t in2)
|
||||
{
|
||||
return in0 * in1 * in2;
|
||||
}
|
||||
|
||||
__kernel void fma_simple_kernel(
|
||||
__global arg_t *out, const __global arg_t *in0, const __global arg_t *in1, const __global arg_t *in2)
|
||||
{
|
||||
uint gid = get_global_id(0);
|
||||
out[gid] = fma_simple(in0[gid], in1[gid], in2[gid]);
|
||||
}
|
||||
|
||||
result_t fma_extended_precision(arg_t in0, arg_t in1, arg_t in2)
|
||||
{
|
||||
ulong16 mul = vc4cl_mul(in0, in1);
|
||||
return vc4cl_lossy(vc4cl_add(mul, vc4cl_extend(in2)));
|
||||
}
|
||||
|
||||
__kernel void fma_extended_precision_kernel(
|
||||
__global arg_t *out, const __global arg_t *in0, const __global arg_t *in1, const __global arg_t *in2)
|
||||
{
|
||||
uint gid = get_global_id(0);
|
||||
out[gid] = fma_extended_precision(in0[gid], in1[gid], in2[gid]);
|
||||
}
|
||||
|
||||
__kernel void fma_builtin_kernel(
|
||||
__global arg_t *out, const __global arg_t *in0, const __global arg_t *in1, const __global arg_t *in2)
|
||||
{
|
||||
uint gid = get_global_id(0);
|
||||
out[gid] = fma(in0[gid], in1[gid], in2[gid]);
|
||||
}
|
9
drivers/videocore4_stdlib/experimental/identity.cl
Normal file
9
drivers/videocore4_stdlib/experimental/identity.cl
Normal file
@ -0,0 +1,9 @@
|
||||
#define arg_t float16
|
||||
#define result_t float16
|
||||
#define int_t int16
|
||||
|
||||
__kernel void identity_kernel(__global arg_t *out, const __global arg_t *in)
|
||||
{
|
||||
uint gid = get_global_id(0);
|
||||
out[gid] = in[gid];
|
||||
}
|
256
drivers/videocore4_stdlib/experimental/log.cl
Normal file
256
drivers/videocore4_stdlib/experimental/log.cl
Normal file
@ -0,0 +1,256 @@
|
||||
#define arg_t float16
|
||||
#define result_t float16
|
||||
#define int_t int16
|
||||
|
||||
/*
|
||||
* Helper, arithmetic-geometric-mean,
|
||||
*
|
||||
* https://en.wikipedia.org/wiki/Arithmetic%E2%80%93geometric_mean
|
||||
*/
|
||||
result_t agm(arg_t x, arg_t y)
|
||||
{
|
||||
arg_t arithm = x;
|
||||
arg_t geom = y;
|
||||
for(unsigned iteration = 0; iteration < 6; ++iteration) // TODO can adjust number of iterations
|
||||
{
|
||||
arg_t arithm_new = (arithm + geom) / (arg_t) 2.0;
|
||||
geom = sqrt(arithm * geom);
|
||||
arithm = arithm_new;
|
||||
}
|
||||
return arithm;
|
||||
}
|
||||
|
||||
#define CONCAT(a, b) a##b
|
||||
#define CAT(a, b) CONCAT(a, b)
|
||||
|
||||
#define REDUCE_ARGUMENT_TO_0_1 \
|
||||
/* log(S * M * 2^E) = log(S * M) + E log(2) */ \
|
||||
int_t bitcast = CAT(as_, int_t)(val); \
|
||||
/* deduct exponent offset, we use -126, to go into the range [0.5, 1) */ \
|
||||
int_t exponent = ((bitcast >> 23) & 0xFF) - 126; \
|
||||
/* mask off exponent and replace with exponent for range [0.5, 1) */ \
|
||||
int_t signedMantissaBits = (bitcast & (int_t) 0x807FFFFF) | (int_t) 0x3F000000; \
|
||||
arg_t mantissa = CAT(as_, result_t)(signedMantissaBits); \
|
||||
result_t reduced = CAT(convert_, result_t)(exponent) * M_LN2_F;
|
||||
|
||||
/*
|
||||
* Taylor-series,
|
||||
*
|
||||
* https://en.wikipedia.org/wiki/Mercator_series
|
||||
*/
|
||||
result_t log1p_taylor(arg_t val)
|
||||
{
|
||||
// ln (1 + x) = x - x^2/2 + x^3/3 - x^4/4
|
||||
// converges for -1 < x <= 1 (requires argument reduction)
|
||||
|
||||
REDUCE_ARGUMENT_TO_0_1
|
||||
|
||||
// iteration 1
|
||||
result_t result = mantissa;
|
||||
arg_t power = mantissa;
|
||||
#pragma loop unroll
|
||||
for(unsigned iteration = 2; iteration <= 26; ++iteration) // TODO can adjust number of iterations
|
||||
{
|
||||
power *= mantissa;
|
||||
arg_t sign = iteration & 1 ? (arg_t) 1.0 : (arg_t) -1.0;
|
||||
result = result + sign * (arg_t) (1.0 / iteration) * power;
|
||||
}
|
||||
return result + reduced;
|
||||
}
|
||||
|
||||
__kernel void log1p_taylor_kernel(__global arg_t *out, const __global arg_t *in)
|
||||
{
|
||||
uint gid = get_global_id(0);
|
||||
out[gid] = log1p_taylor(in[gid] - (arg_t) 1.0f);
|
||||
}
|
||||
|
||||
result_t log1p_taylor_unrolled(arg_t val)
|
||||
{
|
||||
// ln (1 + x) = x - x^2/2 + x^3/3 - x^4/4
|
||||
// converges for -1 < x <= 1 (requires argument reduction)
|
||||
|
||||
REDUCE_ARGUMENT_TO_0_1
|
||||
|
||||
// iteration 1
|
||||
result_t result = mantissa;
|
||||
arg_t power = mantissa;
|
||||
|
||||
// iteration 2
|
||||
power *= mantissa;
|
||||
result = result - (arg_t) (1.0 / 2.0) * power;
|
||||
|
||||
// iteration 3
|
||||
power *= mantissa;
|
||||
result = result + (arg_t) (1.0 / 3.0) * power;
|
||||
|
||||
// iteration 4
|
||||
power *= mantissa;
|
||||
result = result - (arg_t) (1.0 / 4.0) * power;
|
||||
|
||||
// iteration 5
|
||||
power *= mantissa;
|
||||
result = result + (arg_t) (1.0 / 5.0) * power;
|
||||
|
||||
// iteration 6
|
||||
power *= mantissa;
|
||||
result = result - (arg_t) (1.0 / 6.0) * power;
|
||||
|
||||
// iteration 7
|
||||
power *= mantissa;
|
||||
result = result + (arg_t) (1.0 / 7.0) * power;
|
||||
|
||||
// iteration 8
|
||||
power *= mantissa;
|
||||
result = result - (arg_t) (1.0 / 8.0) * power;
|
||||
|
||||
// iteration 9
|
||||
power *= mantissa;
|
||||
result = result + (arg_t) (1.0 / 9.0) * power;
|
||||
|
||||
// iteration 10
|
||||
power *= mantissa;
|
||||
result = result - (arg_t) (1.0 / 10.0) * power;
|
||||
|
||||
// iteration 11
|
||||
power *= mantissa;
|
||||
result = result + (arg_t) (1.0 / 11.0) * power;
|
||||
|
||||
// iteration 12
|
||||
power *= mantissa;
|
||||
result = result - (arg_t) (1.0 / 12.0) * power;
|
||||
|
||||
// iteration 13
|
||||
power *= mantissa;
|
||||
result = result + (arg_t) (1.0 / 13.0) * power;
|
||||
|
||||
// iteration 14
|
||||
power *= mantissa;
|
||||
result = result - (arg_t) (1.0 / 14.0) * power;
|
||||
|
||||
// iteration 15
|
||||
power *= mantissa;
|
||||
result = result + (arg_t) (1.0 / 15.0) * power;
|
||||
|
||||
// iteration 16
|
||||
power *= mantissa;
|
||||
result = result - (arg_t) (1.0 / 16.0) * power;
|
||||
|
||||
// iteration 17
|
||||
power *= mantissa;
|
||||
result = result + (arg_t) (1.0 / 17.0) * power;
|
||||
|
||||
// iteration 18
|
||||
power *= mantissa;
|
||||
result = result - (arg_t) (1.0 / 18.0) * power;
|
||||
|
||||
// iteration 19
|
||||
power *= mantissa;
|
||||
result = result + (arg_t) (1.0 / 19.0) * power;
|
||||
|
||||
// iteration 20
|
||||
power *= mantissa;
|
||||
result = result - (arg_t) (1.0 / 20.0) * power;
|
||||
|
||||
// iteration 21
|
||||
power *= mantissa;
|
||||
result = result + (arg_t) (1.0 / 21.0) * power;
|
||||
|
||||
// iteration 22
|
||||
power *= mantissa;
|
||||
result = result - (arg_t) (1.0 / 22.0) * power;
|
||||
|
||||
// iteration 23
|
||||
power *= mantissa;
|
||||
result = result + (arg_t) (1.0 / 23.0) * power;
|
||||
|
||||
// iteration 24
|
||||
power *= mantissa;
|
||||
result = result - (arg_t) (1.0 / 24.0) * power;
|
||||
|
||||
// iteration 25
|
||||
power *= mantissa;
|
||||
result = result + (arg_t) (1.0 / 25.0) * power;
|
||||
|
||||
// iteration 26
|
||||
power *= mantissa;
|
||||
result = result - (arg_t) (1.0 / 26.0) * power;
|
||||
|
||||
// TODO can adjust number of iterations
|
||||
|
||||
return result + reduced;
|
||||
}
|
||||
|
||||
__kernel void log1p_taylor_unrolled_kernel(__global arg_t *out, const __global arg_t *in)
|
||||
{
|
||||
uint gid = get_global_id(0);
|
||||
out[gid] = log1p_taylor_unrolled(in[gid] - (arg_t) 1.0f);
|
||||
}
|
||||
|
||||
/*
|
||||
* Taylor series with optimization, requires argument reduction,
|
||||
*
|
||||
* https://math.stackexchange.com/a/3383716
|
||||
*/
|
||||
result_t log_taylor(arg_t val)
|
||||
{
|
||||
REDUCE_ARGUMENT_TO_0_1
|
||||
|
||||
result_t result = 0;
|
||||
// iteration 1
|
||||
arg_t tmp = 2 * (mantissa - (arg_t) 1.0) / (mantissa + (arg_t) 1.0);
|
||||
arg_t factor = tmp * tmp;
|
||||
#pragma loop unroll
|
||||
for(unsigned iteration = 1; iteration <= 26; iteration += 2) // TODO can adjust number of iterations
|
||||
{
|
||||
result += tmp / (arg_t) iteration;
|
||||
tmp *= factor;
|
||||
}
|
||||
return result + reduced;
|
||||
}
|
||||
|
||||
__kernel void log_taylor_kernel(__global arg_t *out, const __global arg_t *in)
|
||||
{
|
||||
uint gid = get_global_id(0);
|
||||
out[gid] = log_taylor(in[gid]);
|
||||
}
|
||||
|
||||
/*
|
||||
* Using the arithmetic-geometric-mean,
|
||||
*
|
||||
* https://en.wikipedia.org/wiki/Natural_logarithm#High_precision
|
||||
*/
|
||||
result_t log_agm(arg_t val)
|
||||
{
|
||||
const unsigned m = 8; // TODO can adjust for precision
|
||||
arg_t s = val * (arg_t) (1 << m);
|
||||
arg_t mean = agm(1.0, (arg_t) 4.0 / s);
|
||||
return (val * M_PI_F) / (2 * mean) - (arg_t) (m * M_LN2);
|
||||
}
|
||||
|
||||
__kernel void log_agm_kernel(__global arg_t *out, const __global arg_t *in)
|
||||
{
|
||||
uint gid = get_global_id(0);
|
||||
out[gid] = log_agm(in[gid]);
|
||||
}
|
||||
|
||||
result_t log_agm_reduced(arg_t val)
|
||||
{
|
||||
REDUCE_ARGUMENT_TO_0_1
|
||||
|
||||
const unsigned m = 8; // TODO can adjust for precision
|
||||
arg_t s = mantissa * (arg_t) (1 << m);
|
||||
arg_t mean = agm(1.0, (arg_t) 4.0 / s);
|
||||
return (mantissa * M_PI_F) / (2 * mean) - (arg_t) (m * M_LN2) + reduced;
|
||||
}
|
||||
|
||||
__kernel void log_agm_reduced_kernel(__global arg_t *out, const __global arg_t *in)
|
||||
{
|
||||
uint gid = get_global_id(0);
|
||||
out[gid] = log_agm_reduced(in[gid]);
|
||||
}
|
||||
|
||||
__kernel void log_builtin_kernel(__global arg_t *out, const __global arg_t *in)
|
||||
{
|
||||
uint gid = get_global_id(0);
|
||||
out[gid] = log(in[gid]);
|
||||
}
|
77
drivers/videocore4_stdlib/include/VC4CLStdLib.h
Normal file
77
drivers/videocore4_stdlib/include/VC4CLStdLib.h
Normal file
@ -0,0 +1,77 @@
|
||||
/*
|
||||
* General header for the VC4CLStdlib implementation, contains all required headers
|
||||
*
|
||||
* Author: doe300
|
||||
*
|
||||
* See the file "LICENSE" for the full license governing this code.
|
||||
*/
|
||||
|
||||
#ifndef VC4CLSTDLIB_H
|
||||
#define VC4CLSTDLIB_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"
|
||||
{
|
||||
#endif
|
||||
|
||||
#include "_config.h"
|
||||
#include "_extensions.h"
|
||||
#include "_conversions.h"
|
||||
#include "_common.h"
|
||||
#include "_math.h"
|
||||
#include "_integer.h"
|
||||
#include "_geometric.h"
|
||||
#include "_relational.h"
|
||||
#include "_work_items.h"
|
||||
#include "_vector.h"
|
||||
#include "_synchronization.h"
|
||||
#include "_async.h"
|
||||
#include "_atomics.h"
|
||||
#include "_images.h"
|
||||
#include "_printf.h"
|
||||
#include "_spir_mangling.h"
|
||||
#include "_clcxx_mangling.h"
|
||||
|
||||
#undef ALL_BITS_SET
|
||||
#undef OVERLOADABLE
|
||||
#undef CONST
|
||||
#undef PURE
|
||||
#undef INLINE
|
||||
#undef FUNC_1
|
||||
#undef OVERLOAD_1
|
||||
#undef OVERLOAD_1_RETURN_SCALAR
|
||||
#undef FUNC_2
|
||||
#undef OVERLOAD_2
|
||||
#undef OVERLOAD_2_SCALAR
|
||||
#undef OVERLOAD_2_RETURN_SCALAR
|
||||
#undef OVERLOAD_2_SCALAR_RETURN_SCALAR
|
||||
#undef FUNC_3
|
||||
#undef OVERLOAD_3
|
||||
#undef OVERLOAD_3_SCALAR
|
||||
#undef FUNC_4
|
||||
#undef FUNC_5
|
||||
#undef SIMPLE_1
|
||||
#undef SIMPLE_1_RETURN_SCALAR
|
||||
#undef SIMPLE_2
|
||||
#undef SIMPLE_2_RETURN_SCALAR
|
||||
#undef SIMPLE_2_SCALAR
|
||||
#undef SIMPLE_3
|
||||
#undef SIMPLE_3_SCALAR
|
||||
#undef SIMPLE_3_TWO_SCALAR
|
||||
#undef COMPLEX_1
|
||||
#undef COMPLEX_1_RETURN_SCALAR
|
||||
#undef COMPLEX_2
|
||||
#undef COMPLEX_3
|
||||
#undef COMPLEX_3_SCALAR
|
||||
#undef OVERLOAD_ALL_IMAGE_TYPES
|
||||
#undef OVERLOAD_ALL_IMAGE_TYPES_1
|
||||
#undef OVERLOAD_ALL_IMAGE_TYPES_2
|
||||
#undef OVERLOAD_ALL_IMAGE_TYPES_3
|
||||
#undef OVERLOAD_ALL_IMAGE_TYPES_4
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* VC4CLSTDLIB_H */
|
||||
|
245
drivers/videocore4_stdlib/include/_async.h
Normal file
245
drivers/videocore4_stdlib/include/_async.h
Normal file
@ -0,0 +1,245 @@
|
||||
/*
|
||||
* Author: doe300
|
||||
*
|
||||
* See the file "LICENSE" for the full license governing this code.
|
||||
*/
|
||||
|
||||
#ifndef VC4CL_ASYNC_H
|
||||
#define VC4CL_ASYNC_H
|
||||
|
||||
#include "_config.h"
|
||||
#include "_overloads.h"
|
||||
|
||||
|
||||
/*
|
||||
* This is a synchronous/blocking implementation.
|
||||
* The copy is "performed by all work-items in a work-group", so any work-item only has to copy a part of the area.
|
||||
* Or, since the copying of memory on different QPUs block each other, we can simply only execute the copying on the first work-item
|
||||
* (index 0, 0, 0). Idea taken from PoCL
|
||||
*/
|
||||
|
||||
#define ASYNC_COPY_INTERNAL \
|
||||
if(vc4cl_local_id(0) == 0) \
|
||||
{ \
|
||||
vc4cl_mutex_lock(); \
|
||||
vc4cl_dma_copy(dst, src, num_elements); \
|
||||
vc4cl_mutex_unlock(); \
|
||||
}
|
||||
|
||||
#define ASYNC_COPY(type) \
|
||||
INLINE event_t async_work_group_copy(__local type * dst, const __global type * src, size_t num_elements, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_copy(__local type##2 * dst, const __global type##2 * src, size_t num_elements, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_copy(__local type##3 * dst, const __global type##3 * src, size_t num_elements, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_copy(__local type##4 * dst, const __global type##4 * src, size_t num_elements, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_copy(__local type##8 * dst, const __global type##8 * src, size_t num_elements, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_copy(__local type##16 * dst, const __global type##16 * src, size_t num_elements, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_copy(__global type * dst, const __local type * src, size_t num_elements, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_copy(__global type##2 * dst, const __local type##2 * src, size_t num_elements, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_copy(__global type##3 * dst, const __local type##3 * src, size_t num_elements, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_copy(__global type##4 * dst, const __local type##4 * src, size_t num_elements, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_copy(__global type##8 * dst, const __local type##8 * src, size_t num_elements, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_copy(__global type##16 * dst, const __local type##16 * src, size_t num_elements, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
}
|
||||
|
||||
#define ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
|
||||
for (size_t i = 0; i < num_elements; ++i) \
|
||||
dst[i] = src[i * src_stride];
|
||||
//TODO better way, e.g. via vc4cl_dma_copy and stride-parameter?
|
||||
|
||||
#define ASYNC_STRIDED_DEST_COPY_INTERNAL \
|
||||
for (size_t i = 0; i < num_elements; ++i) \
|
||||
dst[i * dst_stride] = src[i];
|
||||
|
||||
#define ASYNC_STRIDED_COPY(type) \
|
||||
INLINE event_t async_work_group_strided_copy(__local type * dst, const __global type * src, size_t num_elements, size_t src_stride, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_strided_copy(__local type##2 * dst, const __global type##2 * src, size_t num_elements, size_t src_stride, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_strided_copy(__local type##3 * dst, const __global type##3 * src, size_t num_elements, size_t src_stride, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_strided_copy(__local type##4 * dst, const __global type##4 * src, size_t num_elements, size_t src_stride, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_strided_copy(__local type##8 * dst, const __global type##8 * src, size_t num_elements, size_t src_stride, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_strided_copy(__local type##16 * dst, const __global type##16 * src, size_t num_elements, size_t src_stride, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_STRIDED_SOURCE_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_strided_copy(__global type * dst, const __local type * src, size_t num_elements, size_t dst_stride, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_STRIDED_DEST_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_strided_copy(__global type##2 * dst, const __local type##2 * src, size_t num_elements, size_t dst_stride, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_STRIDED_DEST_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_strided_copy(__global type##3 * dst, const __local type##3 * src, size_t num_elements, size_t dst_stride, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_STRIDED_DEST_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_strided_copy(__global type##4 * dst, const __local type##4 * src, size_t num_elements, size_t dst_stride, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_STRIDED_DEST_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_strided_copy(__global type##8 * dst, const __local type##8 * src, size_t num_elements, size_t dst_stride, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_STRIDED_DEST_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
} \
|
||||
INLINE event_t async_work_group_strided_copy(__global type##16 * dst, const __local type##16 * src, size_t num_elements, size_t dst_stride, event_t event) OVERLOADABLE \
|
||||
{ \
|
||||
ASYNC_STRIDED_DEST_COPY_INTERNAL \
|
||||
return vc4cl_set_event(event); \
|
||||
}
|
||||
|
||||
#define PREFETCH(type) \
|
||||
INLINE void prefetch(const __global type * ptr, size_t num_entries) OVERLOADABLE \
|
||||
{ \
|
||||
vc4cl_prefetch(ptr, num_entries); \
|
||||
} \
|
||||
INLINE void prefetch(const __global type##2 * ptr, size_t num_entries) OVERLOADABLE \
|
||||
{ \
|
||||
vc4cl_prefetch(ptr, num_entries); \
|
||||
} \
|
||||
INLINE void prefetch(const __global type##3 * ptr, size_t num_entries) OVERLOADABLE \
|
||||
{ \
|
||||
vc4cl_prefetch(ptr, num_entries); \
|
||||
} \
|
||||
INLINE void prefetch(const __global type##4 * ptr, size_t num_entries) OVERLOADABLE \
|
||||
{ \
|
||||
vc4cl_prefetch(ptr, num_entries); \
|
||||
} \
|
||||
INLINE void prefetch(const __global type##8 * ptr, size_t num_entries) OVERLOADABLE \
|
||||
{ \
|
||||
vc4cl_prefetch(ptr, num_entries); \
|
||||
} \
|
||||
INLINE void prefetch(const __global type##16 * ptr, size_t num_entries) OVERLOADABLE \
|
||||
{ \
|
||||
vc4cl_prefetch(ptr, num_entries); \
|
||||
}
|
||||
|
||||
/*
|
||||
* OpenCL 1.2, page 278:
|
||||
* "Perform an async copy of num_gentypes gentype elements from src to dst.
|
||||
* The async copy is performed by all work-items in a work-group and this built-in
|
||||
* function must therefore be encountered by all work-items in a work-group executing the kernel with the same argument values."
|
||||
*/
|
||||
ASYNC_COPY(uchar)
|
||||
ASYNC_COPY(char)
|
||||
ASYNC_COPY(ushort)
|
||||
ASYNC_COPY(short)
|
||||
ASYNC_COPY(uint)
|
||||
ASYNC_COPY(int)
|
||||
ASYNC_COPY(float)
|
||||
|
||||
ASYNC_STRIDED_COPY(uchar)
|
||||
ASYNC_STRIDED_COPY(char)
|
||||
ASYNC_STRIDED_COPY(ushort)
|
||||
ASYNC_STRIDED_COPY(short)
|
||||
ASYNC_STRIDED_COPY(uint)
|
||||
ASYNC_STRIDED_COPY(int)
|
||||
ASYNC_STRIDED_COPY(float)
|
||||
|
||||
/*
|
||||
* OpenCL 1.2, page 279:
|
||||
* "Wait for events that identify the async_work_group_copy operations to complete.
|
||||
* The event objects specified in event_list will be released after the wait is performed."
|
||||
*/
|
||||
INLINE void wait_group_events(int num_events, event_t* event_list) OVERLOADABLE
|
||||
{
|
||||
// async_work_group_copy is blocking, so we don't need to wait for any asynchronous operation to finish
|
||||
// But: Since the copy is only performed on the first work-item, we need to wait for it to finish
|
||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
/*
|
||||
* OpenCL 1.2, page 280:
|
||||
* "Prefetch num_gentypes * sizeof(gentype) bytes into the global cache.
|
||||
* The prefetch instruction is applied to a work-item in a work-group and does not affect the functional behavior of the kernel."
|
||||
*
|
||||
* -> Since it doesn't affect the functional behavior, the implementation is a no-op
|
||||
*/
|
||||
PREFETCH(uchar)
|
||||
PREFETCH(char)
|
||||
PREFETCH(ushort)
|
||||
PREFETCH(short)
|
||||
PREFETCH(uint)
|
||||
PREFETCH(int)
|
||||
PREFETCH(float)
|
||||
|
||||
#undef ASYNC_COPY_INTERNAL
|
||||
#undef ASYNC_COPY
|
||||
#undef ASYNC_STRIDED_SOURCE_COPY_INTERNAL
|
||||
#undef ASYNC_STRIDED_DEST_COPY_INTERNAL
|
||||
#undef ASYNC_STRIDED_COPY
|
||||
#undef PREFETCH
|
||||
|
||||
#endif /* VC4CL_ASYNC_H */
|
||||
|
659
drivers/videocore4_stdlib/include/_atomics.h
Normal file
659
drivers/videocore4_stdlib/include/_atomics.h
Normal file
@ -0,0 +1,659 @@
|
||||
/*
|
||||
* Author: doe300
|
||||
*
|
||||
* See the file "LICENSE" for the full license governing this code.
|
||||
*/
|
||||
|
||||
#ifndef VC4CL_ATOMICS_H
|
||||
#define VC4CL_ATOMICS_H
|
||||
|
||||
#include "_config.h"
|
||||
#include "_overloads.h"
|
||||
#include "_intrinsics.h"
|
||||
|
||||
INLINE int atomic_add(volatile __global int * ptr, int val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, old + val);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE unsigned int atomic_add(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
unsigned int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, old + val);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE int atomic_add(volatile __local int * ptr, int val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, old + val);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE unsigned int atomic_add(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
unsigned int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, old + val);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE int atom_add(volatile __global int *ptr, int val) OVERLOADABLE
|
||||
{
|
||||
return atomic_add(ptr, val);
|
||||
}
|
||||
|
||||
INLINE unsigned int atom_add(volatile __global unsigned int *ptr, unsigned int val) OVERLOADABLE
|
||||
{
|
||||
return atomic_add(ptr, val);
|
||||
}
|
||||
|
||||
INLINE int atom_add(volatile __local int *ptr, int val) OVERLOADABLE
|
||||
{
|
||||
return atomic_add(ptr, val);
|
||||
}
|
||||
|
||||
INLINE unsigned int atom_add(volatile __local unsigned int *ptr, unsigned int val) OVERLOADABLE
|
||||
{
|
||||
return atomic_add(ptr, val);
|
||||
}
|
||||
|
||||
INLINE int atomic_sub(volatile __global int * ptr, int val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, old - val);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE unsigned int atomic_sub(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
unsigned int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, old - val);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE int atomic_sub(volatile __local int * ptr, int val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, old - val);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE unsigned int atomic_sub(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
unsigned int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, old - val);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE int atom_sub(volatile __global int *ptr, int val) OVERLOADABLE
|
||||
{
|
||||
return atomic_sub(ptr, val);
|
||||
}
|
||||
|
||||
INLINE unsigned int atom_sub(volatile __global unsigned int *ptr, unsigned int val) OVERLOADABLE
|
||||
{
|
||||
return atomic_sub(ptr, val);
|
||||
}
|
||||
|
||||
INLINE int atom_sub(volatile __local int *ptr, int val) OVERLOADABLE
|
||||
{
|
||||
return atomic_sub(ptr, val);
|
||||
}
|
||||
|
||||
INLINE unsigned int atom_sub(volatile __local unsigned int *ptr, unsigned int val) OVERLOADABLE
|
||||
{
|
||||
return atomic_sub(ptr, val);
|
||||
}
|
||||
|
||||
INLINE int atomic_xchg(volatile __global int * ptr, int val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, val);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE unsigned int atomic_xchg(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
unsigned int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, val);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE float atomic_xchg(volatile __global float * ptr, float val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
float old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, val);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE int atomic_xchg(volatile __local int * ptr, int val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, val);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE unsigned int atomic_xchg(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
unsigned int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, val);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE float atomic_xchg(volatile __local float * ptr, float val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
float old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, val);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE int atom_xchg(volatile __global int * ptr, int val) OVERLOADABLE
|
||||
{
|
||||
return atomic_xchg(ptr, val);
|
||||
}
|
||||
|
||||
INLINE unsigned int atom_xchg(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
|
||||
{
|
||||
return atomic_xchg(ptr, val);
|
||||
}
|
||||
|
||||
INLINE float atom_xchg(volatile __global float * ptr, float val) OVERLOADABLE
|
||||
{
|
||||
return atomic_xchg(ptr, val);
|
||||
}
|
||||
|
||||
INLINE int atom_xchg(volatile __local int * ptr, int val) OVERLOADABLE
|
||||
{
|
||||
return atomic_xchg(ptr, val);
|
||||
}
|
||||
|
||||
INLINE unsigned int atom_xchg(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
|
||||
{
|
||||
return atomic_xchg(ptr, val);
|
||||
}
|
||||
|
||||
INLINE float atom_xchg(volatile __local float * ptr, float val) OVERLOADABLE
|
||||
{
|
||||
return atomic_xchg(ptr, val);
|
||||
}
|
||||
|
||||
INLINE int atomic_inc(volatile __global int * ptr) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, old + 1);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE unsigned int atomic_inc(volatile __global unsigned int * ptr) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
unsigned int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, old + 1);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE int atomic_inc(volatile __local int * ptr) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, old + 1);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE unsigned int atomic_inc(volatile __local unsigned int * ptr) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
unsigned int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, old + 1);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE int atom_inc(volatile __global int * ptr) OVERLOADABLE
|
||||
{
|
||||
return atomic_inc(ptr);
|
||||
}
|
||||
|
||||
INLINE unsigned int atom_inc(volatile __global unsigned int * ptr) OVERLOADABLE
|
||||
{
|
||||
return atomic_inc(ptr);
|
||||
}
|
||||
|
||||
INLINE int atom_inc(volatile __local int * ptr) OVERLOADABLE
|
||||
{
|
||||
return atomic_inc(ptr);
|
||||
}
|
||||
|
||||
INLINE unsigned int atom_inc(volatile __local unsigned int * ptr) OVERLOADABLE
|
||||
{
|
||||
return atomic_inc(ptr);
|
||||
}
|
||||
|
||||
INLINE int atomic_dec(volatile __global int * ptr) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, old - 1);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE unsigned int atomic_dec(volatile __global unsigned int * ptr) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
unsigned int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, old - 1);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE int atomic_dec(volatile __local int * ptr) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, old - 1);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE unsigned int atomic_dec(volatile __local unsigned int * ptr) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
unsigned int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, old - 1);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE int atom_dec(volatile __global int * ptr) OVERLOADABLE
|
||||
{
|
||||
return atomic_dec(ptr);
|
||||
}
|
||||
|
||||
INLINE unsigned int atom_dec(volatile __global unsigned int * ptr) OVERLOADABLE
|
||||
{
|
||||
return atomic_dec(ptr);
|
||||
}
|
||||
|
||||
INLINE int atom_dec(volatile __local int * ptr) OVERLOADABLE
|
||||
{
|
||||
return atomic_dec(ptr);
|
||||
}
|
||||
|
||||
INLINE unsigned int atom_dec(volatile __local unsigned int * ptr) OVERLOADABLE
|
||||
{
|
||||
return atomic_dec(ptr);
|
||||
}
|
||||
|
||||
INLINE int atomic_cmpxchg(volatile __global int * ptr, int compare, int val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, (old == compare) ? val : old);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE unsigned int atomic_cmpxchg(volatile __global unsigned int * ptr, unsigned int compare, unsigned int val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
unsigned int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, (old == compare) ? val : old);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE int atomic_cmpxchg(volatile __local int * ptr, int compare, int val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, (old == compare) ? val : old);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE unsigned int atomic_cmpxchg(volatile __local unsigned int * ptr, unsigned int compare, unsigned int val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
unsigned int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, (old == compare) ? val : old);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE int atom_cmpxchg(volatile __global int * ptr, int compare, int val) OVERLOADABLE
|
||||
{
|
||||
return atomic_cmpxchg(ptr, compare, val);
|
||||
}
|
||||
|
||||
INLINE unsigned int atom_cmpxchg(volatile __global unsigned int * ptr, unsigned int compare, unsigned int val) OVERLOADABLE
|
||||
{
|
||||
return atomic_cmpxchg(ptr, compare, val);
|
||||
}
|
||||
|
||||
INLINE int atom_cmpxchg(volatile __local int * ptr, int compare, int val) OVERLOADABLE
|
||||
{
|
||||
return atomic_cmpxchg(ptr, compare, val);
|
||||
}
|
||||
|
||||
INLINE unsigned int atom_cmpxchg(volatile __local unsigned int * ptr, unsigned int compare, unsigned int val) OVERLOADABLE
|
||||
{
|
||||
return atomic_cmpxchg(ptr, compare, val);
|
||||
}
|
||||
|
||||
INLINE int atomic_min(volatile __global int * ptr, int val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, min(old, val));
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE unsigned int atomic_min(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
unsigned int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, min(old, val));
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE int atomic_min(volatile __local int * ptr, int val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, min(old, val));
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE unsigned int atomic_min(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
unsigned int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, min(old, val));
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE int atom_min(volatile __global int * ptr, int val) OVERLOADABLE
|
||||
{
|
||||
return atomic_min(ptr, val);
|
||||
}
|
||||
|
||||
INLINE unsigned int atom_min(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
|
||||
{
|
||||
return atomic_min(ptr, val);
|
||||
}
|
||||
|
||||
INLINE int atom_min(volatile __local int * ptr, int val) OVERLOADABLE
|
||||
{
|
||||
return atomic_min(ptr, val);
|
||||
}
|
||||
|
||||
INLINE unsigned int atom_min(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
|
||||
{
|
||||
return atomic_min(ptr, val);
|
||||
}
|
||||
|
||||
INLINE int atomic_max(volatile __global int * ptr, int val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, max(old, val));
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE unsigned int atomic_max(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
unsigned int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, max(old, val));
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE int atomic_max(volatile __local int * ptr, int val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, max(old, val));
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE unsigned int atomic_max(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
unsigned int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, max(old, val));
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE int atom_max(volatile __global int * ptr, int val) OVERLOADABLE
|
||||
{
|
||||
return atomic_max(ptr, val);
|
||||
}
|
||||
|
||||
INLINE unsigned int atom_max(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
|
||||
{
|
||||
return atomic_max(ptr, val);
|
||||
}
|
||||
|
||||
INLINE int atom_max(volatile __local int * ptr, int val) OVERLOADABLE
|
||||
{
|
||||
return atomic_max(ptr, val);
|
||||
}
|
||||
|
||||
INLINE unsigned int atom_max(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
|
||||
{
|
||||
return atomic_max(ptr, val);
|
||||
}
|
||||
|
||||
INLINE int atomic_and(volatile __global int * ptr, int val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, old & val);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE unsigned int atomic_and(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
unsigned int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, old & val);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE int atomic_and(volatile __local int * ptr, int val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, old & val);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE unsigned int atomic_and(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
unsigned int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, old & val);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE int atom_and(volatile __global int * ptr, int val) OVERLOADABLE
|
||||
{
|
||||
return atomic_and(ptr, val);
|
||||
}
|
||||
|
||||
INLINE unsigned int atom_and(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
|
||||
{
|
||||
return atomic_and(ptr, val);
|
||||
}
|
||||
|
||||
INLINE int atom_and(volatile __local int * ptr, int val) OVERLOADABLE
|
||||
{
|
||||
return atomic_and(ptr, val);
|
||||
}
|
||||
|
||||
INLINE unsigned int atom_and(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
|
||||
{
|
||||
return atomic_and(ptr, val);
|
||||
}
|
||||
|
||||
INLINE int atomic_or(volatile __global int * ptr, int val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, old | val);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE unsigned int atomic_or(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
unsigned int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, old | val);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE int atomic_or(volatile __local int * ptr, int val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, old | val);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE unsigned int atomic_or(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
unsigned int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, old | val);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE int atom_or(volatile __global int * ptr, int val) OVERLOADABLE
|
||||
{
|
||||
return atomic_or(ptr, val);
|
||||
}
|
||||
|
||||
INLINE unsigned int atom_or(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
|
||||
{
|
||||
return atomic_or(ptr, val);
|
||||
}
|
||||
|
||||
INLINE int atom_or(volatile __local int * ptr, int val) OVERLOADABLE
|
||||
{
|
||||
return atomic_or(ptr, val);
|
||||
}
|
||||
|
||||
INLINE unsigned int atom_or(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
|
||||
{
|
||||
return atomic_or(ptr, val);
|
||||
}
|
||||
|
||||
INLINE int atomic_xor(volatile __global int * ptr, int val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, old ^ val);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE unsigned int atomic_xor(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
unsigned int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, old ^ val);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE int atomic_xor(volatile __local int * ptr, int val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, old ^ val);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE unsigned int atomic_xor(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
|
||||
{
|
||||
vc4cl_mutex_lock();
|
||||
unsigned int old = vc4cl_dma_read(ptr);
|
||||
vc4cl_dma_write(ptr, old ^ val);
|
||||
vc4cl_mutex_unlock();
|
||||
return old;
|
||||
}
|
||||
|
||||
INLINE int atom_xor(volatile __global int * ptr, int val) OVERLOADABLE
|
||||
{
|
||||
return atomic_xor(ptr, val);
|
||||
}
|
||||
|
||||
INLINE unsigned int atom_xor(volatile __global unsigned int * ptr, unsigned int val) OVERLOADABLE
|
||||
{
|
||||
return atomic_xor(ptr, val);
|
||||
}
|
||||
|
||||
INLINE int atom_xor(volatile __local int * ptr, int val) OVERLOADABLE
|
||||
{
|
||||
return atomic_xor(ptr, val);
|
||||
}
|
||||
|
||||
INLINE unsigned int atom_xor(volatile __local unsigned int * ptr, unsigned int val) OVERLOADABLE
|
||||
{
|
||||
return atomic_xor(ptr, val);
|
||||
}
|
||||
|
||||
#endif /* VC4CL_ATOMICS_H */
|
||||
|
411
drivers/videocore4_stdlib/include/_clcxx_mangling.h
Normal file
411
drivers/videocore4_stdlib/include/_clcxx_mangling.h
Normal file
@ -0,0 +1,411 @@
|
||||
/*
|
||||
* OpenCL 2.0 introduces the __generic address space, which is also used by C++ for OpenCL C.
|
||||
*
|
||||
* Since we do not actually care about address spaces(so far), we can just map those functions to one of the existing address spaces.
|
||||
*
|
||||
* Base list of affected functions generated with:
|
||||
* llvm-dis -o /dev/stdout ../VC4CLStdLib/include/VC4CLStdLib.bc | grep -oE 'spir_func .?* \S*AS1.*?\)' | sort
|
||||
*
|
||||
* This header contains wrapper for the SPIR-mangled functions to the real implementations
|
||||
*
|
||||
* Author: doe300
|
||||
*
|
||||
* See the file "LICENSE" for the full license governing this code.
|
||||
*/
|
||||
#ifndef VC4CL_GENERIC_MANGLING
|
||||
#define VC4CL_GENERIC_MANGLING
|
||||
|
||||
#include "_config.h"
|
||||
|
||||
float _Z4modffPU3AS4f(float, __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z4modffPU3AS1f")));
|
||||
float _Z5fractfPU3AS4f(float, __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z5fractfPU3AS1f")));
|
||||
float _Z5frexpfPU3AS4i(float, __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z5frexpfPU3AS1i")));
|
||||
float _Z6remquoffPU3AS4i(float, float, __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z6remquoffPU3AS1i")));
|
||||
float _Z6sincosfPU3AS4f(float, __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z6sincosfPU3AS1f")));
|
||||
float _Z8lgamma_rfPU3AS4i(float, __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z8lgamma_rfPU3AS1i")));
|
||||
float2 _Z4modfDv2_fPU3AS4S_(float2, __attribute__((address_space(4))) float2*) __attribute__((weak, alias("_Z4modfDv2_fPU3AS1S_")));
|
||||
float2 _Z5fractDv2_fPU3AS4S_(float2, __attribute__((address_space(4))) float2*) __attribute__((weak, alias("_Z5fractDv2_fPU3AS1S_")));
|
||||
float2 _Z5frexpDv2_fPU3AS4Dv2_i(float2, __attribute__((address_space(4))) int2*) __attribute__((weak, alias("_Z5frexpDv2_fPU3AS1Dv2_i")));
|
||||
float2 _Z6remquoDv2_fS_PU3AS4Dv2_i(float2, float2, __attribute__((address_space(4))) int2*) __attribute__((weak, alias("_Z6remquoDv2_fS_PU3AS1Dv2_i")));
|
||||
float2 _Z6sincosDv2_fPU3AS4S_(float2, __attribute__((address_space(4))) float2*) __attribute__((weak, alias("_Z6sincosDv2_fPU3AS1S_")));
|
||||
float2 _Z8lgamma_rDv2_fPU3AS4Dv2_i(float2, __attribute__((address_space(4))) int2*) __attribute__((weak, alias("_Z8lgamma_rDv2_fPU3AS1Dv2_i")));
|
||||
float3 _Z4modfDv3_fPU3AS4S_(float3, __attribute__((address_space(4))) float3*) __attribute__((weak, alias("_Z4modfDv3_fPU3AS1S_")));
|
||||
float3 _Z5fractDv3_fPU3AS4S_(float3, __attribute__((address_space(4))) float3*) __attribute__((weak, alias("_Z5fractDv3_fPU3AS1S_")));
|
||||
float3 _Z5frexpDv3_fPU3AS4Dv3_i(float3, __attribute__((address_space(4))) int3*) __attribute__((weak, alias("_Z5frexpDv3_fPU3AS1Dv3_i")));
|
||||
float3 _Z6remquoDv3_fS_PU3AS4Dv3_i(float3, float3, __attribute__((address_space(4))) int3*) __attribute__((weak, alias("_Z6remquoDv3_fS_PU3AS1Dv3_i")));
|
||||
float3 _Z6sincosDv3_fPU3AS4S_(float3, __attribute__((address_space(4))) float3*) __attribute__((weak, alias("_Z6sincosDv3_fPU3AS1S_")));
|
||||
float3 _Z8lgamma_rDv3_fPU3AS4Dv3_i(float3, __attribute__((address_space(4))) int3*) __attribute__((weak, alias("_Z8lgamma_rDv3_fPU3AS1Dv3_i")));
|
||||
float4 _Z4modfDv4_fPU3AS4S_(float4, __attribute__((address_space(4))) float4*) __attribute__((weak, alias("_Z4modfDv4_fPU3AS1S_")));
|
||||
float4 _Z5fractDv4_fPU3AS4S_(float4, __attribute__((address_space(4))) float4*) __attribute__((weak, alias("_Z5fractDv4_fPU3AS1S_")));
|
||||
float4 _Z5frexpDv4_fPU3AS4Dv4_i(float4, __attribute__((address_space(4))) int4*) __attribute__((weak, alias("_Z5frexpDv4_fPU3AS1Dv4_i")));
|
||||
float4 _Z6remquoDv4_fS_PU3AS4Dv4_i(float4, float4, __attribute__((address_space(4))) int4*) __attribute__((weak, alias("_Z6remquoDv4_fS_PU3AS1Dv4_i")));
|
||||
float4 _Z6sincosDv4_fPU3AS4S_(float4, __attribute__((address_space(4))) float4*) __attribute__((weak, alias("_Z6sincosDv4_fPU3AS1S_")));
|
||||
float4 _Z8lgamma_rDv4_fPU3AS4Dv4_i(float4, __attribute__((address_space(4))) int4*) __attribute__((weak, alias("_Z8lgamma_rDv4_fPU3AS1Dv4_i")));
|
||||
float8 _Z4modfDv8_fPU3AS4S_(float8, __attribute__((address_space(4))) float8*) __attribute__((weak, alias("_Z4modfDv8_fPU3AS1S_")));
|
||||
float8 _Z5fractDv8_fPU3AS4S_(float8, __attribute__((address_space(4))) float8*) __attribute__((weak, alias("_Z5fractDv8_fPU3AS1S_")));
|
||||
float8 _Z5frexpDv8_fPU3AS4Dv8_i(float8, __attribute__((address_space(4))) int8*) __attribute__((weak, alias("_Z5frexpDv8_fPU3AS1Dv8_i")));
|
||||
float8 _Z6remquoDv8_fS_PU3AS4Dv8_i(float8, float8, __attribute__((address_space(4))) int8*) __attribute__((weak, alias("_Z6remquoDv8_fS_PU3AS1Dv8_i")));
|
||||
float8 _Z6sincosDv8_fPU3AS4S_(float8, __attribute__((address_space(4))) float8*) __attribute__((weak, alias("_Z6sincosDv8_fPU3AS1S_")));
|
||||
float8 _Z8lgamma_rDv8_fPU3AS4Dv8_i(float8, __attribute__((address_space(4))) int8*) __attribute__((weak, alias("_Z8lgamma_rDv8_fPU3AS1Dv8_i")));
|
||||
float16 _Z4modfDv16_fPU3AS4S_(float16, __attribute__((address_space(4))) float16*) __attribute__((weak, alias("_Z4modfDv16_fPU3AS1S_")));
|
||||
float16 _Z5fractDv16_fPU3AS4S_(float16, __attribute__((address_space(4))) float16*) __attribute__((weak, alias("_Z5fractDv16_fPU3AS1S_")));
|
||||
float16 _Z5frexpDv16_fPU3AS4Dv16_i(float16, __attribute__((address_space(4))) int16*) __attribute__((weak, alias("_Z5frexpDv16_fPU3AS1Dv16_i")));
|
||||
float16 _Z6remquoDv16_fS_PU3AS4Dv16_i(float16, float16, __attribute__((address_space(4))) int16*) __attribute__((weak, alias("_Z6remquoDv16_fS_PU3AS1Dv16_i")));
|
||||
float16 _Z6sincosDv16_fPU3AS4S_(float16, __attribute__((address_space(4))) float16*) __attribute__((weak, alias("_Z6sincosDv16_fPU3AS1S_")));
|
||||
float16 _Z8lgamma_rDv16_fPU3AS4Dv16_i(float16, __attribute__((address_space(4))) int16*) __attribute__((weak, alias("_Z8lgamma_rDv16_fPU3AS1Dv16_i")));
|
||||
|
||||
char2 _Z6vload2jPU3AS4Kc(uint, const __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z6vload2jPU3AS1Kc")));
|
||||
uchar2 _Z6vload2jPU3AS4Kh(uint, const __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z6vload2jPU3AS1Kh")));
|
||||
short2 _Z6vload2jPU3AS4Ks(uint, const __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z6vload2jPU3AS1Ks")));
|
||||
ushort2 _Z6vload2jPU3AS4Kt(uint, const __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z6vload2jPU3AS1Kt")));
|
||||
int2 _Z6vload2jPU3AS4Ki(uint, const __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z6vload2jPU3AS1Ki")));
|
||||
uint2 _Z6vload2jPU3AS4Kj(uint, const __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z6vload2jPU3AS1Kj")));
|
||||
long2 _Z6vload2jPU3AS4Kl(uint, const __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z6vload2jPU3AS1Kl")));
|
||||
ulong2 _Z6vload2jPU3AS4Km(uint, const __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z6vload2jPU3AS1Km")));
|
||||
float2 _Z6vload2jPU3AS4Kf(uint, const __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z6vload2jPU3AS1Kf")));
|
||||
char3 _Z6vload3jPU3AS4Kc(uint, const __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z6vload3jPU3AS1Kc")));
|
||||
uchar3 _Z6vload3jPU3AS4Kh(uint, const __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z6vload3jPU3AS1Kh")));
|
||||
short3 _Z6vload3jPU3AS4Ks(uint, const __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z6vload3jPU3AS1Ks")));
|
||||
ushort3 _Z6vload3jPU3AS4Kt(uint, const __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z6vload3jPU3AS1Kt")));
|
||||
int3 _Z6vload3jPU3AS4Ki(uint, const __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z6vload3jPU3AS1Ki")));
|
||||
uint3 _Z6vload3jPU3AS4Kj(uint, const __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z6vload3jPU3AS1Kj")));
|
||||
long3 _Z6vload3jPU3AS4Kl(uint, const __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z6vload3jPU3AS1Kl")));
|
||||
ulong3 _Z6vload3jPU3AS4Km(uint, const __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z6vload3jPU3AS1Km")));
|
||||
float3 _Z6vload3jPU3AS4Kf(uint, const __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z6vload3jPU3AS1Kf")));
|
||||
char4 _Z6vload4jPU3AS4Kc(uint, const __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z6vload4jPU3AS1Kc")));
|
||||
uchar4 _Z6vload4jPU3AS4Kh(uint, const __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z6vload4jPU3AS1Kh")));
|
||||
short4 _Z6vload4jPU3AS4Ks(uint, const __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z6vload4jPU3AS1Ks")));
|
||||
ushort4 _Z6vload4jPU3AS4Kt(uint, const __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z6vload4jPU3AS1Kt")));
|
||||
int4 _Z6vload4jPU3AS4Ki(uint, const __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z6vload4jPU3AS1Ki")));
|
||||
uint4 _Z6vload4jPU3AS4Kj(uint, const __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z6vload4jPU3AS1Kj")));
|
||||
long4 _Z6vload4jPU3AS4Kl(uint, const __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z6vload4jPU3AS1Kl")));
|
||||
ulong4 _Z6vload4jPU3AS4Km(uint, const __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z6vload4jPU3AS1Km")));
|
||||
float4 _Z6vload4jPU3AS4Kf(uint, const __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z6vload4jPU3AS1Kf")));
|
||||
char8 _Z6vload8jPU3AS4Kc(uint, const __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z6vload8jPU3AS1Kc")));
|
||||
uchar8 _Z6vload8jPU3AS4Kh(uint, const __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z6vload8jPU3AS1Kh")));
|
||||
short8 _Z6vload8jPU3AS4Ks(uint, const __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z6vload8jPU3AS1Ks")));
|
||||
ushort8 _Z6vload8jPU3AS4Kt(uint, const __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z6vload8jPU3AS1Kt")));
|
||||
int8 _Z6vload8jPU3AS4Ki(uint, const __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z6vload8jPU3AS1Ki")));
|
||||
uint8 _Z6vload8jPU3AS4Kj(uint, const __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z6vload8jPU3AS1Kj")));
|
||||
long8 _Z6vload8jPU3AS4Kl(uint, const __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z6vload8jPU3AS1Kl")));
|
||||
ulong8 _Z6vload8jPU3AS4Km(uint, const __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z6vload8jPU3AS1Km")));
|
||||
float8 _Z6vload8jPU3AS4Kf(uint, const __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z6vload8jPU3AS1Kf")));
|
||||
char16 _Z7vload16jPU3AS4Kc(uint, const __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z7vload16jPU3AS1Kc")));
|
||||
uchar16 _Z7vload16jPU3AS4Kh(uint, const __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z7vload16jPU3AS1Kh")));
|
||||
short16 _Z7vload16jPU3AS4Ks(uint, const __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z7vload16jPU3AS1Ks")));
|
||||
ushort16 _Z7vload16jPU3AS4Kt(uint, const __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z7vload16jPU3AS1Kt")));
|
||||
int16 _Z7vload16jPU3AS4Ki(uint, const __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z7vload16jPU3AS1Ki")));
|
||||
uint16 _Z7vload16jPU3AS4Kj(uint, const __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z7vload16jPU3AS1Kj")));
|
||||
long16 _Z7vload16jPU3AS4Kl(uint, const __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z7vload16jPU3AS1Kl")));
|
||||
ulong16 _Z7vload16jPU3AS4Km(uint, const __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z7vload16jPU3AS1Km")));
|
||||
float16 _Z7vload16jPU3AS4Kf(uint, const __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z7vload16jPU3AS1Kf")));
|
||||
|
||||
void _Z7vstore2Dv2_cjPU3AS4c(char2, uint, __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z7vstore2Dv2_cjPU3AS1c")));
|
||||
void _Z7vstore2Dv2_hjPU3AS4h(uchar2, uint, __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z7vstore2Dv2_hjPU3AS1h")));
|
||||
void _Z7vstore2Dv2_sjPU3AS4s(short2, uint, __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z7vstore2Dv2_sjPU3AS1s")));
|
||||
void _Z7vstore2Dv2_tjPU3AS4t(ushort2, uint, __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z7vstore2Dv2_tjPU3AS1t")));
|
||||
void _Z7vstore2Dv2_ijPU3AS4i(int2, uint, __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z7vstore2Dv2_ijPU3AS1i")));
|
||||
void _Z7vstore2Dv2_jjPU3AS4j(uint2, uint, __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z7vstore2Dv2_jjPU3AS1j")));
|
||||
void _Z7vstore2Dv2_ljPU3AS4l(long2, uint, __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z7vstore2Dv2_ljPU3AS1l")));
|
||||
void _Z7vstore2Dv2_mjPU3AS4m(ulong2, uint, __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z7vstore2Dv2_mjPU3AS1m")));
|
||||
void _Z7vstore2Dv2_fjPU3AS4f(float2, uint, __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z7vstore2Dv2_fjPU3AS1f")));
|
||||
void _Z7vstore3Dv3_cjPU3AS4c(char3, uint, __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z7vstore3Dv3_cjPU3AS1c")));
|
||||
void _Z7vstore3Dv3_hjPU3AS4h(uchar3, uint, __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z7vstore3Dv3_hjPU3AS1h")));
|
||||
void _Z7vstore3Dv3_sjPU3AS4s(short3, uint, __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z7vstore3Dv3_sjPU3AS1s")));
|
||||
void _Z7vstore3Dv3_tjPU3AS4t(ushort3, uint, __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z7vstore3Dv3_tjPU3AS1t")));
|
||||
void _Z7vstore3Dv3_ijPU3AS4i(int3, uint, __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z7vstore3Dv3_ijPU3AS1i")));
|
||||
void _Z7vstore3Dv3_jjPU3AS4j(uint3, uint, __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z7vstore3Dv3_jjPU3AS1j")));
|
||||
void _Z7vstore3Dv3_ljPU3AS4l(long3, uint, __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z7vstore3Dv3_ljPU3AS1l")));
|
||||
void _Z7vstore3Dv3_mjPU3AS4m(ulong3, uint, __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z7vstore3Dv3_mjPU3AS1m")));
|
||||
void _Z7vstore3Dv3_fjPU3AS4f(float3, uint, __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z7vstore3Dv3_fjPU3AS1f")));
|
||||
void _Z7vstore4Dv4_cjPU3AS4c(char4, uint, __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z7vstore4Dv4_cjPU3AS1c")));
|
||||
void _Z7vstore4Dv4_hjPU3AS4h(uchar4, uint, __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z7vstore4Dv4_hjPU3AS1h")));
|
||||
void _Z7vstore4Dv4_sjPU3AS4s(short4, uint, __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z7vstore4Dv4_sjPU3AS1s")));
|
||||
void _Z7vstore4Dv4_tjPU3AS4t(ushort4, uint, __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z7vstore4Dv4_tjPU3AS1t")));
|
||||
void _Z7vstore4Dv4_ijPU3AS4i(int4, uint, __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z7vstore4Dv4_ijPU3AS1i")));
|
||||
void _Z7vstore4Dv4_jjPU3AS4j(uint4, uint, __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z7vstore4Dv4_jjPU3AS1j")));
|
||||
void _Z7vstore4Dv4_ljPU3AS4l(long4, uint, __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z7vstore4Dv4_ljPU3AS1l")));
|
||||
void _Z7vstore4Dv4_mjPU3AS4m(ulong4, uint, __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z7vstore4Dv4_mjPU3AS1m")));
|
||||
void _Z7vstore4Dv4_fjPU3AS4f(float4, uint, __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z7vstore4Dv4_fjPU3AS1f")));
|
||||
void _Z7vstore8Dv8_cjPU3AS4c(char8, uint, __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z7vstore8Dv8_cjPU3AS1c")));
|
||||
void _Z7vstore8Dv8_hjPU3AS4h(uchar8, uint, __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z7vstore8Dv8_hjPU3AS1h")));
|
||||
void _Z7vstore8Dv8_sjPU3AS4s(short8, uint, __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z7vstore8Dv8_sjPU3AS1s")));
|
||||
void _Z7vstore8Dv8_tjPU3AS4t(ushort8, uint, __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z7vstore8Dv8_tjPU3AS1t")));
|
||||
void _Z7vstore8Dv8_ijPU3AS4i(int8, uint, __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z7vstore8Dv8_ijPU3AS1i")));
|
||||
void _Z7vstore8Dv8_jjPU3AS4j(uint8, uint, __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z7vstore8Dv8_jjPU3AS1j")));
|
||||
void _Z7vstore8Dv8_ljPU3AS4l(long8, uint, __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z7vstore8Dv8_ljPU3AS1l")));
|
||||
void _Z7vstore8Dv8_mjPU3AS4m(ulong8, uint, __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z7vstore8Dv8_mjPU3AS1m")));
|
||||
void _Z7vstore8Dv8_fjPU3AS4f(float8, uint, __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z7vstore8Dv8_fjPU3AS1f")));
|
||||
void _Z8vstore16Dv16_cjPU3AS4c(char16, uint, __attribute__((address_space(4))) char*) __attribute__((weak, alias("_Z8vstore16Dv16_cjPU3AS1c")));
|
||||
void _Z8vstore16Dv16_hjPU3AS4h(uchar16, uint, __attribute__((address_space(4))) uchar*) __attribute__((weak, alias("_Z8vstore16Dv16_hjPU3AS1h")));
|
||||
void _Z8vstore16Dv16_sjPU3AS4s(short16, uint, __attribute__((address_space(4))) short*) __attribute__((weak, alias("_Z8vstore16Dv16_sjPU3AS1s")));
|
||||
void _Z8vstore16Dv16_tjPU3AS4t(ushort16, uint, __attribute__((address_space(4))) ushort*) __attribute__((weak, alias("_Z8vstore16Dv16_tjPU3AS1t")));
|
||||
void _Z8vstore16Dv16_ijPU3AS4i(int16, uint, __attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z8vstore16Dv16_ijPU3AS1i")));
|
||||
void _Z8vstore16Dv16_jjPU3AS4j(uint16, uint, __attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z8vstore16Dv16_jjPU3AS1j")));
|
||||
void _Z8vstore16Dv16_ljPU3AS4l(long16, uint, __attribute__((address_space(4))) long*) __attribute__((weak, alias("_Z8vstore16Dv16_ljPU3AS1l")));
|
||||
void _Z8vstore16Dv16_mjPU3AS4m(ulong16, uint, __attribute__((address_space(4))) ulong*) __attribute__((weak, alias("_Z8vstore16Dv16_mjPU3AS1m")));
|
||||
void _Z8vstore16Dv16_fjPU3AS4f(float16, uint, __attribute__((address_space(4))) float*) __attribute__((weak, alias("_Z8vstore16Dv16_fjPU3AS1f")));
|
||||
|
||||
int _Z10atomic_andPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z10atomic_andPU3AS1Vii")));
|
||||
uint _Z10atomic_andPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z10atomic_andPU3AS1Vjj")));
|
||||
int _Z8atom_andPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z8atom_andPU3AS1Vii")));
|
||||
uint _Z8atom_andPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z8atom_andPU3AS1Vjj")));
|
||||
int _Z9atomic_orPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z9atomic_orPU3AS1Vii")));
|
||||
uint _Z9atomic_orPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z9atomic_orPU3AS1Vjj")));
|
||||
int _Z7atom_orPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z7atom_orPU3AS1Vii")));
|
||||
uint _Z7atom_orPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z7atom_orPU3AS1Vjj")));
|
||||
int _Z10atomic_xorPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z10atomic_xorPU3AS1Vii")));
|
||||
uint _Z10atomic_xorPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z10atomic_xorPU3AS1Vjj")));
|
||||
int _Z8atom_xorPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z8atom_xorPU3AS1Vii")));
|
||||
uint _Z8atom_xorPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z8atom_xorPU3AS1Vjj")));
|
||||
int _Z10atomic_decPU3AS4Vi(__attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z10atomic_decPU3AS1Vi")));
|
||||
uint _Z10atomic_decPU3AS4Vj(__attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z10atomic_decPU3AS1Vj")));
|
||||
int _Z8atom_decPU3AS4Vi(__attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z8atom_decPU3AS1Vi")));
|
||||
uint _Z8atom_decPU3AS4Vj(__attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z8atom_decPU3AS1Vj")));
|
||||
int _Z10atomic_incPU3AS4Vi(__attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z10atomic_incPU3AS1Vi")));
|
||||
uint _Z10atomic_incPU3AS4Vj(__attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z10atomic_incPU3AS1Vj")));
|
||||
int _Z8atom_incPU3AS4Vi(__attribute__((address_space(4))) int*) __attribute__((weak, alias("_Z8atom_incPU3AS1Vi")));
|
||||
uint _Z8atom_incPU3AS4Vj(__attribute__((address_space(4))) uint*) __attribute__((weak, alias("_Z8atom_incPU3AS1Vj")));
|
||||
int _Z10atomic_maxPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z10atomic_maxPU3AS1Vii")));
|
||||
uint _Z10atomic_maxPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z10atomic_maxPU3AS1Vjj")));
|
||||
int _Z8atom_maxPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z8atom_maxPU3AS1Vii")));
|
||||
uint _Z8atom_maxPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z8atom_maxPU3AS1Vjj")));
|
||||
int _Z10atomic_minPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z10atomic_minPU3AS1Vii")));
|
||||
uint _Z10atomic_minPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z10atomic_minPU3AS1Vjj")));
|
||||
int _Z8atom_minPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z8atom_minPU3AS1Vii")));
|
||||
uint _Z8atom_minPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z8atom_minPU3AS1Vjj")));
|
||||
int _Z10atomic_addPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z10atomic_addPU3AS1Vii")));
|
||||
uint _Z10atomic_addPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z10atomic_addPU3AS1Vjj")));
|
||||
int _Z8atom_addPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z8atom_addPU3AS1Vii")));
|
||||
uint _Z8atom_addPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z8atom_addPU3AS1Vjj")));
|
||||
int _Z10atomic_subPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z10atomic_subPU3AS1Vii")));
|
||||
uint _Z10atomic_subPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z10atomic_subPU3AS1Vjj")));
|
||||
int _Z8atom_subPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z8atom_subPU3AS1Vii")));
|
||||
uint _Z8atom_subPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z8atom_subPU3AS1Vjj")));
|
||||
int _Z11atomic_xchgPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z11atomic_xchgPU3AS1Vii")));
|
||||
uint _Z11atomic_xchgPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z11atomic_xchgPU3AS1Vjj")));
|
||||
float _Z11atomic_xchgPU3AS4Vff(__attribute__((address_space(4))) float*, float) __attribute__((weak, alias("_Z11atomic_xchgPU3AS1Vff")));
|
||||
int _Z9atom_xchgPU3AS4Vii(__attribute__((address_space(4))) int*, int) __attribute__((weak, alias("_Z9atom_xchgPU3AS1Vii")));
|
||||
uint _Z9atom_xchgPU3AS4Vjj(__attribute__((address_space(4))) uint*, uint) __attribute__((weak, alias("_Z9atom_xchgPU3AS1Vjj")));
|
||||
float _Z9atom_xchgPU3AS4Vff(__attribute__((address_space(4))) float*, float) __attribute__((weak, alias("_Z9atom_xchgPU3AS1Vff")));
|
||||
int _Z12atom_cmpxchgPU3AS4Viii(__attribute__((address_space(4))) int*, int, int) __attribute__((weak, alias("_Z12atom_cmpxchgPU3AS1Viii")));
|
||||
uint _Z12atom_cmpxchgPU3AS4Vjjj(__attribute__((address_space(4))) uint*, uint, uint) __attribute__((weak, alias("_Z12atom_cmpxchgPU3AS1Vjjj")));
|
||||
int _Z14atomic_cmpxchgPU3AS4Viii(__attribute__((address_space(4))) int*, int, int) __attribute__((weak, alias("_Z14atomic_cmpxchgPU3AS1Viii")));
|
||||
uint _Z14atomic_cmpxchgPU3AS4Vjjj(__attribute__((address_space(4))) uint*, uint, uint) __attribute__((weak, alias("_Z14atomic_cmpxchgPU3AS1Vjjj")));
|
||||
|
||||
/*
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1cPU3AS3Kcj9ocl_event(__attribute__((address_space(4))) i8*, i8 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1cPU3AS3Kcj9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv16_cPU3AS3KS_j9ocl_event(<16 x i8> __attribute__((address_space(4)))*, <16 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv16_cPU3AS3KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv16_fPU3AS3KS_j9ocl_event(__attribute__((address_space(4))) float16*, float16 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv16_fPU3AS3KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv16_hPU3AS3KS_j9ocl_event(<16 x i8> __attribute__((address_space(4)))*, <16 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv16_hPU3AS3KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv16_iPU3AS3KS_j9ocl_event(<16 x i32> __attribute__((address_space(4)))*, <16 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv16_iPU3AS3KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv16_jPU3AS3KS_j9ocl_event(<16 x i32> __attribute__((address_space(4)))*, <16 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv16_jPU3AS3KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv16_sPU3AS3KS_j9ocl_event(<16 x i16> __attribute__((address_space(4)))*, <16 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv16_sPU3AS3KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv16_tPU3AS3KS_j9ocl_event(<16 x i16> __attribute__((address_space(4)))*, <16 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv16_tPU3AS3KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv2_cPU3AS3KS_j9ocl_event(<2 x i8> __attribute__((address_space(4)))*, <2 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv2_cPU3AS3KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv2_fPU3AS3KS_j9ocl_event(__attribute__((address_space(4))) float2*, float2 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv2_fPU3AS3KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv2_hPU3AS3KS_j9ocl_event(<2 x i8> __attribute__((address_space(4)))*, <2 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv2_hPU3AS3KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv2_iPU3AS3KS_j9ocl_event(<2 x i32> __attribute__((address_space(4)))*, <2 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv2_iPU3AS3KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv2_jPU3AS3KS_j9ocl_event(<2 x i32> __attribute__((address_space(4)))*, <2 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv2_jPU3AS3KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv2_sPU3AS3KS_j9ocl_event(<2 x i16> __attribute__((address_space(4)))*, <2 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv2_sPU3AS3KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv2_tPU3AS3KS_j9ocl_event(<2 x i16> __attribute__((address_space(4)))*, <2 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv2_tPU3AS3KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv3_cPU3AS3KS_j9ocl_event(<3 x i8> __attribute__((address_space(4)))*, <3 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv3_cPU3AS3KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv3_fPU3AS3KS_j9ocl_event(__attribute__((address_space(4))) float3*, float3 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv3_fPU3AS3KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv3_hPU3AS3KS_j9ocl_event(<3 x i8> __attribute__((address_space(4)))*, <3 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv3_hPU3AS3KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv3_iPU3AS3KS_j9ocl_event(<3 x i32> __attribute__((address_space(4)))*, <3 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv3_iPU3AS3KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv3_jPU3AS3KS_j9ocl_event(<3 x i32> __attribute__((address_space(4)))*, <3 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv3_jPU3AS3KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv3_sPU3AS3KS_j9ocl_event(<3 x i16> __attribute__((address_space(4)))*, <3 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv3_sPU3AS3KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv3_tPU3AS3KS_j9ocl_event(<3 x i16> __attribute__((address_space(4)))*, <3 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv3_tPU3AS3KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv4_cPU3AS3KS_j9ocl_event(<4 x i8> __attribute__((address_space(4)))*, <4 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv4_cPU3AS3KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv4_fPU3AS3KS_j9ocl_event(__attribute__((address_space(4))) float4*, float4 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv4_fPU3AS3KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv4_hPU3AS3KS_j9ocl_event(<4 x i8> __attribute__((address_space(4)))*, <4 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv4_hPU3AS3KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv4_iPU3AS3KS_j9ocl_event(<4 x i32> __attribute__((address_space(4)))*, <4 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv4_iPU3AS3KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv4_jPU3AS3KS_j9ocl_event(<4 x i32> __attribute__((address_space(4)))*, <4 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv4_jPU3AS3KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv4_sPU3AS3KS_j9ocl_event(<4 x i16> __attribute__((address_space(4)))*, <4 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv4_sPU3AS3KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv4_tPU3AS3KS_j9ocl_event(<4 x i16> __attribute__((address_space(4)))*, <4 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv4_tPU3AS3KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv8_cPU3AS3KS_j9ocl_event(<8 x i8> __attribute__((address_space(4)))*, <8 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv8_cPU3AS3KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv8_fPU3AS3KS_j9ocl_event(__attribute__((address_space(4))) float8*, float8 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv8_fPU3AS3KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv8_hPU3AS3KS_j9ocl_event(<8 x i8> __attribute__((address_space(4)))*, <8 x i8> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv8_hPU3AS3KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv8_iPU3AS3KS_j9ocl_event(<8 x i32> __attribute__((address_space(4)))*, <8 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv8_iPU3AS3KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv8_jPU3AS3KS_j9ocl_event(<8 x i32> __attribute__((address_space(4)))*, <8 x i32> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv8_jPU3AS3KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv8_sPU3AS3KS_j9ocl_event(<8 x i16> __attribute__((address_space(4)))*, <8 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv8_sPU3AS3KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1Dv8_tPU3AS3KS_j9ocl_event(<8 x i16> __attribute__((address_space(4)))*, <8 x i16> __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1Dv8_tPU3AS3KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1fPU3AS3Kfj9ocl_event(__attribute__((address_space(4))) float*, float __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1fPU3AS3Kfj9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1hPU3AS3Khj9ocl_event(__attribute__((address_space(4))) i8*, i8 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1hPU3AS3Khj9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1iPU3AS3Kij9ocl_event(__attribute__((address_space(4))) i32*, i32 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1iPU3AS3Kij9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1jPU3AS3Kjj9ocl_event(__attribute__((address_space(4))) i32*, i32 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1jPU3AS3Kjj9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1sPU3AS3Ksj9ocl_event(__attribute__((address_space(4))) i16*, i16 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1sPU3AS3Ksj9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS1tPU3AS3Ktj9ocl_event(__attribute__((address_space(4))) i16*, i16 __attribute__((address_space(3)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS1tPU3AS3Ktj9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3cPU3AS1Kcj9ocl_event(i8 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i8*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3cPU3AS1Kcj9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv16_cPU3AS1KS_j9ocl_event(<16 x i8> __attribute__((address_space(3)))*, <16 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv16_cPU3AS1KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv16_fPU3AS1KS_j9ocl_event(float16 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float16*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv16_fPU3AS1KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv16_hPU3AS1KS_j9ocl_event(<16 x i8> __attribute__((address_space(3)))*, <16 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv16_hPU3AS1KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv16_iPU3AS1KS_j9ocl_event(<16 x i32> __attribute__((address_space(3)))*, <16 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv16_iPU3AS1KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv16_jPU3AS1KS_j9ocl_event(<16 x i32> __attribute__((address_space(3)))*, <16 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv16_jPU3AS1KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv16_sPU3AS1KS_j9ocl_event(<16 x i16> __attribute__((address_space(3)))*, <16 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv16_sPU3AS1KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv16_tPU3AS1KS_j9ocl_event(<16 x i16> __attribute__((address_space(3)))*, <16 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv16_tPU3AS1KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv2_cPU3AS1KS_j9ocl_event(<2 x i8> __attribute__((address_space(3)))*, <2 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv2_cPU3AS1KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv2_fPU3AS1KS_j9ocl_event(float2 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float2*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv2_fPU3AS1KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv2_hPU3AS1KS_j9ocl_event(<2 x i8> __attribute__((address_space(3)))*, <2 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv2_hPU3AS1KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv2_iPU3AS1KS_j9ocl_event(<2 x i32> __attribute__((address_space(3)))*, <2 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv2_iPU3AS1KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv2_jPU3AS1KS_j9ocl_event(<2 x i32> __attribute__((address_space(3)))*, <2 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv2_jPU3AS1KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv2_sPU3AS1KS_j9ocl_event(<2 x i16> __attribute__((address_space(3)))*, <2 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv2_sPU3AS1KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv2_tPU3AS1KS_j9ocl_event(<2 x i16> __attribute__((address_space(3)))*, <2 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv2_tPU3AS1KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv3_cPU3AS1KS_j9ocl_event(<3 x i8> __attribute__((address_space(3)))*, <3 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv3_cPU3AS1KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv3_fPU3AS1KS_j9ocl_event(float3 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float3*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv3_fPU3AS1KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv3_hPU3AS1KS_j9ocl_event(<3 x i8> __attribute__((address_space(3)))*, <3 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv3_hPU3AS1KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv3_iPU3AS1KS_j9ocl_event(<3 x i32> __attribute__((address_space(3)))*, <3 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv3_iPU3AS1KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv3_jPU3AS1KS_j9ocl_event(<3 x i32> __attribute__((address_space(3)))*, <3 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv3_jPU3AS1KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv3_sPU3AS1KS_j9ocl_event(<3 x i16> __attribute__((address_space(3)))*, <3 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv3_sPU3AS1KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv3_tPU3AS1KS_j9ocl_event(<3 x i16> __attribute__((address_space(3)))*, <3 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv3_tPU3AS1KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv4_cPU3AS1KS_j9ocl_event(<4 x i8> __attribute__((address_space(3)))*, <4 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv4_cPU3AS1KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv4_fPU3AS1KS_j9ocl_event(float4 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float4*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv4_fPU3AS1KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv4_hPU3AS1KS_j9ocl_event(<4 x i8> __attribute__((address_space(3)))*, <4 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv4_hPU3AS1KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv4_iPU3AS1KS_j9ocl_event(<4 x i32> __attribute__((address_space(3)))*, <4 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv4_iPU3AS1KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv4_jPU3AS1KS_j9ocl_event(<4 x i32> __attribute__((address_space(3)))*, <4 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv4_jPU3AS1KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv4_sPU3AS1KS_j9ocl_event(<4 x i16> __attribute__((address_space(3)))*, <4 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv4_sPU3AS1KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv4_tPU3AS1KS_j9ocl_event(<4 x i16> __attribute__((address_space(3)))*, <4 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv4_tPU3AS1KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv8_cPU3AS1KS_j9ocl_event(<8 x i8> __attribute__((address_space(3)))*, <8 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv8_cPU3AS1KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv8_fPU3AS1KS_j9ocl_event(float8 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float8*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv8_fPU3AS1KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv8_hPU3AS1KS_j9ocl_event(<8 x i8> __attribute__((address_space(3)))*, <8 x i8> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv8_hPU3AS1KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv8_iPU3AS1KS_j9ocl_event(<8 x i32> __attribute__((address_space(3)))*, <8 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv8_iPU3AS1KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv8_jPU3AS1KS_j9ocl_event(<8 x i32> __attribute__((address_space(3)))*, <8 x i32> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv8_jPU3AS1KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv8_sPU3AS1KS_j9ocl_event(<8 x i16> __attribute__((address_space(3)))*, <8 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv8_sPU3AS1KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3Dv8_tPU3AS1KS_j9ocl_event(<8 x i16> __attribute__((address_space(3)))*, <8 x i16> __attribute__((address_space(4)))*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3Dv8_tPU3AS1KS_j9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3fPU3AS1Kfj9ocl_event(float __attribute__((address_space(3)))*, __attribute__((address_space(4))) float*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3fPU3AS1Kfj9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3hPU3AS1Khj9ocl_event(i8 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i8*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3hPU3AS1Khj9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3iPU3AS1Kij9ocl_event(i32 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i32*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3iPU3AS1Kij9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3jPU3AS1Kjj9ocl_event(i32 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i32*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3jPU3AS1Kjj9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3sPU3AS1Ksj9ocl_event(i16 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i16*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3sPU3AS1Ksj9ocl_event")));
|
||||
%opencl.event_t* _Z21async_work_group_copyPU3AS3tPU3AS1Ktj9ocl_event(i16 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i16*, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z21async_work_group_copyPU3AS3tPU3AS1Ktj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1cPU3AS3Kcjj9ocl_event(__attribute__((address_space(4))) i8*, i8 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1cPU3AS3Kcjj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv16_cPU3AS3KS_jj9ocl_event(<16 x i8> __attribute__((address_space(4)))*, <16 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv16_cPU3AS3KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv16_fPU3AS3KS_jj9ocl_event(__attribute__((address_space(4))) float16*, float16 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv16_fPU3AS3KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv16_hPU3AS3KS_jj9ocl_event(<16 x i8> __attribute__((address_space(4)))*, <16 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv16_hPU3AS3KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv16_iPU3AS3KS_jj9ocl_event(<16 x i32> __attribute__((address_space(4)))*, <16 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv16_iPU3AS3KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv16_jPU3AS3KS_jj9ocl_event(<16 x i32> __attribute__((address_space(4)))*, <16 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv16_jPU3AS3KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv16_sPU3AS3KS_jj9ocl_event(<16 x i16> __attribute__((address_space(4)))*, <16 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv16_sPU3AS3KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv16_tPU3AS3KS_jj9ocl_event(<16 x i16> __attribute__((address_space(4)))*, <16 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv16_tPU3AS3KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv2_cPU3AS3KS_jj9ocl_event(<2 x i8> __attribute__((address_space(4)))*, <2 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv2_cPU3AS3KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv2_fPU3AS3KS_jj9ocl_event(__attribute__((address_space(4))) float2*, float2 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv2_fPU3AS3KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv2_hPU3AS3KS_jj9ocl_event(<2 x i8> __attribute__((address_space(4)))*, <2 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv2_hPU3AS3KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv2_iPU3AS3KS_jj9ocl_event(<2 x i32> __attribute__((address_space(4)))*, <2 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv2_iPU3AS3KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv2_jPU3AS3KS_jj9ocl_event(<2 x i32> __attribute__((address_space(4)))*, <2 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv2_jPU3AS3KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv2_sPU3AS3KS_jj9ocl_event(<2 x i16> __attribute__((address_space(4)))*, <2 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv2_sPU3AS3KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv2_tPU3AS3KS_jj9ocl_event(<2 x i16> __attribute__((address_space(4)))*, <2 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv2_tPU3AS3KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv3_cPU3AS3KS_jj9ocl_event(<3 x i8> __attribute__((address_space(4)))*, <3 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv3_cPU3AS3KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv3_fPU3AS3KS_jj9ocl_event(__attribute__((address_space(4))) float3*, float3 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv3_fPU3AS3KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv3_hPU3AS3KS_jj9ocl_event(<3 x i8> __attribute__((address_space(4)))*, <3 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv3_hPU3AS3KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv3_iPU3AS3KS_jj9ocl_event(<3 x i32> __attribute__((address_space(4)))*, <3 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv3_iPU3AS3KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv3_jPU3AS3KS_jj9ocl_event(<3 x i32> __attribute__((address_space(4)))*, <3 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv3_jPU3AS3KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv3_sPU3AS3KS_jj9ocl_event(<3 x i16> __attribute__((address_space(4)))*, <3 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv3_sPU3AS3KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv3_tPU3AS3KS_jj9ocl_event(<3 x i16> __attribute__((address_space(4)))*, <3 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv3_tPU3AS3KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv4_cPU3AS3KS_jj9ocl_event(<4 x i8> __attribute__((address_space(4)))*, <4 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv4_cPU3AS3KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv4_fPU3AS3KS_jj9ocl_event(__attribute__((address_space(4))) float4*, float4 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv4_fPU3AS3KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv4_hPU3AS3KS_jj9ocl_event(<4 x i8> __attribute__((address_space(4)))*, <4 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv4_hPU3AS3KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv4_iPU3AS3KS_jj9ocl_event(<4 x i32> __attribute__((address_space(4)))*, <4 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv4_iPU3AS3KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv4_jPU3AS3KS_jj9ocl_event(<4 x i32> __attribute__((address_space(4)))*, <4 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv4_jPU3AS3KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv4_sPU3AS3KS_jj9ocl_event(<4 x i16> __attribute__((address_space(4)))*, <4 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv4_sPU3AS3KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv4_tPU3AS3KS_jj9ocl_event(<4 x i16> __attribute__((address_space(4)))*, <4 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv4_tPU3AS3KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv8_cPU3AS3KS_jj9ocl_event(<8 x i8> __attribute__((address_space(4)))*, <8 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv8_cPU3AS3KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv8_fPU3AS3KS_jj9ocl_event(__attribute__((address_space(4))) float8*, float8 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv8_fPU3AS3KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv8_hPU3AS3KS_jj9ocl_event(<8 x i8> __attribute__((address_space(4)))*, <8 x i8> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv8_hPU3AS3KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv8_iPU3AS3KS_jj9ocl_event(<8 x i32> __attribute__((address_space(4)))*, <8 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv8_iPU3AS3KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv8_jPU3AS3KS_jj9ocl_event(<8 x i32> __attribute__((address_space(4)))*, <8 x i32> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv8_jPU3AS3KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv8_sPU3AS3KS_jj9ocl_event(<8 x i16> __attribute__((address_space(4)))*, <8 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv8_sPU3AS3KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1Dv8_tPU3AS3KS_jj9ocl_event(<8 x i16> __attribute__((address_space(4)))*, <8 x i16> __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1Dv8_tPU3AS3KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1fPU3AS3Kfjj9ocl_event(__attribute__((address_space(4))) float*, float __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1fPU3AS3Kfjj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1hPU3AS3Khjj9ocl_event(__attribute__((address_space(4))) i8*, i8 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1hPU3AS3Khjj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1iPU3AS3Kijj9ocl_event(__attribute__((address_space(4))) i32*, i32 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1iPU3AS3Kijj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1jPU3AS3Kjjj9ocl_event(__attribute__((address_space(4))) i32*, i32 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1jPU3AS3Kjjj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1sPU3AS3Ksjj9ocl_event(__attribute__((address_space(4))) i16*, i16 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1sPU3AS3Ksjj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS1tPU3AS3Ktjj9ocl_event(__attribute__((address_space(4))) i16*, i16 __attribute__((address_space(3)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS1tPU3AS3Ktjj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3cPU3AS1Kcjj9ocl_event(i8 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i8* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3cPU3AS1Kcjj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv16_cPU3AS1KS_jj9ocl_event(<16 x i8> __attribute__((address_space(3)))*, <16 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv16_cPU3AS1KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv16_fPU3AS1KS_jj9ocl_event(float16 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float16* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv16_fPU3AS1KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv16_hPU3AS1KS_jj9ocl_event(<16 x i8> __attribute__((address_space(3)))*, <16 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv16_hPU3AS1KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv16_iPU3AS1KS_jj9ocl_event(<16 x i32> __attribute__((address_space(3)))*, <16 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv16_iPU3AS1KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv16_jPU3AS1KS_jj9ocl_event(<16 x i32> __attribute__((address_space(3)))*, <16 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv16_jPU3AS1KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv16_sPU3AS1KS_jj9ocl_event(<16 x i16> __attribute__((address_space(3)))*, <16 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv16_sPU3AS1KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv16_tPU3AS1KS_jj9ocl_event(<16 x i16> __attribute__((address_space(3)))*, <16 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv16_tPU3AS1KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv2_cPU3AS1KS_jj9ocl_event(<2 x i8> __attribute__((address_space(3)))*, <2 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv2_cPU3AS1KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv2_fPU3AS1KS_jj9ocl_event(float2 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float2* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv2_fPU3AS1KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv2_hPU3AS1KS_jj9ocl_event(<2 x i8> __attribute__((address_space(3)))*, <2 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv2_hPU3AS1KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv2_iPU3AS1KS_jj9ocl_event(<2 x i32> __attribute__((address_space(3)))*, <2 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv2_iPU3AS1KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv2_jPU3AS1KS_jj9ocl_event(<2 x i32> __attribute__((address_space(3)))*, <2 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv2_jPU3AS1KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv2_sPU3AS1KS_jj9ocl_event(<2 x i16> __attribute__((address_space(3)))*, <2 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv2_sPU3AS1KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv2_tPU3AS1KS_jj9ocl_event(<2 x i16> __attribute__((address_space(3)))*, <2 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv2_tPU3AS1KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv3_cPU3AS1KS_jj9ocl_event(<3 x i8> __attribute__((address_space(3)))*, <3 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv3_cPU3AS1KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv3_fPU3AS1KS_jj9ocl_event(float3 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float3* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv3_fPU3AS1KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv3_hPU3AS1KS_jj9ocl_event(<3 x i8> __attribute__((address_space(3)))*, <3 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv3_hPU3AS1KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv3_iPU3AS1KS_jj9ocl_event(<3 x i32> __attribute__((address_space(3)))*, <3 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv3_iPU3AS1KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv3_jPU3AS1KS_jj9ocl_event(<3 x i32> __attribute__((address_space(3)))*, <3 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv3_jPU3AS1KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv3_sPU3AS1KS_jj9ocl_event(<3 x i16> __attribute__((address_space(3)))*, <3 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv3_sPU3AS1KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv3_tPU3AS1KS_jj9ocl_event(<3 x i16> __attribute__((address_space(3)))*, <3 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv3_tPU3AS1KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv4_cPU3AS1KS_jj9ocl_event(<4 x i8> __attribute__((address_space(3)))*, <4 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv4_cPU3AS1KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv4_fPU3AS1KS_jj9ocl_event(float4 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float4* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv4_fPU3AS1KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv4_hPU3AS1KS_jj9ocl_event(<4 x i8> __attribute__((address_space(3)))*, <4 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv4_hPU3AS1KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv4_iPU3AS1KS_jj9ocl_event(<4 x i32> __attribute__((address_space(3)))*, <4 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv4_iPU3AS1KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv4_jPU3AS1KS_jj9ocl_event(<4 x i32> __attribute__((address_space(3)))*, <4 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv4_jPU3AS1KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv4_sPU3AS1KS_jj9ocl_event(<4 x i16> __attribute__((address_space(3)))*, <4 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv4_sPU3AS1KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv4_tPU3AS1KS_jj9ocl_event(<4 x i16> __attribute__((address_space(3)))*, <4 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv4_tPU3AS1KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv8_cPU3AS1KS_jj9ocl_event(<8 x i8> __attribute__((address_space(3)))*, <8 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv8_cPU3AS1KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv8_fPU3AS1KS_jj9ocl_event(float8 __attribute__((address_space(3)))*, __attribute__((address_space(4))) float8* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv8_fPU3AS1KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv8_hPU3AS1KS_jj9ocl_event(<8 x i8> __attribute__((address_space(3)))*, <8 x i8> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv8_hPU3AS1KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv8_iPU3AS1KS_jj9ocl_event(<8 x i32> __attribute__((address_space(3)))*, <8 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv8_iPU3AS1KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv8_jPU3AS1KS_jj9ocl_event(<8 x i32> __attribute__((address_space(3)))*, <8 x i32> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv8_jPU3AS1KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv8_sPU3AS1KS_jj9ocl_event(<8 x i16> __attribute__((address_space(3)))*, <8 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv8_sPU3AS1KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3Dv8_tPU3AS1KS_jj9ocl_event(<8 x i16> __attribute__((address_space(3)))*, <8 x i16> __attribute__((address_space(4)))* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3Dv8_tPU3AS1KS_jj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3fPU3AS1Kfjj9ocl_event(float __attribute__((address_space(3)))*, __attribute__((address_space(4))) float* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3fPU3AS1Kfjj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3hPU3AS1Khjj9ocl_event(i8 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i8* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3hPU3AS1Khjj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3iPU3AS1Kijj9ocl_event(i32 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i32* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3iPU3AS1Kijj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3jPU3AS1Kjjj9ocl_event(i32 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i32* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3jPU3AS1Kjjj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3sPU3AS1Ksjj9ocl_event(i16 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i16* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3sPU3AS1Ksjj9ocl_event")));
|
||||
%opencl.event_t* _Z29async_work_group_strided_copyPU3AS3tPU3AS1Ktjj9ocl_event(i16 __attribute__((address_space(3)))*, __attribute__((address_space(4))) i16* , i32, i32, %opencl.event_t* readnone) __attribute__((weak, alias("_Z29async_work_group_strided_copyPU3AS3tPU3AS1Ktjj9ocl_event")));
|
||||
|
||||
TODO missing wait_group_events function(s)
|
||||
|
||||
void _Z8prefetchPU3AS1Kcj(__attribute__((address_space(4))) i8*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1Kcj")));
|
||||
void _Z8prefetchPU3AS1KDv16_cj(<16 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv16_cj")));
|
||||
void _Z8prefetchPU3AS1KDv16_fj(__attribute__((address_space(4))) float16*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv16_fj")));
|
||||
void _Z8prefetchPU3AS1KDv16_hj(<16 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv16_hj")));
|
||||
void _Z8prefetchPU3AS1KDv16_ij(<16 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv16_ij")));
|
||||
void _Z8prefetchPU3AS1KDv16_jj(<16 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv16_jj")));
|
||||
void _Z8prefetchPU3AS1KDv16_sj(<16 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv16_sj")));
|
||||
void _Z8prefetchPU3AS1KDv16_tj(<16 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv16_tj")));
|
||||
void _Z8prefetchPU3AS1KDv2_cj(<2 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv2_cj")));
|
||||
void _Z8prefetchPU3AS1KDv2_fj(__attribute__((address_space(4))) float2*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv2_fj")));
|
||||
void _Z8prefetchPU3AS1KDv2_hj(<2 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv2_hj")));
|
||||
void _Z8prefetchPU3AS1KDv2_ij(<2 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv2_ij")));
|
||||
void _Z8prefetchPU3AS1KDv2_jj(<2 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv2_jj")));
|
||||
void _Z8prefetchPU3AS1KDv2_sj(<2 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv2_sj")));
|
||||
void _Z8prefetchPU3AS1KDv2_tj(<2 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv2_tj")));
|
||||
void _Z8prefetchPU3AS1KDv3_cj(<3 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv3_cj")));
|
||||
void _Z8prefetchPU3AS1KDv3_fj(__attribute__((address_space(4))) float3*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv3_fj")));
|
||||
void _Z8prefetchPU3AS1KDv3_hj(<3 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv3_hj")));
|
||||
void _Z8prefetchPU3AS1KDv3_ij(<3 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv3_ij")));
|
||||
void _Z8prefetchPU3AS1KDv3_jj(<3 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv3_jj")));
|
||||
void _Z8prefetchPU3AS1KDv3_sj(<3 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv3_sj")));
|
||||
void _Z8prefetchPU3AS1KDv3_tj(<3 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv3_tj")));
|
||||
void _Z8prefetchPU3AS1KDv4_cj(<4 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv4_cj")));
|
||||
void _Z8prefetchPU3AS1KDv4_fj(__attribute__((address_space(4))) float4*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv4_fj")));
|
||||
void _Z8prefetchPU3AS1KDv4_hj(<4 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv4_hj")));
|
||||
void _Z8prefetchPU3AS1KDv4_ij(<4 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv4_ij")));
|
||||
void _Z8prefetchPU3AS1KDv4_jj(<4 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv4_jj")));
|
||||
void _Z8prefetchPU3AS1KDv4_sj(<4 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv4_sj")));
|
||||
void _Z8prefetchPU3AS1KDv4_tj(<4 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv4_tj")));
|
||||
void _Z8prefetchPU3AS1KDv8_cj(<8 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv8_cj")));
|
||||
void _Z8prefetchPU3AS1KDv8_fj(__attribute__((address_space(4))) float8*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv8_fj")));
|
||||
void _Z8prefetchPU3AS1KDv8_hj(<8 x i8> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv8_hj")));
|
||||
void _Z8prefetchPU3AS1KDv8_ij(<8 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv8_ij")));
|
||||
void _Z8prefetchPU3AS1KDv8_jj(<8 x i32> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv8_jj")));
|
||||
void _Z8prefetchPU3AS1KDv8_sj(<8 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv8_sj")));
|
||||
void _Z8prefetchPU3AS1KDv8_tj(<8 x i16> __attribute__((address_space(4)))*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1KDv8_tj")));
|
||||
void _Z8prefetchPU3AS1Kfj(__attribute__((address_space(4))) float*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1Kfj")));
|
||||
void _Z8prefetchPU3AS1Khj(__attribute__((address_space(4))) i8*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1Khj")));
|
||||
void _Z8prefetchPU3AS1Kij(__attribute__((address_space(4))) i32*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1Kij")));
|
||||
void _Z8prefetchPU3AS1Kjj(__attribute__((address_space(4))) i32*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1Kjj")));
|
||||
void _Z8prefetchPU3AS1Ksj(__attribute__((address_space(4))) i16*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1Ksj")));
|
||||
void _Z8prefetchPU3AS1Ktj(__attribute__((address_space(4))) i16*, i32) __attribute__((weak, alias("_Z8prefetchPU3AS1Ktj")));
|
||||
*/
|
||||
#endif /* VC4CL_GENERIC_MANGLING */
|
101
drivers/videocore4_stdlib/include/_common.h
Normal file
101
drivers/videocore4_stdlib/include/_common.h
Normal file
@ -0,0 +1,101 @@
|
||||
/*
|
||||
* Author: doe300
|
||||
*
|
||||
* See the file "LICENSE" for the full license governing this code.
|
||||
*/
|
||||
|
||||
#ifndef VC4CL_COMMON_H
|
||||
#define VC4CL_COMMON_H
|
||||
|
||||
#include "_overloads.h"
|
||||
#include "_intrinsics.h"
|
||||
|
||||
|
||||
/*
|
||||
* Common functions
|
||||
*
|
||||
* Some functions have no maximum error in the OpenCL specification, see here: https://github.com/KhronosGroup/OpenCL-Docs/issues/33
|
||||
*
|
||||
* degrees -> 2 ULP
|
||||
* radians -> 2 ULP
|
||||
* mix -> "implementation defined"
|
||||
* smoothstep -> "implementation defined"
|
||||
* clamp, min, max, step, sign -> 0 ULP
|
||||
*/
|
||||
|
||||
SIMPLE_3(float, clamp, float, x, float, minval, float, maxval, fmin(fmax(x, minval), maxval))
|
||||
//TODO version with limits as scalar
|
||||
|
||||
// NOTE: using 0x1.ca5dc2p+5 (= 180/M_PI_F + 1 ULP) is slightly more accurate than using 0x1.ca5dcp+5 (180 / M_PI_F),
|
||||
// but both are accurate enough for 2 ULP maximum error
|
||||
SIMPLE_1(float, degrees, float, radians, 0x1.ca5dc2p+5 * radians)
|
||||
|
||||
// Results are undefined for one of the inputs NaN or Inf,
|
||||
// so we can directly call the intrinsic and don't need to handle these inputs explicitly
|
||||
SIMPLE_2(float, max, float, x, float, y, vc4cl_fmax(x, y))
|
||||
SIMPLE_2_SCALAR(float, max, float, x, float, y, vc4cl_fmax(x, y))
|
||||
|
||||
SIMPLE_2(float, min, float, x, float, y, vc4cl_fmin(x, y))
|
||||
SIMPLE_2_SCALAR(float, min, float, x, float, y, vc4cl_fmin(x, y))
|
||||
|
||||
//" Returns the linear blend of x and y implemented as:
|
||||
// x + (y - x) * a
|
||||
// a must be a value in the range 0.0 ... 1.0. If a is not in the range 0.0 ... 1.0, the return values are undefined. "
|
||||
|
||||
SIMPLE_3(float, mix, float, x, float, y, float, a, x + (y - x) * a)
|
||||
SIMPLE_3_SCALAR(float, mix, float, x, float, y, float, a, x + (y - x) * a)
|
||||
|
||||
SIMPLE_1(float, radians, float, degrees, (M_PI_F / 180) * degrees)
|
||||
|
||||
SIMPLE_2(float, step, float, edge, float, val, val < edge ? 0.0f : 1.0f)
|
||||
INLINE float2 step(float edge, float2 val) OVERLOADABLE
|
||||
{
|
||||
return step((float2)edge, val);
|
||||
}
|
||||
INLINE float3 step(float edge, float3 val) OVERLOADABLE
|
||||
{
|
||||
return step((float3)edge, val);
|
||||
}
|
||||
INLINE float4 step(float edge, float4 val) OVERLOADABLE
|
||||
{
|
||||
return step((float4)edge, val);
|
||||
}
|
||||
INLINE float8 step(float edge, float8 val) OVERLOADABLE
|
||||
{
|
||||
return step((float8)edge, val);
|
||||
}
|
||||
INLINE float16 step(float edge, float16 val) OVERLOADABLE
|
||||
{
|
||||
return step((float16)edge, val);
|
||||
}
|
||||
|
||||
COMPLEX_3(float, smoothstep, float, edge0, float, edge1, float, val,
|
||||
{
|
||||
result_t tmp = clamp((result_t) (val - edge0) / (edge1 - edge0), (result_t)0.0f, (result_t)1.0f);
|
||||
return tmp * tmp * (3 - 2 * tmp);
|
||||
})
|
||||
INLINE float2 smoothstep(float edge0, float edge1, float2 val) OVERLOADABLE
|
||||
{
|
||||
return smoothstep((float2)edge0, (float2)edge1, val);
|
||||
}
|
||||
INLINE float3 smoothstep(float edge0, float edge1, float3 val) OVERLOADABLE
|
||||
{
|
||||
return smoothstep((float3)edge0, (float3)edge1, val);
|
||||
}
|
||||
INLINE float4 smoothstep(float edge0, float edge1, float4 val) OVERLOADABLE
|
||||
{
|
||||
return smoothstep((float4)edge0, (float4)edge1, val);
|
||||
}
|
||||
INLINE float8 smoothstep(float edge0, float edge1, float8 val) OVERLOADABLE
|
||||
{
|
||||
return smoothstep((float8)edge0, (float8)edge1, val);
|
||||
}
|
||||
INLINE float16 smoothstep(float edge0, float edge1, float16 val) OVERLOADABLE
|
||||
{
|
||||
return smoothstep((float16)edge0, (float16)edge1, val);
|
||||
}
|
||||
|
||||
SIMPLE_1(float, sign, float, val, val > 0.0f ? 1.0f : val < 0.0f ? -1.0f : 0.0f)
|
||||
|
||||
#endif /* VC4CL_COMMON_H */
|
||||
|
30
drivers/videocore4_stdlib/include/_config.h
Normal file
30
drivers/videocore4_stdlib/include/_config.h
Normal file
@ -0,0 +1,30 @@
|
||||
/*
|
||||
* Author: doe300
|
||||
*
|
||||
* See the file "LICENSE" for the full license governing this code.
|
||||
*/
|
||||
|
||||
#ifndef VC4CL_CONFIG_H
|
||||
#define VC4CL_CONFIG_H
|
||||
|
||||
#include "defines.h"
|
||||
|
||||
#include "opencl-c.h"
|
||||
|
||||
#ifndef NULL
|
||||
#define NULL ((void *)0)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Math constants
|
||||
*/
|
||||
#define M_LOG210 3.01029995663981195214f /* log_2(10) */
|
||||
#undef NAN
|
||||
#define NAN 0x7fffffffU /* same as defined in OpenCL C, but as integer */
|
||||
#undef INF
|
||||
#define INF 0x7f800000U
|
||||
|
||||
#define ALL_BITS_SET 0xFFFFFFFFU
|
||||
|
||||
#endif /* VC4CL_CONFIG_H */
|
||||
|
1861
drivers/videocore4_stdlib/include/_conversions.h
Normal file
1861
drivers/videocore4_stdlib/include/_conversions.h
Normal file
File diff suppressed because it is too large
Load Diff
173
drivers/videocore4_stdlib/include/_extensions.h
Normal file
173
drivers/videocore4_stdlib/include/_extensions.h
Normal file
@ -0,0 +1,173 @@
|
||||
/*
|
||||
* Author: doe300
|
||||
*
|
||||
* See the file "LICENSE" for the full license governing this code.
|
||||
*/
|
||||
|
||||
#ifndef VC4CL_EXTENSIONS_H
|
||||
#define VC4CL_EXTENSIONS_H
|
||||
|
||||
#include "_config.h"
|
||||
#include "_overloads.h"
|
||||
#include "_intrinsics.h"
|
||||
|
||||
|
||||
/*
|
||||
* Loop unroll pragma extension
|
||||
*
|
||||
* Defines "#pragma unroll <factor>"
|
||||
*
|
||||
* CLang supports this natively, so we do not need to do anything
|
||||
*
|
||||
* See https://www.khronos.org/registry/OpenCL/extensions/nv/cl_nv_pragma_unroll.txt
|
||||
* See https://clang.llvm.org/docs/AttributeReference.html#pragma-unroll-pragma-nounroll
|
||||
*/
|
||||
#ifndef cl_nv_pragma_unroll
|
||||
#define cl_nv_pragma_unroll 1
|
||||
#endif
|
||||
|
||||
/*
|
||||
* ARM core-ID extension
|
||||
*
|
||||
* Adds function
|
||||
* uint arm_get_core_id( void )
|
||||
* which returns the ID of the OpenCL Computation Unit, which is always zero
|
||||
*
|
||||
* See https://www.khronos.org/registry/OpenCL/extensions/arm/cl_arm_get_core_id.txt
|
||||
*/
|
||||
#ifndef cl_arm_core_id
|
||||
#define cl_arm_core_id 1
|
||||
#endif
|
||||
uint arm_get_core_id(void); //prototype, prevents warning
|
||||
uint arm_get_core_id(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* 32-bit atomic counters
|
||||
*
|
||||
* Adds type
|
||||
* counter_32_t
|
||||
* which is a 32-bit type for atomic counters. counter32_t can only be passed as kernel parameter and cannot be read/assigned.
|
||||
*
|
||||
* Adds functions
|
||||
* uint atomic_inc(counter32_t counter)
|
||||
* uint atomic_dec(counter32_t counter)
|
||||
* increments/decrements the given counter32_t value atomically.
|
||||
*
|
||||
* NOTE: Since the syntax/semantics is exactly the same as for the uint version of the standard atomic_inc/atomic_dec functions, counter32_t is used as typedef to an uint pointer.
|
||||
*
|
||||
* See https://www.khronos.org/registry/OpenCL/extensions/ext/cl_ext_atomic_counters_32.txt
|
||||
*/
|
||||
#ifndef cl_ext_atomic_counters_32
|
||||
#define cl_ext_atomic_counters_32 1
|
||||
#endif
|
||||
typedef volatile __global uint* counter32_t;
|
||||
//just the prototypes, the implementations reside in _atomics.h
|
||||
uint atomic_inc(counter32_t counter) OVERLOADABLE;
|
||||
uint atomic_dec(counter32_t counter) OVERLOADABLE;
|
||||
|
||||
/*
|
||||
* Integer dot products
|
||||
*
|
||||
* Adds functions
|
||||
* int arm_dot(char4 a, char4 b)
|
||||
* uint arm_dot(uchar4 a, uchar4 b)
|
||||
* int arm_dot_acc(char4 a, char4 b, int acc)
|
||||
* uint arm_dot_acc(uchar4 a, uchar4 b, uint acc)
|
||||
* int arm_dot_acc(short2 a, short2 b, int acc)
|
||||
* uint arm_dot_acc(ushort2 a, ushort2 b, uint acc)
|
||||
* int arm_dot_acc_sat(char4 a, char4 b, int acc)
|
||||
* uint arm_dot_acc_sat(uchar4 a, uchar4 b, uint acc)
|
||||
* calculate integer dot product (and additionally adds the scalar value).
|
||||
* For the functions xxx_sat, the final addition is saturating.
|
||||
*
|
||||
* See https://www.khronos.org/registry/OpenCL/extensions/arm/cl_arm_integer_dot_product.txt
|
||||
*/
|
||||
#ifndef cl_arm_integer_dot_product_int8
|
||||
#define cl_arm_integer_dot_product_int8 1
|
||||
#endif
|
||||
#ifndef cl_arm_integer_dot_product_accumulate_int8
|
||||
#define cl_arm_integer_dot_product_accumulate_int8 1
|
||||
#endif
|
||||
#ifndef cl_arm_integer_dot_product_accumulate_int16
|
||||
#define cl_arm_integer_dot_product_accumulate_int16 1
|
||||
#endif
|
||||
#ifndef cl_arm_integer_dot_product_accumulate_saturate_int8
|
||||
#define cl_arm_integer_dot_product_accumulate_saturate_int8 1
|
||||
#endif
|
||||
|
||||
// prototypes to prevent warnings
|
||||
int arm_dot(char4 a, char4 b) OVERLOADABLE;
|
||||
uint arm_dot(uchar4 a, uchar4 b) OVERLOADABLE;
|
||||
int arm_dot_acc(char4 a, char4 b, int acc) OVERLOADABLE;
|
||||
uint arm_dot_acc(uchar4 a, uchar4 b, uint acc) OVERLOADABLE;
|
||||
int arm_dot_acc(short2 a, short2 b, int acc) OVERLOADABLE;
|
||||
uint arm_dot_acc(ushort2 a, ushort2 b, uint acc) OVERLOADABLE;
|
||||
int arm_dot_acc_sat(char4 a, char4 b, int acc) OVERLOADABLE;
|
||||
uint arm_dot_acc_sat(uchar4 a, uchar4 b, uint acc) OVERLOADABLE;
|
||||
|
||||
/**
|
||||
* (a.x * b.x) + (a.y * b.y) + (a.z * b.z) + (a.w * b.w)
|
||||
*/
|
||||
int arm_dot(char4 a, char4 b) OVERLOADABLE CONST
|
||||
{
|
||||
int4 tmp = vc4cl_mul24(a, b, VC4CL_SIGNED);
|
||||
return tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3;
|
||||
}
|
||||
uint arm_dot(uchar4 a, uchar4 b) OVERLOADABLE CONST
|
||||
{
|
||||
uint4 tmp = vc4cl_mul24(a, b, VC4CL_UNSIGNED);
|
||||
return tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3;
|
||||
}
|
||||
|
||||
/**
|
||||
* acc + [ (a.x * b.x) + (a.y * b.y) + (a.z * b.z) + (a.w * b.w) ]
|
||||
*/
|
||||
int arm_dot_acc(char4 a, char4 b, int acc) OVERLOADABLE CONST
|
||||
{
|
||||
int4 tmp = vc4cl_mul24(a, b, VC4CL_SIGNED);
|
||||
return acc + tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3;
|
||||
}
|
||||
|
||||
uint arm_dot_acc(uchar4 a, uchar4 b, uint acc) OVERLOADABLE CONST
|
||||
{
|
||||
uint4 tmp = vc4cl_mul24(a, b, VC4CL_UNSIGNED);
|
||||
return acc + tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3;
|
||||
}
|
||||
|
||||
/**
|
||||
* acc + [ (a.x * b.x) + (a.y * b.y) ]
|
||||
*/
|
||||
int arm_dot_acc(short2 a, short2 b, int acc) OVERLOADABLE CONST
|
||||
{
|
||||
int2 tmp = vc4cl_mul24(a, b, VC4CL_SIGNED);
|
||||
return acc + tmp.s0 + tmp.s1;
|
||||
}
|
||||
|
||||
uint arm_dot_acc(ushort2 a, ushort2 b, uint acc) OVERLOADABLE CONST
|
||||
{
|
||||
uint2 tmp = vc4cl_mul24(a, b, VC4CL_UNSIGNED);
|
||||
return acc + tmp.s0 + tmp.s1;
|
||||
}
|
||||
|
||||
/**
|
||||
* acc + [ (a.x * b.x) + (a.y * b.y) + (a.z * b.z) + (a.w * b.w) ]
|
||||
*
|
||||
* The final accumulation is saturating.
|
||||
*/
|
||||
int arm_dot_acc_sat(char4 a, char4 b, int acc) OVERLOADABLE CONST
|
||||
{
|
||||
int4 tmp = vc4cl_mul24(a, b, VC4CL_SIGNED);
|
||||
return add_sat(acc, tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3);
|
||||
}
|
||||
|
||||
uint arm_dot_acc_sat(uchar4 a, uchar4 b, uint acc) OVERLOADABLE CONST
|
||||
{
|
||||
uint4 tmp = vc4cl_mul24(a, b, VC4CL_UNSIGNED);
|
||||
return add_sat(acc, tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3);
|
||||
}
|
||||
|
||||
#endif /* VC4CL_EXTENSIONS_H */
|
||||
|
121
drivers/videocore4_stdlib/include/_float_float.h
Normal file
121
drivers/videocore4_stdlib/include/_float_float.h
Normal file
@ -0,0 +1,121 @@
|
||||
/*
|
||||
* Author: doe300
|
||||
*
|
||||
* See the file "LICENSE" for the full license governing this code.
|
||||
*/
|
||||
/*
|
||||
* Implements a float-float floating point type providing improved accuracy over float32.
|
||||
*
|
||||
* Algorithms and ideas taken from:
|
||||
* - Guillaume da Gracca, David Defour. Implementation of float-float operators on graphics hardware. Real Numbers and
|
||||
* Computers 7, Jul 2006, Nancy, France. pp.23-32. hal-00021443
|
||||
* https://hal.archives-ouvertes.fr/hal-00021443 (https://hal.archives-ouvertes.fr/hal-00021443/document)
|
||||
* - https://andrewthall.org/papers/df64_qf128.pdf
|
||||
*/
|
||||
#ifndef VC4CL_FLOAT_FLOAT_H
|
||||
#define VC4CL_FLOAT_FLOAT_H
|
||||
|
||||
#include "_intrinsics.h"
|
||||
|
||||
/**
|
||||
* Type for extended precision floating point values.
|
||||
*
|
||||
* By combining two 32-bit floats, greatly increases accuracy. Value range is not increased!
|
||||
*
|
||||
* The "real" value calculates as UPPER + LOWER part.
|
||||
*
|
||||
* Using a native 64-bit type implicitly provides vector versions (and proper handling by compiler)
|
||||
*/
|
||||
typedef ulong FloatFloat;
|
||||
typedef ulong2 FloatFloat2;
|
||||
typedef ulong3 FloatFloat3;
|
||||
typedef ulong4 FloatFloat4;
|
||||
typedef ulong8 FloatFloat8;
|
||||
typedef ulong16 FloatFloat16;
|
||||
|
||||
SIMPLE_1(float, vc4cl_upper, FloatFloat, val, vc4cl_bitcast_float(vc4cl_long_to_int(val)))
|
||||
SIMPLE_1(float, vc4cl_lower, FloatFloat, val, vc4cl_bitcast_float(vc4cl_long_to_int(val >> 32)))
|
||||
SIMPLE_1(float, vc4cl_lossy, FloatFloat, val, vc4cl_upper(val) + vc4cl_lower(val))
|
||||
|
||||
COMPLEX_2(FloatFloat, vc4cl_combine, float, upper, float, lower, {
|
||||
result_t upper_extended = vc4cl_int_to_ulong(vc4cl_bitcast_uint(upper));
|
||||
result_t lower_extended = vc4cl_int_to_ulong(vc4cl_bitcast_uint(lower));
|
||||
return upper_extended | (lower_extended << 32);
|
||||
})
|
||||
|
||||
// faster version of vc4cl_combine(val, 0)
|
||||
SIMPLE_1(FloatFloat, vc4cl_extend, float, val, vc4cl_int_to_ulong(vc4cl_bitcast_uint(val)))
|
||||
|
||||
// TODO avoid using this, since it runs against Inf, due to calculating val * 2^15
|
||||
COMPLEX_1(FloatFloat, vc4cl_split, float, val, {
|
||||
// 2^s where p/2 <= s <= p - 1 with (p = bits in mantissa = 23)
|
||||
const float split = (float) (1u << 15); // TODO can be modified for precision
|
||||
arg_t c = (split + 1) * val;
|
||||
arg_t high = c - (c - val);
|
||||
arg_t low = val - high;
|
||||
return vc4cl_combine(high, low);
|
||||
})
|
||||
|
||||
// COMPLEX_1(FloatFloat, vc4cl_split, double, val, {
|
||||
// // 2^s where p/2 <= s <= p - 1 with (p = bits in mantissa = 23)
|
||||
// const double split = (double) (1u << 29); // TODO can be modified for precision
|
||||
// arg_t c = (split + 1) * val;
|
||||
// arg_t high = c - (c - val);
|
||||
// arg_t low = val - high;
|
||||
// return vc4cl_combine(high, low);
|
||||
// })
|
||||
|
||||
COMPLEX_2(FloatFloat, vc4cl_add, float, a, float, b, {
|
||||
float_t s = a + b;
|
||||
float_t v = s - a;
|
||||
float_t e = (a - (s - v)) + (b - v);
|
||||
return vc4cl_combine(s, e);
|
||||
})
|
||||
|
||||
COMPLEX_2(FloatFloat, vc4cl_add, FloatFloat, a, FloatFloat, b, {
|
||||
float_t r = vc4cl_upper(a) + vc4cl_upper(b);
|
||||
float_t s0 = (((vc4cl_upper(a) - r) + vc4cl_upper(b)) + vc4cl_lower(b)) + vc4cl_lower(a);
|
||||
float_t s1 = (((vc4cl_upper(b) - r) + vc4cl_upper(a)) + vc4cl_lower(a)) + vc4cl_lower(b);
|
||||
float_t s = fabs(vc4cl_upper(a)) >= fabs(vc4cl_upper(b)) ? s0 : s1;
|
||||
return vc4cl_add(r, s);
|
||||
})
|
||||
|
||||
SIMPLE_2(FloatFloat, vc4cl_sub, FloatFloat, a, FloatFloat, b, vc4cl_add(a, vc4cl_combine(-vc4cl_upper(b), -vc4cl_lower(b))))
|
||||
|
||||
COMPLEX_2(FloatFloat, vc4cl_mul, float, a, float, b, {
|
||||
float_t x = a * b;
|
||||
result_t a_split = vc4cl_split(a);
|
||||
result_t b_split = vc4cl_split(b);
|
||||
float_t error1 = x - (vc4cl_upper(a_split) * vc4cl_upper(b_split));
|
||||
float_t error2 = error1 - (vc4cl_lower(a_split) * vc4cl_upper(b_split));
|
||||
float_t error3 = error2 - (vc4cl_upper(a_split) * vc4cl_lower(b_split));
|
||||
float_t y = vc4cl_lower(a_split) * vc4cl_lower(b_split) - error3;
|
||||
return vc4cl_combine(x, y);
|
||||
})
|
||||
|
||||
COMPLEX_2(FloatFloat, vc4cl_mul, FloatFloat, a, FloatFloat, b, {
|
||||
result_t t = vc4cl_mul(vc4cl_upper(a), vc4cl_upper(b));
|
||||
float_t t1 = vc4cl_upper(a) * vc4cl_lower(b) + vc4cl_lower(a) * vc4cl_upper(b) + vc4cl_lower(t);
|
||||
return vc4cl_add(vc4cl_upper(t), t1);
|
||||
})
|
||||
|
||||
COMPLEX_2(FloatFloat, vc4cl_div, FloatFloat, a, FloatFloat, b, {
|
||||
float_t xn = 1.0f / vc4cl_upper(b);
|
||||
float_t yn = vc4cl_upper(a) * xn;
|
||||
result_t y = vc4cl_extend(yn);
|
||||
float_t diff = vc4cl_upper(vc4cl_sub(a, vc4cl_mul(b, y)));
|
||||
result_t prod = vc4cl_mul(xn, diff);
|
||||
return vc4cl_add(y, prod);
|
||||
})
|
||||
|
||||
COMPLEX_1(FloatFloat, vc4cl_sqrt, FloatFloat, a, {
|
||||
float_t xn = rsqrt(vc4cl_upper(a));
|
||||
float_t yn = vc4cl_upper(a) * xn;
|
||||
result_t y = vc4cl_extend(yn);
|
||||
result_t ynsqr = vc4cl_mul(y, y); // yn^2
|
||||
float_t diff = vc4cl_upper(vc4cl_sub(a, ynsqr));
|
||||
result_t prod = vc4cl_mul(xn, diff) / 2;
|
||||
return vc4cl_add(y, prod);
|
||||
})
|
||||
|
||||
#endif /* VC4CL_FLOAT_FLOAT_H */
|
93
drivers/videocore4_stdlib/include/_geometric.h
Normal file
93
drivers/videocore4_stdlib/include/_geometric.h
Normal file
@ -0,0 +1,93 @@
|
||||
/*
|
||||
* Author: doe300
|
||||
*
|
||||
* See the file "LICENSE" for the full license governing this code.
|
||||
*/
|
||||
|
||||
#ifndef VC4CL_GEOMETRY_H
|
||||
#define VC4CL_GEOMETRY_H
|
||||
|
||||
#include "_config.h"
|
||||
#include "_overloads.h"
|
||||
|
||||
/* a0 b0 a2 * b3 - a3 * b2
|
||||
* a x b = a1 x b1 = a3 * b1 - a1 * b3
|
||||
* a2 b2 a1 * b2 - a2 * b1
|
||||
*/
|
||||
INLINE float3 cross(float3 p0, float3 p1) OVERLOADABLE CONST
|
||||
{
|
||||
return (float3) (p0.y * p1.z - p0.z * p1.y, p0.z * p1.x - p0.x * p1.z, p0.x * p1.y - p0.y * p1.x);
|
||||
}
|
||||
|
||||
INLINE float4 cross(float4 p0, float4 p1) OVERLOADABLE CONST
|
||||
{
|
||||
return (float4) (p0.y * p1.z - p0.z * p1.y, p0.z * p1.x - p0.x * p1.z, p0.x * p1.y - p0.y * p1.x, 0.0f);
|
||||
}
|
||||
|
||||
/* a0 b0
|
||||
* a * b = a1 * b1 = a1 * b1 + a2 * b2 + a3 * b3
|
||||
* a2 b2
|
||||
*/
|
||||
INLINE float dot(float p0, float p1) OVERLOADABLE CONST
|
||||
{
|
||||
return p0 * p1;
|
||||
}
|
||||
|
||||
INLINE float dot(float2 p0, float2 p1) OVERLOADABLE CONST
|
||||
{
|
||||
const float2 tmp = p0 * p1;
|
||||
return tmp.x + tmp.y;
|
||||
}
|
||||
|
||||
INLINE float dot(float3 p0, float3 p1) OVERLOADABLE CONST
|
||||
{
|
||||
const float3 tmp = p0 * p1;
|
||||
return tmp.x + tmp.y + tmp.z;
|
||||
}
|
||||
|
||||
INLINE float dot(float4 p0, float4 p1) OVERLOADABLE CONST
|
||||
{
|
||||
const float4 tmp = p0 * p1;
|
||||
return tmp.x + tmp.y + tmp.z + tmp.w;
|
||||
}
|
||||
|
||||
float dot(float8 p0, float8 p1) OVERLOADABLE CONST;
|
||||
float dot(float16 p0, float16 p1) OVERLOADABLE CONST;
|
||||
|
||||
COMPLEX_1_RETURN_SCALAR(float, length, float, p, {
|
||||
float tmp = dot(p, p);
|
||||
|
||||
// To mitigate overflow errors for edge-cases, reduce large/increase small numbers, this is taken from LLVM libclc
|
||||
// E.g. since dot(x, x) calculates element-wise x^2, every exponent >= 64 goes to Infinity and every exponent <= -64 to zero!
|
||||
float inputFactor = 1.0f;
|
||||
float outputFactor = 1.0f;
|
||||
outputFactor = tmp == INFINITY ? 0x1.0p+65f : outputFactor;
|
||||
inputFactor = tmp == INFINITY ? 0x1.0p-65f : inputFactor;
|
||||
outputFactor = vc4cl_is_zero(tmp) ? 0x1.0p-86f : outputFactor;
|
||||
inputFactor = vc4cl_is_zero(tmp) ? 0x1.0p+86f : inputFactor;
|
||||
|
||||
return sqrt(dot(p * inputFactor, p * inputFactor)) * outputFactor;
|
||||
})
|
||||
|
||||
//"Returns the distance between p0 and p1.
|
||||
// This is calculated as length(p0 - p1).
|
||||
SIMPLE_2_RETURN_SCALAR(float, distance, float, p0, float, p1, length(p0 - p1))
|
||||
|
||||
/**
|
||||
* Expected behavior:
|
||||
*
|
||||
* normalize(v) = v for all elements in v = 0
|
||||
* normalize(v) = vector of NaNs for all elements in v = NaN
|
||||
* TODO special case for Inf elements
|
||||
*/
|
||||
SIMPLE_1(float, normalize, float, p, p / length(p))
|
||||
|
||||
|
||||
SIMPLE_1_RETURN_SCALAR(float, fast_length, float, p, half_sqrt(dot(p, p)))
|
||||
|
||||
SIMPLE_2_RETURN_SCALAR(float, fast_distance, float, p0, float, p1, fast_length(p0 - p1))
|
||||
|
||||
SIMPLE_1(float, fast_normalize, float, p, p * half_rsqrt(dot(p, p)))
|
||||
|
||||
#endif /* VC4CL_GEOMETRY_H */
|
||||
|
1016
drivers/videocore4_stdlib/include/_images.h
Normal file
1016
drivers/videocore4_stdlib/include/_images.h
Normal file
File diff suppressed because it is too large
Load Diff
233
drivers/videocore4_stdlib/include/_integer.h
Normal file
233
drivers/videocore4_stdlib/include/_integer.h
Normal file
@ -0,0 +1,233 @@
|
||||
/*
|
||||
* Author: doe300
|
||||
*
|
||||
* See the file "LICENSE" for the full license governing this code.
|
||||
*/
|
||||
|
||||
#ifndef VC4CL_INTEGER_H
|
||||
#define VC4CL_INTEGER_H
|
||||
|
||||
#include "_config.h"
|
||||
#include "_intrinsics.h"
|
||||
|
||||
#define SIMPLE_INTEGER_2(func, argName0, argName1, content) \
|
||||
SIMPLE_2(uchar, func, uchar, argName0, uchar, argName1, content) \
|
||||
SIMPLE_2(char, func, char, argName0, char, argName1, content) \
|
||||
SIMPLE_2(ushort, func, ushort, argName0, ushort, argName1, content) \
|
||||
SIMPLE_2(short, func, short, argName0, short, argName1, content) \
|
||||
SIMPLE_2(uint, func, uint, argName0, uint, argName1, content) \
|
||||
SIMPLE_2(int, func, int, argName0, int, argName1, content) \
|
||||
|
||||
#define SIMPLE_INTEGER_3(func, argName0, argName1, argName2, content) \
|
||||
SIMPLE_3(uchar, func, uchar, argName0, uchar, argName1, uchar, argName2, content) \
|
||||
SIMPLE_3(char, func, char, argName0, char, argName1, char, argName2, content) \
|
||||
SIMPLE_3(ushort, func, ushort, argName0, ushort, argName1, ushort, argName2, content) \
|
||||
SIMPLE_3(short, func, short, argName0, short, argName1, short, argName2, content) \
|
||||
SIMPLE_3(uint, func, uint, argName0, uint, argName1, uint, argName2, content) \
|
||||
SIMPLE_3(int, func, int, argName0, int, argName1, int, argName2, content) \
|
||||
|
||||
|
||||
SIMPLE_1(uchar, abs, char, val, vc4cl_bitcast_uchar(max(vc4cl_extend(val), -vc4cl_extend(val))))
|
||||
SIMPLE_1(uchar, abs, uchar, val, val)
|
||||
SIMPLE_1(ushort, abs, short, val, vc4cl_bitcast_ushort(max(vc4cl_extend(val), -vc4cl_extend(val))))
|
||||
SIMPLE_1(ushort, abs, ushort, val, val)
|
||||
SIMPLE_1(uint, abs, int, val, vc4cl_bitcast_uint(max(val, -val)))
|
||||
SIMPLE_1(uint, abs, uint, val, val)
|
||||
SIMPLE_1(ulong, abs, long, val, vc4cl_bitcast_ulong(max(val, -val)))
|
||||
SIMPLE_1(ulong, abs, ulong, val, val)
|
||||
|
||||
//based on pocl (pocl/lib/kernel/abs_diff.cl)
|
||||
SIMPLE_2(uchar, abs_diff, uchar, x, uchar, y, (result_t)abs(x > y ? x - y : y - x))
|
||||
COMPLEX_2(uchar, abs_diff, char, x, char, y, {
|
||||
// explicitly calculate both variants to prevent clang from converting the ?:-operator to an if-else block
|
||||
result_t noflow = (result_t)abs(x - y);
|
||||
result_t flow = abs(x) + abs(y);
|
||||
return (vc4cl_msb_set(x) == vc4cl_msb_set(y)) ? /* same sign -> no under/overflow */ noflow : /* different signs */ flow;
|
||||
})
|
||||
SIMPLE_2(ushort, abs_diff, ushort, x, ushort, y, (result_t)abs(x > y ? x - y : y - x))
|
||||
COMPLEX_2(ushort, abs_diff, short, x, short, y, {
|
||||
// explicitly calculate both variants to prevent clang from converting the ?:-operator to an if-else block
|
||||
result_t noflow = (result_t)abs(x - y);
|
||||
result_t flow = abs(x) + abs(y);
|
||||
return (vc4cl_msb_set(x) == vc4cl_msb_set(y)) ? /* same sign -> no under/overflow */ noflow : /* different signs */ flow;
|
||||
})
|
||||
SIMPLE_2(uint, abs_diff, uint, x, uint, y, abs(x > y ? x - y : y - x))
|
||||
COMPLEX_2(uint, abs_diff, int, x, int, y, {
|
||||
// explicitly calculate both variants to prevent clang from converting the ?:-operator to an if-else block
|
||||
result_t noflow = abs(x - y);
|
||||
result_t flow = abs(x) + abs(y);
|
||||
return (vc4cl_msb_set(x) == vc4cl_msb_set(y)) ? /* same sign -> no under/overflow */ noflow : /* different signs */ flow;
|
||||
})
|
||||
SIMPLE_2(ulong, abs_diff, ulong, x, ulong, y, abs(x > y ? x - y : y - x))
|
||||
COMPLEX_2(ulong, abs_diff, long, x, long, y, {
|
||||
// explicitly calculate both variants to prevent clang from converting the ?:-operator to an if-else block
|
||||
result_t noflow = abs(x - y);
|
||||
result_t flow = abs(x) + abs(y);
|
||||
return (vc4cl_msb_set(x) == vc4cl_msb_set(y)) ? /* same sign -> no under/overflow */ noflow : /* different signs */ flow;
|
||||
})
|
||||
|
||||
SIMPLE_2(uchar, add_sat, uchar, x, uchar, y, vc4cl_v8adds(x, y))
|
||||
SIMPLE_2(char, add_sat, char, x, char, y, vc4cl_bitcast_char(clamp(vc4cl_extend(x) + vc4cl_extend(y), SCHAR_MIN, SCHAR_MAX)))
|
||||
SIMPLE_2(ushort, add_sat, ushort, x, ushort, y, vc4cl_bitcast_ushort(clamp(vc4cl_extend(x) + vc4cl_extend(y), (uint) 0, (uint) USHRT_MAX)))
|
||||
SIMPLE_2(short, add_sat, short, x, short, y, vc4cl_bitcast_short(clamp(vc4cl_extend(x) + vc4cl_extend(y), SHRT_MIN, SHRT_MAX)))
|
||||
//based on pocl (pocl/lib/kernel/add_sat.cl)
|
||||
SIMPLE_2(uint, add_sat, uint, x, uint, y, x > ((result_t)UINT_MAX) - y ? UINT_MAX : x + y)
|
||||
SIMPLE_2(int, add_sat, int, x, int, y, vc4cl_saturated_add(x, y))
|
||||
|
||||
//"Returns (x + y) >> 1. The intermediate sum does not modulo overflow."
|
||||
SIMPLE_2(uchar, hadd, uchar, x, uchar, y, vc4cl_pack_lsb((vc4cl_extend(x) + vc4cl_extend(y)) >> 1))
|
||||
SIMPLE_2(char, hadd, char, x, char, y, vc4cl_bitcast_char(vc4cl_asr(vc4cl_extend(x) + vc4cl_extend(y), 1)))
|
||||
SIMPLE_2(ushort, hadd, ushort, x, ushort, y, vc4cl_bitcast_ushort((vc4cl_extend(x) + vc4cl_extend(y)) >> 1))
|
||||
SIMPLE_2(short, hadd, short, x, short, y, vc4cl_bitcast_short(vc4cl_asr(vc4cl_extend(x) + vc4cl_extend(y), 1)))
|
||||
//based on pocl (pocl/lib/kernel/hadd.cl)
|
||||
SIMPLE_2(uint, hadd, uint, x, uint, y, (x >> (arg0_t)1) + (y >> (arg0_t)1) + (x & y & (arg0_t)1))
|
||||
SIMPLE_2(int, hadd, int, x, int, y, (x >> (arg0_t)1) + (y >> (arg0_t)1) + (x & y & (arg0_t)1))
|
||||
SIMPLE_2(ulong, hadd, ulong, x, ulong, y, (x >> (arg0_t)1) + (y >> (arg0_t)1) + (x & y & (arg0_t)1))
|
||||
SIMPLE_2(long, hadd, long, x, long, y, (x >> (arg0_t)1) + (y >> (arg0_t)1) + (x & y & (arg0_t)1))
|
||||
|
||||
//"Returns (x + y + 1) >> 1. The intermediate sum does not modulo overflow."
|
||||
SIMPLE_2(uchar, rhadd, uchar, x, uchar, y, vc4cl_pack_lsb((vc4cl_extend(x) + vc4cl_extend(y) + (uint)1) >> 1))
|
||||
SIMPLE_2(char, rhadd, char, x, char, y, vc4cl_bitcast_char(vc4cl_asr(vc4cl_extend(x) + vc4cl_extend(y) + (int)1, 1)))
|
||||
SIMPLE_2(ushort, rhadd, ushort, x, ushort, y, vc4cl_bitcast_ushort((vc4cl_extend(x) + vc4cl_extend(y) + (uint)1) >> 1))
|
||||
SIMPLE_2(short, rhadd, short, x, short, y, vc4cl_bitcast_short(vc4cl_asr(vc4cl_extend(x) + vc4cl_extend(y) + (int)1, 1)))
|
||||
//based on pocl (pocl/lib/kernel/rhadd.cl)
|
||||
SIMPLE_2(uint, rhadd, uint, x, uint, y, (x >> (arg0_t)1) + (y >> (arg0_t)1) + ((x | y) & (arg0_t)1))
|
||||
SIMPLE_2(int, rhadd, int, x, int, y, (x >> (arg0_t)1) + (y >> (arg0_t)1) + ((x | y) & (arg0_t)1))
|
||||
SIMPLE_2(ulong, rhadd, ulong, x, ulong, y, (x >> (arg0_t)1) + (y >> (arg0_t)1) + ((x | y) & (arg0_t)1))
|
||||
SIMPLE_2(long, rhadd, long, x, long, y, (x >> (arg0_t)1) + (y >> (arg0_t)1) + ((x | y) & (arg0_t)1))
|
||||
|
||||
SIMPLE_INTEGER_3(clamp, val, minval, maxval, min(max(val, minval), maxval))
|
||||
SIMPLE_3_TWO_SCALAR(uchar, clamp, uchar, val, uchar, minval, uchar, maxval, min(max(val, minval), maxval))
|
||||
SIMPLE_3_TWO_SCALAR(char, clamp, char, val, char, minval, char, maxval, min(max(val, minval), maxval))
|
||||
SIMPLE_3_TWO_SCALAR(ushort, clamp, ushort, val, ushort, minval, ushort, maxval, min(max(val, minval), maxval))
|
||||
SIMPLE_3_TWO_SCALAR(short, clamp, short, val, short, minval, short, maxval, min(max(val, minval), maxval))
|
||||
SIMPLE_3_TWO_SCALAR(uint, clamp, uint, val, uint, minval, uint, maxval, min(max(val, minval), maxval))
|
||||
SIMPLE_3_TWO_SCALAR(int, clamp, int, val, int, minval, int, maxval, min(max(val, minval), maxval))
|
||||
SIMPLE_3(ulong, clamp, ulong, val, ulong, minval, ulong, maxval, min(max(val, minval), maxval))
|
||||
SIMPLE_3_TWO_SCALAR(ulong, clamp, ulong, val, ulong, minval, ulong, maxval, min(max(val, minval), maxval))
|
||||
SIMPLE_3(long, clamp, long, val, long, minval, long, maxval, min(max(val, minval), maxval))
|
||||
SIMPLE_3_TWO_SCALAR(long, clamp, long, val, long, minval, long, maxval, min(max(val, minval), maxval))
|
||||
|
||||
SIMPLE_1(uchar, clz, uchar, x, vc4cl_bitcast_uchar(vc4cl_clz((vc4cl_and(x, (arg_t)0xFF) << 24) | 0xFFFFFF)))
|
||||
SIMPLE_1(char, clz, char, x, vc4cl_bitcast_char(vc4cl_clz((vc4cl_and(x, (arg_t)0xFF) << 24) | 0xFFFFFF)))
|
||||
SIMPLE_1(ushort, clz, ushort, x, vc4cl_bitcast_ushort(vc4cl_clz((vc4cl_and(x, (arg_t)0xFFFF) << 16) | 0xFFFF)))
|
||||
SIMPLE_1(short, clz, short, x, vc4cl_bitcast_short(vc4cl_clz((vc4cl_and(x, (arg_t)0xFFFF) << 16) | 0xFFFF)))
|
||||
SIMPLE_1(uint, clz, uint, x, vc4cl_bitcast_uint(vc4cl_clz(x)))
|
||||
SIMPLE_1(int, clz, int, x, vc4cl_bitcast_int(vc4cl_clz(x)))
|
||||
|
||||
SIMPLE_INTEGER_3(mad_hi, x, y, z, mul_hi(x, y) + z)
|
||||
|
||||
SIMPLE_3(uchar, mad_sat, uchar, x, uchar, y, uchar, z, vc4cl_bitcast_uchar(clamp(vc4cl_extend(x) * vc4cl_extend(y) + vc4cl_extend(z), (uint) 0, (uint) UCHAR_MAX)))
|
||||
SIMPLE_3(char, mad_sat, char, x, char, y, char, z, vc4cl_bitcast_char(clamp(vc4cl_extend(x) * vc4cl_extend(y) + vc4cl_extend(z), (int) CHAR_MIN, (int) CHAR_MAX)))
|
||||
SIMPLE_3(ushort, mad_sat, ushort, x, ushort, y, ushort, z, vc4cl_bitcast_ushort(clamp(vc4cl_extend(x) * vc4cl_extend(y) + vc4cl_extend(z), (uint) 0, (uint) USHRT_MAX)))
|
||||
SIMPLE_3(short, mad_sat, short, x, short, y, short, z, vc4cl_bitcast_short(clamp(vc4cl_extend(x) * vc4cl_extend(y) + vc4cl_extend(z), (int) SHRT_MIN, (int) SHRT_MAX)))
|
||||
SIMPLE_3(uint, mad_sat, uint, x, uint, y, uint, z, vc4cl_long_to_int_sat(vc4cl_mul_full(x, y, VC4CL_UNSIGNED) + vc4cl_int_to_ulong(z), VC4CL_UNSIGNED))
|
||||
SIMPLE_3(int, mad_sat, int, x, int, y, int, z, vc4cl_long_to_int_sat(vc4cl_mul_full(x, y, VC4CL_SIGNED) + vc4cl_int_to_long(z), VC4CL_SIGNED))
|
||||
|
||||
SIMPLE_2(uchar, max, uchar, x, uchar, y, vc4cl_v8max(x, y))
|
||||
SIMPLE_2_SCALAR(uchar, max, uchar, x, uchar, y, vc4cl_v8max(x, y))
|
||||
SIMPLE_2(char, max, char, x, char, y, vc4cl_bitcast_char(vc4cl_max(vc4cl_sign_extend(x), vc4cl_sign_extend(y), VC4CL_SIGNED)))
|
||||
SIMPLE_2_SCALAR(char, max, char, x, char, y, vc4cl_bitcast_char(vc4cl_max(vc4cl_sign_extend(x), vc4cl_sign_extend(y), VC4CL_SIGNED)))
|
||||
SIMPLE_2(ushort, max, ushort, x, ushort, y, vc4cl_bitcast_ushort(vc4cl_max(vc4cl_bitcast_int(vc4cl_zero_extend(x)), vc4cl_bitcast_int(vc4cl_zero_extend(y)), VC4CL_UNSIGNED)))
|
||||
SIMPLE_2_SCALAR(ushort, max, ushort, x, ushort, y, vc4cl_bitcast_ushort(vc4cl_max(vc4cl_bitcast_int(vc4cl_zero_extend(x)), vc4cl_bitcast_int(vc4cl_zero_extend(y)), VC4CL_UNSIGNED)))
|
||||
SIMPLE_2(short, max, short, x, short, y, vc4cl_bitcast_short(vc4cl_max(vc4cl_sign_extend(x), vc4cl_sign_extend(y), VC4CL_SIGNED)))
|
||||
SIMPLE_2_SCALAR(short, max, short, x, short, y, vc4cl_bitcast_short(vc4cl_max(vc4cl_sign_extend(x), vc4cl_sign_extend(y), VC4CL_SIGNED)))
|
||||
SIMPLE_2(uint, max, uint, x, uint, y, x > y ? x : y)
|
||||
SIMPLE_2_SCALAR(uint, max, uint, x, uint, y, x > y ? x : y)
|
||||
SIMPLE_2(int, max, int, x, int, y, vc4cl_max(x, y, VC4CL_SIGNED))
|
||||
SIMPLE_2_SCALAR(int, max, int, x, int, y, vc4cl_max(x, y, VC4CL_SIGNED))
|
||||
COMPLEX_2(ulong, max, ulong, x, ulong, y,
|
||||
{
|
||||
uint_t upX = vc4cl_long_to_int(x >> 32);
|
||||
uint_t upY = vc4cl_long_to_int(y >> 32);
|
||||
uint_t lowX = vc4cl_long_to_int(x);
|
||||
uint_t lowY = vc4cl_long_to_int(y);
|
||||
|
||||
/* can't directly use this condition in return value, since for ?: operator, the condition and return value needs to have the same type */
|
||||
int_t selection = upX > upY ? 0 : (upX < upY ? 1 : (lowX > lowY ? 0 : 1));
|
||||
return vc4cl_int_to_long(selection) == 0 ? x : y;
|
||||
})
|
||||
SIMPLE_2_SCALAR(ulong, max, ulong, x, ulong, y, max(x, (arg0_t) y))
|
||||
SIMPLE_2(long, max, long, x, long, y, vc4cl_max(x, y, VC4CL_SIGNED))
|
||||
SIMPLE_2_SCALAR(long, max, long, x, long, y, vc4cl_max(x, y, VC4CL_SIGNED))
|
||||
|
||||
SIMPLE_2(uchar, min, uchar, x, uchar, y, vc4cl_v8min(x, y))
|
||||
SIMPLE_2_SCALAR(uchar, min, uchar, x, uchar, y, vc4cl_v8min(x, y))
|
||||
SIMPLE_2(char, min, char, x, char, y, vc4cl_bitcast_char(vc4cl_min(vc4cl_sign_extend(x), vc4cl_sign_extend(y), VC4CL_SIGNED)))
|
||||
SIMPLE_2_SCALAR(char, min, char, x, char, y, vc4cl_bitcast_char(vc4cl_min(vc4cl_sign_extend(x), vc4cl_sign_extend(y), VC4CL_SIGNED)))
|
||||
SIMPLE_2(ushort, min, ushort, x, ushort, y, vc4cl_bitcast_ushort(vc4cl_min(vc4cl_bitcast_int(vc4cl_zero_extend(x)), vc4cl_bitcast_int(vc4cl_zero_extend(y)), VC4CL_UNSIGNED)))
|
||||
SIMPLE_2_SCALAR(ushort, min, ushort, x, ushort, y, vc4cl_bitcast_ushort(vc4cl_min(vc4cl_bitcast_int(vc4cl_zero_extend(x)), vc4cl_bitcast_int(vc4cl_zero_extend(y)), VC4CL_UNSIGNED)))
|
||||
SIMPLE_2(short, min, short, x, short, y, vc4cl_bitcast_short(vc4cl_min(vc4cl_sign_extend(x), vc4cl_sign_extend(y), VC4CL_SIGNED)))
|
||||
SIMPLE_2_SCALAR(short, min, short, x, short, y, vc4cl_bitcast_short(vc4cl_min(vc4cl_sign_extend(x), vc4cl_sign_extend(y), VC4CL_SIGNED)))
|
||||
SIMPLE_2(uint, min, uint, x, uint, y, x < y ? x : y)
|
||||
SIMPLE_2_SCALAR(uint, min, uint, x, uint, y, x < y ? x : y)
|
||||
SIMPLE_2(int, min, int, x, int, y, vc4cl_min(x, y, VC4CL_SIGNED))
|
||||
SIMPLE_2_SCALAR(int, min, int, x, int, y, vc4cl_min(x, y, VC4CL_SIGNED))
|
||||
COMPLEX_2(ulong, min, ulong, x, ulong, y,
|
||||
{
|
||||
uint_t upX = vc4cl_long_to_int(x >> 32);
|
||||
uint_t upY = vc4cl_long_to_int(y >> 32);
|
||||
uint_t lowX = vc4cl_long_to_int(x);
|
||||
uint_t lowY = vc4cl_long_to_int(y);
|
||||
|
||||
/* can't directly use this condition in return value, since for ?: operator, the condition and return value needs to have the same type */
|
||||
int_t selection = upX < upY ? 0 : (upX > upY ? 1 : (lowX < lowY ? 0 : 1));
|
||||
return vc4cl_int_to_long(selection) == 0 ? x : y;
|
||||
})
|
||||
SIMPLE_2_SCALAR(ulong, min, ulong, x, ulong, y, min(x, (arg0_t) y))
|
||||
SIMPLE_2(long, min, long, x, long, y, vc4cl_min(x, y, VC4CL_SIGNED))
|
||||
SIMPLE_2_SCALAR(long, min, long, x, long, y, vc4cl_min(x, y, VC4CL_SIGNED))
|
||||
|
||||
SIMPLE_2(uchar, mul_hi, uchar, x, uchar, y, vc4cl_bitcast_uchar(vc4cl_mul24(x, y, VC4CL_UNSIGNED) >> 8))
|
||||
SIMPLE_2(char, mul_hi, char, x, char, y, vc4cl_bitcast_char(vc4cl_asr(vc4cl_mul24(vc4cl_sign_extend(x), vc4cl_sign_extend(y), VC4CL_SIGNED), 8)))
|
||||
SIMPLE_2(ushort, mul_hi, ushort, x, ushort, y, vc4cl_bitcast_ushort(vc4cl_mul24(x, y, VC4CL_UNSIGNED) >> 16))
|
||||
SIMPLE_2(short, mul_hi, short, x, short, y, vc4cl_bitcast_short(vc4cl_asr(vc4cl_sign_extend(x) * vc4cl_sign_extend(y), 16)))
|
||||
SIMPLE_2(uint, mul_hi, uint, x, uint, y, vc4cl_mul_hi(x, y, VC4CL_UNSIGNED))
|
||||
SIMPLE_2(int, mul_hi, int, x, int, y, vc4cl_mul_hi(x, y, VC4CL_SIGNED))
|
||||
|
||||
//Since the rotation is over all 32-bits, for smaller types we need to replicate the value, rotate it and truncate/sign extend the result afterwards
|
||||
SIMPLE_2(uchar, rotate, uchar, x, uchar, y, vc4cl_pack_lsb(vc4cl_ror(vc4cl_replicate_lsb(x), -vc4cl_bitcast_int(vc4cl_zero_extend(y)))))
|
||||
SIMPLE_2(char, rotate, char, x, char, y, vc4cl_bitcast_char(vc4cl_asr(vc4cl_ror(vc4cl_replicate_lsb(x), -vc4cl_extend(y)), 24)))
|
||||
SIMPLE_2(ushort, rotate, ushort, x, ushort, y, vc4cl_pack_truncate(vc4cl_ror(vc4cl_zero_extend(x) | (vc4cl_zero_extend(x) << 16), -vc4cl_bitcast_int(vc4cl_zero_extend(y)))))
|
||||
SIMPLE_2(short, rotate, short, x, short, y, vc4cl_bitcast_short(vc4cl_extend(vc4cl_bitcast_short(vc4cl_ror((vc4cl_sign_extend(x) & (int) 0xFFFF) | (vc4cl_sign_extend(x) << 16), -vc4cl_sign_extend(y))))))
|
||||
SIMPLE_2(uint, rotate, uint, x, uint, y, vc4cl_bitcast_uint(vc4cl_ror(x, -vc4cl_bitcast_int(y))))
|
||||
SIMPLE_2(int, rotate, int, x, int, y, vc4cl_bitcast_int(vc4cl_ror(x, -y)))
|
||||
|
||||
SIMPLE_2(uchar, sub_sat, uchar, x, uchar, y, vc4cl_v8subs(x, y))
|
||||
SIMPLE_2(char, sub_sat, char, x, char, y, vc4cl_bitcast_char(clamp(vc4cl_extend(x) - vc4cl_extend(y), SCHAR_MIN, SCHAR_MAX)))
|
||||
SIMPLE_2(ushort, sub_sat, ushort, x, ushort, y, x < y ? (result_t)0 : x - y)
|
||||
SIMPLE_2(short, sub_sat, short, x, short, y, vc4cl_bitcast_short(clamp(vc4cl_extend(x) - vc4cl_extend(y), SHRT_MIN, SHRT_MAX)))
|
||||
//based on pocl (pocl/lib/kernel/sub_sat.cl)
|
||||
SIMPLE_2(uint, sub_sat, uint, x, uint, y, x < y ? (result_t)0 : x - y)
|
||||
SIMPLE_2(int, sub_sat, int, x, int, y, vc4cl_saturated_sub(x, y))
|
||||
|
||||
SIMPLE_2(short, upsample, char, hi, uchar, lo, vc4cl_bitcast_short((vc4cl_sign_extend(hi) << 8) | vc4cl_bitcast_int(vc4cl_zero_extend(lo))))
|
||||
SIMPLE_2(ushort, upsample, uchar, hi, uchar, lo, vc4cl_bitcast_ushort((vc4cl_zero_extend(hi) << 8) | vc4cl_zero_extend(lo)))
|
||||
SIMPLE_2(int, upsample, short, hi, ushort, lo, (vc4cl_sign_extend(hi) << 16) | vc4cl_bitcast_int(vc4cl_zero_extend(lo)))
|
||||
SIMPLE_2(uint, upsample, ushort, hi, ushort, lo, (vc4cl_zero_extend(hi) << 16) | vc4cl_zero_extend(lo))
|
||||
SIMPLE_2(long, upsample, int, hi, uint, lo, (vc4cl_int_to_long(hi) << 32) | vc4cl_bitcast_long(vc4cl_int_to_ulong(lo)))
|
||||
SIMPLE_2(ulong, upsample, uint, hi, uint, lo, (vc4cl_int_to_ulong(hi) << 32) | vc4cl_int_to_ulong(lo))
|
||||
|
||||
//" Returns the number of non-zero bits in x. "
|
||||
SIMPLE_1(uchar, popcount, uchar, val, vc4cl_popcount(val))
|
||||
SIMPLE_1(char, popcount, char, val, vc4cl_popcount(val))
|
||||
SIMPLE_1(ushort, popcount, ushort, val, vc4cl_popcount(val))
|
||||
SIMPLE_1(short, popcount, short, val, vc4cl_popcount(val))
|
||||
SIMPLE_1(uint, popcount, uint, val, vc4cl_popcount(val))
|
||||
SIMPLE_1(int, popcount, int, val, vc4cl_popcount(val))
|
||||
SIMPLE_1(ulong, popcount, ulong, val, vc4cl_popcount(val))
|
||||
SIMPLE_1(long, popcount, long, val, vc4cl_popcount(val))
|
||||
|
||||
SIMPLE_2(uchar, mul24, uchar, x, uchar, y, vc4cl_bitcast_uchar(vc4cl_mul24(x, y, VC4CL_UNSIGNED)))
|
||||
SIMPLE_2(char, mul24, char, x, char, y, vc4cl_bitcast_char(vc4cl_mul24(x, y, VC4CL_SIGNED)))
|
||||
SIMPLE_2(ushort, mul24, ushort, x, ushort, y, vc4cl_bitcast_ushort(vc4cl_mul24(x, y, VC4CL_UNSIGNED)))
|
||||
SIMPLE_2(short, mul24, short, x, short, y, vc4cl_bitcast_short(vc4cl_mul24(x, y, VC4CL_SIGNED)))
|
||||
SIMPLE_2(uint, mul24, uint, x, uint, y, vc4cl_mul24(x, y, VC4CL_UNSIGNED))
|
||||
SIMPLE_2(int, mul24, int, x, int, y, vc4cl_mul24(x, y, VC4CL_SIGNED))
|
||||
SIMPLE_INTEGER_3(mad24, a, b, c, mul24(a, b) + c)
|
||||
|
||||
#undef SIMPLE_INTEGER_2
|
||||
#undef SIMPLE_INTEGER_3
|
||||
|
||||
#endif /* VC4CL_INTEGER_H */
|
||||
|
436
drivers/videocore4_stdlib/include/_intrinsics.h
Normal file
436
drivers/videocore4_stdlib/include/_intrinsics.h
Normal file
@ -0,0 +1,436 @@
|
||||
/* Declares interfaces for all intrinsic functions
|
||||
*
|
||||
* Author: doe300
|
||||
*
|
||||
* See the file "LICENSE" for the full license governing this code.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef VC4CL_INTRINSICS_H
|
||||
#define VC4CL_INTRINSICS_H
|
||||
|
||||
#include "_overloads.h"
|
||||
|
||||
#define VC4CL_SIGNED 0
|
||||
#define VC4CL_UNSIGNED 1
|
||||
|
||||
/*
|
||||
* ALU operations
|
||||
*
|
||||
* NOTE: These operations directly map to the machine instructions and do not
|
||||
* heed other data-types (e.g. vc4cl_clz will always return the leading zeroes to
|
||||
* full 32-bit width)
|
||||
*/
|
||||
OVERLOAD_2(float, vc4cl_fmax, float, x, float, y)
|
||||
OVERLOAD_2(float, vc4cl_fmin, float, x, float, y)
|
||||
OVERLOAD_2(float, vc4cl_fmaxabs, float, x, float, y)
|
||||
OVERLOAD_2(float, vc4cl_fminabs, float, x, float, y)
|
||||
OVERLOAD_1(int, vc4cl_ftoi, float, val)
|
||||
OVERLOAD_1(float, vc4cl_itof, int, val)
|
||||
|
||||
OVERLOAD_2(int, vc4cl_asr, uint, val, int, offset)
|
||||
OVERLOAD_2(int, vc4cl_asr, int, val, int, offset)
|
||||
OVERLOAD_2(uint, vc4cl_ror, uint, val, int, offset)
|
||||
OVERLOAD_2(int, vc4cl_ror, int, val, int, offset)
|
||||
OVERLOAD_3_SCALAR(int, vc4cl_min, int, x, int, y, uchar, sign)
|
||||
OVERLOAD_3_SCALAR(int, vc4cl_max, int, x, int, y, uchar, sign)
|
||||
OVERLOAD_3_SCALAR(long, vc4cl_min, long, x, long, y, uchar, sign)
|
||||
OVERLOAD_3_SCALAR(long, vc4cl_max, long, x, long, y, uchar, sign)
|
||||
OVERLOAD_2(uint, vc4cl_and, uchar, x, uchar, y)
|
||||
OVERLOAD_2(int, vc4cl_and, char, x, char, y)
|
||||
OVERLOAD_2(uint, vc4cl_and, ushort, x, ushort, y)
|
||||
OVERLOAD_2(int, vc4cl_and, short, x, short, y)
|
||||
SIMPLE_2(uint, vc4cl_and, uint, x, uint, y, x & y)
|
||||
SIMPLE_2(int, vc4cl_and, int, x, int, y, x & y)
|
||||
OVERLOAD_1(uint, vc4cl_clz, uint, val)
|
||||
OVERLOAD_1(int, vc4cl_clz, int, val)
|
||||
|
||||
OVERLOAD_3_SCALAR(uint, vc4cl_mul24, uchar, x, uchar, y, uchar, sign)
|
||||
OVERLOAD_3_SCALAR(int, vc4cl_mul24, char, x, char, y, uchar, sign)
|
||||
OVERLOAD_3_SCALAR(uint, vc4cl_mul24, ushort, x, ushort, y, uchar, sign)
|
||||
OVERLOAD_3_SCALAR(int, vc4cl_mul24, short, x, short, y, uchar, sign)
|
||||
OVERLOAD_3_SCALAR(uint, vc4cl_mul24, uint, x, uint, y, uchar, sign)
|
||||
OVERLOAD_3_SCALAR(int, vc4cl_mul24, int, x, int, y, uchar, sign)
|
||||
|
||||
OVERLOAD_2(uchar, vc4cl_v8adds, uchar, x, uchar, y)
|
||||
OVERLOAD_2(uint, vc4cl_v8adds, uint, x, uint, y)
|
||||
OVERLOAD_2(uchar, vc4cl_v8subs, uchar, x, uchar, y)
|
||||
OVERLOAD_2(uint, vc4cl_v8subs, uint, x, uint, y)
|
||||
OVERLOAD_2(uchar, vc4cl_v8min, uchar, x, uchar, y)
|
||||
OVERLOAD_2(uint, vc4cl_v8min, uint, x, uint, y)
|
||||
OVERLOAD_2(uchar, vc4cl_v8max, uchar, x, uchar, y)
|
||||
OVERLOAD_2(uint, vc4cl_v8max, uint, x, uint, y)
|
||||
|
||||
/*
|
||||
* Pack/unpack modes
|
||||
*/
|
||||
//TODO ALU needs to consume float for this to work
|
||||
//unpacks half to float (UNPACK 1: 16a -> 32)
|
||||
//OVERLOAD_1(float, vc4cl_unpack_half, half, val)
|
||||
//sign-extends short to int (UNPACK 1: 16a -> 32)
|
||||
OVERLOAD_1(int, vc4cl_unpack_sext, short, val)
|
||||
//unpacks first byte [0, 1] to float (UNPACK 4: 8a -> 32)
|
||||
OVERLOAD_1(float, vc4cl_unpack_color_byte0, uchar, val)
|
||||
//unpacks second byte [0, 1] to float (UNPACK 5: 8b -> 32)
|
||||
OVERLOAD_1(float, vc4cl_unpack_color_byte1, uchar, val)
|
||||
//unpacks third byte [0, 1] to float (UNPACK 6: 8c -> 32)
|
||||
OVERLOAD_1(float, vc4cl_unpack_color_byte2, uchar, val)
|
||||
//unpacks fourth byte [0, 1] to float (UNPACK 7: 8d -> 32)
|
||||
OVERLOAD_1(float, vc4cl_unpack_color_byte3, uchar, val)
|
||||
//zero-extend first byte to uint (UNPACK 4: 8a -> 32)
|
||||
OVERLOAD_1(uint, vc4cl_unpack_byte0, uchar, val)
|
||||
//zero-extend second byte to uint (UNPACK 5: 8b -> 32)
|
||||
OVERLOAD_1(uint, vc4cl_unpack_byte1, uchar, val)
|
||||
//zero-extend third byte to uint (UNPACK 6: 8c -> 32)
|
||||
OVERLOAD_1(uint, vc4cl_unpack_byte2, uchar, val)
|
||||
//zero-extend fourth byte to uint (UNPACK 7: 8d -> 32)
|
||||
OVERLOAD_1(uint, vc4cl_unpack_byte3, uchar, val)
|
||||
|
||||
//TODO ALU needs to consume float for this to work
|
||||
//packs float into half (PACK 1: 32 -> 16a)
|
||||
//OVERLOAD_1(half, vc4cl_pack_half, float, val)
|
||||
//converts to unsigned 16-bit integer, truncates the result (PACK 1: 32 -> 16a)
|
||||
OVERLOAD_1(ushort, vc4cl_pack_truncate, int, val)
|
||||
OVERLOAD_1(ushort, vc4cl_pack_truncate, uint, val)
|
||||
//replicates the LSB into all four bytes (PACK 3: 32 -> 8888)
|
||||
OVERLOAD_1(uint, vc4cl_replicate_lsb, char, val)
|
||||
OVERLOAD_1(uint, vc4cl_replicate_lsb, uchar, val)
|
||||
OVERLOAD_1(uint, vc4cl_replicate_lsb, uint, val)
|
||||
//takes the LSB and writes it into LSB (PACK 4: 32 -> 8a)
|
||||
OVERLOAD_1(uchar, vc4cl_pack_lsb, char, val)
|
||||
OVERLOAD_1(uchar, vc4cl_pack_lsb, uchar, val)
|
||||
OVERLOAD_1(uchar, vc4cl_pack_lsb, uint, val)
|
||||
//calculates addition, but saturates the result afterwards (depending on signed integer over-/underflow of addition) (uses PACK 8: 32 -> 32)
|
||||
OVERLOAD_2(int, vc4cl_saturated_add, int, x, int, y)
|
||||
//NOTE: Since the 32 -> 32 saturation pack mode works differently for sub, the intrinsic is implemented differently than saturated_add
|
||||
OVERLOAD_2(int, vc4cl_saturated_sub, int, x, int, y)
|
||||
//saturates to unsigned byte (PACK 12: 32 -> 8a)
|
||||
OVERLOAD_1(uchar, vc4cl_saturate_lsb, uint, val)
|
||||
|
||||
|
||||
/*
|
||||
* SFU calls
|
||||
*/
|
||||
OVERLOAD_1(float, vc4cl_sfu_recip, float, val)
|
||||
OVERLOAD_1(float, vc4cl_sfu_rsqrt, float, val)
|
||||
OVERLOAD_1(float, vc4cl_sfu_log2, float, val)
|
||||
OVERLOAD_1(float, vc4cl_sfu_exp2, float, val)
|
||||
|
||||
/*
|
||||
* Periphery access
|
||||
*/
|
||||
void vc4cl_mutex_lock(void);
|
||||
void vc4cl_mutex_unlock(void);
|
||||
//read DMA without locking the mutex
|
||||
OVERLOAD_1(int, vc4cl_dma_read, volatile __global int, * ptr)
|
||||
OVERLOAD_1(uint, vc4cl_dma_read, volatile __global uint, * ptr)
|
||||
OVERLOAD_1(float, vc4cl_dma_read, volatile __global float, * ptr)
|
||||
OVERLOAD_1(int, vc4cl_dma_read, volatile __local int, * ptr)
|
||||
OVERLOAD_1(uint, vc4cl_dma_read, volatile __local uint, * ptr)
|
||||
OVERLOAD_1(float, vc4cl_dma_read, volatile __local float, * ptr)
|
||||
//write DMA without locking the mutex
|
||||
OVERLOAD_2_RETURN_SCALAR(void, vc4cl_dma_write, volatile __global int, * ptr, int, val)
|
||||
OVERLOAD_2_RETURN_SCALAR(void, vc4cl_dma_write, volatile __global uint, * ptr, uint, val)
|
||||
OVERLOAD_2_RETURN_SCALAR(void, vc4cl_dma_write, volatile __global float, * ptr, float, val)
|
||||
OVERLOAD_2_RETURN_SCALAR(void, vc4cl_dma_write, volatile __local int, * ptr, int, val)
|
||||
OVERLOAD_2_RETURN_SCALAR(void, vc4cl_dma_write, volatile __local uint, * ptr, uint, val)
|
||||
OVERLOAD_2_RETURN_SCALAR(void, vc4cl_dma_write, volatile __local float, * ptr, float, val)
|
||||
//copy DMA without locking the mutex
|
||||
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __global uchar, *dest, const __local uchar, *src, size_t, num_elements)
|
||||
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __global char, *dest, const __local char, *src, size_t, num_elements)
|
||||
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __global ushort, *dest, const __local ushort, *src, size_t, num_elements)
|
||||
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __global short, *dest, const __local short, *src, size_t, num_elements)
|
||||
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __global uint, *dest, const __local uint, *src, size_t, num_elements)
|
||||
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __global int, *dest, const __local int, *src, size_t, num_elements)
|
||||
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __global float, *dest, const __local float, *src, size_t, num_elements)
|
||||
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __local uchar, *dest, const __global uchar, *src, size_t, num_elements)
|
||||
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __local char, *dest, const __global char, *src, size_t, num_elements)
|
||||
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __local ushort, *dest, const __global ushort, *src, size_t, num_elements)
|
||||
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __local short, *dest, const __global short, *src, size_t, num_elements)
|
||||
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __local uint, *dest, const __global uint, *src, size_t, num_elements)
|
||||
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __local int, *dest, const __global int, *src, size_t, num_elements)
|
||||
OVERLOAD_3_SCALAR(int, vc4cl_dma_copy, __local float, *dest, const __global float, *src, size_t, num_elements)
|
||||
//load into VPM without locking the mutex
|
||||
OVERLOAD_2_SCALAR_RETURN_SCALAR(void, vc4cl_prefetch, const __global uchar, *ptr, size_t, num_elements)
|
||||
OVERLOAD_2_SCALAR_RETURN_SCALAR(void, vc4cl_prefetch, const __global char, *ptr, size_t, num_elements)
|
||||
OVERLOAD_2_SCALAR_RETURN_SCALAR(void, vc4cl_prefetch, const __global ushort, *ptr, size_t, num_elements)
|
||||
OVERLOAD_2_SCALAR_RETURN_SCALAR(void, vc4cl_prefetch, const __global short, *ptr, size_t, num_elements)
|
||||
OVERLOAD_2_SCALAR_RETURN_SCALAR(void, vc4cl_prefetch, const __global uint, *ptr, size_t, num_elements)
|
||||
OVERLOAD_2_SCALAR_RETURN_SCALAR(void, vc4cl_prefetch, const __global int, *ptr, size_t, num_elements)
|
||||
OVERLOAD_2_SCALAR_RETURN_SCALAR(void, vc4cl_prefetch, const __global float, *ptr, size_t, num_elements)
|
||||
// special handling of 3-element load/store, since LLVM (compliant with the OpenCL standard) by default generates 4-element load/store
|
||||
char3 vc4cl_vload3(const __global char* ptr) OVERLOADABLE;
|
||||
char3 vc4cl_vload3(const __local char* ptr) OVERLOADABLE;
|
||||
char3 vc4cl_vload3(const __private char* ptr) OVERLOADABLE;
|
||||
char3 vc4cl_vload3(const __constant char* ptr) OVERLOADABLE;
|
||||
uchar3 vc4cl_vload3(const __global uchar* ptr) OVERLOADABLE;
|
||||
uchar3 vc4cl_vload3(const __local uchar* ptr) OVERLOADABLE;
|
||||
uchar3 vc4cl_vload3(const __private uchar* ptr) OVERLOADABLE;
|
||||
uchar3 vc4cl_vload3(const __constant uchar* ptr) OVERLOADABLE;
|
||||
short3 vc4cl_vload3(const __global short* ptr) OVERLOADABLE;
|
||||
short3 vc4cl_vload3(const __local short* ptr) OVERLOADABLE;
|
||||
short3 vc4cl_vload3(const __private short* ptr) OVERLOADABLE;
|
||||
short3 vc4cl_vload3(const __constant short* ptr) OVERLOADABLE;
|
||||
ushort3 vc4cl_vload3(const __global ushort* ptr) OVERLOADABLE;
|
||||
ushort3 vc4cl_vload3(const __local ushort* ptr) OVERLOADABLE;
|
||||
ushort3 vc4cl_vload3(const __private ushort* ptr) OVERLOADABLE;
|
||||
ushort3 vc4cl_vload3(const __constant ushort* ptr) OVERLOADABLE;
|
||||
int3 vc4cl_vload3(const __global int* ptr) OVERLOADABLE;
|
||||
int3 vc4cl_vload3(const __local int* ptr) OVERLOADABLE;
|
||||
int3 vc4cl_vload3(const __private int* ptr) OVERLOADABLE;
|
||||
int3 vc4cl_vload3(const __constant int* ptr) OVERLOADABLE;
|
||||
uint3 vc4cl_vload3(const __global uint* ptr) OVERLOADABLE;
|
||||
uint3 vc4cl_vload3(const __local uint* ptr) OVERLOADABLE;
|
||||
uint3 vc4cl_vload3(const __private uint* ptr) OVERLOADABLE;
|
||||
uint3 vc4cl_vload3(const __constant uint* ptr) OVERLOADABLE;
|
||||
float3 vc4cl_vload3(const __global float* ptr) OVERLOADABLE;
|
||||
float3 vc4cl_vload3(const __local float* ptr) OVERLOADABLE;
|
||||
float3 vc4cl_vload3(const __private float* ptr) OVERLOADABLE;
|
||||
float3 vc4cl_vload3(const __constant float* ptr) OVERLOADABLE;
|
||||
long3 vc4cl_vload3(const __global long* ptr) OVERLOADABLE;
|
||||
long3 vc4cl_vload3(const __local long* ptr) OVERLOADABLE;
|
||||
long3 vc4cl_vload3(const __private long* ptr) OVERLOADABLE;
|
||||
long3 vc4cl_vload3(const __constant long* ptr) OVERLOADABLE;
|
||||
ulong3 vc4cl_vload3(const __global ulong* ptr) OVERLOADABLE;
|
||||
ulong3 vc4cl_vload3(const __local ulong* ptr) OVERLOADABLE;
|
||||
ulong3 vc4cl_vload3(const __private ulong* ptr) OVERLOADABLE;
|
||||
ulong3 vc4cl_vload3(const __constant ulong* ptr) OVERLOADABLE;
|
||||
|
||||
void vc4cl_vstore3(__global char* ptr, char3 val) OVERLOADABLE;
|
||||
void vc4cl_vstore3(__local char* ptr, char3 val) OVERLOADABLE;
|
||||
void vc4cl_vstore3(__private char* ptr, char3 val) OVERLOADABLE;
|
||||
void vc4cl_vstore3(__global uchar* ptr, uchar3 val) OVERLOADABLE;
|
||||
void vc4cl_vstore3(__local uchar* ptr, uchar3 val) OVERLOADABLE;
|
||||
void vc4cl_vstore3(__private uchar* ptr, uchar3 val) OVERLOADABLE;
|
||||
void vc4cl_vstore3(__global short* ptr, short3 val) OVERLOADABLE;
|
||||
void vc4cl_vstore3(__local short* ptr, short3 val) OVERLOADABLE;
|
||||
void vc4cl_vstore3(__private short* ptr, short3 val) OVERLOADABLE;
|
||||
void vc4cl_vstore3(__global ushort* ptr, ushort3 val) OVERLOADABLE;
|
||||
void vc4cl_vstore3(__local ushort* ptr, ushort3 val) OVERLOADABLE;
|
||||
void vc4cl_vstore3(__private ushort* ptr, ushort3 val) OVERLOADABLE;
|
||||
void vc4cl_vstore3(__global int* ptr, int3 val) OVERLOADABLE;
|
||||
void vc4cl_vstore3(__local int* ptr, int3 val) OVERLOADABLE;
|
||||
void vc4cl_vstore3(__private int* ptr, int3 val) OVERLOADABLE;
|
||||
void vc4cl_vstore3(__global uint* ptr, uint3 val) OVERLOADABLE;
|
||||
void vc4cl_vstore3(__local uint* ptr, uint3 val) OVERLOADABLE;
|
||||
void vc4cl_vstore3(__private uint* ptr, uint3 val) OVERLOADABLE;
|
||||
void vc4cl_vstore3(__global float* ptr, float3 val) OVERLOADABLE;
|
||||
void vc4cl_vstore3(__local float* ptr, float3 val) OVERLOADABLE;
|
||||
void vc4cl_vstore3(__private float* ptr, float3 val) OVERLOADABLE;
|
||||
void vc4cl_vstore3(__global long* ptr, long3 val) OVERLOADABLE;
|
||||
void vc4cl_vstore3(__local long* ptr, long3 val) OVERLOADABLE;
|
||||
void vc4cl_vstore3(__private long* ptr, long3 val) OVERLOADABLE;
|
||||
void vc4cl_vstore3(__global ulong* ptr, ulong3 val) OVERLOADABLE;
|
||||
void vc4cl_vstore3(__local ulong* ptr, ulong3 val) OVERLOADABLE;
|
||||
void vc4cl_vstore3(__private ulong* ptr, ulong3 val) OVERLOADABLE;
|
||||
/*
|
||||
* Work-item functions
|
||||
* Mapped to UNIFORM reads
|
||||
*
|
||||
* local values are stored in the a UNIFORM in this fashion:
|
||||
* | 0 | dim2 | dim1 | dim0 |
|
||||
* -> to read value of dimension x, calculate: (UNIFORM >> (dim * 8)) & 0xFF
|
||||
*
|
||||
* This can be compacted in such way, since for a maximum value of 12, the local ID and size fits into 1 Byte
|
||||
*/
|
||||
PURE uchar vc4cl_work_dimensions(void);
|
||||
PURE uchar vc4cl_local_size(uint dim);
|
||||
PURE uchar vc4cl_local_id(uint dim);
|
||||
PURE uint vc4cl_num_groups(uint dim);
|
||||
PURE uint vc4cl_group_id(uint dim);
|
||||
PURE uint vc4cl_global_offset(uint dim);
|
||||
PURE uint vc4cl_global_size(uint dim);
|
||||
PURE uint vc4cl_global_id(uint dim);
|
||||
PURE uchar vc4cl_local_linear_id(void);
|
||||
PURE uint vc4cl_global_linear_id(void);
|
||||
|
||||
/*
|
||||
* Image functions
|
||||
* In CLang, read_only and write_only image-types are separate types.
|
||||
* Also in CLang, OpenCL image-types are built-in opaque types
|
||||
*/
|
||||
#ifdef __IMAGE_SUPPORT__
|
||||
/*
|
||||
* Texture Config Parameter 0
|
||||
* Broadcom specification, table 15
|
||||
*
|
||||
* 0 - 3 | 4 bits | Number of mipmap levels minus 1
|
||||
* 4 - 7 | 4 bits | texture data type (high bit is on config parameter 1)
|
||||
* 8 | 1 bit | flip texture Y axis
|
||||
* 9 | 1 bit | cube map mode
|
||||
* 10 - 11 | 2 bits | cache swizzle
|
||||
* 12 - 31 | 20 bits | texture base pointer (multiple of 4KB)
|
||||
*/
|
||||
OVERLOAD_ALL_IMAGE_TYPES(CONST uint, vc4cl_image_basic_setup)
|
||||
/*
|
||||
* Texture Config Parameter 1
|
||||
* Broadcom specification, table 16
|
||||
*
|
||||
* 0 - 1 | 2 bits | S (x-coord) wrap mode (0 = repeat, 1 = clamp, 2 = mirror, 3 = border)
|
||||
* 2 - 3 | 2 bits | T (y-coord) wrap mode (0 = repeat, 1 = clamp, 2 = mirror, 3 = border)
|
||||
* 4 - 6 | 3 bits | minification filter (interpolation)
|
||||
* 7 | 1 bit | magnification filter
|
||||
* 8 - 18 | 11 bits | image width (0 = 2048)
|
||||
* 19 | 1 bit | flip ETC Y (per block)
|
||||
* 20 - 30 | 11 bits | image height (0 = 248)
|
||||
* 31 | 1 bit | high bit of texture type
|
||||
*/
|
||||
OVERLOAD_ALL_IMAGE_TYPES(CONST uint, vc4cl_image_access_setup)
|
||||
/*
|
||||
* Texture Config Parameters 2 and 3
|
||||
* Broadcom specification, table 17
|
||||
*
|
||||
* Cube map stride:
|
||||
* 0 | 1 bit | disable automatic LOD, use bias only
|
||||
* 12 - 29 | 18 bits | cube map stride (in multiples of 4KB)
|
||||
* 30 - 31 | 2 bits | value 1 for cube map stride
|
||||
*
|
||||
* Child image dimensions:
|
||||
* 0 - 10 | 11 bits | child image width (0 = 2048, does not work, see errata HW-2753)
|
||||
* 12 - 22 | 11 bits | child image height (0 = 2048, does not work, see errata HW-2753)
|
||||
* 30 - 31 | 2 bits | value 2 for child image dimensions
|
||||
*
|
||||
* Child image offsets:
|
||||
* 0 - 10 | 11 bits | child image X offset
|
||||
* 12 - 22 | 11 bits | child image Y offset
|
||||
* 30 - 31 | 2 bits | value 3 for child image offsets
|
||||
*/
|
||||
OVERLOAD_ALL_IMAGE_TYPES(CONST uint, vc4cl_image_extended_setup)
|
||||
/*
|
||||
* To apply a sampler to an image, we need to override the image-access setup UNIFORM before a read with the magnification/minification filters and wrap modes to use
|
||||
*/
|
||||
OVERLOAD_ALL_IMAGE_TYPES_1(void, vc4cl_set_image_access_setup, uint, val)
|
||||
CONST uint vc4cl_sampler_get_normalized_coords(sampler_t sampler);
|
||||
CONST uint vc4cl_sampler_get_addressing_mode(sampler_t sampler);
|
||||
CONST uint vc4cl_sampler_get_filter_mode(sampler_t sampler);
|
||||
/*
|
||||
* Image read functions
|
||||
*
|
||||
* The coordinates need to be floating-values in the range [0, 1] and are scaled to the width/height of the image.
|
||||
* The returned data is not necessarily <4 x int32>, but up to 4 components with up to 32 bits each, loaded according to the byte-sizes and number of components specified in the channel_type_size and channel_order_size.
|
||||
*
|
||||
* So, this functions return the data in the native format (as stored in the image-buffer), but correctly distributed across the 4 components.
|
||||
*/
|
||||
int4 vc4cl_image_read(read_only image1d_t image, float coords, uint channel_type_size, uint channel_order_size) OVERLOADABLE;
|
||||
int4 vc4cl_image_read(read_only image1d_buffer_t image, float coords, uint channel_type_size, uint channel_order_size) OVERLOADABLE;
|
||||
int4 vc4cl_image_read(read_only image1d_array_t image, float coords, int imageIndex, uint channel_type_size, uint channel_order_size) OVERLOADABLE;
|
||||
int4 vc4cl_image_read(read_only image2d_t image, float2 coords, uint channel_type_size, uint channel_order_size) OVERLOADABLE;
|
||||
int4 vc4cl_image_read(read_only image2d_array_t image, float2 coords, int imageIndex, uint channel_type_size, uint channel_order_size) OVERLOADABLE;
|
||||
int4 vc4cl_image_read(read_only image3d_t image, float4 coords, uint channel_type_size, uint channel_order_size) OVERLOADABLE;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Type conversions
|
||||
*/
|
||||
// TODO use __builtin_convertvector ?? https://clang.llvm.org/docs/LanguageExtensions.html#builtin-convertvector
|
||||
// check available on all compiler versions, generated LLVM IR code!
|
||||
//component-wise bitcasts
|
||||
OVERLOAD_1(uchar, vc4cl_bitcast_uchar, uint, val)
|
||||
OVERLOAD_1(uchar, vc4cl_bitcast_uchar, int, val)
|
||||
OVERLOAD_1(char, vc4cl_bitcast_char, uint, val)
|
||||
OVERLOAD_1(char, vc4cl_bitcast_char, int, val)
|
||||
OVERLOAD_1(ushort, vc4cl_bitcast_ushort, uint, val)
|
||||
OVERLOAD_1(ushort, vc4cl_bitcast_ushort, int, val)
|
||||
OVERLOAD_1(short, vc4cl_bitcast_short, uint, val)
|
||||
OVERLOAD_1(short, vc4cl_bitcast_short, int, val)
|
||||
SIMPLE_1(uint, vc4cl_bitcast_uint, uint, val, val)
|
||||
OVERLOAD_1(uint, vc4cl_bitcast_uint, int, val)
|
||||
OVERLOAD_1(int, vc4cl_bitcast_int, uint, val)
|
||||
SIMPLE_1(int, vc4cl_bitcast_int, int, val, val)
|
||||
|
||||
OVERLOAD_1(uint, vc4cl_bitcast_uint, float, val)
|
||||
OVERLOAD_1(float, vc4cl_bitcast_float, uint, val)
|
||||
OVERLOAD_1(int, vc4cl_bitcast_int, float, val)
|
||||
OVERLOAD_1(float, vc4cl_bitcast_float, int, val)
|
||||
|
||||
SIMPLE_1(int, vc4cl_sign_extend, char, val, vc4cl_asr(vc4cl_and(val, (arg_t)0xFF) << 24, 24))
|
||||
//SIMPLE_1(int, vc4cl_sign_extend, short, val, vc4cl_asr(vc4cl_and(val, (arg_t)0xFFFF) << 16, 16))
|
||||
SIMPLE_1(int, vc4cl_sign_extend, short, val, vc4cl_unpack_sext(val))
|
||||
|
||||
SIMPLE_1(uint, vc4cl_zero_extend, uchar, val, vc4cl_and(val, (arg_t) (0xFFU)))
|
||||
SIMPLE_1(uint, vc4cl_zero_extend, ushort, val, vc4cl_and(val, (arg_t) (0xFFFFU)))
|
||||
|
||||
SIMPLE_1(uint, vc4cl_extend, uchar, val, vc4cl_zero_extend(val))
|
||||
SIMPLE_1(int, vc4cl_extend, char, val, vc4cl_sign_extend(val))
|
||||
SIMPLE_1(uint, vc4cl_extend, ushort, val, vc4cl_zero_extend(val))
|
||||
SIMPLE_1(int, vc4cl_extend, short, val, vc4cl_sign_extend(val))
|
||||
SIMPLE_1(uint, vc4cl_extend, uint, val, val)
|
||||
SIMPLE_1(int, vc4cl_extend, int, val, val)
|
||||
SIMPLE_1(ulong, vc4cl_extend, ulong, val, val)
|
||||
SIMPLE_1(long, vc4cl_extend, long, val, val)
|
||||
|
||||
OVERLOAD_1(ulong, vc4cl_bitcast_ulong, long, val)
|
||||
OVERLOAD_1(ulong, vc4cl_bitcast_ulong, ulong, val)
|
||||
OVERLOAD_1(long, vc4cl_bitcast_long, ulong, val)
|
||||
OVERLOAD_1(long, vc4cl_bitcast_long, long, val)
|
||||
OVERLOAD_1(uint, vc4cl_long_to_int, ulong, val)
|
||||
OVERLOAD_1(int, vc4cl_long_to_int, long, val)
|
||||
OVERLOAD_1(ulong, vc4cl_int_to_ulong, uint, val)
|
||||
OVERLOAD_1(long, vc4cl_int_to_long, int, val)
|
||||
SIMPLE_1(ulong, vc4cl_extend_to_long, uint, val, vc4cl_int_to_ulong(val))
|
||||
SIMPLE_1(long, vc4cl_extend_to_long, int, val, vc4cl_int_to_long(val))
|
||||
OVERLOAD_2_SCALAR(int, vc4cl_long_to_int_sat, long, val, uchar, sign)
|
||||
OVERLOAD_2_SCALAR(uint, vc4cl_long_to_int_sat, ulong, val, uchar, sign)
|
||||
OVERLOAD_1(float, vc4cl_long_to_float, long, val)
|
||||
OVERLOAD_1(float, vc4cl_ulong_to_float, ulong, val)
|
||||
|
||||
/*
|
||||
* Other functions
|
||||
*/
|
||||
SIMPLE_1(uchar, vc4cl_msb_set, uchar, val, vc4cl_bitcast_uchar(vc4cl_extend(val >> 7 == (arg_t)1)))
|
||||
SIMPLE_1(char, vc4cl_msb_set, char, val, vc4cl_bitcast_char(vc4cl_and((arg_t)(val >> 7), (arg_t)1)) == (arg_t)1)
|
||||
SIMPLE_1(ushort, vc4cl_msb_set, ushort, val, vc4cl_bitcast_ushort(vc4cl_extend(val >> 15 == (arg_t)1)))
|
||||
SIMPLE_1(short, vc4cl_msb_set, short, val, vc4cl_bitcast_short(vc4cl_and((arg_t)(val >> 15), (arg_t)1)) == (arg_t)1)
|
||||
SIMPLE_1(uint, vc4cl_msb_set, uint, val, vc4cl_bitcast_uint(val >> 31 == 1))
|
||||
SIMPLE_1(int, vc4cl_msb_set, int, val, (val < (arg_t)0))
|
||||
SIMPLE_1(long, vc4cl_msb_set, ulong, val, (val >> 63 == 1))
|
||||
SIMPLE_1(long, vc4cl_msb_set, long, val, (val < (arg_t)0))
|
||||
|
||||
OVERLOAD_1(int, vc4cl_is_nan, float, val)
|
||||
OVERLOAD_1(int, vc4cl_is_inf_nan, float, val)
|
||||
OVERLOAD_1(int, vc4cl_is_zero, float, val)
|
||||
|
||||
OVERLOAD_3_SCALAR(int, vc4cl_mul_hi, int, x, int, y, uchar, sign)
|
||||
OVERLOAD_3_SCALAR(uint, vc4cl_mul_hi, uint, x, uint, y, uchar, sign)
|
||||
OVERLOAD_3_SCALAR(long, vc4cl_mul_full, int, x, int, y, uchar, sign)
|
||||
OVERLOAD_3_SCALAR(ulong, vc4cl_mul_full, uint, x, uint, y, uchar, sign)
|
||||
|
||||
OVERLOAD_1(uchar, vc4cl_popcount, uchar, val)
|
||||
OVERLOAD_1(char, vc4cl_popcount, char, val)
|
||||
OVERLOAD_1(ushort, vc4cl_popcount, ushort, val)
|
||||
OVERLOAD_1(short, vc4cl_popcount, short, val)
|
||||
OVERLOAD_1(uint, vc4cl_popcount, uint, val)
|
||||
OVERLOAD_1(int, vc4cl_popcount, int, val)
|
||||
OVERLOAD_1(ulong, vc4cl_popcount, ulong, val)
|
||||
OVERLOAD_1(long, vc4cl_popcount, long, val)
|
||||
|
||||
event_t vc4cl_set_event(event_t ev) CONST;
|
||||
|
||||
void vc4cl_barrier(cl_mem_fence_flags);
|
||||
|
||||
/*
|
||||
* Vector functions
|
||||
*/
|
||||
//Rotates the vector-elements according to the offset (-15 .. +15)
|
||||
//an offset of 5 means rotate up 5 positions (e.g. x.s0 -> y.s5, x.s10 -> y.15, x.s12 -> y.s1
|
||||
//NOTE: the rotation is always all 16 elements!! So functions with vector-size of less than 16 MUST not use the positions shifted in from the remaining vector-elements
|
||||
OVERLOAD_2_SCALAR(uchar, vc4cl_vector_rotate, uchar, val, char, offset)
|
||||
OVERLOAD_2_SCALAR(char, vc4cl_vector_rotate, char, val, char, offset)
|
||||
OVERLOAD_2_SCALAR(ushort, vc4cl_vector_rotate, ushort, val, char, offset)
|
||||
OVERLOAD_2_SCALAR(short, vc4cl_vector_rotate, short, val, char, offset)
|
||||
OVERLOAD_2_SCALAR(uint, vc4cl_vector_rotate, uint, val, char, offset)
|
||||
OVERLOAD_2_SCALAR(int, vc4cl_vector_rotate, int, val, char, offset)
|
||||
OVERLOAD_2_SCALAR(ulong, vc4cl_vector_rotate, ulong, val, char, offset)
|
||||
OVERLOAD_2_SCALAR(long, vc4cl_vector_rotate, long, val, char, offset)
|
||||
OVERLOAD_2_SCALAR(float, vc4cl_vector_rotate, float, val, char, offset)
|
||||
|
||||
/*
|
||||
* For debugging purposes
|
||||
*/
|
||||
//The vector element number (0 .. 15)
|
||||
CONST uchar16 vc4cl_element_number(void);
|
||||
//the ID of the QPU (the processor)
|
||||
CONST uchar vc4cl_qpu_number(void);
|
||||
|
||||
#endif /* VC4CL_INTRINSICS_H */
|
||||
|
1666
drivers/videocore4_stdlib/include/_math.h
Normal file
1666
drivers/videocore4_stdlib/include/_math.h
Normal file
File diff suppressed because it is too large
Load Diff
819
drivers/videocore4_stdlib/include/_overloads.h
Normal file
819
drivers/videocore4_stdlib/include/_overloads.h
Normal file
@ -0,0 +1,819 @@
|
||||
/*
|
||||
* Author: doe300
|
||||
*
|
||||
* See the file "LICENSE" for the full license governing this code.
|
||||
*/
|
||||
|
||||
#ifndef VC4CL_OVERLOADS_H
|
||||
#define VC4CL_OVERLOADS_H
|
||||
|
||||
#include "_config.h"
|
||||
|
||||
#ifndef OVERLOADABLE
|
||||
#define OVERLOADABLE __attribute__((overloadable))
|
||||
#endif
|
||||
/*
|
||||
* "__attribute__((const)) function attribute
|
||||
* Many functions examine only the arguments passed to them, and have no effects except for the return value.
|
||||
* This is a much stricter class than __attribute__((pure)), because a function is not permitted to read global memory.
|
||||
* If a function is known to operate only on its arguments then it can be subject to common sub-expression elimination and loop optimizations."
|
||||
*
|
||||
* http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0491c/Cacgigch.html
|
||||
*/
|
||||
#ifndef CONST
|
||||
#define CONST __attribute__((const)) //tells the compiler, that result won't change
|
||||
#endif
|
||||
/*
|
||||
* "__attribute__((pure)) function attribute
|
||||
* Many functions have no effects except to return a value, and their return value depends only on the parameters and global variables.
|
||||
* Functions of this kind can be subject to data flow analysis and might be eliminated."
|
||||
*
|
||||
* http://infocenter.arm.com/help/topic/com.arm.doc.dui0491c/Cacigdac.html
|
||||
*/
|
||||
#define PURE __attribute__((pure))
|
||||
#define INLINE __attribute__((always_inline)) __attribute__((flatten)) inline //flatten inlines all call within this function
|
||||
#define FUNC_1(ret, func, argType, argName) ret func(argType argName) OVERLOADABLE
|
||||
#ifndef OVERLOAD_1
|
||||
#define OVERLOAD_1(ret, func, argType, argName) \
|
||||
FUNC_1(ret##16, func, argType##16, argName); \
|
||||
FUNC_1(ret##8, func, argType##8, argName); \
|
||||
FUNC_1(ret##4, func, argType##4, argName); \
|
||||
FUNC_1(ret##3, func, argType##3, argName); \
|
||||
FUNC_1(ret##2, func, argType##2, argName); \
|
||||
FUNC_1(ret, func, argType, argName);
|
||||
#endif
|
||||
|
||||
#ifndef OVERLOAD_1_RETURN_SCALAR
|
||||
#define OVERLOAD_1_RETURN_SCALAR(ret, func, argType, argName) \
|
||||
FUNC_1(ret, func, argType##16, argName); \
|
||||
FUNC_1(ret, func, argType##8, argName); \
|
||||
FUNC_1(ret, func, argType##4, argName); \
|
||||
FUNC_1(ret, func, argType##3, argName); \
|
||||
FUNC_1(ret, func, argType##2, argName); \
|
||||
FUNC_1(ret, func, argType, argName);
|
||||
#endif
|
||||
|
||||
#define FUNC_2(ret, func, argType0, argName0, argType1, argName1) ret func(argType0 argName0, argType1 argName1) OVERLOADABLE
|
||||
#ifndef OVERLOAD_2
|
||||
#define OVERLOAD_2(ret, func, argType0, argName0, argType1, argName1) \
|
||||
FUNC_2(ret##16, func, argType0##16, argName0, argType1##16, argName1); \
|
||||
FUNC_2(ret##8, func, argType0##8, argName0, argType1##8, argName1); \
|
||||
FUNC_2(ret##4, func, argType0##4, argName0, argType1##4, argName1); \
|
||||
FUNC_2(ret##3, func, argType0##3, argName0, argType1##3, argName1); \
|
||||
FUNC_2(ret##2, func, argType0##2, argName0, argType1##2, argName1); \
|
||||
FUNC_2(ret, func, argType0, argName0, argType1, argName1);
|
||||
#endif
|
||||
|
||||
#ifndef OVERLOAD_2_SCALAR
|
||||
#define OVERLOAD_2_SCALAR(ret, func, argType0, argName0, argType1, argName1) \
|
||||
FUNC_2(ret##16, func, argType0##16, argName0, argType1, argName1); \
|
||||
FUNC_2(ret##8, func, argType0##8, argName0, argType1, argName1); \
|
||||
FUNC_2(ret##4, func, argType0##4, argName0, argType1, argName1); \
|
||||
FUNC_2(ret##3, func, argType0##3, argName0, argType1, argName1); \
|
||||
FUNC_2(ret##2, func, argType0##2, argName0, argType1, argName1); \
|
||||
FUNC_2(ret, func, argType0, argName0, argType1, argName1);
|
||||
#endif
|
||||
|
||||
#ifndef OVERLOAD_2_RETURN_SCALAR
|
||||
#define OVERLOAD_2_RETURN_SCALAR(ret, func, argType0, argName0, argType1, argName1) \
|
||||
FUNC_2(ret, func, argType0##16, argName0, argType1##16, argName1); \
|
||||
FUNC_2(ret, func, argType0##8, argName0, argType1##8, argName1); \
|
||||
FUNC_2(ret, func, argType0##4, argName0, argType1##4, argName1); \
|
||||
FUNC_2(ret, func, argType0##3, argName0, argType1##3, argName1); \
|
||||
FUNC_2(ret, func, argType0##2, argName0, argType1##2, argName1); \
|
||||
FUNC_2(ret, func, argType0, argName0, argType1, argName1);
|
||||
#endif
|
||||
|
||||
#ifndef OVERLOAD_2_SCALAR_RETURN_SCALAR
|
||||
#define OVERLOAD_2_SCALAR_RETURN_SCALAR(ret, func, argType0, argName0, argType1, argName1) \
|
||||
FUNC_2(ret, func, argType0##16, argName0, argType1, argName1); \
|
||||
FUNC_2(ret, func, argType0##8, argName0, argType1, argName1); \
|
||||
FUNC_2(ret, func, argType0##4, argName0, argType1, argName1); \
|
||||
FUNC_2(ret, func, argType0##3, argName0, argType1, argName1); \
|
||||
FUNC_2(ret, func, argType0##2, argName0, argType1, argName1); \
|
||||
FUNC_2(ret, func, argType0, argName0, argType1, argName1);
|
||||
#endif
|
||||
|
||||
#define FUNC_3(ret, func, argType0, argName0, argType1, argName1, argType2, argName2) ret func(argType0 argName0, argType1 argName1, argType2 argName2) OVERLOADABLE
|
||||
#ifndef OVERLOAD_3
|
||||
#define OVERLOAD_3(ret, func, argType0, argName0, argType1, argName1, argType2, argName2) \
|
||||
FUNC_3(ret##16, func, argType0##16, argName0, argType1##16, argName1, argType2##16, argName2); \
|
||||
FUNC_3(ret##8, func, argType0##8, argName0, argType1##8, argName1, argType2##8, argName2); \
|
||||
FUNC_3(ret##4, func, argType0##4, argName0, argType1##4, argName1, argType2##4, argName2); \
|
||||
FUNC_3(ret##3, func, argType0##3, argName0, argType1##3, argName1, argType2##3, argName2); \
|
||||
FUNC_3(ret##2, func, argType0##2, argName0, argType1##2, argName1, argType2##2, argName2); \
|
||||
inline FUNC_3(ret, func, argType0, argName0, argType1, argName1, argType2, argName2);
|
||||
#endif
|
||||
|
||||
#ifndef OVERLOAD_3_SCALAR
|
||||
#define OVERLOAD_3_SCALAR(ret, func, argType0, argName0, argType1, argName1, argType2, argName2) \
|
||||
FUNC_3(ret##16, func, argType0##16, argName0, argType1##16, argName1, argType2, argName2); \
|
||||
FUNC_3(ret##8, func, argType0##8, argName0, argType1##8, argName1, argType2, argName2); \
|
||||
FUNC_3(ret##4, func, argType0##4, argName0, argType1##4, argName1, argType2, argName2); \
|
||||
FUNC_3(ret##3, func, argType0##3, argName0, argType1##3, argName1, argType2, argName2); \
|
||||
FUNC_3(ret##2, func, argType0##2, argName0, argType1##2, argName1, argType2, argName2); \
|
||||
inline FUNC_3(ret, func, argType0, argName0, argType1, argName1, argType2, argName2);
|
||||
#endif
|
||||
|
||||
#define FUNC_4(ret, func, argType0, argName0, argType1, argName1, argType2, argName2, argType3, argName3) ret func(argType0 argName0, argType1 argName1, argType2 argName2, argType3 argName3) OVERLOADABLE
|
||||
|
||||
#define FUNC_5(ret, func, argType0, argName0, argType1, argName1, argType2, argName2, argType3, argName3, arg4Type, arg4Name) ret func(argType0 argName0, argType1 argName1, argType2 argName2, argType3 argName3, arg4Type arg4Name) OVERLOADABLE
|
||||
|
||||
#ifndef SIMPLE_1
|
||||
#define SIMPLE_1(ret, func, argType, argName, content) \
|
||||
INLINE FUNC_1(ret##16, func, argType##16, argName) \
|
||||
{ \
|
||||
typedef argType##16 arg_t;\
|
||||
typedef ret##16 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_1(ret##8, func, argType##8, argName) \
|
||||
{ \
|
||||
typedef argType##8 arg_t;\
|
||||
typedef ret##8 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_1(ret##4, func, argType##4, argName) \
|
||||
{ \
|
||||
typedef argType##4 arg_t;\
|
||||
typedef ret##4 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_1(ret##3, func, argType##3, argName) \
|
||||
{ \
|
||||
typedef argType##3 arg_t;\
|
||||
typedef ret##3 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_1(ret##2, func, argType##2, argName) \
|
||||
{ \
|
||||
typedef argType##2 arg_t;\
|
||||
typedef ret##2 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_1(ret, func, argType, argName) \
|
||||
{ \
|
||||
typedef argType arg_t;\
|
||||
typedef ret result_t;\
|
||||
return content; \
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef SIMPLE_1_RETURN_SCALAR
|
||||
#define SIMPLE_1_RETURN_SCALAR(ret, func, argType, argName, content) \
|
||||
INLINE FUNC_1(ret, func, argType##16, argName) \
|
||||
{ \
|
||||
typedef argType##16 arg_t;\
|
||||
typedef ret result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_1(ret, func, argType##8, argName) \
|
||||
{ \
|
||||
typedef argType##8 arg_t;\
|
||||
typedef ret result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_1(ret, func, argType##4, argName) \
|
||||
{ \
|
||||
typedef argType##4 arg_t;\
|
||||
typedef ret result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_1(ret, func, argType##3, argName) \
|
||||
{ \
|
||||
typedef argType##3 arg_t;\
|
||||
typedef ret result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_1(ret, func, argType##2, argName) \
|
||||
{ \
|
||||
typedef argType##2 arg_t;\
|
||||
typedef ret result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_1(ret, func, argType, argName) \
|
||||
{ \
|
||||
typedef argType arg_t;\
|
||||
typedef ret result_t;\
|
||||
return content; \
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef SIMPLE_2
|
||||
#define SIMPLE_2(ret, func, argType0, argName0, argType1, argName1, content) \
|
||||
INLINE FUNC_2(ret##16, func, argType0##16, argName0, argType1##16, argName1) \
|
||||
{ \
|
||||
typedef argType0##16 arg0_t;\
|
||||
typedef argType1##16 arg1_t;\
|
||||
typedef ret##16 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_2(ret##8, func, argType0##8, argName0, argType1##8, argName1) \
|
||||
{ \
|
||||
typedef argType0##8 arg0_t;\
|
||||
typedef argType1##8 arg1_t;\
|
||||
typedef ret##8 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_2(ret##4, func, argType0##4, argName0, argType1##4, argName1) \
|
||||
{ \
|
||||
typedef argType0##4 arg0_t;\
|
||||
typedef argType1##4 arg1_t;\
|
||||
typedef ret##4 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_2(ret##3, func, argType0##3, argName0, argType1##3, argName1) \
|
||||
{ \
|
||||
typedef argType0##3 arg0_t;\
|
||||
typedef argType1##3 arg1_t;\
|
||||
typedef ret##3 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_2(ret##2, func, argType0##2, argName0, argType1##2, argName1) \
|
||||
{ \
|
||||
typedef argType0##2 arg0_t;\
|
||||
typedef argType1##2 arg1_t;\
|
||||
typedef ret##2 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_2(ret, func, argType0, argName0, argType1, argName1) \
|
||||
{ \
|
||||
typedef argType0 arg0_t;\
|
||||
typedef argType1 arg1_t;\
|
||||
typedef ret result_t;\
|
||||
return content; \
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef SIMPLE_2_RETURN_SCALAR
|
||||
#define SIMPLE_2_RETURN_SCALAR(ret, func, argType0, argName0, argType1, argName1, content) \
|
||||
INLINE FUNC_2(ret, func, argType0##16, argName0, argType1##16, argName1) \
|
||||
{ \
|
||||
typedef argType0##16 arg0_t;\
|
||||
typedef argType1##16 arg1_t;\
|
||||
typedef ret result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_2(ret, func, argType0##8, argName0, argType1##8, argName1) \
|
||||
{ \
|
||||
typedef argType0##8 arg0_t;\
|
||||
typedef argType1##8 arg1_t;\
|
||||
typedef ret result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_2(ret, func, argType0##4, argName0, argType1##4, argName1) \
|
||||
{ \
|
||||
typedef argType0##4 arg0_t;\
|
||||
typedef argType1##4 arg1_t;\
|
||||
typedef ret result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_2(ret, func, argType0##3, argName0, argType1##3, argName1) \
|
||||
{ \
|
||||
typedef argType0##3 arg0_t;\
|
||||
typedef argType1##3 arg1_t;\
|
||||
typedef ret result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_2(ret, func, argType0##2, argName0, argType1##2, argName1) \
|
||||
{ \
|
||||
typedef argType0##2 arg0_t;\
|
||||
typedef argType1##2 arg1_t;\
|
||||
typedef ret result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_2(ret, func, argType0, argName0, argType1, argName1) \
|
||||
{ \
|
||||
typedef argType0 arg0_t;\
|
||||
typedef argType1 arg1_t;\
|
||||
typedef ret result_t;\
|
||||
return content; \
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef SIMPLE_2_SCALAR
|
||||
#define SIMPLE_2_SCALAR(ret, func, argType0, argName0, argType1, argName1, content) \
|
||||
INLINE FUNC_2(ret##16, func, argType0##16, argName0, argType1, argName1) \
|
||||
{ \
|
||||
typedef argType0##16 arg0_t;\
|
||||
typedef argType1 arg1_t;\
|
||||
typedef ret##16 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_2(ret##8, func, argType0##8, argName0, argType1, argName1) \
|
||||
{ \
|
||||
typedef argType0##8 arg0_t;\
|
||||
typedef argType1 arg1_t;\
|
||||
typedef ret##8 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_2(ret##4, func, argType0##4, argName0, argType1, argName1) \
|
||||
{ \
|
||||
typedef argType0##4 arg0_t;\
|
||||
typedef argType1 arg1_t;\
|
||||
typedef ret##4 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_2(ret##3, func, argType0##3, argName0, argType1, argName1) \
|
||||
{ \
|
||||
typedef argType0##3 arg0_t;\
|
||||
typedef argType1 arg1_t;\
|
||||
typedef ret##3 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_2(ret##2, func, argType0##2, argName0, argType1, argName1) \
|
||||
{ \
|
||||
typedef argType0##2 arg0_t;\
|
||||
typedef argType1 arg1_t;\
|
||||
typedef ret##2 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
//scalar part is skipped, since it is too often already defined for e.g. a version taking two vectors
|
||||
#endif
|
||||
|
||||
#ifndef SIMPLE_3
|
||||
#define SIMPLE_3(ret, func, argType0, argName0, argType1, argName1, argType2, argName2, content) \
|
||||
INLINE FUNC_3(ret##16, func, argType0##16, argName0, argType1##16, argName1, argType2##16, argName2) \
|
||||
{ \
|
||||
typedef argType0##16 arg0_t;\
|
||||
typedef argType1##16 arg1_t;\
|
||||
typedef argType2##16 arg2_t;\
|
||||
typedef ret##16 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_3(ret##8, func, argType0##8, argName0, argType1##8, argName1, argType2##8, argName2) \
|
||||
{ \
|
||||
typedef argType0##8 arg0_t;\
|
||||
typedef argType1##8 arg1_t;\
|
||||
typedef argType2##8 arg2_t;\
|
||||
typedef ret##8 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_3(ret##4, func, argType0##4, argName0, argType1##4, argName1, argType2##4, argName2) \
|
||||
{ \
|
||||
typedef argType0##4 arg0_t;\
|
||||
typedef argType1##4 arg1_t;\
|
||||
typedef argType2##4 arg2_t;\
|
||||
typedef ret##4 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_3(ret##3, func, argType0##3, argName0, argType1##3, argName1, argType2##3, argName2) \
|
||||
{ \
|
||||
typedef argType0##3 arg0_t;\
|
||||
typedef argType1##3 arg1_t;\
|
||||
typedef argType2##3 arg2_t;\
|
||||
typedef ret##3 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_3(ret##2, func, argType0##2, argName0, argType1##2, argName1, argType2##2, argName2) \
|
||||
{ \
|
||||
typedef argType0##2 arg0_t;\
|
||||
typedef argType1##2 arg1_t;\
|
||||
typedef argType2##2 arg2_t;\
|
||||
typedef ret##2 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_3(ret, func, argType0, argName0, argType1, argName1, argType2, argName2) \
|
||||
{ \
|
||||
typedef argType0 arg0_t;\
|
||||
typedef argType1 arg1_t;\
|
||||
typedef argType2 arg2_t;\
|
||||
typedef ret result_t;\
|
||||
return content; \
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef SIMPLE_3_SCALAR
|
||||
#define SIMPLE_3_SCALAR(ret, func, argType0, argName0, argType1, argName1, argType2, argName2, content) \
|
||||
INLINE FUNC_3(ret##16, func, argType0##16, argName0, argType1##16, argName1, argType2, argName2) \
|
||||
{ \
|
||||
typedef argType0##16 arg0_t;\
|
||||
typedef argType1##16 arg1_t;\
|
||||
typedef argType2 arg2_t;\
|
||||
typedef ret##16 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_3(ret##8, func, argType0##8, argName0, argType1##8, argName1, argType2, argName2) \
|
||||
{ \
|
||||
typedef argType0##8 arg0_t;\
|
||||
typedef argType1##8 arg1_t;\
|
||||
typedef argType2 arg2_t;\
|
||||
typedef ret##8 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_3(ret##4, func, argType0##4, argName0, argType1##4, argName1, argType2, argName2) \
|
||||
{ \
|
||||
typedef argType0##4 arg0_t;\
|
||||
typedef argType1##4 arg1_t;\
|
||||
typedef argType2 arg2_t;\
|
||||
typedef ret##4 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_3(ret##3, func, argType0##3, argName0, argType1##3, argName1, argType2, argName2) \
|
||||
{ \
|
||||
typedef argType0##3 arg0_t;\
|
||||
typedef argType1##3 arg1_t;\
|
||||
typedef argType2 arg2_t;\
|
||||
typedef ret##3 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_3(ret##2, func, argType0##2, argName0, argType1##2, argName1, argType2, argName2) \
|
||||
{ \
|
||||
typedef argType0##2 arg0_t;\
|
||||
typedef argType1##2 arg1_t;\
|
||||
typedef argType2 arg2_t;\
|
||||
typedef ret##2 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
//scalar version is skipped, since it is already defined by the vector-vector-vector version with "vector" of 1 element
|
||||
#endif
|
||||
|
||||
#ifndef SIMPLE_3_TWO_SCALAR
|
||||
#define SIMPLE_3_TWO_SCALAR(ret, func, argType0, argName0, argType1, argName1, argType2, argName2, content) \
|
||||
INLINE FUNC_3(ret##16, func, argType0##16, argName0, argType1, argName1, argType2, argName2) \
|
||||
{ \
|
||||
typedef argType0##16 arg0_t;\
|
||||
typedef argType1 arg1_t;\
|
||||
typedef argType2 arg2_t;\
|
||||
typedef ret##16 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_3(ret##8, func, argType0##8, argName0, argType1, argName1, argType2, argName2) \
|
||||
{ \
|
||||
typedef argType0##8 arg0_t;\
|
||||
typedef argType1 arg1_t;\
|
||||
typedef argType2 arg2_t;\
|
||||
typedef ret##8 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_3(ret##4, func, argType0##4, argName0, argType1, argName1, argType2, argName2) \
|
||||
{ \
|
||||
typedef argType0##4 arg0_t;\
|
||||
typedef argType1 arg1_t;\
|
||||
typedef argType2 arg2_t;\
|
||||
typedef ret##4 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_3(ret##3, func, argType0##3, argName0, argType1, argName1, argType2, argName2) \
|
||||
{ \
|
||||
typedef argType0##3 arg0_t;\
|
||||
typedef argType1 arg1_t;\
|
||||
typedef argType2 arg2_t;\
|
||||
typedef ret##3 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
INLINE FUNC_3(ret##2, func, argType0##2, argName0, argType1, argName1, argType2, argName2) \
|
||||
{ \
|
||||
typedef argType0##2 arg0_t;\
|
||||
typedef argType1 arg1_t;\
|
||||
typedef argType2 arg2_t;\
|
||||
typedef ret##2 result_t;\
|
||||
return content; \
|
||||
} \
|
||||
//scalar version is skipped, since it is already defined by the vector-vector-vector version with "vector" of 1 element
|
||||
#endif
|
||||
|
||||
#ifndef COMPLEX_1
|
||||
#define COMPLEX_1(ret, func, argType, argName, content) \
|
||||
INLINE FUNC_1(ret##16, func, argType##16, argName) \
|
||||
{ \
|
||||
typedef argType##16 arg_t;\
|
||||
typedef ret##16 result_t;\
|
||||
typedef int##16 int_t; \
|
||||
typedef float##16 float_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_1(ret##8, func, argType##8, argName) \
|
||||
{ \
|
||||
typedef argType##8 arg_t;\
|
||||
typedef ret##8 result_t;\
|
||||
typedef int##8 int_t; \
|
||||
typedef float##8 float_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_1(ret##4, func, argType##4, argName) \
|
||||
{ \
|
||||
typedef argType##4 arg_t;\
|
||||
typedef ret##4 result_t;\
|
||||
typedef int##4 int_t; \
|
||||
typedef float##4 float_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_1(ret##3, func, argType##3, argName) \
|
||||
{ \
|
||||
typedef argType##3 arg_t;\
|
||||
typedef ret##3 result_t;\
|
||||
typedef int##3 int_t; \
|
||||
typedef float##3 float_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_1(ret##2, func, argType##2, argName) \
|
||||
{ \
|
||||
typedef argType##2 arg_t;\
|
||||
typedef ret##2 result_t;\
|
||||
typedef int##2 int_t; \
|
||||
typedef float##2 float_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_1(ret, func, argType, argName) \
|
||||
{ \
|
||||
typedef argType arg_t;\
|
||||
typedef ret result_t;\
|
||||
typedef int int_t; \
|
||||
typedef float float_t; \
|
||||
content \
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef COMPLEX_1_RETURN_SCALAR
|
||||
#define COMPLEX_1_RETURN_SCALAR(ret, func, argType, argName, content) \
|
||||
INLINE FUNC_1(ret, func, argType##16, argName) \
|
||||
{ \
|
||||
typedef argType##16 arg_t;\
|
||||
typedef int##16 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_1(ret, func, argType##8, argName) \
|
||||
{ \
|
||||
typedef argType##8 arg_t;\
|
||||
typedef int##8 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_1(ret, func, argType##4, argName) \
|
||||
{ \
|
||||
typedef argType##4 arg_t;\
|
||||
typedef int##4 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_1(ret, func, argType##3, argName) \
|
||||
{ \
|
||||
typedef argType##3 arg_t;\
|
||||
typedef int##3 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_1(ret, func, argType##2, argName) \
|
||||
{ \
|
||||
typedef argType##2 arg_t;\
|
||||
typedef int##2 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_1(ret, func, argType, argName) \
|
||||
{ \
|
||||
typedef argType arg_t;\
|
||||
typedef int int_t; \
|
||||
content \
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef COMPLEX_2
|
||||
#define COMPLEX_2(ret, func, argType0, argName0, argType1, argName1, content) \
|
||||
INLINE FUNC_2(ret##16, func, argType0##16, argName0, argType1##16, argName1) \
|
||||
{ \
|
||||
typedef argType0##16 arg0_t;\
|
||||
typedef argType1##16 arg1_t;\
|
||||
typedef ret##16 result_t;\
|
||||
typedef int##16 int_t; \
|
||||
typedef uint##16 uint_t; \
|
||||
typedef float##16 float_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_2(ret##8, func, argType0##8, argName0, argType1##8, argName1) \
|
||||
{ \
|
||||
typedef argType0##8 arg0_t;\
|
||||
typedef argType1##8 arg1_t;\
|
||||
typedef ret##8 result_t;\
|
||||
typedef int##8 int_t; \
|
||||
typedef uint##8 uint_t; \
|
||||
typedef float##8 float_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_2(ret##4, func, argType0##4, argName0, argType1##4, argName1) \
|
||||
{ \
|
||||
typedef argType0##4 arg0_t;\
|
||||
typedef argType1##4 arg1_t;\
|
||||
typedef ret##4 result_t;\
|
||||
typedef int##4 int_t; \
|
||||
typedef uint##4 uint_t; \
|
||||
typedef float##4 float_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_2(ret##3, func, argType0##3, argName0, argType1##3, argName1) \
|
||||
{ \
|
||||
typedef argType0##3 arg0_t;\
|
||||
typedef argType1##3 arg1_t;\
|
||||
typedef ret##3 result_t;\
|
||||
typedef int##3 int_t; \
|
||||
typedef uint##3 uint_t; \
|
||||
typedef float##3 float_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_2(ret##2, func, argType0##2, argName0, argType1##2, argName1) \
|
||||
{ \
|
||||
typedef argType0##2 arg0_t;\
|
||||
typedef argType1##2 arg1_t;\
|
||||
typedef ret##2 result_t;\
|
||||
typedef int##2 int_t; \
|
||||
typedef uint##2 uint_t; \
|
||||
typedef float##2 float_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_2(ret, func, argType0, argName0, argType1, argName1) \
|
||||
{ \
|
||||
typedef argType0 arg0_t;\
|
||||
typedef argType1 arg1_t;\
|
||||
typedef ret result_t;\
|
||||
typedef int int_t; \
|
||||
typedef uint uint_t; \
|
||||
typedef float float_t; \
|
||||
content \
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef COMPLEX_3
|
||||
#define COMPLEX_3(ret, func, argType0, argName0, argType1, argName1, argType2, argName2, content) \
|
||||
INLINE FUNC_3(ret##16, func, argType0##16, argName0, argType1##16, argName1, argType2##16, argName2) \
|
||||
{ \
|
||||
typedef argType0##16 arg0_t;\
|
||||
typedef argType1##16 arg1_t;\
|
||||
typedef argType2##16 arg2_t;\
|
||||
typedef ret##16 result_t;\
|
||||
typedef int##16 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_3(ret##8, func, argType0##8, argName0, argType1##8, argName1, argType2##8, argName2) \
|
||||
{ \
|
||||
typedef argType0##8 arg0_t;\
|
||||
typedef argType1##8 arg1_t;\
|
||||
typedef argType2##8 arg2_t;\
|
||||
typedef ret##8 result_t;\
|
||||
typedef int##8 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_3(ret##4, func, argType0##4, argName0, argType1##4, argName1, argType2##4, argName2) \
|
||||
{ \
|
||||
typedef argType0##4 arg0_t;\
|
||||
typedef argType1##4 arg1_t;\
|
||||
typedef argType2##4 arg2_t;\
|
||||
typedef ret##4 result_t;\
|
||||
typedef int##4 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_3(ret##3, func, argType0##3, argName0, argType1##3, argName1, argType2##3, argName2) \
|
||||
{ \
|
||||
typedef argType0##3 arg0_t;\
|
||||
typedef argType1##3 arg1_t;\
|
||||
typedef argType2##3 arg2_t;\
|
||||
typedef ret##3 result_t;\
|
||||
typedef int##3 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_3(ret##2, func, argType0##2, argName0, argType1##2, argName1, argType2##2, argName2) \
|
||||
{ \
|
||||
typedef argType0##2 arg0_t;\
|
||||
typedef argType1##2 arg1_t;\
|
||||
typedef argType2##2 arg2_t;\
|
||||
typedef ret##2 result_t;\
|
||||
typedef int##2 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_3(ret, func, argType0, argName0, argType1, argName1, argType2, argName2) \
|
||||
{ \
|
||||
typedef argType0 arg0_t;\
|
||||
typedef argType1 arg1_t;\
|
||||
typedef argType2 arg2_t;\
|
||||
typedef ret result_t;\
|
||||
typedef int int_t; \
|
||||
content \
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef COMPLEX_3_SCALAR
|
||||
#define COMPLEX_3_SCALAR(ret, func, argType0, argName0, argType1, argName1, argType2, argName2, content) \
|
||||
INLINE FUNC_3(ret##16, func, argType0##16, argName0, argType1##16, argName1, argType2, argName2) \
|
||||
{ \
|
||||
typedef argType0##16 arg0_t;\
|
||||
typedef argType1##16 arg1_t;\
|
||||
typedef argType2 arg2_t;\
|
||||
typedef ret##16 result_t;\
|
||||
typedef int##16 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_3(ret##8, func, argType0##8, argName0, argType1##8, argName1, argType2, argName2) \
|
||||
{ \
|
||||
typedef argType0##8 arg0_t;\
|
||||
typedef argType1##8 arg1_t;\
|
||||
typedef argType2 arg2_t;\
|
||||
typedef ret##8 result_t;\
|
||||
typedef int##8 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_3(ret##4, func, argType0##4, argName0, argType1##4, argName1, argType2, argName2) \
|
||||
{ \
|
||||
typedef argType0##4 arg0_t;\
|
||||
typedef argType1##4 arg1_t;\
|
||||
typedef argType2 arg2_t;\
|
||||
typedef ret##4 result_t;\
|
||||
typedef int##4 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_3(ret##3, func, argType0##3, argName0, argType1##3, argName1, argType2, argName2) \
|
||||
{ \
|
||||
typedef argType0##3 arg0_t;\
|
||||
typedef argType1##3 arg1_t;\
|
||||
typedef argType2 arg2_t;\
|
||||
typedef ret##3 result_t;\
|
||||
typedef int##3 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_3(ret##2, func, argType0##2, argName0, argType1##2, argName1, argType2, argName2) \
|
||||
{ \
|
||||
typedef argType0##2 arg0_t;\
|
||||
typedef argType1##2 arg1_t;\
|
||||
typedef argType2 arg2_t;\
|
||||
typedef ret##2 result_t;\
|
||||
typedef int##2 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_3(ret, func, argType0, argName0, argType1, argName1, argType2, argName2) \
|
||||
{ \
|
||||
typedef argType0 arg0_t;\
|
||||
typedef argType1 arg1_t;\
|
||||
typedef argType2 arg2_t;\
|
||||
typedef ret result_t;\
|
||||
typedef int int_t; \
|
||||
content \
|
||||
}
|
||||
#endif
|
||||
|
||||
#define OVERLOAD_ALL_IMAGE_TYPES(ret, func) \
|
||||
CONST FUNC_1(ret, func, read_only image1d_t, image); \
|
||||
CONST FUNC_1(ret, func, write_only image1d_t, image); \
|
||||
CONST FUNC_1(ret, func, read_only image2d_t, image); \
|
||||
CONST FUNC_1(ret, func, write_only image2d_t, image); \
|
||||
CONST FUNC_1(ret, func, read_only image3d_t, image); \
|
||||
/* XXX CONST FUNC_1(ret, func, write_only image3d_t, image); */ \
|
||||
CONST FUNC_1(ret, func, read_only image1d_buffer_t, image); \
|
||||
CONST FUNC_1(ret, func, write_only image1d_buffer_t, image); \
|
||||
CONST FUNC_1(ret, func, read_only image1d_array_t, image); \
|
||||
CONST FUNC_1(ret, func, write_only image1d_array_t, image); \
|
||||
CONST FUNC_1(ret, func, read_only image2d_array_t, image); \
|
||||
CONST FUNC_1(ret, func, write_only image2d_array_t, image);
|
||||
|
||||
#define OVERLOAD_ALL_IMAGE_TYPES_1(ret, func, argType, argName) \
|
||||
FUNC_2(ret, func, read_only image1d_t, image, argType, argName); \
|
||||
FUNC_2(ret, func, write_only image1d_t, image, argType, argName); \
|
||||
FUNC_2(ret, func, read_only image2d_t, image, argType, argName); \
|
||||
FUNC_2(ret, func, write_only image2d_t, image, argType, argName); \
|
||||
FUNC_2(ret, func, read_only image3d_t, image, argType, argName); \
|
||||
/* XXX FUNC_2(ret, func, write_only image3d_t, image, argType, argName); */ \
|
||||
FUNC_2(ret, func, read_only image1d_buffer_t, image, argType, argName); \
|
||||
FUNC_2(ret, func, write_only image1d_buffer_t, image, argType, argName); \
|
||||
FUNC_2(ret, func, read_only image1d_array_t, image, argType, argName); \
|
||||
FUNC_2(ret, func, write_only image1d_array_t, image, argType, argName); \
|
||||
FUNC_2(ret, func, read_only image2d_array_t, image, argType, argName); \
|
||||
FUNC_2(ret, func, write_only image2d_array_t, image, argType, argName);
|
||||
|
||||
#define OVERLOAD_ALL_IMAGE_TYPES_2(ret, func, arg0Type, arg0Name, arg1Type, arg1Name) \
|
||||
FUNC_3(ret, func, read_only image1d_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
|
||||
FUNC_3(ret, func, write_only image1d_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
|
||||
FUNC_3(ret, func, read_only image2d_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
|
||||
FUNC_3(ret, func, write_only image2d_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
|
||||
FUNC_3(ret, func, read_only image3d_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
|
||||
/* XXX FUNC_3(ret, func, write_only image3d_t, image, arg0Type, arg0Name, arg1Type, arg1Name); */ \
|
||||
FUNC_3(ret, func, read_only image1d_buffer_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
|
||||
FUNC_3(ret, func, write_only image1d_buffer_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
|
||||
FUNC_3(ret, func, read_only image1d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
|
||||
FUNC_3(ret, func, write_only image1d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
|
||||
FUNC_3(ret, func, read_only image2d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name); \
|
||||
FUNC_3(ret, func, write_only image2d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name);
|
||||
|
||||
#define OVERLOAD_ALL_IMAGE_TYPES_3(ret, func, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name) \
|
||||
FUNC_4(ret, func, read_only image1d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
|
||||
FUNC_4(ret, func, write_only image1d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
|
||||
FUNC_4(ret, func, read_only image2d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
|
||||
FUNC_4(ret, func, write_only image2d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
|
||||
FUNC_4(ret, func, read_only image3d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
|
||||
/* XXX FUNC_4(ret, func, write_only image3d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); */ \
|
||||
FUNC_4(ret, func, read_only image1d_buffer_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
|
||||
FUNC_4(ret, func, write_only image1d_buffer_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
|
||||
FUNC_4(ret, func, read_only image1d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
|
||||
FUNC_4(ret, func, write_only image1d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
|
||||
FUNC_4(ret, func, read_only image2d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name); \
|
||||
FUNC_4(ret, func, write_only image2d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name);
|
||||
|
||||
#define OVERLOAD_ALL_IMAGE_TYPES_4(ret, func, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name) \
|
||||
FUNC_5(ret, func, read_only image1d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
|
||||
FUNC_5(ret, func, write_only image1d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
|
||||
FUNC_5(ret, func, read_only image2d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
|
||||
FUNC_5(ret, func, write_only image2d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
|
||||
FUNC_5(ret, func, read_only image3d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
|
||||
/* XXX FUNC_5(ret, func, write_only image3d_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); */ \
|
||||
FUNC_5(ret, func, read_only image1d_buffer_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
|
||||
FUNC_5(ret, func, write_only image1d_buffer_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
|
||||
FUNC_5(ret, func, read_only image1d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
|
||||
FUNC_5(ret, func, write_only image1d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
|
||||
FUNC_5(ret, func, read_only image2d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name); \
|
||||
FUNC_5(ret, func, write_only image2d_array_t, image, arg0Type, arg0Name, arg1Type, arg1Name, arg2Type, arg2Name, arg3Type, arg3Name);
|
||||
|
||||
#endif /* VC4CL_OVERLOADS_H */
|
||||
|
43
drivers/videocore4_stdlib/include/_printf.h
Normal file
43
drivers/videocore4_stdlib/include/_printf.h
Normal file
@ -0,0 +1,43 @@
|
||||
/*
|
||||
* Author: doe300
|
||||
*
|
||||
* See the file "LICENSE" for the full license governing this code.
|
||||
*/
|
||||
|
||||
#ifndef VC4CL_PRINTF
|
||||
#define VC4CL_PRINTF
|
||||
|
||||
#include "_config.h"
|
||||
|
||||
//void* vc4cl_get_param(uint);
|
||||
//void vc4cl_print_char(char);
|
||||
//
|
||||
//INLINE int printf(__constant const char * restrict format, ...)
|
||||
//{
|
||||
// __constant const char* formatPtr = format;
|
||||
// uint paramIndex = 1;
|
||||
// while(*format != '\0')
|
||||
// {
|
||||
// if(*format == '%')
|
||||
// {
|
||||
// ++formatPtr;
|
||||
// switch(*formatPtr)
|
||||
// {
|
||||
// case '%':
|
||||
// vc4cl_print_char('%');
|
||||
// break;
|
||||
// case 'c':
|
||||
// vc4cl_print_char(*vc4cl_get_param(paramIndex));
|
||||
// case 's':
|
||||
//
|
||||
// }
|
||||
// }
|
||||
// else
|
||||
// vc4cl_print_char(*formatPtr);
|
||||
// ++formatPtr;
|
||||
// }
|
||||
// //TODO
|
||||
// return -1;
|
||||
//}
|
||||
|
||||
#endif /* VC4CL_PRINTF */
|
341
drivers/videocore4_stdlib/include/_relational.h
Normal file
341
drivers/videocore4_stdlib/include/_relational.h
Normal file
@ -0,0 +1,341 @@
|
||||
/*
|
||||
* Author: doe300
|
||||
*
|
||||
* See the file "LICENSE" for the full license governing this code.
|
||||
*/
|
||||
|
||||
#ifndef VC4CL_RELATIONAL_H
|
||||
#define VC4CL_RELATIONAL_H
|
||||
|
||||
#include "_config.h"
|
||||
#include "_overloads.h"
|
||||
#include "_intrinsics.h"
|
||||
|
||||
#ifndef COMPARISON_1
|
||||
#define COMPARISON_1(func, content) \
|
||||
INLINE FUNC_1(int##16, func, float##16, val) CONST \
|
||||
{ \
|
||||
return (content) ? -1 : 0; \
|
||||
} \
|
||||
INLINE FUNC_1(int##8, func, float##8, val) CONST \
|
||||
{ \
|
||||
return (content) ? -1 : 0; \
|
||||
} \
|
||||
INLINE FUNC_1(int##4, func, float##4, val) CONST \
|
||||
{ \
|
||||
return (content) ? -1 : 0; \
|
||||
} \
|
||||
INLINE FUNC_1(int##3, func, float##3, val) CONST \
|
||||
{ \
|
||||
return (content) ? -1 : 0; \
|
||||
} \
|
||||
INLINE FUNC_1(int##2, func, float##2, val) CONST \
|
||||
{ \
|
||||
return (content) ? -1 : 0; \
|
||||
} \
|
||||
INLINE FUNC_1(int, func, float, val) CONST \
|
||||
{ /* 1 instead of -1 here on purpose! */ \
|
||||
return (content) ? 1 : 0; \
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef COMPARISON_2
|
||||
#define COMPARISON_2(func, content) \
|
||||
INLINE FUNC_2(int##16, func, float##16, x, float##16, y) CONST \
|
||||
{ \
|
||||
return (content) ? -1 : 0; \
|
||||
} \
|
||||
INLINE FUNC_2(int##8, func, float##8, x, float##8, y) CONST \
|
||||
{ \
|
||||
return (content) ? -1 : 0; \
|
||||
} \
|
||||
INLINE FUNC_2(int##4, func, float##4, x, float##4, y) CONST \
|
||||
{ \
|
||||
return (content) ? -1 : 0; \
|
||||
} \
|
||||
INLINE FUNC_2(int##3, func, float##3, x, float##3, y) CONST \
|
||||
{ \
|
||||
return (content) ? -1 : 0; \
|
||||
} \
|
||||
INLINE FUNC_2(int##2, func, float##2, x, float##2, y) CONST \
|
||||
{ \
|
||||
return (content) ? -1 : 0; \
|
||||
} \
|
||||
INLINE FUNC_2(int, func, float, x, float, y) CONST \
|
||||
{ /* 1 instead of -1 here on purpose! */ \
|
||||
return (content) ? 1 : 0; \
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef FOR_ALL_ELEMENTS
|
||||
#define FOR_ALL_ELEMENTS(func, type, op, conv) \
|
||||
INLINE FUNC_1(int, func, type##16, x) CONST \
|
||||
{ \
|
||||
/* (s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sa, sb, sc, sd, se, sf) */ \
|
||||
type##16 val0 = conv(x); \
|
||||
/* (s0 op s1, s1 op s2, s2 op s3, s3 op s4, s4 op s5, s5 op s6, s6 op s7, s7 op s8, s8 op s9, s9 op sa, sa op sb, sb op sc, sc op sd, sd op se, se op sf, sf op s0) */ \
|
||||
val0 = val0 op vc4cl_vector_rotate(val0, -1); \
|
||||
/* (s0 op s1 op s2 op s3, s1 op s2 op s3 op s4, s2 op s3 op s4 op s5, s3 op s4 op s5 op s6, s4 op s5 op s6 op s7, s5 op s6 op s7 op s8, s6 op s7 op s8 op s9, s7 op s8 op s9 op sa, s8 op s9 op sa op sb, s9 op sa op sb op sc, sa op sb op sc op sd, sb op sc op sd op se, sc op sd op se op sf, ...) */ \
|
||||
const type##16 val1 = val0 op vc4cl_vector_rotate(val0, -2); \
|
||||
/* (s0 op s1 op s2 op s3 op s4 op s5 op s6 op s7, ..., s8 op s9 op sa op ab op sc op sd op se op sf, ...) */ \
|
||||
const type##16 val2 = val1 op vc4cl_vector_rotate(val1, -4); \
|
||||
/* s0 op s1 op s2 op s3 op s4 op s5 op s6 op s7 op s8 op s9 op sa op ab op sc op sd op se op sf */ \
|
||||
return (val2 op val1 op vc4cl_vector_rotate(val2, -8)).x != 0; \
|
||||
} \
|
||||
INLINE FUNC_1(int, func, type##8, x) CONST \
|
||||
{ \
|
||||
/* (s0, s1, s2, s3, s4, s5, s6, s7) */ \
|
||||
type##8 val0 = conv(x); \
|
||||
/* (s0 op s1, s1 op s2, s2 op s3, s3 op s4, s4 op s5, s5 op s6, s6 op s7, s7 op ?) */ \
|
||||
val0 = val0 op vc4cl_vector_rotate(val0, -1); \
|
||||
/* (s0 op s1 op s2 op s3, s1 op s2 op s3 op s4, s2 op s3 op s4 op s5, s3 op s4 op s5 op s6, s4 op s5 op s6 op s7, s5 op s6 op s7 op ?, s6 op s7 op ? op ?, s7 op ? op ? op ?) */ \
|
||||
const type##8 val1 = val0 op vc4cl_vector_rotate(val0, -2); \
|
||||
/* s0 op s1 op s2 op s3 op s4 op s5 op s6 op s7 */ \
|
||||
return (val1 op vc4cl_vector_rotate(val1, -4)).x != 0; \
|
||||
} \
|
||||
INLINE FUNC_1(int, func, type##4, x) CONST \
|
||||
{ \
|
||||
/* (x, y, z, w) */ \
|
||||
type##4 val0 = conv(x); \
|
||||
/* (x op y, y op z, z op w, w op ?) */ \
|
||||
val0 = val0 op vc4cl_vector_rotate(val0, -1); \
|
||||
/* (z op w, w op ?, ? op ?, ? op ?) */ \
|
||||
const type##4 val1 = vc4cl_vector_rotate(val0, -2); \
|
||||
/* (x op y op z op w, ...) */ \
|
||||
return (val0 op val1).x != 0; \
|
||||
} \
|
||||
INLINE FUNC_1(int, func, type##3, x) CONST \
|
||||
{ \
|
||||
type##3 val = conv(x); \
|
||||
return (val.x op val.y op val.z) != 0; \
|
||||
} \
|
||||
INLINE FUNC_1(int, func, type##2, x) CONST \
|
||||
{ \
|
||||
type##2 val = conv(x); \
|
||||
return (val.x op val.y) != 0; \
|
||||
} \
|
||||
INLINE FUNC_1(int, func, type, x) CONST \
|
||||
{ \
|
||||
type val = conv(x); \
|
||||
return val != 0; \
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef SELECT_SCALAR
|
||||
#define SELECT_SCALAR(type, maskType, content) \
|
||||
INLINE FUNC_3(type, select, type, a, type, b, maskType, c) CONST \
|
||||
{ \
|
||||
return content; \
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef SELECT_VECTOR
|
||||
#define SELECT_VECTOR(type, maskType, content) \
|
||||
INLINE FUNC_3(type##2, select, type##2, a, type##2, b, maskType##2, c) CONST \
|
||||
{ \
|
||||
typedef int##2 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_3(type##3, select, type##3, a, type##3, b, maskType##3, c) CONST \
|
||||
{ \
|
||||
typedef int##3 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_3(type##4, select, type##4, a, type##4, b, maskType##4, c) CONST \
|
||||
{ \
|
||||
typedef int##4 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_3(type##8, select, type##8, a, type##8, b, maskType##8, c) CONST \
|
||||
{ \
|
||||
typedef int##8 int_t; \
|
||||
content \
|
||||
} \
|
||||
INLINE FUNC_3(type##16, select, type##16, a, type##16, b, maskType##16, c) CONST \
|
||||
{ \
|
||||
typedef int##16 int_t; \
|
||||
content \
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* The checks for NaNs as defined in the specification are done in the intrinsic of the comparison operators:
|
||||
*
|
||||
* "The relational functions isequal, isgreater, isgreaterequal, isless, islessequal, and islessgreater
|
||||
* always return 0 if either argument is not a number (NaN). isnotequal returns 1 if one or both
|
||||
* arguments are not a number (NaN) and the argument type is a scalar [...]"
|
||||
* - OpenCL 1.2, section 6.12.6 Relational Functions
|
||||
*/
|
||||
COMPARISON_2(isequal, x == y)
|
||||
COMPARISON_2(isnotequal, x != y)
|
||||
COMPARISON_2(isgreater, x > y)
|
||||
COMPARISON_2(isgreaterequal, x >= y)
|
||||
COMPARISON_2(isless, x < y)
|
||||
COMPARISON_2(islessequal, x <= y)
|
||||
COMPARISON_2(islessgreater, (x < y) || (x > y))
|
||||
|
||||
// From <cmath>: "A finite value is any floating-point value that is neither infinite nor NaN (Not-A-Number)."
|
||||
COMPARISON_1(isfinite, !vc4cl_is_inf_nan(val))
|
||||
COMPARISON_1(isinf, (vc4cl_bitcast_uint(val) & NAN) == INF)
|
||||
COMPARISON_1(isnan, vc4cl_is_nan(val))
|
||||
// From <cmath>: "Returns whether x is a normal value: i.e., whether it is neither infinity, NaN, zero or subnormal."
|
||||
COMPARISON_1(isnormal, !isinf(val) && !isnan(val) && ((vc4cl_bitcast_uint(val) & 0x7F800000) != 0) /* neither zero nor denormal */)
|
||||
COMPARISON_2(isordered, isequal(x, x) && isequal(y, y))
|
||||
COMPARISON_2(isunordered, isnan(x) || isnan(y))
|
||||
|
||||
// for vector,directly use asr, for scalar shr. This is way more efficient than everything else (1 instruction)
|
||||
INLINE FUNC_1(int16, signbit, float16, val) CONST
|
||||
{
|
||||
return vc4cl_asr(vc4cl_bitcast_uint(val), 31);
|
||||
}
|
||||
INLINE FUNC_1(int8, signbit, float8, val) CONST
|
||||
{
|
||||
return vc4cl_asr(vc4cl_bitcast_uint(val), 31);
|
||||
}
|
||||
INLINE FUNC_1(int4, signbit, float4, val) CONST
|
||||
{
|
||||
return vc4cl_asr(vc4cl_bitcast_uint(val), 31);
|
||||
}
|
||||
INLINE FUNC_1(int3, signbit, float3, val) CONST
|
||||
{
|
||||
return vc4cl_asr(vc4cl_bitcast_uint(val), 31);
|
||||
}
|
||||
INLINE FUNC_1(int2, signbit, float2, val) CONST
|
||||
{
|
||||
return vc4cl_asr(vc4cl_bitcast_uint(val), 31);
|
||||
}
|
||||
INLINE FUNC_1(int, signbit, float, val) CONST
|
||||
{
|
||||
return vc4cl_bitcast_uint(val) >> 31;
|
||||
}
|
||||
|
||||
FOR_ALL_ELEMENTS(any, char, |, vc4cl_msb_set)
|
||||
FOR_ALL_ELEMENTS(any, short, |, vc4cl_msb_set)
|
||||
FOR_ALL_ELEMENTS(any, int, |, vc4cl_msb_set)
|
||||
FOR_ALL_ELEMENTS(any, long, |, vc4cl_msb_set)
|
||||
|
||||
FOR_ALL_ELEMENTS(all, char, &, vc4cl_msb_set)
|
||||
FOR_ALL_ELEMENTS(all, short, &, vc4cl_msb_set)
|
||||
FOR_ALL_ELEMENTS(all, int, &, vc4cl_msb_set)
|
||||
FOR_ALL_ELEMENTS(all, long, &, vc4cl_msb_set)
|
||||
|
||||
|
||||
//"Each bit of the result is the corresponding bit of a if the corresponding bit of c is 0.
|
||||
// Otherwise it is the corresponding bit of b."
|
||||
//based on pocl (pocl/lib/kernel/bitselect.cl)
|
||||
SIMPLE_3(uchar, bitselect, uchar, a, uchar, b, uchar, c, (~c & a) | (c & b))
|
||||
SIMPLE_3(char, bitselect, char, a, char, b, char, c, (~c & a) | (c & b))
|
||||
SIMPLE_3(ushort, bitselect, ushort, a, ushort, b, ushort, c, (~c & a) | (c & b))
|
||||
SIMPLE_3(short, bitselect, short, a, short, b, short, c, (~c & a) | (c & b))
|
||||
SIMPLE_3(uint, bitselect, uint, a, uint, b, uint, c, (~c & a) | (c & b))
|
||||
SIMPLE_3(int, bitselect, int, a, int, b, int, c, (~c & a) | (c & b))
|
||||
SIMPLE_3(ulong, bitselect, ulong, a, ulong, b, ulong, c, (~c & a) | (c & b))
|
||||
SIMPLE_3(long, bitselect, long, a, long, b, long, c, (~c & a) | (c & b))
|
||||
SIMPLE_3(float, bitselect, float, a, float, b, float, c, vc4cl_bitcast_float((~vc4cl_bitcast_uint(c) & vc4cl_bitcast_uint(a)) | (vc4cl_bitcast_uint(c) & vc4cl_bitcast_uint(b))))
|
||||
|
||||
//"For a scalar type, result = c ? b : a."
|
||||
SELECT_SCALAR(uchar, uchar, vc4cl_extend(c) ? b : a)
|
||||
SELECT_SCALAR(uchar, char, vc4cl_extend(c) ? b : a)
|
||||
SELECT_SCALAR(char, uchar, vc4cl_extend(c) ? b : a)
|
||||
SELECT_SCALAR(char, char, vc4cl_extend(c) ? b : a)
|
||||
SELECT_SCALAR(ushort, ushort, vc4cl_extend(c) ? b : a)
|
||||
SELECT_SCALAR(ushort, short, vc4cl_extend(c) ? b : a)
|
||||
SELECT_SCALAR(short, ushort, vc4cl_extend(c) ? b : a)
|
||||
SELECT_SCALAR(short, short, vc4cl_extend(c) ? b : a)
|
||||
SELECT_SCALAR(uint, uint, vc4cl_extend(c) ? b : a)
|
||||
SELECT_SCALAR(uint, int, vc4cl_extend(c) ? b : a)
|
||||
SELECT_SCALAR(int, uint, vc4cl_extend(c) ? b : a)
|
||||
SELECT_SCALAR(int, int, vc4cl_extend(c) ? b : a)
|
||||
SELECT_SCALAR(ulong, ulong, vc4cl_extend(c) ? b : a)
|
||||
SELECT_SCALAR(ulong, long, vc4cl_extend(c) ? b : a)
|
||||
SELECT_SCALAR(long, ulong, vc4cl_extend(c) ? b : a)
|
||||
SELECT_SCALAR(long, long, vc4cl_extend(c) ? b : a)
|
||||
SELECT_SCALAR(float, uint, vc4cl_extend(c) ? b : a)
|
||||
SELECT_SCALAR(float, int, vc4cl_extend(c) ? b : a)
|
||||
|
||||
//"For each component of a vector type, result[i] = if MSB of c[i] is set ? b[i] : a[i]"
|
||||
SELECT_VECTOR(uchar, uchar,
|
||||
{
|
||||
int_t mask = vc4cl_asr(vc4cl_extend(c) << 24, 31);
|
||||
return vc4cl_bitcast_uchar(mask & vc4cl_bitcast_int(vc4cl_extend(b)) | (~mask & vc4cl_bitcast_int(vc4cl_extend(a))));
|
||||
})
|
||||
SELECT_VECTOR(uchar, char,
|
||||
{
|
||||
int_t mask = vc4cl_asr(vc4cl_extend(c) << 24, 31);
|
||||
return vc4cl_bitcast_uchar(mask & vc4cl_bitcast_int(vc4cl_extend(b)) | (~mask & vc4cl_bitcast_int(vc4cl_extend(a))));
|
||||
})
|
||||
SELECT_VECTOR(char, char,
|
||||
{
|
||||
return vc4cl_msb_set(c) ? b : a;
|
||||
})
|
||||
SELECT_VECTOR(char, uchar,
|
||||
{
|
||||
return vc4cl_msb_set(c) ? b : a;
|
||||
})
|
||||
SELECT_VECTOR(ushort, ushort,
|
||||
{
|
||||
int_t mask = vc4cl_asr(vc4cl_extend(c) << 16, 31);
|
||||
return vc4cl_bitcast_ushort(mask & vc4cl_bitcast_int(vc4cl_extend(b)) | (~mask & vc4cl_bitcast_int(vc4cl_extend(a))));
|
||||
})
|
||||
SELECT_VECTOR(ushort, short,
|
||||
{
|
||||
int_t mask = vc4cl_asr(vc4cl_extend(c) << 16, 31);
|
||||
return vc4cl_bitcast_ushort(mask & vc4cl_bitcast_int(vc4cl_extend(b)) | (~mask & vc4cl_bitcast_int(vc4cl_extend(a))));
|
||||
})
|
||||
SELECT_VECTOR(short, short,
|
||||
{
|
||||
return vc4cl_msb_set(c) ? b : a;
|
||||
})
|
||||
SELECT_VECTOR(short, ushort,
|
||||
{
|
||||
return vc4cl_msb_set(c) ? b : a;
|
||||
})
|
||||
SELECT_VECTOR(uint, uint,
|
||||
{
|
||||
return vc4cl_msb_set(c) ? b : a;
|
||||
})
|
||||
SELECT_VECTOR(uint, int,
|
||||
{
|
||||
return vc4cl_msb_set(c) ? b : a;
|
||||
})
|
||||
SELECT_VECTOR(int, int,
|
||||
{
|
||||
return vc4cl_msb_set(c) ? b : a;
|
||||
})
|
||||
SELECT_VECTOR(int, uint,
|
||||
{
|
||||
return vc4cl_msb_set(c) ? b : a;
|
||||
})
|
||||
SELECT_VECTOR(ulong, ulong,
|
||||
{
|
||||
return vc4cl_msb_set(c) ? b : a;
|
||||
})
|
||||
SELECT_VECTOR(ulong, long,
|
||||
{
|
||||
return vc4cl_msb_set(c) ? b : a;
|
||||
})
|
||||
SELECT_VECTOR(long, long,
|
||||
{
|
||||
return vc4cl_msb_set(c) ? b : a;
|
||||
})
|
||||
SELECT_VECTOR(long, ulong,
|
||||
{
|
||||
return vc4cl_msb_set(c) ? b : a;
|
||||
})
|
||||
SELECT_VECTOR(float, uint,
|
||||
{
|
||||
return vc4cl_msb_set(c) ? b : a;
|
||||
})
|
||||
SELECT_VECTOR(float, int,
|
||||
{
|
||||
return vc4cl_msb_set(c) ? b : a;
|
||||
})
|
||||
|
||||
#undef COMPARISON_1
|
||||
#undef COMPARISON_2
|
||||
#undef FOR_ALL_ELEMENTS
|
||||
#undef SELECT_SCALAR
|
||||
#undef SELECT_VECTOR
|
||||
|
||||
#endif /* VC4CL_RELATIONAL_H */
|
||||
|
1716
drivers/videocore4_stdlib/include/_spir_mangling.h
Normal file
1716
drivers/videocore4_stdlib/include/_spir_mangling.h
Normal file
File diff suppressed because it is too large
Load Diff
24
drivers/videocore4_stdlib/include/_synchronization.h
Normal file
24
drivers/videocore4_stdlib/include/_synchronization.h
Normal file
@ -0,0 +1,24 @@
|
||||
/*
|
||||
* Author: doe300
|
||||
*
|
||||
* See the file "LICENSE" for the full license governing this code.
|
||||
*/
|
||||
|
||||
#ifndef VC4CL_SYNCHRONIZATION_H
|
||||
#define VC4CL_SYNCHRONIZATION_H
|
||||
|
||||
#include "_config.h"
|
||||
#include "_work_items.h"
|
||||
|
||||
INLINE void barrier(cl_mem_fence_flags flags) OVERLOADABLE
|
||||
{
|
||||
vc4cl_barrier(flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* We do not declare read_mem_fence() and write_mem_fence(), since:
|
||||
* - The SPIRV-LLVM-Translator (in older versions, e.g. 7.0) can't handle them passing a non-const flags to the mem_fence() function
|
||||
* - We anyway handle mem_fence(), read_mem_fence() and write_mem_fence() in both front-ends the exact same way
|
||||
*/
|
||||
#endif /* VC4CL_SYNCHRONIZATION_H */
|
||||
|
265
drivers/videocore4_stdlib/include/_vector.h
Normal file
265
drivers/videocore4_stdlib/include/_vector.h
Normal file
@ -0,0 +1,265 @@
|
||||
/*
|
||||
* Author: doe300
|
||||
*
|
||||
* See the file "LICENSE" for the full license governing this code.
|
||||
*/
|
||||
|
||||
#ifndef VC4CL_VECTOR_H
|
||||
#define VC4CL_VECTOR_H
|
||||
|
||||
#include "_config.h"
|
||||
#include "_overloads.h"
|
||||
|
||||
#ifndef VECTOR_LOAD
|
||||
#define VECTOR_LOAD(type) \
|
||||
INLINE type##2 vload2(size_t offset, const __global type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return *((const __global type##2 *)(ptr + offset * 2)); \
|
||||
} \
|
||||
INLINE type##3 vload3(size_t offset, const __global type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return vc4cl_vload3(ptr + offset * 3); \
|
||||
} \
|
||||
INLINE type##4 vload4(size_t offset, const __global type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return *((const __global type##4 *)(ptr + offset * 4)); \
|
||||
} \
|
||||
INLINE type##8 vload8(size_t offset, const __global type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return *((const __global type##8 *)(ptr + offset * 8)); \
|
||||
} \
|
||||
INLINE type##16 vload16(size_t offset, const __global type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return *((const __global type##16 *)(ptr + offset * 16)); \
|
||||
} \
|
||||
INLINE type##2 vload2(size_t offset, const __local type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return *((const __local type##2 *)(ptr + offset * 2)); \
|
||||
} \
|
||||
INLINE type##3 vload3(size_t offset, const __local type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return vc4cl_vload3(ptr + offset * 3); \
|
||||
} \
|
||||
INLINE type##4 vload4(size_t offset, const __local type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return *((const __local type##4 *)(ptr + offset * 4)); \
|
||||
} \
|
||||
INLINE type##8 vload8(size_t offset, const __local type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return *((const __local type##8 *)(ptr + offset * 8)); \
|
||||
} \
|
||||
INLINE type##16 vload16(size_t offset, const __local type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return *((const __local type##16 *)(ptr + offset * 16)); \
|
||||
} \
|
||||
INLINE type##2 vload2(size_t offset, const __constant type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return *((const __constant type##2 *)(ptr + offset * 2)); \
|
||||
} \
|
||||
INLINE type##3 vload3(size_t offset, const __constant type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return vc4cl_vload3(ptr + offset * 3); \
|
||||
} \
|
||||
INLINE type##4 vload4(size_t offset, const __constant type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return *((const __constant type##4 *)(ptr + offset * 4)); \
|
||||
} \
|
||||
INLINE type##8 vload8(size_t offset, const __constant type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return *((const __constant type##8 *)(ptr + offset * 8)); \
|
||||
} \
|
||||
INLINE type##16 vload16(size_t offset, const __constant type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return *((const __constant type##16 *)(ptr + offset * 16)); \
|
||||
} \
|
||||
INLINE type##2 vload2(size_t offset, const __private type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return *((const __private type##2 *)(ptr + offset * 2)); \
|
||||
} \
|
||||
INLINE type##3 vload3(size_t offset, const __private type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return vc4cl_vload3(ptr + offset * 3); \
|
||||
} \
|
||||
INLINE type##4 vload4(size_t offset, const __private type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return *((const __private type##4 *)(ptr + offset * 4)); \
|
||||
} \
|
||||
INLINE type##8 vload8(size_t offset, const __private type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return *((const __private type##8 *)(ptr + offset * 8)); \
|
||||
} \
|
||||
INLINE type##16 vload16(size_t offset, const __private type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
return *((const __private type##16 *)(ptr + offset * 16)); \
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef VECTOR_STORE
|
||||
#define VECTOR_STORE(type) \
|
||||
INLINE void vstore2(type##2 data, size_t offset, __global type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
*((__global type##2 *)(ptr + offset * 2)) = data; \
|
||||
} \
|
||||
INLINE void vstore3(type##3 data, size_t offset, __global type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
vc4cl_vstore3(ptr + offset * 3, data); \
|
||||
} \
|
||||
INLINE void vstore4(type##4 data, size_t offset, __global type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
*((__global type##4 *)(ptr + offset * 4)) = data; \
|
||||
} \
|
||||
INLINE void vstore8(type##8 data, size_t offset, __global type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
*((__global type##8 *)(ptr + offset * 8)) = data; \
|
||||
} \
|
||||
INLINE void vstore16(type##16 data, size_t offset, __global type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
*((__global type##16 *)(ptr + offset * 16)) = data; \
|
||||
} \
|
||||
INLINE void vstore2(type##2 data, size_t offset, __local type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
*((__local type##2 *)(ptr + offset * 2)) = data; \
|
||||
} \
|
||||
INLINE void vstore3(type##3 data, size_t offset, __local type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
vc4cl_vstore3(ptr + offset * 3, data); \
|
||||
} \
|
||||
INLINE void vstore4(type##4 data, size_t offset, __local type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
*((__local type##4 *)(ptr + offset * 4)) = data; \
|
||||
} \
|
||||
INLINE void vstore8(type##8 data, size_t offset, __local type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
*((__local type##8 *)(ptr + offset * 8)) = data; \
|
||||
} \
|
||||
INLINE void vstore16(type##16 data, size_t offset, __local type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
*((__local type##16 *)(ptr + offset * 16)) = data; \
|
||||
} \
|
||||
INLINE void vstore2(type##2 data, size_t offset, __private type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
*((__private type##2 *)(ptr + offset * 2)) = data; \
|
||||
} \
|
||||
INLINE void vstore3(type##3 data, size_t offset, __private type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
vc4cl_vstore3(ptr + offset * 3, data); \
|
||||
} \
|
||||
INLINE void vstore4(type##4 data, size_t offset, __private type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
*((__private type##4 *)(ptr + offset * 4)) = data; \
|
||||
} \
|
||||
INLINE void vstore8(type##8 data, size_t offset, __private type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
*((__private type##8 *)(ptr + offset * 8)) = data; \
|
||||
} \
|
||||
INLINE void vstore16(type##16 data, size_t offset, __private type * ptr) OVERLOADABLE \
|
||||
{ \
|
||||
*((__private type##16 *)(ptr + offset * 16)) = data; \
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef VECTOR_SHUFFLE_2
|
||||
#define VECTOR_SHUFFLE_2_INTERNAL(type, maskType, num) \
|
||||
INLINE type##2 shuffle2(type##num x, type##num y, maskType##2 mask) OVERLOADABLE \
|
||||
{ \
|
||||
return __builtin_shufflevector(x, y, mask.x, mask.y); \
|
||||
} \
|
||||
INLINE type##4 shuffle2(type##num x, type##num y, maskType##4 mask) OVERLOADABLE \
|
||||
{ \
|
||||
return __builtin_shufflevector(x, y, mask.x, mask.y, mask.z, mask.w); \
|
||||
} \
|
||||
INLINE type##8 shuffle2(type##num x, type##num y, maskType##8 mask) OVERLOADABLE \
|
||||
{ \
|
||||
return __builtin_shufflevector(x, y, mask.s0, mask.s1, mask.s2, mask.s3, mask.s4, mask.s5, mask.s6, mask.s7); \
|
||||
} \
|
||||
INLINE type##16 shuffle2(type##num x, type##num y, maskType##16 mask) OVERLOADABLE \
|
||||
{ \
|
||||
return __builtin_shufflevector(x, y, mask.s0, mask.s1, mask.s2, mask.s3, mask.s4, mask.s5, mask.s6, mask.s7, mask.s8, mask.s9, mask.sa, mask.sb, mask.sc, mask.sd, mask.se, mask.sf); \
|
||||
} \
|
||||
|
||||
#define VECTOR_SHUFFLE_2(type, maskType) \
|
||||
VECTOR_SHUFFLE_2_INTERNAL(type, maskType, 2) \
|
||||
VECTOR_SHUFFLE_2_INTERNAL(type, maskType, 4) \
|
||||
VECTOR_SHUFFLE_2_INTERNAL(type, maskType, 8) \
|
||||
VECTOR_SHUFFLE_2_INTERNAL(type, maskType, 16)
|
||||
#endif
|
||||
|
||||
#ifndef VECTOR_SHUFFLE
|
||||
#define VECTOR_SHUFFLE_INTERNAL(type, maskType, num) \
|
||||
INLINE type##2 shuffle(type##num val, maskType##2 mask) OVERLOADABLE \
|
||||
{ \
|
||||
return shuffle2(val, val, mask); \
|
||||
} \
|
||||
INLINE type##4 shuffle(type##num val, maskType##4 mask) OVERLOADABLE \
|
||||
{ \
|
||||
return shuffle2(val, val, mask); \
|
||||
} \
|
||||
INLINE type##8 shuffle(type##num val, maskType##8 mask) OVERLOADABLE \
|
||||
{ \
|
||||
return shuffle2(val, val, mask); \
|
||||
} \
|
||||
INLINE type##16 shuffle(type##num val, maskType##16 mask) OVERLOADABLE \
|
||||
{ \
|
||||
return shuffle2(val, val, mask); \
|
||||
} \
|
||||
|
||||
#define VECTOR_SHUFFLE(type, maskType) \
|
||||
VECTOR_SHUFFLE_INTERNAL(type, maskType, 2) \
|
||||
VECTOR_SHUFFLE_INTERNAL(type, maskType, 4) \
|
||||
VECTOR_SHUFFLE_INTERNAL(type, maskType, 8) \
|
||||
VECTOR_SHUFFLE_INTERNAL(type, maskType, 16)
|
||||
#endif
|
||||
|
||||
VECTOR_LOAD(uchar)
|
||||
VECTOR_LOAD(char)
|
||||
VECTOR_LOAD(ushort)
|
||||
VECTOR_LOAD(short)
|
||||
VECTOR_LOAD(uint)
|
||||
VECTOR_LOAD(int)
|
||||
VECTOR_LOAD(float)
|
||||
VECTOR_LOAD(ulong)
|
||||
VECTOR_LOAD(long)
|
||||
|
||||
VECTOR_STORE(uchar)
|
||||
VECTOR_STORE(char)
|
||||
VECTOR_STORE(ushort)
|
||||
VECTOR_STORE(short)
|
||||
VECTOR_STORE(uint)
|
||||
VECTOR_STORE(int)
|
||||
VECTOR_STORE(float)
|
||||
VECTOR_STORE(ulong)
|
||||
VECTOR_STORE(long)
|
||||
|
||||
//TODO vload(a)_half, vload(a)_halfn (+rounding) (load half and return converted to float, possible with unpack-modes)
|
||||
//TODO vstore(a)_half, vstore(a)_halfn (+rounding) (store float as half in memory, possible with pack modes)
|
||||
|
||||
/*
|
||||
* TODO shuffle2, but LLVM fails, since the indices for the __builtin intrinsic need to be constant integers!
|
||||
VECTOR_SHUFFLE_2(uchar, uchar)
|
||||
VECTOR_SHUFFLE_2(char, uchar)
|
||||
VECTOR_SHUFFLE_2(ushort, ushort)
|
||||
VECTOR_SHUFFLE_2(short, ushort)
|
||||
VECTOR_SHUFFLE_2(uint, uint)
|
||||
VECTOR_SHUFFLE_2(int, uint)
|
||||
VECTOR_SHUFFLE_2(float, uint)
|
||||
|
||||
VECTOR_SHUFFLE(uchar, uchar)
|
||||
VECTOR_SHUFFLE(char, uchar)
|
||||
VECTOR_SHUFFLE(ushort, ushort)
|
||||
VECTOR_SHUFFLE(short, ushort)
|
||||
VECTOR_SHUFFLE(uint, uint)
|
||||
VECTOR_SHUFFLE(int, uint)
|
||||
VECTOR_SHUFFLE(float, uint)
|
||||
*/
|
||||
|
||||
//shuffle/shuffle2 are handled via intrinsifying the OpenCL function
|
||||
|
||||
#undef VECTOR_LOAD
|
||||
#undef VECTOR_STORE
|
||||
#undef VECTOR_SHUFFLE_2_INTERNAL
|
||||
#undef VECTOR_SHUFFLE_2
|
||||
#undef VECTOR_SHUFFLE_INTERNAL
|
||||
#undef VECTOR_SHUFFLE
|
||||
|
||||
#endif /* VC4CL_VECTOR_H */
|
||||
|
70
drivers/videocore4_stdlib/include/_work_items.h
Normal file
70
drivers/videocore4_stdlib/include/_work_items.h
Normal file
@ -0,0 +1,70 @@
|
||||
/*
|
||||
* Author: doe300
|
||||
*
|
||||
* See the file "LICENSE" for the full license governing this code.
|
||||
*/
|
||||
|
||||
#ifndef VC4CL_WORK_ITEMS_H
|
||||
#define VC4CL_WORK_ITEMS_H
|
||||
|
||||
#include "_intrinsics.h"
|
||||
#include "_overloads.h"
|
||||
|
||||
INLINE uint get_work_dim(void) OVERLOADABLE CONST
|
||||
{
|
||||
return vc4cl_work_dimensions();
|
||||
}
|
||||
|
||||
INLINE size_t get_global_size(uint dim) OVERLOADABLE CONST
|
||||
{
|
||||
return vc4cl_global_size(dim);
|
||||
}
|
||||
|
||||
INLINE size_t get_global_id(uint dim) OVERLOADABLE CONST
|
||||
{
|
||||
return vc4cl_global_id(dim);
|
||||
}
|
||||
|
||||
INLINE size_t get_local_size(uint dim) OVERLOADABLE CONST
|
||||
{
|
||||
return vc4cl_local_size(dim);
|
||||
}
|
||||
|
||||
INLINE size_t get_enqueued_local_size(uint dimindx) OVERLOADABLE CONST
|
||||
{
|
||||
// "Returns the same value as that returned by get_local_size(dimindx) if the kernel is executed with a uniform
|
||||
// work-group size."
|
||||
return vc4cl_local_size(dimindx);
|
||||
}
|
||||
|
||||
INLINE size_t get_local_id(uint dim) OVERLOADABLE CONST
|
||||
{
|
||||
return vc4cl_local_id(dim);
|
||||
}
|
||||
|
||||
INLINE size_t get_num_groups(uint dim) OVERLOADABLE CONST
|
||||
{
|
||||
return vc4cl_num_groups(dim);
|
||||
}
|
||||
|
||||
INLINE size_t get_group_id(uint dim) OVERLOADABLE CONST
|
||||
{
|
||||
return vc4cl_group_id(dim);
|
||||
}
|
||||
|
||||
INLINE size_t get_global_offset(uint dim) OVERLOADABLE CONST
|
||||
{
|
||||
return vc4cl_global_offset(dim);
|
||||
}
|
||||
|
||||
INLINE size_t get_global_linear_id() OVERLOADABLE CONST
|
||||
{
|
||||
return vc4cl_global_linear_id();
|
||||
}
|
||||
|
||||
INLINE size_t get_local_linear_id() OVERLOADABLE CONST
|
||||
{
|
||||
return vc4cl_local_linear_id();
|
||||
}
|
||||
|
||||
#endif /* VC4CL_WORK_ITEMS_H */
|
105
drivers/videocore4_stdlib/include/defines.h
Normal file
105
drivers/videocore4_stdlib/include/defines.h
Normal file
@ -0,0 +1,105 @@
|
||||
/*
|
||||
* Author: doe300
|
||||
*
|
||||
* See the file "LICENSE" for the full license governing this code.
|
||||
*/
|
||||
|
||||
#ifndef VC4CL_DEFINES_H
|
||||
#define VC4CL_DEFINES_H
|
||||
|
||||
#ifndef CL_VERSION_1_0
|
||||
#define CL_VERSION_1_0 100
|
||||
#endif
|
||||
#ifndef CL_VERSION_1_1
|
||||
#define CL_VERSION_1_1 110
|
||||
#endif
|
||||
#ifndef CL_VERSION_1_2
|
||||
#define CL_VERSION_1_2 120
|
||||
#endif
|
||||
#ifndef CL_VERSION_2_0
|
||||
#define CL_VERSION_2_0 200
|
||||
#endif
|
||||
#ifndef CL_VERSION_2_1
|
||||
#define CL_VERSION_2_1 210
|
||||
#endif
|
||||
#ifndef CL_VERSION_2_2
|
||||
#define CL_VERSION_2_2 220
|
||||
#endif
|
||||
|
||||
#undef __OPENCL_VERSION__
|
||||
#define __OPENCL_VERSION__ CL_VERSION_1_2
|
||||
#undef __OPENCL_C_VERSION__
|
||||
#define __OPENCL_C_VERSION__ CL_VERSION_1_2
|
||||
#ifndef __ENDIAN_LITTLE__
|
||||
#define __ENDIAN_LITTLE__ 1
|
||||
#endif
|
||||
#ifndef __EMBEDDED_PROFILE__
|
||||
#define __EMBEDDED_PROFILE__ 1
|
||||
#endif
|
||||
//#ifndef __IMAGE_SUPPORT__
|
||||
//#define __IMAGE_SUPPORT__ 1
|
||||
//#endif
|
||||
#undef __IMAGE_SUPPORT__
|
||||
|
||||
#ifndef cl_khr_global_int32_base_atomics
|
||||
#define cl_khr_global_int32_base_atomics
|
||||
#endif
|
||||
#ifndef cl_khr_local_int32_base_atomics
|
||||
#define cl_khr_local_int32_base_atomics
|
||||
#endif
|
||||
#ifndef cl_khr_global_int32_extended_atomics
|
||||
#define cl_khr_global_int32_extended_atomics
|
||||
#endif
|
||||
#ifndef cl_khr_local_int32_extended_atomics
|
||||
#define cl_khr_local_int32_extended_atomics
|
||||
#endif
|
||||
#ifndef cl_khr_byte_addressable_store
|
||||
#define cl_khr_byte_addressable_store
|
||||
#endif
|
||||
#ifndef cl_khr_initialize_memory
|
||||
#define cl_khr_initialize_memory
|
||||
#endif
|
||||
|
||||
#ifdef __IMAGE_SUPPORT__
|
||||
#ifndef cl_khr_3d_image_writes
|
||||
#define cl_khr_3d_image_writes
|
||||
#endif
|
||||
#ifndef cl_intel_packed_yuv
|
||||
#define cl_intel_packed_yuv
|
||||
#endif
|
||||
#else
|
||||
#undef cl_khr_3d_image_writes
|
||||
#undef cl_intel_packed_yuv
|
||||
#endif
|
||||
|
||||
// additional supported extensions (need to set flag here, since the module is loaded too late)
|
||||
#define cl_nv_pragma_unroll 1
|
||||
#define cl_arm_core_id 1
|
||||
#define cl_ext_atomic_counters_32 1
|
||||
#define cl_arm_integer_dot_product_int8 1
|
||||
#define cl_arm_integer_dot_product_accumulate_int8 1
|
||||
#define cl_arm_integer_dot_product_accumulate_int16 1
|
||||
#define cl_arm_integer_dot_product_accumulate_saturate_int8 1
|
||||
|
||||
// unsupported extensions or optional core features
|
||||
#undef cl_khr_fp16
|
||||
#undef cl_khr_fp64
|
||||
#undef cl_khr_int64_base_atomics
|
||||
#undef cl_khr_int64_extended_atomics
|
||||
#undef cl_khr_depth_images
|
||||
#undef cl_khr_gl_depth_images
|
||||
#undef cl_khr_gl_msaa_sharing
|
||||
#undef cl_amd_media_ops
|
||||
#undef cl_amd_media_ops2
|
||||
// unsupported host-only extensions (disable for safety)
|
||||
#undef cl_khr_gl_sharing
|
||||
#undef cl_khr_gl_event
|
||||
#undef cl_khr_d3d10_sharing
|
||||
#undef cl_khr_dx9_media_sharing
|
||||
#undef cl_khr_d3d11_sharing
|
||||
#undef cl_khr_image2d_from_buffer
|
||||
#undef cl_khr_terminate_context
|
||||
#undef cl_khr_egl_image
|
||||
#undef cl_khr_egl_event
|
||||
|
||||
#endif /* VC4CL_DEFINES_H */
|
16914
drivers/videocore4_stdlib/include/opencl-c.h
Normal file
16914
drivers/videocore4_stdlib/include/opencl-c.h
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user