Brooklyn/drivers/videocore4_stdlib/experimental/compare_implementations.cpp


#define CL_TARGET_OPENCL_VERSION 120
#define CL_HPP_CL_1_2_DEFAULT_BUILD 1
#define CL_HPP_ENABLE_EXCEPTIONS 1
#define CL_HPP_TARGET_OPENCL_VERSION 120
#define CL_HPP_MINIMUM_OPENCL_VERSION 120
#include <CL/cl.hpp>

#include <algorithm>
#include <chrono>
#include <cmath>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <functional>
#include <iostream>
#include <limits>
#include <random>
#include <sstream>
#include <stdexcept>
#include <string>
#include <unistd.h> // geteuid()
#include <vector>

static constexpr uint32_t DEFAULT_NUM_LINEAR = 12 * 16 * 8;
static constexpr uint32_t DEFAULT_NUM_RANDOM = 12 * 16 * 8;

// VC4CL performance counters
#define CL_PROFILING_PERFORMANCE_COUNTER_EXECUTION_CYCLES_VC4CL (CL_PROFILING_COMMAND_END + 10)
#define CL_PROFILING_PERFORMANCE_COUNTER_IDLE_CYCLES_VC4CL (CL_PROFILING_COMMAND_END + 11)
#define CL_PROFILING_PERFORMANCE_COUNTER_INSTRUCTION_COUNT_VC4CL (CL_PROFILING_COMMAND_END + 12)
#define CL_PROFILING_PERFORMANCE_COUNTER_INSTRUCTION_CACHE_MISSES_VC4CL (CL_PROFILING_COMMAND_END + 13)
#define CL_PROFILING_PERFORMANCE_COUNTER_L2_CACHE_MISSES_VC4CL (CL_PROFILING_COMMAND_END + 14)

struct Range
{
	float min;
	float max;
};

struct ReferenceFunction
{
	ReferenceFunction(float (*func)(float)) : numParameters(1), ptr(reinterpret_cast<void *>(func)) {}
	ReferenceFunction(float (*func)(float, float)) : numParameters(2), ptr(reinterpret_cast<void *>(func)) {}
	ReferenceFunction(float (*func)(float, float, float)) : numParameters(3), ptr(reinterpret_cast<void *>(func)) {}

	float operator()(float val) const
	{
		if(numParameters != 1)
			throw std::runtime_error{"Reference function called with the wrong number of arguments"};
		return reinterpret_cast<float (*)(float)>(ptr)(val);
	}

	float operator()(float val0, float val1) const
	{
		if(numParameters != 2)
			throw std::runtime_error{"Reference function called with the wrong number of arguments"};
		return reinterpret_cast<float (*)(float, float)>(ptr)(val0, val1);
	}

	float operator()(float val0, float val1, float val2) const
	{
		if(numParameters != 3)
			throw std::runtime_error{"Reference function called with the wrong number of arguments"};
		return reinterpret_cast<float (*)(float, float, float)>(ptr)(val0, val1, val2);
	}

	std::vector<float> operator()(const std::vector<std::vector<float>> &inputs) const
	{
		std::vector<float> out(inputs.front().size());
		for(std::size_t i = 0; i < out.size(); ++i)
		{
			if(numParameters == 1)
				out[i] = (*this)(inputs[0][i]);
			if(numParameters == 2)
				out[i] = (*this)(inputs[0][i], inputs[1][i]);
			if(numParameters == 3)
				out[i] = (*this)(inputs[0][i], inputs[1][i], inputs[2][i]);
		}
		return out;
	}

	uint8_t numParameters;
	void *ptr;
};

struct Test
{
	std::string name;
	ReferenceFunction reference;
	uint32_t allowedErrorInUlp;
	std::string sourceFile;
	std::vector<Range> ranges;
};

static float identity(float val)
{
	return val;
}

// XXX OpenCL-CTS calculates reference in double, thus is more accurate. So tests being accurate here might not be in
// the CTS!
static const std::vector<Test> floatTests = {
	Test{"log", logf, 4, "log.cl",
		{
			{0.5, 1.0}, // reduced range some implementations use
			{std::numeric_limits<float>::min(), std::numeric_limits<float>::max()} // full range
		}},
	Test{"exp", expf, 4, "exp.cl",
		{
			{0.0, 0.5f * logf(2.0f)}, // reduced range some implementations use
			{-87.0f /* everything below e^-87 is subnormal */, 89.0f /* everything above e^89 is Inf */} // full range
		}},
	Test{"identity", identity, 0, "identity.cl",
		{
			{-10.0f, 10.0f}, {std::numeric_limits<float>::lowest(), std::numeric_limits<float>::max()} // full range
		}},
	Test{"cbrt", cbrtf, 4, "cbrt.cl",
		{
			{-1.0, 1.0}, // limited range for precision testing
			{std::numeric_limits<float>::lowest(), std::numeric_limits<float>::max()} // full range
		}},
	Test{"fma", fmaf, 0, "fma.cl",
		{
			{-100.0f, 100.0f}, // reduced range to not run into NaN/Inf
			{std::numeric_limits<float>::lowest(), std::numeric_limits<float>::max()} // full range
		}}};

static std::vector<float> generateInputData(const Range &range, uint32_t numLinear, uint32_t numRandom)
{
	std::vector<float> data{};
	data.reserve(numLinear + numRandom);
	auto step = (range.max - range.min) / static_cast<float>(numLinear); // TODO overflows on full ranges
	for(float val = range.min; val < range.max; val += step)
		data.emplace_back(val);

	std::random_device rd{};
	std::default_random_engine gen(rd());
	std::uniform_real_distribution<> dist{range.min, range.max};

	while(data.size() < (numLinear + numRandom))
		data.emplace_back(static_cast<float>(dist(gen)));

	return data;
}

static std::vector<std::vector<float>> generateInputData(
	const Range &range, uint32_t numLinear, uint32_t numRandom, uint8_t numInputs)
{
	std::vector<std::vector<float>> data{};
	for(uint8_t i = 0; i < numInputs; ++i)
		data.emplace_back(generateInputData(range, numLinear, numRandom));
	return data;
}

static std::vector<cl::Kernel> createKernels(const cl::Context &context, const Test &test)
{
	std::stringstream ss;
	{
		std::ifstream fis{test.sourceFile};
		ss << fis.rdbuf();
	}
	cl::Program program(context, ss.str(), true);

	std::vector<cl::Kernel> kernels;
	program.createKernels(&kernels);
	return kernels;
}

struct ErrorResult
{
	std::vector<float> inputValues;
	float expected;
	float actual;
	uint32_t errorInUlp;

	// ordered by "most wrong" first
	bool operator<(const ErrorResult &other) const noexcept
	{
		if(errorInUlp > other.errorInUlp)
			return true;
		if(errorInUlp < other.errorInUlp)
			return false;
		return inputValues < other.inputValues;
	}

	friend std::ostream &operator<<(std::ostream &os, const ErrorResult &error)
	{
		os << "Error of " << error.errorInUlp << " ULP for ";
		if(error.inputValues.size() == 1)
			os << std::scientific << error.inputValues.front();
		else if(error.inputValues.size() == 2)
			os << std::scientific << '{' << error.inputValues.front() << ", " << error.inputValues.back() << '}';
		else if(error.inputValues.size() == 3)
			os << std::scientific << '{' << error.inputValues[0] << ", " << error.inputValues[1] << ", "
			   << error.inputValues[2] << '}';
		else
		{
			os << '{';
			for(auto input : error.inputValues)
				os << std::scientific << input << ", ";
			os << '}';
		}
		os << ", expected " << error.expected << ", got " << error.actual << std::defaultfloat;
		return os;
	}
};

template <typename Out, typename In>
static Out bit_cast(In val)
{
	union
	{
		In in;
		Out out;
	} u;
	u.in = val;
	return u.out;
}

static uint32_t calculateError(float reference, float result, uint32_t allowedErrorInUlp)
{
	if(std::isinf(reference) && std::isinf(result) && std::signbit(reference) == std::signbit(result))
		return 0;
	if(std::isnan(reference) && std::isnan(result))
		return 0;
	// auto ulp = std::abs(reference * std::numeric_limits<float>::epsilon());
	// float difference = std::abs(result - reference);
	// if(difference > static_cast<float>(allowedErrorInUlp))
	// 	return static_cast<uint32_t>(std::ceil(difference / ulp));
	// return 0;
	return static_cast<uint32_t>(std::abs(bit_cast<int32_t>(reference) - bit_cast<int32_t>(result)));
}

static std::pair<std::vector<ErrorResult>, uint32_t> checkResults(const std::vector<std::vector<float>> &inputs,
	const std::vector<float> &reference, const std::vector<float> &result, uint32_t allowedErrorInUlp)
{
	std::vector<ErrorResult> errors;
	uint32_t maxError = 0;

	for(std::size_t i = 0; i < std::min(reference.size(), result.size()); ++i)
	{
		auto error = calculateError(reference.at(i), result.at(i), allowedErrorInUlp);
		maxError = std::max(maxError, error);
		if(error > allowedErrorInUlp)
		{
			std::vector<float> errorInputs;
			for(const auto &input : inputs)
				errorInputs.push_back(input.at(i));
			errors.emplace_back(ErrorResult{std::move(errorInputs), reference.at(i), result.at(i), error});
		}
	}

	std::sort(errors.begin(), errors.end());
	return std::make_pair(std::move(errors), maxError);
}

static void runTest(
	const cl::Context &context, const cl::CommandQueue &queue, const Test &test, uint32_t numLinear, uint32_t numRandom)
{
	std::cout << "Running test " << test.sourceFile << " ..." << std::endl;
	std::cout << "\tRunning " << test.ranges.size() << " ranges with " << (numLinear + numRandom) << " values"
			  << std::endl;
	auto kernels = createKernels(context, test);
	std::cout << "\tTesting " << kernels.size() << " implementations " << std::endl;

	for(const auto &range : test.ranges)
	{
		auto inputs = generateInputData(range, numLinear, numRandom, test.reference.numParameters);
		auto inputSize = inputs.front().size();
		cl::NDRange globalSize(inputSize / 16);
		std::vector<float> reference = test.reference(inputs);

		std::vector<cl::Buffer> inputBuffers;
		for(auto &input : inputs)
			inputBuffers.emplace_back(queue, input.begin(), input.end(), true);
		cl::Buffer outputBuffer(context, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, inputSize * sizeof(float));

		for(auto &kernel : kernels)
		{
			kernel.setArg(0, outputBuffer);
			for(std::size_t i = 0; i < inputBuffers.size(); ++i)
				kernel.setArg(1 + i, inputBuffers[i]);

			std::cout << "\tRunning kernel '" << kernel.getInfo<CL_KERNEL_FUNCTION_NAME>() << "' with "
					  << (inputSize / 16) << " work-items ... " << std::endl;
			auto start = std::chrono::steady_clock::now();
			cl::Event kernelEvent{};
			queue.enqueueNDRangeKernel(kernel, cl::NullRange, globalSize, cl::NullRange, nullptr, &kernelEvent);
			kernelEvent.wait();
			auto end = std::chrono::steady_clock::now();
			std::cout << "\t- Finished in "
					  << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() << " us"
					  << std::endl;
			std::chrono::nanoseconds deviceDuration{kernelEvent.getProfilingInfo<CL_PROFILING_COMMAND_END>() -
				kernelEvent.getProfilingInfo<CL_PROFILING_COMMAND_START>()};
			std::cout << "\t- Executed for "
					  << std::chrono::duration_cast<std::chrono::microseconds>(deviceDuration).count() << " us"
					  << std::endl;
			if(geteuid() == 0) // TODO only on hardware
			{
				cl_ulong numInstructions = 0;
				kernelEvent.getProfilingInfo(
					CL_PROFILING_PERFORMANCE_COUNTER_INSTRUCTION_COUNT_VC4CL, &numInstructions);
				cl_ulong numCycles = 0;
				kernelEvent.getProfilingInfo(CL_PROFILING_PERFORMANCE_COUNTER_EXECUTION_CYCLES_VC4CL, &numCycles);
				std::cout << "\t- Executed " << numInstructions << " instructions in " << numCycles << " cycles"
						  << std::endl;
			}

			std::vector<float> result(inputSize);
			queue.enqueueReadBuffer(outputBuffer, CL_TRUE, 0, inputSize * sizeof(float), result.data());
			auto errors = checkResults(inputs, reference, result, test.allowedErrorInUlp);
			std::cout << "\t- Has " << errors.first.size() << " wrong results and a maximum error of " << errors.second
					  << " ULP (of allowed " << test.allowedErrorInUlp << " ULP)" << std::endl;
			for(std::size_t i = 0; i < std::min(errors.first.size(), std::size_t{8}); ++i)
				std::cout << "\t\t" << errors.first[i] << std::endl;
			if(errors.first.size() > 8)
				std::cout << "\t\t[...]" << std::endl;
		}
	}
}

static void printHelp()
{
	std::cout << "Usage: <program> [<options>] <test> [<test>...]" << std::endl;
	std::cout << "Options: " << std::endl;
	std::cout << "\t--help         Shows this help message" << std::endl;
	std::cout << "\t--linear=<num> Specifies the number of linear test values, defaults to " << DEFAULT_NUM_LINEAR
			  << std::endl;
	std::cout << "\t--random=<num> Specifies the number of random test values, defaults to " << DEFAULT_NUM_RANDOM
			  << std::endl;
	std::cout << "Available tests: ";
	for(const auto &test : floatTests)
		std::cout << test.name << ", ";
	std::cout << std::endl;
}

int main(int argc, char **argv)
{
	uint32_t numLinear = DEFAULT_NUM_LINEAR;
	uint32_t numRandom = DEFAULT_NUM_RANDOM;

	if(argc < 2)
	{
		printHelp();
		return EXIT_SUCCESS;
	}

	auto platform = cl::Platform::get();
	cl::Device device{};
	{
		std::vector<cl::Device> devices;
		platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
		if(devices.empty())
		{
			std::cout << "No device found!" << std::endl;
			return EXIT_FAILURE;
		}
		device = devices.front();
	}
	cl::Context context(device);
	cl::CommandQueue queue(context, CL_QUEUE_PROFILING_ENABLE);

	std::vector<std::reference_wrapper<const Test>> selectedTests;
	for(int i = 1; i < argc; ++i)
	{
		if(argv[i][0] == '-')
		{
			if(std::string{"--help"} == argv[i])
			{
				printHelp();
				return EXIT_SUCCESS;
			}
			else if(strstr(argv[i], "--linear=") == argv[i])
				numLinear = static_cast<uint32_t>(std::atoi(argv[i] + strlen("--linear=")));
			else if(strstr(argv[i], "--random=") == argv[i])
				numRandom = static_cast<uint32_t>(std::atoi(argv[i] + strlen("--random=")));
			else
			{
				std::cout << "Unknown option: " << argv[i] << std::endl;
				printHelp();
				return EXIT_FAILURE;
			}
		}
		auto testIt =
			std::find_if(floatTests.begin(), floatTests.end(), [&](const Test &test) { return test.name == argv[i]; });
		if(testIt != floatTests.end())
			selectedTests.emplace_back(std::cref(*testIt));
		else
		{
			std::cout << "No such test '" << argv[i] << "', available tests: ";
			for(const auto &test : floatTests)
				std::cout << test.name << ", ";
			std::cout << std::endl;
			return EXIT_FAILURE;
		}
	}

	for(const auto &test : selectedTests)
		runTest(context, queue, test.get(), numLinear, numRandom);

	return EXIT_SUCCESS;
}