gpu_burn-drv.cpp

/* 
 * Copyright (c) 2016, Ville Timonen
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 * 
 * 1. Redistributions of source code must retain the above copyright notice, this
 *    list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * 
 * The views and conclusions contained in the software and documentation are those
 * of the authors and should not be interpreted as representing official policies,
 * either expressed or implied, of the FreeBSD Project.
 */

#define SIZE 2048ul // Matrices are SIZE*SIZE..  2048^2 should be efficiently implemented in CUBLAS
#define USEMEM 0.9 // Try to allocate 90% of memory

// Used to report op/s, measured through Visual Profiler, CUBLAS from CUDA 7.5
// (Seems that they indeed take the naive dim^3 approach)
#define OPS_PER_MUL 17188257792ul

#include <cstdio>
#include <cstring>
#include <string>
#include <map>
#include <vector>
#include <sys/types.h>
#include <signal.h>
#include <sys/wait.h>
#include <sys/time.h>
#include <string.h>
#include <unistd.h>
#include <time.h>
#include <fstream>

#include <cuda.h>
#include "cublas_v2.h"

void checkError(int rCode, std::string desc = "") {
	static std::map<int, std::string> g_errorStrings;
	if (!g_errorStrings.size()) {
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_INVALID_VALUE, "CUDA_ERROR_INVALID_VALUE"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_OUT_OF_MEMORY, "CUDA_ERROR_OUT_OF_MEMORY"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_NOT_INITIALIZED, "CUDA_ERROR_NOT_INITIALIZED"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_DEINITIALIZED, "CUDA_ERROR_DEINITIALIZED"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_NO_DEVICE, "CUDA_ERROR_NO_DEVICE"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_INVALID_DEVICE, "CUDA_ERROR_INVALID_DEVICE"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_INVALID_IMAGE, "CUDA_ERROR_INVALID_IMAGE"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_INVALID_CONTEXT, "CUDA_ERROR_INVALID_CONTEXT"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_MAP_FAILED, "CUDA_ERROR_MAP_FAILED"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_UNMAP_FAILED, "CUDA_ERROR_UNMAP_FAILED"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_ARRAY_IS_MAPPED, "CUDA_ERROR_ARRAY_IS_MAPPED"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_ALREADY_MAPPED, "CUDA_ERROR_ALREADY_MAPPED"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_NO_BINARY_FOR_GPU, "CUDA_ERROR_NO_BINARY_FOR_GPU"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_ALREADY_ACQUIRED, "CUDA_ERROR_ALREADY_ACQUIRED"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_NOT_MAPPED, "CUDA_ERROR_NOT_MAPPED"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_NOT_MAPPED_AS_ARRAY, "CUDA_ERROR_NOT_MAPPED_AS_ARRAY"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_NOT_MAPPED_AS_POINTER, "CUDA_ERROR_NOT_MAPPED_AS_POINTER"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_UNSUPPORTED_LIMIT, "CUDA_ERROR_UNSUPPORTED_LIMIT"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_CONTEXT_ALREADY_IN_USE, "CUDA_ERROR_CONTEXT_ALREADY_IN_USE"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_INVALID_SOURCE, "CUDA_ERROR_INVALID_SOURCE"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_FILE_NOT_FOUND, "CUDA_ERROR_FILE_NOT_FOUND"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_OPERATING_SYSTEM, "CUDA_ERROR_OPERATING_SYSTEM"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_INVALID_HANDLE, "CUDA_ERROR_INVALID_HANDLE"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_NOT_FOUND, "CUDA_ERROR_NOT_FOUND"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_NOT_READY, "CUDA_ERROR_NOT_READY"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_LAUNCH_FAILED, "CUDA_ERROR_LAUNCH_FAILED"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_LAUNCH_TIMEOUT, "CUDA_ERROR_LAUNCH_TIMEOUT"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE, "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_CONTEXT_IS_DESTROYED, "CUDA_ERROR_CONTEXT_IS_DESTROYED"));
		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_UNKNOWN, "CUDA_ERROR_UNKNOWN"));
	}

	if (rCode != CUDA_SUCCESS)
		throw ((desc == "") ? 
				std::string("Error: ") : 
				(std::string("Error in \"") + desc + std::string("\": "))) + 
			g_errorStrings[rCode];
}

void checkError(cublasStatus_t rCode, std::string desc = "") {
	static std::map<cublasStatus_t, std::string> g_errorStrings;
	if (!g_errorStrings.size()) {
		g_errorStrings.insert(std::pair<cublasStatus_t, std::string>(CUBLAS_STATUS_NOT_INITIALIZED, "CUBLAS_STATUS_NOT_INITIALIZED"));
		g_errorStrings.insert(std::pair<cublasStatus_t, std::string>(CUBLAS_STATUS_ALLOC_FAILED, "CUBLAS_STATUS_ALLOC_FAILED"));
		g_errorStrings.insert(std::pair<cublasStatus_t, std::string>(CUBLAS_STATUS_INVALID_VALUE, "CUBLAS_STATUS_INVALID_VALUE"));
		g_errorStrings.insert(std::pair<cublasStatus_t, std::string>(CUBLAS_STATUS_ARCH_MISMATCH, "CUBLAS_STATUS_ARCH_MISMATCH"));
		g_errorStrings.insert(std::pair<cublasStatus_t, std::string>(CUBLAS_STATUS_MAPPING_ERROR, "CUBLAS_STATUS_MAPPING_ERROR"));
		g_errorStrings.insert(std::pair<cublasStatus_t, std::string>(CUBLAS_STATUS_EXECUTION_FAILED, "CUBLAS_STATUS_EXECUTION_FAILED"));
		g_errorStrings.insert(std::pair<cublasStatus_t, std::string>(CUBLAS_STATUS_INTERNAL_ERROR, "CUBLAS_STATUS_INTERNAL_ERROR"));
	}

	if (rCode != CUBLAS_STATUS_SUCCESS)
		throw ((desc == "") ? 
				std::string("Error: ") : 
				(std::string("Error in \"") + desc + std::string("\": "))) + 
			g_errorStrings[rCode];
}

double getTime()
{
	struct timeval t;
	gettimeofday(&t, NULL);
	return (double)t.tv_sec + (double)t.tv_usec / 1e6;
}

bool g_running = false;

template <class T> class GPU_Test {
	public:
	GPU_Test(int dev, bool doubles, bool tensors) : 
			d_devNumber(dev), d_doubles(doubles), d_tensors(tensors) {
		checkError(cuDeviceGet(&d_dev, d_devNumber));
		checkError(cuCtxCreate(&d_ctx, 0, d_dev));

		bind();

		//checkError(cublasInit());
		checkError(cublasCreate(&d_cublas), "init");

		if(d_tensors)
			checkError(cublasSetMathMode(d_cublas, CUBLAS_TENSOR_OP_MATH));

		checkError(cuMemAllocHost((void**)&d_faultyElemsHost, sizeof(int)));
		d_error = 0;

		g_running = true;

		struct sigaction action;
		memset(&action, 0, sizeof(struct sigaction));
		action.sa_handler = termHandler;
		sigaction(SIGTERM, &action, NULL);
	}
	~GPU_Test() {
		bind();
		checkError(cuMemFree(d_Cdata), "Free A");
		checkError(cuMemFree(d_Adata), "Free B");
		checkError(cuMemFree(d_Bdata), "Free C");
		cuMemFreeHost(d_faultyElemsHost);
		printf("Freed memory for dev %d\n", d_devNumber);

		cublasDestroy(d_cublas);
		printf("Uninitted cublas\n");
	}

	static void termHandler(int signum)
	{
		g_running = false;
	}

	unsigned long long int getErrors() {
		if (*d_faultyElemsHost) {
			d_error += (long long int)*d_faultyElemsHost;
		}
		unsigned long long int tempErrs = d_error;
		d_error = 0;
		return tempErrs;
	}

	size_t getIters() {
		return d_iters;
	}

	void bind() {
		checkError(cuCtxSetCurrent(d_ctx), "Bind CTX");
	}

	size_t totalMemory() {
		bind();
		size_t freeMem, totalMem;
		checkError(cuMemGetInfo(&freeMem, &totalMem));
		return totalMem;
	}

	size_t availMemory() {
		bind();
		size_t freeMem, totalMem;
		checkError(cuMemGetInfo(&freeMem, &totalMem));
		return freeMem;
	}

	void initBuffers(T *A, T *B, ssize_t useBytes=0) {
		bind();

		if(useBytes == 0)
            useBytes = (ssize_t)((double)availMemory()*USEMEM);
		if(useBytes < 0)
           useBytes = (ssize_t)((double)availMemory()*(-useBytes/100.0));

		printf("Initialized device %d with %lu MB of memory (%lu MB available, using %lu MB of it), %s%s\n",
				d_devNumber, totalMemory()/1024ul/1024ul, availMemory()/1024ul/1024ul, useBytes/1024ul/1024ul,
				d_doubles ? "using DOUBLES" : "using FLOATS", d_tensors ? ", using Tensor Cores" : "");
		size_t d_resultSize = sizeof(T)*SIZE*SIZE;
		d_iters = (useBytes - 2*d_resultSize)/d_resultSize; // We remove A and B sizes
		printf("Results are %zu bytes each, thus performing %zu iterations\n", d_resultSize, d_iters);
        if((size_t)useBytes < 3*d_resultSize)
            throw std::string("Low mem for result. aborting.\n");
		checkError(cuMemAlloc(&d_Cdata, d_iters*d_resultSize), "C alloc");
		checkError(cuMemAlloc(&d_Adata, d_resultSize), "A alloc");
		checkError(cuMemAlloc(&d_Bdata, d_resultSize), "B alloc");

		checkError(cuMemAlloc(&d_faultyElemData, sizeof(int)), "faulty data");

		// Populating matrices A and B
		checkError(cuMemcpyHtoD(d_Adata, A, d_resultSize), "A -> device");
		checkError(cuMemcpyHtoD(d_Bdata, B, d_resultSize), "A -> device");

		initCompareKernel();
	}

	void compute() {
		bind();
		static const float alpha = 1.0f;
		static const float beta = 0.0f;
		static const double alphaD = 1.0;
		static const double betaD = 0.0;

		for (size_t i = 0; i < d_iters; ++i) {
			if (d_doubles)
				checkError(cublasDgemm(d_cublas, CUBLAS_OP_N, CUBLAS_OP_N,
							SIZE, SIZE, SIZE, &alphaD,
							(const double*)d_Adata, SIZE,
							(const double*)d_Bdata, SIZE,
							&betaD, 
							(double*)d_Cdata + i*SIZE*SIZE, SIZE), "DGEMM");
			else
				checkError(cublasSgemm(d_cublas, CUBLAS_OP_N, CUBLAS_OP_N,
							SIZE, SIZE, SIZE, &alpha,
							(const float*)d_Adata, SIZE,
							(const float*)d_Bdata, SIZE,
							&beta, 
							(float*)d_Cdata + i*SIZE*SIZE, SIZE), "SGEMM");
		}
	}

	void initCompareKernel() {
		const char *kernelFile = "compare.ptx";
		{
			std::ifstream f(kernelFile);
			checkError(f.good() ? CUDA_SUCCESS : CUDA_ERROR_NOT_FOUND, std::string("couldn't find file \"") + kernelFile + "\" from working directory");
		}
		checkError(cuModuleLoad(&d_module, kernelFile), "load module");
		checkError(cuModuleGetFunction(&d_function, d_module, 
					d_doubles ? "compareD" : "compare"), "get func");

		checkError(cuFuncSetCacheConfig(d_function, CU_FUNC_CACHE_PREFER_L1), "L1 config");
		checkError(cuParamSetSize(d_function, __alignof(T*) + __alignof(int*) + __alignof(size_t)), "set param size");
		checkError(cuParamSetv(d_function, 0, &d_Cdata, sizeof(T*)), "set param");
		checkError(cuParamSetv(d_function, __alignof(T*), &d_faultyElemData, sizeof(T*)), "set param");
		checkError(cuParamSetv(d_function, __alignof(T*) + __alignof(int*), &d_iters, sizeof(size_t)), "set param");

		checkError(cuFuncSetBlockShape(d_function, g_blockSize, g_blockSize, 1), "set block size");
	}

	void compare() {
		checkError(cuMemsetD32Async(d_faultyElemData, 0, 1, 0), "memset");
		checkError(cuLaunchGridAsync(d_function, SIZE/g_blockSize, SIZE/g_blockSize, 0), "Launch grid");
		checkError(cuMemcpyDtoHAsync(d_faultyElemsHost, d_faultyElemData, sizeof(int), 0), "Read faultyelemdata");
	}

	bool shouldRun()
	{
		return g_running;
	}

	private:
	bool d_doubles;
	bool d_tensors;
	int d_devNumber;
	size_t d_iters;
	size_t d_resultSize;

	long long int d_error;

	static const int g_blockSize = 16;

	CUdevice d_dev;
	CUcontext d_ctx;
	CUmodule d_module;
	CUfunction d_function;

	CUdeviceptr d_Cdata;
	CUdeviceptr d_Adata;
	CUdeviceptr d_Bdata;
	CUdeviceptr d_faultyElemData;
	int *d_faultyElemsHost;

	cublasHandle_t d_cublas;
};

// Returns the number of devices
int initCuda() {
	checkError(cuInit(0));
	int deviceCount = 0;
	checkError(cuDeviceGetCount(&deviceCount));

	if (!deviceCount)
		throw std::string("No CUDA devices");

	#ifdef USEDEV
	if (USEDEV >= deviceCount)
		throw std::string("Not enough devices for USEDEV");
	#endif

	return deviceCount;
}

template<class T> void startBurn(int index, int writeFd, T *A, T *B, bool doubles, bool tensors, ssize_t useBytes) {
	GPU_Test<T> *our;
	try {
		our = new GPU_Test<T>(index, doubles, tensors);
		our->initBuffers(A, B, useBytes);
	} catch (std::string e) {
		fprintf(stderr, "Couldn't init a GPU test: %s\n", e.c_str());
		exit(124);
	}

	// The actual work
	try {
		int eventIndex = 0;
		const int maxEvents = 2;
		CUevent events[maxEvents];
		for (int i = 0; i < maxEvents; ++i)
			cuEventCreate(events + i, 0);

		int nonWorkIters = maxEvents;

		while (our->shouldRun()) {
			our->compute();
			our->compare();
			checkError(cuEventRecord(events[eventIndex], 0), "Record event");

			eventIndex = ++eventIndex % maxEvents;

			while (cuEventQuery(events[eventIndex]) != CUDA_SUCCESS) usleep(1000);

			if (--nonWorkIters > 0) continue;

			int ops = our->getIters();
			write(writeFd, &ops, sizeof(int));
			ops = our->getErrors();
			write(writeFd, &ops, sizeof(int));
		}

		for (int i = 0; i < maxEvents; ++i)
			cuEventSynchronize(events[i]);
		delete our;
	} catch (std::string e) {
		fprintf(stderr, "Failure during compute: %s\n", e.c_str());
		int ops = -1;
		// Signalling that we failed
		write(writeFd, &ops, sizeof(int));
		write(writeFd, &ops, sizeof(int));
		exit(111);
	}
}

int pollTemp(pid_t *p) {
	int tempPipe[2];
	pipe(tempPipe);
	
	pid_t myPid = fork();

	if (!myPid) {
		close(tempPipe[0]);
		dup2(tempPipe[1], STDOUT_FILENO); // Stdout
		execlp("nvidia-smi", "nvidia-smi", "-l", "5", "-q", "-d", "TEMPERATURE", NULL);
		fprintf(stderr, "Could not invoke nvidia-smi, no temps available\n");
		
		exit(0);
	}

	*p = myPid;
	close(tempPipe[1]);

	return tempPipe[0];
}

void updateTemps(int handle, std::vector<int> *temps) {
	const int readSize = 10240;
	static int gpuIter = 0;
	char data[readSize+1];

	int curPos = 0;
	do {
		read(handle, data+curPos, sizeof(char));
	} while (data[curPos++] != '\n');

	data[curPos-1] = 0;

	int tempValue;
	// FIXME: The syntax of this print might change in the future..
	if (sscanf(data, "        GPU Current Temp            : %d C", &tempValue) == 1) {
		//printf("read temp val %d\n", tempValue);
		temps->at(gpuIter) = tempValue;
		gpuIter = (gpuIter+1)%(temps->size());
	} else if (!strcmp(data, "        Gpu                     : N/A"))
		gpuIter = (gpuIter+1)%(temps->size()); // We rotate the iterator for N/A values as well
}

void listenClients(std::vector<int> clientFd, std::vector<pid_t> clientPid, int runTime) {
	fd_set waitHandles;
	
	pid_t tempPid;
	int tempHandle = pollTemp(&tempPid);
	int maxHandle = tempHandle;

	FD_ZERO(&waitHandles);
	FD_SET(tempHandle, &waitHandles);

	for (size_t i = 0; i < clientFd.size(); ++i) {
		if (clientFd.at(i) > maxHandle)
			maxHandle = clientFd.at(i);
		FD_SET(clientFd.at(i), &waitHandles);
	}

	std::vector<int> clientTemp;
	std::vector<int> clientErrors;
	std::vector<int> clientCalcs;
	std::vector<struct timespec> clientUpdateTime;
	std::vector<float> clientGflops;
	std::vector<bool> clientFaulty;

	time_t startTime = time(0);

	for (size_t i = 0; i < clientFd.size(); ++i) {
		clientTemp.push_back(0);
		clientErrors.push_back(0);
		clientCalcs.push_back(0);
		struct timespec thisTime;
		clock_gettime(CLOCK_REALTIME, &thisTime);
		clientUpdateTime.push_back(thisTime);
		clientGflops.push_back(0.0f);
		clientFaulty.push_back(false);
	}
	
	int changeCount;
	float nextReport = 10.0f;
	bool childReport = false;
	while ((changeCount = select(maxHandle+1, &waitHandles, NULL, NULL, NULL))) {
		size_t thisTime = time(0);
		struct timespec thisTimeSpec;
		clock_gettime(CLOCK_REALTIME, &thisTimeSpec);

		//printf("got new data! %d\n", changeCount);
		// Going through all descriptors
		for (size_t i = 0; i < clientFd.size(); ++i)
			if (FD_ISSET(clientFd.at(i), &waitHandles)) {
				// First, reading processed
				int processed, errors;
				int res = read(clientFd.at(i), &processed, sizeof(int));
                if(res < sizeof(int))
                {
                    fprintf(stderr,"read[%zu] error %d", i, res);
                    processed = -1;
                }
				// Then errors
				read(clientFd.at(i), &errors, sizeof(int));

				clientErrors.at(i) += errors;
				if (processed == -1)
					clientCalcs.at(i) = -1;
				else
				{
					double flops = (double)processed * (double)OPS_PER_MUL;
					struct timespec clientPrevTime = clientUpdateTime.at(i);
					double clientTimeDelta = (double)thisTimeSpec.tv_sec + (double)thisTimeSpec.tv_nsec / 1000000000.0 - ((double)clientPrevTime.tv_sec + (double)clientPrevTime.tv_nsec / 1000000000.0);
					clientUpdateTime.at(i) = thisTimeSpec;

					clientGflops.at(i) = (double)((unsigned long long int)processed * OPS_PER_MUL) / clientTimeDelta / 1000.0 / 1000.0 / 1000.0;
					clientCalcs.at(i) += processed;
				}

				childReport = true;
			}

		if (FD_ISSET(tempHandle, &waitHandles))
			updateTemps(tempHandle, &clientTemp);
		
		// Resetting the listeners
		FD_ZERO(&waitHandles);
		FD_SET(tempHandle, &waitHandles);
		for (size_t i = 0; i < clientFd.size(); ++i)
			FD_SET(clientFd.at(i), &waitHandles);

		// Printing progress (if a child has initted already)
		if (childReport) {
			float elapsed = fminf((float)(thisTime-startTime)/(float)runTime*100.0f, 100.0f);
			printf("\r%.1f%%  ", elapsed);
			printf("proc'd: ");
			for (size_t i = 0; i < clientCalcs.size(); ++i) {
				printf("%d (%.0f Gflop/s) ", clientCalcs.at(i), clientGflops.at(i));
				if (i != clientCalcs.size() - 1)
					printf("- ");
			}
			printf("  errors: ");
			for (size_t i = 0; i < clientErrors.size(); ++i) {
				std::string note = "%d ";
				if (clientCalcs.at(i) == -1)
					note += " (DIED!)";
				else if (clientErrors.at(i))
					note += " (WARNING!)";

				printf(note.c_str(), clientErrors.at(i));
				if (i != clientCalcs.size() - 1)
					printf("- ");
			}
			printf("  temps: ");
			for (size_t i = 0; i < clientTemp.size(); ++i) {
				printf(clientTemp.at(i) != 0 ? "%d C " : "-- ", clientTemp.at(i));
				if (i != clientCalcs.size() - 1)
					printf("- ");
			}
			
			fflush(stdout);

			if (nextReport < elapsed) {
				nextReport = elapsed + 10.0f;
				printf("\n\tSummary at:   ");
				fflush(stdout);
				system("date"); // Printing a date
				fflush(stdout);
				printf("\n");
				//printf("\t(checkpoint)\n");
				for (size_t i = 0; i < clientErrors.size(); ++i) {
					if (clientErrors.at(i))
						clientFaulty.at(i) = true;
					clientErrors.at(i) = 0;
				}
			}
		}

		// Checking whether all clients are dead
		bool oneAlive = false;
		for (size_t i = 0; i < clientCalcs.size(); ++i)
			if (clientCalcs.at(i) != -1)
				oneAlive = true;
		if (!oneAlive) {
			fprintf(stderr, "\n\nNo clients are alive!  Aborting\n");
			exit(123);
		}

		if (startTime + runTime < thisTime)
			break;
	}

	printf("\nKilling processes.. ");
	fflush(stdout);
	for (size_t i = 0; i < clientPid.size(); ++i)
		kill(clientPid.at(i), 15);
	
	kill(tempPid, 15);
	close(tempHandle);

	while (wait(NULL) != -1);
	printf("done\n");

	printf("\nTested %d GPUs:\n", (int)clientPid.size());
	for (size_t i = 0; i < clientPid.size(); ++i)
		printf("\tGPU %d: %s\n", (int)i, clientFaulty.at(i) ? "FAULTY" : "OK");
}

template<class T> void launch(int runLength, bool useDoubles, bool useTensorCores, ssize_t useBytes) {
	system("nvidia-smi -L");

	// Initting A and B with random data
	T *A = (T*) malloc(sizeof(T)*SIZE*SIZE);
	T *B = (T*) malloc(sizeof(T)*SIZE*SIZE);
	srand(10);
	for (size_t i = 0; i < SIZE*SIZE; ++i) {
		A[i] = (T)((double)(rand()%1000000)/100000.0);
		B[i] = (T)((double)(rand()%1000000)/100000.0);
	}

	// Forking a process..  This one checks the number of devices to use,
	// returns the value, and continues to use the first one.
	int mainPipe[2];
	pipe(mainPipe);
	int readMain = mainPipe[0];
	std::vector<int> clientPipes;
	std::vector<pid_t> clientPids;
	clientPipes.push_back(readMain);

	pid_t myPid = fork();
	if (!myPid) {
		// Child
		close(mainPipe[0]);
		int writeFd = mainPipe[1];
		int devCount = initCuda();
		write(writeFd, &devCount, sizeof(int));

		startBurn<T>(0, writeFd, A, B, useDoubles, useTensorCores, useBytes);

		close(writeFd);
		return;
	} else {
		clientPids.push_back(myPid);

		close(mainPipe[1]);
		int devCount;
	    read(readMain, &devCount, sizeof(int));

		if (!devCount) {
			fprintf(stderr, "No CUDA devices\n");
			exit(EXIT_FAILURE);
		} else {

			for (int i = 1; i < devCount; ++i) {
				int slavePipe[2];
				pipe(slavePipe);
				clientPipes.push_back(slavePipe[0]);

				pid_t slavePid = fork();

				if (!slavePid) {
					// Child
					close(slavePipe[0]);
					initCuda();
					startBurn<T>(i, slavePipe[1], A, B, useDoubles, useTensorCores, useBytes);

					close(slavePipe[1]);
					return;
				} else {
					clientPids.push_back(slavePid);
					close(slavePipe[1]);
				}
			}
			
			listenClients(clientPipes, clientPids, runLength);
		}
	}

	for (size_t i = 0; i < clientPipes.size(); ++i)
		close(clientPipes.at(i));

	free(A);
	free(B);
}

void showHelp() {
	printf("GPU Burn\n");
	printf("Usage: gpu_burn [OPTIONS] [TIME]\n\n");
    printf("-m <MBytes>\tUse such mem. >48/96M.\n");
    printf("-m <N>%%\tUse such %% of free mem. Default is%d%%\n", (int)(USEMEM*100));
	printf("-d\tUse doubles\n");
	printf("-tc\tUse Tensor cores\n");
	printf("-h\tShow this help message\n\n");
	printf("Example:\n");
	printf("gpu-burn -d 3600\n");
}

// NNN MB
// NN% <0
// 0 --- error
ssize_t decodeUSEMEM(const char* s){
    char* s2;
    int64_t r = strtoll(s,&s2,10);
    if(s==s2)
        return 0;
    if(*s2 == '%')
        return (s2[1] == 0) ? -r : 0;
    return (*s2 == 0) ? r*1024*1024 : 0;
}

int main(int argc, char **argv) {
	int runLength = 10;
	bool useDoubles = false;
	bool useTensorCores = false;
	int thisParam = 0;
    ssize_t useBytes = 0; // 0 == use USEMEM% of free mem

	std::vector<std::string> args(argv, argv + argc);
	for (size_t i = 1; i < args.size(); ++i)
	{
		if (argc >= 2 && std::string(argv[i]).find("-h") != std::string::npos)
		{
			showHelp();
			return 0;
		}
		if (argc >= 2 && std::string(argv[i]).find("-d") != std::string::npos)
		{
			useDoubles = true;
			thisParam++;
		}
		if (argc >= 2 && std::string(argv[i]).find("-tc") != std::string::npos)
		{
			useTensorCores = true;
			thisParam++;
		}
		if (argc >= 2 && strncmp(argv[i],"-m",2)==0)
		{
            thisParam++;

            // -mNNN[%]
            // -m NNN[%]
            if(argv[i][2]){
                useBytes = decodeUSEMEM(argv[i]+2);

            } else if(i+1 < args.size()){
                i++;
			    thisParam++;
                useBytes = decodeUSEMEM(argv[i]);
            }else{
                fprintf(stderr,"invalid format of memusage param (-m)\n");
                exit(1);
            }
            if(useBytes == 0)
            {
                fprintf(stderr,"Can't decode memusage param (-m)\n");
                exit(1);
            }
		}
	}
	
	if (argc-thisParam < 2)
		printf("Run length not specified in the command line.  Burning for 10 secs\n");
	else 
		runLength = atoi(argv[1+thisParam]);

	if (useDoubles)
		launch<double>(runLength, useDoubles, useTensorCores, useBytes);
	else
		launch<float>(runLength, useDoubles, useTensorCores, useBytes);

	return 0;
}