diff --git a/README.md b/README.md new file mode 100644 index 0000000..95f1956 --- /dev/null +++ b/README.md @@ -0,0 +1,8 @@ +# SBCBinaryFormat + +Here you will find all the code required to build and read a SBC binary format. + +Languages supported: + +* C++ +* Python \ No newline at end of file diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt new file mode 100644 index 0000000..1392aa4 --- /dev/null +++ b/cpp/CMakeLists.txt @@ -0,0 +1,35 @@ +cmake_minimum_required(VERSION 3.14...3.22) + +# ---- Project ---- + +# Note: update this to your new project's name and version +project( + SBCBinaryFormat + VERSION 0.5.0 + LANGUAGES CXX C +) + +option(SBC_BINARY_FORMAT_TESTS "Enable" OFF) + +# Note: globbing sources is considered bad practice as CMake's generators may not detect new files +# automatically. Keep that in mind when changing files, or explicitly mention them here. +file(GLOB_RECURSE headers CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/include/*.h") + +# ---- Create library ---- + +# Note: for header-only libraries change all PUBLIC flags to INTERFACE and create an interface +# target: add_library(${PROJECT_NAME} INTERFACE) +add_library(${PROJECT_NAME} ${headers} INTERFACE) + +target_compile_features(${PROJECT_NAME} INTERFACE cxx_std_20) +#set_target_properties(${PROJECT_NAME} PROPERTIES CXX_STANDARD 20) + +target_include_directories( + ${PROJECT_NAME} INTERFACE $ +) + +add_library(${PROJECT_NAME}::${PROJECT_NAME} ALIAS ${PROJECT_NAME}) + +if(SBC_BINARY_FORMAT_TESTS) + add_subdirectory(test) +endif() diff --git a/cpp/README.md b/cpp/README.md new file mode 100644 index 0000000..21f58e7 --- /dev/null +++ b/cpp/README.md @@ -0,0 +1,24 @@ +# C++ SBC Binary Format driver + +A header-only library for the SBC binary format. + +To install this library, there are two options: + +1. Copy everything inside the "include" folder into your project include folder. + Done! +2. Clone this github repository anywhere in your project and in your + "CMakeLists.txt" add a line called `add_subdirectory + (LOCATION_OF_THIS_LIBRARY)`. Then, in `target_link_libraries` add + `RedDigitizer++::RedDigitizer++`. Done! + +# Examples + +There are tests (or examples) inside "test/source/basic_test.cpp" that can be compiled to test the library works. Its original intent is to make sure the code can compile in your computer and actually run with little to no bugs. + +It uses a library called doctest, so it has some code that might look unusual. + +# TODO: + +* Better examples. +* Complete tests. +* Bug hunt, are there bugs? \ No newline at end of file diff --git a/cpp/include/SBCBinaryFormat/Reader.hpp b/cpp/include/SBCBinaryFormat/Reader.hpp new file mode 100644 index 0000000..bcf5d7d --- /dev/null +++ b/cpp/include/SBCBinaryFormat/Reader.hpp @@ -0,0 +1,444 @@ +#ifndef SBC_BINARYFORMAT_READER_H +#define SBC_BINARYFORMAT_READER_H +#pragma once + +// C STD includes +// C 3rd party includes +// C++ STD includes +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// C++ 3rd party includes +// my includes +#include "SBCBinaryFormat/Utilities.hpp" + +namespace SBC::BinaryFormat { + +struct StreamerTraits { + using index_type = std::size_t; + // The max size of any dimension read from DynamicStreamer + constexpr static uint32_t MAX_BUFFER_SIZE = 32000; + + constexpr static uint64_t MAX_FILE_SIZE_RAM_MB = 8000; // MB + + constexpr static double FILE_TOLERANCE = 1e-3; +}; + +template +class DynamicStreamer { + using index_type = Traits::index_type; + + // These guys are defined by their types + static Types _types; + static ColumnNames _column_names; + + using data_type = Types::data_type; + + // Checks Types and Columns names have the same size + static_assert(_types.size == _column_names.size); + + std::string _file_name; + std::fstream _stream; + bool _swap_buffer = false; + + bool _is_all_in_ram = false; + // This should be a tuple of vectors... + // where all our dynamically allocated memory should be. + // Note: this data is UNROLLED. In other words: only 1 dimensionality to + // the arrays. The sizes and dimensions are stored in the vectors below + data_type _data; + std::vector _data_buffer; + + // Sizes of each dimension of the data vector: read from the file. + std::vector _type_sizes; + // Pseudo-Dimensions of each data vector: read directly from the file. + std::vector _type_dims; + + std::vector _type_total_size; + // Product of all size times the type sizeof in bytes + std::vector _type_total_sizes_in_B; + // Positions in bytes of each parameter in the line + std::vector _type_offset_in_B; + + std::uintmax_t _file_size_in_B; + index_type _line_size_in_B = 0; + index_type _header_size_in_B = 0; + index_type _num_elements; + index_type _block_size_in_lines; + index_type _block_size_in_B; + index_type _num_blocks; + index_type _current_block; + + // Helper for the bellow function + template + void _allocate_memory_helper(std::index_sequence) { + (std::get(_data).resize(_block_size_in_lines * _type_total_size[I]),...); + } + + // Allocates the data buffers to sizes found in _type_total_sizes_in_B times + // block size which should be all the memory we need. + void _allocate_memory() { + _allocate_memory_helper(std::make_index_sequence{}); + } + + template + void _decode_to_vector(T& column, const std::size_t& lines, + std::span data_buffer_span) { + const auto& offset_in_line_in_B = _type_offset_in_B[I]; + const auto& length = _type_total_sizes_in_B[I]; + const auto& column_size = _type_total_size[I]; + + for (std::size_t i = 0; i < lines; i++) { +// auto curr_data = data_buffer_span.subspan(i*_line_size_in_B + offset_in_line_in_B, length); + std::memcpy(&column[i*column_size], + &data_buffer_span[i*_line_size_in_B+offset_in_line_in_B], + length); + } + } + + template + void _decode_buffer_helper(const std::size_t& lines, + std::index_sequence) { + (_decode_to_vector(std::get(_data), lines, _data_buffer),...); + } + + void _decode_buffer(const std::size_t& lines) { + _decode_buffer_helper(lines, std::make_index_sequence{}); + } + + // Helper function to get type T from file + template + T _get_type() { + constexpr std::size_t size = sizeof(T); + std::array buff; + _stream.read(buff.begin(), size); + + if (_swap_buffer) { + std::reverse(buff.begin(), buff.end()); + } + + T out; + std::memcpy(&out, buff.begin(), size); + return out; + } + public: + constexpr static auto sbc_type_strings = Types::type_str; + constexpr static auto column_names = _column_names.names; + constexpr static auto type_sizes = Types::type_sizes; + constexpr static std::size_t num_columns = _types.size; + constexpr static std::endian system_endian = std::endian::native; + constexpr static uint32_t system_sbc_endian = 0x01020304; + + explicit DynamicStreamer(std::string_view file_name, + const std::size_t& max_size = 1000, + const std::size_t& block_size = 65536) : + _file_name{file_name}, + _block_size_in_lines{block_size} + { + // Reads header and verifies it is compatible with the template + if (not std::filesystem::exists(file_name)) { + throw std::invalid_argument("File does not exist."); + } + + if (std::filesystem::is_empty(file_name)) { + throw std::invalid_argument("File exists but it is empty."); + } + + _stream.open(_file_name, std::ios::in | std::ofstream::binary); + + if (not _stream.is_open()) { + throw std::runtime_error("File was not open."); + } + + _file_size_in_B = std::filesystem::file_size(_file_name); + + std::uintmax_t actual_max_size_in_B = + max_size > Traits::MAX_FILE_SIZE_RAM_MB ? + Traits::MAX_FILE_SIZE_RAM_MB : max_size; + + // We transform to Bytes and divide by 2 because memory + // is split in two buffers: one, the real data. Two, the raw buffer + // which is the same size of the real data. + actual_max_size_in_B *= 0.5e6; + _is_all_in_ram = _file_size_in_B < actual_max_size_in_B; + + uint32_t sbc_file_endian = _get_type(); + uint16_t header_length = _get_type(); + + _swap_buffer = sbc_file_endian != system_sbc_endian; + + std::vector buff(header_length); + _stream.read(buff.data(), header_length); + std::string_view header(buff.data(), header_length); + + auto header_split = _split(header, ";"); + if ((header_split.size() - 1) % 3 != 0) { + throw std::runtime_error("The number of items found in the header" + "should always come in multiples of 3."); + } + + if (static_cast(header_split.size() / 3) != num_columns) { + throw std::runtime_error("Incompatible number of elements in the " + "header: " + std::string(header)); + } + + for (std::size_t i = 0; i < header_split.size() - 1; i += 3) { + auto j = static_cast(i / 3); + auto column_name = header_split[i]; + auto type = header_split[i + 1]; + auto raw_sizes_str = header_split[i + 2]; + + // Checks to see if the header is compatible with what + // was provided + if (column_name != column_names[j]) { + throw std::runtime_error("Incompatible column name: " + + column_name + " with " + + column_names[j] + " at position " + + std::to_string(j)); + } + + auto type_str = std::string(sbc_type_strings[j]); + if (type != type_str) { + throw std::runtime_error("Incompatible type: " + + type + " with " + + type_str + + " at position " + std::to_string(j)); + } + + auto sizes_str = _split(raw_sizes_str, ","); + for(const auto& size_str : sizes_str) { + index_type size = std::atoi(size_str.c_str()); + + if (size == 0) { + throw std::runtime_error("Header indicated a size of 0. " + "That is not allowed by the " + "format."); + } + + if (size > Traits::MAX_BUFFER_SIZE) { + throw std::runtime_error("Size of column " + column_name + + " is too big."); + } + + _type_sizes.emplace_back(size); + } + + _type_dims.emplace_back(sizes_str.size()); + } + + // Last info to read from the header is the number of lines + // the file has inside. If its 0, it can be any size + auto file_num_elements = static_cast(_get_type()); + + // Done with header! Time to deal with the data... + // But first, let's check the file size is a multiple + // of what we estimated in the header. + _header_size_in_B = header_length + 10; + + // Routine to find the size of a line in the file. + index_type offset = 0; + _line_size_in_B = 0; + for (index_type i = 0; i < column_names.size(); i++) { + _type_offset_in_B.emplace_back(_line_size_in_B); + + index_type prod = 1; + for (index_type dim_j = 0; dim_j < _type_dims[i]; dim_j++) { + prod *= _type_sizes[dim_j + offset]; + } + + offset += _type_dims[i]; + _line_size_in_B += type_sizes[i]*prod; + _type_total_size.emplace_back(prod); + _type_total_sizes_in_B.emplace_back(type_sizes[i]*prod); + } + + auto data_size = _file_size_in_B - _header_size_in_B; + if ((data_size % _line_size_in_B) != 0) { + throw std::runtime_error("After doing the math, the remaining " + "file is not evenly distributed by the " + "given parameters."); + } + + // Expected number of lines in the file + _num_elements = static_cast(data_size / _line_size_in_B); + if (file_num_elements != 0) { + if (_num_elements != file_num_elements) { + throw std::runtime_error("Number of elements in the file does " + "not match with the estimated one. " + "Maybe a corrupted file?"); + } + } + + if (_is_all_in_ram) { + _block_size_in_lines = _num_elements; + } else { + // If the file does not fit in RAM, we calculate the best + // power of 2 for block size that it fits in actual_max_size + while (_block_size_in_lines*_line_size_in_B > actual_max_size_in_B) { + _block_size_in_lines /= 2; + } + } + // This holds 1 block of raw data + _block_size_in_B = _block_size_in_lines*_line_size_in_B; + _num_blocks = static_cast(data_size/_block_size_in_B); + // Done with header checking, now we can move to allocating data! + _allocate_memory(); + + _data_buffer.reserve(_block_size_in_B); + // Technically done, but let's load the first block to memory + // Start at number of blocks so the next function thinks we + // are at the end of the buffer + _current_block = _num_blocks; + load_next_block(); + } + + ~DynamicStreamer() = default; + + // Indicates if all the file line of data are stored all in ram + bool is_all_in_ram() { return _is_all_in_ram; } + // Total lines of data found inside the file. + index_type size() { return _num_elements; } + // Returns the internal buffer size in bytes + index_type get_buffer_size() { return _block_size_in_B; } + // Returns the internal buffer number of lines. + // Equal to size() if is_all_in_ram() returns true + index_type get_num_lines() { return _block_size_in_lines;} + + // "Normal" container functions + // These are meant to work as a "normal" dictionary + // Returns a tuple of vectors of the elements at i + data_type at(const index_type& i) { + auto item_block = static_cast(i / (_num_blocks * _block_size_in_lines)); + if (item_block != _current_block) { + // We load the previous block as load_nex_block will + // increment _current_block by 1. + _current_block = item_block - 1; + load_next_block(); + } + + data_type out_data; + _copy_to_line(out_data, i % _block_size_in_lines); + // Check bounds, if beyond: retrieve data. If within, turn i to + // internal bounds index. + return out_data; + } + + template + auto at(const index_type& i, + TransformFunc&& transform_func) { + auto index_tuple = at(i); + return std::apply(transform_func, index_tuple); + } + + // "Efficient" functions + // These are more efficient in terms of speed. + template + const auto& get() { + return std::get(_data); + } + + template + const auto& get() { + return get<_find_index()>(); + } + + void load_next_block() { + // Here is the deal: + // _header_size_in_B is the start of the data in file + // _file_size is the end of the data in file. + // We load 1 block size worth of memory + // Except for the last buffer + _current_block++; + if (_current_block >= _num_blocks) { + _current_block = 0; + } + // Start of the block position + std::size_t start = _current_block*_block_size_in_B + \ + _header_size_in_B; + _stream.seekg(start); + + // Try to read 1 block size worth of lines + _stream.read(_data_buffer.data(), _block_size_in_B); + // The above can "fail". We check how many bytes it actually read + auto read_bytes = static_cast(_stream.gcount()); + auto lines = _block_size_in_lines; + if (read_bytes != _block_size_in_B) { + // TODO(Any): think what to do here. + } + + _decode_buffer(lines); + } + + private: + template + void _copy_to_column(T& column, const index_type& block_line) { + auto data_column = std::get(_data); + column.resize(_type_total_size[I]); + for(index_type i = 0; i < _type_total_size[I]; i++) { + column[i] = data_column[i + block_line]; + } + } + + // Helper for the bellow function + template + void _copy_to_line_helper(data_type& line, const index_type& block_line, + std::index_sequence) { + (_copy_to_column(std::get(line), block_line),...); + } + + // Allocates the data buffers to sizes found in _type_total_sizes_in_B times + // block size which should be all the memory we need. + void _copy_to_line(data_type& line, const index_type& block_line) { + _copy_to_line_helper(line, block_line, + std::make_index_sequence{}); + } + + // Other functions: + + template + constexpr static std::size_t _find_index() { + for(std::size_t i = 0; i < column_names.size(); i++) { + if (column_names[i] == column_name.value) { + return i; + } + } + + throw std::logic_error("Element was not found"); + } + + // This should be a STD library thing... + // https://stackoverflow.com/questions/14265581/parse-split-a-string-in-c-using-string-delimiter-standard-c + std::vector _split(std::string_view s, + std::string_view delimiter) { + size_t pos_start = 0, pos_end, delim_len = delimiter.length(); + std::string token; + std::vector res; + + while ((pos_end = s.find(delimiter, pos_start)) != std::string::npos) { + token = s.substr(pos_start, pos_end - pos_start); + pos_start = pos_end + delim_len; + res.push_back(token); + } + + token = s.substr(pos_start); + res.push_back(token); + return res; + } +}; + +} // namespace SBCQueens::Binary + +#endif //SBC_BINARYFORMAT_READER_H diff --git a/cpp/include/SBCBinaryFormat/Utilities.hpp b/cpp/include/SBCBinaryFormat/Utilities.hpp new file mode 100644 index 0000000..01ce7d8 --- /dev/null +++ b/cpp/include/SBCBinaryFormat/Utilities.hpp @@ -0,0 +1,109 @@ +/* + * Author: Hector Hawley Herrera + * + * This file contains a bunch of C++20 utilities used through all of the files + * to do some template magic. + * + * Examples: a struct that turns strings into a template parameter, a + * type container, and integral type to sbc type string + * + * */ +#ifndef SBC_BINARYFORMAT_TOOLS_H +#define SBC_BINARYFORMAT_TOOLS_H +#pragma once + +// C STD includes +// C 3rd party includes +// C++ STD includes +#include +#include +#include +#include + +// C++ 3rd party includes +// my includes + +namespace SBC::BinaryFormat::Tools { +// The BinaryFormat only accepts arithmetic pointers. +// ex: const char* returns true +// std::string return false +// double* return true; +template +concept is_arithmetic_ptr = std::is_arithmetic_v< + std::remove_cvref_t< + std::remove_pointer_t>> and not std::is_pointer_v; + +template +concept is_arithmetic_ptr_unpack = requires(T x) { + (... and is_arithmetic_ptr); +}; + +template +requires is_arithmetic_ptr +constexpr static std::string_view type_to_string() { + // We get the most pure essence of T: no consts, no references, + // and no [] + using T_no_const = std::remove_pointer_t>>>; + if constexpr (std::is_same_v) { + return "char"; + } else if constexpr (std::is_same_v) { + return "uint8"; + } else if constexpr (std::is_same_v) { + return "uint16"; + } else if constexpr (std::is_same_v) { + return "uint32"; + } else if constexpr (std::is_same_v) { + return "uint64"; + } else if constexpr (std::is_same_v) { + return "int8"; + } else if constexpr (std::is_same_v) { + return "int16"; + } else if constexpr (std::is_same_v) { + return "int32"; + } else if constexpr (std::is_same_v) { + return "int64"; + } else if constexpr (std::is_same_v) { + return "single"; + } else if constexpr (std::is_same_v) { + return "double"; + } else if constexpr (std::is_same_v) { + return "float128"; + } + // TODO(All): maybe the default should be uint32? or no default? +} + +template +struct ColumnName { + constexpr ColumnName(const char (&str)[N]) { + // The explicit breaks this, maybe we can find a more suitable solution? + std::copy_n(str, N, value); + } + + char value[N]; +}; + +template +struct ColumnNames { + static constexpr auto names = std::to_array({column_names.value...}); + static constexpr std::size_t size = names.size(); +}; + +template class container, typename... T> +struct ColumnTypes { + using types = std::tuple; + static constexpr std::size_t size = sizeof...(T); + static constexpr auto type_sizes = std::to_array({sizeof(T)...}); + static constexpr auto type_str = std::to_array({type_to_string()...}); + using data_type = std::tuple...>; + + constexpr ColumnTypes() {} +}; + +template +struct STDVectorColumnTypes : public ColumnTypes {}; + + +} // namespace SBC::BinaryFormat::Tools + +#endif //SBC_BINARYFORMAT_TOOLS_H diff --git a/cpp/include/SBCBinaryFormat/Writer.hpp b/cpp/include/SBCBinaryFormat/Writer.hpp new file mode 100644 index 0000000..90af1ad --- /dev/null +++ b/cpp/include/SBCBinaryFormat/Writer.hpp @@ -0,0 +1,426 @@ +// +// Created by Hector Hawley Herrera on 2023-02-19. +// + +#ifndef SBC_BINARYFORMAT_WRITER_H +#define SBC_BINARYFORMAT_WRITER_H +#pragma once + +// C STD includes +// C 3rd party includes +// C++ STD includes +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// C++ 3rd party includes +//#include + +// my includes +#include "SBCBinaryFormat/Utilities.hpp" + +namespace SBC::BinaryFormat { +/* SBC Binary Header description: + * Header of a binary format is divided in 4 parts: + * 1.- Edianess - always 4 bits long (uint32_t) + * 2.- Data Header size - always 2 bits long (uint16_t) + * and is the length of the next bit of data + * 3.- Data Header - is data header long. + * Contains the structure of each line. It is always found as a raw + * string in the form "{name_col};{type_col};{size1},{size2}...;...; + * Cannot be longer than 65536 bytes. + * 4.- Number of lines - always 4 bits long (int32_t) + * Number of lines in the file. If 0, it is indefinitely long. +*/ +template +requires Tools::is_arithmetic_ptr_unpack +struct DynamicWriter { + //TODO(Any): make it possible to take both normal arithmetic types + // - and their corresponding array types Ex: int and int[] + // - that is to assume that if int is passed, it mean we want a scalar + // - and int[] would mean an array. + using tuple_type = std::tuple...>; + constexpr static std::size_t n_cols = sizeof...(DataTypes); + constexpr static std::array size_of_types = { sizeof(DataTypes)... }; + constexpr static std::array parameters_types_str = { Tools::type_to_string()... }; + + private: + const std::string _file_name; + const std::array _names; + const std::array _ranks; + const std::vector _sizes; + + std::size_t total_ranks = 0; + bool _open = false; + std::fstream _stream; + + std::size_t _line_byte_size = 0; + std::size_t _line_param_order = 0; + std::size_t _line_buffer_loc = 0; + std::string _line_buffer; + + template + void _copy_number_to_buff(const T& num, + std::string& buffer, + std::size_t& loc, + const std::size_t& size = sizeof(T)) { + const char* tmpstr = reinterpret_cast(&num); + for(std::size_t i = 0; i < size; i++) { + buffer[i + loc] = tmpstr[i]; + } + + loc += size; + } + + void _copy_str_to_buff(std::string_view source, + std::string& buffer, + std::size_t& loc) { + source.copy(&buffer[loc], source.length(), 0); + loc += source.length(); + } + + std::string _build_header() { + // Edianess first + uint32_t endianess = 0x01020304; + + // calculates + std::size_t total_header_size = 0; + // has to be uint16_t because we are saving it to the file later + + uint16_t binary_header_size = 0; + std::size_t total_ranks_so_far = 0; + for (std::size_t i = 0; i < n_cols; i++) { + auto column_rank = _ranks[i]; + // + 1 for the ; character + binary_header_size += _names[i].length() + 1; + // + 1 for the ; character + binary_header_size += parameters_types_str[i].length() + 1; + + std::size_t total_rank_size = 1; + for(std::size_t j = 0; j < column_rank; j ++) { + auto size = _sizes[total_ranks_so_far + j]; + total_rank_size *= size; + // It is always + 1 because there is either a ',' or a ';' + binary_header_size += std::to_string(size).length() + 1; + } + + total_ranks_so_far += column_rank; + _line_byte_size += size_of_types[i]*total_rank_size; + } + + // We also calculate the line size very useful when we start data saving + total_header_size += sizeof(uint32_t); // 1. + total_header_size += sizeof(uint16_t); // 2. + total_header_size += binary_header_size; // 3. + total_header_size += sizeof(int32_t); // 4. + + // Now we allocate the memory! + std::size_t buffer_loc = 0; + auto buffer = std::string(total_header_size, 'A'); + _line_buffer = std::string(_line_byte_size, 'A'); + + // Now we fill buffer. + total_ranks_so_far = 0; + _copy_number_to_buff(endianess, buffer, buffer_loc); // 1. + _copy_number_to_buff(binary_header_size, buffer, buffer_loc); // 2. + for (std::size_t i = 0; i < n_cols; i++) { // 3. + auto column_name = _names[i]; + auto column_type = parameters_types_str[i]; + auto column_rank = _ranks[i]; + + _copy_str_to_buff(column_name, buffer, buffer_loc); + _copy_str_to_buff(";", buffer, buffer_loc); + _copy_str_to_buff(column_type, buffer, buffer_loc); + _copy_str_to_buff(";", buffer, buffer_loc); + for(std::size_t size_j = 0; size_j < column_rank; size_j++) { + _copy_str_to_buff(std::to_string(_sizes[total_ranks_so_far + size_j]), + buffer, buffer_loc); + + if(size_j != column_rank - 1) { + _copy_str_to_buff(",", buffer, buffer_loc); + } + } + + _copy_str_to_buff(";", buffer, buffer_loc); + total_ranks_so_far += column_rank; + } + + // For dynamic files, this is always 0! + const int32_t num_lines = 0x00000000; + _copy_number_to_buff(num_lines, buffer, buffer_loc); // 4. + // Done! + return buffer; + } + + // Save item from tuple in position i + template + void _save_item(const tuple_type& items, std::size_t& loc) { + auto item = std::get(items); + + auto rank = _ranks[i]; + auto total_rank_up_to_i = std::accumulate(_ranks.begin(), + &_ranks[i], + 0); + + const auto& size_index_start = total_rank_up_to_i; + const std::size_t expected_size = std::accumulate(&_sizes[size_index_start], + &_sizes[size_index_start + rank], + 1, + std::multiplies()); + + if (expected_size != item.size()) { + throw std::out_of_range("memory is out of range"); + } + +// std::memcpy(&_line_buffer[loc], item.data(), item.size_bytes()); + auto item_bytes = std::as_bytes(item); + for(std::size_t byte = 0; byte < item.size_bytes(); byte++) { + _line_buffer.at(loc + byte) = static_cast(item_bytes[byte]); + } + loc += item.size_bytes(); + } + + // Think of this function as a wrapper between _save_item + // and _save_data + template + void _save_item_helper(const tuple_type& data, std::index_sequence) { + (_save_item(data, _line_buffer_loc),...); + } + + void _save_event(const tuple_type& data) { + _line_buffer_loc = 0; + _save_item_helper(data, std::make_index_sequence{}); + _stream << _line_buffer; + } + + public: + DynamicWriter(std::string_view file_name, // "out.sbc.bin" + const std::array& columns_names, // {"x", "y"} + const std::array& columns_ranks, // {1, 2} + const std::vector& columns_sizes) : // {1, 3, 2} + _file_name{file_name}, + _names{columns_names}, + _ranks{columns_ranks}, + _sizes{columns_sizes} + { + total_ranks = std::accumulate(columns_ranks.begin(), + columns_ranks.end(), 0); + + if (std::filesystem::exists(file_name)) { + if (std::filesystem::is_empty(file_name)) { + // If file is empty or does not exist, then we + // open it and write to it. + _stream.open(_file_name, std::ios::app | std::ofstream::binary); + if (_stream.is_open()) { + _open = true; + _stream << _build_header(); + } + } else { + std::ifstream peeker(_file_name, std::ofstream::binary); + + std::string header = _build_header(); + std::string current_file_header(header.length(), '\0'); + peeker.seekg(0); + peeker.read(¤t_file_header[0], header.length()); + + if (current_file_header != header) { + throw std::runtime_error("File being written to has an " + "incompatible header format. " + "Details:\n\t File = " + _file_name); + } + + // We do not write anything. + _stream.open(_file_name, std::ios::app | std::ofstream::binary); + _open = _stream.is_open(); + + } + } else { + _stream.open(_file_name, std::ios::app | std::ofstream::binary); + if (_stream.is_open()) { + _open = true; + _stream << _build_header(); + } + } + } + + bool isOpen() { return _open; } + + ~DynamicWriter() { + _open = false; + _stream.flush(); + _stream.close(); + } + +// {"x", "y"} +// {1, 2} +// {1, 3, 2} + void save(std::span... data) { + // save(std::vector{1}, std::vector{1, 2, 3, 4, 4, 6}) + if(_open) { + _save_event(std::make_tuple(data...)); + } + } +}; + +//class SiPMDynamicWriter { +// using SiPMDW = DynamicWriter< double, // sample rate +// uint8_t, // Enabled Channels +// uint64_t, // Trigger Mask +// uint16_t, // Thresholds +// uint16_t, // DC Offsets +// uint8_t, // DC Corrections +// float, // DC Range +// uint32_t, // Time stamp +// uint32_t, // Trigger source +// uint16_t>; // Waveforms +// +// constexpr static std::size_t num_cols = 10; +// constexpr static std::array sipm_ranks = +// {1, 1, 1, 1, 1, 1, 1, 1, 1, 2}; +// const inline static std::array column_names = +// {"sample_rate", "en_chs", "trg_mask", "thresholds", "dc_offsets", +// "dc_corrections", "dc_range", "time_stamp", "trg_source", "sipm_traces"}; +// +// +// double _sample_rate[1] = {0.0}; +// std::vector _en_chs; +// uint64_t _trigger_mask[1] = {0}; +// std::vector _thresholds; +// std::vector _dc_offsets; +// std::vector _dc_corrections; +// std::vector _dc_ranges; +// +// uint32_t _trigger_tag[1] = {0}; +// uint32_t _trigger_source[1] = {0}; +// +// uint32_t _record_length; +// SiPMDW _streamer; +// public: +// /* Details of each parameters: +// Name | type | length (in Bytes) | is a constant?| +// --------------------------------------------------------------- +// sample_rate | double | 8 | Y +// en_chs | uint8 | 1*ch_size | Y +// trg_mask | uint64 | 8 | Y +// thresholds | uint16 | 2*ch_size | Y +// dc_offsets | uint16 | 2*ch_size | Y +// dc_corrections| uint8 | 1*ch_size | Y +// dc_range | single | 4*ch_size | Y +// time_stamp | uint32 | 4 | N +// trg_source | uint32 | 4 | N +// data | uint16 | 2*rl*ch_size | N +// --------------------------------------------------------------- +// rl -> record length of the waveforms +// ch_size -> number of enabled channels +// en_chs -> the channels # that were enabled +// +// Total length = 24 + ch_size*(10 + 2*record_length) +// */ +// +// SiPMDynamicWriter(std::string_view file_name, +// const CAENDigitizerFamilies& fam, +// const CAENDigitizerModelConstants& model_consts, +// const CAENGlobalConfig& global_config, +// const std::array& group_configs) : +// _sample_rate{model_consts.AcquisitionRate}, +// _en_chs{_get_en_chs(model_consts, group_configs)}, +// _record_length{global_config.RecordLength}, +// _streamer{file_name, column_names, sipm_ranks, _form_sizes(global_config)} +// { +// // Only for these families there is a decimation factor +// if (fam == CAENDigitizerFamilies::x740 or fam == CAENDigitizerFamilies::x724) { +// _sample_rate[0] /= global_config.DecimationFactor; +// } +// +// for(auto ch : _en_chs) { +// CAENGroupConfig group; +// if (model_consts.NumberOfGroups == 0) { +// group = group_configs[ch]; +// _dc_corrections.push_back(group.DCCorrections[0]); +// } else { +// group = group_configs[ch % 8]; +// _dc_corrections.push_back(group.DCCorrections[ch % 8]); +// } +// +// _thresholds.push_back(group.TriggerThreshold); +// _dc_offsets.push_back(group.DCOffset); +// _dc_ranges.push_back(static_cast( +// model_consts.VoltageRanges.at(group.DCRange))); +// } +// // _trigger_mask[0] is set in _get_en_chs +// } +// +// ~SiPMDynamicWriter() = default; +// +// bool isOpen() { return _streamer.isOpen(); } +// +// void save_waveform(const std::shared_ptr>& waveform) { +// _trigger_tag[0] = waveform->getInfo().TriggerTimeTag; +// _trigger_source[0] = waveform->getInfo().Pattern; +// _streamer.save(_sample_rate, +// _en_chs, +// _trigger_mask, +// _thresholds, +// _dc_offsets, +// _dc_corrections, +// _dc_ranges, +// _trigger_tag,` +// _trigger_source, +// waveform->getData()); +// } +// +// private: +// +// std::vector _form_sizes( +// const CAENGlobalConfig& caen_global_config) { +// +// auto num_en_chs = _en_chs.size(); +// return {1, num_en_chs, 1, num_en_chs, num_en_chs, num_en_chs, num_en_chs, +// 1, 1, num_en_chs, caen_global_config.RecordLength}; +// } +// +// std::vector _get_en_chs( +// const CAENDigitizerModelConstants& model_constants, +// const std::array& groups) { +// std::vector out; +// for(std::size_t group_num = 0; group_num < groups.size(); group_num++) { +// const auto& group = groups[group_num]; +// if (not group.Enabled) { +// continue; +// } +// +// // If the digitizer does not support groups, group_num = ch +// if (model_constants.NumberOfGroups == 0) { +// out.push_back(group_num); +// continue; +// } +// +// // Othewise, calculate using the AcquisitionMask +// for (std::size_t ch = 0; ch < model_constants.NumChannelsPerGroup; ch++) { +// // If the acq mask or trigg mask is enabled that means we are saving that ch +// // to the file. +// if (group.AcquisitionMask.at(ch) or group.TriggerMask.at(ch)) { +// out.push_back(ch + model_constants.NumChannelsPerGroup * group_num); +// } +// +// // However, only trg mask ones are saved to _trigger_mask duh +// if (group.TriggerMask.at(ch)) { +// auto g_ch = ch + model_constants.NumChannelsPerGroup * group_num; +// _trigger_mask[0] |= (1 << g_ch); +// } +// } +// } +// return out; +// } +//}; + +} // namespace SBCQueens::BinaryFormat + +#endif //SBC_BINARYFORMAT_WRITER_H diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt new file mode 100644 index 0000000..c0c00af --- /dev/null +++ b/cpp/test/CMakeLists.txt @@ -0,0 +1,63 @@ +cmake_minimum_required(VERSION 3.14...3.22) + +project(SBCBinaryFormatTests LANGUAGES CXX C) + +# ---- Options ---- + +option(ENABLE_TEST_COVERAGE "Enable test coverage" OFF) +#option(TEST_INSTALLED_VERSION "Test the version found by find_package" OFF) + +# --- Import tools ---- + +include(cmake/tools.cmake) + +# ---- Dependencies ---- + +include(cmake/CPM.cmake) + +CPMAddPackage("gh:onqtam/doctest@2.4.8") +CPMAddPackage("gh:TheLartians/Format.cmake@1.7.3") + +if(TEST_INSTALLED_VERSION) + find_package(SBCBinaryFormat REQUIRED) +else() + CPMAddPackage(NAME SBCBinaryFormat SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/..) +endif() + +# ---- Create binary ---- + +file(GLOB sources CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/source/*.cpp) +add_executable(${PROJECT_NAME} ${sources}) +target_link_libraries(${PROJECT_NAME} PUBLIC doctest::doctest SBCBinaryFormat::SBCBinaryFormat) +set_target_properties(${PROJECT_NAME} PROPERTIES CXX_STANDARD 20) + +file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/data/test.sbc.bin DESTINATION + ${CMAKE_CURRENT_BINARY_DIR}/data) + +# enable compiler warnings +if(NOT TEST_INSTALLED_VERSION) + if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU") + target_compile_options(SBCBinaryFormatTests PUBLIC -Wall -Wpedantic -Wextra) + elseif(MSVC) + target_compile_options(SBCBinaryFormatTests PUBLIC /W4 /WX) + target_compile_definitions(${PROJECT_NAME} PUBLIC DOCTEST_CONFIG_USE_STD_HEADERS) + endif() +endif() + +# ---- Add SipmAnalysisTests ---- + +enable_testing() + +# Note: doctest and similar testing frameworks can automatically configure CMake tests. For other +# testing frameworks add the tests target instead: add_test(NAME ${PROJECT_NAME} COMMAND +# ${PROJECT_NAME}) + +include(${doctest_SOURCE_DIR}/scripts/cmake/doctest.cmake) +doctest_discover_tests(${PROJECT_NAME}) + +# ---- code coverage ---- + +if(ENABLE_TEST_COVERAGE) + target_compile_options(SBCBinaryFormatTests PUBLIC -O0 -g -fprofile-arcs -ftest-coverage) + target_link_options(SBCBinaryFormatTests PUBLIC -fprofile-arcs -ftest-coverage) +endif() diff --git a/cpp/test/cmake/CPM.cmake b/cpp/test/cmake/CPM.cmake new file mode 100644 index 0000000..e51b761 --- /dev/null +++ b/cpp/test/cmake/CPM.cmake @@ -0,0 +1,21 @@ +set(CPM_DOWNLOAD_VERSION 0.36.0) + +if(CPM_SOURCE_CACHE) + # Expand relative path. This is important if the provided path contains a tilde (~) + get_filename_component(CPM_SOURCE_CACHE ${CPM_SOURCE_CACHE} ABSOLUTE) + set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake") +elseif(DEFINED ENV{CPM_SOURCE_CACHE}) + set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake") +else() + set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake") +endif() + +if(NOT (EXISTS ${CPM_DOWNLOAD_LOCATION})) + message(STATUS "Downloading CPM.cmake to ${CPM_DOWNLOAD_LOCATION}") + file(DOWNLOAD + https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake + ${CPM_DOWNLOAD_LOCATION} + ) +endif() + +include(${CPM_DOWNLOAD_LOCATION}) diff --git a/cpp/test/cmake/tools.cmake b/cpp/test/cmake/tools.cmake new file mode 100644 index 0000000..2364da9 --- /dev/null +++ b/cpp/test/cmake/tools.cmake @@ -0,0 +1,66 @@ +# this file contains a list of tools that can be activated and downloaded on-demand each tool is +# enabled during configuration by passing an additional `-DUSE_=` argument to CMake + +# only activate tools for top level project +if(NOT PROJECT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) + return() +endif() + +include(${CMAKE_CURRENT_LIST_DIR}/CPM.cmake) + +# enables sanitizers support using the the `USE_SANITIZER` flag available values are: Address, +# Memory, MemoryWithOrigins, Undefined, Thread, Leak, 'Address;Undefined' +if(USE_SANITIZER OR USE_STATIC_ANALYZER) + CPMAddPackage("gh:StableCoder/cmake-scripts#1f822d1fc87c8d7720c074cde8a278b44963c354") + + if(USE_SANITIZER) + include(${cmake-scripts_SOURCE_DIR}/sanitizers.cmake) + endif() + + if(USE_STATIC_ANALYZER) + if("clang-tidy" IN_LIST USE_STATIC_ANALYZER) + set(CLANG_TIDY + ON + CACHE INTERNAL "" + ) + else() + set(CLANG_TIDY + OFF + CACHE INTERNAL "" + ) + endif() + if("iwyu" IN_LIST USE_STATIC_ANALYZER) + set(IWYU + ON + CACHE INTERNAL "" + ) + else() + set(IWYU + OFF + CACHE INTERNAL "" + ) + endif() + if("cppcheck" IN_LIST USE_STATIC_ANALYZER) + set(CPPCHECK + ON + CACHE INTERNAL "" + ) + else() + set(CPPCHECK + OFF + CACHE INTERNAL "" + ) + endif() + + include(${cmake-scripts_SOURCE_DIR}/tools.cmake) + + clang_tidy(${CLANG_TIDY_ARGS}) + include_what_you_use(${IWYU_ARGS}) + cppcheck(${CPPCHECK_ARGS}) + endif() +endif() + +# enables CCACHE support through the USE_CCACHE flag possible values are: YES, NO or equivalent +if(USE_CCACHE) + CPMAddPackage("gh:TheLartians/Ccache.cmake@1.2.3") +endif() diff --git a/cpp/test/data/empty.sbc.bin b/cpp/test/data/empty.sbc.bin new file mode 100644 index 0000000..e69de29 diff --git a/cpp/test/data/test.sbc.bin b/cpp/test/data/test.sbc.bin new file mode 100644 index 0000000..26ad2a8 Binary files /dev/null and b/cpp/test/data/test.sbc.bin differ diff --git a/cpp/test/source/basic_test.cpp b/cpp/test/source/basic_test.cpp new file mode 100644 index 0000000..d0b52f0 --- /dev/null +++ b/cpp/test/source/basic_test.cpp @@ -0,0 +1,120 @@ +// +// Created by Hector Hawley Herrera on 2023-04-26. +// + +// C STD includes +// C 3rd party includes +// C++ STD include +// C++ 3rd party includes +#include + +// my includes +#include "SBCBinaryFormat/Reader.hpp" +#include "SBCBinaryFormat/Writers.hpp" + +using namespace SBC::BinaryFormat; + +// This struct contains a row of data in the binary data. +struct TestHolder { + int32_t t; + double x; + double y; + double z; + std::vector momentum; +}; + +TEST_CASE("DYNAMIC_READER_TESTS") +{ + // This prepares the code for the column types of the files. + // Note the last one, despite it being an array, it is still a double. + using TestTypes = Tools::STDVectorColumnTypes; + // This prepares the code for the column names found within the file. + using TestColumnNames = Tools::ColumnNames<"t", "x", "y", "z", "momentum">; + + SUBCASE("Common IO errors") { + // Check for the expected throws + // File not found + CHECK_THROWS([&]() { + DynamicStreamer streamer("data/imaginary.sbc.bin"); + }()); + + // File is empty with no header + CHECK_THROWS([&]() { + DynamicStreamer streamer("data/empty.sbc.bin"); + }()); + + // File is directory + CHECK_THROWS([&]() { + DynamicStreamer streamer("data"); + }()); + } + + SUBCASE("Opposite Endianess check") { + + } + + SUBCASE("Bad header checks") { + + } + + + SUBCASE("Small good file checks") { + DynamicStreamer dB("data/test.sbc.bin"); + + // There are 129 elements in the file, did the reader found them all? + CHECK(dB.size() == 129); + +// auto data = streamer.get<"x">(); + auto data = dB.at(0); + + auto data_too = dB.at(0, + [](std::vector t, std::vector x, + std::vector y, std::vector z, + std::vector momentum) -> TestHolder + { + return TestHolder{.t = t[0], .x = x[0], .y = y[0], .z = z[0], + .momentum{momentum.begin(), momentum.end()}}; + } + ); + + CHECK(data_too.t == 1); + CHECK(data_too.x == 2.0); + CHECK(data_too.y == 3.0); + CHECK(data_too.z == 4.0); + CHECK(data_too.momentum.size() == 6); + CHECK(data_too.momentum == std::vector({1.0, 2.0, 4.0, 5.0, 7.0, 8.0})); + } + + SUBCASE("Big good file checks") { + + } + +} + +TEST_CASE("WRITER_TEST") { + SUBCASE("Small example") { + // First, create the writer type with the types of the members + // between the <...>: only allowed members: integer types (int16_t, + // int8_t, int32_t, int64_t and their corresponding unsigned types. + // char, double, and float are also allowed. + // If it is not allowed, the program won´t compile! + using TestDW = SBC::BinaryFormat::DynamicWriter; + + // Names and ranks have to be an array where the length is equal + // to the number of types passed to the writer + std::array names = {"t", "x"}; + std::array ranks = {1, 2}; + // Size doesnt need to be an array but its length has to be equal + // to the product of all the members of ranks + std::vector sizes = {1, 3, 2}; + // First is the name, then names... + TestDW writer("out_test.sbc.bin", names, ranks, sizes); + + // The data to save has to be a span (vector, array, ...) + // of the same underlying type in the same order as it was passed + // to the writer types + std::vector t = {1}; + std::vector x = {3.0, 4.0, 5.0, 6.0, 7.0, 8.0}; + writer.save(t, x); + } +} \ No newline at end of file diff --git a/cpp/test/source/main.cpp b/cpp/test/source/main.cpp new file mode 100644 index 0000000..815236f --- /dev/null +++ b/cpp/test/source/main.cpp @@ -0,0 +1,7 @@ +// +// Created by Hector Hawley Herrera on 2023-04-26. +// + +#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN + +#include diff --git a/python/README.md b/python/README.md new file mode 100644 index 0000000..b3833d4 --- /dev/null +++ b/python/README.md @@ -0,0 +1,9 @@ +# Python SBC Binary Format driver + +Use this library either directly (copy sbcbinaryformat/files.py into your source code) or install it as a library. + +Example usage files can be found under the example folder. + +# TODO: +* Reader example +* Combined Writer + Reader example \ No newline at end of file diff --git a/python/examples/test.sbc.bin b/python/examples/test.sbc.bin new file mode 100644 index 0000000..26ad2a8 Binary files /dev/null and b/python/examples/test.sbc.bin differ diff --git a/python/examples/writer_example.py b/python/examples/writer_example.py new file mode 100644 index 0000000..1c8ebe3 --- /dev/null +++ b/python/examples/writer_example.py @@ -0,0 +1,23 @@ +import numpy as np + +from sbcbinaryformat.files import Writer + +with Writer("test.sbc.bin", + ["t", "x", "y", "z", "momentum"], + ['i4', 'd', 'd', 'd', 'd'], + [[1], [1], [1], [1], [3, 2]]) as sbc_writer_example: + + sbc_writer_example.write({'t': [1], + 'x': [2.0], + 'y': [3.0], + 'z': [4.0], + 'momentum': [[1, 2], [4, 5], [7, 8]]}) + + rng = np.random.default_rng() + + for _ in range(128): + sbc_writer_example.write({'t': rng.integers(-10, 10, (1)), + 'x': rng.random((1)), + 'y': rng.random((1)), + 'z': rng.random((1)), + 'momentum': rng.random((3, 2))}) diff --git a/python/sbcbinaryformat/__init__.py b/python/sbcbinaryformat/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/sbcbinaryformat/files.py b/python/sbcbinaryformat/files.py new file mode 100644 index 0000000..2050720 --- /dev/null +++ b/python/sbcbinaryformat/files.py @@ -0,0 +1,483 @@ +""" +Contains two classes: + +* Streamer: which reads a file and either creates a streamer + or saved everything to RAM. No multiprocessing features. +* Writer: which creates SBC binary files given the passed parameters. +Very simplistic and does not have any multiprocessing features. +""" +import sys +import os + +import numpy as np + + +class Streamer: + """ + This class specializes in opening and managing a sbc-binary file. + If a file is too big to save into RAM, this code will manage the + reading into a more tolerable internal buffer. + """ + def __init__(self, file, block_size=65536, max_size=1000): + """ + :param file: File location and name + :param blocksize: Size in lines of the internal buffer + :param max_size: Max size of the file that will be directly loaded + into RAM. Beyond this value the streamer will default + to a block style of reading. + :raises OSError: if Endianess is not supported, if the header is + not consistent or contents do not match the header. + """ + self.__data = None + self.__binary_data = None + self.system_edianess = sys.byteorder + self.file_size = os.path.getsize(file) / 1e6 + self.is_all_in_ram = self.file_size < max_size + + # This will throw if the file is not found + self.file_resource = open(file, "rb") + + # Read the constants from the file + # First its Endianess + file_endianess = np.fromfile(self.file_resource, + dtype=np.uint32, count=1) + + if file_endianess == 0x01020304: + self.file_endianess = "little" + elif file_endianess == 0x04030201: + self.file_endianess = "big" + else: + raise OSError(f"Endianess not supported: {file_endianess}") + + # Now the length of the header + self.header_length = np.fromfile(self.file_resource, + dtype=np.uint16, count=1)[0] + + header = self.file_resource.read(self.header_length).decode('ascii') + header = header.split(';') + + if (len(header) - 1) % 3 != 0: + raise OSError(f"The number of items found in the header should \ + always come in multiples of 3. It is {len(header) - 1}") + + self.num_columns = int(len(header) / 3) + header = np.resize(header, (self.num_columns, 3)) + + self.columns = header[:, 0] + self.dtypes = [sbcstring_to_type(type_s, self.file_endianess) + for type_s in header[:, 1]] + self.sizes = [[int(lenght) for lenght in length.split(',')] + for length in header[:, 2]] + + self.expected_num_elems = np.fromfile(self.file_resource, + dtype=np.int32, count=1)[0] + + # 4 for endianess, 2 for header length and 4 for num elems + self.header_size_in_bytes = self.header_length + 10 + # Address in the file where the data starts + self.__start_data = self.header_size_in_bytes + # Address in the file where data ends + self.__end_data = self.file_size + + header_size_in_megbytes = self.header_size_in_bytes*1e-6 + # We need to calculate how many elements are in the file + bytes_each = [dtype.itemsize*np.prod(sizes) for i, (dtype, sizes) + in enumerate(zip(self.dtypes, self.sizes))] + + self.line_size_in_bytes = np.sum(bytes_each) + line_size_in_megbytes = self.line_size_in_bytes*1.0e-6 + + self.num_elems = self.file_size - header_size_in_megbytes + # TODO(Any): this check has a lot of flaws... float rounding errors + # mess this up. Solution: check to integers and deal with bytes + if self.num_elems % line_size_in_megbytes > 1e-3: + raise OSError(f"""After doing the math, the remaining file is +not evenly distributed by the given parameters. +Header or data written incorrectly. +- Header size = {header_size_in_megbytes} MB. +- File size = {self.file_size } MB. +- Expected line size = {line_size_in_megbytes} MB""") + + self.num_elems = int(self.num_elems / line_size_in_megbytes) + + if self.expected_num_elems != 0: + if self.num_elems != self.expected_num_elems: + raise OSError(f"Expected number of elements in file, \ + {self.expected_num_elems}, does not match \ + the calculated number of element in file: \ + {self.num_elems}") + + if (block_size * line_size_in_megbytes) > max_size: + print(f"Warning: Block size \ +({block_size * line_size_in_megbytes}MB) is bigger than the amount of memory \ +this streamer can allocate which is equal to {max_size}MB. \ +Reducing until reasonable.") + + if self.is_all_in_ram: + self.block_size = self.num_elems + else: + self.block_size = block_size + while (self.block_size * line_size_in_megbytes) > max_size: + self.block_size = int(0.5*self.block_size) + + print(f"Final block size = {self.block_size}") + + self.__create_df() + + self.__start_line_in_memory = 0 + self.__end_line_in_memory = 0 + self.__current_line = 0 + self.__load_data() + + for column in self.columns: + setattr(self, column, self.__data[column]) + + def __enter__(self): + """ + This allows the use of with() + """ + return self + + def __exit__(self, exc_type, exc_value, traceback): + """ + This allows the use of with() and properly closes the resources + """ + self.file_resource.close() + + def __process_line_data(self, i): + """ + Helper function to process the data and save it to the + allocated memory + """ + position_in_array = 0 + for name, dtype, sizes in zip(self.columns, self.dtypes, self.sizes): + length = dtype.itemsize*np.prod(sizes) + s_index = position_in_array + i*self.line_size_in_bytes + e_index = length + s_index + if len(sizes) > 1: + self.__data[name][i] \ + = self.__binary_data[s_index:e_index].view(dtype).reshape(sizes) + elif len(sizes) == 1: + self.__data[name][i] \ + = self.__binary_data[s_index:e_index].view(dtype) + + position_in_array += length + + def __create_df(self): + """ + Here is where we allocate the memory for this streamer + """ + df_dtypes = {} + if self.__binary_data is None: + self.__binary_data = np.zeros( + self.line_size_in_bytes*self.block_size, dtype=np.uint8) + + if self.__data is None: + self.__data = dict.fromkeys(self.columns) + # self.__data = pd.DataFrame() + for name, dtype, sizes in zip(self.columns, self.dtypes, self.sizes): + self.__data[name] = () + if len(sizes) > 1: + df_dtypes[name] = list + sizes = np.append(self.block_size, sizes) + self.__data[name] = np.zeros(sizes, dtype=dtype) + elif len(sizes) == 1: + df_dtypes[name] = dtype + self.__data[name] = np.zeros((self.block_size, sizes[0]), + dtype=dtype) + + def __get_row(self, i): + return i*self.line_size_in_bytes + self.header_size_in_bytes + + def __set_line_file(self, i): + self.file_resource.seek(self.__get_row(i)) + + def __load_data(self): + """ + Loads data at self.__current_line + """ + self.__set_line_file(self.__current_line) + start_in_bytes = self.file_resource.tell() + + self.__binary_data = np.fromfile(self.file_resource, dtype=np.uint8, + count=self.line_size_in_bytes + * self.block_size) + + end_in_bytes = self.file_resource.tell() + + lines_moved = end_in_bytes - start_in_bytes + lines_moved = lines_moved / self.line_size_in_bytes + + if int(lines_moved) != lines_moved: + raise ValueError(f'File did not moved an integer value of \ +{self.line_size_in_bytes}') + + lines_moved = int(lines_moved) + + self.__start_line_in_memory = self.__current_line + self.__end_line_in_memory += lines_moved + for i in range(lines_moved): + self.__process_line_data(i) + + def __len__(self): + return self.num_elems + + def __iter__(self): + return self + + def __next__(self): + if self.__current_line == self.num_elems: + self.__current_line = 0 + raise StopIteration + + return self.__getitem__(self.__current_line) + + def __getitem(self, i): + self.__current_line = i + if self.__current_line >= self.__end_line_in_memory or \ + self.__current_line < self.__start_line_in_memory: + # if outside these limits, we dont have that data in memory + # we need to load it from the file + print("Loading data...") + self.__load_data() + + # now, we load the data + out = dict.fromkeys(self.columns) + internal_index = self.__current_line - self.__start_line_in_memory + + for column in self.columns: + out[column] = self.__data[column][internal_index] + + self.__current_line += 1 + return out + + def __getitem__(self, indexes): + if isinstance(indexes, (int, np.integer)): + return self.__getitem(indexes) + if isinstance(indexes, str): + return self.__data[indexes] + + return np.array([self.__getitem(i) for i in indexes]) + + +class Writer: + """ + SBC Binary Header description: + * Header of a binary format is divided in 4 parts: + * 1.- Edianess - always 4 bits long (uint32_t) + * 2.- Data Header size - always 2 bits long (uint16_t) + * and is the length of the next bit of data + * 3.- Data Header - is data header long. + * Contains the structure of each line. It is always found as a raw + * string in the form "{name_col};{type_col};{size1},{size2}...;...; + * Cannot be longer than 65536 bytes. + * 4.- Number of lines - always 4 bits long (int32_t) + * Number of lines in the file. If 0, it is indefinitely long. + """ + + def __init__(self, file_name, columns_names, dtypes, sizes): + self.file_name = file_name + self.num_elems_saved = 0 + self.system_endianess = sys.byteorder + + if len(columns_names) != len(dtypes): + raise ValueError("columns names and dtypes should be of the \ +same length") + + # if does not exist OR its size is 0, we create the header + if not os.path.exists(file_name) or os.path.getsize(file_name) == 0: + self.__create_header(file_name, columns_names, dtypes, sizes) + # Otherwise, we read the header and check whenever is compatible with + # current parameters + else: + if not self.__has_compatible_header(file_name, columns_names, + dtypes, sizes): + raise ValueError(f"Header of already existing file must match \ +columns_names ({columns_names}), dtypes ({dtypes}), \ +and sizes ({sizes})") + + self.file_resource = open(file_name, 'ab') + + def __len__(self): + return self.num_elems_saved + + def __enter__(self): + """ + This allows the use of with() + """ + return self + + def __exit__(self, exc_type, exc_value, traceback): + """ + This allows the use of with() and properly closes the resources + """ + self.file_resource.close() + + def write(self, data_line): + """ + Write data_line to the file. + + :raises ValueError: if data_line is not a dictionary, the keys or + lengths do not match the columns of the file. + """ + if not isinstance(data_line, dict): + raise ValueError("data must be a dictionary") + + if (list(data_line) != self.columns).all(): + raise ValueError(f'Data keys must match the column types. This \ + file columns names : {self.columns}, the data provided keys : \ + {list(data_line)}') + + sizes = [list(np.array(value).shape) for _, value in data_line.items()] + if not self.__check_sizes(sizes): + raise ValueError("Data passed is not the length of the \ +expected sizes") + + for column_name, dtype, _ in zip(self.columns, self.dtypes, self.sizes): + np.array(data_line[column_name], + dtype=dtype).tofile(self.file_resource) + + def __create_header(self, file_name, columns_names, dtypes, sizes): + self.file_resource = open(file_name, 'wb') + # first endianess + np.array(0x01020304, dtype='u4').tofile(self.file_resource) + + # then we need two things: the header and its length + self.header = "" + for column_name, dtype, size in zip(columns_names, dtypes, sizes): + header_str = "" + if isinstance(size, int): + header_str = f"{size}" + else: + for i, size_i in enumerate(size): + if i == 0: + header_str += f"{size_i}" + else: + header_str += f",{size_i}" + + self.header += f"{column_name};{type_to_sbcstring(dtype)};{header_str};" + + self.header_length = len(self.header) + np.array(self.header_length, dtype='u2').tofile(self.file_resource) + + self.file_resource.write(self.header.encode('ascii')) + + np.array(0, dtype='i4').tofile(self.file_resource) + + self.columns = np.array(columns_names) + self.dtypes = np.array(dtypes) + self.sizes = sizes + + def __has_compatible_header(self, file_name, columns_names, dtypes, sizes): + # open the file for read only + with open(file_name, "rb") as file: + file_endianess = np.fromfile(file, + dtype=np.uint32, count=1)[0] + + if file_endianess not in (0x1020304, 0x4030201): + raise OSError(f"Endianess not supported: 0x{file_endianess:X}") + + self.correct_for_endian = False + if (file_endianess == 0x04030201 + and self.system_endianess == 'little') \ + or (file_endianess == 0x01020304 + and self.system_endianess == 'big'): + self.correct_for_endian = True + + # Now the length of the header + self.header_length = np.fromfile(file, + dtype=np.uint16, + count=1)[0] + + print(self.header_length) + + header = file.read(self.header_length).decode('ascii') + header = header.split(';') + + if (len(header) - 1) % 3 != 0: + raise OSError(f"The number of items found in the header \ + should always come in multiples of 3. It is \ + {len(header) - 1}") + + self.num_columns = int(len(header) / 3) + header = np.resize(header, (self.num_columns, 3)) + + self.columns = header[:, 0] + + if not (self.columns == columns_names).all(): + print(f"SBCWriter: columns did not match {self.columns} and \ + {columns_names}") + return False + + self.dtypes = np.array([sbcstring_to_type(type_s, file) + for type_s in header[:, 1]]) + + np_dtypes = [np.dtype(dtype) for dtype in dtypes] + if not (self.dtypes == np_dtypes).all(): + print("SBCWriter: dtypes did not match") + return False + + self.sizes = [[int(lenght) for lenght in length.split(',')] + for length in header[:, 2]] + + if not self.__check_sizes(sizes): + print("SBCWriter: sizes did not match") + return False + + self.num_elems = np.fromfile(file, + dtype=np.int32, + count=1)[0] + # If it passed all the test, then they match! + return True + + def __check_sizes(self, sizes): + return np.array([(np.array(x) == y).all() \ + for x, y in zip(self.sizes, sizes)]).all() + + +def sbcstring_to_type(type_str, endianess): + out_type_str = "" + if endianess == 'little': + out_type_str += '<' + elif endianess == 'big': + out_type_str += '>' + + string_to_type = {'char': 'i1', + 'int8': 'i1', + 'int16': 'i2', + 'int32': 'i4', + 'int64': 'i8', + 'uint8': 'u1', + 'uint16': 'u2', + 'uint32': 'u4', + 'uint64': 'u8', + 'single': 'f', + 'float32': 'f', + 'double': 'd', + 'float64': 'd', + 'float128': 'f16'} + + return np.dtype(out_type_str+string_to_type[type_str]) + + +def type_to_sbcstring(sbc_type_str): + string_to_type = {'i1': 'int8', + 'i2': 'int16', + 'i4': 'int32', + 'i8': 'int64', + 'u1': 'uint8', + 'u2': 'uint16', + 'u4': 'uint32', + 'u8': 'uint64', + 'f': 'float32', + 'd': 'double', + 'f16': 'float128'} + + return string_to_type[sbc_type_str] + +# with SBCWriteBinary("test.bin", ("a", "b"), ('i1', 'i2'), [[1], [1]]) as out: +# print(out.columns, out.dtypes, out.sizes) +# out.write({'a': [0], 'b': [2]}) + +# with SBCReadBinaryStreamer('test.bin') as data: +# print(data.columns, data.dtypes, data.sizes) diff --git a/python/setup.py b/python/setup.py new file mode 100644 index 0000000..10b5424 --- /dev/null +++ b/python/setup.py @@ -0,0 +1,10 @@ +from setuptools import find_packages, setup + +setup( + name='sbcbinaryformat', + packages=find_packages(include=['sbcbinaryformat']), + version='0.1.3', + description='SBC Binary Format library', + author='Hector Hawley Herrera', + install_requires=['numpy'] +)