diff --git a/Makefile b/Makefile index e607bb3cbe..715d7aff40 100644 --- a/Makefile +++ b/Makefile @@ -86,11 +86,20 @@ endif CHPL_FLAGS += -lhdf5 -lhdf5_hl -lzmq -liconv -lidn2 -lparquet -larrow -ARROW_FILE_NAME += $(ARKOUDA_SOURCE_DIR)/ArrowFunctions -ARROW_CPP += $(ARROW_FILE_NAME).cpp -ARROW_H += $(ARROW_FILE_NAME).h -ARROW_O += $(ARROW_FILE_NAME).o +ARROW_READ_FILE_NAME += $(ARKOUDA_SOURCE_DIR)/parquet/ReadParquet +ARROW_READ_CPP += $(ARROW_READ_FILE_NAME).cpp +ARROW_READ_H += $(ARROW_READ_FILE_NAME).h +ARROW_READ_O += $(ARKOUDA_SOURCE_DIR)/ReadParquet.o +ARROW_WRITE_FILE_NAME += $(ARKOUDA_SOURCE_DIR)/parquet/WriteParquet +ARROW_WRITE_CPP += $(ARROW_WRITE_FILE_NAME).cpp +ARROW_WRITE_H += $(ARROW_WRITE_FILE_NAME).h +ARROW_WRITE_O += $(ARKOUDA_SOURCE_DIR)/WriteParquet.o + +ARROW_UTIL_FILE_NAME += $(ARKOUDA_SOURCE_DIR)/parquet/UtilParquet +ARROW_UTIL_CPP += $(ARROW_UTIL_FILE_NAME).cpp +ARROW_UTIL_H += $(ARROW_UTIL_FILE_NAME).h +ARROW_UTIL_O += $(ARKOUDA_SOURCE_DIR)/UtilParquet.o .PHONY: install-deps install-deps: install-zmq install-hdf5 install-arrow install-iconv install-idn2 @@ -203,10 +212,30 @@ endif .PHONY: compile-arrow-cpp compile-arrow-cpp: - $(CHPL_CXX) -O3 -std=c++17 -c $(ARROW_CPP) -o $(ARROW_O) $(INCLUDE_FLAGS) $(ARROW_SANITIZE) + make compile-arrow-write + make compile-arrow-read + make compile-arrow-util -$(ARROW_O): $(ARROW_CPP) $(ARROW_H) - make compile-arrow-cpp +.PHONY: compile-arrow-write +compile-arrow-write: + $(CHPL_CXX) -O3 -std=c++17 -c $(ARROW_WRITE_CPP) -o $(ARROW_WRITE_O) $(INCLUDE_FLAGS) $(ARROW_SANITIZE) + +.PHONY: compile-arrow-read +compile-arrow-read: + $(CHPL_CXX) -O3 -std=c++17 -c $(ARROW_READ_CPP) -o $(ARROW_READ_O) $(INCLUDE_FLAGS) $(ARROW_SANITIZE) + +.PHONY: compile-arrow-util +compile-arrow-util: + $(CHPL_CXX) -O3 -std=c++17 -c $(ARROW_UTIL_CPP) -o $(ARROW_UTIL_O) $(INCLUDE_FLAGS) $(ARROW_SANITIZE) + +$(ARROW_UTIL_O): $(ARROW_UTIL_CPP) $(ARROW_UTIL_H) + make compile-arrow-util + +$(ARROW_READ_O): $(ARROW_READ_CPP) $(ARROW_READ_H) + make compile-arrow-read + +$(ARROW_WRITE_O): $(ARROW_WRITE_CPP) $(ARROW_WRITE_H) + make compile-arrow-write CHPL_MAJOR := $(shell $(CHPL) --version | sed -n "s/chpl version \([0-9]\)\.[0-9]*.*/\1/p") CHPL_MINOR := $(shell $(CHPL) --version | sed -n "s/chpl version [0-9]\.\([0-9]*\).*/\1/p") @@ -243,10 +272,10 @@ check-re2: $(RE2_CHECK) @rm -f $(DEP_INSTALL_DIR)/$@ $(DEP_INSTALL_DIR)/$@_real ARROW_CHECK = $(DEP_INSTALL_DIR)/checkArrow.chpl -check-arrow: $(ARROW_CHECK) $(ARROW_O) +check-arrow: $(ARROW_CHECK) $(ARROW_UTIL_O) $(ARROW_READ_O) $(ARROW_WRITE_O) @echo "Checking for Arrow" make compile-arrow-cpp - @$(CHPL) $(CHPL_FLAGS) $(ARKOUDA_COMPAT_MODULES) $< $(ARROW_M) -M $(ARKOUDA_SOURCE_DIR) -o $(DEP_INSTALL_DIR)/$@ && ([ $$? -eq 0 ] && echo "Success compiling program") || echo "\nERROR: Please ensure that dependencies have been installed correctly (see -> https://github.com/Bears-R-Us/arkouda/blob/master/pydoc/setup/BUILD.md)\n" + @$(CHPL) $(CHPL_FLAGS) $(ARKOUDA_COMPAT_MODULES) $< $(ARROW_M) -M $(ARKOUDA_SOURCE_DIR) -I $(ARKOUDA_SOURCE_DIR)/parquet -o $(DEP_INSTALL_DIR)/$@ && ([ $$? -eq 0 ] && echo "Success compiling program") || echo "\nERROR: Please ensure that dependencies have been installed correctly (see -> https://github.com/Bears-R-Us/arkouda/blob/master/pydoc/setup/BUILD.md)\n" $(DEP_INSTALL_DIR)/$@ -nl 1 @rm -f $(DEP_INSTALL_DIR)/$@ $(DEP_INSTALL_DIR)/$@_real @@ -349,15 +378,15 @@ endif SERVER_CONFIG_SCRIPT=$(ARKOUDA_SOURCE_DIR)/parseServerConfig.py # This is the main compilation statement section -$(ARKOUDA_MAIN_MODULE): check-deps $(ARROW_O) $(ARKOUDA_SOURCES) $(ARKOUDA_MAKEFILES) +$(ARKOUDA_MAIN_MODULE): check-deps $(ARROW_UTIL_O) $(ARROW_READ_O) $(ARROW_WRITE_O) $(ARKOUDA_SOURCES) $(ARKOUDA_MAKEFILES) $(eval MOD_GEN_OUT=$(shell python3 $(SERVER_CONFIG_SCRIPT) $(ARKOUDA_CONFIG_FILE) $(ARKOUDA_SOURCE_DIR))) - $(CHPL) $(CHPL_DEBUG_FLAGS) $(PRINT_PASSES_FLAGS) $(REGEX_MAX_CAPTURES_FLAG) $(OPTIONAL_SERVER_FLAGS) $(CHPL_FLAGS_WITH_VERSION) $(CHPL_COMPAT_FLAGS) $(ARKOUDA_MAIN_SOURCE) $(ARKOUDA_COMPAT_MODULES) $(ARKOUDA_SERVER_USER_MODULES) $(MOD_GEN_OUT) $(ARKOUDA_RW_DEFAULT_FLAG) -o $@ + $(CHPL) $(CHPL_DEBUG_FLAGS) $(PRINT_PASSES_FLAGS) $(REGEX_MAX_CAPTURES_FLAG) $(OPTIONAL_SERVER_FLAGS) $(CHPL_FLAGS_WITH_VERSION) $(CHPL_COMPAT_FLAGS) $(ARKOUDA_MAIN_SOURCE) $(ARKOUDA_COMPAT_MODULES) $(ARKOUDA_SERVER_USER_MODULES) $(MOD_GEN_OUT) $(ARKOUDA_RW_DEFAULT_FLAG) -I$(ARKOUDA_SOURCE_DIR)/parquet -o $@ CLEAN_TARGETS += arkouda-clean .PHONY: arkouda-clean arkouda-clean: - $(RM) $(ARKOUDA_MAIN_MODULE) $(ARKOUDA_MAIN_MODULE)_real $(ARROW_O) + $(RM) $(ARKOUDA_MAIN_MODULE) $(ARKOUDA_MAIN_MODULE)_real $(ARROW_UTIL_O) $(ARROW_READ_O) $(ARROW_WRITE_O) .PHONY: tags tags: diff --git a/dep/checkArrow.chpl b/dep/checkArrow.chpl index d1a47524ea..f70b72736e 100644 --- a/dep/checkArrow.chpl +++ b/dep/checkArrow.chpl @@ -1,8 +1,12 @@ use IO; use CTypes; -require "../src/ArrowFunctions.h"; -require "../src/ArrowFunctions.o"; +require "../src/parquet/WriteParquet.h"; +require "../src/WriteParquet.o"; +require "../src/parquet/ReadParquet.h"; +require "../src/ReadParquet.o"; +require "../src/parquet/UtilParquet.h"; +require "../src/UtilParquet.o"; proc getVersionInfo() { extern proc c_getVersionInfo(): c_ptrConst(c_char); diff --git a/src/ArrowFunctions.cpp b/src/ArrowFunctions.cpp deleted file mode 100644 index 31bfa6f2a5..0000000000 --- a/src/ArrowFunctions.cpp +++ /dev/null @@ -1,2200 +0,0 @@ -#include "ArrowFunctions.h" - -static std::map> globalFiles; -static std::map> globalRowGroupReaders; -static std::map> globalColumnReaders; - -/* - Arrow Error Helpers - ------------------- - Arrow provides PARQUETASSIGNORTHROW and other similar macros - to help with error handling, but since we are doing something - unique (passing back the error message to Chapel to be displayed), - these helpers are similar to the provided macros but matching our - functionality. -*/ - -// The `ARROWRESULT_OK` macro should be used when trying to -// assign the result of an Arrow/Parquet function to a value that can -// potentially throw an error, so the argument `cmd` is the Arrow -// command to execute and `res` is the desired variable to store the -// result -#define ARROWRESULT_OK(cmd, res) \ - { \ - auto result = cmd; \ - if(!result.ok()) { \ - *errMsg = strdup(result.status().message().c_str()); \ - return ARROWERROR; \ - } \ - res = result.ValueOrDie(); \ - } - -// The `ARROWSTATUS_OK` macro should be used when calling an -// Arrow/Parquet function that returns a status. The `cmd` -// argument should be the Arrow function to execute. -#define ARROWSTATUS_OK(cmd) \ - if(!check_status_ok(cmd, errMsg)) \ - return ARROWERROR; - -bool check_status_ok(arrow::Status status, char** errMsg) { - if(!status.ok()) { - *errMsg = strdup(status.message().c_str()); - return false; - } - return true; -} - -/* - C++ functions - ------------- - These C++ functions are used to call into the Arrow library - and are then called to by their corresponding C functions to - allow interoperability with Chapel. This means that all of the - C++ functions must return types that are C compatible. -*/ - -int64_t cpp_getNumRows(const char* filename, char** errMsg) { - try { - std::shared_ptr infile; - ARROWRESULT_OK(arrow::io::ReadableFile::Open(filename, arrow::default_memory_pool()), - infile); - - std::unique_ptr reader; - ARROWSTATUS_OK(parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); - - return reader -> parquet_reader() -> metadata() -> num_rows(); - } catch (const std::exception& e) { - *errMsg = strdup(e.what()); - return ARROWERROR; - } -} - -int cpp_getPrecision(const char* filename, const char* colname, char** errMsg) { - try { - std::shared_ptr infile; - ARROWRESULT_OK(arrow::io::ReadableFile::Open(filename, arrow::default_memory_pool()), - infile); - - std::unique_ptr reader; - ARROWSTATUS_OK(parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); - - std::shared_ptr sc; - std::shared_ptr* out = ≻ - ARROWSTATUS_OK(reader->GetSchema(out)); - - int idx = sc -> GetFieldIndex(colname); - - const auto& decimal_type = static_cast(*sc->field(idx)->type()); - const int64_t precision = decimal_type.precision(); - - return precision; - } catch (const std::exception& e) { - *errMsg = strdup(e.what()); - return ARROWERROR; - } -} - -int cpp_getType(const char* filename, const char* colname, char** errMsg) { - try { - std::shared_ptr infile; - ARROWRESULT_OK(arrow::io::ReadableFile::Open(filename, arrow::default_memory_pool()), - infile); - - std::unique_ptr reader; - ARROWSTATUS_OK(parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); - - std::shared_ptr sc; - std::shared_ptr* out = ≻ - ARROWSTATUS_OK(reader->GetSchema(out)); - - int idx = sc -> GetFieldIndex(colname); - // Since this doesn't actually throw a Parquet error, we have to generate - // our own error message for this case - if(idx == -1) { - std::string fname(filename); - std::string dname(colname); - std::string msg = "Dataset: " + dname + " does not exist in file: " + filename; - *errMsg = strdup(msg.c_str()); - return ARROWERROR; - } - auto myType = sc -> field(idx) -> type(); - - if(myType->id() == arrow::Type::INT64) - return ARROWINT64; - else if(myType->id() == arrow::Type::INT32 || myType->id() == arrow::Type::INT16) - return ARROWINT32; // int16 is logical type, stored as int32 - else if(myType->id() == arrow::Type::UINT64) - return ARROWUINT64; - else if(myType->id() == arrow::Type::UINT32 || - myType->id() == arrow::Type::UINT16) - return ARROWUINT32; // uint16 is logical type, stored as uint32 - else if(myType->id() == arrow::Type::TIMESTAMP) - return ARROWTIMESTAMP; - else if(myType->id() == arrow::Type::BOOL) - return ARROWBOOLEAN; - else if(myType->id() == arrow::Type::STRING || - myType->id() == arrow::Type::BINARY || - myType->id() == arrow::Type::LARGE_STRING) - return ARROWSTRING; - else if(myType->id() == arrow::Type::FLOAT) - return ARROWFLOAT; - else if(myType->id() == arrow::Type::DOUBLE) - return ARROWDOUBLE; - else if(myType->id() == arrow::Type::LIST) - return ARROWLIST; - else if(myType->id() == arrow::Type::DECIMAL) - return ARROWDECIMAL; - else { - std::string fname(filename); - std::string dname(colname); - std::string msg = "Unsupported type on column: " + dname + " in " + filename; - *errMsg = strdup(msg.c_str()); - return ARROWERROR; - } - } catch (const std::exception& e) { - *errMsg = strdup(e.what()); - return ARROWERROR; - } -} - -int cpp_getListType(const char* filename, const char* colname, char** errMsg) { - try { - std::shared_ptr infile; - ARROWRESULT_OK(arrow::io::ReadableFile::Open(filename, arrow::default_memory_pool()), - infile); - - std::unique_ptr reader; - ARROWSTATUS_OK(parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); - - std::shared_ptr sc; - std::shared_ptr* out = ≻ - ARROWSTATUS_OK(reader->GetSchema(out)); - - int idx = sc -> GetFieldIndex(colname); - // Since this doesn't actually throw a Parquet error, we have to generate - // our own error message for this case - if(idx == -1) { - std::string fname(filename); - std::string dname(colname); - std::string msg = "Dataset: " + dname + " does not exist in file: " + fname; - *errMsg = strdup(msg.c_str()); - return ARROWERROR; - } - auto myType = sc -> field(idx) -> type(); - - if (myType->id() == arrow::Type::LIST) { - if (myType->num_fields() != 1) { - std::string fname(filename); - std::string dname(colname); - std::string msg = "Column " + dname + " in " + fname + " cannot be read by Arkouda."; - *errMsg = strdup(msg.c_str()); - return ARROWERROR; - } - else { - // fields returns a vector of fields, but here we are expecting lists so should only contain 1 item here - auto field = myType->fields()[0]; - auto f_type = field->type(); - if(f_type->id() == arrow::Type::INT64) - return ARROWINT64; - else if(f_type->id() == arrow::Type::INT32 || f_type->id() == arrow::Type::INT16) - return ARROWINT32; - else if(f_type->id() == arrow::Type::UINT64) - return ARROWUINT64; - else if(f_type->id() == arrow::Type::UINT32 || f_type->id() == arrow::Type::UINT16) - return ARROWUINT32; - else if(f_type->id() == arrow::Type::TIMESTAMP) - return ARROWTIMESTAMP; - else if(f_type->id() == arrow::Type::BOOL) - return ARROWBOOLEAN; - else if(f_type->id() == arrow::Type::STRING || - f_type->id() == arrow::Type::BINARY || - f_type->id() == arrow::Type::LARGE_STRING) // Verify that this is functional as expected - return ARROWSTRING; - else if(f_type->id() == arrow::Type::FLOAT) - return ARROWFLOAT; - else if(f_type->id() == arrow::Type::DOUBLE) - return ARROWDOUBLE; - else { - std::string fname(filename); - std::string dname(colname); - std::string msg = "Unsupported type on column: " + dname + " in " + fname; - *errMsg = strdup(msg.c_str()); - return ARROWERROR; - } - } - } - else { - std::string fname(filename); - std::string dname(colname); - std::string msg = "Column " + dname + " in " + fname + " is not a List"; - *errMsg = strdup(msg.c_str()); - return ARROWERROR; - } - } catch (const std::exception& e) { - *errMsg = strdup(e.what()); - return ARROWERROR; - } -} - -int64_t cpp_getStringColumnNumBytes(const char* filename, const char* colname, void* chpl_offsets, int64_t numElems, int64_t startIdx, int64_t batchSize, char** errMsg) { - try { - int64_t ty = cpp_getType(filename, colname, errMsg); - int64_t dty; // used to store the type of data so we can handle lists - if (ty == ARROWLIST) { // get the type of the list so we can verify it is ARROWSTRING - dty = cpp_getListType(filename, colname, errMsg); - } - else { - dty = ty; - } - auto offsets = (int64_t*)chpl_offsets; - int64_t byteSize = 0; - - if(dty == ARROWSTRING) { - std::unique_ptr parquet_reader = - parquet::ParquetFileReader::OpenFile(filename, false); - - std::shared_ptr file_metadata = parquet_reader->metadata(); - int num_row_groups = file_metadata->num_row_groups(); - - int64_t i = 0; - for (int r = 0; r < num_row_groups; r++) { - std::shared_ptr row_group_reader = - parquet_reader->RowGroup(r); - - int64_t values_read = 0; - - std::shared_ptr column_reader; - - int64_t idx; - if (ty == ARROWLIST) { - idx = file_metadata -> schema() -> group_node() -> FieldIndex(colname); - } else { - idx = file_metadata -> schema() -> ColumnIndex(colname); - } - - if(idx < 0) { - std::string dname(colname); - std::string fname(filename); - std::string msg = "Dataset: " + dname + " does not exist in file: " + fname; - *errMsg = strdup(msg.c_str()); - return ARROWERROR; - } - column_reader = row_group_reader->Column(idx); - - int16_t definition_level; - parquet::ByteArrayReader* ba_reader = - static_cast(column_reader.get()); - - int64_t numRead = 0; - while (ba_reader->HasNext() && numRead < numElems) { - parquet::ByteArray value; - (void)ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); - if ((ty == ARROWLIST && definition_level == 3) || ty == ARROWSTRING) { - if(values_read > 0) { - offsets[i] = value.len + 1; - byteSize += value.len + 1; - numRead += values_read; - } else { - offsets[i] = 1; - byteSize+=1; - numRead+=1; - } - i++; - } - } - } - return byteSize; - } - return ARROWERROR; - } catch (const std::exception& e) { - *errMsg = strdup(e.what()); - return ARROWERROR; - } -} - -int64_t cpp_getListColumnSize(const char* filename, const char* colname, void* chpl_seg_sizes, int64_t numElems, int64_t startIdx, char** errMsg) { - try { - int64_t ty = cpp_getType(filename, colname, errMsg); - auto seg_sizes = (int64_t*)chpl_seg_sizes; - int64_t listSize = 0; - - if (ty == ARROWLIST){ - int64_t lty = cpp_getListType(filename, colname, errMsg); - std::unique_ptr parquet_reader = - parquet::ParquetFileReader::OpenFile(filename, false); - - std::shared_ptr file_metadata = parquet_reader->metadata(); - int num_row_groups = file_metadata->num_row_groups(); - - auto idx = file_metadata -> schema() -> group_node() -> FieldIndex(colname); - if(idx < 0) { - std::string dname(colname); - std::string fname(filename); - std::string msg = "Dataset: " + dname + " does not exist in file: " + fname; - *errMsg = strdup(msg.c_str()); - return ARROWERROR; - } - - int64_t i = 0; - int64_t vct = 0; - int64_t seg_size = 0; - int64_t off = 0; - bool first = true; - for (int r = 0; r < num_row_groups; r++) { - std::shared_ptr row_group_reader = - parquet_reader->RowGroup(r); - - int64_t values_read = 0; - - std::shared_ptr column_reader; - - column_reader = row_group_reader->Column(idx); - int16_t definition_level; - int16_t rep_lvl; - - if(lty == ARROWINT64 || lty == ARROWUINT64) { - parquet::Int64Reader* int_reader = - static_cast(column_reader.get()); - - while (int_reader->HasNext()) { - int64_t value; - (void)int_reader->ReadBatch(1, &definition_level, &rep_lvl, &value, &values_read); - if (values_read == 0 || (!first && rep_lvl == 0)) { - seg_sizes[i] = seg_size; - i++; - seg_size = 0; - } - if (values_read != 0) { - seg_size++; - vct++; - if (first) { - first = false; - } - } - if (values_read != 0 && !int_reader->HasNext()){ - seg_sizes[i] = seg_size; - } - } - } else if(lty == ARROWINT32 || lty == ARROWUINT32) { - parquet::Int32Reader* int_reader = - static_cast(column_reader.get()); - - while (int_reader->HasNext()) { - int32_t value; - (void)int_reader->ReadBatch(1, &definition_level, &rep_lvl, &value, &values_read); - if (values_read == 0 || (!first && rep_lvl == 0)) { - seg_sizes[i] = seg_size; - i++; - seg_size = 0; - } - if (values_read != 0) { - seg_size++; - vct++; - if (first) { - first = false; - } - } - if (values_read != 0 && !int_reader->HasNext()){ - seg_sizes[i] = seg_size; - } - } - } else if (lty == ARROWSTRING) { - parquet::ByteArrayReader* reader = - static_cast(column_reader.get()); - - while (reader->HasNext()) { - parquet::ByteArray value; - (void)reader->ReadBatch(1, &definition_level, &rep_lvl, &value, &values_read); - if (values_read == 0 || (!first && rep_lvl == 0)) { - seg_sizes[i] = seg_size; - i++; - seg_size = 0; - } - if (values_read != 0) { - seg_size++; - vct++; - if (first) { - first = false; - } - } - if (values_read != 0 && !reader->HasNext()){ - seg_sizes[i] = seg_size; - } - } - } else if(lty == ARROWBOOLEAN) { - parquet::BoolReader* bool_reader = - static_cast(column_reader.get()); - - while (bool_reader->HasNext()) { - bool value; - (void)bool_reader->ReadBatch(1, &definition_level, &rep_lvl, &value, &values_read); - if (values_read == 0 || (!first && rep_lvl == 0)) { - seg_sizes[i] = seg_size; - i++; - seg_size = 0; - } - if (values_read != 0) { - seg_size++; - vct++; - if (first) { - first = false; - } - } - if (values_read != 0 && !bool_reader->HasNext()){ - seg_sizes[i] = seg_size; - } - } - } else if (lty == ARROWFLOAT) { - parquet::FloatReader* float_reader = - static_cast(column_reader.get()); - - int64_t numRead = 0; - while (float_reader->HasNext()) { - float value; - (void)float_reader->ReadBatch(1, &definition_level, &rep_lvl, &value, &values_read); - if ((values_read == 0 && definition_level != 2) || (!first && rep_lvl == 0)) { - seg_sizes[i] = seg_size; - i++; - seg_size = 0; - } - if (values_read != 0 || (values_read == 0 && definition_level == 2)) { - seg_size++; - vct++; - if (first) { - first = false; - } - } - if ((values_read != 0 || (values_read == 0 && definition_level == 2)) && !float_reader->HasNext()){ - seg_sizes[i] = seg_size; - } - } - } else if(lty == ARROWDOUBLE) { - parquet::DoubleReader* dbl_reader = - static_cast(column_reader.get()); - - while (dbl_reader->HasNext()) { - double value; - (void)dbl_reader->ReadBatch(1, &definition_level, &rep_lvl, &value, &values_read); - if ((values_read == 0 && definition_level != 2) || (!first && rep_lvl == 0)) { - seg_sizes[i] = seg_size; - i++; - seg_size = 0; - } - if (values_read != 0 || (values_read == 0 && definition_level == 2)) { - seg_size++; - vct++; - if (first) { - first = false; - } - } - if ((values_read != 0 || (values_read == 0 && definition_level == 2)) && !dbl_reader->HasNext()){ - seg_sizes[i] = seg_size; - } - } - } - } - return vct; - } - return ARROWERROR; - } catch (const std::exception& e) { - *errMsg = strdup(e.what()); - return ARROWERROR; - } -} - -int64_t cpp_getStringColumnNullIndices(const char* filename, const char* colname, void* chpl_nulls, char** errMsg) { - try { - int64_t ty = cpp_getType(filename, colname, errMsg); - auto null_indices = (int64_t*)chpl_nulls; - int64_t byteSize = 0; - - if(ty == ARROWSTRING) { - std::unique_ptr parquet_reader = - parquet::ParquetFileReader::OpenFile(filename, false); - - std::shared_ptr file_metadata = parquet_reader->metadata(); - int num_row_groups = file_metadata->num_row_groups(); - - int64_t i = 0; - for (int r = 0; r < num_row_groups; r++) { - std::shared_ptr row_group_reader = - parquet_reader->RowGroup(r); - - int64_t values_read = 0; - - std::shared_ptr column_reader; - - auto idx = file_metadata -> schema() -> ColumnIndex(colname); - - if(idx < 0) { - std::string dname(colname); - std::string fname(filename); - std::string msg = "Dataset: " + dname + " does not exist in file: " + fname; - *errMsg = strdup(msg.c_str()); - return ARROWERROR; - } - column_reader = row_group_reader->Column(idx); - int16_t definition_level; - parquet::ByteArrayReader* ba_reader = - static_cast(column_reader.get()); - - while (ba_reader->HasNext()) { - parquet::ByteArray value; - (void)ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); - if(values_read == 0) - null_indices[i] = 1; - i++; - } - } - return 0; - } - return ARROWERROR; - } catch (const std::exception& e) { - *errMsg = strdup(e.what()); - return ARROWERROR; - } -} - -int cpp_readListColumnByName(const char* filename, void* chpl_arr, const char* colname, int64_t numElems, int64_t startIdx, int64_t batchSize, char** errMsg) { - try { - int64_t ty = cpp_getType(filename, colname, errMsg); - if (ty == ARROWLIST){ - int64_t lty = cpp_getListType(filename, colname, errMsg); - std::unique_ptr parquet_reader = - parquet::ParquetFileReader::OpenFile(filename, false); - - std::shared_ptr file_metadata = parquet_reader->metadata(); - int num_row_groups = file_metadata->num_row_groups(); - - auto idx = file_metadata -> schema() -> group_node() -> FieldIndex(colname); - if(idx < 0) { - std::string dname(colname); - std::string fname(filename); - std::string msg = "Dataset: " + dname + " does not exist in file: " + fname; - *errMsg = strdup(msg.c_str()); - return ARROWERROR; - } - - int64_t i = 0; - int64_t arrayIdx = 0; - for (int r = 0; r < num_row_groups; r++) { - std::shared_ptr row_group_reader = - parquet_reader->RowGroup(r); - - int64_t values_read = 0; - int16_t definition_level; // needed for any type that is nullable - - std::shared_ptr column_reader = row_group_reader->Column(idx); - if(lty == ARROWINT64 || lty == ARROWUINT64) { - int16_t definition_level; // nullable type and only reading single records in batch - auto chpl_ptr = (int64_t*)chpl_arr; - parquet::Int64Reader* reader = - static_cast(column_reader.get()); - startIdx -= reader->Skip(startIdx); - - while (reader->HasNext() && arrayIdx < numElems) { - (void)reader->ReadBatch(1, &definition_level, nullptr, &chpl_ptr[arrayIdx], &values_read); - // if values_read is 0, that means that it was an empty seg - if (values_read != 0) { - arrayIdx++; - } - i++; - } - } else if(lty == ARROWINT32 || lty == ARROWUINT32) { - int16_t definition_level; // nullable type and only reading single records in batch - auto chpl_ptr = (int64_t*)chpl_arr; - parquet::Int32Reader* reader = - static_cast(column_reader.get()); - startIdx -= reader->Skip(startIdx); - - int32_t tmp; - while (reader->HasNext() && arrayIdx < numElems) { - (void)reader->ReadBatch(1, &definition_level, nullptr, &tmp, &values_read); - // if values_read is 0, that means that it was an empty seg - if (values_read != 0) { - chpl_ptr[arrayIdx] = (int64_t)tmp; - arrayIdx++; - } - i++; - } - } else if (lty == ARROWSTRING) { - int16_t definition_level; // nullable type and only reading single records in batch - auto chpl_ptr = (unsigned char*)chpl_arr; - parquet::ByteArrayReader* reader = - static_cast(column_reader.get()); - - while (reader->HasNext()) { - parquet::ByteArray value; - (void)reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); - // if values_read is 0, that means that it was a null value - if(values_read > 0 && definition_level == 3) { - for(int j = 0; j < value.len; j++) { - chpl_ptr[i] = value.ptr[j]; - i++; - } - i++; // skip one space so the strings are null terminated with a 0 - } - } - } else if(lty == ARROWBOOLEAN) { - int16_t definition_level; // nullable type and only reading single records in batch - auto chpl_ptr = (bool*)chpl_arr; - parquet::BoolReader* reader = - static_cast(column_reader.get()); - startIdx -= reader->Skip(startIdx); - - while (reader->HasNext() && arrayIdx < numElems) { - (void)reader->ReadBatch(1, &definition_level, nullptr, &chpl_ptr[arrayIdx], &values_read); - // if values_read is 0, that means that it was an empty seg - if (values_read != 0) { - arrayIdx++; - } - i++; - } - } else if(lty == ARROWFLOAT) { - // convert to simpler single batch to sidestep this seemingly architecture dependent (see issue #3234) - int16_t definition_level; // nullable type and only reading single records in batch - auto chpl_ptr = (double*)chpl_arr; - parquet::FloatReader* reader = - static_cast(column_reader.get()); - - float tmp; - while (reader->HasNext() && arrayIdx < numElems) { - (void)reader->ReadBatch(1, &definition_level, nullptr, &tmp, &values_read); - // if values_read is 0, that means that it was a null value or empty seg - if (values_read != 0) { - chpl_ptr[arrayIdx] = (double) tmp; - arrayIdx++; - } - else { - // check if nan otherwise it's an empty seg - if (definition_level == 2) { - chpl_ptr[arrayIdx] = NAN; - arrayIdx++; - } - } - i++; - } - } else if(lty == ARROWDOUBLE) { - // convert to simpler single batch to sidestep this seemingly architecture dependent (see issue #3234) - int16_t definition_level; // nullable type and only reading single records in batch - auto chpl_ptr = (double*)chpl_arr; - parquet::DoubleReader* reader = - static_cast(column_reader.get()); - - while (reader->HasNext() && arrayIdx < numElems) { - (void)reader->ReadBatch(1, &definition_level, nullptr, &chpl_ptr[arrayIdx], &values_read); - // if values_read is 0, that means that it was a null value or empty seg - if (values_read != 0) { - arrayIdx++; - } - else { - // check if nan otherwise it's an empty seg - if (definition_level == 2) { - chpl_ptr[arrayIdx] = NAN; - arrayIdx++; - } - } - i++; - } - } - } - return 0; - } - return ARROWERROR; - } catch (const std::exception& e) { - *errMsg = strdup(e.what()); - return ARROWERROR; - } -} - -int cpp_readColumnByName(const char* filename, void* chpl_arr, bool* where_null_chpl, const char* colname, int64_t numElems, int64_t startIdx, int64_t batchSize, int64_t byteLength, bool hasNonFloatNulls, char** errMsg) { - try { - int64_t ty = cpp_getType(filename, colname, errMsg); - - std::unique_ptr parquet_reader = - parquet::ParquetFileReader::OpenFile(filename, false); - - std::shared_ptr file_metadata = parquet_reader->metadata(); - int num_row_groups = file_metadata->num_row_groups(); - - int64_t i = 0; - for (int r = 0; r < num_row_groups; r++) { - std::shared_ptr row_group_reader = - parquet_reader->RowGroup(r); - - int64_t values_read = 0; - - std::shared_ptr column_reader; - - auto idx = file_metadata -> schema() -> ColumnIndex(colname); - auto max_def = file_metadata -> schema() -> Column(idx) -> max_definition_level(); // needed to determine if nulls are allowed - - if(idx < 0) { - std::string dname(colname); - std::string fname(filename); - std::string msg = "Dataset: " + dname + " does not exist in file: " + fname; - *errMsg = strdup(msg.c_str()); - return ARROWERROR; - } - - column_reader = row_group_reader->Column(idx); - - // Since int64 and uint64 Arrow dtypes share a physical type and only differ - // in logical type, they must be read from the file in the same way - if(ty == ARROWINT64 || ty == ARROWUINT64) { - int16_t definition_level; // nullable type and only reading single records in batch - auto chpl_ptr = (int64_t*)chpl_arr; - parquet::Int64Reader* reader = - static_cast(column_reader.get()); - startIdx -= reader->Skip(startIdx); - - if (not hasNonFloatNulls) { - while (reader->HasNext() && i < numElems) { - if((numElems - i) < batchSize) // adjust batchSize if needed - batchSize = numElems - i; - (void)reader->ReadBatch(batchSize, nullptr, nullptr, &chpl_ptr[i], &values_read); - i+=values_read; - } - } - else { - while (reader->HasNext() && i < numElems) { - (void)reader->ReadBatch(1, &definition_level, nullptr, &chpl_ptr[i], &values_read); - // if values_read is 0, that means that it was a null value - if(values_read == 0) { - where_null_chpl[i] = true; - } - i++; - } - } - } else if(ty == ARROWINT32 || ty == ARROWUINT32) { - int16_t definition_level; // nullable type and only reading single records in batch - auto chpl_ptr = (int64_t*)chpl_arr; - parquet::Int32Reader* reader = - static_cast(column_reader.get()); - startIdx -= reader->Skip(startIdx); - - if (not hasNonFloatNulls) { - int32_t* tmpArr = (int32_t*)malloc(batchSize * sizeof(int32_t)); - while (reader->HasNext() && i < numElems) { - if((numElems - i) < batchSize) // adjust batchSize if needed - batchSize = numElems - i; - - // Can't read directly into chpl_ptr because it is int64 - (void)reader->ReadBatch(batchSize, nullptr, nullptr, tmpArr, &values_read); - for (int64_t j = 0; j < values_read; j++) - chpl_ptr[i+j] = (int64_t)tmpArr[j]; - i+=values_read; - } - free(tmpArr); - } - else { - int32_t tmp; - while (reader->HasNext() && i < numElems) { - (void)reader->ReadBatch(1, &definition_level, nullptr, &tmp, &values_read); - // if values_read is 0, that means that it was a null value - if(values_read == 0) { - where_null_chpl[i] = true; - } - else { - chpl_ptr[i] = (int64_t)tmp; - } - i++; - } - } - } else if(ty == ARROWBOOLEAN) { - int16_t definition_level; // nullable type and only reading single records in batch - auto chpl_ptr = (bool*)chpl_arr; - parquet::BoolReader* reader = - static_cast(column_reader.get()); - startIdx -= reader->Skip(startIdx); - - if (not hasNonFloatNulls) { - while (reader->HasNext() && i < numElems) { - if((numElems - i) < batchSize) // adjust batchSize if needed - batchSize = numElems - i; - (void)reader->ReadBatch(batchSize, nullptr, nullptr, &chpl_ptr[i], &values_read); - i+=values_read; - } - } - else { - while (reader->HasNext() && i < numElems) { - (void)reader->ReadBatch(1, &definition_level, nullptr, &chpl_ptr[i], &values_read); - // if values_read is 0, that means that it was a null value - if(values_read == 0) { - where_null_chpl[i] = true; - } - i++; - } - } - } else if(ty == ARROWSTRING) { - int16_t definition_level; // nullable type and only reading single records in batch - auto chpl_ptr = (unsigned char*)chpl_arr; - parquet::ByteArrayReader* reader = - static_cast(column_reader.get()); - - while (reader->HasNext()) { - parquet::ByteArray value; - (void)reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); - // if values_read is 0, that means that it was a null value - if(values_read > 0) { - for(int j = 0; j < value.len; j++) { - chpl_ptr[i] = value.ptr[j]; - i++; - } - } - i++; // skip one space so the strings are null terminated with a 0 - } - } else if(ty == ARROWFLOAT) { - int16_t definition_level; // nullable type and only reading single records in batch - auto chpl_ptr = (double*)chpl_arr; - parquet::FloatReader* reader = - static_cast(column_reader.get()); - startIdx -= reader->Skip(startIdx); - - while (reader->HasNext() && i < numElems) { - float value; - (void)reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); - // if values_read is 0, that means that it was a null value - if(values_read > 0) { - // this means it wasn't null - chpl_ptr[i] = (double) value; - } else { - chpl_ptr[i] = NAN; - } - i++; - } - } else if(ty == ARROWDOUBLE) { - int16_t definition_level; // nullable type and only reading single records in batch - auto chpl_ptr = (double*)chpl_arr; - parquet::DoubleReader* reader = - static_cast(column_reader.get()); - startIdx -= reader->Skip(startIdx); - - while (reader->HasNext() && i < numElems) { - double value; - (void)reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); - // if values_read is 0, that means that it was a null value - if(values_read > 0) { - // this means it wasn't null - chpl_ptr[i] = value; - } else { - chpl_ptr[i] = NAN; - } - i++; - } - } else if(ty == ARROWDECIMAL) { - auto chpl_ptr = (double*)chpl_arr; - parquet::FixedLenByteArray value; - parquet::FixedLenByteArrayReader* reader = - static_cast(column_reader.get()); - startIdx -= reader->Skip(startIdx); - - while (reader->HasNext() && i < numElems) { - (void)reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); - arrow::Decimal128 v; - PARQUET_ASSIGN_OR_THROW(v, - ::arrow::Decimal128::FromBigEndian(value.ptr, byteLength)); - - chpl_ptr[i] = v.ToDouble(0); - i+=values_read; - } - } - } - return 0; - } catch (const std::exception& e) { - *errMsg = strdup(e.what()); - return ARROWERROR; - } -} - -// configure the schema for a multicolumn file -std::shared_ptr SetupSchema(void* column_names, void * objTypes, void* datatypes, int64_t colnum) { - parquet::schema::NodeVector fields; - auto cname_ptr = (char**)column_names; - auto dtypes_ptr = (int64_t*) datatypes; - auto objType_ptr = (int64_t*) objTypes; - for (int64_t i = 0; i < colnum; i++){ - if(dtypes_ptr[i] == ARROWINT64) { - if (objType_ptr[i] == SEGARRAY){ - auto element = parquet::schema::PrimitiveNode::Make("item", parquet::Repetition::OPTIONAL, parquet::Type::INT64, parquet::ConvertedType::NONE); - auto list = parquet::schema::GroupNode::Make("list", parquet::Repetition::REPEATED, {element}); - fields.push_back(parquet::schema::GroupNode::Make(cname_ptr[i], parquet::Repetition::OPTIONAL, {list}, parquet::ConvertedType::LIST)); - } else { - fields.push_back(parquet::schema::PrimitiveNode::Make(cname_ptr[i], parquet::Repetition::REQUIRED, parquet::Type::INT64, parquet::ConvertedType::NONE)); - } - } else if(dtypes_ptr[i] == ARROWUINT64) { - if (objType_ptr[i] == SEGARRAY){ - auto element = parquet::schema::PrimitiveNode::Make("item", parquet::Repetition::OPTIONAL, parquet::Type::INT64, parquet::ConvertedType::UINT_64); - auto list = parquet::schema::GroupNode::Make("list", parquet::Repetition::REPEATED, {element}); - fields.push_back(parquet::schema::GroupNode::Make(cname_ptr[i], parquet::Repetition::OPTIONAL, {list}, parquet::ConvertedType::LIST)); - } else { - fields.push_back(parquet::schema::PrimitiveNode::Make(cname_ptr[i], parquet::Repetition::REQUIRED, parquet::Type::INT64, parquet::ConvertedType::UINT_64)); - } - } else if(dtypes_ptr[i] == ARROWBOOLEAN) { - if (objType_ptr[i] == SEGARRAY){ - auto element = parquet::schema::PrimitiveNode::Make("item", parquet::Repetition::OPTIONAL, parquet::Type::BOOLEAN, parquet::ConvertedType::NONE); - auto list = parquet::schema::GroupNode::Make("list", parquet::Repetition::REPEATED, {element}); - fields.push_back(parquet::schema::GroupNode::Make(cname_ptr[i], parquet::Repetition::OPTIONAL, {list}, parquet::ConvertedType::LIST)); - } else { - fields.push_back(parquet::schema::PrimitiveNode::Make(cname_ptr[i], parquet::Repetition::REQUIRED, parquet::Type::BOOLEAN, parquet::ConvertedType::NONE)); - } - } else if(dtypes_ptr[i] == ARROWDOUBLE) { - if (objType_ptr[i] == SEGARRAY) { - auto element = parquet::schema::PrimitiveNode::Make("item", parquet::Repetition::OPTIONAL, parquet::Type::DOUBLE, parquet::ConvertedType::NONE); - auto list = parquet::schema::GroupNode::Make("list", parquet::Repetition::REPEATED, {element}); - fields.push_back(parquet::schema::GroupNode::Make(cname_ptr[i], parquet::Repetition::OPTIONAL, {list}, parquet::ConvertedType::LIST)); - } else { - fields.push_back(parquet::schema::PrimitiveNode::Make(cname_ptr[i], parquet::Repetition::REQUIRED, parquet::Type::DOUBLE, parquet::ConvertedType::NONE)); - } - } else if(dtypes_ptr[i] == ARROWSTRING) { - if (objType_ptr[i] == SEGARRAY) { - auto element = parquet::schema::PrimitiveNode::Make("item", parquet::Repetition::OPTIONAL, parquet::Type::BYTE_ARRAY, parquet::ConvertedType::NONE); - auto list = parquet::schema::GroupNode::Make("list", parquet::Repetition::REPEATED, {element}); - fields.push_back(parquet::schema::GroupNode::Make(cname_ptr[i], parquet::Repetition::OPTIONAL, {list}, parquet::ConvertedType::LIST)); - } else { - fields.push_back(parquet::schema::PrimitiveNode::Make(cname_ptr[i], parquet::Repetition::OPTIONAL, parquet::Type::BYTE_ARRAY, parquet::ConvertedType::NONE)); - } - } - } - return std::static_pointer_cast( - parquet::schema::GroupNode::Make("schema", parquet::Repetition::REQUIRED, fields)); -} - -int cpp_writeMultiColToParquet(const char* filename, void* column_names, - void** ptr_arr, void** offset_arr, void* objTypes, void* datatypes, - void* segArr_sizes, int64_t colnum, int64_t numelems, int64_t rowGroupSize, - int64_t compression, char** errMsg) { - try { - // initialize the file to write to - using FileClass = ::arrow::io::FileOutputStream; - std::shared_ptr out_file; - ARROWRESULT_OK(FileClass::Open(filename), out_file); - - // Setup the parquet schema - std::shared_ptr schema = SetupSchema(column_names, objTypes, datatypes, colnum); - - parquet::WriterProperties::Builder builder; - // assign the proper compression - if(compression == SNAPPY_COMP) { - builder.compression(parquet::Compression::SNAPPY); - } else if (compression == GZIP_COMP) { - builder.compression(parquet::Compression::GZIP); - } else if (compression == BROTLI_COMP) { - builder.compression(parquet::Compression::BROTLI); - } else if (compression == ZSTD_COMP) { - builder.compression(parquet::Compression::ZSTD); - } else if (compression == LZ4_COMP) { - builder.compression(parquet::Compression::LZ4); - } - std::shared_ptr props = builder.build(); - - std::shared_ptr file_writer = - parquet::ParquetFileWriter::Open(out_file, schema, props); - - std::queue idxQueue_str; // queue used to track string byteIdx - std::queue idxQueue_segarray; // queue used to track index into the offsets - - auto dtypes_ptr = (int64_t*) datatypes; - auto objType_ptr = (int64_t*) objTypes; - auto saSizes_ptr = (int64_t*) segArr_sizes; - int64_t numLeft = numelems; // number of elements remaining to write (rows) - int64_t x = 0; // index to start writing batch from - while (numLeft > 0) { - // Append a RowGroup with a specific number of rows. - parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); - int64_t batchSize = rowGroupSize; - if(numLeft < rowGroupSize) - batchSize = numLeft; - - // loop the columns and write the row groups - for(int64_t i = 0; i < colnum; i++){ - int64_t dtype = dtypes_ptr[i]; - if (dtype == ARROWINT64 || dtype == ARROWUINT64) { - auto data_ptr = (int64_t*)ptr_arr[i]; - parquet::Int64Writer* writer = - static_cast(rg_writer->NextColumn()); - - if (objType_ptr[i] == SEGARRAY) { - auto offset_ptr = (int64_t*)offset_arr[i]; - int64_t offIdx = 0; // index into offsets - - if (x > 0){ - offIdx = idxQueue_segarray.front(); - idxQueue_segarray.pop(); - } - - int64_t count = 0; - while (count < batchSize) { // ensures rowGroupSize maintained - int64_t segSize; - if (offIdx == (numelems - 1)) { - segSize = saSizes_ptr[i] - offset_ptr[offIdx]; - } - else { - segSize = offset_ptr[offIdx+1] - offset_ptr[offIdx]; - } - if (segSize > 0) { - int16_t* def_lvl = new int16_t[segSize] { 3 }; - int16_t* rep_lvl = new int16_t[segSize] { 0 }; - for (int64_t s = 0; s < segSize; s++){ - // if the value is first in the segment rep_lvl = 0, otherwise 1 - // all values defined at the item level (3) - rep_lvl[s] = (s == 0) ? 0 : 1; - def_lvl[s] = 3; - } - int64_t valIdx = offset_ptr[offIdx]; - writer->WriteBatch(segSize, def_lvl, rep_lvl, &data_ptr[valIdx]); - delete[] def_lvl; - delete[] rep_lvl; - } - else { - // empty segment denoted by null value that is not repeated (first of segment) defined at the list level (1) - segSize = 1; // even though segment is length=0, write null to hold the empty segment - int16_t def_lvl = 1; - int16_t rep_lvl = 0; - writer->WriteBatch(segSize, &def_lvl, &rep_lvl, nullptr); - } - offIdx++; - count++; - } - if (numLeft - count > 0) { - idxQueue_segarray.push(offIdx); - } - } else { - writer->WriteBatch(batchSize, nullptr, nullptr, &data_ptr[x]); - } - } else if(dtype == ARROWBOOLEAN) { - auto data_ptr = (bool*)ptr_arr[i]; - parquet::BoolWriter* writer = - static_cast(rg_writer->NextColumn()); - if (objType_ptr[i] == SEGARRAY) { - auto offset_ptr = (int64_t*)offset_arr[i]; - int64_t offIdx = 0; // index into offsets - - if (x > 0){ - offIdx = idxQueue_segarray.front(); - idxQueue_segarray.pop(); - } - - int64_t count = 0; - while (count < batchSize) { // ensures rowGroupSize maintained - int64_t segSize; - if (offIdx == numelems - 1) { - segSize = saSizes_ptr[i] - offset_ptr[offIdx]; - } - else { - segSize = offset_ptr[offIdx+1] - offset_ptr[offIdx]; - } - if (segSize > 0) { - int16_t* def_lvl = new int16_t[segSize] { 3 }; - int16_t* rep_lvl = new int16_t[segSize] { 0 }; - for (int64_t s = 0; s < segSize; s++){ - // if the value is first in the segment rep_lvl = 0, otherwise 1 - // all values defined at the item level (3) - rep_lvl[s] = (s == 0) ? 0 : 1; - def_lvl[s] = 3; - } - int64_t valIdx = offset_ptr[offIdx]; - writer->WriteBatch(segSize, def_lvl, rep_lvl, &data_ptr[valIdx]); - delete[] def_lvl; - delete[] rep_lvl; - } - else { - // empty segment denoted by null value that is not repeated (first of segment) defined at the list level (1) - segSize = 1; // even though segment is length=0, write null to hold the empty segment - int16_t def_lvl = 1; - int16_t rep_lvl = 0; - writer->WriteBatch(segSize, &def_lvl, &rep_lvl, nullptr); - } - offIdx++; - count++; - } - if (numLeft - count > 0) { - idxQueue_segarray.push(offIdx); - } - } else { - writer->WriteBatch(batchSize, nullptr, nullptr, &data_ptr[x]); - } - } else if(dtype == ARROWDOUBLE) { - auto data_ptr = (double*)ptr_arr[i]; - parquet::DoubleWriter* writer = - static_cast(rg_writer->NextColumn()); - if (objType_ptr[i] == SEGARRAY) { - auto offset_ptr = (int64_t*)offset_arr[i]; - int64_t offIdx = 0; // index into offsets - - if (x > 0){ - offIdx = idxQueue_segarray.front(); - idxQueue_segarray.pop(); - } - - int64_t count = 0; - while (count < batchSize) { // ensures rowGroupSize maintained - int64_t segSize; - if (offIdx == numelems - 1) { - segSize = saSizes_ptr[i] - offset_ptr[offIdx]; - } - else { - segSize = offset_ptr[offIdx+1] - offset_ptr[offIdx]; - } - if (segSize > 0) { - int16_t* def_lvl = new int16_t[segSize] { 3 }; - int16_t* rep_lvl = new int16_t[segSize] { 0 }; - for (int64_t s = 0; s < segSize; s++){ - // if the value is first in the segment rep_lvl = 0, otherwise 1 - // all values defined at the item level (3) - rep_lvl[s] = (s == 0) ? 0 : 1; - def_lvl[s] = 3; - } - int64_t valIdx = offset_ptr[offIdx]; - writer->WriteBatch(segSize, def_lvl, rep_lvl, &data_ptr[valIdx]); - delete[] def_lvl; - delete[] rep_lvl; - } - else { - // empty segment denoted by null value that is not repeated (first of segment) defined at the list level (1) - segSize = 1; // even though segment is length=0, write null to hold the empty segment - int16_t def_lvl = 1; - int16_t rep_lvl =0; - writer->WriteBatch(segSize, &def_lvl, &rep_lvl, nullptr); - } - offIdx++; - count++; - } - if (numLeft - count > 0) { - idxQueue_segarray.push(offIdx); - } - } else { - writer->WriteBatch(batchSize, nullptr, nullptr, &data_ptr[x]); - } - } else if(dtype == ARROWSTRING) { - auto data_ptr = (uint8_t*)ptr_arr[i]; - parquet::ByteArrayWriter* ba_writer = - static_cast(rg_writer->NextColumn()); - if (objType_ptr[i] == SEGARRAY) { - auto offset_ptr = (int64_t*)offset_arr[i]; - int64_t byteIdx = 0; - int64_t offIdx = 0; // index into offsets - - // identify the starting byte index - if (x > 0){ - byteIdx = idxQueue_str.front(); - idxQueue_str.pop(); - - offIdx = idxQueue_segarray.front(); - idxQueue_segarray.pop(); - } - - int64_t count = 0; - while (count < batchSize) { // ensures rowGroupSize maintained - int64_t segSize; - if (offIdx == numelems - 1) { - segSize = saSizes_ptr[i] - offset_ptr[offIdx]; - } - else { - segSize = offset_ptr[offIdx+1] - offset_ptr[offIdx]; - } - if (segSize > 0) { - for (int64_t s=0; s(&data_ptr[byteIdx]); - int64_t nextIdx = byteIdx; - while (data_ptr[nextIdx] != 0x00){ - nextIdx++; - } - value.len = nextIdx - byteIdx; - ba_writer->WriteBatch(1, &def_lvl, &rep_lvl, &value); - byteIdx = nextIdx + 1; // increment to start of next word - } - } - else { - // empty segment denoted by null value that is not repeated (first of segment) defined at the list level (1) - segSize = 1; // even though segment is length=0, write null to hold the empty segment - int16_t* def_lvl = new int16_t[segSize] { 1 }; - int16_t* rep_lvl = new int16_t[segSize] { 0 }; - ba_writer->WriteBatch(segSize, def_lvl, rep_lvl, nullptr); - } - offIdx++; - count++; - } - if (numLeft - count > 0) { - idxQueue_str.push(byteIdx); - idxQueue_segarray.push(offIdx); - } - } - else { - int64_t count = 0; - int64_t byteIdx = 0; - - // identify the starting byte index - if (x > 0){ - byteIdx = idxQueue_str.front(); - idxQueue_str.pop(); - } - - while(count < batchSize) { - parquet::ByteArray value; - int16_t definition_level = 1; - value.ptr = reinterpret_cast(&data_ptr[byteIdx]); - int64_t nextIdx = byteIdx; - while (data_ptr[nextIdx] != 0x00){ - nextIdx++; - } - // subtract 1 since we have the null terminator - value.len = nextIdx - byteIdx; - ba_writer->WriteBatch(1, &definition_level, nullptr, &value); - count++; - byteIdx = nextIdx + 1; - } - if (numLeft - count > 0) { - idxQueue_str.push(byteIdx); - } - } - } else { - return ARROWERROR; - } - } - numLeft -= batchSize; - x += batchSize; - } - - file_writer->Close(); - ARROWSTATUS_OK(out_file->Close()); - - return 0; - } catch (const std::exception& e) { - *errMsg = strdup(e.what()); - return ARROWERROR; - } -} - -int cpp_writeColumnToParquet(const char* filename, void* chpl_arr, - int64_t colnum, const char* dsetname, int64_t numelems, - int64_t rowGroupSize, int64_t dtype, int64_t compression, - char** errMsg) { - try { - using FileClass = ::arrow::io::FileOutputStream; - std::shared_ptr out_file; - ARROWRESULT_OK(FileClass::Open(filename), out_file); - - parquet::schema::NodeVector fields; - if(dtype == ARROWINT64) - fields.push_back(parquet::schema::PrimitiveNode::Make(dsetname, parquet::Repetition::REQUIRED, parquet::Type::INT64, parquet::ConvertedType::NONE)); - else if(dtype == ARROWUINT64) - fields.push_back(parquet::schema::PrimitiveNode::Make(dsetname, parquet::Repetition::REQUIRED, parquet::Type::INT64, parquet::ConvertedType::UINT_64)); - else if(dtype == ARROWBOOLEAN) - fields.push_back(parquet::schema::PrimitiveNode::Make(dsetname, parquet::Repetition::REQUIRED, parquet::Type::BOOLEAN, parquet::ConvertedType::NONE)); - else if(dtype == ARROWDOUBLE) - fields.push_back(parquet::schema::PrimitiveNode::Make(dsetname, parquet::Repetition::REQUIRED, parquet::Type::DOUBLE, parquet::ConvertedType::NONE)); - std::shared_ptr schema = std::static_pointer_cast - (parquet::schema::GroupNode::Make("schema", parquet::Repetition::REQUIRED, fields)); - - parquet::WriterProperties::Builder builder; - // assign the proper compression - if(compression == SNAPPY_COMP) { - builder.compression(parquet::Compression::SNAPPY); - } else if (compression == GZIP_COMP) { - builder.compression(parquet::Compression::GZIP); - } else if (compression == BROTLI_COMP) { - builder.compression(parquet::Compression::BROTLI); - } else if (compression == ZSTD_COMP) { - builder.compression(parquet::Compression::ZSTD); - } else if (compression == LZ4_COMP) { - builder.compression(parquet::Compression::LZ4); - } - std::shared_ptr props = builder.build(); - - std::shared_ptr file_writer = - parquet::ParquetFileWriter::Open(out_file, schema, props); - - int64_t i = 0; - int64_t numLeft = numelems; - - if (chpl_arr == NULL) { - // early out to prevent bad memory access - return 0; - } - - if(dtype == ARROWINT64 || dtype == ARROWUINT64) { - auto chpl_ptr = (int64_t*)chpl_arr; - while(numLeft > 0) { - parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); - parquet::Int64Writer* int64_writer = - static_cast(rg_writer->NextColumn()); - - int64_t batchSize = rowGroupSize; - if(numLeft < rowGroupSize) - batchSize = numLeft; - int64_writer->WriteBatch(batchSize, nullptr, nullptr, &chpl_ptr[i]); - numLeft -= batchSize; - i += batchSize; - } - } else if(dtype == ARROWBOOLEAN) { - auto chpl_ptr = (bool*)chpl_arr; - while(numLeft > 0) { - parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); - parquet::BoolWriter* writer = - static_cast(rg_writer->NextColumn()); - - int64_t batchSize = rowGroupSize; - if(numLeft < rowGroupSize) - batchSize = numLeft; - writer->WriteBatch(batchSize, nullptr, nullptr, &chpl_ptr[i]); - numLeft -= batchSize; - i += batchSize; - } - } else if(dtype == ARROWDOUBLE) { - auto chpl_ptr = (double*)chpl_arr; - while(numLeft > 0) { - parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); - parquet::DoubleWriter* writer = - static_cast(rg_writer->NextColumn()); - - int64_t batchSize = rowGroupSize; - if(numLeft < rowGroupSize) - batchSize = numLeft; - writer->WriteBatch(batchSize, nullptr, nullptr, &chpl_ptr[i]); - numLeft -= batchSize; - i += batchSize; - } - } else { - return ARROWERROR; - } - - file_writer->Close(); - ARROWSTATUS_OK(out_file->Close()); - - return 0; - } catch (const std::exception& e) { - *errMsg = strdup(e.what()); - return ARROWERROR; - } -} - -int cpp_writeStrColumnToParquet(const char* filename, void* chpl_arr, void* chpl_offsets, - const char* dsetname, int64_t numelems, - int64_t rowGroupSize, int64_t dtype, int64_t compression, - char** errMsg) { - try { - using FileClass = ::arrow::io::FileOutputStream; - std::shared_ptr out_file; - PARQUET_ASSIGN_OR_THROW(out_file, FileClass::Open(filename)); - - parquet::schema::NodeVector fields; - - fields.push_back(parquet::schema::PrimitiveNode::Make(dsetname, parquet::Repetition::OPTIONAL, parquet::Type::BYTE_ARRAY, parquet::ConvertedType::NONE)); - std::shared_ptr schema = std::static_pointer_cast - (parquet::schema::GroupNode::Make("schema", parquet::Repetition::REQUIRED, fields)); - - parquet::WriterProperties::Builder builder; - // assign the proper compression - if(compression == SNAPPY_COMP) { - builder.compression(parquet::Compression::SNAPPY); - } else if (compression == GZIP_COMP) { - builder.compression(parquet::Compression::GZIP); - } else if (compression == BROTLI_COMP) { - builder.compression(parquet::Compression::BROTLI); - } else if (compression == ZSTD_COMP) { - builder.compression(parquet::Compression::ZSTD); - } else if (compression == LZ4_COMP) { - builder.compression(parquet::Compression::LZ4); - } - std::shared_ptr props = builder.build(); - - std::shared_ptr file_writer = - parquet::ParquetFileWriter::Open(out_file, schema, props); - - int64_t i = 0; - int64_t numLeft = numelems; - - if(dtype == ARROWSTRING) { - auto chpl_ptr = (uint8_t*)chpl_arr; - auto offsets = (int64_t*)chpl_offsets; - int64_t byteIdx = 0; - int64_t offIdx = 0; - - while(numLeft > 0) { - parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); - parquet::ByteArrayWriter* ba_writer = - static_cast(rg_writer->NextColumn()); - int64_t count = 0; - while(numLeft > 0 && count < rowGroupSize) { - parquet::ByteArray value; - int16_t definition_level = 1; - value.ptr = reinterpret_cast(&chpl_ptr[byteIdx]); - // subtract 1 since we have the null terminator - value.len = offsets[offIdx+1] - offsets[offIdx] - 1; - if (value.len == 0) - definition_level = 0; - ba_writer->WriteBatch(1, &definition_level, nullptr, &value); - numLeft--;count++; - offIdx++; - byteIdx+=offsets[offIdx] - offsets[offIdx-1]; - } - } - } else { - return ARROWERROR; - } - - file_writer->Close(); - ARROWSTATUS_OK(out_file->Close()); - - return 0; - } catch (const std::exception& e) { - *errMsg = strdup(e.what()); - return ARROWERROR; - } -} - -int cpp_writeStrListColumnToParquet(const char* filename, void* chpl_segs, void* chpl_offsets, void* chpl_arr, - const char* dsetname, int64_t numelems, - int64_t rowGroupSize, int64_t dtype, int64_t compression, - char** errMsg) { - try { - if(dtype == ARROWSTRING) { // check the type here so if it is wrong we don't create a bad file - using FileClass = ::arrow::io::FileOutputStream; - std::shared_ptr out_file; - PARQUET_ASSIGN_OR_THROW(out_file, FileClass::Open(filename)); - - parquet::schema::NodeVector fields; - - auto element = parquet::schema::PrimitiveNode::Make("item", parquet::Repetition::OPTIONAL, parquet::Type::BYTE_ARRAY, parquet::ConvertedType::NONE); - auto list = parquet::schema::GroupNode::Make("list", parquet::Repetition::REPEATED, {element}); - fields.push_back(parquet::schema::GroupNode::Make(dsetname, parquet::Repetition::OPTIONAL, {list}, parquet::ConvertedType::LIST)); - std::shared_ptr schema = std::static_pointer_cast - (parquet::schema::GroupNode::Make("schema", parquet::Repetition::REQUIRED, fields)); - - parquet::WriterProperties::Builder builder; - // assign the proper compression - if(compression == SNAPPY_COMP) { - builder.compression(parquet::Compression::SNAPPY); - } else if (compression == GZIP_COMP) { - builder.compression(parquet::Compression::GZIP); - } else if (compression == BROTLI_COMP) { - builder.compression(parquet::Compression::BROTLI); - } else if (compression == ZSTD_COMP) { - builder.compression(parquet::Compression::ZSTD); - } else if (compression == LZ4_COMP) { - builder.compression(parquet::Compression::LZ4); - } - std::shared_ptr props = builder.build(); - - std::shared_ptr file_writer = - parquet::ParquetFileWriter::Open(out_file, schema, props); - - int64_t i = 0; - int64_t numLeft = numelems; - auto segments = (int64_t*)chpl_segs; - int64_t segIdx = 0; // index into segarray segments - int64_t offIdx = 0; // index into the segstring segments - int64_t valIdx = 0; // index into chpl_arr - - while(numLeft > 0) { // write all local values to the file - parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); - parquet::ByteArrayWriter* ba_writer = - static_cast(rg_writer->NextColumn()); - int64_t count = 0; - while (numLeft > 0 && count < rowGroupSize) { // ensures rowGroupSize maintained - int64_t segmentLength = segments[segIdx+1] - segments[segIdx]; - if (segmentLength > 0) { - auto offsets = (int64_t*)chpl_offsets; - auto chpl_ptr = (uint8_t*)chpl_arr; - for (int64_t x = 0; x < segmentLength; x++){ - int16_t rep_lvl = (x == 0) ? 0 : 1; - int16_t def_lvl = 3; - parquet::ByteArray value; - value.ptr = reinterpret_cast(&chpl_ptr[valIdx]); - value.len = offsets[offIdx+1] - offsets[offIdx] - 1; - ba_writer->WriteBatch(1, &def_lvl, &rep_lvl, &value); - offIdx++; - valIdx+=offsets[offIdx] - offsets[offIdx-1]; - } - } else { - // empty segment denoted by null value that is not repeated (first of segment) defined at the list level (1) - segmentLength = 1; // even though segment is length=0, write null to hold the empty segment - int16_t def_lvl = 1; - int16_t rep_lvl = 0; - ba_writer->WriteBatch(segmentLength, &def_lvl, &rep_lvl, nullptr); - } - segIdx++; - numLeft--;count++; - } - } - - file_writer->Close(); - ARROWSTATUS_OK(out_file->Close()); - return 0; - } else { - return ARROWERROR; - } - } catch (const std::exception& e) { - *errMsg = strdup(e.what()); - return ARROWERROR; - } -} - -int cpp_writeListColumnToParquet(const char* filename, void* chpl_segs, void* chpl_arr, - const char* dsetname, int64_t numelems, - int64_t rowGroupSize, int64_t dtype, int64_t compression, - char** errMsg) { - try { - using FileClass = ::arrow::io::FileOutputStream; - std::shared_ptr out_file; - PARQUET_ASSIGN_OR_THROW(out_file, FileClass::Open(filename)); - - parquet::schema::NodeVector fields; - - // create the list schema. List containing the dtype - if (dtype == ARROWINT64) { - auto element = parquet::schema::PrimitiveNode::Make("item", parquet::Repetition::OPTIONAL, parquet::Type::INT64, parquet::ConvertedType::NONE); - auto list = parquet::schema::GroupNode::Make("list", parquet::Repetition::REPEATED, {element}); - fields.push_back(parquet::schema::GroupNode::Make(dsetname, parquet::Repetition::OPTIONAL, {list}, parquet::ConvertedType::LIST)); - } - else if (dtype == ARROWUINT64) { - auto element = parquet::schema::PrimitiveNode::Make("item", parquet::Repetition::OPTIONAL, parquet::Type::INT64, parquet::ConvertedType::UINT_64); - auto list = parquet::schema::GroupNode::Make("list", parquet::Repetition::REPEATED, {element}); - fields.push_back(parquet::schema::GroupNode::Make(dsetname, parquet::Repetition::OPTIONAL, {list}, parquet::ConvertedType::LIST)); - } - else if (dtype == ARROWBOOLEAN) { - auto element = parquet::schema::PrimitiveNode::Make("item", parquet::Repetition::OPTIONAL, parquet::Type::BOOLEAN, parquet::ConvertedType::NONE); - auto list = parquet::schema::GroupNode::Make("list", parquet::Repetition::REPEATED, {element}); - fields.push_back(parquet::schema::GroupNode::Make(dsetname, parquet::Repetition::OPTIONAL, {list}, parquet::ConvertedType::LIST)); - } - else if (dtype == ARROWDOUBLE) { - auto element = parquet::schema::PrimitiveNode::Make("item", parquet::Repetition::OPTIONAL, parquet::Type::DOUBLE, parquet::ConvertedType::NONE); - auto list = parquet::schema::GroupNode::Make("list", parquet::Repetition::REPEATED, {element}); - fields.push_back(parquet::schema::GroupNode::Make(dsetname, parquet::Repetition::OPTIONAL, {list}, parquet::ConvertedType::LIST)); - } - std::shared_ptr schema = std::static_pointer_cast - (parquet::schema::GroupNode::Make("schema", parquet::Repetition::REQUIRED, fields)); - - parquet::WriterProperties::Builder builder; - // assign the proper compression - if(compression == SNAPPY_COMP) { - builder.compression(parquet::Compression::SNAPPY); - } else if (compression == GZIP_COMP) { - builder.compression(parquet::Compression::GZIP); - } else if (compression == BROTLI_COMP) { - builder.compression(parquet::Compression::BROTLI); - } else if (compression == ZSTD_COMP) { - builder.compression(parquet::Compression::ZSTD); - } else if (compression == LZ4_COMP) { - builder.compression(parquet::Compression::LZ4); - } - std::shared_ptr props = builder.build(); - - std::shared_ptr file_writer = - parquet::ParquetFileWriter::Open(out_file, schema, props); - - int64_t i = 0; - int64_t numLeft = numelems; - auto segments = (int64_t*)chpl_segs; - int64_t valIdx = 0; // index into chpl_arr - int64_t segIdx = 0; // index into offsets - - if(dtype == ARROWINT64 || dtype == ARROWUINT64) { - while(numLeft > 0) { // write all local values to the file - parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); - parquet::Int64Writer* writer = - static_cast(rg_writer->NextColumn()); - int64_t count = 0; - while (numLeft > 0 && count < rowGroupSize) { // ensures rowGroupSize maintained - int64_t batchSize = segments[segIdx+1] - segments[segIdx]; - if (batchSize > 0) { - auto chpl_ptr = (int64_t*)chpl_arr; - int16_t* def_lvl = new int16_t[batchSize] { 3 }; // all values defined at the item level (3) - int16_t* rep_lvl = new int16_t[batchSize] { 0 }; - for (int64_t x = 0; x < batchSize; x++){ - // if the value is first in the segment rep_lvl = 0, otherwise 1 - rep_lvl[x] = (x == 0) ? 0 : 1; - def_lvl[x] = 3; - } - writer->WriteBatch(batchSize, def_lvl, rep_lvl, &chpl_ptr[valIdx]); - valIdx += batchSize; - delete[] def_lvl; - delete[] rep_lvl; - } - else { - // empty segment denoted by null value that is not repeated (first of segment) defined at the list level (1) - batchSize = 1; // even though segment is length=0, write null to hold the empty segment - int16_t def_lvl = 1; - int16_t rep_lvl = 0; - writer->WriteBatch(batchSize, &def_lvl, &rep_lvl, nullptr); - } - count++; - segIdx++; - numLeft--; - } - } - } - else if (dtype == ARROWBOOLEAN) { - while(numLeft > 0) { - parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); - parquet::BoolWriter* writer = - static_cast(rg_writer->NextColumn()); - int64_t count = 0; - while (numLeft > 0 && count < rowGroupSize) { - int64_t batchSize = segments[segIdx+1] - segments[segIdx]; - if (batchSize > 0) { - auto chpl_ptr = (bool*)chpl_arr; - // if the value is first in the segment rep_lvl = 0, otherwise 1 - // all values defined at the item level (3) - int16_t* def_lvl = new int16_t[batchSize] { 3 }; - int16_t* rep_lvl = new int16_t[batchSize] { 0 }; - for (int64_t x = 0; x < batchSize; x++){ - rep_lvl[x] = (x == 0) ? 0 : 1; - def_lvl[x] = 3; - } - writer->WriteBatch(batchSize, def_lvl, rep_lvl, &chpl_ptr[valIdx]); - valIdx += batchSize; - delete[] def_lvl; - delete[] rep_lvl; - } - else { - // empty segment denoted by null value that is not repeated (first of segment) defined at the list level (1) - batchSize = 1; // even though segment is length=0, write null to hold the empty segment - int16_t def_lvl = 1; - int16_t rep_lvl = 0; - writer->WriteBatch(batchSize, &def_lvl, &rep_lvl, nullptr); - } - count++; - segIdx++; - numLeft--; - } - } - } - else if (dtype == ARROWDOUBLE) { - while(numLeft > 0) { - parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); - parquet::DoubleWriter* writer = - static_cast(rg_writer->NextColumn()); - int64_t count = 0; - while (numLeft > 0 && count < rowGroupSize) { - int64_t batchSize = segments[segIdx+1] - segments[segIdx]; - if (batchSize > 0) { - auto chpl_ptr = (double*)chpl_arr; - // if the value is first in the segment rep_lvl = 0, otherwise 1 - // all values defined at the item level (3) - int16_t* def_lvl = new int16_t[batchSize] { 3 }; - int16_t* rep_lvl = new int16_t[batchSize] { 0 }; - for (int64_t x = 0; x < batchSize; x++){ - rep_lvl[x] = (x == 0) ? 0 : 1; - def_lvl[x] = 3; - } - writer->WriteBatch(batchSize, def_lvl, rep_lvl, &chpl_ptr[valIdx]); - valIdx += batchSize; - delete[] def_lvl; - delete[] rep_lvl; - } - else { - // empty segment denoted by null value that is not repeated (first of segment) defined at the list level (1) - batchSize = 1; // even though segment is length=0, write null to hold the empty segment - int16_t def_lvl = 1; - int16_t rep_lvl = 0; - writer->WriteBatch(batchSize, &def_lvl, &rep_lvl, nullptr); - } - count++; - segIdx++; - numLeft--; - } - } - } - else { - return ARROWERROR; - } - - file_writer->Close(); - ARROWSTATUS_OK(out_file->Close()); - - return 0; - } catch (const std::exception& e) { - *errMsg = strdup(e.what()); - return ARROWERROR; - } -} - -int cpp_createEmptyListParquetFile(const char* filename, const char* dsetname, int64_t dtype, - int64_t compression, char** errMsg) { - try { - using FileClass = ::arrow::io::FileOutputStream; - std::shared_ptr out_file; - PARQUET_ASSIGN_OR_THROW(out_file, FileClass::Open(filename)); - - parquet::schema::NodeVector fields; - if (dtype == ARROWINT64) { - auto element = parquet::schema::PrimitiveNode::Make("item", parquet::Repetition::OPTIONAL, parquet::Type::INT64, parquet::ConvertedType::NONE); - auto list = parquet::schema::GroupNode::Make("list", parquet::Repetition::REPEATED, {element}); - fields.push_back(parquet::schema::GroupNode::Make(dsetname, parquet::Repetition::OPTIONAL, {list}, parquet::ConvertedType::LIST)); - } - else if (dtype == ARROWUINT64) { - auto element = parquet::schema::PrimitiveNode::Make("item", parquet::Repetition::OPTIONAL, parquet::Type::INT64, parquet::ConvertedType::UINT_64); - auto list = parquet::schema::GroupNode::Make("list", parquet::Repetition::REPEATED, {element}); - fields.push_back(parquet::schema::GroupNode::Make(dsetname, parquet::Repetition::OPTIONAL, {list}, parquet::ConvertedType::LIST)); - } - else if (dtype == ARROWBOOLEAN) { - auto element = parquet::schema::PrimitiveNode::Make("item", parquet::Repetition::OPTIONAL, parquet::Type::BOOLEAN, parquet::ConvertedType::NONE); - auto list = parquet::schema::GroupNode::Make("list", parquet::Repetition::REPEATED, {element}); - fields.push_back(parquet::schema::GroupNode::Make(dsetname, parquet::Repetition::OPTIONAL, {list}, parquet::ConvertedType::LIST)); - } - else if (dtype == ARROWDOUBLE) { - auto element = parquet::schema::PrimitiveNode::Make("item", parquet::Repetition::OPTIONAL, parquet::Type::DOUBLE, parquet::ConvertedType::NONE); - auto list = parquet::schema::GroupNode::Make("list", parquet::Repetition::REPEATED, {element}); - fields.push_back(parquet::schema::GroupNode::Make(dsetname, parquet::Repetition::OPTIONAL, {list}, parquet::ConvertedType::LIST)); - } - std::shared_ptr schema = std::static_pointer_cast - (parquet::schema::GroupNode::Make("schema", parquet::Repetition::REQUIRED, fields)); - - parquet::WriterProperties::Builder builder; - // assign the proper compression - if(compression == SNAPPY_COMP) { - builder.compression(parquet::Compression::SNAPPY); - } else if (compression == GZIP_COMP) { - builder.compression(parquet::Compression::GZIP); - } else if (compression == BROTLI_COMP) { - builder.compression(parquet::Compression::BROTLI); - } else if (compression == ZSTD_COMP) { - builder.compression(parquet::Compression::ZSTD); - } else if (compression == LZ4_COMP) { - builder.compression(parquet::Compression::LZ4); - } - std::shared_ptr props = builder.build(); - std::shared_ptr file_writer = - parquet::ParquetFileWriter::Open(out_file, schema, props); - - file_writer->Close(); - ARROWSTATUS_OK(out_file->Close()); - - return 0; - } catch (const std::exception& e) { - *errMsg = strdup(e.what()); - return ARROWERROR; - } -} - -int cpp_createEmptyParquetFile(const char* filename, const char* dsetname, int64_t dtype, - int64_t compression, char** errMsg) { - try { - using FileClass = ::arrow::io::FileOutputStream; - std::shared_ptr out_file; - PARQUET_ASSIGN_OR_THROW(out_file, FileClass::Open(filename)); - - parquet::schema::NodeVector fields; - if(dtype == ARROWINT64) - fields.push_back(parquet::schema::PrimitiveNode::Make(dsetname, parquet::Repetition::REQUIRED, parquet::Type::INT64, parquet::ConvertedType::NONE)); - else if(dtype == ARROWUINT64) - fields.push_back(parquet::schema::PrimitiveNode::Make(dsetname, parquet::Repetition::REQUIRED, parquet::Type::INT64, parquet::ConvertedType::UINT_64)); - else if(dtype == ARROWBOOLEAN) - fields.push_back(parquet::schema::PrimitiveNode::Make(dsetname, parquet::Repetition::REQUIRED, parquet::Type::BOOLEAN, parquet::ConvertedType::NONE)); - else if(dtype == ARROWDOUBLE) - fields.push_back(parquet::schema::PrimitiveNode::Make(dsetname, parquet::Repetition::REQUIRED, parquet::Type::DOUBLE, parquet::ConvertedType::NONE)); - else if(dtype == ARROWSTRING) - fields.push_back(parquet::schema::PrimitiveNode::Make(dsetname, parquet::Repetition::OPTIONAL, parquet::Type::BYTE_ARRAY, parquet::ConvertedType::NONE)); - std::shared_ptr schema = std::static_pointer_cast - (parquet::schema::GroupNode::Make("schema", parquet::Repetition::REQUIRED, fields)); - - parquet::WriterProperties::Builder builder; - // assign the proper compression - if(compression == SNAPPY_COMP) { - builder.compression(parquet::Compression::SNAPPY); - } else if (compression == GZIP_COMP) { - builder.compression(parquet::Compression::GZIP); - } else if (compression == BROTLI_COMP) { - builder.compression(parquet::Compression::BROTLI); - } else if (compression == ZSTD_COMP) { - builder.compression(parquet::Compression::ZSTD); - } else if (compression == LZ4_COMP) { - builder.compression(parquet::Compression::LZ4); - } - std::shared_ptr props = builder.build(); - std::shared_ptr file_writer = - parquet::ParquetFileWriter::Open(out_file, schema, props); - - file_writer->Close(); - ARROWSTATUS_OK(out_file->Close()); - - return 0; - } catch (const std::exception& e) { - *errMsg = strdup(e.what()); - return ARROWERROR; - } -} - -int cpp_appendColumnToParquet(const char* filename, void* chpl_arr, - const char* dsetname, int64_t numelems, - int64_t dtype, int64_t compression, - char** errMsg) { - try { - if (chpl_arr == NULL){ - // early out to prevent bad memory access - return 0; - } - std::shared_ptr infile; - ARROWRESULT_OK(arrow::io::ReadableFile::Open(filename, arrow::default_memory_pool()), - infile); - std::unique_ptr reader; - ARROWSTATUS_OK(parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); - // Use threads for case when reading a table with many columns - reader->set_use_threads(true); - - std::shared_ptr table; - std::shared_ptr* hold_table = &table; - ARROWSTATUS_OK(reader->ReadTable(hold_table)); - - arrow::ArrayVector arrays; - std::shared_ptr values; - auto chunk_type = arrow::int64(); - if(dtype == ARROWINT64) { - chunk_type = arrow::int64(); - arrow::Int64Builder builder; - auto chpl_ptr = (int64_t*)chpl_arr; - ARROWSTATUS_OK(builder.AppendValues(chpl_ptr, numelems, nullptr)) - ARROWSTATUS_OK(builder.Finish(&values)); - } else if(dtype == ARROWUINT64) { - chunk_type = arrow::uint64(); - arrow::UInt64Builder builder; - auto chpl_ptr = (uint64_t*)chpl_arr; - ARROWSTATUS_OK(builder.AppendValues(chpl_ptr, numelems, nullptr)) - ARROWSTATUS_OK(builder.Finish(&values)); - } else if(dtype == ARROWBOOLEAN) { - chunk_type = arrow::boolean(); - arrow::BooleanBuilder builder; - auto chpl_ptr = (uint8_t*)chpl_arr; - ARROWSTATUS_OK(builder.AppendValues(chpl_ptr, numelems, nullptr)) - ARROWSTATUS_OK(builder.Finish(&values)); - } else if(dtype == ARROWSTRING) { - chunk_type = arrow::utf8(); - arrow::StringBuilder builder; - auto chpl_ptr = (uint8_t*)chpl_arr; - int64_t j = 0; - for(int64_t i = 0; i < numelems; i++) { - std::string tmp_str = ""; - while(chpl_ptr[j] != 0x00) { - tmp_str += chpl_ptr[j++]; - } - j++; - - auto const status = builder.Append(tmp_str); - if (status.IsCapacityError()) { - // Reached current chunk's capacity limit, so start a new one... - ARROWSTATUS_OK(builder.Finish(&values)); - arrays.push_back(values); - values.reset(); - builder.Reset(); - - // ...with this string as its first item. - ARROWSTATUS_OK(builder.Append(tmp_str)); - } else { - ARROWSTATUS_OK(status); - } - } - ARROWSTATUS_OK(builder.Finish(&values)); - } else if(dtype == ARROWDOUBLE) { - chunk_type = arrow::float64(); - arrow::DoubleBuilder builder; - auto chpl_ptr = (double*)chpl_arr; - ARROWSTATUS_OK(builder.AppendValues(chpl_ptr, numelems, nullptr)) - ARROWSTATUS_OK(builder.Finish(&values)); - } else { - std::string msg = "Unrecognized Parquet dtype"; - *errMsg = strdup(msg.c_str()); - return ARROWERROR; - } - arrays.push_back(values); - - std::shared_ptr chunk_sh_ptr; - ARROWRESULT_OK(arrow::ChunkedArray::Make({arrays}, chunk_type), chunk_sh_ptr); - - auto newField = arrow::field(dsetname, chunk_type); - std::shared_ptr fin_table; - ARROWRESULT_OK(table -> AddColumn(0, newField, chunk_sh_ptr), fin_table); - - using FileClass = ::arrow::io::FileOutputStream; - std::shared_ptr out_file; - ARROWRESULT_OK(FileClass::Open(filename), out_file); - ARROWSTATUS_OK(parquet::arrow::WriteTable(*fin_table, arrow::default_memory_pool(), out_file, numelems)); - - return 0; - } catch (const std::exception& e) { - *errMsg = strdup(e.what()); - return ARROWERROR; - } -} - -const char* cpp_getVersionInfo(void) { - return strdup(arrow::GetBuildInfo().version_string.c_str()); -} - -int cpp_getDatasetNames(const char* filename, char** dsetResult, bool readNested, char** errMsg) { - try { - std::shared_ptr infile; - ARROWRESULT_OK(arrow::io::ReadableFile::Open(filename, arrow::default_memory_pool()), - infile); - std::unique_ptr reader; - ARROWSTATUS_OK(parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); - - std::shared_ptr sc; - std::shared_ptr* out = ≻ - ARROWSTATUS_OK(reader->GetSchema(out)); - - std::string fields = ""; - bool first = true; - - for(int i = 0; i < sc->num_fields(); i++) { - // only add fields of supported types - if(sc->field(i)->type()->id() == arrow::Type::INT64 || - sc->field(i)->type()->id() == arrow::Type::INT32 || - sc->field(i)->type()->id() == arrow::Type::INT16 || - sc->field(i)->type()->id() == arrow::Type::UINT64 || - sc->field(i)->type()->id() == arrow::Type::UINT32 || - sc->field(i)->type()->id() == arrow::Type::UINT16 || - sc->field(i)->type()->id() == arrow::Type::TIMESTAMP || - sc->field(i)->type()->id() == arrow::Type::BOOL || - sc->field(i)->type()->id() == arrow::Type::STRING || - sc->field(i)->type()->id() == arrow::Type::BINARY || - sc->field(i)->type()->id() == arrow::Type::FLOAT || - sc->field(i)->type()->id() == arrow::Type::DOUBLE || - (sc->field(i)->type()->id() == arrow::Type::LIST && readNested) || - sc->field(i)->type()->id() == arrow::Type::DECIMAL || - sc->field(i)->type()->id() == arrow::Type::LARGE_STRING - ) { - if(!first) - fields += ("," + sc->field(i)->name()); - else - fields += (sc->field(i)->name()); - first = false; - } else if (sc->field(i)->type()->id() == arrow::Type::LIST && !readNested) { - continue; - } else { - std::string fname(filename); - std::string dname(sc->field(i)->ToString()); - std::string msg = "Unsupported type on column: " + dname + " in " + filename; - *errMsg = strdup(msg.c_str()); - return ARROWERROR; - } - } - *dsetResult = strdup(fields.c_str()); - - return 0; - } catch (const std::exception& e) { - *errMsg = strdup(e.what()); - return ARROWERROR; - } -} - -void cpp_free_string(void* ptr) { - free(ptr); -} - -void cpp_openFile(const char* filename, int64_t idx) { - std::shared_ptr parquet_reader = - parquet::ParquetFileReader::OpenFile(filename, false); - globalFiles[idx] = parquet_reader; -} - -void cpp_createRowGroupReader(int64_t rowGroup, int64_t readerIdx) { - std::shared_ptr row_group_reader = - globalFiles[readerIdx]->RowGroup(rowGroup); - globalRowGroupReaders[readerIdx] = row_group_reader; -} - -void cpp_createColumnReader(const char* colname, int64_t readerIdx) { - - std::shared_ptr file_metadata = globalFiles[readerIdx]->metadata(); - auto idx = file_metadata -> schema() -> ColumnIndex(colname); - - std::shared_ptr column_reader; - column_reader = globalRowGroupReaders[readerIdx]->Column(idx); - globalColumnReaders[readerIdx] = column_reader; -} - -int cpp_readParquetColumnChunks(const char* filename, int64_t batchSize, int64_t numElems, - int64_t readerIdx, int64_t* numRead, - void** outData, bool* containsNulls, char** errMsg) { - try { - auto reader = static_cast(globalColumnReaders[readerIdx].get()); - parquet::ByteArray* string_values = - (parquet::ByteArray*)malloc(numElems*sizeof(parquet::ByteArray)); - std::vector definition_level(batchSize); - int64_t values_read = 0; - int64_t total_read = 0; - while(reader->HasNext() && total_read < numElems) { - if((numElems - total_read) < batchSize) - batchSize = numElems - total_read; - // adding 1 to definition level, since the first value indicates if null values - (void)reader->ReadBatch(batchSize, definition_level.data(), nullptr, string_values + total_read, &values_read); - for(int i = 0; i < values_read; i++) { - if(definition_level[i] == 0) - *containsNulls = true; - } - total_read += values_read; - } - *numRead = total_read; - *outData = (void*)string_values; - return 0; - } catch (const std::exception& e) { - *errMsg = strdup(e.what()); - return ARROWERROR; - } -} - -int cpp_getNumRowGroups(int64_t readerIdx) { - std::shared_ptr file_metadata = globalFiles[readerIdx]->metadata(); - return file_metadata->num_row_groups(); -} - -void cpp_freeMapValues(void* row) { - parquet::ByteArray* string_values = - static_cast(row); - free(string_values); - globalColumnReaders.clear(); - globalRowGroupReaders.clear(); - globalFiles.clear(); -} - -/* - C functions - ----------- - These C functions provide no functionality, since the C++ - Arrow library is being used, they merely call the C++ functions - to allow Chapel to call the C++ functions through C interoperability. - Each Arrow function must have a corresponding C function if wished - to be called by Chapel. -*/ - -extern "C" { - int64_t c_getNumRows(const char* chpl_str, char** errMsg) { - return cpp_getNumRows(chpl_str, errMsg); - } - - int c_readListColumnByName(const char* filename, void* chpl_arr, const char* colname, int64_t numElems, int64_t startIdx, int64_t batchSize, char** errMsg) { - return cpp_readListColumnByName(filename, chpl_arr, colname, numElems, startIdx, batchSize, errMsg); - } - - int c_readColumnByName(const char* filename, void* chpl_arr, bool* where_null_chpl, const char* colname, int64_t numElems, int64_t startIdx, int64_t batchSize, int64_t byteLength, bool hasNonFloatNulls, char** errMsg) { - return cpp_readColumnByName(filename, chpl_arr, where_null_chpl, colname, numElems, startIdx, batchSize, byteLength, hasNonFloatNulls, errMsg); - } - - int c_getType(const char* filename, const char* colname, char** errMsg) { - return cpp_getType(filename, colname, errMsg); - } - - int c_getListType(const char* filename, const char* colname, char** errMsg) { - return cpp_getListType(filename, colname, errMsg); - } - - int c_writeColumnToParquet(const char* filename, void* chpl_arr, - int64_t colnum, const char* dsetname, int64_t numelems, - int64_t rowGroupSize, int64_t dtype, int64_t compression, - char** errMsg) { - return cpp_writeColumnToParquet(filename, chpl_arr, colnum, dsetname, - numelems, rowGroupSize, dtype, compression, - errMsg); - } - - int c_writeStrColumnToParquet(const char* filename, void* chpl_arr, void* chpl_offsets, - const char* dsetname, int64_t numelems, - int64_t rowGroupSize, int64_t dtype, int64_t compression, - char** errMsg) { - return cpp_writeStrColumnToParquet(filename, chpl_arr, chpl_offsets, - dsetname, numelems, rowGroupSize, dtype, compression, errMsg); - } - - int c_writeListColumnToParquet(const char* filename, void* chpl_segs, void* chpl_arr, - const char* dsetname, int64_t numelems, - int64_t rowGroupSize, int64_t dtype, int64_t compression, - char** errMsg) { - return cpp_writeListColumnToParquet(filename, chpl_segs, chpl_arr, - dsetname, numelems, rowGroupSize, dtype, compression, errMsg); - } - - int c_writeStrListColumnToParquet(const char* filename, void* chpl_segs, void* chpl_offsets, void* chpl_arr, - const char* dsetname, int64_t numelems, - int64_t rowGroupSize, int64_t dtype, int64_t compression, - char** errMsg) { - return cpp_writeStrListColumnToParquet(filename, chpl_segs, chpl_offsets, chpl_arr, - dsetname, numelems, rowGroupSize, dtype, compression, errMsg); - } - - int c_createEmptyParquetFile(const char* filename, const char* dsetname, int64_t dtype, - int64_t compression, char** errMsg) { - return cpp_createEmptyParquetFile(filename, dsetname, dtype, compression, errMsg); - } - - int c_createEmptyListParquetFile(const char* filename, const char* dsetname, int64_t dtype, - int64_t compression, char** errMsg) { - return cpp_createEmptyListParquetFile(filename, dsetname, dtype, compression, errMsg); - } - - int c_appendColumnToParquet(const char* filename, void* chpl_arr, - const char* dsetname, int64_t numelems, - int64_t dtype, int64_t compression, - char** errMsg) { - return cpp_appendColumnToParquet(filename, chpl_arr, - dsetname, numelems, - dtype, compression, - errMsg); - } - - int64_t c_getStringColumnNumBytes(const char* filename, const char* colname, void* chpl_offsets, int64_t numElems, int64_t startIdx, int64_t batchSize, char** errMsg) { - return cpp_getStringColumnNumBytes(filename, colname, chpl_offsets, numElems, startIdx, batchSize, errMsg); - } - - int64_t c_getListColumnSize(const char* filename, const char* colname, void* chpl_seg_sizes, int64_t numElems, int64_t startIdx, char** errMsg) { - return cpp_getListColumnSize(filename, colname, chpl_seg_sizes, numElems, startIdx, errMsg); - } - - int64_t c_getStringColumnNullIndices(const char* filename, const char* colname, void* chpl_nulls, char** errMsg) { - return cpp_getStringColumnNullIndices(filename, colname, chpl_nulls, errMsg); - } - - const char* c_getVersionInfo(void) { - return cpp_getVersionInfo(); - } - - int c_getDatasetNames(const char* filename, char** dsetResult, bool readNested, char** errMsg) { - return cpp_getDatasetNames(filename, dsetResult, readNested, errMsg); - } - - void c_free_string(void* ptr) { - cpp_free_string(ptr); - } - - int c_writeMultiColToParquet(const char* filename, void* column_names, - void** ptr_arr, void** offset_arr, void* objTypes, void* datatypes, - void* segArr_sizes, int64_t colnum, int64_t numelems, int64_t rowGroupSize, - int64_t compression, char** errMsg){ - return cpp_writeMultiColToParquet(filename, column_names, ptr_arr, offset_arr, objTypes, datatypes, segArr_sizes, colnum, numelems, rowGroupSize, compression, errMsg); - } - - int c_getPrecision(const char* filename, const char* colname, char** errMsg) { - return cpp_getPrecision(filename, colname, errMsg); - } - - void c_openFile(const char* filename, int64_t idx) { - cpp_openFile(filename, idx); - } - - void c_createRowGroupReader(int64_t rowGroup, int64_t readerIdx) { - return cpp_createRowGroupReader(rowGroup, readerIdx); - } - - void c_createColumnReader(const char* colname, int64_t readerIdx) { - cpp_createColumnReader(colname, readerIdx); - } - - int c_readParquetColumnChunks(const char* filename, int64_t batchSize, int64_t numElems, - int64_t readerIdx, int64_t* numRead, - void** outData, bool* containsNulls, char** errMsg) { - return cpp_readParquetColumnChunks(filename, batchSize, numElems, readerIdx, - numRead, outData, containsNulls, errMsg); - } - - int c_getNumRowGroups(int64_t readerIdx) { - return cpp_getNumRowGroups(readerIdx); - } - - void c_freeMapValues(void* row) { - cpp_freeMapValues(row); - } -} diff --git a/src/ParquetMsg.chpl b/src/ParquetMsg.chpl index 25baaebd6c..1c13bfafa3 100644 --- a/src/ParquetMsg.chpl +++ b/src/ParquetMsg.chpl @@ -31,8 +31,12 @@ module ParquetMsg { // Use reflection for error information use Reflection; - require "ArrowFunctions.h"; - require "ArrowFunctions.o"; + require "ReadParquet.h"; + require "ReadParquet.o"; + require "WriteParquet.h"; + require "WriteParquet.o"; + require "UtilParquet.h"; + require "UtilParquet.o"; private config const logLevel = ServerConfig.logLevel; private config const logChannel = ServerConfig.logChannel; diff --git a/src/parquet/ReadParquet.cpp b/src/parquet/ReadParquet.cpp new file mode 100644 index 0000000000..80ca27bd91 --- /dev/null +++ b/src/parquet/ReadParquet.cpp @@ -0,0 +1,638 @@ +#include "ReadParquet.h" +#include "UtilParquet.h" + +int cpp_readColumnByName(const char* filename, void* chpl_arr, bool* where_null_chpl, const char* colname, int64_t numElems, int64_t startIdx, int64_t batchSize, int64_t byteLength, bool hasNonFloatNulls, char** errMsg) { + try { + int64_t ty = cpp_getType(filename, colname, errMsg); + + std::unique_ptr parquet_reader = + parquet::ParquetFileReader::OpenFile(filename, false); + + std::shared_ptr file_metadata = parquet_reader->metadata(); + int num_row_groups = file_metadata->num_row_groups(); + + int64_t i = 0; + for (int r = 0; r < num_row_groups; r++) { + std::shared_ptr row_group_reader = + parquet_reader->RowGroup(r); + + int64_t values_read = 0; + + std::shared_ptr column_reader; + + auto idx = file_metadata -> schema() -> ColumnIndex(colname); + auto max_def = file_metadata -> schema() -> Column(idx) -> max_definition_level(); // needed to determine if nulls are allowed + + if(idx < 0) { + std::string dname(colname); + std::string fname(filename); + std::string msg = "Dataset: " + dname + " does not exist in file: " + fname; + *errMsg = strdup(msg.c_str()); + return ARROWERROR; + } + + column_reader = row_group_reader->Column(idx); + + // Since int64 and uint64 Arrow dtypes share a physical type and only differ + // in logical type, they must be read from the file in the same way + if(ty == ARROWINT64 || ty == ARROWUINT64) { + int16_t definition_level; // nullable type and only reading single records in batch + auto chpl_ptr = (int64_t*)chpl_arr; + parquet::Int64Reader* reader = + static_cast(column_reader.get()); + startIdx -= reader->Skip(startIdx); + + if (not hasNonFloatNulls) { + while (reader->HasNext() && i < numElems) { + if((numElems - i) < batchSize) // adjust batchSize if needed + batchSize = numElems - i; + (void)reader->ReadBatch(batchSize, nullptr, nullptr, &chpl_ptr[i], &values_read); + i+=values_read; + } + } + else { + while (reader->HasNext() && i < numElems) { + (void)reader->ReadBatch(1, &definition_level, nullptr, &chpl_ptr[i], &values_read); + // if values_read is 0, that means that it was a null value + if(values_read == 0) { + where_null_chpl[i] = true; + } + i++; + } + } + } else if(ty == ARROWINT32 || ty == ARROWUINT32) { + int16_t definition_level; // nullable type and only reading single records in batch + auto chpl_ptr = (int64_t*)chpl_arr; + parquet::Int32Reader* reader = + static_cast(column_reader.get()); + startIdx -= reader->Skip(startIdx); + + if (not hasNonFloatNulls) { + int32_t* tmpArr = (int32_t*)malloc(batchSize * sizeof(int32_t)); + while (reader->HasNext() && i < numElems) { + if((numElems - i) < batchSize) // adjust batchSize if needed + batchSize = numElems - i; + + // Can't read directly into chpl_ptr because it is int64 + (void)reader->ReadBatch(batchSize, nullptr, nullptr, tmpArr, &values_read); + for (int64_t j = 0; j < values_read; j++) + chpl_ptr[i+j] = (int64_t)tmpArr[j]; + i+=values_read; + } + free(tmpArr); + } + else { + int32_t tmp; + while (reader->HasNext() && i < numElems) { + (void)reader->ReadBatch(1, &definition_level, nullptr, &tmp, &values_read); + // if values_read is 0, that means that it was a null value + if(values_read == 0) { + where_null_chpl[i] = true; + } + else { + chpl_ptr[i] = (int64_t)tmp; + } + i++; + } + } + } else if(ty == ARROWBOOLEAN) { + int16_t definition_level; // nullable type and only reading single records in batch + auto chpl_ptr = (bool*)chpl_arr; + parquet::BoolReader* reader = + static_cast(column_reader.get()); + startIdx -= reader->Skip(startIdx); + + if (not hasNonFloatNulls) { + while (reader->HasNext() && i < numElems) { + if((numElems - i) < batchSize) // adjust batchSize if needed + batchSize = numElems - i; + (void)reader->ReadBatch(batchSize, nullptr, nullptr, &chpl_ptr[i], &values_read); + i+=values_read; + } + } + else { + while (reader->HasNext() && i < numElems) { + (void)reader->ReadBatch(1, &definition_level, nullptr, &chpl_ptr[i], &values_read); + // if values_read is 0, that means that it was a null value + if(values_read == 0) { + where_null_chpl[i] = true; + } + i++; + } + } + } else if(ty == ARROWSTRING) { + int16_t definition_level; // nullable type and only reading single records in batch + auto chpl_ptr = (unsigned char*)chpl_arr; + parquet::ByteArrayReader* reader = + static_cast(column_reader.get()); + + while (reader->HasNext()) { + parquet::ByteArray value; + (void)reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); + // if values_read is 0, that means that it was a null value + if(values_read > 0) { + for(int j = 0; j < value.len; j++) { + chpl_ptr[i] = value.ptr[j]; + i++; + } + } + i++; // skip one space so the strings are null terminated with a 0 + } + } else if(ty == ARROWFLOAT) { + int16_t definition_level; // nullable type and only reading single records in batch + auto chpl_ptr = (double*)chpl_arr; + parquet::FloatReader* reader = + static_cast(column_reader.get()); + startIdx -= reader->Skip(startIdx); + + while (reader->HasNext() && i < numElems) { + float value; + (void)reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); + // if values_read is 0, that means that it was a null value + if(values_read > 0) { + // this means it wasn't null + chpl_ptr[i] = (double) value; + } else { + chpl_ptr[i] = NAN; + } + i++; + } + } else if(ty == ARROWDOUBLE) { + int16_t definition_level; // nullable type and only reading single records in batch + auto chpl_ptr = (double*)chpl_arr; + parquet::DoubleReader* reader = + static_cast(column_reader.get()); + startIdx -= reader->Skip(startIdx); + + while (reader->HasNext() && i < numElems) { + double value; + (void)reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); + // if values_read is 0, that means that it was a null value + if(values_read > 0) { + // this means it wasn't null + chpl_ptr[i] = value; + } else { + chpl_ptr[i] = NAN; + } + i++; + } + } else if(ty == ARROWDECIMAL) { + auto chpl_ptr = (double*)chpl_arr; + parquet::FixedLenByteArray value; + parquet::FixedLenByteArrayReader* reader = + static_cast(column_reader.get()); + startIdx -= reader->Skip(startIdx); + + while (reader->HasNext() && i < numElems) { + (void)reader->ReadBatch(1, nullptr, nullptr, &value, &values_read); + arrow::Decimal128 v; + PARQUET_ASSIGN_OR_THROW(v, + ::arrow::Decimal128::FromBigEndian(value.ptr, byteLength)); + + chpl_ptr[i] = v.ToDouble(0); + i+=values_read; + } + } + } + return 0; + } catch (const std::exception& e) { + *errMsg = strdup(e.what()); + return ARROWERROR; + } +} + +int cpp_readListColumnByName(const char* filename, void* chpl_arr, const char* colname, int64_t numElems, int64_t startIdx, int64_t batchSize, char** errMsg) { + try { + int64_t ty = cpp_getType(filename, colname, errMsg); + if (ty == ARROWLIST){ + int64_t lty = cpp_getListType(filename, colname, errMsg); + std::unique_ptr parquet_reader = + parquet::ParquetFileReader::OpenFile(filename, false); + + std::shared_ptr file_metadata = parquet_reader->metadata(); + int num_row_groups = file_metadata->num_row_groups(); + + auto idx = file_metadata -> schema() -> group_node() -> FieldIndex(colname); + if(idx < 0) { + std::string dname(colname); + std::string fname(filename); + std::string msg = "Dataset: " + dname + " does not exist in file: " + fname; + *errMsg = strdup(msg.c_str()); + return ARROWERROR; + } + + int64_t i = 0; + int64_t arrayIdx = 0; + for (int r = 0; r < num_row_groups; r++) { + std::shared_ptr row_group_reader = + parquet_reader->RowGroup(r); + + int64_t values_read = 0; + int16_t definition_level; // needed for any type that is nullable + + std::shared_ptr column_reader = row_group_reader->Column(idx); + if(lty == ARROWINT64 || lty == ARROWUINT64) { + int16_t definition_level; // nullable type and only reading single records in batch + auto chpl_ptr = (int64_t*)chpl_arr; + parquet::Int64Reader* reader = + static_cast(column_reader.get()); + startIdx -= reader->Skip(startIdx); + + while (reader->HasNext() && arrayIdx < numElems) { + (void)reader->ReadBatch(1, &definition_level, nullptr, &chpl_ptr[arrayIdx], &values_read); + // if values_read is 0, that means that it was an empty seg + if (values_read != 0) { + arrayIdx++; + } + i++; + } + } else if(lty == ARROWINT32 || lty == ARROWUINT32) { + int16_t definition_level; // nullable type and only reading single records in batch + auto chpl_ptr = (int64_t*)chpl_arr; + parquet::Int32Reader* reader = + static_cast(column_reader.get()); + startIdx -= reader->Skip(startIdx); + + int32_t tmp; + while (reader->HasNext() && arrayIdx < numElems) { + (void)reader->ReadBatch(1, &definition_level, nullptr, &tmp, &values_read); + // if values_read is 0, that means that it was an empty seg + if (values_read != 0) { + chpl_ptr[arrayIdx] = (int64_t)tmp; + arrayIdx++; + } + i++; + } + } else if (lty == ARROWSTRING) { + int16_t definition_level; // nullable type and only reading single records in batch + auto chpl_ptr = (unsigned char*)chpl_arr; + parquet::ByteArrayReader* reader = + static_cast(column_reader.get()); + + while (reader->HasNext()) { + parquet::ByteArray value; + (void)reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); + // if values_read is 0, that means that it was a null value + if(values_read > 0 && definition_level == 3) { + for(int j = 0; j < value.len; j++) { + chpl_ptr[i] = value.ptr[j]; + i++; + } + i++; // skip one space so the strings are null terminated with a 0 + } + } + } else if(lty == ARROWBOOLEAN) { + int16_t definition_level; // nullable type and only reading single records in batch + auto chpl_ptr = (bool*)chpl_arr; + parquet::BoolReader* reader = + static_cast(column_reader.get()); + startIdx -= reader->Skip(startIdx); + + while (reader->HasNext() && arrayIdx < numElems) { + (void)reader->ReadBatch(1, &definition_level, nullptr, &chpl_ptr[arrayIdx], &values_read); + // if values_read is 0, that means that it was an empty seg + if (values_read != 0) { + arrayIdx++; + } + i++; + } + } else if(lty == ARROWFLOAT) { + // convert to simpler single batch to sidestep this seemingly architecture dependent (see issue #3234) + int16_t definition_level; // nullable type and only reading single records in batch + auto chpl_ptr = (double*)chpl_arr; + parquet::FloatReader* reader = + static_cast(column_reader.get()); + + float tmp; + while (reader->HasNext() && arrayIdx < numElems) { + (void)reader->ReadBatch(1, &definition_level, nullptr, &tmp, &values_read); + // if values_read is 0, that means that it was a null value or empty seg + if (values_read != 0) { + chpl_ptr[arrayIdx] = (double) tmp; + arrayIdx++; + } + else { + // check if nan otherwise it's an empty seg + if (definition_level == 2) { + chpl_ptr[arrayIdx] = NAN; + arrayIdx++; + } + } + i++; + } + } else if(lty == ARROWDOUBLE) { + // convert to simpler single batch to sidestep this seemingly architecture dependent (see issue #3234) + int16_t definition_level; // nullable type and only reading single records in batch + auto chpl_ptr = (double*)chpl_arr; + parquet::DoubleReader* reader = + static_cast(column_reader.get()); + + while (reader->HasNext() && arrayIdx < numElems) { + (void)reader->ReadBatch(1, &definition_level, nullptr, &chpl_ptr[arrayIdx], &values_read); + // if values_read is 0, that means that it was a null value or empty seg + if (values_read != 0) { + arrayIdx++; + } + else { + // check if nan otherwise it's an empty seg + if (definition_level == 2) { + chpl_ptr[arrayIdx] = NAN; + arrayIdx++; + } + } + i++; + } + } + } + return 0; + } + return ARROWERROR; + } catch (const std::exception& e) { + *errMsg = strdup(e.what()); + return ARROWERROR; + } +} + +int64_t cpp_getStringColumnNumBytes(const char* filename, const char* colname, void* chpl_offsets, int64_t numElems, int64_t startIdx, int64_t batchSize, char** errMsg) { + try { + int64_t ty = cpp_getType(filename, colname, errMsg); + int64_t dty; // used to store the type of data so we can handle lists + if (ty == ARROWLIST) { // get the type of the list so we can verify it is ARROWSTRING + dty = cpp_getListType(filename, colname, errMsg); + } + else { + dty = ty; + } + auto offsets = (int64_t*)chpl_offsets; + int64_t byteSize = 0; + + if(dty == ARROWSTRING) { + std::unique_ptr parquet_reader = + parquet::ParquetFileReader::OpenFile(filename, false); + + std::shared_ptr file_metadata = parquet_reader->metadata(); + int num_row_groups = file_metadata->num_row_groups(); + + int64_t i = 0; + for (int r = 0; r < num_row_groups; r++) { + std::shared_ptr row_group_reader = + parquet_reader->RowGroup(r); + + int64_t values_read = 0; + + std::shared_ptr column_reader; + + int64_t idx; + if (ty == ARROWLIST) { + idx = file_metadata -> schema() -> group_node() -> FieldIndex(colname); + } else { + idx = file_metadata -> schema() -> ColumnIndex(colname); + } + + if(idx < 0) { + std::string dname(colname); + std::string fname(filename); + std::string msg = "Dataset: " + dname + " does not exist in file: " + fname; + *errMsg = strdup(msg.c_str()); + return ARROWERROR; + } + column_reader = row_group_reader->Column(idx); + + int16_t definition_level; + parquet::ByteArrayReader* ba_reader = + static_cast(column_reader.get()); + + int64_t numRead = 0; + while (ba_reader->HasNext() && numRead < numElems) { + parquet::ByteArray value; + (void)ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); + if ((ty == ARROWLIST && definition_level == 3) || ty == ARROWSTRING) { + if(values_read > 0) { + offsets[i] = value.len + 1; + byteSize += value.len + 1; + numRead += values_read; + } else { + offsets[i] = 1; + byteSize+=1; + numRead+=1; + } + i++; + } + } + } + return byteSize; + } + return ARROWERROR; + } catch (const std::exception& e) { + *errMsg = strdup(e.what()); + return ARROWERROR; + } +} + +int64_t cpp_getListColumnSize(const char* filename, const char* colname, void* chpl_seg_sizes, int64_t numElems, int64_t startIdx, char** errMsg) { + try { + int64_t ty = cpp_getType(filename, colname, errMsg); + auto seg_sizes = (int64_t*)chpl_seg_sizes; + int64_t listSize = 0; + + if (ty == ARROWLIST){ + int64_t lty = cpp_getListType(filename, colname, errMsg); + std::unique_ptr parquet_reader = + parquet::ParquetFileReader::OpenFile(filename, false); + + std::shared_ptr file_metadata = parquet_reader->metadata(); + int num_row_groups = file_metadata->num_row_groups(); + + auto idx = file_metadata -> schema() -> group_node() -> FieldIndex(colname); + if(idx < 0) { + std::string dname(colname); + std::string fname(filename); + std::string msg = "Dataset: " + dname + " does not exist in file: " + fname; + *errMsg = strdup(msg.c_str()); + return ARROWERROR; + } + + int64_t i = 0; + int64_t vct = 0; + int64_t seg_size = 0; + int64_t off = 0; + bool first = true; + for (int r = 0; r < num_row_groups; r++) { + std::shared_ptr row_group_reader = + parquet_reader->RowGroup(r); + + int64_t values_read = 0; + + std::shared_ptr column_reader; + + column_reader = row_group_reader->Column(idx); + int16_t definition_level; + int16_t rep_lvl; + + if(lty == ARROWINT64 || lty == ARROWUINT64) { + parquet::Int64Reader* int_reader = + static_cast(column_reader.get()); + + while (int_reader->HasNext()) { + int64_t value; + (void)int_reader->ReadBatch(1, &definition_level, &rep_lvl, &value, &values_read); + if (values_read == 0 || (!first && rep_lvl == 0)) { + seg_sizes[i] = seg_size; + i++; + seg_size = 0; + } + if (values_read != 0) { + seg_size++; + vct++; + if (first) { + first = false; + } + } + if (values_read != 0 && !int_reader->HasNext()){ + seg_sizes[i] = seg_size; + } + } + } else if(lty == ARROWINT32 || lty == ARROWUINT32) { + parquet::Int32Reader* int_reader = + static_cast(column_reader.get()); + + while (int_reader->HasNext()) { + int32_t value; + (void)int_reader->ReadBatch(1, &definition_level, &rep_lvl, &value, &values_read); + if (values_read == 0 || (!first && rep_lvl == 0)) { + seg_sizes[i] = seg_size; + i++; + seg_size = 0; + } + if (values_read != 0) { + seg_size++; + vct++; + if (first) { + first = false; + } + } + if (values_read != 0 && !int_reader->HasNext()){ + seg_sizes[i] = seg_size; + } + } + } else if (lty == ARROWSTRING) { + parquet::ByteArrayReader* reader = + static_cast(column_reader.get()); + + while (reader->HasNext()) { + parquet::ByteArray value; + (void)reader->ReadBatch(1, &definition_level, &rep_lvl, &value, &values_read); + if (values_read == 0 || (!first && rep_lvl == 0)) { + seg_sizes[i] = seg_size; + i++; + seg_size = 0; + } + if (values_read != 0) { + seg_size++; + vct++; + if (first) { + first = false; + } + } + if (values_read != 0 && !reader->HasNext()){ + seg_sizes[i] = seg_size; + } + } + } else if(lty == ARROWBOOLEAN) { + parquet::BoolReader* bool_reader = + static_cast(column_reader.get()); + + while (bool_reader->HasNext()) { + bool value; + (void)bool_reader->ReadBatch(1, &definition_level, &rep_lvl, &value, &values_read); + if (values_read == 0 || (!first && rep_lvl == 0)) { + seg_sizes[i] = seg_size; + i++; + seg_size = 0; + } + if (values_read != 0) { + seg_size++; + vct++; + if (first) { + first = false; + } + } + if (values_read != 0 && !bool_reader->HasNext()){ + seg_sizes[i] = seg_size; + } + } + } else if (lty == ARROWFLOAT) { + parquet::FloatReader* float_reader = + static_cast(column_reader.get()); + + int64_t numRead = 0; + while (float_reader->HasNext()) { + float value; + (void)float_reader->ReadBatch(1, &definition_level, &rep_lvl, &value, &values_read); + if ((values_read == 0 && definition_level != 2) || (!first && rep_lvl == 0)) { + seg_sizes[i] = seg_size; + i++; + seg_size = 0; + } + if (values_read != 0 || (values_read == 0 && definition_level == 2)) { + seg_size++; + vct++; + if (first) { + first = false; + } + } + if ((values_read != 0 || (values_read == 0 && definition_level == 2)) && !float_reader->HasNext()){ + seg_sizes[i] = seg_size; + } + } + } else if(lty == ARROWDOUBLE) { + parquet::DoubleReader* dbl_reader = + static_cast(column_reader.get()); + + while (dbl_reader->HasNext()) { + double value; + (void)dbl_reader->ReadBatch(1, &definition_level, &rep_lvl, &value, &values_read); + if ((values_read == 0 && definition_level != 2) || (!first && rep_lvl == 0)) { + seg_sizes[i] = seg_size; + i++; + seg_size = 0; + } + if (values_read != 0 || (values_read == 0 && definition_level == 2)) { + seg_size++; + vct++; + if (first) { + first = false; + } + } + if ((values_read != 0 || (values_read == 0 && definition_level == 2)) && !dbl_reader->HasNext()){ + seg_sizes[i] = seg_size; + } + } + } + } + return vct; + } + return ARROWERROR; + } catch (const std::exception& e) { + *errMsg = strdup(e.what()); + return ARROWERROR; + } +} + +extern "C" { + int c_readColumnByName(const char* filename, void* chpl_arr, bool* where_null_chpl, const char* colname, int64_t numElems, int64_t startIdx, int64_t batchSize, int64_t byteLength, bool hasNonFloatNulls, char** errMsg) { + return cpp_readColumnByName(filename, chpl_arr, where_null_chpl, colname, numElems, startIdx, batchSize, byteLength, hasNonFloatNulls, errMsg); + } + + int c_readListColumnByName(const char* filename, void* chpl_arr, const char* colname, int64_t numElems, int64_t startIdx, int64_t batchSize, char** errMsg) { + return cpp_readListColumnByName(filename, chpl_arr, colname, numElems, startIdx, batchSize, errMsg); + } + + int64_t c_getStringColumnNumBytes(const char* filename, const char* colname, void* chpl_offsets, int64_t numElems, int64_t startIdx, int64_t batchSize, char** errMsg) { + return cpp_getStringColumnNumBytes(filename, colname, chpl_offsets, numElems, startIdx, batchSize, errMsg); + } + + int64_t c_getListColumnSize(const char* filename, const char* colname, void* chpl_seg_sizes, int64_t numElems, int64_t startIdx, char** errMsg) { + return cpp_getListColumnSize(filename, colname, chpl_seg_sizes, numElems, startIdx, errMsg); + } +} diff --git a/src/parquet/ReadParquet.h b/src/parquet/ReadParquet.h new file mode 100644 index 0000000000..52e1871f0c --- /dev/null +++ b/src/parquet/ReadParquet.h @@ -0,0 +1,46 @@ +#include +#include + +// Wrap functions in C extern if compiling C++ object file +#ifdef __cplusplus +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +extern "C" { +#endif + + + int c_readColumnByName(const char* filename, void* chpl_arr, bool* where_null_chpl, + const char* colname, int64_t numElems, int64_t startIdx, + int64_t batchSize, int64_t byteLength, bool hasNonFloatNulls, char** errMsg); + int cpp_readColumnByName(const char* filename, void* chpl_arr, bool* where_null_chpl, + const char* colname, int64_t numElems, int64_t startIdx, + int64_t batchSize, int64_t byteLength, bool hasNonFloatNulls, char** errMsg); + + int c_readListColumnByName(const char* filename, void* chpl_arr, + const char* colname, int64_t numElems, + int64_t startIdx, int64_t batchSize, char** errMsg); + int cpp_readListColumnByName(const char* filename, void* chpl_arr, + const char* colname, int64_t numElems, + int64_t startIdx, int64_t batchSize, char** errMsg); + + int64_t cpp_getStringColumnNumBytes(const char* filename, const char* colname, void* chpl_offsets, + int64_t numElems, int64_t startIdx, int64_t batchSize, char** errMsg); + int64_t c_getStringColumnNumBytes(const char* filename, const char* colname, void* chpl_offsets, + int64_t numElems, int64_t startIdx, int64_t batchSize, char** errMsg); + + int64_t c_getListColumnSize(const char* filename, const char* colname, + void* chpl_seg_sizes, int64_t numElems, int64_t startIdx, char** errMsg); + int64_t cpp_getListColumnSize(const char* filename, const char* colname, + void* chpl_seg_sizes, int64_t numElems, int64_t startIdx, char** errMsg); + +#ifdef __cplusplus +} +#endif diff --git a/src/parquet/UtilParquet.cpp b/src/parquet/UtilParquet.cpp new file mode 100644 index 0000000000..e91b727d18 --- /dev/null +++ b/src/parquet/UtilParquet.cpp @@ -0,0 +1,774 @@ +#include "UtilParquet.h" + + /* + Arrow Error Helpers + ------------------- + Arrow provides PARQUETASSIGNORTHROW and other similar macros + to help with error handling, but since we are doing something + unique (passing back the error message to Chapel to be displayed), + these helpers are similar to the provided macros but matching our + functionality. + */ + + // The `ARROWRESULT_OK` macro should be used when trying to + // assign the result of an Arrow/Parquet function to a value that can + // potentially throw an error, so the argument `cmd` is the Arrow + // command to execute and `res` is the desired variable to store the + // result +#define ARROWRESULT_OK(cmd, res) \ + { \ + auto result = cmd; \ + if(!result.ok()) { \ + *errMsg = strdup(result.status().message().c_str()); \ + return ARROWERROR; \ + } \ + res = result.ValueOrDie(); \ + } + + // The `ARROWSTATUS_OK` macro should be used when calling an + // Arrow/Parquet function that returns a status. The `cmd` + // argument should be the Arrow function to execute. +#define ARROWSTATUS_OK(cmd) \ + if(!check_status_ok(cmd, errMsg)) \ + return ARROWERROR; + +static std::map> globalFiles; +static std::map> globalRowGroupReaders; +static std::map> globalColumnReaders; + +bool check_status_ok(arrow::Status status, char** errMsg) { + if(!status.ok()) { + *errMsg = strdup(status.message().c_str()); + return false; + } + return true; +} + +/* + C++ functions + ------------- + These C++ functions are used to call into the Arrow library + and are then called to by their corresponding C functions to + allow interoperability with Chapel. This means that all of the + C++ functions must return types that are C compatible. +*/ + +int64_t cpp_getNumRows(const char* filename, char** errMsg) { + try { + std::shared_ptr infile; + ARROWRESULT_OK(arrow::io::ReadableFile::Open(filename, arrow::default_memory_pool()), + infile); + + std::unique_ptr reader; + ARROWSTATUS_OK(parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); + + return reader -> parquet_reader() -> metadata() -> num_rows(); + } catch (const std::exception& e) { + *errMsg = strdup(e.what()); + return ARROWERROR; + } +} + +int cpp_getPrecision(const char* filename, const char* colname, char** errMsg) { + try { + std::shared_ptr infile; + ARROWRESULT_OK(arrow::io::ReadableFile::Open(filename, arrow::default_memory_pool()), + infile); + + std::unique_ptr reader; + ARROWSTATUS_OK(parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); + + std::shared_ptr sc; + std::shared_ptr* out = ≻ + ARROWSTATUS_OK(reader->GetSchema(out)); + + int idx = sc -> GetFieldIndex(colname); + + const auto& decimal_type = static_cast(*sc->field(idx)->type()); + const int64_t precision = decimal_type.precision(); + + return precision; + } catch (const std::exception& e) { + *errMsg = strdup(e.what()); + return ARROWERROR; + } +} + +int cpp_getType(const char* filename, const char* colname, char** errMsg) { + try { + std::shared_ptr infile; + ARROWRESULT_OK(arrow::io::ReadableFile::Open(filename, arrow::default_memory_pool()), + infile); + + std::unique_ptr reader; + ARROWSTATUS_OK(parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); + + std::shared_ptr sc; + std::shared_ptr* out = ≻ + ARROWSTATUS_OK(reader->GetSchema(out)); + + int idx = sc -> GetFieldIndex(colname); + // Since this doesn't actually throw a Parquet error, we have to generate + // our own error message for this case + if(idx == -1) { + std::string fname(filename); + std::string dname(colname); + std::string msg = "Dataset: " + dname + " does not exist in file: " + filename; + *errMsg = strdup(msg.c_str()); + return ARROWERROR; + } + auto myType = sc -> field(idx) -> type(); + + if(myType->id() == arrow::Type::INT64) + return ARROWINT64; + else if(myType->id() == arrow::Type::INT32 || myType->id() == arrow::Type::INT16) + return ARROWINT32; // int16 is logical type, stored as int32 + else if(myType->id() == arrow::Type::UINT64) + return ARROWUINT64; + else if(myType->id() == arrow::Type::UINT32 || + myType->id() == arrow::Type::UINT16) + return ARROWUINT32; // uint16 is logical type, stored as uint32 + else if(myType->id() == arrow::Type::TIMESTAMP) + return ARROWTIMESTAMP; + else if(myType->id() == arrow::Type::BOOL) + return ARROWBOOLEAN; + else if(myType->id() == arrow::Type::STRING || + myType->id() == arrow::Type::BINARY || + myType->id() == arrow::Type::LARGE_STRING) + return ARROWSTRING; + else if(myType->id() == arrow::Type::FLOAT) + return ARROWFLOAT; + else if(myType->id() == arrow::Type::DOUBLE) + return ARROWDOUBLE; + else if(myType->id() == arrow::Type::LIST) + return ARROWLIST; + else if(myType->id() == arrow::Type::DECIMAL) + return ARROWDECIMAL; + else { + std::string fname(filename); + std::string dname(colname); + std::string msg = "Unsupported type on column: " + dname + " in " + filename; + *errMsg = strdup(msg.c_str()); + return ARROWERROR; + } + } catch (const std::exception& e) { + *errMsg = strdup(e.what()); + return ARROWERROR; + } +} + +int cpp_getListType(const char* filename, const char* colname, char** errMsg) { + try { + std::shared_ptr infile; + ARROWRESULT_OK(arrow::io::ReadableFile::Open(filename, arrow::default_memory_pool()), + infile); + + std::unique_ptr reader; + ARROWSTATUS_OK(parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); + + std::shared_ptr sc; + std::shared_ptr* out = ≻ + ARROWSTATUS_OK(reader->GetSchema(out)); + + int idx = sc -> GetFieldIndex(colname); + // Since this doesn't actually throw a Parquet error, we have to generate + // our own error message for this case + if(idx == -1) { + std::string fname(filename); + std::string dname(colname); + std::string msg = "Dataset: " + dname + " does not exist in file: " + fname; + *errMsg = strdup(msg.c_str()); + return ARROWERROR; + } + auto myType = sc -> field(idx) -> type(); + + if (myType->id() == arrow::Type::LIST) { + if (myType->num_fields() != 1) { + std::string fname(filename); + std::string dname(colname); + std::string msg = "Column " + dname + " in " + fname + " cannot be read by Arkouda."; + *errMsg = strdup(msg.c_str()); + return ARROWERROR; + } + else { + // fields returns a vector of fields, but here we are expecting lists so should only contain 1 item here + auto field = myType->fields()[0]; + auto f_type = field->type(); + if(f_type->id() == arrow::Type::INT64) + return ARROWINT64; + else if(f_type->id() == arrow::Type::INT32 || f_type->id() == arrow::Type::INT16) + return ARROWINT32; + else if(f_type->id() == arrow::Type::UINT64) + return ARROWUINT64; + else if(f_type->id() == arrow::Type::UINT32 || f_type->id() == arrow::Type::UINT16) + return ARROWUINT32; + else if(f_type->id() == arrow::Type::TIMESTAMP) + return ARROWTIMESTAMP; + else if(f_type->id() == arrow::Type::BOOL) + return ARROWBOOLEAN; + else if(f_type->id() == arrow::Type::STRING || + f_type->id() == arrow::Type::BINARY || + f_type->id() == arrow::Type::LARGE_STRING) // Verify that this is functional as expected + return ARROWSTRING; + else if(f_type->id() == arrow::Type::FLOAT) + return ARROWFLOAT; + else if(f_type->id() == arrow::Type::DOUBLE) + return ARROWDOUBLE; + else { + std::string fname(filename); + std::string dname(colname); + std::string msg = "Unsupported type on column: " + dname + " in " + fname; + *errMsg = strdup(msg.c_str()); + return ARROWERROR; + } + } + } + else { + std::string fname(filename); + std::string dname(colname); + std::string msg = "Column " + dname + " in " + fname + " is not a List"; + *errMsg = strdup(msg.c_str()); + return ARROWERROR; + } + } catch (const std::exception& e) { + *errMsg = strdup(e.what()); + return ARROWERROR; + } +} + +int64_t cpp_getStringColumnNullIndices(const char* filename, const char* colname, void* chpl_nulls, char** errMsg) { + try { + int64_t ty = cpp_getType(filename, colname, errMsg); + auto null_indices = (int64_t*)chpl_nulls; + int64_t byteSize = 0; + + if(ty == ARROWSTRING) { + std::unique_ptr parquet_reader = + parquet::ParquetFileReader::OpenFile(filename, false); + + std::shared_ptr file_metadata = parquet_reader->metadata(); + int num_row_groups = file_metadata->num_row_groups(); + + int64_t i = 0; + for (int r = 0; r < num_row_groups; r++) { + std::shared_ptr row_group_reader = + parquet_reader->RowGroup(r); + + int64_t values_read = 0; + + std::shared_ptr column_reader; + + auto idx = file_metadata -> schema() -> ColumnIndex(colname); + + if(idx < 0) { + std::string dname(colname); + std::string fname(filename); + std::string msg = "Dataset: " + dname + " does not exist in file: " + fname; + *errMsg = strdup(msg.c_str()); + return ARROWERROR; + } + column_reader = row_group_reader->Column(idx); + int16_t definition_level; + parquet::ByteArrayReader* ba_reader = + static_cast(column_reader.get()); + + while (ba_reader->HasNext()) { + parquet::ByteArray value; + (void)ba_reader->ReadBatch(1, &definition_level, nullptr, &value, &values_read); + if(values_read == 0) + null_indices[i] = 1; + i++; + } + } + return 0; + } + return ARROWERROR; + } catch (const std::exception& e) { + *errMsg = strdup(e.what()); + return ARROWERROR; + } +} + +// configure the schema for a multicolumn file +std::shared_ptr SetupSchema(void* column_names, void * objTypes, void* datatypes, int64_t colnum) { + parquet::schema::NodeVector fields; + auto cname_ptr = (char**)column_names; + auto dtypes_ptr = (int64_t*) datatypes; + auto objType_ptr = (int64_t*) objTypes; + for (int64_t i = 0; i < colnum; i++){ + if(dtypes_ptr[i] == ARROWINT64) { + if (objType_ptr[i] == SEGARRAY){ + auto element = parquet::schema::PrimitiveNode::Make("item", parquet::Repetition::OPTIONAL, parquet::Type::INT64, parquet::ConvertedType::NONE); + auto list = parquet::schema::GroupNode::Make("list", parquet::Repetition::REPEATED, {element}); + fields.push_back(parquet::schema::GroupNode::Make(cname_ptr[i], parquet::Repetition::OPTIONAL, {list}, parquet::ConvertedType::LIST)); + } else { + fields.push_back(parquet::schema::PrimitiveNode::Make(cname_ptr[i], parquet::Repetition::REQUIRED, parquet::Type::INT64, parquet::ConvertedType::NONE)); + } + } else if(dtypes_ptr[i] == ARROWUINT64) { + if (objType_ptr[i] == SEGARRAY){ + auto element = parquet::schema::PrimitiveNode::Make("item", parquet::Repetition::OPTIONAL, parquet::Type::INT64, parquet::ConvertedType::UINT_64); + auto list = parquet::schema::GroupNode::Make("list", parquet::Repetition::REPEATED, {element}); + fields.push_back(parquet::schema::GroupNode::Make(cname_ptr[i], parquet::Repetition::OPTIONAL, {list}, parquet::ConvertedType::LIST)); + } else { + fields.push_back(parquet::schema::PrimitiveNode::Make(cname_ptr[i], parquet::Repetition::REQUIRED, parquet::Type::INT64, parquet::ConvertedType::UINT_64)); + } + } else if(dtypes_ptr[i] == ARROWBOOLEAN) { + if (objType_ptr[i] == SEGARRAY){ + auto element = parquet::schema::PrimitiveNode::Make("item", parquet::Repetition::OPTIONAL, parquet::Type::BOOLEAN, parquet::ConvertedType::NONE); + auto list = parquet::schema::GroupNode::Make("list", parquet::Repetition::REPEATED, {element}); + fields.push_back(parquet::schema::GroupNode::Make(cname_ptr[i], parquet::Repetition::OPTIONAL, {list}, parquet::ConvertedType::LIST)); + } else { + fields.push_back(parquet::schema::PrimitiveNode::Make(cname_ptr[i], parquet::Repetition::REQUIRED, parquet::Type::BOOLEAN, parquet::ConvertedType::NONE)); + } + } else if(dtypes_ptr[i] == ARROWDOUBLE) { + if (objType_ptr[i] == SEGARRAY) { + auto element = parquet::schema::PrimitiveNode::Make("item", parquet::Repetition::OPTIONAL, parquet::Type::DOUBLE, parquet::ConvertedType::NONE); + auto list = parquet::schema::GroupNode::Make("list", parquet::Repetition::REPEATED, {element}); + fields.push_back(parquet::schema::GroupNode::Make(cname_ptr[i], parquet::Repetition::OPTIONAL, {list}, parquet::ConvertedType::LIST)); + } else { + fields.push_back(parquet::schema::PrimitiveNode::Make(cname_ptr[i], parquet::Repetition::REQUIRED, parquet::Type::DOUBLE, parquet::ConvertedType::NONE)); + } + } else if(dtypes_ptr[i] == ARROWSTRING) { + if (objType_ptr[i] == SEGARRAY) { + auto element = parquet::schema::PrimitiveNode::Make("item", parquet::Repetition::OPTIONAL, parquet::Type::BYTE_ARRAY, parquet::ConvertedType::NONE); + auto list = parquet::schema::GroupNode::Make("list", parquet::Repetition::REPEATED, {element}); + fields.push_back(parquet::schema::GroupNode::Make(cname_ptr[i], parquet::Repetition::OPTIONAL, {list}, parquet::ConvertedType::LIST)); + } else { + fields.push_back(parquet::schema::PrimitiveNode::Make(cname_ptr[i], parquet::Repetition::OPTIONAL, parquet::Type::BYTE_ARRAY, parquet::ConvertedType::NONE)); + } + } + } + return std::static_pointer_cast( + parquet::schema::GroupNode::Make("schema", parquet::Repetition::REQUIRED, fields)); +} + +int cpp_createEmptyListParquetFile(const char* filename, const char* dsetname, int64_t dtype, + int64_t compression, char** errMsg) { + try { + using FileClass = ::arrow::io::FileOutputStream; + std::shared_ptr out_file; + PARQUET_ASSIGN_OR_THROW(out_file, FileClass::Open(filename)); + + parquet::schema::NodeVector fields; + if (dtype == ARROWINT64) { + auto element = parquet::schema::PrimitiveNode::Make("item", parquet::Repetition::OPTIONAL, parquet::Type::INT64, parquet::ConvertedType::NONE); + auto list = parquet::schema::GroupNode::Make("list", parquet::Repetition::REPEATED, {element}); + fields.push_back(parquet::schema::GroupNode::Make(dsetname, parquet::Repetition::OPTIONAL, {list}, parquet::ConvertedType::LIST)); + } + else if (dtype == ARROWUINT64) { + auto element = parquet::schema::PrimitiveNode::Make("item", parquet::Repetition::OPTIONAL, parquet::Type::INT64, parquet::ConvertedType::UINT_64); + auto list = parquet::schema::GroupNode::Make("list", parquet::Repetition::REPEATED, {element}); + fields.push_back(parquet::schema::GroupNode::Make(dsetname, parquet::Repetition::OPTIONAL, {list}, parquet::ConvertedType::LIST)); + } + else if (dtype == ARROWBOOLEAN) { + auto element = parquet::schema::PrimitiveNode::Make("item", parquet::Repetition::OPTIONAL, parquet::Type::BOOLEAN, parquet::ConvertedType::NONE); + auto list = parquet::schema::GroupNode::Make("list", parquet::Repetition::REPEATED, {element}); + fields.push_back(parquet::schema::GroupNode::Make(dsetname, parquet::Repetition::OPTIONAL, {list}, parquet::ConvertedType::LIST)); + } + else if (dtype == ARROWDOUBLE) { + auto element = parquet::schema::PrimitiveNode::Make("item", parquet::Repetition::OPTIONAL, parquet::Type::DOUBLE, parquet::ConvertedType::NONE); + auto list = parquet::schema::GroupNode::Make("list", parquet::Repetition::REPEATED, {element}); + fields.push_back(parquet::schema::GroupNode::Make(dsetname, parquet::Repetition::OPTIONAL, {list}, parquet::ConvertedType::LIST)); + } + std::shared_ptr schema = std::static_pointer_cast + (parquet::schema::GroupNode::Make("schema", parquet::Repetition::REQUIRED, fields)); + + parquet::WriterProperties::Builder builder; + // assign the proper compression + if(compression == SNAPPY_COMP) { + builder.compression(parquet::Compression::SNAPPY); + } else if (compression == GZIP_COMP) { + builder.compression(parquet::Compression::GZIP); + } else if (compression == BROTLI_COMP) { + builder.compression(parquet::Compression::BROTLI); + } else if (compression == ZSTD_COMP) { + builder.compression(parquet::Compression::ZSTD); + } else if (compression == LZ4_COMP) { + builder.compression(parquet::Compression::LZ4); + } + std::shared_ptr props = builder.build(); + std::shared_ptr file_writer = + parquet::ParquetFileWriter::Open(out_file, schema, props); + + file_writer->Close(); + ARROWSTATUS_OK(out_file->Close()); + + return 0; + } catch (const std::exception& e) { + *errMsg = strdup(e.what()); + return ARROWERROR; + } +} + +int cpp_createEmptyParquetFile(const char* filename, const char* dsetname, int64_t dtype, + int64_t compression, char** errMsg) { + try { + using FileClass = ::arrow::io::FileOutputStream; + std::shared_ptr out_file; + PARQUET_ASSIGN_OR_THROW(out_file, FileClass::Open(filename)); + + parquet::schema::NodeVector fields; + if(dtype == ARROWINT64) + fields.push_back(parquet::schema::PrimitiveNode::Make(dsetname, parquet::Repetition::REQUIRED, parquet::Type::INT64, parquet::ConvertedType::NONE)); + else if(dtype == ARROWUINT64) + fields.push_back(parquet::schema::PrimitiveNode::Make(dsetname, parquet::Repetition::REQUIRED, parquet::Type::INT64, parquet::ConvertedType::UINT_64)); + else if(dtype == ARROWBOOLEAN) + fields.push_back(parquet::schema::PrimitiveNode::Make(dsetname, parquet::Repetition::REQUIRED, parquet::Type::BOOLEAN, parquet::ConvertedType::NONE)); + else if(dtype == ARROWDOUBLE) + fields.push_back(parquet::schema::PrimitiveNode::Make(dsetname, parquet::Repetition::REQUIRED, parquet::Type::DOUBLE, parquet::ConvertedType::NONE)); + else if(dtype == ARROWSTRING) + fields.push_back(parquet::schema::PrimitiveNode::Make(dsetname, parquet::Repetition::OPTIONAL, parquet::Type::BYTE_ARRAY, parquet::ConvertedType::NONE)); + std::shared_ptr schema = std::static_pointer_cast + (parquet::schema::GroupNode::Make("schema", parquet::Repetition::REQUIRED, fields)); + + parquet::WriterProperties::Builder builder; + // assign the proper compression + if(compression == SNAPPY_COMP) { + builder.compression(parquet::Compression::SNAPPY); + } else if (compression == GZIP_COMP) { + builder.compression(parquet::Compression::GZIP); + } else if (compression == BROTLI_COMP) { + builder.compression(parquet::Compression::BROTLI); + } else if (compression == ZSTD_COMP) { + builder.compression(parquet::Compression::ZSTD); + } else if (compression == LZ4_COMP) { + builder.compression(parquet::Compression::LZ4); + } + std::shared_ptr props = builder.build(); + std::shared_ptr file_writer = + parquet::ParquetFileWriter::Open(out_file, schema, props); + + file_writer->Close(); + ARROWSTATUS_OK(out_file->Close()); + + return 0; + } catch (const std::exception& e) { + *errMsg = strdup(e.what()); + return ARROWERROR; + } +} + +int cpp_appendColumnToParquet(const char* filename, void* chpl_arr, + const char* dsetname, int64_t numelems, + int64_t dtype, int64_t compression, + char** errMsg) { + try { + if (chpl_arr == NULL){ + // early out to prevent bad memory access + return 0; + } + std::shared_ptr infile; + ARROWRESULT_OK(arrow::io::ReadableFile::Open(filename, arrow::default_memory_pool()), + infile); + std::unique_ptr reader; + ARROWSTATUS_OK(parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); + // Use threads for case when reading a table with many columns + reader->set_use_threads(true); + + std::shared_ptr table; + std::shared_ptr* hold_table = &table; + ARROWSTATUS_OK(reader->ReadTable(hold_table)); + + arrow::ArrayVector arrays; + std::shared_ptr values; + auto chunk_type = arrow::int64(); + if(dtype == ARROWINT64) { + chunk_type = arrow::int64(); + arrow::Int64Builder builder; + auto chpl_ptr = (int64_t*)chpl_arr; + ARROWSTATUS_OK(builder.AppendValues(chpl_ptr, numelems, nullptr)) + ARROWSTATUS_OK(builder.Finish(&values)); + } else if(dtype == ARROWUINT64) { + chunk_type = arrow::uint64(); + arrow::UInt64Builder builder; + auto chpl_ptr = (uint64_t*)chpl_arr; + ARROWSTATUS_OK(builder.AppendValues(chpl_ptr, numelems, nullptr)) + ARROWSTATUS_OK(builder.Finish(&values)); + } else if(dtype == ARROWBOOLEAN) { + chunk_type = arrow::boolean(); + arrow::BooleanBuilder builder; + auto chpl_ptr = (uint8_t*)chpl_arr; + ARROWSTATUS_OK(builder.AppendValues(chpl_ptr, numelems, nullptr)) + ARROWSTATUS_OK(builder.Finish(&values)); + } else if(dtype == ARROWSTRING) { + chunk_type = arrow::utf8(); + arrow::StringBuilder builder; + auto chpl_ptr = (uint8_t*)chpl_arr; + int64_t j = 0; + for(int64_t i = 0; i < numelems; i++) { + std::string tmp_str = ""; + while(chpl_ptr[j] != 0x00) { + tmp_str += chpl_ptr[j++]; + } + j++; + + auto const status = builder.Append(tmp_str); + if (status.IsCapacityError()) { + // Reached current chunk's capacity limit, so start a new one... + ARROWSTATUS_OK(builder.Finish(&values)); + arrays.push_back(values); + values.reset(); + builder.Reset(); + + // ...with this string as its first item. + ARROWSTATUS_OK(builder.Append(tmp_str)); + } else { + ARROWSTATUS_OK(status); + } + } + ARROWSTATUS_OK(builder.Finish(&values)); + } else if(dtype == ARROWDOUBLE) { + chunk_type = arrow::float64(); + arrow::DoubleBuilder builder; + auto chpl_ptr = (double*)chpl_arr; + ARROWSTATUS_OK(builder.AppendValues(chpl_ptr, numelems, nullptr)) + ARROWSTATUS_OK(builder.Finish(&values)); + } else { + std::string msg = "Unrecognized Parquet dtype"; + *errMsg = strdup(msg.c_str()); + return ARROWERROR; + } + arrays.push_back(values); + + std::shared_ptr chunk_sh_ptr; + ARROWRESULT_OK(arrow::ChunkedArray::Make({arrays}, chunk_type), chunk_sh_ptr); + + auto newField = arrow::field(dsetname, chunk_type); + std::shared_ptr fin_table; + ARROWRESULT_OK(table -> AddColumn(0, newField, chunk_sh_ptr), fin_table); + + using FileClass = ::arrow::io::FileOutputStream; + std::shared_ptr out_file; + ARROWRESULT_OK(FileClass::Open(filename), out_file); + ARROWSTATUS_OK(parquet::arrow::WriteTable(*fin_table, arrow::default_memory_pool(), out_file, numelems)); + + return 0; + } catch (const std::exception& e) { + *errMsg = strdup(e.what()); + return ARROWERROR; + } +} + +const char* cpp_getVersionInfo(void) { + return strdup(arrow::GetBuildInfo().version_string.c_str()); +} + +int cpp_getDatasetNames(const char* filename, char** dsetResult, bool readNested, char** errMsg) { + try { + std::shared_ptr infile; + ARROWRESULT_OK(arrow::io::ReadableFile::Open(filename, arrow::default_memory_pool()), + infile); + std::unique_ptr reader; + ARROWSTATUS_OK(parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader)); + + std::shared_ptr sc; + std::shared_ptr* out = ≻ + ARROWSTATUS_OK(reader->GetSchema(out)); + + std::string fields = ""; + bool first = true; + + for(int i = 0; i < sc->num_fields(); i++) { + // only add fields of supported types + if(sc->field(i)->type()->id() == arrow::Type::INT64 || + sc->field(i)->type()->id() == arrow::Type::INT32 || + sc->field(i)->type()->id() == arrow::Type::INT16 || + sc->field(i)->type()->id() == arrow::Type::UINT64 || + sc->field(i)->type()->id() == arrow::Type::UINT32 || + sc->field(i)->type()->id() == arrow::Type::UINT16 || + sc->field(i)->type()->id() == arrow::Type::TIMESTAMP || + sc->field(i)->type()->id() == arrow::Type::BOOL || + sc->field(i)->type()->id() == arrow::Type::STRING || + sc->field(i)->type()->id() == arrow::Type::BINARY || + sc->field(i)->type()->id() == arrow::Type::FLOAT || + sc->field(i)->type()->id() == arrow::Type::DOUBLE || + (sc->field(i)->type()->id() == arrow::Type::LIST && readNested) || + sc->field(i)->type()->id() == arrow::Type::DECIMAL || + sc->field(i)->type()->id() == arrow::Type::LARGE_STRING + ) { + if(!first) + fields += ("," + sc->field(i)->name()); + else + fields += (sc->field(i)->name()); + first = false; + } else if (sc->field(i)->type()->id() == arrow::Type::LIST && !readNested) { + continue; + } else { + std::string fname(filename); + std::string dname(sc->field(i)->ToString()); + std::string msg = "Unsupported type on column: " + dname + " in " + filename; + *errMsg = strdup(msg.c_str()); + return ARROWERROR; + } + } + *dsetResult = strdup(fields.c_str()); + + return 0; + } catch (const std::exception& e) { + *errMsg = strdup(e.what()); + return ARROWERROR; + } +} + +void cpp_free_string(void* ptr) { + free(ptr); +} + +void cpp_openFile(const char* filename, int64_t idx) { + std::shared_ptr parquet_reader = + parquet::ParquetFileReader::OpenFile(filename, false); + globalFiles[idx] = parquet_reader; +} + +void cpp_createRowGroupReader(int64_t rowGroup, int64_t readerIdx) { + std::shared_ptr row_group_reader = + globalFiles[readerIdx]->RowGroup(rowGroup); + globalRowGroupReaders[readerIdx] = row_group_reader; +} + +void cpp_createColumnReader(const char* colname, int64_t readerIdx) { + + std::shared_ptr file_metadata = globalFiles[readerIdx]->metadata(); + auto idx = file_metadata -> schema() -> ColumnIndex(colname); + + std::shared_ptr column_reader; + column_reader = globalRowGroupReaders[readerIdx]->Column(idx); + globalColumnReaders[readerIdx] = column_reader; +} + + + +int cpp_getNumRowGroups(int64_t readerIdx) { + std::shared_ptr file_metadata = globalFiles[readerIdx]->metadata(); + return file_metadata->num_row_groups(); +} + +void cpp_freeMapValues(void* row) { + parquet::ByteArray* string_values = + static_cast(row); + free(string_values); + globalColumnReaders.clear(); + globalRowGroupReaders.clear(); + globalFiles.clear(); +} + +int cpp_readParquetColumnChunks(const char* filename, int64_t batchSize, int64_t numElems, + int64_t readerIdx, int64_t* numRead, + void** outData, bool* containsNulls, char** errMsg) { + try { + auto reader = static_cast(globalColumnReaders[readerIdx].get()); + parquet::ByteArray* string_values = + (parquet::ByteArray*)malloc(numElems*sizeof(parquet::ByteArray)); + std::vector definition_level(batchSize); + int64_t values_read = 0; + int64_t total_read = 0; + while(reader->HasNext() && total_read < numElems) { + if((numElems - total_read) < batchSize) + batchSize = numElems - total_read; + // adding 1 to definition level, since the first value indicates if null values + (void)reader->ReadBatch(batchSize, definition_level.data(), nullptr, string_values + total_read, &values_read); + for(int i = 0; i < values_read; i++) { + if(definition_level[i] == 0) + *containsNulls = true; + } + total_read += values_read; + } + *numRead = total_read; + *outData = (void*)string_values; + return 0; + } catch (const std::exception& e) { + *errMsg = strdup(e.what()); + return ARROWERROR; + } +} + +/* + C functions + ----------- + These C functions provide no functionality, since the C++ + Arrow library is being used, they merely call the C++ functions + to allow Chapel to call the C++ functions through C interoperability. + Each Arrow function must have a corresponding C function if wished + to be called by Chapel. +*/ + +extern "C" { + int64_t c_getNumRows(const char* chpl_str, char** errMsg) { + return cpp_getNumRows(chpl_str, errMsg); + } + + int c_getType(const char* filename, const char* colname, char** errMsg) { + return cpp_getType(filename, colname, errMsg); + } + + int c_getListType(const char* filename, const char* colname, char** errMsg) { + return cpp_getListType(filename, colname, errMsg); + } + + int c_createEmptyParquetFile(const char* filename, const char* dsetname, int64_t dtype, + int64_t compression, char** errMsg) { + return cpp_createEmptyParquetFile(filename, dsetname, dtype, compression, errMsg); + } + + int c_createEmptyListParquetFile(const char* filename, const char* dsetname, int64_t dtype, + int64_t compression, char** errMsg) { + return cpp_createEmptyListParquetFile(filename, dsetname, dtype, compression, errMsg); + } + + int c_appendColumnToParquet(const char* filename, void* chpl_arr, + const char* dsetname, int64_t numelems, + int64_t dtype, int64_t compression, + char** errMsg) { + return cpp_appendColumnToParquet(filename, chpl_arr, + dsetname, numelems, + dtype, compression, + errMsg); + } + + int64_t c_getStringColumnNullIndices(const char* filename, const char* colname, void* chpl_nulls, char** errMsg) { + return cpp_getStringColumnNullIndices(filename, colname, chpl_nulls, errMsg); + } + + const char* c_getVersionInfo(void) { + return cpp_getVersionInfo(); + } + + int c_getDatasetNames(const char* filename, char** dsetResult, bool readNested, char** errMsg) { + return cpp_getDatasetNames(filename, dsetResult, readNested, errMsg); + } + + void c_free_string(void* ptr) { + cpp_free_string(ptr); + } + + int c_getPrecision(const char* filename, const char* colname, char** errMsg) { + return cpp_getPrecision(filename, colname, errMsg); + } + + void c_openFile(const char* filename, int64_t idx) { + cpp_openFile(filename, idx); + } + + void c_createRowGroupReader(int64_t rowGroup, int64_t readerIdx) { + return cpp_createRowGroupReader(rowGroup, readerIdx); + } + + void c_createColumnReader(const char* colname, int64_t readerIdx) { + cpp_createColumnReader(colname, readerIdx); + } + + int c_getNumRowGroups(int64_t readerIdx) { + return cpp_getNumRowGroups(readerIdx); + } + + void c_freeMapValues(void* row) { + cpp_freeMapValues(row); + } + + int c_readParquetColumnChunks(const char* filename, int64_t batchSize, int64_t numElems, + int64_t readerIdx, int64_t* numRead, + void** outData, bool* containsNulls, char** errMsg) { + return cpp_readParquetColumnChunks(filename, batchSize, numElems, readerIdx, + numRead, outData, containsNulls, errMsg); + } +} diff --git a/src/ArrowFunctions.h b/src/parquet/UtilParquet.h similarity index 67% rename from src/ArrowFunctions.h rename to src/parquet/UtilParquet.h index 4f73487eec..8bf67256a7 100644 --- a/src/ArrowFunctions.h +++ b/src/parquet/UtilParquet.h @@ -13,6 +13,9 @@ #include #include #include + +std::shared_ptr SetupSchema(void* column_names, void * objTypes, void* datatypes, int64_t colnum); + extern "C" { #endif @@ -55,13 +58,6 @@ extern "C" { void c_createColumnReader(const char* colname, int64_t readerIdx); void cpp_createColumnReader(const char* colname, int64_t readerIdx); - int c_readParquetColumnChunks(const char* filename, int64_t batchSize, int64_t numElems, - int64_t readerIdx, int64_t* numRead, - void** outData, bool* containsNulls, char** errMsg); - int cpp_readParquetColumnChunks(const char* filename, int64_t batchSize, int64_t numElems, - int64_t readerIdx, int64_t* numRead, - void** outData, bool* containsNulls, char** errMsg); - int c_getNumRowGroups(int64_t readerIdx); int cpp_getNumRowGroups(int64_t readerIdx); @@ -71,30 +67,6 @@ extern "C" { // is no C++ interoperability supported in Chapel today. int64_t c_getNumRows(const char*, char** errMsg); int64_t cpp_getNumRows(const char*, char** errMsg); - - int c_readColumnByName(const char* filename, void* chpl_arr, bool* where_null_chpl, - const char* colname, int64_t numElems, int64_t startIdx, - int64_t batchSize, int64_t byteLength, bool hasNonFloatNulls, char** errMsg); - int cpp_readColumnByName(const char* filename, void* chpl_arr, bool* where_null_chpl, - const char* colname, int64_t numElems, int64_t startIdx, - int64_t batchSize, int64_t byteLength, bool hasNonFloatNulls, char** errMsg); - - int c_readListColumnByName(const char* filename, void* chpl_arr, - const char* colname, int64_t numElems, - int64_t startIdx, int64_t batchSize, char** errMsg); - int cpp_readListColumnByName(const char* filename, void* chpl_arr, - const char* colname, int64_t numElems, - int64_t startIdx, int64_t batchSize, char** errMsg); - - int64_t cpp_getStringColumnNumBytes(const char* filename, const char* colname, void* chpl_offsets, - int64_t numElems, int64_t startIdx, int64_t batchSize, char** errMsg); - int64_t c_getStringColumnNumBytes(const char* filename, const char* colname, void* chpl_offsets, - int64_t numElems, int64_t startIdx, int64_t batchSize, char** errMsg); - - int64_t c_getListColumnSize(const char* filename, const char* colname, - void* chpl_seg_sizes, int64_t numElems, int64_t startIdx, char** errMsg); - int64_t cpp_getListColumnSize(const char* filename, const char* colname, - void* chpl_seg_sizes, int64_t numElems, int64_t startIdx, char** errMsg); int64_t c_getStringColumnNullIndices(const char* filename, const char* colname, void* chpl_nulls, char** errMsg); int64_t cpp_getStringColumnNullIndices(const char* filename, const char* colname, void* chpl_nulls, char** errMsg); @@ -135,15 +107,6 @@ extern "C" { const char* dsetname, int64_t numelems, int64_t rowGroupSize, int64_t dtype, int64_t compression, char** errMsg); - - int c_writeStrListColumnToParquet(const char* filename, void* chpl_segs, void* chpl_offsets, - void* chpl_arr, const char* dsetname, int64_t numelems, - int64_t rowGroupSize, int64_t dtype, int64_t compression, - char** errMsg); - int cpp_writeStrListColumnToParquet(const char* filename, void* chpl_segs, void* chpl_offsets, - void* chpl_arr, const char* dsetname, int64_t numelems, - int64_t rowGroupSize, int64_t dtype, int64_t compression, - char** errMsg); int c_createEmptyParquetFile(const char* filename, const char* dsetname, int64_t dtype, int64_t compression, char** errMsg); @@ -158,16 +121,6 @@ extern "C" { const char* dsetname, int64_t numelems, int64_t dtype, int64_t compression, char** errMsg); - - int c_writeMultiColToParquet(const char* filename, void* column_names, - void** ptr_arr, void** offset_arr, void* objTypes, void* datatypes, - void* segArr_sizes, int64_t colnum, int64_t numelems, int64_t rowGroupSize, - int64_t compression, char** errMsg); - - int cpp_writeMultiColToParquet(const char* filename, void* column_names, - void** ptr_arr, void** offset_arr, void* objTypes, void* datatypes, - void* segArr_sizes, int64_t colnum, int64_t numelems, int64_t rowGroupSize, - int64_t compression, char** errMsg); int c_getPrecision(const char* filename, const char* colname, char** errMsg); int cpp_getPrecision(const char* filename, const char* colname, char** errMsg); @@ -183,7 +136,15 @@ extern "C" { void c_freeMapValues(void* row); void cpp_freeMapValues(void* row); + + int c_readParquetColumnChunks(const char* filename, int64_t batchSize, int64_t numElems, + int64_t readerIdx, int64_t* numRead, + void** outData, bool* containsNulls, char** errMsg); + int cpp_readParquetColumnChunks(const char* filename, int64_t batchSize, int64_t numElems, + int64_t readerIdx, int64_t* numRead, + void** outData, bool* containsNulls, char** errMsg); #ifdef __cplusplus + bool check_status_ok(arrow::Status status, char** errMsg); } #endif diff --git a/src/parquet/WriteParquet.cpp b/src/parquet/WriteParquet.cpp new file mode 100644 index 0000000000..b8c6a07d39 --- /dev/null +++ b/src/parquet/WriteParquet.cpp @@ -0,0 +1,829 @@ +#include "WriteParquet.h" +#include "UtilParquet.h" + + /* + Arrow Error Helpers + ------------------- + Arrow provides PARQUETASSIGNORTHROW and other similar macros + to help with error handling, but since we are doing something + unique (passing back the error message to Chapel to be displayed), + these helpers are similar to the provided macros but matching our + functionality. + */ + + // The `ARROWRESULT_OK` macro should be used when trying to + // assign the result of an Arrow/Parquet function to a value that can + // potentially throw an error, so the argument `cmd` is the Arrow + // command to execute and `res` is the desired variable to store the + // result +#define ARROWRESULT_OK(cmd, res) \ + { \ + auto result = cmd; \ + if(!result.ok()) { \ + *errMsg = strdup(result.status().message().c_str()); \ + return ARROWERROR; \ + } \ + res = result.ValueOrDie(); \ + } + + // The `ARROWSTATUS_OK` macro should be used when calling an + // Arrow/Parquet function that returns a status. The `cmd` + // argument should be the Arrow function to execute. +#define ARROWSTATUS_OK(cmd) \ + if(!check_status_ok(cmd, errMsg)) \ + return ARROWERROR; + +int cpp_writeColumnToParquet(const char* filename, void* chpl_arr, + int64_t colnum, const char* dsetname, int64_t numelems, + int64_t rowGroupSize, int64_t dtype, int64_t compression, + char** errMsg) { + try { + using FileClass = ::arrow::io::FileOutputStream; + std::shared_ptr out_file; + ARROWRESULT_OK(FileClass::Open(filename), out_file); + + parquet::schema::NodeVector fields; + if(dtype == ARROWINT64) + fields.push_back(parquet::schema::PrimitiveNode::Make(dsetname, parquet::Repetition::REQUIRED, parquet::Type::INT64, parquet::ConvertedType::NONE)); + else if(dtype == ARROWUINT64) + fields.push_back(parquet::schema::PrimitiveNode::Make(dsetname, parquet::Repetition::REQUIRED, parquet::Type::INT64, parquet::ConvertedType::UINT_64)); + else if(dtype == ARROWBOOLEAN) + fields.push_back(parquet::schema::PrimitiveNode::Make(dsetname, parquet::Repetition::REQUIRED, parquet::Type::BOOLEAN, parquet::ConvertedType::NONE)); + else if(dtype == ARROWDOUBLE) + fields.push_back(parquet::schema::PrimitiveNode::Make(dsetname, parquet::Repetition::REQUIRED, parquet::Type::DOUBLE, parquet::ConvertedType::NONE)); + std::shared_ptr schema = std::static_pointer_cast + (parquet::schema::GroupNode::Make("schema", parquet::Repetition::REQUIRED, fields)); + + parquet::WriterProperties::Builder builder; + // assign the proper compression + if(compression == SNAPPY_COMP) { + builder.compression(parquet::Compression::SNAPPY); + } else if (compression == GZIP_COMP) { + builder.compression(parquet::Compression::GZIP); + } else if (compression == BROTLI_COMP) { + builder.compression(parquet::Compression::BROTLI); + } else if (compression == ZSTD_COMP) { + builder.compression(parquet::Compression::ZSTD); + } else if (compression == LZ4_COMP) { + builder.compression(parquet::Compression::LZ4); + } + std::shared_ptr props = builder.build(); + + std::shared_ptr file_writer = + parquet::ParquetFileWriter::Open(out_file, schema, props); + + int64_t i = 0; + int64_t numLeft = numelems; + + if (chpl_arr == NULL) { + // early out to prevent bad memory access + return 0; + } + + if(dtype == ARROWINT64 || dtype == ARROWUINT64) { + auto chpl_ptr = (int64_t*)chpl_arr; + while(numLeft > 0) { + parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); + parquet::Int64Writer* int64_writer = + static_cast(rg_writer->NextColumn()); + + int64_t batchSize = rowGroupSize; + if(numLeft < rowGroupSize) + batchSize = numLeft; + int64_writer->WriteBatch(batchSize, nullptr, nullptr, &chpl_ptr[i]); + numLeft -= batchSize; + i += batchSize; + } + } else if(dtype == ARROWBOOLEAN) { + auto chpl_ptr = (bool*)chpl_arr; + while(numLeft > 0) { + parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); + parquet::BoolWriter* writer = + static_cast(rg_writer->NextColumn()); + + int64_t batchSize = rowGroupSize; + if(numLeft < rowGroupSize) + batchSize = numLeft; + writer->WriteBatch(batchSize, nullptr, nullptr, &chpl_ptr[i]); + numLeft -= batchSize; + i += batchSize; + } + } else if(dtype == ARROWDOUBLE) { + auto chpl_ptr = (double*)chpl_arr; + while(numLeft > 0) { + parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); + parquet::DoubleWriter* writer = + static_cast(rg_writer->NextColumn()); + + int64_t batchSize = rowGroupSize; + if(numLeft < rowGroupSize) + batchSize = numLeft; + writer->WriteBatch(batchSize, nullptr, nullptr, &chpl_ptr[i]); + numLeft -= batchSize; + i += batchSize; + } + } else { + return ARROWERROR; + } + + file_writer->Close(); + ARROWSTATUS_OK(out_file->Close()); + + return 0; + } catch (const std::exception& e) { + *errMsg = strdup(e.what()); + return ARROWERROR; + } +} + +int cpp_writeStrColumnToParquet(const char* filename, void* chpl_arr, void* chpl_offsets, + const char* dsetname, int64_t numelems, + int64_t rowGroupSize, int64_t dtype, int64_t compression, + char** errMsg) { + try { + using FileClass = ::arrow::io::FileOutputStream; + std::shared_ptr out_file; + PARQUET_ASSIGN_OR_THROW(out_file, FileClass::Open(filename)); + + parquet::schema::NodeVector fields; + + fields.push_back(parquet::schema::PrimitiveNode::Make(dsetname, parquet::Repetition::OPTIONAL, parquet::Type::BYTE_ARRAY, parquet::ConvertedType::NONE)); + std::shared_ptr schema = std::static_pointer_cast + (parquet::schema::GroupNode::Make("schema", parquet::Repetition::REQUIRED, fields)); + + parquet::WriterProperties::Builder builder; + // assign the proper compression + if(compression == SNAPPY_COMP) { + builder.compression(parquet::Compression::SNAPPY); + } else if (compression == GZIP_COMP) { + builder.compression(parquet::Compression::GZIP); + } else if (compression == BROTLI_COMP) { + builder.compression(parquet::Compression::BROTLI); + } else if (compression == ZSTD_COMP) { + builder.compression(parquet::Compression::ZSTD); + } else if (compression == LZ4_COMP) { + builder.compression(parquet::Compression::LZ4); + } + std::shared_ptr props = builder.build(); + + std::shared_ptr file_writer = + parquet::ParquetFileWriter::Open(out_file, schema, props); + + int64_t i = 0; + int64_t numLeft = numelems; + + if(dtype == ARROWSTRING) { + auto chpl_ptr = (uint8_t*)chpl_arr; + auto offsets = (int64_t*)chpl_offsets; + int64_t byteIdx = 0; + int64_t offIdx = 0; + + while(numLeft > 0) { + parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); + parquet::ByteArrayWriter* ba_writer = + static_cast(rg_writer->NextColumn()); + int64_t count = 0; + while(numLeft > 0 && count < rowGroupSize) { + parquet::ByteArray value; + int16_t definition_level = 1; + value.ptr = reinterpret_cast(&chpl_ptr[byteIdx]); + // subtract 1 since we have the null terminator + value.len = offsets[offIdx+1] - offsets[offIdx] - 1; + if (value.len == 0) + definition_level = 0; + ba_writer->WriteBatch(1, &definition_level, nullptr, &value); + numLeft--;count++; + offIdx++; + byteIdx+=offsets[offIdx] - offsets[offIdx-1]; + } + } + } else { + return ARROWERROR; + } + + file_writer->Close(); + ARROWSTATUS_OK(out_file->Close()); + + return 0; + } catch (const std::exception& e) { + *errMsg = strdup(e.what()); + return ARROWERROR; + } +} + +int cpp_writeStrListColumnToParquet(const char* filename, void* chpl_segs, void* chpl_offsets, void* chpl_arr, + const char* dsetname, int64_t numelems, + int64_t rowGroupSize, int64_t dtype, int64_t compression, + char** errMsg) { + try { + if(dtype == ARROWSTRING) { // check the type here so if it is wrong we don't create a bad file + using FileClass = ::arrow::io::FileOutputStream; + std::shared_ptr out_file; + PARQUET_ASSIGN_OR_THROW(out_file, FileClass::Open(filename)); + + parquet::schema::NodeVector fields; + + auto element = parquet::schema::PrimitiveNode::Make("item", parquet::Repetition::OPTIONAL, parquet::Type::BYTE_ARRAY, parquet::ConvertedType::NONE); + auto list = parquet::schema::GroupNode::Make("list", parquet::Repetition::REPEATED, {element}); + fields.push_back(parquet::schema::GroupNode::Make(dsetname, parquet::Repetition::OPTIONAL, {list}, parquet::ConvertedType::LIST)); + std::shared_ptr schema = std::static_pointer_cast + (parquet::schema::GroupNode::Make("schema", parquet::Repetition::REQUIRED, fields)); + + parquet::WriterProperties::Builder builder; + // assign the proper compression + if(compression == SNAPPY_COMP) { + builder.compression(parquet::Compression::SNAPPY); + } else if (compression == GZIP_COMP) { + builder.compression(parquet::Compression::GZIP); + } else if (compression == BROTLI_COMP) { + builder.compression(parquet::Compression::BROTLI); + } else if (compression == ZSTD_COMP) { + builder.compression(parquet::Compression::ZSTD); + } else if (compression == LZ4_COMP) { + builder.compression(parquet::Compression::LZ4); + } + std::shared_ptr props = builder.build(); + + std::shared_ptr file_writer = + parquet::ParquetFileWriter::Open(out_file, schema, props); + + int64_t i = 0; + int64_t numLeft = numelems; + auto segments = (int64_t*)chpl_segs; + int64_t segIdx = 0; // index into segarray segments + int64_t offIdx = 0; // index into the segstring segments + int64_t valIdx = 0; // index into chpl_arr + + while(numLeft > 0) { // write all local values to the file + parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); + parquet::ByteArrayWriter* ba_writer = + static_cast(rg_writer->NextColumn()); + int64_t count = 0; + while (numLeft > 0 && count < rowGroupSize) { // ensures rowGroupSize maintained + int64_t segmentLength = segments[segIdx+1] - segments[segIdx]; + if (segmentLength > 0) { + auto offsets = (int64_t*)chpl_offsets; + auto chpl_ptr = (uint8_t*)chpl_arr; + for (int64_t x = 0; x < segmentLength; x++){ + int16_t rep_lvl = (x == 0) ? 0 : 1; + int16_t def_lvl = 3; + parquet::ByteArray value; + value.ptr = reinterpret_cast(&chpl_ptr[valIdx]); + value.len = offsets[offIdx+1] - offsets[offIdx] - 1; + ba_writer->WriteBatch(1, &def_lvl, &rep_lvl, &value); + offIdx++; + valIdx+=offsets[offIdx] - offsets[offIdx-1]; + } + } else { + // empty segment denoted by null value that is not repeated (first of segment) defined at the list level (1) + segmentLength = 1; // even though segment is length=0, write null to hold the empty segment + int16_t def_lvl = 1; + int16_t rep_lvl = 0; + ba_writer->WriteBatch(segmentLength, &def_lvl, &rep_lvl, nullptr); + } + segIdx++; + numLeft--;count++; + } + } + + file_writer->Close(); + ARROWSTATUS_OK(out_file->Close()); + return 0; + } else { + return ARROWERROR; + } + } catch (const std::exception& e) { + *errMsg = strdup(e.what()); + return ARROWERROR; + } +} + +int cpp_writeListColumnToParquet(const char* filename, void* chpl_segs, void* chpl_arr, + const char* dsetname, int64_t numelems, + int64_t rowGroupSize, int64_t dtype, int64_t compression, + char** errMsg) { + try { + using FileClass = ::arrow::io::FileOutputStream; + std::shared_ptr out_file; + PARQUET_ASSIGN_OR_THROW(out_file, FileClass::Open(filename)); + + parquet::schema::NodeVector fields; + + // create the list schema. List containing the dtype + if (dtype == ARROWINT64) { + auto element = parquet::schema::PrimitiveNode::Make("item", parquet::Repetition::OPTIONAL, parquet::Type::INT64, parquet::ConvertedType::NONE); + auto list = parquet::schema::GroupNode::Make("list", parquet::Repetition::REPEATED, {element}); + fields.push_back(parquet::schema::GroupNode::Make(dsetname, parquet::Repetition::OPTIONAL, {list}, parquet::ConvertedType::LIST)); + } + else if (dtype == ARROWUINT64) { + auto element = parquet::schema::PrimitiveNode::Make("item", parquet::Repetition::OPTIONAL, parquet::Type::INT64, parquet::ConvertedType::UINT_64); + auto list = parquet::schema::GroupNode::Make("list", parquet::Repetition::REPEATED, {element}); + fields.push_back(parquet::schema::GroupNode::Make(dsetname, parquet::Repetition::OPTIONAL, {list}, parquet::ConvertedType::LIST)); + } + else if (dtype == ARROWBOOLEAN) { + auto element = parquet::schema::PrimitiveNode::Make("item", parquet::Repetition::OPTIONAL, parquet::Type::BOOLEAN, parquet::ConvertedType::NONE); + auto list = parquet::schema::GroupNode::Make("list", parquet::Repetition::REPEATED, {element}); + fields.push_back(parquet::schema::GroupNode::Make(dsetname, parquet::Repetition::OPTIONAL, {list}, parquet::ConvertedType::LIST)); + } + else if (dtype == ARROWDOUBLE) { + auto element = parquet::schema::PrimitiveNode::Make("item", parquet::Repetition::OPTIONAL, parquet::Type::DOUBLE, parquet::ConvertedType::NONE); + auto list = parquet::schema::GroupNode::Make("list", parquet::Repetition::REPEATED, {element}); + fields.push_back(parquet::schema::GroupNode::Make(dsetname, parquet::Repetition::OPTIONAL, {list}, parquet::ConvertedType::LIST)); + } + std::shared_ptr schema = std::static_pointer_cast + (parquet::schema::GroupNode::Make("schema", parquet::Repetition::REQUIRED, fields)); + + parquet::WriterProperties::Builder builder; + // assign the proper compression + if(compression == SNAPPY_COMP) { + builder.compression(parquet::Compression::SNAPPY); + } else if (compression == GZIP_COMP) { + builder.compression(parquet::Compression::GZIP); + } else if (compression == BROTLI_COMP) { + builder.compression(parquet::Compression::BROTLI); + } else if (compression == ZSTD_COMP) { + builder.compression(parquet::Compression::ZSTD); + } else if (compression == LZ4_COMP) { + builder.compression(parquet::Compression::LZ4); + } + std::shared_ptr props = builder.build(); + + std::shared_ptr file_writer = + parquet::ParquetFileWriter::Open(out_file, schema, props); + + int64_t i = 0; + int64_t numLeft = numelems; + auto segments = (int64_t*)chpl_segs; + int64_t valIdx = 0; // index into chpl_arr + int64_t segIdx = 0; // index into offsets + + if(dtype == ARROWINT64 || dtype == ARROWUINT64) { + while(numLeft > 0) { // write all local values to the file + parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); + parquet::Int64Writer* writer = + static_cast(rg_writer->NextColumn()); + int64_t count = 0; + while (numLeft > 0 && count < rowGroupSize) { // ensures rowGroupSize maintained + int64_t batchSize = segments[segIdx+1] - segments[segIdx]; + if (batchSize > 0) { + auto chpl_ptr = (int64_t*)chpl_arr; + int16_t* def_lvl = new int16_t[batchSize] { 3 }; // all values defined at the item level (3) + int16_t* rep_lvl = new int16_t[batchSize] { 0 }; + for (int64_t x = 0; x < batchSize; x++){ + // if the value is first in the segment rep_lvl = 0, otherwise 1 + rep_lvl[x] = (x == 0) ? 0 : 1; + def_lvl[x] = 3; + } + writer->WriteBatch(batchSize, def_lvl, rep_lvl, &chpl_ptr[valIdx]); + valIdx += batchSize; + delete[] def_lvl; + delete[] rep_lvl; + } + else { + // empty segment denoted by null value that is not repeated (first of segment) defined at the list level (1) + batchSize = 1; // even though segment is length=0, write null to hold the empty segment + int16_t def_lvl = 1; + int16_t rep_lvl = 0; + writer->WriteBatch(batchSize, &def_lvl, &rep_lvl, nullptr); + } + count++; + segIdx++; + numLeft--; + } + } + } + else if (dtype == ARROWBOOLEAN) { + while(numLeft > 0) { + parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); + parquet::BoolWriter* writer = + static_cast(rg_writer->NextColumn()); + int64_t count = 0; + while (numLeft > 0 && count < rowGroupSize) { + int64_t batchSize = segments[segIdx+1] - segments[segIdx]; + if (batchSize > 0) { + auto chpl_ptr = (bool*)chpl_arr; + // if the value is first in the segment rep_lvl = 0, otherwise 1 + // all values defined at the item level (3) + int16_t* def_lvl = new int16_t[batchSize] { 3 }; + int16_t* rep_lvl = new int16_t[batchSize] { 0 }; + for (int64_t x = 0; x < batchSize; x++){ + rep_lvl[x] = (x == 0) ? 0 : 1; + def_lvl[x] = 3; + } + writer->WriteBatch(batchSize, def_lvl, rep_lvl, &chpl_ptr[valIdx]); + valIdx += batchSize; + delete[] def_lvl; + delete[] rep_lvl; + } + else { + // empty segment denoted by null value that is not repeated (first of segment) defined at the list level (1) + batchSize = 1; // even though segment is length=0, write null to hold the empty segment + int16_t def_lvl = 1; + int16_t rep_lvl = 0; + writer->WriteBatch(batchSize, &def_lvl, &rep_lvl, nullptr); + } + count++; + segIdx++; + numLeft--; + } + } + } + else if (dtype == ARROWDOUBLE) { + while(numLeft > 0) { + parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); + parquet::DoubleWriter* writer = + static_cast(rg_writer->NextColumn()); + int64_t count = 0; + while (numLeft > 0 && count < rowGroupSize) { + int64_t batchSize = segments[segIdx+1] - segments[segIdx]; + if (batchSize > 0) { + auto chpl_ptr = (double*)chpl_arr; + // if the value is first in the segment rep_lvl = 0, otherwise 1 + // all values defined at the item level (3) + int16_t* def_lvl = new int16_t[batchSize] { 3 }; + int16_t* rep_lvl = new int16_t[batchSize] { 0 }; + for (int64_t x = 0; x < batchSize; x++){ + rep_lvl[x] = (x == 0) ? 0 : 1; + def_lvl[x] = 3; + } + writer->WriteBatch(batchSize, def_lvl, rep_lvl, &chpl_ptr[valIdx]); + valIdx += batchSize; + delete[] def_lvl; + delete[] rep_lvl; + } + else { + // empty segment denoted by null value that is not repeated (first of segment) defined at the list level (1) + batchSize = 1; // even though segment is length=0, write null to hold the empty segment + int16_t def_lvl = 1; + int16_t rep_lvl = 0; + writer->WriteBatch(batchSize, &def_lvl, &rep_lvl, nullptr); + } + count++; + segIdx++; + numLeft--; + } + } + } + else { + return ARROWERROR; + } + + file_writer->Close(); + ARROWSTATUS_OK(out_file->Close()); + + return 0; + } catch (const std::exception& e) { + *errMsg = strdup(e.what()); + return ARROWERROR; + } +} + +int cpp_writeMultiColToParquet(const char* filename, void* column_names, + void** ptr_arr, void** offset_arr, void* objTypes, void* datatypes, + void* segArr_sizes, int64_t colnum, int64_t numelems, int64_t rowGroupSize, + int64_t compression, char** errMsg) { + try { + // initialize the file to write to + using FileClass = ::arrow::io::FileOutputStream; + std::shared_ptr out_file; + ARROWRESULT_OK(FileClass::Open(filename), out_file); + + // Setup the parquet schema + std::shared_ptr schema = SetupSchema(column_names, objTypes, datatypes, colnum); + + parquet::WriterProperties::Builder builder; + // assign the proper compression + if(compression == SNAPPY_COMP) { + builder.compression(parquet::Compression::SNAPPY); + } else if (compression == GZIP_COMP) { + builder.compression(parquet::Compression::GZIP); + } else if (compression == BROTLI_COMP) { + builder.compression(parquet::Compression::BROTLI); + } else if (compression == ZSTD_COMP) { + builder.compression(parquet::Compression::ZSTD); + } else if (compression == LZ4_COMP) { + builder.compression(parquet::Compression::LZ4); + } + std::shared_ptr props = builder.build(); + + std::shared_ptr file_writer = + parquet::ParquetFileWriter::Open(out_file, schema, props); + + std::queue idxQueue_str; // queue used to track string byteIdx + std::queue idxQueue_segarray; // queue used to track index into the offsets + + auto dtypes_ptr = (int64_t*) datatypes; + auto objType_ptr = (int64_t*) objTypes; + auto saSizes_ptr = (int64_t*) segArr_sizes; + int64_t numLeft = numelems; // number of elements remaining to write (rows) + int64_t x = 0; // index to start writing batch from + while (numLeft > 0) { + // Append a RowGroup with a specific number of rows. + parquet::RowGroupWriter* rg_writer = file_writer->AppendRowGroup(); + int64_t batchSize = rowGroupSize; + if(numLeft < rowGroupSize) + batchSize = numLeft; + + // loop the columns and write the row groups + for(int64_t i = 0; i < colnum; i++){ + int64_t dtype = dtypes_ptr[i]; + if (dtype == ARROWINT64 || dtype == ARROWUINT64) { + auto data_ptr = (int64_t*)ptr_arr[i]; + parquet::Int64Writer* writer = + static_cast(rg_writer->NextColumn()); + + if (objType_ptr[i] == SEGARRAY) { + auto offset_ptr = (int64_t*)offset_arr[i]; + int64_t offIdx = 0; // index into offsets + + if (x > 0){ + offIdx = idxQueue_segarray.front(); + idxQueue_segarray.pop(); + } + + int64_t count = 0; + while (count < batchSize) { // ensures rowGroupSize maintained + int64_t segSize; + if (offIdx == (numelems - 1)) { + segSize = saSizes_ptr[i] - offset_ptr[offIdx]; + } + else { + segSize = offset_ptr[offIdx+1] - offset_ptr[offIdx]; + } + if (segSize > 0) { + int16_t* def_lvl = new int16_t[segSize] { 3 }; + int16_t* rep_lvl = new int16_t[segSize] { 0 }; + for (int64_t s = 0; s < segSize; s++){ + // if the value is first in the segment rep_lvl = 0, otherwise 1 + // all values defined at the item level (3) + rep_lvl[s] = (s == 0) ? 0 : 1; + def_lvl[s] = 3; + } + int64_t valIdx = offset_ptr[offIdx]; + writer->WriteBatch(segSize, def_lvl, rep_lvl, &data_ptr[valIdx]); + delete[] def_lvl; + delete[] rep_lvl; + } + else { + // empty segment denoted by null value that is not repeated (first of segment) defined at the list level (1) + segSize = 1; // even though segment is length=0, write null to hold the empty segment + int16_t def_lvl = 1; + int16_t rep_lvl = 0; + writer->WriteBatch(segSize, &def_lvl, &rep_lvl, nullptr); + } + offIdx++; + count++; + } + if (numLeft - count > 0) { + idxQueue_segarray.push(offIdx); + } + } else { + writer->WriteBatch(batchSize, nullptr, nullptr, &data_ptr[x]); + } + } else if(dtype == ARROWBOOLEAN) { + auto data_ptr = (bool*)ptr_arr[i]; + parquet::BoolWriter* writer = + static_cast(rg_writer->NextColumn()); + if (objType_ptr[i] == SEGARRAY) { + auto offset_ptr = (int64_t*)offset_arr[i]; + int64_t offIdx = 0; // index into offsets + + if (x > 0){ + offIdx = idxQueue_segarray.front(); + idxQueue_segarray.pop(); + } + + int64_t count = 0; + while (count < batchSize) { // ensures rowGroupSize maintained + int64_t segSize; + if (offIdx == numelems - 1) { + segSize = saSizes_ptr[i] - offset_ptr[offIdx]; + } + else { + segSize = offset_ptr[offIdx+1] - offset_ptr[offIdx]; + } + if (segSize > 0) { + int16_t* def_lvl = new int16_t[segSize] { 3 }; + int16_t* rep_lvl = new int16_t[segSize] { 0 }; + for (int64_t s = 0; s < segSize; s++){ + // if the value is first in the segment rep_lvl = 0, otherwise 1 + // all values defined at the item level (3) + rep_lvl[s] = (s == 0) ? 0 : 1; + def_lvl[s] = 3; + } + int64_t valIdx = offset_ptr[offIdx]; + writer->WriteBatch(segSize, def_lvl, rep_lvl, &data_ptr[valIdx]); + delete[] def_lvl; + delete[] rep_lvl; + } + else { + // empty segment denoted by null value that is not repeated (first of segment) defined at the list level (1) + segSize = 1; // even though segment is length=0, write null to hold the empty segment + int16_t def_lvl = 1; + int16_t rep_lvl = 0; + writer->WriteBatch(segSize, &def_lvl, &rep_lvl, nullptr); + } + offIdx++; + count++; + } + if (numLeft - count > 0) { + idxQueue_segarray.push(offIdx); + } + } else { + writer->WriteBatch(batchSize, nullptr, nullptr, &data_ptr[x]); + } + } else if(dtype == ARROWDOUBLE) { + auto data_ptr = (double*)ptr_arr[i]; + parquet::DoubleWriter* writer = + static_cast(rg_writer->NextColumn()); + if (objType_ptr[i] == SEGARRAY) { + auto offset_ptr = (int64_t*)offset_arr[i]; + int64_t offIdx = 0; // index into offsets + + if (x > 0){ + offIdx = idxQueue_segarray.front(); + idxQueue_segarray.pop(); + } + + int64_t count = 0; + while (count < batchSize) { // ensures rowGroupSize maintained + int64_t segSize; + if (offIdx == numelems - 1) { + segSize = saSizes_ptr[i] - offset_ptr[offIdx]; + } + else { + segSize = offset_ptr[offIdx+1] - offset_ptr[offIdx]; + } + if (segSize > 0) { + int16_t* def_lvl = new int16_t[segSize] { 3 }; + int16_t* rep_lvl = new int16_t[segSize] { 0 }; + for (int64_t s = 0; s < segSize; s++){ + // if the value is first in the segment rep_lvl = 0, otherwise 1 + // all values defined at the item level (3) + rep_lvl[s] = (s == 0) ? 0 : 1; + def_lvl[s] = 3; + } + int64_t valIdx = offset_ptr[offIdx]; + writer->WriteBatch(segSize, def_lvl, rep_lvl, &data_ptr[valIdx]); + delete[] def_lvl; + delete[] rep_lvl; + } + else { + // empty segment denoted by null value that is not repeated (first of segment) defined at the list level (1) + segSize = 1; // even though segment is length=0, write null to hold the empty segment + int16_t def_lvl = 1; + int16_t rep_lvl =0; + writer->WriteBatch(segSize, &def_lvl, &rep_lvl, nullptr); + } + offIdx++; + count++; + } + if (numLeft - count > 0) { + idxQueue_segarray.push(offIdx); + } + } else { + writer->WriteBatch(batchSize, nullptr, nullptr, &data_ptr[x]); + } + } else if(dtype == ARROWSTRING) { + auto data_ptr = (uint8_t*)ptr_arr[i]; + parquet::ByteArrayWriter* ba_writer = + static_cast(rg_writer->NextColumn()); + if (objType_ptr[i] == SEGARRAY) { + auto offset_ptr = (int64_t*)offset_arr[i]; + int64_t byteIdx = 0; + int64_t offIdx = 0; // index into offsets + + // identify the starting byte index + if (x > 0){ + byteIdx = idxQueue_str.front(); + idxQueue_str.pop(); + + offIdx = idxQueue_segarray.front(); + idxQueue_segarray.pop(); + } + + int64_t count = 0; + while (count < batchSize) { // ensures rowGroupSize maintained + int64_t segSize; + if (offIdx == numelems - 1) { + segSize = saSizes_ptr[i] - offset_ptr[offIdx]; + } + else { + segSize = offset_ptr[offIdx+1] - offset_ptr[offIdx]; + } + if (segSize > 0) { + for (int64_t s=0; s(&data_ptr[byteIdx]); + int64_t nextIdx = byteIdx; + while (data_ptr[nextIdx] != 0x00){ + nextIdx++; + } + value.len = nextIdx - byteIdx; + ba_writer->WriteBatch(1, &def_lvl, &rep_lvl, &value); + byteIdx = nextIdx + 1; // increment to start of next word + } + } + else { + // empty segment denoted by null value that is not repeated (first of segment) defined at the list level (1) + segSize = 1; // even though segment is length=0, write null to hold the empty segment + int16_t* def_lvl = new int16_t[segSize] { 1 }; + int16_t* rep_lvl = new int16_t[segSize] { 0 }; + ba_writer->WriteBatch(segSize, def_lvl, rep_lvl, nullptr); + } + offIdx++; + count++; + } + if (numLeft - count > 0) { + idxQueue_str.push(byteIdx); + idxQueue_segarray.push(offIdx); + } + } + else { + int64_t count = 0; + int64_t byteIdx = 0; + + // identify the starting byte index + if (x > 0){ + byteIdx = idxQueue_str.front(); + idxQueue_str.pop(); + } + + while(count < batchSize) { + parquet::ByteArray value; + int16_t definition_level = 1; + value.ptr = reinterpret_cast(&data_ptr[byteIdx]); + int64_t nextIdx = byteIdx; + while (data_ptr[nextIdx] != 0x00){ + nextIdx++; + } + // subtract 1 since we have the null terminator + value.len = nextIdx - byteIdx; + ba_writer->WriteBatch(1, &definition_level, nullptr, &value); + count++; + byteIdx = nextIdx + 1; + } + if (numLeft - count > 0) { + idxQueue_str.push(byteIdx); + } + } + } else { + return ARROWERROR; + } + } + numLeft -= batchSize; + x += batchSize; + } + + file_writer->Close(); + ARROWSTATUS_OK(out_file->Close()); + + return 0; + } catch (const std::exception& e) { + *errMsg = strdup(e.what()); + return ARROWERROR; + } +} + +extern "C" { + int c_writeColumnToParquet(const char* filename, void* chpl_arr, + int64_t colnum, const char* dsetname, int64_t numelems, + int64_t rowGroupSize, int64_t dtype, int64_t compression, + char** errMsg) { + return cpp_writeColumnToParquet(filename, chpl_arr, colnum, dsetname, + numelems, rowGroupSize, dtype, compression, + errMsg); + } + int c_writeStrColumnToParquet(const char* filename, void* chpl_arr, void* chpl_offsets, + const char* dsetname, int64_t numelems, + int64_t rowGroupSize, int64_t dtype, int64_t compression, + char** errMsg) { + return cpp_writeStrColumnToParquet(filename, chpl_arr, chpl_offsets, + dsetname, numelems, rowGroupSize, dtype, compression, errMsg); + } + + int c_writeListColumnToParquet(const char* filename, void* chpl_segs, void* chpl_arr, + const char* dsetname, int64_t numelems, + int64_t rowGroupSize, int64_t dtype, int64_t compression, + char** errMsg) { + return cpp_writeListColumnToParquet(filename, chpl_segs, chpl_arr, + dsetname, numelems, rowGroupSize, dtype, compression, errMsg); + } + + int c_writeStrListColumnToParquet(const char* filename, void* chpl_segs, void* chpl_offsets, void* chpl_arr, + const char* dsetname, int64_t numelems, + int64_t rowGroupSize, int64_t dtype, int64_t compression, + char** errMsg) { + return cpp_writeStrListColumnToParquet(filename, chpl_segs, chpl_offsets, chpl_arr, + dsetname, numelems, rowGroupSize, dtype, compression, errMsg); + } + + int c_writeMultiColToParquet(const char* filename, void* column_names, + void** ptr_arr, void** offset_arr, void* objTypes, void* datatypes, + void* segArr_sizes, int64_t colnum, int64_t numelems, int64_t rowGroupSize, + int64_t compression, char** errMsg){ + return cpp_writeMultiColToParquet(filename, column_names, ptr_arr, offset_arr, objTypes, datatypes, segArr_sizes, colnum, numelems, rowGroupSize, compression, errMsg); + } +} diff --git a/src/parquet/WriteParquet.h b/src/parquet/WriteParquet.h new file mode 100644 index 0000000000..7f42d462c3 --- /dev/null +++ b/src/parquet/WriteParquet.h @@ -0,0 +1,57 @@ +#include +#include + +// Wrap functions in C extern if compiling C++ object file +#ifdef __cplusplus +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +extern "C" { +#endif + + int cpp_writeColumnToParquet(const char* filename, void* chpl_arr, + int64_t colnum, const char* dsetname, int64_t numelems, + int64_t rowGroupSize, int64_t dtype, int64_t compression, + char** errMsg); + int c_writeColumnToParquet(const char* filename, void* chpl_arr, + int64_t colnum, const char* dsetname, int64_t numelems, + int64_t rowGroupSize, int64_t dtype, int64_t compression, char** errMsg); + + int c_writeStrColumnToParquet(const char* filename, void* chpl_arr, void* chpl_offsets, + const char* dsetname, int64_t numelems, + int64_t rowGroupSize, int64_t dtype, int64_t compression, + char** errMsg); + int cpp_writeStrColumnToParquet(const char* filename, void* chpl_arr, void* chpl_offsets, + const char* dsetname, int64_t numelems, + int64_t rowGroupSize, int64_t dtype, int64_t compression, + char** errMsg); + + int c_writeMultiColToParquet(const char* filename, void* column_names, + void** ptr_arr, void** offset_arr, void* objTypes, void* datatypes, + void* segArr_sizes, int64_t colnum, int64_t numelems, int64_t rowGroupSize, + int64_t compression, char** errMsg); + + int cpp_writeMultiColToParquet(const char* filename, void* column_names, + void** ptr_arr, void** offset_arr, void* objTypes, void* datatypes, + void* segArr_sizes, int64_t colnum, int64_t numelems, int64_t rowGroupSize, + int64_t compression, char** errMsg); + + int c_writeStrListColumnToParquet(const char* filename, void* chpl_segs, void* chpl_offsets, + void* chpl_arr, const char* dsetname, int64_t numelems, + int64_t rowGroupSize, int64_t dtype, int64_t compression, + char** errMsg); + int cpp_writeStrListColumnToParquet(const char* filename, void* chpl_segs, void* chpl_offsets, + void* chpl_arr, const char* dsetname, int64_t numelems, + int64_t rowGroupSize, int64_t dtype, int64_t compression, + char** errMsg); + +#ifdef __cplusplus +} +#endif