From 15daf92e52b10db473e0361fab9ebbbf59d08fc6 Mon Sep 17 00:00:00 2001 From: Jan Marvin Garbuszus Date: Sat, 20 Jul 2024 19:24:42 +0200 Subject: [PATCH] [writing] provide openxlsx2.export_with_pugi = FALSE option --- R/RcppExports.R | 4 + R/class-workbook.R | 33 ++++-- src/RcppExports.cpp | 14 +++ src/write_file.cpp | 194 +++++++++++++++++++++++++++++++++++- tests/testthat/test-write.R | 23 +++++ 5 files changed, 260 insertions(+), 8 deletions(-) diff --git a/R/RcppExports.R b/R/RcppExports.R index 9b9d52fe0..e858d868c 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -348,6 +348,10 @@ set_sst <- function(sharedStrings) { .Call(`_openxlsx2_set_sst`, sharedStrings) } +write_worksheet_slim <- function(sheet_data, prior, post, fl) { + invisible(.Call(`_openxlsx2_write_worksheet_slim`, sheet_data, prior, post, fl)) +} + write_worksheet <- function(prior, post, sheet_data) { .Call(`_openxlsx2_write_worksheet`, prior, post, sheet_data) } diff --git a/R/class-workbook.R b/R/class-workbook.R index f16401af6..0cf5d0ae4 100644 --- a/R/class-workbook.R +++ b/R/class-workbook.R @@ -9318,14 +9318,33 @@ wbWorkbook <- R6::R6Class( ws$sheet_data$cc_out <- NULL } - # create entire sheet prior to writing it - sheet_xml <- write_worksheet( - prior = prior, - post = post, - sheet_data = ws$sheet_data - ) ws_file <- file.path(xlworksheetsDir, sprintf("sheet%s.xml", i)) - write_xmlPtr(doc = sheet_xml, fl = ws_file) + + use_pugixml_export <- getOption("openxlsx2.export_with_pugi", default = TRUE) + + if (use_pugixml_export) { + + # create entire sheet prior to writing it + sheet_xml <- write_worksheet( + prior = prior, + post = post, + sheet_data = ws$sheet_data + ) + write_xmlPtr(doc = sheet_xml, fl = ws_file) + + } else { + + if (grepl("", prior)) + prior <- substr(prior, 1, nchar(prior) - 13) # remove " " + + write_worksheet_slim( + sheet_data = ws$sheet_data, + prior = prior, + post = post, + fl = ws_file + ) + + } ## write worksheet rels if (length(self$worksheets_rels[[i]]) || hasHL) { diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 1d089a13e..dc7904fa1 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -880,6 +880,19 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// write_worksheet_slim +void write_worksheet_slim(Rcpp::Environment sheet_data, std::string prior, std::string post, std::string fl); +RcppExport SEXP _openxlsx2_write_worksheet_slim(SEXP sheet_dataSEXP, SEXP priorSEXP, SEXP postSEXP, SEXP flSEXP) { +BEGIN_RCPP + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< Rcpp::Environment >::type sheet_data(sheet_dataSEXP); + Rcpp::traits::input_parameter< std::string >::type prior(priorSEXP); + Rcpp::traits::input_parameter< std::string >::type post(postSEXP); + Rcpp::traits::input_parameter< std::string >::type fl(flSEXP); + write_worksheet_slim(sheet_data, prior, post, fl); + return R_NilValue; +END_RCPP +} // write_worksheet XPtrXML write_worksheet(std::string prior, std::string post, Rcpp::Environment sheet_data); RcppExport SEXP _openxlsx2_write_worksheet(SEXP priorSEXP, SEXP postSEXP, SEXP sheet_dataSEXP) { @@ -1068,6 +1081,7 @@ static const R_CallMethodDef CallEntries[] = { {"_openxlsx2_read_colors", (DL_FUNC) &_openxlsx2_read_colors, 1}, {"_openxlsx2_write_colors", (DL_FUNC) &_openxlsx2_write_colors, 1}, {"_openxlsx2_set_sst", (DL_FUNC) &_openxlsx2_set_sst, 1}, + {"_openxlsx2_write_worksheet_slim", (DL_FUNC) &_openxlsx2_write_worksheet_slim, 4}, {"_openxlsx2_write_worksheet", (DL_FUNC) &_openxlsx2_write_worksheet, 3}, {"_openxlsx2_write_xmlPtr", (DL_FUNC) &_openxlsx2_write_xmlPtr, 2}, {"_openxlsx2_styles_bin", (DL_FUNC) &_openxlsx2_styles_bin, 3}, diff --git a/src/write_file.cpp b/src/write_file.cpp index cc069cd5b..c6eb9a980 100644 --- a/src/write_file.cpp +++ b/src/write_file.cpp @@ -21,6 +21,199 @@ Rcpp::CharacterVector set_sst(Rcpp::CharacterVector sharedStrings) { return sst; } +// write xml by streaming to files. this takes whatever input we provide and +// dumps it into the file. no xml checking, no unicode checking +void xml_sheet_data_slim( + Rcpp::DataFrame row_attr, + Rcpp::DataFrame cc, + std::string prior, + std::string post, + std::string fl +) { + + std::ofstream file(fl); + + auto lastrow = 0; // integer value of the last row with column data + auto thisrow = 0; // integer value of the current row with column data + auto row_idx = 0; // the index of the row_attr file. this is != rowid + auto rowid = 0; // integer value of the r field in row_attr + + std::string xml_preserver = " "; + + file << "\n"; + file << prior; + + if (cc.nrow() && cc.ncol()) { + // we cannot access rows directly in the dataframe. + // Have to extract the columns and use these + Rcpp::CharacterVector cc_row_r = cc["row_r"]; // 1 + Rcpp::CharacterVector cc_r = cc["r"]; // A1 + Rcpp::CharacterVector cc_v = cc["v"]; + Rcpp::CharacterVector cc_c_t = cc["c_t"]; + Rcpp::CharacterVector cc_c_s = cc["c_s"]; + Rcpp::CharacterVector cc_c_cm = cc["c_cm"]; + Rcpp::CharacterVector cc_c_ph = cc["c_ph"]; + Rcpp::CharacterVector cc_c_vm = cc["c_vm"]; + Rcpp::CharacterVector cc_f = cc["f"]; + Rcpp::CharacterVector cc_f_t = cc["f_t"]; + Rcpp::CharacterVector cc_f_ref = cc["f_ref"]; + Rcpp::CharacterVector cc_f_ca = cc["f_ca"]; + Rcpp::CharacterVector cc_f_si = cc["f_si"]; + Rcpp::CharacterVector cc_is = cc["is"]; + + Rcpp::CharacterVector row_r = row_attr["r"]; + + + file << ""; + for (auto i = 0; i < cc.nrow(); ++i) { + + thisrow = std::stoi(Rcpp::as(cc_row_r[i])); + + if (lastrow < thisrow) { + + // there might be entirely empty rows in between. this is the case for + // loadExample. We check the rowid and write the line and skip until we + // have every row and only then continue writing the column + while (rowid < thisrow) { + + rowid = std::stoi(Rcpp::as( + row_r[row_idx] + )); + + if (row_idx) file << ""; + file << "(row_attr[j])[row_idx]; + + if (cv_s[0] != "") { + const std::string val_strl = Rcpp::as(cv_s); + file << " " << attrnams[j] << "=\"" << val_strl.c_str() << "\""; + } + } + file << ">"; // end + + // read the next row_idx when visiting again + ++row_idx; + } + } + + // create node + file << " + file << " r" << "=\"" << to_string(cc_r[i]).c_str() << "\""; + + if (!to_string(cc_c_s[i]).empty()) + file << " s" << "=\"" << to_string(cc_c_s[i]).c_str() << "\""; + + // assign type if not aka numeric + if (!to_string(cc_c_t[i]).empty()) + file << " t" << "=\"" << to_string(cc_c_t[i]).c_str() << "\""; + + // CellMetaIndex: suppress curly brackets in spreadsheet software + if (!to_string(cc_c_cm[i]).empty()) + file << " cm" << "=\"" << to_string(cc_c_cm[i]).c_str() << "\""; + + // phonetics spelling + if (!to_string(cc_c_ph[i]).empty()) + file << " ph" << "=\"" << to_string(cc_c_ph[i]).c_str() << "\""; + + // suppress curly brackets in spreadsheet software + if (!to_string(cc_c_vm[i]).empty()) + file << " vm" << "=\"" << to_string(cc_c_vm[i]).c_str() << "\""; + + file << ">"; // end + + bool f_si = false; + + // ... + // f node: formula to be evaluated + if (!to_string(cc_f[i]).empty() || !to_string(cc_f_t[i]).empty() || !to_string(cc_f_si[i]).empty()) { + file << ""; + + file << to_string(cc_f[i]).c_str(); + + file << ""; + } + + // v node: value stored from evaluated formula + if (!to_string(cc_v[i]).empty()) { + if (!f_si & (to_string(cc_v[i]).compare(xml_preserver.c_str()) == 0)) { + // this looks strange + file << ""; + file << " "; + file << ""; + } else { + file << "" << to_string(cc_v[i]).c_str() << ""; + } + } + + // ... + if (to_string(cc_c_t[i]).compare("inlineStr") == 0) { + if (!to_string(cc_is[i]).empty()) { + file << to_string(cc_is[i]).c_str(); + } + } + + file << ""; + + // update lastrow + lastrow = thisrow; + } + + file << ""; + file << ""; + } else { + file << ""; + } + + + file << post; + file << ""; + + file.close(); + +} + +// export worksheet without pugixml +// this should be way quicker, uses far less memory, but also skips all of the checks pugi does +// +// [[Rcpp::export]] +void write_worksheet_slim( + Rcpp::Environment sheet_data, + std::string prior, + std::string post, + std::string fl +){ + // sheet_data will be in order, just need to check for row_heights + // CharacterVector cell_col = int_to_col(sheet_data.field("cols")); + Rcpp::DataFrame row_attr = Rcpp::as(sheet_data["row_attr"]); + Rcpp::DataFrame cc = Rcpp::as(sheet_data["cc_out"]); + + xml_sheet_data_slim(row_attr, cc, prior, post, fl); +} + // creates an xml row // data in xml is ordered row wise. therefore we need the row attributes and @@ -178,7 +371,6 @@ pugi::xml_document xml_sheet_data(Rcpp::DataFrame row_attr, Rcpp::DataFrame cc) return doc; } - // TODO: convert to pugi // function that creates the xml worksheet // uses preparated data and writes it. It passes data to set_row() which will diff --git a/tests/testthat/test-write.R b/tests/testthat/test-write.R index 8028a29a1..c513c9f19 100644 --- a/tests/testthat/test-write.R +++ b/tests/testthat/test-write.R @@ -1239,3 +1239,26 @@ test_that("sheet is a valid argument in write_xlsx", { wb2 <- write_xlsx(x = mtcars, sheet = "data") expect_equal(wb1$get_sheet_names(), wb2$get_sheet_names()) }) + +test_that("writing without pugixml works", { + + temp <- temp_xlsx() + expect_silent(write_xlsx(x = mtcars, file = temp)) + expect_silent(wb <- wb_load(temp)) + + temp <- temp_xlsx() + options("openxlsx2.export_with_pugi" = FALSE) + expect_silent(write_xlsx(x = mtcars, file = temp)) + expect_silent(wb <- wb_load(temp)) + + temp <- temp_xlsx() + options("openxlsx2.export_with_pugi" = TRUE) + expect_silent(write_xlsx(x = mtcars, file = temp)) + expect_silent(wb <- wb_load(temp)) + + temp <- temp_xlsx() + options("openxlsx2.export_with_pugi" = NULL) + expect_silent(write_xlsx(x = mtcars, file = temp)) + expect_silent(wb <- wb_load(temp)) + +})