Skip to content

Commit

Permalink
[xlsb] parse some table formulas (#1052)
Browse files Browse the repository at this point in the history
  • Loading branch information
JanMarvin authored Jun 16, 2024
1 parent ea27595 commit d06c655
Show file tree
Hide file tree
Showing 5 changed files with 206 additions and 22 deletions.
12 changes: 8 additions & 4 deletions R/class-workbook.R
Original file line number Diff line number Diff line change
Expand Up @@ -7251,18 +7251,22 @@ wbWorkbook <- R6::R6Class(
#' @description get tables
#' @return The sheet tables. `character()` if empty
get_tables = function(sheet = current_sheet()) {
if (length(sheet) != 1) {
if (!is.null(sheet) && length(sheet) != 1) {
stop("sheet argument must be length 1")
}

if (is.null(self$tables)) {
return(character())
}

sheet <- private$get_sheet_index(sheet)
if (is.na(sheet)) stop("No such sheet in workbook")
if (!is.null(sheet)) {
sheet <- private$get_sheet_index(sheet)
if (is.na(sheet)) stop("No such sheet in workbook")

sel <- self$tables$tab_sheet == sheet & self$tables$tab_act == 1
sel <- self$tables$tab_sheet == sheet & self$tables$tab_act == 1
} else {
sel <- self$tables$tab_act == 1
}
self$tables[sel, c("tab_name", "tab_ref")]
},

Expand Down
69 changes: 67 additions & 2 deletions R/wb_load.R
Original file line number Diff line number Diff line change
Expand Up @@ -1561,6 +1561,16 @@ wb_load <- function(
}
}

for (i in seq_along(wb$tables$tab_xml)) {
wb$tables$tab_xml[i] <-
stringi::stri_replace_all_fixed(
wb$tables$tab_xml[i],
xti$name_id,
xti$sheets,
vectorize_all = FALSE
)
}

### for external references we need to get the required sheet names first
# For now this is all a little guess work

Expand Down Expand Up @@ -1623,10 +1633,13 @@ wb_load <- function(

# replace named region in formulas
nri <- wb$get_named_regions()
nri$name_id <- paste0("openxlsx2defnam_", sprintf("%012d", as.integer(rownames(nri))))
nri$name_id <- paste0("openxlsx2defnam_", sprintf("%012d", as.integer(nri$id)))

if (debug)
print(nri)

for (j in seq_along(wb$worksheets)) {
if (any(sel <- wb$worksheets[[j]]$sheet_data$cc$f != "")) {
if (any(sel <- grepl(paste0(nri$name_id, collapse = "|"), wb$worksheets[[j]]$sheet_data$cc$f))) {
wb$worksheets[[j]]$sheet_data$cc$f[sel] <-
stringi::stri_replace_all_fixed(
wb$worksheets[[j]]$sheet_data$cc$f[sel],
Expand All @@ -1639,6 +1652,58 @@ wb_load <- function(

}

if (length(wb$tables)) {
# replace named region in formulas
tri <- wb$get_tables(sheet = NULL)
tri$id <- as.integer(rbindlist(xml_attr(wb$tables$tab_xml, "table"))$id) # - 1L
tri$name_id <- paste0("openxlsx2tab_", sprintf("%012d", tri$id))
tri$vars <- lapply(wb$tables$tab_xml, function(x) rbindlist(xml_attr(x, "table", "tableColumns", "tableColumn"))$name)

tri <- tri[order(tri$id), ]

if (debug)
print(tri)

for (j in seq_along(wb$worksheets)) {
if (any(sel <- grepl(paste0(tri$name_id, collapse = "|"), wb$worksheets[[j]]$sheet_data$cc$f))) {

for (i in seq_len(nrow(tri))) {

sel <- grepl(paste0(tri$name_id, collapse = "|"), wb$worksheets[[j]]$sheet_data$cc$f)

from_xlsb <- c(tri$name_id[i], paste0("[openxlsx2col_", tri$id[i], "_", seq_along(unlist(tri$vars[i])) - 1L, "]"))
to_xlsx <- c(tri$tab_name[i], paste0("[", unlist(tri$vars[i]), "]"))

# always on all?
wb$tables$tab_xml <-
stringi::stri_replace_all_fixed(
wb$tables$tab_xml,
from_xlsb,
to_xlsx,
vectorize_all = FALSE
)

wb$worksheets[[j]]$sheet_data$cc$f[sel] <-
stringi::stri_replace_all_fixed(
wb$worksheets[[j]]$sheet_data$cc$f[sel],
from_xlsb,
to_xlsx,
vectorize_all = FALSE
)

wb$workbook$definedNames <-
stringi::stri_replace_all_fixed(
wb$workbook$definedNames,
from_xlsb,
to_xlsx,
vectorize_all = FALSE
)
}
}
}

}

# this might be terribly slow!
for (j in seq_along(wb$worksheets)) {
if (any(sel <- wb$worksheets[[j]]$sheet_data$cc$f != "")) {
Expand Down
19 changes: 19 additions & 0 deletions src/xlsb_defines.h
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,25 @@ typedef struct {
uint8_t reserved2 : 2;
} PtgListFields;

enum PtgRowType
{
data = 0x00,
all = 0x01, // #All
headers = 0x02, // #Headers
data2 = 0x04, // #Data
dataheaders = 0x06,
totals = 0x08, // #Totals
datatotals = 0x0C,
current = 0x10 // #This Row
};

enum PtgDataType
{
reference = 0x1,
value = 0x2,
array = 0x3
};

enum RgbExtra
{
PtgExtraArray = 0,
Expand Down
114 changes: 101 additions & 13 deletions src/xlsb_funs.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,14 @@ std::string escape_xml(const std::string& input) {
return result;
}

std::string wrap_xml(const std::string& str) {
if (!str.empty() && isspace(str[0])) {
return "<t xml:space=\"preserve\">" + str + "</t>";
} else {
return "<t>" + str + "</t>";
}
}

std::string to_utf8(const std::u16string& u16str) {

std::string utf8str;
Expand Down Expand Up @@ -485,7 +493,7 @@ std::string to_rich_text(const std::string& str, const std::vector<std::pair<int

std::string part = utf8_substr(str, start, len);

result += "<r><FONT_" + std::to_string(str_runs[str_run].second) + "/><t xml:space=\"preserve\">" + escape_xml(part) + "</t></r>";
result += "<r><FONT_" + std::to_string(str_runs[str_run].second) + "/>" + wrap_xml(escape_xml(part)) + "</r>";
}

start = str_runs[str_run].first;
Expand All @@ -496,7 +504,7 @@ std::string to_rich_text(const std::string& str, const std::vector<std::pair<int

std::string part = utf8_substr(str, start, len);

result += "<r><FONT_" + std::to_string(str_runs[str_run].second) + "/><t xml:space=\"preserve\">" + escape_xml(part) + "</t></r>";
result += "<r><FONT_" + std::to_string(str_runs[str_run].second) + "/>" + wrap_xml(escape_xml(part)) + "</r>";
}

return result;
Expand Down Expand Up @@ -542,7 +550,7 @@ std::string RichStr(std::istream& sas, bool swapit) {

str = to_rich_text(str, str_run);
} else {
str = "<t>" + escape_xml(str) + "</t>";
str = wrap_xml(escape_xml(str));
}

if (B) {
Expand Down Expand Up @@ -1164,33 +1172,113 @@ std::string CellParsedFormula(std::istream& sas, bool swapit, bool debug, int co
uint32_t listIndex = 0;
int16_t colFirst = 0, colLast = 0;

// this is a reference to a table column something like "tab[col]"
// ixti = location of table
ixti = readbin(ixti, sas, swapit);
flags = readbin(flags, sas, swapit);

// B:
// 0x00 columns consist of all columns in table
// 0x01 one column wide, only colFirst required
// 0x02 columns from colFirst to colLast
// rowType = PtgRowType()
// squareBracketSpace: spacing?
// commaSpace: comma space?
// unused
// type: PtgDataType()
// invalid: bool
// nonresident: bool
flags = readbin(flags, sas, swapit);

// table identifier: unused if invalid=1 || nonresident=1
listIndex = readbin(listIndex, sas, swapit);
colFirst = ColShort(sas, swapit);
colLast = ColShort(sas, swapit);

// cols: unused if invalid = 1 || nonresident = 1 || columns = 0
colFirst = ColShort(sas, swapit);
colLast = ColShort(sas, swapit);


PtgListFields *fields = (PtgListFields *)&flags;

// if (debug)
// Rprintf("%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n",
// (uint32_t)fields->columns,
// (uint32_t)fields->commaSpace,
// (uint32_t)fields->invalid,
// (uint32_t)fields->nonresident,
// (uint32_t)fields->reserved2,
// (uint32_t)fields->rowType,
// (uint32_t)fields->squareBracketSpace,
// (uint32_t)fields->type,
// (uint32_t)fields->unused
// );

if (fields->nonresident) // different workbook and invalid == 0
ptgextra.push_back(typ);

std::stringstream paddedStr;
paddedStr << std::setw(12) << std::setfill('0') << ixti;
paddedStr << std::setw(12) << std::setfill('0') << listIndex; // << ixti;

// A1 notation cell
// fml_out += "openxlsx2xlsb_" + paddedStr.str();
// maybe [ ]
fml_out += "#REF!";
// something like this: Table1[[#This Row],[a]]
fml_out += "openxlsx2tab_" + paddedStr.str();

bool no_row_type = fields->invalid == 1 || fields->nonresident == 1;

fml_out += "[";

bool need_bracket = fields->columns > 0 ||
(fields->columns == 0 &&
(fields->rowType == dataheaders ||
fields->rowType == datatotals)
);


// if rowType == 0 no #Data etc is added
if (!no_row_type && fields->rowType) {
if (need_bracket) fml_out += "[";
if (fields->rowType == data) fml_out += "";
if (fields->rowType == all) fml_out += "#All";
if (fields->rowType == headers) fml_out += "#Headers";
if (fields->rowType == data2) fml_out += "#Data";
if (fields->rowType == dataheaders) fml_out += "#Headers],[#Data";
if (fields->rowType == totals) fml_out += "#Totals";
if (fields->rowType == datatotals) fml_out += "#Data],[#Totals";
if (fields->rowType == current) fml_out += "#This Row";
if (need_bracket) fml_out += "]";
if (fields->columns > 0) fml_out += ",";
}

// not sure what is supposed to happen in this case?
// have to replace colFirst with a variable name
if (!(fields->invalid == 1 || fields->nonresident == 1 || fields->columns == 0)) {
// Rcpp::Rcout << "colFirst" << std::endl;
if (fields->columns > 1 || fields->rowType > data) fml_out += "[";
fml_out += "openxlsx2col_";
fml_out += std::to_string(listIndex);
fml_out += "_";
fml_out += std::to_string(colFirst);
if (fields->columns > 1 || fields->rowType > data) fml_out += "]";
}

// have to replace colLast with a variable name
if ((colFirst < colLast) && !(fields->invalid == 1 || fields->nonresident == 1 || fields->columns == 0)) {
// Rcpp::Rcout << "colLast" << std::endl;
fml_out += ":[openxlsx2col_";
fml_out += std::to_string(listIndex);
fml_out += "_";
fml_out += std::to_string(colLast);
if (fields->columns > 1 || fields->rowType > data) fml_out += "]";
}

fml_out += "]";
fml_out += "\n";

// Do something with this, just ... what?
if (debug) Rprintf("PtgList: %d, %d, %d, %d\n",
if (debug)
Rprintf("PtgList: %d, %d, %d, %d\n",
ixti, listIndex, colFirst, colLast);

Rcpp::warning("formulas with table references are not implemented.");
// if (debug)
// Rcpp::warning("formulas with table references are not implemented.");

break;
}
Expand Down
14 changes: 11 additions & 3 deletions tests/testthat/test-read_xlsb.R
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,9 @@ test_that("reading complex xlsb works", {

# comments
exp <- c(
"<t xml:space=\"preserve\">Jan Marvin Garbuszus:</t>",
"<t>Jan Marvin Garbuszus:</t>",
"<t xml:space=\"preserve\">\n</t>",
"<t xml:space=\"preserve\">A new note!</t>"
"<t>A new note!</t>"
)
got <- wb$comments[[1]][[1]]$comment
expect_equal(exp, got)
Expand Down Expand Up @@ -132,8 +132,16 @@ test_that("xlsb formulas", {
exp <- c("", "D2:E2", "A1,B1", "A1 A2", "1+1", "1-1", "1*1", "1/1",
"1%", "1^1", "1=1", "1&gt;1", "1&gt;=1", "1&lt;1", "1&lt;=1",
"1&lt;&gt;1", "+A3", "-R2", "(1)", "SUM(1, )", "1", "2.500000",
"\"a\"", "\"A\"&amp;\"B\"")
"\"a\"", "\"A\"&amp;\"B\"", "Sheet2!B2", "'[1]Sheet3'!A2")
got <- unique(wb$worksheets[[1]]$sheet_data$cc$f)
expect_equal(exp, got)

fl <- testfile_path("formula_checks.xlsx")
xl <- wb_load(fl)

expect_equal(
xl$worksheets[[2]]$sheet_data$cc$f,
wb$worksheets[[2]]$sheet_data$cc$f
)

})

0 comments on commit d06c655

Please sign in to comment.