Skip to content

Commit

Permalink
Merge pull request #661 from SebKrantz/development
Browse files Browse the repository at this point in the history
Development
  • Loading branch information
SebKrantz authored Nov 2, 2024
2 parents 0d096f7 + 0059772 commit e7f1159
Show file tree
Hide file tree
Showing 7 changed files with 62 additions and 43 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: collapse
Title: Advanced and Fast Data Transformation
Version: 2.0.17
Date: 2024-10-27
Date: 2024-11-03
Authors@R: c(
person("Sebastian", "Krantz", role = c("aut", "cre"),
email = "sebastian.krantz@graduateinstitute.ch",
Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@

* Fixed an inaccuracy in `fquantile()/fnth()` with weights: As per documentation the target sum is `sumwp = (sum(w) - min(w)) * p`, however, in practice, the weight of the minimum element of `x` was used instead of the minimum weight. Since the smallest element in the sample usually has a small weight this was unnoticed for a long while, but thanks to @Jahnic-kb now reported and fixed (#659).

* Fixed a bug in `recode_char()` when `regex = TRUE` and the `default` argument was used. Thanks @alinacherkas for both reporing and fixing (#654).

# collapse 2.0.16

* Fixes an installation bug on some Linux systems (conflicting types) (#613).
Expand Down
17 changes: 17 additions & 0 deletions src/data.table.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,23 @@
#define NEED2UTF8(s) !(IS_ASCII(s) || (s)==NA_STRING || IS_UTF8(s))
#define ENC2UTF8(s) (!NEED2UTF8(s) ? (s) : mkCharCE(translateCharUTF8(s), CE_UTF8))

// Needed for vector length manipulation
// https://github.com/wch/r-source/blob/48f06c1071fea6a6e7e365ad3d745217268e2175/src/include/Defn.h#L675
// Until data.table fixes this: https://github.com/Rdatatable/data.table/issues/6180
#define SET_TRULEN(x, v) (STDVEC_TRUELENGTH(x)=(v))
// ALTREP_TRUELENGTH is 0: https://github.com/wch/r-source/blob/48f06c1071fea6a6e7e365ad3d745217268e2175/src/main/altrep.c#L345
#define TRULEN(x) (ALTREP(x) ? 0 : STDVEC_TRUELENGTH(x))
#define STDVEC_LENGTH(x) (((VECSEXP) (x))->vecsxp.length)
// Needed for SETLENGTH
#define SETSCAL(x, v) ((((SEXPREC_partial *)(x))->sxpinfo.scalar) = (v))
#define SET_STDVEC_LENGTH(x,v) do { \
SEXP __x__ = (x); \
R_xlen_t __v__ = (v); \
STDVEC_LENGTH(__x__) = __v__; \
SETSCAL(__x__, __v__ == 1 ? 1 : 0); \
} while (0)
#define SET_LEN(x, v) SET_STDVEC_LENGTH((x), (v))

// for use with bit64::integer64
#define NA_INTEGER64 INT64_MIN
#define MAX_INTEGER64 INT64_MAX
Expand Down
2 changes: 1 addition & 1 deletion src/data.table_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ SEXP collapse_init(SEXP mess) // void SEXP mess DllInfo *info

SEXP tmp = PROTECT(allocVector(INTSXP,2));
if (LENGTH(tmp)!=2) error("Checking LENGTH(allocVector(INTSXP,2)) [%d] is 2 %s", LENGTH(tmp), msg);
if (TRUELENGTH(tmp)!=0) error("Checking TRUELENGTH(allocVector(INTSXP,2)) [%d] is 0 %s", (int)TRUELENGTH(tmp), msg);
if (TRULEN(tmp)!=0) error("Checking TRUELENGTH(allocVector(INTSXP,2)) [%d] is 0 %s", (int)TRULEN(tmp), msg);
UNPROTECT(1);

// According to IEEE (http://en.wikipedia.org/wiki/IEEE_754-1985#Zero) we can rely on 0.0 being all 0 bits.
Expand Down
50 changes: 25 additions & 25 deletions src/data.table_rbindlist.c
Original file line number Diff line number Diff line change
Expand Up @@ -172,14 +172,14 @@ void savetl(SEXP s)
savedtl = (R_len_t *)tmp;
}
saveds[nsaved] = s;
savedtl[nsaved] = TRUELENGTH(s);
savedtl[nsaved] = TRULEN(s);
nsaved++;
}

void savetl_end(void) {
// Can get called if nothing has been saved yet (nsaved==0), or even if _init() hasn't been called yet (pointers NULL). Such
// as to clear up before error. Also, it might be that nothing needed to be saved anyway.
for (int i=0; i<nsaved; i++) SET_TRUELENGTH(saveds[i],savedtl[i]);
for (int i=0; i<nsaved; i++) SET_TRULEN(saveds[i],savedtl[i]);
free(saveds); // possible free(NULL) which is safe no-op
saveds = NULL;
free(savedtl);
Expand Down Expand Up @@ -265,10 +265,10 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg)
const SEXP *cnp = SEXPPTR(cn);
for (int j=0; j<thisncol; j++) {
SEXP s = cnp[j];
if (TRUELENGTH(s)<0) continue; // seen this name before
if (TRUELENGTH(s)>0) savetl(s);
if (TRULEN(s)<0) continue; // seen this name before
if (TRULEN(s)>0) savetl(s);
uniq[nuniq++] = s;
SET_TRUELENGTH(s,-nuniq);
SET_TRULEN(s,-nuniq);
}
}
if (nuniq>0) {
Expand All @@ -281,7 +281,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg)
int *maxdup = (int *)calloc(nuniq, sizeof(int)); // the most number of dups for any name within one colname vector
if (!counts || !maxdup) {
// # nocov start
for (int i=0; i<nuniq; ++i) SET_TRUELENGTH(uniq[i], 0);
for (int i=0; i<nuniq; ++i) SET_TRULEN(uniq[i], 0);
free(uniq); free(counts); free(maxdup);
savetl_end();
error("Failed to allocate nuniq=%d items working memory in rbindlist.c", nuniq);
Expand All @@ -297,7 +297,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg)
memset(counts, 0, nuniq*sizeof(int));
for (int j=0; j<thisncol; j++) {
SEXP s = cnp[j];
counts[ -TRUELENGTH(s)-1 ]++;
counts[ -TRULEN(s)-1 ]++;
}
for (int u=0; u<nuniq; u++) {
if (counts[u] > maxdup[u]) maxdup[u] = counts[u];
Expand All @@ -315,7 +315,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg)
int *dupLink = (int *)malloc(ncol * sizeof(int)); // if a colname has occurred before (a dup) links from the 1st to the 2nd time in the final result, 2nd to 3rd, etc
if (!colMapRaw || !uniqMap || !dupLink) {
// # nocov start
for (int i=0; i<nuniq; ++i) SET_TRUELENGTH(uniq[i], 0);
for (int i=0; i<nuniq; ++i) SET_TRULEN(uniq[i], 0);
free(uniq); free(counts); free(colMapRaw); free(uniqMap); free(dupLink);
savetl_end();
error("Failed to allocate ncol=%d items working memory in rbindlist.c", ncol);
Expand All @@ -337,7 +337,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg)
memset(counts, 0, nuniq*sizeof(int));
for (int j=0; j<thisncol; j++) {
SEXP s = cnp[j];
int w = -TRUELENGTH(s)-1;
int w = -TRULEN(s)-1;
int wi = counts[w]++; // how many dups have we seen before of this name within this item
if (uniqMap[w]==-1) {
// first time seen this name across all items
Expand All @@ -354,7 +354,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg)
}
}
}
for (int i=0; i<nuniq; ++i) SET_TRUELENGTH(uniq[i], 0); // zero out our usage of tl
for (int i=0; i<nuniq; ++i) SET_TRULEN(uniq[i], 0); // zero out our usage of tl
free(uniq); free(counts); free(uniqMap); free(dupLink); // all local scope so no need to set to NULL
savetl_end(); // restore R's usage

Expand Down Expand Up @@ -546,9 +546,9 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg)
if (!levelsRaw) { savetl_end(); error("Failed to allocate working memory for %d ordered factor levels of result column %d", nLevel, idcol+j+1); }
for (int k=0; k<longestLen; ++k) {
SEXP s = sd[k];
if (TRUELENGTH(s)>0) savetl(s);
if (TRULEN(s)>0) savetl(s);
levelsRaw[k] = s;
SET_TRUELENGTH(s,-k-1);
SET_TRULEN(s,-k-1);
}
for (int i=0; i<ll; ++i) {
int w = usenames ? colMap[i*ncol + j] : j;
Expand All @@ -560,7 +560,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg)
const int n = length(levels);
for (int k=0, last=0; k<n; ++k) {
SEXP s = levelsD[k];
const int tl = TRUELENGTH(s);
const int tl = TRULEN(s);
if (tl>=last) { // if tl>=0 then also tl>=last because last<=0
if (tl>=0) {
snprintf(warnStr, sizeof(warnStr), // not direct warning as we're inside tl region
Expand Down Expand Up @@ -599,8 +599,8 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg)
for (int k=0; k<n; ++k) {
SEXP s = thisColStrD[k];
if (s==NA_STRING || // remove NA from levels; test 1979 found by package emil when revdep testing 1.12.2 (#3473)
TRUELENGTH(s)<0) continue; // seen this level before; handles removing dups from levels as well as finding unique of character columns
if (TRUELENGTH(s)>0) savetl(s);
TRULEN(s)<0) continue; // seen this level before; handles removing dups from levels as well as finding unique of character columns
if (TRULEN(s)>0) savetl(s);
if (allocLevel==nLevel) { // including initial time when allocLevel==nLevel==0
SEXP *tt = NULL;
if (allocLevel<INT_MAX) {
Expand All @@ -611,15 +611,15 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg)
if (tt==NULL) {
// # nocov start
// C spec states that if realloc() fails (above) the original block (levelsRaw) is left untouched: it is not freed or moved. We ...
for (int k=0; k<nLevel; k++) SET_TRUELENGTH(levelsRaw[k], 0); // ... rely on that in this loop which uses levelsRaw.
for (int k=0; k<nLevel; k++) SET_TRULEN(levelsRaw[k], 0); // ... rely on that in this loop which uses levelsRaw.
free(levelsRaw);
savetl_end();
error("Failed to allocate working memory for %d factor levels of result column %d when reading item %d of item %d", allocLevel, idcol+j+1, w+1, i+1);
// # nocov end
}
levelsRaw = tt;
}
SET_TRUELENGTH(s,-(++nLevel));
SET_TRULEN(s,-(++nLevel));
levelsRaw[nLevel-1] = s;
}
int *targetd = INTEGER(target);
Expand All @@ -628,7 +628,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg)
if (length(thisCol)<=1) {
// recycle length-1, or NA-fill length-0
SEXP lev;
const int val = (length(thisCol)==1 && id[0]!=NA_INTEGER && (lev=thisColStrD[id[0]-1])!=NA_STRING) ? -TRUELENGTH(lev) : NA_INTEGER;
const int val = (length(thisCol)==1 && id[0]!=NA_INTEGER && (lev=thisColStrD[id[0]-1])!=NA_STRING) ? -TRULEN(lev) : NA_INTEGER;
// ^^ #3915 and tests 2015.2-5
for (int r=0; r<thisnrow; ++r) targetd[ansloc+r] = val;
} else {
Expand All @@ -639,22 +639,22 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg)
// retain the position of NA level (if any) and the integer mappings to it
for (int k=0; k<n; ++k) {
SEXP s = thisColStrD[k];
if (s!=NA_STRING && -TRUELENGTH(s)!=k+1) { hop=true; break; }
if (s!=NA_STRING && -TRULEN(s)!=k+1) { hop=true; break; }
}
} else {
for (int k=0; k<n; ++k) {
SEXP s = thisColStrD[k];
if (s==NA_STRING || -TRUELENGTH(s)!=k+1) { hop=true; break; }
if (s==NA_STRING || -TRULEN(s)!=k+1) { hop=true; break; }
}
}
if (hop) {
if (orderedFactor) {
for (int r=0; r<thisnrow; ++r)
targetd[ansloc+r] = id[r]==NA_INTEGER ? NA_INTEGER : -TRUELENGTH(thisColStrD[id[r]-1]);
targetd[ansloc+r] = id[r]==NA_INTEGER ? NA_INTEGER : -TRULEN(thisColStrD[id[r]-1]);
} else {
for (int r=0; r<thisnrow; ++r) {
SEXP lev;
targetd[ansloc+r] = id[r]==NA_INTEGER || (lev=thisColStrD[id[r]-1])==NA_STRING ? NA_INTEGER : -TRUELENGTH(lev);
targetd[ansloc+r] = id[r]==NA_INTEGER || (lev=thisColStrD[id[r]-1])==NA_STRING ? NA_INTEGER : -TRULEN(lev);
}
}
} else {
Expand All @@ -664,16 +664,16 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg)
} else {
const SEXP *sd = SEXPPTR(thisColStr);
if (length(thisCol)<=1) {
const int val = (length(thisCol)==1 && sd[0]!=NA_STRING) ? -TRUELENGTH(sd[0]) : NA_INTEGER;
const int val = (length(thisCol)==1 && sd[0]!=NA_STRING) ? -TRULEN(sd[0]) : NA_INTEGER;
for (int r=0; r<thisnrow; ++r) targetd[ansloc+r] = val;
} else {
for (int r=0; r<thisnrow; ++r) targetd[ansloc+r] = sd[r]==NA_STRING ? NA_INTEGER : -TRUELENGTH(sd[r]);
for (int r=0; r<thisnrow; ++r) targetd[ansloc+r] = sd[r]==NA_STRING ? NA_INTEGER : -TRULEN(sd[r]);
}
}
}
ansloc += thisnrow;
}
for (int k=0; k<nLevel; ++k) SET_TRUELENGTH(levelsRaw[k], 0);
for (int k=0; k<nLevel; ++k) SET_TRULEN(levelsRaw[k], 0);
savetl_end();
if (warnStr[0]) warning("%s", warnStr); // now savetl_end() has happened it's safe to call warning (could error if options(warn=2))
copyMostAttrib(firstCol, target); // all but names,dim and dimnames; mainly for class. And if so, we want a copy here, not keepattr's SET_ATTRIB.
Expand Down
26 changes: 13 additions & 13 deletions src/data.table_subset.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ static void finalizer(SEXP p)
p = R_ExternalPtrTag(p);
if (!isString(p)) error("Internal error: finalizer's ExternalPtr doesn't see names in tag"); // # nocov
l = LENGTH(p);
tl = TRUELENGTH(p);
tl = TRULEN(p);
if (l<0 || tl<l) error("Internal error: finalizer sees l=%d, tl=%d",l,tl); // # nocov
n = tl-l;
if (n==0) {
Expand All @@ -26,7 +26,7 @@ static void finalizer(SEXP p)
}
x = PROTECT(allocVector(INTSXP, 50)); // 50 so it's big enough to be on LargeVector heap. See NodeClassSize in memory.c:allocVector
// INTSXP rather than VECSXP so that GC doesn't inspect contents after LENGTH (thanks to Karl Miller, Jul 2015)
SETLENGTH(x,50+n*2*sizeof(void *)/4); // 1*n for the names, 1*n for the VECSXP itself (both are over allocated).
SET_LEN(x,50+n*2*sizeof(void *)/4); // 1*n for the names, 1*n for the VECSXP itself (both are over allocated).
UNPROTECT(1);
return;
}
Expand Down Expand Up @@ -125,10 +125,10 @@ static SEXP shallow(SEXP dt, SEXP cols, R_len_t n)
setAttrib(newdt, R_NamesSymbol, newnames);
// setAttrib appears to change length and truelength, so need to do that first _then_ SET next,
// otherwise (if the SET were were first) the 100 tl is assigned to length.
SETLENGTH(newnames,l);
SET_TRUELENGTH(newnames,n);
SETLENGTH(newdt,l);
SET_TRUELENGTH(newdt,n);
SET_LEN(newnames,l);
SET_TRULEN(newnames,n);
SET_LEN(newdt,l);
SET_TRULEN(newdt,n);
setselfref(newdt);
UNPROTECT(protecti);
return(newdt);
Expand All @@ -140,7 +140,7 @@ SEXP Calloccol(SEXP dt) // , SEXP Rn
R_len_t tl, n, l;
l = LENGTH(dt);
n = l + 100; // asInteger(GetOption1(sym_collapse_DT_alloccol)); // asInteger(Rn);
tl = TRUELENGTH(dt);
tl = TRULEN(dt);
// R <= 2.13.2 and we didn't catch uninitialized tl somehow
if (tl < 0) error("Internal error, tl of class is marked but tl<0."); // # nocov
// better disable these...
Expand All @@ -153,8 +153,8 @@ SEXP Calloccol(SEXP dt) // , SEXP Rn
return shallow(dt, R_NilValue, n); // usual case (increasing alloc)

// SEXP nam = PROTECT(getAttrib(dt, R_NamesSymbol));
// if(LENGTH(nam) != l) SETLENGTH(nam, l);
// SET_TRUELENGTH(nam, n);
// if(LENGTH(nam) != l) SET_LEN(nam, l);
// SET_TRULEN(nam, n);
// setselfref(dt); // better, otherwise may be invalid !!
// UNPROTECT(1);
// return(dt);
Expand Down Expand Up @@ -544,8 +544,8 @@ SEXP subsetDT(SEXP x, SEXP rows, SEXP cols, SEXP checkrows) { // , SEXP fastret
// class is also copied here which retains superclass name in class vector as has been the case for many years; e.g. tests 1228.* for #5296

// This is because overalloc.. creating columns by reference stuff..
// SET_TRUELENGTH(ans, LENGTH(ans));
// SETLENGTH(ans, LENGTH(cols));
// SET_TRULEN(ans, LENGTH(ans));
// SET_LEN(ans, LENGTH(cols));
int ansn;
const SEXP *px = SEXPPTR_RO(x);
SEXP *pans = SEXPPTR(ans);
Expand Down Expand Up @@ -574,8 +574,8 @@ SEXP subsetDT(SEXP x, SEXP rows, SEXP cols, SEXP checkrows) { // , SEXP fastret
if(TYPEOF(colnam) == STRSXP) {
PROTECT(colnam);
SEXP tmp = PROTECT(allocVector(STRSXP, ncol)); nprotect++;
// SET_TRUELENGTH(tmp, LENGTH(tmp));
// SETLENGTH(tmp, LENGTH(cols));
// SET_TRULEN(tmp, LENGTH(tmp));
// SET_LEN(tmp, LENGTH(cols));
setAttrib(ans, R_NamesSymbol, tmp);
subsetVectorRaw(tmp, colnam, cols, /*anyNA=*/false);
UNPROTECT(1);
Expand Down
6 changes: 3 additions & 3 deletions src/data.table_utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ SEXP coerceUtf8IfNeeded(SEXP x) {
SEXP setnames(SEXP x, SEXP nam) {
if(TYPEOF(nam) != STRSXP) error("names need to be character typed");
if(INHERITS(x, char_datatable)) {
int n = TRUELENGTH(x), l = LENGTH(nam);
int n = TRULEN(x), l = LENGTH(nam);
if(n < l) { // error("Invalid data.table (underallocated), use qDT(data) to make valid.");
setAttrib(x, R_NamesSymbol, nam);
// setselfref(x);
Expand All @@ -44,8 +44,8 @@ SEXP setnames(SEXP x, SEXP nam) {
SEXP newnam = PROTECT(allocVector(STRSXP, n)),
*pnn = SEXPPTR(newnam), *pn = SEXPPTR(nam);
for(int i = 0; i < l; ++i) pnn[i] = pn[i];
SETLENGTH(newnam, l);
SET_TRUELENGTH(newnam, n);
SET_LEN(newnam, l);
SET_TRULEN(newnam, n);
setAttrib(x, R_NamesSymbol, newnam);
setselfref(x);
UNPROTECT(1);
Expand Down

0 comments on commit e7f1159

Please sign in to comment.