Merge pull request #98 from MicheleNuijten/update-pdftext

Update pdftext
MicheleNuijten · Jul 26, 2024 · 0038858 · 0038858
2 parents 6604b84 + 88f85b9
commit 0038858
Show file tree

Hide file tree

Showing 21 changed files with 849 additions and 104 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -47,7 +47,8 @@ Imports:
     rlang,
     rmarkdown,
     stringi,
-    tcltk
+    tcltk,
+    pdftools
 Suggests: 
     testthat
 ByteCompile: yes

diff --git a/NAMESPACE b/NAMESPACE
@@ -33,3 +33,8 @@ importFrom(graphics,plot.default)
 importFrom(graphics,points)
 importFrom(graphics,text)
 importFrom(rlang,.data)
+importFrom(stringi,stri_enc_toutf32)
+importFrom(stringi,stri_enc_fromutf32)
+importFrom(stringi,stri_split_lines)
+importFrom(stringi,stri_split_regex)
+importFrom(pdftools,pdf_text)
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,13 @@
+# statcheck 1.6.0
+
+## Major changes
+
+## Small updates
+* Changed the way pdf files are converted to text. Initially, statcheck relied on the external program Xpdf, which needed to be installed separately and added to the path. To simplify the workflow (and also have improved pdf conversions) statcheck now uses the R package pdftools.
+* Updated the test files for testing text-to-file. Before, I hard coded the true values statcheck should extract, but this was error prone. Now, there is a spreadsheet with manually extracted values (the gold standard) that is used as a reference. This is easier to update if necessary; the updates then only need to happen in one place (the spreadsheet) and no longer at multiple places in the code.
+
+## Bug fixes
+
 # statcheck 1.5.0
 
 ## Major changes

diff --git a/R/doc-checkdirs.R b/R/doc-checkdirs.R
diff --git a/R/doc-checkfiles.R b/R/doc-checkfiles.R
diff --git a/R/file-to-txt.R b/R/file-to-txt.R
@@ -91,27 +91,150 @@ getHTML <- function(x){
 # PDF TO TXT -------------------------------------------------------------------
 getPDF <- function(x){
 
-  txtfiles <- character(length(x))
-  for (i in 1:length(x)){
-
-    system(paste('pdftotext -q -enc "ASCII7" "', x[i], '"', sep = ""))
-    if (file.exists(gsub("\\.pdf$", "\\.txt", x[i]))) {
-      fileName <- gsub("\\.pdf$", "\\.txt", x[i])
-      strings <- readChar(fileName, file.info(fileName)$size)
-
-      # remove carriage returns and new lines
-      strings <- gsub(x = strings, pattern = "[\r\n]", replacement = "")
-
-      # save result in vector
-      txtfiles[i] <- strings
-
-    } else{
-
-      warning(paste("Failure in file", x[i]))
-      txtfiles[i] <- ""
-
-    }
-  }
+  txtfiles <- sapply(x, pdftools::pdf_text) 
+
+  # encode everything in UTF-32 
+  # this should ensure the same output accross multiple operating systems
+  txtfiles <- stringi::stri_enc_toutf32(txtfiles) 
+
+  # Replace known weird characters
+
+  # substitute double solidous (UTF-32 Decimal 11005) with equal sign (UTF-32
+  # Decimal 61) [issue in APA journals]
+  txtfiles <- lapply(txtfiles, gsub, pattern = "11005",
+                     replacement = "61", fixed = TRUE)
+
+  # substitute 1/4 (UTF-32 decimal 188) with equal sign (UTF-32 Decimal 61);
+  # [issue in Elsevier journal: Journal of Environmental Psychology]
+  txtfiles <- lapply(txtfiles, gsub, pattern = "188",
+                     replacement = "61", fixed = TRUE)
+
+  # substitute U+2B0D (C++ \u2b0d; UTF-32 Decimal 11021) with less than
+  # sign (UTF-32 Decimal 60) [issue in APA journals]
+  txtfiles <- lapply(txtfiles, gsub, pattern = "11021",
+                     replacement = "60", fixed = TRUE)
+
+  # substitute ! (UTF-32 decimal 33) with less than sign (UTF-32 Decimal 60);
+  # [issue in Oxford journal: Journal of Consumer Research]
+  txtfiles <- lapply(txtfiles, gsub, pattern = "33",
+                     replacement = "60", fixed = TRUE)
+
+  # substitute U+2AFA (UTF-32 Decimal 11002) with HYPHEN-MINUS sign (UTF-32
+  # Decimal 45) [issue in APA journals]
+  txtfiles <- lapply(txtfiles, gsub, pattern = "11002",
+                     replacement = "45", fixed = TRUE)
+
+  # substitute U+2439 (C++ \u2439; UTF-32 Decimal 9273) with small greek chi
+  # (UTF-32 Decimal 967) [issue in APA journals]
+  txtfiles <- lapply(txtfiles, gsub, pattern = "9273",
+                     replacement = "967", fixed = TRUE)
+
+  # Revert to UTF-8 encoding
+  txtfiles <- stringi::stri_enc_fromutf32(txtfiles)
+
+
+  # Arrange text according to paper column layout
+  txtfiles <- pdf_columns(txtfiles)
+
+  # Paste the differente pages together, so that each pdf is converted to 
+  # one string of text
+  txtfiles <- stringr::str_c(unlist(txtfiles), collapse = "")
+
+
+
+  # substitute the letter "b" in a NHST result for a "<". This is not feasible
+  # in utf32 encoding, because making a regex that only substitutes the b in
+  # a statistical result instead of ALL b's in the paper is very hard in 
+  # utf32 encoding. [issue in Elsevier journal: JESP]
+  txtfiles <- lapply(txtfiles, gsub, 
+                     # don't match a b preceded by =<>, because the b itself 
+                     # should be the comparison sign.
+                     # only match a b followed by a number, that gives further
+                     # proof that the b is in fact the comparison sign.
+                     pattern = RGX_B_SMALLER,
+                     replacement = "<", perl = TRUE)
+
+  # substitute the letter "N" in a NHST result for a ">", for the same reason 
+  # as above. [issue in Elsevier journal: JESP]
+  txtfiles <- lapply(txtfiles, gsub, 
+                     # don't match a N preceded by =<>, because the N itself 
+                     # should be the comparison sign.
+                     # only match a N followed by a number, that gives further
+                     # proof that the N is in fact the comparison sign.
+                     pattern = RGX_N_LARGER,
+                     replacement = ">", perl = TRUE)
+
+  # substitute the letter "p" that should be a "=". [issue in Oxford journal:
+  # journal of consumer research]
+  txtfiles <- lapply(txtfiles, gsub, 
+                     # don't match a p preceded by a "," or a ",\\s", because 
+                     # that is the actual p-value.
+                     # only match a p followed by a number, that gives further
+                     # proof that the p is in fact the comparison sign.
+                     pattern = RGX_P_EQUAL,
+                     replacement = "=", perl = TRUE)
+
+  # substitute the letter "B" that should be a '"'. [issue in BRM]
+  txtfiles <- lapply(txtfiles, gsub, 
+                     # only match a B followed by a letter that could indicate
+                     # a test statistic
+                     pattern = RGX_B_QUOTE,
+                     replacement = '"', perl = TRUE)
+
 
   return(txtfiles)
 }
+
+
+# helper function for getPDF() -------------------------------------------------
+
+# This function helps maintaining the format of pdf files with a multiple 
+# columns layout.
+# Credits to:  
+# https://github.com/fsingletonthorn/EffectSizeScraping/blob/master/R/pdf_process.R 
+# for original function
+
+true_false <- function(x, chars) {
+  x > chars
+}
+
+pdf_columns <- function(x, pattern = "\\p{WHITE_SPACE}{3,}") {
+  # \p{L} matches a single code point in the category "letter".
+  # {3,} three or more
+
+  # This function is slightly adapted from pdfsearch
+  # see: https://github.com/lebebr01/pdfsearch/blob/master/R/split_pdf.r
+
+  x_lines <- stringi::stri_split_lines(x)
+  x_lines <- lapply(x_lines, gsub,
+                    pattern = "^\\s{1,20}",
+                    # ^ string that starts with
+                    # \ creates regular expression containing following...
+                    # \s matches any whitespace
+                    # {1,20} between 1 and 20 of these [ in your case this will become +]
+                    replacement = "")
+
+  x_page <- lapply(
+    x_lines,
+    stringi::stri_split_regex,
+    pattern = pattern,
+    omit_empty = NA,
+    simplify = TRUE
+  )
+
+  page_lines <- unlist(lapply(x_page, nrow))
+  columns <- unlist(lapply(x_page, ncol))
+
+  num_chars <- lapply(x_page, base::nchar)
+  num_chars_tf <- lapply(num_chars, true_false, chars = 3)
+
+  for (xx in seq_along(num_chars_tf)) {
+    num_chars_tf[[xx]][is.na(num_chars_tf[[xx]])] <- FALSE
+  }
+
+  output <- lapply(seq_along(x_page), function(xx)
+    x_page[[xx]][num_chars_tf[[xx]]])
+
+  output <- lapply(output, paste, collapse = " ")
+  return(output)
+}
diff --git a/R/helper-load-manual-reference.R b/R/helper-load-manual-reference.R
@@ -0,0 +1,41 @@
+# this script contains a helper function for the unit tests to load the 
+# manual reference file and select relevant rows to compare statcheck output to
+
+
+load_manual <- function(
+    path_manual, # path to the reference file
+    apa = TRUE, # only consider apa reported stats
+    pdf_conversion_issues = FALSE, # exclude cases where pdf conversion led to weird characters
+    typesetting_issues = FALSE, # exclude cases where typesetting issues led to weird situations
+    file_type = c("all", "pdf", "html"), # select specific file types
+    file_id = NULL # select specific files based on file_id variable
+){
+
+  # load the reference file with manually extracted statistics
+  manual <- read.csv2(system.file(path_manual, package = "statcheck"), header = TRUE)
+
+  # row selection based on arguments
+  if(apa == TRUE){
+    manual <- manual[manual$extract_apa == 1, ]
+  }
+
+  if(pdf_conversion_issues == FALSE){
+    manual <- manual[manual$pdf_conversion_issues == 0, ]
+  }
+
+  if(typesetting_issues == FALSE){
+    manual <- manual[manual$typesetting_issues == 0, ]
+  }
+
+  if(file_type[1] == "pdf"){
+    manual <- manual[manual$file_type == "pdf", ]
+  } else if(file_type[1] == "html") {
+    manual <- manual[manual$file_type == "pdf", ]
+  } 
+
+  if(!is.null(file_id)){
+    manual <- manual[manual$file_id == file_id, ]
+  }
+
+  return(manual)
+}
diff --git a/R/regex.R b/R/regex.R
@@ -96,4 +96,24 @@ RGX_WEIRD_MINUS <- "\\s?[^\\d\\.\\s]+(?=\\d|\\.)"
 # F-tests and when df1 == 1, it gets typeset as the letter l or I 
 RGX_DF1_I_L <- "I|l"
 
+################################################################################
+###################### REGEXES FOR WEIRD PDF ENCODING ##########################
+################################################################################
+
+# in some JESP articles, a < is translated with a b
+# this regex is used in file-to-txt.R to replace it
+RGX_B_SMALLER <- "(?<![=<>])b(?=\\s?-?\\s?\\.?\\d)"
+
+# in some JESP articles, a > is translated with a N
+# this regex is used in file-to-txt.R to replace it
+RGX_N_LARGER <- "(?<![=<>])N(?=\\s?-?\\s?\\.?\\d)"
+
+# in the journal of consumer research, a = is translated with a p
+# this regex is used in file-to-txt.R to replace it
+RGX_P_EQUAL <- "(?<!(,\\s)|,)p(?=\\s?-?\\s?\\.?\\d)"
 
+# in the Nuijten et al. 2016 article, quotes are translated as B
+# this means that tests between quotes are not detected, because
+# statcheck can only find tests if there are not preceded by other
+# letters. Find upper case B followed by a test statistic
+RGX_B_QUOTE <- "B(?=(t|F|r|Q|z|Z))"
diff --git a/inst/test_materials/codebook_manually_extracted_stats.xlsx b/inst/test_materials/codebook_manually_extracted_stats.xlsx