Skip to content

Commit

Permalink
Merge pull request #98 from MicheleNuijten/update-pdftext
Browse files Browse the repository at this point in the history
Update pdftext
  • Loading branch information
MicheleNuijten authored Jul 26, 2024
2 parents 6604b84 + 88f85b9 commit 0038858
Show file tree
Hide file tree
Showing 21 changed files with 849 additions and 104 deletions.
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ Imports:
rlang,
rmarkdown,
stringi,
tcltk
tcltk,
pdftools
Suggests:
testthat
ByteCompile: yes
Expand Down
5 changes: 5 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,8 @@ importFrom(graphics,plot.default)
importFrom(graphics,points)
importFrom(graphics,text)
importFrom(rlang,.data)
importFrom(stringi,stri_enc_toutf32)
importFrom(stringi,stri_enc_fromutf32)
importFrom(stringi,stri_split_lines)
importFrom(stringi,stri_split_regex)
importFrom(pdftools,pdf_text)
10 changes: 10 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
# statcheck 1.6.0

## Major changes

## Small updates
* Changed the way pdf files are converted to text. Initially, statcheck relied on the external program Xpdf, which needed to be installed separately and added to the path. To simplify the workflow (and also have improved pdf conversions) statcheck now uses the R package pdftools.
* Updated the test files for testing text-to-file. Before, I hard coded the true values statcheck should extract, but this was error prone. Now, there is a spreadsheet with manually extracted values (the gold standard) that is used as a reference. This is easier to update if necessary; the updates then only need to happen in one place (the spreadsheet) and no longer at multiple places in the code.

## Bug fixes

# statcheck 1.5.0

## Major changes
Expand Down
20 changes: 0 additions & 20 deletions R/doc-checkdirs.R

This file was deleted.

14 changes: 0 additions & 14 deletions R/doc-checkfiles.R

This file was deleted.

165 changes: 144 additions & 21 deletions R/file-to-txt.R
Original file line number Diff line number Diff line change
Expand Up @@ -91,27 +91,150 @@ getHTML <- function(x){
# PDF TO TXT -------------------------------------------------------------------
getPDF <- function(x){

txtfiles <- character(length(x))
for (i in 1:length(x)){

system(paste('pdftotext -q -enc "ASCII7" "', x[i], '"', sep = ""))
if (file.exists(gsub("\\.pdf$", "\\.txt", x[i]))) {
fileName <- gsub("\\.pdf$", "\\.txt", x[i])
strings <- readChar(fileName, file.info(fileName)$size)

# remove carriage returns and new lines
strings <- gsub(x = strings, pattern = "[\r\n]", replacement = "")

# save result in vector
txtfiles[i] <- strings

} else{

warning(paste("Failure in file", x[i]))
txtfiles[i] <- ""

}
}
txtfiles <- sapply(x, pdftools::pdf_text)

# encode everything in UTF-32
# this should ensure the same output accross multiple operating systems
txtfiles <- stringi::stri_enc_toutf32(txtfiles)

# Replace known weird characters

# substitute double solidous (UTF-32 Decimal 11005) with equal sign (UTF-32
# Decimal 61) [issue in APA journals]
txtfiles <- lapply(txtfiles, gsub, pattern = "11005",
replacement = "61", fixed = TRUE)

# substitute 1/4 (UTF-32 decimal 188) with equal sign (UTF-32 Decimal 61);
# [issue in Elsevier journal: Journal of Environmental Psychology]
txtfiles <- lapply(txtfiles, gsub, pattern = "188",
replacement = "61", fixed = TRUE)

# substitute U+2B0D (C++ \u2b0d; UTF-32 Decimal 11021) with less than
# sign (UTF-32 Decimal 60) [issue in APA journals]
txtfiles <- lapply(txtfiles, gsub, pattern = "11021",
replacement = "60", fixed = TRUE)

# substitute ! (UTF-32 decimal 33) with less than sign (UTF-32 Decimal 60);
# [issue in Oxford journal: Journal of Consumer Research]
txtfiles <- lapply(txtfiles, gsub, pattern = "33",
replacement = "60", fixed = TRUE)

# substitute U+2AFA (UTF-32 Decimal 11002) with HYPHEN-MINUS sign (UTF-32
# Decimal 45) [issue in APA journals]
txtfiles <- lapply(txtfiles, gsub, pattern = "11002",
replacement = "45", fixed = TRUE)

# substitute U+2439 (C++ \u2439; UTF-32 Decimal 9273) with small greek chi
# (UTF-32 Decimal 967) [issue in APA journals]
txtfiles <- lapply(txtfiles, gsub, pattern = "9273",
replacement = "967", fixed = TRUE)

# Revert to UTF-8 encoding
txtfiles <- stringi::stri_enc_fromutf32(txtfiles)


# Arrange text according to paper column layout
txtfiles <- pdf_columns(txtfiles)

# Paste the differente pages together, so that each pdf is converted to
# one string of text
txtfiles <- stringr::str_c(unlist(txtfiles), collapse = "")



# substitute the letter "b" in a NHST result for a "<". This is not feasible
# in utf32 encoding, because making a regex that only substitutes the b in
# a statistical result instead of ALL b's in the paper is very hard in
# utf32 encoding. [issue in Elsevier journal: JESP]
txtfiles <- lapply(txtfiles, gsub,
# don't match a b preceded by =<>, because the b itself
# should be the comparison sign.
# only match a b followed by a number, that gives further
# proof that the b is in fact the comparison sign.
pattern = RGX_B_SMALLER,
replacement = "<", perl = TRUE)

# substitute the letter "N" in a NHST result for a ">", for the same reason
# as above. [issue in Elsevier journal: JESP]
txtfiles <- lapply(txtfiles, gsub,
# don't match a N preceded by =<>, because the N itself
# should be the comparison sign.
# only match a N followed by a number, that gives further
# proof that the N is in fact the comparison sign.
pattern = RGX_N_LARGER,
replacement = ">", perl = TRUE)

# substitute the letter "p" that should be a "=". [issue in Oxford journal:
# journal of consumer research]
txtfiles <- lapply(txtfiles, gsub,
# don't match a p preceded by a "," or a ",\\s", because
# that is the actual p-value.
# only match a p followed by a number, that gives further
# proof that the p is in fact the comparison sign.
pattern = RGX_P_EQUAL,
replacement = "=", perl = TRUE)

# substitute the letter "B" that should be a '"'. [issue in BRM]
txtfiles <- lapply(txtfiles, gsub,
# only match a B followed by a letter that could indicate
# a test statistic
pattern = RGX_B_QUOTE,
replacement = '"', perl = TRUE)


return(txtfiles)
}


# helper function for getPDF() -------------------------------------------------

# This function helps maintaining the format of pdf files with a multiple
# columns layout.
# Credits to:
# https://github.com/fsingletonthorn/EffectSizeScraping/blob/master/R/pdf_process.R
# for original function

true_false <- function(x, chars) {
x > chars
}

pdf_columns <- function(x, pattern = "\\p{WHITE_SPACE}{3,}") {
# \p{L} matches a single code point in the category "letter".
# {3,} three or more

# This function is slightly adapted from pdfsearch
# see: https://github.com/lebebr01/pdfsearch/blob/master/R/split_pdf.r

x_lines <- stringi::stri_split_lines(x)
x_lines <- lapply(x_lines, gsub,
pattern = "^\\s{1,20}",
# ^ string that starts with
# \ creates regular expression containing following...
# \s matches any whitespace
# {1,20} between 1 and 20 of these [ in your case this will become +]
replacement = "")

x_page <- lapply(
x_lines,
stringi::stri_split_regex,
pattern = pattern,
omit_empty = NA,
simplify = TRUE
)

page_lines <- unlist(lapply(x_page, nrow))
columns <- unlist(lapply(x_page, ncol))

num_chars <- lapply(x_page, base::nchar)
num_chars_tf <- lapply(num_chars, true_false, chars = 3)

for (xx in seq_along(num_chars_tf)) {
num_chars_tf[[xx]][is.na(num_chars_tf[[xx]])] <- FALSE
}

output <- lapply(seq_along(x_page), function(xx)
x_page[[xx]][num_chars_tf[[xx]]])

output <- lapply(output, paste, collapse = " ")
return(output)
}
41 changes: 41 additions & 0 deletions R/helper-load-manual-reference.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# this script contains a helper function for the unit tests to load the
# manual reference file and select relevant rows to compare statcheck output to


load_manual <- function(
path_manual, # path to the reference file
apa = TRUE, # only consider apa reported stats
pdf_conversion_issues = FALSE, # exclude cases where pdf conversion led to weird characters
typesetting_issues = FALSE, # exclude cases where typesetting issues led to weird situations
file_type = c("all", "pdf", "html"), # select specific file types
file_id = NULL # select specific files based on file_id variable
){

# load the reference file with manually extracted statistics
manual <- read.csv2(system.file(path_manual, package = "statcheck"), header = TRUE)

# row selection based on arguments
if(apa == TRUE){
manual <- manual[manual$extract_apa == 1, ]
}

if(pdf_conversion_issues == FALSE){
manual <- manual[manual$pdf_conversion_issues == 0, ]
}

if(typesetting_issues == FALSE){
manual <- manual[manual$typesetting_issues == 0, ]
}

if(file_type[1] == "pdf"){
manual <- manual[manual$file_type == "pdf", ]
} else if(file_type[1] == "html") {
manual <- manual[manual$file_type == "pdf", ]
}

if(!is.null(file_id)){
manual <- manual[manual$file_id == file_id, ]
}

return(manual)
}
20 changes: 20 additions & 0 deletions R/regex.R
Original file line number Diff line number Diff line change
Expand Up @@ -96,4 +96,24 @@ RGX_WEIRD_MINUS <- "\\s?[^\\d\\.\\s]+(?=\\d|\\.)"
# F-tests and when df1 == 1, it gets typeset as the letter l or I
RGX_DF1_I_L <- "I|l"

################################################################################
###################### REGEXES FOR WEIRD PDF ENCODING ##########################
################################################################################

# in some JESP articles, a < is translated with a b
# this regex is used in file-to-txt.R to replace it
RGX_B_SMALLER <- "(?<![=<>])b(?=\\s?-?\\s?\\.?\\d)"

# in some JESP articles, a > is translated with a N
# this regex is used in file-to-txt.R to replace it
RGX_N_LARGER <- "(?<![=<>])N(?=\\s?-?\\s?\\.?\\d)"

# in the journal of consumer research, a = is translated with a p
# this regex is used in file-to-txt.R to replace it
RGX_P_EQUAL <- "(?<!(,\\s)|,)p(?=\\s?-?\\s?\\.?\\d)"

# in the Nuijten et al. 2016 article, quotes are translated as B
# this means that tests between quotes are not detected, because
# statcheck can only find tests if there are not preceded by other
# letters. Find upper case B followed by a test statistic
RGX_B_QUOTE <- "B(?=(t|F|r|Q|z|Z))"
Binary file not shown.
Loading

0 comments on commit 0038858

Please sign in to comment.