From 9d984969672e5edcdd91ec5fb822030100ccc996 Mon Sep 17 00:00:00 2001 From: MHCZ Date: Thu, 5 Nov 2020 21:36:19 +0000 Subject: [PATCH 1/2] added compatibility with raw vector having pdf data (as is for pdf functions in the pdftools package) --- DESCRIPTION | 2 +- R/extract_tables.R | 2 +- R/utils.R | 24 +++++++++++++++--------- man/extract_areas.Rd | 9 +++++++-- man/extract_tables.Rd | 21 +++++++++++++++------ man/extract_text.Rd | 10 ++++++++-- man/make_thumbnails.Rd | 11 +++++++++-- 7 files changed, 56 insertions(+), 23 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index d424d0b..be0623d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -41,4 +41,4 @@ Suggests: testthat SystemRequirements: Java (>= 7.0) VignetteBuilder: knitr -RoxygenNote: 6.0.1 +RoxygenNote: 7.1.1 diff --git a/R/extract_tables.R b/R/extract_tables.R index 59fc343..f9cdd58 100644 --- a/R/extract_tables.R +++ b/R/extract_tables.R @@ -1,6 +1,6 @@ #' @title extract_tables #' @description Extract tables from a file -#' @param file A character string specifying the path or URL to a PDF file. +#' @param file A character string specifying the path or URL to a PDF file, or raw vector with pdf data. #' @param pages An optional integer vector specifying pages to extract from. #' @param area An optional list, of length equal to the number of pages specified, where each entry contains a four-element numeric vector of coordinates (top,left,bottom,right) containing the table for the corresponding page. As a convenience, a list of length 1 can be used to extract the same area from all (specified) pages. Only specify \code{area} xor \code{columns}. #' @param columns An optional list, of length equal to the number of pages specified, where each entry contains a numeric vector of horizontal (x) coordinates separating columns of data for the corresponding page. As a convenience, a list of length 1 can be used to specify the same columns for all (specified) pages. Only specify \code{area} xor \code{columns}. diff --git a/R/utils.R b/R/utils.R index 4125907..f5997fa 100644 --- a/R/utils.R +++ b/R/utils.R @@ -19,17 +19,23 @@ localize_file <- function(path, copy = FALSE, quiet = TRUE) { path } -load_doc <- function(file, password = NULL, copy = FALSE) { +load_doc <- function(file = NULL, password = NULL, copy = FALSE) { + pdfDocument <- new(J("org.apache.pdfbox.pdmodel.PDDocument")) + if(typeof(file) != "raw"){ localfile <- localize_file(path = file, copy = copy) - pdfDocument <- new(J("org.apache.pdfbox.pdmodel.PDDocument")) fileInputStream <- new(J("java.io.FileInputStream"), name <- localfile) - if (is.null(password)) { - doc <- pdfDocument$load(input = fileInputStream) - } else { - doc <- pdfDocument$load(input = fileInputStream, password = password) - } - pdfDocument$close() - doc + } + else { + fileInputStream <- new(J("java.io.ByteArrayInputStream"), buf = rJava::.jbyte(file)) + } + + if (is.null(password)) { + doc <- pdfDocument$load(input = fileInputStream) + } else { + doc <- pdfDocument$load(input = fileInputStream, password = password) + } + pdfDocument$close() + doc } make_pages <- function(pages, oe) { diff --git a/man/extract_areas.Rd b/man/extract_areas.Rd index 41a7fea..df2d1c1 100644 --- a/man/extract_areas.Rd +++ b/man/extract_areas.Rd @@ -5,8 +5,13 @@ \alias{extract_areas} \title{extract_areas} \usage{ -locate_areas(file, pages = NULL, resolution = 60L, widget = c("shiny", - "native", "reduced"), copy = FALSE) +locate_areas( + file, + pages = NULL, + resolution = 60L, + widget = c("shiny", "native", "reduced"), + copy = FALSE +) extract_areas(file, pages = NULL, guess = FALSE, copy = FALSE, ...) } diff --git a/man/extract_tables.Rd b/man/extract_tables.Rd index 03b9d4b..03c6b7d 100644 --- a/man/extract_tables.Rd +++ b/man/extract_tables.Rd @@ -4,14 +4,23 @@ \alias{extract_tables} \title{extract_tables} \usage{ -extract_tables(file, pages = NULL, area = NULL, columns = NULL, - guess = TRUE, method = c("decide", "lattice", "stream"), - output = c("matrix", "data.frame", "character", "asis", "csv", "tsv", - "json"), outdir = NULL, password = NULL, encoding = NULL, - copy = FALSE, ...) +extract_tables( + file, + pages = NULL, + area = NULL, + columns = NULL, + guess = TRUE, + method = c("decide", "lattice", "stream"), + output = c("matrix", "data.frame", "character", "asis", "csv", "tsv", "json"), + outdir = NULL, + password = NULL, + encoding = NULL, + copy = FALSE, + ... +) } \arguments{ -\item{file}{A character string specifying the path or URL to a PDF file.} +\item{file}{A character string specifying the path or URL to a PDF file, or raw vector with pdf data.} \item{pages}{An optional integer vector specifying pages to extract from.} diff --git a/man/extract_text.Rd b/man/extract_text.Rd index 3e40802..269f0f9 100644 --- a/man/extract_text.Rd +++ b/man/extract_text.Rd @@ -4,8 +4,14 @@ \alias{extract_text} \title{extract_text} \usage{ -extract_text(file, pages = NULL, area = NULL, password = NULL, - encoding = NULL, copy = FALSE) +extract_text( + file, + pages = NULL, + area = NULL, + password = NULL, + encoding = NULL, + copy = FALSE +) } \arguments{ \item{file}{A character string specifying the path or URL to a PDF file.} diff --git a/man/make_thumbnails.Rd b/man/make_thumbnails.Rd index 2f00dd1..1a29924 100644 --- a/man/make_thumbnails.Rd +++ b/man/make_thumbnails.Rd @@ -4,8 +4,15 @@ \alias{make_thumbnails} \title{make_thumbnails} \usage{ -make_thumbnails(file, outdir = NULL, pages = NULL, format = c("png", - "jpeg", "bmp", "gif"), resolution = 72, password = NULL, copy = FALSE) +make_thumbnails( + file, + outdir = NULL, + pages = NULL, + format = c("png", "jpeg", "bmp", "gif"), + resolution = 72, + password = NULL, + copy = FALSE +) } \arguments{ \item{file}{A character string specifying the path or URL to a PDF file.} From f69af0500220f704959ea02059becefb5851bbf1 Mon Sep 17 00:00:00 2001 From: MHCZ Date: Thu, 5 Nov 2020 21:41:54 +0000 Subject: [PATCH 2/2] updated documentation for all compatible functions with raw vectors --- R/extract_metadata.R | 2 +- R/extract_text.R | 2 +- R/get_page_dims.R | 2 +- man/extract_metadata.Rd | 2 +- man/extract_text.Rd | 2 +- man/get_page_dims.Rd | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/R/extract_metadata.R b/R/extract_metadata.R index ab83936..e6c490b 100644 --- a/R/extract_metadata.R +++ b/R/extract_metadata.R @@ -1,6 +1,6 @@ #' @title extract_metadata #' @description Extract metadata from a file -#' @param file A character string specifying the path or URL to a PDF file. +#' @param file A character string specifying the path or URL to a PDF file, or raw vector with pdf data. #' @param password Optionally, a character string containing a user password to access a secured PDF. #' @param copy Specifies whether the original local file(s) should be copied to #' \code{tempdir()} before processing. \code{FALSE} by default. The argument is diff --git a/R/extract_text.R b/R/extract_text.R index 019b9a5..c299234 100644 --- a/R/extract_text.R +++ b/R/extract_text.R @@ -1,6 +1,6 @@ #' @title extract_text #' @description Extract text from a file -#' @param file A character string specifying the path or URL to a PDF file. +#' @param file A character string specifying the path or URL to a PDF file, or raw vector with pdf data. #' @param pages An optional integer vector specifying pages to extract from. #' @param area An optional list, of length equal to the number of pages specified, where each entry contains a four-element numeric vector of coordinates (top,left,bottom,right) containing the table for the corresponding page. As a convenience, a list of length 1 can be used to extract the same area from all (specified) pages. #' @param password Optionally, a character string containing a user password to access a secured PDF. diff --git a/R/get_page_dims.R b/R/get_page_dims.R index 50076e2..3b7f40a 100644 --- a/R/get_page_dims.R +++ b/R/get_page_dims.R @@ -1,7 +1,7 @@ #' @rdname get_page_dims #' @title Page length and dimensions #' @description Get Page Length and Dimensions -#' @param file A character string specifying the path or URL to a PDF file. +#' @param file A character string specifying the path or URL to a PDF file, or raw vector with pdf data. #' @param pages An optional integer vector specifying pages to extract from. #' @param doc Optionally,, in lieu of \code{file}, an rJava reference to a PDDocument Java object. #' @param password Optionally, a character string containing a user password to access a secured PDF. diff --git a/man/extract_metadata.Rd b/man/extract_metadata.Rd index b48b3cf..f54ac86 100644 --- a/man/extract_metadata.Rd +++ b/man/extract_metadata.Rd @@ -7,7 +7,7 @@ extract_metadata(file, password = NULL, copy = FALSE) } \arguments{ -\item{file}{A character string specifying the path or URL to a PDF file.} +\item{file}{A character string specifying the path or URL to a PDF file, or raw vector with pdf data.} \item{password}{Optionally, a character string containing a user password to access a secured PDF.} diff --git a/man/extract_text.Rd b/man/extract_text.Rd index 269f0f9..21e4261 100644 --- a/man/extract_text.Rd +++ b/man/extract_text.Rd @@ -14,7 +14,7 @@ extract_text( ) } \arguments{ -\item{file}{A character string specifying the path or URL to a PDF file.} +\item{file}{A character string specifying the path or URL to a PDF file, or raw vector with pdf data.} \item{pages}{An optional integer vector specifying pages to extract from.} diff --git a/man/get_page_dims.Rd b/man/get_page_dims.Rd index 1f7c480..62c8ae9 100644 --- a/man/get_page_dims.Rd +++ b/man/get_page_dims.Rd @@ -10,7 +10,7 @@ get_page_dims(file, doc, pages = NULL, password = NULL, copy = FALSE) get_n_pages(file, doc, password = NULL, copy = FALSE) } \arguments{ -\item{file}{A character string specifying the path or URL to a PDF file.} +\item{file}{A character string specifying the path or URL to a PDF file, or raw vector with pdf data.} \item{doc}{Optionally,, in lieu of \code{file}, an rJava reference to a PDDocument Java object.}