From 9d984969672e5edcdd91ec5fb822030100ccc996 Mon Sep 17 00:00:00 2001
From: MHCZ <mhcz@novonordisk.com>
Date: Thu, 5 Nov 2020 21:36:19 +0000
Subject: [PATCH 1/2] added compatibility with raw vector having pdf data (as
 is for pdf functions in the pdftools package)

---
 DESCRIPTION            |  2 +-
 R/extract_tables.R     |  2 +-
 R/utils.R              | 24 +++++++++++++++---------
 man/extract_areas.Rd   |  9 +++++++--
 man/extract_tables.Rd  | 21 +++++++++++++++------
 man/extract_text.Rd    | 10 ++++++++--
 man/make_thumbnails.Rd | 11 +++++++++--
 7 files changed, 56 insertions(+), 23 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index d424d0b..be0623d 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -41,4 +41,4 @@ Suggests:
     testthat
 SystemRequirements: Java (>= 7.0)
 VignetteBuilder: knitr
-RoxygenNote: 6.0.1
+RoxygenNote: 7.1.1
diff --git a/R/extract_tables.R b/R/extract_tables.R
index 59fc343..f9cdd58 100644
--- a/R/extract_tables.R
+++ b/R/extract_tables.R
@@ -1,6 +1,6 @@
 #' @title extract_tables
 #' @description Extract tables from a file
-#' @param file A character string specifying the path or URL to a PDF file.
+#' @param file A character string specifying the path or URL to a PDF file, or raw vector with pdf data.
 #' @param pages An optional integer vector specifying pages to extract from.
 #' @param area An optional list, of length equal to the number of pages specified, where each entry contains a four-element numeric vector of coordinates (top,left,bottom,right) containing the table for the corresponding page. As a convenience, a list of length 1 can be used to extract the same area from all (specified) pages. Only specify \code{area} xor \code{columns}.
 #' @param columns An optional list, of length equal to the number of pages specified, where each entry contains a numeric vector of horizontal (x) coordinates separating columns of data for the corresponding page. As a convenience, a list of length 1 can be used to specify the same columns for all (specified) pages. Only specify \code{area} xor \code{columns}.
diff --git a/R/utils.R b/R/utils.R
index 4125907..f5997fa 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -19,17 +19,23 @@ localize_file <- function(path, copy = FALSE, quiet = TRUE) {
     path
 }
 
-load_doc <- function(file, password = NULL, copy = FALSE) {
+load_doc <- function(file = NULL, password = NULL, copy = FALSE) {
+  pdfDocument <- new(J("org.apache.pdfbox.pdmodel.PDDocument"))
+  if(typeof(file) != "raw"){
     localfile <- localize_file(path = file, copy = copy)
-    pdfDocument <- new(J("org.apache.pdfbox.pdmodel.PDDocument"))
     fileInputStream <- new(J("java.io.FileInputStream"), name <- localfile)
-    if (is.null(password)) {
-        doc <- pdfDocument$load(input = fileInputStream)
-    } else {
-        doc <- pdfDocument$load(input = fileInputStream, password = password)
-    }
-    pdfDocument$close()
-    doc
+  }
+  else {
+    fileInputStream <- new(J("java.io.ByteArrayInputStream"), buf = rJava::.jbyte(file))
+  }
+  
+  if (is.null(password)) {
+    doc <- pdfDocument$load(input = fileInputStream)
+  } else {
+    doc <- pdfDocument$load(input = fileInputStream, password = password)
+  }
+  pdfDocument$close()
+  doc
 }
 
 make_pages <- function(pages, oe) {
diff --git a/man/extract_areas.Rd b/man/extract_areas.Rd
index 41a7fea..df2d1c1 100644
--- a/man/extract_areas.Rd
+++ b/man/extract_areas.Rd
@@ -5,8 +5,13 @@
 \alias{extract_areas}
 \title{extract_areas}
 \usage{
-locate_areas(file, pages = NULL, resolution = 60L, widget = c("shiny",
-  "native", "reduced"), copy = FALSE)
+locate_areas(
+  file,
+  pages = NULL,
+  resolution = 60L,
+  widget = c("shiny", "native", "reduced"),
+  copy = FALSE
+)
 
 extract_areas(file, pages = NULL, guess = FALSE, copy = FALSE, ...)
 }
diff --git a/man/extract_tables.Rd b/man/extract_tables.Rd
index 03b9d4b..03c6b7d 100644
--- a/man/extract_tables.Rd
+++ b/man/extract_tables.Rd
@@ -4,14 +4,23 @@
 \alias{extract_tables}
 \title{extract_tables}
 \usage{
-extract_tables(file, pages = NULL, area = NULL, columns = NULL,
-  guess = TRUE, method = c("decide", "lattice", "stream"),
-  output = c("matrix", "data.frame", "character", "asis", "csv", "tsv",
-  "json"), outdir = NULL, password = NULL, encoding = NULL,
-  copy = FALSE, ...)
+extract_tables(
+  file,
+  pages = NULL,
+  area = NULL,
+  columns = NULL,
+  guess = TRUE,
+  method = c("decide", "lattice", "stream"),
+  output = c("matrix", "data.frame", "character", "asis", "csv", "tsv", "json"),
+  outdir = NULL,
+  password = NULL,
+  encoding = NULL,
+  copy = FALSE,
+  ...
+)
 }
 \arguments{
-\item{file}{A character string specifying the path or URL to a PDF file.}
+\item{file}{A character string specifying the path or URL to a PDF file, or raw vector with pdf data.}
 
 \item{pages}{An optional integer vector specifying pages to extract from.}
 
diff --git a/man/extract_text.Rd b/man/extract_text.Rd
index 3e40802..269f0f9 100644
--- a/man/extract_text.Rd
+++ b/man/extract_text.Rd
@@ -4,8 +4,14 @@
 \alias{extract_text}
 \title{extract_text}
 \usage{
-extract_text(file, pages = NULL, area = NULL, password = NULL,
-  encoding = NULL, copy = FALSE)
+extract_text(
+  file,
+  pages = NULL,
+  area = NULL,
+  password = NULL,
+  encoding = NULL,
+  copy = FALSE
+)
 }
 \arguments{
 \item{file}{A character string specifying the path or URL to a PDF file.}
diff --git a/man/make_thumbnails.Rd b/man/make_thumbnails.Rd
index 2f00dd1..1a29924 100644
--- a/man/make_thumbnails.Rd
+++ b/man/make_thumbnails.Rd
@@ -4,8 +4,15 @@
 \alias{make_thumbnails}
 \title{make_thumbnails}
 \usage{
-make_thumbnails(file, outdir = NULL, pages = NULL, format = c("png",
-  "jpeg", "bmp", "gif"), resolution = 72, password = NULL, copy = FALSE)
+make_thumbnails(
+  file,
+  outdir = NULL,
+  pages = NULL,
+  format = c("png", "jpeg", "bmp", "gif"),
+  resolution = 72,
+  password = NULL,
+  copy = FALSE
+)
 }
 \arguments{
 \item{file}{A character string specifying the path or URL to a PDF file.}

From f69af0500220f704959ea02059becefb5851bbf1 Mon Sep 17 00:00:00 2001
From: MHCZ <mhcz@novonordisk.com>
Date: Thu, 5 Nov 2020 21:41:54 +0000
Subject: [PATCH 2/2] updated documentation for all compatible functions with
 raw vectors

---
 R/extract_metadata.R    | 2 +-
 R/extract_text.R        | 2 +-
 R/get_page_dims.R       | 2 +-
 man/extract_metadata.Rd | 2 +-
 man/extract_text.Rd     | 2 +-
 man/get_page_dims.Rd    | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/R/extract_metadata.R b/R/extract_metadata.R
index ab83936..e6c490b 100644
--- a/R/extract_metadata.R
+++ b/R/extract_metadata.R
@@ -1,6 +1,6 @@
 #' @title extract_metadata
 #' @description Extract metadata from a file
-#' @param file A character string specifying the path or URL to a PDF file.
+#' @param file A character string specifying the path or URL to a PDF file, or raw vector with pdf data.
 #' @param password Optionally, a character string containing a user password to access a secured PDF.
 #' @param copy Specifies whether the original local file(s) should be copied to
 #' \code{tempdir()} before processing. \code{FALSE} by default. The argument is
diff --git a/R/extract_text.R b/R/extract_text.R
index 019b9a5..c299234 100644
--- a/R/extract_text.R
+++ b/R/extract_text.R
@@ -1,6 +1,6 @@
 #' @title extract_text
 #' @description Extract text from a file
-#' @param file A character string specifying the path or URL to a PDF file.
+#' @param file A character string specifying the path or URL to a PDF file, or raw vector with pdf data.
 #' @param pages An optional integer vector specifying pages to extract from.
 #' @param area An optional list, of length equal to the number of pages specified, where each entry contains a four-element numeric vector of coordinates (top,left,bottom,right) containing the table for the corresponding page. As a convenience, a list of length 1 can be used to extract the same area from all (specified) pages.
 #' @param password Optionally, a character string containing a user password to access a secured PDF.
diff --git a/R/get_page_dims.R b/R/get_page_dims.R
index 50076e2..3b7f40a 100644
--- a/R/get_page_dims.R
+++ b/R/get_page_dims.R
@@ -1,7 +1,7 @@
 #' @rdname get_page_dims
 #' @title Page length and dimensions
 #' @description Get Page Length and Dimensions
-#' @param file A character string specifying the path or URL to a PDF file.
+#' @param file A character string specifying the path or URL to a PDF file, or raw vector with pdf data.
 #' @param pages An optional integer vector specifying pages to extract from.
 #' @param doc Optionally,, in lieu of \code{file}, an rJava reference to a PDDocument Java object.
 #' @param password Optionally, a character string containing a user password to access a secured PDF.
diff --git a/man/extract_metadata.Rd b/man/extract_metadata.Rd
index b48b3cf..f54ac86 100644
--- a/man/extract_metadata.Rd
+++ b/man/extract_metadata.Rd
@@ -7,7 +7,7 @@
 extract_metadata(file, password = NULL, copy = FALSE)
 }
 \arguments{
-\item{file}{A character string specifying the path or URL to a PDF file.}
+\item{file}{A character string specifying the path or URL to a PDF file, or raw vector with pdf data.}
 
 \item{password}{Optionally, a character string containing a user password to access a secured PDF.}
 
diff --git a/man/extract_text.Rd b/man/extract_text.Rd
index 269f0f9..21e4261 100644
--- a/man/extract_text.Rd
+++ b/man/extract_text.Rd
@@ -14,7 +14,7 @@ extract_text(
 )
 }
 \arguments{
-\item{file}{A character string specifying the path or URL to a PDF file.}
+\item{file}{A character string specifying the path or URL to a PDF file, or raw vector with pdf data.}
 
 \item{pages}{An optional integer vector specifying pages to extract from.}
 
diff --git a/man/get_page_dims.Rd b/man/get_page_dims.Rd
index 1f7c480..62c8ae9 100644
--- a/man/get_page_dims.Rd
+++ b/man/get_page_dims.Rd
@@ -10,7 +10,7 @@ get_page_dims(file, doc, pages = NULL, password = NULL, copy = FALSE)
 get_n_pages(file, doc, password = NULL, copy = FALSE)
 }
 \arguments{
-\item{file}{A character string specifying the path or URL to a PDF file.}
+\item{file}{A character string specifying the path or URL to a PDF file, or raw vector with pdf data.}
 
 \item{doc}{Optionally,, in lieu of \code{file}, an rJava reference to a PDDocument Java object.}