Merge pull request #1932 from haxtibal/tdmg/system_chromedriver

HTML2PDF: System chromedriver for PDF export
strictdoc-project · Aug 7, 2024 · 1b8f3cf · 1b8f3cf
2 parents c639d25 + 8b651d8
commit 1b8f3cf
Show file tree

Hide file tree

Showing 16 changed files with 116 additions and 24 deletions.
diff --git a/docs/strictdoc_01_user_guide.sdoc b/docs/strictdoc_01_user_guide.sdoc
@@ -2642,7 +2642,10 @@ There are three methods of PDF printing available:
 
 3. Also in the web interface, by navigating to a 'PDF' view of a document and using the browser's built-in Print function.
 
-The first two methods require the Chrome browser to be installed on the user's computer.
+The first two methods require the Chrome browser and chromedriver to be installed on the user's computer.
+StrictDoc downloads chromedriver on demand by default, or uses a pre installed executable if
+``strictdoc export --chromedriver=/path/to/chromedriver`` or the equivalent ``strictdoc.toml`` option
+is given.
 
 The third method, the PDF screen, presents a version of the document that is optimized for browser printing. This approach allows for the creation of neatly formatted PDF documents or directly printed documents. Although this method is compatible with any browser, Chrome is recommended for the best printing results. Unlike Firefox and Safari, Chrome maintains the document's internal hyperlinks in the printed PDF.
 

diff --git a/strictdoc/cli/cli_arg_parser.py b/strictdoc/cli/cli_arg_parser.py
@@ -146,6 +146,7 @@ def __init__(
  reqif_multiline_is_xhtml: bool,
  reqif_enable_mid: bool,
  view: Optional[str],
+ chromedriver: Optional[str],
  ):
  assert isinstance(input_paths, list), f"{input_paths}"
  self.input_paths: List[str] = input_paths
@@ -165,6 +166,7 @@ def __init__(
  self.reqif_enable_mid: bool = reqif_enable_mid
  self.view: Optional[str] = view
  self.output_html_root: str = os.path.join(output_dir, "html")
+ self.chromedriver: Optional[str] = chromedriver
 
  def get_path_to_config(self) -> str:
  # FIXME: The control flow can be improved.
@@ -298,6 +300,7 @@ def get_export_config(self) -> ExportCommandConfig:
  self.args.reqif_multiline_is_xhtml,
  self.args.reqif_enable_mid,
  self.args.view,
+ self.args.chromedriver,
  )
 
  def get_import_config_reqif(self, _) -> ImportReqIFCommandConfig:

diff --git a/strictdoc/cli/command_parser_builder.py b/strictdoc/cli/command_parser_builder.py
@@ -243,6 +243,13 @@ def add_export_command(parent_command_parser):
  type=str,
  help="Choose which view will be exported.",
  )
+ command_parser_export.add_argument(
+ "--chromedriver",
+ type=str,
+ help="Path to pre installed chromedriver for html2pdf. "
+ "If not given, chromedriver is downloaded and saved to"
+ "strictdoc cache.",
+ )
  add_config_argument(command_parser_export)
 
  @staticmethod

diff --git a/strictdoc/core/project_config.py b/strictdoc/core/project_config.py
@@ -95,6 +95,7 @@ def __init__(
  reqif_enable_mid: bool,
  reqif_import_markup: Optional[str],
  config_last_update: Optional[datetime.datetime],
+ chromedriver: Optional[str],
  ):
  assert isinstance(environment, SDocRuntimeEnvironment)
  if source_root_path is not None:
@@ -149,6 +150,7 @@ def __init__(
  )
  self.is_running_on_server: bool = False
  self.view: Optional[str] = None
+ self.chromedriver: Optional[str] = chromedriver
 
  @staticmethod
  def default_config(environment: SDocRuntimeEnvironment):
@@ -172,6 +174,7 @@ def default_config(environment: SDocRuntimeEnvironment):
  reqif_enable_mid=False,
  reqif_import_markup=None,
  config_last_update=None,
+ chromedriver=None,
  )
 
  # Some server command settings can override the project config settings.
@@ -194,6 +197,7 @@ def integrate_export_config(self, export_config: ExportCommandConfig):
  self.filter_sections = export_config.filter_sections
  self.excel_export_fields = export_config.fields
  self.view = export_config.view
+ self.chromedriver = export_config.chromedriver
  if self.source_root_path is None:
  source_root_path = export_config.input_paths[0]
  if not os.path.abspath(source_root_path):
@@ -368,6 +372,7 @@ def _load_from_dictionary(
  reqif_multiline_is_xhtml = False
  reqif_enable_mid = False
  reqif_import_markup: Optional[str] = None
+ chromedriver: Optional[str] = None
 
  if "project" in config_dict:
  project_content = config_dict["project"]
@@ -507,6 +512,13 @@ def _load_from_dictionary(
  assert relation_tuple is not None
  traceability_matrix_relation_columns.append(relation_tuple)
 
+ chromedriver = project_content.get("chromedriver", chromedriver)
+ if chromedriver is not None and not os.path.isfile(chromedriver):
+ print( # noqa: T201
+ f"warning: strictdoc.toml: chromedriver {chromedriver} "
+ "not found."
+ )
+
  if "server" in config_dict:
  # FIXME: Introduce at least a basic validation for the host/port.
  server_content = config_dict["server"]
@@ -554,4 +566,5 @@ def _load_from_dictionary(
  reqif_enable_mid=reqif_enable_mid,
  reqif_import_markup=reqif_import_markup,
  config_last_update=config_last_update,
+ chromedriver=chromedriver,
  )
diff --git a/strictdoc/export/html2pdf/html2pdf.py b/strictdoc/export/html2pdf/html2pdf.py
@@ -164,18 +164,20 @@ def get_pdf_from_html(driver, url) -> bytes:
  return data
 
 
-def create_webdriver():
+def create_webdriver(chromedriver: Optional[str]):
  print("HTML2PDF: creating Chrome Driver service.", flush=True) # noqa: T201
+ if chromedriver is None:
+ cache_manager = HTML2PDF_CacheManager(
+ file_manager=FileManager(os_system_manager=OperationSystemManager())
+ )
 
- cache_manager = HTML2PDF_CacheManager(
- file_manager=FileManager(os_system_manager=OperationSystemManager())
- )
-
- http_client = HTML2PDF_HTTPClient()
- download_manager = WDMDownloadManager(http_client)
- path_to_chrome = ChromeDriverManager(
- download_manager=download_manager, cache_manager=cache_manager
- ).install()
+ http_client = HTML2PDF_HTTPClient()
+ download_manager = WDMDownloadManager(http_client)
+ path_to_chrome = ChromeDriverManager(
+ download_manager=download_manager, cache_manager=cache_manager
+ ).install()
+ else:
+ path_to_chrome = chromedriver
  print(f"HTML2PDF: Chrome Driver available at path: {path_to_chrome}") # noqa: T201
 
  service = Service(path_to_chrome)
@@ -211,14 +213,19 @@ def main():
  os.environ["WDM_LOCAL"] = "1"
 
  parser = argparse.ArgumentParser(description="HTML2PDF printer script.")
+ parser.add_argument(
+ "--chromedriver",
+ type=str,
+ help="Optional chromedriver path. Downloaded if not given.",
+ )
  parser.add_argument("paths", help="Paths to input HTML file.")
  args = parser.parse_args()
 
  paths = args.paths
 
  separate_path_pairs = paths.split(";")
 
- driver = create_webdriver()
+ driver = create_webdriver(args.chromedriver)
 
  @atexit.register
  def exit_handler():

diff --git a/strictdoc/export/html2pdf/html2pdf_generator.py b/strictdoc/export/html2pdf/html2pdf_generator.py
@@ -106,6 +106,9 @@ def export_tree(
  )
  pdf_print_driver = PDFPrintDriver()
  try:
- pdf_print_driver.get_pdf_from_html(paths_to_print_argument)
+ pdf_print_driver.get_pdf_from_html(
+ project_config,
+ paths_to_print_argument,
+ )
  except TimeoutError:
  print("error: HTML2PDF: timeout error.") # noqa: T201
diff --git a/strictdoc/export/html2pdf/pdf_print_driver.py b/strictdoc/export/html2pdf/pdf_print_driver.py
@@ -3,26 +3,33 @@
 from subprocess import CompletedProcess, TimeoutExpired, run
 
 from strictdoc import environment
+from strictdoc.core.project_config import ProjectConfig
 from strictdoc.helpers.timing import measure_performance
 
 
 class PDFPrintDriver:
  @staticmethod
- def get_pdf_from_html(paths_to_print: str):
+ def get_pdf_from_html(
+ project_config: ProjectConfig,
+ paths_to_print: str,
+ ):
  assert isinstance(paths_to_print, str)
+ cmd = [
+ # Using sys.executable instead of "python" is important because
+ # venv subprocess call to python resolves to wrong interpreter,
+ # https://github.com/python/cpython/issues/86207
+ sys.executable,
+ environment.get_path_to_html2pdf(),
+ paths_to_print,
+ ]
+ if project_config.chromedriver is not None:
+ cmd.extend(["--chromedriver", project_config.chromedriver])
  with measure_performance(
  "PDFPrintDriver: printing HTML to PDF using HTML2PDF and Chrome Driver"
  ):
  try:
  _: CompletedProcess = run(
- [
- # Using sys.executable instead of "python" is important because
- # venv subprocess call to python resolves to wrong interpreter,
- # https://github.com/python/cpython/issues/86207
- sys.executable,
- environment.get_path_to_html2pdf(),
- paths_to_print,
- ],
+ cmd,
  capture_output=False,
  check=False,
  )

diff --git a/strictdoc/server/routers/main_router.py b/strictdoc/server/routers/main_router.py
@@ -157,6 +157,7 @@ def create_main_router(
  reqif_multiline_is_xhtml=False,
  reqif_enable_mid=False,
  view=None,
+ chromedriver=None,
  )
  project_config.integrate_export_config(_export_config)
  project_config.is_running_on_server = True
@@ -2589,7 +2590,8 @@ def get_export_html2pdf(document_mid: str): # noqa: ARG001
 
  try:
  pdf_print_driver.get_pdf_from_html(
- f"{path_to_output_html},{path_to_output_pdf}"
+ project_config,
+ f"{path_to_output_html},{path_to_output_pdf}",
  )
  except TimeoutError:
  return Response(

diff --git a/tasks.py b/tasks.py
@@ -332,10 +332,16 @@ def test_integration(
  if not html2pdf:
  parallelize_opts = "" if not no_parallelization else "--threads 1"
  html2pdf_param = ""
+ chromedriver_param = ""
  test_folder = f"{cwd}/tests/integration"
  else:
  parallelize_opts = "--threads 1"
  html2pdf_param = "--param TEST_HTML2PDF=1"
+ chromedriver_path = os.environ.get("CHROMEWEBDRIVER")
+ assert (
+ chromedriver_path is not None
+ ), "TEST_HTML2PDF expects path to chromedriver in environment variable CHROMEWEBDRIVER"
+ chromedriver_param = f"--param CHROMEDRIVER={os.path.join(chromedriver_path, 'chromedriver')}"
  test_folder = f"{cwd}/tests/integration/features/html2pdf"
 
  strictdoc_cache_dir = os.path.join(tempfile.gettempdir(), "strictdoc_cache")
@@ -345,6 +351,7 @@ def test_integration(
  --param STRICTDOC_EXEC="{strictdoc_exec}"
  --param STRICTDOC_CACHE_DIR="{strictdoc_cache_dir}"
  {html2pdf_param}
+ {chromedriver_param}
  -v
  {debug_opts}
  {focus_or_none}

diff --git a/tests/integration/features/html2pdf/06_system_chromedriver/input.sdoc b/tests/integration/features/html2pdf/06_system_chromedriver/input.sdoc
@@ -0,0 +1,6 @@
+[DOCUMENT]
+TITLE: Dummy Software Requirements Specification #1
+
+[FREETEXT]
+Hello world! 😊😊😊
+[/FREETEXT]
diff --git a/tests/integration/features/html2pdf/06_system_chromedriver/strictdoc.toml b/tests/integration/features/html2pdf/06_system_chromedriver/strictdoc.toml
@@ -0,0 +1,5 @@
+[project]
+
+features = [
+ "HTML2PDF",
+]
diff --git a/tests/integration/features/html2pdf/06_system_chromedriver/test.itest b/tests/integration/features/html2pdf/06_system_chromedriver/test.itest
@@ -0,0 +1,16 @@
+REQUIRES: TEST_HTML2PDF
+
+# FIXME: Getting timeouts on Windows CI all the time. Needs to be checked or tested by users.
+REQUIRES: PLATFORM_IS_NOT_WINDOWS
+
+# GitHub images provide a chromedriver and export installed location, see
+# https://github.com/actions/runner-images/blob/main/images/ubuntu/Ubuntu2404-Readme.md#browsers-and-drivers
+RUN: STRICTDOC_CACHE_DIR=%strictdoc_cache_dir %strictdoc export %S --formats=html2pdf --chromedriver=%chromedriver --output-dir Output | filecheck %s --dump-input=fail
+CHECK: HTML2PDF: JS logs from the print session
+CHECK-NOT: HTML2PDF: Chrome Driver available at path: {{.*}}strictdoc_cache{{.*}}
+
+RUN: %check_exists --file %S/Output/html2pdf/pdf/input.pdf
+
+RUN: %check_exists --file %S/Output/html2pdf/html/06_system_chromedriver/input.html
+
+RUN: python %S/test_pdf.py
diff --git a/tests/integration/features/html2pdf/06_system_chromedriver/test_pdf.py b/tests/integration/features/html2pdf/06_system_chromedriver/test_pdf.py
@@ -0,0 +1,8 @@
+from pypdf import PdfReader
+
+reader = PdfReader("Output/html2pdf/pdf/input.pdf")
+
+assert len(reader.pages) == 3, reader.pages
+
+# page2_text = reader.pages[1].extract_text() # noqa: ERA001
+# assert "Table of contents" not in page2_text # noqa: ERA001
diff --git a/tests/integration/lit.cfg b/tests/integration/lit.cfg
@@ -41,5 +41,8 @@ if not lit_config.isWindows:
  config.available_features.add('PLATFORM_IS_NOT_WINDOWS')
 
 if "TEST_HTML2PDF" in lit_config.params:
+ chromedriver = lit_config.params['CHROMEDRIVER']
+ assert(chromedriver)
  config.available_features.add('TEST_HTML2PDF')
+ config.substitutions.append(('%chromedriver', chromedriver))
  config.name = "StrictDoc HTML2PDF integration tests"
diff --git a/tests/unit/strictdoc/cli/test_cli_arg_parser.py b/tests/unit/strictdoc/cli/test_cli_arg_parser.py
@@ -8,7 +8,7 @@
 FAKE_STRICTDOC_ROOT_PATH = "/tmp/strictdoc-123"
 
 
-TOTAL_EXPORT_ARGS = 17
+TOTAL_EXPORT_ARGS = 18
 
 
 def cli_args_parser():

diff --git a/tox.ini b/tox.ini
@@ -27,6 +27,8 @@ skip_install = true
 deps =
  -rrequirements.bootstrap.txt
  -rrequirements.check.txt
+pass_env=
+ CHROMEWEBDRIVER
 commands =
  python developer/pip_install_strictdoc_deps.py
  {posargs}