From 991a3a1012433baafe7b09f03a04168c4e8c6b67 Mon Sep 17 00:00:00 2001 From: Dusty White Date: Mon, 25 Oct 2021 12:34:40 -0500 Subject: [PATCH 1/2] trying to force timeout for urls --- pdfx/downloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdfx/downloader.py b/pdfx/downloader.py index 92f0ecb..8852648 100644 --- a/pdfx/downloader.py +++ b/pdfx/downloader.py @@ -46,7 +46,7 @@ def get_status_code(url): "Mozilla/5.0 (compatible; MSIE 9.0; " "Windows NT 6.1; Trident/5.0)", ) request.get_method = lambda: "HEAD" - response = urlopen(request, context=ssl_unverified_context) + response = urlopen(request, context=ssl_unverified_context, timeout=10) # print response.info() return response.getcode() except HTTPError as e: From 87933705dde179b0836753039021e79e62475bd2 Mon Sep 17 00:00:00 2001 From: Dusty White Date: Tue, 26 Oct 2021 11:04:54 -0500 Subject: [PATCH 2/2] added cli interface for timeout parameter --- pdfx/cli.py | 12 +++++++++++- pdfx/downloader.py | 16 ++++++++++------ 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/pdfx/cli.py b/pdfx/cli.py index a872415..ae7f86d 100644 --- a/pdfx/cli.py +++ b/pdfx/cli.py @@ -94,6 +94,13 @@ def create_parser(): action="version", version="%(prog)s v{version}".format(version=pdfx.__version__), ) + + parser.add_argument( + "-f", + "--fast-timeout", + action="store_true", + help="Apply 10 second timeout for URLOPEN to prevent hanging processes", + ) return parser @@ -201,7 +208,10 @@ def main(): refs_all = pdf.get_references() refs = [ref for ref in refs_all if ref.reftype in ["url", "pdf"]] print("\nChecking %s URLs for broken links..." % len(refs)) - check_refs(refs) + if args.fast_timeout: + check_refs(refs, timeout=True) + else: + check_refs(refs) try: if args.download_pdfs: diff --git a/pdfx/downloader.py b/pdfx/downloader.py index 8852648..5a3d8f1 100644 --- a/pdfx/downloader.py +++ b/pdfx/downloader.py @@ -3,6 +3,7 @@ from .colorprint import colorprint, OKGREEN, FAIL from .threadpool import ThreadPool from collections import defaultdict +from itertools import repeat import ssl import os import sys @@ -37,7 +38,7 @@ def sanitize_url(url): return url -def get_status_code(url): +def get_status_code(url, timeout): """ Perform HEAD request and return status code """ try: request = Request(sanitize_url(url)) @@ -46,7 +47,10 @@ def get_status_code(url): "Mozilla/5.0 (compatible; MSIE 9.0; " "Windows NT 6.1; Trident/5.0)", ) request.get_method = lambda: "HEAD" - response = urlopen(request, context=ssl_unverified_context, timeout=10) + if timeout: + response = urlopen(request, context=ssl_unverified_context, timeout=10) + else: + response = urlopen(request, context=ssl_unverified_context) # print response.info() return response.getcode() except HTTPError as e: @@ -58,13 +62,13 @@ def get_status_code(url): return None -def check_refs(refs, verbose=True, max_threads=MAX_THREADS_DEFAULT): +def check_refs(refs, verbose=True, max_threads=MAX_THREADS_DEFAULT, timeout=False): """ Check if urls exist """ codes = defaultdict(list) - def check_url(ref): + def check_url(ref, timeout): url = ref.ref - status_code = str(get_status_code(url)) + status_code = str(get_status_code(url, timeout)) codes[status_code].append(ref) if verbose: if status_code == "200": @@ -75,7 +79,7 @@ def check_url(ref): # Start a threadpool and add the check-url tasks try: pool = ThreadPool(5) - pool.map(check_url, refs) + pool.starmap(check_url, zip(refs, repeat(timeout))) pool.wait_completion() except Exception as e: