diff --git a/CHANGELOG.md b/CHANGELOG.md index 46ccea5f..58cabc45 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ ## Changelog + +## 1.6.0 +- focus on precision, stricter extraction patterns (#103, #105, #106, #112) +- simplified code base (#108, #109) +- replaced lxml.html.Cleaner (#104) +- extended evaluation + ## 1.5.2 - fix for missing months keys in custom extractor (#100) - fix for None in `try_date_expr()` (#101) diff --git a/README.rst b/README.rst index 6720bc26..5443aa98 100644 --- a/README.rst +++ b/README.rst @@ -97,17 +97,17 @@ Performance ----------- =============================== ========= ========= ========= ========= ======= -500 web pages containing identifiable dates (as of 2022-11-28 on Python 3.8) +1000 web pages containing identifiable dates (as of 2023-11-13 on Python 3.10) ------------------------------------------------------------------------------- Python Package Precision Recall Accuracy F-Score Time =============================== ========= ========= ========= ========= ======= -articleDateExtractor 0.20 0.769 0.691 0.572 0.728 4x -date_guesser 2.1.4 0.738 0.544 0.456 0.626 16x -goose3 3.1.12 0.821 0.453 0.412 0.584 14x -htmldate[all] 1.4.0 (fast) **0.856** 0.921 0.798 0.888 **1x** -htmldate[all] 1.4.0 (extensive) 0.847 **0.991** **0.840** **0.913** 2.2x -newspaper3k 0.2.8 0.729 0.630 0.510 0.675 13x -news-please 1.5.22 0.769 0.691 0.572 0.728 38x +articleDateExtractor 0.20 0.803 0.734 0.622 0.767 5x +date_guesser 2.1.4 0.781 0.600 0.514 0.679 18x +goose3 3.1.17 0.869 0.532 0.493 0.660 15x +htmldate[all] 1.6.0 (fast) **0.883** 0.924 0.823 0.903 **1x** +htmldate[all] 1.6.0 (extensive) 0.870 **0.993** **0.865** **0.928** 1.7x +newspaper3k 0.2.8 0.769 0.667 0.556 0.715 15x +news-please 1.5.35 0.801 0.768 0.645 0.784 34x =============================== ========= ========= ========= ========= ======= For complete results and explanations see the `evaluation page `_. diff --git a/docs/evaluation.rst b/docs/evaluation.rst index 51dedbdd..a3638b82 100644 --- a/docs/evaluation.rst +++ b/docs/evaluation.rst @@ -42,17 +42,17 @@ The results below show that **date extraction is not a completely solved task** =============================== ========= ========= ========= ========= ======= -500 web pages containing identifiable dates (as of 2022-11-28 on Python 3.8) +1000 web pages containing identifiable dates (as of 2023-11-13 on Python 3.10) ------------------------------------------------------------------------------- Python Package Precision Recall Accuracy F-Score Time =============================== ========= ========= ========= ========= ======= -articleDateExtractor 0.20 0.769 0.691 0.572 0.728 4x -date_guesser 2.1.4 0.738 0.544 0.456 0.626 16x -goose3 3.1.12 0.821 0.453 0.412 0.584 14x -htmldate[all] 1.4.0 (fast) **0.856** 0.921 0.798 0.888 **1x** -htmldate[all] 1.4.0 (extensive) 0.847 **0.991** **0.840** **0.913** 2.2x -newspaper3k 0.2.8 0.729 0.630 0.510 0.675 13x -news-please 1.5.22 0.769 0.691 0.572 0.728 38x +articleDateExtractor 0.20 0.803 0.734 0.622 0.767 5x +date_guesser 2.1.4 0.781 0.600 0.514 0.679 18x +goose3 3.1.17 0.869 0.532 0.493 0.660 15x +htmldate[all] 1.6.0 (fast) **0.883** 0.924 0.823 0.903 **1x** +htmldate[all] 1.6.0 (extensive) 0.870 **0.993** **0.865** **0.928** 1.7x +newspaper3k 0.2.8 0.769 0.667 0.556 0.715 15x +news-please 1.5.35 0.801 0.768 0.645 0.784 34x =============================== ========= ========= ========= ========= ======= @@ -72,6 +72,21 @@ Note on the different versions: Older Results ------------- +=============================== ========= ========= ========= ========= ======= +500 web pages containing identifiable dates (as of 2022-11-28 on Python 3.8) +------------------------------------------------------------------------------- +Python Package Precision Recall Accuracy F-Score Time +=============================== ========= ========= ========= ========= ======= +articleDateExtractor 0.20 0.769 0.691 0.572 0.728 4x +date_guesser 2.1.4 0.738 0.544 0.456 0.626 16x +goose3 3.1.12 0.821 0.453 0.412 0.584 14x +htmldate[all] 1.4.0 (fast) **0.856** 0.921 0.798 0.888 **1x** +htmldate[all] 1.4.0 (extensive) 0.847 **0.991** **0.840** **0.913** 2.2x +newspaper3k 0.2.8 0.729 0.630 0.510 0.675 13x +news-please 1.5.22 0.769 0.691 0.572 0.728 38x +=============================== ========= ========= ========= ========= ======= + + =============================== ========= ========= ========= ========= ======= 500 web pages containing identifiable dates (as of 2022-03-23 on Python 3.8) diff --git a/htmldate/__init__.py b/htmldate/__init__.py index 83b1c2af..72d3db4b 100644 --- a/htmldate/__init__.py +++ b/htmldate/__init__.py @@ -7,7 +7,7 @@ __author__ = "Adrien Barbaresi" __license__ = "GNU GPL v3" __copyright__ = "Copyright 2017-2023, Adrien Barbaresi" -__version__ = "1.5.2" +__version__ = "1.6.0" import logging diff --git a/setup.py b/setup.py index d2a2de78..27eb26a1 100644 --- a/setup.py +++ b/setup.py @@ -14,8 +14,7 @@ extras = { "speed": [ "backports-datetime-fromisoformat; python_version < '3.11'", - "cchardet >= 2.1.7; python_version < '3.11'", # build issue - "faust-cchardet >= 2.1.19; python_version >= '3.11'", # fix for build + "faust-cchardet >= 2.1.19", "urllib3[brotli]", ], } @@ -34,7 +33,7 @@ def get_long_description(): def get_version(package): "Return package version as listed in `__version__` in `init.py`" - initfile = Path(package, "__init__.py").read_text() # Python >= 3.5 + initfile = Path(package, "__init__.py").read_text() return re.search("__version__ = ['\"]([^'\"]+)['\"]", initfile)[1] @@ -117,7 +116,7 @@ def get_version(package): install_requires=[ "backports-datetime-fromisoformat; python_version < '3.7'", "charset_normalizer >= 3.0.1; python_version < '3.7'", - "charset_normalizer >= 3.3.0; python_version >= '3.7'", + "charset_normalizer >= 3.3.2; python_version >= '3.7'", "dateparser >= 1.1.2", # 1.1.3+ slower "lxml >= 4.9.3 ; platform_system != 'Darwin'", "lxml == 4.9.2 ; platform_system == 'Darwin'", diff --git a/tests/comparison.py b/tests/comparison.py index cfcc46ef..ea4b54a7 100644 --- a/tests/comparison.py +++ b/tests/comparison.py @@ -86,17 +86,14 @@ def run_newspaper(htmlstring): # throws error on the eval_default dataset try: myarticle = Article(htmlstring) - except (TypeError, UnicodeDecodeError): - return None - myarticle.html = htmlstring - myarticle.download_state = ArticleDownloadState.SUCCESS - try: + myarticle.html = htmlstring + myarticle.download_state = ArticleDownloadState.SUCCESS myarticle.parse() - except UnicodeEncodeError: + except (UnicodeDecodeError, UnicodeEncodeError): return None if myarticle.publish_date is None or myarticle.publish_date == "": return None - return convert_date(myarticle.publish_date, "%Y-%m-%d %H:%M:%S", "%Y-%m-%d") + return str(myarticle.publish_date)[0:10] def run_newsplease(htmlstring): @@ -129,11 +126,14 @@ def run_dateguesser(htmlstring): def run_goose(htmlstring): """try with the goose algorithm""" - article = G.extract(raw_html=htmlstring) + try: + article = G.extract(raw_html=htmlstring) + except (AttributeError, UnicodeDecodeError): + return None if article.publish_date is None: return None - datematch = re.match(r"[0-9]{4}-[0-9]{2}-[0-9]{2}", article.publish_date) try: + datematch = re.match(r"[0-9]{4}-[0-9]{2}-[0-9]{2}", article.publish_date) return datematch[0] # illogical result except TypeError: diff --git a/tests/eval-requirements.txt b/tests/eval-requirements.txt index 09833779..6ce9379f 100644 --- a/tests/eval-requirements.txt +++ b/tests/eval-requirements.txt @@ -1,5 +1,5 @@ # package -htmldate>=1.5.0 +htmldate>=1.6.0 # alternatives articleDateExtractor==0.20