From 43aa155660bd12cfec9433ba5aca0c14d395d3fa Mon Sep 17 00:00:00 2001
From: Kiran Jonnalagadda <kiran@hasgeek.com>
Date: Mon, 13 May 2024 23:32:25 +0530
Subject: [PATCH] Move over NLTK code from Coaster; revise Ruff config

---
 hasjob/tagging.py |  32 ++++++++++++++-
 pyproject.toml    | 102 +++++++++++++++++-----------------------------
 requirements.txt  |   2 +-
 3 files changed, 70 insertions(+), 66 deletions(-)

diff --git a/hasjob/tagging.py b/hasjob/tagging.py
index c2a5b156..ed174916 100644
--- a/hasjob/tagging.py
+++ b/hasjob/tagging.py
@@ -1,9 +1,10 @@
 from collections import defaultdict
+from collections.abc import Iterable
 from urllib.parse import urljoin
 
+import nltk
 import requests
 
-from coaster.nlp import extract_named_entities
 from coaster.utils import text_blocks
 
 from . import app, rq
@@ -25,6 +26,35 @@
 
 
 @rq.job('hasjob')
+def extract_named_entities(text_blocks: Iterable[str]) -> set[str]:
+    """Return a set of named entities extracted from the provided text blocks."""
+    sentences = []
+    for text in text_blocks:
+        sentences.extend(nltk.sent_tokenize(text))
+
+    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
+    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
+    chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
+
+    def extract_entity_names(tree: nltk.Tree) -> list[str]:
+        entity_names = []
+
+        if hasattr(tree, "label"):
+            if tree.label() == "NE":
+                entity_names.append(" ".join(child[0] for child in tree))
+            else:
+                for child in tree:
+                    entity_names.extend(extract_entity_names(child))
+
+        return entity_names
+
+    entity_names = []
+    for tree in chunked_sentences:
+        entity_names.extend(extract_entity_names(tree))
+
+    return set(entity_names)
+
+
 def tag_locations(jobpost_id):
     with app.test_request_context():
         post = JobPost.query.get(jobpost_id)
diff --git a/pyproject.toml b/pyproject.toml
index 12c2a135..3bc2c7ee 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -86,64 +86,6 @@ exclude_dirs = ['node_modules', 'build/lib']
 skips = ['*/*_test.py', '*/test_*.py']
 
 [tool.ruff]
-# This is a slight customisation of the default rules
-# 1. Hasjob still supports Python 3.7 pending its EOL
-# 2. Rule E402 (module-level import not top-level) is disabled as isort handles it
-# 3. Rule E501 (line too long) is left to Black; some strings are worse for wrapping
-
-# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
-select = ["E", "F"]
-ignore = ["E402", "E501"]
-
-# Allow autofix for all enabled rules (when `--fix`) is provided.
-fixable = [
-  "A",
-  "B",
-  "C",
-  "D",
-  "E",
-  "F",
-  "G",
-  "I",
-  "N",
-  "Q",
-  "S",
-  "T",
-  "W",
-  "ANN",
-  "ARG",
-  "BLE",
-  "COM",
-  "DJ",
-  "DTZ",
-  "EM",
-  "ERA",
-  "EXE",
-  "FBT",
-  "ICN",
-  "INP",
-  "ISC",
-  "NPY",
-  "PD",
-  "PGH",
-  "PIE",
-  "PL",
-  "PT",
-  "PTH",
-  "PYI",
-  "RET",
-  "RSE",
-  "RUF",
-  "SIM",
-  "SLF",
-  "TCH",
-  "TID",
-  "TRY",
-  "UP",
-  "YTT",
-]
-unfixable = []
-
 # Exclude a variety of commonly ignored directories.
 exclude = [
   ".bzr",
@@ -171,23 +113,48 @@ exclude = [
 # Same as Black.
 line-length = 88
 
+# Target Python 3.11
+target-version = "py311"
+
+[tool.ruff.format]
+docstring-code-format = true
+quote-style = "preserve"
+
+[tool.ruff.lint]
 # Allow unused variables when underscore-prefixed.
 dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
 
-# Target Python 3.11
-target-version = "py311"
+# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
+select = ["E", "F"]
+ignore = ["E402", "E501"]
 
-[tool.ruff.mccabe]
+# Allow autofix for all enabled rules (when `--fix`) is provided.
+fixable = ["ALL"]
+unfixable = []
+
+# Allow these characters in strings
+allowed-confusables = ["‘", "’"]
+
+[tool.ruff.lint.mccabe]
 # Unlike Flake8, default to a complexity level of 10.
 max-complexity = 10
 
-[tool.ruff.isort]
+[tool.ruff.lint.extend-per-file-ignores]
+"__init__.py" = ["E402"] # Allow non-top-level imports
+"tests/**.py" = [
+  "S101",   # Allow assert
+  "ANN001", # Args don't need types (usually fixtures)
+  "N802",   # Fixture returning a class may be named per class name convention
+  "N803",   # Args don't require naming convention (fixture could be a class)
+]
+
+[tool.ruff.lint.isort]
 # These config options should match isort config above under [tool.isort]
 combine-as-imports = true
 extra-standard-library = ['typing_extensions']
 split-on-trailing-comma = false
 relative-imports-order = 'furthest-to-closest'
-known-first-party = ['coaster']
+known-first-party = ['coaster', 'baseframe', 'flask_lastuser']
 section-order = [
   'future',
   'standard-library',
@@ -197,5 +164,12 @@ section-order = [
   'local-folder',
 ]
 
-[tool.ruff.isort.sections]
+[tool.ruff.lint.isort.sections]
 repo = ['hasjob']
+
+[tool.ruff.lint.flake8-pytest-style]
+fixture-parentheses = false
+mark-parentheses = false
+
+[tool.ruff.lint.pyupgrade]
+keep-runtime-typing = true
diff --git a/requirements.txt b/requirements.txt
index 8eee7d60..1bf9cc8b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,7 +11,6 @@ Flask-Migrate
 Flask-Redis
 Flask-RQ2
 Flask-SQLAlchemy
-Flask-Testing
 git+https://github.com/maxcountryman/flask-uploads.git#egg=Flask-Uploads
 Flask-WTF
 geoip2
@@ -19,6 +18,7 @@ gunicorn
 html2text
 jsmin
 langid
+nltk
 Pillow
 premailer
 progressbar2