Skip to content

Commit

Permalink
add averaged_perceptron_tagger to nltk downloadables
Browse files Browse the repository at this point in the history
  • Loading branch information
fonhorst committed Jun 8, 2024
1 parent d33f84e commit ee8245e
Showing 1 changed file with 3 additions and 1 deletion.
4 changes: 3 additions & 1 deletion autotm/preprocessing/text_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,16 @@
logger = logging.getLogger(__name__)

# TODO: make transformer class and prep function to download all files
nltk_components = ['corpora/stopwords', 'corpora/wordnet.zip']
nltk_components = ['corpora/stopwords', 'corpora/wordnet.zip', 'taggers/averaged_perceptron_tagger']

for nltk_component in nltk_components:
try:
nltk.data.find(nltk_component)
except LookupError:
nltk_component_name = os.path.splitext(os.path.basename(nltk_component))[0]
logger.warning(f"NLTK {nltk_component_name} component is not found. Downloading it...")
nltk.download(nltk_component_name)
logger.debug(f"NLTK {nltk_component_name} component has been downloaded.")

stop = stopwords.words("russian") + [" "] + stopwords.words("english")

Expand Down

0 comments on commit ee8245e

Please sign in to comment.