Skip to content

Commit

Permalink
Merge pull request #99 from meghdadFar/meghdadFar/fix-nltk-download-i…
Browse files Browse the repository at this point in the history
…ssue

Fix nltk download issue
  • Loading branch information
meghdadFar authored Aug 11, 2023
2 parents cc2c6a4 + 823424e commit 6d44278
Show file tree
Hide file tree
Showing 14 changed files with 1,049 additions and 990 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ jobs:
python -m pip install poetry
- name: Install Dependencies
run: poetry install
- name: Download NLTK Resources
run: poetry run nltk_download_script
# - name: Download NLTK Resources
# run: poetry run nltk_download_script
- name: Run Tests
run: poetry run pytest --ignore=tests/clustering/
7 changes: 7 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
Version 1.1.2
-------------
- Automatic check and download of NLTK missing resources.
- Rm CI step for downloading NLTK resources.
- Facilitate configuration of plots for Text & Label Analysis plots, by creating new and more clear arguments.


Version 1.1.1
-------------
- Fix minor bugs in bias analysis.
Expand Down
7 changes: 0 additions & 7 deletions bin/downloads.py

This file was deleted.

20 changes: 20 additions & 0 deletions bin/nltk_resources.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import nltk
import os
from wordview import logger


def check_nltk_resources():
nltk_data_path = os.path.expanduser('~/nltk_data/')

resources = {
'tokenizers/punkt': 'punkt',
'corpora/stopwords': 'stopwords',
'taggers/averaged_perceptron_tagger': 'averaged_perceptron_tagger'
}

for path, package in resources.items():
if not os.path.exists(os.path.join(nltk_data_path, path)):
logger.info(f"Downloading NLTK resource: {package}")
nltk.download(package)
else:
pass
4 changes: 2 additions & 2 deletions docs/source/mwes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ the documentation.
# you can do it as follows:
from wordview.preprocessing import NgramExtractor
import pandas as pd
imdb_train = pd.read_csv("data/IMDB_Dataset_sample.csv")
extractor = NgramExtractor(imdb_train, "review")
imdb_corpus = pd.read_csv("data/IMDB_Dataset_sample.csv")
extractor = NgramExtractor(imdb_corpus, "review")
extractor.extract_ngrams()
extractor.get_ngram_counts(ngram_count_file_path="data/ngram_counts.json")
Expand Down
1,962 changes: 991 additions & 971 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "wordview"
version = "1.1.1"
version = "1.1.2"
description = "Wordview is a Python package for text analysis."
authors = ["meghdadFar <meghdad.farahmand@gmail.com>"]
readme = "README.rst"
Expand Down
13 changes: 6 additions & 7 deletions tests/mwe/test_mwe.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import pytest
from unittest.mock import patch, MagicMock
import pandas as pd
from wordview.mwes.mwe import MWE, MWEPatternAssociation
import nltk
from wordview.mwes.mwe import MWE


@pytest.fixture
Expand Down Expand Up @@ -64,11 +63,11 @@ def dummy_text_pandas_with_no_noun_compund():
dummy_pos_tags_without_noun_compund = [("no", "XXX"),("sequence", "XXX"),("of", "XXX"),("nouns", "XXX"),("in", "XXX"),("this", "XXX"),("one", "XXX")]


@pytest.fixture
def tagged_sentence_fixture():
sentence = "The very quick brown fox swiftly jumps over the lazy dog that is extremely lazy while John Doe attentively watches the lazy dog."
tokens = nltk.word_tokenize(sentence)
return tokens
# @pytest.fixture
# def tagged_sentence_fixture():
# sentence = "The very quick brown fox swiftly jumps over the lazy dog that is extremely lazy while John Doe attentively watches the lazy dog."
# tokens = nltk.word_tokenize(sentence)
# return tokens


class TestMweInitialisation:
Expand Down
3 changes: 3 additions & 0 deletions wordview/bias_analysis/bias.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,13 @@
from tqdm import tqdm
from transformers import BertForSequenceClassification, BertTokenizer

from bin.nltk_resources import check_nltk_resources
from wordview import logger
from wordview.bias_analysis import bias_terms
from wordview.io.dataframe_reader import DataFrameReader

check_nltk_resources()


class BiasDetector:
def __init__(self, df, text_column):
Expand Down
3 changes: 3 additions & 0 deletions wordview/io/dataframe_reader.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
from nltk.tokenize import sent_tokenize

from bin.nltk_resources import check_nltk_resources
from wordview import logger

check_nltk_resources()


class DataFrameReader:
"""Reads a dataframe column and returns sentences."""
Expand Down
4 changes: 4 additions & 0 deletions wordview/mwes/mwe.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from tabulate import tabulate # type: ignore
from tqdm import tqdm

from bin.nltk_resources import check_nltk_resources
from wordview import logger
from wordview.io.dataframe_reader import DataFrameReader
from wordview.mwes.association_measures import PMICalculator
Expand All @@ -21,6 +22,9 @@ def is_alphanumeric_latinscript_multigram(word: str) -> Optional[Match[str]]:
return match


check_nltk_resources()


class MWEPatternAssociation:
"""Extract MWE candidates from a list of tokens based on a given pattern."""

Expand Down
4 changes: 4 additions & 0 deletions wordview/preprocessing/cleaning.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@

from nltk import word_tokenize

from bin.nltk_resources import check_nltk_resources

check_nltk_resources()


def clean_text(
text: str,
Expand Down
3 changes: 3 additions & 0 deletions wordview/preprocessing/count.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,12 @@
from nltk.tokenize import word_tokenize
from nltk.util import ngrams

from bin.nltk_resources import check_nltk_resources
from wordview import logger
from wordview.io.dataframe_reader import DataFrameReader

check_nltk_resources()


class NgramExtractor:
"""Extracts n-grams from a dataframe.
Expand Down
3 changes: 3 additions & 0 deletions wordview/text_analysis/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,11 @@
from tqdm import tqdm
from wordcloud import WordCloud, get_single_color_func

from bin.nltk_resources import check_nltk_resources
from wordview import logger

check_nltk_resources()


def plotly_wordcloud(
token_count_dic: dict, plot_settings: Dict = {}
Expand Down

0 comments on commit 6d44278

Please sign in to comment.