diff --git a/.gitignore b/.gitignore index 2f92791..517598d 100644 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,4 @@ sphinx/.buildinfo tx2.egg-info dist/* build/* -data/* \ No newline at end of file +data/* diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b11bb31..5b87462 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,6 +7,6 @@ repos: - id: trailing-whitespace - id: flake8 - repo: https://github.com/psf/black - rev: 19.3b0 + rev: 22.6.0 hooks: - id: black diff --git a/CHANGELOG.md b/CHANGELOG.md index 1f3fa56..9977bae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,7 +17,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Updating example jupyter notebooks to use new versions of packages. - Datasources in jupyter example notebooks. -### Fixed +### Fixed - Updated to patched numpy version 1.22. - Potential issue in calc.frequent_words_in_cluster() where clusters of empty string values would stop computation. diff --git a/tests/test_wrapper.py b/tests/test_wrapper.py index 827f3dc..8530523 100644 --- a/tests/test_wrapper.py +++ b/tests/test_wrapper.py @@ -48,7 +48,7 @@ def test_wrapper_prepare_no_crash( wrapper.prepare(umap_args=dict(n_neighbors=2)) - + def test_wrapper_np_prepare_no_crash( dummy_np_data, dummy_encodings, dummy_model, clear_files_teardown ): diff --git a/tx2/__init__.py b/tx2/__init__.py index 73bca4a..76e72e8 100644 --- a/tx2/__init__.py +++ b/tx2/__init__.py @@ -1,3 +1,184 @@ __version__ = "1.1.0" -STOPWORDS = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't", "via"] +STOPWORDS = [ + "i", + "me", + "my", + "myself", + "we", + "our", + "ours", + "ourselves", + "you", + "you're", + "you've", + "you'll", + "you'd", + "your", + "yours", + "yourself", + "yourselves", + "he", + "him", + "his", + "himself", + "she", + "she's", + "her", + "hers", + "herself", + "it", + "it's", + "its", + "itself", + "they", + "them", + "their", + "theirs", + "themselves", + "what", + "which", + "who", + "whom", + "this", + "that", + "that'll", + "these", + "those", + "am", + "is", + "are", + "was", + "were", + "be", + "been", + "being", + "have", + "has", + "had", + "having", + "do", + "does", + "did", + "doing", + "a", + "an", + "the", + "and", + "but", + "if", + "or", + "because", + "as", + "until", + "while", + "of", + "at", + "by", + "for", + "with", + "about", + "against", + "between", + "into", + "through", + "during", + "before", + "after", + "above", + "below", + "to", + "from", + "up", + "down", + "in", + "out", + "on", + "off", + "over", + "under", + "again", + "further", + "then", + "once", + "here", + "there", + "when", + "where", + "why", + "how", + "all", + "any", + "both", + "each", + "few", + "more", + "most", + "other", + "some", + "such", + "no", + "nor", + "not", + "only", + "own", + "same", + "so", + "than", + "too", + "very", + "s", + "t", + "can", + "will", + "just", + "don", + "don't", + "should", + "should've", + "now", + "d", + "ll", + "m", + "o", + "re", + "ve", + "y", + "ain", + "aren", + "aren't", + "couldn", + "couldn't", + "didn", + "didn't", + "doesn", + "doesn't", + "hadn", + "hadn't", + "hasn", + "hasn't", + "haven", + "haven't", + "isn", + "isn't", + "ma", + "mightn", + "mightn't", + "mustn", + "mustn't", + "needn", + "needn't", + "shan", + "shan't", + "shouldn", + "shouldn't", + "wasn", + "wasn't", + "weren", + "weren't", + "won", + "won't", + "wouldn", + "wouldn't", + "via", +] diff --git a/tx2/calc.py b/tx2/calc.py index 668ed29..f82ff7c 100644 --- a/tx2/calc.py +++ b/tx2/calc.py @@ -137,7 +137,7 @@ def frequent_words_in_cluster( :return: A list of tuples, each tuple containing the word and the number of times it appears in that cluster. """ - freq_words = [("",0)] + freq_words = [("", 0)] try: counter = CountVectorizer(stop_words=STOPWORDS) cv_fit = counter.fit_transform(texts) @@ -146,8 +146,10 @@ def frequent_words_in_cluster( key=lambda x: x[1], reverse=True, ) - except ValueError as e: - logging.warning(f"ValueError in frequent_words_in_cluster. Could be caused by cluster of empty text.") + except ValueError: + logging.warning( + "ValueError in frequent_words_in_cluster. Could be caused by cluster of empty text." + ) finally: return freq_words @@ -186,9 +188,7 @@ def frequent_words_by_class_in_cluster( # iterate through each classification and get the number of entries with that word in it for classification in encodings.values(): local_df = working_df[working_df.target == classification] - counter = CountVectorizer( - stop_words=STOPWORDS, vocabulary=vocab - ) + counter = CountVectorizer(stop_words=STOPWORDS, vocabulary=vocab) cv_fit = counter.fit_transform(local_df.text.values) class_freq_words = list( zip(counter.get_feature_names_out(), cv_fit.toarray().sum(axis=0)) diff --git a/tx2/utils.py b/tx2/utils.py index f77750e..b575129 100644 --- a/tx2/utils.py +++ b/tx2/utils.py @@ -1,5 +1,4 @@ import asyncio -import logging import numpy as np from torch import cuda, has_mps diff --git a/tx2/visualization.py b/tx2/visualization.py index 217fec4..1752ef9 100644 --- a/tx2/visualization.py +++ b/tx2/visualization.py @@ -3,13 +3,11 @@ import itertools import math from typing import Dict, List, Union -import logging import matplotlib.pyplot as plt import numpy as np import pandas as pd from IPython.core.interactiveshell import InteractiveShell -from IPython.display import clear_output, display from matplotlib.lines import Line2D from sklearn.metrics import confusion_matrix from wordcloud import WordCloud @@ -384,8 +382,8 @@ def plot_embedding_projections(text, dashboard, prediction=None): # if text differs from the selected index, render new point if ( - dashboard.prior_reference_point is None or - dashboard.prior_reference_text != text + dashboard.prior_reference_point is None + or dashboard.prior_reference_text != text ): text_projection = dashboard.transformer_wrapper.project([text])[0] if prediction is None: @@ -471,11 +469,7 @@ def plot_embedding_projections(text, dashboard, prediction=None): fmt = InteractiveShell.instance().display_formatter.format data, metadata = fmt(fig) dashboard.out_projection_scatter.outputs = ( - { - 'output_type': 'display_data', - 'data': data, - 'metadata': metadata - }, + {"output_type": "display_data", "data": data, "metadata": metadata}, ) dashboard.html_graph_status.value = ( diff --git a/tx2/wrapper.py b/tx2/wrapper.py index c2b0217..2241ca8 100644 --- a/tx2/wrapper.py +++ b/tx2/wrapper.py @@ -61,8 +61,8 @@ class is being used and has a layer representing the output of just the language `_. Note that **this argument is not required**, if the user intends to manually specify encode and classification functions. - :param device: Set the device for pytorch to place tensors on, pass either "cpu", - "cuda", or "mps". This variable is used by the default embedding function. + :param device: Set the device for pytorch to place tensors on, pass either "cpu", + "cuda", or "mps". This variable is used by the default embedding function. If unspecified, "cuda" or "mps" will be used if GPU is found, otherwise it defaults to "cpu". :param cache_path: The directory path to cache intermediate outputs from the :meth:`tx2.wrapper.Wrapper.prepare` function. This allows the wrapper to precompute