Fix pre-commit and re-ran linters/fixers

ORNL · Aug 30, 2022 · 656f481 · 656f481
1 parent f1d5e88
commit 656f481
Show file tree

Hide file tree

Showing 9 changed files with 197 additions and 23 deletions.
diff --git a/.gitignore b/.gitignore
@@ -11,4 +11,4 @@ sphinx/.buildinfo
 tx2.egg-info
 dist/*
 build/*
-data/*
+data/*
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -7,6 +7,6 @@ repos:
   - id: trailing-whitespace
   - id: flake8
 - repo: https://github.com/psf/black
-  rev: 19.3b0
+  rev: 22.6.0
   hooks:
   - id: black
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,7 +17,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Updating example jupyter notebooks to use new versions of packages.
 - Datasources in jupyter example notebooks.
 
-### Fixed 
+### Fixed
 - Updated to patched numpy version 1.22.
 - Potential issue in calc.frequent_words_in_cluster() where clusters of empty
   string values would stop computation.

diff --git a/tests/test_wrapper.py b/tests/test_wrapper.py
@@ -48,7 +48,7 @@ def test_wrapper_prepare_no_crash(
 
     wrapper.prepare(umap_args=dict(n_neighbors=2))
 
-    
+
 def test_wrapper_np_prepare_no_crash(
     dummy_np_data, dummy_encodings, dummy_model, clear_files_teardown
 ):

diff --git a/tx2/__init__.py b/tx2/__init__.py
@@ -1,3 +1,184 @@
 __version__ = "1.1.0"
 
-STOPWORDS = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't", "via"]
+STOPWORDS = [
+    "i",
+    "me",
+    "my",
+    "myself",
+    "we",
+    "our",
+    "ours",
+    "ourselves",
+    "you",
+    "you're",
+    "you've",
+    "you'll",
+    "you'd",
+    "your",
+    "yours",
+    "yourself",
+    "yourselves",
+    "he",
+    "him",
+    "his",
+    "himself",
+    "she",
+    "she's",
+    "her",
+    "hers",
+    "herself",
+    "it",
+    "it's",
+    "its",
+    "itself",
+    "they",
+    "them",
+    "their",
+    "theirs",
+    "themselves",
+    "what",
+    "which",
+    "who",
+    "whom",
+    "this",
+    "that",
+    "that'll",
+    "these",
+    "those",
+    "am",
+    "is",
+    "are",
+    "was",
+    "were",
+    "be",
+    "been",
+    "being",
+    "have",
+    "has",
+    "had",
+    "having",
+    "do",
+    "does",
+    "did",
+    "doing",
+    "a",
+    "an",
+    "the",
+    "and",
+    "but",
+    "if",
+    "or",
+    "because",
+    "as",
+    "until",
+    "while",
+    "of",
+    "at",
+    "by",
+    "for",
+    "with",
+    "about",
+    "against",
+    "between",
+    "into",
+    "through",
+    "during",
+    "before",
+    "after",
+    "above",
+    "below",
+    "to",
+    "from",
+    "up",
+    "down",
+    "in",
+    "out",
+    "on",
+    "off",
+    "over",
+    "under",
+    "again",
+    "further",
+    "then",
+    "once",
+    "here",
+    "there",
+    "when",
+    "where",
+    "why",
+    "how",
+    "all",
+    "any",
+    "both",
+    "each",
+    "few",
+    "more",
+    "most",
+    "other",
+    "some",
+    "such",
+    "no",
+    "nor",
+    "not",
+    "only",
+    "own",
+    "same",
+    "so",
+    "than",
+    "too",
+    "very",
+    "s",
+    "t",
+    "can",
+    "will",
+    "just",
+    "don",
+    "don't",
+    "should",
+    "should've",
+    "now",
+    "d",
+    "ll",
+    "m",
+    "o",
+    "re",
+    "ve",
+    "y",
+    "ain",
+    "aren",
+    "aren't",
+    "couldn",
+    "couldn't",
+    "didn",
+    "didn't",
+    "doesn",
+    "doesn't",
+    "hadn",
+    "hadn't",
+    "hasn",
+    "hasn't",
+    "haven",
+    "haven't",
+    "isn",
+    "isn't",
+    "ma",
+    "mightn",
+    "mightn't",
+    "mustn",
+    "mustn't",
+    "needn",
+    "needn't",
+    "shan",
+    "shan't",
+    "shouldn",
+    "shouldn't",
+    "wasn",
+    "wasn't",
+    "weren",
+    "weren't",
+    "won",
+    "won't",
+    "wouldn",
+    "wouldn't",
+    "via",
+]
diff --git a/tx2/calc.py b/tx2/calc.py
@@ -137,7 +137,7 @@ def frequent_words_in_cluster(
     :return: A list of tuples, each tuple containing the word and the number of times
         it appears in that cluster.
     """
-    freq_words = [("",0)]
+    freq_words = [("", 0)]
     try:
         counter = CountVectorizer(stop_words=STOPWORDS)
         cv_fit = counter.fit_transform(texts)
@@ -146,8 +146,10 @@ def frequent_words_in_cluster(
             key=lambda x: x[1],
             reverse=True,
         )
-    except ValueError as e:
-        logging.warning(f"ValueError in frequent_words_in_cluster. Could be caused by cluster of empty text.")
+    except ValueError:
+        logging.warning(
+            "ValueError in frequent_words_in_cluster. Could be caused by cluster of empty text."
+        )
     finally:
         return freq_words
 
@@ -186,9 +188,7 @@ def frequent_words_by_class_in_cluster(
     # iterate through each classification and get the number of entries with that word in it
     for classification in encodings.values():
         local_df = working_df[working_df.target == classification]
-        counter = CountVectorizer(
-            stop_words=STOPWORDS, vocabulary=vocab
-        )
+        counter = CountVectorizer(stop_words=STOPWORDS, vocabulary=vocab)
         cv_fit = counter.fit_transform(local_df.text.values)
         class_freq_words = list(
             zip(counter.get_feature_names_out(), cv_fit.toarray().sum(axis=0))

diff --git a/tx2/utils.py b/tx2/utils.py
@@ -1,5 +1,4 @@
 import asyncio
-import logging
 import numpy as np
 from torch import cuda, has_mps
 

diff --git a/tx2/visualization.py b/tx2/visualization.py
@@ -3,13 +3,11 @@
 import itertools
 import math
 from typing import Dict, List, Union
-import logging
 
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 from IPython.core.interactiveshell import InteractiveShell
-from IPython.display import clear_output, display
 from matplotlib.lines import Line2D
 from sklearn.metrics import confusion_matrix
 from wordcloud import WordCloud
@@ -384,8 +382,8 @@ def plot_embedding_projections(text, dashboard, prediction=None):
 
     # if text differs from the selected index, render new point
     if (
-        dashboard.prior_reference_point is None or
-        dashboard.prior_reference_text != text
+        dashboard.prior_reference_point is None
+        or dashboard.prior_reference_text != text
     ):
         text_projection = dashboard.transformer_wrapper.project([text])[0]
         if prediction is None:
@@ -471,11 +469,7 @@ def plot_embedding_projections(text, dashboard, prediction=None):
     fmt = InteractiveShell.instance().display_formatter.format
     data, metadata = fmt(fig)
     dashboard.out_projection_scatter.outputs = (
-        {
-            'output_type': 'display_data',
-            'data': data,
-            'metadata': metadata
-        },
+        {"output_type": "display_data", "data": data, "metadata": metadata},
     )
 
     dashboard.html_graph_status.value = (

diff --git a/tx2/wrapper.py b/tx2/wrapper.py
@@ -61,8 +61,8 @@ class is being used and has a layer representing the output of just the language
             <https://huggingface.co/transformers/main_classes/tokenizer.html>`_. Note that **this
             argument is not required**, if the user intends to manually specify encode and
             classification functions.
-        :param device: Set the device for pytorch to place tensors on, pass either "cpu", 
-            "cuda", or "mps". This variable is used by the default embedding function. 
+        :param device: Set the device for pytorch to place tensors on, pass either "cpu",
+            "cuda", or "mps". This variable is used by the default embedding function.
             If unspecified, "cuda" or "mps" will be used if GPU is found, otherwise it defaults to "cpu".
         :param cache_path: The directory path to cache intermediate outputs from the
             :meth:`tx2.wrapper.Wrapper.prepare` function. This allows the wrapper to precompute