diff --git a/sphinx-docs/source/chat.rst b/sphinx-docs/source/chat.rst index 435e2dc..75f6ae6 100644 --- a/sphinx-docs/source/chat.rst +++ b/sphinx-docs/source/chat.rst @@ -19,7 +19,7 @@ call the `chat` method to interact with the data and get insights from it via Na from wordview.text_analysis import TextStatsPlots imdb_df = pd.read_csv("data/IMDB_Dataset_sample_5k.csv") - with open("wordview/chat/secrets/openai_api_key.json", "r") as f: + with open("your_secrets_dir/openai_api_key.json", "r") as f: credentials = json.load(f) tsp = TextStatsPlots(df=imdb_df, text_column="review") @@ -27,6 +27,41 @@ call the `chat` method to interact with the data and get insights from it via Na The chat UI is available under http://127.0.0.1:5000/ + +Chat with MWEs +~~~~~~~~~~~~~~ + +After allowing Wordview to extract MWEs, you can call the `chat` method to get insights from this extraction through Natural Language. + +.. code:: python + + import json + + import pandas as pd + + from wordview.mwe_extraction import MWEs + from wordview.preprocessing import NgramExtractor + + imdb_df = pd.read_csv("data/IMDB_Dataset_sample_5k.csv") + with open("your_secrets_dir/openai_api_key.json", "r") as f: + credentials = json.load(f) + + extractor = NgramExtractor(imdb_df, "review") + extractor.extract_ngrams() + extractor.get_ngram_counts(ngram_count_file_path="ngram_counts.json") + + mwe_obj = MWE(imdb_df, 'review', + ngram_count_file_path='ngram_counts.json', + language='EN', + custom_patterns="NP: {
?*}", + only_custom_patterns=False, + ) + mwe_obj.extract_mwes(sort=True, top_n=10) + mwe_obj.chat(api_key=credentials.get("openai_api_key")) + +The chat UI for MWEs is available under http://127.0.0.1:5001/ + + |chat| .. |chat| image:: ../figs/chat.png \ No newline at end of file diff --git a/sphinx-docs/source/mwes.rst b/sphinx-docs/source/mwes.rst index 003fec7..d7a0c19 100644 --- a/sphinx-docs/source/mwes.rst +++ b/sphinx-docs/source/mwes.rst @@ -32,16 +32,20 @@ the documentation. custom_patterns="NP: {
?*}", only_custom_patterns=False, ) - mwes = mwe_obj.extract_mwes(sort=True, top_n=10) - json.dump(mwes, open('data/mwes.json', 'w'), indent=4) - + mwe_obj.extract_mwes(sort=True, top_n=10) + json.dump(mwe_obj.mwes, open('data/mwes.json', 'w'), indent=4) -The above returns the results in a dictionary, that in this example we stored in `mwes.json` file. +The above returns the results in a dictionary, that in this example we stored in a json file called `data/mwes.json`. You can also return the result in a table: .. code-block:: python mwe_obj.print_mwe_table() + +Which will return a table like this: + +.. code-block:: text + ╔═════════════════════════╦═══════════════╗ ║ LVC ║ Association ║ ╠═════════════════════════╬═══════════════╣ diff --git a/wordview/text_analysis/chat/chat.html b/wordview/chat_ui/chat.html similarity index 84% rename from wordview/text_analysis/chat/chat.html rename to wordview/chat_ui/chat.html index be22e24..e976723 100644 --- a/wordview/text_analysis/chat/chat.html +++ b/wordview/chat_ui/chat.html @@ -26,12 +26,45 @@ display: flex; flex-direction: column; } - .message-container { + /* .message-container { padding: 20px; overflow-y: auto; flex-grow: 1; - margin-bottom: 10px; /* Adjusted to add space at the bottom */ + margin-bottom: 10px; + } */ + .message-container { + overflow-y: auto; /* Enables vertical scrolling */ + max-height: 500px; /* Set a max-height that fits your design */ + padding: 10px; + margin-bottom: 10px; + width: 100%; /* Ensure it fills the container */ + box-sizing: border-box; /* Include padding and border in the width and height */ + position: relative; + } + /* Styling the scrollbar itself */ + .message-container::-webkit-scrollbar { + width: 10px; /* Adjust the width of the scrollbar */ + } + /* Styling the track (part the thumb slides within) */ + .message-container::-webkit-scrollbar-track { + background: #f1f1f1; /* Light grey background on the track */ + border-radius: 10px; /* Rounded corners on the track */ + } + + /* Styling the thumb (the part that you drag) */ + .message-container::-webkit-scrollbar-thumb { + background: #888; /* Dark grey thumb */ + border-radius: 10px; /* Rounded corners on the thumb */ + } + + /* Handle on hover */ + .message-container::-webkit-scrollbar-thumb:hover { + background: #555; /* Darker grey on hover */ } + + + + /* Standard Oval Style of Message Bubbles */ /* .message { @@ -147,7 +180,7 @@ button:hover { background-color: #f0f0f0; } - ::-webkit-scrollbar { + /* ::-webkit-scrollbar { width: 5px; } ::-webkit-scrollbar-track { @@ -155,10 +188,10 @@ } ::-webkit-scrollbar-thumb { background: #888; - } - ::-webkit-scrollbar-thumb:hover { + } */ + /* ::-webkit-scrollbar-thumb:hover { background: #555; - } + } */ diff --git a/wordview/mwes/mwe.py b/wordview/mwes/mwe.py index 3a2535c..4249bcb 100644 --- a/wordview/mwes/mwe.py +++ b/wordview/mwes/mwe.py @@ -1,11 +1,14 @@ import re import string +import threading from re import Match from typing import Optional import nltk import pandas +from flask import Flask, jsonify, request, send_from_directory from nltk import RegexpParser, word_tokenize +from openai import OpenAI from tabulate import tabulate # type: ignore from tqdm import tqdm @@ -26,7 +29,9 @@ def is_alphanumeric_latinscript_multigram(word: str) -> Optional[Match[str]]: class MWE: - """Extract MWEs of type LVC, VPC, Noun Compounds, Adjective Compounds, and custom patterns from a text corpus.""" + """Extract MWEs of typeS: + LVC, VPC, Noun Compounds, Adjective Compounds, and custom patterns from a text corpus. + """ def __init__( self, @@ -99,19 +104,75 @@ def __init__( custom_pattern=mwe_patterns, ) + def chat(self, api_key: str = ""): + """Chat with OpenAI's latest model about MWEs . + Access the chat UI in your localhost under http://127.0.0.1:5001/ + + Args: + api_key: OpenAI API key. + + Returns: + None + """ + self.api_key = api_key + self.chat_client = OpenAI(api_key=api_key) + base_content = f"""Answer any questions about the Multiword Expressions (MWEs) that extracted from the uploaded text corpus by Wordview and are presented in the following MWEs dictionary. + \n\n + ------------------------------ + MWEs dictionary: + ------------------------------ + {self.mwes} + \n\n + Important Points:\n + - Answer the questions without including "According/based on to MWEs dictionary".\n + - The format of the above dictionary is as follows:\n + "MWE Type": "MWE instance 1": "Association measure", "MWE instance 2": "Association measure", ...\n + - There could be other custom types in which case you should just mention the dictionary key.\n + - Depending on a parameter N set by the user, each MWE type contains at most N instances. But it can contain less or even 0. + """ + chat_history = [ + {"role": "system", "content": base_content}, + ] + app = Flask(__name__, static_folder="path_to_your_ui_folder") + + @app.route("/") + def index(): + return send_from_directory("../chat_ui", "chat.html") + + @app.route("/chat", methods=["POST"]) + def chat(): + user_input = request.json["message"] + chat_history.append({"role": "user", "content": user_input}) + response = ( + self.chat_client.chat.completions.create( + model="gpt-3.5-turbo", + messages=chat_history, + ) + .choices[0] + .message.content + ) + chat_history.append({"role": "assistant", "content": response}) + return jsonify({"reply": response}) + + def run(): + app.run(port=5001) + + flask_thread = threading.Thread(target=run) + flask_thread.start() + def extract_mwes( self, sort: bool = True, top_n: Optional[int] = None, ) -> dict[str, dict[str, float]]: - """Extract MWEs from the text corpus. + """Extract MWEs from the text corpus and add them to self.mwes. Args: sort: If True, the MWEs will be sorted in descending order of association measure. top_n: If provided, only the top n MWEs will be returned. Returns: - A dictionary containing the MWEs and their association measures. + None. """ for sentence in tqdm(self.reader.get_sentences()): try: diff --git a/wordview/mwes/patterns.py b/wordview/mwes/patterns.py index 2aaa155..d1a88ae 100644 --- a/wordview/mwes/patterns.py +++ b/wordview/mwes/patterns.py @@ -4,56 +4,80 @@ class EnMWEPatterns: patterns: Dict[str, List[str]] = {} - def __init__(self, mwe_types=["LVC", "NC2", "NC3", "ANC2", "ANC3", "VPC"]): - if "LVC" in mwe_types: - self.patterns["LVC"] = [ - "LVC: {
<\\w+>}", + def __init__( + self, + mwe_types=[ + "Light Verb Constructions", + "Noun Noun Compounds", + "Noun Noun Noun Compounds", + "Adjective Noun Compounds", + "Adjective Adjective Noun Compounds", + "Verb Particle Constructions", + ], + ): + if "Light Verb Constructions" in mwe_types: + self.patterns["Light Verb Constructions"] = [ + "Light Verb Constructions: {
<\\w+>}", ] - if "NC2" in mwe_types: - self.patterns["NC2"] = [ - "NC2: {}", + if "Noun Noun Compounds" in mwe_types: + self.patterns["Noun Noun Compounds"] = [ + "Noun Noun Compounds: {}", ] - if "NC3" in mwe_types: - self.patterns["NC3"] = [ - "NC3: {}", + if "Noun Noun Noun Compounds" in mwe_types: + self.patterns["Noun Noun Noun Compounds"] = [ + "Noun Noun Noun Compounds: {}", ] - if "ANC2" in mwe_types: - self.patterns["ANC2"] = [ - "ANC2: {}", + if "Adjective Noun Compounds" in mwe_types: + self.patterns["Adjective Noun Compounds"] = [ + "Adjective Noun Compounds: {}", ] - if "ANC3" in mwe_types: - self.patterns["ANC3"] = ["ANC3: {}"] - if "VPC" in mwe_types: - self.patterns["VPC"] = [ - "VPC: {}", + if "Adjective Adjective Noun Compounds" in mwe_types: + self.patterns["Adjective Adjective Noun Compounds"] = [ + "Adjective Adjective Noun Compounds: {}" + ] + if "Verb Particle Constructions" in mwe_types: + self.patterns["Verb Particle Constructions"] = [ + "Verb Particle Constructions: {}", ] class DeMWEPatterns: patterns: Dict[str, List[str]] = {} - def __init__(self, mwe_types=["LVC", "NC2", "NC3", "ANC2", "ANC3", "VPC"]): - if "LVC" in mwe_types: - self.patterns["LVC"] = [ - "LVC: {
<\\w+>}", + def __init__( + self, + mwe_types=[ + "Light Verb Constructions", + "Noun Noun Compounds", + "Noun Noun Noun Compounds", + "Adjective Noun Compounds", + "Adjective Adjective Noun Compounds", + "Verb Particle Constructions", + ], + ): + if "Light Verb Constructions" in mwe_types: + self.patterns["Light Verb Constructions"] = [ + "Light Verb Constructions: {
<\\w+>}", ] # Define the patterns for 2 and 3-word noun compounds (e.g., "Hausaufgaben", "Fußballplatz") - if "NC2" in mwe_types: - self.patterns["NC2"] = [ - "NC2: {}", + if "Noun Noun Compounds" in mwe_types: + self.patterns["Noun Noun Compounds"] = [ + "Noun Noun Compounds: {}", + ] + if "Noun Noun Noun Compounds" in mwe_types: + self.patterns["Noun Noun Noun Compounds"] = [ + "Noun Noun Noun Compounds: {}", ] - if "NC3" in mwe_types: - self.patterns["NC3"] = [ - "NC3: {}", + if "Adjective Noun Compounds" in mwe_types: + self.patterns["Adjective Noun Compounds"] = [ + "Adjective Noun Compounds: {}", ] - if "ANC2" in mwe_types: - self.patterns["ANC2"] = [ - "ANC2: {}", + if "Adjective Adjective Noun Compounds" in mwe_types: + self.patterns["Adjective Adjective Noun Compounds"] = [ + "Adjective Adjective Noun Compounds: {}" ] - if "ANC3" in mwe_types: - self.patterns["ANC3"] = ["ANC3: {}"] # Define the patterns for verb particle constructions (e.g., "aufstehen", "zurückkommen") - if "VPC" in mwe_types: - self.patterns["VPC"] = [ - "VPC: {}", + if "Verb Particle Constructions" in mwe_types: + self.patterns["Verb Particle Constructions"] = [ + "Verb Particle Constructions: {}", ] diff --git a/wordview/text_analysis/wrapper.py b/wordview/text_analysis/wrapper.py index b12055a..a5c1c70 100644 --- a/wordview/text_analysis/wrapper.py +++ b/wordview/text_analysis/wrapper.py @@ -127,7 +127,7 @@ def chat(self, api_key: str = ""): @app.route("/") def index(): - return send_from_directory("chat", "chat.html") + return send_from_directory("../chat_ui", "chat.html") @app.route("/chat", methods=["POST"]) def chat():