diff --git a/sphinx-docs/source/chat.rst b/sphinx-docs/source/chat.rst
index 435e2dc..75f6ae6 100644
--- a/sphinx-docs/source/chat.rst
+++ b/sphinx-docs/source/chat.rst
@@ -19,7 +19,7 @@ call the `chat` method to interact with the data and get insights from it via Na
from wordview.text_analysis import TextStatsPlots
imdb_df = pd.read_csv("data/IMDB_Dataset_sample_5k.csv")
- with open("wordview/chat/secrets/openai_api_key.json", "r") as f:
+ with open("your_secrets_dir/openai_api_key.json", "r") as f:
credentials = json.load(f)
tsp = TextStatsPlots(df=imdb_df, text_column="review")
@@ -27,6 +27,41 @@ call the `chat` method to interact with the data and get insights from it via Na
The chat UI is available under http://127.0.0.1:5000/
+
+Chat with MWEs
+~~~~~~~~~~~~~~
+
+After allowing Wordview to extract MWEs, you can call the `chat` method to get insights from this extraction through Natural Language.
+
+.. code:: python
+
+ import json
+
+ import pandas as pd
+
+ from wordview.mwe_extraction import MWEs
+ from wordview.preprocessing import NgramExtractor
+
+ imdb_df = pd.read_csv("data/IMDB_Dataset_sample_5k.csv")
+ with open("your_secrets_dir/openai_api_key.json", "r") as f:
+ credentials = json.load(f)
+
+ extractor = NgramExtractor(imdb_df, "review")
+ extractor.extract_ngrams()
+ extractor.get_ngram_counts(ngram_count_file_path="ngram_counts.json")
+
+ mwe_obj = MWE(imdb_df, 'review',
+ ngram_count_file_path='ngram_counts.json',
+ language='EN',
+ custom_patterns="NP: {
?*}",
+ only_custom_patterns=False,
+ )
+ mwe_obj.extract_mwes(sort=True, top_n=10)
+ mwe_obj.chat(api_key=credentials.get("openai_api_key"))
+
+The chat UI for MWEs is available under http://127.0.0.1:5001/
+
+
|chat|
.. |chat| image:: ../figs/chat.png
\ No newline at end of file
diff --git a/sphinx-docs/source/mwes.rst b/sphinx-docs/source/mwes.rst
index 003fec7..d7a0c19 100644
--- a/sphinx-docs/source/mwes.rst
+++ b/sphinx-docs/source/mwes.rst
@@ -32,16 +32,20 @@ the documentation.
custom_patterns="NP: {?*}",
only_custom_patterns=False,
)
- mwes = mwe_obj.extract_mwes(sort=True, top_n=10)
- json.dump(mwes, open('data/mwes.json', 'w'), indent=4)
-
+ mwe_obj.extract_mwes(sort=True, top_n=10)
+ json.dump(mwe_obj.mwes, open('data/mwes.json', 'w'), indent=4)
-The above returns the results in a dictionary, that in this example we stored in `mwes.json` file.
+The above returns the results in a dictionary, that in this example we stored in a json file called `data/mwes.json`.
You can also return the result in a table:
.. code-block:: python
mwe_obj.print_mwe_table()
+
+Which will return a table like this:
+
+.. code-block:: text
+
╔═════════════════════════╦═══════════════╗
║ LVC ║ Association ║
╠═════════════════════════╬═══════════════╣
diff --git a/wordview/text_analysis/chat/chat.html b/wordview/chat_ui/chat.html
similarity index 84%
rename from wordview/text_analysis/chat/chat.html
rename to wordview/chat_ui/chat.html
index be22e24..e976723 100644
--- a/wordview/text_analysis/chat/chat.html
+++ b/wordview/chat_ui/chat.html
@@ -26,12 +26,45 @@
display: flex;
flex-direction: column;
}
- .message-container {
+ /* .message-container {
padding: 20px;
overflow-y: auto;
flex-grow: 1;
- margin-bottom: 10px; /* Adjusted to add space at the bottom */
+ margin-bottom: 10px;
+ } */
+ .message-container {
+ overflow-y: auto; /* Enables vertical scrolling */
+ max-height: 500px; /* Set a max-height that fits your design */
+ padding: 10px;
+ margin-bottom: 10px;
+ width: 100%; /* Ensure it fills the container */
+ box-sizing: border-box; /* Include padding and border in the width and height */
+ position: relative;
+ }
+ /* Styling the scrollbar itself */
+ .message-container::-webkit-scrollbar {
+ width: 10px; /* Adjust the width of the scrollbar */
+ }
+ /* Styling the track (part the thumb slides within) */
+ .message-container::-webkit-scrollbar-track {
+ background: #f1f1f1; /* Light grey background on the track */
+ border-radius: 10px; /* Rounded corners on the track */
+ }
+
+ /* Styling the thumb (the part that you drag) */
+ .message-container::-webkit-scrollbar-thumb {
+ background: #888; /* Dark grey thumb */
+ border-radius: 10px; /* Rounded corners on the thumb */
+ }
+
+ /* Handle on hover */
+ .message-container::-webkit-scrollbar-thumb:hover {
+ background: #555; /* Darker grey on hover */
}
+
+
+
+
/* Standard Oval Style of Message Bubbles */
/*
.message {
@@ -147,7 +180,7 @@
button:hover {
background-color: #f0f0f0;
}
- ::-webkit-scrollbar {
+ /* ::-webkit-scrollbar {
width: 5px;
}
::-webkit-scrollbar-track {
@@ -155,10 +188,10 @@
}
::-webkit-scrollbar-thumb {
background: #888;
- }
- ::-webkit-scrollbar-thumb:hover {
+ } */
+ /* ::-webkit-scrollbar-thumb:hover {
background: #555;
- }
+ } */
diff --git a/wordview/mwes/mwe.py b/wordview/mwes/mwe.py
index 3a2535c..4249bcb 100644
--- a/wordview/mwes/mwe.py
+++ b/wordview/mwes/mwe.py
@@ -1,11 +1,14 @@
import re
import string
+import threading
from re import Match
from typing import Optional
import nltk
import pandas
+from flask import Flask, jsonify, request, send_from_directory
from nltk import RegexpParser, word_tokenize
+from openai import OpenAI
from tabulate import tabulate # type: ignore
from tqdm import tqdm
@@ -26,7 +29,9 @@ def is_alphanumeric_latinscript_multigram(word: str) -> Optional[Match[str]]:
class MWE:
- """Extract MWEs of type LVC, VPC, Noun Compounds, Adjective Compounds, and custom patterns from a text corpus."""
+ """Extract MWEs of typeS:
+ LVC, VPC, Noun Compounds, Adjective Compounds, and custom patterns from a text corpus.
+ """
def __init__(
self,
@@ -99,19 +104,75 @@ def __init__(
custom_pattern=mwe_patterns,
)
+ def chat(self, api_key: str = ""):
+ """Chat with OpenAI's latest model about MWEs .
+ Access the chat UI in your localhost under http://127.0.0.1:5001/
+
+ Args:
+ api_key: OpenAI API key.
+
+ Returns:
+ None
+ """
+ self.api_key = api_key
+ self.chat_client = OpenAI(api_key=api_key)
+ base_content = f"""Answer any questions about the Multiword Expressions (MWEs) that extracted from the uploaded text corpus by Wordview and are presented in the following MWEs dictionary.
+ \n\n
+ ------------------------------
+ MWEs dictionary:
+ ------------------------------
+ {self.mwes}
+ \n\n
+ Important Points:\n
+ - Answer the questions without including "According/based on to MWEs dictionary".\n
+ - The format of the above dictionary is as follows:\n
+ "MWE Type": "MWE instance 1": "Association measure", "MWE instance 2": "Association measure", ...\n
+ - There could be other custom types in which case you should just mention the dictionary key.\n
+ - Depending on a parameter N set by the user, each MWE type contains at most N instances. But it can contain less or even 0.
+ """
+ chat_history = [
+ {"role": "system", "content": base_content},
+ ]
+ app = Flask(__name__, static_folder="path_to_your_ui_folder")
+
+ @app.route("/")
+ def index():
+ return send_from_directory("../chat_ui", "chat.html")
+
+ @app.route("/chat", methods=["POST"])
+ def chat():
+ user_input = request.json["message"]
+ chat_history.append({"role": "user", "content": user_input})
+ response = (
+ self.chat_client.chat.completions.create(
+ model="gpt-3.5-turbo",
+ messages=chat_history,
+ )
+ .choices[0]
+ .message.content
+ )
+ chat_history.append({"role": "assistant", "content": response})
+ return jsonify({"reply": response})
+
+ def run():
+ app.run(port=5001)
+
+ flask_thread = threading.Thread(target=run)
+ flask_thread.start()
+
def extract_mwes(
self,
sort: bool = True,
top_n: Optional[int] = None,
) -> dict[str, dict[str, float]]:
- """Extract MWEs from the text corpus.
+ """Extract MWEs from the text corpus and add them to self.mwes.
Args:
sort: If True, the MWEs will be sorted in descending order of association measure.
top_n: If provided, only the top n MWEs will be returned.
Returns:
- A dictionary containing the MWEs and their association measures.
+ None.
"""
for sentence in tqdm(self.reader.get_sentences()):
try:
diff --git a/wordview/mwes/patterns.py b/wordview/mwes/patterns.py
index 2aaa155..d1a88ae 100644
--- a/wordview/mwes/patterns.py
+++ b/wordview/mwes/patterns.py
@@ -4,56 +4,80 @@
class EnMWEPatterns:
patterns: Dict[str, List[str]] = {}
- def __init__(self, mwe_types=["LVC", "NC2", "NC3", "ANC2", "ANC3", "VPC"]):
- if "LVC" in mwe_types:
- self.patterns["LVC"] = [
- "LVC: {<\\w+>}",
+ def __init__(
+ self,
+ mwe_types=[
+ "Light Verb Constructions",
+ "Noun Noun Compounds",
+ "Noun Noun Noun Compounds",
+ "Adjective Noun Compounds",
+ "Adjective Adjective Noun Compounds",
+ "Verb Particle Constructions",
+ ],
+ ):
+ if "Light Verb Constructions" in mwe_types:
+ self.patterns["Light Verb Constructions"] = [
+ "Light Verb Constructions: {<\\w+>}",
]
- if "NC2" in mwe_types:
- self.patterns["NC2"] = [
- "NC2: {}",
+ if "Noun Noun Compounds" in mwe_types:
+ self.patterns["Noun Noun Compounds"] = [
+ "Noun Noun Compounds: {}",
]
- if "NC3" in mwe_types:
- self.patterns["NC3"] = [
- "NC3: {}",
+ if "Noun Noun Noun Compounds" in mwe_types:
+ self.patterns["Noun Noun Noun Compounds"] = [
+ "Noun Noun Noun Compounds: {}",
]
- if "ANC2" in mwe_types:
- self.patterns["ANC2"] = [
- "ANC2: {}",
+ if "Adjective Noun Compounds" in mwe_types:
+ self.patterns["Adjective Noun Compounds"] = [
+ "Adjective Noun Compounds: {}",
]
- if "ANC3" in mwe_types:
- self.patterns["ANC3"] = ["ANC3: {}"]
- if "VPC" in mwe_types:
- self.patterns["VPC"] = [
- "VPC: {