Skip to content

Commit

Permalink
Merge pull request #140 from meghdadFar/feature/chat-for-mwes
Browse files Browse the repository at this point in the history
Support chat with MWE
  • Loading branch information
meghdadFar authored Apr 8, 2024
2 parents 2253615 + c1f86cb commit a312e78
Show file tree
Hide file tree
Showing 6 changed files with 208 additions and 51 deletions.
37 changes: 36 additions & 1 deletion sphinx-docs/source/chat.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,49 @@ call the `chat` method to interact with the data and get insights from it via Na
from wordview.text_analysis import TextStatsPlots
imdb_df = pd.read_csv("data/IMDB_Dataset_sample_5k.csv")
with open("wordview/chat/secrets/openai_api_key.json", "r") as f:
with open("your_secrets_dir/openai_api_key.json", "r") as f:
credentials = json.load(f)
tsp = TextStatsPlots(df=imdb_df, text_column="review")
tsp.chat(api_key=credentials.get("openai_api_key"))
The chat UI is available under http://127.0.0.1:5000/


Chat with MWEs
~~~~~~~~~~~~~~

After allowing Wordview to extract MWEs, you can call the `chat` method to get insights from this extraction through Natural Language.

.. code:: python
import json
import pandas as pd
from wordview.mwe_extraction import MWEs
from wordview.preprocessing import NgramExtractor
imdb_df = pd.read_csv("data/IMDB_Dataset_sample_5k.csv")
with open("your_secrets_dir/openai_api_key.json", "r") as f:
credentials = json.load(f)
extractor = NgramExtractor(imdb_df, "review")
extractor.extract_ngrams()
extractor.get_ngram_counts(ngram_count_file_path="ngram_counts.json")
mwe_obj = MWE(imdb_df, 'review',
ngram_count_file_path='ngram_counts.json',
language='EN',
custom_patterns="NP: {<DT>?<JJ>*<NN>}",
only_custom_patterns=False,
)
mwe_obj.extract_mwes(sort=True, top_n=10)
mwe_obj.chat(api_key=credentials.get("openai_api_key"))
The chat UI for MWEs is available under http://127.0.0.1:5001/


|chat|

.. |chat| image:: ../figs/chat.png
12 changes: 8 additions & 4 deletions sphinx-docs/source/mwes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,20 @@ the documentation.
custom_patterns="NP: {<DT>?<JJ>*<NN>}",
only_custom_patterns=False,
)
mwes = mwe_obj.extract_mwes(sort=True, top_n=10)
json.dump(mwes, open('data/mwes.json', 'w'), indent=4)
mwe_obj.extract_mwes(sort=True, top_n=10)
json.dump(mwe_obj.mwes, open('data/mwes.json', 'w'), indent=4)
The above returns the results in a dictionary, that in this example we stored in `mwes.json` file.
The above returns the results in a dictionary, that in this example we stored in a json file called `data/mwes.json`.
You can also return the result in a table:

.. code-block:: python
mwe_obj.print_mwe_table()
Which will return a table like this:

.. code-block:: text
╔═════════════════════════╦═══════════════╗
║ LVC ║ Association ║
╠═════════════════════════╬═══════════════╣
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,45 @@
display: flex;
flex-direction: column;
}
.message-container {
/* .message-container {
padding: 20px;
overflow-y: auto;
flex-grow: 1;
margin-bottom: 10px; /* Adjusted to add space at the bottom */
margin-bottom: 10px;
} */
.message-container {
overflow-y: auto; /* Enables vertical scrolling */
max-height: 500px; /* Set a max-height that fits your design */
padding: 10px;
margin-bottom: 10px;
width: 100%; /* Ensure it fills the container */
box-sizing: border-box; /* Include padding and border in the width and height */
position: relative;
}
/* Styling the scrollbar itself */
.message-container::-webkit-scrollbar {
width: 10px; /* Adjust the width of the scrollbar */
}
/* Styling the track (part the thumb slides within) */
.message-container::-webkit-scrollbar-track {
background: #f1f1f1; /* Light grey background on the track */
border-radius: 10px; /* Rounded corners on the track */
}

/* Styling the thumb (the part that you drag) */
.message-container::-webkit-scrollbar-thumb {
background: #888; /* Dark grey thumb */
border-radius: 10px; /* Rounded corners on the thumb */
}

/* Handle on hover */
.message-container::-webkit-scrollbar-thumb:hover {
background: #555; /* Darker grey on hover */
}




/* Standard Oval Style of Message Bubbles */
/*
.message {
Expand Down Expand Up @@ -147,18 +180,18 @@
button:hover {
background-color: #f0f0f0;
}
::-webkit-scrollbar {
/* ::-webkit-scrollbar {
width: 5px;
}
::-webkit-scrollbar-track {
background: #f1f1f1;
}
::-webkit-scrollbar-thumb {
background: #888;
}
::-webkit-scrollbar-thumb:hover {
} */
/* ::-webkit-scrollbar-thumb:hover {
background: #555;
}
} */
</style>
</head>
<body>
Expand Down
67 changes: 64 additions & 3 deletions wordview/mwes/mwe.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import re
import string
import threading
from re import Match
from typing import Optional

import nltk
import pandas
from flask import Flask, jsonify, request, send_from_directory
from nltk import RegexpParser, word_tokenize
from openai import OpenAI
from tabulate import tabulate # type: ignore
from tqdm import tqdm

Expand All @@ -26,7 +29,9 @@ def is_alphanumeric_latinscript_multigram(word: str) -> Optional[Match[str]]:


class MWE:
"""Extract MWEs of type LVC, VPC, Noun Compounds, Adjective Compounds, and custom patterns from a text corpus."""
"""Extract MWEs of typeS:
LVC, VPC, Noun Compounds, Adjective Compounds, and custom patterns from a text corpus.
"""

def __init__(
self,
Expand Down Expand Up @@ -99,19 +104,75 @@ def __init__(
custom_pattern=mwe_patterns,
)

def chat(self, api_key: str = ""):
"""Chat with OpenAI's latest model about MWEs .
Access the chat UI in your localhost under http://127.0.0.1:5001/
Args:
api_key: OpenAI API key.
Returns:
None
"""
self.api_key = api_key
self.chat_client = OpenAI(api_key=api_key)
base_content = f"""Answer any questions about the Multiword Expressions (MWEs) that extracted from the uploaded text corpus by Wordview and are presented in the following MWEs dictionary.
\n\n
------------------------------
MWEs dictionary:
------------------------------
{self.mwes}
\n\n
Important Points:\n
- Answer the questions without including "According/based on to MWEs dictionary".\n
- The format of the above dictionary is as follows:\n
"MWE Type": "MWE instance 1": "Association measure", "MWE instance 2": "Association measure", ...\n
- There could be other custom types in which case you should just mention the dictionary key.\n
- Depending on a parameter N set by the user, each MWE type contains at most N instances. But it can contain less or even 0.
"""
chat_history = [
{"role": "system", "content": base_content},
]
app = Flask(__name__, static_folder="path_to_your_ui_folder")

@app.route("/")
def index():
return send_from_directory("../chat_ui", "chat.html")

@app.route("/chat", methods=["POST"])
def chat():
user_input = request.json["message"]
chat_history.append({"role": "user", "content": user_input})
response = (
self.chat_client.chat.completions.create(
model="gpt-3.5-turbo",
messages=chat_history,
)
.choices[0]
.message.content
)
chat_history.append({"role": "assistant", "content": response})
return jsonify({"reply": response})

def run():
app.run(port=5001)

flask_thread = threading.Thread(target=run)
flask_thread.start()

def extract_mwes(
self,
sort: bool = True,
top_n: Optional[int] = None,
) -> dict[str, dict[str, float]]:
"""Extract MWEs from the text corpus.
"""Extract MWEs from the text corpus and add them to self.mwes.
Args:
sort: If True, the MWEs will be sorted in descending order of association measure.
top_n: If provided, only the top n MWEs will be returned.
Returns:
A dictionary containing the MWEs and their association measures.
None.
"""
for sentence in tqdm(self.reader.get_sentences()):
try:
Expand Down
96 changes: 60 additions & 36 deletions wordview/mwes/patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,56 +4,80 @@
class EnMWEPatterns:
patterns: Dict[str, List[str]] = {}

def __init__(self, mwe_types=["LVC", "NC2", "NC3", "ANC2", "ANC3", "VPC"]):
if "LVC" in mwe_types:
self.patterns["LVC"] = [
"LVC: {<VB*><DT><\\w+>}",
def __init__(
self,
mwe_types=[
"Light Verb Constructions",
"Noun Noun Compounds",
"Noun Noun Noun Compounds",
"Adjective Noun Compounds",
"Adjective Adjective Noun Compounds",
"Verb Particle Constructions",
],
):
if "Light Verb Constructions" in mwe_types:
self.patterns["Light Verb Constructions"] = [
"Light Verb Constructions: {<VB*><DT><\\w+>}",
]
if "NC2" in mwe_types:
self.patterns["NC2"] = [
"NC2: {<NN|NNS><NN|NNS>}",
if "Noun Noun Compounds" in mwe_types:
self.patterns["Noun Noun Compounds"] = [
"Noun Noun Compounds: {<NN|NNS><NN|NNS>}",
]
if "NC3" in mwe_types:
self.patterns["NC3"] = [
"NC3: {<NN|NNS><NN|NNS><NN|NNS>}",
if "Noun Noun Noun Compounds" in mwe_types:
self.patterns["Noun Noun Noun Compounds"] = [
"Noun Noun Noun Compounds: {<NN|NNS><NN|NNS><NN|NNS>}",
]
if "ANC2" in mwe_types:
self.patterns["ANC2"] = [
"ANC2: {<JJ><NN|NNS>}",
if "Adjective Noun Compounds" in mwe_types:
self.patterns["Adjective Noun Compounds"] = [
"Adjective Noun Compounds: {<JJ><NN|NNS>}",
]
if "ANC3" in mwe_types:
self.patterns["ANC3"] = ["ANC3: {<JJ><JJ><NN|NNS>}"]
if "VPC" in mwe_types:
self.patterns["VPC"] = [
"VPC: {<VB|VBP><RP>}",
if "Adjective Adjective Noun Compounds" in mwe_types:
self.patterns["Adjective Adjective Noun Compounds"] = [
"Adjective Adjective Noun Compounds: {<JJ><JJ><NN|NNS>}"
]
if "Verb Particle Constructions" in mwe_types:
self.patterns["Verb Particle Constructions"] = [
"Verb Particle Constructions: {<VB|VBP><RP>}",
]


class DeMWEPatterns:
patterns: Dict[str, List[str]] = {}

def __init__(self, mwe_types=["LVC", "NC2", "NC3", "ANC2", "ANC3", "VPC"]):
if "LVC" in mwe_types:
self.patterns["LVC"] = [
"LVC: {<VB*><DT><\\w+>}",
def __init__(
self,
mwe_types=[
"Light Verb Constructions",
"Noun Noun Compounds",
"Noun Noun Noun Compounds",
"Adjective Noun Compounds",
"Adjective Adjective Noun Compounds",
"Verb Particle Constructions",
],
):
if "Light Verb Constructions" in mwe_types:
self.patterns["Light Verb Constructions"] = [
"Light Verb Constructions: {<VB*><DT><\\w+>}",
]
# Define the patterns for 2 and 3-word noun compounds (e.g., "Hausaufgaben", "Fußballplatz")
if "NC2" in mwe_types:
self.patterns["NC2"] = [
"NC2: {<NN|NNS><NN|NNS>}",
if "Noun Noun Compounds" in mwe_types:
self.patterns["Noun Noun Compounds"] = [
"Noun Noun Compounds: {<NN|NNS><NN|NNS>}",
]
if "Noun Noun Noun Compounds" in mwe_types:
self.patterns["Noun Noun Noun Compounds"] = [
"Noun Noun Noun Compounds: {<NN|NNS><NN|NNS><NN|NNS>}",
]
if "NC3" in mwe_types:
self.patterns["NC3"] = [
"NC3: {<NN|NNS><NN|NNS><NN|NNS>}",
if "Adjective Noun Compounds" in mwe_types:
self.patterns["Adjective Noun Compounds"] = [
"Adjective Noun Compounds: {<JJ><NN|NNS>}",
]
if "ANC2" in mwe_types:
self.patterns["ANC2"] = [
"ANC2: {<JJ><NN|NNS>}",
if "Adjective Adjective Noun Compounds" in mwe_types:
self.patterns["Adjective Adjective Noun Compounds"] = [
"Adjective Adjective Noun Compounds: {<JJ><JJ><NN|NNS>}"
]
if "ANC3" in mwe_types:
self.patterns["ANC3"] = ["ANC3: {<JJ><JJ><NN|NNS>}"]
# Define the patterns for verb particle constructions (e.g., "aufstehen", "zurückkommen")
if "VPC" in mwe_types:
self.patterns["VPC"] = [
"VPC: {<VB|VBP><RP>}",
if "Verb Particle Constructions" in mwe_types:
self.patterns["Verb Particle Constructions"] = [
"Verb Particle Constructions: {<VB|VBP><RP>}",
]
2 changes: 1 addition & 1 deletion wordview/text_analysis/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def chat(self, api_key: str = ""):

@app.route("/")
def index():
return send_from_directory("chat", "chat.html")
return send_from_directory("../chat_ui", "chat.html")

@app.route("/chat", methods=["POST"])
def chat():
Expand Down

0 comments on commit a312e78

Please sign in to comment.