diff --git a/LICENSE.txt b/LICENSE.txt old mode 100755 new mode 100644 diff --git a/README.md b/README.md old mode 100755 new mode 100644 index 00bdd16..636ac7f --- a/README.md +++ b/README.md @@ -1,11 +1,9 @@ -# Text2Text: Crosslingual NLP/G toolkit -Transform texts in a hundred different [languages](https://github.com/artitw/text2text#languages-available)! +# Text2Text Language Modeling Toolkit
Overview * [Colab Notebooks](https://github.com/artitw/text2text#colab-notebooks) -* [Crosslingual Models](https://github.com/artitw/text2text#how-crosslingual-models-work-click-to-watch) * [Installation Requirements](https://github.com/artitw/text2text#installation-requirements) * [Quick Start Guide](https://github.com/artitw/text2text#api-quick-start-guide) * [Class Diagram](https://github.com/artitw/text2text#class-diagram) @@ -36,9 +34,6 @@ Transform texts in a hundred different [languages](https://github.com/artitw/tex * STF-IDF multilingual search [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1RaWj5SqWvyC2SsCTGg8IAVcl9G5hOB50?usp=sharing) * All examples [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1LE_ifTpOGO5QJCKNQYtZe6c_tjbwnulR) -## How Crosslingual Models Work (click to watch) -[![Crosslingual Models](http://img.youtube.com/vi/caZLVcJqsqo/0.jpg)](https://youtu.be/caZLVcJqsqo "Cross-Lingual Models") - ## Installation Requirements ``` pip install -qq -U text2text @@ -63,119 +58,6 @@ Module Importing | `import text2text as t2t` | Libraries imported [Identification](https://github.com/artitw/text2text#identification) | `t2t.Identifier().transform(["Aj keď sa Buzz Aldrin stal až „druhým človekom“..."])` | `['sk', 'Slovak']` [Web Server](https://github.com/artitw/text2text#serving) | `t2t.Serve(host='0.0.0.0', port=80)` | Web server started on host and port -## Languages Available -
- Show all - -``` -t2t.Transformer.LANGUAGES - -# Dict of languages supported -# code: language -{'af': 'Afrikaans', - 'am': 'Amharic', - 'ar': 'Arabic', - 'ast': 'Asturian', - 'az': 'Azerbaijani', - 'ba': 'Bashkir', - 'be': 'Belarusian', - 'bg': 'Bulgarian', - 'bn': 'Bengali', - 'br': 'Breton', - 'bs': 'Bosnian', - 'ca': 'Catalan_Valencian', - 'ceb': 'Cebuano', - 'cs': 'Czech', - 'cy': 'Welsh', - 'da': 'Danish', - 'de': 'German', - 'el': 'Greeek', - 'en': 'English', - 'es': 'Spanish', - 'et': 'Estonian', - 'fa': 'Persian', - 'ff': 'Fulah', - 'fi': 'Finnish', - 'fr': 'French', - 'fy': 'Western_Frisian', - 'ga': 'Irish', - 'gd': 'Gaelic_Scottish_Gaelic', - 'gl': 'Galician', - 'gu': 'Gujarati', - 'ha': 'Hausa', - 'he': 'Hebrew', - 'hi': 'Hindi', - 'hr': 'Croatian', - 'ht': 'Haitian_Haitian_Creole', - 'hu': 'Hungarian', - 'hy': 'Armenian', - 'id': 'Indonesian', - 'ig': 'Igbo', - 'ilo': 'Iloko', - 'is': 'Icelandic', - 'it': 'Italian', - 'ja': 'Japanese', - 'jv': 'Javanese', - 'ka': 'Georgian', - 'kk': 'Kazakh', - 'km': 'Central_Khmer', - 'kn': 'Kannada', - 'ko': 'Korean', - 'lb': 'Luxembourgish_Letzeburgesch', - 'lg': 'Ganda', - 'ln': 'Lingala', - 'lo': 'Lao', - 'lt': 'Lithuanian', - 'lv': 'Latvian', - 'mg': 'Malagasy', - 'mk': 'Macedonian', - 'ml': 'Malayalam', - 'mn': 'Mongolian', - 'mr': 'Marathi', - 'ms': 'Malay', - 'my': 'Burmese', - 'ne': 'Nepali', - 'nl': 'Dutch_Flemish', - 'no': 'Norwegian', - 'ns': 'Northern_Sotho', - 'oc': 'Occitan', - 'or': 'Oriya', - 'pa': 'Panjabi_Punjabi', - 'pl': 'Polish', - 'ps': 'Pushto_Pashto', - 'pt': 'Portuguese', - 'ro': 'Romanian_Moldavian_Moldovan', - 'ru': 'Russian', - 'sd': 'Sindhi', - 'si': 'Sinhala_Sinhalese', - 'sk': 'Slovak', - 'sl': 'Slovenian', - 'so': 'Somali', - 'sq': 'Albanian', - 'sr': 'Serbian', - 'ss': 'Swati', - 'su': 'Sundanese', - 'sv': 'Swedish', - 'sw': 'Swahili', - 'ta': 'Tamil', - 'th': 'Thai', - 'tl': 'Tagalog', - 'tn': 'Tswana', - 'tr': 'Turkish', - 'uk': 'Ukrainian', - 'ur': 'Urdu', - 'uz': 'Uzbek', - 'vi': 'Vietnamese', - 'wo': 'Wolof', - 'xh': 'Xhosa', - 'yi': 'Yiddish', - 'yo': 'Yoruba', - 'zh': 'Chinese', - 'zu': 'Zulu'} -``` - -
- ## Examples ### Assistant diff --git a/demos/Text2Text_Demos.ipynb b/demos/Text2Text_Demos.ipynb index 6bcd3a4..5dd9a10 100644 --- a/demos/Text2Text_Demos.ipynb +++ b/demos/Text2Text_Demos.ipynb @@ -6,10 +6,7 @@ "id": "zL8uiXTHIrHI" }, "source": [ - "# [Text2Text](https://github.com/artitw/text2text): Crosslingual NLP/G toolkit\n", - "\n", - "## How Crosslingual Models Work (click to watch)\n", - "[![Crosslingual Models](http://img.youtube.com/vi/caZLVcJqsqo/0.jpg)](https://youtu.be/caZLVcJqsqo \"Crosslingual Models\")" + "# [Text2Text](https://github.com/artitw/text2text): Language Modeling Toolkit" ] }, { @@ -857,132 +854,6 @@ }, "execution_count": null, "outputs": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "F0RWWWrM83ov", - "outputId": "400db902-d3d3-4922-d300-b6c0020094a1" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "{'af': 'Afrikaans',\n", - " 'am': 'Amharic',\n", - " 'ar': 'Arabic',\n", - " 'ast': 'Asturian',\n", - " 'az': 'Azerbaijani',\n", - " 'ba': 'Bashkir',\n", - " 'be': 'Belarusian',\n", - " 'bg': 'Bulgarian',\n", - " 'bn': 'Bengali',\n", - " 'br': 'Breton',\n", - " 'bs': 'Bosnian',\n", - " 'ca': 'Catalan_Valencian',\n", - " 'ceb': 'Cebuano',\n", - " 'cs': 'Czech',\n", - " 'cy': 'Welsh',\n", - " 'da': 'Danish',\n", - " 'de': 'German',\n", - " 'el': 'Greeek',\n", - " 'en': 'English',\n", - " 'es': 'Spanish',\n", - " 'et': 'Estonian',\n", - " 'fa': 'Persian',\n", - " 'ff': 'Fulah',\n", - " 'fi': 'Finnish',\n", - " 'fr': 'French',\n", - " 'fy': 'Western_Frisian',\n", - " 'ga': 'Irish',\n", - " 'gd': 'Gaelic_Scottish_Gaelic',\n", - " 'gl': 'Galician',\n", - " 'gu': 'Gujarati',\n", - " 'ha': 'Hausa',\n", - " 'he': 'Hebrew',\n", - " 'hi': 'Hindi',\n", - " 'hr': 'Croatian',\n", - " 'ht': 'Haitian_Haitian_Creole',\n", - " 'hu': 'Hungarian',\n", - " 'hy': 'Armenian',\n", - " 'id': 'Indonesian',\n", - " 'ig': 'Igbo',\n", - " 'ilo': 'Iloko',\n", - " 'is': 'Icelandic',\n", - " 'it': 'Italian',\n", - " 'ja': 'Japanese',\n", - " 'jv': 'Javanese',\n", - " 'ka': 'Georgian',\n", - " 'kk': 'Kazakh',\n", - " 'km': 'Central_Khmer',\n", - " 'kn': 'Kannada',\n", - " 'ko': 'Korean',\n", - " 'lb': 'Luxembourgish_Letzeburgesch',\n", - " 'lg': 'Ganda',\n", - " 'ln': 'Lingala',\n", - " 'lo': 'Lao',\n", - " 'lt': 'Lithuanian',\n", - " 'lv': 'Latvian',\n", - " 'mg': 'Malagasy',\n", - " 'mk': 'Macedonian',\n", - " 'ml': 'Malayalam',\n", - " 'mn': 'Mongolian',\n", - " 'mr': 'Marathi',\n", - " 'ms': 'Malay',\n", - " 'my': 'Burmese',\n", - " 'ne': 'Nepali',\n", - " 'nl': 'Dutch_Flemish',\n", - " 'no': 'Norwegian',\n", - " 'ns': 'Northern_Sotho',\n", - " 'oc': 'Occitan',\n", - " 'or': 'Oriya',\n", - " 'pa': 'Panjabi_Punjabi',\n", - " 'pl': 'Polish',\n", - " 'ps': 'Pushto_Pashto',\n", - " 'pt': 'Portuguese',\n", - " 'ro': 'Romanian_Moldavian_Moldovan',\n", - " 'ru': 'Russian',\n", - " 'sd': 'Sindhi',\n", - " 'si': 'Sinhala_Sinhalese',\n", - " 'sk': 'Slovak',\n", - " 'sl': 'Slovenian',\n", - " 'so': 'Somali',\n", - " 'sq': 'Albanian',\n", - " 'sr': 'Serbian',\n", - " 'ss': 'Swati',\n", - " 'su': 'Sundanese',\n", - " 'sv': 'Swedish',\n", - " 'sw': 'Swahili',\n", - " 'ta': 'Tamil',\n", - " 'th': 'Thai',\n", - " 'tl': 'Tagalog',\n", - " 'tn': 'Tswana',\n", - " 'tr': 'Turkish',\n", - " 'uk': 'Ukrainian',\n", - " 'ur': 'Urdu',\n", - " 'uz': 'Uzbek',\n", - " 'vi': 'Vietnamese',\n", - " 'wo': 'Wolof',\n", - " 'xh': 'Xhosa',\n", - " 'yi': 'Yiddish',\n", - " 'yo': 'Yoruba',\n", - " 'zh': 'Chinese',\n", - " 'zu': 'Zulu'}" - ] - }, - "metadata": {}, - "execution_count": 16 - } - ], - "source": [ - "### Languages Available\n", - "t2t.Transformer.LANGUAGES" - ] } ], "metadata": { diff --git a/demos/Text2Text_LLM.ipynb b/demos/Text2Text_LLM.ipynb index 1ccc63d..5a7f697 100644 --- a/demos/Text2Text_LLM.ipynb +++ b/demos/Text2Text_LLM.ipynb @@ -169,8 +169,8 @@ "from pydantic import BaseModel\n", "\n", "class Song(BaseModel):\n", - " name: str\n", - " artist: str\n", + " name: str = \"\"\n", + " artist: str = \"\"\n", "\n", "result = asst.chat_completion([\n", " {\"role\": \"user\", \"content\": \"What is Britney Spears's best song?\"}\n", diff --git a/setup.py b/setup.py old mode 100755 new mode 100644 index 74259b6..5ad6811 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name="text2text", - version="1.7.5", + version="1.7.6", author="artitw", author_email="artitw@gmail.com", description="Text2Text: Crosslingual NLP/G toolkit", diff --git a/text2text/__init__.py b/text2text/__init__.py old mode 100755 new mode 100644 diff --git a/text2text/indexer.py b/text2text/indexer.py index afebd7b..d0db4f9 100644 --- a/text2text/indexer.py +++ b/text2text/indexer.py @@ -35,9 +35,11 @@ def add(self, input_lines, src_lang='en', faiss_index=None, **kwargs): if self.index.ntotal: starting_id = 1+np.amax(faiss.vector_to_array(self.index.id_map), initial=0) ids = list(range(starting_id, starting_id+len(input_lines))) - vectors = self.get_formatted_matrix(input_lines, src_lang=src_lang, **kwargs) - self.index.add_with_ids(vectors, np.array(ids)) - new_docs = pd.DataFrame({'document': input_lines}) + embeddings = kwargs.get("embeddings", None) + if embeddings is None: + embeddings = self.get_formatted_matrix(input_lines, src_lang=src_lang, **kwargs) + self.index.add_with_ids(embeddings, np.array(ids)) + new_docs = pd.DataFrame({'document': input_lines, 'embedding': embeddings.tolist()}) new_docs.index = ids self.corpus = pd.concat([self.corpus, new_docs]) return self @@ -67,7 +69,7 @@ def transform(self, input_lines, src_lang='en', encoders=[], **kwargs): self.src_lang = src_lang d = self.get_formatted_matrix(["DUMMY"], src_lang=src_lang, **kwargs).shape[-1] self.index = faiss.IndexIDMap2(faiss.IndexFlatL2(d)) - self.corpus = pd.DataFrame({"document": []}) + self.corpus = pd.DataFrame({"document": [], "embedding": []}) if not input_lines: return self return self.add(input_lines, src_lang=src_lang, **kwargs) diff --git a/text2text/rag_assistant.py b/text2text/rag_assistant.py index 0024e2b..74c9f89 100644 --- a/text2text/rag_assistant.py +++ b/text2text/rag_assistant.py @@ -1,9 +1,14 @@ import text2text as t2t +import pickle +import sqlite3 import requests import warnings import urllib.parse +import numpy as np +import pandas as pd + from tqdm.auto import tqdm from bs4 import BeautifulSoup @@ -41,9 +46,25 @@ def is_affirmative(response): return False +RAG_TABLE_NAME = "rag_corpus_embeddings" + class RagAssistant(t2t.Assistant): def __init__(self, **kwargs): super().__init__(**kwargs) + sqlite_path = kwargs.get("sqlite_path", None) + if sqlite_path: + conn = sqlite3.connect(sqlite_path) + query = f"SELECT document, embedding FROM {RAG_TABLE_NAME}" + self.records = pd.read_sql_query(query, conn) + conn.close() + self.records["embedding"] = self.records["embedding"].apply(lambda x: pickle.loads(x)) + self.index = t2t.Indexer().transform([], encoders=[t2t.Vectorizer()]) + self.index.add( + self.records["document"].tolist(), + embeddings=np.vstack(self.records["embedding"]) + ) + return + schema = kwargs.get("schema", None) texts = kwargs.get("texts", []) urls = kwargs.get("urls", []) @@ -56,18 +77,29 @@ def __init__(self, **kwargs): warnings.warn(f"Skipping URL with errors: {u}") else: warnings.warn(f"Skipping invalid URL: {u}") - + if schema: + column_names = schema.model_fields.keys() + self.records = pd.DataFrame(columns=column_names) for t in tqdm(texts, desc='Schema extraction'): - fields = ", ".join(schema.model_fields.keys()) + fields = ", ".join(column_names) prompt = f'Extract {fields} from the following text:\n\n{t}' res = t2t.Assistant.chat_completion(self, [{"role": "user", "content": prompt}], schema=schema) + new_row = pd.DataFrame([vars(res)]) + self.records = pd.concat([self.records, new_row], ignore_index=True) res = "\n".join(f'{k}: {v}' for k,v in vars(res).items()) input_lines.append(res) else: input_lines = texts + self.records = pd.DataFrame({"text": texts}) self.index = t2t.Indexer().transform(input_lines, encoders=[t2t.Vectorizer()]) + self.records = pd.concat([self.records, self.index.corpus], axis=1) + self.records["embedding"] = self.records["embedding"].apply(lambda x: pickle.dumps(x)) + conn = sqlite3.connect("text2text.db") + self.records.to_sql(RAG_TABLE_NAME, conn, if_exists='replace', index=False) + conn.close() + def chat_completion(self, messages=[{"role": "user", "content": "hello"}], stream=False, schema=None, **kwargs): k = kwargs.get("k", 3)