From e68e9f520d61bd393bec0dbceddfcee38aa1fc8d Mon Sep 17 00:00:00 2001 From: Ashish Mahendra Date: Mon, 31 Jul 2023 21:56:06 +0530 Subject: [PATCH] Summarizer merge (#1198) * Relocated unused modules to _inactive directory 1. cl_summer 2. ent_ext 3. zs_classifier 4. bi_ner 5. gpt3 * Merged BART_SUM and T5_SUM to summarization * reverting cl_summer * updating action config --- .../jac_nlp => _inactive}/t5_sum/README.md | 0 .../jac_nlp => _inactive}/t5_sum/__init__.py | 0 .../t5_sum/requirements.txt | 0 .../jac_nlp => _inactive}/t5_sum/t5_sum.py | 0 .../t5_sum/tests}/__init__.py | 0 .../t5_sum/tests/fixtures}/__init__.py | 0 .../t5_sum/tests/fixtures/t5_sum.jac | 0 .../t5_sum/tests/test_t5_sum.py | 0 jaseci_ai_kit/config.py | 2 +- jaseci_ai_kit/jac_nlp/README.md | 61 +++---------- ...nfig.py => summarization_action_config.py} | 8 +- .../action_configs/t5_sum_action_config.py | 86 ------------------- .../jac_nlp/jac_nlp/bart_sum/__init__.py | 1 - .../bart_sum/tests/fixtures/bart_sum.jac | 23 ----- jaseci_ai_kit/jac_nlp/jac_nlp/config.py | 4 +- .../{bart_sum => summarization}/.gitignore | 0 .../{bart_sum => summarization}/README.md | 36 ++++---- .../jac_nlp/jac_nlp/summarization/__init__.py | 1 + .../requirements.txt | 0 .../summarization.py} | 54 ++++++------ .../tests/fixtures/__init__.py | 0 .../tests/fixtures/summarization.jac | 23 +++++ .../tests/test_summarization.py} | 22 ++--- jaseci_ai_kit/jac_nlp/setup.py | 3 +- 24 files changed, 97 insertions(+), 227 deletions(-) rename jaseci_ai_kit/{jac_nlp/jac_nlp => _inactive}/t5_sum/README.md (100%) rename jaseci_ai_kit/{jac_nlp/jac_nlp => _inactive}/t5_sum/__init__.py (100%) rename jaseci_ai_kit/{jac_nlp/jac_nlp => _inactive}/t5_sum/requirements.txt (100%) rename jaseci_ai_kit/{jac_nlp/jac_nlp => _inactive}/t5_sum/t5_sum.py (100%) rename jaseci_ai_kit/{jac_nlp/jac_nlp/bart_sum/tests/fixtures => _inactive/t5_sum/tests}/__init__.py (100%) rename jaseci_ai_kit/{jac_nlp/jac_nlp/t5_sum/tests => _inactive/t5_sum/tests/fixtures}/__init__.py (100%) rename jaseci_ai_kit/{jac_nlp/jac_nlp => _inactive}/t5_sum/tests/fixtures/t5_sum.jac (100%) rename jaseci_ai_kit/{jac_nlp/jac_nlp => _inactive}/t5_sum/tests/test_t5_sum.py (100%) rename jaseci_ai_kit/jac_nlp/jac_nlp/action_configs/{bart_sum_action_config.py => summarization_action_config.py} (94%) delete mode 100644 jaseci_ai_kit/jac_nlp/jac_nlp/action_configs/t5_sum_action_config.py delete mode 100644 jaseci_ai_kit/jac_nlp/jac_nlp/bart_sum/__init__.py delete mode 100644 jaseci_ai_kit/jac_nlp/jac_nlp/bart_sum/tests/fixtures/bart_sum.jac rename jaseci_ai_kit/jac_nlp/jac_nlp/{bart_sum => summarization}/.gitignore (100%) rename jaseci_ai_kit/jac_nlp/jac_nlp/{bart_sum => summarization}/README.md (66%) create mode 100644 jaseci_ai_kit/jac_nlp/jac_nlp/summarization/__init__.py rename jaseci_ai_kit/jac_nlp/jac_nlp/{bart_sum => summarization}/requirements.txt (100%) rename jaseci_ai_kit/jac_nlp/jac_nlp/{bart_sum/bart_sum.py => summarization/summarization.py} (56%) rename jaseci_ai_kit/jac_nlp/jac_nlp/{t5_sum => summarization}/tests/fixtures/__init__.py (100%) create mode 100644 jaseci_ai_kit/jac_nlp/jac_nlp/summarization/tests/fixtures/summarization.jac rename jaseci_ai_kit/jac_nlp/jac_nlp/{bart_sum/tests/test_bart_sum.py => summarization/tests/test_summarization.py} (51%) diff --git a/jaseci_ai_kit/jac_nlp/jac_nlp/t5_sum/README.md b/jaseci_ai_kit/_inactive/t5_sum/README.md similarity index 100% rename from jaseci_ai_kit/jac_nlp/jac_nlp/t5_sum/README.md rename to jaseci_ai_kit/_inactive/t5_sum/README.md diff --git a/jaseci_ai_kit/jac_nlp/jac_nlp/t5_sum/__init__.py b/jaseci_ai_kit/_inactive/t5_sum/__init__.py similarity index 100% rename from jaseci_ai_kit/jac_nlp/jac_nlp/t5_sum/__init__.py rename to jaseci_ai_kit/_inactive/t5_sum/__init__.py diff --git a/jaseci_ai_kit/jac_nlp/jac_nlp/t5_sum/requirements.txt b/jaseci_ai_kit/_inactive/t5_sum/requirements.txt similarity index 100% rename from jaseci_ai_kit/jac_nlp/jac_nlp/t5_sum/requirements.txt rename to jaseci_ai_kit/_inactive/t5_sum/requirements.txt diff --git a/jaseci_ai_kit/jac_nlp/jac_nlp/t5_sum/t5_sum.py b/jaseci_ai_kit/_inactive/t5_sum/t5_sum.py similarity index 100% rename from jaseci_ai_kit/jac_nlp/jac_nlp/t5_sum/t5_sum.py rename to jaseci_ai_kit/_inactive/t5_sum/t5_sum.py diff --git a/jaseci_ai_kit/jac_nlp/jac_nlp/bart_sum/tests/fixtures/__init__.py b/jaseci_ai_kit/_inactive/t5_sum/tests/__init__.py similarity index 100% rename from jaseci_ai_kit/jac_nlp/jac_nlp/bart_sum/tests/fixtures/__init__.py rename to jaseci_ai_kit/_inactive/t5_sum/tests/__init__.py diff --git a/jaseci_ai_kit/jac_nlp/jac_nlp/t5_sum/tests/__init__.py b/jaseci_ai_kit/_inactive/t5_sum/tests/fixtures/__init__.py similarity index 100% rename from jaseci_ai_kit/jac_nlp/jac_nlp/t5_sum/tests/__init__.py rename to jaseci_ai_kit/_inactive/t5_sum/tests/fixtures/__init__.py diff --git a/jaseci_ai_kit/jac_nlp/jac_nlp/t5_sum/tests/fixtures/t5_sum.jac b/jaseci_ai_kit/_inactive/t5_sum/tests/fixtures/t5_sum.jac similarity index 100% rename from jaseci_ai_kit/jac_nlp/jac_nlp/t5_sum/tests/fixtures/t5_sum.jac rename to jaseci_ai_kit/_inactive/t5_sum/tests/fixtures/t5_sum.jac diff --git a/jaseci_ai_kit/jac_nlp/jac_nlp/t5_sum/tests/test_t5_sum.py b/jaseci_ai_kit/_inactive/t5_sum/tests/test_t5_sum.py similarity index 100% rename from jaseci_ai_kit/jac_nlp/jac_nlp/t5_sum/tests/test_t5_sum.py rename to jaseci_ai_kit/_inactive/t5_sum/tests/test_t5_sum.py diff --git a/jaseci_ai_kit/config.py b/jaseci_ai_kit/config.py index 961e440dfc..670574896d 100644 --- a/jaseci_ai_kit/config.py +++ b/jaseci_ai_kit/config.py @@ -7,7 +7,7 @@ from .jac_nlp.jac_nlp.tfm_ner.action_config import TFM_NER_ACTION_CONFIG from .jac_nlp.jac_nlp.use_enc.action_config import USE_ENC_ACTION_CONFIG from .jac_nlp.jac_nlp.use_qa.action_config import USE_QA_ACTION_CONFIG -from .jac_nlp.jac_nlp.bart_sum.action_config import BART_SUM_ACTION_CONFIG +from .jac_nlp.jac_nlp.summarization.action_config import BART_SUM_ACTION_CONFIG ACTION_CONFIGS = { "cl_summer": CL_SUMMER_ACTION_CONFIG, diff --git a/jaseci_ai_kit/jac_nlp/README.md b/jaseci_ai_kit/jac_nlp/README.md index 010388b093..701bd5221c 100644 --- a/jaseci_ai_kit/jac_nlp/README.md +++ b/jaseci_ai_kit/jac_nlp/README.md @@ -28,10 +28,7 @@ The `jac_nlp` package contains a collection of state-of-the-art NLP models that - [Summarizer (`cl_summer`)](#summarizer-cl_summer) - [Actions](#actions-8) - [Example Jac Usage](#example-jac-usage-8) - - [T5 Summarization (`t5_sum`)](#t5-summarization-t5_sum) - - [Actions](#actions-9) - - [Example Jac Usage:](#example-jac-usage-9) - - [Bart Summarization (`bart_sum`)](#bart-summarization-bart_sum) + - [Summarization (`summarization`)](#summarization-summarization) - [Actions](#actions-10) - [Example Jac Usage:](#example-jac-usage-10) - [Topic Modeling Modules](#topic-modeling-modules) @@ -662,51 +659,15 @@ walker cl_summer_example { ``` For a complete example visit [here](jac_nlp/cl_summer/README.md) -### T5 Summarization (`t5_sum`) -`t5_sum` uses the T5 transformer model to perform abstractive summary on a body of text. -#### Actions - -* `classify_text`: use the T5 model to summarize a body of text - * **Input**: - * `text` (string): text to summarize - * `min_length` (integer): the least amount of words you want returned from the model - * `max_length` (integer): the most amount of words you want returned from the model - * **Input datafile** - `**data.json**` - ``` - { - "text": "The US has passed the peak on new coronavirus cases, President Donald Trump said and predicted that some states would reopen this month. The US has over 637,000 confirmed Covid-19 cases and over 30,826 deaths, the highest for any country in the world. At the daily White House coronavirus briefing on Wednesday, Trump said new guidelines to reopen the country would be announced on Thursday after he speaks to governors. We'll be the comeback kids, all of us, he said. We want to get our country back. The Trump administration has previously fixed May 1 as a possible date to reopen the world's largest economy, but the president said some states may be able to return to normalcy earlier than that.", - "min_length": 30, - "max_length": 100 - } - ``` - -#### Example Jac Usage: -```jac -# Use the T5 model to summarize a given piece of text -walker summarization { - can t5_sum.classify_text; - has data = "data.json"; - data = file.load_json(data); - summarized_text = t5_sum.classify_text( - text = data["text"], - min_length = data["min_length"], - max_length = data["max_length"] - ); - report summarized_text; -} -``` - -For a complete example visit [here](jac_nlp/t5_sum/README.md) -### Bart Summarization (`bart_sum`) +### Summarization (`summarization`) -`bart_sum` uses the BART transformer model to perform abstractive summary on a body of text. +`summarization` uses the BART transformer model to perform abstractive summary on a body of text. #### Actions -There are 2 ways to use `bart_sum` module. +There are 2 ways to use `summarization` module. 1. Given a text, it will return the summary of the text. 2. Given a web page url, it will return the summary of the web page. @@ -724,15 +685,15 @@ Following example will return the summary of the a single text. ```jac walker test_summarize_single { - can bart_sum.summarize; - report bart_sum.summarize("There was once a king of Scotland whose name was Robert Bruce. He needed to be both brave and wise because the times in which he lived were wild and rude.", 10); + can summarization.summarize; + report summarization.summarize("There was once a king of Scotland whose name was Robert Bruce. He needed to be both brave and wise because the times in which he lived were wild and rude.", 10); } ``` You can also pass a list of texts to get the summary of all the texts. ```jac walker test_summarize_batch { - can bart_sum.summarize; - report bart_sum.summarize( + can summarization.summarize; + report summarization.summarize( ["There was once a king of Scotland whose name was Robert Bruce. He needed to be both brave and wise because the times in which he lived were wild and rude.", "There was once a king of Scotland whose name was Robert Bruce. He needed to be both brave and wise because the times in which he lived were wild and rude.", "There was once a king of Scotland whose name was Robert Bruce. He needed to be both brave and wise because the times in which he lived were wild and rude."], @@ -744,12 +705,12 @@ Following example will return the summary of the web page. ```jac walker test_summarize_url { - can bart_sum.summarize; - report bart_sum.summarize(null, "https://in.mashable.com/"); + can summarization.summarize; + report summarization.summarize(null, "https://in.mashable.com/"); } ``` -For a complete example visit [here](jac_nlp/bart_sum/README.md) +For a complete example visit [here](jac_nlp/summarization/README.md) ## Topic Modeling Modules diff --git a/jaseci_ai_kit/jac_nlp/jac_nlp/action_configs/bart_sum_action_config.py b/jaseci_ai_kit/jac_nlp/jac_nlp/action_configs/summarization_action_config.py similarity index 94% rename from jaseci_ai_kit/jac_nlp/jac_nlp/action_configs/bart_sum_action_config.py rename to jaseci_ai_kit/jac_nlp/jac_nlp/action_configs/summarization_action_config.py index 196abe33d1..b75bf8e4fa 100644 --- a/jaseci_ai_kit/jac_nlp/jac_nlp/action_configs/bart_sum_action_config.py +++ b/jaseci_ai_kit/jac_nlp/jac_nlp/action_configs/summarization_action_config.py @@ -1,6 +1,6 @@ -BART_SUM_ACTION_CONFIG = { - "module": "jac_nlp.bart_sum", - "loaded_module": "jac_nlp.bart_sum.bart_sum", +SUMMARIZATION_ACTION_CONFIG = { + "module": "jac_nlp.summarization", + "loaded_module": "jac_nlp.summarization.summarization", "local_mem_requirement": 2100, "remote": { "Service": { @@ -26,7 +26,7 @@ "creationTimestamp": None, }, "data": { - "prod_up": "uvicorn jac_nlp.bart_sum:serv_actions --host 0.0.0.0 --port 80" + "prod_up": "uvicorn jac_nlp.summarization:serv_actions --host 0.0.0.0 --port 80" }, }, "Deployment": { diff --git a/jaseci_ai_kit/jac_nlp/jac_nlp/action_configs/t5_sum_action_config.py b/jaseci_ai_kit/jac_nlp/jac_nlp/action_configs/t5_sum_action_config.py deleted file mode 100644 index 231938a70c..0000000000 --- a/jaseci_ai_kit/jac_nlp/jac_nlp/action_configs/t5_sum_action_config.py +++ /dev/null @@ -1,86 +0,0 @@ -T5_SUM_ACTION_CONFIG = { - "module": "jac_nlp.t5_sum", - "loaded_module": "jac_nlp.t5_sum.t5_sum", - "remote": { - "Service": { - "kind": "Service", - "apiVersion": "v1", - "metadata": {"name": "t5-sum", "creationTimestamp": None}, - "spec": { - "ports": [ - {"name": "http", "protocol": "TCP", "port": 80, "targetPort": 80} - ], - "selector": {"pod": "t5-sum"}, - "type": "ClusterIP", - "sessionAffinity": "None", - "internalTrafficPolicy": "Cluster", - }, - "status": {"loadBalancer": {}}, - }, - "ConfigMap": { - "kind": "ConfigMap", - "apiVersion": "v1", - "metadata": { - "name": "t5-sum-up", - "creationTimestamp": None, - }, - "data": { - "prod_up": "uvicorn jac_nlp.t5_sum:serv_actions --host 0.0.0.0 --port 80" - }, - }, - "Deployment": { - "kind": "Deployment", - "apiVersion": "apps/v1", - "metadata": {"name": "t5-sum", "creationTimestamp": None}, - "spec": { - "replicas": 1, - "selector": {"matchLabels": {"pod": "t5-sum"}}, - "template": { - "metadata": { - "name": "t5-sum", - "creationTimestamp": None, - "labels": {"pod": "t5-sum"}, - }, - "spec": { - "volumes": [ - { - "name": "prod-script", - "configMap": {"name": "t5-sum-up", "defaultMode": 420}, - } - ], - "containers": [ - { - "name": "t5-sum", - "image": "jaseci/jac-nlp:latest", - "command": ["bash", "-c", "source script/prod_up"], - "ports": [{"containerPort": 80, "protocol": "TCP"}], - "resources": { - "limits": {"memory": "3Gi"}, - "requests": {"memory": "3Gi"}, - }, - "volumeMounts": [ - {"name": "prod-script", "mountPath": "/script"} - ], - "terminationMessagePath": "/dev/termination-log", - "terminationMessagePolicy": "File", - "imagePullPolicy": "IfNotPresent", - } - ], - "restartPolicy": "Always", - "terminationGracePeriodSeconds": 30, - "dnsPolicy": "ClusterFirst", - "securityContext": {}, - "schedulerName": "default-scheduler", - }, - }, - "strategy": { - "type": "RollingUpdate", - "rollingUpdate": {"maxUnavailable": "25%", "maxSurge": "25%"}, - }, - "revisionHistoryLimit": 10, - "progressDeadlineSeconds": 600, - }, - "status": {}, - }, - }, -} diff --git a/jaseci_ai_kit/jac_nlp/jac_nlp/bart_sum/__init__.py b/jaseci_ai_kit/jac_nlp/jac_nlp/bart_sum/__init__.py deleted file mode 100644 index 4b6e32867d..0000000000 --- a/jaseci_ai_kit/jac_nlp/jac_nlp/bart_sum/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .bart_sum import * # noqa diff --git a/jaseci_ai_kit/jac_nlp/jac_nlp/bart_sum/tests/fixtures/bart_sum.jac b/jaseci_ai_kit/jac_nlp/jac_nlp/bart_sum/tests/fixtures/bart_sum.jac deleted file mode 100644 index fa62106783..0000000000 --- a/jaseci_ai_kit/jac_nlp/jac_nlp/bart_sum/tests/fixtures/bart_sum.jac +++ /dev/null @@ -1,23 +0,0 @@ -walker test_summarize_single { - can bart_sum.summarize; - report bart_sum.summarize(text="There was once a king of Scotland whose name was Robert Bruce. He needed to be both brave and wise because the times in which he lived were wild and rude.", min_length=10); -} - -walker test_summarize_single_percentage { - can bart_sum.summarize; - report bart_sum.summarize(text="There was once a king of Scotland whose name was Robert Bruce. He needed to be both brave and wise because the times in which he lived were wild and rude.", min_length=0.2, max_length=0.5); -} - -walker test_summarize_url { - can bart_sum.summarize; - report bart_sum.summarize(url="https://in.mashable.com/"); -} - -walker test_summarize_batch { - can bart_sum.summarize; - report bart_sum.summarize( - ["There was once a king of Scotland whose name was Robert Bruce. He needed to be both brave and wise because the times in which he lived were wild and rude.", - "There was once a king of Scotland whose name was Robert Bruce. He needed to be both brave and wise because the times in which he lived were wild and rude.", - "There was once a king of Scotland whose name was Robert Bruce. He needed to be both brave and wise because the times in which he lived were wild and rude."] - ); -} \ No newline at end of file diff --git a/jaseci_ai_kit/jac_nlp/jac_nlp/config.py b/jaseci_ai_kit/jac_nlp/jac_nlp/config.py index 90c0aca872..f13ca30250 100644 --- a/jaseci_ai_kit/jac_nlp/jac_nlp/config.py +++ b/jaseci_ai_kit/jac_nlp/jac_nlp/config.py @@ -1,4 +1,4 @@ -from .action_configs.bart_sum_action_config import BART_SUM_ACTION_CONFIG +from .action_configs.summarization_action_config import SUMMARIZATION_ACTION_CONFIG from .action_configs.bi_enc_action_config import BI_ENC_ACTION_CONFIG from .action_configs.cl_summer_action_config import CL_SUMMER_ACTION_CONFIG from .action_configs.sbert_sim_action_config import SBERT_SIM_ACTION_CONFIG @@ -10,7 +10,7 @@ from .action_configs.sentiment_action_config import SENTIMENT_ACTION_CONFIG ACTION_CONFIGS = { - "bart_sum": BART_SUM_ACTION_CONFIG, + "summarization": SUMMARIZATION_ACTION_CONFIG, "bi_enc": BI_ENC_ACTION_CONFIG, "cl_summer": CL_SUMMER_ACTION_CONFIG, "sbert_sim": SBERT_SIM_ACTION_CONFIG, diff --git a/jaseci_ai_kit/jac_nlp/jac_nlp/bart_sum/.gitignore b/jaseci_ai_kit/jac_nlp/jac_nlp/summarization/.gitignore similarity index 100% rename from jaseci_ai_kit/jac_nlp/jac_nlp/bart_sum/.gitignore rename to jaseci_ai_kit/jac_nlp/jac_nlp/summarization/.gitignore diff --git a/jaseci_ai_kit/jac_nlp/jac_nlp/bart_sum/README.md b/jaseci_ai_kit/jac_nlp/jac_nlp/summarization/README.md similarity index 66% rename from jaseci_ai_kit/jac_nlp/jac_nlp/bart_sum/README.md rename to jaseci_ai_kit/jac_nlp/jac_nlp/summarization/README.md index 9d6baa0662..95655e2413 100644 --- a/jaseci_ai_kit/jac_nlp/jac_nlp/bart_sum/README.md +++ b/jaseci_ai_kit/jac_nlp/jac_nlp/summarization/README.md @@ -1,29 +1,29 @@ --- -title: Text Summarization with BART +title: Text Summarization --- -# **Bart Summarizer (`bart_sum`)** +# **Summarization (`summarization`)** -Module `bart_sum` uses the `bart-large-cnn` to get the abstractive summary of a text. +Module `summarization` uses the `philschmid/bart-large-cnn-samsum` to get the abstractive summary of a text. -1. Import [`bart_sum`](#1-import-summarizer-bart_sum-module-in-jac) module in jac +1. Import [`summarization`](#1-import-summarizer-summarization-module-in-jac) module in jac 2. [Summarizer](#2-summarizer) # **Walk through** -## **1. Import Summarizer (`bart_sum`) module in jac** +## **1. Import Summarizer (`summarization`) module in jac** 1. For executing jaseci Open terminal and run follow command. ``` jsctl -m ``` -2. Load bart_sum module in jac +2. Load summarization module in jac ``` - actions load module jac_nlp.bart_sum + actions load module jac_nlp.summarization ``` ## **2. Summarizer** -There are 2 ways to use `bart_sum` module. +There are 2 ways to use `summarization` module. 1. Given a text, it will return the summary of the text. 2. Given a web page url, it will return the summary of the web page. @@ -40,15 +40,15 @@ Following example will return the summary of the a single text. ```jac walker test_summarize_single { - can bart_sum.summarize; - report bart_sum.summarize("There was once a king of Scotland whose name was Robert Bruce. He needed to be both brave and wise because the times in which he lived were wild and rude.", 10); + can summarization.summarize; + report summarization.summarize("There was once a king of Scotland whose name was Robert Bruce. He needed to be both brave and wise because the times in which he lived were wild and rude.", 10); } ``` You can also pass a list of texts to get the summary of all the texts. ```jac walker test_summarize_batch { - can bart_sum.summarize; - report bart_sum.summarize( + can summarization.summarize; + report summarization.summarize( ["There was once a king of Scotland whose name was Robert Bruce. He needed to be both brave and wise because the times in which he lived were wild and rude.", "There was once a king of Scotland whose name was Robert Bruce. He needed to be both brave and wise because the times in which he lived were wild and rude.", "There was once a king of Scotland whose name was Robert Bruce. He needed to be both brave and wise because the times in which he lived were wild and rude."], @@ -62,15 +62,15 @@ Following example will return the summary of the web page. ```jac walker test_summarize_url { - can bart_sum.summarize; - report bart_sum.summarize(null, "https://in.mashable.com/"); + can summarization.summarize; + report summarization.summarize(null, "https://in.mashable.com/"); } ``` ### Setup Parameters -* `tokenizer` - Tokenizer to be used for tokenizing the text. Type: `str` Default: `facebook/bart-large-cnn` -* `model` - Model to be used for summarizing the text. Type: `str` Default: `facebook/bart-large-cnn` + +* `model` - Model to be used for summarizing the text. Type: `str` Default: `philschmid/bart-large-cnn-samsum` # **References** -* [Bart Summarizer](https://huggingface.co/transformers/model_doc/bart.html) -* [Bart Summarizer Paper](https://arxiv.org/abs/1910.13461) \ No newline at end of file +* [Summarization](https://huggingface.co/transformers/model_doc/bart.html) +* [Summarization Paper](https://arxiv.org/abs/1910.13461) \ No newline at end of file diff --git a/jaseci_ai_kit/jac_nlp/jac_nlp/summarization/__init__.py b/jaseci_ai_kit/jac_nlp/jac_nlp/summarization/__init__.py new file mode 100644 index 0000000000..3fcacd3bd6 --- /dev/null +++ b/jaseci_ai_kit/jac_nlp/jac_nlp/summarization/__init__.py @@ -0,0 +1 @@ +from .summarization import * # noqa diff --git a/jaseci_ai_kit/jac_nlp/jac_nlp/bart_sum/requirements.txt b/jaseci_ai_kit/jac_nlp/jac_nlp/summarization/requirements.txt similarity index 100% rename from jaseci_ai_kit/jac_nlp/jac_nlp/bart_sum/requirements.txt rename to jaseci_ai_kit/jac_nlp/jac_nlp/summarization/requirements.txt diff --git a/jaseci_ai_kit/jac_nlp/jac_nlp/bart_sum/bart_sum.py b/jaseci_ai_kit/jac_nlp/jac_nlp/summarization/summarization.py similarity index 56% rename from jaseci_ai_kit/jac_nlp/jac_nlp/bart_sum/bart_sum.py rename to jaseci_ai_kit/jac_nlp/jac_nlp/summarization/summarization.py index 27f0d2e5e0..f7215d8e0b 100644 --- a/jaseci_ai_kit/jac_nlp/jac_nlp/bart_sum/bart_sum.py +++ b/jaseci_ai_kit/jac_nlp/jac_nlp/summarization/summarization.py @@ -1,8 +1,7 @@ -from transformers import BartTokenizer, BartForConditionalGeneration -import torch from typing import List, Union +import torch +from transformers import AutoModelForSeq2SeqLM, AutoTokenizer from jaseci.jsorc.live_actions import jaseci_action -import traceback from fastapi import HTTPException import requests from bs4 import BeautifulSoup @@ -10,40 +9,40 @@ import os device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -BART_SUM_ROOT = model_base_path("jac_nlp/bart_sum") +SUM_ROOT = str(model_base_path("jac_nlp/sum")) + +os.makedirs(SUM_ROOT, exist_ok=True) +global tokenizer, model -@jaseci_action(act_group=["bart_sum"], allow_remote=True) -def setup( - tokenizer: str = "facebook/bart-large-cnn", - model: str = "philschmid/bart-large-cnn-samsum", -): - global bart_tokenizer, bart_model - os.makedirs(BART_SUM_ROOT, exist_ok=True) + +@jaseci_action(act_group=["summarization"], allow_remote=True) +def setup(model_name: str = "philschmid/bart-large-cnn-samsum"): + global tokenizer, model if all( - os.path.isfile(os.path.join(BART_SUM_ROOT, f_name)) + os.path.isfile(os.path.join(SUM_ROOT, f_name)) for f_name in ["vocab.json", "pytorch_model.bin", "config.json"] ): - bart_tokenizer = BartTokenizer.from_pretrained(tokenizer, local_files_only=True) - bart_model = BartForConditionalGeneration.from_pretrained( - model, local_files_only=True + tokenizer = AutoTokenizer.from_pretrained(model_name, local_files_only=True) + model = AutoModelForSeq2SeqLM.from_pretrained( + model_name, local_files_only=True ).to(device) else: - bart_tokenizer = BartTokenizer.from_pretrained(tokenizer) - bart_model = BartForConditionalGeneration.from_pretrained(model).to(device) - bart_model.save_pretrained(BART_SUM_ROOT) - bart_tokenizer.save_vocabulary(BART_SUM_ROOT) + tokenizer = AutoTokenizer.from_pretrained(model_name) + model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device) + model.save_pretrained(SUM_ROOT) + tokenizer.save_vocabulary(SUM_ROOT) + print(f"Loaded model: {model_name}") -@jaseci_action(act_group=["bart_sum"], allow_remote=True) +@jaseci_action(act_group=["summarization"], allow_remote=True) def summarize( - text: Union[List[str], str] = None, # type: ignore - url: str = None, # type: ignore + text: Union[List[str], str] = None, + url: str = None, max_length: Union[int, float] = 1.0, min_length: Union[int, float] = 0.1, num_beams: int = 4, ) -> List[str]: - global bart_tokenizer, bart_model try: if text is not None: if isinstance(text, str): @@ -55,27 +54,24 @@ def summarize( else: raise HTTPException(status_code=400, detail="No text or url provided") - inputs = bart_tokenizer.batch_encode_plus( - text, max_length=1024, return_tensors="pt" - ) + inputs = tokenizer.batch_encode_plus(text, max_length=1024, return_tensors="pt") if not isinstance(max_length, int): max_length = int(inputs["input_ids"].shape[1] * max_length) if not isinstance(min_length, int): min_length = int(inputs["input_ids"].shape[1] * min_length) - summary_ids = bart_model.generate( + summary_ids = model.generate( inputs["input_ids"].to(device), num_beams=num_beams, max_length=max_length, min_length=min_length, early_stopping=True, ) - return bart_tokenizer.batch_decode( + return tokenizer.batch_decode( summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True ) except Exception as e: - traceback.print_exc() raise HTTPException(status_code=500, detail=str(e)) diff --git a/jaseci_ai_kit/jac_nlp/jac_nlp/t5_sum/tests/fixtures/__init__.py b/jaseci_ai_kit/jac_nlp/jac_nlp/summarization/tests/fixtures/__init__.py similarity index 100% rename from jaseci_ai_kit/jac_nlp/jac_nlp/t5_sum/tests/fixtures/__init__.py rename to jaseci_ai_kit/jac_nlp/jac_nlp/summarization/tests/fixtures/__init__.py diff --git a/jaseci_ai_kit/jac_nlp/jac_nlp/summarization/tests/fixtures/summarization.jac b/jaseci_ai_kit/jac_nlp/jac_nlp/summarization/tests/fixtures/summarization.jac new file mode 100644 index 0000000000..f488332b53 --- /dev/null +++ b/jaseci_ai_kit/jac_nlp/jac_nlp/summarization/tests/fixtures/summarization.jac @@ -0,0 +1,23 @@ +walker test_summarize_single { + can summarization.summarize; + report summarization.summarize(text="There was once a king of Scotland whose name was Robert Bruce. He needed to be both brave and wise because the times in which he lived were wild and rude.", min_length=10); +} + +walker test_summarize_single_percentage { + can summarization.summarize; + report summarization.summarize(text="There was once a king of Scotland whose name was Robert Bruce. He needed to be both brave and wise because the times in which he lived were wild and rude.", min_length=0.2, max_length=0.5); +} + +walker test_summarize_url { + can summarization.summarize; + report summarization.summarize(url="https://in.mashable.com/"); +} + +walker test_summarize_batch { + can summarization.summarize; + report summarization.summarize( + ["There was once a king of Scotland whose name was Robert Bruce. He needed to be both brave and wise because the times in which he lived were wild and rude.", + "There was once a king of Scotland whose name was Robert Bruce. He needed to be both brave and wise because the times in which he lived were wild and rude.", + "There was once a king of Scotland whose name was Robert Bruce. He needed to be both brave and wise because the times in which he lived were wild and rude."] + ); +} \ No newline at end of file diff --git a/jaseci_ai_kit/jac_nlp/jac_nlp/bart_sum/tests/test_bart_sum.py b/jaseci_ai_kit/jac_nlp/jac_nlp/summarization/tests/test_summarization.py similarity index 51% rename from jaseci_ai_kit/jac_nlp/jac_nlp/bart_sum/tests/test_bart_sum.py rename to jaseci_ai_kit/jac_nlp/jac_nlp/summarization/tests/test_summarization.py index 794b988222..a6fb81502f 100644 --- a/jaseci_ai_kit/jac_nlp/jac_nlp/bart_sum/tests/test_bart_sum.py +++ b/jaseci_ai_kit/jac_nlp/jac_nlp/summarization/tests/test_summarization.py @@ -2,33 +2,33 @@ from jaseci.jsorc.live_actions import load_module_actions, unload_module -class BartSumTests(CoreTest): +class SummarizationTests(CoreTest): fixture_src = __file__ @classmethod def setUpClass(cls): - super(BartSumTests, cls).setUpClass() - ret = load_module_actions("jac_nlp.bart_sum") - assert ret == True + super(SummarizationTests, cls).setUpClass() + ret = load_module_actions("jac_nlp.summarization") + assert ret is True - @jac_testcase("bart_sum.jac", "test_summarize_single") + @jac_testcase("summarization.jac", "test_summarize_single") def test_summarize_single(self, ret): self.assertEqual(len(ret["report"][0]), 1) - @jac_testcase("bart_sum.jac", "test_summarize_single_percentage") + @jac_testcase("summarization.jac", "test_summarize_single_percentage") def test_summarize_single_percentage(self, ret): self.assertEqual(len(ret["report"][0]), 1) - @jac_testcase("bart_sum.jac", "test_summarize_url") + @jac_testcase("summarization.jac", "test_summarize_url") def test_summarize_url(self, ret): self.assertEqual(len(ret["report"][0]), 1) - @jac_testcase("bart_sum.jac", "test_summarize_batch") + @jac_testcase("summarization.jac", "test_summarize_batch") def test_summarize_batch(self, ret): self.assertEqual(len(ret["report"][0]), 3) @classmethod def tearDownClass(cls): - super(BartSumTests, cls).tearDownClass() - ret = unload_module("jac_nlp.bart_sum.bart_sum") - assert ret == True + super(SummarizationTests, cls).tearDownClass() + ret = unload_module("jac_nlp.summarization.summarization") + assert ret is True diff --git a/jaseci_ai_kit/jac_nlp/setup.py b/jaseci_ai_kit/jac_nlp/setup.py index 6e35340d55..d9208cc717 100644 --- a/jaseci_ai_kit/jac_nlp/setup.py +++ b/jaseci_ai_kit/jac_nlp/setup.py @@ -2,9 +2,8 @@ from os.path import join MODULES = [ - "bart_sum", + "summarization", "sbert_sim", - "t5_sum", "text_seg", "tfm_ner", "use_enc",