From 1a39e55c0379154bd9615b8c7c470dcdd118f049 Mon Sep 17 00:00:00 2001
From: Elena Merdjanovska <elenamerdzanovska@yahoo.com>
Date: Wed, 13 Dec 2023 11:05:40 +0100
Subject: [PATCH 1/4] add agnews corpus

---
 flair/datasets/__init__.py                |  1 +
 flair/datasets/document_classification.py | 70 +++++++++++++++++++++++
 2 files changed, 71 insertions(+)

diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py
index 2052a7998..eae3f217f 100644
--- a/flair/datasets/__init__.py
+++ b/flair/datasets/__init__.py
@@ -108,6 +108,7 @@
     GO_EMOTIONS,
     IMDB,
     NEWSGROUPS,
+    AGNEWS,
     SENTEVAL_CR,
     SENTEVAL_MPQA,
     SENTEVAL_MR,
diff --git a/flair/datasets/document_classification.py b/flair/datasets/document_classification.py
index 2c0d6b341..8eccba49c 100644
--- a/flair/datasets/document_classification.py
+++ b/flair/datasets/document_classification.py
@@ -907,6 +907,76 @@ def __init__(
         super().__init__(data_folder, tokenizer=tokenizer, memory_mode=memory_mode, **corpusargs)
 
 
+class AGNEWS(ClassificationCorpus):
+    """
+    The AG's News Topic Classification Corpus, classifying news into 4 coarse-grained topics
+    (World, Sports, Business, Sci/Tech).
+    """
+
+    def __init__(
+        self,
+        base_path: Optional[Union[str, Path]] = None,
+        tokenizer: Union[bool, Tokenizer] = SpaceTokenizer(),
+        memory_mode="partial",
+        **corpusargs,
+    ):
+        """
+        Instantiates AGNews Classification Corpus with 4 classes.
+        :param base_path: Provide this only if you store the AGNEWS corpus in a specific folder, otherwise use default.
+        :param tokenizer: Custom tokenizer to use (default is SpaceTokenizer)
+        :param memory_mode: Set to 'partial' by default. Can also be 'full' or 'none'.
+        :param corpusargs: Other args for ClassificationCorpus.
+        """
+        base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
+
+        dataset_name = self.__class__.__name__.lower()
+
+        data_folder = base_path / dataset_name
+
+        # download data from same source as in huggingface's implementations
+        agnews_path = "https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/"
+
+        original_filenames = ["train.csv", "test.csv",'classes.txt']
+        new_filenames = ["train.txt", "test.txt"]
+        
+        for original_filename in original_filenames:
+            cached_path(f"{agnews_path}{original_filename}", Path("datasets") / dataset_name / "original")
+
+        data_file = data_folder / new_filenames[0]
+        label_dict = []
+        label_path = original_filenames[-1]
+        
+        #read label order
+        with open(data_folder / "original" / label_path, "rt") as f:
+            for line in f:
+                line = line.rstrip()
+                label_dict.append(line)
+
+        original_filenames=original_filenames[:-1]
+        if not data_file.is_file():
+            for original_filename, new_filename in zip(original_filenames, new_filenames):
+                with open(data_folder / "original" / original_filename, "rt", encoding="utf-8") as open_fp:
+                    with open(data_folder / new_filename, "wt", encoding="utf-8") as write_fp:
+                        csv_reader = csv.reader(
+                            open_fp, quotechar='"', delimiter=",", quoting=csv.QUOTE_ALL, skipinitialspace=True
+                        )
+                        for id_, row in enumerate(csv_reader):
+                            label, title, description = row
+                            # Original labels are [1, 2, 3, 4] -> ['World', 'Sports', 'Business', 'Sci/Tech']
+                            # Re-map to [0, 1, 2, 3].
+                            label = int(label) - 1
+                            text = " ".join((title, description))
+
+
+                            new_label = "__label__"
+                            new_label += label_dict[label]
+
+                            write_fp.write(f"{new_label} {text}\n")
+
+        super(AGNEWS, self).__init__(
+            data_folder, label_type="topic", tokenizer=tokenizer, memory_mode=memory_mode, **corpusargs
+        )
+
 class STACKOVERFLOW(ClassificationCorpus):
     """Stackoverflow corpus classifying questions into one of 20 labels.
 

From 6382c476295d3b68aa85525687c58817a179f6c7 Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Sun, 17 Dec 2023 08:47:58 +0100
Subject: [PATCH 2/4] Ruff fixes

---
 flair/datasets/__init__.py                |  3 ++-
 flair/datasets/document_classification.py | 28 ++++++++++-------------
 2 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py
index eae3f217f..d8edd4445 100644
--- a/flair/datasets/__init__.py
+++ b/flair/datasets/__init__.py
@@ -100,6 +100,7 @@
 
 # Expose all document classification datasets
 from .document_classification import (
+    AGNEWS,
     AMAZON_REVIEWS,
     COMMUNICATIVE_FUNCTIONS,
     GERMEVAL_2018_OFFENSIVE_LANGUAGE,
@@ -108,7 +109,6 @@
     GO_EMOTIONS,
     IMDB,
     NEWSGROUPS,
-    AGNEWS,
     SENTEVAL_CR,
     SENTEVAL_MPQA,
     SENTEVAL_MR,
@@ -315,6 +315,7 @@
     "SentenceDataset",
     "MongoDataset",
     "StringDataset",
+    "AGNEWS",
     "ANAT_EM",
     "AZDZ",
     "BC2GM",
diff --git a/flair/datasets/document_classification.py b/flair/datasets/document_classification.py
index 8eccba49c..ad9291b43 100644
--- a/flair/datasets/document_classification.py
+++ b/flair/datasets/document_classification.py
@@ -908,8 +908,7 @@ def __init__(
 
 
 class AGNEWS(ClassificationCorpus):
-    """
-    The AG's News Topic Classification Corpus, classifying news into 4 coarse-grained topics
+    """The AG's News Topic Classification Corpus, classifying news into 4 coarse-grained topics
     (World, Sports, Business, Sci/Tech).
     """
 
@@ -920,8 +919,7 @@ def __init__(
         memory_mode="partial",
         **corpusargs,
     ):
-        """
-        Instantiates AGNews Classification Corpus with 4 classes.
+        """Instantiates AGNews Classification Corpus with 4 classes.
         :param base_path: Provide this only if you store the AGNEWS corpus in a specific folder, otherwise use default.
         :param tokenizer: Custom tokenizer to use (default is SpaceTokenizer)
         :param memory_mode: Set to 'partial' by default. Can also be 'full' or 'none'.
@@ -936,27 +934,27 @@ def __init__(
         # download data from same source as in huggingface's implementations
         agnews_path = "https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/"
 
-        original_filenames = ["train.csv", "test.csv",'classes.txt']
+        original_filenames = ["train.csv", "test.csv", "classes.txt"]
         new_filenames = ["train.txt", "test.txt"]
-        
+
         for original_filename in original_filenames:
             cached_path(f"{agnews_path}{original_filename}", Path("datasets") / dataset_name / "original")
 
         data_file = data_folder / new_filenames[0]
         label_dict = []
         label_path = original_filenames[-1]
-        
-        #read label order
-        with open(data_folder / "original" / label_path, "rt") as f:
+
+        # read label order
+        with open(data_folder / "original" / label_path) as f:
             for line in f:
                 line = line.rstrip()
                 label_dict.append(line)
 
-        original_filenames=original_filenames[:-1]
+        original_filenames = original_filenames[:-1]
         if not data_file.is_file():
             for original_filename, new_filename in zip(original_filenames, new_filenames):
-                with open(data_folder / "original" / original_filename, "rt", encoding="utf-8") as open_fp:
-                    with open(data_folder / new_filename, "wt", encoding="utf-8") as write_fp:
+                with open(data_folder / "original" / original_filename, encoding="utf-8") as open_fp:
+                    with open(data_folder / new_filename, "w", encoding="utf-8") as write_fp:
                         csv_reader = csv.reader(
                             open_fp, quotechar='"', delimiter=",", quoting=csv.QUOTE_ALL, skipinitialspace=True
                         )
@@ -967,15 +965,13 @@ def __init__(
                             label = int(label) - 1
                             text = " ".join((title, description))
 
-
                             new_label = "__label__"
                             new_label += label_dict[label]
 
                             write_fp.write(f"{new_label} {text}\n")
 
-        super(AGNEWS, self).__init__(
-            data_folder, label_type="topic", tokenizer=tokenizer, memory_mode=memory_mode, **corpusargs
-        )
+        super().__init__(data_folder, label_type="topic", tokenizer=tokenizer, memory_mode=memory_mode, **corpusargs)
+
 
 class STACKOVERFLOW(ClassificationCorpus):
     """Stackoverflow corpus classifying questions into one of 20 labels.

From d0441d7d72e4cbef0300a8f1ddb564f7ced0e84c Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Sun, 17 Dec 2023 09:24:47 +0100
Subject: [PATCH 3/4] Make ruff happy

---
 flair/datasets/document_classification.py | 35 ++++++++++++-----------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/flair/datasets/document_classification.py b/flair/datasets/document_classification.py
index ad9291b43..97bb19432 100644
--- a/flair/datasets/document_classification.py
+++ b/flair/datasets/document_classification.py
@@ -908,8 +908,9 @@ def __init__(
 
 
 class AGNEWS(ClassificationCorpus):
-    """The AG's News Topic Classification Corpus, classifying news into 4 coarse-grained topics
-    (World, Sports, Business, Sci/Tech).
+    """The AG's News Topic Classification Corpus, classifying news into 4 coarse-grained topics.
+
+    Labels: World, Sports, Business, Sci/Tech.
     """
 
     def __init__(
@@ -920,6 +921,7 @@ def __init__(
         **corpusargs,
     ):
         """Instantiates AGNews Classification Corpus with 4 classes.
+
         :param base_path: Provide this only if you store the AGNEWS corpus in a specific folder, otherwise use default.
         :param tokenizer: Custom tokenizer to use (default is SpaceTokenizer)
         :param memory_mode: Set to 'partial' by default. Can also be 'full' or 'none'.
@@ -953,22 +955,23 @@ def __init__(
         original_filenames = original_filenames[:-1]
         if not data_file.is_file():
             for original_filename, new_filename in zip(original_filenames, new_filenames):
-                with open(data_folder / "original" / original_filename, encoding="utf-8") as open_fp:
-                    with open(data_folder / new_filename, "w", encoding="utf-8") as write_fp:
-                        csv_reader = csv.reader(
-                            open_fp, quotechar='"', delimiter=",", quoting=csv.QUOTE_ALL, skipinitialspace=True
-                        )
-                        for id_, row in enumerate(csv_reader):
-                            label, title, description = row
-                            # Original labels are [1, 2, 3, 4] -> ['World', 'Sports', 'Business', 'Sci/Tech']
-                            # Re-map to [0, 1, 2, 3].
-                            label = int(label) - 1
-                            text = " ".join((title, description))
+                with open(data_folder / "original" / original_filename, encoding="utf-8") as open_fp, open(
+                    data_folder / new_filename, "w", encoding="utf-8"
+                ) as write_fp:
+                    csv_reader = csv.reader(
+                        open_fp, quotechar='"', delimiter=",", quoting=csv.QUOTE_ALL, skipinitialspace=True
+                    )
+                    for id_, row in enumerate(csv_reader):
+                        label, title, description = row
+                        # Original labels are [1, 2, 3, 4] -> ['World', 'Sports', 'Business', 'Sci/Tech']
+                        # Re-map to [0, 1, 2, 3].
+                        label = int(label) - 1
+                        text = " ".join((title, description))
 
-                            new_label = "__label__"
-                            new_label += label_dict[label]
+                        new_label = "__label__"
+                        new_label += label_dict[label]
 
-                            write_fp.write(f"{new_label} {text}\n")
+                        write_fp.write(f"{new_label} {text}\n")
 
         super().__init__(data_folder, label_type="topic", tokenizer=tokenizer, memory_mode=memory_mode, **corpusargs)
 

From f1807714635487c03e27546d8cc1421cdd9373a9 Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Sun, 17 Dec 2023 10:20:18 +0100
Subject: [PATCH 4/4] Make mypy happy

---
 flair/datasets/document_classification.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/flair/datasets/document_classification.py b/flair/datasets/document_classification.py
index 97bb19432..0bbc47181 100644
--- a/flair/datasets/document_classification.py
+++ b/flair/datasets/document_classification.py
@@ -965,11 +965,10 @@ def __init__(
                         label, title, description = row
                         # Original labels are [1, 2, 3, 4] -> ['World', 'Sports', 'Business', 'Sci/Tech']
                         # Re-map to [0, 1, 2, 3].
-                        label = int(label) - 1
                         text = " ".join((title, description))
 
                         new_label = "__label__"
-                        new_label += label_dict[label]
+                        new_label += label_dict[int(label) - 1]
 
                         write_fp.write(f"{new_label} {text}\n")