From 1a39e55c0379154bd9615b8c7c470dcdd118f049 Mon Sep 17 00:00:00 2001 From: Elena Merdjanovska Date: Wed, 13 Dec 2023 11:05:40 +0100 Subject: [PATCH 1/4] add agnews corpus --- flair/datasets/__init__.py | 1 + flair/datasets/document_classification.py | 70 +++++++++++++++++++++++ 2 files changed, 71 insertions(+) diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py index 2052a7998..eae3f217f 100644 --- a/flair/datasets/__init__.py +++ b/flair/datasets/__init__.py @@ -108,6 +108,7 @@ GO_EMOTIONS, IMDB, NEWSGROUPS, + AGNEWS, SENTEVAL_CR, SENTEVAL_MPQA, SENTEVAL_MR, diff --git a/flair/datasets/document_classification.py b/flair/datasets/document_classification.py index 2c0d6b341..8eccba49c 100644 --- a/flair/datasets/document_classification.py +++ b/flair/datasets/document_classification.py @@ -907,6 +907,76 @@ def __init__( super().__init__(data_folder, tokenizer=tokenizer, memory_mode=memory_mode, **corpusargs) +class AGNEWS(ClassificationCorpus): + """ + The AG's News Topic Classification Corpus, classifying news into 4 coarse-grained topics + (World, Sports, Business, Sci/Tech). + """ + + def __init__( + self, + base_path: Optional[Union[str, Path]] = None, + tokenizer: Union[bool, Tokenizer] = SpaceTokenizer(), + memory_mode="partial", + **corpusargs, + ): + """ + Instantiates AGNews Classification Corpus with 4 classes. + :param base_path: Provide this only if you store the AGNEWS corpus in a specific folder, otherwise use default. + :param tokenizer: Custom tokenizer to use (default is SpaceTokenizer) + :param memory_mode: Set to 'partial' by default. Can also be 'full' or 'none'. + :param corpusargs: Other args for ClassificationCorpus. + """ + base_path = flair.cache_root / "datasets" if not base_path else Path(base_path) + + dataset_name = self.__class__.__name__.lower() + + data_folder = base_path / dataset_name + + # download data from same source as in huggingface's implementations + agnews_path = "https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/" + + original_filenames = ["train.csv", "test.csv",'classes.txt'] + new_filenames = ["train.txt", "test.txt"] + + for original_filename in original_filenames: + cached_path(f"{agnews_path}{original_filename}", Path("datasets") / dataset_name / "original") + + data_file = data_folder / new_filenames[0] + label_dict = [] + label_path = original_filenames[-1] + + #read label order + with open(data_folder / "original" / label_path, "rt") as f: + for line in f: + line = line.rstrip() + label_dict.append(line) + + original_filenames=original_filenames[:-1] + if not data_file.is_file(): + for original_filename, new_filename in zip(original_filenames, new_filenames): + with open(data_folder / "original" / original_filename, "rt", encoding="utf-8") as open_fp: + with open(data_folder / new_filename, "wt", encoding="utf-8") as write_fp: + csv_reader = csv.reader( + open_fp, quotechar='"', delimiter=",", quoting=csv.QUOTE_ALL, skipinitialspace=True + ) + for id_, row in enumerate(csv_reader): + label, title, description = row + # Original labels are [1, 2, 3, 4] -> ['World', 'Sports', 'Business', 'Sci/Tech'] + # Re-map to [0, 1, 2, 3]. + label = int(label) - 1 + text = " ".join((title, description)) + + + new_label = "__label__" + new_label += label_dict[label] + + write_fp.write(f"{new_label} {text}\n") + + super(AGNEWS, self).__init__( + data_folder, label_type="topic", tokenizer=tokenizer, memory_mode=memory_mode, **corpusargs + ) + class STACKOVERFLOW(ClassificationCorpus): """Stackoverflow corpus classifying questions into one of 20 labels. From 6382c476295d3b68aa85525687c58817a179f6c7 Mon Sep 17 00:00:00 2001 From: Alan Akbik Date: Sun, 17 Dec 2023 08:47:58 +0100 Subject: [PATCH 2/4] Ruff fixes --- flair/datasets/__init__.py | 3 ++- flair/datasets/document_classification.py | 28 ++++++++++------------- 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py index eae3f217f..d8edd4445 100644 --- a/flair/datasets/__init__.py +++ b/flair/datasets/__init__.py @@ -100,6 +100,7 @@ # Expose all document classification datasets from .document_classification import ( + AGNEWS, AMAZON_REVIEWS, COMMUNICATIVE_FUNCTIONS, GERMEVAL_2018_OFFENSIVE_LANGUAGE, @@ -108,7 +109,6 @@ GO_EMOTIONS, IMDB, NEWSGROUPS, - AGNEWS, SENTEVAL_CR, SENTEVAL_MPQA, SENTEVAL_MR, @@ -315,6 +315,7 @@ "SentenceDataset", "MongoDataset", "StringDataset", + "AGNEWS", "ANAT_EM", "AZDZ", "BC2GM", diff --git a/flair/datasets/document_classification.py b/flair/datasets/document_classification.py index 8eccba49c..ad9291b43 100644 --- a/flair/datasets/document_classification.py +++ b/flair/datasets/document_classification.py @@ -908,8 +908,7 @@ def __init__( class AGNEWS(ClassificationCorpus): - """ - The AG's News Topic Classification Corpus, classifying news into 4 coarse-grained topics + """The AG's News Topic Classification Corpus, classifying news into 4 coarse-grained topics (World, Sports, Business, Sci/Tech). """ @@ -920,8 +919,7 @@ def __init__( memory_mode="partial", **corpusargs, ): - """ - Instantiates AGNews Classification Corpus with 4 classes. + """Instantiates AGNews Classification Corpus with 4 classes. :param base_path: Provide this only if you store the AGNEWS corpus in a specific folder, otherwise use default. :param tokenizer: Custom tokenizer to use (default is SpaceTokenizer) :param memory_mode: Set to 'partial' by default. Can also be 'full' or 'none'. @@ -936,27 +934,27 @@ def __init__( # download data from same source as in huggingface's implementations agnews_path = "https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/" - original_filenames = ["train.csv", "test.csv",'classes.txt'] + original_filenames = ["train.csv", "test.csv", "classes.txt"] new_filenames = ["train.txt", "test.txt"] - + for original_filename in original_filenames: cached_path(f"{agnews_path}{original_filename}", Path("datasets") / dataset_name / "original") data_file = data_folder / new_filenames[0] label_dict = [] label_path = original_filenames[-1] - - #read label order - with open(data_folder / "original" / label_path, "rt") as f: + + # read label order + with open(data_folder / "original" / label_path) as f: for line in f: line = line.rstrip() label_dict.append(line) - original_filenames=original_filenames[:-1] + original_filenames = original_filenames[:-1] if not data_file.is_file(): for original_filename, new_filename in zip(original_filenames, new_filenames): - with open(data_folder / "original" / original_filename, "rt", encoding="utf-8") as open_fp: - with open(data_folder / new_filename, "wt", encoding="utf-8") as write_fp: + with open(data_folder / "original" / original_filename, encoding="utf-8") as open_fp: + with open(data_folder / new_filename, "w", encoding="utf-8") as write_fp: csv_reader = csv.reader( open_fp, quotechar='"', delimiter=",", quoting=csv.QUOTE_ALL, skipinitialspace=True ) @@ -967,15 +965,13 @@ def __init__( label = int(label) - 1 text = " ".join((title, description)) - new_label = "__label__" new_label += label_dict[label] write_fp.write(f"{new_label} {text}\n") - super(AGNEWS, self).__init__( - data_folder, label_type="topic", tokenizer=tokenizer, memory_mode=memory_mode, **corpusargs - ) + super().__init__(data_folder, label_type="topic", tokenizer=tokenizer, memory_mode=memory_mode, **corpusargs) + class STACKOVERFLOW(ClassificationCorpus): """Stackoverflow corpus classifying questions into one of 20 labels. From d0441d7d72e4cbef0300a8f1ddb564f7ced0e84c Mon Sep 17 00:00:00 2001 From: Alan Akbik Date: Sun, 17 Dec 2023 09:24:47 +0100 Subject: [PATCH 3/4] Make ruff happy --- flair/datasets/document_classification.py | 35 ++++++++++++----------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/flair/datasets/document_classification.py b/flair/datasets/document_classification.py index ad9291b43..97bb19432 100644 --- a/flair/datasets/document_classification.py +++ b/flair/datasets/document_classification.py @@ -908,8 +908,9 @@ def __init__( class AGNEWS(ClassificationCorpus): - """The AG's News Topic Classification Corpus, classifying news into 4 coarse-grained topics - (World, Sports, Business, Sci/Tech). + """The AG's News Topic Classification Corpus, classifying news into 4 coarse-grained topics. + + Labels: World, Sports, Business, Sci/Tech. """ def __init__( @@ -920,6 +921,7 @@ def __init__( **corpusargs, ): """Instantiates AGNews Classification Corpus with 4 classes. + :param base_path: Provide this only if you store the AGNEWS corpus in a specific folder, otherwise use default. :param tokenizer: Custom tokenizer to use (default is SpaceTokenizer) :param memory_mode: Set to 'partial' by default. Can also be 'full' or 'none'. @@ -953,22 +955,23 @@ def __init__( original_filenames = original_filenames[:-1] if not data_file.is_file(): for original_filename, new_filename in zip(original_filenames, new_filenames): - with open(data_folder / "original" / original_filename, encoding="utf-8") as open_fp: - with open(data_folder / new_filename, "w", encoding="utf-8") as write_fp: - csv_reader = csv.reader( - open_fp, quotechar='"', delimiter=",", quoting=csv.QUOTE_ALL, skipinitialspace=True - ) - for id_, row in enumerate(csv_reader): - label, title, description = row - # Original labels are [1, 2, 3, 4] -> ['World', 'Sports', 'Business', 'Sci/Tech'] - # Re-map to [0, 1, 2, 3]. - label = int(label) - 1 - text = " ".join((title, description)) + with open(data_folder / "original" / original_filename, encoding="utf-8") as open_fp, open( + data_folder / new_filename, "w", encoding="utf-8" + ) as write_fp: + csv_reader = csv.reader( + open_fp, quotechar='"', delimiter=",", quoting=csv.QUOTE_ALL, skipinitialspace=True + ) + for id_, row in enumerate(csv_reader): + label, title, description = row + # Original labels are [1, 2, 3, 4] -> ['World', 'Sports', 'Business', 'Sci/Tech'] + # Re-map to [0, 1, 2, 3]. + label = int(label) - 1 + text = " ".join((title, description)) - new_label = "__label__" - new_label += label_dict[label] + new_label = "__label__" + new_label += label_dict[label] - write_fp.write(f"{new_label} {text}\n") + write_fp.write(f"{new_label} {text}\n") super().__init__(data_folder, label_type="topic", tokenizer=tokenizer, memory_mode=memory_mode, **corpusargs) From f1807714635487c03e27546d8cc1421cdd9373a9 Mon Sep 17 00:00:00 2001 From: Alan Akbik Date: Sun, 17 Dec 2023 10:20:18 +0100 Subject: [PATCH 4/4] Make mypy happy --- flair/datasets/document_classification.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/flair/datasets/document_classification.py b/flair/datasets/document_classification.py index 97bb19432..0bbc47181 100644 --- a/flair/datasets/document_classification.py +++ b/flair/datasets/document_classification.py @@ -965,11 +965,10 @@ def __init__( label, title, description = row # Original labels are [1, 2, 3, 4] -> ['World', 'Sports', 'Business', 'Sci/Tech'] # Re-map to [0, 1, 2, 3]. - label = int(label) - 1 text = " ".join((title, description)) new_label = "__label__" - new_label += label_dict[label] + new_label += label_dict[int(label) - 1] write_fp.write(f"{new_label} {text}\n")