From af8f5abc74df3c478e287c17de5124c73ea0277b Mon Sep 17 00:00:00 2001 From: alanakbik Date: Wed, 25 Nov 2020 12:01:26 +0100 Subject: [PATCH 01/35] GH-1983: bump version numbers --- flair/__init__.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/flair/__init__.py b/flair/__init__.py index 7d3e9a311..ecb28ec24 100644 --- a/flair/__init__.py +++ b/flair/__init__.py @@ -25,7 +25,7 @@ import logging.config -__version__ = "0.6.1.post1" +__version__ = "0.7" logging.config.dictConfig( { diff --git a/setup.py b/setup.py index 0ca078dc0..824626455 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name="flair", - version="0.6.1.post1", + version="0.7", description="A very simple framework for state-of-the-art NLP", long_description=open("README.md", encoding="utf-8").read(), long_description_content_type="text/markdown", From 8f20886c1516c22bbb597e9c1a74b5eb79dac954 Mon Sep 17 00:00:00 2001 From: alanakbik Date: Wed, 25 Nov 2020 12:40:42 +0100 Subject: [PATCH 02/35] GH-1983: update list of datasets --- flair/datasets/__init__.py | 32 +- flair/datasets/sequence_labeling.py | 3008 ++++++++++++++------------- resources/docs/TUTORIAL_6_CORPUS.md | 33 +- 3 files changed, 1553 insertions(+), 1520 deletions(-) diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py index 5b611cd23..a59181506 100755 --- a/flair/datasets/__init__.py +++ b/flair/datasets/__init__.py @@ -7,6 +7,7 @@ # Expose all sequence labeling datasets from .sequence_labeling import ColumnCorpus from .sequence_labeling import ColumnDataset +from .sequence_labeling import ANER_CORP from .sequence_labeling import BIOFID from .sequence_labeling import BIOSCOPE from .sequence_labeling import CONLL_03 @@ -14,19 +15,31 @@ from .sequence_labeling import CONLL_03_DUTCH from .sequence_labeling import CONLL_03_SPANISH from .sequence_labeling import CONLL_2000 -from .sequence_labeling import TWITTER_NER from .sequence_labeling import DANE from .sequence_labeling import EUROPARL_NER_GERMAN from .sequence_labeling import GERMEVAL_14 from .sequence_labeling import INSPEC from .sequence_labeling import LER_GERMAN +from .sequence_labeling import MIT_MOVIE_NER_SIMPLE +from .sequence_labeling import MIT_MOVIE_NER_COMPLEX +from .sequence_labeling import MIT_RESTAURANT_NER from .sequence_labeling import NER_BASQUE from .sequence_labeling import NER_FINNISH from .sequence_labeling import NER_SWEDISH from .sequence_labeling import SEMEVAL2010 from .sequence_labeling import SEMEVAL2017 +from .sequence_labeling import TURKU_NER +from .sequence_labeling import TWITTER_NER +from .sequence_labeling import UP_CHINESE +from .sequence_labeling import UP_ENGLISH +from .sequence_labeling import UP_FINNISH +from .sequence_labeling import UP_FRENCH +from .sequence_labeling import UP_GERMAN +from .sequence_labeling import UP_ITALIAN +from .sequence_labeling import UP_SPANISH +from .sequence_labeling import UP_SPANISH_ANCORA +from .sequence_labeling import WEIBO_NER from .sequence_labeling import WIKIANN -from .sequence_labeling import XTREME from .sequence_labeling import WIKIGOLD_NER from .sequence_labeling import WIKINER_ENGLISH from .sequence_labeling import WIKINER_GERMAN @@ -39,20 +52,7 @@ from .sequence_labeling import WIKINER_RUSSIAN from .sequence_labeling import WNUT_17 from .sequence_labeling import WNUT_2020_NER -from .sequence_labeling import WEIBO_NER -from .sequence_labeling import MIT_RESTAURANTS -from .sequence_labeling import UP_CHINESE -from .sequence_labeling import UP_ENGLISH -from .sequence_labeling import UP_FINNISH -from .sequence_labeling import UP_FRENCH -from .sequence_labeling import UP_GERMAN -from .sequence_labeling import UP_ITALIAN -from .sequence_labeling import UP_SPANISH -from .sequence_labeling import UP_SPANISH_ANCORA -from .sequence_labeling import ANER_CORP -from .sequence_labeling import MITMovieNERSimple -from .sequence_labeling import MITMovieNERComplex -from .sequence_labeling import TURKU_NER +from .sequence_labeling import XTREME # Expose all document classification datasets from .document_classification import ClassificationCorpus diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 7dc950dba..02e0a5800 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -267,6 +267,56 @@ def __getitem__(self, index: int = 0) -> Sentence: return sentence +class ANER_CORP(ColumnCorpus): + def __init__( + self, + base_path: Union[str, Path] = None, + tag_to_bioes: str = "ner", + in_memory: bool = True, + document_as_sequence: bool = False, + ): + """ + Initialize a preprocessed version of the Arabic Named Entity Recognition Corpus (ANERCorp) dataset available + from https://github.com/EmnamoR/Arabic-named-entity-recognition/blob/master/ANERCorp.rar. + http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp + Column order is swapped + The first time you call this constructor it will automatically download the dataset. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict + POS tags instead + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ + if type(base_path) == str: + base_path: Path = Path(base_path) + + # column format + columns = {0: "text", 1: "ner"} + + # this dataset name + dataset_name = self.__class__.__name__.lower() + + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + + # download data if necessary + anercorp_path = "https://megantosh.s3.eu-central-1.amazonaws.com/ANERcorp/" + # cached_path(f"{anercorp_path}test.txt", Path("datasets") / dataset_name) + cached_path(f"{anercorp_path}train.txt", Path("datasets") / dataset_name) + + super(ANER_CORP, self).__init__( + data_folder, + columns, + # tag_to_bioes=tag_to_bioes, + encoding="utf-8", + in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + ) + + class BIOFID(ColumnCorpus): def __init__( self, @@ -299,6 +349,36 @@ def __init__( ) +class BIOSCOPE(ColumnCorpus): + + def __init__( + self, + base_path: Union[str, Path] = None, + in_memory: bool = True, + ): + if type(base_path) == str: + base_path: Path = Path(base_path) + + # column format + columns = {0: "text", 1: "tag"} + + # this dataset name + dataset_name = self.__class__.__name__.lower() + + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + + # download data if necessary + bioscope_path = "https://raw.githubusercontent.com/whoisjones/BioScopeSequenceLabelingData/master/sequence_labeled/" + cached_path(f"{bioscope_path}output.txt", Path("datasets") / dataset_name) + + super(BIOSCOPE, self).__init__( + data_folder, columns, in_memory=in_memory, train_file="output.txt" + ) + + class CONLL_03(ColumnCorpus): def __init__( self, @@ -449,21 +529,123 @@ def __init__( ) +def add_IOB_tags(data_file: Union[str, Path], encoding: str = "utf8", ner_column: int = 1): + """ +Function that adds IOB tags if only chunk names are provided (e.g. words are tagged PER instead +of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects +the letter 'O'. Additionally it removes lines with no tags in the data file and can also +be used if the data is only partially IOB tagged. +Parameters +---------- +data_file : Union[str, Path] + Path to the data file. +encoding : str, optional + Encoding used in open function. The default is "utf8". +ner_column : int, optional + Specifies the ner-tagged column. The default is 1 (the second column). -class WNUT_2020_NER(ColumnCorpus): +""" + + def add_I_prefix(current_line: List[str], ner: int, tag: str): + for i in range(0, len(current_line)): + if i == 0: + f.write(line_list[i]) + elif i == ner: + f.write(' I-' + tag) + else: + f.write(' ' + current_line[i]) + f.write('\n') + + with open(file=data_file, mode='r', encoding=encoding) as f: + lines = f.readlines() + with open(file=data_file, mode='w', encoding=encoding) as f: + pred = 'O' # remembers ner tag of predecessing line + for line in lines: + line_list = line.split() + if len(line_list) > 2: # word with tags + ner_tag = line_list[ner_column] + if ner_tag in ['0', 'O']: # no chunk + for i in range(0, len(line_list)): + if i == 0: + f.write(line_list[i]) + elif i == ner_column: + f.write(' O') + else: + f.write(' ' + line_list[i]) + f.write('\n') + pred = 'O' + elif '-' not in ner_tag: # no IOB tags + if pred == 'O': # found a new chunk + add_I_prefix(line_list, ner_column, ner_tag) + pred = ner_tag + else: # found further part of chunk or new chunk directly after old chunk + add_I_prefix(line_list, ner_column, ner_tag) + pred = ner_tag + else: # line already has IOB tag (tag contains '-') + f.write(line) + pred = ner_tag.split('-')[1] + elif len(line_list) == 0: # empty line + f.write('\n') + pred = 'O' + + +def add_IOB2_tags(data_file: Union[str, Path], encoding: str = "utf8"): + """ +Function that adds IOB2 tags if only chunk names are provided (e.g. words are tagged PER instead +of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects +the letter 'O'. Additionally it removes lines with no tags in the data file and can also +be used if the data is only partially IOB tagged. +Parameters +---------- +data_file : Union[str, Path] + Path to the data file. +encoding : str, optional + Encoding used in open function. The default is "utf8". + +""" + with open(file=data_file, mode='r', encoding=encoding) as f: + lines = f.readlines() + with open(file=data_file, mode='w', encoding=encoding) as f: + pred = 'O' # remembers tag of predecessing line + for line in lines: + line_list = line.split() + if len(line_list) == 2: # word with tag + word = line_list[0] + tag = line_list[1] + if tag in ['0', 'O']: # no chunk + f.write(word + ' O\n') + pred = 'O' + elif '-' not in tag: # no IOB tags + if pred == 'O': # found a new chunk + f.write(word + ' B-' + tag + '\n') + pred = tag + else: # found further part of chunk or new chunk directly after old chunk + if pred == tag: + f.write(word + ' I-' + tag + '\n') + else: + f.write(word + ' B-' + tag + '\n') + pred = tag + else: # line already has IOB tag (tag contains '-') + f.write(line) + pred = tag.split('-')[1] + elif len(line_list) == 0: # empty line + f.write('\n') + pred = 'O' + + +class CONLL_03_SPANISH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", in_memory: bool = True, - document_as_sequence: bool = False, ): """ - Initialize the WNUT_2020_NER corpus. The first time you call this constructor it will automatically + Initialize the CoNLL-03 corpus for Spanish. The first time you call this constructor it will automatically download the dataset. :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, since it is the only option of the WNUT corpus. + :param tag_to_bioes: NER by default, should not be changed :param in_memory: If True, keeps dataset in memory giving speedups in training. :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ @@ -482,65 +664,40 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - github_url = "https://github.com/jeniyat/WNUT_2020_NER/archive/master.zip" - - for sample in ["train", "test", "dev"]: - - sample_file = data_folder / (sample + ".txt") - if not sample_file.is_file(): - - zip_path = cached_path( - f"{github_url}", Path("datasets") / dataset_name - ) - - # unzip the downloaded repo and merge the train, dev and test datasets - unpack_file(zip_path, data_folder, "zip", False) # unzipped folder name: WNUT_2020_NER-master - - if sample == "test": - file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data_2020/Conll_Format/") - else: - file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data/Conll_Format/") - filenames = os.listdir(file_path) - with open(data_folder / (sample + '.txt'), 'w') as outfile: - for fname in filenames: - with open(file_path / fname) as infile: - lines = infile.read() - outfile.write(lines) - - shutil.rmtree(str(data_folder / "WNUT_2020_NER-master")) # clean up when done + conll_02_path = "https://www.clips.uantwerpen.be/conll2002/ner/data/" + cached_path(f"{conll_02_path}esp.testa", Path("datasets") / dataset_name) + cached_path(f"{conll_02_path}esp.testb", Path("datasets") / dataset_name) + cached_path(f"{conll_02_path}esp.train", Path("datasets") / dataset_name) - super(WNUT_2020_NER, self).__init__( + super(CONLL_03_SPANISH, self).__init__( data_folder, columns, tag_to_bioes=tag_to_bioes, - encoding="utf-8", + encoding="latin-1", in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", ) -class WIKIGOLD_NER(ColumnCorpus): +class CONLL_2000(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", + tag_to_bioes: str = "np", in_memory: bool = True, - document_as_sequence: bool = False, ): """ - Initialize the wikigold corpus. The first time you call this constructor it will automatically - download the dataset. + Initialize the CoNLL-2000 corpus for English chunking. + The first time you call this constructor it will automatically download the dataset. :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, should not be changed + :param tag_to_bioes: 'np' by default, should not be changed, but you can set 'pos' instead to predict POS tags :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "ner"} + columns = {0: "text", 1: "pos", 2: "np"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -551,45 +708,52 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - wikigold_ner_path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/wikigold/CONLL-format/data/" - cached_path(f"{wikigold_ner_path}wikigold.conll.txt", Path("datasets") / dataset_name) - - super(WIKIGOLD_NER, self).__init__( - data_folder, - columns, - tag_to_bioes=tag_to_bioes, - encoding="utf-8", - in_memory=in_memory, - train_file='wikigold.conll.txt', - document_separator_token=None if not document_as_sequence else "-DOCSTART-", - ) - + conll_2000_path = "https://www.clips.uantwerpen.be/conll2000/chunking/" + data_file = Path(flair.cache_root) / "datasets" / dataset_name / "train.txt" + if not data_file.is_file(): + cached_path( + f"{conll_2000_path}train.txt.gz", Path("datasets") / dataset_name + ) + cached_path( + f"{conll_2000_path}test.txt.gz", Path("datasets") / dataset_name + ) + import gzip, shutil -class TWITTER_NER(ColumnCorpus): + with gzip.open( + Path(flair.cache_root) / "datasets" / dataset_name / "train.txt.gz", + "rb", + ) as f_in: + with open( + Path(flair.cache_root) / "datasets" / dataset_name / "train.txt", + "wb", + ) as f_out: + shutil.copyfileobj(f_in, f_out) + with gzip.open( + Path(flair.cache_root) / "datasets" / dataset_name / "test.txt.gz", "rb" + ) as f_in: + with open( + Path(flair.cache_root) / "datasets" / dataset_name / "test.txt", + "wb", + ) as f_out: + shutil.copyfileobj(f_in, f_out) + + super(CONLL_2000, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory + ) + + +class DANE(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", in_memory: bool = True, - document_as_sequence: bool = False, ): - """ - Initialize a dataset called twitter_ner which can be found on the following page: - https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/ner.txt. - - The first time you call this constructor it will automatically - download the dataset. - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, need not be changed - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: 'text', 1: 'ner'} + columns = {1: 'text', 3: 'pos', 9: 'ner'} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -600,43 +764,61 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - twitter_ner_path = "https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/" - cached_path(f"{twitter_ner_path}ner.txt", Path("datasets") / dataset_name) + data_path = Path(flair.cache_root) / "datasets" / dataset_name + train_data_file = data_path / "ddt.train.conllu" + if not train_data_file.is_file(): + temp_file = cached_path( + 'https://danlp.alexandra.dk/304bd159d5de/datasets/ddt.zip', + Path("datasets") / dataset_name + ) + from zipfile import ZipFile - super(TWITTER_NER, self).__init__( - data_folder, - columns, - tag_to_bioes=tag_to_bioes, - encoding="latin-1", - train_file="ner.txt", - in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", + with ZipFile(temp_file, 'r') as zip_file: + zip_file.extractall(path=data_path) + + # Remove CoNLL-U meta information in the last column + for part in ['train', 'dev', 'test']: + lines = [] + data_file = "ddt.{}.conllu".format(part) + with open(data_path / data_file, 'r') as file: + for line in file: + if line.startswith("#") or line == "\n": + lines.append(line) + lines.append(line.replace("name=", "").replace("|SpaceAfter=No", "")) + + with open(data_path / data_file, 'w') as file: + file.writelines(lines) + + print(data_path / data_file) + + super(DANE, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, + in_memory=in_memory, comment_symbol="#" ) -class MIT_RESTAURANTS(ColumnCorpus): +class EUROPARL_NER_GERMAN(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", - in_memory: bool = True, - document_as_sequence: bool = False, + in_memory: bool = False, ): """ - Initialize the experimental MIT Restaurant corpus available on https://groups.csail.mit.edu/sls/downloads/restaurant/. - The first time you call this constructor it will automatically download the dataset. + Initialize the EUROPARL_NER_GERMAN corpus. The first time you call this constructor it will automatically + download the dataset. :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict - POS tags instead - :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param tag_to_bioes: 'ner' by default, should not be changed. + :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage. :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ + if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "ner"} + columns = {0: 'text', 1: 'lemma', 2: 'pos', 3: 'np', 4: 'ner'} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -647,125 +829,25 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - mit_restaurants_path = "https://megantosh.s3.eu-central-1.amazonaws.com/MITRestoCorpus/" - cached_path(f"{mit_restaurants_path}test.txt", Path("datasets") / dataset_name) - cached_path(f"{mit_restaurants_path}train.txt", Path("datasets") / dataset_name) + europarl_ner_german_path = "https://nlpado.de/~sebastian/software/ner/" + cached_path(f"{europarl_ner_german_path}ep-96-04-15.conll", Path("datasets") / dataset_name) + cached_path(f"{europarl_ner_german_path}ep-96-04-16.conll", Path("datasets") / dataset_name) + + add_IOB_tags(data_file=Path(data_folder / "ep-96-04-15.conll"), encoding="latin-1", ner_column=4) + add_IOB_tags(data_file=Path(data_folder / "ep-96-04-16.conll"), encoding="latin-1", ner_column=4) - super(MIT_RESTAURANTS, self).__init__( + super(EUROPARL_NER_GERMAN, self).__init__( data_folder, columns, tag_to_bioes=tag_to_bioes, encoding="latin-1", in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", + train_file='ep-96-04-16.conll', + test_file='ep-96-04-15.conll' ) -def add_IOB_tags(data_file: Union[str, Path], encoding: str = "utf8", ner_column: int = 1): - """ -Function that adds IOB tags if only chunk names are provided (e.g. words are tagged PER instead -of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects -the letter 'O'. Additionally it removes lines with no tags in the data file and can also -be used if the data is only partially IOB tagged. -Parameters ----------- -data_file : Union[str, Path] - Path to the data file. -encoding : str, optional - Encoding used in open function. The default is "utf8". -ner_column : int, optional - Specifies the ner-tagged column. The default is 1 (the second column). - -""" - - def add_I_prefix(current_line: List[str], ner: int, tag: str): - for i in range(0, len(current_line)): - if i == 0: - f.write(line_list[i]) - elif i == ner: - f.write(' I-' + tag) - else: - f.write(' ' + current_line[i]) - f.write('\n') - - with open(file=data_file, mode='r', encoding=encoding) as f: - lines = f.readlines() - with open(file=data_file, mode='w', encoding=encoding) as f: - pred = 'O' # remembers ner tag of predecessing line - for line in lines: - line_list = line.split() - if len(line_list) > 2: # word with tags - ner_tag = line_list[ner_column] - if ner_tag in ['0', 'O']: # no chunk - for i in range(0, len(line_list)): - if i == 0: - f.write(line_list[i]) - elif i == ner_column: - f.write(' O') - else: - f.write(' ' + line_list[i]) - f.write('\n') - pred = 'O' - elif '-' not in ner_tag: # no IOB tags - if pred == 'O': # found a new chunk - add_I_prefix(line_list, ner_column, ner_tag) - pred = ner_tag - else: # found further part of chunk or new chunk directly after old chunk - add_I_prefix(line_list, ner_column, ner_tag) - pred = ner_tag - else: # line already has IOB tag (tag contains '-') - f.write(line) - pred = ner_tag.split('-')[1] - elif len(line_list) == 0: # empty line - f.write('\n') - pred = 'O' - - -def add_IOB2_tags(data_file: Union[str, Path], encoding: str = "utf8"): - """ -Function that adds IOB2 tags if only chunk names are provided (e.g. words are tagged PER instead -of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects -the letter 'O'. Additionally it removes lines with no tags in the data file and can also -be used if the data is only partially IOB tagged. -Parameters ----------- -data_file : Union[str, Path] - Path to the data file. -encoding : str, optional - Encoding used in open function. The default is "utf8". - -""" - with open(file=data_file, mode='r', encoding=encoding) as f: - lines = f.readlines() - with open(file=data_file, mode='w', encoding=encoding) as f: - pred = 'O' # remembers tag of predecessing line - for line in lines: - line_list = line.split() - if len(line_list) == 2: # word with tag - word = line_list[0] - tag = line_list[1] - if tag in ['0', 'O']: # no chunk - f.write(word + ' O\n') - pred = 'O' - elif '-' not in tag: # no IOB tags - if pred == 'O': # found a new chunk - f.write(word + ' B-' + tag + '\n') - pred = tag - else: # found further part of chunk or new chunk directly after old chunk - if pred == tag: - f.write(word + ' I-' + tag + '\n') - else: - f.write(word + ' B-' + tag + '\n') - pred = tag - else: # line already has IOB tag (tag contains '-') - f.write(line) - pred = tag.split('-')[1] - elif len(line_list) == 0: # empty line - f.write('\n') - pred = 'O' - - -class CONLL_03_SPANISH(ColumnCorpus): +class GERMEVAL_14(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, @@ -773,19 +855,18 @@ def __init__( in_memory: bool = True, ): """ - Initialize the CoNLL-03 corpus for Spanish. The first time you call this constructor it will automatically - download the dataset. - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, should not be changed - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + Initialize the GermEval NER corpus for German. This is only possible if you've manually downloaded it to your + machine. Obtain the corpus from https://sites.google.com/site/germeval2014ner/data and put it into some folder. + Then point the base_path parameter in the constructor to this folder + :param base_path: Path to the GermEval corpus on your machine + :param tag_to_bioes: 'ner' by default, should not be changed. + :param in_memory:If True, keeps dataset in memory giving speedups in training. """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "ner"} + columns = {1: "text", 2: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -795,41 +876,36 @@ def __init__( base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name - # download data if necessary - conll_02_path = "https://www.clips.uantwerpen.be/conll2002/ner/data/" - cached_path(f"{conll_02_path}esp.testa", Path("datasets") / dataset_name) - cached_path(f"{conll_02_path}esp.testb", Path("datasets") / dataset_name) - cached_path(f"{conll_02_path}esp.train", Path("datasets") / dataset_name) - - super(CONLL_03_SPANISH, self).__init__( + # check if data there + if not data_folder.exists(): + log.warning("-" * 100) + log.warning(f'WARNING: GermEval-14 dataset not found at "{data_folder}".') + log.warning( + 'Instructions for obtaining the data can be found here: https://sites.google.com/site/germeval2014ner/data"' + ) + log.warning("-" * 100) + super(GERMEVAL_14, self).__init__( data_folder, columns, tag_to_bioes=tag_to_bioes, - encoding="latin-1", + comment_symbol="#", in_memory=in_memory, ) -class CONLL_2000(ColumnCorpus): +class INSPEC(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "np", + tag_to_bioes: str = "keyword", in_memory: bool = True, ): - """ - Initialize the CoNLL-2000 corpus for English chunking. - The first time you call this constructor it will automatically download the dataset. - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: 'np' by default, should not be changed, but you can set 'pos' instead to predict POS tags - :param in_memory: If True, keeps dataset in memory giving speedups in training. - """ + if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "pos", 2: "np"} + columns = {0: "text", 1: "keyword"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -839,77 +915,34 @@ def __init__( base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name - # download data if necessary - conll_2000_path = "https://www.clips.uantwerpen.be/conll2000/chunking/" - data_file = Path(flair.cache_root) / "datasets" / dataset_name / "train.txt" - if not data_file.is_file(): - cached_path( - f"{conll_2000_path}train.txt.gz", Path("datasets") / dataset_name - ) - cached_path( - f"{conll_2000_path}test.txt.gz", Path("datasets") / dataset_name - ) - import gzip, shutil - - with gzip.open( - Path(flair.cache_root) / "datasets" / dataset_name / "train.txt.gz", - "rb", - ) as f_in: - with open( - Path(flair.cache_root) / "datasets" / dataset_name / "train.txt", - "wb", - ) as f_out: - shutil.copyfileobj(f_in, f_out) - with gzip.open( - Path(flair.cache_root) / "datasets" / dataset_name / "test.txt.gz", "rb" - ) as f_in: - with open( - Path(flair.cache_root) / "datasets" / dataset_name / "test.txt", - "wb", - ) as f_out: - shutil.copyfileobj(f_in, f_out) + inspec_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/Inspec" + cached_path(f"{inspec_path}/train.txt", Path("datasets") / dataset_name) + cached_path(f"{inspec_path}/test.txt", Path("datasets") / dataset_name) + if not "dev.txt" in os.listdir(data_folder): + cached_path(f"{inspec_path}/valid.txt", Path("datasets") / dataset_name) + # rename according to train - test - dev - convention + os.rename(data_folder / "valid.txt", data_folder / "dev.txt") - super(CONLL_2000, self).__init__( + super(INSPEC, self).__init__( data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class XTREME(MultiCorpus): +class LER_GERMAN(ColumnCorpus): def __init__( self, - languages: Union[str, List[str]] = None, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", in_memory: bool = False, ): """ - Xtreme corpus for cross-lingual NER consisting of datasets of a total of 176 languages. The data comes from the google - research work XTREME https://github.com/google-research/xtreme. All datasets for NER and respective language abbreviations (e.g. - "en" for english can be found here https://www.amazon.com/clouddrive/share/d3KGCRCIYwhKJF0H3eWA26hjg2ZCRhjpEQtDL70FSBN/folder/C43gs51bSIaq5sFTQkWNCQ?_encoding=UTF8&*Version*=1&*entries*=0&mgh=1 ) - The data is derived from the wikiann dataset https://elisa-ie.github.io/wikiann/ (license: https://opendatacommons.org/licenses/by/) - - Parameters - ---------- - languages : Union[str, List[str]], optional - Default the 40 languages that are used in XTREME are loaded. Otherwise on can hand over a strings or a list of strings - consisiting of abbreviations for languages. All datasets will be loaded in a MultiCorpus object. - base_path : Union[str, Path], optional - Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - tag_to_bioes : str, optional - The data is in bio-format. It will by default (with the string "ner" as value) be transformed - into the bioes format. If you dont want that set it to None. - + Initialize the LER_GERMAN (Legal Entity Recognition) corpus. The first time you call this constructor it will automatically + download the dataset. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ - # if no languages are given as argument all languages used in XTREME will be loaded - if not languages: - languages = ["af", "ar", "bg", "bn", "de", "el", "en", "es", "et", "eu", "fa", "fi", "fr", "he", "hi", "hu", - "id", "it", "ja", "jv", "ka", "kk", "ko", "ml", "mr", "ms", "my", "nl", "pt", "ru", "sw", "ta", - "te", "th", "tl", "tr", "ur", "vi", "yo", "zh"] - - # if only one language is given - if type(languages) == str: - languages = [languages] if type(base_path) == str: base_path: Path = Path(base_path) @@ -918,112 +951,136 @@ def __init__( columns = {0: "text", 1: "ner"} # this dataset name - dataset_name = "xtreme" + dataset_name = self.__class__.__name__.lower() # default dataset folder is the cache root if not base_path: base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name - # For each language in languages, the file is downloaded if not existent - # Then a comlumncorpus of that data is created and saved in a list - # This list is handed to the multicorpus + # download data if necessary + ler_path = "https://raw.githubusercontent.com/elenanereiss/Legal-Entity-Recognition/master/data/" + cached_path(f"{ler_path}ler.conll", Path("datasets") / dataset_name) - # list that contains the columncopora - corpora = [] + super(LER_GERMAN, self).__init__( + data_folder, + columns, + tag_to_bioes=tag_to_bioes, + in_memory=in_memory, + train_file='ler.conll' + ) - hu_path = "https://nlp.informatik.hu-berlin.de/resources/datasets/panx_dataset" - # download data if necessary - for language in languages: +class MIT_MOVIE_NER_SIMPLE(ColumnCorpus): + def __init__( + self, + base_path: Union[str, Path] = None, + tag_to_bioes: str = "ner", + in_memory: bool = True, + ): + """ + Initialize the eng corpus of the MIT Movie Corpus (it has simpler queries compared to the trivia10k13 corpus) + in BIO format. The first time you call this constructor it will automatically download the dataset. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict + POS tags instead + :param in_memory: If True, keeps dataset in memory giving speedups in training. + """ + # column format + columns = {0: "ner", 1: "text"} - language_folder = data_folder / language + # dataset name + dataset_name = self.__class__.__name__.lower() - # if language not downloaded yet, download it - if not language_folder.exists(): + # data folder: default dataset folder is the cache root + if type(base_path) == str: + base_path: Path = Path(base_path) + if not base_path: + base_path: Path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name - file_name = language + '.tar.gz' - # create folder - os.makedirs(language_folder) + # download data if necessary + mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/" + train_file = "engtrain.bio" + test_file = "engtest.bio" + cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name) + cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name) - # download from HU Server - temp_file = cached_path( - hu_path + "/" + file_name, - Path("datasets") / dataset_name / language - ) + super(MIT_MOVIE_NER_SIMPLE, self).__init__( + data_folder, + columns, + train_file=train_file, + test_file=test_file, + tag_to_bioes=tag_to_bioes, + in_memory=in_memory, + ) - # unzip - print("Extract data...") - import tarfile - tar = tarfile.open(str(temp_file), "r:gz") - for part in ["train", "test", "dev"]: - tar.extract(part, str(language_folder)) - tar.close() - print('...done.') - # transform data into required format - print("Process dataset...") - for part in ["train", "test", "dev"]: - xtreme_to_simple_ner_annotation(str(language_folder / part)) - print('...done.') +class MIT_MOVIE_NER_COMPLEX(ColumnCorpus): + def __init__( + self, + base_path: Union[str, Path] = None, + tag_to_bioes: str = "ner", + in_memory: bool = True, + ): + """ + Initialize the trivia10k13 corpus of the MIT Movie Corpus (it has more complex queries compared to the eng corpus) + in BIO format. The first time you call this constructor it will automatically download the dataset. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict + POS tags instead + :param in_memory: If True, keeps dataset in memory giving speedups in training. + """ + # column format + columns = {0: "ner", 1: "text"} - # initialize comlumncorpus and add it to list - print("Read data into corpus...") - corp = ColumnCorpus(data_folder=language_folder, - column_format=columns, - tag_to_bioes=tag_to_bioes, - in_memory=in_memory, - ) - corpora.append(corp) - print("...done.") + # dataset name + dataset_name = self.__class__.__name__.lower() - super(XTREME, self).__init__( - corpora, name='xtreme' - ) + # data folder: default dataset folder is the cache root + if type(base_path) == str: + base_path: Path = Path(base_path) + if not base_path: + base_path: Path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + # download data if necessary + mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/" + train_file = "trivia10k13train.bio" + test_file = "trivia10k13test.bio" + cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name) + cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name) -def xtreme_to_simple_ner_annotation(data_file: Union[str, Path]): - with open(data_file, 'r', encoding='utf-8') as f: - lines = f.readlines() - with open(data_file, 'w', encoding='utf-8') as f: - for line in lines: - if line == '\n': - f.write(line) - else: - liste = line.split() - f.write(liste[0].split(':', 1)[1] + ' ' + liste[1] + '\n') + super(MIT_MOVIE_NER_COMPLEX, self).__init__( + data_folder, + columns, + train_file=train_file, + test_file=test_file, + tag_to_bioes=tag_to_bioes, + in_memory=in_memory, + ) -class WIKIANN(MultiCorpus): +class MIT_RESTAURANT_NER(ColumnCorpus): def __init__( self, - languages: Union[str, List[str]], base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", - in_memory: bool = False, + in_memory: bool = True, + document_as_sequence: bool = False, ): """ - WkiAnn corpus for cross-lingual NER consisting of datasets from 282 languages that exist - in Wikipedia. See https://elisa-ie.github.io/wikiann/ for details and for the languages and their - respective abbreveations, i.e. "en" for english. (license: https://opendatacommons.org/licenses/by/) - Parameters - ---------- - languages : Union[str, List[str]] - Should be an abbreviation of a language ("en", "de",..) or a list of abbreviations. - The datasets of all passed languages will be saved in one MultiCorpus. - (Note that, even though listed on https://elisa-ie.github.io/wikiann/ some datasets are empty. - This includes "aa", "cho", "ho", "hz", "ii", "jam", "kj", "kr", "mus", "olo" and "tcy".) - base_path : Union[str, Path], optional - Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - tag_to_bioes : str, optional - The data is in bio-format. It will by default (with the string "ner" as value) be transformed - into the bioes format. If you dont want that set it to None. - + Initialize the experimental MIT Restaurant corpus available on https://groups.csail.mit.edu/sls/downloads/restaurant/. + The first time you call this constructor it will automatically download the dataset. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict + POS tags instead + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ - if type(languages) == str: - languages = [languages] - if type(base_path) == str: base_path: Path = Path(base_path) @@ -1031,405 +1088,140 @@ def __init__( columns = {0: "text", 1: "ner"} # this dataset name - dataset_name = "wikiann" + dataset_name = self.__class__.__name__.lower() # default dataset folder is the cache root if not base_path: base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name - # For each language in languages, the file is downloaded if not existent - # Then a comlumncorpus of that data is created and saved in a list - # this list is handed to the multicorpus + # download data if necessary + mit_restaurants_path = "https://megantosh.s3.eu-central-1.amazonaws.com/MITRestoCorpus/" + cached_path(f"{mit_restaurants_path}test.txt", Path("datasets") / dataset_name) + cached_path(f"{mit_restaurants_path}train.txt", Path("datasets") / dataset_name) - # list that contains the columncopora - corpora = [] + super(MIT_RESTAURANT_NER, self).__init__( + data_folder, + columns, + tag_to_bioes=tag_to_bioes, + encoding="latin-1", + in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + ) + + +class NER_BASQUE(ColumnCorpus): + def __init__( + self, + base_path: Union[str, Path] = None, + tag_to_bioes: str = "ner", + in_memory: bool = True, + ): + if type(base_path) == str: + base_path: Path = Path(base_path) + + # column format + columns = {0: "text", 1: "ner"} + + # this dataset name + dataset_name = self.__class__.__name__.lower() + + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name - google_drive_path = 'https://drive.google.com/uc?id=' # download data if necessary - first = True - for language in languages: + ner_basque_path = "http://ixa2.si.ehu.eus/eiec/" + data_path = Path(flair.cache_root) / "datasets" / dataset_name + data_file = data_path / "named_ent_eu.train" + if not data_file.is_file(): + cached_path( + f"{ner_basque_path}/eiec_v1.0.tgz", Path("datasets") / dataset_name + ) + import tarfile, shutil - language_folder = data_folder / language - file_name = 'wikiann-' + language + '.bio' + with tarfile.open( + Path(flair.cache_root) / "datasets" / dataset_name / "eiec_v1.0.tgz", + "r:gz", + ) as f_in: + corpus_files = ( + "eiec_v1.0/named_ent_eu.train", + "eiec_v1.0/named_ent_eu.test", + ) + for corpus_file in corpus_files: + f_in.extract(corpus_file, data_path) + shutil.move(f"{data_path}/{corpus_file}", data_path) - # if language not downloaded yet, download it - if not language_folder.exists(): - if first == True: - import gdown - import tarfile - first = False - # create folder - os.makedirs(language_folder) - # get google drive id from list - google_id = google_drive_id_from_language_name(language) - url = google_drive_path + google_id + super(NER_BASQUE, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory + ) - # download from google drive - gdown.download(url, str(language_folder / language) + '.tar.gz') - # unzip - print("Extract data...") - tar = tarfile.open(str(language_folder / language) + '.tar.gz', "r:gz") - # tar.extractall(language_folder,members=[tar.getmember(file_name)]) - tar.extract(file_name, str(language_folder)) - tar.close() - print('...done.') +class NER_FINNISH(ColumnCorpus): + def __init__( + self, + base_path: Union[str, Path] = None, + tag_to_bioes: str = "ner", + in_memory: bool = True, + ): + if type(base_path) == str: + base_path: Path = Path(base_path) - # transform data into required format - # the processed dataset has the additional ending "_new" - print("Process dataset...") - silver_standard_to_simple_ner_annotation(str(language_folder / file_name)) - # remove the unprocessed dataset - os.remove(str(language_folder / file_name)) - print('...done.') + # column format + columns = {0: "text", 1: "ner"} - # initialize comlumncorpus and add it to list - print("Read data into corpus...") - corp = ColumnCorpus(data_folder=language_folder, - column_format=columns, - train_file=file_name + '_new', - tag_to_bioes=tag_to_bioes, - in_memory=in_memory, - ) - corpora.append(corp) - print("...done.") + # this dataset name + dataset_name = self.__class__.__name__.lower() - super(WIKIANN, self).__init__( - corpora, name='wikiann' + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + + # download data if necessary + ner_finnish_path = "https://raw.githubusercontent.com/mpsilfve/finer-data/master/data/digitoday." + cached_path(f"{ner_finnish_path}2014.train.csv", Path("datasets") / dataset_name) + cached_path(f"{ner_finnish_path}2014.dev.csv", Path("datasets") / dataset_name) + cached_path(f"{ner_finnish_path}2015.test.csv", Path("datasets") / dataset_name) + + _remove_lines_without_annotations(data_file=Path(data_folder / "digitoday.2015.test.csv")) + + super(NER_FINNISH, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, skip_first_line=True ) -def silver_standard_to_simple_ner_annotation(data_file: Union[str, Path]): - f_read = open(data_file, 'r', encoding='utf-8') - f_write = open(data_file + '_new', 'w+', encoding='utf-8') - while True: - line = f_read.readline() - if line: - if line == '\n': - f_write.write(line) - else: - liste = line.split() - f_write.write(liste[0] + ' ' + liste[-1] + '\n') - else: - break - f_read.close() - f_write.close() +def _remove_lines_without_annotations(data_file: Union[str, Path] = None): + with open(data_file, 'r') as f: + lines = f.readlines() + with open(data_file, 'w') as f: + for line in lines: + if len(line.split()) != 1: + f.write(line) -def google_drive_id_from_language_name(language): - languages_ids = { - 'aa': '1tDDlydKq7KQQ3_23Ysbtke4HJOe4snIk', # leer - 'ab': '1hB8REj2XA_0DjI9hdQvNvSDpuBIb8qRf', - 'ace': '1WENJS2ppHcZqaBEXRZyk2zY-PqXkTkgG', - 'ady': '1n6On8WWDHxEoybj7F9K15d_fkGPy6KgO', - 'af': '1CPB-0BD2tg3zIT60D3hmJT0i5O_SKja0', - 'ak': '1l2vlGHnQwvm9XhW5S-403fetwUXhBlZm', - 'als': '196xyYjhbie7sYLHLZHWkkurOwQLi8wK-', - 'am': '1ug1IEoExKD3xWpvfZprAPSQi82YF9Cet', - 'an': '1DNLgPOAOsGZBYd6rC5ddhzvc9_DtWnk2', - 'ang': '1W_0ti7Tl8AkqM91lRCMPWEuUnPOAZroV', - 'ar': '1tyvd32udEQG_cNeVpaD5I2fxvCc6XKIS', - 'arc': '1hSOByStqPmP3b9HfQ39EclUZGo8IKCMb', - 'arz': '1CKW5ZhxTpIHmc8Jt5JLz_5O6Cr8Icsan', - 'as': '12opBoIweBLM8XciMHT4B6-MAaKdYdvpE', - 'ast': '1rp64PxGZBDfcw-tpFBjLg_ddLDElG1II', - 'av': '1hncGUrkG1vwAAQgLtwOf41BWkHkEvdss', - 'ay': '1VmIsWpMTz442b4Mx798ZOgtB9vquKQtf', - 'az': '1FXDXsvBSdqc7GGIDZv0hqBOaaw12Ip2-', - 'azb': '1amVqOuHLEkhjn8rkGUl-mXdZlaACWyNT', - 'ba': '1aLx1d8GagI11VZVYOGQy0BEePeqoT0x3', - 'bar': '1JZ8-k8ZmnpWYI_Yl_cBBgjVdxoM9Daci', - 'bat-smg': '1trxKXDFSeKsygTMKi-ZqXSJs7F90k5a8', - 'bcl': '1Hs0k7KVZ2DPsqroZ4cUKcwZG4HdPV794', - 'be-x-old': '1gaK-spj1m6eGYQ-SsngLxxLUvP1VRk08', - 'be': '1_ttfOSy9BzCRkIT_p3mImT82XRPpEiuH', - 'bg': '1Iug6gYKemb0OrLTUrKDc_c66YGypTfCF', - 'bh': '12OcSFLu940A8tVQLxI8pnxKBpTeZHmrh', - 'bi': '1rftVziS_pqARx4mvLJC0sKLY-OL5ZIjE', - 'bjn': '1n17mkRjPUAOWQk5LQs2C3Tz3ShxK0enZ', - 'bm': '1284dwO_sfdsWE7FR06HhfBRUb8ePesKR', - 'bn': '1K2DM1mT4hkr6NlAIBTj95BeVXcgvpgDm', - 'bo': '1SzGHDVK-OguKdjZ4DXWiOJVrie1iHeWm', - 'bpy': '1m-e5EoruJufvwBEgJLmJtx6jzx64pYN2', - 'br': '1xdaBoJ1DnwI0iEq7gQN1dWcABAs_bM9H', - 'bs': '167dsB01trMYFQl8FshtIdfhjw7IfVKbk', - 'bug': '1yCnevM9_KJzFk27Vxsva_20OacLo4Uam', - 'bxr': '1DlByAX3zB-9UyEAVD4wtX-R7mXC-8xum', - 'ca': '1LuUgbd9sGa-5Ahcsy31EK89a3WOowftY', - 'cbk-zam': '1kgF8xoD-kIOWZET_9kp_4yNX6AAXn6PI', - 'cdo': '14x1y6611G-UAEGq92QEHRpreVkYnoUCw', - 'ce': '1QUUCVKA-fkiCHd3KT3zUWefaWnxzlZLu', - 'ceb': '1DJZE9RfaMoPNXHI73KBXAm4YSe-_YCUk', - 'ch': '1YzAfhmatkmTpkZbAcD6X83epCgzD5S2_', - 'cho': '1ciY0vF3c5a2mTOo_k32A2wMs0klK98Kb', # leer - 'chr': '1EHaxz1UZHn7v2bbRzCLAhPsNtRzrG3Ae', - 'chy': '1nNWwMAJr1KNdz3bHf6uIn-thZCknlTeB', - 'ckb': '1llpaftcUSiXCZQZMdAqaJSrhwMdcf9IV', - 'co': '1ZP-8oWgMYfW7a6w6ygEFkKDGbN39QnDn', - 'cr': '1ST0xRicLAG4JdCZwGdaY-0pEXooQh7e6', - 'crh': '1Jmpq2XVYUR_XaXU5XNhtOMnz-qkpsgpE', - 'cs': '1Vydyze-jBkK_S1uV5ewV_Y6dbwhXr7lk', - 'csb': '1naUyF74lZPnnopXdOqf5Xor2kT4WoHfS', - 'cu': '1EN5dVTU6jc7YOYPCHq8EYUF31HlMUKs7', - 'cv': '1gEUAlqYSSDI4TrWCqP1LUq2n0X1XEjN3', - 'cy': '1q5g6NJE5GXf65Vc_P4BnUMHQ49Prz-J1', - 'da': '11onAGOLkkqrIwM784siWlg-cewa5WKm8', - 'de': '1f9nWvNkCCy6XWhd9uf4Dq-2--GzSaYAb', - 'diq': '1IkpJaVbEOuOs9qay_KG9rkxRghWZhWPm', - 'dsb': '1hlExWaMth-2eVIQ3i3siJSG-MN_7Z6MY', - 'dv': '1WpCrslO4I7TMb2uaKVQw4U2U8qMs5szi', - 'dz': '10WX52ePq2KfyGliwPvY_54hIjpzW6klV', - 'ee': '1tYEt3oN2KPzBSWrk9jpCqnW3J1KXdhjz', - 'el': '1cxq4NUYmHwWsEn5waYXfFSanlINXWLfM', - 'eml': '17FgGhPZqZNtzbxpTJOf-6nxEuI5oU4Vd', - 'en': '1mqxeCPjxqmO7e8utj1MQv1CICLFVvKa-', - 'eo': '1YeknLymGcqj44ug2yd4P7xQVpSK27HkK', - 'es': '1Dnx3MVR9r5cuoOgeew2gT8bDvWpOKxkU', - 'et': '1Qhb3kYlQnLefWmNimdN_Vykm4mWzbcWy', - 'eu': '1f613wH88UeITYyBSEMZByK-nRNMwLHTs', - 'ext': '1D0nLOZ3aolCM8TShIRyCgF3-_MhWXccN', - 'fa': '1QOG15HU8VfZvJUNKos024xI-OGm0zhEX', - 'ff': '1h5pVjxDYcq70bSus30oqi9KzDmezVNry', - 'fi': '1y3Kf6qYsSvL8_nSEwE1Y6Bf6ninaPvqa', - 'fiu-vro': '1oKUiqG19WgPd3CCl4FGudk5ATmtNfToR', - 'fj': '10xDMuqtoTJlJFp5ghbhKfNWRpLDK3W4d', - 'fo': '1RhjYqgtri1276Be1N9RrNitdBNkpzh0J', - 'fr': '1sK_T_-wzVPJYrnziNqWTriU52rEsXGjn', - 'frp': '1NUm8B2zClBcEa8dHLBb-ZgzEr8phcQyZ', - 'frr': '1FjNqbIUlOW1deJdB8WCuWjaZfUzKqujV', - 'fur': '1oqHZMK7WAV8oHoZLjGR0PfmO38wmR6XY', - 'fy': '1DvnU6iaTJc9bWedmDklHyx8nzKD1s3Ge', - 'ga': '1Ql6rh7absdYQ8l-3hj_MVKcEC3tHKeFB', - 'gag': '1zli-hOl2abuQ2wsDJU45qbb0xuvYwA3a', - 'gan': '1u2dOwy58y-GaS-tCPJS_i9VRDQIPXwCr', - 'gd': '1umsUpngJiwkLdGQbRqYpkgxZju9dWlRz', - 'gl': '141K2IbLjJfXwFTIf-kthmmG0YWdi8liE', - 'glk': '1ZDaxQ6ilXaoivo4_KllagabbvfOuiZ0c', - 'gn': '1hM4MuCaVnZqnL-w-0N-WcWag22ikVLtZ', - 'gom': '1BNOSw75tzPC0wEgLOCKbwu9wg9gcLOzs', - 'got': '1YSHYBtXc1WvUvMIHPz6HHgJvaXKulJUj', - 'gu': '1VdK-B2drqFwKg8KD23c3dKXY-cZgCMgd', - 'gv': '1XZFohYNbKszEFR-V-yDXxx40V41PV9Zm', - 'ha': '18ZG4tUU0owRtQA8Ey3Dl72ALjryEJWMC', - 'hak': '1QQe3WgrCWbvnVH42QXD7KX4kihHURB0Z', - 'haw': '1FLqlK-wpz4jy768XbQAtxd9PhC-9ciP7', - 'he': '18K-Erc2VOgtIdskaQq4D5A3XkVstDmfX', - 'hi': '1lBRapb5tjBqT176gD36K5yb_qsaFeu-k', - 'hif': '153MQ9Ga4NQ-CkK8UiJM3DjKOk09fhCOV', - 'ho': '1c1AoS7yq15iVkTEE-0f3x25NT4F202B8', # leer - 'hr': '1wS-UtB3sGHuXJQQGR0F5lDegogsgoyif', - 'hsb': '1_3mMLzAE5OmXn2z64rW3OwWbo85Mirbd', - 'ht': '1BwCaF0nfdgkM7Yt7A7d7KyVk0BcuwPGk', - 'hu': '10AkDmTxUWNbOXuYLYZ-ZPbLAdGAGZZ8J', - 'hy': '1Mi2k2alJJquT1ybd3GC3QYDstSagaWdo', - 'hz': '1c1m_-Q92v0Di7Nez6VuaccrN19i8icKV', # leer - 'ia': '1jPyqTmDuVhEhj89N606Cja5heJEbcMoM', - 'id': '1JWIvIh8fQoMQqk1rPvUThaskxnTs8tsf', - 'ie': '1TaKRlTtB8-Wqu4sfvx6JQKIugAlg0pV-', - 'ig': '15NFAf2Qx6BXSjv_Oun9_3QRBWNn49g86', - 'ii': '1qldGJkMOMKwY13DpcgbxQCbff0K982f9', # leer - 'ik': '1VoSTou2ZlwVhply26ujowDz6gjwtxmny', - 'ilo': '1-xMuIT6GaM_YeHqgm1OamGkxYfBREiv3', - 'io': '19Zla0wsAcrZm2c0Pw5ghpp4rHjYs26Pp', - 'is': '11i-NCyqS6HbldIbYulsCgQGZFXR8hwoB', - 'it': '1HmjlOaQunHqL2Te7pIkuBWrnjlmdfYo_', - 'iu': '18jKm1S7Ls3l0_pHqQH8MycG3LhoC2pdX', - 'ja': '10dz8UxyK4RIacXE2HcGdrharmp5rwc3r', - 'jam': '1v99CXf9RnbF6aJo669YeTR6mQRTOLZ74', # leer - 'jbo': '1_LmH9hc6FDGE3F7pyGB1fUEbSwuTYQdD', - 'jv': '1qiSu1uECCLl4IBZS27FBdJIBivkJ7GwE', - 'ka': '172UFuFRBX2V1aWeXlPSpu9TjS-3cxNaD', - 'kaa': '1kh6hMPUdqO-FIxRY6qaIBZothBURXxbY', - 'kab': '1oKjbZI6ZrrALCqnPCYgIjKNrKDA7ehcs', - 'kbd': '1jNbfrboPOwJmlXQBIv053d7n5WXpMRv7', - 'kg': '1iiu5z-sdJ2JLC4Ja9IgDxpRZklIb6nDx', - 'ki': '1GUtt0QI84c5McyLGGxoi5uwjHOq1d6G8', - 'kj': '1nSxXUSGDlXVCIPGlVpcakRc537MwuKZR', # leer - 'kk': '1ryC3UN0myckc1awrWhhb6RIi17C0LCuS', - 'kl': '1gXtGtX9gcTXms1IExICnqZUHefrlcIFf', - 'km': '1DS5ATxvxyfn1iWvq2G6qmjZv9pv0T6hD', - 'kn': '1ZGLYMxbb5-29MNmuUfg2xFhYUbkJFMJJ', - 'ko': '12r8tIkTnwKhLJxy71qpIcoLrT6NNhQYm', - 'koi': '1EdG_wZ_Qk124EPAZw-w6rdEhYLsgcvIj', - 'kr': '19VNQtnBA-YL_avWuVeHQHxJZ9MZ04WPF', # leer - 'krc': '1nReV4Mb7Wdj96czpO5regFbdBPu0zZ_y', - 'ks': '1kzh0Pgrv27WRMstR9MpU8mu7p60TcT-X', - 'ksh': '1iHJvrl2HeRaCumlrx3N7CPrHQ2KuLUkt', - 'ku': '1YqJog7Bkk0fHBCSTxJ9heeE-bfbkbkye', - 'kv': '1s91HI4eq8lQYlZwfrJAgaGlCyAtIhvIJ', - 'kw': '16TaIX2nRfqDp8n7zudd4bqf5abN49dvW', - 'ky': '17HPUKFdKWhUjuR1NOp5f3PQYfMlMCxCT', - 'la': '1NiQuBaUIFEERvVXo6CQLwosPraGyiRYw', - 'lad': '1PEmXCWLCqnjLBomMAYHeObM1AmVHtD08', - 'lb': '1nE4g10xoTU23idmDtOQ0w2QCuizZ6QH_', - 'lbe': '1KOm-AdRcCHfSc1-uYBxBA4GjxXjnIlE-', - 'lez': '1cJAXshrLlF1TZlPHJTpDwEvurIOsz4yR', - 'lg': '1Ur0y7iiEpWBgHECrIrT1OyIC8um_y4th', - 'li': '1TikIqfqcZlSDWhOae1JnjJiDko4nj4Dj', - 'lij': '1ro5ItUcF49iP3JdV82lhCQ07MtZn_VjW', - 'lmo': '1W4rhBy2Pi5SuYWyWbNotOVkVY3kYWS_O', - 'ln': '1bLSV6bWx0CgFm7ByKppZLpYCFL8EIAoD', - 'lo': '1C6SSLeKF3QirjZbAZAcpVX_AXYg_TJG3', - 'lrc': '1GUcS28MlJe_OjeQfS2AJ8uczpD8ut60e', - 'lt': '1gAG6TcMTmC128wWK0rCXRlCTsJY9wFQY', - 'ltg': '12ziP8t_fAAS9JqOCEC0kuJObEyuoiOjD', - 'lv': '1MPuAM04u-AtfybXdpHwCqUpFWbe-zD0_', - 'mai': '1d_nUewBkka2QGEmxCc9v3dTfvo7lPATH', - 'map-bms': '1wrNIE-mqp2xb3lrNdwADe6pb7f35NP6V', - 'mdf': '1BmMGUJy7afuKfhfTBMiKxM3D7FY-JrQ2', - 'mg': '105WaMhcWa-46tCztoj8npUyg0aH18nFL', - 'mh': '1Ej7n6yA1cF1cpD5XneftHtL33iHJwntT', - 'mhr': '1CCPIUaFkEYXiHO0HF8_w07UzVyWchrjS', - 'mi': '1F6au9xQjnF-aNBupGJ1PwaMMM6T_PgdQ', - 'min': '1tVK5SHiCy_DaZSDm3nZBgT5bgWThbJt_', - 'mk': '18NpudytGhSWq_LbmycTDw10cSftlSBGS', - 'ml': '1V73UE-EvcE-vV3V1RTvU4sak6QFcP91y', - 'mn': '14jRXicA87oXZOZllWqUjKBMetNpQEUUp', - 'mo': '1YsLGNMsJ7VsekhdcITQeolzOSK4NzE6U', - 'mr': '1vOr1AIHbgkhTO9Ol9Jx5Wh98Qdyh1QKI', - 'mrj': '1dW-YmEW8a9D5KyXz8ojSdIXWGekNzGzN', - 'ms': '1bs-_5WNRiZBjO-DtcNtkcIle-98homf_', - 'mt': '1L7aU3iGjm6SmPIU74k990qRgHFV9hrL0', - 'mus': '1_b7DcRqiKJFEFwp87cUecqf8A5BDbTIJ', # leer - 'mwl': '1MfP0jba2jQfGVeJOLq26MjI6fYY7xTPu', - 'my': '16wsIGBhNVd2lC2p6n1X8rdMbiaemeiUM', - 'myv': '1KEqHmfx2pfU-a1tdI_7ZxMQAk5NJzJjB', - 'mzn': '1CflvmYEXZnWwpsBmIs2OvG-zDDvLEMDJ', - 'na': '1r0AVjee5wNnrcgJxQmVGPVKg5YWz1irz', - 'nah': '1fx6eu91NegyueZ1i0XaB07CKjUwjHN7H', - 'nap': '1bhT4sXCJvaTchCIV9mwLBtf3a7OprbVB', - 'nds-nl': '1UIFi8eOCuFYJXSAXZ9pCWwkQMlHaY4ye', - 'nds': '1FLgZIXUWa_vekDt4ndY0B5XL7FNLiulr', - 'ne': '1gEoCjSJmzjIH4kdHsbDZzD6ID4_78ekS', - 'new': '1_-p45Ny4w9UvGuhD8uRNSPPeaARYvESH', - 'ng': '11yxPdkmpmnijQUcnFHZ3xcOmLTYJmN_R', - 'nl': '1dqYXg3ilzVOSQ_tz_dF47elSIvSIhgqd', - 'nn': '1pDrtRhQ001z2WUNMWCZQU3RV_M0BqOmv', - 'no': '1zuT8MI96Ivpiu9mEVFNjwbiM8gJlSzY2', - 'nov': '1l38388Rln0NXsSARMZHmTmyfo5C0wYTd', - 'nrm': '10vxPq1Nci7Wpq4XOvx3dtqODskzjdxJQ', - 'nso': '1iaIV8qlT0RDnbeQlnxJ3RehsG3gU5ePK', - 'nv': '1oN31jT0w3wP9aGwAPz91pSdUytnd9B0g', - 'ny': '1eEKH_rUPC560bfEg11kp3kbe8qWm35IG', - 'oc': '1C01cW8G_j8US-DTrsmeal_ENHTtNWn-H', - 'olo': '1vbDwKZKqFq84dusr1SvDx5JbBcPanx9L', # leer - 'om': '1q3h22VMbWg2kgVFm-OArR-E4y1yBQ1JX', - 'or': '1k8LwCE8nC7lq6neXDaS3zRn0KOrd9RnS', - 'os': '1u81KAB34aEQfet00dLMRIBJsfRwbDTij', - 'pa': '1JDEHL1VcLHBamgTPBom_Ryi8hk6PBpsu', - 'pag': '1k905VUWnRgY8kFb2P2431Kr4dZuolYGF', - 'pam': '1ssugGyJb8ipispC60B3I6kzMsri1WcvC', - 'pap': '1Za0wfwatxYoD7jGclmTtRoBP0uV_qImQ', - 'pcd': '1csJlKgtG04pdIYCUWhsCCZARKIGlEYPx', - 'pdc': '1Xnms4RXZKZ1BBQmQJEPokmkiweTpouUw', - 'pfl': '1tPQfHX7E0uKMdDSlwNw5aGmaS5bUK0rn', - 'pi': '16b-KxNxzbEuyoNSlI3bfe2YXmdSEsPFu', - 'pih': '1vwyihTnS8_PE5BNK7cTISmIBqGWvsVnF', - 'pl': '1fijjS0LbfpKcoPB5V8c8fH08T8AkXRp9', - 'pms': '12ySc7X9ajWWqMlBjyrPiEdc-qVBuIkbA', - 'pnb': '1RB3-wjluhTKbdTGCsk3nag1bM3m4wENb', - 'pnt': '1ZCUzms6fY4on_fW8uVgO7cEs9KHydHY_', - 'ps': '1WKl9Av6Sqz6aHKyUM5kIh90mzFzyVWH9', - 'pt': '13BX-_4_hcTUp59HDyczFDI32qUB94vUY', - 'qu': '1CB_C4ygtRoegkqgcqfXNHr8oQd-UcvDE', - 'rm': '1YRSGgWoxEqSojHXuBHJnY8vAHr1VgLu-', - 'rmy': '1uFcCyvOWBJWKFQxbkYSp373xUXVl4IgF', - 'rn': '1ekyyb2MvupYGY_E8_BhKvV664sLvW4aE', - 'ro': '1YfeNTSoxU-zJMnyQotLk5X8B_6nHryBu', - 'roa-rup': '150s4H4TdQ5nNYVC6j0E416TUAjBE85yy', - 'roa-tara': '1H6emfQsD_a5yohK4RMPQ-GrnHXqqVgr3', - 'ru': '11gP2s-SYcfS3j9MjPp5C3_nFeQB-8x86', - 'rue': '1OuSglZAndja1J5D5IUmdbt_niTTyEgYK', - 'rw': '1NuhHfi0-B-Xlr_BApijnxCw0WMEltttP', - 'sa': '1P2S3gL_zvKgXLKJJxg-Fb4z8XdlVpQik', - 'sah': '1qz0MpKckzUref2FX_FYiNzI2p4BDc5oR', - 'sc': '1oAYj_Fty4FUwjAOBEBaiZt_cY8dtpDfA', - 'scn': '1sDN9zHkXWYoHYx-DUu-GPvsUgB_IRa8S', - 'sco': '1i8W7KQPj6YZQLop89vZBSybJNgNsvXWR', - 'sd': '1vaNqfv3S8Gl5pQmig3vwWQ3cqRTsXmMR', - 'se': '1RT9xhn0Vl90zjWYDTw5V1L_u1Oh16tpP', - 'sg': '1iIh2oXD2Szz_AygUvTt3_ZK8a3RYEGZ_', - 'sh': '1qPwLiAm6t4__G-zVEOrBgYx6VRmgDgiS', - 'si': '1G5ryceID0TP6SAO42e-HAbIlCvYmnUN7', - 'simple': '1FVV49o_RlK6M5Iw_7zeJOEDQoTa5zSbq', - 'sk': '11mkYvbmAWKTInj6t4Ma8BUPxoR5o6irL', - 'sl': '1fsIZS5LgMzMzZ6T7ogStyj-ILEZIBRvO', - 'sm': '1yefECpKX_Y4R7G2tggIxvc_BvJfOAz-t', - 'sn': '1fYeCjMPvRAv94kvZjiKI-ktIDLkbv0Ve', - 'so': '1Uc-eSZnJb36SgeTvRU3GirXZOlGD_NB6', - 'sq': '11u-53n71O_yjpwRiCQSwgL7N2w72ZptX', - 'sr': '1PGLGlQi8Q0Eac6dib-uuCJAAHK6SF5Pz', - 'srn': '1JKiL3TSXqK1-KhPfAwMK0uqw90WEzg7M', - 'ss': '1e0quNEsA1dn57-IbincF4D82dRWgzQlp', - 'st': '1ny-FBzpBqIDgv6jMcsoFev3Ih65FNZFO', - 'stq': '15Fx32ROy2IM6lSqAPUykkr3CITR6Xd7v', - 'su': '1C0FJum7bYZpnyptBvfAgwJb0TX2hggtO', - 'sv': '1YyqzOSXzK5yrAou9zeTDWH_7s569mDcz', - 'sw': '1_bNTj6T8eXlNAIuHaveleWlHB_22alJs', - 'szl': '1_dXEip1snK4CPVGqH8x7lF5O-6FdCNFW', - 'ta': '1ZFTONsxGtSnC9QB6RpWSvgD_MbZwIhHH', - 'tcy': '15R6u7KQs1vmDSm_aSDrQMJ3Q6q3Be0r7', # leer - 'te': '11Sx-pBAPeZOXGyv48UNSVMD0AH7uf4YN', - 'tet': '11mr2MYLcv9pz7mHhGGNi5iNCOVErYeOt', - 'tg': '16ttF7HWqM9Cnj4qmgf3ZfNniiOJfZ52w', - 'th': '14xhIt-xr5n9nMuvcwayCGM1-zBCFZquW', - 'ti': '123q5e9MStMShp8eESGtHdSBGLDrCKfJU', - 'tk': '1X-JNInt34BNGhg8A8Peyjw2WjsALdXsD', - 'tl': '1WkQHbWd9cqtTnSHAv0DpUThaBnzeSPTJ', - 'tn': '1fHfQHetZn8-fLuRZEu-cvs-kQYwPvjyL', - 'to': '1cHOLaczYJ8h-OqQgxeoH9vMG3izg6muT', - 'tpi': '1YsRjxVu6NYOrXRb8oqMO9FPaicelFEcu', - 'tr': '1J1Zy02IxvtCK0d1Ba2h_Ulit1mVb9UIX', - 'ts': '1pIcfAt3KmtmDkyhOl-SMSeoM8aP8bOpl', - 'tt': '1vsfzCjj-_bMOn5jBai41TF5GjKJM_Ius', - 'tum': '1NWcg65daI2Bt0awyEgU6apUDbBmiqCus', - 'tw': '1WCYKZIqS7AagS76QFSfbteiOgFNBvNne', - 'ty': '1DIqaP1l-N9VXTNokrlr6EuPMGE765o4h', - 'tyv': '1F3qa05OYLBcjT1lXMurAJFDXP_EesCvM', - 'udm': '1T0YMTAPLOk768sstnewy5Jxgx2RPu3Rb', - 'ug': '1fjezvqlysyZhiQMZdazqLGgk72PqtXAw', - 'uk': '1UMJCHtzxkfLDBJE7NtfN5FeMrnnUVwoh', - 'ur': '1WNaD2TuHvdsF-z0k_emQYchwoQQDFmRk', - 'uz': '11wrG2FSTpRJc2jb5MhgvxjkVDYhT8M-l', - 've': '1PucJ7pJ4CXGEXZ5p_WleZDs2usNz74to', - 'vec': '1cAVjm_y3ehNteDQIYz9yyoq1EKkqOXZ0', - 'vep': '1K_eqV7O6C7KPJWZtmIuzFMKAagj-0O85', - 'vi': '1yQ6nhm1BmG9lD4_NaG1hE5VV6biEaV5f', - 'vls': '1bpQQW6pKHruKJJaKtuggH5rReMXyeVXp', - 'vo': '1D80QRdTpe7H4mHFKpfugscsjX71kiMJN', - 'wa': '1m4B81QYbf74htpInDU5p7d0n0ot8WLPZ', - 'war': '1EC3jsHtu22tHBv6jX_I4rupC5RwV3OYd', - 'wo': '1vChyqNNLu5xYHdyHpACwwpw4l3ptiKlo', - 'wuu': '1_EIn02xCUBcwLOwYnA-lScjS2Lh2ECw6', - 'xal': '19bKXsL1D2UesbB50JPyc9TpG1lNc2POt', - 'xh': '1pPVcxBG3xsCzEnUzlohc_p89gQ9dSJB3', - 'xmf': '1SM9llku6I_ZuZz05mOBuL2lx-KQXvehr', - 'yi': '1WNWr1oV-Nl7c1Jv8x_MiAj2vxRtyQawu', - 'yo': '1yNVOwMOWeglbOcRoZzgd4uwlN5JMynnY', - 'za': '1i7pg162cD_iU9h8dgtI2An8QCcbzUAjB', - 'zea': '1EWSkiSkPBfbyjWjZK0VuKdpqFnFOpXXQ', - 'zh-classical': '1uUKZamNp08KA7s7794sKPOqPALvo_btl', - 'zh-min-nan': '1oSgz3YBXLGUgI7kl-uMOC_ww6L0FNFmp', - 'zh-yue': '1zhwlUeeiyOAU1QqwqZ8n91yXIRPFA7UE', - 'zh': '1LZ96GUhkVHQU-aj2C3WOrtffOp0U3Z7f', - 'zu': '1FyXl_UK1737XB3drqQFhGXiJrJckiB1W' - } - return languages_ids[language] - - -class DANE(ColumnCorpus): +class NER_SWEDISH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", in_memory: bool = True, ): + """ + Initialize the NER_SWEDISH corpus for Swedish. The first time you call this constructor it will automatically + download the dataset. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ + if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {1: 'text', 3: 'pos', 9: 'ner'} + columns = {0: "text", 1: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1440,61 +1232,35 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - data_path = Path(flair.cache_root) / "datasets" / dataset_name - train_data_file = data_path / "ddt.train.conllu" - if not train_data_file.is_file(): - temp_file = cached_path( - 'https://danlp.alexandra.dk/304bd159d5de/datasets/ddt.zip', - Path("datasets") / dataset_name - ) - from zipfile import ZipFile - - with ZipFile(temp_file, 'r') as zip_file: - zip_file.extractall(path=data_path) - - # Remove CoNLL-U meta information in the last column - for part in ['train', 'dev', 'test']: - lines = [] - data_file = "ddt.{}.conllu".format(part) - with open(data_path / data_file, 'r') as file: - for line in file: - if line.startswith("#") or line == "\n": - lines.append(line) - lines.append(line.replace("name=", "").replace("|SpaceAfter=No", "")) - - with open(data_path / data_file, 'w') as file: - file.writelines(lines) + ner_spraakbanken_path = "https://raw.githubusercontent.com/klintan/swedish-ner-corpus/master/" + cached_path(f"{ner_spraakbanken_path}test_corpus.txt", Path("datasets") / dataset_name) + cached_path(f"{ner_spraakbanken_path}train_corpus.txt", Path("datasets") / dataset_name) - print(data_path / data_file) + # data is not in IOB2 format. Thus we transform it to IOB2 + add_IOB2_tags(data_file=Path(data_folder / "test_corpus.txt")) + add_IOB2_tags(data_file=Path(data_folder / "train_corpus.txt")) - super(DANE, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, - in_memory=in_memory, comment_symbol="#" + super(NER_SWEDISH, self).__init__( + data_folder, + columns, + tag_to_bioes=tag_to_bioes, + in_memory=in_memory, ) -class EUROPARL_NER_GERMAN(ColumnCorpus): +class SEC_FILLINGS(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", - in_memory: bool = False, + in_memory: bool = True, ): - """ - Initialize the EUROPARL_NER_GERMAN corpus. The first time you call this constructor it will automatically - download the dataset. - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: 'ner' by default, should not be changed. - :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: 'text', 1: 'lemma', 2: 'pos', 3: 'np', 4: 'ner'} + columns = {0: "text", 1: "pos", 3: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1505,44 +1271,35 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - europarl_ner_german_path = "https://nlpado.de/~sebastian/software/ner/" - cached_path(f"{europarl_ner_german_path}ep-96-04-15.conll", Path("datasets") / dataset_name) - cached_path(f"{europarl_ner_german_path}ep-96-04-16.conll", Path("datasets") / dataset_name) - - add_IOB_tags(data_file=Path(data_folder / "ep-96-04-15.conll"), encoding="latin-1", ner_column=4) - add_IOB_tags(data_file=Path(data_folder / "ep-96-04-16.conll"), encoding="latin-1", ner_column=4) + SEC_FILLINGS_Path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/SEC-filings/CONLL-format/data/" + cached_path(f"{SEC_FILLINGS_Path}test/FIN3.txt", Path("datasets") / dataset_name) + cached_path(f"{SEC_FILLINGS_Path}train/FIN5.txt", Path("datasets") / dataset_name) - super(EUROPARL_NER_GERMAN, self).__init__( + super(SEC_FILLINGS, self).__init__( data_folder, columns, tag_to_bioes=tag_to_bioes, - encoding="latin-1", + encoding="utf-8", in_memory=in_memory, - train_file='ep-96-04-16.conll', - test_file='ep-96-04-15.conll' + train_file='FIN5.txt', + test_file="FIN3.txt", + skip_first_line=True ) -class GERMEVAL_14(ColumnCorpus): +class SEMEVAL2017(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", + tag_to_bioes: str = "keyword", in_memory: bool = True, ): - """ - Initialize the GermEval NER corpus for German. This is only possible if you've manually downloaded it to your - machine. Obtain the corpus from https://sites.google.com/site/germeval2014ner/data and put it into some folder. - Then point the base_path parameter in the constructor to this folder - :param base_path: Path to the GermEval corpus on your machine - :param tag_to_bioes: 'ner' by default, should not be changed. - :param in_memory:If True, keeps dataset in memory giving speedups in training. - """ + if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {1: "text", 2: "ner"} + columns = {0: "text", 1: "keyword"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1552,24 +1309,17 @@ def __init__( base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name - # check if data there - if not data_folder.exists(): - log.warning("-" * 100) - log.warning(f'WARNING: GermEval-14 dataset not found at "{data_folder}".') - log.warning( - 'Instructions for obtaining the data can be found here: https://sites.google.com/site/germeval2014ner/data"' - ) - log.warning("-" * 100) - super(GERMEVAL_14, self).__init__( - data_folder, - columns, - tag_to_bioes=tag_to_bioes, - comment_symbol="#", - in_memory=in_memory, + semeval2017_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/SemEval-2017" + cached_path(f"{semeval2017_path}/train.txt", Path("datasets") / dataset_name) + cached_path(f"{semeval2017_path}/test.txt", Path("datasets") / dataset_name) + cached_path(f"{semeval2017_path}/dev.txt", Path("datasets") / dataset_name) + + super(SEMEVAL2017, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class INSPEC(ColumnCorpus): +class SEMEVAL2010(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, @@ -1591,35 +1341,33 @@ def __init__( base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name - inspec_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/Inspec" - cached_path(f"{inspec_path}/train.txt", Path("datasets") / dataset_name) - cached_path(f"{inspec_path}/test.txt", Path("datasets") / dataset_name) - if not "dev.txt" in os.listdir(data_folder): - cached_path(f"{inspec_path}/valid.txt", Path("datasets") / dataset_name) - # rename according to train - test - dev - convention - os.rename(data_folder / "valid.txt", data_folder / "dev.txt") + semeval2010_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/processed_semeval-2010" + cached_path(f"{semeval2010_path}/train.txt", Path("datasets") / dataset_name) + cached_path(f"{semeval2010_path}/test.txt", Path("datasets") / dataset_name) - super(INSPEC, self).__init__( + super(SEMEVAL2010, self).__init__( data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class LER_GERMAN(ColumnCorpus): +class TURKU_NER(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", - in_memory: bool = False, + in_memory: bool = True, + document_as_sequence: bool = False, ): """ - Initialize the LER_GERMAN (Legal Entity Recognition) corpus. The first time you call this constructor it will automatically + Initialize the Finnish TurkuNER corpus. The first time you call this constructor it will automatically download the dataset. :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary. - :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage. + :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict + POS tags instead + :param in_memory: If True, keeps dataset in memory giving speedups in training. :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ - if type(base_path) == str: base_path: Path = Path(base_path) @@ -1635,18 +1383,29 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ler_path = "https://raw.githubusercontent.com/elenanereiss/Legal-Entity-Recognition/master/data/" - cached_path(f"{ler_path}ler.conll", Path("datasets") / dataset_name) + conll_path = "https://raw.githubusercontent.com/TurkuNLP/turku-ner-corpus/master/data/conll" + dev_file = "dev.tsv" + test_file = "test.tsv" + train_file = "train.tsv" + cached_path(f"{conll_path}/{dev_file}", Path("datasets") / dataset_name) + cached_path(f"{conll_path}/{test_file}", Path("datasets") / dataset_name) + cached_path(f"{conll_path}/{train_file}", Path("datasets") / dataset_name) - super(LER_GERMAN, self).__init__( + super(TURKU_NER, self).__init__( data_folder, columns, + dev_file=dev_file, + test_file=test_file, + train_file=train_file, + column_delimiter="\t", tag_to_bioes=tag_to_bioes, + encoding="latin-1", in_memory=in_memory, - train_file='ler.conll' + document_separator_token=None if not document_as_sequence else "-DOCSTART-", ) -class ANER_CORP(ColumnCorpus): + +class TWITTER_NER(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, @@ -1655,15 +1414,14 @@ def __init__( document_as_sequence: bool = False, ): """ - Initialize a preprocessed version of the Arabic Named Entity Recognition Corpus (ANERCorp) dataset available - from https://github.com/EmnamoR/Arabic-named-entity-recognition/blob/master/ANERCorp.rar. - http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp - Column order is swapped - The first time you call this constructor it will automatically download the dataset. + Initialize a dataset called twitter_ner which can be found on the following page: + https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/ner.txt. + + The first time you call this constructor it will automatically + download the dataset. :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict - POS tags instead + :param tag_to_bioes: NER by default, need not be changed :param in_memory: If True, keeps dataset in memory giving speedups in training. :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ @@ -1671,7 +1429,7 @@ def __init__( base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "ner"} + columns = {0: 'text', 1: 'ner'} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1682,32 +1440,41 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - anercorp_path = "https://megantosh.s3.eu-central-1.amazonaws.com/ANERcorp/" - # cached_path(f"{anercorp_path}test.txt", Path("datasets") / dataset_name) - cached_path(f"{anercorp_path}train.txt", Path("datasets") / dataset_name) + twitter_ner_path = "https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/" + cached_path(f"{twitter_ner_path}ner.txt", Path("datasets") / dataset_name) - super(ANER_CORP, self).__init__( + super(TWITTER_NER, self).__init__( data_folder, columns, - # tag_to_bioes=tag_to_bioes, - encoding="utf-8", + tag_to_bioes=tag_to_bioes, + encoding="latin-1", + train_file="ner.txt", in_memory=in_memory, document_separator_token=None if not document_as_sequence else "-DOCSTART-", ) -class NER_BASQUE(ColumnCorpus): +class UP_CHINESE(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", in_memory: bool = True, + document_as_sequence: bool = False, ): + """ + Initialize the Chinese dataset from the Universal Propositions Bank, comming from that webpage: + https://github.com/System-T/UniversalPropositions/tree/master/UP_Chinese + + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "ner"} + columns = {1: "text", 9: "frame"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1718,44 +1485,45 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ner_basque_path = "http://ixa2.si.ehu.eus/eiec/" - data_path = Path(flair.cache_root) / "datasets" / dataset_name - data_file = data_path / "named_ent_eu.train" - if not data_file.is_file(): - cached_path( - f"{ner_basque_path}/eiec_v1.0.tgz", Path("datasets") / dataset_name - ) - import tarfile, shutil - - with tarfile.open( - Path(flair.cache_root) / "datasets" / dataset_name / "eiec_v1.0.tgz", - "r:gz", - ) as f_in: - corpus_files = ( - "eiec_v1.0/named_ent_eu.train", - "eiec_v1.0/named_ent_eu.test", - ) - for corpus_file in corpus_files: - f_in.extract(corpus_file, data_path) - shutil.move(f"{data_path}/{corpus_file}", data_path) + up_zh_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Chinese/" + cached_path(f"{up_zh_path}zh-up-train.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_zh_path}zh-up-dev.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_zh_path}zh-up-test.conllu", Path("datasets") / dataset_name) - super(NER_BASQUE, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory + super(UP_CHINESE, self).__init__( + data_folder, + columns, + encoding="utf-8", + train_file="zh-up-train.conllu", + test_file="zh-up-test.conllu", + dev_file="zh-up-dev.conllu", + in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + comment_symbol="#", ) -class NER_FINNISH(ColumnCorpus): +class UP_ENGLISH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", in_memory: bool = True, + document_as_sequence: bool = False, ): + """ + Initialize the English dataset from the Universal Propositions Bank, comming from that webpage: + https://github.com/System-T/UniversalPropositions. + + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "ner"} + columns = {1: "text", 10: "frame"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1766,48 +1534,45 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ner_finnish_path = "https://raw.githubusercontent.com/mpsilfve/finer-data/master/data/digitoday." - cached_path(f"{ner_finnish_path}2014.train.csv", Path("datasets") / dataset_name) - cached_path(f"{ner_finnish_path}2014.dev.csv", Path("datasets") / dataset_name) - cached_path(f"{ner_finnish_path}2015.test.csv", Path("datasets") / dataset_name) - - _remove_lines_without_annotations(data_file=Path(data_folder / "digitoday.2015.test.csv")) + up_en_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_English-EWT/" + cached_path(f"{up_en_path}en_ewt-up-train.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_en_path}en_ewt-up-dev.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_en_path}en_ewt-up-test.conllu", Path("datasets") / dataset_name) - super(NER_FINNISH, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, skip_first_line=True + super(UP_ENGLISH, self).__init__( + data_folder, + columns, + encoding="utf-8", + train_file="en_ewt-up-train.conllu", + test_file="en_ewt-up-test.conllu", + dev_file="en_ewt-up-dev.conllu", + in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + comment_symbol="#", ) -def _remove_lines_without_annotations(data_file: Union[str, Path] = None): - with open(data_file, 'r') as f: - lines = f.readlines() - with open(data_file, 'w') as f: - for line in lines: - if len(line.split()) != 1: - f.write(line) - - -class NER_SWEDISH(ColumnCorpus): +class UP_FRENCH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", in_memory: bool = True, + document_as_sequence: bool = False, ): """ - Initialize the NER_SWEDISH corpus for Swedish. The first time you call this constructor it will automatically - download the dataset. + Initialize the French dataset from the Universal Propositions Bank, comming from that webpage: + https://github.com/System-T/UniversalPropositions. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary. :param in_memory: If True, keeps dataset in memory giving speedups in training. :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ - if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "ner"} + columns = {1: "text", 9: "frame"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1818,35 +1583,45 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ner_spraakbanken_path = "https://raw.githubusercontent.com/klintan/swedish-ner-corpus/master/" - cached_path(f"{ner_spraakbanken_path}test_corpus.txt", Path("datasets") / dataset_name) - cached_path(f"{ner_spraakbanken_path}train_corpus.txt", Path("datasets") / dataset_name) - - # data is not in IOB2 format. Thus we transform it to IOB2 - add_IOB2_tags(data_file=Path(data_folder / "test_corpus.txt")) - add_IOB2_tags(data_file=Path(data_folder / "train_corpus.txt")) + up_fr_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_French/" + cached_path(f"{up_fr_path}fr-up-train.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_fr_path}fr-up-dev.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_fr_path}fr-up-test.conllu", Path("datasets") / dataset_name) - super(NER_SWEDISH, self).__init__( + super(UP_FRENCH, self).__init__( data_folder, columns, - tag_to_bioes=tag_to_bioes, + encoding="utf-8", + train_file="fr-up-train.conllu", + test_file="fr-up-test.conllu", + dev_file="fr-up-dev.conllu", in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + comment_symbol="#", ) -class SEMEVAL2017(ColumnCorpus): +class UP_FINNISH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "keyword", in_memory: bool = True, + document_as_sequence: bool = False, ): + """ + Initialize the Finnish dataset from the Universal Propositions Bank, comming from that webpage: + https://github.com/System-T/UniversalPropositions/tree/master/UP_Finnish + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "keyword"} + columns = {1: "text", 9: "frame"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1856,29 +1631,46 @@ def __init__( base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name - semeval2017_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/SemEval-2017" - cached_path(f"{semeval2017_path}/train.txt", Path("datasets") / dataset_name) - cached_path(f"{semeval2017_path}/test.txt", Path("datasets") / dataset_name) - cached_path(f"{semeval2017_path}/dev.txt", Path("datasets") / dataset_name) + # download data if necessary + up_fi_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Finnish/" + cached_path(f"{up_fi_path}fi-up-train.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_fi_path}fi-up-dev.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_fi_path}fi-up-test.conllu", Path("datasets") / dataset_name) - super(SEMEVAL2017, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory + super(UP_FINNISH, self).__init__( + data_folder, + columns, + encoding="utf-8", + train_file="fi-up-train.conllu", + test_file="fi-up-test.conllu", + dev_file="fi-up-dev.conllu", + in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + comment_symbol="#", ) -class SEMEVAL2010(ColumnCorpus): +class UP_GERMAN(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "keyword", in_memory: bool = True, + document_as_sequence: bool = False, ): + """ + Initialize the German dataset from the Universal Propositions Bank, comming from that webpage: + https://github.com/System-T/UniversalPropositions. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "keyword"} + columns = {1: "text", 9: "frame"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1888,27 +1680,46 @@ def __init__( base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name - semeval2010_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/processed_semeval-2010" - cached_path(f"{semeval2010_path}/train.txt", Path("datasets") / dataset_name) - cached_path(f"{semeval2010_path}/test.txt", Path("datasets") / dataset_name) + # download data if necessary + up_de_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_German/" + cached_path(f"{up_de_path}de-up-train.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_de_path}de-up-dev.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_de_path}de-up-test.conllu", Path("datasets") / dataset_name) - super(SEMEVAL2010, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory + super(UP_GERMAN, self).__init__( + data_folder, + columns, + encoding="utf-8", + train_file="de-up-train.conllu", + test_file="de-up-test.conllu", + dev_file="de-up-dev.conllu", + in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + comment_symbol="#", ) -class WIKINER_ENGLISH(ColumnCorpus): +class UP_ITALIAN(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", - in_memory: bool = False, + in_memory: bool = True, + document_as_sequence: bool = False, ): + """ + Initialize the Italian dataset from the Universal Propositions Bank, comming from that webpage: + https://github.com/System-T/UniversalPropositions/tree/master/UP_Italian + + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "pos", 2: "ner"} + columns = {1: "text", 9: "frame"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1919,25 +1730,45 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - _download_wikiner("en", dataset_name) + up_it_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Italian/" + cached_path(f"{up_it_path}it-up-train.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_it_path}it-up-dev.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_it_path}it-up-test.conllu", Path("datasets") / dataset_name) - super(WIKINER_ENGLISH, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory + super(UP_ITALIAN, self).__init__( + data_folder, + columns, + encoding="utf-8", + train_file="it-up-train.conllu", + test_file="it-up-test.conllu", + dev_file="it-up-dev.conllu", + in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + comment_symbol="#", ) -class WIKINER_GERMAN(ColumnCorpus): +class UP_SPANISH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", - in_memory: bool = False, + in_memory: bool = True, + document_as_sequence: bool = False, ): + """ + Initialize the Spanish dataset from the Universal Propositions Bank, comming from that webpage: + https://github.com/System-T/UniversalPropositions + + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "pos", 2: "ner"} + columns = {1: "text", 9: "frame"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1948,25 +1779,45 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - _download_wikiner("de", dataset_name) + up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish/" + cached_path(f"{up_es_path}es-up-train.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_es_path}es-up-dev.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_es_path}es-up-test.conllu", Path("datasets") / dataset_name) - super(WIKINER_GERMAN, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory + super(UP_SPANISH, self).__init__( + data_folder, + columns, + encoding="utf-8", + train_file="es-up-train.conllu", + test_file="es-up-test.conllu", + dev_file="es-up-dev.conllu", + in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + comment_symbol="#", ) -class WIKINER_DUTCH(ColumnCorpus): +class UP_SPANISH_ANCORA(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", - in_memory: bool = False, + in_memory: bool = True, + document_as_sequence: bool = False, ): + """ + Initialize the Spanish AnCora dataset from the Universal Propositions Bank, comming from that webpage: + https://github.com/System-T/UniversalPropositions + + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "pos", 2: "ner"} + columns = {1: "text", 9: "frame"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1977,25 +1828,47 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - _download_wikiner("nl", dataset_name) + up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish-AnCora/" + cached_path(f"{up_es_path}es_ancora-up-train.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_es_path}es_ancora-up-dev.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_es_path}es_ancora-up-test.conllu", Path("datasets") / dataset_name) - super(WIKINER_DUTCH, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory + super(UP_SPANISH_ANCORA, self).__init__( + data_folder, + columns, + encoding="utf-8", + train_file="es_ancora-up-train.conllu", + test_file="es_ancora-up-test.conllu", + dev_file="es_ancora-up-dev.conllu", + in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + comment_symbol="#", ) -class WIKINER_FRENCH(ColumnCorpus): +class WEIBO_NER(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", - in_memory: bool = False, + in_memory: bool = True, + document_as_sequence: bool = False, ): + """ + Initialize the WEIBO_NER corpus . The first time you call this constructor it will automatically + download the dataset. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict + POS tags instead + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "pos", 2: "ner"} + columns = {0: 'text', 1: 'ner'} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2006,192 +1879,449 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - _download_wikiner("fr", dataset_name) + weiboNER_conll_path = "https://raw.githubusercontent.com/87302380/WEIBO_NER/main/data/" + cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.train", Path("datasets") / dataset_name) + cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.test", Path("datasets") / dataset_name) + cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.dev", Path("datasets") / dataset_name) - super(WIKINER_FRENCH, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory + super(WEIBO_NER, self).__init__( + data_folder, + columns, + tag_to_bioes=tag_to_bioes, + encoding="utf-8", + in_memory=in_memory, + train_file="weiboNER_2nd_conll_format.train", + test_file="weiboNER_2nd_conll_format.test", + dev_file="weiboNER_2nd_conll_format.dev", + document_separator_token=None if not document_as_sequence else "-DOCSTART-", ) -class WIKINER_ITALIAN(ColumnCorpus): +class WIKIANN(MultiCorpus): def __init__( self, + languages: Union[str, List[str]], base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", in_memory: bool = False, ): + """ + WkiAnn corpus for cross-lingual NER consisting of datasets from 282 languages that exist + in Wikipedia. See https://elisa-ie.github.io/wikiann/ for details and for the languages and their + respective abbreveations, i.e. "en" for english. (license: https://opendatacommons.org/licenses/by/) + Parameters + ---------- + languages : Union[str, List[str]] + Should be an abbreviation of a language ("en", "de",..) or a list of abbreviations. + The datasets of all passed languages will be saved in one MultiCorpus. + (Note that, even though listed on https://elisa-ie.github.io/wikiann/ some datasets are empty. + This includes "aa", "cho", "ho", "hz", "ii", "jam", "kj", "kr", "mus", "olo" and "tcy".) + base_path : Union[str, Path], optional + Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + tag_to_bioes : str, optional + The data is in bio-format. It will by default (with the string "ner" as value) be transformed + into the bioes format. If you dont want that set it to None. + + """ + if type(languages) == str: + languages = [languages] + if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "pos", 2: "ner"} + columns = {0: "text", 1: "ner"} # this dataset name - dataset_name = self.__class__.__name__.lower() + dataset_name = "wikiann" # default dataset folder is the cache root if not base_path: base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name - # download data if necessary - _download_wikiner("it", dataset_name) + # For each language in languages, the file is downloaded if not existent + # Then a comlumncorpus of that data is created and saved in a list + # this list is handed to the multicorpus - super(WIKINER_ITALIAN, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory - ) + # list that contains the columncopora + corpora = [] + google_drive_path = 'https://drive.google.com/uc?id=' + # download data if necessary + first = True + for language in languages: -class WIKINER_SPANISH(ColumnCorpus): - def __init__( - self, - base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", - in_memory: bool = False, - ): - if type(base_path) == str: - base_path: Path = Path(base_path) - - # column format - columns = {0: "text", 1: "pos", 2: "ner"} - - # this dataset name - dataset_name = self.__class__.__name__.lower() - - # default dataset folder is the cache root - if not base_path: - base_path = Path(flair.cache_root) / "datasets" - data_folder = base_path / dataset_name - - # download data if necessary - _download_wikiner("es", dataset_name) - - super(WIKINER_SPANISH, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory - ) - - -class WIKINER_PORTUGUESE(ColumnCorpus): - def __init__( - self, - base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", - in_memory: bool = False, - ): - if type(base_path) == str: - base_path: Path = Path(base_path) - - # column format - columns = {0: "text", 1: "pos", 2: "ner"} - - # this dataset name - dataset_name = self.__class__.__name__.lower() - - # default dataset folder is the cache root - if not base_path: - base_path = Path(flair.cache_root) / "datasets" - data_folder = base_path / dataset_name - - # download data if necessary - _download_wikiner("pt", dataset_name) - - super(WIKINER_PORTUGUESE, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory - ) - - -class WIKINER_POLISH(ColumnCorpus): - def __init__( - self, - base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", - in_memory: bool = False, - ): - if type(base_path) == str: - base_path: Path = Path(base_path) - - # column format - columns = {0: "text", 1: "pos", 2: "ner"} - - # this dataset name - dataset_name = self.__class__.__name__.lower() - - # default dataset folder is the cache root - if not base_path: - base_path = Path(flair.cache_root) / "datasets" - data_folder = base_path / dataset_name - - # download data if necessary - _download_wikiner("pl", dataset_name) - - super(WIKINER_POLISH, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory - ) - + language_folder = data_folder / language + file_name = 'wikiann-' + language + '.bio' -class WIKINER_RUSSIAN(ColumnCorpus): - def __init__( - self, - base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", - in_memory: bool = False, - ): - if type(base_path) == str: - base_path: Path = Path(base_path) + # if language not downloaded yet, download it + if not language_folder.exists(): + if first == True: + import gdown + import tarfile + first = False + # create folder + os.makedirs(language_folder) + # get google drive id from list + google_id = google_drive_id_from_language_name(language) + url = google_drive_path + google_id - # column format - columns = {0: "text", 1: "pos", 2: "ner"} + # download from google drive + gdown.download(url, str(language_folder / language) + '.tar.gz') - # this dataset name - dataset_name = self.__class__.__name__.lower() + # unzip + print("Extract data...") + tar = tarfile.open(str(language_folder / language) + '.tar.gz', "r:gz") + # tar.extractall(language_folder,members=[tar.getmember(file_name)]) + tar.extract(file_name, str(language_folder)) + tar.close() + print('...done.') - # default dataset folder is the cache root - if not base_path: - base_path = Path(flair.cache_root) / "datasets" - data_folder = base_path / dataset_name + # transform data into required format + # the processed dataset has the additional ending "_new" + print("Process dataset...") + silver_standard_to_simple_ner_annotation(str(language_folder / file_name)) + # remove the unprocessed dataset + os.remove(str(language_folder / file_name)) + print('...done.') - # download data if necessary - _download_wikiner("ru", dataset_name) + # initialize comlumncorpus and add it to list + print("Read data into corpus...") + corp = ColumnCorpus(data_folder=language_folder, + column_format=columns, + train_file=file_name + '_new', + tag_to_bioes=tag_to_bioes, + in_memory=in_memory, + ) + corpora.append(corp) + print("...done.") - super(WIKINER_RUSSIAN, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory + super(WIKIANN, self).__init__( + corpora, name='wikiann' ) -class WNUT_17(ColumnCorpus): - def __init__( - self, - base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", - in_memory: bool = True, - ): - if type(base_path) == str: - base_path: Path = Path(base_path) - - # column format - columns = {0: "text", 1: "ner"} - - # this dataset name - dataset_name = self.__class__.__name__.lower() +def silver_standard_to_simple_ner_annotation(data_file: Union[str, Path]): + f_read = open(data_file, 'r', encoding='utf-8') + f_write = open(data_file + '_new', 'w+', encoding='utf-8') + while True: + line = f_read.readline() + if line: + if line == '\n': + f_write.write(line) + else: + liste = line.split() + f_write.write(liste[0] + ' ' + liste[-1] + '\n') + else: + break + f_read.close() + f_write.close() - # default dataset folder is the cache root - if not base_path: - base_path = Path(flair.cache_root) / "datasets" - data_folder = base_path / dataset_name - # download data if necessary - wnut_path = "https://noisy-text.github.io/2017/files/" - cached_path(f"{wnut_path}wnut17train.conll", Path("datasets") / dataset_name) - cached_path(f"{wnut_path}emerging.dev.conll", Path("datasets") / dataset_name) - cached_path( - f"{wnut_path}emerging.test.annotated", Path("datasets") / dataset_name - ) +def google_drive_id_from_language_name(language): + languages_ids = { + 'aa': '1tDDlydKq7KQQ3_23Ysbtke4HJOe4snIk', # leer + 'ab': '1hB8REj2XA_0DjI9hdQvNvSDpuBIb8qRf', + 'ace': '1WENJS2ppHcZqaBEXRZyk2zY-PqXkTkgG', + 'ady': '1n6On8WWDHxEoybj7F9K15d_fkGPy6KgO', + 'af': '1CPB-0BD2tg3zIT60D3hmJT0i5O_SKja0', + 'ak': '1l2vlGHnQwvm9XhW5S-403fetwUXhBlZm', + 'als': '196xyYjhbie7sYLHLZHWkkurOwQLi8wK-', + 'am': '1ug1IEoExKD3xWpvfZprAPSQi82YF9Cet', + 'an': '1DNLgPOAOsGZBYd6rC5ddhzvc9_DtWnk2', + 'ang': '1W_0ti7Tl8AkqM91lRCMPWEuUnPOAZroV', + 'ar': '1tyvd32udEQG_cNeVpaD5I2fxvCc6XKIS', + 'arc': '1hSOByStqPmP3b9HfQ39EclUZGo8IKCMb', + 'arz': '1CKW5ZhxTpIHmc8Jt5JLz_5O6Cr8Icsan', + 'as': '12opBoIweBLM8XciMHT4B6-MAaKdYdvpE', + 'ast': '1rp64PxGZBDfcw-tpFBjLg_ddLDElG1II', + 'av': '1hncGUrkG1vwAAQgLtwOf41BWkHkEvdss', + 'ay': '1VmIsWpMTz442b4Mx798ZOgtB9vquKQtf', + 'az': '1FXDXsvBSdqc7GGIDZv0hqBOaaw12Ip2-', + 'azb': '1amVqOuHLEkhjn8rkGUl-mXdZlaACWyNT', + 'ba': '1aLx1d8GagI11VZVYOGQy0BEePeqoT0x3', + 'bar': '1JZ8-k8ZmnpWYI_Yl_cBBgjVdxoM9Daci', + 'bat-smg': '1trxKXDFSeKsygTMKi-ZqXSJs7F90k5a8', + 'bcl': '1Hs0k7KVZ2DPsqroZ4cUKcwZG4HdPV794', + 'be-x-old': '1gaK-spj1m6eGYQ-SsngLxxLUvP1VRk08', + 'be': '1_ttfOSy9BzCRkIT_p3mImT82XRPpEiuH', + 'bg': '1Iug6gYKemb0OrLTUrKDc_c66YGypTfCF', + 'bh': '12OcSFLu940A8tVQLxI8pnxKBpTeZHmrh', + 'bi': '1rftVziS_pqARx4mvLJC0sKLY-OL5ZIjE', + 'bjn': '1n17mkRjPUAOWQk5LQs2C3Tz3ShxK0enZ', + 'bm': '1284dwO_sfdsWE7FR06HhfBRUb8ePesKR', + 'bn': '1K2DM1mT4hkr6NlAIBTj95BeVXcgvpgDm', + 'bo': '1SzGHDVK-OguKdjZ4DXWiOJVrie1iHeWm', + 'bpy': '1m-e5EoruJufvwBEgJLmJtx6jzx64pYN2', + 'br': '1xdaBoJ1DnwI0iEq7gQN1dWcABAs_bM9H', + 'bs': '167dsB01trMYFQl8FshtIdfhjw7IfVKbk', + 'bug': '1yCnevM9_KJzFk27Vxsva_20OacLo4Uam', + 'bxr': '1DlByAX3zB-9UyEAVD4wtX-R7mXC-8xum', + 'ca': '1LuUgbd9sGa-5Ahcsy31EK89a3WOowftY', + 'cbk-zam': '1kgF8xoD-kIOWZET_9kp_4yNX6AAXn6PI', + 'cdo': '14x1y6611G-UAEGq92QEHRpreVkYnoUCw', + 'ce': '1QUUCVKA-fkiCHd3KT3zUWefaWnxzlZLu', + 'ceb': '1DJZE9RfaMoPNXHI73KBXAm4YSe-_YCUk', + 'ch': '1YzAfhmatkmTpkZbAcD6X83epCgzD5S2_', + 'cho': '1ciY0vF3c5a2mTOo_k32A2wMs0klK98Kb', # leer + 'chr': '1EHaxz1UZHn7v2bbRzCLAhPsNtRzrG3Ae', + 'chy': '1nNWwMAJr1KNdz3bHf6uIn-thZCknlTeB', + 'ckb': '1llpaftcUSiXCZQZMdAqaJSrhwMdcf9IV', + 'co': '1ZP-8oWgMYfW7a6w6ygEFkKDGbN39QnDn', + 'cr': '1ST0xRicLAG4JdCZwGdaY-0pEXooQh7e6', + 'crh': '1Jmpq2XVYUR_XaXU5XNhtOMnz-qkpsgpE', + 'cs': '1Vydyze-jBkK_S1uV5ewV_Y6dbwhXr7lk', + 'csb': '1naUyF74lZPnnopXdOqf5Xor2kT4WoHfS', + 'cu': '1EN5dVTU6jc7YOYPCHq8EYUF31HlMUKs7', + 'cv': '1gEUAlqYSSDI4TrWCqP1LUq2n0X1XEjN3', + 'cy': '1q5g6NJE5GXf65Vc_P4BnUMHQ49Prz-J1', + 'da': '11onAGOLkkqrIwM784siWlg-cewa5WKm8', + 'de': '1f9nWvNkCCy6XWhd9uf4Dq-2--GzSaYAb', + 'diq': '1IkpJaVbEOuOs9qay_KG9rkxRghWZhWPm', + 'dsb': '1hlExWaMth-2eVIQ3i3siJSG-MN_7Z6MY', + 'dv': '1WpCrslO4I7TMb2uaKVQw4U2U8qMs5szi', + 'dz': '10WX52ePq2KfyGliwPvY_54hIjpzW6klV', + 'ee': '1tYEt3oN2KPzBSWrk9jpCqnW3J1KXdhjz', + 'el': '1cxq4NUYmHwWsEn5waYXfFSanlINXWLfM', + 'eml': '17FgGhPZqZNtzbxpTJOf-6nxEuI5oU4Vd', + 'en': '1mqxeCPjxqmO7e8utj1MQv1CICLFVvKa-', + 'eo': '1YeknLymGcqj44ug2yd4P7xQVpSK27HkK', + 'es': '1Dnx3MVR9r5cuoOgeew2gT8bDvWpOKxkU', + 'et': '1Qhb3kYlQnLefWmNimdN_Vykm4mWzbcWy', + 'eu': '1f613wH88UeITYyBSEMZByK-nRNMwLHTs', + 'ext': '1D0nLOZ3aolCM8TShIRyCgF3-_MhWXccN', + 'fa': '1QOG15HU8VfZvJUNKos024xI-OGm0zhEX', + 'ff': '1h5pVjxDYcq70bSus30oqi9KzDmezVNry', + 'fi': '1y3Kf6qYsSvL8_nSEwE1Y6Bf6ninaPvqa', + 'fiu-vro': '1oKUiqG19WgPd3CCl4FGudk5ATmtNfToR', + 'fj': '10xDMuqtoTJlJFp5ghbhKfNWRpLDK3W4d', + 'fo': '1RhjYqgtri1276Be1N9RrNitdBNkpzh0J', + 'fr': '1sK_T_-wzVPJYrnziNqWTriU52rEsXGjn', + 'frp': '1NUm8B2zClBcEa8dHLBb-ZgzEr8phcQyZ', + 'frr': '1FjNqbIUlOW1deJdB8WCuWjaZfUzKqujV', + 'fur': '1oqHZMK7WAV8oHoZLjGR0PfmO38wmR6XY', + 'fy': '1DvnU6iaTJc9bWedmDklHyx8nzKD1s3Ge', + 'ga': '1Ql6rh7absdYQ8l-3hj_MVKcEC3tHKeFB', + 'gag': '1zli-hOl2abuQ2wsDJU45qbb0xuvYwA3a', + 'gan': '1u2dOwy58y-GaS-tCPJS_i9VRDQIPXwCr', + 'gd': '1umsUpngJiwkLdGQbRqYpkgxZju9dWlRz', + 'gl': '141K2IbLjJfXwFTIf-kthmmG0YWdi8liE', + 'glk': '1ZDaxQ6ilXaoivo4_KllagabbvfOuiZ0c', + 'gn': '1hM4MuCaVnZqnL-w-0N-WcWag22ikVLtZ', + 'gom': '1BNOSw75tzPC0wEgLOCKbwu9wg9gcLOzs', + 'got': '1YSHYBtXc1WvUvMIHPz6HHgJvaXKulJUj', + 'gu': '1VdK-B2drqFwKg8KD23c3dKXY-cZgCMgd', + 'gv': '1XZFohYNbKszEFR-V-yDXxx40V41PV9Zm', + 'ha': '18ZG4tUU0owRtQA8Ey3Dl72ALjryEJWMC', + 'hak': '1QQe3WgrCWbvnVH42QXD7KX4kihHURB0Z', + 'haw': '1FLqlK-wpz4jy768XbQAtxd9PhC-9ciP7', + 'he': '18K-Erc2VOgtIdskaQq4D5A3XkVstDmfX', + 'hi': '1lBRapb5tjBqT176gD36K5yb_qsaFeu-k', + 'hif': '153MQ9Ga4NQ-CkK8UiJM3DjKOk09fhCOV', + 'ho': '1c1AoS7yq15iVkTEE-0f3x25NT4F202B8', # leer + 'hr': '1wS-UtB3sGHuXJQQGR0F5lDegogsgoyif', + 'hsb': '1_3mMLzAE5OmXn2z64rW3OwWbo85Mirbd', + 'ht': '1BwCaF0nfdgkM7Yt7A7d7KyVk0BcuwPGk', + 'hu': '10AkDmTxUWNbOXuYLYZ-ZPbLAdGAGZZ8J', + 'hy': '1Mi2k2alJJquT1ybd3GC3QYDstSagaWdo', + 'hz': '1c1m_-Q92v0Di7Nez6VuaccrN19i8icKV', # leer + 'ia': '1jPyqTmDuVhEhj89N606Cja5heJEbcMoM', + 'id': '1JWIvIh8fQoMQqk1rPvUThaskxnTs8tsf', + 'ie': '1TaKRlTtB8-Wqu4sfvx6JQKIugAlg0pV-', + 'ig': '15NFAf2Qx6BXSjv_Oun9_3QRBWNn49g86', + 'ii': '1qldGJkMOMKwY13DpcgbxQCbff0K982f9', # leer + 'ik': '1VoSTou2ZlwVhply26ujowDz6gjwtxmny', + 'ilo': '1-xMuIT6GaM_YeHqgm1OamGkxYfBREiv3', + 'io': '19Zla0wsAcrZm2c0Pw5ghpp4rHjYs26Pp', + 'is': '11i-NCyqS6HbldIbYulsCgQGZFXR8hwoB', + 'it': '1HmjlOaQunHqL2Te7pIkuBWrnjlmdfYo_', + 'iu': '18jKm1S7Ls3l0_pHqQH8MycG3LhoC2pdX', + 'ja': '10dz8UxyK4RIacXE2HcGdrharmp5rwc3r', + 'jam': '1v99CXf9RnbF6aJo669YeTR6mQRTOLZ74', # leer + 'jbo': '1_LmH9hc6FDGE3F7pyGB1fUEbSwuTYQdD', + 'jv': '1qiSu1uECCLl4IBZS27FBdJIBivkJ7GwE', + 'ka': '172UFuFRBX2V1aWeXlPSpu9TjS-3cxNaD', + 'kaa': '1kh6hMPUdqO-FIxRY6qaIBZothBURXxbY', + 'kab': '1oKjbZI6ZrrALCqnPCYgIjKNrKDA7ehcs', + 'kbd': '1jNbfrboPOwJmlXQBIv053d7n5WXpMRv7', + 'kg': '1iiu5z-sdJ2JLC4Ja9IgDxpRZklIb6nDx', + 'ki': '1GUtt0QI84c5McyLGGxoi5uwjHOq1d6G8', + 'kj': '1nSxXUSGDlXVCIPGlVpcakRc537MwuKZR', # leer + 'kk': '1ryC3UN0myckc1awrWhhb6RIi17C0LCuS', + 'kl': '1gXtGtX9gcTXms1IExICnqZUHefrlcIFf', + 'km': '1DS5ATxvxyfn1iWvq2G6qmjZv9pv0T6hD', + 'kn': '1ZGLYMxbb5-29MNmuUfg2xFhYUbkJFMJJ', + 'ko': '12r8tIkTnwKhLJxy71qpIcoLrT6NNhQYm', + 'koi': '1EdG_wZ_Qk124EPAZw-w6rdEhYLsgcvIj', + 'kr': '19VNQtnBA-YL_avWuVeHQHxJZ9MZ04WPF', # leer + 'krc': '1nReV4Mb7Wdj96czpO5regFbdBPu0zZ_y', + 'ks': '1kzh0Pgrv27WRMstR9MpU8mu7p60TcT-X', + 'ksh': '1iHJvrl2HeRaCumlrx3N7CPrHQ2KuLUkt', + 'ku': '1YqJog7Bkk0fHBCSTxJ9heeE-bfbkbkye', + 'kv': '1s91HI4eq8lQYlZwfrJAgaGlCyAtIhvIJ', + 'kw': '16TaIX2nRfqDp8n7zudd4bqf5abN49dvW', + 'ky': '17HPUKFdKWhUjuR1NOp5f3PQYfMlMCxCT', + 'la': '1NiQuBaUIFEERvVXo6CQLwosPraGyiRYw', + 'lad': '1PEmXCWLCqnjLBomMAYHeObM1AmVHtD08', + 'lb': '1nE4g10xoTU23idmDtOQ0w2QCuizZ6QH_', + 'lbe': '1KOm-AdRcCHfSc1-uYBxBA4GjxXjnIlE-', + 'lez': '1cJAXshrLlF1TZlPHJTpDwEvurIOsz4yR', + 'lg': '1Ur0y7iiEpWBgHECrIrT1OyIC8um_y4th', + 'li': '1TikIqfqcZlSDWhOae1JnjJiDko4nj4Dj', + 'lij': '1ro5ItUcF49iP3JdV82lhCQ07MtZn_VjW', + 'lmo': '1W4rhBy2Pi5SuYWyWbNotOVkVY3kYWS_O', + 'ln': '1bLSV6bWx0CgFm7ByKppZLpYCFL8EIAoD', + 'lo': '1C6SSLeKF3QirjZbAZAcpVX_AXYg_TJG3', + 'lrc': '1GUcS28MlJe_OjeQfS2AJ8uczpD8ut60e', + 'lt': '1gAG6TcMTmC128wWK0rCXRlCTsJY9wFQY', + 'ltg': '12ziP8t_fAAS9JqOCEC0kuJObEyuoiOjD', + 'lv': '1MPuAM04u-AtfybXdpHwCqUpFWbe-zD0_', + 'mai': '1d_nUewBkka2QGEmxCc9v3dTfvo7lPATH', + 'map-bms': '1wrNIE-mqp2xb3lrNdwADe6pb7f35NP6V', + 'mdf': '1BmMGUJy7afuKfhfTBMiKxM3D7FY-JrQ2', + 'mg': '105WaMhcWa-46tCztoj8npUyg0aH18nFL', + 'mh': '1Ej7n6yA1cF1cpD5XneftHtL33iHJwntT', + 'mhr': '1CCPIUaFkEYXiHO0HF8_w07UzVyWchrjS', + 'mi': '1F6au9xQjnF-aNBupGJ1PwaMMM6T_PgdQ', + 'min': '1tVK5SHiCy_DaZSDm3nZBgT5bgWThbJt_', + 'mk': '18NpudytGhSWq_LbmycTDw10cSftlSBGS', + 'ml': '1V73UE-EvcE-vV3V1RTvU4sak6QFcP91y', + 'mn': '14jRXicA87oXZOZllWqUjKBMetNpQEUUp', + 'mo': '1YsLGNMsJ7VsekhdcITQeolzOSK4NzE6U', + 'mr': '1vOr1AIHbgkhTO9Ol9Jx5Wh98Qdyh1QKI', + 'mrj': '1dW-YmEW8a9D5KyXz8ojSdIXWGekNzGzN', + 'ms': '1bs-_5WNRiZBjO-DtcNtkcIle-98homf_', + 'mt': '1L7aU3iGjm6SmPIU74k990qRgHFV9hrL0', + 'mus': '1_b7DcRqiKJFEFwp87cUecqf8A5BDbTIJ', # leer + 'mwl': '1MfP0jba2jQfGVeJOLq26MjI6fYY7xTPu', + 'my': '16wsIGBhNVd2lC2p6n1X8rdMbiaemeiUM', + 'myv': '1KEqHmfx2pfU-a1tdI_7ZxMQAk5NJzJjB', + 'mzn': '1CflvmYEXZnWwpsBmIs2OvG-zDDvLEMDJ', + 'na': '1r0AVjee5wNnrcgJxQmVGPVKg5YWz1irz', + 'nah': '1fx6eu91NegyueZ1i0XaB07CKjUwjHN7H', + 'nap': '1bhT4sXCJvaTchCIV9mwLBtf3a7OprbVB', + 'nds-nl': '1UIFi8eOCuFYJXSAXZ9pCWwkQMlHaY4ye', + 'nds': '1FLgZIXUWa_vekDt4ndY0B5XL7FNLiulr', + 'ne': '1gEoCjSJmzjIH4kdHsbDZzD6ID4_78ekS', + 'new': '1_-p45Ny4w9UvGuhD8uRNSPPeaARYvESH', + 'ng': '11yxPdkmpmnijQUcnFHZ3xcOmLTYJmN_R', + 'nl': '1dqYXg3ilzVOSQ_tz_dF47elSIvSIhgqd', + 'nn': '1pDrtRhQ001z2WUNMWCZQU3RV_M0BqOmv', + 'no': '1zuT8MI96Ivpiu9mEVFNjwbiM8gJlSzY2', + 'nov': '1l38388Rln0NXsSARMZHmTmyfo5C0wYTd', + 'nrm': '10vxPq1Nci7Wpq4XOvx3dtqODskzjdxJQ', + 'nso': '1iaIV8qlT0RDnbeQlnxJ3RehsG3gU5ePK', + 'nv': '1oN31jT0w3wP9aGwAPz91pSdUytnd9B0g', + 'ny': '1eEKH_rUPC560bfEg11kp3kbe8qWm35IG', + 'oc': '1C01cW8G_j8US-DTrsmeal_ENHTtNWn-H', + 'olo': '1vbDwKZKqFq84dusr1SvDx5JbBcPanx9L', # leer + 'om': '1q3h22VMbWg2kgVFm-OArR-E4y1yBQ1JX', + 'or': '1k8LwCE8nC7lq6neXDaS3zRn0KOrd9RnS', + 'os': '1u81KAB34aEQfet00dLMRIBJsfRwbDTij', + 'pa': '1JDEHL1VcLHBamgTPBom_Ryi8hk6PBpsu', + 'pag': '1k905VUWnRgY8kFb2P2431Kr4dZuolYGF', + 'pam': '1ssugGyJb8ipispC60B3I6kzMsri1WcvC', + 'pap': '1Za0wfwatxYoD7jGclmTtRoBP0uV_qImQ', + 'pcd': '1csJlKgtG04pdIYCUWhsCCZARKIGlEYPx', + 'pdc': '1Xnms4RXZKZ1BBQmQJEPokmkiweTpouUw', + 'pfl': '1tPQfHX7E0uKMdDSlwNw5aGmaS5bUK0rn', + 'pi': '16b-KxNxzbEuyoNSlI3bfe2YXmdSEsPFu', + 'pih': '1vwyihTnS8_PE5BNK7cTISmIBqGWvsVnF', + 'pl': '1fijjS0LbfpKcoPB5V8c8fH08T8AkXRp9', + 'pms': '12ySc7X9ajWWqMlBjyrPiEdc-qVBuIkbA', + 'pnb': '1RB3-wjluhTKbdTGCsk3nag1bM3m4wENb', + 'pnt': '1ZCUzms6fY4on_fW8uVgO7cEs9KHydHY_', + 'ps': '1WKl9Av6Sqz6aHKyUM5kIh90mzFzyVWH9', + 'pt': '13BX-_4_hcTUp59HDyczFDI32qUB94vUY', + 'qu': '1CB_C4ygtRoegkqgcqfXNHr8oQd-UcvDE', + 'rm': '1YRSGgWoxEqSojHXuBHJnY8vAHr1VgLu-', + 'rmy': '1uFcCyvOWBJWKFQxbkYSp373xUXVl4IgF', + 'rn': '1ekyyb2MvupYGY_E8_BhKvV664sLvW4aE', + 'ro': '1YfeNTSoxU-zJMnyQotLk5X8B_6nHryBu', + 'roa-rup': '150s4H4TdQ5nNYVC6j0E416TUAjBE85yy', + 'roa-tara': '1H6emfQsD_a5yohK4RMPQ-GrnHXqqVgr3', + 'ru': '11gP2s-SYcfS3j9MjPp5C3_nFeQB-8x86', + 'rue': '1OuSglZAndja1J5D5IUmdbt_niTTyEgYK', + 'rw': '1NuhHfi0-B-Xlr_BApijnxCw0WMEltttP', + 'sa': '1P2S3gL_zvKgXLKJJxg-Fb4z8XdlVpQik', + 'sah': '1qz0MpKckzUref2FX_FYiNzI2p4BDc5oR', + 'sc': '1oAYj_Fty4FUwjAOBEBaiZt_cY8dtpDfA', + 'scn': '1sDN9zHkXWYoHYx-DUu-GPvsUgB_IRa8S', + 'sco': '1i8W7KQPj6YZQLop89vZBSybJNgNsvXWR', + 'sd': '1vaNqfv3S8Gl5pQmig3vwWQ3cqRTsXmMR', + 'se': '1RT9xhn0Vl90zjWYDTw5V1L_u1Oh16tpP', + 'sg': '1iIh2oXD2Szz_AygUvTt3_ZK8a3RYEGZ_', + 'sh': '1qPwLiAm6t4__G-zVEOrBgYx6VRmgDgiS', + 'si': '1G5ryceID0TP6SAO42e-HAbIlCvYmnUN7', + 'simple': '1FVV49o_RlK6M5Iw_7zeJOEDQoTa5zSbq', + 'sk': '11mkYvbmAWKTInj6t4Ma8BUPxoR5o6irL', + 'sl': '1fsIZS5LgMzMzZ6T7ogStyj-ILEZIBRvO', + 'sm': '1yefECpKX_Y4R7G2tggIxvc_BvJfOAz-t', + 'sn': '1fYeCjMPvRAv94kvZjiKI-ktIDLkbv0Ve', + 'so': '1Uc-eSZnJb36SgeTvRU3GirXZOlGD_NB6', + 'sq': '11u-53n71O_yjpwRiCQSwgL7N2w72ZptX', + 'sr': '1PGLGlQi8Q0Eac6dib-uuCJAAHK6SF5Pz', + 'srn': '1JKiL3TSXqK1-KhPfAwMK0uqw90WEzg7M', + 'ss': '1e0quNEsA1dn57-IbincF4D82dRWgzQlp', + 'st': '1ny-FBzpBqIDgv6jMcsoFev3Ih65FNZFO', + 'stq': '15Fx32ROy2IM6lSqAPUykkr3CITR6Xd7v', + 'su': '1C0FJum7bYZpnyptBvfAgwJb0TX2hggtO', + 'sv': '1YyqzOSXzK5yrAou9zeTDWH_7s569mDcz', + 'sw': '1_bNTj6T8eXlNAIuHaveleWlHB_22alJs', + 'szl': '1_dXEip1snK4CPVGqH8x7lF5O-6FdCNFW', + 'ta': '1ZFTONsxGtSnC9QB6RpWSvgD_MbZwIhHH', + 'tcy': '15R6u7KQs1vmDSm_aSDrQMJ3Q6q3Be0r7', # leer + 'te': '11Sx-pBAPeZOXGyv48UNSVMD0AH7uf4YN', + 'tet': '11mr2MYLcv9pz7mHhGGNi5iNCOVErYeOt', + 'tg': '16ttF7HWqM9Cnj4qmgf3ZfNniiOJfZ52w', + 'th': '14xhIt-xr5n9nMuvcwayCGM1-zBCFZquW', + 'ti': '123q5e9MStMShp8eESGtHdSBGLDrCKfJU', + 'tk': '1X-JNInt34BNGhg8A8Peyjw2WjsALdXsD', + 'tl': '1WkQHbWd9cqtTnSHAv0DpUThaBnzeSPTJ', + 'tn': '1fHfQHetZn8-fLuRZEu-cvs-kQYwPvjyL', + 'to': '1cHOLaczYJ8h-OqQgxeoH9vMG3izg6muT', + 'tpi': '1YsRjxVu6NYOrXRb8oqMO9FPaicelFEcu', + 'tr': '1J1Zy02IxvtCK0d1Ba2h_Ulit1mVb9UIX', + 'ts': '1pIcfAt3KmtmDkyhOl-SMSeoM8aP8bOpl', + 'tt': '1vsfzCjj-_bMOn5jBai41TF5GjKJM_Ius', + 'tum': '1NWcg65daI2Bt0awyEgU6apUDbBmiqCus', + 'tw': '1WCYKZIqS7AagS76QFSfbteiOgFNBvNne', + 'ty': '1DIqaP1l-N9VXTNokrlr6EuPMGE765o4h', + 'tyv': '1F3qa05OYLBcjT1lXMurAJFDXP_EesCvM', + 'udm': '1T0YMTAPLOk768sstnewy5Jxgx2RPu3Rb', + 'ug': '1fjezvqlysyZhiQMZdazqLGgk72PqtXAw', + 'uk': '1UMJCHtzxkfLDBJE7NtfN5FeMrnnUVwoh', + 'ur': '1WNaD2TuHvdsF-z0k_emQYchwoQQDFmRk', + 'uz': '11wrG2FSTpRJc2jb5MhgvxjkVDYhT8M-l', + 've': '1PucJ7pJ4CXGEXZ5p_WleZDs2usNz74to', + 'vec': '1cAVjm_y3ehNteDQIYz9yyoq1EKkqOXZ0', + 'vep': '1K_eqV7O6C7KPJWZtmIuzFMKAagj-0O85', + 'vi': '1yQ6nhm1BmG9lD4_NaG1hE5VV6biEaV5f', + 'vls': '1bpQQW6pKHruKJJaKtuggH5rReMXyeVXp', + 'vo': '1D80QRdTpe7H4mHFKpfugscsjX71kiMJN', + 'wa': '1m4B81QYbf74htpInDU5p7d0n0ot8WLPZ', + 'war': '1EC3jsHtu22tHBv6jX_I4rupC5RwV3OYd', + 'wo': '1vChyqNNLu5xYHdyHpACwwpw4l3ptiKlo', + 'wuu': '1_EIn02xCUBcwLOwYnA-lScjS2Lh2ECw6', + 'xal': '19bKXsL1D2UesbB50JPyc9TpG1lNc2POt', + 'xh': '1pPVcxBG3xsCzEnUzlohc_p89gQ9dSJB3', + 'xmf': '1SM9llku6I_ZuZz05mOBuL2lx-KQXvehr', + 'yi': '1WNWr1oV-Nl7c1Jv8x_MiAj2vxRtyQawu', + 'yo': '1yNVOwMOWeglbOcRoZzgd4uwlN5JMynnY', + 'za': '1i7pg162cD_iU9h8dgtI2An8QCcbzUAjB', + 'zea': '1EWSkiSkPBfbyjWjZK0VuKdpqFnFOpXXQ', + 'zh-classical': '1uUKZamNp08KA7s7794sKPOqPALvo_btl', + 'zh-min-nan': '1oSgz3YBXLGUgI7kl-uMOC_ww6L0FNFmp', + 'zh-yue': '1zhwlUeeiyOAU1QqwqZ8n91yXIRPFA7UE', + 'zh': '1LZ96GUhkVHQU-aj2C3WOrtffOp0U3Z7f', + 'zu': '1FyXl_UK1737XB3drqQFhGXiJrJckiB1W' + } + return languages_ids[language] - super(WNUT_17, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory - ) -class WEIBO_NER(ColumnCorpus): +class WIKIGOLD_NER(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, @@ -2200,12 +2330,11 @@ def __init__( document_as_sequence: bool = False, ): """ - Initialize the WEIBO_NER corpus . The first time you call this constructor it will automatically + Initialize the wikigold corpus. The first time you call this constructor it will automatically download the dataset. :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict - POS tags instead + :param tag_to_bioes: NER by default, should not be changed :param in_memory: If True, keeps dataset in memory giving speedups in training. :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ @@ -2213,7 +2342,7 @@ def __init__( base_path: Path = Path(base_path) # column format - columns = {0: 'text', 1: 'ner'} + columns = {0: "text", 1: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2224,117 +2353,32 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - weiboNER_conll_path = "https://raw.githubusercontent.com/87302380/WEIBO_NER/main/data/" - cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.train", Path("datasets") / dataset_name) - cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.test", Path("datasets") / dataset_name) - cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.dev", Path("datasets") / dataset_name) - + wikigold_ner_path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/wikigold/CONLL-format/data/" + cached_path(f"{wikigold_ner_path}wikigold.conll.txt", Path("datasets") / dataset_name) - super(WEIBO_NER, self).__init__( + super(WIKIGOLD_NER, self).__init__( data_folder, columns, tag_to_bioes=tag_to_bioes, encoding="utf-8", in_memory=in_memory, - train_file="weiboNER_2nd_conll_format.train", - test_file="weiboNER_2nd_conll_format.test", - dev_file="weiboNER_2nd_conll_format.dev", + train_file='wikigold.conll.txt', document_separator_token=None if not document_as_sequence else "-DOCSTART-", ) -class BIOSCOPE(ColumnCorpus): - - def __init__( - self, - base_path: Union[str, Path] = None, - in_memory: bool = True, - ): - if type(base_path) == str: - base_path: Path = Path(base_path) - - # column format - columns = {0: "text", 1: "tag"} - - # this dataset name - dataset_name = self.__class__.__name__.lower() - - # default dataset folder is the cache root - if not base_path: - base_path = Path(flair.cache_root) / "datasets" - data_folder = base_path / dataset_name - - # download data if necessary - bioscope_path = "https://raw.githubusercontent.com/whoisjones/BioScopeSequenceLabelingData/master/sequence_labeled/" - cached_path(f"{bioscope_path}output.txt", Path("datasets") / dataset_name) - - super(BIOSCOPE, self).__init__( - data_folder, columns, in_memory=in_memory, train_file="output.txt" - ) - - -def _download_wikiner(language_code: str, dataset_name: str): - # download data if necessary - wikiner_path = ( - "https://raw.githubusercontent.com/dice-group/FOX/master/input/Wikiner/" - ) - lc = language_code - - data_file = ( - Path(flair.cache_root) - / "datasets" - / dataset_name - / f"aij-wikiner-{lc}-wp3.train" - ) - if not data_file.is_file(): - - cached_path( - f"{wikiner_path}aij-wikiner-{lc}-wp3.bz2", Path("datasets") / dataset_name - ) - import bz2, shutil - - # unpack and write out in CoNLL column-like format - bz_file = bz2.BZ2File( - Path(flair.cache_root) - / "datasets" - / dataset_name - / f"aij-wikiner-{lc}-wp3.bz2", - "rb", - ) - with bz_file as f, open( - Path(flair.cache_root) - / "datasets" - / dataset_name - / f"aij-wikiner-{lc}-wp3.train", - "w", - encoding="utf-8" - ) as out: - for line in f: - line = line.decode("utf-8") - words = line.split(" ") - for word in words: - out.write("\t".join(word.split("|")) + "\n") -class UP_CHINESE(ColumnCorpus): +class WIKINER_ENGLISH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - in_memory: bool = True, - document_as_sequence: bool = False, + tag_to_bioes: str = "ner", + in_memory: bool = False, ): - """ - Initialize the Chinese dataset from the Universal Propositions Bank, comming from that webpage: - https://github.com/System-T/UniversalPropositions/tree/master/UP_Chinese - - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {1: "text", 9: "frame"} + columns = {0: "text", 1: "pos", 2: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2345,92 +2389,25 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - up_zh_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Chinese/" - cached_path(f"{up_zh_path}zh-up-train.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_zh_path}zh-up-dev.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_zh_path}zh-up-test.conllu", Path("datasets") / dataset_name) + _download_wikiner("en", dataset_name) - super(UP_CHINESE, self).__init__( - data_folder, - columns, - encoding="utf-8", - train_file="zh-up-train.conllu", - test_file="zh-up-test.conllu", - dev_file="zh-up-dev.conllu", - in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", - comment_symbol="#", + super(WIKINER_ENGLISH, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class UP_ENGLISH(ColumnCorpus): - def __init__( - self, - base_path: Union[str, Path] = None, - in_memory: bool = True, - document_as_sequence: bool = False, - ): - """ - Initialize the English dataset from the Universal Propositions Bank, comming from that webpage: - https://github.com/System-T/UniversalPropositions. - - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ - if type(base_path) == str: - base_path: Path = Path(base_path) - - # column format - columns = {1: "text", 10: "frame"} - - # this dataset name - dataset_name = self.__class__.__name__.lower() - - # default dataset folder is the cache root - if not base_path: - base_path = Path(flair.cache_root) / "datasets" - data_folder = base_path / dataset_name - - # download data if necessary - up_en_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_English-EWT/" - cached_path(f"{up_en_path}en_ewt-up-train.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_en_path}en_ewt-up-dev.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_en_path}en_ewt-up-test.conllu", Path("datasets") / dataset_name) - - super(UP_ENGLISH, self).__init__( - data_folder, - columns, - encoding="utf-8", - train_file="en_ewt-up-train.conllu", - test_file="en_ewt-up-test.conllu", - dev_file="en_ewt-up-dev.conllu", - in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", - comment_symbol="#", - ) -class UP_FRENCH(ColumnCorpus): +class WIKINER_GERMAN(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - in_memory: bool = True, - document_as_sequence: bool = False, + tag_to_bioes: str = "ner", + in_memory: bool = False, ): - """ - Initialize the French dataset from the Universal Propositions Bank, comming from that webpage: - https://github.com/System-T/UniversalPropositions. - - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {1: "text", 9: "frame"} + columns = {0: "text", 1: "pos", 2: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2441,44 +2418,25 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - up_fr_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_French/" - cached_path(f"{up_fr_path}fr-up-train.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_fr_path}fr-up-dev.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_fr_path}fr-up-test.conllu", Path("datasets") / dataset_name) + _download_wikiner("de", dataset_name) - super(UP_FRENCH, self).__init__( - data_folder, - columns, - encoding="utf-8", - train_file="fr-up-train.conllu", - test_file="fr-up-test.conllu", - dev_file="fr-up-dev.conllu", - in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", - comment_symbol="#", + super(WIKINER_GERMAN, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class UP_FINNISH(ColumnCorpus): + +class WIKINER_DUTCH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - in_memory: bool = True, - document_as_sequence: bool = False, + tag_to_bioes: str = "ner", + in_memory: bool = False, ): - """ - Initialize the Finnish dataset from the Universal Propositions Bank, comming from that webpage: - https://github.com/System-T/UniversalPropositions/tree/master/UP_Finnish - - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {1: "text", 9: "frame"} + columns = {0: "text", 1: "pos", 2: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2489,44 +2447,25 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - up_fi_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Finnish/" - cached_path(f"{up_fi_path}fi-up-train.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_fi_path}fi-up-dev.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_fi_path}fi-up-test.conllu", Path("datasets") / dataset_name) + _download_wikiner("nl", dataset_name) - super(UP_FINNISH, self).__init__( - data_folder, - columns, - encoding="utf-8", - train_file="fi-up-train.conllu", - test_file="fi-up-test.conllu", - dev_file="fi-up-dev.conllu", - in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", - comment_symbol="#", + super(WIKINER_DUTCH, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class UP_GERMAN(ColumnCorpus): + +class WIKINER_FRENCH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - in_memory: bool = True, - document_as_sequence: bool = False, + tag_to_bioes: str = "ner", + in_memory: bool = False, ): - """ - Initialize the German dataset from the Universal Propositions Bank, comming from that webpage: - https://github.com/System-T/UniversalPropositions. - - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {1: "text", 9: "frame"} + columns = {0: "text", 1: "pos", 2: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2537,44 +2476,25 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - up_de_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_German/" - cached_path(f"{up_de_path}de-up-train.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_de_path}de-up-dev.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_de_path}de-up-test.conllu", Path("datasets") / dataset_name) + _download_wikiner("fr", dataset_name) - super(UP_GERMAN, self).__init__( - data_folder, - columns, - encoding="utf-8", - train_file="de-up-train.conllu", - test_file="de-up-test.conllu", - dev_file="de-up-dev.conllu", - in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", - comment_symbol="#", + super(WIKINER_FRENCH, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class UP_ITALIAN(ColumnCorpus): + +class WIKINER_ITALIAN(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - in_memory: bool = True, - document_as_sequence: bool = False, + tag_to_bioes: str = "ner", + in_memory: bool = False, ): - """ - Initialize the Italian dataset from the Universal Propositions Bank, comming from that webpage: - https://github.com/System-T/UniversalPropositions/tree/master/UP_Italian - - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {1: "text", 9: "frame"} + columns = {0: "text", 1: "pos", 2: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2585,44 +2505,25 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - up_it_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Italian/" - cached_path(f"{up_it_path}it-up-train.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_it_path}it-up-dev.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_it_path}it-up-test.conllu", Path("datasets") / dataset_name) + _download_wikiner("it", dataset_name) - super(UP_ITALIAN, self).__init__( - data_folder, - columns, - encoding="utf-8", - train_file="it-up-train.conllu", - test_file="it-up-test.conllu", - dev_file="it-up-dev.conllu", - in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", - comment_symbol="#", + super(WIKINER_ITALIAN, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class UP_SPANISH(ColumnCorpus): + +class WIKINER_SPANISH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - in_memory: bool = True, - document_as_sequence: bool = False, + tag_to_bioes: str = "ner", + in_memory: bool = False, ): - """ - Initialize the Spanish dataset from the Universal Propositions Bank, comming from that webpage: - https://github.com/System-T/UniversalPropositions - - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {1: "text", 9: "frame"} + columns = {0: "text", 1: "pos", 2: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2633,44 +2534,25 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish/" - cached_path(f"{up_es_path}es-up-train.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_es_path}es-up-dev.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_es_path}es-up-test.conllu", Path("datasets") / dataset_name) + _download_wikiner("es", dataset_name) - super(UP_SPANISH, self).__init__( - data_folder, - columns, - encoding="utf-8", - train_file="es-up-train.conllu", - test_file="es-up-test.conllu", - dev_file="es-up-dev.conllu", - in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", - comment_symbol="#", + super(WIKINER_SPANISH, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class UP_SPANISH_ANCORA(ColumnCorpus): + +class WIKINER_PORTUGUESE(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - in_memory: bool = True, - document_as_sequence: bool = False, + tag_to_bioes: str = "ner", + in_memory: bool = False, ): - """ - Initialize the Spanish AnCora dataset from the Universal Propositions Bank, comming from that webpage: - https://github.com/System-T/UniversalPropositions - - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {1: "text", 9: "frame"} + columns = {0: "text", 1: "pos", 2: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2681,127 +2563,83 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish-AnCora/" - cached_path(f"{up_es_path}es_ancora-up-train.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_es_path}es_ancora-up-dev.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_es_path}es_ancora-up-test.conllu", Path("datasets") / dataset_name) + _download_wikiner("pt", dataset_name) - super(UP_SPANISH_ANCORA, self).__init__( - data_folder, - columns, - encoding="utf-8", - train_file="es_ancora-up-train.conllu", - test_file="es_ancora-up-test.conllu", - dev_file="es_ancora-up-dev.conllu", - in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", - comment_symbol="#", + super(WIKINER_PORTUGUESE, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class MITMovieNERSimple(ColumnCorpus): +class WIKINER_POLISH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", - in_memory: bool = True, + in_memory: bool = False, ): - """ - Initialize the eng corpus of the MIT Movie Corpus (it has simpler queries compared to the trivia10k13 corpus) - in BIO format. The first time you call this constructor it will automatically download the dataset. - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict - POS tags instead - :param in_memory: If True, keeps dataset in memory giving speedups in training. - """ + if type(base_path) == str: + base_path: Path = Path(base_path) + # column format - columns = {0: "ner", 1: "text"} + columns = {0: "text", 1: "pos", 2: "ner"} - # dataset name + # this dataset name dataset_name = self.__class__.__name__.lower() - # data folder: default dataset folder is the cache root - if type(base_path) == str: - base_path: Path = Path(base_path) + # default dataset folder is the cache root if not base_path: - base_path: Path = Path(flair.cache_root) / "datasets" + base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name # download data if necessary - mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/" - train_file = "engtrain.bio" - test_file = "engtest.bio" - cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name) - cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name) + _download_wikiner("pl", dataset_name) - super(MITMovieNERSimple, self).__init__( - data_folder, - columns, - train_file=train_file, - test_file=test_file, - tag_to_bioes=tag_to_bioes, - in_memory=in_memory, + super(WIKINER_POLISH, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class MITMovieNERComplex(ColumnCorpus): + +class WIKINER_RUSSIAN(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", - in_memory: bool = True, + in_memory: bool = False, ): - """ - Initialize the trivia10k13 corpus of the MIT Movie Corpus (it has more complex queries compared to the eng corpus) - in BIO format. The first time you call this constructor it will automatically download the dataset. - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict - POS tags instead - :param in_memory: If True, keeps dataset in memory giving speedups in training. - """ + if type(base_path) == str: + base_path: Path = Path(base_path) + # column format - columns = {0: "ner", 1: "text"} + columns = {0: "text", 1: "pos", 2: "ner"} - # dataset name + # this dataset name dataset_name = self.__class__.__name__.lower() - # data folder: default dataset folder is the cache root - if type(base_path) == str: - base_path: Path = Path(base_path) + # default dataset folder is the cache root if not base_path: - base_path: Path = Path(flair.cache_root) / "datasets" + base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name # download data if necessary - mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/" - train_file = "trivia10k13train.bio" - test_file = "trivia10k13test.bio" - cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name) - cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name) + _download_wikiner("ru", dataset_name) - super(MITMovieNERComplex, self).__init__( - data_folder, - columns, - train_file=train_file, - test_file=test_file, - tag_to_bioes=tag_to_bioes, - in_memory=in_memory, + super(WIKINER_RUSSIAN, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class SEC_FILLINGS(ColumnCorpus): + +class WNUT_17(ColumnCorpus): def __init__( self, - base_path: Union[str, Path] = None, + base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", in_memory: bool = True, ): - if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "pos", 3: "ner"} + columns = {0: "text", 1: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2812,22 +2650,19 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - SEC_FILLINGS_Path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/SEC-filings/CONLL-format/data/" - cached_path(f"{SEC_FILLINGS_Path}test/FIN3.txt", Path("datasets") / dataset_name) - cached_path(f"{SEC_FILLINGS_Path}train/FIN5.txt", Path("datasets") / dataset_name) + wnut_path = "https://noisy-text.github.io/2017/files/" + cached_path(f"{wnut_path}wnut17train.conll", Path("datasets") / dataset_name) + cached_path(f"{wnut_path}emerging.dev.conll", Path("datasets") / dataset_name) + cached_path( + f"{wnut_path}emerging.test.annotated", Path("datasets") / dataset_name + ) - super(SEC_FILLINGS, self).__init__( - data_folder, - columns, - tag_to_bioes=tag_to_bioes, - encoding="utf-8", - in_memory=in_memory, - train_file='FIN5.txt', - test_file="FIN3.txt", - skip_first_line=True + super(WNUT_17, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class TURKU_NER(ColumnCorpus): + +class WNUT_2020_NER(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, @@ -2836,12 +2671,11 @@ def __init__( document_as_sequence: bool = False, ): """ - Initialize the Finnish TurkuNER corpus. The first time you call this constructor it will automatically + Initialize the WNUT_2020_NER corpus. The first time you call this constructor it will automatically download the dataset. :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict - POS tags instead + :param tag_to_bioes: NER by default, since it is the only option of the WNUT corpus. :param in_memory: If True, keeps dataset in memory giving speedups in training. :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ @@ -2860,23 +2694,201 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - conll_path = "https://raw.githubusercontent.com/TurkuNLP/turku-ner-corpus/master/data/conll" - dev_file = "dev.tsv" - test_file = "test.tsv" - train_file = "train.tsv" - cached_path(f"{conll_path}/{dev_file}", Path("datasets") / dataset_name) - cached_path(f"{conll_path}/{test_file}", Path("datasets") / dataset_name) - cached_path(f"{conll_path}/{train_file}", Path("datasets") / dataset_name) + github_url = "https://github.com/jeniyat/WNUT_2020_NER/archive/master.zip" - super(TURKU_NER, self).__init__( + for sample in ["train", "test", "dev"]: + + sample_file = data_folder / (sample + ".txt") + if not sample_file.is_file(): + + zip_path = cached_path( + f"{github_url}", Path("datasets") / dataset_name + ) + + # unzip the downloaded repo and merge the train, dev and test datasets + unpack_file(zip_path, data_folder, "zip", False) # unzipped folder name: WNUT_2020_NER-master + + if sample == "test": + file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data_2020/Conll_Format/") + else: + file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data/Conll_Format/") + filenames = os.listdir(file_path) + with open(data_folder / (sample + '.txt'), 'w') as outfile: + for fname in filenames: + with open(file_path / fname) as infile: + lines = infile.read() + outfile.write(lines) + + shutil.rmtree(str(data_folder / "WNUT_2020_NER-master")) # clean up when done + + super(WNUT_2020_NER, self).__init__( data_folder, columns, - dev_file=dev_file, - test_file=test_file, - train_file=train_file, - column_delimiter="\t", tag_to_bioes=tag_to_bioes, - encoding="latin-1", + encoding="utf-8", in_memory=in_memory, document_separator_token=None if not document_as_sequence else "-DOCSTART-", - ) \ No newline at end of file + ) + + +def _download_wikiner(language_code: str, dataset_name: str): + # download data if necessary + wikiner_path = ( + "https://raw.githubusercontent.com/dice-group/FOX/master/input/Wikiner/" + ) + lc = language_code + + data_file = ( + Path(flair.cache_root) + / "datasets" + / dataset_name + / f"aij-wikiner-{lc}-wp3.train" + ) + if not data_file.is_file(): + + cached_path( + f"{wikiner_path}aij-wikiner-{lc}-wp3.bz2", Path("datasets") / dataset_name + ) + import bz2, shutil + + # unpack and write out in CoNLL column-like format + bz_file = bz2.BZ2File( + Path(flair.cache_root) + / "datasets" + / dataset_name + / f"aij-wikiner-{lc}-wp3.bz2", + "rb", + ) + with bz_file as f, open( + Path(flair.cache_root) + / "datasets" + / dataset_name + / f"aij-wikiner-{lc}-wp3.train", + "w", + encoding="utf-8" + ) as out: + for line in f: + line = line.decode("utf-8") + words = line.split(" ") + for word in words: + out.write("\t".join(word.split("|")) + "\n") + + +class XTREME(MultiCorpus): + def __init__( + self, + languages: Union[str, List[str]] = None, + base_path: Union[str, Path] = None, + tag_to_bioes: str = "ner", + in_memory: bool = False, + ): + """ + Xtreme corpus for cross-lingual NER consisting of datasets of a total of 176 languages. The data comes from the google + research work XTREME https://github.com/google-research/xtreme. All datasets for NER and respective language abbreviations (e.g. + "en" for english can be found here https://www.amazon.com/clouddrive/share/d3KGCRCIYwhKJF0H3eWA26hjg2ZCRhjpEQtDL70FSBN/folder/C43gs51bSIaq5sFTQkWNCQ?_encoding=UTF8&*Version*=1&*entries*=0&mgh=1 ) + The data is derived from the wikiann dataset https://elisa-ie.github.io/wikiann/ (license: https://opendatacommons.org/licenses/by/) + + Parameters + ---------- + languages : Union[str, List[str]], optional + Default the 40 languages that are used in XTREME are loaded. Otherwise on can hand over a strings or a list of strings + consisiting of abbreviations for languages. All datasets will be loaded in a MultiCorpus object. + base_path : Union[str, Path], optional + Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + tag_to_bioes : str, optional + The data is in bio-format. It will by default (with the string "ner" as value) be transformed + into the bioes format. If you dont want that set it to None. + + """ + # if no languages are given as argument all languages used in XTREME will be loaded + if not languages: + languages = ["af", "ar", "bg", "bn", "de", "el", "en", "es", "et", "eu", "fa", "fi", "fr", "he", "hi", "hu", + "id", "it", "ja", "jv", "ka", "kk", "ko", "ml", "mr", "ms", "my", "nl", "pt", "ru", "sw", "ta", + "te", "th", "tl", "tr", "ur", "vi", "yo", "zh"] + + # if only one language is given + if type(languages) == str: + languages = [languages] + + if type(base_path) == str: + base_path: Path = Path(base_path) + + # column format + columns = {0: "text", 1: "ner"} + + # this dataset name + dataset_name = "xtreme" + + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + + # For each language in languages, the file is downloaded if not existent + # Then a comlumncorpus of that data is created and saved in a list + # This list is handed to the multicorpus + + # list that contains the columncopora + corpora = [] + + hu_path = "https://nlp.informatik.hu-berlin.de/resources/datasets/panx_dataset" + + # download data if necessary + for language in languages: + + language_folder = data_folder / language + + # if language not downloaded yet, download it + if not language_folder.exists(): + + file_name = language + '.tar.gz' + # create folder + os.makedirs(language_folder) + + # download from HU Server + temp_file = cached_path( + hu_path + "/" + file_name, + Path("datasets") / dataset_name / language + ) + + # unzip + print("Extract data...") + import tarfile + tar = tarfile.open(str(temp_file), "r:gz") + for part in ["train", "test", "dev"]: + tar.extract(part, str(language_folder)) + tar.close() + print('...done.') + + # transform data into required format + print("Process dataset...") + for part in ["train", "test", "dev"]: + xtreme_to_simple_ner_annotation(str(language_folder / part)) + print('...done.') + + # initialize comlumncorpus and add it to list + print("Read data into corpus...") + corp = ColumnCorpus(data_folder=language_folder, + column_format=columns, + tag_to_bioes=tag_to_bioes, + in_memory=in_memory, + ) + corpora.append(corp) + print("...done.") + + super(XTREME, self).__init__( + corpora, name='xtreme' + ) + + +def xtreme_to_simple_ner_annotation(data_file: Union[str, Path]): + with open(data_file, 'r', encoding='utf-8') as f: + lines = f.readlines() + with open(data_file, 'w', encoding='utf-8') as f: + for line in lines: + if line == '\n': + f.write(line) + else: + liste = line.split() + f.write(liste[0].split(':', 1)[1] + ' ' + liste[1] + '\n') diff --git a/resources/docs/TUTORIAL_6_CORPUS.md b/resources/docs/TUTORIAL_6_CORPUS.md index f981bf715..0c7419abe 100644 --- a/resources/docs/TUTORIAL_6_CORPUS.md +++ b/resources/docs/TUTORIAL_6_CORPUS.md @@ -162,20 +162,24 @@ data the first time you call the corresponding constructor ID. The following dat | ID(s) | Languages | Description | | ------------- | ------------- |------------- +| 'ANER_CORP' | Arabic | [Arabic Named Entity Recognition Corpus](http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp/) 4-class NER | | 'BIOFID' | German | [CoNLL-03](https://www.aclweb.org/anthology/K19-1081/) Biodiversity literature NER | +| 'BIOSCOPE' | English | [BioScope](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-9-S11-S9/) biomedical texts annotated for uncertainty, negation and their scopes | | 'CONLL_03_DUTCH' | Dutch | [CoNLL-03](https://www.clips.uantwerpen.be/conll2002/ner/) 4-class NER | | 'CONLL_03_SPANISH' | Spanish | [CoNLL-03](https://www.clips.uantwerpen.be/conll2002/ner/) 4-class NER | | 'DANE' | Danish | [DaNE dataset](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#danish-dependency-treebank) | | 'EUROPARL_NER_GERMAN' | German | [German Europarl dataset](https://nlpado.de/~sebastian/software/ner_german.shtml) NER in German EU parliament speeches | | 'LER_GERMAN' | German | [Legal Entity Recognition](https://github.com/elenanereiss/Legal-Entity-Recognition) NER in German Legal Documents | -| 'MIT_RESTAURANTS' | English | [NER dataset for restaurant reviews](https://groups.csail.mit.edu/sls/downloads/restaurant/) | +| 'MIT_MOVIE_NER_SIMPLE' | English | [NER dataset for movie reviews](https://groups.csail.mit.edu/sls/downloads/movie/) - simple NER | +| 'MIT_MOVIE_NER_COMPLEX' | English | [NER dataset for movie reviews](https://groups.csail.mit.edu/sls/downloads/movie/) - complex NER | +| 'MIT_RESTAURANT_NER' | English | [NER dataset for restaurant reviews](https://groups.csail.mit.edu/sls/downloads/restaurant/) | | 'NER_BASQUE' | Basque | [NER dataset for Basque](http://ixa2.si.ehu.eus/eiec/) | | 'NER_FINNISH' | Finnish | [Finer-data](https://github.com/mpsilfve/finer-data) | | 'NER_SWEDISH' | Swedish | [Swedish Spraakbanken NER](https://github.com/klintan/swedish-ner-corpus/) 4-class NER | +| 'TURKU_NER' | Finnish | [TURKU_NER](https://github.com/TurkuNLP/turku-ner-corpus) NER corpus created by the Turku NLP Group, University of Turku, Finland | | 'TWITTER_NER' | English | [Twitter NER dataset](https://github.com/aritter/twitter_nlp/) | +| 'WEIBO_NER' | Chinese | [Weibo NER corpus](https://paperswithcode.com/sota/chinese-named-entity-recognition-on-weibo-ner/). | | 'WIKIANN' | 282 languages | Gigantic [corpus for cross-lingual NER derived from Wikipedia](https://elisa-ie.github.io/wikiann/). | -| 'WNUT_17' | English | [WNUT-17](https://noisy-text.github.io/2017/files/) emerging entity detection | -| 'WNUT_20' | English | [WNUT-20](https://github.com/jeniyat/WNUT_2020_NER) named entity extraction | | 'WIKIGOLD_NER' | English | [Wikigold](https://github.com/juand-r/entity-recognition-datasets/tree/master/data/wikigold) a manually annotated collection of Wikipedia text | | 'WIKINER_ENGLISH' | English | [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia | | 'WIKINER_GERMAN' | German | [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia | @@ -185,16 +189,33 @@ data the first time you call the corresponding constructor ID. The following dat | 'WIKINER_PORTUGUESE' | Portuguese | [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia | | 'WIKINER_POLISH' | Polish | [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia | | 'WIKINER_RUSSIAN' | Russian | [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia | +| 'WNUT_17' | English | [WNUT-17](https://noisy-text.github.io/2017/files/) emerging entity detection | +| 'WNUT_2020_NER' | English | [WNUT-20](https://github.com/jeniyat/WNUT_2020_NER) named entity extraction | | 'XTREME' | 176 languages | [Xtreme](https://github.com/google-research/xtreme) corpus by Google Research for cross-lingual NER consisting of datasets of a total of 176 languages | -| 'MITMovieNERSimple' | English | [eng](https://groups.csail.mit.edu/sls/downloads/movie) NER movie corpus collected by MIT (simpler queries) | -| 'MITMovieNERComplex' | English | [trivia10k13](https://groups.csail.mit.edu/sls/downloads/movie) NER movie corpus collected by MIT (more complex queries) | -| 'TURKU_NER' | Finnish | [TURKU_NER](https://github.com/TurkuNLP/turku-ner-corpus) NER corpus created by the Turku NLP Group, University of Turku, Finland | #### Biomedical Named Entity Recognition We support 31 biomedical NER datasets, listed [here](HUNFLAIR_CORPORA.md). + +#### Universal Proposition Banks + +We now also support loading the [Universal Proposition Banks](https://github.com/System-T/UniversalPropositions) +for the purpose of training multilingual frame detection systems. + +| ID(s) | Languages | Description | +| ------------- | ------------- |------------- | +| 'UP_CHINESE' | Chinese | Universal Propositions for [Chinese](https://github.com/System-T/UniversalPropositions/tree/master/UP_Chinese) | +| 'UP_ENGLISH'| English | Universal Propositions for [English](https://github.com/System-T/UniversalPropositions/tree/master/UP_English-EWT) | +| 'UP_FINNISH'| Finnish | Universal Propositions for [Finnish](https://github.com/System-T/UniversalPropositions/tree/master/UP_Finnish) +| 'UP_FRENCH'| French | Universal Propositions for [French](https://github.com/System-T/UniversalPropositions/tree/master/UP_French) +| 'UP_GERMAN'| German | Universal Propositions for [German](https://github.com/System-T/UniversalPropositions/tree/master/UP_German) | +| 'UP_ITALIAN', | Italian | Universal Propositions for [Italian](https://github.com/System-T/UniversalPropositions/tree/master/UP_Italian) | +| 'UP_SPANISH' | Spanish | Universal Propositions for [Spanish](https://github.com/System-T/UniversalPropositions/tree/master/UP_Spanish) | +| 'UP_SPANISH_ANCORA' | Spanish (Ancora Corpus) | Universal Propositions for [Spanish](https://github.com/System-T/UniversalPropositions/tree/master/UP_Spanish-AnCora) | + + #### Universal Dependency Treebanks | ID(s) | Languages | Description | From d256d947b55cb8f057a7606f0de057afd0bb4c1d Mon Sep 17 00:00:00 2001 From: Alan Akbik Date: Wed, 25 Nov 2020 14:44:58 +0100 Subject: [PATCH 03/35] GH-1983: bump version number --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index fa33a27cc..d82f2155d 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ document embeddings, including our proposed **[Flair embeddings](https://www.acl * **A PyTorch NLP framework.** Our framework builds directly on [PyTorch](https://pytorch.org/), making it easy to train your own models and experiment with new approaches using Flair embeddings and classes. -Now at [version 0.6.1](https://github.com/flairNLP/flair/releases)! +Now at [version 0.7](https://github.com/flairNLP/flair/releases)! ## Comparison with State-of-the-Art From 541e0a8fed226082d51df09f86ced3e8b6fd05d0 Mon Sep 17 00:00:00 2001 From: Alan Akbik Date: Wed, 25 Nov 2020 14:51:25 +0100 Subject: [PATCH 04/35] Update TUTORIAL_1_BASICS.md --- resources/docs/TUTORIAL_1_BASICS.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/resources/docs/TUTORIAL_1_BASICS.md b/resources/docs/TUTORIAL_1_BASICS.md index 655ef375e..61828d0d0 100644 --- a/resources/docs/TUTORIAL_1_BASICS.md +++ b/resources/docs/TUTORIAL_1_BASICS.md @@ -80,7 +80,7 @@ print(untokenized_sentence) In this case, no tokenization is performed and the text is split on whitespaces, thus resulting in only 4 tokens here. -### Using a Different Tokenizer +### Using a different tokenizer You can also pass custom tokenizers to the initialization method. For instance, if you want to tokenize a Japanese sentence you can use the 'janome' tokenizer instead, like this: @@ -110,12 +110,12 @@ You can write your own tokenization routine. Check the code of `flair.data.Token your own tokenization method. ### Using pretokenized sequences -You can pass pass a pretokenized sequence as list of words, e.g. +You can alternatively pass a pretokenized sequence as list of words, e.g. ```python from flair.data import Sentence -my_sent = Sentence(['The', 'grass', 'is', 'green', '.']) -print(my_sent) +sentence = Sentence(['The', 'grass', 'is', 'green', '.']) +print(sentence) ``` This should print: @@ -129,7 +129,7 @@ Sentence: "The grass is green ." [− Tokens: 5] In Flair, any data point can be labeled. For instance, you can label a word or label a sentence: -### Adding Labels to Tokens +### Adding labels to tokens A `Token` has fields for linguistic annotation, such as lemmas, part-of-speech tags or named entity tags. You can add a tag by specifying the tag type and the tag value. In this example, we're adding an NER tag of type 'color' to @@ -171,7 +171,7 @@ This should print: Our color tag has a score of 1.0 since we manually added it. If a tag is predicted by our sequence labeler, the score value will indicate classifier confidence. -### Adding Labels to Sentences +### Adding labels to sentences You can also add a `Label` to a whole `Sentence`. For instance, the example below shows how we add the label 'sports' to a sentence, thereby labeling it @@ -199,7 +199,7 @@ Sentence: "France is the current world cup winner." [− Tokens: 7 − Senten Indicating that this sentence belongs to the topic 'sports' with confidence 1.0. -### Multiple Labels +### Multiple labels Any data point can be labeled multiple times. A sentence for instance might belong to two topics. In this case, add two labels with the same label name: @@ -234,7 +234,7 @@ Sentence: "France is the current world cup winner." [− Tokens: 7 − Senten Indicating that this sentence has two "topic" labels and one "language" label. -### Accessing a Sentence's Labels +### Accessing a sentence's labels You can access these labels like this: From d5f951a943ff905ae91a333f401f421b5442ddd9 Mon Sep 17 00:00:00 2001 From: Alan Akbik Date: Fri, 27 Nov 2020 12:50:11 +0100 Subject: [PATCH 05/35] Update TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md --- resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md index eba2594df..50bbfc633 100644 --- a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md +++ b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md @@ -18,6 +18,8 @@ For instance, say you want to predict whether text is "happy" or "sad" but you h Just use TARS with this snippet: ```python +from flair.models.text_classification_model import TARSClassifier + # 1. Load our pre-trained TARS model for English tars = TARSClassifier.load('tars-base') From dff58a08da42de9dfe3d95fdc3808fb250c0e664 Mon Sep 17 00:00:00 2001 From: Alan Akbik Date: Fri, 27 Nov 2020 12:51:28 +0100 Subject: [PATCH 06/35] Update TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md --- resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md index 50bbfc633..16f19b7ce 100644 --- a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md +++ b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md @@ -19,6 +19,7 @@ Just use TARS with this snippet: ```python from flair.models.text_classification_model import TARSClassifier +from flair.data import Sentence # 1. Load our pre-trained TARS model for English tars = TARSClassifier.load('tars-base') @@ -69,6 +70,8 @@ In this case, zero-shot prediction falsely predicts "drink" for the sentence "I To improve this, let's first create a small corpus of 4 training and 2 testing examples: ```python +from flair.datasets import SentenceDataset + # training dataset consisting of four sentences (2 labeled as "food" and 2 labeled as "drink") train = SentenceDataset( [ From 8ce61612ab9ad02a99bbdc91339059b40bda0644 Mon Sep 17 00:00:00 2001 From: alanakbik Date: Wed, 25 Nov 2020 12:01:26 +0100 Subject: [PATCH 07/35] GH-1983: bump version numbers --- flair/__init__.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/flair/__init__.py b/flair/__init__.py index 7d3e9a311..ecb28ec24 100644 --- a/flair/__init__.py +++ b/flair/__init__.py @@ -25,7 +25,7 @@ import logging.config -__version__ = "0.6.1.post1" +__version__ = "0.7" logging.config.dictConfig( { diff --git a/setup.py b/setup.py index 0ca078dc0..824626455 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name="flair", - version="0.6.1.post1", + version="0.7", description="A very simple framework for state-of-the-art NLP", long_description=open("README.md", encoding="utf-8").read(), long_description_content_type="text/markdown", From 1a3bcce5d4c82a9f18ef11eb76024dfd3f931ea6 Mon Sep 17 00:00:00 2001 From: alanakbik Date: Wed, 25 Nov 2020 12:40:42 +0100 Subject: [PATCH 08/35] GH-1983: update list of datasets --- flair/datasets/__init__.py | 32 +- flair/datasets/sequence_labeling.py | 3008 ++++++++++++++------------- resources/docs/TUTORIAL_6_CORPUS.md | 33 +- 3 files changed, 1553 insertions(+), 1520 deletions(-) diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py index 5b611cd23..a59181506 100755 --- a/flair/datasets/__init__.py +++ b/flair/datasets/__init__.py @@ -7,6 +7,7 @@ # Expose all sequence labeling datasets from .sequence_labeling import ColumnCorpus from .sequence_labeling import ColumnDataset +from .sequence_labeling import ANER_CORP from .sequence_labeling import BIOFID from .sequence_labeling import BIOSCOPE from .sequence_labeling import CONLL_03 @@ -14,19 +15,31 @@ from .sequence_labeling import CONLL_03_DUTCH from .sequence_labeling import CONLL_03_SPANISH from .sequence_labeling import CONLL_2000 -from .sequence_labeling import TWITTER_NER from .sequence_labeling import DANE from .sequence_labeling import EUROPARL_NER_GERMAN from .sequence_labeling import GERMEVAL_14 from .sequence_labeling import INSPEC from .sequence_labeling import LER_GERMAN +from .sequence_labeling import MIT_MOVIE_NER_SIMPLE +from .sequence_labeling import MIT_MOVIE_NER_COMPLEX +from .sequence_labeling import MIT_RESTAURANT_NER from .sequence_labeling import NER_BASQUE from .sequence_labeling import NER_FINNISH from .sequence_labeling import NER_SWEDISH from .sequence_labeling import SEMEVAL2010 from .sequence_labeling import SEMEVAL2017 +from .sequence_labeling import TURKU_NER +from .sequence_labeling import TWITTER_NER +from .sequence_labeling import UP_CHINESE +from .sequence_labeling import UP_ENGLISH +from .sequence_labeling import UP_FINNISH +from .sequence_labeling import UP_FRENCH +from .sequence_labeling import UP_GERMAN +from .sequence_labeling import UP_ITALIAN +from .sequence_labeling import UP_SPANISH +from .sequence_labeling import UP_SPANISH_ANCORA +from .sequence_labeling import WEIBO_NER from .sequence_labeling import WIKIANN -from .sequence_labeling import XTREME from .sequence_labeling import WIKIGOLD_NER from .sequence_labeling import WIKINER_ENGLISH from .sequence_labeling import WIKINER_GERMAN @@ -39,20 +52,7 @@ from .sequence_labeling import WIKINER_RUSSIAN from .sequence_labeling import WNUT_17 from .sequence_labeling import WNUT_2020_NER -from .sequence_labeling import WEIBO_NER -from .sequence_labeling import MIT_RESTAURANTS -from .sequence_labeling import UP_CHINESE -from .sequence_labeling import UP_ENGLISH -from .sequence_labeling import UP_FINNISH -from .sequence_labeling import UP_FRENCH -from .sequence_labeling import UP_GERMAN -from .sequence_labeling import UP_ITALIAN -from .sequence_labeling import UP_SPANISH -from .sequence_labeling import UP_SPANISH_ANCORA -from .sequence_labeling import ANER_CORP -from .sequence_labeling import MITMovieNERSimple -from .sequence_labeling import MITMovieNERComplex -from .sequence_labeling import TURKU_NER +from .sequence_labeling import XTREME # Expose all document classification datasets from .document_classification import ClassificationCorpus diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 7dc950dba..02e0a5800 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -267,6 +267,56 @@ def __getitem__(self, index: int = 0) -> Sentence: return sentence +class ANER_CORP(ColumnCorpus): + def __init__( + self, + base_path: Union[str, Path] = None, + tag_to_bioes: str = "ner", + in_memory: bool = True, + document_as_sequence: bool = False, + ): + """ + Initialize a preprocessed version of the Arabic Named Entity Recognition Corpus (ANERCorp) dataset available + from https://github.com/EmnamoR/Arabic-named-entity-recognition/blob/master/ANERCorp.rar. + http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp + Column order is swapped + The first time you call this constructor it will automatically download the dataset. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict + POS tags instead + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ + if type(base_path) == str: + base_path: Path = Path(base_path) + + # column format + columns = {0: "text", 1: "ner"} + + # this dataset name + dataset_name = self.__class__.__name__.lower() + + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + + # download data if necessary + anercorp_path = "https://megantosh.s3.eu-central-1.amazonaws.com/ANERcorp/" + # cached_path(f"{anercorp_path}test.txt", Path("datasets") / dataset_name) + cached_path(f"{anercorp_path}train.txt", Path("datasets") / dataset_name) + + super(ANER_CORP, self).__init__( + data_folder, + columns, + # tag_to_bioes=tag_to_bioes, + encoding="utf-8", + in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + ) + + class BIOFID(ColumnCorpus): def __init__( self, @@ -299,6 +349,36 @@ def __init__( ) +class BIOSCOPE(ColumnCorpus): + + def __init__( + self, + base_path: Union[str, Path] = None, + in_memory: bool = True, + ): + if type(base_path) == str: + base_path: Path = Path(base_path) + + # column format + columns = {0: "text", 1: "tag"} + + # this dataset name + dataset_name = self.__class__.__name__.lower() + + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + + # download data if necessary + bioscope_path = "https://raw.githubusercontent.com/whoisjones/BioScopeSequenceLabelingData/master/sequence_labeled/" + cached_path(f"{bioscope_path}output.txt", Path("datasets") / dataset_name) + + super(BIOSCOPE, self).__init__( + data_folder, columns, in_memory=in_memory, train_file="output.txt" + ) + + class CONLL_03(ColumnCorpus): def __init__( self, @@ -449,21 +529,123 @@ def __init__( ) +def add_IOB_tags(data_file: Union[str, Path], encoding: str = "utf8", ner_column: int = 1): + """ +Function that adds IOB tags if only chunk names are provided (e.g. words are tagged PER instead +of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects +the letter 'O'. Additionally it removes lines with no tags in the data file and can also +be used if the data is only partially IOB tagged. +Parameters +---------- +data_file : Union[str, Path] + Path to the data file. +encoding : str, optional + Encoding used in open function. The default is "utf8". +ner_column : int, optional + Specifies the ner-tagged column. The default is 1 (the second column). -class WNUT_2020_NER(ColumnCorpus): +""" + + def add_I_prefix(current_line: List[str], ner: int, tag: str): + for i in range(0, len(current_line)): + if i == 0: + f.write(line_list[i]) + elif i == ner: + f.write(' I-' + tag) + else: + f.write(' ' + current_line[i]) + f.write('\n') + + with open(file=data_file, mode='r', encoding=encoding) as f: + lines = f.readlines() + with open(file=data_file, mode='w', encoding=encoding) as f: + pred = 'O' # remembers ner tag of predecessing line + for line in lines: + line_list = line.split() + if len(line_list) > 2: # word with tags + ner_tag = line_list[ner_column] + if ner_tag in ['0', 'O']: # no chunk + for i in range(0, len(line_list)): + if i == 0: + f.write(line_list[i]) + elif i == ner_column: + f.write(' O') + else: + f.write(' ' + line_list[i]) + f.write('\n') + pred = 'O' + elif '-' not in ner_tag: # no IOB tags + if pred == 'O': # found a new chunk + add_I_prefix(line_list, ner_column, ner_tag) + pred = ner_tag + else: # found further part of chunk or new chunk directly after old chunk + add_I_prefix(line_list, ner_column, ner_tag) + pred = ner_tag + else: # line already has IOB tag (tag contains '-') + f.write(line) + pred = ner_tag.split('-')[1] + elif len(line_list) == 0: # empty line + f.write('\n') + pred = 'O' + + +def add_IOB2_tags(data_file: Union[str, Path], encoding: str = "utf8"): + """ +Function that adds IOB2 tags if only chunk names are provided (e.g. words are tagged PER instead +of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects +the letter 'O'. Additionally it removes lines with no tags in the data file and can also +be used if the data is only partially IOB tagged. +Parameters +---------- +data_file : Union[str, Path] + Path to the data file. +encoding : str, optional + Encoding used in open function. The default is "utf8". + +""" + with open(file=data_file, mode='r', encoding=encoding) as f: + lines = f.readlines() + with open(file=data_file, mode='w', encoding=encoding) as f: + pred = 'O' # remembers tag of predecessing line + for line in lines: + line_list = line.split() + if len(line_list) == 2: # word with tag + word = line_list[0] + tag = line_list[1] + if tag in ['0', 'O']: # no chunk + f.write(word + ' O\n') + pred = 'O' + elif '-' not in tag: # no IOB tags + if pred == 'O': # found a new chunk + f.write(word + ' B-' + tag + '\n') + pred = tag + else: # found further part of chunk or new chunk directly after old chunk + if pred == tag: + f.write(word + ' I-' + tag + '\n') + else: + f.write(word + ' B-' + tag + '\n') + pred = tag + else: # line already has IOB tag (tag contains '-') + f.write(line) + pred = tag.split('-')[1] + elif len(line_list) == 0: # empty line + f.write('\n') + pred = 'O' + + +class CONLL_03_SPANISH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", in_memory: bool = True, - document_as_sequence: bool = False, ): """ - Initialize the WNUT_2020_NER corpus. The first time you call this constructor it will automatically + Initialize the CoNLL-03 corpus for Spanish. The first time you call this constructor it will automatically download the dataset. :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, since it is the only option of the WNUT corpus. + :param tag_to_bioes: NER by default, should not be changed :param in_memory: If True, keeps dataset in memory giving speedups in training. :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ @@ -482,65 +664,40 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - github_url = "https://github.com/jeniyat/WNUT_2020_NER/archive/master.zip" - - for sample in ["train", "test", "dev"]: - - sample_file = data_folder / (sample + ".txt") - if not sample_file.is_file(): - - zip_path = cached_path( - f"{github_url}", Path("datasets") / dataset_name - ) - - # unzip the downloaded repo and merge the train, dev and test datasets - unpack_file(zip_path, data_folder, "zip", False) # unzipped folder name: WNUT_2020_NER-master - - if sample == "test": - file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data_2020/Conll_Format/") - else: - file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data/Conll_Format/") - filenames = os.listdir(file_path) - with open(data_folder / (sample + '.txt'), 'w') as outfile: - for fname in filenames: - with open(file_path / fname) as infile: - lines = infile.read() - outfile.write(lines) - - shutil.rmtree(str(data_folder / "WNUT_2020_NER-master")) # clean up when done + conll_02_path = "https://www.clips.uantwerpen.be/conll2002/ner/data/" + cached_path(f"{conll_02_path}esp.testa", Path("datasets") / dataset_name) + cached_path(f"{conll_02_path}esp.testb", Path("datasets") / dataset_name) + cached_path(f"{conll_02_path}esp.train", Path("datasets") / dataset_name) - super(WNUT_2020_NER, self).__init__( + super(CONLL_03_SPANISH, self).__init__( data_folder, columns, tag_to_bioes=tag_to_bioes, - encoding="utf-8", + encoding="latin-1", in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", ) -class WIKIGOLD_NER(ColumnCorpus): +class CONLL_2000(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", + tag_to_bioes: str = "np", in_memory: bool = True, - document_as_sequence: bool = False, ): """ - Initialize the wikigold corpus. The first time you call this constructor it will automatically - download the dataset. + Initialize the CoNLL-2000 corpus for English chunking. + The first time you call this constructor it will automatically download the dataset. :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, should not be changed + :param tag_to_bioes: 'np' by default, should not be changed, but you can set 'pos' instead to predict POS tags :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "ner"} + columns = {0: "text", 1: "pos", 2: "np"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -551,45 +708,52 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - wikigold_ner_path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/wikigold/CONLL-format/data/" - cached_path(f"{wikigold_ner_path}wikigold.conll.txt", Path("datasets") / dataset_name) - - super(WIKIGOLD_NER, self).__init__( - data_folder, - columns, - tag_to_bioes=tag_to_bioes, - encoding="utf-8", - in_memory=in_memory, - train_file='wikigold.conll.txt', - document_separator_token=None if not document_as_sequence else "-DOCSTART-", - ) - + conll_2000_path = "https://www.clips.uantwerpen.be/conll2000/chunking/" + data_file = Path(flair.cache_root) / "datasets" / dataset_name / "train.txt" + if not data_file.is_file(): + cached_path( + f"{conll_2000_path}train.txt.gz", Path("datasets") / dataset_name + ) + cached_path( + f"{conll_2000_path}test.txt.gz", Path("datasets") / dataset_name + ) + import gzip, shutil -class TWITTER_NER(ColumnCorpus): + with gzip.open( + Path(flair.cache_root) / "datasets" / dataset_name / "train.txt.gz", + "rb", + ) as f_in: + with open( + Path(flair.cache_root) / "datasets" / dataset_name / "train.txt", + "wb", + ) as f_out: + shutil.copyfileobj(f_in, f_out) + with gzip.open( + Path(flair.cache_root) / "datasets" / dataset_name / "test.txt.gz", "rb" + ) as f_in: + with open( + Path(flair.cache_root) / "datasets" / dataset_name / "test.txt", + "wb", + ) as f_out: + shutil.copyfileobj(f_in, f_out) + + super(CONLL_2000, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory + ) + + +class DANE(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", in_memory: bool = True, - document_as_sequence: bool = False, ): - """ - Initialize a dataset called twitter_ner which can be found on the following page: - https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/ner.txt. - - The first time you call this constructor it will automatically - download the dataset. - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, need not be changed - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: 'text', 1: 'ner'} + columns = {1: 'text', 3: 'pos', 9: 'ner'} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -600,43 +764,61 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - twitter_ner_path = "https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/" - cached_path(f"{twitter_ner_path}ner.txt", Path("datasets") / dataset_name) + data_path = Path(flair.cache_root) / "datasets" / dataset_name + train_data_file = data_path / "ddt.train.conllu" + if not train_data_file.is_file(): + temp_file = cached_path( + 'https://danlp.alexandra.dk/304bd159d5de/datasets/ddt.zip', + Path("datasets") / dataset_name + ) + from zipfile import ZipFile - super(TWITTER_NER, self).__init__( - data_folder, - columns, - tag_to_bioes=tag_to_bioes, - encoding="latin-1", - train_file="ner.txt", - in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", + with ZipFile(temp_file, 'r') as zip_file: + zip_file.extractall(path=data_path) + + # Remove CoNLL-U meta information in the last column + for part in ['train', 'dev', 'test']: + lines = [] + data_file = "ddt.{}.conllu".format(part) + with open(data_path / data_file, 'r') as file: + for line in file: + if line.startswith("#") or line == "\n": + lines.append(line) + lines.append(line.replace("name=", "").replace("|SpaceAfter=No", "")) + + with open(data_path / data_file, 'w') as file: + file.writelines(lines) + + print(data_path / data_file) + + super(DANE, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, + in_memory=in_memory, comment_symbol="#" ) -class MIT_RESTAURANTS(ColumnCorpus): +class EUROPARL_NER_GERMAN(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", - in_memory: bool = True, - document_as_sequence: bool = False, + in_memory: bool = False, ): """ - Initialize the experimental MIT Restaurant corpus available on https://groups.csail.mit.edu/sls/downloads/restaurant/. - The first time you call this constructor it will automatically download the dataset. + Initialize the EUROPARL_NER_GERMAN corpus. The first time you call this constructor it will automatically + download the dataset. :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict - POS tags instead - :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param tag_to_bioes: 'ner' by default, should not be changed. + :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage. :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ + if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "ner"} + columns = {0: 'text', 1: 'lemma', 2: 'pos', 3: 'np', 4: 'ner'} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -647,125 +829,25 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - mit_restaurants_path = "https://megantosh.s3.eu-central-1.amazonaws.com/MITRestoCorpus/" - cached_path(f"{mit_restaurants_path}test.txt", Path("datasets") / dataset_name) - cached_path(f"{mit_restaurants_path}train.txt", Path("datasets") / dataset_name) + europarl_ner_german_path = "https://nlpado.de/~sebastian/software/ner/" + cached_path(f"{europarl_ner_german_path}ep-96-04-15.conll", Path("datasets") / dataset_name) + cached_path(f"{europarl_ner_german_path}ep-96-04-16.conll", Path("datasets") / dataset_name) + + add_IOB_tags(data_file=Path(data_folder / "ep-96-04-15.conll"), encoding="latin-1", ner_column=4) + add_IOB_tags(data_file=Path(data_folder / "ep-96-04-16.conll"), encoding="latin-1", ner_column=4) - super(MIT_RESTAURANTS, self).__init__( + super(EUROPARL_NER_GERMAN, self).__init__( data_folder, columns, tag_to_bioes=tag_to_bioes, encoding="latin-1", in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", + train_file='ep-96-04-16.conll', + test_file='ep-96-04-15.conll' ) -def add_IOB_tags(data_file: Union[str, Path], encoding: str = "utf8", ner_column: int = 1): - """ -Function that adds IOB tags if only chunk names are provided (e.g. words are tagged PER instead -of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects -the letter 'O'. Additionally it removes lines with no tags in the data file and can also -be used if the data is only partially IOB tagged. -Parameters ----------- -data_file : Union[str, Path] - Path to the data file. -encoding : str, optional - Encoding used in open function. The default is "utf8". -ner_column : int, optional - Specifies the ner-tagged column. The default is 1 (the second column). - -""" - - def add_I_prefix(current_line: List[str], ner: int, tag: str): - for i in range(0, len(current_line)): - if i == 0: - f.write(line_list[i]) - elif i == ner: - f.write(' I-' + tag) - else: - f.write(' ' + current_line[i]) - f.write('\n') - - with open(file=data_file, mode='r', encoding=encoding) as f: - lines = f.readlines() - with open(file=data_file, mode='w', encoding=encoding) as f: - pred = 'O' # remembers ner tag of predecessing line - for line in lines: - line_list = line.split() - if len(line_list) > 2: # word with tags - ner_tag = line_list[ner_column] - if ner_tag in ['0', 'O']: # no chunk - for i in range(0, len(line_list)): - if i == 0: - f.write(line_list[i]) - elif i == ner_column: - f.write(' O') - else: - f.write(' ' + line_list[i]) - f.write('\n') - pred = 'O' - elif '-' not in ner_tag: # no IOB tags - if pred == 'O': # found a new chunk - add_I_prefix(line_list, ner_column, ner_tag) - pred = ner_tag - else: # found further part of chunk or new chunk directly after old chunk - add_I_prefix(line_list, ner_column, ner_tag) - pred = ner_tag - else: # line already has IOB tag (tag contains '-') - f.write(line) - pred = ner_tag.split('-')[1] - elif len(line_list) == 0: # empty line - f.write('\n') - pred = 'O' - - -def add_IOB2_tags(data_file: Union[str, Path], encoding: str = "utf8"): - """ -Function that adds IOB2 tags if only chunk names are provided (e.g. words are tagged PER instead -of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects -the letter 'O'. Additionally it removes lines with no tags in the data file and can also -be used if the data is only partially IOB tagged. -Parameters ----------- -data_file : Union[str, Path] - Path to the data file. -encoding : str, optional - Encoding used in open function. The default is "utf8". - -""" - with open(file=data_file, mode='r', encoding=encoding) as f: - lines = f.readlines() - with open(file=data_file, mode='w', encoding=encoding) as f: - pred = 'O' # remembers tag of predecessing line - for line in lines: - line_list = line.split() - if len(line_list) == 2: # word with tag - word = line_list[0] - tag = line_list[1] - if tag in ['0', 'O']: # no chunk - f.write(word + ' O\n') - pred = 'O' - elif '-' not in tag: # no IOB tags - if pred == 'O': # found a new chunk - f.write(word + ' B-' + tag + '\n') - pred = tag - else: # found further part of chunk or new chunk directly after old chunk - if pred == tag: - f.write(word + ' I-' + tag + '\n') - else: - f.write(word + ' B-' + tag + '\n') - pred = tag - else: # line already has IOB tag (tag contains '-') - f.write(line) - pred = tag.split('-')[1] - elif len(line_list) == 0: # empty line - f.write('\n') - pred = 'O' - - -class CONLL_03_SPANISH(ColumnCorpus): +class GERMEVAL_14(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, @@ -773,19 +855,18 @@ def __init__( in_memory: bool = True, ): """ - Initialize the CoNLL-03 corpus for Spanish. The first time you call this constructor it will automatically - download the dataset. - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, should not be changed - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + Initialize the GermEval NER corpus for German. This is only possible if you've manually downloaded it to your + machine. Obtain the corpus from https://sites.google.com/site/germeval2014ner/data and put it into some folder. + Then point the base_path parameter in the constructor to this folder + :param base_path: Path to the GermEval corpus on your machine + :param tag_to_bioes: 'ner' by default, should not be changed. + :param in_memory:If True, keeps dataset in memory giving speedups in training. """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "ner"} + columns = {1: "text", 2: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -795,41 +876,36 @@ def __init__( base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name - # download data if necessary - conll_02_path = "https://www.clips.uantwerpen.be/conll2002/ner/data/" - cached_path(f"{conll_02_path}esp.testa", Path("datasets") / dataset_name) - cached_path(f"{conll_02_path}esp.testb", Path("datasets") / dataset_name) - cached_path(f"{conll_02_path}esp.train", Path("datasets") / dataset_name) - - super(CONLL_03_SPANISH, self).__init__( + # check if data there + if not data_folder.exists(): + log.warning("-" * 100) + log.warning(f'WARNING: GermEval-14 dataset not found at "{data_folder}".') + log.warning( + 'Instructions for obtaining the data can be found here: https://sites.google.com/site/germeval2014ner/data"' + ) + log.warning("-" * 100) + super(GERMEVAL_14, self).__init__( data_folder, columns, tag_to_bioes=tag_to_bioes, - encoding="latin-1", + comment_symbol="#", in_memory=in_memory, ) -class CONLL_2000(ColumnCorpus): +class INSPEC(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "np", + tag_to_bioes: str = "keyword", in_memory: bool = True, ): - """ - Initialize the CoNLL-2000 corpus for English chunking. - The first time you call this constructor it will automatically download the dataset. - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: 'np' by default, should not be changed, but you can set 'pos' instead to predict POS tags - :param in_memory: If True, keeps dataset in memory giving speedups in training. - """ + if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "pos", 2: "np"} + columns = {0: "text", 1: "keyword"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -839,77 +915,34 @@ def __init__( base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name - # download data if necessary - conll_2000_path = "https://www.clips.uantwerpen.be/conll2000/chunking/" - data_file = Path(flair.cache_root) / "datasets" / dataset_name / "train.txt" - if not data_file.is_file(): - cached_path( - f"{conll_2000_path}train.txt.gz", Path("datasets") / dataset_name - ) - cached_path( - f"{conll_2000_path}test.txt.gz", Path("datasets") / dataset_name - ) - import gzip, shutil - - with gzip.open( - Path(flair.cache_root) / "datasets" / dataset_name / "train.txt.gz", - "rb", - ) as f_in: - with open( - Path(flair.cache_root) / "datasets" / dataset_name / "train.txt", - "wb", - ) as f_out: - shutil.copyfileobj(f_in, f_out) - with gzip.open( - Path(flair.cache_root) / "datasets" / dataset_name / "test.txt.gz", "rb" - ) as f_in: - with open( - Path(flair.cache_root) / "datasets" / dataset_name / "test.txt", - "wb", - ) as f_out: - shutil.copyfileobj(f_in, f_out) + inspec_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/Inspec" + cached_path(f"{inspec_path}/train.txt", Path("datasets") / dataset_name) + cached_path(f"{inspec_path}/test.txt", Path("datasets") / dataset_name) + if not "dev.txt" in os.listdir(data_folder): + cached_path(f"{inspec_path}/valid.txt", Path("datasets") / dataset_name) + # rename according to train - test - dev - convention + os.rename(data_folder / "valid.txt", data_folder / "dev.txt") - super(CONLL_2000, self).__init__( + super(INSPEC, self).__init__( data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class XTREME(MultiCorpus): +class LER_GERMAN(ColumnCorpus): def __init__( self, - languages: Union[str, List[str]] = None, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", in_memory: bool = False, ): """ - Xtreme corpus for cross-lingual NER consisting of datasets of a total of 176 languages. The data comes from the google - research work XTREME https://github.com/google-research/xtreme. All datasets for NER and respective language abbreviations (e.g. - "en" for english can be found here https://www.amazon.com/clouddrive/share/d3KGCRCIYwhKJF0H3eWA26hjg2ZCRhjpEQtDL70FSBN/folder/C43gs51bSIaq5sFTQkWNCQ?_encoding=UTF8&*Version*=1&*entries*=0&mgh=1 ) - The data is derived from the wikiann dataset https://elisa-ie.github.io/wikiann/ (license: https://opendatacommons.org/licenses/by/) - - Parameters - ---------- - languages : Union[str, List[str]], optional - Default the 40 languages that are used in XTREME are loaded. Otherwise on can hand over a strings or a list of strings - consisiting of abbreviations for languages. All datasets will be loaded in a MultiCorpus object. - base_path : Union[str, Path], optional - Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - tag_to_bioes : str, optional - The data is in bio-format. It will by default (with the string "ner" as value) be transformed - into the bioes format. If you dont want that set it to None. - + Initialize the LER_GERMAN (Legal Entity Recognition) corpus. The first time you call this constructor it will automatically + download the dataset. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ - # if no languages are given as argument all languages used in XTREME will be loaded - if not languages: - languages = ["af", "ar", "bg", "bn", "de", "el", "en", "es", "et", "eu", "fa", "fi", "fr", "he", "hi", "hu", - "id", "it", "ja", "jv", "ka", "kk", "ko", "ml", "mr", "ms", "my", "nl", "pt", "ru", "sw", "ta", - "te", "th", "tl", "tr", "ur", "vi", "yo", "zh"] - - # if only one language is given - if type(languages) == str: - languages = [languages] if type(base_path) == str: base_path: Path = Path(base_path) @@ -918,112 +951,136 @@ def __init__( columns = {0: "text", 1: "ner"} # this dataset name - dataset_name = "xtreme" + dataset_name = self.__class__.__name__.lower() # default dataset folder is the cache root if not base_path: base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name - # For each language in languages, the file is downloaded if not existent - # Then a comlumncorpus of that data is created and saved in a list - # This list is handed to the multicorpus + # download data if necessary + ler_path = "https://raw.githubusercontent.com/elenanereiss/Legal-Entity-Recognition/master/data/" + cached_path(f"{ler_path}ler.conll", Path("datasets") / dataset_name) - # list that contains the columncopora - corpora = [] + super(LER_GERMAN, self).__init__( + data_folder, + columns, + tag_to_bioes=tag_to_bioes, + in_memory=in_memory, + train_file='ler.conll' + ) - hu_path = "https://nlp.informatik.hu-berlin.de/resources/datasets/panx_dataset" - # download data if necessary - for language in languages: +class MIT_MOVIE_NER_SIMPLE(ColumnCorpus): + def __init__( + self, + base_path: Union[str, Path] = None, + tag_to_bioes: str = "ner", + in_memory: bool = True, + ): + """ + Initialize the eng corpus of the MIT Movie Corpus (it has simpler queries compared to the trivia10k13 corpus) + in BIO format. The first time you call this constructor it will automatically download the dataset. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict + POS tags instead + :param in_memory: If True, keeps dataset in memory giving speedups in training. + """ + # column format + columns = {0: "ner", 1: "text"} - language_folder = data_folder / language + # dataset name + dataset_name = self.__class__.__name__.lower() - # if language not downloaded yet, download it - if not language_folder.exists(): + # data folder: default dataset folder is the cache root + if type(base_path) == str: + base_path: Path = Path(base_path) + if not base_path: + base_path: Path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name - file_name = language + '.tar.gz' - # create folder - os.makedirs(language_folder) + # download data if necessary + mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/" + train_file = "engtrain.bio" + test_file = "engtest.bio" + cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name) + cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name) - # download from HU Server - temp_file = cached_path( - hu_path + "/" + file_name, - Path("datasets") / dataset_name / language - ) + super(MIT_MOVIE_NER_SIMPLE, self).__init__( + data_folder, + columns, + train_file=train_file, + test_file=test_file, + tag_to_bioes=tag_to_bioes, + in_memory=in_memory, + ) - # unzip - print("Extract data...") - import tarfile - tar = tarfile.open(str(temp_file), "r:gz") - for part in ["train", "test", "dev"]: - tar.extract(part, str(language_folder)) - tar.close() - print('...done.') - # transform data into required format - print("Process dataset...") - for part in ["train", "test", "dev"]: - xtreme_to_simple_ner_annotation(str(language_folder / part)) - print('...done.') +class MIT_MOVIE_NER_COMPLEX(ColumnCorpus): + def __init__( + self, + base_path: Union[str, Path] = None, + tag_to_bioes: str = "ner", + in_memory: bool = True, + ): + """ + Initialize the trivia10k13 corpus of the MIT Movie Corpus (it has more complex queries compared to the eng corpus) + in BIO format. The first time you call this constructor it will automatically download the dataset. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict + POS tags instead + :param in_memory: If True, keeps dataset in memory giving speedups in training. + """ + # column format + columns = {0: "ner", 1: "text"} - # initialize comlumncorpus and add it to list - print("Read data into corpus...") - corp = ColumnCorpus(data_folder=language_folder, - column_format=columns, - tag_to_bioes=tag_to_bioes, - in_memory=in_memory, - ) - corpora.append(corp) - print("...done.") + # dataset name + dataset_name = self.__class__.__name__.lower() - super(XTREME, self).__init__( - corpora, name='xtreme' - ) + # data folder: default dataset folder is the cache root + if type(base_path) == str: + base_path: Path = Path(base_path) + if not base_path: + base_path: Path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + # download data if necessary + mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/" + train_file = "trivia10k13train.bio" + test_file = "trivia10k13test.bio" + cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name) + cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name) -def xtreme_to_simple_ner_annotation(data_file: Union[str, Path]): - with open(data_file, 'r', encoding='utf-8') as f: - lines = f.readlines() - with open(data_file, 'w', encoding='utf-8') as f: - for line in lines: - if line == '\n': - f.write(line) - else: - liste = line.split() - f.write(liste[0].split(':', 1)[1] + ' ' + liste[1] + '\n') + super(MIT_MOVIE_NER_COMPLEX, self).__init__( + data_folder, + columns, + train_file=train_file, + test_file=test_file, + tag_to_bioes=tag_to_bioes, + in_memory=in_memory, + ) -class WIKIANN(MultiCorpus): +class MIT_RESTAURANT_NER(ColumnCorpus): def __init__( self, - languages: Union[str, List[str]], base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", - in_memory: bool = False, + in_memory: bool = True, + document_as_sequence: bool = False, ): """ - WkiAnn corpus for cross-lingual NER consisting of datasets from 282 languages that exist - in Wikipedia. See https://elisa-ie.github.io/wikiann/ for details and for the languages and their - respective abbreveations, i.e. "en" for english. (license: https://opendatacommons.org/licenses/by/) - Parameters - ---------- - languages : Union[str, List[str]] - Should be an abbreviation of a language ("en", "de",..) or a list of abbreviations. - The datasets of all passed languages will be saved in one MultiCorpus. - (Note that, even though listed on https://elisa-ie.github.io/wikiann/ some datasets are empty. - This includes "aa", "cho", "ho", "hz", "ii", "jam", "kj", "kr", "mus", "olo" and "tcy".) - base_path : Union[str, Path], optional - Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - tag_to_bioes : str, optional - The data is in bio-format. It will by default (with the string "ner" as value) be transformed - into the bioes format. If you dont want that set it to None. - + Initialize the experimental MIT Restaurant corpus available on https://groups.csail.mit.edu/sls/downloads/restaurant/. + The first time you call this constructor it will automatically download the dataset. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict + POS tags instead + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ - if type(languages) == str: - languages = [languages] - if type(base_path) == str: base_path: Path = Path(base_path) @@ -1031,405 +1088,140 @@ def __init__( columns = {0: "text", 1: "ner"} # this dataset name - dataset_name = "wikiann" + dataset_name = self.__class__.__name__.lower() # default dataset folder is the cache root if not base_path: base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name - # For each language in languages, the file is downloaded if not existent - # Then a comlumncorpus of that data is created and saved in a list - # this list is handed to the multicorpus + # download data if necessary + mit_restaurants_path = "https://megantosh.s3.eu-central-1.amazonaws.com/MITRestoCorpus/" + cached_path(f"{mit_restaurants_path}test.txt", Path("datasets") / dataset_name) + cached_path(f"{mit_restaurants_path}train.txt", Path("datasets") / dataset_name) - # list that contains the columncopora - corpora = [] + super(MIT_RESTAURANT_NER, self).__init__( + data_folder, + columns, + tag_to_bioes=tag_to_bioes, + encoding="latin-1", + in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + ) + + +class NER_BASQUE(ColumnCorpus): + def __init__( + self, + base_path: Union[str, Path] = None, + tag_to_bioes: str = "ner", + in_memory: bool = True, + ): + if type(base_path) == str: + base_path: Path = Path(base_path) + + # column format + columns = {0: "text", 1: "ner"} + + # this dataset name + dataset_name = self.__class__.__name__.lower() + + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name - google_drive_path = 'https://drive.google.com/uc?id=' # download data if necessary - first = True - for language in languages: + ner_basque_path = "http://ixa2.si.ehu.eus/eiec/" + data_path = Path(flair.cache_root) / "datasets" / dataset_name + data_file = data_path / "named_ent_eu.train" + if not data_file.is_file(): + cached_path( + f"{ner_basque_path}/eiec_v1.0.tgz", Path("datasets") / dataset_name + ) + import tarfile, shutil - language_folder = data_folder / language - file_name = 'wikiann-' + language + '.bio' + with tarfile.open( + Path(flair.cache_root) / "datasets" / dataset_name / "eiec_v1.0.tgz", + "r:gz", + ) as f_in: + corpus_files = ( + "eiec_v1.0/named_ent_eu.train", + "eiec_v1.0/named_ent_eu.test", + ) + for corpus_file in corpus_files: + f_in.extract(corpus_file, data_path) + shutil.move(f"{data_path}/{corpus_file}", data_path) - # if language not downloaded yet, download it - if not language_folder.exists(): - if first == True: - import gdown - import tarfile - first = False - # create folder - os.makedirs(language_folder) - # get google drive id from list - google_id = google_drive_id_from_language_name(language) - url = google_drive_path + google_id + super(NER_BASQUE, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory + ) - # download from google drive - gdown.download(url, str(language_folder / language) + '.tar.gz') - # unzip - print("Extract data...") - tar = tarfile.open(str(language_folder / language) + '.tar.gz', "r:gz") - # tar.extractall(language_folder,members=[tar.getmember(file_name)]) - tar.extract(file_name, str(language_folder)) - tar.close() - print('...done.') +class NER_FINNISH(ColumnCorpus): + def __init__( + self, + base_path: Union[str, Path] = None, + tag_to_bioes: str = "ner", + in_memory: bool = True, + ): + if type(base_path) == str: + base_path: Path = Path(base_path) - # transform data into required format - # the processed dataset has the additional ending "_new" - print("Process dataset...") - silver_standard_to_simple_ner_annotation(str(language_folder / file_name)) - # remove the unprocessed dataset - os.remove(str(language_folder / file_name)) - print('...done.') + # column format + columns = {0: "text", 1: "ner"} - # initialize comlumncorpus and add it to list - print("Read data into corpus...") - corp = ColumnCorpus(data_folder=language_folder, - column_format=columns, - train_file=file_name + '_new', - tag_to_bioes=tag_to_bioes, - in_memory=in_memory, - ) - corpora.append(corp) - print("...done.") + # this dataset name + dataset_name = self.__class__.__name__.lower() - super(WIKIANN, self).__init__( - corpora, name='wikiann' + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + + # download data if necessary + ner_finnish_path = "https://raw.githubusercontent.com/mpsilfve/finer-data/master/data/digitoday." + cached_path(f"{ner_finnish_path}2014.train.csv", Path("datasets") / dataset_name) + cached_path(f"{ner_finnish_path}2014.dev.csv", Path("datasets") / dataset_name) + cached_path(f"{ner_finnish_path}2015.test.csv", Path("datasets") / dataset_name) + + _remove_lines_without_annotations(data_file=Path(data_folder / "digitoday.2015.test.csv")) + + super(NER_FINNISH, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, skip_first_line=True ) -def silver_standard_to_simple_ner_annotation(data_file: Union[str, Path]): - f_read = open(data_file, 'r', encoding='utf-8') - f_write = open(data_file + '_new', 'w+', encoding='utf-8') - while True: - line = f_read.readline() - if line: - if line == '\n': - f_write.write(line) - else: - liste = line.split() - f_write.write(liste[0] + ' ' + liste[-1] + '\n') - else: - break - f_read.close() - f_write.close() +def _remove_lines_without_annotations(data_file: Union[str, Path] = None): + with open(data_file, 'r') as f: + lines = f.readlines() + with open(data_file, 'w') as f: + for line in lines: + if len(line.split()) != 1: + f.write(line) -def google_drive_id_from_language_name(language): - languages_ids = { - 'aa': '1tDDlydKq7KQQ3_23Ysbtke4HJOe4snIk', # leer - 'ab': '1hB8REj2XA_0DjI9hdQvNvSDpuBIb8qRf', - 'ace': '1WENJS2ppHcZqaBEXRZyk2zY-PqXkTkgG', - 'ady': '1n6On8WWDHxEoybj7F9K15d_fkGPy6KgO', - 'af': '1CPB-0BD2tg3zIT60D3hmJT0i5O_SKja0', - 'ak': '1l2vlGHnQwvm9XhW5S-403fetwUXhBlZm', - 'als': '196xyYjhbie7sYLHLZHWkkurOwQLi8wK-', - 'am': '1ug1IEoExKD3xWpvfZprAPSQi82YF9Cet', - 'an': '1DNLgPOAOsGZBYd6rC5ddhzvc9_DtWnk2', - 'ang': '1W_0ti7Tl8AkqM91lRCMPWEuUnPOAZroV', - 'ar': '1tyvd32udEQG_cNeVpaD5I2fxvCc6XKIS', - 'arc': '1hSOByStqPmP3b9HfQ39EclUZGo8IKCMb', - 'arz': '1CKW5ZhxTpIHmc8Jt5JLz_5O6Cr8Icsan', - 'as': '12opBoIweBLM8XciMHT4B6-MAaKdYdvpE', - 'ast': '1rp64PxGZBDfcw-tpFBjLg_ddLDElG1II', - 'av': '1hncGUrkG1vwAAQgLtwOf41BWkHkEvdss', - 'ay': '1VmIsWpMTz442b4Mx798ZOgtB9vquKQtf', - 'az': '1FXDXsvBSdqc7GGIDZv0hqBOaaw12Ip2-', - 'azb': '1amVqOuHLEkhjn8rkGUl-mXdZlaACWyNT', - 'ba': '1aLx1d8GagI11VZVYOGQy0BEePeqoT0x3', - 'bar': '1JZ8-k8ZmnpWYI_Yl_cBBgjVdxoM9Daci', - 'bat-smg': '1trxKXDFSeKsygTMKi-ZqXSJs7F90k5a8', - 'bcl': '1Hs0k7KVZ2DPsqroZ4cUKcwZG4HdPV794', - 'be-x-old': '1gaK-spj1m6eGYQ-SsngLxxLUvP1VRk08', - 'be': '1_ttfOSy9BzCRkIT_p3mImT82XRPpEiuH', - 'bg': '1Iug6gYKemb0OrLTUrKDc_c66YGypTfCF', - 'bh': '12OcSFLu940A8tVQLxI8pnxKBpTeZHmrh', - 'bi': '1rftVziS_pqARx4mvLJC0sKLY-OL5ZIjE', - 'bjn': '1n17mkRjPUAOWQk5LQs2C3Tz3ShxK0enZ', - 'bm': '1284dwO_sfdsWE7FR06HhfBRUb8ePesKR', - 'bn': '1K2DM1mT4hkr6NlAIBTj95BeVXcgvpgDm', - 'bo': '1SzGHDVK-OguKdjZ4DXWiOJVrie1iHeWm', - 'bpy': '1m-e5EoruJufvwBEgJLmJtx6jzx64pYN2', - 'br': '1xdaBoJ1DnwI0iEq7gQN1dWcABAs_bM9H', - 'bs': '167dsB01trMYFQl8FshtIdfhjw7IfVKbk', - 'bug': '1yCnevM9_KJzFk27Vxsva_20OacLo4Uam', - 'bxr': '1DlByAX3zB-9UyEAVD4wtX-R7mXC-8xum', - 'ca': '1LuUgbd9sGa-5Ahcsy31EK89a3WOowftY', - 'cbk-zam': '1kgF8xoD-kIOWZET_9kp_4yNX6AAXn6PI', - 'cdo': '14x1y6611G-UAEGq92QEHRpreVkYnoUCw', - 'ce': '1QUUCVKA-fkiCHd3KT3zUWefaWnxzlZLu', - 'ceb': '1DJZE9RfaMoPNXHI73KBXAm4YSe-_YCUk', - 'ch': '1YzAfhmatkmTpkZbAcD6X83epCgzD5S2_', - 'cho': '1ciY0vF3c5a2mTOo_k32A2wMs0klK98Kb', # leer - 'chr': '1EHaxz1UZHn7v2bbRzCLAhPsNtRzrG3Ae', - 'chy': '1nNWwMAJr1KNdz3bHf6uIn-thZCknlTeB', - 'ckb': '1llpaftcUSiXCZQZMdAqaJSrhwMdcf9IV', - 'co': '1ZP-8oWgMYfW7a6w6ygEFkKDGbN39QnDn', - 'cr': '1ST0xRicLAG4JdCZwGdaY-0pEXooQh7e6', - 'crh': '1Jmpq2XVYUR_XaXU5XNhtOMnz-qkpsgpE', - 'cs': '1Vydyze-jBkK_S1uV5ewV_Y6dbwhXr7lk', - 'csb': '1naUyF74lZPnnopXdOqf5Xor2kT4WoHfS', - 'cu': '1EN5dVTU6jc7YOYPCHq8EYUF31HlMUKs7', - 'cv': '1gEUAlqYSSDI4TrWCqP1LUq2n0X1XEjN3', - 'cy': '1q5g6NJE5GXf65Vc_P4BnUMHQ49Prz-J1', - 'da': '11onAGOLkkqrIwM784siWlg-cewa5WKm8', - 'de': '1f9nWvNkCCy6XWhd9uf4Dq-2--GzSaYAb', - 'diq': '1IkpJaVbEOuOs9qay_KG9rkxRghWZhWPm', - 'dsb': '1hlExWaMth-2eVIQ3i3siJSG-MN_7Z6MY', - 'dv': '1WpCrslO4I7TMb2uaKVQw4U2U8qMs5szi', - 'dz': '10WX52ePq2KfyGliwPvY_54hIjpzW6klV', - 'ee': '1tYEt3oN2KPzBSWrk9jpCqnW3J1KXdhjz', - 'el': '1cxq4NUYmHwWsEn5waYXfFSanlINXWLfM', - 'eml': '17FgGhPZqZNtzbxpTJOf-6nxEuI5oU4Vd', - 'en': '1mqxeCPjxqmO7e8utj1MQv1CICLFVvKa-', - 'eo': '1YeknLymGcqj44ug2yd4P7xQVpSK27HkK', - 'es': '1Dnx3MVR9r5cuoOgeew2gT8bDvWpOKxkU', - 'et': '1Qhb3kYlQnLefWmNimdN_Vykm4mWzbcWy', - 'eu': '1f613wH88UeITYyBSEMZByK-nRNMwLHTs', - 'ext': '1D0nLOZ3aolCM8TShIRyCgF3-_MhWXccN', - 'fa': '1QOG15HU8VfZvJUNKos024xI-OGm0zhEX', - 'ff': '1h5pVjxDYcq70bSus30oqi9KzDmezVNry', - 'fi': '1y3Kf6qYsSvL8_nSEwE1Y6Bf6ninaPvqa', - 'fiu-vro': '1oKUiqG19WgPd3CCl4FGudk5ATmtNfToR', - 'fj': '10xDMuqtoTJlJFp5ghbhKfNWRpLDK3W4d', - 'fo': '1RhjYqgtri1276Be1N9RrNitdBNkpzh0J', - 'fr': '1sK_T_-wzVPJYrnziNqWTriU52rEsXGjn', - 'frp': '1NUm8B2zClBcEa8dHLBb-ZgzEr8phcQyZ', - 'frr': '1FjNqbIUlOW1deJdB8WCuWjaZfUzKqujV', - 'fur': '1oqHZMK7WAV8oHoZLjGR0PfmO38wmR6XY', - 'fy': '1DvnU6iaTJc9bWedmDklHyx8nzKD1s3Ge', - 'ga': '1Ql6rh7absdYQ8l-3hj_MVKcEC3tHKeFB', - 'gag': '1zli-hOl2abuQ2wsDJU45qbb0xuvYwA3a', - 'gan': '1u2dOwy58y-GaS-tCPJS_i9VRDQIPXwCr', - 'gd': '1umsUpngJiwkLdGQbRqYpkgxZju9dWlRz', - 'gl': '141K2IbLjJfXwFTIf-kthmmG0YWdi8liE', - 'glk': '1ZDaxQ6ilXaoivo4_KllagabbvfOuiZ0c', - 'gn': '1hM4MuCaVnZqnL-w-0N-WcWag22ikVLtZ', - 'gom': '1BNOSw75tzPC0wEgLOCKbwu9wg9gcLOzs', - 'got': '1YSHYBtXc1WvUvMIHPz6HHgJvaXKulJUj', - 'gu': '1VdK-B2drqFwKg8KD23c3dKXY-cZgCMgd', - 'gv': '1XZFohYNbKszEFR-V-yDXxx40V41PV9Zm', - 'ha': '18ZG4tUU0owRtQA8Ey3Dl72ALjryEJWMC', - 'hak': '1QQe3WgrCWbvnVH42QXD7KX4kihHURB0Z', - 'haw': '1FLqlK-wpz4jy768XbQAtxd9PhC-9ciP7', - 'he': '18K-Erc2VOgtIdskaQq4D5A3XkVstDmfX', - 'hi': '1lBRapb5tjBqT176gD36K5yb_qsaFeu-k', - 'hif': '153MQ9Ga4NQ-CkK8UiJM3DjKOk09fhCOV', - 'ho': '1c1AoS7yq15iVkTEE-0f3x25NT4F202B8', # leer - 'hr': '1wS-UtB3sGHuXJQQGR0F5lDegogsgoyif', - 'hsb': '1_3mMLzAE5OmXn2z64rW3OwWbo85Mirbd', - 'ht': '1BwCaF0nfdgkM7Yt7A7d7KyVk0BcuwPGk', - 'hu': '10AkDmTxUWNbOXuYLYZ-ZPbLAdGAGZZ8J', - 'hy': '1Mi2k2alJJquT1ybd3GC3QYDstSagaWdo', - 'hz': '1c1m_-Q92v0Di7Nez6VuaccrN19i8icKV', # leer - 'ia': '1jPyqTmDuVhEhj89N606Cja5heJEbcMoM', - 'id': '1JWIvIh8fQoMQqk1rPvUThaskxnTs8tsf', - 'ie': '1TaKRlTtB8-Wqu4sfvx6JQKIugAlg0pV-', - 'ig': '15NFAf2Qx6BXSjv_Oun9_3QRBWNn49g86', - 'ii': '1qldGJkMOMKwY13DpcgbxQCbff0K982f9', # leer - 'ik': '1VoSTou2ZlwVhply26ujowDz6gjwtxmny', - 'ilo': '1-xMuIT6GaM_YeHqgm1OamGkxYfBREiv3', - 'io': '19Zla0wsAcrZm2c0Pw5ghpp4rHjYs26Pp', - 'is': '11i-NCyqS6HbldIbYulsCgQGZFXR8hwoB', - 'it': '1HmjlOaQunHqL2Te7pIkuBWrnjlmdfYo_', - 'iu': '18jKm1S7Ls3l0_pHqQH8MycG3LhoC2pdX', - 'ja': '10dz8UxyK4RIacXE2HcGdrharmp5rwc3r', - 'jam': '1v99CXf9RnbF6aJo669YeTR6mQRTOLZ74', # leer - 'jbo': '1_LmH9hc6FDGE3F7pyGB1fUEbSwuTYQdD', - 'jv': '1qiSu1uECCLl4IBZS27FBdJIBivkJ7GwE', - 'ka': '172UFuFRBX2V1aWeXlPSpu9TjS-3cxNaD', - 'kaa': '1kh6hMPUdqO-FIxRY6qaIBZothBURXxbY', - 'kab': '1oKjbZI6ZrrALCqnPCYgIjKNrKDA7ehcs', - 'kbd': '1jNbfrboPOwJmlXQBIv053d7n5WXpMRv7', - 'kg': '1iiu5z-sdJ2JLC4Ja9IgDxpRZklIb6nDx', - 'ki': '1GUtt0QI84c5McyLGGxoi5uwjHOq1d6G8', - 'kj': '1nSxXUSGDlXVCIPGlVpcakRc537MwuKZR', # leer - 'kk': '1ryC3UN0myckc1awrWhhb6RIi17C0LCuS', - 'kl': '1gXtGtX9gcTXms1IExICnqZUHefrlcIFf', - 'km': '1DS5ATxvxyfn1iWvq2G6qmjZv9pv0T6hD', - 'kn': '1ZGLYMxbb5-29MNmuUfg2xFhYUbkJFMJJ', - 'ko': '12r8tIkTnwKhLJxy71qpIcoLrT6NNhQYm', - 'koi': '1EdG_wZ_Qk124EPAZw-w6rdEhYLsgcvIj', - 'kr': '19VNQtnBA-YL_avWuVeHQHxJZ9MZ04WPF', # leer - 'krc': '1nReV4Mb7Wdj96czpO5regFbdBPu0zZ_y', - 'ks': '1kzh0Pgrv27WRMstR9MpU8mu7p60TcT-X', - 'ksh': '1iHJvrl2HeRaCumlrx3N7CPrHQ2KuLUkt', - 'ku': '1YqJog7Bkk0fHBCSTxJ9heeE-bfbkbkye', - 'kv': '1s91HI4eq8lQYlZwfrJAgaGlCyAtIhvIJ', - 'kw': '16TaIX2nRfqDp8n7zudd4bqf5abN49dvW', - 'ky': '17HPUKFdKWhUjuR1NOp5f3PQYfMlMCxCT', - 'la': '1NiQuBaUIFEERvVXo6CQLwosPraGyiRYw', - 'lad': '1PEmXCWLCqnjLBomMAYHeObM1AmVHtD08', - 'lb': '1nE4g10xoTU23idmDtOQ0w2QCuizZ6QH_', - 'lbe': '1KOm-AdRcCHfSc1-uYBxBA4GjxXjnIlE-', - 'lez': '1cJAXshrLlF1TZlPHJTpDwEvurIOsz4yR', - 'lg': '1Ur0y7iiEpWBgHECrIrT1OyIC8um_y4th', - 'li': '1TikIqfqcZlSDWhOae1JnjJiDko4nj4Dj', - 'lij': '1ro5ItUcF49iP3JdV82lhCQ07MtZn_VjW', - 'lmo': '1W4rhBy2Pi5SuYWyWbNotOVkVY3kYWS_O', - 'ln': '1bLSV6bWx0CgFm7ByKppZLpYCFL8EIAoD', - 'lo': '1C6SSLeKF3QirjZbAZAcpVX_AXYg_TJG3', - 'lrc': '1GUcS28MlJe_OjeQfS2AJ8uczpD8ut60e', - 'lt': '1gAG6TcMTmC128wWK0rCXRlCTsJY9wFQY', - 'ltg': '12ziP8t_fAAS9JqOCEC0kuJObEyuoiOjD', - 'lv': '1MPuAM04u-AtfybXdpHwCqUpFWbe-zD0_', - 'mai': '1d_nUewBkka2QGEmxCc9v3dTfvo7lPATH', - 'map-bms': '1wrNIE-mqp2xb3lrNdwADe6pb7f35NP6V', - 'mdf': '1BmMGUJy7afuKfhfTBMiKxM3D7FY-JrQ2', - 'mg': '105WaMhcWa-46tCztoj8npUyg0aH18nFL', - 'mh': '1Ej7n6yA1cF1cpD5XneftHtL33iHJwntT', - 'mhr': '1CCPIUaFkEYXiHO0HF8_w07UzVyWchrjS', - 'mi': '1F6au9xQjnF-aNBupGJ1PwaMMM6T_PgdQ', - 'min': '1tVK5SHiCy_DaZSDm3nZBgT5bgWThbJt_', - 'mk': '18NpudytGhSWq_LbmycTDw10cSftlSBGS', - 'ml': '1V73UE-EvcE-vV3V1RTvU4sak6QFcP91y', - 'mn': '14jRXicA87oXZOZllWqUjKBMetNpQEUUp', - 'mo': '1YsLGNMsJ7VsekhdcITQeolzOSK4NzE6U', - 'mr': '1vOr1AIHbgkhTO9Ol9Jx5Wh98Qdyh1QKI', - 'mrj': '1dW-YmEW8a9D5KyXz8ojSdIXWGekNzGzN', - 'ms': '1bs-_5WNRiZBjO-DtcNtkcIle-98homf_', - 'mt': '1L7aU3iGjm6SmPIU74k990qRgHFV9hrL0', - 'mus': '1_b7DcRqiKJFEFwp87cUecqf8A5BDbTIJ', # leer - 'mwl': '1MfP0jba2jQfGVeJOLq26MjI6fYY7xTPu', - 'my': '16wsIGBhNVd2lC2p6n1X8rdMbiaemeiUM', - 'myv': '1KEqHmfx2pfU-a1tdI_7ZxMQAk5NJzJjB', - 'mzn': '1CflvmYEXZnWwpsBmIs2OvG-zDDvLEMDJ', - 'na': '1r0AVjee5wNnrcgJxQmVGPVKg5YWz1irz', - 'nah': '1fx6eu91NegyueZ1i0XaB07CKjUwjHN7H', - 'nap': '1bhT4sXCJvaTchCIV9mwLBtf3a7OprbVB', - 'nds-nl': '1UIFi8eOCuFYJXSAXZ9pCWwkQMlHaY4ye', - 'nds': '1FLgZIXUWa_vekDt4ndY0B5XL7FNLiulr', - 'ne': '1gEoCjSJmzjIH4kdHsbDZzD6ID4_78ekS', - 'new': '1_-p45Ny4w9UvGuhD8uRNSPPeaARYvESH', - 'ng': '11yxPdkmpmnijQUcnFHZ3xcOmLTYJmN_R', - 'nl': '1dqYXg3ilzVOSQ_tz_dF47elSIvSIhgqd', - 'nn': '1pDrtRhQ001z2WUNMWCZQU3RV_M0BqOmv', - 'no': '1zuT8MI96Ivpiu9mEVFNjwbiM8gJlSzY2', - 'nov': '1l38388Rln0NXsSARMZHmTmyfo5C0wYTd', - 'nrm': '10vxPq1Nci7Wpq4XOvx3dtqODskzjdxJQ', - 'nso': '1iaIV8qlT0RDnbeQlnxJ3RehsG3gU5ePK', - 'nv': '1oN31jT0w3wP9aGwAPz91pSdUytnd9B0g', - 'ny': '1eEKH_rUPC560bfEg11kp3kbe8qWm35IG', - 'oc': '1C01cW8G_j8US-DTrsmeal_ENHTtNWn-H', - 'olo': '1vbDwKZKqFq84dusr1SvDx5JbBcPanx9L', # leer - 'om': '1q3h22VMbWg2kgVFm-OArR-E4y1yBQ1JX', - 'or': '1k8LwCE8nC7lq6neXDaS3zRn0KOrd9RnS', - 'os': '1u81KAB34aEQfet00dLMRIBJsfRwbDTij', - 'pa': '1JDEHL1VcLHBamgTPBom_Ryi8hk6PBpsu', - 'pag': '1k905VUWnRgY8kFb2P2431Kr4dZuolYGF', - 'pam': '1ssugGyJb8ipispC60B3I6kzMsri1WcvC', - 'pap': '1Za0wfwatxYoD7jGclmTtRoBP0uV_qImQ', - 'pcd': '1csJlKgtG04pdIYCUWhsCCZARKIGlEYPx', - 'pdc': '1Xnms4RXZKZ1BBQmQJEPokmkiweTpouUw', - 'pfl': '1tPQfHX7E0uKMdDSlwNw5aGmaS5bUK0rn', - 'pi': '16b-KxNxzbEuyoNSlI3bfe2YXmdSEsPFu', - 'pih': '1vwyihTnS8_PE5BNK7cTISmIBqGWvsVnF', - 'pl': '1fijjS0LbfpKcoPB5V8c8fH08T8AkXRp9', - 'pms': '12ySc7X9ajWWqMlBjyrPiEdc-qVBuIkbA', - 'pnb': '1RB3-wjluhTKbdTGCsk3nag1bM3m4wENb', - 'pnt': '1ZCUzms6fY4on_fW8uVgO7cEs9KHydHY_', - 'ps': '1WKl9Av6Sqz6aHKyUM5kIh90mzFzyVWH9', - 'pt': '13BX-_4_hcTUp59HDyczFDI32qUB94vUY', - 'qu': '1CB_C4ygtRoegkqgcqfXNHr8oQd-UcvDE', - 'rm': '1YRSGgWoxEqSojHXuBHJnY8vAHr1VgLu-', - 'rmy': '1uFcCyvOWBJWKFQxbkYSp373xUXVl4IgF', - 'rn': '1ekyyb2MvupYGY_E8_BhKvV664sLvW4aE', - 'ro': '1YfeNTSoxU-zJMnyQotLk5X8B_6nHryBu', - 'roa-rup': '150s4H4TdQ5nNYVC6j0E416TUAjBE85yy', - 'roa-tara': '1H6emfQsD_a5yohK4RMPQ-GrnHXqqVgr3', - 'ru': '11gP2s-SYcfS3j9MjPp5C3_nFeQB-8x86', - 'rue': '1OuSglZAndja1J5D5IUmdbt_niTTyEgYK', - 'rw': '1NuhHfi0-B-Xlr_BApijnxCw0WMEltttP', - 'sa': '1P2S3gL_zvKgXLKJJxg-Fb4z8XdlVpQik', - 'sah': '1qz0MpKckzUref2FX_FYiNzI2p4BDc5oR', - 'sc': '1oAYj_Fty4FUwjAOBEBaiZt_cY8dtpDfA', - 'scn': '1sDN9zHkXWYoHYx-DUu-GPvsUgB_IRa8S', - 'sco': '1i8W7KQPj6YZQLop89vZBSybJNgNsvXWR', - 'sd': '1vaNqfv3S8Gl5pQmig3vwWQ3cqRTsXmMR', - 'se': '1RT9xhn0Vl90zjWYDTw5V1L_u1Oh16tpP', - 'sg': '1iIh2oXD2Szz_AygUvTt3_ZK8a3RYEGZ_', - 'sh': '1qPwLiAm6t4__G-zVEOrBgYx6VRmgDgiS', - 'si': '1G5ryceID0TP6SAO42e-HAbIlCvYmnUN7', - 'simple': '1FVV49o_RlK6M5Iw_7zeJOEDQoTa5zSbq', - 'sk': '11mkYvbmAWKTInj6t4Ma8BUPxoR5o6irL', - 'sl': '1fsIZS5LgMzMzZ6T7ogStyj-ILEZIBRvO', - 'sm': '1yefECpKX_Y4R7G2tggIxvc_BvJfOAz-t', - 'sn': '1fYeCjMPvRAv94kvZjiKI-ktIDLkbv0Ve', - 'so': '1Uc-eSZnJb36SgeTvRU3GirXZOlGD_NB6', - 'sq': '11u-53n71O_yjpwRiCQSwgL7N2w72ZptX', - 'sr': '1PGLGlQi8Q0Eac6dib-uuCJAAHK6SF5Pz', - 'srn': '1JKiL3TSXqK1-KhPfAwMK0uqw90WEzg7M', - 'ss': '1e0quNEsA1dn57-IbincF4D82dRWgzQlp', - 'st': '1ny-FBzpBqIDgv6jMcsoFev3Ih65FNZFO', - 'stq': '15Fx32ROy2IM6lSqAPUykkr3CITR6Xd7v', - 'su': '1C0FJum7bYZpnyptBvfAgwJb0TX2hggtO', - 'sv': '1YyqzOSXzK5yrAou9zeTDWH_7s569mDcz', - 'sw': '1_bNTj6T8eXlNAIuHaveleWlHB_22alJs', - 'szl': '1_dXEip1snK4CPVGqH8x7lF5O-6FdCNFW', - 'ta': '1ZFTONsxGtSnC9QB6RpWSvgD_MbZwIhHH', - 'tcy': '15R6u7KQs1vmDSm_aSDrQMJ3Q6q3Be0r7', # leer - 'te': '11Sx-pBAPeZOXGyv48UNSVMD0AH7uf4YN', - 'tet': '11mr2MYLcv9pz7mHhGGNi5iNCOVErYeOt', - 'tg': '16ttF7HWqM9Cnj4qmgf3ZfNniiOJfZ52w', - 'th': '14xhIt-xr5n9nMuvcwayCGM1-zBCFZquW', - 'ti': '123q5e9MStMShp8eESGtHdSBGLDrCKfJU', - 'tk': '1X-JNInt34BNGhg8A8Peyjw2WjsALdXsD', - 'tl': '1WkQHbWd9cqtTnSHAv0DpUThaBnzeSPTJ', - 'tn': '1fHfQHetZn8-fLuRZEu-cvs-kQYwPvjyL', - 'to': '1cHOLaczYJ8h-OqQgxeoH9vMG3izg6muT', - 'tpi': '1YsRjxVu6NYOrXRb8oqMO9FPaicelFEcu', - 'tr': '1J1Zy02IxvtCK0d1Ba2h_Ulit1mVb9UIX', - 'ts': '1pIcfAt3KmtmDkyhOl-SMSeoM8aP8bOpl', - 'tt': '1vsfzCjj-_bMOn5jBai41TF5GjKJM_Ius', - 'tum': '1NWcg65daI2Bt0awyEgU6apUDbBmiqCus', - 'tw': '1WCYKZIqS7AagS76QFSfbteiOgFNBvNne', - 'ty': '1DIqaP1l-N9VXTNokrlr6EuPMGE765o4h', - 'tyv': '1F3qa05OYLBcjT1lXMurAJFDXP_EesCvM', - 'udm': '1T0YMTAPLOk768sstnewy5Jxgx2RPu3Rb', - 'ug': '1fjezvqlysyZhiQMZdazqLGgk72PqtXAw', - 'uk': '1UMJCHtzxkfLDBJE7NtfN5FeMrnnUVwoh', - 'ur': '1WNaD2TuHvdsF-z0k_emQYchwoQQDFmRk', - 'uz': '11wrG2FSTpRJc2jb5MhgvxjkVDYhT8M-l', - 've': '1PucJ7pJ4CXGEXZ5p_WleZDs2usNz74to', - 'vec': '1cAVjm_y3ehNteDQIYz9yyoq1EKkqOXZ0', - 'vep': '1K_eqV7O6C7KPJWZtmIuzFMKAagj-0O85', - 'vi': '1yQ6nhm1BmG9lD4_NaG1hE5VV6biEaV5f', - 'vls': '1bpQQW6pKHruKJJaKtuggH5rReMXyeVXp', - 'vo': '1D80QRdTpe7H4mHFKpfugscsjX71kiMJN', - 'wa': '1m4B81QYbf74htpInDU5p7d0n0ot8WLPZ', - 'war': '1EC3jsHtu22tHBv6jX_I4rupC5RwV3OYd', - 'wo': '1vChyqNNLu5xYHdyHpACwwpw4l3ptiKlo', - 'wuu': '1_EIn02xCUBcwLOwYnA-lScjS2Lh2ECw6', - 'xal': '19bKXsL1D2UesbB50JPyc9TpG1lNc2POt', - 'xh': '1pPVcxBG3xsCzEnUzlohc_p89gQ9dSJB3', - 'xmf': '1SM9llku6I_ZuZz05mOBuL2lx-KQXvehr', - 'yi': '1WNWr1oV-Nl7c1Jv8x_MiAj2vxRtyQawu', - 'yo': '1yNVOwMOWeglbOcRoZzgd4uwlN5JMynnY', - 'za': '1i7pg162cD_iU9h8dgtI2An8QCcbzUAjB', - 'zea': '1EWSkiSkPBfbyjWjZK0VuKdpqFnFOpXXQ', - 'zh-classical': '1uUKZamNp08KA7s7794sKPOqPALvo_btl', - 'zh-min-nan': '1oSgz3YBXLGUgI7kl-uMOC_ww6L0FNFmp', - 'zh-yue': '1zhwlUeeiyOAU1QqwqZ8n91yXIRPFA7UE', - 'zh': '1LZ96GUhkVHQU-aj2C3WOrtffOp0U3Z7f', - 'zu': '1FyXl_UK1737XB3drqQFhGXiJrJckiB1W' - } - return languages_ids[language] - - -class DANE(ColumnCorpus): +class NER_SWEDISH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", in_memory: bool = True, ): + """ + Initialize the NER_SWEDISH corpus for Swedish. The first time you call this constructor it will automatically + download the dataset. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ + if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {1: 'text', 3: 'pos', 9: 'ner'} + columns = {0: "text", 1: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1440,61 +1232,35 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - data_path = Path(flair.cache_root) / "datasets" / dataset_name - train_data_file = data_path / "ddt.train.conllu" - if not train_data_file.is_file(): - temp_file = cached_path( - 'https://danlp.alexandra.dk/304bd159d5de/datasets/ddt.zip', - Path("datasets") / dataset_name - ) - from zipfile import ZipFile - - with ZipFile(temp_file, 'r') as zip_file: - zip_file.extractall(path=data_path) - - # Remove CoNLL-U meta information in the last column - for part in ['train', 'dev', 'test']: - lines = [] - data_file = "ddt.{}.conllu".format(part) - with open(data_path / data_file, 'r') as file: - for line in file: - if line.startswith("#") or line == "\n": - lines.append(line) - lines.append(line.replace("name=", "").replace("|SpaceAfter=No", "")) - - with open(data_path / data_file, 'w') as file: - file.writelines(lines) + ner_spraakbanken_path = "https://raw.githubusercontent.com/klintan/swedish-ner-corpus/master/" + cached_path(f"{ner_spraakbanken_path}test_corpus.txt", Path("datasets") / dataset_name) + cached_path(f"{ner_spraakbanken_path}train_corpus.txt", Path("datasets") / dataset_name) - print(data_path / data_file) + # data is not in IOB2 format. Thus we transform it to IOB2 + add_IOB2_tags(data_file=Path(data_folder / "test_corpus.txt")) + add_IOB2_tags(data_file=Path(data_folder / "train_corpus.txt")) - super(DANE, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, - in_memory=in_memory, comment_symbol="#" + super(NER_SWEDISH, self).__init__( + data_folder, + columns, + tag_to_bioes=tag_to_bioes, + in_memory=in_memory, ) -class EUROPARL_NER_GERMAN(ColumnCorpus): +class SEC_FILLINGS(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", - in_memory: bool = False, + in_memory: bool = True, ): - """ - Initialize the EUROPARL_NER_GERMAN corpus. The first time you call this constructor it will automatically - download the dataset. - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: 'ner' by default, should not be changed. - :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: 'text', 1: 'lemma', 2: 'pos', 3: 'np', 4: 'ner'} + columns = {0: "text", 1: "pos", 3: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1505,44 +1271,35 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - europarl_ner_german_path = "https://nlpado.de/~sebastian/software/ner/" - cached_path(f"{europarl_ner_german_path}ep-96-04-15.conll", Path("datasets") / dataset_name) - cached_path(f"{europarl_ner_german_path}ep-96-04-16.conll", Path("datasets") / dataset_name) - - add_IOB_tags(data_file=Path(data_folder / "ep-96-04-15.conll"), encoding="latin-1", ner_column=4) - add_IOB_tags(data_file=Path(data_folder / "ep-96-04-16.conll"), encoding="latin-1", ner_column=4) + SEC_FILLINGS_Path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/SEC-filings/CONLL-format/data/" + cached_path(f"{SEC_FILLINGS_Path}test/FIN3.txt", Path("datasets") / dataset_name) + cached_path(f"{SEC_FILLINGS_Path}train/FIN5.txt", Path("datasets") / dataset_name) - super(EUROPARL_NER_GERMAN, self).__init__( + super(SEC_FILLINGS, self).__init__( data_folder, columns, tag_to_bioes=tag_to_bioes, - encoding="latin-1", + encoding="utf-8", in_memory=in_memory, - train_file='ep-96-04-16.conll', - test_file='ep-96-04-15.conll' + train_file='FIN5.txt', + test_file="FIN3.txt", + skip_first_line=True ) -class GERMEVAL_14(ColumnCorpus): +class SEMEVAL2017(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", + tag_to_bioes: str = "keyword", in_memory: bool = True, ): - """ - Initialize the GermEval NER corpus for German. This is only possible if you've manually downloaded it to your - machine. Obtain the corpus from https://sites.google.com/site/germeval2014ner/data and put it into some folder. - Then point the base_path parameter in the constructor to this folder - :param base_path: Path to the GermEval corpus on your machine - :param tag_to_bioes: 'ner' by default, should not be changed. - :param in_memory:If True, keeps dataset in memory giving speedups in training. - """ + if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {1: "text", 2: "ner"} + columns = {0: "text", 1: "keyword"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1552,24 +1309,17 @@ def __init__( base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name - # check if data there - if not data_folder.exists(): - log.warning("-" * 100) - log.warning(f'WARNING: GermEval-14 dataset not found at "{data_folder}".') - log.warning( - 'Instructions for obtaining the data can be found here: https://sites.google.com/site/germeval2014ner/data"' - ) - log.warning("-" * 100) - super(GERMEVAL_14, self).__init__( - data_folder, - columns, - tag_to_bioes=tag_to_bioes, - comment_symbol="#", - in_memory=in_memory, + semeval2017_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/SemEval-2017" + cached_path(f"{semeval2017_path}/train.txt", Path("datasets") / dataset_name) + cached_path(f"{semeval2017_path}/test.txt", Path("datasets") / dataset_name) + cached_path(f"{semeval2017_path}/dev.txt", Path("datasets") / dataset_name) + + super(SEMEVAL2017, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class INSPEC(ColumnCorpus): +class SEMEVAL2010(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, @@ -1591,35 +1341,33 @@ def __init__( base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name - inspec_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/Inspec" - cached_path(f"{inspec_path}/train.txt", Path("datasets") / dataset_name) - cached_path(f"{inspec_path}/test.txt", Path("datasets") / dataset_name) - if not "dev.txt" in os.listdir(data_folder): - cached_path(f"{inspec_path}/valid.txt", Path("datasets") / dataset_name) - # rename according to train - test - dev - convention - os.rename(data_folder / "valid.txt", data_folder / "dev.txt") + semeval2010_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/processed_semeval-2010" + cached_path(f"{semeval2010_path}/train.txt", Path("datasets") / dataset_name) + cached_path(f"{semeval2010_path}/test.txt", Path("datasets") / dataset_name) - super(INSPEC, self).__init__( + super(SEMEVAL2010, self).__init__( data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class LER_GERMAN(ColumnCorpus): +class TURKU_NER(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", - in_memory: bool = False, + in_memory: bool = True, + document_as_sequence: bool = False, ): """ - Initialize the LER_GERMAN (Legal Entity Recognition) corpus. The first time you call this constructor it will automatically + Initialize the Finnish TurkuNER corpus. The first time you call this constructor it will automatically download the dataset. :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary. - :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage. + :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict + POS tags instead + :param in_memory: If True, keeps dataset in memory giving speedups in training. :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ - if type(base_path) == str: base_path: Path = Path(base_path) @@ -1635,18 +1383,29 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ler_path = "https://raw.githubusercontent.com/elenanereiss/Legal-Entity-Recognition/master/data/" - cached_path(f"{ler_path}ler.conll", Path("datasets") / dataset_name) + conll_path = "https://raw.githubusercontent.com/TurkuNLP/turku-ner-corpus/master/data/conll" + dev_file = "dev.tsv" + test_file = "test.tsv" + train_file = "train.tsv" + cached_path(f"{conll_path}/{dev_file}", Path("datasets") / dataset_name) + cached_path(f"{conll_path}/{test_file}", Path("datasets") / dataset_name) + cached_path(f"{conll_path}/{train_file}", Path("datasets") / dataset_name) - super(LER_GERMAN, self).__init__( + super(TURKU_NER, self).__init__( data_folder, columns, + dev_file=dev_file, + test_file=test_file, + train_file=train_file, + column_delimiter="\t", tag_to_bioes=tag_to_bioes, + encoding="latin-1", in_memory=in_memory, - train_file='ler.conll' + document_separator_token=None if not document_as_sequence else "-DOCSTART-", ) -class ANER_CORP(ColumnCorpus): + +class TWITTER_NER(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, @@ -1655,15 +1414,14 @@ def __init__( document_as_sequence: bool = False, ): """ - Initialize a preprocessed version of the Arabic Named Entity Recognition Corpus (ANERCorp) dataset available - from https://github.com/EmnamoR/Arabic-named-entity-recognition/blob/master/ANERCorp.rar. - http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp - Column order is swapped - The first time you call this constructor it will automatically download the dataset. + Initialize a dataset called twitter_ner which can be found on the following page: + https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/ner.txt. + + The first time you call this constructor it will automatically + download the dataset. :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict - POS tags instead + :param tag_to_bioes: NER by default, need not be changed :param in_memory: If True, keeps dataset in memory giving speedups in training. :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ @@ -1671,7 +1429,7 @@ def __init__( base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "ner"} + columns = {0: 'text', 1: 'ner'} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1682,32 +1440,41 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - anercorp_path = "https://megantosh.s3.eu-central-1.amazonaws.com/ANERcorp/" - # cached_path(f"{anercorp_path}test.txt", Path("datasets") / dataset_name) - cached_path(f"{anercorp_path}train.txt", Path("datasets") / dataset_name) + twitter_ner_path = "https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/" + cached_path(f"{twitter_ner_path}ner.txt", Path("datasets") / dataset_name) - super(ANER_CORP, self).__init__( + super(TWITTER_NER, self).__init__( data_folder, columns, - # tag_to_bioes=tag_to_bioes, - encoding="utf-8", + tag_to_bioes=tag_to_bioes, + encoding="latin-1", + train_file="ner.txt", in_memory=in_memory, document_separator_token=None if not document_as_sequence else "-DOCSTART-", ) -class NER_BASQUE(ColumnCorpus): +class UP_CHINESE(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", in_memory: bool = True, + document_as_sequence: bool = False, ): + """ + Initialize the Chinese dataset from the Universal Propositions Bank, comming from that webpage: + https://github.com/System-T/UniversalPropositions/tree/master/UP_Chinese + + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "ner"} + columns = {1: "text", 9: "frame"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1718,44 +1485,45 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ner_basque_path = "http://ixa2.si.ehu.eus/eiec/" - data_path = Path(flair.cache_root) / "datasets" / dataset_name - data_file = data_path / "named_ent_eu.train" - if not data_file.is_file(): - cached_path( - f"{ner_basque_path}/eiec_v1.0.tgz", Path("datasets") / dataset_name - ) - import tarfile, shutil - - with tarfile.open( - Path(flair.cache_root) / "datasets" / dataset_name / "eiec_v1.0.tgz", - "r:gz", - ) as f_in: - corpus_files = ( - "eiec_v1.0/named_ent_eu.train", - "eiec_v1.0/named_ent_eu.test", - ) - for corpus_file in corpus_files: - f_in.extract(corpus_file, data_path) - shutil.move(f"{data_path}/{corpus_file}", data_path) + up_zh_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Chinese/" + cached_path(f"{up_zh_path}zh-up-train.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_zh_path}zh-up-dev.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_zh_path}zh-up-test.conllu", Path("datasets") / dataset_name) - super(NER_BASQUE, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory + super(UP_CHINESE, self).__init__( + data_folder, + columns, + encoding="utf-8", + train_file="zh-up-train.conllu", + test_file="zh-up-test.conllu", + dev_file="zh-up-dev.conllu", + in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + comment_symbol="#", ) -class NER_FINNISH(ColumnCorpus): +class UP_ENGLISH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", in_memory: bool = True, + document_as_sequence: bool = False, ): + """ + Initialize the English dataset from the Universal Propositions Bank, comming from that webpage: + https://github.com/System-T/UniversalPropositions. + + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "ner"} + columns = {1: "text", 10: "frame"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1766,48 +1534,45 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ner_finnish_path = "https://raw.githubusercontent.com/mpsilfve/finer-data/master/data/digitoday." - cached_path(f"{ner_finnish_path}2014.train.csv", Path("datasets") / dataset_name) - cached_path(f"{ner_finnish_path}2014.dev.csv", Path("datasets") / dataset_name) - cached_path(f"{ner_finnish_path}2015.test.csv", Path("datasets") / dataset_name) - - _remove_lines_without_annotations(data_file=Path(data_folder / "digitoday.2015.test.csv")) + up_en_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_English-EWT/" + cached_path(f"{up_en_path}en_ewt-up-train.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_en_path}en_ewt-up-dev.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_en_path}en_ewt-up-test.conllu", Path("datasets") / dataset_name) - super(NER_FINNISH, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, skip_first_line=True + super(UP_ENGLISH, self).__init__( + data_folder, + columns, + encoding="utf-8", + train_file="en_ewt-up-train.conllu", + test_file="en_ewt-up-test.conllu", + dev_file="en_ewt-up-dev.conllu", + in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + comment_symbol="#", ) -def _remove_lines_without_annotations(data_file: Union[str, Path] = None): - with open(data_file, 'r') as f: - lines = f.readlines() - with open(data_file, 'w') as f: - for line in lines: - if len(line.split()) != 1: - f.write(line) - - -class NER_SWEDISH(ColumnCorpus): +class UP_FRENCH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", in_memory: bool = True, + document_as_sequence: bool = False, ): """ - Initialize the NER_SWEDISH corpus for Swedish. The first time you call this constructor it will automatically - download the dataset. + Initialize the French dataset from the Universal Propositions Bank, comming from that webpage: + https://github.com/System-T/UniversalPropositions. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary. :param in_memory: If True, keeps dataset in memory giving speedups in training. :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ - if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "ner"} + columns = {1: "text", 9: "frame"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1818,35 +1583,45 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ner_spraakbanken_path = "https://raw.githubusercontent.com/klintan/swedish-ner-corpus/master/" - cached_path(f"{ner_spraakbanken_path}test_corpus.txt", Path("datasets") / dataset_name) - cached_path(f"{ner_spraakbanken_path}train_corpus.txt", Path("datasets") / dataset_name) - - # data is not in IOB2 format. Thus we transform it to IOB2 - add_IOB2_tags(data_file=Path(data_folder / "test_corpus.txt")) - add_IOB2_tags(data_file=Path(data_folder / "train_corpus.txt")) + up_fr_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_French/" + cached_path(f"{up_fr_path}fr-up-train.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_fr_path}fr-up-dev.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_fr_path}fr-up-test.conllu", Path("datasets") / dataset_name) - super(NER_SWEDISH, self).__init__( + super(UP_FRENCH, self).__init__( data_folder, columns, - tag_to_bioes=tag_to_bioes, + encoding="utf-8", + train_file="fr-up-train.conllu", + test_file="fr-up-test.conllu", + dev_file="fr-up-dev.conllu", in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + comment_symbol="#", ) -class SEMEVAL2017(ColumnCorpus): +class UP_FINNISH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "keyword", in_memory: bool = True, + document_as_sequence: bool = False, ): + """ + Initialize the Finnish dataset from the Universal Propositions Bank, comming from that webpage: + https://github.com/System-T/UniversalPropositions/tree/master/UP_Finnish + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "keyword"} + columns = {1: "text", 9: "frame"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1856,29 +1631,46 @@ def __init__( base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name - semeval2017_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/SemEval-2017" - cached_path(f"{semeval2017_path}/train.txt", Path("datasets") / dataset_name) - cached_path(f"{semeval2017_path}/test.txt", Path("datasets") / dataset_name) - cached_path(f"{semeval2017_path}/dev.txt", Path("datasets") / dataset_name) + # download data if necessary + up_fi_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Finnish/" + cached_path(f"{up_fi_path}fi-up-train.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_fi_path}fi-up-dev.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_fi_path}fi-up-test.conllu", Path("datasets") / dataset_name) - super(SEMEVAL2017, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory + super(UP_FINNISH, self).__init__( + data_folder, + columns, + encoding="utf-8", + train_file="fi-up-train.conllu", + test_file="fi-up-test.conllu", + dev_file="fi-up-dev.conllu", + in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + comment_symbol="#", ) -class SEMEVAL2010(ColumnCorpus): +class UP_GERMAN(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "keyword", in_memory: bool = True, + document_as_sequence: bool = False, ): + """ + Initialize the German dataset from the Universal Propositions Bank, comming from that webpage: + https://github.com/System-T/UniversalPropositions. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "keyword"} + columns = {1: "text", 9: "frame"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1888,27 +1680,46 @@ def __init__( base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name - semeval2010_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/processed_semeval-2010" - cached_path(f"{semeval2010_path}/train.txt", Path("datasets") / dataset_name) - cached_path(f"{semeval2010_path}/test.txt", Path("datasets") / dataset_name) + # download data if necessary + up_de_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_German/" + cached_path(f"{up_de_path}de-up-train.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_de_path}de-up-dev.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_de_path}de-up-test.conllu", Path("datasets") / dataset_name) - super(SEMEVAL2010, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory + super(UP_GERMAN, self).__init__( + data_folder, + columns, + encoding="utf-8", + train_file="de-up-train.conllu", + test_file="de-up-test.conllu", + dev_file="de-up-dev.conllu", + in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + comment_symbol="#", ) -class WIKINER_ENGLISH(ColumnCorpus): +class UP_ITALIAN(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", - in_memory: bool = False, + in_memory: bool = True, + document_as_sequence: bool = False, ): + """ + Initialize the Italian dataset from the Universal Propositions Bank, comming from that webpage: + https://github.com/System-T/UniversalPropositions/tree/master/UP_Italian + + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "pos", 2: "ner"} + columns = {1: "text", 9: "frame"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1919,25 +1730,45 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - _download_wikiner("en", dataset_name) + up_it_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Italian/" + cached_path(f"{up_it_path}it-up-train.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_it_path}it-up-dev.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_it_path}it-up-test.conllu", Path("datasets") / dataset_name) - super(WIKINER_ENGLISH, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory + super(UP_ITALIAN, self).__init__( + data_folder, + columns, + encoding="utf-8", + train_file="it-up-train.conllu", + test_file="it-up-test.conllu", + dev_file="it-up-dev.conllu", + in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + comment_symbol="#", ) -class WIKINER_GERMAN(ColumnCorpus): +class UP_SPANISH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", - in_memory: bool = False, + in_memory: bool = True, + document_as_sequence: bool = False, ): + """ + Initialize the Spanish dataset from the Universal Propositions Bank, comming from that webpage: + https://github.com/System-T/UniversalPropositions + + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "pos", 2: "ner"} + columns = {1: "text", 9: "frame"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1948,25 +1779,45 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - _download_wikiner("de", dataset_name) + up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish/" + cached_path(f"{up_es_path}es-up-train.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_es_path}es-up-dev.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_es_path}es-up-test.conllu", Path("datasets") / dataset_name) - super(WIKINER_GERMAN, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory + super(UP_SPANISH, self).__init__( + data_folder, + columns, + encoding="utf-8", + train_file="es-up-train.conllu", + test_file="es-up-test.conllu", + dev_file="es-up-dev.conllu", + in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + comment_symbol="#", ) -class WIKINER_DUTCH(ColumnCorpus): +class UP_SPANISH_ANCORA(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", - in_memory: bool = False, + in_memory: bool = True, + document_as_sequence: bool = False, ): + """ + Initialize the Spanish AnCora dataset from the Universal Propositions Bank, comming from that webpage: + https://github.com/System-T/UniversalPropositions + + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "pos", 2: "ner"} + columns = {1: "text", 9: "frame"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1977,25 +1828,47 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - _download_wikiner("nl", dataset_name) + up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish-AnCora/" + cached_path(f"{up_es_path}es_ancora-up-train.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_es_path}es_ancora-up-dev.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_es_path}es_ancora-up-test.conllu", Path("datasets") / dataset_name) - super(WIKINER_DUTCH, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory + super(UP_SPANISH_ANCORA, self).__init__( + data_folder, + columns, + encoding="utf-8", + train_file="es_ancora-up-train.conllu", + test_file="es_ancora-up-test.conllu", + dev_file="es_ancora-up-dev.conllu", + in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + comment_symbol="#", ) -class WIKINER_FRENCH(ColumnCorpus): +class WEIBO_NER(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", - in_memory: bool = False, + in_memory: bool = True, + document_as_sequence: bool = False, ): + """ + Initialize the WEIBO_NER corpus . The first time you call this constructor it will automatically + download the dataset. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict + POS tags instead + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "pos", 2: "ner"} + columns = {0: 'text', 1: 'ner'} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2006,192 +1879,449 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - _download_wikiner("fr", dataset_name) + weiboNER_conll_path = "https://raw.githubusercontent.com/87302380/WEIBO_NER/main/data/" + cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.train", Path("datasets") / dataset_name) + cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.test", Path("datasets") / dataset_name) + cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.dev", Path("datasets") / dataset_name) - super(WIKINER_FRENCH, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory + super(WEIBO_NER, self).__init__( + data_folder, + columns, + tag_to_bioes=tag_to_bioes, + encoding="utf-8", + in_memory=in_memory, + train_file="weiboNER_2nd_conll_format.train", + test_file="weiboNER_2nd_conll_format.test", + dev_file="weiboNER_2nd_conll_format.dev", + document_separator_token=None if not document_as_sequence else "-DOCSTART-", ) -class WIKINER_ITALIAN(ColumnCorpus): +class WIKIANN(MultiCorpus): def __init__( self, + languages: Union[str, List[str]], base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", in_memory: bool = False, ): + """ + WkiAnn corpus for cross-lingual NER consisting of datasets from 282 languages that exist + in Wikipedia. See https://elisa-ie.github.io/wikiann/ for details and for the languages and their + respective abbreveations, i.e. "en" for english. (license: https://opendatacommons.org/licenses/by/) + Parameters + ---------- + languages : Union[str, List[str]] + Should be an abbreviation of a language ("en", "de",..) or a list of abbreviations. + The datasets of all passed languages will be saved in one MultiCorpus. + (Note that, even though listed on https://elisa-ie.github.io/wikiann/ some datasets are empty. + This includes "aa", "cho", "ho", "hz", "ii", "jam", "kj", "kr", "mus", "olo" and "tcy".) + base_path : Union[str, Path], optional + Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + tag_to_bioes : str, optional + The data is in bio-format. It will by default (with the string "ner" as value) be transformed + into the bioes format. If you dont want that set it to None. + + """ + if type(languages) == str: + languages = [languages] + if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "pos", 2: "ner"} + columns = {0: "text", 1: "ner"} # this dataset name - dataset_name = self.__class__.__name__.lower() + dataset_name = "wikiann" # default dataset folder is the cache root if not base_path: base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name - # download data if necessary - _download_wikiner("it", dataset_name) + # For each language in languages, the file is downloaded if not existent + # Then a comlumncorpus of that data is created and saved in a list + # this list is handed to the multicorpus - super(WIKINER_ITALIAN, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory - ) + # list that contains the columncopora + corpora = [] + google_drive_path = 'https://drive.google.com/uc?id=' + # download data if necessary + first = True + for language in languages: -class WIKINER_SPANISH(ColumnCorpus): - def __init__( - self, - base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", - in_memory: bool = False, - ): - if type(base_path) == str: - base_path: Path = Path(base_path) - - # column format - columns = {0: "text", 1: "pos", 2: "ner"} - - # this dataset name - dataset_name = self.__class__.__name__.lower() - - # default dataset folder is the cache root - if not base_path: - base_path = Path(flair.cache_root) / "datasets" - data_folder = base_path / dataset_name - - # download data if necessary - _download_wikiner("es", dataset_name) - - super(WIKINER_SPANISH, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory - ) - - -class WIKINER_PORTUGUESE(ColumnCorpus): - def __init__( - self, - base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", - in_memory: bool = False, - ): - if type(base_path) == str: - base_path: Path = Path(base_path) - - # column format - columns = {0: "text", 1: "pos", 2: "ner"} - - # this dataset name - dataset_name = self.__class__.__name__.lower() - - # default dataset folder is the cache root - if not base_path: - base_path = Path(flair.cache_root) / "datasets" - data_folder = base_path / dataset_name - - # download data if necessary - _download_wikiner("pt", dataset_name) - - super(WIKINER_PORTUGUESE, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory - ) - - -class WIKINER_POLISH(ColumnCorpus): - def __init__( - self, - base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", - in_memory: bool = False, - ): - if type(base_path) == str: - base_path: Path = Path(base_path) - - # column format - columns = {0: "text", 1: "pos", 2: "ner"} - - # this dataset name - dataset_name = self.__class__.__name__.lower() - - # default dataset folder is the cache root - if not base_path: - base_path = Path(flair.cache_root) / "datasets" - data_folder = base_path / dataset_name - - # download data if necessary - _download_wikiner("pl", dataset_name) - - super(WIKINER_POLISH, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory - ) - + language_folder = data_folder / language + file_name = 'wikiann-' + language + '.bio' -class WIKINER_RUSSIAN(ColumnCorpus): - def __init__( - self, - base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", - in_memory: bool = False, - ): - if type(base_path) == str: - base_path: Path = Path(base_path) + # if language not downloaded yet, download it + if not language_folder.exists(): + if first == True: + import gdown + import tarfile + first = False + # create folder + os.makedirs(language_folder) + # get google drive id from list + google_id = google_drive_id_from_language_name(language) + url = google_drive_path + google_id - # column format - columns = {0: "text", 1: "pos", 2: "ner"} + # download from google drive + gdown.download(url, str(language_folder / language) + '.tar.gz') - # this dataset name - dataset_name = self.__class__.__name__.lower() + # unzip + print("Extract data...") + tar = tarfile.open(str(language_folder / language) + '.tar.gz', "r:gz") + # tar.extractall(language_folder,members=[tar.getmember(file_name)]) + tar.extract(file_name, str(language_folder)) + tar.close() + print('...done.') - # default dataset folder is the cache root - if not base_path: - base_path = Path(flair.cache_root) / "datasets" - data_folder = base_path / dataset_name + # transform data into required format + # the processed dataset has the additional ending "_new" + print("Process dataset...") + silver_standard_to_simple_ner_annotation(str(language_folder / file_name)) + # remove the unprocessed dataset + os.remove(str(language_folder / file_name)) + print('...done.') - # download data if necessary - _download_wikiner("ru", dataset_name) + # initialize comlumncorpus and add it to list + print("Read data into corpus...") + corp = ColumnCorpus(data_folder=language_folder, + column_format=columns, + train_file=file_name + '_new', + tag_to_bioes=tag_to_bioes, + in_memory=in_memory, + ) + corpora.append(corp) + print("...done.") - super(WIKINER_RUSSIAN, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory + super(WIKIANN, self).__init__( + corpora, name='wikiann' ) -class WNUT_17(ColumnCorpus): - def __init__( - self, - base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", - in_memory: bool = True, - ): - if type(base_path) == str: - base_path: Path = Path(base_path) - - # column format - columns = {0: "text", 1: "ner"} - - # this dataset name - dataset_name = self.__class__.__name__.lower() +def silver_standard_to_simple_ner_annotation(data_file: Union[str, Path]): + f_read = open(data_file, 'r', encoding='utf-8') + f_write = open(data_file + '_new', 'w+', encoding='utf-8') + while True: + line = f_read.readline() + if line: + if line == '\n': + f_write.write(line) + else: + liste = line.split() + f_write.write(liste[0] + ' ' + liste[-1] + '\n') + else: + break + f_read.close() + f_write.close() - # default dataset folder is the cache root - if not base_path: - base_path = Path(flair.cache_root) / "datasets" - data_folder = base_path / dataset_name - # download data if necessary - wnut_path = "https://noisy-text.github.io/2017/files/" - cached_path(f"{wnut_path}wnut17train.conll", Path("datasets") / dataset_name) - cached_path(f"{wnut_path}emerging.dev.conll", Path("datasets") / dataset_name) - cached_path( - f"{wnut_path}emerging.test.annotated", Path("datasets") / dataset_name - ) +def google_drive_id_from_language_name(language): + languages_ids = { + 'aa': '1tDDlydKq7KQQ3_23Ysbtke4HJOe4snIk', # leer + 'ab': '1hB8REj2XA_0DjI9hdQvNvSDpuBIb8qRf', + 'ace': '1WENJS2ppHcZqaBEXRZyk2zY-PqXkTkgG', + 'ady': '1n6On8WWDHxEoybj7F9K15d_fkGPy6KgO', + 'af': '1CPB-0BD2tg3zIT60D3hmJT0i5O_SKja0', + 'ak': '1l2vlGHnQwvm9XhW5S-403fetwUXhBlZm', + 'als': '196xyYjhbie7sYLHLZHWkkurOwQLi8wK-', + 'am': '1ug1IEoExKD3xWpvfZprAPSQi82YF9Cet', + 'an': '1DNLgPOAOsGZBYd6rC5ddhzvc9_DtWnk2', + 'ang': '1W_0ti7Tl8AkqM91lRCMPWEuUnPOAZroV', + 'ar': '1tyvd32udEQG_cNeVpaD5I2fxvCc6XKIS', + 'arc': '1hSOByStqPmP3b9HfQ39EclUZGo8IKCMb', + 'arz': '1CKW5ZhxTpIHmc8Jt5JLz_5O6Cr8Icsan', + 'as': '12opBoIweBLM8XciMHT4B6-MAaKdYdvpE', + 'ast': '1rp64PxGZBDfcw-tpFBjLg_ddLDElG1II', + 'av': '1hncGUrkG1vwAAQgLtwOf41BWkHkEvdss', + 'ay': '1VmIsWpMTz442b4Mx798ZOgtB9vquKQtf', + 'az': '1FXDXsvBSdqc7GGIDZv0hqBOaaw12Ip2-', + 'azb': '1amVqOuHLEkhjn8rkGUl-mXdZlaACWyNT', + 'ba': '1aLx1d8GagI11VZVYOGQy0BEePeqoT0x3', + 'bar': '1JZ8-k8ZmnpWYI_Yl_cBBgjVdxoM9Daci', + 'bat-smg': '1trxKXDFSeKsygTMKi-ZqXSJs7F90k5a8', + 'bcl': '1Hs0k7KVZ2DPsqroZ4cUKcwZG4HdPV794', + 'be-x-old': '1gaK-spj1m6eGYQ-SsngLxxLUvP1VRk08', + 'be': '1_ttfOSy9BzCRkIT_p3mImT82XRPpEiuH', + 'bg': '1Iug6gYKemb0OrLTUrKDc_c66YGypTfCF', + 'bh': '12OcSFLu940A8tVQLxI8pnxKBpTeZHmrh', + 'bi': '1rftVziS_pqARx4mvLJC0sKLY-OL5ZIjE', + 'bjn': '1n17mkRjPUAOWQk5LQs2C3Tz3ShxK0enZ', + 'bm': '1284dwO_sfdsWE7FR06HhfBRUb8ePesKR', + 'bn': '1K2DM1mT4hkr6NlAIBTj95BeVXcgvpgDm', + 'bo': '1SzGHDVK-OguKdjZ4DXWiOJVrie1iHeWm', + 'bpy': '1m-e5EoruJufvwBEgJLmJtx6jzx64pYN2', + 'br': '1xdaBoJ1DnwI0iEq7gQN1dWcABAs_bM9H', + 'bs': '167dsB01trMYFQl8FshtIdfhjw7IfVKbk', + 'bug': '1yCnevM9_KJzFk27Vxsva_20OacLo4Uam', + 'bxr': '1DlByAX3zB-9UyEAVD4wtX-R7mXC-8xum', + 'ca': '1LuUgbd9sGa-5Ahcsy31EK89a3WOowftY', + 'cbk-zam': '1kgF8xoD-kIOWZET_9kp_4yNX6AAXn6PI', + 'cdo': '14x1y6611G-UAEGq92QEHRpreVkYnoUCw', + 'ce': '1QUUCVKA-fkiCHd3KT3zUWefaWnxzlZLu', + 'ceb': '1DJZE9RfaMoPNXHI73KBXAm4YSe-_YCUk', + 'ch': '1YzAfhmatkmTpkZbAcD6X83epCgzD5S2_', + 'cho': '1ciY0vF3c5a2mTOo_k32A2wMs0klK98Kb', # leer + 'chr': '1EHaxz1UZHn7v2bbRzCLAhPsNtRzrG3Ae', + 'chy': '1nNWwMAJr1KNdz3bHf6uIn-thZCknlTeB', + 'ckb': '1llpaftcUSiXCZQZMdAqaJSrhwMdcf9IV', + 'co': '1ZP-8oWgMYfW7a6w6ygEFkKDGbN39QnDn', + 'cr': '1ST0xRicLAG4JdCZwGdaY-0pEXooQh7e6', + 'crh': '1Jmpq2XVYUR_XaXU5XNhtOMnz-qkpsgpE', + 'cs': '1Vydyze-jBkK_S1uV5ewV_Y6dbwhXr7lk', + 'csb': '1naUyF74lZPnnopXdOqf5Xor2kT4WoHfS', + 'cu': '1EN5dVTU6jc7YOYPCHq8EYUF31HlMUKs7', + 'cv': '1gEUAlqYSSDI4TrWCqP1LUq2n0X1XEjN3', + 'cy': '1q5g6NJE5GXf65Vc_P4BnUMHQ49Prz-J1', + 'da': '11onAGOLkkqrIwM784siWlg-cewa5WKm8', + 'de': '1f9nWvNkCCy6XWhd9uf4Dq-2--GzSaYAb', + 'diq': '1IkpJaVbEOuOs9qay_KG9rkxRghWZhWPm', + 'dsb': '1hlExWaMth-2eVIQ3i3siJSG-MN_7Z6MY', + 'dv': '1WpCrslO4I7TMb2uaKVQw4U2U8qMs5szi', + 'dz': '10WX52ePq2KfyGliwPvY_54hIjpzW6klV', + 'ee': '1tYEt3oN2KPzBSWrk9jpCqnW3J1KXdhjz', + 'el': '1cxq4NUYmHwWsEn5waYXfFSanlINXWLfM', + 'eml': '17FgGhPZqZNtzbxpTJOf-6nxEuI5oU4Vd', + 'en': '1mqxeCPjxqmO7e8utj1MQv1CICLFVvKa-', + 'eo': '1YeknLymGcqj44ug2yd4P7xQVpSK27HkK', + 'es': '1Dnx3MVR9r5cuoOgeew2gT8bDvWpOKxkU', + 'et': '1Qhb3kYlQnLefWmNimdN_Vykm4mWzbcWy', + 'eu': '1f613wH88UeITYyBSEMZByK-nRNMwLHTs', + 'ext': '1D0nLOZ3aolCM8TShIRyCgF3-_MhWXccN', + 'fa': '1QOG15HU8VfZvJUNKos024xI-OGm0zhEX', + 'ff': '1h5pVjxDYcq70bSus30oqi9KzDmezVNry', + 'fi': '1y3Kf6qYsSvL8_nSEwE1Y6Bf6ninaPvqa', + 'fiu-vro': '1oKUiqG19WgPd3CCl4FGudk5ATmtNfToR', + 'fj': '10xDMuqtoTJlJFp5ghbhKfNWRpLDK3W4d', + 'fo': '1RhjYqgtri1276Be1N9RrNitdBNkpzh0J', + 'fr': '1sK_T_-wzVPJYrnziNqWTriU52rEsXGjn', + 'frp': '1NUm8B2zClBcEa8dHLBb-ZgzEr8phcQyZ', + 'frr': '1FjNqbIUlOW1deJdB8WCuWjaZfUzKqujV', + 'fur': '1oqHZMK7WAV8oHoZLjGR0PfmO38wmR6XY', + 'fy': '1DvnU6iaTJc9bWedmDklHyx8nzKD1s3Ge', + 'ga': '1Ql6rh7absdYQ8l-3hj_MVKcEC3tHKeFB', + 'gag': '1zli-hOl2abuQ2wsDJU45qbb0xuvYwA3a', + 'gan': '1u2dOwy58y-GaS-tCPJS_i9VRDQIPXwCr', + 'gd': '1umsUpngJiwkLdGQbRqYpkgxZju9dWlRz', + 'gl': '141K2IbLjJfXwFTIf-kthmmG0YWdi8liE', + 'glk': '1ZDaxQ6ilXaoivo4_KllagabbvfOuiZ0c', + 'gn': '1hM4MuCaVnZqnL-w-0N-WcWag22ikVLtZ', + 'gom': '1BNOSw75tzPC0wEgLOCKbwu9wg9gcLOzs', + 'got': '1YSHYBtXc1WvUvMIHPz6HHgJvaXKulJUj', + 'gu': '1VdK-B2drqFwKg8KD23c3dKXY-cZgCMgd', + 'gv': '1XZFohYNbKszEFR-V-yDXxx40V41PV9Zm', + 'ha': '18ZG4tUU0owRtQA8Ey3Dl72ALjryEJWMC', + 'hak': '1QQe3WgrCWbvnVH42QXD7KX4kihHURB0Z', + 'haw': '1FLqlK-wpz4jy768XbQAtxd9PhC-9ciP7', + 'he': '18K-Erc2VOgtIdskaQq4D5A3XkVstDmfX', + 'hi': '1lBRapb5tjBqT176gD36K5yb_qsaFeu-k', + 'hif': '153MQ9Ga4NQ-CkK8UiJM3DjKOk09fhCOV', + 'ho': '1c1AoS7yq15iVkTEE-0f3x25NT4F202B8', # leer + 'hr': '1wS-UtB3sGHuXJQQGR0F5lDegogsgoyif', + 'hsb': '1_3mMLzAE5OmXn2z64rW3OwWbo85Mirbd', + 'ht': '1BwCaF0nfdgkM7Yt7A7d7KyVk0BcuwPGk', + 'hu': '10AkDmTxUWNbOXuYLYZ-ZPbLAdGAGZZ8J', + 'hy': '1Mi2k2alJJquT1ybd3GC3QYDstSagaWdo', + 'hz': '1c1m_-Q92v0Di7Nez6VuaccrN19i8icKV', # leer + 'ia': '1jPyqTmDuVhEhj89N606Cja5heJEbcMoM', + 'id': '1JWIvIh8fQoMQqk1rPvUThaskxnTs8tsf', + 'ie': '1TaKRlTtB8-Wqu4sfvx6JQKIugAlg0pV-', + 'ig': '15NFAf2Qx6BXSjv_Oun9_3QRBWNn49g86', + 'ii': '1qldGJkMOMKwY13DpcgbxQCbff0K982f9', # leer + 'ik': '1VoSTou2ZlwVhply26ujowDz6gjwtxmny', + 'ilo': '1-xMuIT6GaM_YeHqgm1OamGkxYfBREiv3', + 'io': '19Zla0wsAcrZm2c0Pw5ghpp4rHjYs26Pp', + 'is': '11i-NCyqS6HbldIbYulsCgQGZFXR8hwoB', + 'it': '1HmjlOaQunHqL2Te7pIkuBWrnjlmdfYo_', + 'iu': '18jKm1S7Ls3l0_pHqQH8MycG3LhoC2pdX', + 'ja': '10dz8UxyK4RIacXE2HcGdrharmp5rwc3r', + 'jam': '1v99CXf9RnbF6aJo669YeTR6mQRTOLZ74', # leer + 'jbo': '1_LmH9hc6FDGE3F7pyGB1fUEbSwuTYQdD', + 'jv': '1qiSu1uECCLl4IBZS27FBdJIBivkJ7GwE', + 'ka': '172UFuFRBX2V1aWeXlPSpu9TjS-3cxNaD', + 'kaa': '1kh6hMPUdqO-FIxRY6qaIBZothBURXxbY', + 'kab': '1oKjbZI6ZrrALCqnPCYgIjKNrKDA7ehcs', + 'kbd': '1jNbfrboPOwJmlXQBIv053d7n5WXpMRv7', + 'kg': '1iiu5z-sdJ2JLC4Ja9IgDxpRZklIb6nDx', + 'ki': '1GUtt0QI84c5McyLGGxoi5uwjHOq1d6G8', + 'kj': '1nSxXUSGDlXVCIPGlVpcakRc537MwuKZR', # leer + 'kk': '1ryC3UN0myckc1awrWhhb6RIi17C0LCuS', + 'kl': '1gXtGtX9gcTXms1IExICnqZUHefrlcIFf', + 'km': '1DS5ATxvxyfn1iWvq2G6qmjZv9pv0T6hD', + 'kn': '1ZGLYMxbb5-29MNmuUfg2xFhYUbkJFMJJ', + 'ko': '12r8tIkTnwKhLJxy71qpIcoLrT6NNhQYm', + 'koi': '1EdG_wZ_Qk124EPAZw-w6rdEhYLsgcvIj', + 'kr': '19VNQtnBA-YL_avWuVeHQHxJZ9MZ04WPF', # leer + 'krc': '1nReV4Mb7Wdj96czpO5regFbdBPu0zZ_y', + 'ks': '1kzh0Pgrv27WRMstR9MpU8mu7p60TcT-X', + 'ksh': '1iHJvrl2HeRaCumlrx3N7CPrHQ2KuLUkt', + 'ku': '1YqJog7Bkk0fHBCSTxJ9heeE-bfbkbkye', + 'kv': '1s91HI4eq8lQYlZwfrJAgaGlCyAtIhvIJ', + 'kw': '16TaIX2nRfqDp8n7zudd4bqf5abN49dvW', + 'ky': '17HPUKFdKWhUjuR1NOp5f3PQYfMlMCxCT', + 'la': '1NiQuBaUIFEERvVXo6CQLwosPraGyiRYw', + 'lad': '1PEmXCWLCqnjLBomMAYHeObM1AmVHtD08', + 'lb': '1nE4g10xoTU23idmDtOQ0w2QCuizZ6QH_', + 'lbe': '1KOm-AdRcCHfSc1-uYBxBA4GjxXjnIlE-', + 'lez': '1cJAXshrLlF1TZlPHJTpDwEvurIOsz4yR', + 'lg': '1Ur0y7iiEpWBgHECrIrT1OyIC8um_y4th', + 'li': '1TikIqfqcZlSDWhOae1JnjJiDko4nj4Dj', + 'lij': '1ro5ItUcF49iP3JdV82lhCQ07MtZn_VjW', + 'lmo': '1W4rhBy2Pi5SuYWyWbNotOVkVY3kYWS_O', + 'ln': '1bLSV6bWx0CgFm7ByKppZLpYCFL8EIAoD', + 'lo': '1C6SSLeKF3QirjZbAZAcpVX_AXYg_TJG3', + 'lrc': '1GUcS28MlJe_OjeQfS2AJ8uczpD8ut60e', + 'lt': '1gAG6TcMTmC128wWK0rCXRlCTsJY9wFQY', + 'ltg': '12ziP8t_fAAS9JqOCEC0kuJObEyuoiOjD', + 'lv': '1MPuAM04u-AtfybXdpHwCqUpFWbe-zD0_', + 'mai': '1d_nUewBkka2QGEmxCc9v3dTfvo7lPATH', + 'map-bms': '1wrNIE-mqp2xb3lrNdwADe6pb7f35NP6V', + 'mdf': '1BmMGUJy7afuKfhfTBMiKxM3D7FY-JrQ2', + 'mg': '105WaMhcWa-46tCztoj8npUyg0aH18nFL', + 'mh': '1Ej7n6yA1cF1cpD5XneftHtL33iHJwntT', + 'mhr': '1CCPIUaFkEYXiHO0HF8_w07UzVyWchrjS', + 'mi': '1F6au9xQjnF-aNBupGJ1PwaMMM6T_PgdQ', + 'min': '1tVK5SHiCy_DaZSDm3nZBgT5bgWThbJt_', + 'mk': '18NpudytGhSWq_LbmycTDw10cSftlSBGS', + 'ml': '1V73UE-EvcE-vV3V1RTvU4sak6QFcP91y', + 'mn': '14jRXicA87oXZOZllWqUjKBMetNpQEUUp', + 'mo': '1YsLGNMsJ7VsekhdcITQeolzOSK4NzE6U', + 'mr': '1vOr1AIHbgkhTO9Ol9Jx5Wh98Qdyh1QKI', + 'mrj': '1dW-YmEW8a9D5KyXz8ojSdIXWGekNzGzN', + 'ms': '1bs-_5WNRiZBjO-DtcNtkcIle-98homf_', + 'mt': '1L7aU3iGjm6SmPIU74k990qRgHFV9hrL0', + 'mus': '1_b7DcRqiKJFEFwp87cUecqf8A5BDbTIJ', # leer + 'mwl': '1MfP0jba2jQfGVeJOLq26MjI6fYY7xTPu', + 'my': '16wsIGBhNVd2lC2p6n1X8rdMbiaemeiUM', + 'myv': '1KEqHmfx2pfU-a1tdI_7ZxMQAk5NJzJjB', + 'mzn': '1CflvmYEXZnWwpsBmIs2OvG-zDDvLEMDJ', + 'na': '1r0AVjee5wNnrcgJxQmVGPVKg5YWz1irz', + 'nah': '1fx6eu91NegyueZ1i0XaB07CKjUwjHN7H', + 'nap': '1bhT4sXCJvaTchCIV9mwLBtf3a7OprbVB', + 'nds-nl': '1UIFi8eOCuFYJXSAXZ9pCWwkQMlHaY4ye', + 'nds': '1FLgZIXUWa_vekDt4ndY0B5XL7FNLiulr', + 'ne': '1gEoCjSJmzjIH4kdHsbDZzD6ID4_78ekS', + 'new': '1_-p45Ny4w9UvGuhD8uRNSPPeaARYvESH', + 'ng': '11yxPdkmpmnijQUcnFHZ3xcOmLTYJmN_R', + 'nl': '1dqYXg3ilzVOSQ_tz_dF47elSIvSIhgqd', + 'nn': '1pDrtRhQ001z2WUNMWCZQU3RV_M0BqOmv', + 'no': '1zuT8MI96Ivpiu9mEVFNjwbiM8gJlSzY2', + 'nov': '1l38388Rln0NXsSARMZHmTmyfo5C0wYTd', + 'nrm': '10vxPq1Nci7Wpq4XOvx3dtqODskzjdxJQ', + 'nso': '1iaIV8qlT0RDnbeQlnxJ3RehsG3gU5ePK', + 'nv': '1oN31jT0w3wP9aGwAPz91pSdUytnd9B0g', + 'ny': '1eEKH_rUPC560bfEg11kp3kbe8qWm35IG', + 'oc': '1C01cW8G_j8US-DTrsmeal_ENHTtNWn-H', + 'olo': '1vbDwKZKqFq84dusr1SvDx5JbBcPanx9L', # leer + 'om': '1q3h22VMbWg2kgVFm-OArR-E4y1yBQ1JX', + 'or': '1k8LwCE8nC7lq6neXDaS3zRn0KOrd9RnS', + 'os': '1u81KAB34aEQfet00dLMRIBJsfRwbDTij', + 'pa': '1JDEHL1VcLHBamgTPBom_Ryi8hk6PBpsu', + 'pag': '1k905VUWnRgY8kFb2P2431Kr4dZuolYGF', + 'pam': '1ssugGyJb8ipispC60B3I6kzMsri1WcvC', + 'pap': '1Za0wfwatxYoD7jGclmTtRoBP0uV_qImQ', + 'pcd': '1csJlKgtG04pdIYCUWhsCCZARKIGlEYPx', + 'pdc': '1Xnms4RXZKZ1BBQmQJEPokmkiweTpouUw', + 'pfl': '1tPQfHX7E0uKMdDSlwNw5aGmaS5bUK0rn', + 'pi': '16b-KxNxzbEuyoNSlI3bfe2YXmdSEsPFu', + 'pih': '1vwyihTnS8_PE5BNK7cTISmIBqGWvsVnF', + 'pl': '1fijjS0LbfpKcoPB5V8c8fH08T8AkXRp9', + 'pms': '12ySc7X9ajWWqMlBjyrPiEdc-qVBuIkbA', + 'pnb': '1RB3-wjluhTKbdTGCsk3nag1bM3m4wENb', + 'pnt': '1ZCUzms6fY4on_fW8uVgO7cEs9KHydHY_', + 'ps': '1WKl9Av6Sqz6aHKyUM5kIh90mzFzyVWH9', + 'pt': '13BX-_4_hcTUp59HDyczFDI32qUB94vUY', + 'qu': '1CB_C4ygtRoegkqgcqfXNHr8oQd-UcvDE', + 'rm': '1YRSGgWoxEqSojHXuBHJnY8vAHr1VgLu-', + 'rmy': '1uFcCyvOWBJWKFQxbkYSp373xUXVl4IgF', + 'rn': '1ekyyb2MvupYGY_E8_BhKvV664sLvW4aE', + 'ro': '1YfeNTSoxU-zJMnyQotLk5X8B_6nHryBu', + 'roa-rup': '150s4H4TdQ5nNYVC6j0E416TUAjBE85yy', + 'roa-tara': '1H6emfQsD_a5yohK4RMPQ-GrnHXqqVgr3', + 'ru': '11gP2s-SYcfS3j9MjPp5C3_nFeQB-8x86', + 'rue': '1OuSglZAndja1J5D5IUmdbt_niTTyEgYK', + 'rw': '1NuhHfi0-B-Xlr_BApijnxCw0WMEltttP', + 'sa': '1P2S3gL_zvKgXLKJJxg-Fb4z8XdlVpQik', + 'sah': '1qz0MpKckzUref2FX_FYiNzI2p4BDc5oR', + 'sc': '1oAYj_Fty4FUwjAOBEBaiZt_cY8dtpDfA', + 'scn': '1sDN9zHkXWYoHYx-DUu-GPvsUgB_IRa8S', + 'sco': '1i8W7KQPj6YZQLop89vZBSybJNgNsvXWR', + 'sd': '1vaNqfv3S8Gl5pQmig3vwWQ3cqRTsXmMR', + 'se': '1RT9xhn0Vl90zjWYDTw5V1L_u1Oh16tpP', + 'sg': '1iIh2oXD2Szz_AygUvTt3_ZK8a3RYEGZ_', + 'sh': '1qPwLiAm6t4__G-zVEOrBgYx6VRmgDgiS', + 'si': '1G5ryceID0TP6SAO42e-HAbIlCvYmnUN7', + 'simple': '1FVV49o_RlK6M5Iw_7zeJOEDQoTa5zSbq', + 'sk': '11mkYvbmAWKTInj6t4Ma8BUPxoR5o6irL', + 'sl': '1fsIZS5LgMzMzZ6T7ogStyj-ILEZIBRvO', + 'sm': '1yefECpKX_Y4R7G2tggIxvc_BvJfOAz-t', + 'sn': '1fYeCjMPvRAv94kvZjiKI-ktIDLkbv0Ve', + 'so': '1Uc-eSZnJb36SgeTvRU3GirXZOlGD_NB6', + 'sq': '11u-53n71O_yjpwRiCQSwgL7N2w72ZptX', + 'sr': '1PGLGlQi8Q0Eac6dib-uuCJAAHK6SF5Pz', + 'srn': '1JKiL3TSXqK1-KhPfAwMK0uqw90WEzg7M', + 'ss': '1e0quNEsA1dn57-IbincF4D82dRWgzQlp', + 'st': '1ny-FBzpBqIDgv6jMcsoFev3Ih65FNZFO', + 'stq': '15Fx32ROy2IM6lSqAPUykkr3CITR6Xd7v', + 'su': '1C0FJum7bYZpnyptBvfAgwJb0TX2hggtO', + 'sv': '1YyqzOSXzK5yrAou9zeTDWH_7s569mDcz', + 'sw': '1_bNTj6T8eXlNAIuHaveleWlHB_22alJs', + 'szl': '1_dXEip1snK4CPVGqH8x7lF5O-6FdCNFW', + 'ta': '1ZFTONsxGtSnC9QB6RpWSvgD_MbZwIhHH', + 'tcy': '15R6u7KQs1vmDSm_aSDrQMJ3Q6q3Be0r7', # leer + 'te': '11Sx-pBAPeZOXGyv48UNSVMD0AH7uf4YN', + 'tet': '11mr2MYLcv9pz7mHhGGNi5iNCOVErYeOt', + 'tg': '16ttF7HWqM9Cnj4qmgf3ZfNniiOJfZ52w', + 'th': '14xhIt-xr5n9nMuvcwayCGM1-zBCFZquW', + 'ti': '123q5e9MStMShp8eESGtHdSBGLDrCKfJU', + 'tk': '1X-JNInt34BNGhg8A8Peyjw2WjsALdXsD', + 'tl': '1WkQHbWd9cqtTnSHAv0DpUThaBnzeSPTJ', + 'tn': '1fHfQHetZn8-fLuRZEu-cvs-kQYwPvjyL', + 'to': '1cHOLaczYJ8h-OqQgxeoH9vMG3izg6muT', + 'tpi': '1YsRjxVu6NYOrXRb8oqMO9FPaicelFEcu', + 'tr': '1J1Zy02IxvtCK0d1Ba2h_Ulit1mVb9UIX', + 'ts': '1pIcfAt3KmtmDkyhOl-SMSeoM8aP8bOpl', + 'tt': '1vsfzCjj-_bMOn5jBai41TF5GjKJM_Ius', + 'tum': '1NWcg65daI2Bt0awyEgU6apUDbBmiqCus', + 'tw': '1WCYKZIqS7AagS76QFSfbteiOgFNBvNne', + 'ty': '1DIqaP1l-N9VXTNokrlr6EuPMGE765o4h', + 'tyv': '1F3qa05OYLBcjT1lXMurAJFDXP_EesCvM', + 'udm': '1T0YMTAPLOk768sstnewy5Jxgx2RPu3Rb', + 'ug': '1fjezvqlysyZhiQMZdazqLGgk72PqtXAw', + 'uk': '1UMJCHtzxkfLDBJE7NtfN5FeMrnnUVwoh', + 'ur': '1WNaD2TuHvdsF-z0k_emQYchwoQQDFmRk', + 'uz': '11wrG2FSTpRJc2jb5MhgvxjkVDYhT8M-l', + 've': '1PucJ7pJ4CXGEXZ5p_WleZDs2usNz74to', + 'vec': '1cAVjm_y3ehNteDQIYz9yyoq1EKkqOXZ0', + 'vep': '1K_eqV7O6C7KPJWZtmIuzFMKAagj-0O85', + 'vi': '1yQ6nhm1BmG9lD4_NaG1hE5VV6biEaV5f', + 'vls': '1bpQQW6pKHruKJJaKtuggH5rReMXyeVXp', + 'vo': '1D80QRdTpe7H4mHFKpfugscsjX71kiMJN', + 'wa': '1m4B81QYbf74htpInDU5p7d0n0ot8WLPZ', + 'war': '1EC3jsHtu22tHBv6jX_I4rupC5RwV3OYd', + 'wo': '1vChyqNNLu5xYHdyHpACwwpw4l3ptiKlo', + 'wuu': '1_EIn02xCUBcwLOwYnA-lScjS2Lh2ECw6', + 'xal': '19bKXsL1D2UesbB50JPyc9TpG1lNc2POt', + 'xh': '1pPVcxBG3xsCzEnUzlohc_p89gQ9dSJB3', + 'xmf': '1SM9llku6I_ZuZz05mOBuL2lx-KQXvehr', + 'yi': '1WNWr1oV-Nl7c1Jv8x_MiAj2vxRtyQawu', + 'yo': '1yNVOwMOWeglbOcRoZzgd4uwlN5JMynnY', + 'za': '1i7pg162cD_iU9h8dgtI2An8QCcbzUAjB', + 'zea': '1EWSkiSkPBfbyjWjZK0VuKdpqFnFOpXXQ', + 'zh-classical': '1uUKZamNp08KA7s7794sKPOqPALvo_btl', + 'zh-min-nan': '1oSgz3YBXLGUgI7kl-uMOC_ww6L0FNFmp', + 'zh-yue': '1zhwlUeeiyOAU1QqwqZ8n91yXIRPFA7UE', + 'zh': '1LZ96GUhkVHQU-aj2C3WOrtffOp0U3Z7f', + 'zu': '1FyXl_UK1737XB3drqQFhGXiJrJckiB1W' + } + return languages_ids[language] - super(WNUT_17, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory - ) -class WEIBO_NER(ColumnCorpus): +class WIKIGOLD_NER(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, @@ -2200,12 +2330,11 @@ def __init__( document_as_sequence: bool = False, ): """ - Initialize the WEIBO_NER corpus . The first time you call this constructor it will automatically + Initialize the wikigold corpus. The first time you call this constructor it will automatically download the dataset. :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict - POS tags instead + :param tag_to_bioes: NER by default, should not be changed :param in_memory: If True, keeps dataset in memory giving speedups in training. :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ @@ -2213,7 +2342,7 @@ def __init__( base_path: Path = Path(base_path) # column format - columns = {0: 'text', 1: 'ner'} + columns = {0: "text", 1: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2224,117 +2353,32 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - weiboNER_conll_path = "https://raw.githubusercontent.com/87302380/WEIBO_NER/main/data/" - cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.train", Path("datasets") / dataset_name) - cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.test", Path("datasets") / dataset_name) - cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.dev", Path("datasets") / dataset_name) - + wikigold_ner_path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/wikigold/CONLL-format/data/" + cached_path(f"{wikigold_ner_path}wikigold.conll.txt", Path("datasets") / dataset_name) - super(WEIBO_NER, self).__init__( + super(WIKIGOLD_NER, self).__init__( data_folder, columns, tag_to_bioes=tag_to_bioes, encoding="utf-8", in_memory=in_memory, - train_file="weiboNER_2nd_conll_format.train", - test_file="weiboNER_2nd_conll_format.test", - dev_file="weiboNER_2nd_conll_format.dev", + train_file='wikigold.conll.txt', document_separator_token=None if not document_as_sequence else "-DOCSTART-", ) -class BIOSCOPE(ColumnCorpus): - - def __init__( - self, - base_path: Union[str, Path] = None, - in_memory: bool = True, - ): - if type(base_path) == str: - base_path: Path = Path(base_path) - - # column format - columns = {0: "text", 1: "tag"} - - # this dataset name - dataset_name = self.__class__.__name__.lower() - - # default dataset folder is the cache root - if not base_path: - base_path = Path(flair.cache_root) / "datasets" - data_folder = base_path / dataset_name - - # download data if necessary - bioscope_path = "https://raw.githubusercontent.com/whoisjones/BioScopeSequenceLabelingData/master/sequence_labeled/" - cached_path(f"{bioscope_path}output.txt", Path("datasets") / dataset_name) - - super(BIOSCOPE, self).__init__( - data_folder, columns, in_memory=in_memory, train_file="output.txt" - ) - - -def _download_wikiner(language_code: str, dataset_name: str): - # download data if necessary - wikiner_path = ( - "https://raw.githubusercontent.com/dice-group/FOX/master/input/Wikiner/" - ) - lc = language_code - - data_file = ( - Path(flair.cache_root) - / "datasets" - / dataset_name - / f"aij-wikiner-{lc}-wp3.train" - ) - if not data_file.is_file(): - - cached_path( - f"{wikiner_path}aij-wikiner-{lc}-wp3.bz2", Path("datasets") / dataset_name - ) - import bz2, shutil - - # unpack and write out in CoNLL column-like format - bz_file = bz2.BZ2File( - Path(flair.cache_root) - / "datasets" - / dataset_name - / f"aij-wikiner-{lc}-wp3.bz2", - "rb", - ) - with bz_file as f, open( - Path(flair.cache_root) - / "datasets" - / dataset_name - / f"aij-wikiner-{lc}-wp3.train", - "w", - encoding="utf-8" - ) as out: - for line in f: - line = line.decode("utf-8") - words = line.split(" ") - for word in words: - out.write("\t".join(word.split("|")) + "\n") -class UP_CHINESE(ColumnCorpus): +class WIKINER_ENGLISH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - in_memory: bool = True, - document_as_sequence: bool = False, + tag_to_bioes: str = "ner", + in_memory: bool = False, ): - """ - Initialize the Chinese dataset from the Universal Propositions Bank, comming from that webpage: - https://github.com/System-T/UniversalPropositions/tree/master/UP_Chinese - - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {1: "text", 9: "frame"} + columns = {0: "text", 1: "pos", 2: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2345,92 +2389,25 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - up_zh_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Chinese/" - cached_path(f"{up_zh_path}zh-up-train.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_zh_path}zh-up-dev.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_zh_path}zh-up-test.conllu", Path("datasets") / dataset_name) + _download_wikiner("en", dataset_name) - super(UP_CHINESE, self).__init__( - data_folder, - columns, - encoding="utf-8", - train_file="zh-up-train.conllu", - test_file="zh-up-test.conllu", - dev_file="zh-up-dev.conllu", - in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", - comment_symbol="#", + super(WIKINER_ENGLISH, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class UP_ENGLISH(ColumnCorpus): - def __init__( - self, - base_path: Union[str, Path] = None, - in_memory: bool = True, - document_as_sequence: bool = False, - ): - """ - Initialize the English dataset from the Universal Propositions Bank, comming from that webpage: - https://github.com/System-T/UniversalPropositions. - - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ - if type(base_path) == str: - base_path: Path = Path(base_path) - - # column format - columns = {1: "text", 10: "frame"} - - # this dataset name - dataset_name = self.__class__.__name__.lower() - - # default dataset folder is the cache root - if not base_path: - base_path = Path(flair.cache_root) / "datasets" - data_folder = base_path / dataset_name - - # download data if necessary - up_en_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_English-EWT/" - cached_path(f"{up_en_path}en_ewt-up-train.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_en_path}en_ewt-up-dev.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_en_path}en_ewt-up-test.conllu", Path("datasets") / dataset_name) - - super(UP_ENGLISH, self).__init__( - data_folder, - columns, - encoding="utf-8", - train_file="en_ewt-up-train.conllu", - test_file="en_ewt-up-test.conllu", - dev_file="en_ewt-up-dev.conllu", - in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", - comment_symbol="#", - ) -class UP_FRENCH(ColumnCorpus): +class WIKINER_GERMAN(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - in_memory: bool = True, - document_as_sequence: bool = False, + tag_to_bioes: str = "ner", + in_memory: bool = False, ): - """ - Initialize the French dataset from the Universal Propositions Bank, comming from that webpage: - https://github.com/System-T/UniversalPropositions. - - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {1: "text", 9: "frame"} + columns = {0: "text", 1: "pos", 2: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2441,44 +2418,25 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - up_fr_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_French/" - cached_path(f"{up_fr_path}fr-up-train.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_fr_path}fr-up-dev.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_fr_path}fr-up-test.conllu", Path("datasets") / dataset_name) + _download_wikiner("de", dataset_name) - super(UP_FRENCH, self).__init__( - data_folder, - columns, - encoding="utf-8", - train_file="fr-up-train.conllu", - test_file="fr-up-test.conllu", - dev_file="fr-up-dev.conllu", - in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", - comment_symbol="#", + super(WIKINER_GERMAN, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class UP_FINNISH(ColumnCorpus): + +class WIKINER_DUTCH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - in_memory: bool = True, - document_as_sequence: bool = False, + tag_to_bioes: str = "ner", + in_memory: bool = False, ): - """ - Initialize the Finnish dataset from the Universal Propositions Bank, comming from that webpage: - https://github.com/System-T/UniversalPropositions/tree/master/UP_Finnish - - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {1: "text", 9: "frame"} + columns = {0: "text", 1: "pos", 2: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2489,44 +2447,25 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - up_fi_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Finnish/" - cached_path(f"{up_fi_path}fi-up-train.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_fi_path}fi-up-dev.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_fi_path}fi-up-test.conllu", Path("datasets") / dataset_name) + _download_wikiner("nl", dataset_name) - super(UP_FINNISH, self).__init__( - data_folder, - columns, - encoding="utf-8", - train_file="fi-up-train.conllu", - test_file="fi-up-test.conllu", - dev_file="fi-up-dev.conllu", - in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", - comment_symbol="#", + super(WIKINER_DUTCH, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class UP_GERMAN(ColumnCorpus): + +class WIKINER_FRENCH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - in_memory: bool = True, - document_as_sequence: bool = False, + tag_to_bioes: str = "ner", + in_memory: bool = False, ): - """ - Initialize the German dataset from the Universal Propositions Bank, comming from that webpage: - https://github.com/System-T/UniversalPropositions. - - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {1: "text", 9: "frame"} + columns = {0: "text", 1: "pos", 2: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2537,44 +2476,25 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - up_de_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_German/" - cached_path(f"{up_de_path}de-up-train.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_de_path}de-up-dev.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_de_path}de-up-test.conllu", Path("datasets") / dataset_name) + _download_wikiner("fr", dataset_name) - super(UP_GERMAN, self).__init__( - data_folder, - columns, - encoding="utf-8", - train_file="de-up-train.conllu", - test_file="de-up-test.conllu", - dev_file="de-up-dev.conllu", - in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", - comment_symbol="#", + super(WIKINER_FRENCH, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class UP_ITALIAN(ColumnCorpus): + +class WIKINER_ITALIAN(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - in_memory: bool = True, - document_as_sequence: bool = False, + tag_to_bioes: str = "ner", + in_memory: bool = False, ): - """ - Initialize the Italian dataset from the Universal Propositions Bank, comming from that webpage: - https://github.com/System-T/UniversalPropositions/tree/master/UP_Italian - - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {1: "text", 9: "frame"} + columns = {0: "text", 1: "pos", 2: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2585,44 +2505,25 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - up_it_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Italian/" - cached_path(f"{up_it_path}it-up-train.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_it_path}it-up-dev.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_it_path}it-up-test.conllu", Path("datasets") / dataset_name) + _download_wikiner("it", dataset_name) - super(UP_ITALIAN, self).__init__( - data_folder, - columns, - encoding="utf-8", - train_file="it-up-train.conllu", - test_file="it-up-test.conllu", - dev_file="it-up-dev.conllu", - in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", - comment_symbol="#", + super(WIKINER_ITALIAN, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class UP_SPANISH(ColumnCorpus): + +class WIKINER_SPANISH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - in_memory: bool = True, - document_as_sequence: bool = False, + tag_to_bioes: str = "ner", + in_memory: bool = False, ): - """ - Initialize the Spanish dataset from the Universal Propositions Bank, comming from that webpage: - https://github.com/System-T/UniversalPropositions - - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {1: "text", 9: "frame"} + columns = {0: "text", 1: "pos", 2: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2633,44 +2534,25 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish/" - cached_path(f"{up_es_path}es-up-train.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_es_path}es-up-dev.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_es_path}es-up-test.conllu", Path("datasets") / dataset_name) + _download_wikiner("es", dataset_name) - super(UP_SPANISH, self).__init__( - data_folder, - columns, - encoding="utf-8", - train_file="es-up-train.conllu", - test_file="es-up-test.conllu", - dev_file="es-up-dev.conllu", - in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", - comment_symbol="#", + super(WIKINER_SPANISH, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class UP_SPANISH_ANCORA(ColumnCorpus): + +class WIKINER_PORTUGUESE(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - in_memory: bool = True, - document_as_sequence: bool = False, + tag_to_bioes: str = "ner", + in_memory: bool = False, ): - """ - Initialize the Spanish AnCora dataset from the Universal Propositions Bank, comming from that webpage: - https://github.com/System-T/UniversalPropositions - - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {1: "text", 9: "frame"} + columns = {0: "text", 1: "pos", 2: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2681,127 +2563,83 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish-AnCora/" - cached_path(f"{up_es_path}es_ancora-up-train.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_es_path}es_ancora-up-dev.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_es_path}es_ancora-up-test.conllu", Path("datasets") / dataset_name) + _download_wikiner("pt", dataset_name) - super(UP_SPANISH_ANCORA, self).__init__( - data_folder, - columns, - encoding="utf-8", - train_file="es_ancora-up-train.conllu", - test_file="es_ancora-up-test.conllu", - dev_file="es_ancora-up-dev.conllu", - in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", - comment_symbol="#", + super(WIKINER_PORTUGUESE, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class MITMovieNERSimple(ColumnCorpus): +class WIKINER_POLISH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", - in_memory: bool = True, + in_memory: bool = False, ): - """ - Initialize the eng corpus of the MIT Movie Corpus (it has simpler queries compared to the trivia10k13 corpus) - in BIO format. The first time you call this constructor it will automatically download the dataset. - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict - POS tags instead - :param in_memory: If True, keeps dataset in memory giving speedups in training. - """ + if type(base_path) == str: + base_path: Path = Path(base_path) + # column format - columns = {0: "ner", 1: "text"} + columns = {0: "text", 1: "pos", 2: "ner"} - # dataset name + # this dataset name dataset_name = self.__class__.__name__.lower() - # data folder: default dataset folder is the cache root - if type(base_path) == str: - base_path: Path = Path(base_path) + # default dataset folder is the cache root if not base_path: - base_path: Path = Path(flair.cache_root) / "datasets" + base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name # download data if necessary - mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/" - train_file = "engtrain.bio" - test_file = "engtest.bio" - cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name) - cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name) + _download_wikiner("pl", dataset_name) - super(MITMovieNERSimple, self).__init__( - data_folder, - columns, - train_file=train_file, - test_file=test_file, - tag_to_bioes=tag_to_bioes, - in_memory=in_memory, + super(WIKINER_POLISH, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class MITMovieNERComplex(ColumnCorpus): + +class WIKINER_RUSSIAN(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", - in_memory: bool = True, + in_memory: bool = False, ): - """ - Initialize the trivia10k13 corpus of the MIT Movie Corpus (it has more complex queries compared to the eng corpus) - in BIO format. The first time you call this constructor it will automatically download the dataset. - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict - POS tags instead - :param in_memory: If True, keeps dataset in memory giving speedups in training. - """ + if type(base_path) == str: + base_path: Path = Path(base_path) + # column format - columns = {0: "ner", 1: "text"} + columns = {0: "text", 1: "pos", 2: "ner"} - # dataset name + # this dataset name dataset_name = self.__class__.__name__.lower() - # data folder: default dataset folder is the cache root - if type(base_path) == str: - base_path: Path = Path(base_path) + # default dataset folder is the cache root if not base_path: - base_path: Path = Path(flair.cache_root) / "datasets" + base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name # download data if necessary - mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/" - train_file = "trivia10k13train.bio" - test_file = "trivia10k13test.bio" - cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name) - cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name) + _download_wikiner("ru", dataset_name) - super(MITMovieNERComplex, self).__init__( - data_folder, - columns, - train_file=train_file, - test_file=test_file, - tag_to_bioes=tag_to_bioes, - in_memory=in_memory, + super(WIKINER_RUSSIAN, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class SEC_FILLINGS(ColumnCorpus): + +class WNUT_17(ColumnCorpus): def __init__( self, - base_path: Union[str, Path] = None, + base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", in_memory: bool = True, ): - if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "pos", 3: "ner"} + columns = {0: "text", 1: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2812,22 +2650,19 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - SEC_FILLINGS_Path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/SEC-filings/CONLL-format/data/" - cached_path(f"{SEC_FILLINGS_Path}test/FIN3.txt", Path("datasets") / dataset_name) - cached_path(f"{SEC_FILLINGS_Path}train/FIN5.txt", Path("datasets") / dataset_name) + wnut_path = "https://noisy-text.github.io/2017/files/" + cached_path(f"{wnut_path}wnut17train.conll", Path("datasets") / dataset_name) + cached_path(f"{wnut_path}emerging.dev.conll", Path("datasets") / dataset_name) + cached_path( + f"{wnut_path}emerging.test.annotated", Path("datasets") / dataset_name + ) - super(SEC_FILLINGS, self).__init__( - data_folder, - columns, - tag_to_bioes=tag_to_bioes, - encoding="utf-8", - in_memory=in_memory, - train_file='FIN5.txt', - test_file="FIN3.txt", - skip_first_line=True + super(WNUT_17, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class TURKU_NER(ColumnCorpus): + +class WNUT_2020_NER(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, @@ -2836,12 +2671,11 @@ def __init__( document_as_sequence: bool = False, ): """ - Initialize the Finnish TurkuNER corpus. The first time you call this constructor it will automatically + Initialize the WNUT_2020_NER corpus. The first time you call this constructor it will automatically download the dataset. :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict - POS tags instead + :param tag_to_bioes: NER by default, since it is the only option of the WNUT corpus. :param in_memory: If True, keeps dataset in memory giving speedups in training. :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ @@ -2860,23 +2694,201 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - conll_path = "https://raw.githubusercontent.com/TurkuNLP/turku-ner-corpus/master/data/conll" - dev_file = "dev.tsv" - test_file = "test.tsv" - train_file = "train.tsv" - cached_path(f"{conll_path}/{dev_file}", Path("datasets") / dataset_name) - cached_path(f"{conll_path}/{test_file}", Path("datasets") / dataset_name) - cached_path(f"{conll_path}/{train_file}", Path("datasets") / dataset_name) + github_url = "https://github.com/jeniyat/WNUT_2020_NER/archive/master.zip" - super(TURKU_NER, self).__init__( + for sample in ["train", "test", "dev"]: + + sample_file = data_folder / (sample + ".txt") + if not sample_file.is_file(): + + zip_path = cached_path( + f"{github_url}", Path("datasets") / dataset_name + ) + + # unzip the downloaded repo and merge the train, dev and test datasets + unpack_file(zip_path, data_folder, "zip", False) # unzipped folder name: WNUT_2020_NER-master + + if sample == "test": + file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data_2020/Conll_Format/") + else: + file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data/Conll_Format/") + filenames = os.listdir(file_path) + with open(data_folder / (sample + '.txt'), 'w') as outfile: + for fname in filenames: + with open(file_path / fname) as infile: + lines = infile.read() + outfile.write(lines) + + shutil.rmtree(str(data_folder / "WNUT_2020_NER-master")) # clean up when done + + super(WNUT_2020_NER, self).__init__( data_folder, columns, - dev_file=dev_file, - test_file=test_file, - train_file=train_file, - column_delimiter="\t", tag_to_bioes=tag_to_bioes, - encoding="latin-1", + encoding="utf-8", in_memory=in_memory, document_separator_token=None if not document_as_sequence else "-DOCSTART-", - ) \ No newline at end of file + ) + + +def _download_wikiner(language_code: str, dataset_name: str): + # download data if necessary + wikiner_path = ( + "https://raw.githubusercontent.com/dice-group/FOX/master/input/Wikiner/" + ) + lc = language_code + + data_file = ( + Path(flair.cache_root) + / "datasets" + / dataset_name + / f"aij-wikiner-{lc}-wp3.train" + ) + if not data_file.is_file(): + + cached_path( + f"{wikiner_path}aij-wikiner-{lc}-wp3.bz2", Path("datasets") / dataset_name + ) + import bz2, shutil + + # unpack and write out in CoNLL column-like format + bz_file = bz2.BZ2File( + Path(flair.cache_root) + / "datasets" + / dataset_name + / f"aij-wikiner-{lc}-wp3.bz2", + "rb", + ) + with bz_file as f, open( + Path(flair.cache_root) + / "datasets" + / dataset_name + / f"aij-wikiner-{lc}-wp3.train", + "w", + encoding="utf-8" + ) as out: + for line in f: + line = line.decode("utf-8") + words = line.split(" ") + for word in words: + out.write("\t".join(word.split("|")) + "\n") + + +class XTREME(MultiCorpus): + def __init__( + self, + languages: Union[str, List[str]] = None, + base_path: Union[str, Path] = None, + tag_to_bioes: str = "ner", + in_memory: bool = False, + ): + """ + Xtreme corpus for cross-lingual NER consisting of datasets of a total of 176 languages. The data comes from the google + research work XTREME https://github.com/google-research/xtreme. All datasets for NER and respective language abbreviations (e.g. + "en" for english can be found here https://www.amazon.com/clouddrive/share/d3KGCRCIYwhKJF0H3eWA26hjg2ZCRhjpEQtDL70FSBN/folder/C43gs51bSIaq5sFTQkWNCQ?_encoding=UTF8&*Version*=1&*entries*=0&mgh=1 ) + The data is derived from the wikiann dataset https://elisa-ie.github.io/wikiann/ (license: https://opendatacommons.org/licenses/by/) + + Parameters + ---------- + languages : Union[str, List[str]], optional + Default the 40 languages that are used in XTREME are loaded. Otherwise on can hand over a strings or a list of strings + consisiting of abbreviations for languages. All datasets will be loaded in a MultiCorpus object. + base_path : Union[str, Path], optional + Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + tag_to_bioes : str, optional + The data is in bio-format. It will by default (with the string "ner" as value) be transformed + into the bioes format. If you dont want that set it to None. + + """ + # if no languages are given as argument all languages used in XTREME will be loaded + if not languages: + languages = ["af", "ar", "bg", "bn", "de", "el", "en", "es", "et", "eu", "fa", "fi", "fr", "he", "hi", "hu", + "id", "it", "ja", "jv", "ka", "kk", "ko", "ml", "mr", "ms", "my", "nl", "pt", "ru", "sw", "ta", + "te", "th", "tl", "tr", "ur", "vi", "yo", "zh"] + + # if only one language is given + if type(languages) == str: + languages = [languages] + + if type(base_path) == str: + base_path: Path = Path(base_path) + + # column format + columns = {0: "text", 1: "ner"} + + # this dataset name + dataset_name = "xtreme" + + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + + # For each language in languages, the file is downloaded if not existent + # Then a comlumncorpus of that data is created and saved in a list + # This list is handed to the multicorpus + + # list that contains the columncopora + corpora = [] + + hu_path = "https://nlp.informatik.hu-berlin.de/resources/datasets/panx_dataset" + + # download data if necessary + for language in languages: + + language_folder = data_folder / language + + # if language not downloaded yet, download it + if not language_folder.exists(): + + file_name = language + '.tar.gz' + # create folder + os.makedirs(language_folder) + + # download from HU Server + temp_file = cached_path( + hu_path + "/" + file_name, + Path("datasets") / dataset_name / language + ) + + # unzip + print("Extract data...") + import tarfile + tar = tarfile.open(str(temp_file), "r:gz") + for part in ["train", "test", "dev"]: + tar.extract(part, str(language_folder)) + tar.close() + print('...done.') + + # transform data into required format + print("Process dataset...") + for part in ["train", "test", "dev"]: + xtreme_to_simple_ner_annotation(str(language_folder / part)) + print('...done.') + + # initialize comlumncorpus and add it to list + print("Read data into corpus...") + corp = ColumnCorpus(data_folder=language_folder, + column_format=columns, + tag_to_bioes=tag_to_bioes, + in_memory=in_memory, + ) + corpora.append(corp) + print("...done.") + + super(XTREME, self).__init__( + corpora, name='xtreme' + ) + + +def xtreme_to_simple_ner_annotation(data_file: Union[str, Path]): + with open(data_file, 'r', encoding='utf-8') as f: + lines = f.readlines() + with open(data_file, 'w', encoding='utf-8') as f: + for line in lines: + if line == '\n': + f.write(line) + else: + liste = line.split() + f.write(liste[0].split(':', 1)[1] + ' ' + liste[1] + '\n') diff --git a/resources/docs/TUTORIAL_6_CORPUS.md b/resources/docs/TUTORIAL_6_CORPUS.md index f981bf715..0c7419abe 100644 --- a/resources/docs/TUTORIAL_6_CORPUS.md +++ b/resources/docs/TUTORIAL_6_CORPUS.md @@ -162,20 +162,24 @@ data the first time you call the corresponding constructor ID. The following dat | ID(s) | Languages | Description | | ------------- | ------------- |------------- +| 'ANER_CORP' | Arabic | [Arabic Named Entity Recognition Corpus](http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp/) 4-class NER | | 'BIOFID' | German | [CoNLL-03](https://www.aclweb.org/anthology/K19-1081/) Biodiversity literature NER | +| 'BIOSCOPE' | English | [BioScope](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-9-S11-S9/) biomedical texts annotated for uncertainty, negation and their scopes | | 'CONLL_03_DUTCH' | Dutch | [CoNLL-03](https://www.clips.uantwerpen.be/conll2002/ner/) 4-class NER | | 'CONLL_03_SPANISH' | Spanish | [CoNLL-03](https://www.clips.uantwerpen.be/conll2002/ner/) 4-class NER | | 'DANE' | Danish | [DaNE dataset](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#danish-dependency-treebank) | | 'EUROPARL_NER_GERMAN' | German | [German Europarl dataset](https://nlpado.de/~sebastian/software/ner_german.shtml) NER in German EU parliament speeches | | 'LER_GERMAN' | German | [Legal Entity Recognition](https://github.com/elenanereiss/Legal-Entity-Recognition) NER in German Legal Documents | -| 'MIT_RESTAURANTS' | English | [NER dataset for restaurant reviews](https://groups.csail.mit.edu/sls/downloads/restaurant/) | +| 'MIT_MOVIE_NER_SIMPLE' | English | [NER dataset for movie reviews](https://groups.csail.mit.edu/sls/downloads/movie/) - simple NER | +| 'MIT_MOVIE_NER_COMPLEX' | English | [NER dataset for movie reviews](https://groups.csail.mit.edu/sls/downloads/movie/) - complex NER | +| 'MIT_RESTAURANT_NER' | English | [NER dataset for restaurant reviews](https://groups.csail.mit.edu/sls/downloads/restaurant/) | | 'NER_BASQUE' | Basque | [NER dataset for Basque](http://ixa2.si.ehu.eus/eiec/) | | 'NER_FINNISH' | Finnish | [Finer-data](https://github.com/mpsilfve/finer-data) | | 'NER_SWEDISH' | Swedish | [Swedish Spraakbanken NER](https://github.com/klintan/swedish-ner-corpus/) 4-class NER | +| 'TURKU_NER' | Finnish | [TURKU_NER](https://github.com/TurkuNLP/turku-ner-corpus) NER corpus created by the Turku NLP Group, University of Turku, Finland | | 'TWITTER_NER' | English | [Twitter NER dataset](https://github.com/aritter/twitter_nlp/) | +| 'WEIBO_NER' | Chinese | [Weibo NER corpus](https://paperswithcode.com/sota/chinese-named-entity-recognition-on-weibo-ner/). | | 'WIKIANN' | 282 languages | Gigantic [corpus for cross-lingual NER derived from Wikipedia](https://elisa-ie.github.io/wikiann/). | -| 'WNUT_17' | English | [WNUT-17](https://noisy-text.github.io/2017/files/) emerging entity detection | -| 'WNUT_20' | English | [WNUT-20](https://github.com/jeniyat/WNUT_2020_NER) named entity extraction | | 'WIKIGOLD_NER' | English | [Wikigold](https://github.com/juand-r/entity-recognition-datasets/tree/master/data/wikigold) a manually annotated collection of Wikipedia text | | 'WIKINER_ENGLISH' | English | [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia | | 'WIKINER_GERMAN' | German | [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia | @@ -185,16 +189,33 @@ data the first time you call the corresponding constructor ID. The following dat | 'WIKINER_PORTUGUESE' | Portuguese | [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia | | 'WIKINER_POLISH' | Polish | [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia | | 'WIKINER_RUSSIAN' | Russian | [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia | +| 'WNUT_17' | English | [WNUT-17](https://noisy-text.github.io/2017/files/) emerging entity detection | +| 'WNUT_2020_NER' | English | [WNUT-20](https://github.com/jeniyat/WNUT_2020_NER) named entity extraction | | 'XTREME' | 176 languages | [Xtreme](https://github.com/google-research/xtreme) corpus by Google Research for cross-lingual NER consisting of datasets of a total of 176 languages | -| 'MITMovieNERSimple' | English | [eng](https://groups.csail.mit.edu/sls/downloads/movie) NER movie corpus collected by MIT (simpler queries) | -| 'MITMovieNERComplex' | English | [trivia10k13](https://groups.csail.mit.edu/sls/downloads/movie) NER movie corpus collected by MIT (more complex queries) | -| 'TURKU_NER' | Finnish | [TURKU_NER](https://github.com/TurkuNLP/turku-ner-corpus) NER corpus created by the Turku NLP Group, University of Turku, Finland | #### Biomedical Named Entity Recognition We support 31 biomedical NER datasets, listed [here](HUNFLAIR_CORPORA.md). + +#### Universal Proposition Banks + +We now also support loading the [Universal Proposition Banks](https://github.com/System-T/UniversalPropositions) +for the purpose of training multilingual frame detection systems. + +| ID(s) | Languages | Description | +| ------------- | ------------- |------------- | +| 'UP_CHINESE' | Chinese | Universal Propositions for [Chinese](https://github.com/System-T/UniversalPropositions/tree/master/UP_Chinese) | +| 'UP_ENGLISH'| English | Universal Propositions for [English](https://github.com/System-T/UniversalPropositions/tree/master/UP_English-EWT) | +| 'UP_FINNISH'| Finnish | Universal Propositions for [Finnish](https://github.com/System-T/UniversalPropositions/tree/master/UP_Finnish) +| 'UP_FRENCH'| French | Universal Propositions for [French](https://github.com/System-T/UniversalPropositions/tree/master/UP_French) +| 'UP_GERMAN'| German | Universal Propositions for [German](https://github.com/System-T/UniversalPropositions/tree/master/UP_German) | +| 'UP_ITALIAN', | Italian | Universal Propositions for [Italian](https://github.com/System-T/UniversalPropositions/tree/master/UP_Italian) | +| 'UP_SPANISH' | Spanish | Universal Propositions for [Spanish](https://github.com/System-T/UniversalPropositions/tree/master/UP_Spanish) | +| 'UP_SPANISH_ANCORA' | Spanish (Ancora Corpus) | Universal Propositions for [Spanish](https://github.com/System-T/UniversalPropositions/tree/master/UP_Spanish-AnCora) | + + #### Universal Dependency Treebanks | ID(s) | Languages | Description | From 08e027cfd5f42ee9b82220ef769f8f814b6ff7fb Mon Sep 17 00:00:00 2001 From: Alan Akbik Date: Wed, 25 Nov 2020 14:44:58 +0100 Subject: [PATCH 09/35] GH-1983: bump version number --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index fa33a27cc..d82f2155d 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ document embeddings, including our proposed **[Flair embeddings](https://www.acl * **A PyTorch NLP framework.** Our framework builds directly on [PyTorch](https://pytorch.org/), making it easy to train your own models and experiment with new approaches using Flair embeddings and classes. -Now at [version 0.6.1](https://github.com/flairNLP/flair/releases)! +Now at [version 0.7](https://github.com/flairNLP/flair/releases)! ## Comparison with State-of-the-Art From fa854426b7eb9c7d2285ab514048a5db8775de3d Mon Sep 17 00:00:00 2001 From: Alan Akbik Date: Wed, 25 Nov 2020 14:51:25 +0100 Subject: [PATCH 10/35] Update TUTORIAL_1_BASICS.md --- resources/docs/TUTORIAL_1_BASICS.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/resources/docs/TUTORIAL_1_BASICS.md b/resources/docs/TUTORIAL_1_BASICS.md index 655ef375e..61828d0d0 100644 --- a/resources/docs/TUTORIAL_1_BASICS.md +++ b/resources/docs/TUTORIAL_1_BASICS.md @@ -80,7 +80,7 @@ print(untokenized_sentence) In this case, no tokenization is performed and the text is split on whitespaces, thus resulting in only 4 tokens here. -### Using a Different Tokenizer +### Using a different tokenizer You can also pass custom tokenizers to the initialization method. For instance, if you want to tokenize a Japanese sentence you can use the 'janome' tokenizer instead, like this: @@ -110,12 +110,12 @@ You can write your own tokenization routine. Check the code of `flair.data.Token your own tokenization method. ### Using pretokenized sequences -You can pass pass a pretokenized sequence as list of words, e.g. +You can alternatively pass a pretokenized sequence as list of words, e.g. ```python from flair.data import Sentence -my_sent = Sentence(['The', 'grass', 'is', 'green', '.']) -print(my_sent) +sentence = Sentence(['The', 'grass', 'is', 'green', '.']) +print(sentence) ``` This should print: @@ -129,7 +129,7 @@ Sentence: "The grass is green ." [− Tokens: 5] In Flair, any data point can be labeled. For instance, you can label a word or label a sentence: -### Adding Labels to Tokens +### Adding labels to tokens A `Token` has fields for linguistic annotation, such as lemmas, part-of-speech tags or named entity tags. You can add a tag by specifying the tag type and the tag value. In this example, we're adding an NER tag of type 'color' to @@ -171,7 +171,7 @@ This should print: Our color tag has a score of 1.0 since we manually added it. If a tag is predicted by our sequence labeler, the score value will indicate classifier confidence. -### Adding Labels to Sentences +### Adding labels to sentences You can also add a `Label` to a whole `Sentence`. For instance, the example below shows how we add the label 'sports' to a sentence, thereby labeling it @@ -199,7 +199,7 @@ Sentence: "France is the current world cup winner." [− Tokens: 7 − Senten Indicating that this sentence belongs to the topic 'sports' with confidence 1.0. -### Multiple Labels +### Multiple labels Any data point can be labeled multiple times. A sentence for instance might belong to two topics. In this case, add two labels with the same label name: @@ -234,7 +234,7 @@ Sentence: "France is the current world cup winner." [− Tokens: 7 − Senten Indicating that this sentence has two "topic" labels and one "language" label. -### Accessing a Sentence's Labels +### Accessing a sentence's labels You can access these labels like this: From d02ad73c5485b5bcada7cd0462737f4a6921d53a Mon Sep 17 00:00:00 2001 From: Alan Akbik Date: Fri, 27 Nov 2020 12:50:11 +0100 Subject: [PATCH 11/35] Update TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md --- resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md index eba2594df..50bbfc633 100644 --- a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md +++ b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md @@ -18,6 +18,8 @@ For instance, say you want to predict whether text is "happy" or "sad" but you h Just use TARS with this snippet: ```python +from flair.models.text_classification_model import TARSClassifier + # 1. Load our pre-trained TARS model for English tars = TARSClassifier.load('tars-base') From 2ab3139d90d7bd9ce2ed36033077483b5b8c2459 Mon Sep 17 00:00:00 2001 From: Alan Akbik Date: Fri, 27 Nov 2020 12:51:28 +0100 Subject: [PATCH 12/35] Update TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md --- resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md index 50bbfc633..16f19b7ce 100644 --- a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md +++ b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md @@ -19,6 +19,7 @@ Just use TARS with this snippet: ```python from flair.models.text_classification_model import TARSClassifier +from flair.data import Sentence # 1. Load our pre-trained TARS model for English tars = TARSClassifier.load('tars-base') @@ -69,6 +70,8 @@ In this case, zero-shot prediction falsely predicts "drink" for the sentence "I To improve this, let's first create a small corpus of 4 training and 2 testing examples: ```python +from flair.datasets import SentenceDataset + # training dataset consisting of four sentences (2 labeled as "food" and 2 labeled as "drink") train = SentenceDataset( [ From ce9904a09e796d186537f14b48af4f4e3179eee8 Mon Sep 17 00:00:00 2001 From: alanakbik Date: Tue, 1 Dec 2020 10:52:55 +0100 Subject: [PATCH 13/35] GH-1983: move distance classifier to diagnostics module --- flair/models/__init__.py | 1 - flair/models/text_classification_model.py | 486 +--------------------- 2 files changed, 1 insertion(+), 486 deletions(-) diff --git a/flair/models/__init__.py b/flair/models/__init__.py index 16a09af1c..784b038a9 100644 --- a/flair/models/__init__.py +++ b/flair/models/__init__.py @@ -1,4 +1,3 @@ from .sequence_tagger_model import SequenceTagger, MultiTagger from .language_model import LanguageModel from .text_classification_model import TextClassifier -from .text_classification_model import DistClassifier diff --git a/flair/models/text_classification_model.py b/flair/models/text_classification_model.py index 00115d2aa..7e0dab976 100644 --- a/flair/models/text_classification_model.py +++ b/flair/models/text_classification_model.py @@ -7,7 +7,6 @@ from torch.utils.data.dataset import Dataset from tqdm import tqdm import numpy as np -from math import floor import sklearn.metrics as metrics from sklearn.metrics.pairwise import cosine_similarity @@ -17,12 +16,7 @@ from flair.data import Dictionary, Sentence, Label, DataPoint from flair.datasets import SentenceDataset, DataLoader from flair.file_utils import cached_path -from flair.training_utils import ( - MetricRegression, - convert_labels_to_one_hot, - Result, - store_embeddings, -) +from flair.training_utils import convert_labels_to_one_hot, Result, store_embeddings log = logging.getLogger("flair") @@ -947,481 +941,3 @@ def _fetch_model(model_name) -> str: model_name = cached_path(model_map[model_name], cache_dir=cache_dir) return model_name - - - -class DistClassifier(flair.nn.Model): - """ - DistClassifier - Model to predict distance between two words given their embeddings. Takes (contextual) word embedding as input. - The pair of word embeddings is passed through a linear layer that predicts their distance in a sentence. - Note: When used for training the batch size must be set to 1!!! - """ - - def __init__( - self, - word_embeddings: flair.embeddings.TokenEmbeddings, - max_distance: int = 20, - beta: float = 1.0, - loss_max_weight: float = 1, - regression = False, - regr_loss_step = 0 - ): - """ - Initializes a DistClassifier - :param word_embeddings: embeddings used to embed each sentence - .param max_distance: max dist between word pairs = number of predicted classes - 1 - :param beta: Parameter for F-beta score for evaluation and training annealing - :param loss_max_weight: Only for classification: Since small distances between word pairs occur mor frequent it makes sense to give them less weight - in the loss function. loss_max_weight will be used as the weight for the maximum distance and should be a number >=1 - The other weights decrease with equidistant steps from high to low distance. - :param regression: if True the class does regression instead of classification - :param regr_loss_step: if > 0, the MSE-Loss in regression will be weighted. Word pairs with - distance 0 have weight 1. Then, as the distance increases, the weight in the loss function, - increases step by step with size regr_loss_step - """ - - super(DistClassifier, self).__init__() - - self.word_embeddings: flair.embeddings.TokenEmbeddings = word_embeddings - - self.beta = beta - - self.loss_max_weight = loss_max_weight - - self.regression = regression - - self.regr_loss_step = regr_loss_step - - if not regression: - self.max_distance = max_distance - - # weights for loss function - if self.loss_max_weight > 1: - step = (self.loss_max_weight - 1) / self.max_distance - - weight_list = [1. + i * step for i in range(self.max_distance + 1)] - - self.loss_weights = torch.FloatTensor(weight_list).to(flair.device) - - else: - self.loss_weights = None - - # iput size is two times wordembedding size since we use pair of words as input - # the output size is max_distance + 1, i.e. we allow 0,1,...,max_distance words between pairs - self.decoder = nn.Linear( - self.word_embeddings.embedding_length * 2, self.max_distance + 1) - - self.loss_function = nn.CrossEntropyLoss(weight=self.loss_weights) - - # regression - else: - self.max_distance = float('inf') - - # iput size is two times wordembedding size since we use pair of words as input - # the output size is 1 - self.decoder = nn.Linear( - self.word_embeddings.embedding_length * 2, 1) - - if regr_loss_step > 0: - self.loss_function = self.weighted_mse_loss - else: - self.loss_function = nn.MSELoss() - - nn.init.xavier_uniform_(self.decoder.weight) - - # auto-spawn on GPU if available - self.to(flair.device) - - - # all input should be tensors - def weighted_mse_loss(self,predictions, target): - - weight = 1 + self.regr_loss_step * target - - return (weight * ((predictions - target) ** 2)).mean() - - - # forward allows only a single sentcence!! - def forward(self, sentence: Sentence): - - # embed words of sentence - self.word_embeddings.embed(sentence) - - # go through all pairs of words with a maximum number of max_distance in between - numberOfWords = len(sentence) - text_embedding_list = [] - # go through all pairs - for i in range(numberOfWords): - for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)): - text_embedding_list.append(torch.cat((sentence[i].embedding, sentence[j].embedding)).unsqueeze(0)) - - # 2-dim matrix whose rows are the embeddings of word pairs of the sentence - text_embedding_tensor = torch.cat(text_embedding_list, 0).to(flair.device) - - label_scores = self.decoder(text_embedding_tensor) - - if self.regression: - return label_scores.squeeze(1) - - return label_scores - - def _get_state_dict(self): - model_state = { - "state_dict": self.state_dict(), - "word_embeddings": self.word_embeddings, - "max_distance": self.max_distance, - "beta": self.beta, - "loss_max_weight": self.loss_max_weight, - "regression": self.regression, - "regr_loss_step": self.regr_loss_step - } - return model_state - - @staticmethod - def _init_model_with_state_dict(state): - beta = 1.0 if "beta" not in state.keys() else state["beta"] - weight = 1 if "loss_max_weight" not in state.keys() else state["loss_max_weight"] - - model = DistClassifier( - word_embeddings=state["word_embeddings"], - max_distance=state["max_distance"], - beta=beta, - loss_max_weight=weight, - regression=state["regression"], - regr_loss_step=state["regr_loss_step"] - ) - - model.load_state_dict(state["state_dict"]) - return model - - # So far only one sentence allowed - # If list of sentences is handed the function works with the first sentence of the list - def forward_loss( - self, data_points: Union[List[Sentence], Sentence] - ) -> torch.tensor: - - if isinstance(data_points, list): # first sentence - data_points = data_points[0] - - if len(data_points) < 2: - return torch.tensor([0.], requires_grad=True) - - scores = self.forward(data_points) - - return self._calculate_loss(scores, data_points) - - # Assume data_points is a single sentence!!! - # scores are the predictions for each word pair - def _calculate_loss(self, scores, data_points): - - indices = [] - numberOfWords = len(data_points) - - # classification needs labels to be integers, regression needs labels to be float - # this is due to the different loss functions - if not self.regression: - for i in range(numberOfWords): - for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)): - indices.append(torch.LongTensor([j - i - 1])) # distance between words - else: - for i in range(numberOfWords): - for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)): - indices.append(torch.Tensor([j - i - 1])) # distance between words - - labels = torch.cat(indices, 0).to(flair.device) - - return self.loss_function(scores, labels) - - # only single sentences as input - def _forward_scores_and_loss( - self, data_points: Union[List[Sentence], Sentence], return_loss=False): - - if isinstance(data_points, list): # first sentence - data_points = data_points[0] - - scores = self.forward(data_points) - - loss = None - if return_loss: - loss = self._calculate_loss(scores, data_points) - - return scores, loss - - def evaluate( - self, - sentences: Union[List[DataPoint], Dataset], - out_path: Union[str, Path] = None, - embedding_storage_mode: str = "none", - mini_batch_size: int = 1, # unnecessary, but trainer.train calls evaluate with this parameter - num_workers: int = 8, - ) -> (Result, float): - - if self.regression: - return self.evaluate_regression( - sentences = sentences, - out_path = out_path, - embedding_storage_mode=embedding_storage_mode, - ) - - return self.evaluate_classification( - sentences = sentences, - out_path = out_path, - embedding_storage_mode=embedding_storage_mode, - ) - - def evaluate_regression( - self, - sentences: Union[List[DataPoint], Dataset], - out_path: Union[str, Path] = None, - embedding_storage_mode: str = "none", - ) -> (Result, float): - - with torch.no_grad(): - - buckets = [0 for _ in range(11)] - - eval_loss = 0 - - metric = MetricRegression("Evaluation") - - lines: List[str] = [] - - max_dist_plus_one = max([len(sent) for sent in sentences]) - 1 - - num_occurences = [0 for _ in range(max_dist_plus_one)] - - cumulated_values = [0 for _ in range(max_dist_plus_one)] - - for sentence in sentences: - - if len(sentence) < 2: # we need at least 2 words per sentence - continue - - scores, loss = self._forward_scores_and_loss(sentence, return_loss=True) - - predictions = scores.tolist() - - # gold labels - true_values_for_sentence = [] - numberOfPairs = 0 - numberOfWords = len(sentence) - lines.append(sentence.to_tokenized_string() + '\n') - for i in range(numberOfWords): - for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)): - true_dist = j - i - 1 - pred = predictions[numberOfPairs] - - true_values_for_sentence.append(true_dist) - - # for output text file - eval_line = f"({i},{j})\t{true_dist}\t{pred:.2f}\n" - lines.append(eval_line) - - # for buckets - error = abs(true_dist - pred) - if error >= 10: - buckets[10] += 1 - else: - buckets[floor(error)] += 1 - - # for average prediction - num_occurences[true_dist] += 1 - cumulated_values[true_dist] += pred - - numberOfPairs += 1 - - eval_loss += loss/numberOfPairs - - metric.true.extend(true_values_for_sentence) - metric.pred.extend(predictions) - - store_embeddings(sentence, embedding_storage_mode) - - eval_loss /= len(sentences) # w.r.t self.loss - - # add some statistics to the output - eval_line = f"Number of Sentences: {len(sentences)}\nBuckets:\n | 0-1 | 1-2 | 2-3 | 3-4 | 4-5 | 5-6 | 6-7 | 7-8 | 8-9 | 9-10 | >10 |\n" - lines.append(eval_line) - eval_line = "| {} | {} | {} | {} | {} | {} | {} | {} | {} | {} | {} |".format(buckets[0],buckets[1],buckets[2],buckets[3], - buckets[4],buckets[5],buckets[6],buckets[7], - buckets[8],buckets[9],buckets[10]) - lines.append(eval_line) - lines.append("\nAverage predicted values per distance:\n") - eval_line = "" - for i in range(max_dist_plus_one): - eval_line += str(i) + ": " + f"{cumulated_values[i]/num_occurences[i]:.2f}" + " " - if i!=0 and i%15==0: - eval_line += "\n" - - lines.append(eval_line) - - - - if out_path is not None: - with open(out_path, "w", encoding="utf-8") as outfile: - outfile.write("".join(lines)) - - log_line = f"{metric.mean_squared_error()}\t{metric.spearmanr()}\t{metric.pearsonr()}" - log_header = "MSE\tSPEARMAN\tPEARSON" - - detailed_result = ( - f"AVG: mse: {metric.mean_squared_error():.4f} - " - f"mae: {metric.mean_absolute_error():.4f} - " - f"pearson: {metric.pearsonr():.4f} - " - f"spearman: {metric.spearmanr():.4f}" - ) - - result: Result = Result( - metric.pearsonr(), log_header, log_line, detailed_result - ) - - - return result, eval_loss - - def evaluate_classification( - self, - sentences: Union[List[DataPoint], Dataset], - out_path: Union[str, Path] = None, - embedding_storage_mode: str = "none", - ) -> (Result, float): - - # use scikit-learn to evaluate - y_true = [] - y_pred = [] - - with torch.no_grad(): - eval_loss = 0 - - lines: List[str] = [] - # we iterate over each sentence, instead of batches - for sentence in sentences: - - if len(sentence) < 2: # we need at least 2 words per sentence - continue - - scores, loss = self._forward_scores_and_loss(sentence, return_loss=True) - - # get single labels from scores - predictions = [self._get_single_label(s) for s in scores] - - # gold labels - true_values_for_sentence = [] - numberOfPairs = 0 - numberOfWords = len(sentence) - lines.append(sentence.to_tokenized_string() + '\n') - for i in range(numberOfWords): - for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)): - true_values_for_sentence.append(j - i - 1) - - # for output text file - eval_line = "({},{})\t{}\t{}\n".format(i, j, j - i - 1, predictions[numberOfPairs]) - lines.append(eval_line) - - numberOfPairs += 1 - - eval_loss += loss / numberOfPairs # add average loss of word pairs - - for prediction_for_sentence, true_value_for_sentence in zip( - predictions, true_values_for_sentence - ): - # hot one vector of true value - y_true_instance = np.zeros(self.max_distance + 1, dtype=int) - y_true_instance[true_value_for_sentence] = 1 - y_true.append(y_true_instance.tolist()) - - # hot one vector of predicted value - y_pred_instance = np.zeros(self.max_distance + 1, dtype=int) - y_pred_instance[prediction_for_sentence] = 1 - y_pred.append(y_pred_instance.tolist()) - - # speichert embeddings, falls embedding_storage!= 'None' - store_embeddings(sentence, embedding_storage_mode) - - if out_path is not None: - with open(out_path, "w", encoding="utf-8") as outfile: - outfile.write("".join(lines)) - - # make "classification report" - target_names = [] # liste aller labels, ins unserem Fall - for i in range(self.max_distance + 1): - target_names.append(str(i)) - classification_report = metrics.classification_report(y_true, y_pred, digits=4, - target_names=target_names, zero_division=0) - - # get scores - micro_f_score = round(metrics.fbeta_score(y_true, y_pred, beta=self.beta, average='micro', zero_division=0), - 4) - accuracy_score = round(metrics.accuracy_score(y_true, y_pred), 4) - macro_f_score = round(metrics.fbeta_score(y_true, y_pred, beta=self.beta, average='macro', zero_division=0), - 4) - # precision_score = round(metrics.precision_score(y_true, y_pred, average='macro', zero_division=0), 4) - # recall_score = round(metrics.recall_score(y_true, y_pred, average='macro', zero_division=0), 4) - - detailed_result = ( - "\nResults:" - f"\n- F-score (micro) {micro_f_score}" - f"\n- F-score (macro) {macro_f_score}" - f"\n- Accuracy {accuracy_score}" - '\n\nBy class:\n' + classification_report - ) - - # line for log file - log_header = "ACCURACY" - log_line = f"\t{accuracy_score}" - - result = Result( - main_score=micro_f_score, - log_line=log_line, - log_header=log_header, - detailed_results=detailed_result, - ) - - eval_loss /= len(sentences) - - return result, eval_loss - - @staticmethod - def _filter_empty_sentences(sentences: List[Sentence]) -> List[Sentence]: - filtered_sentences = [sentence for sentence in sentences if sentence.tokens] - if len(sentences) != len(filtered_sentences): - log.warning( - "Ignore {} sentence(s) with no tokens.".format( - len(sentences) - len(filtered_sentences) - ) - ) - return filtered_sentences - - def _obtain_labels( - self, scores: List[List[float]], predict_prob: bool = False - ) -> List[List[Label]]: - """ - Predicts the labels of sentences. - :param scores: the prediction scores from the model - :return: list of predicted labels - """ - - if predict_prob: - return [self._predict_label_prob(s) for s in scores] - - return [self._get_single_label(s) for s in scores] - - def _get_single_label(self, label_scores): # -> List[Label]: - softmax = torch.nn.functional.softmax(label_scores, dim=0) - conf, idx = torch.max(softmax, 0) - - return idx.item() - - def _predict_label_prob(self, label_scores) -> List[Label]: - softmax = torch.nn.functional.softmax(label_scores, dim=0) - label_probs = [] - for idx, conf in enumerate(softmax): - label_probs.append(Label(idx, conf.item())) - return label_probs - - def __str__(self): - return super(flair.nn.Model, self).__str__().rstrip(')') + \ - f' (beta): {self.beta}\n' + \ - f' (loss_max_weight): {self.loss_max_weight}\n' + \ - f' (max_distance) {self.max_distance}\n)' - From 80a675b596bbc268ac383175e97a32fa5247e6e6 Mon Sep 17 00:00:00 2001 From: alanakbik Date: Wed, 25 Nov 2020 12:01:26 +0100 Subject: [PATCH 14/35] GH-1983: bump version numbers --- flair/__init__.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/flair/__init__.py b/flair/__init__.py index 7d3e9a311..ecb28ec24 100644 --- a/flair/__init__.py +++ b/flair/__init__.py @@ -25,7 +25,7 @@ import logging.config -__version__ = "0.6.1.post1" +__version__ = "0.7" logging.config.dictConfig( { diff --git a/setup.py b/setup.py index 0ca078dc0..824626455 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name="flair", - version="0.6.1.post1", + version="0.7", description="A very simple framework for state-of-the-art NLP", long_description=open("README.md", encoding="utf-8").read(), long_description_content_type="text/markdown", From 04f227e9fbc4a356fd8ad5d72fecf0091f5487a8 Mon Sep 17 00:00:00 2001 From: alanakbik Date: Wed, 25 Nov 2020 12:40:42 +0100 Subject: [PATCH 15/35] GH-1983: update list of datasets --- flair/datasets/__init__.py | 32 +- flair/datasets/sequence_labeling.py | 3008 ++++++++++++++------------- resources/docs/TUTORIAL_6_CORPUS.md | 33 +- 3 files changed, 1553 insertions(+), 1520 deletions(-) diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py index 5b611cd23..a59181506 100755 --- a/flair/datasets/__init__.py +++ b/flair/datasets/__init__.py @@ -7,6 +7,7 @@ # Expose all sequence labeling datasets from .sequence_labeling import ColumnCorpus from .sequence_labeling import ColumnDataset +from .sequence_labeling import ANER_CORP from .sequence_labeling import BIOFID from .sequence_labeling import BIOSCOPE from .sequence_labeling import CONLL_03 @@ -14,19 +15,31 @@ from .sequence_labeling import CONLL_03_DUTCH from .sequence_labeling import CONLL_03_SPANISH from .sequence_labeling import CONLL_2000 -from .sequence_labeling import TWITTER_NER from .sequence_labeling import DANE from .sequence_labeling import EUROPARL_NER_GERMAN from .sequence_labeling import GERMEVAL_14 from .sequence_labeling import INSPEC from .sequence_labeling import LER_GERMAN +from .sequence_labeling import MIT_MOVIE_NER_SIMPLE +from .sequence_labeling import MIT_MOVIE_NER_COMPLEX +from .sequence_labeling import MIT_RESTAURANT_NER from .sequence_labeling import NER_BASQUE from .sequence_labeling import NER_FINNISH from .sequence_labeling import NER_SWEDISH from .sequence_labeling import SEMEVAL2010 from .sequence_labeling import SEMEVAL2017 +from .sequence_labeling import TURKU_NER +from .sequence_labeling import TWITTER_NER +from .sequence_labeling import UP_CHINESE +from .sequence_labeling import UP_ENGLISH +from .sequence_labeling import UP_FINNISH +from .sequence_labeling import UP_FRENCH +from .sequence_labeling import UP_GERMAN +from .sequence_labeling import UP_ITALIAN +from .sequence_labeling import UP_SPANISH +from .sequence_labeling import UP_SPANISH_ANCORA +from .sequence_labeling import WEIBO_NER from .sequence_labeling import WIKIANN -from .sequence_labeling import XTREME from .sequence_labeling import WIKIGOLD_NER from .sequence_labeling import WIKINER_ENGLISH from .sequence_labeling import WIKINER_GERMAN @@ -39,20 +52,7 @@ from .sequence_labeling import WIKINER_RUSSIAN from .sequence_labeling import WNUT_17 from .sequence_labeling import WNUT_2020_NER -from .sequence_labeling import WEIBO_NER -from .sequence_labeling import MIT_RESTAURANTS -from .sequence_labeling import UP_CHINESE -from .sequence_labeling import UP_ENGLISH -from .sequence_labeling import UP_FINNISH -from .sequence_labeling import UP_FRENCH -from .sequence_labeling import UP_GERMAN -from .sequence_labeling import UP_ITALIAN -from .sequence_labeling import UP_SPANISH -from .sequence_labeling import UP_SPANISH_ANCORA -from .sequence_labeling import ANER_CORP -from .sequence_labeling import MITMovieNERSimple -from .sequence_labeling import MITMovieNERComplex -from .sequence_labeling import TURKU_NER +from .sequence_labeling import XTREME # Expose all document classification datasets from .document_classification import ClassificationCorpus diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 7dc950dba..02e0a5800 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -267,6 +267,56 @@ def __getitem__(self, index: int = 0) -> Sentence: return sentence +class ANER_CORP(ColumnCorpus): + def __init__( + self, + base_path: Union[str, Path] = None, + tag_to_bioes: str = "ner", + in_memory: bool = True, + document_as_sequence: bool = False, + ): + """ + Initialize a preprocessed version of the Arabic Named Entity Recognition Corpus (ANERCorp) dataset available + from https://github.com/EmnamoR/Arabic-named-entity-recognition/blob/master/ANERCorp.rar. + http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp + Column order is swapped + The first time you call this constructor it will automatically download the dataset. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict + POS tags instead + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ + if type(base_path) == str: + base_path: Path = Path(base_path) + + # column format + columns = {0: "text", 1: "ner"} + + # this dataset name + dataset_name = self.__class__.__name__.lower() + + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + + # download data if necessary + anercorp_path = "https://megantosh.s3.eu-central-1.amazonaws.com/ANERcorp/" + # cached_path(f"{anercorp_path}test.txt", Path("datasets") / dataset_name) + cached_path(f"{anercorp_path}train.txt", Path("datasets") / dataset_name) + + super(ANER_CORP, self).__init__( + data_folder, + columns, + # tag_to_bioes=tag_to_bioes, + encoding="utf-8", + in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + ) + + class BIOFID(ColumnCorpus): def __init__( self, @@ -299,6 +349,36 @@ def __init__( ) +class BIOSCOPE(ColumnCorpus): + + def __init__( + self, + base_path: Union[str, Path] = None, + in_memory: bool = True, + ): + if type(base_path) == str: + base_path: Path = Path(base_path) + + # column format + columns = {0: "text", 1: "tag"} + + # this dataset name + dataset_name = self.__class__.__name__.lower() + + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + + # download data if necessary + bioscope_path = "https://raw.githubusercontent.com/whoisjones/BioScopeSequenceLabelingData/master/sequence_labeled/" + cached_path(f"{bioscope_path}output.txt", Path("datasets") / dataset_name) + + super(BIOSCOPE, self).__init__( + data_folder, columns, in_memory=in_memory, train_file="output.txt" + ) + + class CONLL_03(ColumnCorpus): def __init__( self, @@ -449,21 +529,123 @@ def __init__( ) +def add_IOB_tags(data_file: Union[str, Path], encoding: str = "utf8", ner_column: int = 1): + """ +Function that adds IOB tags if only chunk names are provided (e.g. words are tagged PER instead +of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects +the letter 'O'. Additionally it removes lines with no tags in the data file and can also +be used if the data is only partially IOB tagged. +Parameters +---------- +data_file : Union[str, Path] + Path to the data file. +encoding : str, optional + Encoding used in open function. The default is "utf8". +ner_column : int, optional + Specifies the ner-tagged column. The default is 1 (the second column). -class WNUT_2020_NER(ColumnCorpus): +""" + + def add_I_prefix(current_line: List[str], ner: int, tag: str): + for i in range(0, len(current_line)): + if i == 0: + f.write(line_list[i]) + elif i == ner: + f.write(' I-' + tag) + else: + f.write(' ' + current_line[i]) + f.write('\n') + + with open(file=data_file, mode='r', encoding=encoding) as f: + lines = f.readlines() + with open(file=data_file, mode='w', encoding=encoding) as f: + pred = 'O' # remembers ner tag of predecessing line + for line in lines: + line_list = line.split() + if len(line_list) > 2: # word with tags + ner_tag = line_list[ner_column] + if ner_tag in ['0', 'O']: # no chunk + for i in range(0, len(line_list)): + if i == 0: + f.write(line_list[i]) + elif i == ner_column: + f.write(' O') + else: + f.write(' ' + line_list[i]) + f.write('\n') + pred = 'O' + elif '-' not in ner_tag: # no IOB tags + if pred == 'O': # found a new chunk + add_I_prefix(line_list, ner_column, ner_tag) + pred = ner_tag + else: # found further part of chunk or new chunk directly after old chunk + add_I_prefix(line_list, ner_column, ner_tag) + pred = ner_tag + else: # line already has IOB tag (tag contains '-') + f.write(line) + pred = ner_tag.split('-')[1] + elif len(line_list) == 0: # empty line + f.write('\n') + pred = 'O' + + +def add_IOB2_tags(data_file: Union[str, Path], encoding: str = "utf8"): + """ +Function that adds IOB2 tags if only chunk names are provided (e.g. words are tagged PER instead +of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects +the letter 'O'. Additionally it removes lines with no tags in the data file and can also +be used if the data is only partially IOB tagged. +Parameters +---------- +data_file : Union[str, Path] + Path to the data file. +encoding : str, optional + Encoding used in open function. The default is "utf8". + +""" + with open(file=data_file, mode='r', encoding=encoding) as f: + lines = f.readlines() + with open(file=data_file, mode='w', encoding=encoding) as f: + pred = 'O' # remembers tag of predecessing line + for line in lines: + line_list = line.split() + if len(line_list) == 2: # word with tag + word = line_list[0] + tag = line_list[1] + if tag in ['0', 'O']: # no chunk + f.write(word + ' O\n') + pred = 'O' + elif '-' not in tag: # no IOB tags + if pred == 'O': # found a new chunk + f.write(word + ' B-' + tag + '\n') + pred = tag + else: # found further part of chunk or new chunk directly after old chunk + if pred == tag: + f.write(word + ' I-' + tag + '\n') + else: + f.write(word + ' B-' + tag + '\n') + pred = tag + else: # line already has IOB tag (tag contains '-') + f.write(line) + pred = tag.split('-')[1] + elif len(line_list) == 0: # empty line + f.write('\n') + pred = 'O' + + +class CONLL_03_SPANISH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", in_memory: bool = True, - document_as_sequence: bool = False, ): """ - Initialize the WNUT_2020_NER corpus. The first time you call this constructor it will automatically + Initialize the CoNLL-03 corpus for Spanish. The first time you call this constructor it will automatically download the dataset. :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, since it is the only option of the WNUT corpus. + :param tag_to_bioes: NER by default, should not be changed :param in_memory: If True, keeps dataset in memory giving speedups in training. :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ @@ -482,65 +664,40 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - github_url = "https://github.com/jeniyat/WNUT_2020_NER/archive/master.zip" - - for sample in ["train", "test", "dev"]: - - sample_file = data_folder / (sample + ".txt") - if not sample_file.is_file(): - - zip_path = cached_path( - f"{github_url}", Path("datasets") / dataset_name - ) - - # unzip the downloaded repo and merge the train, dev and test datasets - unpack_file(zip_path, data_folder, "zip", False) # unzipped folder name: WNUT_2020_NER-master - - if sample == "test": - file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data_2020/Conll_Format/") - else: - file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data/Conll_Format/") - filenames = os.listdir(file_path) - with open(data_folder / (sample + '.txt'), 'w') as outfile: - for fname in filenames: - with open(file_path / fname) as infile: - lines = infile.read() - outfile.write(lines) - - shutil.rmtree(str(data_folder / "WNUT_2020_NER-master")) # clean up when done + conll_02_path = "https://www.clips.uantwerpen.be/conll2002/ner/data/" + cached_path(f"{conll_02_path}esp.testa", Path("datasets") / dataset_name) + cached_path(f"{conll_02_path}esp.testb", Path("datasets") / dataset_name) + cached_path(f"{conll_02_path}esp.train", Path("datasets") / dataset_name) - super(WNUT_2020_NER, self).__init__( + super(CONLL_03_SPANISH, self).__init__( data_folder, columns, tag_to_bioes=tag_to_bioes, - encoding="utf-8", + encoding="latin-1", in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", ) -class WIKIGOLD_NER(ColumnCorpus): +class CONLL_2000(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", + tag_to_bioes: str = "np", in_memory: bool = True, - document_as_sequence: bool = False, ): """ - Initialize the wikigold corpus. The first time you call this constructor it will automatically - download the dataset. + Initialize the CoNLL-2000 corpus for English chunking. + The first time you call this constructor it will automatically download the dataset. :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, should not be changed + :param tag_to_bioes: 'np' by default, should not be changed, but you can set 'pos' instead to predict POS tags :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "ner"} + columns = {0: "text", 1: "pos", 2: "np"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -551,45 +708,52 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - wikigold_ner_path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/wikigold/CONLL-format/data/" - cached_path(f"{wikigold_ner_path}wikigold.conll.txt", Path("datasets") / dataset_name) - - super(WIKIGOLD_NER, self).__init__( - data_folder, - columns, - tag_to_bioes=tag_to_bioes, - encoding="utf-8", - in_memory=in_memory, - train_file='wikigold.conll.txt', - document_separator_token=None if not document_as_sequence else "-DOCSTART-", - ) - + conll_2000_path = "https://www.clips.uantwerpen.be/conll2000/chunking/" + data_file = Path(flair.cache_root) / "datasets" / dataset_name / "train.txt" + if not data_file.is_file(): + cached_path( + f"{conll_2000_path}train.txt.gz", Path("datasets") / dataset_name + ) + cached_path( + f"{conll_2000_path}test.txt.gz", Path("datasets") / dataset_name + ) + import gzip, shutil -class TWITTER_NER(ColumnCorpus): + with gzip.open( + Path(flair.cache_root) / "datasets" / dataset_name / "train.txt.gz", + "rb", + ) as f_in: + with open( + Path(flair.cache_root) / "datasets" / dataset_name / "train.txt", + "wb", + ) as f_out: + shutil.copyfileobj(f_in, f_out) + with gzip.open( + Path(flair.cache_root) / "datasets" / dataset_name / "test.txt.gz", "rb" + ) as f_in: + with open( + Path(flair.cache_root) / "datasets" / dataset_name / "test.txt", + "wb", + ) as f_out: + shutil.copyfileobj(f_in, f_out) + + super(CONLL_2000, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory + ) + + +class DANE(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", in_memory: bool = True, - document_as_sequence: bool = False, ): - """ - Initialize a dataset called twitter_ner which can be found on the following page: - https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/ner.txt. - - The first time you call this constructor it will automatically - download the dataset. - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, need not be changed - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: 'text', 1: 'ner'} + columns = {1: 'text', 3: 'pos', 9: 'ner'} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -600,43 +764,61 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - twitter_ner_path = "https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/" - cached_path(f"{twitter_ner_path}ner.txt", Path("datasets") / dataset_name) + data_path = Path(flair.cache_root) / "datasets" / dataset_name + train_data_file = data_path / "ddt.train.conllu" + if not train_data_file.is_file(): + temp_file = cached_path( + 'https://danlp.alexandra.dk/304bd159d5de/datasets/ddt.zip', + Path("datasets") / dataset_name + ) + from zipfile import ZipFile - super(TWITTER_NER, self).__init__( - data_folder, - columns, - tag_to_bioes=tag_to_bioes, - encoding="latin-1", - train_file="ner.txt", - in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", + with ZipFile(temp_file, 'r') as zip_file: + zip_file.extractall(path=data_path) + + # Remove CoNLL-U meta information in the last column + for part in ['train', 'dev', 'test']: + lines = [] + data_file = "ddt.{}.conllu".format(part) + with open(data_path / data_file, 'r') as file: + for line in file: + if line.startswith("#") or line == "\n": + lines.append(line) + lines.append(line.replace("name=", "").replace("|SpaceAfter=No", "")) + + with open(data_path / data_file, 'w') as file: + file.writelines(lines) + + print(data_path / data_file) + + super(DANE, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, + in_memory=in_memory, comment_symbol="#" ) -class MIT_RESTAURANTS(ColumnCorpus): +class EUROPARL_NER_GERMAN(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", - in_memory: bool = True, - document_as_sequence: bool = False, + in_memory: bool = False, ): """ - Initialize the experimental MIT Restaurant corpus available on https://groups.csail.mit.edu/sls/downloads/restaurant/. - The first time you call this constructor it will automatically download the dataset. + Initialize the EUROPARL_NER_GERMAN corpus. The first time you call this constructor it will automatically + download the dataset. :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict - POS tags instead - :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param tag_to_bioes: 'ner' by default, should not be changed. + :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage. :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ + if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "ner"} + columns = {0: 'text', 1: 'lemma', 2: 'pos', 3: 'np', 4: 'ner'} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -647,125 +829,25 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - mit_restaurants_path = "https://megantosh.s3.eu-central-1.amazonaws.com/MITRestoCorpus/" - cached_path(f"{mit_restaurants_path}test.txt", Path("datasets") / dataset_name) - cached_path(f"{mit_restaurants_path}train.txt", Path("datasets") / dataset_name) + europarl_ner_german_path = "https://nlpado.de/~sebastian/software/ner/" + cached_path(f"{europarl_ner_german_path}ep-96-04-15.conll", Path("datasets") / dataset_name) + cached_path(f"{europarl_ner_german_path}ep-96-04-16.conll", Path("datasets") / dataset_name) + + add_IOB_tags(data_file=Path(data_folder / "ep-96-04-15.conll"), encoding="latin-1", ner_column=4) + add_IOB_tags(data_file=Path(data_folder / "ep-96-04-16.conll"), encoding="latin-1", ner_column=4) - super(MIT_RESTAURANTS, self).__init__( + super(EUROPARL_NER_GERMAN, self).__init__( data_folder, columns, tag_to_bioes=tag_to_bioes, encoding="latin-1", in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", + train_file='ep-96-04-16.conll', + test_file='ep-96-04-15.conll' ) -def add_IOB_tags(data_file: Union[str, Path], encoding: str = "utf8", ner_column: int = 1): - """ -Function that adds IOB tags if only chunk names are provided (e.g. words are tagged PER instead -of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects -the letter 'O'. Additionally it removes lines with no tags in the data file and can also -be used if the data is only partially IOB tagged. -Parameters ----------- -data_file : Union[str, Path] - Path to the data file. -encoding : str, optional - Encoding used in open function. The default is "utf8". -ner_column : int, optional - Specifies the ner-tagged column. The default is 1 (the second column). - -""" - - def add_I_prefix(current_line: List[str], ner: int, tag: str): - for i in range(0, len(current_line)): - if i == 0: - f.write(line_list[i]) - elif i == ner: - f.write(' I-' + tag) - else: - f.write(' ' + current_line[i]) - f.write('\n') - - with open(file=data_file, mode='r', encoding=encoding) as f: - lines = f.readlines() - with open(file=data_file, mode='w', encoding=encoding) as f: - pred = 'O' # remembers ner tag of predecessing line - for line in lines: - line_list = line.split() - if len(line_list) > 2: # word with tags - ner_tag = line_list[ner_column] - if ner_tag in ['0', 'O']: # no chunk - for i in range(0, len(line_list)): - if i == 0: - f.write(line_list[i]) - elif i == ner_column: - f.write(' O') - else: - f.write(' ' + line_list[i]) - f.write('\n') - pred = 'O' - elif '-' not in ner_tag: # no IOB tags - if pred == 'O': # found a new chunk - add_I_prefix(line_list, ner_column, ner_tag) - pred = ner_tag - else: # found further part of chunk or new chunk directly after old chunk - add_I_prefix(line_list, ner_column, ner_tag) - pred = ner_tag - else: # line already has IOB tag (tag contains '-') - f.write(line) - pred = ner_tag.split('-')[1] - elif len(line_list) == 0: # empty line - f.write('\n') - pred = 'O' - - -def add_IOB2_tags(data_file: Union[str, Path], encoding: str = "utf8"): - """ -Function that adds IOB2 tags if only chunk names are provided (e.g. words are tagged PER instead -of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects -the letter 'O'. Additionally it removes lines with no tags in the data file and can also -be used if the data is only partially IOB tagged. -Parameters ----------- -data_file : Union[str, Path] - Path to the data file. -encoding : str, optional - Encoding used in open function. The default is "utf8". - -""" - with open(file=data_file, mode='r', encoding=encoding) as f: - lines = f.readlines() - with open(file=data_file, mode='w', encoding=encoding) as f: - pred = 'O' # remembers tag of predecessing line - for line in lines: - line_list = line.split() - if len(line_list) == 2: # word with tag - word = line_list[0] - tag = line_list[1] - if tag in ['0', 'O']: # no chunk - f.write(word + ' O\n') - pred = 'O' - elif '-' not in tag: # no IOB tags - if pred == 'O': # found a new chunk - f.write(word + ' B-' + tag + '\n') - pred = tag - else: # found further part of chunk or new chunk directly after old chunk - if pred == tag: - f.write(word + ' I-' + tag + '\n') - else: - f.write(word + ' B-' + tag + '\n') - pred = tag - else: # line already has IOB tag (tag contains '-') - f.write(line) - pred = tag.split('-')[1] - elif len(line_list) == 0: # empty line - f.write('\n') - pred = 'O' - - -class CONLL_03_SPANISH(ColumnCorpus): +class GERMEVAL_14(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, @@ -773,19 +855,18 @@ def __init__( in_memory: bool = True, ): """ - Initialize the CoNLL-03 corpus for Spanish. The first time you call this constructor it will automatically - download the dataset. - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, should not be changed - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + Initialize the GermEval NER corpus for German. This is only possible if you've manually downloaded it to your + machine. Obtain the corpus from https://sites.google.com/site/germeval2014ner/data and put it into some folder. + Then point the base_path parameter in the constructor to this folder + :param base_path: Path to the GermEval corpus on your machine + :param tag_to_bioes: 'ner' by default, should not be changed. + :param in_memory:If True, keeps dataset in memory giving speedups in training. """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "ner"} + columns = {1: "text", 2: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -795,41 +876,36 @@ def __init__( base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name - # download data if necessary - conll_02_path = "https://www.clips.uantwerpen.be/conll2002/ner/data/" - cached_path(f"{conll_02_path}esp.testa", Path("datasets") / dataset_name) - cached_path(f"{conll_02_path}esp.testb", Path("datasets") / dataset_name) - cached_path(f"{conll_02_path}esp.train", Path("datasets") / dataset_name) - - super(CONLL_03_SPANISH, self).__init__( + # check if data there + if not data_folder.exists(): + log.warning("-" * 100) + log.warning(f'WARNING: GermEval-14 dataset not found at "{data_folder}".') + log.warning( + 'Instructions for obtaining the data can be found here: https://sites.google.com/site/germeval2014ner/data"' + ) + log.warning("-" * 100) + super(GERMEVAL_14, self).__init__( data_folder, columns, tag_to_bioes=tag_to_bioes, - encoding="latin-1", + comment_symbol="#", in_memory=in_memory, ) -class CONLL_2000(ColumnCorpus): +class INSPEC(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "np", + tag_to_bioes: str = "keyword", in_memory: bool = True, ): - """ - Initialize the CoNLL-2000 corpus for English chunking. - The first time you call this constructor it will automatically download the dataset. - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: 'np' by default, should not be changed, but you can set 'pos' instead to predict POS tags - :param in_memory: If True, keeps dataset in memory giving speedups in training. - """ + if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "pos", 2: "np"} + columns = {0: "text", 1: "keyword"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -839,77 +915,34 @@ def __init__( base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name - # download data if necessary - conll_2000_path = "https://www.clips.uantwerpen.be/conll2000/chunking/" - data_file = Path(flair.cache_root) / "datasets" / dataset_name / "train.txt" - if not data_file.is_file(): - cached_path( - f"{conll_2000_path}train.txt.gz", Path("datasets") / dataset_name - ) - cached_path( - f"{conll_2000_path}test.txt.gz", Path("datasets") / dataset_name - ) - import gzip, shutil - - with gzip.open( - Path(flair.cache_root) / "datasets" / dataset_name / "train.txt.gz", - "rb", - ) as f_in: - with open( - Path(flair.cache_root) / "datasets" / dataset_name / "train.txt", - "wb", - ) as f_out: - shutil.copyfileobj(f_in, f_out) - with gzip.open( - Path(flair.cache_root) / "datasets" / dataset_name / "test.txt.gz", "rb" - ) as f_in: - with open( - Path(flair.cache_root) / "datasets" / dataset_name / "test.txt", - "wb", - ) as f_out: - shutil.copyfileobj(f_in, f_out) + inspec_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/Inspec" + cached_path(f"{inspec_path}/train.txt", Path("datasets") / dataset_name) + cached_path(f"{inspec_path}/test.txt", Path("datasets") / dataset_name) + if not "dev.txt" in os.listdir(data_folder): + cached_path(f"{inspec_path}/valid.txt", Path("datasets") / dataset_name) + # rename according to train - test - dev - convention + os.rename(data_folder / "valid.txt", data_folder / "dev.txt") - super(CONLL_2000, self).__init__( + super(INSPEC, self).__init__( data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class XTREME(MultiCorpus): +class LER_GERMAN(ColumnCorpus): def __init__( self, - languages: Union[str, List[str]] = None, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", in_memory: bool = False, ): """ - Xtreme corpus for cross-lingual NER consisting of datasets of a total of 176 languages. The data comes from the google - research work XTREME https://github.com/google-research/xtreme. All datasets for NER and respective language abbreviations (e.g. - "en" for english can be found here https://www.amazon.com/clouddrive/share/d3KGCRCIYwhKJF0H3eWA26hjg2ZCRhjpEQtDL70FSBN/folder/C43gs51bSIaq5sFTQkWNCQ?_encoding=UTF8&*Version*=1&*entries*=0&mgh=1 ) - The data is derived from the wikiann dataset https://elisa-ie.github.io/wikiann/ (license: https://opendatacommons.org/licenses/by/) - - Parameters - ---------- - languages : Union[str, List[str]], optional - Default the 40 languages that are used in XTREME are loaded. Otherwise on can hand over a strings or a list of strings - consisiting of abbreviations for languages. All datasets will be loaded in a MultiCorpus object. - base_path : Union[str, Path], optional - Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - tag_to_bioes : str, optional - The data is in bio-format. It will by default (with the string "ner" as value) be transformed - into the bioes format. If you dont want that set it to None. - + Initialize the LER_GERMAN (Legal Entity Recognition) corpus. The first time you call this constructor it will automatically + download the dataset. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ - # if no languages are given as argument all languages used in XTREME will be loaded - if not languages: - languages = ["af", "ar", "bg", "bn", "de", "el", "en", "es", "et", "eu", "fa", "fi", "fr", "he", "hi", "hu", - "id", "it", "ja", "jv", "ka", "kk", "ko", "ml", "mr", "ms", "my", "nl", "pt", "ru", "sw", "ta", - "te", "th", "tl", "tr", "ur", "vi", "yo", "zh"] - - # if only one language is given - if type(languages) == str: - languages = [languages] if type(base_path) == str: base_path: Path = Path(base_path) @@ -918,112 +951,136 @@ def __init__( columns = {0: "text", 1: "ner"} # this dataset name - dataset_name = "xtreme" + dataset_name = self.__class__.__name__.lower() # default dataset folder is the cache root if not base_path: base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name - # For each language in languages, the file is downloaded if not existent - # Then a comlumncorpus of that data is created and saved in a list - # This list is handed to the multicorpus + # download data if necessary + ler_path = "https://raw.githubusercontent.com/elenanereiss/Legal-Entity-Recognition/master/data/" + cached_path(f"{ler_path}ler.conll", Path("datasets") / dataset_name) - # list that contains the columncopora - corpora = [] + super(LER_GERMAN, self).__init__( + data_folder, + columns, + tag_to_bioes=tag_to_bioes, + in_memory=in_memory, + train_file='ler.conll' + ) - hu_path = "https://nlp.informatik.hu-berlin.de/resources/datasets/panx_dataset" - # download data if necessary - for language in languages: +class MIT_MOVIE_NER_SIMPLE(ColumnCorpus): + def __init__( + self, + base_path: Union[str, Path] = None, + tag_to_bioes: str = "ner", + in_memory: bool = True, + ): + """ + Initialize the eng corpus of the MIT Movie Corpus (it has simpler queries compared to the trivia10k13 corpus) + in BIO format. The first time you call this constructor it will automatically download the dataset. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict + POS tags instead + :param in_memory: If True, keeps dataset in memory giving speedups in training. + """ + # column format + columns = {0: "ner", 1: "text"} - language_folder = data_folder / language + # dataset name + dataset_name = self.__class__.__name__.lower() - # if language not downloaded yet, download it - if not language_folder.exists(): + # data folder: default dataset folder is the cache root + if type(base_path) == str: + base_path: Path = Path(base_path) + if not base_path: + base_path: Path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name - file_name = language + '.tar.gz' - # create folder - os.makedirs(language_folder) + # download data if necessary + mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/" + train_file = "engtrain.bio" + test_file = "engtest.bio" + cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name) + cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name) - # download from HU Server - temp_file = cached_path( - hu_path + "/" + file_name, - Path("datasets") / dataset_name / language - ) + super(MIT_MOVIE_NER_SIMPLE, self).__init__( + data_folder, + columns, + train_file=train_file, + test_file=test_file, + tag_to_bioes=tag_to_bioes, + in_memory=in_memory, + ) - # unzip - print("Extract data...") - import tarfile - tar = tarfile.open(str(temp_file), "r:gz") - for part in ["train", "test", "dev"]: - tar.extract(part, str(language_folder)) - tar.close() - print('...done.') - # transform data into required format - print("Process dataset...") - for part in ["train", "test", "dev"]: - xtreme_to_simple_ner_annotation(str(language_folder / part)) - print('...done.') +class MIT_MOVIE_NER_COMPLEX(ColumnCorpus): + def __init__( + self, + base_path: Union[str, Path] = None, + tag_to_bioes: str = "ner", + in_memory: bool = True, + ): + """ + Initialize the trivia10k13 corpus of the MIT Movie Corpus (it has more complex queries compared to the eng corpus) + in BIO format. The first time you call this constructor it will automatically download the dataset. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict + POS tags instead + :param in_memory: If True, keeps dataset in memory giving speedups in training. + """ + # column format + columns = {0: "ner", 1: "text"} - # initialize comlumncorpus and add it to list - print("Read data into corpus...") - corp = ColumnCorpus(data_folder=language_folder, - column_format=columns, - tag_to_bioes=tag_to_bioes, - in_memory=in_memory, - ) - corpora.append(corp) - print("...done.") + # dataset name + dataset_name = self.__class__.__name__.lower() - super(XTREME, self).__init__( - corpora, name='xtreme' - ) + # data folder: default dataset folder is the cache root + if type(base_path) == str: + base_path: Path = Path(base_path) + if not base_path: + base_path: Path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + # download data if necessary + mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/" + train_file = "trivia10k13train.bio" + test_file = "trivia10k13test.bio" + cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name) + cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name) -def xtreme_to_simple_ner_annotation(data_file: Union[str, Path]): - with open(data_file, 'r', encoding='utf-8') as f: - lines = f.readlines() - with open(data_file, 'w', encoding='utf-8') as f: - for line in lines: - if line == '\n': - f.write(line) - else: - liste = line.split() - f.write(liste[0].split(':', 1)[1] + ' ' + liste[1] + '\n') + super(MIT_MOVIE_NER_COMPLEX, self).__init__( + data_folder, + columns, + train_file=train_file, + test_file=test_file, + tag_to_bioes=tag_to_bioes, + in_memory=in_memory, + ) -class WIKIANN(MultiCorpus): +class MIT_RESTAURANT_NER(ColumnCorpus): def __init__( self, - languages: Union[str, List[str]], base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", - in_memory: bool = False, + in_memory: bool = True, + document_as_sequence: bool = False, ): """ - WkiAnn corpus for cross-lingual NER consisting of datasets from 282 languages that exist - in Wikipedia. See https://elisa-ie.github.io/wikiann/ for details and for the languages and their - respective abbreveations, i.e. "en" for english. (license: https://opendatacommons.org/licenses/by/) - Parameters - ---------- - languages : Union[str, List[str]] - Should be an abbreviation of a language ("en", "de",..) or a list of abbreviations. - The datasets of all passed languages will be saved in one MultiCorpus. - (Note that, even though listed on https://elisa-ie.github.io/wikiann/ some datasets are empty. - This includes "aa", "cho", "ho", "hz", "ii", "jam", "kj", "kr", "mus", "olo" and "tcy".) - base_path : Union[str, Path], optional - Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - tag_to_bioes : str, optional - The data is in bio-format. It will by default (with the string "ner" as value) be transformed - into the bioes format. If you dont want that set it to None. - + Initialize the experimental MIT Restaurant corpus available on https://groups.csail.mit.edu/sls/downloads/restaurant/. + The first time you call this constructor it will automatically download the dataset. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict + POS tags instead + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ - if type(languages) == str: - languages = [languages] - if type(base_path) == str: base_path: Path = Path(base_path) @@ -1031,405 +1088,140 @@ def __init__( columns = {0: "text", 1: "ner"} # this dataset name - dataset_name = "wikiann" + dataset_name = self.__class__.__name__.lower() # default dataset folder is the cache root if not base_path: base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name - # For each language in languages, the file is downloaded if not existent - # Then a comlumncorpus of that data is created and saved in a list - # this list is handed to the multicorpus + # download data if necessary + mit_restaurants_path = "https://megantosh.s3.eu-central-1.amazonaws.com/MITRestoCorpus/" + cached_path(f"{mit_restaurants_path}test.txt", Path("datasets") / dataset_name) + cached_path(f"{mit_restaurants_path}train.txt", Path("datasets") / dataset_name) - # list that contains the columncopora - corpora = [] + super(MIT_RESTAURANT_NER, self).__init__( + data_folder, + columns, + tag_to_bioes=tag_to_bioes, + encoding="latin-1", + in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + ) + + +class NER_BASQUE(ColumnCorpus): + def __init__( + self, + base_path: Union[str, Path] = None, + tag_to_bioes: str = "ner", + in_memory: bool = True, + ): + if type(base_path) == str: + base_path: Path = Path(base_path) + + # column format + columns = {0: "text", 1: "ner"} + + # this dataset name + dataset_name = self.__class__.__name__.lower() + + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name - google_drive_path = 'https://drive.google.com/uc?id=' # download data if necessary - first = True - for language in languages: + ner_basque_path = "http://ixa2.si.ehu.eus/eiec/" + data_path = Path(flair.cache_root) / "datasets" / dataset_name + data_file = data_path / "named_ent_eu.train" + if not data_file.is_file(): + cached_path( + f"{ner_basque_path}/eiec_v1.0.tgz", Path("datasets") / dataset_name + ) + import tarfile, shutil - language_folder = data_folder / language - file_name = 'wikiann-' + language + '.bio' + with tarfile.open( + Path(flair.cache_root) / "datasets" / dataset_name / "eiec_v1.0.tgz", + "r:gz", + ) as f_in: + corpus_files = ( + "eiec_v1.0/named_ent_eu.train", + "eiec_v1.0/named_ent_eu.test", + ) + for corpus_file in corpus_files: + f_in.extract(corpus_file, data_path) + shutil.move(f"{data_path}/{corpus_file}", data_path) - # if language not downloaded yet, download it - if not language_folder.exists(): - if first == True: - import gdown - import tarfile - first = False - # create folder - os.makedirs(language_folder) - # get google drive id from list - google_id = google_drive_id_from_language_name(language) - url = google_drive_path + google_id + super(NER_BASQUE, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory + ) - # download from google drive - gdown.download(url, str(language_folder / language) + '.tar.gz') - # unzip - print("Extract data...") - tar = tarfile.open(str(language_folder / language) + '.tar.gz', "r:gz") - # tar.extractall(language_folder,members=[tar.getmember(file_name)]) - tar.extract(file_name, str(language_folder)) - tar.close() - print('...done.') +class NER_FINNISH(ColumnCorpus): + def __init__( + self, + base_path: Union[str, Path] = None, + tag_to_bioes: str = "ner", + in_memory: bool = True, + ): + if type(base_path) == str: + base_path: Path = Path(base_path) - # transform data into required format - # the processed dataset has the additional ending "_new" - print("Process dataset...") - silver_standard_to_simple_ner_annotation(str(language_folder / file_name)) - # remove the unprocessed dataset - os.remove(str(language_folder / file_name)) - print('...done.') + # column format + columns = {0: "text", 1: "ner"} - # initialize comlumncorpus and add it to list - print("Read data into corpus...") - corp = ColumnCorpus(data_folder=language_folder, - column_format=columns, - train_file=file_name + '_new', - tag_to_bioes=tag_to_bioes, - in_memory=in_memory, - ) - corpora.append(corp) - print("...done.") + # this dataset name + dataset_name = self.__class__.__name__.lower() - super(WIKIANN, self).__init__( - corpora, name='wikiann' + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + + # download data if necessary + ner_finnish_path = "https://raw.githubusercontent.com/mpsilfve/finer-data/master/data/digitoday." + cached_path(f"{ner_finnish_path}2014.train.csv", Path("datasets") / dataset_name) + cached_path(f"{ner_finnish_path}2014.dev.csv", Path("datasets") / dataset_name) + cached_path(f"{ner_finnish_path}2015.test.csv", Path("datasets") / dataset_name) + + _remove_lines_without_annotations(data_file=Path(data_folder / "digitoday.2015.test.csv")) + + super(NER_FINNISH, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, skip_first_line=True ) -def silver_standard_to_simple_ner_annotation(data_file: Union[str, Path]): - f_read = open(data_file, 'r', encoding='utf-8') - f_write = open(data_file + '_new', 'w+', encoding='utf-8') - while True: - line = f_read.readline() - if line: - if line == '\n': - f_write.write(line) - else: - liste = line.split() - f_write.write(liste[0] + ' ' + liste[-1] + '\n') - else: - break - f_read.close() - f_write.close() +def _remove_lines_without_annotations(data_file: Union[str, Path] = None): + with open(data_file, 'r') as f: + lines = f.readlines() + with open(data_file, 'w') as f: + for line in lines: + if len(line.split()) != 1: + f.write(line) -def google_drive_id_from_language_name(language): - languages_ids = { - 'aa': '1tDDlydKq7KQQ3_23Ysbtke4HJOe4snIk', # leer - 'ab': '1hB8REj2XA_0DjI9hdQvNvSDpuBIb8qRf', - 'ace': '1WENJS2ppHcZqaBEXRZyk2zY-PqXkTkgG', - 'ady': '1n6On8WWDHxEoybj7F9K15d_fkGPy6KgO', - 'af': '1CPB-0BD2tg3zIT60D3hmJT0i5O_SKja0', - 'ak': '1l2vlGHnQwvm9XhW5S-403fetwUXhBlZm', - 'als': '196xyYjhbie7sYLHLZHWkkurOwQLi8wK-', - 'am': '1ug1IEoExKD3xWpvfZprAPSQi82YF9Cet', - 'an': '1DNLgPOAOsGZBYd6rC5ddhzvc9_DtWnk2', - 'ang': '1W_0ti7Tl8AkqM91lRCMPWEuUnPOAZroV', - 'ar': '1tyvd32udEQG_cNeVpaD5I2fxvCc6XKIS', - 'arc': '1hSOByStqPmP3b9HfQ39EclUZGo8IKCMb', - 'arz': '1CKW5ZhxTpIHmc8Jt5JLz_5O6Cr8Icsan', - 'as': '12opBoIweBLM8XciMHT4B6-MAaKdYdvpE', - 'ast': '1rp64PxGZBDfcw-tpFBjLg_ddLDElG1II', - 'av': '1hncGUrkG1vwAAQgLtwOf41BWkHkEvdss', - 'ay': '1VmIsWpMTz442b4Mx798ZOgtB9vquKQtf', - 'az': '1FXDXsvBSdqc7GGIDZv0hqBOaaw12Ip2-', - 'azb': '1amVqOuHLEkhjn8rkGUl-mXdZlaACWyNT', - 'ba': '1aLx1d8GagI11VZVYOGQy0BEePeqoT0x3', - 'bar': '1JZ8-k8ZmnpWYI_Yl_cBBgjVdxoM9Daci', - 'bat-smg': '1trxKXDFSeKsygTMKi-ZqXSJs7F90k5a8', - 'bcl': '1Hs0k7KVZ2DPsqroZ4cUKcwZG4HdPV794', - 'be-x-old': '1gaK-spj1m6eGYQ-SsngLxxLUvP1VRk08', - 'be': '1_ttfOSy9BzCRkIT_p3mImT82XRPpEiuH', - 'bg': '1Iug6gYKemb0OrLTUrKDc_c66YGypTfCF', - 'bh': '12OcSFLu940A8tVQLxI8pnxKBpTeZHmrh', - 'bi': '1rftVziS_pqARx4mvLJC0sKLY-OL5ZIjE', - 'bjn': '1n17mkRjPUAOWQk5LQs2C3Tz3ShxK0enZ', - 'bm': '1284dwO_sfdsWE7FR06HhfBRUb8ePesKR', - 'bn': '1K2DM1mT4hkr6NlAIBTj95BeVXcgvpgDm', - 'bo': '1SzGHDVK-OguKdjZ4DXWiOJVrie1iHeWm', - 'bpy': '1m-e5EoruJufvwBEgJLmJtx6jzx64pYN2', - 'br': '1xdaBoJ1DnwI0iEq7gQN1dWcABAs_bM9H', - 'bs': '167dsB01trMYFQl8FshtIdfhjw7IfVKbk', - 'bug': '1yCnevM9_KJzFk27Vxsva_20OacLo4Uam', - 'bxr': '1DlByAX3zB-9UyEAVD4wtX-R7mXC-8xum', - 'ca': '1LuUgbd9sGa-5Ahcsy31EK89a3WOowftY', - 'cbk-zam': '1kgF8xoD-kIOWZET_9kp_4yNX6AAXn6PI', - 'cdo': '14x1y6611G-UAEGq92QEHRpreVkYnoUCw', - 'ce': '1QUUCVKA-fkiCHd3KT3zUWefaWnxzlZLu', - 'ceb': '1DJZE9RfaMoPNXHI73KBXAm4YSe-_YCUk', - 'ch': '1YzAfhmatkmTpkZbAcD6X83epCgzD5S2_', - 'cho': '1ciY0vF3c5a2mTOo_k32A2wMs0klK98Kb', # leer - 'chr': '1EHaxz1UZHn7v2bbRzCLAhPsNtRzrG3Ae', - 'chy': '1nNWwMAJr1KNdz3bHf6uIn-thZCknlTeB', - 'ckb': '1llpaftcUSiXCZQZMdAqaJSrhwMdcf9IV', - 'co': '1ZP-8oWgMYfW7a6w6ygEFkKDGbN39QnDn', - 'cr': '1ST0xRicLAG4JdCZwGdaY-0pEXooQh7e6', - 'crh': '1Jmpq2XVYUR_XaXU5XNhtOMnz-qkpsgpE', - 'cs': '1Vydyze-jBkK_S1uV5ewV_Y6dbwhXr7lk', - 'csb': '1naUyF74lZPnnopXdOqf5Xor2kT4WoHfS', - 'cu': '1EN5dVTU6jc7YOYPCHq8EYUF31HlMUKs7', - 'cv': '1gEUAlqYSSDI4TrWCqP1LUq2n0X1XEjN3', - 'cy': '1q5g6NJE5GXf65Vc_P4BnUMHQ49Prz-J1', - 'da': '11onAGOLkkqrIwM784siWlg-cewa5WKm8', - 'de': '1f9nWvNkCCy6XWhd9uf4Dq-2--GzSaYAb', - 'diq': '1IkpJaVbEOuOs9qay_KG9rkxRghWZhWPm', - 'dsb': '1hlExWaMth-2eVIQ3i3siJSG-MN_7Z6MY', - 'dv': '1WpCrslO4I7TMb2uaKVQw4U2U8qMs5szi', - 'dz': '10WX52ePq2KfyGliwPvY_54hIjpzW6klV', - 'ee': '1tYEt3oN2KPzBSWrk9jpCqnW3J1KXdhjz', - 'el': '1cxq4NUYmHwWsEn5waYXfFSanlINXWLfM', - 'eml': '17FgGhPZqZNtzbxpTJOf-6nxEuI5oU4Vd', - 'en': '1mqxeCPjxqmO7e8utj1MQv1CICLFVvKa-', - 'eo': '1YeknLymGcqj44ug2yd4P7xQVpSK27HkK', - 'es': '1Dnx3MVR9r5cuoOgeew2gT8bDvWpOKxkU', - 'et': '1Qhb3kYlQnLefWmNimdN_Vykm4mWzbcWy', - 'eu': '1f613wH88UeITYyBSEMZByK-nRNMwLHTs', - 'ext': '1D0nLOZ3aolCM8TShIRyCgF3-_MhWXccN', - 'fa': '1QOG15HU8VfZvJUNKos024xI-OGm0zhEX', - 'ff': '1h5pVjxDYcq70bSus30oqi9KzDmezVNry', - 'fi': '1y3Kf6qYsSvL8_nSEwE1Y6Bf6ninaPvqa', - 'fiu-vro': '1oKUiqG19WgPd3CCl4FGudk5ATmtNfToR', - 'fj': '10xDMuqtoTJlJFp5ghbhKfNWRpLDK3W4d', - 'fo': '1RhjYqgtri1276Be1N9RrNitdBNkpzh0J', - 'fr': '1sK_T_-wzVPJYrnziNqWTriU52rEsXGjn', - 'frp': '1NUm8B2zClBcEa8dHLBb-ZgzEr8phcQyZ', - 'frr': '1FjNqbIUlOW1deJdB8WCuWjaZfUzKqujV', - 'fur': '1oqHZMK7WAV8oHoZLjGR0PfmO38wmR6XY', - 'fy': '1DvnU6iaTJc9bWedmDklHyx8nzKD1s3Ge', - 'ga': '1Ql6rh7absdYQ8l-3hj_MVKcEC3tHKeFB', - 'gag': '1zli-hOl2abuQ2wsDJU45qbb0xuvYwA3a', - 'gan': '1u2dOwy58y-GaS-tCPJS_i9VRDQIPXwCr', - 'gd': '1umsUpngJiwkLdGQbRqYpkgxZju9dWlRz', - 'gl': '141K2IbLjJfXwFTIf-kthmmG0YWdi8liE', - 'glk': '1ZDaxQ6ilXaoivo4_KllagabbvfOuiZ0c', - 'gn': '1hM4MuCaVnZqnL-w-0N-WcWag22ikVLtZ', - 'gom': '1BNOSw75tzPC0wEgLOCKbwu9wg9gcLOzs', - 'got': '1YSHYBtXc1WvUvMIHPz6HHgJvaXKulJUj', - 'gu': '1VdK-B2drqFwKg8KD23c3dKXY-cZgCMgd', - 'gv': '1XZFohYNbKszEFR-V-yDXxx40V41PV9Zm', - 'ha': '18ZG4tUU0owRtQA8Ey3Dl72ALjryEJWMC', - 'hak': '1QQe3WgrCWbvnVH42QXD7KX4kihHURB0Z', - 'haw': '1FLqlK-wpz4jy768XbQAtxd9PhC-9ciP7', - 'he': '18K-Erc2VOgtIdskaQq4D5A3XkVstDmfX', - 'hi': '1lBRapb5tjBqT176gD36K5yb_qsaFeu-k', - 'hif': '153MQ9Ga4NQ-CkK8UiJM3DjKOk09fhCOV', - 'ho': '1c1AoS7yq15iVkTEE-0f3x25NT4F202B8', # leer - 'hr': '1wS-UtB3sGHuXJQQGR0F5lDegogsgoyif', - 'hsb': '1_3mMLzAE5OmXn2z64rW3OwWbo85Mirbd', - 'ht': '1BwCaF0nfdgkM7Yt7A7d7KyVk0BcuwPGk', - 'hu': '10AkDmTxUWNbOXuYLYZ-ZPbLAdGAGZZ8J', - 'hy': '1Mi2k2alJJquT1ybd3GC3QYDstSagaWdo', - 'hz': '1c1m_-Q92v0Di7Nez6VuaccrN19i8icKV', # leer - 'ia': '1jPyqTmDuVhEhj89N606Cja5heJEbcMoM', - 'id': '1JWIvIh8fQoMQqk1rPvUThaskxnTs8tsf', - 'ie': '1TaKRlTtB8-Wqu4sfvx6JQKIugAlg0pV-', - 'ig': '15NFAf2Qx6BXSjv_Oun9_3QRBWNn49g86', - 'ii': '1qldGJkMOMKwY13DpcgbxQCbff0K982f9', # leer - 'ik': '1VoSTou2ZlwVhply26ujowDz6gjwtxmny', - 'ilo': '1-xMuIT6GaM_YeHqgm1OamGkxYfBREiv3', - 'io': '19Zla0wsAcrZm2c0Pw5ghpp4rHjYs26Pp', - 'is': '11i-NCyqS6HbldIbYulsCgQGZFXR8hwoB', - 'it': '1HmjlOaQunHqL2Te7pIkuBWrnjlmdfYo_', - 'iu': '18jKm1S7Ls3l0_pHqQH8MycG3LhoC2pdX', - 'ja': '10dz8UxyK4RIacXE2HcGdrharmp5rwc3r', - 'jam': '1v99CXf9RnbF6aJo669YeTR6mQRTOLZ74', # leer - 'jbo': '1_LmH9hc6FDGE3F7pyGB1fUEbSwuTYQdD', - 'jv': '1qiSu1uECCLl4IBZS27FBdJIBivkJ7GwE', - 'ka': '172UFuFRBX2V1aWeXlPSpu9TjS-3cxNaD', - 'kaa': '1kh6hMPUdqO-FIxRY6qaIBZothBURXxbY', - 'kab': '1oKjbZI6ZrrALCqnPCYgIjKNrKDA7ehcs', - 'kbd': '1jNbfrboPOwJmlXQBIv053d7n5WXpMRv7', - 'kg': '1iiu5z-sdJ2JLC4Ja9IgDxpRZklIb6nDx', - 'ki': '1GUtt0QI84c5McyLGGxoi5uwjHOq1d6G8', - 'kj': '1nSxXUSGDlXVCIPGlVpcakRc537MwuKZR', # leer - 'kk': '1ryC3UN0myckc1awrWhhb6RIi17C0LCuS', - 'kl': '1gXtGtX9gcTXms1IExICnqZUHefrlcIFf', - 'km': '1DS5ATxvxyfn1iWvq2G6qmjZv9pv0T6hD', - 'kn': '1ZGLYMxbb5-29MNmuUfg2xFhYUbkJFMJJ', - 'ko': '12r8tIkTnwKhLJxy71qpIcoLrT6NNhQYm', - 'koi': '1EdG_wZ_Qk124EPAZw-w6rdEhYLsgcvIj', - 'kr': '19VNQtnBA-YL_avWuVeHQHxJZ9MZ04WPF', # leer - 'krc': '1nReV4Mb7Wdj96czpO5regFbdBPu0zZ_y', - 'ks': '1kzh0Pgrv27WRMstR9MpU8mu7p60TcT-X', - 'ksh': '1iHJvrl2HeRaCumlrx3N7CPrHQ2KuLUkt', - 'ku': '1YqJog7Bkk0fHBCSTxJ9heeE-bfbkbkye', - 'kv': '1s91HI4eq8lQYlZwfrJAgaGlCyAtIhvIJ', - 'kw': '16TaIX2nRfqDp8n7zudd4bqf5abN49dvW', - 'ky': '17HPUKFdKWhUjuR1NOp5f3PQYfMlMCxCT', - 'la': '1NiQuBaUIFEERvVXo6CQLwosPraGyiRYw', - 'lad': '1PEmXCWLCqnjLBomMAYHeObM1AmVHtD08', - 'lb': '1nE4g10xoTU23idmDtOQ0w2QCuizZ6QH_', - 'lbe': '1KOm-AdRcCHfSc1-uYBxBA4GjxXjnIlE-', - 'lez': '1cJAXshrLlF1TZlPHJTpDwEvurIOsz4yR', - 'lg': '1Ur0y7iiEpWBgHECrIrT1OyIC8um_y4th', - 'li': '1TikIqfqcZlSDWhOae1JnjJiDko4nj4Dj', - 'lij': '1ro5ItUcF49iP3JdV82lhCQ07MtZn_VjW', - 'lmo': '1W4rhBy2Pi5SuYWyWbNotOVkVY3kYWS_O', - 'ln': '1bLSV6bWx0CgFm7ByKppZLpYCFL8EIAoD', - 'lo': '1C6SSLeKF3QirjZbAZAcpVX_AXYg_TJG3', - 'lrc': '1GUcS28MlJe_OjeQfS2AJ8uczpD8ut60e', - 'lt': '1gAG6TcMTmC128wWK0rCXRlCTsJY9wFQY', - 'ltg': '12ziP8t_fAAS9JqOCEC0kuJObEyuoiOjD', - 'lv': '1MPuAM04u-AtfybXdpHwCqUpFWbe-zD0_', - 'mai': '1d_nUewBkka2QGEmxCc9v3dTfvo7lPATH', - 'map-bms': '1wrNIE-mqp2xb3lrNdwADe6pb7f35NP6V', - 'mdf': '1BmMGUJy7afuKfhfTBMiKxM3D7FY-JrQ2', - 'mg': '105WaMhcWa-46tCztoj8npUyg0aH18nFL', - 'mh': '1Ej7n6yA1cF1cpD5XneftHtL33iHJwntT', - 'mhr': '1CCPIUaFkEYXiHO0HF8_w07UzVyWchrjS', - 'mi': '1F6au9xQjnF-aNBupGJ1PwaMMM6T_PgdQ', - 'min': '1tVK5SHiCy_DaZSDm3nZBgT5bgWThbJt_', - 'mk': '18NpudytGhSWq_LbmycTDw10cSftlSBGS', - 'ml': '1V73UE-EvcE-vV3V1RTvU4sak6QFcP91y', - 'mn': '14jRXicA87oXZOZllWqUjKBMetNpQEUUp', - 'mo': '1YsLGNMsJ7VsekhdcITQeolzOSK4NzE6U', - 'mr': '1vOr1AIHbgkhTO9Ol9Jx5Wh98Qdyh1QKI', - 'mrj': '1dW-YmEW8a9D5KyXz8ojSdIXWGekNzGzN', - 'ms': '1bs-_5WNRiZBjO-DtcNtkcIle-98homf_', - 'mt': '1L7aU3iGjm6SmPIU74k990qRgHFV9hrL0', - 'mus': '1_b7DcRqiKJFEFwp87cUecqf8A5BDbTIJ', # leer - 'mwl': '1MfP0jba2jQfGVeJOLq26MjI6fYY7xTPu', - 'my': '16wsIGBhNVd2lC2p6n1X8rdMbiaemeiUM', - 'myv': '1KEqHmfx2pfU-a1tdI_7ZxMQAk5NJzJjB', - 'mzn': '1CflvmYEXZnWwpsBmIs2OvG-zDDvLEMDJ', - 'na': '1r0AVjee5wNnrcgJxQmVGPVKg5YWz1irz', - 'nah': '1fx6eu91NegyueZ1i0XaB07CKjUwjHN7H', - 'nap': '1bhT4sXCJvaTchCIV9mwLBtf3a7OprbVB', - 'nds-nl': '1UIFi8eOCuFYJXSAXZ9pCWwkQMlHaY4ye', - 'nds': '1FLgZIXUWa_vekDt4ndY0B5XL7FNLiulr', - 'ne': '1gEoCjSJmzjIH4kdHsbDZzD6ID4_78ekS', - 'new': '1_-p45Ny4w9UvGuhD8uRNSPPeaARYvESH', - 'ng': '11yxPdkmpmnijQUcnFHZ3xcOmLTYJmN_R', - 'nl': '1dqYXg3ilzVOSQ_tz_dF47elSIvSIhgqd', - 'nn': '1pDrtRhQ001z2WUNMWCZQU3RV_M0BqOmv', - 'no': '1zuT8MI96Ivpiu9mEVFNjwbiM8gJlSzY2', - 'nov': '1l38388Rln0NXsSARMZHmTmyfo5C0wYTd', - 'nrm': '10vxPq1Nci7Wpq4XOvx3dtqODskzjdxJQ', - 'nso': '1iaIV8qlT0RDnbeQlnxJ3RehsG3gU5ePK', - 'nv': '1oN31jT0w3wP9aGwAPz91pSdUytnd9B0g', - 'ny': '1eEKH_rUPC560bfEg11kp3kbe8qWm35IG', - 'oc': '1C01cW8G_j8US-DTrsmeal_ENHTtNWn-H', - 'olo': '1vbDwKZKqFq84dusr1SvDx5JbBcPanx9L', # leer - 'om': '1q3h22VMbWg2kgVFm-OArR-E4y1yBQ1JX', - 'or': '1k8LwCE8nC7lq6neXDaS3zRn0KOrd9RnS', - 'os': '1u81KAB34aEQfet00dLMRIBJsfRwbDTij', - 'pa': '1JDEHL1VcLHBamgTPBom_Ryi8hk6PBpsu', - 'pag': '1k905VUWnRgY8kFb2P2431Kr4dZuolYGF', - 'pam': '1ssugGyJb8ipispC60B3I6kzMsri1WcvC', - 'pap': '1Za0wfwatxYoD7jGclmTtRoBP0uV_qImQ', - 'pcd': '1csJlKgtG04pdIYCUWhsCCZARKIGlEYPx', - 'pdc': '1Xnms4RXZKZ1BBQmQJEPokmkiweTpouUw', - 'pfl': '1tPQfHX7E0uKMdDSlwNw5aGmaS5bUK0rn', - 'pi': '16b-KxNxzbEuyoNSlI3bfe2YXmdSEsPFu', - 'pih': '1vwyihTnS8_PE5BNK7cTISmIBqGWvsVnF', - 'pl': '1fijjS0LbfpKcoPB5V8c8fH08T8AkXRp9', - 'pms': '12ySc7X9ajWWqMlBjyrPiEdc-qVBuIkbA', - 'pnb': '1RB3-wjluhTKbdTGCsk3nag1bM3m4wENb', - 'pnt': '1ZCUzms6fY4on_fW8uVgO7cEs9KHydHY_', - 'ps': '1WKl9Av6Sqz6aHKyUM5kIh90mzFzyVWH9', - 'pt': '13BX-_4_hcTUp59HDyczFDI32qUB94vUY', - 'qu': '1CB_C4ygtRoegkqgcqfXNHr8oQd-UcvDE', - 'rm': '1YRSGgWoxEqSojHXuBHJnY8vAHr1VgLu-', - 'rmy': '1uFcCyvOWBJWKFQxbkYSp373xUXVl4IgF', - 'rn': '1ekyyb2MvupYGY_E8_BhKvV664sLvW4aE', - 'ro': '1YfeNTSoxU-zJMnyQotLk5X8B_6nHryBu', - 'roa-rup': '150s4H4TdQ5nNYVC6j0E416TUAjBE85yy', - 'roa-tara': '1H6emfQsD_a5yohK4RMPQ-GrnHXqqVgr3', - 'ru': '11gP2s-SYcfS3j9MjPp5C3_nFeQB-8x86', - 'rue': '1OuSglZAndja1J5D5IUmdbt_niTTyEgYK', - 'rw': '1NuhHfi0-B-Xlr_BApijnxCw0WMEltttP', - 'sa': '1P2S3gL_zvKgXLKJJxg-Fb4z8XdlVpQik', - 'sah': '1qz0MpKckzUref2FX_FYiNzI2p4BDc5oR', - 'sc': '1oAYj_Fty4FUwjAOBEBaiZt_cY8dtpDfA', - 'scn': '1sDN9zHkXWYoHYx-DUu-GPvsUgB_IRa8S', - 'sco': '1i8W7KQPj6YZQLop89vZBSybJNgNsvXWR', - 'sd': '1vaNqfv3S8Gl5pQmig3vwWQ3cqRTsXmMR', - 'se': '1RT9xhn0Vl90zjWYDTw5V1L_u1Oh16tpP', - 'sg': '1iIh2oXD2Szz_AygUvTt3_ZK8a3RYEGZ_', - 'sh': '1qPwLiAm6t4__G-zVEOrBgYx6VRmgDgiS', - 'si': '1G5ryceID0TP6SAO42e-HAbIlCvYmnUN7', - 'simple': '1FVV49o_RlK6M5Iw_7zeJOEDQoTa5zSbq', - 'sk': '11mkYvbmAWKTInj6t4Ma8BUPxoR5o6irL', - 'sl': '1fsIZS5LgMzMzZ6T7ogStyj-ILEZIBRvO', - 'sm': '1yefECpKX_Y4R7G2tggIxvc_BvJfOAz-t', - 'sn': '1fYeCjMPvRAv94kvZjiKI-ktIDLkbv0Ve', - 'so': '1Uc-eSZnJb36SgeTvRU3GirXZOlGD_NB6', - 'sq': '11u-53n71O_yjpwRiCQSwgL7N2w72ZptX', - 'sr': '1PGLGlQi8Q0Eac6dib-uuCJAAHK6SF5Pz', - 'srn': '1JKiL3TSXqK1-KhPfAwMK0uqw90WEzg7M', - 'ss': '1e0quNEsA1dn57-IbincF4D82dRWgzQlp', - 'st': '1ny-FBzpBqIDgv6jMcsoFev3Ih65FNZFO', - 'stq': '15Fx32ROy2IM6lSqAPUykkr3CITR6Xd7v', - 'su': '1C0FJum7bYZpnyptBvfAgwJb0TX2hggtO', - 'sv': '1YyqzOSXzK5yrAou9zeTDWH_7s569mDcz', - 'sw': '1_bNTj6T8eXlNAIuHaveleWlHB_22alJs', - 'szl': '1_dXEip1snK4CPVGqH8x7lF5O-6FdCNFW', - 'ta': '1ZFTONsxGtSnC9QB6RpWSvgD_MbZwIhHH', - 'tcy': '15R6u7KQs1vmDSm_aSDrQMJ3Q6q3Be0r7', # leer - 'te': '11Sx-pBAPeZOXGyv48UNSVMD0AH7uf4YN', - 'tet': '11mr2MYLcv9pz7mHhGGNi5iNCOVErYeOt', - 'tg': '16ttF7HWqM9Cnj4qmgf3ZfNniiOJfZ52w', - 'th': '14xhIt-xr5n9nMuvcwayCGM1-zBCFZquW', - 'ti': '123q5e9MStMShp8eESGtHdSBGLDrCKfJU', - 'tk': '1X-JNInt34BNGhg8A8Peyjw2WjsALdXsD', - 'tl': '1WkQHbWd9cqtTnSHAv0DpUThaBnzeSPTJ', - 'tn': '1fHfQHetZn8-fLuRZEu-cvs-kQYwPvjyL', - 'to': '1cHOLaczYJ8h-OqQgxeoH9vMG3izg6muT', - 'tpi': '1YsRjxVu6NYOrXRb8oqMO9FPaicelFEcu', - 'tr': '1J1Zy02IxvtCK0d1Ba2h_Ulit1mVb9UIX', - 'ts': '1pIcfAt3KmtmDkyhOl-SMSeoM8aP8bOpl', - 'tt': '1vsfzCjj-_bMOn5jBai41TF5GjKJM_Ius', - 'tum': '1NWcg65daI2Bt0awyEgU6apUDbBmiqCus', - 'tw': '1WCYKZIqS7AagS76QFSfbteiOgFNBvNne', - 'ty': '1DIqaP1l-N9VXTNokrlr6EuPMGE765o4h', - 'tyv': '1F3qa05OYLBcjT1lXMurAJFDXP_EesCvM', - 'udm': '1T0YMTAPLOk768sstnewy5Jxgx2RPu3Rb', - 'ug': '1fjezvqlysyZhiQMZdazqLGgk72PqtXAw', - 'uk': '1UMJCHtzxkfLDBJE7NtfN5FeMrnnUVwoh', - 'ur': '1WNaD2TuHvdsF-z0k_emQYchwoQQDFmRk', - 'uz': '11wrG2FSTpRJc2jb5MhgvxjkVDYhT8M-l', - 've': '1PucJ7pJ4CXGEXZ5p_WleZDs2usNz74to', - 'vec': '1cAVjm_y3ehNteDQIYz9yyoq1EKkqOXZ0', - 'vep': '1K_eqV7O6C7KPJWZtmIuzFMKAagj-0O85', - 'vi': '1yQ6nhm1BmG9lD4_NaG1hE5VV6biEaV5f', - 'vls': '1bpQQW6pKHruKJJaKtuggH5rReMXyeVXp', - 'vo': '1D80QRdTpe7H4mHFKpfugscsjX71kiMJN', - 'wa': '1m4B81QYbf74htpInDU5p7d0n0ot8WLPZ', - 'war': '1EC3jsHtu22tHBv6jX_I4rupC5RwV3OYd', - 'wo': '1vChyqNNLu5xYHdyHpACwwpw4l3ptiKlo', - 'wuu': '1_EIn02xCUBcwLOwYnA-lScjS2Lh2ECw6', - 'xal': '19bKXsL1D2UesbB50JPyc9TpG1lNc2POt', - 'xh': '1pPVcxBG3xsCzEnUzlohc_p89gQ9dSJB3', - 'xmf': '1SM9llku6I_ZuZz05mOBuL2lx-KQXvehr', - 'yi': '1WNWr1oV-Nl7c1Jv8x_MiAj2vxRtyQawu', - 'yo': '1yNVOwMOWeglbOcRoZzgd4uwlN5JMynnY', - 'za': '1i7pg162cD_iU9h8dgtI2An8QCcbzUAjB', - 'zea': '1EWSkiSkPBfbyjWjZK0VuKdpqFnFOpXXQ', - 'zh-classical': '1uUKZamNp08KA7s7794sKPOqPALvo_btl', - 'zh-min-nan': '1oSgz3YBXLGUgI7kl-uMOC_ww6L0FNFmp', - 'zh-yue': '1zhwlUeeiyOAU1QqwqZ8n91yXIRPFA7UE', - 'zh': '1LZ96GUhkVHQU-aj2C3WOrtffOp0U3Z7f', - 'zu': '1FyXl_UK1737XB3drqQFhGXiJrJckiB1W' - } - return languages_ids[language] - - -class DANE(ColumnCorpus): +class NER_SWEDISH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", in_memory: bool = True, ): + """ + Initialize the NER_SWEDISH corpus for Swedish. The first time you call this constructor it will automatically + download the dataset. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ + if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {1: 'text', 3: 'pos', 9: 'ner'} + columns = {0: "text", 1: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1440,61 +1232,35 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - data_path = Path(flair.cache_root) / "datasets" / dataset_name - train_data_file = data_path / "ddt.train.conllu" - if not train_data_file.is_file(): - temp_file = cached_path( - 'https://danlp.alexandra.dk/304bd159d5de/datasets/ddt.zip', - Path("datasets") / dataset_name - ) - from zipfile import ZipFile - - with ZipFile(temp_file, 'r') as zip_file: - zip_file.extractall(path=data_path) - - # Remove CoNLL-U meta information in the last column - for part in ['train', 'dev', 'test']: - lines = [] - data_file = "ddt.{}.conllu".format(part) - with open(data_path / data_file, 'r') as file: - for line in file: - if line.startswith("#") or line == "\n": - lines.append(line) - lines.append(line.replace("name=", "").replace("|SpaceAfter=No", "")) - - with open(data_path / data_file, 'w') as file: - file.writelines(lines) + ner_spraakbanken_path = "https://raw.githubusercontent.com/klintan/swedish-ner-corpus/master/" + cached_path(f"{ner_spraakbanken_path}test_corpus.txt", Path("datasets") / dataset_name) + cached_path(f"{ner_spraakbanken_path}train_corpus.txt", Path("datasets") / dataset_name) - print(data_path / data_file) + # data is not in IOB2 format. Thus we transform it to IOB2 + add_IOB2_tags(data_file=Path(data_folder / "test_corpus.txt")) + add_IOB2_tags(data_file=Path(data_folder / "train_corpus.txt")) - super(DANE, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, - in_memory=in_memory, comment_symbol="#" + super(NER_SWEDISH, self).__init__( + data_folder, + columns, + tag_to_bioes=tag_to_bioes, + in_memory=in_memory, ) -class EUROPARL_NER_GERMAN(ColumnCorpus): +class SEC_FILLINGS(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", - in_memory: bool = False, + in_memory: bool = True, ): - """ - Initialize the EUROPARL_NER_GERMAN corpus. The first time you call this constructor it will automatically - download the dataset. - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: 'ner' by default, should not be changed. - :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: 'text', 1: 'lemma', 2: 'pos', 3: 'np', 4: 'ner'} + columns = {0: "text", 1: "pos", 3: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1505,44 +1271,35 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - europarl_ner_german_path = "https://nlpado.de/~sebastian/software/ner/" - cached_path(f"{europarl_ner_german_path}ep-96-04-15.conll", Path("datasets") / dataset_name) - cached_path(f"{europarl_ner_german_path}ep-96-04-16.conll", Path("datasets") / dataset_name) - - add_IOB_tags(data_file=Path(data_folder / "ep-96-04-15.conll"), encoding="latin-1", ner_column=4) - add_IOB_tags(data_file=Path(data_folder / "ep-96-04-16.conll"), encoding="latin-1", ner_column=4) + SEC_FILLINGS_Path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/SEC-filings/CONLL-format/data/" + cached_path(f"{SEC_FILLINGS_Path}test/FIN3.txt", Path("datasets") / dataset_name) + cached_path(f"{SEC_FILLINGS_Path}train/FIN5.txt", Path("datasets") / dataset_name) - super(EUROPARL_NER_GERMAN, self).__init__( + super(SEC_FILLINGS, self).__init__( data_folder, columns, tag_to_bioes=tag_to_bioes, - encoding="latin-1", + encoding="utf-8", in_memory=in_memory, - train_file='ep-96-04-16.conll', - test_file='ep-96-04-15.conll' + train_file='FIN5.txt', + test_file="FIN3.txt", + skip_first_line=True ) -class GERMEVAL_14(ColumnCorpus): +class SEMEVAL2017(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", + tag_to_bioes: str = "keyword", in_memory: bool = True, ): - """ - Initialize the GermEval NER corpus for German. This is only possible if you've manually downloaded it to your - machine. Obtain the corpus from https://sites.google.com/site/germeval2014ner/data and put it into some folder. - Then point the base_path parameter in the constructor to this folder - :param base_path: Path to the GermEval corpus on your machine - :param tag_to_bioes: 'ner' by default, should not be changed. - :param in_memory:If True, keeps dataset in memory giving speedups in training. - """ + if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {1: "text", 2: "ner"} + columns = {0: "text", 1: "keyword"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1552,24 +1309,17 @@ def __init__( base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name - # check if data there - if not data_folder.exists(): - log.warning("-" * 100) - log.warning(f'WARNING: GermEval-14 dataset not found at "{data_folder}".') - log.warning( - 'Instructions for obtaining the data can be found here: https://sites.google.com/site/germeval2014ner/data"' - ) - log.warning("-" * 100) - super(GERMEVAL_14, self).__init__( - data_folder, - columns, - tag_to_bioes=tag_to_bioes, - comment_symbol="#", - in_memory=in_memory, + semeval2017_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/SemEval-2017" + cached_path(f"{semeval2017_path}/train.txt", Path("datasets") / dataset_name) + cached_path(f"{semeval2017_path}/test.txt", Path("datasets") / dataset_name) + cached_path(f"{semeval2017_path}/dev.txt", Path("datasets") / dataset_name) + + super(SEMEVAL2017, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class INSPEC(ColumnCorpus): +class SEMEVAL2010(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, @@ -1591,35 +1341,33 @@ def __init__( base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name - inspec_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/Inspec" - cached_path(f"{inspec_path}/train.txt", Path("datasets") / dataset_name) - cached_path(f"{inspec_path}/test.txt", Path("datasets") / dataset_name) - if not "dev.txt" in os.listdir(data_folder): - cached_path(f"{inspec_path}/valid.txt", Path("datasets") / dataset_name) - # rename according to train - test - dev - convention - os.rename(data_folder / "valid.txt", data_folder / "dev.txt") + semeval2010_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/processed_semeval-2010" + cached_path(f"{semeval2010_path}/train.txt", Path("datasets") / dataset_name) + cached_path(f"{semeval2010_path}/test.txt", Path("datasets") / dataset_name) - super(INSPEC, self).__init__( + super(SEMEVAL2010, self).__init__( data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class LER_GERMAN(ColumnCorpus): +class TURKU_NER(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", - in_memory: bool = False, + in_memory: bool = True, + document_as_sequence: bool = False, ): """ - Initialize the LER_GERMAN (Legal Entity Recognition) corpus. The first time you call this constructor it will automatically + Initialize the Finnish TurkuNER corpus. The first time you call this constructor it will automatically download the dataset. :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary. - :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage. + :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict + POS tags instead + :param in_memory: If True, keeps dataset in memory giving speedups in training. :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ - if type(base_path) == str: base_path: Path = Path(base_path) @@ -1635,18 +1383,29 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ler_path = "https://raw.githubusercontent.com/elenanereiss/Legal-Entity-Recognition/master/data/" - cached_path(f"{ler_path}ler.conll", Path("datasets") / dataset_name) + conll_path = "https://raw.githubusercontent.com/TurkuNLP/turku-ner-corpus/master/data/conll" + dev_file = "dev.tsv" + test_file = "test.tsv" + train_file = "train.tsv" + cached_path(f"{conll_path}/{dev_file}", Path("datasets") / dataset_name) + cached_path(f"{conll_path}/{test_file}", Path("datasets") / dataset_name) + cached_path(f"{conll_path}/{train_file}", Path("datasets") / dataset_name) - super(LER_GERMAN, self).__init__( + super(TURKU_NER, self).__init__( data_folder, columns, + dev_file=dev_file, + test_file=test_file, + train_file=train_file, + column_delimiter="\t", tag_to_bioes=tag_to_bioes, + encoding="latin-1", in_memory=in_memory, - train_file='ler.conll' + document_separator_token=None if not document_as_sequence else "-DOCSTART-", ) -class ANER_CORP(ColumnCorpus): + +class TWITTER_NER(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, @@ -1655,15 +1414,14 @@ def __init__( document_as_sequence: bool = False, ): """ - Initialize a preprocessed version of the Arabic Named Entity Recognition Corpus (ANERCorp) dataset available - from https://github.com/EmnamoR/Arabic-named-entity-recognition/blob/master/ANERCorp.rar. - http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp - Column order is swapped - The first time you call this constructor it will automatically download the dataset. + Initialize a dataset called twitter_ner which can be found on the following page: + https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/ner.txt. + + The first time you call this constructor it will automatically + download the dataset. :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict - POS tags instead + :param tag_to_bioes: NER by default, need not be changed :param in_memory: If True, keeps dataset in memory giving speedups in training. :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ @@ -1671,7 +1429,7 @@ def __init__( base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "ner"} + columns = {0: 'text', 1: 'ner'} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1682,32 +1440,41 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - anercorp_path = "https://megantosh.s3.eu-central-1.amazonaws.com/ANERcorp/" - # cached_path(f"{anercorp_path}test.txt", Path("datasets") / dataset_name) - cached_path(f"{anercorp_path}train.txt", Path("datasets") / dataset_name) + twitter_ner_path = "https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/" + cached_path(f"{twitter_ner_path}ner.txt", Path("datasets") / dataset_name) - super(ANER_CORP, self).__init__( + super(TWITTER_NER, self).__init__( data_folder, columns, - # tag_to_bioes=tag_to_bioes, - encoding="utf-8", + tag_to_bioes=tag_to_bioes, + encoding="latin-1", + train_file="ner.txt", in_memory=in_memory, document_separator_token=None if not document_as_sequence else "-DOCSTART-", ) -class NER_BASQUE(ColumnCorpus): +class UP_CHINESE(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", in_memory: bool = True, + document_as_sequence: bool = False, ): + """ + Initialize the Chinese dataset from the Universal Propositions Bank, comming from that webpage: + https://github.com/System-T/UniversalPropositions/tree/master/UP_Chinese + + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "ner"} + columns = {1: "text", 9: "frame"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1718,44 +1485,45 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ner_basque_path = "http://ixa2.si.ehu.eus/eiec/" - data_path = Path(flair.cache_root) / "datasets" / dataset_name - data_file = data_path / "named_ent_eu.train" - if not data_file.is_file(): - cached_path( - f"{ner_basque_path}/eiec_v1.0.tgz", Path("datasets") / dataset_name - ) - import tarfile, shutil - - with tarfile.open( - Path(flair.cache_root) / "datasets" / dataset_name / "eiec_v1.0.tgz", - "r:gz", - ) as f_in: - corpus_files = ( - "eiec_v1.0/named_ent_eu.train", - "eiec_v1.0/named_ent_eu.test", - ) - for corpus_file in corpus_files: - f_in.extract(corpus_file, data_path) - shutil.move(f"{data_path}/{corpus_file}", data_path) + up_zh_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Chinese/" + cached_path(f"{up_zh_path}zh-up-train.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_zh_path}zh-up-dev.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_zh_path}zh-up-test.conllu", Path("datasets") / dataset_name) - super(NER_BASQUE, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory + super(UP_CHINESE, self).__init__( + data_folder, + columns, + encoding="utf-8", + train_file="zh-up-train.conllu", + test_file="zh-up-test.conllu", + dev_file="zh-up-dev.conllu", + in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + comment_symbol="#", ) -class NER_FINNISH(ColumnCorpus): +class UP_ENGLISH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", in_memory: bool = True, + document_as_sequence: bool = False, ): + """ + Initialize the English dataset from the Universal Propositions Bank, comming from that webpage: + https://github.com/System-T/UniversalPropositions. + + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "ner"} + columns = {1: "text", 10: "frame"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1766,48 +1534,45 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ner_finnish_path = "https://raw.githubusercontent.com/mpsilfve/finer-data/master/data/digitoday." - cached_path(f"{ner_finnish_path}2014.train.csv", Path("datasets") / dataset_name) - cached_path(f"{ner_finnish_path}2014.dev.csv", Path("datasets") / dataset_name) - cached_path(f"{ner_finnish_path}2015.test.csv", Path("datasets") / dataset_name) - - _remove_lines_without_annotations(data_file=Path(data_folder / "digitoday.2015.test.csv")) + up_en_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_English-EWT/" + cached_path(f"{up_en_path}en_ewt-up-train.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_en_path}en_ewt-up-dev.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_en_path}en_ewt-up-test.conllu", Path("datasets") / dataset_name) - super(NER_FINNISH, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, skip_first_line=True + super(UP_ENGLISH, self).__init__( + data_folder, + columns, + encoding="utf-8", + train_file="en_ewt-up-train.conllu", + test_file="en_ewt-up-test.conllu", + dev_file="en_ewt-up-dev.conllu", + in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + comment_symbol="#", ) -def _remove_lines_without_annotations(data_file: Union[str, Path] = None): - with open(data_file, 'r') as f: - lines = f.readlines() - with open(data_file, 'w') as f: - for line in lines: - if len(line.split()) != 1: - f.write(line) - - -class NER_SWEDISH(ColumnCorpus): +class UP_FRENCH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", in_memory: bool = True, + document_as_sequence: bool = False, ): """ - Initialize the NER_SWEDISH corpus for Swedish. The first time you call this constructor it will automatically - download the dataset. + Initialize the French dataset from the Universal Propositions Bank, comming from that webpage: + https://github.com/System-T/UniversalPropositions. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary. :param in_memory: If True, keeps dataset in memory giving speedups in training. :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ - if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "ner"} + columns = {1: "text", 9: "frame"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1818,35 +1583,45 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ner_spraakbanken_path = "https://raw.githubusercontent.com/klintan/swedish-ner-corpus/master/" - cached_path(f"{ner_spraakbanken_path}test_corpus.txt", Path("datasets") / dataset_name) - cached_path(f"{ner_spraakbanken_path}train_corpus.txt", Path("datasets") / dataset_name) - - # data is not in IOB2 format. Thus we transform it to IOB2 - add_IOB2_tags(data_file=Path(data_folder / "test_corpus.txt")) - add_IOB2_tags(data_file=Path(data_folder / "train_corpus.txt")) + up_fr_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_French/" + cached_path(f"{up_fr_path}fr-up-train.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_fr_path}fr-up-dev.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_fr_path}fr-up-test.conllu", Path("datasets") / dataset_name) - super(NER_SWEDISH, self).__init__( + super(UP_FRENCH, self).__init__( data_folder, columns, - tag_to_bioes=tag_to_bioes, + encoding="utf-8", + train_file="fr-up-train.conllu", + test_file="fr-up-test.conllu", + dev_file="fr-up-dev.conllu", in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + comment_symbol="#", ) -class SEMEVAL2017(ColumnCorpus): +class UP_FINNISH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "keyword", in_memory: bool = True, + document_as_sequence: bool = False, ): + """ + Initialize the Finnish dataset from the Universal Propositions Bank, comming from that webpage: + https://github.com/System-T/UniversalPropositions/tree/master/UP_Finnish + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "keyword"} + columns = {1: "text", 9: "frame"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1856,29 +1631,46 @@ def __init__( base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name - semeval2017_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/SemEval-2017" - cached_path(f"{semeval2017_path}/train.txt", Path("datasets") / dataset_name) - cached_path(f"{semeval2017_path}/test.txt", Path("datasets") / dataset_name) - cached_path(f"{semeval2017_path}/dev.txt", Path("datasets") / dataset_name) + # download data if necessary + up_fi_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Finnish/" + cached_path(f"{up_fi_path}fi-up-train.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_fi_path}fi-up-dev.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_fi_path}fi-up-test.conllu", Path("datasets") / dataset_name) - super(SEMEVAL2017, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory + super(UP_FINNISH, self).__init__( + data_folder, + columns, + encoding="utf-8", + train_file="fi-up-train.conllu", + test_file="fi-up-test.conllu", + dev_file="fi-up-dev.conllu", + in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + comment_symbol="#", ) -class SEMEVAL2010(ColumnCorpus): +class UP_GERMAN(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "keyword", in_memory: bool = True, + document_as_sequence: bool = False, ): + """ + Initialize the German dataset from the Universal Propositions Bank, comming from that webpage: + https://github.com/System-T/UniversalPropositions. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "keyword"} + columns = {1: "text", 9: "frame"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1888,27 +1680,46 @@ def __init__( base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name - semeval2010_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/processed_semeval-2010" - cached_path(f"{semeval2010_path}/train.txt", Path("datasets") / dataset_name) - cached_path(f"{semeval2010_path}/test.txt", Path("datasets") / dataset_name) + # download data if necessary + up_de_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_German/" + cached_path(f"{up_de_path}de-up-train.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_de_path}de-up-dev.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_de_path}de-up-test.conllu", Path("datasets") / dataset_name) - super(SEMEVAL2010, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory + super(UP_GERMAN, self).__init__( + data_folder, + columns, + encoding="utf-8", + train_file="de-up-train.conllu", + test_file="de-up-test.conllu", + dev_file="de-up-dev.conllu", + in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + comment_symbol="#", ) -class WIKINER_ENGLISH(ColumnCorpus): +class UP_ITALIAN(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", - in_memory: bool = False, + in_memory: bool = True, + document_as_sequence: bool = False, ): + """ + Initialize the Italian dataset from the Universal Propositions Bank, comming from that webpage: + https://github.com/System-T/UniversalPropositions/tree/master/UP_Italian + + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "pos", 2: "ner"} + columns = {1: "text", 9: "frame"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1919,25 +1730,45 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - _download_wikiner("en", dataset_name) + up_it_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Italian/" + cached_path(f"{up_it_path}it-up-train.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_it_path}it-up-dev.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_it_path}it-up-test.conllu", Path("datasets") / dataset_name) - super(WIKINER_ENGLISH, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory + super(UP_ITALIAN, self).__init__( + data_folder, + columns, + encoding="utf-8", + train_file="it-up-train.conllu", + test_file="it-up-test.conllu", + dev_file="it-up-dev.conllu", + in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + comment_symbol="#", ) -class WIKINER_GERMAN(ColumnCorpus): +class UP_SPANISH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", - in_memory: bool = False, + in_memory: bool = True, + document_as_sequence: bool = False, ): + """ + Initialize the Spanish dataset from the Universal Propositions Bank, comming from that webpage: + https://github.com/System-T/UniversalPropositions + + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "pos", 2: "ner"} + columns = {1: "text", 9: "frame"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1948,25 +1779,45 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - _download_wikiner("de", dataset_name) + up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish/" + cached_path(f"{up_es_path}es-up-train.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_es_path}es-up-dev.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_es_path}es-up-test.conllu", Path("datasets") / dataset_name) - super(WIKINER_GERMAN, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory + super(UP_SPANISH, self).__init__( + data_folder, + columns, + encoding="utf-8", + train_file="es-up-train.conllu", + test_file="es-up-test.conllu", + dev_file="es-up-dev.conllu", + in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + comment_symbol="#", ) -class WIKINER_DUTCH(ColumnCorpus): +class UP_SPANISH_ANCORA(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", - in_memory: bool = False, + in_memory: bool = True, + document_as_sequence: bool = False, ): + """ + Initialize the Spanish AnCora dataset from the Universal Propositions Bank, comming from that webpage: + https://github.com/System-T/UniversalPropositions + + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "pos", 2: "ner"} + columns = {1: "text", 9: "frame"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1977,25 +1828,47 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - _download_wikiner("nl", dataset_name) + up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish-AnCora/" + cached_path(f"{up_es_path}es_ancora-up-train.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_es_path}es_ancora-up-dev.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_es_path}es_ancora-up-test.conllu", Path("datasets") / dataset_name) - super(WIKINER_DUTCH, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory + super(UP_SPANISH_ANCORA, self).__init__( + data_folder, + columns, + encoding="utf-8", + train_file="es_ancora-up-train.conllu", + test_file="es_ancora-up-test.conllu", + dev_file="es_ancora-up-dev.conllu", + in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + comment_symbol="#", ) -class WIKINER_FRENCH(ColumnCorpus): +class WEIBO_NER(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", - in_memory: bool = False, + in_memory: bool = True, + document_as_sequence: bool = False, ): + """ + Initialize the WEIBO_NER corpus . The first time you call this constructor it will automatically + download the dataset. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict + POS tags instead + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "pos", 2: "ner"} + columns = {0: 'text', 1: 'ner'} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2006,192 +1879,449 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - _download_wikiner("fr", dataset_name) + weiboNER_conll_path = "https://raw.githubusercontent.com/87302380/WEIBO_NER/main/data/" + cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.train", Path("datasets") / dataset_name) + cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.test", Path("datasets") / dataset_name) + cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.dev", Path("datasets") / dataset_name) - super(WIKINER_FRENCH, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory + super(WEIBO_NER, self).__init__( + data_folder, + columns, + tag_to_bioes=tag_to_bioes, + encoding="utf-8", + in_memory=in_memory, + train_file="weiboNER_2nd_conll_format.train", + test_file="weiboNER_2nd_conll_format.test", + dev_file="weiboNER_2nd_conll_format.dev", + document_separator_token=None if not document_as_sequence else "-DOCSTART-", ) -class WIKINER_ITALIAN(ColumnCorpus): +class WIKIANN(MultiCorpus): def __init__( self, + languages: Union[str, List[str]], base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", in_memory: bool = False, ): + """ + WkiAnn corpus for cross-lingual NER consisting of datasets from 282 languages that exist + in Wikipedia. See https://elisa-ie.github.io/wikiann/ for details and for the languages and their + respective abbreveations, i.e. "en" for english. (license: https://opendatacommons.org/licenses/by/) + Parameters + ---------- + languages : Union[str, List[str]] + Should be an abbreviation of a language ("en", "de",..) or a list of abbreviations. + The datasets of all passed languages will be saved in one MultiCorpus. + (Note that, even though listed on https://elisa-ie.github.io/wikiann/ some datasets are empty. + This includes "aa", "cho", "ho", "hz", "ii", "jam", "kj", "kr", "mus", "olo" and "tcy".) + base_path : Union[str, Path], optional + Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + tag_to_bioes : str, optional + The data is in bio-format. It will by default (with the string "ner" as value) be transformed + into the bioes format. If you dont want that set it to None. + + """ + if type(languages) == str: + languages = [languages] + if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "pos", 2: "ner"} + columns = {0: "text", 1: "ner"} # this dataset name - dataset_name = self.__class__.__name__.lower() + dataset_name = "wikiann" # default dataset folder is the cache root if not base_path: base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name - # download data if necessary - _download_wikiner("it", dataset_name) + # For each language in languages, the file is downloaded if not existent + # Then a comlumncorpus of that data is created and saved in a list + # this list is handed to the multicorpus - super(WIKINER_ITALIAN, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory - ) + # list that contains the columncopora + corpora = [] + google_drive_path = 'https://drive.google.com/uc?id=' + # download data if necessary + first = True + for language in languages: -class WIKINER_SPANISH(ColumnCorpus): - def __init__( - self, - base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", - in_memory: bool = False, - ): - if type(base_path) == str: - base_path: Path = Path(base_path) - - # column format - columns = {0: "text", 1: "pos", 2: "ner"} - - # this dataset name - dataset_name = self.__class__.__name__.lower() - - # default dataset folder is the cache root - if not base_path: - base_path = Path(flair.cache_root) / "datasets" - data_folder = base_path / dataset_name - - # download data if necessary - _download_wikiner("es", dataset_name) - - super(WIKINER_SPANISH, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory - ) - - -class WIKINER_PORTUGUESE(ColumnCorpus): - def __init__( - self, - base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", - in_memory: bool = False, - ): - if type(base_path) == str: - base_path: Path = Path(base_path) - - # column format - columns = {0: "text", 1: "pos", 2: "ner"} - - # this dataset name - dataset_name = self.__class__.__name__.lower() - - # default dataset folder is the cache root - if not base_path: - base_path = Path(flair.cache_root) / "datasets" - data_folder = base_path / dataset_name - - # download data if necessary - _download_wikiner("pt", dataset_name) - - super(WIKINER_PORTUGUESE, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory - ) - - -class WIKINER_POLISH(ColumnCorpus): - def __init__( - self, - base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", - in_memory: bool = False, - ): - if type(base_path) == str: - base_path: Path = Path(base_path) - - # column format - columns = {0: "text", 1: "pos", 2: "ner"} - - # this dataset name - dataset_name = self.__class__.__name__.lower() - - # default dataset folder is the cache root - if not base_path: - base_path = Path(flair.cache_root) / "datasets" - data_folder = base_path / dataset_name - - # download data if necessary - _download_wikiner("pl", dataset_name) - - super(WIKINER_POLISH, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory - ) - + language_folder = data_folder / language + file_name = 'wikiann-' + language + '.bio' -class WIKINER_RUSSIAN(ColumnCorpus): - def __init__( - self, - base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", - in_memory: bool = False, - ): - if type(base_path) == str: - base_path: Path = Path(base_path) + # if language not downloaded yet, download it + if not language_folder.exists(): + if first == True: + import gdown + import tarfile + first = False + # create folder + os.makedirs(language_folder) + # get google drive id from list + google_id = google_drive_id_from_language_name(language) + url = google_drive_path + google_id - # column format - columns = {0: "text", 1: "pos", 2: "ner"} + # download from google drive + gdown.download(url, str(language_folder / language) + '.tar.gz') - # this dataset name - dataset_name = self.__class__.__name__.lower() + # unzip + print("Extract data...") + tar = tarfile.open(str(language_folder / language) + '.tar.gz', "r:gz") + # tar.extractall(language_folder,members=[tar.getmember(file_name)]) + tar.extract(file_name, str(language_folder)) + tar.close() + print('...done.') - # default dataset folder is the cache root - if not base_path: - base_path = Path(flair.cache_root) / "datasets" - data_folder = base_path / dataset_name + # transform data into required format + # the processed dataset has the additional ending "_new" + print("Process dataset...") + silver_standard_to_simple_ner_annotation(str(language_folder / file_name)) + # remove the unprocessed dataset + os.remove(str(language_folder / file_name)) + print('...done.') - # download data if necessary - _download_wikiner("ru", dataset_name) + # initialize comlumncorpus and add it to list + print("Read data into corpus...") + corp = ColumnCorpus(data_folder=language_folder, + column_format=columns, + train_file=file_name + '_new', + tag_to_bioes=tag_to_bioes, + in_memory=in_memory, + ) + corpora.append(corp) + print("...done.") - super(WIKINER_RUSSIAN, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory + super(WIKIANN, self).__init__( + corpora, name='wikiann' ) -class WNUT_17(ColumnCorpus): - def __init__( - self, - base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", - in_memory: bool = True, - ): - if type(base_path) == str: - base_path: Path = Path(base_path) - - # column format - columns = {0: "text", 1: "ner"} - - # this dataset name - dataset_name = self.__class__.__name__.lower() +def silver_standard_to_simple_ner_annotation(data_file: Union[str, Path]): + f_read = open(data_file, 'r', encoding='utf-8') + f_write = open(data_file + '_new', 'w+', encoding='utf-8') + while True: + line = f_read.readline() + if line: + if line == '\n': + f_write.write(line) + else: + liste = line.split() + f_write.write(liste[0] + ' ' + liste[-1] + '\n') + else: + break + f_read.close() + f_write.close() - # default dataset folder is the cache root - if not base_path: - base_path = Path(flair.cache_root) / "datasets" - data_folder = base_path / dataset_name - # download data if necessary - wnut_path = "https://noisy-text.github.io/2017/files/" - cached_path(f"{wnut_path}wnut17train.conll", Path("datasets") / dataset_name) - cached_path(f"{wnut_path}emerging.dev.conll", Path("datasets") / dataset_name) - cached_path( - f"{wnut_path}emerging.test.annotated", Path("datasets") / dataset_name - ) +def google_drive_id_from_language_name(language): + languages_ids = { + 'aa': '1tDDlydKq7KQQ3_23Ysbtke4HJOe4snIk', # leer + 'ab': '1hB8REj2XA_0DjI9hdQvNvSDpuBIb8qRf', + 'ace': '1WENJS2ppHcZqaBEXRZyk2zY-PqXkTkgG', + 'ady': '1n6On8WWDHxEoybj7F9K15d_fkGPy6KgO', + 'af': '1CPB-0BD2tg3zIT60D3hmJT0i5O_SKja0', + 'ak': '1l2vlGHnQwvm9XhW5S-403fetwUXhBlZm', + 'als': '196xyYjhbie7sYLHLZHWkkurOwQLi8wK-', + 'am': '1ug1IEoExKD3xWpvfZprAPSQi82YF9Cet', + 'an': '1DNLgPOAOsGZBYd6rC5ddhzvc9_DtWnk2', + 'ang': '1W_0ti7Tl8AkqM91lRCMPWEuUnPOAZroV', + 'ar': '1tyvd32udEQG_cNeVpaD5I2fxvCc6XKIS', + 'arc': '1hSOByStqPmP3b9HfQ39EclUZGo8IKCMb', + 'arz': '1CKW5ZhxTpIHmc8Jt5JLz_5O6Cr8Icsan', + 'as': '12opBoIweBLM8XciMHT4B6-MAaKdYdvpE', + 'ast': '1rp64PxGZBDfcw-tpFBjLg_ddLDElG1II', + 'av': '1hncGUrkG1vwAAQgLtwOf41BWkHkEvdss', + 'ay': '1VmIsWpMTz442b4Mx798ZOgtB9vquKQtf', + 'az': '1FXDXsvBSdqc7GGIDZv0hqBOaaw12Ip2-', + 'azb': '1amVqOuHLEkhjn8rkGUl-mXdZlaACWyNT', + 'ba': '1aLx1d8GagI11VZVYOGQy0BEePeqoT0x3', + 'bar': '1JZ8-k8ZmnpWYI_Yl_cBBgjVdxoM9Daci', + 'bat-smg': '1trxKXDFSeKsygTMKi-ZqXSJs7F90k5a8', + 'bcl': '1Hs0k7KVZ2DPsqroZ4cUKcwZG4HdPV794', + 'be-x-old': '1gaK-spj1m6eGYQ-SsngLxxLUvP1VRk08', + 'be': '1_ttfOSy9BzCRkIT_p3mImT82XRPpEiuH', + 'bg': '1Iug6gYKemb0OrLTUrKDc_c66YGypTfCF', + 'bh': '12OcSFLu940A8tVQLxI8pnxKBpTeZHmrh', + 'bi': '1rftVziS_pqARx4mvLJC0sKLY-OL5ZIjE', + 'bjn': '1n17mkRjPUAOWQk5LQs2C3Tz3ShxK0enZ', + 'bm': '1284dwO_sfdsWE7FR06HhfBRUb8ePesKR', + 'bn': '1K2DM1mT4hkr6NlAIBTj95BeVXcgvpgDm', + 'bo': '1SzGHDVK-OguKdjZ4DXWiOJVrie1iHeWm', + 'bpy': '1m-e5EoruJufvwBEgJLmJtx6jzx64pYN2', + 'br': '1xdaBoJ1DnwI0iEq7gQN1dWcABAs_bM9H', + 'bs': '167dsB01trMYFQl8FshtIdfhjw7IfVKbk', + 'bug': '1yCnevM9_KJzFk27Vxsva_20OacLo4Uam', + 'bxr': '1DlByAX3zB-9UyEAVD4wtX-R7mXC-8xum', + 'ca': '1LuUgbd9sGa-5Ahcsy31EK89a3WOowftY', + 'cbk-zam': '1kgF8xoD-kIOWZET_9kp_4yNX6AAXn6PI', + 'cdo': '14x1y6611G-UAEGq92QEHRpreVkYnoUCw', + 'ce': '1QUUCVKA-fkiCHd3KT3zUWefaWnxzlZLu', + 'ceb': '1DJZE9RfaMoPNXHI73KBXAm4YSe-_YCUk', + 'ch': '1YzAfhmatkmTpkZbAcD6X83epCgzD5S2_', + 'cho': '1ciY0vF3c5a2mTOo_k32A2wMs0klK98Kb', # leer + 'chr': '1EHaxz1UZHn7v2bbRzCLAhPsNtRzrG3Ae', + 'chy': '1nNWwMAJr1KNdz3bHf6uIn-thZCknlTeB', + 'ckb': '1llpaftcUSiXCZQZMdAqaJSrhwMdcf9IV', + 'co': '1ZP-8oWgMYfW7a6w6ygEFkKDGbN39QnDn', + 'cr': '1ST0xRicLAG4JdCZwGdaY-0pEXooQh7e6', + 'crh': '1Jmpq2XVYUR_XaXU5XNhtOMnz-qkpsgpE', + 'cs': '1Vydyze-jBkK_S1uV5ewV_Y6dbwhXr7lk', + 'csb': '1naUyF74lZPnnopXdOqf5Xor2kT4WoHfS', + 'cu': '1EN5dVTU6jc7YOYPCHq8EYUF31HlMUKs7', + 'cv': '1gEUAlqYSSDI4TrWCqP1LUq2n0X1XEjN3', + 'cy': '1q5g6NJE5GXf65Vc_P4BnUMHQ49Prz-J1', + 'da': '11onAGOLkkqrIwM784siWlg-cewa5WKm8', + 'de': '1f9nWvNkCCy6XWhd9uf4Dq-2--GzSaYAb', + 'diq': '1IkpJaVbEOuOs9qay_KG9rkxRghWZhWPm', + 'dsb': '1hlExWaMth-2eVIQ3i3siJSG-MN_7Z6MY', + 'dv': '1WpCrslO4I7TMb2uaKVQw4U2U8qMs5szi', + 'dz': '10WX52ePq2KfyGliwPvY_54hIjpzW6klV', + 'ee': '1tYEt3oN2KPzBSWrk9jpCqnW3J1KXdhjz', + 'el': '1cxq4NUYmHwWsEn5waYXfFSanlINXWLfM', + 'eml': '17FgGhPZqZNtzbxpTJOf-6nxEuI5oU4Vd', + 'en': '1mqxeCPjxqmO7e8utj1MQv1CICLFVvKa-', + 'eo': '1YeknLymGcqj44ug2yd4P7xQVpSK27HkK', + 'es': '1Dnx3MVR9r5cuoOgeew2gT8bDvWpOKxkU', + 'et': '1Qhb3kYlQnLefWmNimdN_Vykm4mWzbcWy', + 'eu': '1f613wH88UeITYyBSEMZByK-nRNMwLHTs', + 'ext': '1D0nLOZ3aolCM8TShIRyCgF3-_MhWXccN', + 'fa': '1QOG15HU8VfZvJUNKos024xI-OGm0zhEX', + 'ff': '1h5pVjxDYcq70bSus30oqi9KzDmezVNry', + 'fi': '1y3Kf6qYsSvL8_nSEwE1Y6Bf6ninaPvqa', + 'fiu-vro': '1oKUiqG19WgPd3CCl4FGudk5ATmtNfToR', + 'fj': '10xDMuqtoTJlJFp5ghbhKfNWRpLDK3W4d', + 'fo': '1RhjYqgtri1276Be1N9RrNitdBNkpzh0J', + 'fr': '1sK_T_-wzVPJYrnziNqWTriU52rEsXGjn', + 'frp': '1NUm8B2zClBcEa8dHLBb-ZgzEr8phcQyZ', + 'frr': '1FjNqbIUlOW1deJdB8WCuWjaZfUzKqujV', + 'fur': '1oqHZMK7WAV8oHoZLjGR0PfmO38wmR6XY', + 'fy': '1DvnU6iaTJc9bWedmDklHyx8nzKD1s3Ge', + 'ga': '1Ql6rh7absdYQ8l-3hj_MVKcEC3tHKeFB', + 'gag': '1zli-hOl2abuQ2wsDJU45qbb0xuvYwA3a', + 'gan': '1u2dOwy58y-GaS-tCPJS_i9VRDQIPXwCr', + 'gd': '1umsUpngJiwkLdGQbRqYpkgxZju9dWlRz', + 'gl': '141K2IbLjJfXwFTIf-kthmmG0YWdi8liE', + 'glk': '1ZDaxQ6ilXaoivo4_KllagabbvfOuiZ0c', + 'gn': '1hM4MuCaVnZqnL-w-0N-WcWag22ikVLtZ', + 'gom': '1BNOSw75tzPC0wEgLOCKbwu9wg9gcLOzs', + 'got': '1YSHYBtXc1WvUvMIHPz6HHgJvaXKulJUj', + 'gu': '1VdK-B2drqFwKg8KD23c3dKXY-cZgCMgd', + 'gv': '1XZFohYNbKszEFR-V-yDXxx40V41PV9Zm', + 'ha': '18ZG4tUU0owRtQA8Ey3Dl72ALjryEJWMC', + 'hak': '1QQe3WgrCWbvnVH42QXD7KX4kihHURB0Z', + 'haw': '1FLqlK-wpz4jy768XbQAtxd9PhC-9ciP7', + 'he': '18K-Erc2VOgtIdskaQq4D5A3XkVstDmfX', + 'hi': '1lBRapb5tjBqT176gD36K5yb_qsaFeu-k', + 'hif': '153MQ9Ga4NQ-CkK8UiJM3DjKOk09fhCOV', + 'ho': '1c1AoS7yq15iVkTEE-0f3x25NT4F202B8', # leer + 'hr': '1wS-UtB3sGHuXJQQGR0F5lDegogsgoyif', + 'hsb': '1_3mMLzAE5OmXn2z64rW3OwWbo85Mirbd', + 'ht': '1BwCaF0nfdgkM7Yt7A7d7KyVk0BcuwPGk', + 'hu': '10AkDmTxUWNbOXuYLYZ-ZPbLAdGAGZZ8J', + 'hy': '1Mi2k2alJJquT1ybd3GC3QYDstSagaWdo', + 'hz': '1c1m_-Q92v0Di7Nez6VuaccrN19i8icKV', # leer + 'ia': '1jPyqTmDuVhEhj89N606Cja5heJEbcMoM', + 'id': '1JWIvIh8fQoMQqk1rPvUThaskxnTs8tsf', + 'ie': '1TaKRlTtB8-Wqu4sfvx6JQKIugAlg0pV-', + 'ig': '15NFAf2Qx6BXSjv_Oun9_3QRBWNn49g86', + 'ii': '1qldGJkMOMKwY13DpcgbxQCbff0K982f9', # leer + 'ik': '1VoSTou2ZlwVhply26ujowDz6gjwtxmny', + 'ilo': '1-xMuIT6GaM_YeHqgm1OamGkxYfBREiv3', + 'io': '19Zla0wsAcrZm2c0Pw5ghpp4rHjYs26Pp', + 'is': '11i-NCyqS6HbldIbYulsCgQGZFXR8hwoB', + 'it': '1HmjlOaQunHqL2Te7pIkuBWrnjlmdfYo_', + 'iu': '18jKm1S7Ls3l0_pHqQH8MycG3LhoC2pdX', + 'ja': '10dz8UxyK4RIacXE2HcGdrharmp5rwc3r', + 'jam': '1v99CXf9RnbF6aJo669YeTR6mQRTOLZ74', # leer + 'jbo': '1_LmH9hc6FDGE3F7pyGB1fUEbSwuTYQdD', + 'jv': '1qiSu1uECCLl4IBZS27FBdJIBivkJ7GwE', + 'ka': '172UFuFRBX2V1aWeXlPSpu9TjS-3cxNaD', + 'kaa': '1kh6hMPUdqO-FIxRY6qaIBZothBURXxbY', + 'kab': '1oKjbZI6ZrrALCqnPCYgIjKNrKDA7ehcs', + 'kbd': '1jNbfrboPOwJmlXQBIv053d7n5WXpMRv7', + 'kg': '1iiu5z-sdJ2JLC4Ja9IgDxpRZklIb6nDx', + 'ki': '1GUtt0QI84c5McyLGGxoi5uwjHOq1d6G8', + 'kj': '1nSxXUSGDlXVCIPGlVpcakRc537MwuKZR', # leer + 'kk': '1ryC3UN0myckc1awrWhhb6RIi17C0LCuS', + 'kl': '1gXtGtX9gcTXms1IExICnqZUHefrlcIFf', + 'km': '1DS5ATxvxyfn1iWvq2G6qmjZv9pv0T6hD', + 'kn': '1ZGLYMxbb5-29MNmuUfg2xFhYUbkJFMJJ', + 'ko': '12r8tIkTnwKhLJxy71qpIcoLrT6NNhQYm', + 'koi': '1EdG_wZ_Qk124EPAZw-w6rdEhYLsgcvIj', + 'kr': '19VNQtnBA-YL_avWuVeHQHxJZ9MZ04WPF', # leer + 'krc': '1nReV4Mb7Wdj96czpO5regFbdBPu0zZ_y', + 'ks': '1kzh0Pgrv27WRMstR9MpU8mu7p60TcT-X', + 'ksh': '1iHJvrl2HeRaCumlrx3N7CPrHQ2KuLUkt', + 'ku': '1YqJog7Bkk0fHBCSTxJ9heeE-bfbkbkye', + 'kv': '1s91HI4eq8lQYlZwfrJAgaGlCyAtIhvIJ', + 'kw': '16TaIX2nRfqDp8n7zudd4bqf5abN49dvW', + 'ky': '17HPUKFdKWhUjuR1NOp5f3PQYfMlMCxCT', + 'la': '1NiQuBaUIFEERvVXo6CQLwosPraGyiRYw', + 'lad': '1PEmXCWLCqnjLBomMAYHeObM1AmVHtD08', + 'lb': '1nE4g10xoTU23idmDtOQ0w2QCuizZ6QH_', + 'lbe': '1KOm-AdRcCHfSc1-uYBxBA4GjxXjnIlE-', + 'lez': '1cJAXshrLlF1TZlPHJTpDwEvurIOsz4yR', + 'lg': '1Ur0y7iiEpWBgHECrIrT1OyIC8um_y4th', + 'li': '1TikIqfqcZlSDWhOae1JnjJiDko4nj4Dj', + 'lij': '1ro5ItUcF49iP3JdV82lhCQ07MtZn_VjW', + 'lmo': '1W4rhBy2Pi5SuYWyWbNotOVkVY3kYWS_O', + 'ln': '1bLSV6bWx0CgFm7ByKppZLpYCFL8EIAoD', + 'lo': '1C6SSLeKF3QirjZbAZAcpVX_AXYg_TJG3', + 'lrc': '1GUcS28MlJe_OjeQfS2AJ8uczpD8ut60e', + 'lt': '1gAG6TcMTmC128wWK0rCXRlCTsJY9wFQY', + 'ltg': '12ziP8t_fAAS9JqOCEC0kuJObEyuoiOjD', + 'lv': '1MPuAM04u-AtfybXdpHwCqUpFWbe-zD0_', + 'mai': '1d_nUewBkka2QGEmxCc9v3dTfvo7lPATH', + 'map-bms': '1wrNIE-mqp2xb3lrNdwADe6pb7f35NP6V', + 'mdf': '1BmMGUJy7afuKfhfTBMiKxM3D7FY-JrQ2', + 'mg': '105WaMhcWa-46tCztoj8npUyg0aH18nFL', + 'mh': '1Ej7n6yA1cF1cpD5XneftHtL33iHJwntT', + 'mhr': '1CCPIUaFkEYXiHO0HF8_w07UzVyWchrjS', + 'mi': '1F6au9xQjnF-aNBupGJ1PwaMMM6T_PgdQ', + 'min': '1tVK5SHiCy_DaZSDm3nZBgT5bgWThbJt_', + 'mk': '18NpudytGhSWq_LbmycTDw10cSftlSBGS', + 'ml': '1V73UE-EvcE-vV3V1RTvU4sak6QFcP91y', + 'mn': '14jRXicA87oXZOZllWqUjKBMetNpQEUUp', + 'mo': '1YsLGNMsJ7VsekhdcITQeolzOSK4NzE6U', + 'mr': '1vOr1AIHbgkhTO9Ol9Jx5Wh98Qdyh1QKI', + 'mrj': '1dW-YmEW8a9D5KyXz8ojSdIXWGekNzGzN', + 'ms': '1bs-_5WNRiZBjO-DtcNtkcIle-98homf_', + 'mt': '1L7aU3iGjm6SmPIU74k990qRgHFV9hrL0', + 'mus': '1_b7DcRqiKJFEFwp87cUecqf8A5BDbTIJ', # leer + 'mwl': '1MfP0jba2jQfGVeJOLq26MjI6fYY7xTPu', + 'my': '16wsIGBhNVd2lC2p6n1X8rdMbiaemeiUM', + 'myv': '1KEqHmfx2pfU-a1tdI_7ZxMQAk5NJzJjB', + 'mzn': '1CflvmYEXZnWwpsBmIs2OvG-zDDvLEMDJ', + 'na': '1r0AVjee5wNnrcgJxQmVGPVKg5YWz1irz', + 'nah': '1fx6eu91NegyueZ1i0XaB07CKjUwjHN7H', + 'nap': '1bhT4sXCJvaTchCIV9mwLBtf3a7OprbVB', + 'nds-nl': '1UIFi8eOCuFYJXSAXZ9pCWwkQMlHaY4ye', + 'nds': '1FLgZIXUWa_vekDt4ndY0B5XL7FNLiulr', + 'ne': '1gEoCjSJmzjIH4kdHsbDZzD6ID4_78ekS', + 'new': '1_-p45Ny4w9UvGuhD8uRNSPPeaARYvESH', + 'ng': '11yxPdkmpmnijQUcnFHZ3xcOmLTYJmN_R', + 'nl': '1dqYXg3ilzVOSQ_tz_dF47elSIvSIhgqd', + 'nn': '1pDrtRhQ001z2WUNMWCZQU3RV_M0BqOmv', + 'no': '1zuT8MI96Ivpiu9mEVFNjwbiM8gJlSzY2', + 'nov': '1l38388Rln0NXsSARMZHmTmyfo5C0wYTd', + 'nrm': '10vxPq1Nci7Wpq4XOvx3dtqODskzjdxJQ', + 'nso': '1iaIV8qlT0RDnbeQlnxJ3RehsG3gU5ePK', + 'nv': '1oN31jT0w3wP9aGwAPz91pSdUytnd9B0g', + 'ny': '1eEKH_rUPC560bfEg11kp3kbe8qWm35IG', + 'oc': '1C01cW8G_j8US-DTrsmeal_ENHTtNWn-H', + 'olo': '1vbDwKZKqFq84dusr1SvDx5JbBcPanx9L', # leer + 'om': '1q3h22VMbWg2kgVFm-OArR-E4y1yBQ1JX', + 'or': '1k8LwCE8nC7lq6neXDaS3zRn0KOrd9RnS', + 'os': '1u81KAB34aEQfet00dLMRIBJsfRwbDTij', + 'pa': '1JDEHL1VcLHBamgTPBom_Ryi8hk6PBpsu', + 'pag': '1k905VUWnRgY8kFb2P2431Kr4dZuolYGF', + 'pam': '1ssugGyJb8ipispC60B3I6kzMsri1WcvC', + 'pap': '1Za0wfwatxYoD7jGclmTtRoBP0uV_qImQ', + 'pcd': '1csJlKgtG04pdIYCUWhsCCZARKIGlEYPx', + 'pdc': '1Xnms4RXZKZ1BBQmQJEPokmkiweTpouUw', + 'pfl': '1tPQfHX7E0uKMdDSlwNw5aGmaS5bUK0rn', + 'pi': '16b-KxNxzbEuyoNSlI3bfe2YXmdSEsPFu', + 'pih': '1vwyihTnS8_PE5BNK7cTISmIBqGWvsVnF', + 'pl': '1fijjS0LbfpKcoPB5V8c8fH08T8AkXRp9', + 'pms': '12ySc7X9ajWWqMlBjyrPiEdc-qVBuIkbA', + 'pnb': '1RB3-wjluhTKbdTGCsk3nag1bM3m4wENb', + 'pnt': '1ZCUzms6fY4on_fW8uVgO7cEs9KHydHY_', + 'ps': '1WKl9Av6Sqz6aHKyUM5kIh90mzFzyVWH9', + 'pt': '13BX-_4_hcTUp59HDyczFDI32qUB94vUY', + 'qu': '1CB_C4ygtRoegkqgcqfXNHr8oQd-UcvDE', + 'rm': '1YRSGgWoxEqSojHXuBHJnY8vAHr1VgLu-', + 'rmy': '1uFcCyvOWBJWKFQxbkYSp373xUXVl4IgF', + 'rn': '1ekyyb2MvupYGY_E8_BhKvV664sLvW4aE', + 'ro': '1YfeNTSoxU-zJMnyQotLk5X8B_6nHryBu', + 'roa-rup': '150s4H4TdQ5nNYVC6j0E416TUAjBE85yy', + 'roa-tara': '1H6emfQsD_a5yohK4RMPQ-GrnHXqqVgr3', + 'ru': '11gP2s-SYcfS3j9MjPp5C3_nFeQB-8x86', + 'rue': '1OuSglZAndja1J5D5IUmdbt_niTTyEgYK', + 'rw': '1NuhHfi0-B-Xlr_BApijnxCw0WMEltttP', + 'sa': '1P2S3gL_zvKgXLKJJxg-Fb4z8XdlVpQik', + 'sah': '1qz0MpKckzUref2FX_FYiNzI2p4BDc5oR', + 'sc': '1oAYj_Fty4FUwjAOBEBaiZt_cY8dtpDfA', + 'scn': '1sDN9zHkXWYoHYx-DUu-GPvsUgB_IRa8S', + 'sco': '1i8W7KQPj6YZQLop89vZBSybJNgNsvXWR', + 'sd': '1vaNqfv3S8Gl5pQmig3vwWQ3cqRTsXmMR', + 'se': '1RT9xhn0Vl90zjWYDTw5V1L_u1Oh16tpP', + 'sg': '1iIh2oXD2Szz_AygUvTt3_ZK8a3RYEGZ_', + 'sh': '1qPwLiAm6t4__G-zVEOrBgYx6VRmgDgiS', + 'si': '1G5ryceID0TP6SAO42e-HAbIlCvYmnUN7', + 'simple': '1FVV49o_RlK6M5Iw_7zeJOEDQoTa5zSbq', + 'sk': '11mkYvbmAWKTInj6t4Ma8BUPxoR5o6irL', + 'sl': '1fsIZS5LgMzMzZ6T7ogStyj-ILEZIBRvO', + 'sm': '1yefECpKX_Y4R7G2tggIxvc_BvJfOAz-t', + 'sn': '1fYeCjMPvRAv94kvZjiKI-ktIDLkbv0Ve', + 'so': '1Uc-eSZnJb36SgeTvRU3GirXZOlGD_NB6', + 'sq': '11u-53n71O_yjpwRiCQSwgL7N2w72ZptX', + 'sr': '1PGLGlQi8Q0Eac6dib-uuCJAAHK6SF5Pz', + 'srn': '1JKiL3TSXqK1-KhPfAwMK0uqw90WEzg7M', + 'ss': '1e0quNEsA1dn57-IbincF4D82dRWgzQlp', + 'st': '1ny-FBzpBqIDgv6jMcsoFev3Ih65FNZFO', + 'stq': '15Fx32ROy2IM6lSqAPUykkr3CITR6Xd7v', + 'su': '1C0FJum7bYZpnyptBvfAgwJb0TX2hggtO', + 'sv': '1YyqzOSXzK5yrAou9zeTDWH_7s569mDcz', + 'sw': '1_bNTj6T8eXlNAIuHaveleWlHB_22alJs', + 'szl': '1_dXEip1snK4CPVGqH8x7lF5O-6FdCNFW', + 'ta': '1ZFTONsxGtSnC9QB6RpWSvgD_MbZwIhHH', + 'tcy': '15R6u7KQs1vmDSm_aSDrQMJ3Q6q3Be0r7', # leer + 'te': '11Sx-pBAPeZOXGyv48UNSVMD0AH7uf4YN', + 'tet': '11mr2MYLcv9pz7mHhGGNi5iNCOVErYeOt', + 'tg': '16ttF7HWqM9Cnj4qmgf3ZfNniiOJfZ52w', + 'th': '14xhIt-xr5n9nMuvcwayCGM1-zBCFZquW', + 'ti': '123q5e9MStMShp8eESGtHdSBGLDrCKfJU', + 'tk': '1X-JNInt34BNGhg8A8Peyjw2WjsALdXsD', + 'tl': '1WkQHbWd9cqtTnSHAv0DpUThaBnzeSPTJ', + 'tn': '1fHfQHetZn8-fLuRZEu-cvs-kQYwPvjyL', + 'to': '1cHOLaczYJ8h-OqQgxeoH9vMG3izg6muT', + 'tpi': '1YsRjxVu6NYOrXRb8oqMO9FPaicelFEcu', + 'tr': '1J1Zy02IxvtCK0d1Ba2h_Ulit1mVb9UIX', + 'ts': '1pIcfAt3KmtmDkyhOl-SMSeoM8aP8bOpl', + 'tt': '1vsfzCjj-_bMOn5jBai41TF5GjKJM_Ius', + 'tum': '1NWcg65daI2Bt0awyEgU6apUDbBmiqCus', + 'tw': '1WCYKZIqS7AagS76QFSfbteiOgFNBvNne', + 'ty': '1DIqaP1l-N9VXTNokrlr6EuPMGE765o4h', + 'tyv': '1F3qa05OYLBcjT1lXMurAJFDXP_EesCvM', + 'udm': '1T0YMTAPLOk768sstnewy5Jxgx2RPu3Rb', + 'ug': '1fjezvqlysyZhiQMZdazqLGgk72PqtXAw', + 'uk': '1UMJCHtzxkfLDBJE7NtfN5FeMrnnUVwoh', + 'ur': '1WNaD2TuHvdsF-z0k_emQYchwoQQDFmRk', + 'uz': '11wrG2FSTpRJc2jb5MhgvxjkVDYhT8M-l', + 've': '1PucJ7pJ4CXGEXZ5p_WleZDs2usNz74to', + 'vec': '1cAVjm_y3ehNteDQIYz9yyoq1EKkqOXZ0', + 'vep': '1K_eqV7O6C7KPJWZtmIuzFMKAagj-0O85', + 'vi': '1yQ6nhm1BmG9lD4_NaG1hE5VV6biEaV5f', + 'vls': '1bpQQW6pKHruKJJaKtuggH5rReMXyeVXp', + 'vo': '1D80QRdTpe7H4mHFKpfugscsjX71kiMJN', + 'wa': '1m4B81QYbf74htpInDU5p7d0n0ot8WLPZ', + 'war': '1EC3jsHtu22tHBv6jX_I4rupC5RwV3OYd', + 'wo': '1vChyqNNLu5xYHdyHpACwwpw4l3ptiKlo', + 'wuu': '1_EIn02xCUBcwLOwYnA-lScjS2Lh2ECw6', + 'xal': '19bKXsL1D2UesbB50JPyc9TpG1lNc2POt', + 'xh': '1pPVcxBG3xsCzEnUzlohc_p89gQ9dSJB3', + 'xmf': '1SM9llku6I_ZuZz05mOBuL2lx-KQXvehr', + 'yi': '1WNWr1oV-Nl7c1Jv8x_MiAj2vxRtyQawu', + 'yo': '1yNVOwMOWeglbOcRoZzgd4uwlN5JMynnY', + 'za': '1i7pg162cD_iU9h8dgtI2An8QCcbzUAjB', + 'zea': '1EWSkiSkPBfbyjWjZK0VuKdpqFnFOpXXQ', + 'zh-classical': '1uUKZamNp08KA7s7794sKPOqPALvo_btl', + 'zh-min-nan': '1oSgz3YBXLGUgI7kl-uMOC_ww6L0FNFmp', + 'zh-yue': '1zhwlUeeiyOAU1QqwqZ8n91yXIRPFA7UE', + 'zh': '1LZ96GUhkVHQU-aj2C3WOrtffOp0U3Z7f', + 'zu': '1FyXl_UK1737XB3drqQFhGXiJrJckiB1W' + } + return languages_ids[language] - super(WNUT_17, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory - ) -class WEIBO_NER(ColumnCorpus): +class WIKIGOLD_NER(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, @@ -2200,12 +2330,11 @@ def __init__( document_as_sequence: bool = False, ): """ - Initialize the WEIBO_NER corpus . The first time you call this constructor it will automatically + Initialize the wikigold corpus. The first time you call this constructor it will automatically download the dataset. :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict - POS tags instead + :param tag_to_bioes: NER by default, should not be changed :param in_memory: If True, keeps dataset in memory giving speedups in training. :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ @@ -2213,7 +2342,7 @@ def __init__( base_path: Path = Path(base_path) # column format - columns = {0: 'text', 1: 'ner'} + columns = {0: "text", 1: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2224,117 +2353,32 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - weiboNER_conll_path = "https://raw.githubusercontent.com/87302380/WEIBO_NER/main/data/" - cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.train", Path("datasets") / dataset_name) - cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.test", Path("datasets") / dataset_name) - cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.dev", Path("datasets") / dataset_name) - + wikigold_ner_path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/wikigold/CONLL-format/data/" + cached_path(f"{wikigold_ner_path}wikigold.conll.txt", Path("datasets") / dataset_name) - super(WEIBO_NER, self).__init__( + super(WIKIGOLD_NER, self).__init__( data_folder, columns, tag_to_bioes=tag_to_bioes, encoding="utf-8", in_memory=in_memory, - train_file="weiboNER_2nd_conll_format.train", - test_file="weiboNER_2nd_conll_format.test", - dev_file="weiboNER_2nd_conll_format.dev", + train_file='wikigold.conll.txt', document_separator_token=None if not document_as_sequence else "-DOCSTART-", ) -class BIOSCOPE(ColumnCorpus): - - def __init__( - self, - base_path: Union[str, Path] = None, - in_memory: bool = True, - ): - if type(base_path) == str: - base_path: Path = Path(base_path) - - # column format - columns = {0: "text", 1: "tag"} - - # this dataset name - dataset_name = self.__class__.__name__.lower() - - # default dataset folder is the cache root - if not base_path: - base_path = Path(flair.cache_root) / "datasets" - data_folder = base_path / dataset_name - - # download data if necessary - bioscope_path = "https://raw.githubusercontent.com/whoisjones/BioScopeSequenceLabelingData/master/sequence_labeled/" - cached_path(f"{bioscope_path}output.txt", Path("datasets") / dataset_name) - - super(BIOSCOPE, self).__init__( - data_folder, columns, in_memory=in_memory, train_file="output.txt" - ) - - -def _download_wikiner(language_code: str, dataset_name: str): - # download data if necessary - wikiner_path = ( - "https://raw.githubusercontent.com/dice-group/FOX/master/input/Wikiner/" - ) - lc = language_code - - data_file = ( - Path(flair.cache_root) - / "datasets" - / dataset_name - / f"aij-wikiner-{lc}-wp3.train" - ) - if not data_file.is_file(): - - cached_path( - f"{wikiner_path}aij-wikiner-{lc}-wp3.bz2", Path("datasets") / dataset_name - ) - import bz2, shutil - - # unpack and write out in CoNLL column-like format - bz_file = bz2.BZ2File( - Path(flair.cache_root) - / "datasets" - / dataset_name - / f"aij-wikiner-{lc}-wp3.bz2", - "rb", - ) - with bz_file as f, open( - Path(flair.cache_root) - / "datasets" - / dataset_name - / f"aij-wikiner-{lc}-wp3.train", - "w", - encoding="utf-8" - ) as out: - for line in f: - line = line.decode("utf-8") - words = line.split(" ") - for word in words: - out.write("\t".join(word.split("|")) + "\n") -class UP_CHINESE(ColumnCorpus): +class WIKINER_ENGLISH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - in_memory: bool = True, - document_as_sequence: bool = False, + tag_to_bioes: str = "ner", + in_memory: bool = False, ): - """ - Initialize the Chinese dataset from the Universal Propositions Bank, comming from that webpage: - https://github.com/System-T/UniversalPropositions/tree/master/UP_Chinese - - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {1: "text", 9: "frame"} + columns = {0: "text", 1: "pos", 2: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2345,92 +2389,25 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - up_zh_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Chinese/" - cached_path(f"{up_zh_path}zh-up-train.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_zh_path}zh-up-dev.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_zh_path}zh-up-test.conllu", Path("datasets") / dataset_name) + _download_wikiner("en", dataset_name) - super(UP_CHINESE, self).__init__( - data_folder, - columns, - encoding="utf-8", - train_file="zh-up-train.conllu", - test_file="zh-up-test.conllu", - dev_file="zh-up-dev.conllu", - in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", - comment_symbol="#", + super(WIKINER_ENGLISH, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class UP_ENGLISH(ColumnCorpus): - def __init__( - self, - base_path: Union[str, Path] = None, - in_memory: bool = True, - document_as_sequence: bool = False, - ): - """ - Initialize the English dataset from the Universal Propositions Bank, comming from that webpage: - https://github.com/System-T/UniversalPropositions. - - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ - if type(base_path) == str: - base_path: Path = Path(base_path) - - # column format - columns = {1: "text", 10: "frame"} - - # this dataset name - dataset_name = self.__class__.__name__.lower() - - # default dataset folder is the cache root - if not base_path: - base_path = Path(flair.cache_root) / "datasets" - data_folder = base_path / dataset_name - - # download data if necessary - up_en_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_English-EWT/" - cached_path(f"{up_en_path}en_ewt-up-train.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_en_path}en_ewt-up-dev.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_en_path}en_ewt-up-test.conllu", Path("datasets") / dataset_name) - - super(UP_ENGLISH, self).__init__( - data_folder, - columns, - encoding="utf-8", - train_file="en_ewt-up-train.conllu", - test_file="en_ewt-up-test.conllu", - dev_file="en_ewt-up-dev.conllu", - in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", - comment_symbol="#", - ) -class UP_FRENCH(ColumnCorpus): +class WIKINER_GERMAN(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - in_memory: bool = True, - document_as_sequence: bool = False, + tag_to_bioes: str = "ner", + in_memory: bool = False, ): - """ - Initialize the French dataset from the Universal Propositions Bank, comming from that webpage: - https://github.com/System-T/UniversalPropositions. - - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {1: "text", 9: "frame"} + columns = {0: "text", 1: "pos", 2: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2441,44 +2418,25 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - up_fr_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_French/" - cached_path(f"{up_fr_path}fr-up-train.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_fr_path}fr-up-dev.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_fr_path}fr-up-test.conllu", Path("datasets") / dataset_name) + _download_wikiner("de", dataset_name) - super(UP_FRENCH, self).__init__( - data_folder, - columns, - encoding="utf-8", - train_file="fr-up-train.conllu", - test_file="fr-up-test.conllu", - dev_file="fr-up-dev.conllu", - in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", - comment_symbol="#", + super(WIKINER_GERMAN, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class UP_FINNISH(ColumnCorpus): + +class WIKINER_DUTCH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - in_memory: bool = True, - document_as_sequence: bool = False, + tag_to_bioes: str = "ner", + in_memory: bool = False, ): - """ - Initialize the Finnish dataset from the Universal Propositions Bank, comming from that webpage: - https://github.com/System-T/UniversalPropositions/tree/master/UP_Finnish - - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {1: "text", 9: "frame"} + columns = {0: "text", 1: "pos", 2: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2489,44 +2447,25 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - up_fi_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Finnish/" - cached_path(f"{up_fi_path}fi-up-train.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_fi_path}fi-up-dev.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_fi_path}fi-up-test.conllu", Path("datasets") / dataset_name) + _download_wikiner("nl", dataset_name) - super(UP_FINNISH, self).__init__( - data_folder, - columns, - encoding="utf-8", - train_file="fi-up-train.conllu", - test_file="fi-up-test.conllu", - dev_file="fi-up-dev.conllu", - in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", - comment_symbol="#", + super(WIKINER_DUTCH, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class UP_GERMAN(ColumnCorpus): + +class WIKINER_FRENCH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - in_memory: bool = True, - document_as_sequence: bool = False, + tag_to_bioes: str = "ner", + in_memory: bool = False, ): - """ - Initialize the German dataset from the Universal Propositions Bank, comming from that webpage: - https://github.com/System-T/UniversalPropositions. - - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {1: "text", 9: "frame"} + columns = {0: "text", 1: "pos", 2: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2537,44 +2476,25 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - up_de_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_German/" - cached_path(f"{up_de_path}de-up-train.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_de_path}de-up-dev.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_de_path}de-up-test.conllu", Path("datasets") / dataset_name) + _download_wikiner("fr", dataset_name) - super(UP_GERMAN, self).__init__( - data_folder, - columns, - encoding="utf-8", - train_file="de-up-train.conllu", - test_file="de-up-test.conllu", - dev_file="de-up-dev.conllu", - in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", - comment_symbol="#", + super(WIKINER_FRENCH, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class UP_ITALIAN(ColumnCorpus): + +class WIKINER_ITALIAN(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - in_memory: bool = True, - document_as_sequence: bool = False, + tag_to_bioes: str = "ner", + in_memory: bool = False, ): - """ - Initialize the Italian dataset from the Universal Propositions Bank, comming from that webpage: - https://github.com/System-T/UniversalPropositions/tree/master/UP_Italian - - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {1: "text", 9: "frame"} + columns = {0: "text", 1: "pos", 2: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2585,44 +2505,25 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - up_it_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Italian/" - cached_path(f"{up_it_path}it-up-train.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_it_path}it-up-dev.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_it_path}it-up-test.conllu", Path("datasets") / dataset_name) + _download_wikiner("it", dataset_name) - super(UP_ITALIAN, self).__init__( - data_folder, - columns, - encoding="utf-8", - train_file="it-up-train.conllu", - test_file="it-up-test.conllu", - dev_file="it-up-dev.conllu", - in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", - comment_symbol="#", + super(WIKINER_ITALIAN, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class UP_SPANISH(ColumnCorpus): + +class WIKINER_SPANISH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - in_memory: bool = True, - document_as_sequence: bool = False, + tag_to_bioes: str = "ner", + in_memory: bool = False, ): - """ - Initialize the Spanish dataset from the Universal Propositions Bank, comming from that webpage: - https://github.com/System-T/UniversalPropositions - - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {1: "text", 9: "frame"} + columns = {0: "text", 1: "pos", 2: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2633,44 +2534,25 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish/" - cached_path(f"{up_es_path}es-up-train.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_es_path}es-up-dev.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_es_path}es-up-test.conllu", Path("datasets") / dataset_name) + _download_wikiner("es", dataset_name) - super(UP_SPANISH, self).__init__( - data_folder, - columns, - encoding="utf-8", - train_file="es-up-train.conllu", - test_file="es-up-test.conllu", - dev_file="es-up-dev.conllu", - in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", - comment_symbol="#", + super(WIKINER_SPANISH, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class UP_SPANISH_ANCORA(ColumnCorpus): + +class WIKINER_PORTUGUESE(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - in_memory: bool = True, - document_as_sequence: bool = False, + tag_to_bioes: str = "ner", + in_memory: bool = False, ): - """ - Initialize the Spanish AnCora dataset from the Universal Propositions Bank, comming from that webpage: - https://github.com/System-T/UniversalPropositions - - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {1: "text", 9: "frame"} + columns = {0: "text", 1: "pos", 2: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2681,127 +2563,83 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish-AnCora/" - cached_path(f"{up_es_path}es_ancora-up-train.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_es_path}es_ancora-up-dev.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_es_path}es_ancora-up-test.conllu", Path("datasets") / dataset_name) + _download_wikiner("pt", dataset_name) - super(UP_SPANISH_ANCORA, self).__init__( - data_folder, - columns, - encoding="utf-8", - train_file="es_ancora-up-train.conllu", - test_file="es_ancora-up-test.conllu", - dev_file="es_ancora-up-dev.conllu", - in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", - comment_symbol="#", + super(WIKINER_PORTUGUESE, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class MITMovieNERSimple(ColumnCorpus): +class WIKINER_POLISH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", - in_memory: bool = True, + in_memory: bool = False, ): - """ - Initialize the eng corpus of the MIT Movie Corpus (it has simpler queries compared to the trivia10k13 corpus) - in BIO format. The first time you call this constructor it will automatically download the dataset. - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict - POS tags instead - :param in_memory: If True, keeps dataset in memory giving speedups in training. - """ + if type(base_path) == str: + base_path: Path = Path(base_path) + # column format - columns = {0: "ner", 1: "text"} + columns = {0: "text", 1: "pos", 2: "ner"} - # dataset name + # this dataset name dataset_name = self.__class__.__name__.lower() - # data folder: default dataset folder is the cache root - if type(base_path) == str: - base_path: Path = Path(base_path) + # default dataset folder is the cache root if not base_path: - base_path: Path = Path(flair.cache_root) / "datasets" + base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name # download data if necessary - mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/" - train_file = "engtrain.bio" - test_file = "engtest.bio" - cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name) - cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name) + _download_wikiner("pl", dataset_name) - super(MITMovieNERSimple, self).__init__( - data_folder, - columns, - train_file=train_file, - test_file=test_file, - tag_to_bioes=tag_to_bioes, - in_memory=in_memory, + super(WIKINER_POLISH, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class MITMovieNERComplex(ColumnCorpus): + +class WIKINER_RUSSIAN(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", - in_memory: bool = True, + in_memory: bool = False, ): - """ - Initialize the trivia10k13 corpus of the MIT Movie Corpus (it has more complex queries compared to the eng corpus) - in BIO format. The first time you call this constructor it will automatically download the dataset. - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict - POS tags instead - :param in_memory: If True, keeps dataset in memory giving speedups in training. - """ + if type(base_path) == str: + base_path: Path = Path(base_path) + # column format - columns = {0: "ner", 1: "text"} + columns = {0: "text", 1: "pos", 2: "ner"} - # dataset name + # this dataset name dataset_name = self.__class__.__name__.lower() - # data folder: default dataset folder is the cache root - if type(base_path) == str: - base_path: Path = Path(base_path) + # default dataset folder is the cache root if not base_path: - base_path: Path = Path(flair.cache_root) / "datasets" + base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name # download data if necessary - mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/" - train_file = "trivia10k13train.bio" - test_file = "trivia10k13test.bio" - cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name) - cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name) + _download_wikiner("ru", dataset_name) - super(MITMovieNERComplex, self).__init__( - data_folder, - columns, - train_file=train_file, - test_file=test_file, - tag_to_bioes=tag_to_bioes, - in_memory=in_memory, + super(WIKINER_RUSSIAN, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class SEC_FILLINGS(ColumnCorpus): + +class WNUT_17(ColumnCorpus): def __init__( self, - base_path: Union[str, Path] = None, + base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", in_memory: bool = True, ): - if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "pos", 3: "ner"} + columns = {0: "text", 1: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2812,22 +2650,19 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - SEC_FILLINGS_Path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/SEC-filings/CONLL-format/data/" - cached_path(f"{SEC_FILLINGS_Path}test/FIN3.txt", Path("datasets") / dataset_name) - cached_path(f"{SEC_FILLINGS_Path}train/FIN5.txt", Path("datasets") / dataset_name) + wnut_path = "https://noisy-text.github.io/2017/files/" + cached_path(f"{wnut_path}wnut17train.conll", Path("datasets") / dataset_name) + cached_path(f"{wnut_path}emerging.dev.conll", Path("datasets") / dataset_name) + cached_path( + f"{wnut_path}emerging.test.annotated", Path("datasets") / dataset_name + ) - super(SEC_FILLINGS, self).__init__( - data_folder, - columns, - tag_to_bioes=tag_to_bioes, - encoding="utf-8", - in_memory=in_memory, - train_file='FIN5.txt', - test_file="FIN3.txt", - skip_first_line=True + super(WNUT_17, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class TURKU_NER(ColumnCorpus): + +class WNUT_2020_NER(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, @@ -2836,12 +2671,11 @@ def __init__( document_as_sequence: bool = False, ): """ - Initialize the Finnish TurkuNER corpus. The first time you call this constructor it will automatically + Initialize the WNUT_2020_NER corpus. The first time you call this constructor it will automatically download the dataset. :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict - POS tags instead + :param tag_to_bioes: NER by default, since it is the only option of the WNUT corpus. :param in_memory: If True, keeps dataset in memory giving speedups in training. :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ @@ -2860,23 +2694,201 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - conll_path = "https://raw.githubusercontent.com/TurkuNLP/turku-ner-corpus/master/data/conll" - dev_file = "dev.tsv" - test_file = "test.tsv" - train_file = "train.tsv" - cached_path(f"{conll_path}/{dev_file}", Path("datasets") / dataset_name) - cached_path(f"{conll_path}/{test_file}", Path("datasets") / dataset_name) - cached_path(f"{conll_path}/{train_file}", Path("datasets") / dataset_name) + github_url = "https://github.com/jeniyat/WNUT_2020_NER/archive/master.zip" - super(TURKU_NER, self).__init__( + for sample in ["train", "test", "dev"]: + + sample_file = data_folder / (sample + ".txt") + if not sample_file.is_file(): + + zip_path = cached_path( + f"{github_url}", Path("datasets") / dataset_name + ) + + # unzip the downloaded repo and merge the train, dev and test datasets + unpack_file(zip_path, data_folder, "zip", False) # unzipped folder name: WNUT_2020_NER-master + + if sample == "test": + file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data_2020/Conll_Format/") + else: + file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data/Conll_Format/") + filenames = os.listdir(file_path) + with open(data_folder / (sample + '.txt'), 'w') as outfile: + for fname in filenames: + with open(file_path / fname) as infile: + lines = infile.read() + outfile.write(lines) + + shutil.rmtree(str(data_folder / "WNUT_2020_NER-master")) # clean up when done + + super(WNUT_2020_NER, self).__init__( data_folder, columns, - dev_file=dev_file, - test_file=test_file, - train_file=train_file, - column_delimiter="\t", tag_to_bioes=tag_to_bioes, - encoding="latin-1", + encoding="utf-8", in_memory=in_memory, document_separator_token=None if not document_as_sequence else "-DOCSTART-", - ) \ No newline at end of file + ) + + +def _download_wikiner(language_code: str, dataset_name: str): + # download data if necessary + wikiner_path = ( + "https://raw.githubusercontent.com/dice-group/FOX/master/input/Wikiner/" + ) + lc = language_code + + data_file = ( + Path(flair.cache_root) + / "datasets" + / dataset_name + / f"aij-wikiner-{lc}-wp3.train" + ) + if not data_file.is_file(): + + cached_path( + f"{wikiner_path}aij-wikiner-{lc}-wp3.bz2", Path("datasets") / dataset_name + ) + import bz2, shutil + + # unpack and write out in CoNLL column-like format + bz_file = bz2.BZ2File( + Path(flair.cache_root) + / "datasets" + / dataset_name + / f"aij-wikiner-{lc}-wp3.bz2", + "rb", + ) + with bz_file as f, open( + Path(flair.cache_root) + / "datasets" + / dataset_name + / f"aij-wikiner-{lc}-wp3.train", + "w", + encoding="utf-8" + ) as out: + for line in f: + line = line.decode("utf-8") + words = line.split(" ") + for word in words: + out.write("\t".join(word.split("|")) + "\n") + + +class XTREME(MultiCorpus): + def __init__( + self, + languages: Union[str, List[str]] = None, + base_path: Union[str, Path] = None, + tag_to_bioes: str = "ner", + in_memory: bool = False, + ): + """ + Xtreme corpus for cross-lingual NER consisting of datasets of a total of 176 languages. The data comes from the google + research work XTREME https://github.com/google-research/xtreme. All datasets for NER and respective language abbreviations (e.g. + "en" for english can be found here https://www.amazon.com/clouddrive/share/d3KGCRCIYwhKJF0H3eWA26hjg2ZCRhjpEQtDL70FSBN/folder/C43gs51bSIaq5sFTQkWNCQ?_encoding=UTF8&*Version*=1&*entries*=0&mgh=1 ) + The data is derived from the wikiann dataset https://elisa-ie.github.io/wikiann/ (license: https://opendatacommons.org/licenses/by/) + + Parameters + ---------- + languages : Union[str, List[str]], optional + Default the 40 languages that are used in XTREME are loaded. Otherwise on can hand over a strings or a list of strings + consisiting of abbreviations for languages. All datasets will be loaded in a MultiCorpus object. + base_path : Union[str, Path], optional + Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + tag_to_bioes : str, optional + The data is in bio-format. It will by default (with the string "ner" as value) be transformed + into the bioes format. If you dont want that set it to None. + + """ + # if no languages are given as argument all languages used in XTREME will be loaded + if not languages: + languages = ["af", "ar", "bg", "bn", "de", "el", "en", "es", "et", "eu", "fa", "fi", "fr", "he", "hi", "hu", + "id", "it", "ja", "jv", "ka", "kk", "ko", "ml", "mr", "ms", "my", "nl", "pt", "ru", "sw", "ta", + "te", "th", "tl", "tr", "ur", "vi", "yo", "zh"] + + # if only one language is given + if type(languages) == str: + languages = [languages] + + if type(base_path) == str: + base_path: Path = Path(base_path) + + # column format + columns = {0: "text", 1: "ner"} + + # this dataset name + dataset_name = "xtreme" + + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + + # For each language in languages, the file is downloaded if not existent + # Then a comlumncorpus of that data is created and saved in a list + # This list is handed to the multicorpus + + # list that contains the columncopora + corpora = [] + + hu_path = "https://nlp.informatik.hu-berlin.de/resources/datasets/panx_dataset" + + # download data if necessary + for language in languages: + + language_folder = data_folder / language + + # if language not downloaded yet, download it + if not language_folder.exists(): + + file_name = language + '.tar.gz' + # create folder + os.makedirs(language_folder) + + # download from HU Server + temp_file = cached_path( + hu_path + "/" + file_name, + Path("datasets") / dataset_name / language + ) + + # unzip + print("Extract data...") + import tarfile + tar = tarfile.open(str(temp_file), "r:gz") + for part in ["train", "test", "dev"]: + tar.extract(part, str(language_folder)) + tar.close() + print('...done.') + + # transform data into required format + print("Process dataset...") + for part in ["train", "test", "dev"]: + xtreme_to_simple_ner_annotation(str(language_folder / part)) + print('...done.') + + # initialize comlumncorpus and add it to list + print("Read data into corpus...") + corp = ColumnCorpus(data_folder=language_folder, + column_format=columns, + tag_to_bioes=tag_to_bioes, + in_memory=in_memory, + ) + corpora.append(corp) + print("...done.") + + super(XTREME, self).__init__( + corpora, name='xtreme' + ) + + +def xtreme_to_simple_ner_annotation(data_file: Union[str, Path]): + with open(data_file, 'r', encoding='utf-8') as f: + lines = f.readlines() + with open(data_file, 'w', encoding='utf-8') as f: + for line in lines: + if line == '\n': + f.write(line) + else: + liste = line.split() + f.write(liste[0].split(':', 1)[1] + ' ' + liste[1] + '\n') diff --git a/resources/docs/TUTORIAL_6_CORPUS.md b/resources/docs/TUTORIAL_6_CORPUS.md index f981bf715..0c7419abe 100644 --- a/resources/docs/TUTORIAL_6_CORPUS.md +++ b/resources/docs/TUTORIAL_6_CORPUS.md @@ -162,20 +162,24 @@ data the first time you call the corresponding constructor ID. The following dat | ID(s) | Languages | Description | | ------------- | ------------- |------------- +| 'ANER_CORP' | Arabic | [Arabic Named Entity Recognition Corpus](http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp/) 4-class NER | | 'BIOFID' | German | [CoNLL-03](https://www.aclweb.org/anthology/K19-1081/) Biodiversity literature NER | +| 'BIOSCOPE' | English | [BioScope](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-9-S11-S9/) biomedical texts annotated for uncertainty, negation and their scopes | | 'CONLL_03_DUTCH' | Dutch | [CoNLL-03](https://www.clips.uantwerpen.be/conll2002/ner/) 4-class NER | | 'CONLL_03_SPANISH' | Spanish | [CoNLL-03](https://www.clips.uantwerpen.be/conll2002/ner/) 4-class NER | | 'DANE' | Danish | [DaNE dataset](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#danish-dependency-treebank) | | 'EUROPARL_NER_GERMAN' | German | [German Europarl dataset](https://nlpado.de/~sebastian/software/ner_german.shtml) NER in German EU parliament speeches | | 'LER_GERMAN' | German | [Legal Entity Recognition](https://github.com/elenanereiss/Legal-Entity-Recognition) NER in German Legal Documents | -| 'MIT_RESTAURANTS' | English | [NER dataset for restaurant reviews](https://groups.csail.mit.edu/sls/downloads/restaurant/) | +| 'MIT_MOVIE_NER_SIMPLE' | English | [NER dataset for movie reviews](https://groups.csail.mit.edu/sls/downloads/movie/) - simple NER | +| 'MIT_MOVIE_NER_COMPLEX' | English | [NER dataset for movie reviews](https://groups.csail.mit.edu/sls/downloads/movie/) - complex NER | +| 'MIT_RESTAURANT_NER' | English | [NER dataset for restaurant reviews](https://groups.csail.mit.edu/sls/downloads/restaurant/) | | 'NER_BASQUE' | Basque | [NER dataset for Basque](http://ixa2.si.ehu.eus/eiec/) | | 'NER_FINNISH' | Finnish | [Finer-data](https://github.com/mpsilfve/finer-data) | | 'NER_SWEDISH' | Swedish | [Swedish Spraakbanken NER](https://github.com/klintan/swedish-ner-corpus/) 4-class NER | +| 'TURKU_NER' | Finnish | [TURKU_NER](https://github.com/TurkuNLP/turku-ner-corpus) NER corpus created by the Turku NLP Group, University of Turku, Finland | | 'TWITTER_NER' | English | [Twitter NER dataset](https://github.com/aritter/twitter_nlp/) | +| 'WEIBO_NER' | Chinese | [Weibo NER corpus](https://paperswithcode.com/sota/chinese-named-entity-recognition-on-weibo-ner/). | | 'WIKIANN' | 282 languages | Gigantic [corpus for cross-lingual NER derived from Wikipedia](https://elisa-ie.github.io/wikiann/). | -| 'WNUT_17' | English | [WNUT-17](https://noisy-text.github.io/2017/files/) emerging entity detection | -| 'WNUT_20' | English | [WNUT-20](https://github.com/jeniyat/WNUT_2020_NER) named entity extraction | | 'WIKIGOLD_NER' | English | [Wikigold](https://github.com/juand-r/entity-recognition-datasets/tree/master/data/wikigold) a manually annotated collection of Wikipedia text | | 'WIKINER_ENGLISH' | English | [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia | | 'WIKINER_GERMAN' | German | [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia | @@ -185,16 +189,33 @@ data the first time you call the corresponding constructor ID. The following dat | 'WIKINER_PORTUGUESE' | Portuguese | [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia | | 'WIKINER_POLISH' | Polish | [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia | | 'WIKINER_RUSSIAN' | Russian | [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia | +| 'WNUT_17' | English | [WNUT-17](https://noisy-text.github.io/2017/files/) emerging entity detection | +| 'WNUT_2020_NER' | English | [WNUT-20](https://github.com/jeniyat/WNUT_2020_NER) named entity extraction | | 'XTREME' | 176 languages | [Xtreme](https://github.com/google-research/xtreme) corpus by Google Research for cross-lingual NER consisting of datasets of a total of 176 languages | -| 'MITMovieNERSimple' | English | [eng](https://groups.csail.mit.edu/sls/downloads/movie) NER movie corpus collected by MIT (simpler queries) | -| 'MITMovieNERComplex' | English | [trivia10k13](https://groups.csail.mit.edu/sls/downloads/movie) NER movie corpus collected by MIT (more complex queries) | -| 'TURKU_NER' | Finnish | [TURKU_NER](https://github.com/TurkuNLP/turku-ner-corpus) NER corpus created by the Turku NLP Group, University of Turku, Finland | #### Biomedical Named Entity Recognition We support 31 biomedical NER datasets, listed [here](HUNFLAIR_CORPORA.md). + +#### Universal Proposition Banks + +We now also support loading the [Universal Proposition Banks](https://github.com/System-T/UniversalPropositions) +for the purpose of training multilingual frame detection systems. + +| ID(s) | Languages | Description | +| ------------- | ------------- |------------- | +| 'UP_CHINESE' | Chinese | Universal Propositions for [Chinese](https://github.com/System-T/UniversalPropositions/tree/master/UP_Chinese) | +| 'UP_ENGLISH'| English | Universal Propositions for [English](https://github.com/System-T/UniversalPropositions/tree/master/UP_English-EWT) | +| 'UP_FINNISH'| Finnish | Universal Propositions for [Finnish](https://github.com/System-T/UniversalPropositions/tree/master/UP_Finnish) +| 'UP_FRENCH'| French | Universal Propositions for [French](https://github.com/System-T/UniversalPropositions/tree/master/UP_French) +| 'UP_GERMAN'| German | Universal Propositions for [German](https://github.com/System-T/UniversalPropositions/tree/master/UP_German) | +| 'UP_ITALIAN', | Italian | Universal Propositions for [Italian](https://github.com/System-T/UniversalPropositions/tree/master/UP_Italian) | +| 'UP_SPANISH' | Spanish | Universal Propositions for [Spanish](https://github.com/System-T/UniversalPropositions/tree/master/UP_Spanish) | +| 'UP_SPANISH_ANCORA' | Spanish (Ancora Corpus) | Universal Propositions for [Spanish](https://github.com/System-T/UniversalPropositions/tree/master/UP_Spanish-AnCora) | + + #### Universal Dependency Treebanks | ID(s) | Languages | Description | From b5db7ed62c00f618fa5e1ed520bfd979a5cd362a Mon Sep 17 00:00:00 2001 From: Alan Akbik Date: Wed, 25 Nov 2020 14:44:58 +0100 Subject: [PATCH 16/35] GH-1983: bump version number --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index fa33a27cc..d82f2155d 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ document embeddings, including our proposed **[Flair embeddings](https://www.acl * **A PyTorch NLP framework.** Our framework builds directly on [PyTorch](https://pytorch.org/), making it easy to train your own models and experiment with new approaches using Flair embeddings and classes. -Now at [version 0.6.1](https://github.com/flairNLP/flair/releases)! +Now at [version 0.7](https://github.com/flairNLP/flair/releases)! ## Comparison with State-of-the-Art From 6dbef308d17ba4578f013441a8b315eb1f95e498 Mon Sep 17 00:00:00 2001 From: Alan Akbik Date: Wed, 25 Nov 2020 14:51:25 +0100 Subject: [PATCH 17/35] Update TUTORIAL_1_BASICS.md --- resources/docs/TUTORIAL_1_BASICS.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/resources/docs/TUTORIAL_1_BASICS.md b/resources/docs/TUTORIAL_1_BASICS.md index 655ef375e..61828d0d0 100644 --- a/resources/docs/TUTORIAL_1_BASICS.md +++ b/resources/docs/TUTORIAL_1_BASICS.md @@ -80,7 +80,7 @@ print(untokenized_sentence) In this case, no tokenization is performed and the text is split on whitespaces, thus resulting in only 4 tokens here. -### Using a Different Tokenizer +### Using a different tokenizer You can also pass custom tokenizers to the initialization method. For instance, if you want to tokenize a Japanese sentence you can use the 'janome' tokenizer instead, like this: @@ -110,12 +110,12 @@ You can write your own tokenization routine. Check the code of `flair.data.Token your own tokenization method. ### Using pretokenized sequences -You can pass pass a pretokenized sequence as list of words, e.g. +You can alternatively pass a pretokenized sequence as list of words, e.g. ```python from flair.data import Sentence -my_sent = Sentence(['The', 'grass', 'is', 'green', '.']) -print(my_sent) +sentence = Sentence(['The', 'grass', 'is', 'green', '.']) +print(sentence) ``` This should print: @@ -129,7 +129,7 @@ Sentence: "The grass is green ." [− Tokens: 5] In Flair, any data point can be labeled. For instance, you can label a word or label a sentence: -### Adding Labels to Tokens +### Adding labels to tokens A `Token` has fields for linguistic annotation, such as lemmas, part-of-speech tags or named entity tags. You can add a tag by specifying the tag type and the tag value. In this example, we're adding an NER tag of type 'color' to @@ -171,7 +171,7 @@ This should print: Our color tag has a score of 1.0 since we manually added it. If a tag is predicted by our sequence labeler, the score value will indicate classifier confidence. -### Adding Labels to Sentences +### Adding labels to sentences You can also add a `Label` to a whole `Sentence`. For instance, the example below shows how we add the label 'sports' to a sentence, thereby labeling it @@ -199,7 +199,7 @@ Sentence: "France is the current world cup winner." [− Tokens: 7 − Senten Indicating that this sentence belongs to the topic 'sports' with confidence 1.0. -### Multiple Labels +### Multiple labels Any data point can be labeled multiple times. A sentence for instance might belong to two topics. In this case, add two labels with the same label name: @@ -234,7 +234,7 @@ Sentence: "France is the current world cup winner." [− Tokens: 7 − Senten Indicating that this sentence has two "topic" labels and one "language" label. -### Accessing a Sentence's Labels +### Accessing a sentence's labels You can access these labels like this: From 06ec50c1129963b3db575348ce088c50935ff42a Mon Sep 17 00:00:00 2001 From: Alan Akbik Date: Fri, 27 Nov 2020 12:50:11 +0100 Subject: [PATCH 18/35] Update TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md --- resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md index eba2594df..50bbfc633 100644 --- a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md +++ b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md @@ -18,6 +18,8 @@ For instance, say you want to predict whether text is "happy" or "sad" but you h Just use TARS with this snippet: ```python +from flair.models.text_classification_model import TARSClassifier + # 1. Load our pre-trained TARS model for English tars = TARSClassifier.load('tars-base') From 1d91f254e8de01362e4f72d4b4308edb697f520f Mon Sep 17 00:00:00 2001 From: Alan Akbik Date: Fri, 27 Nov 2020 12:51:28 +0100 Subject: [PATCH 19/35] Update TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md --- resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md index 50bbfc633..16f19b7ce 100644 --- a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md +++ b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md @@ -19,6 +19,7 @@ Just use TARS with this snippet: ```python from flair.models.text_classification_model import TARSClassifier +from flair.data import Sentence # 1. Load our pre-trained TARS model for English tars = TARSClassifier.load('tars-base') @@ -69,6 +70,8 @@ In this case, zero-shot prediction falsely predicts "drink" for the sentence "I To improve this, let's first create a small corpus of 4 training and 2 testing examples: ```python +from flair.datasets import SentenceDataset + # training dataset consisting of four sentences (2 labeled as "food" and 2 labeled as "drink") train = SentenceDataset( [ From 4c274dc5416a395c4f0e4824b4cb0c7f78749529 Mon Sep 17 00:00:00 2001 From: alanakbik Date: Tue, 1 Dec 2020 10:52:55 +0100 Subject: [PATCH 20/35] GH-1983: move distance classifier to diagnostics module --- flair/models/__init__.py | 1 - flair/models/text_classification_model.py | 486 +--------------------- 2 files changed, 1 insertion(+), 486 deletions(-) diff --git a/flair/models/__init__.py b/flair/models/__init__.py index 15f2a326b..ebb6827d3 100644 --- a/flair/models/__init__.py +++ b/flair/models/__init__.py @@ -2,4 +2,3 @@ from .simple_sequence_tagger_model import SimpleSequenceTagger from .language_model import LanguageModel from .text_classification_model import TextClassifier -from .text_classification_model import DistClassifier diff --git a/flair/models/text_classification_model.py b/flair/models/text_classification_model.py index 00115d2aa..7e0dab976 100644 --- a/flair/models/text_classification_model.py +++ b/flair/models/text_classification_model.py @@ -7,7 +7,6 @@ from torch.utils.data.dataset import Dataset from tqdm import tqdm import numpy as np -from math import floor import sklearn.metrics as metrics from sklearn.metrics.pairwise import cosine_similarity @@ -17,12 +16,7 @@ from flair.data import Dictionary, Sentence, Label, DataPoint from flair.datasets import SentenceDataset, DataLoader from flair.file_utils import cached_path -from flair.training_utils import ( - MetricRegression, - convert_labels_to_one_hot, - Result, - store_embeddings, -) +from flair.training_utils import convert_labels_to_one_hot, Result, store_embeddings log = logging.getLogger("flair") @@ -947,481 +941,3 @@ def _fetch_model(model_name) -> str: model_name = cached_path(model_map[model_name], cache_dir=cache_dir) return model_name - - - -class DistClassifier(flair.nn.Model): - """ - DistClassifier - Model to predict distance between two words given their embeddings. Takes (contextual) word embedding as input. - The pair of word embeddings is passed through a linear layer that predicts their distance in a sentence. - Note: When used for training the batch size must be set to 1!!! - """ - - def __init__( - self, - word_embeddings: flair.embeddings.TokenEmbeddings, - max_distance: int = 20, - beta: float = 1.0, - loss_max_weight: float = 1, - regression = False, - regr_loss_step = 0 - ): - """ - Initializes a DistClassifier - :param word_embeddings: embeddings used to embed each sentence - .param max_distance: max dist between word pairs = number of predicted classes - 1 - :param beta: Parameter for F-beta score for evaluation and training annealing - :param loss_max_weight: Only for classification: Since small distances between word pairs occur mor frequent it makes sense to give them less weight - in the loss function. loss_max_weight will be used as the weight for the maximum distance and should be a number >=1 - The other weights decrease with equidistant steps from high to low distance. - :param regression: if True the class does regression instead of classification - :param regr_loss_step: if > 0, the MSE-Loss in regression will be weighted. Word pairs with - distance 0 have weight 1. Then, as the distance increases, the weight in the loss function, - increases step by step with size regr_loss_step - """ - - super(DistClassifier, self).__init__() - - self.word_embeddings: flair.embeddings.TokenEmbeddings = word_embeddings - - self.beta = beta - - self.loss_max_weight = loss_max_weight - - self.regression = regression - - self.regr_loss_step = regr_loss_step - - if not regression: - self.max_distance = max_distance - - # weights for loss function - if self.loss_max_weight > 1: - step = (self.loss_max_weight - 1) / self.max_distance - - weight_list = [1. + i * step for i in range(self.max_distance + 1)] - - self.loss_weights = torch.FloatTensor(weight_list).to(flair.device) - - else: - self.loss_weights = None - - # iput size is two times wordembedding size since we use pair of words as input - # the output size is max_distance + 1, i.e. we allow 0,1,...,max_distance words between pairs - self.decoder = nn.Linear( - self.word_embeddings.embedding_length * 2, self.max_distance + 1) - - self.loss_function = nn.CrossEntropyLoss(weight=self.loss_weights) - - # regression - else: - self.max_distance = float('inf') - - # iput size is two times wordembedding size since we use pair of words as input - # the output size is 1 - self.decoder = nn.Linear( - self.word_embeddings.embedding_length * 2, 1) - - if regr_loss_step > 0: - self.loss_function = self.weighted_mse_loss - else: - self.loss_function = nn.MSELoss() - - nn.init.xavier_uniform_(self.decoder.weight) - - # auto-spawn on GPU if available - self.to(flair.device) - - - # all input should be tensors - def weighted_mse_loss(self,predictions, target): - - weight = 1 + self.regr_loss_step * target - - return (weight * ((predictions - target) ** 2)).mean() - - - # forward allows only a single sentcence!! - def forward(self, sentence: Sentence): - - # embed words of sentence - self.word_embeddings.embed(sentence) - - # go through all pairs of words with a maximum number of max_distance in between - numberOfWords = len(sentence) - text_embedding_list = [] - # go through all pairs - for i in range(numberOfWords): - for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)): - text_embedding_list.append(torch.cat((sentence[i].embedding, sentence[j].embedding)).unsqueeze(0)) - - # 2-dim matrix whose rows are the embeddings of word pairs of the sentence - text_embedding_tensor = torch.cat(text_embedding_list, 0).to(flair.device) - - label_scores = self.decoder(text_embedding_tensor) - - if self.regression: - return label_scores.squeeze(1) - - return label_scores - - def _get_state_dict(self): - model_state = { - "state_dict": self.state_dict(), - "word_embeddings": self.word_embeddings, - "max_distance": self.max_distance, - "beta": self.beta, - "loss_max_weight": self.loss_max_weight, - "regression": self.regression, - "regr_loss_step": self.regr_loss_step - } - return model_state - - @staticmethod - def _init_model_with_state_dict(state): - beta = 1.0 if "beta" not in state.keys() else state["beta"] - weight = 1 if "loss_max_weight" not in state.keys() else state["loss_max_weight"] - - model = DistClassifier( - word_embeddings=state["word_embeddings"], - max_distance=state["max_distance"], - beta=beta, - loss_max_weight=weight, - regression=state["regression"], - regr_loss_step=state["regr_loss_step"] - ) - - model.load_state_dict(state["state_dict"]) - return model - - # So far only one sentence allowed - # If list of sentences is handed the function works with the first sentence of the list - def forward_loss( - self, data_points: Union[List[Sentence], Sentence] - ) -> torch.tensor: - - if isinstance(data_points, list): # first sentence - data_points = data_points[0] - - if len(data_points) < 2: - return torch.tensor([0.], requires_grad=True) - - scores = self.forward(data_points) - - return self._calculate_loss(scores, data_points) - - # Assume data_points is a single sentence!!! - # scores are the predictions for each word pair - def _calculate_loss(self, scores, data_points): - - indices = [] - numberOfWords = len(data_points) - - # classification needs labels to be integers, regression needs labels to be float - # this is due to the different loss functions - if not self.regression: - for i in range(numberOfWords): - for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)): - indices.append(torch.LongTensor([j - i - 1])) # distance between words - else: - for i in range(numberOfWords): - for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)): - indices.append(torch.Tensor([j - i - 1])) # distance between words - - labels = torch.cat(indices, 0).to(flair.device) - - return self.loss_function(scores, labels) - - # only single sentences as input - def _forward_scores_and_loss( - self, data_points: Union[List[Sentence], Sentence], return_loss=False): - - if isinstance(data_points, list): # first sentence - data_points = data_points[0] - - scores = self.forward(data_points) - - loss = None - if return_loss: - loss = self._calculate_loss(scores, data_points) - - return scores, loss - - def evaluate( - self, - sentences: Union[List[DataPoint], Dataset], - out_path: Union[str, Path] = None, - embedding_storage_mode: str = "none", - mini_batch_size: int = 1, # unnecessary, but trainer.train calls evaluate with this parameter - num_workers: int = 8, - ) -> (Result, float): - - if self.regression: - return self.evaluate_regression( - sentences = sentences, - out_path = out_path, - embedding_storage_mode=embedding_storage_mode, - ) - - return self.evaluate_classification( - sentences = sentences, - out_path = out_path, - embedding_storage_mode=embedding_storage_mode, - ) - - def evaluate_regression( - self, - sentences: Union[List[DataPoint], Dataset], - out_path: Union[str, Path] = None, - embedding_storage_mode: str = "none", - ) -> (Result, float): - - with torch.no_grad(): - - buckets = [0 for _ in range(11)] - - eval_loss = 0 - - metric = MetricRegression("Evaluation") - - lines: List[str] = [] - - max_dist_plus_one = max([len(sent) for sent in sentences]) - 1 - - num_occurences = [0 for _ in range(max_dist_plus_one)] - - cumulated_values = [0 for _ in range(max_dist_plus_one)] - - for sentence in sentences: - - if len(sentence) < 2: # we need at least 2 words per sentence - continue - - scores, loss = self._forward_scores_and_loss(sentence, return_loss=True) - - predictions = scores.tolist() - - # gold labels - true_values_for_sentence = [] - numberOfPairs = 0 - numberOfWords = len(sentence) - lines.append(sentence.to_tokenized_string() + '\n') - for i in range(numberOfWords): - for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)): - true_dist = j - i - 1 - pred = predictions[numberOfPairs] - - true_values_for_sentence.append(true_dist) - - # for output text file - eval_line = f"({i},{j})\t{true_dist}\t{pred:.2f}\n" - lines.append(eval_line) - - # for buckets - error = abs(true_dist - pred) - if error >= 10: - buckets[10] += 1 - else: - buckets[floor(error)] += 1 - - # for average prediction - num_occurences[true_dist] += 1 - cumulated_values[true_dist] += pred - - numberOfPairs += 1 - - eval_loss += loss/numberOfPairs - - metric.true.extend(true_values_for_sentence) - metric.pred.extend(predictions) - - store_embeddings(sentence, embedding_storage_mode) - - eval_loss /= len(sentences) # w.r.t self.loss - - # add some statistics to the output - eval_line = f"Number of Sentences: {len(sentences)}\nBuckets:\n | 0-1 | 1-2 | 2-3 | 3-4 | 4-5 | 5-6 | 6-7 | 7-8 | 8-9 | 9-10 | >10 |\n" - lines.append(eval_line) - eval_line = "| {} | {} | {} | {} | {} | {} | {} | {} | {} | {} | {} |".format(buckets[0],buckets[1],buckets[2],buckets[3], - buckets[4],buckets[5],buckets[6],buckets[7], - buckets[8],buckets[9],buckets[10]) - lines.append(eval_line) - lines.append("\nAverage predicted values per distance:\n") - eval_line = "" - for i in range(max_dist_plus_one): - eval_line += str(i) + ": " + f"{cumulated_values[i]/num_occurences[i]:.2f}" + " " - if i!=0 and i%15==0: - eval_line += "\n" - - lines.append(eval_line) - - - - if out_path is not None: - with open(out_path, "w", encoding="utf-8") as outfile: - outfile.write("".join(lines)) - - log_line = f"{metric.mean_squared_error()}\t{metric.spearmanr()}\t{metric.pearsonr()}" - log_header = "MSE\tSPEARMAN\tPEARSON" - - detailed_result = ( - f"AVG: mse: {metric.mean_squared_error():.4f} - " - f"mae: {metric.mean_absolute_error():.4f} - " - f"pearson: {metric.pearsonr():.4f} - " - f"spearman: {metric.spearmanr():.4f}" - ) - - result: Result = Result( - metric.pearsonr(), log_header, log_line, detailed_result - ) - - - return result, eval_loss - - def evaluate_classification( - self, - sentences: Union[List[DataPoint], Dataset], - out_path: Union[str, Path] = None, - embedding_storage_mode: str = "none", - ) -> (Result, float): - - # use scikit-learn to evaluate - y_true = [] - y_pred = [] - - with torch.no_grad(): - eval_loss = 0 - - lines: List[str] = [] - # we iterate over each sentence, instead of batches - for sentence in sentences: - - if len(sentence) < 2: # we need at least 2 words per sentence - continue - - scores, loss = self._forward_scores_and_loss(sentence, return_loss=True) - - # get single labels from scores - predictions = [self._get_single_label(s) for s in scores] - - # gold labels - true_values_for_sentence = [] - numberOfPairs = 0 - numberOfWords = len(sentence) - lines.append(sentence.to_tokenized_string() + '\n') - for i in range(numberOfWords): - for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)): - true_values_for_sentence.append(j - i - 1) - - # for output text file - eval_line = "({},{})\t{}\t{}\n".format(i, j, j - i - 1, predictions[numberOfPairs]) - lines.append(eval_line) - - numberOfPairs += 1 - - eval_loss += loss / numberOfPairs # add average loss of word pairs - - for prediction_for_sentence, true_value_for_sentence in zip( - predictions, true_values_for_sentence - ): - # hot one vector of true value - y_true_instance = np.zeros(self.max_distance + 1, dtype=int) - y_true_instance[true_value_for_sentence] = 1 - y_true.append(y_true_instance.tolist()) - - # hot one vector of predicted value - y_pred_instance = np.zeros(self.max_distance + 1, dtype=int) - y_pred_instance[prediction_for_sentence] = 1 - y_pred.append(y_pred_instance.tolist()) - - # speichert embeddings, falls embedding_storage!= 'None' - store_embeddings(sentence, embedding_storage_mode) - - if out_path is not None: - with open(out_path, "w", encoding="utf-8") as outfile: - outfile.write("".join(lines)) - - # make "classification report" - target_names = [] # liste aller labels, ins unserem Fall - for i in range(self.max_distance + 1): - target_names.append(str(i)) - classification_report = metrics.classification_report(y_true, y_pred, digits=4, - target_names=target_names, zero_division=0) - - # get scores - micro_f_score = round(metrics.fbeta_score(y_true, y_pred, beta=self.beta, average='micro', zero_division=0), - 4) - accuracy_score = round(metrics.accuracy_score(y_true, y_pred), 4) - macro_f_score = round(metrics.fbeta_score(y_true, y_pred, beta=self.beta, average='macro', zero_division=0), - 4) - # precision_score = round(metrics.precision_score(y_true, y_pred, average='macro', zero_division=0), 4) - # recall_score = round(metrics.recall_score(y_true, y_pred, average='macro', zero_division=0), 4) - - detailed_result = ( - "\nResults:" - f"\n- F-score (micro) {micro_f_score}" - f"\n- F-score (macro) {macro_f_score}" - f"\n- Accuracy {accuracy_score}" - '\n\nBy class:\n' + classification_report - ) - - # line for log file - log_header = "ACCURACY" - log_line = f"\t{accuracy_score}" - - result = Result( - main_score=micro_f_score, - log_line=log_line, - log_header=log_header, - detailed_results=detailed_result, - ) - - eval_loss /= len(sentences) - - return result, eval_loss - - @staticmethod - def _filter_empty_sentences(sentences: List[Sentence]) -> List[Sentence]: - filtered_sentences = [sentence for sentence in sentences if sentence.tokens] - if len(sentences) != len(filtered_sentences): - log.warning( - "Ignore {} sentence(s) with no tokens.".format( - len(sentences) - len(filtered_sentences) - ) - ) - return filtered_sentences - - def _obtain_labels( - self, scores: List[List[float]], predict_prob: bool = False - ) -> List[List[Label]]: - """ - Predicts the labels of sentences. - :param scores: the prediction scores from the model - :return: list of predicted labels - """ - - if predict_prob: - return [self._predict_label_prob(s) for s in scores] - - return [self._get_single_label(s) for s in scores] - - def _get_single_label(self, label_scores): # -> List[Label]: - softmax = torch.nn.functional.softmax(label_scores, dim=0) - conf, idx = torch.max(softmax, 0) - - return idx.item() - - def _predict_label_prob(self, label_scores) -> List[Label]: - softmax = torch.nn.functional.softmax(label_scores, dim=0) - label_probs = [] - for idx, conf in enumerate(softmax): - label_probs.append(Label(idx, conf.item())) - return label_probs - - def __str__(self): - return super(flair.nn.Model, self).__str__().rstrip(')') + \ - f' (beta): {self.beta}\n' + \ - f' (loss_max_weight): {self.loss_max_weight}\n' + \ - f' (max_distance) {self.max_distance}\n)' - From 3020313ccf7dcc594409c7a530f0ced138e6a608 Mon Sep 17 00:00:00 2001 From: alanakbik Date: Tue, 1 Dec 2020 11:15:44 +0100 Subject: [PATCH 21/35] GH-1983: move simple tagger to sandbox module --- flair/models/__init__.py | 1 - .../simple_sequence_tagger_model.py | 26 +++++++++---------- 2 files changed, 13 insertions(+), 14 deletions(-) rename flair/models/{ => sandbox}/simple_sequence_tagger_model.py (97%) diff --git a/flair/models/__init__.py b/flair/models/__init__.py index ebb6827d3..784b038a9 100644 --- a/flair/models/__init__.py +++ b/flair/models/__init__.py @@ -1,4 +1,3 @@ from .sequence_tagger_model import SequenceTagger, MultiTagger -from .simple_sequence_tagger_model import SimpleSequenceTagger from .language_model import LanguageModel from .text_classification_model import TextClassifier diff --git a/flair/models/simple_sequence_tagger_model.py b/flair/models/sandbox/simple_sequence_tagger_model.py similarity index 97% rename from flair/models/simple_sequence_tagger_model.py rename to flair/models/sandbox/simple_sequence_tagger_model.py index 298d887e0..211744643 100644 --- a/flair/models/simple_sequence_tagger_model.py +++ b/flair/models/sandbox/simple_sequence_tagger_model.py @@ -3,7 +3,6 @@ from pathlib import Path from typing import List, Union, Optional -import numpy as np import torch import torch.nn import torch.nn.functional as F @@ -18,19 +17,20 @@ log = logging.getLogger("flair") -""" -This class is a simple version of the SequenceTagger class. -The purpose of this class is to demonstrate the basic hierarchy of a -sequence tagger (this could be helpful for new developers). -It only uses the given embeddings and maps them with a linear layer to -the tag_dictionary dimension. -Thus, this class misses following functionalities from the SequenceTagger: -- CRF, -- RNN, -- Reprojection. -As a result, only poor results can be expected. -""" + class SimpleSequenceTagger(flair.nn.Model): + """ + This class is a simple version of the SequenceTagger class. + The purpose of this class is to demonstrate the basic hierarchy of a + sequence tagger (this could be helpful for new developers). + It only uses the given embeddings and maps them with a linear layer to + the tag_dictionary dimension. + Thus, this class misses following functionalities from the SequenceTagger: + - CRF, + - RNN, + - Reprojection. + As a result, only poor results can be expected. + """ def __init__( self, embeddings: TokenEmbeddings, From d1b09ab85fc8352a7f508c4074e1f1a552f9a0fa Mon Sep 17 00:00:00 2001 From: alanakbik Date: Wed, 25 Nov 2020 12:01:26 +0100 Subject: [PATCH 22/35] GH-1983: bump version numbers --- flair/__init__.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/flair/__init__.py b/flair/__init__.py index 7d3e9a311..ecb28ec24 100644 --- a/flair/__init__.py +++ b/flair/__init__.py @@ -25,7 +25,7 @@ import logging.config -__version__ = "0.6.1.post1" +__version__ = "0.7" logging.config.dictConfig( { diff --git a/setup.py b/setup.py index 0ca078dc0..824626455 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name="flair", - version="0.6.1.post1", + version="0.7", description="A very simple framework for state-of-the-art NLP", long_description=open("README.md", encoding="utf-8").read(), long_description_content_type="text/markdown", From b5d08ccb9df3c5249586fa8fb55c4b3ec982ea13 Mon Sep 17 00:00:00 2001 From: alanakbik Date: Wed, 25 Nov 2020 12:40:42 +0100 Subject: [PATCH 23/35] GH-1983: update list of datasets --- flair/datasets/__init__.py | 32 +- flair/datasets/sequence_labeling.py | 3010 ++++++++++++++------------- resources/docs/TUTORIAL_6_CORPUS.md | 33 +- 3 files changed, 1551 insertions(+), 1524 deletions(-) diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py index 5b611cd23..a59181506 100755 --- a/flair/datasets/__init__.py +++ b/flair/datasets/__init__.py @@ -7,6 +7,7 @@ # Expose all sequence labeling datasets from .sequence_labeling import ColumnCorpus from .sequence_labeling import ColumnDataset +from .sequence_labeling import ANER_CORP from .sequence_labeling import BIOFID from .sequence_labeling import BIOSCOPE from .sequence_labeling import CONLL_03 @@ -14,19 +15,31 @@ from .sequence_labeling import CONLL_03_DUTCH from .sequence_labeling import CONLL_03_SPANISH from .sequence_labeling import CONLL_2000 -from .sequence_labeling import TWITTER_NER from .sequence_labeling import DANE from .sequence_labeling import EUROPARL_NER_GERMAN from .sequence_labeling import GERMEVAL_14 from .sequence_labeling import INSPEC from .sequence_labeling import LER_GERMAN +from .sequence_labeling import MIT_MOVIE_NER_SIMPLE +from .sequence_labeling import MIT_MOVIE_NER_COMPLEX +from .sequence_labeling import MIT_RESTAURANT_NER from .sequence_labeling import NER_BASQUE from .sequence_labeling import NER_FINNISH from .sequence_labeling import NER_SWEDISH from .sequence_labeling import SEMEVAL2010 from .sequence_labeling import SEMEVAL2017 +from .sequence_labeling import TURKU_NER +from .sequence_labeling import TWITTER_NER +from .sequence_labeling import UP_CHINESE +from .sequence_labeling import UP_ENGLISH +from .sequence_labeling import UP_FINNISH +from .sequence_labeling import UP_FRENCH +from .sequence_labeling import UP_GERMAN +from .sequence_labeling import UP_ITALIAN +from .sequence_labeling import UP_SPANISH +from .sequence_labeling import UP_SPANISH_ANCORA +from .sequence_labeling import WEIBO_NER from .sequence_labeling import WIKIANN -from .sequence_labeling import XTREME from .sequence_labeling import WIKIGOLD_NER from .sequence_labeling import WIKINER_ENGLISH from .sequence_labeling import WIKINER_GERMAN @@ -39,20 +52,7 @@ from .sequence_labeling import WIKINER_RUSSIAN from .sequence_labeling import WNUT_17 from .sequence_labeling import WNUT_2020_NER -from .sequence_labeling import WEIBO_NER -from .sequence_labeling import MIT_RESTAURANTS -from .sequence_labeling import UP_CHINESE -from .sequence_labeling import UP_ENGLISH -from .sequence_labeling import UP_FINNISH -from .sequence_labeling import UP_FRENCH -from .sequence_labeling import UP_GERMAN -from .sequence_labeling import UP_ITALIAN -from .sequence_labeling import UP_SPANISH -from .sequence_labeling import UP_SPANISH_ANCORA -from .sequence_labeling import ANER_CORP -from .sequence_labeling import MITMovieNERSimple -from .sequence_labeling import MITMovieNERComplex -from .sequence_labeling import TURKU_NER +from .sequence_labeling import XTREME # Expose all document classification datasets from .document_classification import ClassificationCorpus diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py index 0da2a1fd5..f9ee3ce0f 100644 --- a/flair/datasets/sequence_labeling.py +++ b/flair/datasets/sequence_labeling.py @@ -32,7 +32,6 @@ def __init__( ): """ Instantiates a Corpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000. - :param data_folder: base folder with the task data :param column_format: a map specifying the column format :param train_file: the name of the train file @@ -118,7 +117,6 @@ def __init__( ): """ Instantiates a column dataset (typically used for sequence labeling or word-level prediction). - :param path_to_column_file: path to the file with the column-formatted data :param column_name_map: a map specifying the column format :param tag_to_bioes: whether to convert to BIOES tagging scheme @@ -219,7 +217,7 @@ def _parse_token(self, line: str) -> Token: if len(fields) > column: if column != self.text_column and self.column_name_map[column] != self.SPACE_AFTER_KEY: task = self.column_name_map[column] # for example 'pos' - tag = fields[column] + tag = fields[column] if tag.count("-") >= 1: # tag with prefix, for example tag='B-OBJ' split_at_first_hyphen = tag.split("-", 1) tagging_format_prefix = split_at_first_hyphen[0] @@ -284,6 +282,58 @@ def __getitem__(self, index: int = 0) -> Sentence: return sentence +class ANER_CORP(ColumnCorpus): + def __init__( + self, + base_path: Union[str, Path] = None, + tag_to_bioes: str = "ner", + in_memory: bool = True, + document_as_sequence: bool = False, + **corpusargs, + ): + """ + Initialize a preprocessed version of the Arabic Named Entity Recognition Corpus (ANERCorp) dataset available + from https://github.com/EmnamoR/Arabic-named-entity-recognition/blob/master/ANERCorp.rar. + http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp + Column order is swapped + The first time you call this constructor it will automatically download the dataset. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict + POS tags instead + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ + if type(base_path) == str: + base_path: Path = Path(base_path) + + # column format + columns = {0: "text", 1: "ner"} + + # this dataset name + dataset_name = self.__class__.__name__.lower() + + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + + # download data if necessary + anercorp_path = "https://megantosh.s3.eu-central-1.amazonaws.com/ANERcorp/" + # cached_path(f"{anercorp_path}test.txt", Path("datasets") / dataset_name) + cached_path(f"{anercorp_path}train.txt", Path("datasets") / dataset_name) + + super(ANER_CORP, self).__init__( + data_folder, + columns, + # tag_to_bioes=tag_to_bioes, + encoding="utf-8", + in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + **corpusargs, + ) + + class BIOFID(ColumnCorpus): def __init__( self, @@ -317,6 +367,37 @@ def __init__( ) +class BIOSCOPE(ColumnCorpus): + + def __init__( + self, + base_path: Union[str, Path] = None, + in_memory: bool = True, + **corpusargs, + ): + if type(base_path) == str: + base_path: Path = Path(base_path) + + # column format + columns = {0: "text", 1: "tag"} + + # this dataset name + dataset_name = self.__class__.__name__.lower() + + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + + # download data if necessary + bioscope_path = "https://raw.githubusercontent.com/whoisjones/BioScopeSequenceLabelingData/master/sequence_labeled/" + cached_path(f"{bioscope_path}output.txt", Path("datasets") / dataset_name) + + super(BIOSCOPE, self).__init__( + data_folder, columns, in_memory=in_memory, train_file="output.txt", **corpusargs, + ) + + class CONLL_03(ColumnCorpus): def __init__( self, @@ -473,22 +554,124 @@ def __init__( ) +def add_IOB_tags(data_file: Union[str, Path], encoding: str = "utf8", ner_column: int = 1): + """ +Function that adds IOB tags if only chunk names are provided (e.g. words are tagged PER instead +of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects +the letter 'O'. Additionally it removes lines with no tags in the data file and can also +be used if the data is only partially IOB tagged. +Parameters +---------- +data_file : Union[str, Path] + Path to the data file. +encoding : str, optional + Encoding used in open function. The default is "utf8". +ner_column : int, optional + Specifies the ner-tagged column. The default is 1 (the second column). + +""" + + def add_I_prefix(current_line: List[str], ner: int, tag: str): + for i in range(0, len(current_line)): + if i == 0: + f.write(line_list[i]) + elif i == ner: + f.write(' I-' + tag) + else: + f.write(' ' + current_line[i]) + f.write('\n') -class WNUT_2020_NER(ColumnCorpus): + with open(file=data_file, mode='r', encoding=encoding) as f: + lines = f.readlines() + with open(file=data_file, mode='w', encoding=encoding) as f: + pred = 'O' # remembers ner tag of predecessing line + for line in lines: + line_list = line.split() + if len(line_list) > 2: # word with tags + ner_tag = line_list[ner_column] + if ner_tag in ['0', 'O']: # no chunk + for i in range(0, len(line_list)): + if i == 0: + f.write(line_list[i]) + elif i == ner_column: + f.write(' O') + else: + f.write(' ' + line_list[i]) + f.write('\n') + pred = 'O' + elif '-' not in ner_tag: # no IOB tags + if pred == 'O': # found a new chunk + add_I_prefix(line_list, ner_column, ner_tag) + pred = ner_tag + else: # found further part of chunk or new chunk directly after old chunk + add_I_prefix(line_list, ner_column, ner_tag) + pred = ner_tag + else: # line already has IOB tag (tag contains '-') + f.write(line) + pred = ner_tag.split('-')[1] + elif len(line_list) == 0: # empty line + f.write('\n') + pred = 'O' + + +def add_IOB2_tags(data_file: Union[str, Path], encoding: str = "utf8"): + """ +Function that adds IOB2 tags if only chunk names are provided (e.g. words are tagged PER instead +of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects +the letter 'O'. Additionally it removes lines with no tags in the data file and can also +be used if the data is only partially IOB tagged. +Parameters +---------- +data_file : Union[str, Path] + Path to the data file. +encoding : str, optional + Encoding used in open function. The default is "utf8". + +""" + with open(file=data_file, mode='r', encoding=encoding) as f: + lines = f.readlines() + with open(file=data_file, mode='w', encoding=encoding) as f: + pred = 'O' # remembers tag of predecessing line + for line in lines: + line_list = line.split() + if len(line_list) == 2: # word with tag + word = line_list[0] + tag = line_list[1] + if tag in ['0', 'O']: # no chunk + f.write(word + ' O\n') + pred = 'O' + elif '-' not in tag: # no IOB tags + if pred == 'O': # found a new chunk + f.write(word + ' B-' + tag + '\n') + pred = tag + else: # found further part of chunk or new chunk directly after old chunk + if pred == tag: + f.write(word + ' I-' + tag + '\n') + else: + f.write(word + ' B-' + tag + '\n') + pred = tag + else: # line already has IOB tag (tag contains '-') + f.write(line) + pred = tag.split('-')[1] + elif len(line_list) == 0: # empty line + f.write('\n') + pred = 'O' + + +class CONLL_03_SPANISH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", in_memory: bool = True, - document_as_sequence: bool = False, **corpusargs, ): """ - Initialize the WNUT_2020_NER corpus. The first time you call this constructor it will automatically + Initialize the CoNLL-03 corpus for Spanish. The first time you call this constructor it will automatically download the dataset. :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, since it is the only option of the WNUT corpus. + :param tag_to_bioes: NER by default, should not be changed :param in_memory: If True, keeps dataset in memory giving speedups in training. :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ @@ -507,67 +690,42 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - github_url = "https://github.com/jeniyat/WNUT_2020_NER/archive/master.zip" - - for sample in ["train", "test", "dev"]: - - sample_file = data_folder / (sample + ".txt") - if not sample_file.is_file(): - - zip_path = cached_path( - f"{github_url}", Path("datasets") / dataset_name - ) - - # unzip the downloaded repo and merge the train, dev and test datasets - unpack_file(zip_path, data_folder, "zip", False) # unzipped folder name: WNUT_2020_NER-master - - if sample == "test": - file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data_2020/Conll_Format/") - else: - file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data/Conll_Format/") - filenames = os.listdir(file_path) - with open(data_folder / (sample + '.txt'), 'w') as outfile: - for fname in filenames: - with open(file_path / fname) as infile: - lines = infile.read() - outfile.write(lines) - - shutil.rmtree(str(data_folder / "WNUT_2020_NER-master")) # clean up when done + conll_02_path = "https://www.clips.uantwerpen.be/conll2002/ner/data/" + cached_path(f"{conll_02_path}esp.testa", Path("datasets") / dataset_name) + cached_path(f"{conll_02_path}esp.testb", Path("datasets") / dataset_name) + cached_path(f"{conll_02_path}esp.train", Path("datasets") / dataset_name) - super(WNUT_2020_NER, self).__init__( + super(CONLL_03_SPANISH, self).__init__( data_folder, columns, tag_to_bioes=tag_to_bioes, - encoding="utf-8", + encoding="latin-1", in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", **corpusargs, ) -class WIKIGOLD_NER(ColumnCorpus): +class CONLL_2000(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", + tag_to_bioes: str = "np", in_memory: bool = True, - document_as_sequence: bool = False, **corpusargs, ): """ - Initialize the wikigold corpus. The first time you call this constructor it will automatically - download the dataset. + Initialize the CoNLL-2000 corpus for English chunking. + The first time you call this constructor it will automatically download the dataset. :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, should not be changed + :param tag_to_bioes: 'np' by default, should not be changed, but you can set 'pos' instead to predict POS tags :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "ner"} + columns = {0: "text", 1: "pos", 2: "np"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -578,47 +736,53 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - wikigold_ner_path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/wikigold/CONLL-format/data/" - cached_path(f"{wikigold_ner_path}wikigold.conll.txt", Path("datasets") / dataset_name) + conll_2000_path = "https://www.clips.uantwerpen.be/conll2000/chunking/" + data_file = Path(flair.cache_root) / "datasets" / dataset_name / "train.txt" + if not data_file.is_file(): + cached_path( + f"{conll_2000_path}train.txt.gz", Path("datasets") / dataset_name + ) + cached_path( + f"{conll_2000_path}test.txt.gz", Path("datasets") / dataset_name + ) + import gzip, shutil - super(WIKIGOLD_NER, self).__init__( - data_folder, - columns, - tag_to_bioes=tag_to_bioes, - encoding="utf-8", - in_memory=in_memory, - train_file='wikigold.conll.txt', - document_separator_token=None if not document_as_sequence else "-DOCSTART-", - **corpusargs, + with gzip.open( + Path(flair.cache_root) / "datasets" / dataset_name / "train.txt.gz", + "rb", + ) as f_in: + with open( + Path(flair.cache_root) / "datasets" / dataset_name / "train.txt", + "wb", + ) as f_out: + shutil.copyfileobj(f_in, f_out) + with gzip.open( + Path(flair.cache_root) / "datasets" / dataset_name / "test.txt.gz", "rb" + ) as f_in: + with open( + Path(flair.cache_root) / "datasets" / dataset_name / "test.txt", + "wb", + ) as f_out: + shutil.copyfileobj(f_in, f_out) + + super(CONLL_2000, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, ) -class TWITTER_NER(ColumnCorpus): +class DANE(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", in_memory: bool = True, - document_as_sequence: bool = False, **corpusargs, ): - """ - Initialize a dataset called twitter_ner which can be found on the following page: - https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/ner.txt. - - The first time you call this constructor it will automatically - download the dataset. - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, need not be changed - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: 'text', 1: 'ner'} + columns = {1: 'text', 3: 'pos', 9: 'ner'} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -629,45 +793,63 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - twitter_ner_path = "https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/" - cached_path(f"{twitter_ner_path}ner.txt", Path("datasets") / dataset_name) + data_path = Path(flair.cache_root) / "datasets" / dataset_name + train_data_file = data_path / "ddt.train.conllu" + if not train_data_file.is_file(): + temp_file = cached_path( + 'https://danlp.alexandra.dk/304bd159d5de/datasets/ddt.zip', + Path("datasets") / dataset_name + ) + from zipfile import ZipFile - super(TWITTER_NER, self).__init__( - data_folder, - columns, - tag_to_bioes=tag_to_bioes, - encoding="latin-1", - train_file="ner.txt", - in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", + with ZipFile(temp_file, 'r') as zip_file: + zip_file.extractall(path=data_path) + + # Remove CoNLL-U meta information in the last column + for part in ['train', 'dev', 'test']: + lines = [] + data_file = "ddt.{}.conllu".format(part) + with open(data_path / data_file, 'r') as file: + for line in file: + if line.startswith("#") or line == "\n": + lines.append(line) + lines.append(line.replace("name=", "").replace("|SpaceAfter=No", "")) + + with open(data_path / data_file, 'w') as file: + file.writelines(lines) + + print(data_path / data_file) + + super(DANE, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, + in_memory=in_memory, comment_symbol="#", **corpusargs, ) -class MIT_RESTAURANTS(ColumnCorpus): +class EUROPARL_NER_GERMAN(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", - in_memory: bool = True, - document_as_sequence: bool = False, + in_memory: bool = False, **corpusargs, ): """ - Initialize the experimental MIT Restaurant corpus available on https://groups.csail.mit.edu/sls/downloads/restaurant/. - The first time you call this constructor it will automatically download the dataset. + Initialize the EUROPARL_NER_GERMAN corpus. The first time you call this constructor it will automatically + download the dataset. :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict - POS tags instead - :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param tag_to_bioes: 'ner' by default, should not be changed. + :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage. :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ + if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "ner"} + columns = {0: 'text', 1: 'lemma', 2: 'pos', 3: 'np', 4: 'ner'} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -678,126 +860,26 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - mit_restaurants_path = "https://megantosh.s3.eu-central-1.amazonaws.com/MITRestoCorpus/" - cached_path(f"{mit_restaurants_path}test.txt", Path("datasets") / dataset_name) - cached_path(f"{mit_restaurants_path}train.txt", Path("datasets") / dataset_name) + europarl_ner_german_path = "https://nlpado.de/~sebastian/software/ner/" + cached_path(f"{europarl_ner_german_path}ep-96-04-15.conll", Path("datasets") / dataset_name) + cached_path(f"{europarl_ner_german_path}ep-96-04-16.conll", Path("datasets") / dataset_name) + + add_IOB_tags(data_file=Path(data_folder / "ep-96-04-15.conll"), encoding="latin-1", ner_column=4) + add_IOB_tags(data_file=Path(data_folder / "ep-96-04-16.conll"), encoding="latin-1", ner_column=4) - super(MIT_RESTAURANTS, self).__init__( + super(EUROPARL_NER_GERMAN, self).__init__( data_folder, columns, tag_to_bioes=tag_to_bioes, encoding="latin-1", in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", + train_file='ep-96-04-16.conll', + test_file='ep-96-04-15.conll', **corpusargs, ) -def add_IOB_tags(data_file: Union[str, Path], encoding: str = "utf8", ner_column: int = 1): - """ -Function that adds IOB tags if only chunk names are provided (e.g. words are tagged PER instead -of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects -the letter 'O'. Additionally it removes lines with no tags in the data file and can also -be used if the data is only partially IOB tagged. -Parameters ----------- -data_file : Union[str, Path] - Path to the data file. -encoding : str, optional - Encoding used in open function. The default is "utf8". -ner_column : int, optional - Specifies the ner-tagged column. The default is 1 (the second column). - -""" - - def add_I_prefix(current_line: List[str], ner: int, tag: str): - for i in range(0, len(current_line)): - if i == 0: - f.write(line_list[i]) - elif i == ner: - f.write(' I-' + tag) - else: - f.write(' ' + current_line[i]) - f.write('\n') - - with open(file=data_file, mode='r', encoding=encoding) as f: - lines = f.readlines() - with open(file=data_file, mode='w', encoding=encoding) as f: - pred = 'O' # remembers ner tag of predecessing line - for line in lines: - line_list = line.split() - if len(line_list) > 2: # word with tags - ner_tag = line_list[ner_column] - if ner_tag in ['0', 'O']: # no chunk - for i in range(0, len(line_list)): - if i == 0: - f.write(line_list[i]) - elif i == ner_column: - f.write(' O') - else: - f.write(' ' + line_list[i]) - f.write('\n') - pred = 'O' - elif '-' not in ner_tag: # no IOB tags - if pred == 'O': # found a new chunk - add_I_prefix(line_list, ner_column, ner_tag) - pred = ner_tag - else: # found further part of chunk or new chunk directly after old chunk - add_I_prefix(line_list, ner_column, ner_tag) - pred = ner_tag - else: # line already has IOB tag (tag contains '-') - f.write(line) - pred = ner_tag.split('-')[1] - elif len(line_list) == 0: # empty line - f.write('\n') - pred = 'O' - - -def add_IOB2_tags(data_file: Union[str, Path], encoding: str = "utf8"): - """ -Function that adds IOB2 tags if only chunk names are provided (e.g. words are tagged PER instead -of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects -the letter 'O'. Additionally it removes lines with no tags in the data file and can also -be used if the data is only partially IOB tagged. -Parameters ----------- -data_file : Union[str, Path] - Path to the data file. -encoding : str, optional - Encoding used in open function. The default is "utf8". - -""" - with open(file=data_file, mode='r', encoding=encoding) as f: - lines = f.readlines() - with open(file=data_file, mode='w', encoding=encoding) as f: - pred = 'O' # remembers tag of predecessing line - for line in lines: - line_list = line.split() - if len(line_list) == 2: # word with tag - word = line_list[0] - tag = line_list[1] - if tag in ['0', 'O']: # no chunk - f.write(word + ' O\n') - pred = 'O' - elif '-' not in tag: # no IOB tags - if pred == 'O': # found a new chunk - f.write(word + ' B-' + tag + '\n') - pred = tag - else: # found further part of chunk or new chunk directly after old chunk - if pred == tag: - f.write(word + ' I-' + tag + '\n') - else: - f.write(word + ' B-' + tag + '\n') - pred = tag - else: # line already has IOB tag (tag contains '-') - f.write(line) - pred = tag.split('-')[1] - elif len(line_list) == 0: # empty line - f.write('\n') - pred = 'O' - - -class CONLL_03_SPANISH(ColumnCorpus): +class GERMEVAL_14(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, @@ -806,19 +888,18 @@ def __init__( **corpusargs, ): """ - Initialize the CoNLL-03 corpus for Spanish. The first time you call this constructor it will automatically - download the dataset. - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, should not be changed - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + Initialize the GermEval NER corpus for German. This is only possible if you've manually downloaded it to your + machine. Obtain the corpus from https://sites.google.com/site/germeval2014ner/data and put it into some folder. + Then point the base_path parameter in the constructor to this folder + :param base_path: Path to the GermEval corpus on your machine + :param tag_to_bioes: 'ner' by default, should not be changed. + :param in_memory:If True, keeps dataset in memory giving speedups in training. """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "ner"} + columns = {1: "text", 2: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -828,43 +909,38 @@ def __init__( base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name - # download data if necessary - conll_02_path = "https://www.clips.uantwerpen.be/conll2002/ner/data/" - cached_path(f"{conll_02_path}esp.testa", Path("datasets") / dataset_name) - cached_path(f"{conll_02_path}esp.testb", Path("datasets") / dataset_name) - cached_path(f"{conll_02_path}esp.train", Path("datasets") / dataset_name) - - super(CONLL_03_SPANISH, self).__init__( + # check if data there + if not data_folder.exists(): + log.warning("-" * 100) + log.warning(f'WARNING: GermEval-14 dataset not found at "{data_folder}".') + log.warning( + 'Instructions for obtaining the data can be found here: https://sites.google.com/site/germeval2014ner/data"' + ) + log.warning("-" * 100) + super(GERMEVAL_14, self).__init__( data_folder, columns, tag_to_bioes=tag_to_bioes, - encoding="latin-1", + comment_symbol="#", in_memory=in_memory, **corpusargs, ) -class CONLL_2000(ColumnCorpus): +class INSPEC(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "np", + tag_to_bioes: str = "keyword", in_memory: bool = True, **corpusargs, ): - """ - Initialize the CoNLL-2000 corpus for English chunking. - The first time you call this constructor it will automatically download the dataset. - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: 'np' by default, should not be changed, but you can set 'pos' instead to predict POS tags - :param in_memory: If True, keeps dataset in memory giving speedups in training. - """ + if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "pos", 2: "np"} + columns = {0: "text", 1: "keyword"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -874,78 +950,35 @@ def __init__( base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name - # download data if necessary - conll_2000_path = "https://www.clips.uantwerpen.be/conll2000/chunking/" - data_file = Path(flair.cache_root) / "datasets" / dataset_name / "train.txt" - if not data_file.is_file(): - cached_path( - f"{conll_2000_path}train.txt.gz", Path("datasets") / dataset_name - ) - cached_path( - f"{conll_2000_path}test.txt.gz", Path("datasets") / dataset_name - ) - import gzip, shutil - - with gzip.open( - Path(flair.cache_root) / "datasets" / dataset_name / "train.txt.gz", - "rb", - ) as f_in: - with open( - Path(flair.cache_root) / "datasets" / dataset_name / "train.txt", - "wb", - ) as f_out: - shutil.copyfileobj(f_in, f_out) - with gzip.open( - Path(flair.cache_root) / "datasets" / dataset_name / "test.txt.gz", "rb" - ) as f_in: - with open( - Path(flair.cache_root) / "datasets" / dataset_name / "test.txt", - "wb", - ) as f_out: - shutil.copyfileobj(f_in, f_out) + inspec_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/Inspec" + cached_path(f"{inspec_path}/train.txt", Path("datasets") / dataset_name) + cached_path(f"{inspec_path}/test.txt", Path("datasets") / dataset_name) + if not "dev.txt" in os.listdir(data_folder): + cached_path(f"{inspec_path}/valid.txt", Path("datasets") / dataset_name) + # rename according to train - test - dev - convention + os.rename(data_folder / "valid.txt", data_folder / "dev.txt") - super(CONLL_2000, self).__init__( + super(INSPEC, self).__init__( data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, ) -class XTREME(MultiCorpus): +class LER_GERMAN(ColumnCorpus): def __init__( self, - languages: Union[str, List[str]] = None, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", in_memory: bool = False, **corpusargs, ): """ - Xtreme corpus for cross-lingual NER consisting of datasets of a total of 176 languages. The data comes from the google - research work XTREME https://github.com/google-research/xtreme. All datasets for NER and respective language abbreviations (e.g. - "en" for english can be found here https://www.amazon.com/clouddrive/share/d3KGCRCIYwhKJF0H3eWA26hjg2ZCRhjpEQtDL70FSBN/folder/C43gs51bSIaq5sFTQkWNCQ?_encoding=UTF8&*Version*=1&*entries*=0&mgh=1 ) - The data is derived from the wikiann dataset https://elisa-ie.github.io/wikiann/ (license: https://opendatacommons.org/licenses/by/) - - Parameters - ---------- - languages : Union[str, List[str]], optional - Default the 40 languages that are used in XTREME are loaded. Otherwise on can hand over a strings or a list of strings - consisiting of abbreviations for languages. All datasets will be loaded in a MultiCorpus object. - base_path : Union[str, Path], optional - Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - tag_to_bioes : str, optional - The data is in bio-format. It will by default (with the string "ner" as value) be transformed - into the bioes format. If you dont want that set it to None. - + Initialize the LER_GERMAN (Legal Entity Recognition) corpus. The first time you call this constructor it will automatically + download the dataset. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ - # if no languages are given as argument all languages used in XTREME will be loaded - if not languages: - languages = ["af", "ar", "bg", "bn", "de", "el", "en", "es", "et", "eu", "fa", "fi", "fr", "he", "hi", "hu", - "id", "it", "ja", "jv", "ka", "kk", "ko", "ml", "mr", "ms", "my", "nl", "pt", "ru", "sw", "ta", - "te", "th", "tl", "tr", "ur", "vi", "yo", "zh"] - - # if only one language is given - if type(languages) == str: - languages = [languages] if type(base_path) == str: base_path: Path = Path(base_path) @@ -954,113 +987,142 @@ def __init__( columns = {0: "text", 1: "ner"} # this dataset name - dataset_name = "xtreme" + dataset_name = self.__class__.__name__.lower() # default dataset folder is the cache root if not base_path: base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name - # For each language in languages, the file is downloaded if not existent - # Then a comlumncorpus of that data is created and saved in a list - # This list is handed to the multicorpus + # download data if necessary + ler_path = "https://raw.githubusercontent.com/elenanereiss/Legal-Entity-Recognition/master/data/" + cached_path(f"{ler_path}ler.conll", Path("datasets") / dataset_name) - # list that contains the columncopora - corpora = [] + super(LER_GERMAN, self).__init__( + data_folder, + columns, + tag_to_bioes=tag_to_bioes, + in_memory=in_memory, + train_file='ler.conll', + **corpusargs, + ) - hu_path = "https://nlp.informatik.hu-berlin.de/resources/datasets/panx_dataset" - # download data if necessary - for language in languages: +class MIT_MOVIE_NER_SIMPLE(ColumnCorpus): + def __init__( + self, + base_path: Union[str, Path] = None, + tag_to_bioes: str = "ner", + in_memory: bool = True, + **corpusargs, + ): + """ + Initialize the eng corpus of the MIT Movie Corpus (it has simpler queries compared to the trivia10k13 corpus) + in BIO format. The first time you call this constructor it will automatically download the dataset. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict + POS tags instead + :param in_memory: If True, keeps dataset in memory giving speedups in training. + """ + # column format + columns = {0: "ner", 1: "text"} - language_folder = data_folder / language + # dataset name + dataset_name = self.__class__.__name__.lower() - # if language not downloaded yet, download it - if not language_folder.exists(): + # data folder: default dataset folder is the cache root + if type(base_path) == str: + base_path: Path = Path(base_path) + if not base_path: + base_path: Path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name - file_name = language + '.tar.gz' - # create folder - os.makedirs(language_folder) + # download data if necessary + mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/" + train_file = "engtrain.bio" + test_file = "engtest.bio" + cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name) + cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name) - # download from HU Server - temp_file = cached_path( - hu_path + "/" + file_name, - Path("datasets") / dataset_name / language - ) + super(MIT_MOVIE_NER_SIMPLE, self).__init__( + data_folder, + columns, + train_file=train_file, + test_file=test_file, + tag_to_bioes=tag_to_bioes, + in_memory=in_memory, + **corpusargs, + ) - # unzip - print("Extract data...") - import tarfile - tar = tarfile.open(str(temp_file), "r:gz") - for part in ["train", "test", "dev"]: - tar.extract(part, str(language_folder)) - tar.close() - print('...done.') - # transform data into required format - print("Process dataset...") - for part in ["train", "test", "dev"]: - xtreme_to_simple_ner_annotation(str(language_folder / part)) - print('...done.') +class MIT_MOVIE_NER_COMPLEX(ColumnCorpus): + def __init__( + self, + base_path: Union[str, Path] = None, + tag_to_bioes: str = "ner", + in_memory: bool = True, + **corpusargs, + ): + """ + Initialize the trivia10k13 corpus of the MIT Movie Corpus (it has more complex queries compared to the eng corpus) + in BIO format. The first time you call this constructor it will automatically download the dataset. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict + POS tags instead + :param in_memory: If True, keeps dataset in memory giving speedups in training. + """ + # column format + columns = {0: "ner", 1: "text"} - # initialize comlumncorpus and add it to list - print("Read data into corpus...") - corp = ColumnCorpus(data_folder=language_folder, - column_format=columns, - tag_to_bioes=tag_to_bioes, - in_memory=in_memory, - ) - corpora.append(corp) - print("...done.") + # dataset name + dataset_name = self.__class__.__name__.lower() - super(XTREME, self).__init__( - corpora, name='xtreme', **corpusargs, - ) + # data folder: default dataset folder is the cache root + if type(base_path) == str: + base_path: Path = Path(base_path) + if not base_path: + base_path: Path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + # download data if necessary + mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/" + train_file = "trivia10k13train.bio" + test_file = "trivia10k13test.bio" + cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name) + cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name) -def xtreme_to_simple_ner_annotation(data_file: Union[str, Path]): - with open(data_file, 'r', encoding='utf-8') as f: - lines = f.readlines() - with open(data_file, 'w', encoding='utf-8') as f: - for line in lines: - if line == '\n': - f.write(line) - else: - liste = line.split() - f.write(liste[0].split(':', 1)[1] + ' ' + liste[1] + '\n') + super(MIT_MOVIE_NER_COMPLEX, self).__init__( + data_folder, + columns, + train_file=train_file, + test_file=test_file, + tag_to_bioes=tag_to_bioes, + in_memory=in_memory, + **corpusargs, + ) -class WIKIANN(MultiCorpus): +class MIT_RESTAURANT_NER(ColumnCorpus): def __init__( self, - languages: Union[str, List[str]], base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", - in_memory: bool = False, + in_memory: bool = True, + document_as_sequence: bool = False, **corpusargs, ): """ - WkiAnn corpus for cross-lingual NER consisting of datasets from 282 languages that exist - in Wikipedia. See https://elisa-ie.github.io/wikiann/ for details and for the languages and their - respective abbreveations, i.e. "en" for english. (license: https://opendatacommons.org/licenses/by/) - Parameters - ---------- - languages : Union[str, List[str]] - Should be an abbreviation of a language ("en", "de",..) or a list of abbreviations. - The datasets of all passed languages will be saved in one MultiCorpus. - (Note that, even though listed on https://elisa-ie.github.io/wikiann/ some datasets are empty. - This includes "aa", "cho", "ho", "hz", "ii", "jam", "kj", "kr", "mus", "olo" and "tcy".) - base_path : Union[str, Path], optional - Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - tag_to_bioes : str, optional - The data is in bio-format. It will by default (with the string "ner" as value) be transformed - into the bioes format. If you dont want that set it to None. - + Initialize the experimental MIT Restaurant corpus available on https://groups.csail.mit.edu/sls/downloads/restaurant/. + The first time you call this constructor it will automatically download the dataset. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict + POS tags instead + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ - if type(languages) == str: - languages = [languages] - if type(base_path) == str: base_path: Path = Path(base_path) @@ -1068,394 +1130,123 @@ def __init__( columns = {0: "text", 1: "ner"} # this dataset name - dataset_name = "wikiann" + dataset_name = self.__class__.__name__.lower() # default dataset folder is the cache root if not base_path: base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name - # For each language in languages, the file is downloaded if not existent - # Then a comlumncorpus of that data is created and saved in a list - # this list is handed to the multicorpus + # download data if necessary + mit_restaurants_path = "https://megantosh.s3.eu-central-1.amazonaws.com/MITRestoCorpus/" + cached_path(f"{mit_restaurants_path}test.txt", Path("datasets") / dataset_name) + cached_path(f"{mit_restaurants_path}train.txt", Path("datasets") / dataset_name) - # list that contains the columncopora - corpora = [] + super(MIT_RESTAURANT_NER, self).__init__( + data_folder, + columns, + tag_to_bioes=tag_to_bioes, + encoding="latin-1", + in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + **corpusargs, + ) + + +class NER_BASQUE(ColumnCorpus): + def __init__( + self, + base_path: Union[str, Path] = None, + tag_to_bioes: str = "ner", + in_memory: bool = True, + **corpusargs, + ): + if type(base_path) == str: + base_path: Path = Path(base_path) + + # column format + columns = {0: "text", 1: "ner"} + + # this dataset name + dataset_name = self.__class__.__name__.lower() + + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name - google_drive_path = 'https://drive.google.com/uc?id=' # download data if necessary - first = True - for language in languages: + ner_basque_path = "http://ixa2.si.ehu.eus/eiec/" + data_path = Path(flair.cache_root) / "datasets" / dataset_name + data_file = data_path / "named_ent_eu.train" + if not data_file.is_file(): + cached_path( + f"{ner_basque_path}/eiec_v1.0.tgz", Path("datasets") / dataset_name + ) + import tarfile, shutil - language_folder = data_folder / language - file_name = 'wikiann-' + language + '.bio' + with tarfile.open( + Path(flair.cache_root) / "datasets" / dataset_name / "eiec_v1.0.tgz", + "r:gz", + ) as f_in: + corpus_files = ( + "eiec_v1.0/named_ent_eu.train", + "eiec_v1.0/named_ent_eu.test", + ) + for corpus_file in corpus_files: + f_in.extract(corpus_file, data_path) + shutil.move(f"{data_path}/{corpus_file}", data_path) - # if language not downloaded yet, download it - if not language_folder.exists(): - if first == True: - import gdown - import tarfile - first = False - # create folder - os.makedirs(language_folder) - # get google drive id from list - google_id = google_drive_id_from_language_name(language) - url = google_drive_path + google_id + super(NER_BASQUE, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, + ) - # download from google drive - gdown.download(url, str(language_folder / language) + '.tar.gz') - # unzip - print("Extract data...") - tar = tarfile.open(str(language_folder / language) + '.tar.gz', "r:gz") - # tar.extractall(language_folder,members=[tar.getmember(file_name)]) - tar.extract(file_name, str(language_folder)) - tar.close() - print('...done.') +class NER_FINNISH(ColumnCorpus): + def __init__( + self, + base_path: Union[str, Path] = None, + tag_to_bioes: str = "ner", + in_memory: bool = True, + **corpusargs, + ): + if type(base_path) == str: + base_path: Path = Path(base_path) - # transform data into required format - # the processed dataset has the additional ending "_new" - print("Process dataset...") - silver_standard_to_simple_ner_annotation(str(language_folder / file_name)) - # remove the unprocessed dataset - os.remove(str(language_folder / file_name)) - print('...done.') + # column format + columns = {0: "text", 1: "ner"} - # initialize comlumncorpus and add it to list - print("Read data into corpus...") - corp = ColumnCorpus(data_folder=language_folder, - column_format=columns, - train_file=file_name + '_new', - tag_to_bioes=tag_to_bioes, - in_memory=in_memory, - ) - corpora.append(corp) - print("...done.") + # this dataset name + dataset_name = self.__class__.__name__.lower() - super(WIKIANN, self).__init__( - corpora, name='wikiann', **corpusargs, + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / dataset_name + + # download data if necessary + ner_finnish_path = "https://raw.githubusercontent.com/mpsilfve/finer-data/master/data/digitoday." + cached_path(f"{ner_finnish_path}2014.train.csv", Path("datasets") / dataset_name) + cached_path(f"{ner_finnish_path}2014.dev.csv", Path("datasets") / dataset_name) + cached_path(f"{ner_finnish_path}2015.test.csv", Path("datasets") / dataset_name) + + _remove_lines_without_annotations(data_file=Path(data_folder / "digitoday.2015.test.csv")) + + super(NER_FINNISH, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, skip_first_line=True, **corpusargs, ) -def silver_standard_to_simple_ner_annotation(data_file: Union[str, Path]): - f_read = open(data_file, 'r', encoding='utf-8') - f_write = open(data_file + '_new', 'w+', encoding='utf-8') - while True: - line = f_read.readline() - if line: - if line == '\n': - f_write.write(line) - else: - liste = line.split() - f_write.write(liste[0] + ' ' + liste[-1] + '\n') - else: - break - f_read.close() - f_write.close() +def _remove_lines_without_annotations(data_file: Union[str, Path] = None): + with open(data_file, 'r') as f: + lines = f.readlines() + with open(data_file, 'w') as f: + for line in lines: + if len(line.split()) != 1: + f.write(line) -def google_drive_id_from_language_name(language): - languages_ids = { - 'aa': '1tDDlydKq7KQQ3_23Ysbtke4HJOe4snIk', # leer - 'ab': '1hB8REj2XA_0DjI9hdQvNvSDpuBIb8qRf', - 'ace': '1WENJS2ppHcZqaBEXRZyk2zY-PqXkTkgG', - 'ady': '1n6On8WWDHxEoybj7F9K15d_fkGPy6KgO', - 'af': '1CPB-0BD2tg3zIT60D3hmJT0i5O_SKja0', - 'ak': '1l2vlGHnQwvm9XhW5S-403fetwUXhBlZm', - 'als': '196xyYjhbie7sYLHLZHWkkurOwQLi8wK-', - 'am': '1ug1IEoExKD3xWpvfZprAPSQi82YF9Cet', - 'an': '1DNLgPOAOsGZBYd6rC5ddhzvc9_DtWnk2', - 'ang': '1W_0ti7Tl8AkqM91lRCMPWEuUnPOAZroV', - 'ar': '1tyvd32udEQG_cNeVpaD5I2fxvCc6XKIS', - 'arc': '1hSOByStqPmP3b9HfQ39EclUZGo8IKCMb', - 'arz': '1CKW5ZhxTpIHmc8Jt5JLz_5O6Cr8Icsan', - 'as': '12opBoIweBLM8XciMHT4B6-MAaKdYdvpE', - 'ast': '1rp64PxGZBDfcw-tpFBjLg_ddLDElG1II', - 'av': '1hncGUrkG1vwAAQgLtwOf41BWkHkEvdss', - 'ay': '1VmIsWpMTz442b4Mx798ZOgtB9vquKQtf', - 'az': '1FXDXsvBSdqc7GGIDZv0hqBOaaw12Ip2-', - 'azb': '1amVqOuHLEkhjn8rkGUl-mXdZlaACWyNT', - 'ba': '1aLx1d8GagI11VZVYOGQy0BEePeqoT0x3', - 'bar': '1JZ8-k8ZmnpWYI_Yl_cBBgjVdxoM9Daci', - 'bat-smg': '1trxKXDFSeKsygTMKi-ZqXSJs7F90k5a8', - 'bcl': '1Hs0k7KVZ2DPsqroZ4cUKcwZG4HdPV794', - 'be-x-old': '1gaK-spj1m6eGYQ-SsngLxxLUvP1VRk08', - 'be': '1_ttfOSy9BzCRkIT_p3mImT82XRPpEiuH', - 'bg': '1Iug6gYKemb0OrLTUrKDc_c66YGypTfCF', - 'bh': '12OcSFLu940A8tVQLxI8pnxKBpTeZHmrh', - 'bi': '1rftVziS_pqARx4mvLJC0sKLY-OL5ZIjE', - 'bjn': '1n17mkRjPUAOWQk5LQs2C3Tz3ShxK0enZ', - 'bm': '1284dwO_sfdsWE7FR06HhfBRUb8ePesKR', - 'bn': '1K2DM1mT4hkr6NlAIBTj95BeVXcgvpgDm', - 'bo': '1SzGHDVK-OguKdjZ4DXWiOJVrie1iHeWm', - 'bpy': '1m-e5EoruJufvwBEgJLmJtx6jzx64pYN2', - 'br': '1xdaBoJ1DnwI0iEq7gQN1dWcABAs_bM9H', - 'bs': '167dsB01trMYFQl8FshtIdfhjw7IfVKbk', - 'bug': '1yCnevM9_KJzFk27Vxsva_20OacLo4Uam', - 'bxr': '1DlByAX3zB-9UyEAVD4wtX-R7mXC-8xum', - 'ca': '1LuUgbd9sGa-5Ahcsy31EK89a3WOowftY', - 'cbk-zam': '1kgF8xoD-kIOWZET_9kp_4yNX6AAXn6PI', - 'cdo': '14x1y6611G-UAEGq92QEHRpreVkYnoUCw', - 'ce': '1QUUCVKA-fkiCHd3KT3zUWefaWnxzlZLu', - 'ceb': '1DJZE9RfaMoPNXHI73KBXAm4YSe-_YCUk', - 'ch': '1YzAfhmatkmTpkZbAcD6X83epCgzD5S2_', - 'cho': '1ciY0vF3c5a2mTOo_k32A2wMs0klK98Kb', # leer - 'chr': '1EHaxz1UZHn7v2bbRzCLAhPsNtRzrG3Ae', - 'chy': '1nNWwMAJr1KNdz3bHf6uIn-thZCknlTeB', - 'ckb': '1llpaftcUSiXCZQZMdAqaJSrhwMdcf9IV', - 'co': '1ZP-8oWgMYfW7a6w6ygEFkKDGbN39QnDn', - 'cr': '1ST0xRicLAG4JdCZwGdaY-0pEXooQh7e6', - 'crh': '1Jmpq2XVYUR_XaXU5XNhtOMnz-qkpsgpE', - 'cs': '1Vydyze-jBkK_S1uV5ewV_Y6dbwhXr7lk', - 'csb': '1naUyF74lZPnnopXdOqf5Xor2kT4WoHfS', - 'cu': '1EN5dVTU6jc7YOYPCHq8EYUF31HlMUKs7', - 'cv': '1gEUAlqYSSDI4TrWCqP1LUq2n0X1XEjN3', - 'cy': '1q5g6NJE5GXf65Vc_P4BnUMHQ49Prz-J1', - 'da': '11onAGOLkkqrIwM784siWlg-cewa5WKm8', - 'de': '1f9nWvNkCCy6XWhd9uf4Dq-2--GzSaYAb', - 'diq': '1IkpJaVbEOuOs9qay_KG9rkxRghWZhWPm', - 'dsb': '1hlExWaMth-2eVIQ3i3siJSG-MN_7Z6MY', - 'dv': '1WpCrslO4I7TMb2uaKVQw4U2U8qMs5szi', - 'dz': '10WX52ePq2KfyGliwPvY_54hIjpzW6klV', - 'ee': '1tYEt3oN2KPzBSWrk9jpCqnW3J1KXdhjz', - 'el': '1cxq4NUYmHwWsEn5waYXfFSanlINXWLfM', - 'eml': '17FgGhPZqZNtzbxpTJOf-6nxEuI5oU4Vd', - 'en': '1mqxeCPjxqmO7e8utj1MQv1CICLFVvKa-', - 'eo': '1YeknLymGcqj44ug2yd4P7xQVpSK27HkK', - 'es': '1Dnx3MVR9r5cuoOgeew2gT8bDvWpOKxkU', - 'et': '1Qhb3kYlQnLefWmNimdN_Vykm4mWzbcWy', - 'eu': '1f613wH88UeITYyBSEMZByK-nRNMwLHTs', - 'ext': '1D0nLOZ3aolCM8TShIRyCgF3-_MhWXccN', - 'fa': '1QOG15HU8VfZvJUNKos024xI-OGm0zhEX', - 'ff': '1h5pVjxDYcq70bSus30oqi9KzDmezVNry', - 'fi': '1y3Kf6qYsSvL8_nSEwE1Y6Bf6ninaPvqa', - 'fiu-vro': '1oKUiqG19WgPd3CCl4FGudk5ATmtNfToR', - 'fj': '10xDMuqtoTJlJFp5ghbhKfNWRpLDK3W4d', - 'fo': '1RhjYqgtri1276Be1N9RrNitdBNkpzh0J', - 'fr': '1sK_T_-wzVPJYrnziNqWTriU52rEsXGjn', - 'frp': '1NUm8B2zClBcEa8dHLBb-ZgzEr8phcQyZ', - 'frr': '1FjNqbIUlOW1deJdB8WCuWjaZfUzKqujV', - 'fur': '1oqHZMK7WAV8oHoZLjGR0PfmO38wmR6XY', - 'fy': '1DvnU6iaTJc9bWedmDklHyx8nzKD1s3Ge', - 'ga': '1Ql6rh7absdYQ8l-3hj_MVKcEC3tHKeFB', - 'gag': '1zli-hOl2abuQ2wsDJU45qbb0xuvYwA3a', - 'gan': '1u2dOwy58y-GaS-tCPJS_i9VRDQIPXwCr', - 'gd': '1umsUpngJiwkLdGQbRqYpkgxZju9dWlRz', - 'gl': '141K2IbLjJfXwFTIf-kthmmG0YWdi8liE', - 'glk': '1ZDaxQ6ilXaoivo4_KllagabbvfOuiZ0c', - 'gn': '1hM4MuCaVnZqnL-w-0N-WcWag22ikVLtZ', - 'gom': '1BNOSw75tzPC0wEgLOCKbwu9wg9gcLOzs', - 'got': '1YSHYBtXc1WvUvMIHPz6HHgJvaXKulJUj', - 'gu': '1VdK-B2drqFwKg8KD23c3dKXY-cZgCMgd', - 'gv': '1XZFohYNbKszEFR-V-yDXxx40V41PV9Zm', - 'ha': '18ZG4tUU0owRtQA8Ey3Dl72ALjryEJWMC', - 'hak': '1QQe3WgrCWbvnVH42QXD7KX4kihHURB0Z', - 'haw': '1FLqlK-wpz4jy768XbQAtxd9PhC-9ciP7', - 'he': '18K-Erc2VOgtIdskaQq4D5A3XkVstDmfX', - 'hi': '1lBRapb5tjBqT176gD36K5yb_qsaFeu-k', - 'hif': '153MQ9Ga4NQ-CkK8UiJM3DjKOk09fhCOV', - 'ho': '1c1AoS7yq15iVkTEE-0f3x25NT4F202B8', # leer - 'hr': '1wS-UtB3sGHuXJQQGR0F5lDegogsgoyif', - 'hsb': '1_3mMLzAE5OmXn2z64rW3OwWbo85Mirbd', - 'ht': '1BwCaF0nfdgkM7Yt7A7d7KyVk0BcuwPGk', - 'hu': '10AkDmTxUWNbOXuYLYZ-ZPbLAdGAGZZ8J', - 'hy': '1Mi2k2alJJquT1ybd3GC3QYDstSagaWdo', - 'hz': '1c1m_-Q92v0Di7Nez6VuaccrN19i8icKV', # leer - 'ia': '1jPyqTmDuVhEhj89N606Cja5heJEbcMoM', - 'id': '1JWIvIh8fQoMQqk1rPvUThaskxnTs8tsf', - 'ie': '1TaKRlTtB8-Wqu4sfvx6JQKIugAlg0pV-', - 'ig': '15NFAf2Qx6BXSjv_Oun9_3QRBWNn49g86', - 'ii': '1qldGJkMOMKwY13DpcgbxQCbff0K982f9', # leer - 'ik': '1VoSTou2ZlwVhply26ujowDz6gjwtxmny', - 'ilo': '1-xMuIT6GaM_YeHqgm1OamGkxYfBREiv3', - 'io': '19Zla0wsAcrZm2c0Pw5ghpp4rHjYs26Pp', - 'is': '11i-NCyqS6HbldIbYulsCgQGZFXR8hwoB', - 'it': '1HmjlOaQunHqL2Te7pIkuBWrnjlmdfYo_', - 'iu': '18jKm1S7Ls3l0_pHqQH8MycG3LhoC2pdX', - 'ja': '10dz8UxyK4RIacXE2HcGdrharmp5rwc3r', - 'jam': '1v99CXf9RnbF6aJo669YeTR6mQRTOLZ74', # leer - 'jbo': '1_LmH9hc6FDGE3F7pyGB1fUEbSwuTYQdD', - 'jv': '1qiSu1uECCLl4IBZS27FBdJIBivkJ7GwE', - 'ka': '172UFuFRBX2V1aWeXlPSpu9TjS-3cxNaD', - 'kaa': '1kh6hMPUdqO-FIxRY6qaIBZothBURXxbY', - 'kab': '1oKjbZI6ZrrALCqnPCYgIjKNrKDA7ehcs', - 'kbd': '1jNbfrboPOwJmlXQBIv053d7n5WXpMRv7', - 'kg': '1iiu5z-sdJ2JLC4Ja9IgDxpRZklIb6nDx', - 'ki': '1GUtt0QI84c5McyLGGxoi5uwjHOq1d6G8', - 'kj': '1nSxXUSGDlXVCIPGlVpcakRc537MwuKZR', # leer - 'kk': '1ryC3UN0myckc1awrWhhb6RIi17C0LCuS', - 'kl': '1gXtGtX9gcTXms1IExICnqZUHefrlcIFf', - 'km': '1DS5ATxvxyfn1iWvq2G6qmjZv9pv0T6hD', - 'kn': '1ZGLYMxbb5-29MNmuUfg2xFhYUbkJFMJJ', - 'ko': '12r8tIkTnwKhLJxy71qpIcoLrT6NNhQYm', - 'koi': '1EdG_wZ_Qk124EPAZw-w6rdEhYLsgcvIj', - 'kr': '19VNQtnBA-YL_avWuVeHQHxJZ9MZ04WPF', # leer - 'krc': '1nReV4Mb7Wdj96czpO5regFbdBPu0zZ_y', - 'ks': '1kzh0Pgrv27WRMstR9MpU8mu7p60TcT-X', - 'ksh': '1iHJvrl2HeRaCumlrx3N7CPrHQ2KuLUkt', - 'ku': '1YqJog7Bkk0fHBCSTxJ9heeE-bfbkbkye', - 'kv': '1s91HI4eq8lQYlZwfrJAgaGlCyAtIhvIJ', - 'kw': '16TaIX2nRfqDp8n7zudd4bqf5abN49dvW', - 'ky': '17HPUKFdKWhUjuR1NOp5f3PQYfMlMCxCT', - 'la': '1NiQuBaUIFEERvVXo6CQLwosPraGyiRYw', - 'lad': '1PEmXCWLCqnjLBomMAYHeObM1AmVHtD08', - 'lb': '1nE4g10xoTU23idmDtOQ0w2QCuizZ6QH_', - 'lbe': '1KOm-AdRcCHfSc1-uYBxBA4GjxXjnIlE-', - 'lez': '1cJAXshrLlF1TZlPHJTpDwEvurIOsz4yR', - 'lg': '1Ur0y7iiEpWBgHECrIrT1OyIC8um_y4th', - 'li': '1TikIqfqcZlSDWhOae1JnjJiDko4nj4Dj', - 'lij': '1ro5ItUcF49iP3JdV82lhCQ07MtZn_VjW', - 'lmo': '1W4rhBy2Pi5SuYWyWbNotOVkVY3kYWS_O', - 'ln': '1bLSV6bWx0CgFm7ByKppZLpYCFL8EIAoD', - 'lo': '1C6SSLeKF3QirjZbAZAcpVX_AXYg_TJG3', - 'lrc': '1GUcS28MlJe_OjeQfS2AJ8uczpD8ut60e', - 'lt': '1gAG6TcMTmC128wWK0rCXRlCTsJY9wFQY', - 'ltg': '12ziP8t_fAAS9JqOCEC0kuJObEyuoiOjD', - 'lv': '1MPuAM04u-AtfybXdpHwCqUpFWbe-zD0_', - 'mai': '1d_nUewBkka2QGEmxCc9v3dTfvo7lPATH', - 'map-bms': '1wrNIE-mqp2xb3lrNdwADe6pb7f35NP6V', - 'mdf': '1BmMGUJy7afuKfhfTBMiKxM3D7FY-JrQ2', - 'mg': '105WaMhcWa-46tCztoj8npUyg0aH18nFL', - 'mh': '1Ej7n6yA1cF1cpD5XneftHtL33iHJwntT', - 'mhr': '1CCPIUaFkEYXiHO0HF8_w07UzVyWchrjS', - 'mi': '1F6au9xQjnF-aNBupGJ1PwaMMM6T_PgdQ', - 'min': '1tVK5SHiCy_DaZSDm3nZBgT5bgWThbJt_', - 'mk': '18NpudytGhSWq_LbmycTDw10cSftlSBGS', - 'ml': '1V73UE-EvcE-vV3V1RTvU4sak6QFcP91y', - 'mn': '14jRXicA87oXZOZllWqUjKBMetNpQEUUp', - 'mo': '1YsLGNMsJ7VsekhdcITQeolzOSK4NzE6U', - 'mr': '1vOr1AIHbgkhTO9Ol9Jx5Wh98Qdyh1QKI', - 'mrj': '1dW-YmEW8a9D5KyXz8ojSdIXWGekNzGzN', - 'ms': '1bs-_5WNRiZBjO-DtcNtkcIle-98homf_', - 'mt': '1L7aU3iGjm6SmPIU74k990qRgHFV9hrL0', - 'mus': '1_b7DcRqiKJFEFwp87cUecqf8A5BDbTIJ', # leer - 'mwl': '1MfP0jba2jQfGVeJOLq26MjI6fYY7xTPu', - 'my': '16wsIGBhNVd2lC2p6n1X8rdMbiaemeiUM', - 'myv': '1KEqHmfx2pfU-a1tdI_7ZxMQAk5NJzJjB', - 'mzn': '1CflvmYEXZnWwpsBmIs2OvG-zDDvLEMDJ', - 'na': '1r0AVjee5wNnrcgJxQmVGPVKg5YWz1irz', - 'nah': '1fx6eu91NegyueZ1i0XaB07CKjUwjHN7H', - 'nap': '1bhT4sXCJvaTchCIV9mwLBtf3a7OprbVB', - 'nds-nl': '1UIFi8eOCuFYJXSAXZ9pCWwkQMlHaY4ye', - 'nds': '1FLgZIXUWa_vekDt4ndY0B5XL7FNLiulr', - 'ne': '1gEoCjSJmzjIH4kdHsbDZzD6ID4_78ekS', - 'new': '1_-p45Ny4w9UvGuhD8uRNSPPeaARYvESH', - 'ng': '11yxPdkmpmnijQUcnFHZ3xcOmLTYJmN_R', - 'nl': '1dqYXg3ilzVOSQ_tz_dF47elSIvSIhgqd', - 'nn': '1pDrtRhQ001z2WUNMWCZQU3RV_M0BqOmv', - 'no': '1zuT8MI96Ivpiu9mEVFNjwbiM8gJlSzY2', - 'nov': '1l38388Rln0NXsSARMZHmTmyfo5C0wYTd', - 'nrm': '10vxPq1Nci7Wpq4XOvx3dtqODskzjdxJQ', - 'nso': '1iaIV8qlT0RDnbeQlnxJ3RehsG3gU5ePK', - 'nv': '1oN31jT0w3wP9aGwAPz91pSdUytnd9B0g', - 'ny': '1eEKH_rUPC560bfEg11kp3kbe8qWm35IG', - 'oc': '1C01cW8G_j8US-DTrsmeal_ENHTtNWn-H', - 'olo': '1vbDwKZKqFq84dusr1SvDx5JbBcPanx9L', # leer - 'om': '1q3h22VMbWg2kgVFm-OArR-E4y1yBQ1JX', - 'or': '1k8LwCE8nC7lq6neXDaS3zRn0KOrd9RnS', - 'os': '1u81KAB34aEQfet00dLMRIBJsfRwbDTij', - 'pa': '1JDEHL1VcLHBamgTPBom_Ryi8hk6PBpsu', - 'pag': '1k905VUWnRgY8kFb2P2431Kr4dZuolYGF', - 'pam': '1ssugGyJb8ipispC60B3I6kzMsri1WcvC', - 'pap': '1Za0wfwatxYoD7jGclmTtRoBP0uV_qImQ', - 'pcd': '1csJlKgtG04pdIYCUWhsCCZARKIGlEYPx', - 'pdc': '1Xnms4RXZKZ1BBQmQJEPokmkiweTpouUw', - 'pfl': '1tPQfHX7E0uKMdDSlwNw5aGmaS5bUK0rn', - 'pi': '16b-KxNxzbEuyoNSlI3bfe2YXmdSEsPFu', - 'pih': '1vwyihTnS8_PE5BNK7cTISmIBqGWvsVnF', - 'pl': '1fijjS0LbfpKcoPB5V8c8fH08T8AkXRp9', - 'pms': '12ySc7X9ajWWqMlBjyrPiEdc-qVBuIkbA', - 'pnb': '1RB3-wjluhTKbdTGCsk3nag1bM3m4wENb', - 'pnt': '1ZCUzms6fY4on_fW8uVgO7cEs9KHydHY_', - 'ps': '1WKl9Av6Sqz6aHKyUM5kIh90mzFzyVWH9', - 'pt': '13BX-_4_hcTUp59HDyczFDI32qUB94vUY', - 'qu': '1CB_C4ygtRoegkqgcqfXNHr8oQd-UcvDE', - 'rm': '1YRSGgWoxEqSojHXuBHJnY8vAHr1VgLu-', - 'rmy': '1uFcCyvOWBJWKFQxbkYSp373xUXVl4IgF', - 'rn': '1ekyyb2MvupYGY_E8_BhKvV664sLvW4aE', - 'ro': '1YfeNTSoxU-zJMnyQotLk5X8B_6nHryBu', - 'roa-rup': '150s4H4TdQ5nNYVC6j0E416TUAjBE85yy', - 'roa-tara': '1H6emfQsD_a5yohK4RMPQ-GrnHXqqVgr3', - 'ru': '11gP2s-SYcfS3j9MjPp5C3_nFeQB-8x86', - 'rue': '1OuSglZAndja1J5D5IUmdbt_niTTyEgYK', - 'rw': '1NuhHfi0-B-Xlr_BApijnxCw0WMEltttP', - 'sa': '1P2S3gL_zvKgXLKJJxg-Fb4z8XdlVpQik', - 'sah': '1qz0MpKckzUref2FX_FYiNzI2p4BDc5oR', - 'sc': '1oAYj_Fty4FUwjAOBEBaiZt_cY8dtpDfA', - 'scn': '1sDN9zHkXWYoHYx-DUu-GPvsUgB_IRa8S', - 'sco': '1i8W7KQPj6YZQLop89vZBSybJNgNsvXWR', - 'sd': '1vaNqfv3S8Gl5pQmig3vwWQ3cqRTsXmMR', - 'se': '1RT9xhn0Vl90zjWYDTw5V1L_u1Oh16tpP', - 'sg': '1iIh2oXD2Szz_AygUvTt3_ZK8a3RYEGZ_', - 'sh': '1qPwLiAm6t4__G-zVEOrBgYx6VRmgDgiS', - 'si': '1G5ryceID0TP6SAO42e-HAbIlCvYmnUN7', - 'simple': '1FVV49o_RlK6M5Iw_7zeJOEDQoTa5zSbq', - 'sk': '11mkYvbmAWKTInj6t4Ma8BUPxoR5o6irL', - 'sl': '1fsIZS5LgMzMzZ6T7ogStyj-ILEZIBRvO', - 'sm': '1yefECpKX_Y4R7G2tggIxvc_BvJfOAz-t', - 'sn': '1fYeCjMPvRAv94kvZjiKI-ktIDLkbv0Ve', - 'so': '1Uc-eSZnJb36SgeTvRU3GirXZOlGD_NB6', - 'sq': '11u-53n71O_yjpwRiCQSwgL7N2w72ZptX', - 'sr': '1PGLGlQi8Q0Eac6dib-uuCJAAHK6SF5Pz', - 'srn': '1JKiL3TSXqK1-KhPfAwMK0uqw90WEzg7M', - 'ss': '1e0quNEsA1dn57-IbincF4D82dRWgzQlp', - 'st': '1ny-FBzpBqIDgv6jMcsoFev3Ih65FNZFO', - 'stq': '15Fx32ROy2IM6lSqAPUykkr3CITR6Xd7v', - 'su': '1C0FJum7bYZpnyptBvfAgwJb0TX2hggtO', - 'sv': '1YyqzOSXzK5yrAou9zeTDWH_7s569mDcz', - 'sw': '1_bNTj6T8eXlNAIuHaveleWlHB_22alJs', - 'szl': '1_dXEip1snK4CPVGqH8x7lF5O-6FdCNFW', - 'ta': '1ZFTONsxGtSnC9QB6RpWSvgD_MbZwIhHH', - 'tcy': '15R6u7KQs1vmDSm_aSDrQMJ3Q6q3Be0r7', # leer - 'te': '11Sx-pBAPeZOXGyv48UNSVMD0AH7uf4YN', - 'tet': '11mr2MYLcv9pz7mHhGGNi5iNCOVErYeOt', - 'tg': '16ttF7HWqM9Cnj4qmgf3ZfNniiOJfZ52w', - 'th': '14xhIt-xr5n9nMuvcwayCGM1-zBCFZquW', - 'ti': '123q5e9MStMShp8eESGtHdSBGLDrCKfJU', - 'tk': '1X-JNInt34BNGhg8A8Peyjw2WjsALdXsD', - 'tl': '1WkQHbWd9cqtTnSHAv0DpUThaBnzeSPTJ', - 'tn': '1fHfQHetZn8-fLuRZEu-cvs-kQYwPvjyL', - 'to': '1cHOLaczYJ8h-OqQgxeoH9vMG3izg6muT', - 'tpi': '1YsRjxVu6NYOrXRb8oqMO9FPaicelFEcu', - 'tr': '1J1Zy02IxvtCK0d1Ba2h_Ulit1mVb9UIX', - 'ts': '1pIcfAt3KmtmDkyhOl-SMSeoM8aP8bOpl', - 'tt': '1vsfzCjj-_bMOn5jBai41TF5GjKJM_Ius', - 'tum': '1NWcg65daI2Bt0awyEgU6apUDbBmiqCus', - 'tw': '1WCYKZIqS7AagS76QFSfbteiOgFNBvNne', - 'ty': '1DIqaP1l-N9VXTNokrlr6EuPMGE765o4h', - 'tyv': '1F3qa05OYLBcjT1lXMurAJFDXP_EesCvM', - 'udm': '1T0YMTAPLOk768sstnewy5Jxgx2RPu3Rb', - 'ug': '1fjezvqlysyZhiQMZdazqLGgk72PqtXAw', - 'uk': '1UMJCHtzxkfLDBJE7NtfN5FeMrnnUVwoh', - 'ur': '1WNaD2TuHvdsF-z0k_emQYchwoQQDFmRk', - 'uz': '11wrG2FSTpRJc2jb5MhgvxjkVDYhT8M-l', - 've': '1PucJ7pJ4CXGEXZ5p_WleZDs2usNz74to', - 'vec': '1cAVjm_y3ehNteDQIYz9yyoq1EKkqOXZ0', - 'vep': '1K_eqV7O6C7KPJWZtmIuzFMKAagj-0O85', - 'vi': '1yQ6nhm1BmG9lD4_NaG1hE5VV6biEaV5f', - 'vls': '1bpQQW6pKHruKJJaKtuggH5rReMXyeVXp', - 'vo': '1D80QRdTpe7H4mHFKpfugscsjX71kiMJN', - 'wa': '1m4B81QYbf74htpInDU5p7d0n0ot8WLPZ', - 'war': '1EC3jsHtu22tHBv6jX_I4rupC5RwV3OYd', - 'wo': '1vChyqNNLu5xYHdyHpACwwpw4l3ptiKlo', - 'wuu': '1_EIn02xCUBcwLOwYnA-lScjS2Lh2ECw6', - 'xal': '19bKXsL1D2UesbB50JPyc9TpG1lNc2POt', - 'xh': '1pPVcxBG3xsCzEnUzlohc_p89gQ9dSJB3', - 'xmf': '1SM9llku6I_ZuZz05mOBuL2lx-KQXvehr', - 'yi': '1WNWr1oV-Nl7c1Jv8x_MiAj2vxRtyQawu', - 'yo': '1yNVOwMOWeglbOcRoZzgd4uwlN5JMynnY', - 'za': '1i7pg162cD_iU9h8dgtI2An8QCcbzUAjB', - 'zea': '1EWSkiSkPBfbyjWjZK0VuKdpqFnFOpXXQ', - 'zh-classical': '1uUKZamNp08KA7s7794sKPOqPALvo_btl', - 'zh-min-nan': '1oSgz3YBXLGUgI7kl-uMOC_ww6L0FNFmp', - 'zh-yue': '1zhwlUeeiyOAU1QqwqZ8n91yXIRPFA7UE', - 'zh': '1LZ96GUhkVHQU-aj2C3WOrtffOp0U3Z7f', - 'zu': '1FyXl_UK1737XB3drqQFhGXiJrJckiB1W' - } - return languages_ids[language] - - -class DANE(ColumnCorpus): +class NER_SWEDISH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, @@ -1463,11 +1254,20 @@ def __init__( in_memory: bool = True, **corpusargs, ): + """ + Initialize the NER_SWEDISH corpus for Swedish. The first time you call this constructor it will automatically + download the dataset. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ + if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {1: 'text', 3: 'pos', 9: 'ner'} + columns = {0: "text", 1: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1478,66 +1278,37 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - data_path = Path(flair.cache_root) / "datasets" / dataset_name - train_data_file = data_path / "ddt.train.conllu" - if not train_data_file.is_file(): - temp_file = cached_path( - 'https://danlp.alexandra.dk/304bd159d5de/datasets/ddt.zip', - Path("datasets") / dataset_name - ) - from zipfile import ZipFile - - with ZipFile(temp_file, 'r') as zip_file: - zip_file.extractall(path=data_path) - - # Remove CoNLL-U meta information in the last column - for part in ['train', 'dev', 'test']: - lines = [] - data_file = "ddt.{}.conllu".format(part) - with open(data_path / data_file, 'r') as file: - for line in file: - if line.startswith("#") or line == "\n": - lines.append(line) - lines.append(line.replace("name=", "").replace("|SpaceAfter=No", "")) - - with open(data_path / data_file, 'w') as file: - file.writelines(lines) + ner_spraakbanken_path = "https://raw.githubusercontent.com/klintan/swedish-ner-corpus/master/" + cached_path(f"{ner_spraakbanken_path}test_corpus.txt", Path("datasets") / dataset_name) + cached_path(f"{ner_spraakbanken_path}train_corpus.txt", Path("datasets") / dataset_name) - print(data_path / data_file) + # data is not in IOB2 format. Thus we transform it to IOB2 + add_IOB2_tags(data_file=Path(data_folder / "test_corpus.txt")) + add_IOB2_tags(data_file=Path(data_folder / "train_corpus.txt")) - super(DANE, self).__init__( + super(NER_SWEDISH, self).__init__( data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, - comment_symbol="#", **corpusargs, ) -class EUROPARL_NER_GERMAN(ColumnCorpus): +class SEC_FILLINGS(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", - in_memory: bool = False, + in_memory: bool = True, **corpusargs, ): - """ - Initialize the EUROPARL_NER_GERMAN corpus. The first time you call this constructor it will automatically - download the dataset. - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: 'ner' by default, should not be changed. - :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: 'text', 1: 'lemma', 2: 'pos', 3: 'np', 4: 'ner'} + columns = {0: "text", 1: "pos", 3: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1548,46 +1319,37 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - europarl_ner_german_path = "https://nlpado.de/~sebastian/software/ner/" - cached_path(f"{europarl_ner_german_path}ep-96-04-15.conll", Path("datasets") / dataset_name) - cached_path(f"{europarl_ner_german_path}ep-96-04-16.conll", Path("datasets") / dataset_name) - - add_IOB_tags(data_file=Path(data_folder / "ep-96-04-15.conll"), encoding="latin-1", ner_column=4) - add_IOB_tags(data_file=Path(data_folder / "ep-96-04-16.conll"), encoding="latin-1", ner_column=4) + SEC_FILLINGS_Path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/SEC-filings/CONLL-format/data/" + cached_path(f"{SEC_FILLINGS_Path}test/FIN3.txt", Path("datasets") / dataset_name) + cached_path(f"{SEC_FILLINGS_Path}train/FIN5.txt", Path("datasets") / dataset_name) - super(EUROPARL_NER_GERMAN, self).__init__( + super(SEC_FILLINGS, self).__init__( data_folder, columns, tag_to_bioes=tag_to_bioes, - encoding="latin-1", + encoding="utf-8", in_memory=in_memory, - train_file='ep-96-04-16.conll', - test_file='ep-96-04-15.conll', + train_file='FIN5.txt', + test_file="FIN3.txt", + skip_first_line=True, **corpusargs, ) -class GERMEVAL_14(ColumnCorpus): +class SEMEVAL2017(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", + tag_to_bioes: str = "keyword", in_memory: bool = True, **corpusargs, ): - """ - Initialize the GermEval NER corpus for German. This is only possible if you've manually downloaded it to your - machine. Obtain the corpus from https://sites.google.com/site/germeval2014ner/data and put it into some folder. - Then point the base_path parameter in the constructor to this folder - :param base_path: Path to the GermEval corpus on your machine - :param tag_to_bioes: 'ner' by default, should not be changed. - :param in_memory:If True, keeps dataset in memory giving speedups in training. - """ + if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {1: "text", 2: "ner"} + columns = {0: "text", 1: "keyword"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1597,25 +1359,17 @@ def __init__( base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name - # check if data there - if not data_folder.exists(): - log.warning("-" * 100) - log.warning(f'WARNING: GermEval-14 dataset not found at "{data_folder}".') - log.warning( - 'Instructions for obtaining the data can be found here: https://sites.google.com/site/germeval2014ner/data"' - ) - log.warning("-" * 100) - super(GERMEVAL_14, self).__init__( - data_folder, - columns, - tag_to_bioes=tag_to_bioes, - comment_symbol="#", - in_memory=in_memory, - **corpusargs, + semeval2017_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/SemEval-2017" + cached_path(f"{semeval2017_path}/train.txt", Path("datasets") / dataset_name) + cached_path(f"{semeval2017_path}/test.txt", Path("datasets") / dataset_name) + cached_path(f"{semeval2017_path}/dev.txt", Path("datasets") / dataset_name) + + super(SEMEVAL2017, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, ) -class INSPEC(ColumnCorpus): +class SEMEVAL2010(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, @@ -1638,36 +1392,34 @@ def __init__( base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name - inspec_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/Inspec" - cached_path(f"{inspec_path}/train.txt", Path("datasets") / dataset_name) - cached_path(f"{inspec_path}/test.txt", Path("datasets") / dataset_name) - if not "dev.txt" in os.listdir(data_folder): - cached_path(f"{inspec_path}/valid.txt", Path("datasets") / dataset_name) - # rename according to train - test - dev - convention - os.rename(data_folder / "valid.txt", data_folder / "dev.txt") + semeval2010_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/processed_semeval-2010" + cached_path(f"{semeval2010_path}/train.txt", Path("datasets") / dataset_name) + cached_path(f"{semeval2010_path}/test.txt", Path("datasets") / dataset_name) - super(INSPEC, self).__init__( + super(SEMEVAL2010, self).__init__( data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, ) -class LER_GERMAN(ColumnCorpus): +class TURKU_NER(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", - in_memory: bool = False, + in_memory: bool = True, + document_as_sequence: bool = False, **corpusargs, ): """ - Initialize the LER_GERMAN (Legal Entity Recognition) corpus. The first time you call this constructor it will automatically + Initialize the Finnish TurkuNER corpus. The first time you call this constructor it will automatically download the dataset. :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary. - :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage. + :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict + POS tags instead + :param in_memory: If True, keeps dataset in memory giving speedups in training. :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ - if type(base_path) == str: base_path: Path = Path(base_path) @@ -1683,19 +1435,30 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ler_path = "https://raw.githubusercontent.com/elenanereiss/Legal-Entity-Recognition/master/data/" - cached_path(f"{ler_path}ler.conll", Path("datasets") / dataset_name) + conll_path = "https://raw.githubusercontent.com/TurkuNLP/turku-ner-corpus/master/data/conll" + dev_file = "dev.tsv" + test_file = "test.tsv" + train_file = "train.tsv" + cached_path(f"{conll_path}/{dev_file}", Path("datasets") / dataset_name) + cached_path(f"{conll_path}/{test_file}", Path("datasets") / dataset_name) + cached_path(f"{conll_path}/{train_file}", Path("datasets") / dataset_name) - super(LER_GERMAN, self).__init__( + super(TURKU_NER, self).__init__( data_folder, columns, + dev_file=dev_file, + test_file=test_file, + train_file=train_file, + column_delimiter="\t", tag_to_bioes=tag_to_bioes, + encoding="latin-1", in_memory=in_memory, - train_file='ler.conll', + document_separator_token=None if not document_as_sequence else "-DOCSTART-", **corpusargs, ) -class ANER_CORP(ColumnCorpus): + +class TWITTER_NER(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, @@ -1705,15 +1468,14 @@ def __init__( **corpusargs, ): """ - Initialize a preprocessed version of the Arabic Named Entity Recognition Corpus (ANERCorp) dataset available - from https://github.com/EmnamoR/Arabic-named-entity-recognition/blob/master/ANERCorp.rar. - http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp - Column order is swapped - The first time you call this constructor it will automatically download the dataset. + Initialize a dataset called twitter_ner which can be found on the following page: + https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/ner.txt. + + The first time you call this constructor it will automatically + download the dataset. :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict - POS tags instead + :param tag_to_bioes: NER by default, need not be changed :param in_memory: If True, keeps dataset in memory giving speedups in training. :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ @@ -1721,7 +1483,7 @@ def __init__( base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "ner"} + columns = {0: 'text', 1: 'ner'} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1732,34 +1494,43 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - anercorp_path = "https://megantosh.s3.eu-central-1.amazonaws.com/ANERcorp/" - # cached_path(f"{anercorp_path}test.txt", Path("datasets") / dataset_name) - cached_path(f"{anercorp_path}train.txt", Path("datasets") / dataset_name) + twitter_ner_path = "https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/" + cached_path(f"{twitter_ner_path}ner.txt", Path("datasets") / dataset_name) - super(ANER_CORP, self).__init__( + super(TWITTER_NER, self).__init__( data_folder, columns, - # tag_to_bioes=tag_to_bioes, - encoding="utf-8", + tag_to_bioes=tag_to_bioes, + encoding="latin-1", + train_file="ner.txt", in_memory=in_memory, document_separator_token=None if not document_as_sequence else "-DOCSTART-", **corpusargs, ) -class NER_BASQUE(ColumnCorpus): +class UP_CHINESE(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", in_memory: bool = True, + document_as_sequence: bool = False, **corpusargs, ): + """ + Initialize the Chinese dataset from the Universal Propositions Bank, comming from that webpage: + https://github.com/System-T/UniversalPropositions/tree/master/UP_Chinese + + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "ner"} + columns = {1: "text", 9: "frame"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1770,45 +1541,47 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ner_basque_path = "http://ixa2.si.ehu.eus/eiec/" - data_path = Path(flair.cache_root) / "datasets" / dataset_name - data_file = data_path / "named_ent_eu.train" - if not data_file.is_file(): - cached_path( - f"{ner_basque_path}/eiec_v1.0.tgz", Path("datasets") / dataset_name - ) - import tarfile, shutil - - with tarfile.open( - Path(flair.cache_root) / "datasets" / dataset_name / "eiec_v1.0.tgz", - "r:gz", - ) as f_in: - corpus_files = ( - "eiec_v1.0/named_ent_eu.train", - "eiec_v1.0/named_ent_eu.test", - ) - for corpus_file in corpus_files: - f_in.extract(corpus_file, data_path) - shutil.move(f"{data_path}/{corpus_file}", data_path) + up_zh_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Chinese/" + cached_path(f"{up_zh_path}zh-up-train.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_zh_path}zh-up-dev.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_zh_path}zh-up-test.conllu", Path("datasets") / dataset_name) - super(NER_BASQUE, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, + super(UP_CHINESE, self).__init__( + data_folder, + columns, + encoding="utf-8", + train_file="zh-up-train.conllu", + test_file="zh-up-test.conllu", + dev_file="zh-up-dev.conllu", + in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + comment_symbol="#", + **corpusargs, ) -class NER_FINNISH(ColumnCorpus): +class UP_ENGLISH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", in_memory: bool = True, + document_as_sequence: bool = False, **corpusargs, ): + """ + Initialize the English dataset from the Universal Propositions Bank, comming from that webpage: + https://github.com/System-T/UniversalPropositions. + + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "ner"} + columns = {1: "text", 10: "frame"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1819,49 +1592,47 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ner_finnish_path = "https://raw.githubusercontent.com/mpsilfve/finer-data/master/data/digitoday." - cached_path(f"{ner_finnish_path}2014.train.csv", Path("datasets") / dataset_name) - cached_path(f"{ner_finnish_path}2014.dev.csv", Path("datasets") / dataset_name) - cached_path(f"{ner_finnish_path}2015.test.csv", Path("datasets") / dataset_name) - - _remove_lines_without_annotations(data_file=Path(data_folder / "digitoday.2015.test.csv")) + up_en_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_English-EWT/" + cached_path(f"{up_en_path}en_ewt-up-train.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_en_path}en_ewt-up-dev.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_en_path}en_ewt-up-test.conllu", Path("datasets") / dataset_name) - super(NER_FINNISH, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, skip_first_line=True, **corpusargs, + super(UP_ENGLISH, self).__init__( + data_folder, + columns, + encoding="utf-8", + train_file="en_ewt-up-train.conllu", + test_file="en_ewt-up-test.conllu", + dev_file="en_ewt-up-dev.conllu", + in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + comment_symbol="#", + **corpusargs, ) -def _remove_lines_without_annotations(data_file: Union[str, Path] = None): - with open(data_file, 'r') as f: - lines = f.readlines() - with open(data_file, 'w') as f: - for line in lines: - if len(line.split()) != 1: - f.write(line) - - -class NER_SWEDISH(ColumnCorpus): +class UP_FRENCH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", in_memory: bool = True, + document_as_sequence: bool = False, **corpusargs, ): """ - Initialize the NER_SWEDISH corpus for Swedish. The first time you call this constructor it will automatically - download the dataset. + Initialize the French dataset from the Universal Propositions Bank, comming from that webpage: + https://github.com/System-T/UniversalPropositions. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary. :param in_memory: If True, keeps dataset in memory giving speedups in training. :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ - if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "ner"} + columns = {1: "text", 9: "frame"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1872,37 +1643,47 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - ner_spraakbanken_path = "https://raw.githubusercontent.com/klintan/swedish-ner-corpus/master/" - cached_path(f"{ner_spraakbanken_path}test_corpus.txt", Path("datasets") / dataset_name) - cached_path(f"{ner_spraakbanken_path}train_corpus.txt", Path("datasets") / dataset_name) - - # data is not in IOB2 format. Thus we transform it to IOB2 - add_IOB2_tags(data_file=Path(data_folder / "test_corpus.txt")) - add_IOB2_tags(data_file=Path(data_folder / "train_corpus.txt")) + up_fr_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_French/" + cached_path(f"{up_fr_path}fr-up-train.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_fr_path}fr-up-dev.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_fr_path}fr-up-test.conllu", Path("datasets") / dataset_name) - super(NER_SWEDISH, self).__init__( + super(UP_FRENCH, self).__init__( data_folder, columns, - tag_to_bioes=tag_to_bioes, + encoding="utf-8", + train_file="fr-up-train.conllu", + test_file="fr-up-test.conllu", + dev_file="fr-up-dev.conllu", in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + comment_symbol="#", **corpusargs, ) -class SEMEVAL2017(ColumnCorpus): +class UP_FINNISH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "keyword", in_memory: bool = True, + document_as_sequence: bool = False, **corpusargs, ): + """ + Initialize the Finnish dataset from the Universal Propositions Bank, comming from that webpage: + https://github.com/System-T/UniversalPropositions/tree/master/UP_Finnish + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "keyword"} + columns = {1: "text", 9: "frame"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1912,30 +1693,48 @@ def __init__( base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name - semeval2017_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/SemEval-2017" - cached_path(f"{semeval2017_path}/train.txt", Path("datasets") / dataset_name) - cached_path(f"{semeval2017_path}/test.txt", Path("datasets") / dataset_name) - cached_path(f"{semeval2017_path}/dev.txt", Path("datasets") / dataset_name) + # download data if necessary + up_fi_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Finnish/" + cached_path(f"{up_fi_path}fi-up-train.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_fi_path}fi-up-dev.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_fi_path}fi-up-test.conllu", Path("datasets") / dataset_name) - super(SEMEVAL2017, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, + super(UP_FINNISH, self).__init__( + data_folder, + columns, + encoding="utf-8", + train_file="fi-up-train.conllu", + test_file="fi-up-test.conllu", + dev_file="fi-up-dev.conllu", + in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + comment_symbol="#", + **corpusargs, ) -class SEMEVAL2010(ColumnCorpus): +class UP_GERMAN(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "keyword", in_memory: bool = True, + document_as_sequence: bool = False, **corpusargs, ): + """ + Initialize the German dataset from the Universal Propositions Bank, comming from that webpage: + https://github.com/System-T/UniversalPropositions. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "keyword"} + columns = {1: "text", 9: "frame"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1945,28 +1744,48 @@ def __init__( base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name - semeval2010_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/processed_semeval-2010" - cached_path(f"{semeval2010_path}/train.txt", Path("datasets") / dataset_name) - cached_path(f"{semeval2010_path}/test.txt", Path("datasets") / dataset_name) + # download data if necessary + up_de_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_German/" + cached_path(f"{up_de_path}de-up-train.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_de_path}de-up-dev.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_de_path}de-up-test.conllu", Path("datasets") / dataset_name) - super(SEMEVAL2010, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, + super(UP_GERMAN, self).__init__( + data_folder, + columns, + encoding="utf-8", + train_file="de-up-train.conllu", + test_file="de-up-test.conllu", + dev_file="de-up-dev.conllu", + in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + comment_symbol="#", + **corpusargs, ) -class WIKINER_ENGLISH(ColumnCorpus): +class UP_ITALIAN(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", - in_memory: bool = False, + in_memory: bool = True, + document_as_sequence: bool = False, **corpusargs, ): + """ + Initialize the Italian dataset from the Universal Propositions Bank, comming from that webpage: + https://github.com/System-T/UniversalPropositions/tree/master/UP_Italian + + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "pos", 2: "ner"} + columns = {1: "text", 9: "frame"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -1977,26 +1796,47 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - _download_wikiner("en", dataset_name) + up_it_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Italian/" + cached_path(f"{up_it_path}it-up-train.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_it_path}it-up-dev.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_it_path}it-up-test.conllu", Path("datasets") / dataset_name) - super(WIKINER_ENGLISH, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, + super(UP_ITALIAN, self).__init__( + data_folder, + columns, + encoding="utf-8", + train_file="it-up-train.conllu", + test_file="it-up-test.conllu", + dev_file="it-up-dev.conllu", + in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + comment_symbol="#", + **corpusargs, ) -class WIKINER_GERMAN(ColumnCorpus): +class UP_SPANISH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", - in_memory: bool = False, + in_memory: bool = True, + document_as_sequence: bool = False, **corpusargs, ): + """ + Initialize the Spanish dataset from the Universal Propositions Bank, comming from that webpage: + https://github.com/System-T/UniversalPropositions + + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "pos", 2: "ner"} + columns = {1: "text", 9: "frame"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2007,26 +1847,47 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - _download_wikiner("de", dataset_name) + up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish/" + cached_path(f"{up_es_path}es-up-train.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_es_path}es-up-dev.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_es_path}es-up-test.conllu", Path("datasets") / dataset_name) - super(WIKINER_GERMAN, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, + super(UP_SPANISH, self).__init__( + data_folder, + columns, + encoding="utf-8", + train_file="es-up-train.conllu", + test_file="es-up-test.conllu", + dev_file="es-up-dev.conllu", + in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + comment_symbol="#", + **corpusargs, ) -class WIKINER_DUTCH(ColumnCorpus): +class UP_SPANISH_ANCORA(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", - in_memory: bool = False, + in_memory: bool = True, + document_as_sequence: bool = False, **corpusargs, ): + """ + Initialize the Spanish AnCora dataset from the Universal Propositions Bank, comming from that webpage: + https://github.com/System-T/UniversalPropositions + + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "pos", 2: "ner"} + columns = {1: "text", 9: "frame"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2037,26 +1898,49 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - _download_wikiner("nl", dataset_name) + up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish-AnCora/" + cached_path(f"{up_es_path}es_ancora-up-train.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_es_path}es_ancora-up-dev.conllu", Path("datasets") / dataset_name) + cached_path(f"{up_es_path}es_ancora-up-test.conllu", Path("datasets") / dataset_name) - super(WIKINER_DUTCH, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, + super(UP_SPANISH_ANCORA, self).__init__( + data_folder, + columns, + encoding="utf-8", + train_file="es_ancora-up-train.conllu", + test_file="es_ancora-up-test.conllu", + dev_file="es_ancora-up-dev.conllu", + in_memory=in_memory, + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + comment_symbol="#", + **corpusargs, ) -class WIKINER_FRENCH(ColumnCorpus): +class WEIBO_NER(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", - in_memory: bool = False, + in_memory: bool = True, + document_as_sequence: bool = False, **corpusargs, ): + """ + Initialize the WEIBO_NER corpus . The first time you call this constructor it will automatically + download the dataset. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict + POS tags instead + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "pos", 2: "ner"} + columns = {0: 'text', 1: 'ner'} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2067,198 +1951,450 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - _download_wikiner("fr", dataset_name) + weiboNER_conll_path = "https://raw.githubusercontent.com/87302380/WEIBO_NER/main/data/" + cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.train", Path("datasets") / dataset_name) + cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.test", Path("datasets") / dataset_name) + cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.dev", Path("datasets") / dataset_name) - super(WIKINER_FRENCH, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, + super(WEIBO_NER, self).__init__( + data_folder, + columns, + tag_to_bioes=tag_to_bioes, + encoding="utf-8", + in_memory=in_memory, + train_file="weiboNER_2nd_conll_format.train", + test_file="weiboNER_2nd_conll_format.test", + dev_file="weiboNER_2nd_conll_format.dev", + document_separator_token=None if not document_as_sequence else "-DOCSTART-", + **corpusargs, ) -class WIKINER_ITALIAN(ColumnCorpus): +class WIKIANN(MultiCorpus): def __init__( self, + languages: Union[str, List[str]], base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", in_memory: bool = False, - **corpusargs, ): + """ + WkiAnn corpus for cross-lingual NER consisting of datasets from 282 languages that exist + in Wikipedia. See https://elisa-ie.github.io/wikiann/ for details and for the languages and their + respective abbreveations, i.e. "en" for english. (license: https://opendatacommons.org/licenses/by/) + Parameters + ---------- + languages : Union[str, List[str]] + Should be an abbreviation of a language ("en", "de",..) or a list of abbreviations. + The datasets of all passed languages will be saved in one MultiCorpus. + (Note that, even though listed on https://elisa-ie.github.io/wikiann/ some datasets are empty. + This includes "aa", "cho", "ho", "hz", "ii", "jam", "kj", "kr", "mus", "olo" and "tcy".) + base_path : Union[str, Path], optional + Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + tag_to_bioes : str, optional + The data is in bio-format. It will by default (with the string "ner" as value) be transformed + into the bioes format. If you dont want that set it to None. + + """ + if type(languages) == str: + languages = [languages] + if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "pos", 2: "ner"} + columns = {0: "text", 1: "ner"} # this dataset name - dataset_name = self.__class__.__name__.lower() + dataset_name = "wikiann" # default dataset folder is the cache root if not base_path: base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name - # download data if necessary - _download_wikiner("it", dataset_name) + # For each language in languages, the file is downloaded if not existent + # Then a comlumncorpus of that data is created and saved in a list + # this list is handed to the multicorpus - super(WIKINER_ITALIAN, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, - ) - - -class WIKINER_SPANISH(ColumnCorpus): - def __init__( - self, - base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", - in_memory: bool = False, - **corpusargs, - ): - if type(base_path) == str: - base_path: Path = Path(base_path) - - # column format - columns = {0: "text", 1: "pos", 2: "ner"} - - # this dataset name - dataset_name = self.__class__.__name__.lower() - - # default dataset folder is the cache root - if not base_path: - base_path = Path(flair.cache_root) / "datasets" - data_folder = base_path / dataset_name - - # download data if necessary - _download_wikiner("es", dataset_name) - - super(WIKINER_SPANISH, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, - ) - - -class WIKINER_PORTUGUESE(ColumnCorpus): - def __init__( - self, - base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", - in_memory: bool = False, - **corpusargs, - ): - if type(base_path) == str: - base_path: Path = Path(base_path) - - # column format - columns = {0: "text", 1: "pos", 2: "ner"} - - # this dataset name - dataset_name = self.__class__.__name__.lower() - - # default dataset folder is the cache root - if not base_path: - base_path = Path(flair.cache_root) / "datasets" - data_folder = base_path / dataset_name - - # download data if necessary - _download_wikiner("pt", dataset_name) - - super(WIKINER_PORTUGUESE, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, - ) - - -class WIKINER_POLISH(ColumnCorpus): - def __init__( - self, - base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", - in_memory: bool = False, - **corpusargs, - ): - if type(base_path) == str: - base_path: Path = Path(base_path) - - # column format - columns = {0: "text", 1: "pos", 2: "ner"} - - # this dataset name - dataset_name = self.__class__.__name__.lower() - - # default dataset folder is the cache root - if not base_path: - base_path = Path(flair.cache_root) / "datasets" - data_folder = base_path / dataset_name + # list that contains the columncopora + corpora = [] + google_drive_path = 'https://drive.google.com/uc?id=' # download data if necessary - _download_wikiner("pl", dataset_name) - - super(WIKINER_POLISH, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, - ) + first = True + for language in languages: + language_folder = data_folder / language + file_name = 'wikiann-' + language + '.bio' -class WIKINER_RUSSIAN(ColumnCorpus): - def __init__( - self, - base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", - in_memory: bool = False, - **corpusargs, - ): - if type(base_path) == str: - base_path: Path = Path(base_path) + # if language not downloaded yet, download it + if not language_folder.exists(): + if first == True: + import gdown + import tarfile + first = False + # create folder + os.makedirs(language_folder) + # get google drive id from list + google_id = google_drive_id_from_language_name(language) + url = google_drive_path + google_id - # column format - columns = {0: "text", 1: "pos", 2: "ner"} + # download from google drive + gdown.download(url, str(language_folder / language) + '.tar.gz') - # this dataset name - dataset_name = self.__class__.__name__.lower() + # unzip + print("Extract data...") + tar = tarfile.open(str(language_folder / language) + '.tar.gz', "r:gz") + # tar.extractall(language_folder,members=[tar.getmember(file_name)]) + tar.extract(file_name, str(language_folder)) + tar.close() + print('...done.') - # default dataset folder is the cache root - if not base_path: - base_path = Path(flair.cache_root) / "datasets" - data_folder = base_path / dataset_name + # transform data into required format + # the processed dataset has the additional ending "_new" + print("Process dataset...") + silver_standard_to_simple_ner_annotation(str(language_folder / file_name)) + # remove the unprocessed dataset + os.remove(str(language_folder / file_name)) + print('...done.') - # download data if necessary - _download_wikiner("ru", dataset_name) + # initialize comlumncorpus and add it to list + print("Read data into corpus...") + corp = ColumnCorpus(data_folder=language_folder, + column_format=columns, + train_file=file_name + '_new', + tag_to_bioes=tag_to_bioes, + in_memory=in_memory, + ) + corpora.append(corp) + print("...done.") - super(WIKINER_RUSSIAN, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, + super(WIKIANN, self).__init__( + corpora, name='wikiann', ) -class WNUT_17(ColumnCorpus): - def __init__( - self, - base_path: Union[str, Path] = None, - tag_to_bioes: str = "ner", - in_memory: bool = True, - **corpusargs, - ): - if type(base_path) == str: - base_path: Path = Path(base_path) - - # column format - columns = {0: "text", 1: "ner"} - - # this dataset name - dataset_name = self.__class__.__name__.lower() +def silver_standard_to_simple_ner_annotation(data_file: Union[str, Path]): + f_read = open(data_file, 'r', encoding='utf-8') + f_write = open(data_file + '_new', 'w+', encoding='utf-8') + while True: + line = f_read.readline() + if line: + if line == '\n': + f_write.write(line) + else: + liste = line.split() + f_write.write(liste[0] + ' ' + liste[-1] + '\n') + else: + break + f_read.close() + f_write.close() - # default dataset folder is the cache root - if not base_path: - base_path = Path(flair.cache_root) / "datasets" - data_folder = base_path / dataset_name - # download data if necessary - wnut_path = "https://noisy-text.github.io/2017/files/" - cached_path(f"{wnut_path}wnut17train.conll", Path("datasets") / dataset_name) - cached_path(f"{wnut_path}emerging.dev.conll", Path("datasets") / dataset_name) - cached_path( - f"{wnut_path}emerging.test.annotated", Path("datasets") / dataset_name - ) +def google_drive_id_from_language_name(language): + languages_ids = { + 'aa': '1tDDlydKq7KQQ3_23Ysbtke4HJOe4snIk', # leer + 'ab': '1hB8REj2XA_0DjI9hdQvNvSDpuBIb8qRf', + 'ace': '1WENJS2ppHcZqaBEXRZyk2zY-PqXkTkgG', + 'ady': '1n6On8WWDHxEoybj7F9K15d_fkGPy6KgO', + 'af': '1CPB-0BD2tg3zIT60D3hmJT0i5O_SKja0', + 'ak': '1l2vlGHnQwvm9XhW5S-403fetwUXhBlZm', + 'als': '196xyYjhbie7sYLHLZHWkkurOwQLi8wK-', + 'am': '1ug1IEoExKD3xWpvfZprAPSQi82YF9Cet', + 'an': '1DNLgPOAOsGZBYd6rC5ddhzvc9_DtWnk2', + 'ang': '1W_0ti7Tl8AkqM91lRCMPWEuUnPOAZroV', + 'ar': '1tyvd32udEQG_cNeVpaD5I2fxvCc6XKIS', + 'arc': '1hSOByStqPmP3b9HfQ39EclUZGo8IKCMb', + 'arz': '1CKW5ZhxTpIHmc8Jt5JLz_5O6Cr8Icsan', + 'as': '12opBoIweBLM8XciMHT4B6-MAaKdYdvpE', + 'ast': '1rp64PxGZBDfcw-tpFBjLg_ddLDElG1II', + 'av': '1hncGUrkG1vwAAQgLtwOf41BWkHkEvdss', + 'ay': '1VmIsWpMTz442b4Mx798ZOgtB9vquKQtf', + 'az': '1FXDXsvBSdqc7GGIDZv0hqBOaaw12Ip2-', + 'azb': '1amVqOuHLEkhjn8rkGUl-mXdZlaACWyNT', + 'ba': '1aLx1d8GagI11VZVYOGQy0BEePeqoT0x3', + 'bar': '1JZ8-k8ZmnpWYI_Yl_cBBgjVdxoM9Daci', + 'bat-smg': '1trxKXDFSeKsygTMKi-ZqXSJs7F90k5a8', + 'bcl': '1Hs0k7KVZ2DPsqroZ4cUKcwZG4HdPV794', + 'be-x-old': '1gaK-spj1m6eGYQ-SsngLxxLUvP1VRk08', + 'be': '1_ttfOSy9BzCRkIT_p3mImT82XRPpEiuH', + 'bg': '1Iug6gYKemb0OrLTUrKDc_c66YGypTfCF', + 'bh': '12OcSFLu940A8tVQLxI8pnxKBpTeZHmrh', + 'bi': '1rftVziS_pqARx4mvLJC0sKLY-OL5ZIjE', + 'bjn': '1n17mkRjPUAOWQk5LQs2C3Tz3ShxK0enZ', + 'bm': '1284dwO_sfdsWE7FR06HhfBRUb8ePesKR', + 'bn': '1K2DM1mT4hkr6NlAIBTj95BeVXcgvpgDm', + 'bo': '1SzGHDVK-OguKdjZ4DXWiOJVrie1iHeWm', + 'bpy': '1m-e5EoruJufvwBEgJLmJtx6jzx64pYN2', + 'br': '1xdaBoJ1DnwI0iEq7gQN1dWcABAs_bM9H', + 'bs': '167dsB01trMYFQl8FshtIdfhjw7IfVKbk', + 'bug': '1yCnevM9_KJzFk27Vxsva_20OacLo4Uam', + 'bxr': '1DlByAX3zB-9UyEAVD4wtX-R7mXC-8xum', + 'ca': '1LuUgbd9sGa-5Ahcsy31EK89a3WOowftY', + 'cbk-zam': '1kgF8xoD-kIOWZET_9kp_4yNX6AAXn6PI', + 'cdo': '14x1y6611G-UAEGq92QEHRpreVkYnoUCw', + 'ce': '1QUUCVKA-fkiCHd3KT3zUWefaWnxzlZLu', + 'ceb': '1DJZE9RfaMoPNXHI73KBXAm4YSe-_YCUk', + 'ch': '1YzAfhmatkmTpkZbAcD6X83epCgzD5S2_', + 'cho': '1ciY0vF3c5a2mTOo_k32A2wMs0klK98Kb', # leer + 'chr': '1EHaxz1UZHn7v2bbRzCLAhPsNtRzrG3Ae', + 'chy': '1nNWwMAJr1KNdz3bHf6uIn-thZCknlTeB', + 'ckb': '1llpaftcUSiXCZQZMdAqaJSrhwMdcf9IV', + 'co': '1ZP-8oWgMYfW7a6w6ygEFkKDGbN39QnDn', + 'cr': '1ST0xRicLAG4JdCZwGdaY-0pEXooQh7e6', + 'crh': '1Jmpq2XVYUR_XaXU5XNhtOMnz-qkpsgpE', + 'cs': '1Vydyze-jBkK_S1uV5ewV_Y6dbwhXr7lk', + 'csb': '1naUyF74lZPnnopXdOqf5Xor2kT4WoHfS', + 'cu': '1EN5dVTU6jc7YOYPCHq8EYUF31HlMUKs7', + 'cv': '1gEUAlqYSSDI4TrWCqP1LUq2n0X1XEjN3', + 'cy': '1q5g6NJE5GXf65Vc_P4BnUMHQ49Prz-J1', + 'da': '11onAGOLkkqrIwM784siWlg-cewa5WKm8', + 'de': '1f9nWvNkCCy6XWhd9uf4Dq-2--GzSaYAb', + 'diq': '1IkpJaVbEOuOs9qay_KG9rkxRghWZhWPm', + 'dsb': '1hlExWaMth-2eVIQ3i3siJSG-MN_7Z6MY', + 'dv': '1WpCrslO4I7TMb2uaKVQw4U2U8qMs5szi', + 'dz': '10WX52ePq2KfyGliwPvY_54hIjpzW6klV', + 'ee': '1tYEt3oN2KPzBSWrk9jpCqnW3J1KXdhjz', + 'el': '1cxq4NUYmHwWsEn5waYXfFSanlINXWLfM', + 'eml': '17FgGhPZqZNtzbxpTJOf-6nxEuI5oU4Vd', + 'en': '1mqxeCPjxqmO7e8utj1MQv1CICLFVvKa-', + 'eo': '1YeknLymGcqj44ug2yd4P7xQVpSK27HkK', + 'es': '1Dnx3MVR9r5cuoOgeew2gT8bDvWpOKxkU', + 'et': '1Qhb3kYlQnLefWmNimdN_Vykm4mWzbcWy', + 'eu': '1f613wH88UeITYyBSEMZByK-nRNMwLHTs', + 'ext': '1D0nLOZ3aolCM8TShIRyCgF3-_MhWXccN', + 'fa': '1QOG15HU8VfZvJUNKos024xI-OGm0zhEX', + 'ff': '1h5pVjxDYcq70bSus30oqi9KzDmezVNry', + 'fi': '1y3Kf6qYsSvL8_nSEwE1Y6Bf6ninaPvqa', + 'fiu-vro': '1oKUiqG19WgPd3CCl4FGudk5ATmtNfToR', + 'fj': '10xDMuqtoTJlJFp5ghbhKfNWRpLDK3W4d', + 'fo': '1RhjYqgtri1276Be1N9RrNitdBNkpzh0J', + 'fr': '1sK_T_-wzVPJYrnziNqWTriU52rEsXGjn', + 'frp': '1NUm8B2zClBcEa8dHLBb-ZgzEr8phcQyZ', + 'frr': '1FjNqbIUlOW1deJdB8WCuWjaZfUzKqujV', + 'fur': '1oqHZMK7WAV8oHoZLjGR0PfmO38wmR6XY', + 'fy': '1DvnU6iaTJc9bWedmDklHyx8nzKD1s3Ge', + 'ga': '1Ql6rh7absdYQ8l-3hj_MVKcEC3tHKeFB', + 'gag': '1zli-hOl2abuQ2wsDJU45qbb0xuvYwA3a', + 'gan': '1u2dOwy58y-GaS-tCPJS_i9VRDQIPXwCr', + 'gd': '1umsUpngJiwkLdGQbRqYpkgxZju9dWlRz', + 'gl': '141K2IbLjJfXwFTIf-kthmmG0YWdi8liE', + 'glk': '1ZDaxQ6ilXaoivo4_KllagabbvfOuiZ0c', + 'gn': '1hM4MuCaVnZqnL-w-0N-WcWag22ikVLtZ', + 'gom': '1BNOSw75tzPC0wEgLOCKbwu9wg9gcLOzs', + 'got': '1YSHYBtXc1WvUvMIHPz6HHgJvaXKulJUj', + 'gu': '1VdK-B2drqFwKg8KD23c3dKXY-cZgCMgd', + 'gv': '1XZFohYNbKszEFR-V-yDXxx40V41PV9Zm', + 'ha': '18ZG4tUU0owRtQA8Ey3Dl72ALjryEJWMC', + 'hak': '1QQe3WgrCWbvnVH42QXD7KX4kihHURB0Z', + 'haw': '1FLqlK-wpz4jy768XbQAtxd9PhC-9ciP7', + 'he': '18K-Erc2VOgtIdskaQq4D5A3XkVstDmfX', + 'hi': '1lBRapb5tjBqT176gD36K5yb_qsaFeu-k', + 'hif': '153MQ9Ga4NQ-CkK8UiJM3DjKOk09fhCOV', + 'ho': '1c1AoS7yq15iVkTEE-0f3x25NT4F202B8', # leer + 'hr': '1wS-UtB3sGHuXJQQGR0F5lDegogsgoyif', + 'hsb': '1_3mMLzAE5OmXn2z64rW3OwWbo85Mirbd', + 'ht': '1BwCaF0nfdgkM7Yt7A7d7KyVk0BcuwPGk', + 'hu': '10AkDmTxUWNbOXuYLYZ-ZPbLAdGAGZZ8J', + 'hy': '1Mi2k2alJJquT1ybd3GC3QYDstSagaWdo', + 'hz': '1c1m_-Q92v0Di7Nez6VuaccrN19i8icKV', # leer + 'ia': '1jPyqTmDuVhEhj89N606Cja5heJEbcMoM', + 'id': '1JWIvIh8fQoMQqk1rPvUThaskxnTs8tsf', + 'ie': '1TaKRlTtB8-Wqu4sfvx6JQKIugAlg0pV-', + 'ig': '15NFAf2Qx6BXSjv_Oun9_3QRBWNn49g86', + 'ii': '1qldGJkMOMKwY13DpcgbxQCbff0K982f9', # leer + 'ik': '1VoSTou2ZlwVhply26ujowDz6gjwtxmny', + 'ilo': '1-xMuIT6GaM_YeHqgm1OamGkxYfBREiv3', + 'io': '19Zla0wsAcrZm2c0Pw5ghpp4rHjYs26Pp', + 'is': '11i-NCyqS6HbldIbYulsCgQGZFXR8hwoB', + 'it': '1HmjlOaQunHqL2Te7pIkuBWrnjlmdfYo_', + 'iu': '18jKm1S7Ls3l0_pHqQH8MycG3LhoC2pdX', + 'ja': '10dz8UxyK4RIacXE2HcGdrharmp5rwc3r', + 'jam': '1v99CXf9RnbF6aJo669YeTR6mQRTOLZ74', # leer + 'jbo': '1_LmH9hc6FDGE3F7pyGB1fUEbSwuTYQdD', + 'jv': '1qiSu1uECCLl4IBZS27FBdJIBivkJ7GwE', + 'ka': '172UFuFRBX2V1aWeXlPSpu9TjS-3cxNaD', + 'kaa': '1kh6hMPUdqO-FIxRY6qaIBZothBURXxbY', + 'kab': '1oKjbZI6ZrrALCqnPCYgIjKNrKDA7ehcs', + 'kbd': '1jNbfrboPOwJmlXQBIv053d7n5WXpMRv7', + 'kg': '1iiu5z-sdJ2JLC4Ja9IgDxpRZklIb6nDx', + 'ki': '1GUtt0QI84c5McyLGGxoi5uwjHOq1d6G8', + 'kj': '1nSxXUSGDlXVCIPGlVpcakRc537MwuKZR', # leer + 'kk': '1ryC3UN0myckc1awrWhhb6RIi17C0LCuS', + 'kl': '1gXtGtX9gcTXms1IExICnqZUHefrlcIFf', + 'km': '1DS5ATxvxyfn1iWvq2G6qmjZv9pv0T6hD', + 'kn': '1ZGLYMxbb5-29MNmuUfg2xFhYUbkJFMJJ', + 'ko': '12r8tIkTnwKhLJxy71qpIcoLrT6NNhQYm', + 'koi': '1EdG_wZ_Qk124EPAZw-w6rdEhYLsgcvIj', + 'kr': '19VNQtnBA-YL_avWuVeHQHxJZ9MZ04WPF', # leer + 'krc': '1nReV4Mb7Wdj96czpO5regFbdBPu0zZ_y', + 'ks': '1kzh0Pgrv27WRMstR9MpU8mu7p60TcT-X', + 'ksh': '1iHJvrl2HeRaCumlrx3N7CPrHQ2KuLUkt', + 'ku': '1YqJog7Bkk0fHBCSTxJ9heeE-bfbkbkye', + 'kv': '1s91HI4eq8lQYlZwfrJAgaGlCyAtIhvIJ', + 'kw': '16TaIX2nRfqDp8n7zudd4bqf5abN49dvW', + 'ky': '17HPUKFdKWhUjuR1NOp5f3PQYfMlMCxCT', + 'la': '1NiQuBaUIFEERvVXo6CQLwosPraGyiRYw', + 'lad': '1PEmXCWLCqnjLBomMAYHeObM1AmVHtD08', + 'lb': '1nE4g10xoTU23idmDtOQ0w2QCuizZ6QH_', + 'lbe': '1KOm-AdRcCHfSc1-uYBxBA4GjxXjnIlE-', + 'lez': '1cJAXshrLlF1TZlPHJTpDwEvurIOsz4yR', + 'lg': '1Ur0y7iiEpWBgHECrIrT1OyIC8um_y4th', + 'li': '1TikIqfqcZlSDWhOae1JnjJiDko4nj4Dj', + 'lij': '1ro5ItUcF49iP3JdV82lhCQ07MtZn_VjW', + 'lmo': '1W4rhBy2Pi5SuYWyWbNotOVkVY3kYWS_O', + 'ln': '1bLSV6bWx0CgFm7ByKppZLpYCFL8EIAoD', + 'lo': '1C6SSLeKF3QirjZbAZAcpVX_AXYg_TJG3', + 'lrc': '1GUcS28MlJe_OjeQfS2AJ8uczpD8ut60e', + 'lt': '1gAG6TcMTmC128wWK0rCXRlCTsJY9wFQY', + 'ltg': '12ziP8t_fAAS9JqOCEC0kuJObEyuoiOjD', + 'lv': '1MPuAM04u-AtfybXdpHwCqUpFWbe-zD0_', + 'mai': '1d_nUewBkka2QGEmxCc9v3dTfvo7lPATH', + 'map-bms': '1wrNIE-mqp2xb3lrNdwADe6pb7f35NP6V', + 'mdf': '1BmMGUJy7afuKfhfTBMiKxM3D7FY-JrQ2', + 'mg': '105WaMhcWa-46tCztoj8npUyg0aH18nFL', + 'mh': '1Ej7n6yA1cF1cpD5XneftHtL33iHJwntT', + 'mhr': '1CCPIUaFkEYXiHO0HF8_w07UzVyWchrjS', + 'mi': '1F6au9xQjnF-aNBupGJ1PwaMMM6T_PgdQ', + 'min': '1tVK5SHiCy_DaZSDm3nZBgT5bgWThbJt_', + 'mk': '18NpudytGhSWq_LbmycTDw10cSftlSBGS', + 'ml': '1V73UE-EvcE-vV3V1RTvU4sak6QFcP91y', + 'mn': '14jRXicA87oXZOZllWqUjKBMetNpQEUUp', + 'mo': '1YsLGNMsJ7VsekhdcITQeolzOSK4NzE6U', + 'mr': '1vOr1AIHbgkhTO9Ol9Jx5Wh98Qdyh1QKI', + 'mrj': '1dW-YmEW8a9D5KyXz8ojSdIXWGekNzGzN', + 'ms': '1bs-_5WNRiZBjO-DtcNtkcIle-98homf_', + 'mt': '1L7aU3iGjm6SmPIU74k990qRgHFV9hrL0', + 'mus': '1_b7DcRqiKJFEFwp87cUecqf8A5BDbTIJ', # leer + 'mwl': '1MfP0jba2jQfGVeJOLq26MjI6fYY7xTPu', + 'my': '16wsIGBhNVd2lC2p6n1X8rdMbiaemeiUM', + 'myv': '1KEqHmfx2pfU-a1tdI_7ZxMQAk5NJzJjB', + 'mzn': '1CflvmYEXZnWwpsBmIs2OvG-zDDvLEMDJ', + 'na': '1r0AVjee5wNnrcgJxQmVGPVKg5YWz1irz', + 'nah': '1fx6eu91NegyueZ1i0XaB07CKjUwjHN7H', + 'nap': '1bhT4sXCJvaTchCIV9mwLBtf3a7OprbVB', + 'nds-nl': '1UIFi8eOCuFYJXSAXZ9pCWwkQMlHaY4ye', + 'nds': '1FLgZIXUWa_vekDt4ndY0B5XL7FNLiulr', + 'ne': '1gEoCjSJmzjIH4kdHsbDZzD6ID4_78ekS', + 'new': '1_-p45Ny4w9UvGuhD8uRNSPPeaARYvESH', + 'ng': '11yxPdkmpmnijQUcnFHZ3xcOmLTYJmN_R', + 'nl': '1dqYXg3ilzVOSQ_tz_dF47elSIvSIhgqd', + 'nn': '1pDrtRhQ001z2WUNMWCZQU3RV_M0BqOmv', + 'no': '1zuT8MI96Ivpiu9mEVFNjwbiM8gJlSzY2', + 'nov': '1l38388Rln0NXsSARMZHmTmyfo5C0wYTd', + 'nrm': '10vxPq1Nci7Wpq4XOvx3dtqODskzjdxJQ', + 'nso': '1iaIV8qlT0RDnbeQlnxJ3RehsG3gU5ePK', + 'nv': '1oN31jT0w3wP9aGwAPz91pSdUytnd9B0g', + 'ny': '1eEKH_rUPC560bfEg11kp3kbe8qWm35IG', + 'oc': '1C01cW8G_j8US-DTrsmeal_ENHTtNWn-H', + 'olo': '1vbDwKZKqFq84dusr1SvDx5JbBcPanx9L', # leer + 'om': '1q3h22VMbWg2kgVFm-OArR-E4y1yBQ1JX', + 'or': '1k8LwCE8nC7lq6neXDaS3zRn0KOrd9RnS', + 'os': '1u81KAB34aEQfet00dLMRIBJsfRwbDTij', + 'pa': '1JDEHL1VcLHBamgTPBom_Ryi8hk6PBpsu', + 'pag': '1k905VUWnRgY8kFb2P2431Kr4dZuolYGF', + 'pam': '1ssugGyJb8ipispC60B3I6kzMsri1WcvC', + 'pap': '1Za0wfwatxYoD7jGclmTtRoBP0uV_qImQ', + 'pcd': '1csJlKgtG04pdIYCUWhsCCZARKIGlEYPx', + 'pdc': '1Xnms4RXZKZ1BBQmQJEPokmkiweTpouUw', + 'pfl': '1tPQfHX7E0uKMdDSlwNw5aGmaS5bUK0rn', + 'pi': '16b-KxNxzbEuyoNSlI3bfe2YXmdSEsPFu', + 'pih': '1vwyihTnS8_PE5BNK7cTISmIBqGWvsVnF', + 'pl': '1fijjS0LbfpKcoPB5V8c8fH08T8AkXRp9', + 'pms': '12ySc7X9ajWWqMlBjyrPiEdc-qVBuIkbA', + 'pnb': '1RB3-wjluhTKbdTGCsk3nag1bM3m4wENb', + 'pnt': '1ZCUzms6fY4on_fW8uVgO7cEs9KHydHY_', + 'ps': '1WKl9Av6Sqz6aHKyUM5kIh90mzFzyVWH9', + 'pt': '13BX-_4_hcTUp59HDyczFDI32qUB94vUY', + 'qu': '1CB_C4ygtRoegkqgcqfXNHr8oQd-UcvDE', + 'rm': '1YRSGgWoxEqSojHXuBHJnY8vAHr1VgLu-', + 'rmy': '1uFcCyvOWBJWKFQxbkYSp373xUXVl4IgF', + 'rn': '1ekyyb2MvupYGY_E8_BhKvV664sLvW4aE', + 'ro': '1YfeNTSoxU-zJMnyQotLk5X8B_6nHryBu', + 'roa-rup': '150s4H4TdQ5nNYVC6j0E416TUAjBE85yy', + 'roa-tara': '1H6emfQsD_a5yohK4RMPQ-GrnHXqqVgr3', + 'ru': '11gP2s-SYcfS3j9MjPp5C3_nFeQB-8x86', + 'rue': '1OuSglZAndja1J5D5IUmdbt_niTTyEgYK', + 'rw': '1NuhHfi0-B-Xlr_BApijnxCw0WMEltttP', + 'sa': '1P2S3gL_zvKgXLKJJxg-Fb4z8XdlVpQik', + 'sah': '1qz0MpKckzUref2FX_FYiNzI2p4BDc5oR', + 'sc': '1oAYj_Fty4FUwjAOBEBaiZt_cY8dtpDfA', + 'scn': '1sDN9zHkXWYoHYx-DUu-GPvsUgB_IRa8S', + 'sco': '1i8W7KQPj6YZQLop89vZBSybJNgNsvXWR', + 'sd': '1vaNqfv3S8Gl5pQmig3vwWQ3cqRTsXmMR', + 'se': '1RT9xhn0Vl90zjWYDTw5V1L_u1Oh16tpP', + 'sg': '1iIh2oXD2Szz_AygUvTt3_ZK8a3RYEGZ_', + 'sh': '1qPwLiAm6t4__G-zVEOrBgYx6VRmgDgiS', + 'si': '1G5ryceID0TP6SAO42e-HAbIlCvYmnUN7', + 'simple': '1FVV49o_RlK6M5Iw_7zeJOEDQoTa5zSbq', + 'sk': '11mkYvbmAWKTInj6t4Ma8BUPxoR5o6irL', + 'sl': '1fsIZS5LgMzMzZ6T7ogStyj-ILEZIBRvO', + 'sm': '1yefECpKX_Y4R7G2tggIxvc_BvJfOAz-t', + 'sn': '1fYeCjMPvRAv94kvZjiKI-ktIDLkbv0Ve', + 'so': '1Uc-eSZnJb36SgeTvRU3GirXZOlGD_NB6', + 'sq': '11u-53n71O_yjpwRiCQSwgL7N2w72ZptX', + 'sr': '1PGLGlQi8Q0Eac6dib-uuCJAAHK6SF5Pz', + 'srn': '1JKiL3TSXqK1-KhPfAwMK0uqw90WEzg7M', + 'ss': '1e0quNEsA1dn57-IbincF4D82dRWgzQlp', + 'st': '1ny-FBzpBqIDgv6jMcsoFev3Ih65FNZFO', + 'stq': '15Fx32ROy2IM6lSqAPUykkr3CITR6Xd7v', + 'su': '1C0FJum7bYZpnyptBvfAgwJb0TX2hggtO', + 'sv': '1YyqzOSXzK5yrAou9zeTDWH_7s569mDcz', + 'sw': '1_bNTj6T8eXlNAIuHaveleWlHB_22alJs', + 'szl': '1_dXEip1snK4CPVGqH8x7lF5O-6FdCNFW', + 'ta': '1ZFTONsxGtSnC9QB6RpWSvgD_MbZwIhHH', + 'tcy': '15R6u7KQs1vmDSm_aSDrQMJ3Q6q3Be0r7', # leer + 'te': '11Sx-pBAPeZOXGyv48UNSVMD0AH7uf4YN', + 'tet': '11mr2MYLcv9pz7mHhGGNi5iNCOVErYeOt', + 'tg': '16ttF7HWqM9Cnj4qmgf3ZfNniiOJfZ52w', + 'th': '14xhIt-xr5n9nMuvcwayCGM1-zBCFZquW', + 'ti': '123q5e9MStMShp8eESGtHdSBGLDrCKfJU', + 'tk': '1X-JNInt34BNGhg8A8Peyjw2WjsALdXsD', + 'tl': '1WkQHbWd9cqtTnSHAv0DpUThaBnzeSPTJ', + 'tn': '1fHfQHetZn8-fLuRZEu-cvs-kQYwPvjyL', + 'to': '1cHOLaczYJ8h-OqQgxeoH9vMG3izg6muT', + 'tpi': '1YsRjxVu6NYOrXRb8oqMO9FPaicelFEcu', + 'tr': '1J1Zy02IxvtCK0d1Ba2h_Ulit1mVb9UIX', + 'ts': '1pIcfAt3KmtmDkyhOl-SMSeoM8aP8bOpl', + 'tt': '1vsfzCjj-_bMOn5jBai41TF5GjKJM_Ius', + 'tum': '1NWcg65daI2Bt0awyEgU6apUDbBmiqCus', + 'tw': '1WCYKZIqS7AagS76QFSfbteiOgFNBvNne', + 'ty': '1DIqaP1l-N9VXTNokrlr6EuPMGE765o4h', + 'tyv': '1F3qa05OYLBcjT1lXMurAJFDXP_EesCvM', + 'udm': '1T0YMTAPLOk768sstnewy5Jxgx2RPu3Rb', + 'ug': '1fjezvqlysyZhiQMZdazqLGgk72PqtXAw', + 'uk': '1UMJCHtzxkfLDBJE7NtfN5FeMrnnUVwoh', + 'ur': '1WNaD2TuHvdsF-z0k_emQYchwoQQDFmRk', + 'uz': '11wrG2FSTpRJc2jb5MhgvxjkVDYhT8M-l', + 've': '1PucJ7pJ4CXGEXZ5p_WleZDs2usNz74to', + 'vec': '1cAVjm_y3ehNteDQIYz9yyoq1EKkqOXZ0', + 'vep': '1K_eqV7O6C7KPJWZtmIuzFMKAagj-0O85', + 'vi': '1yQ6nhm1BmG9lD4_NaG1hE5VV6biEaV5f', + 'vls': '1bpQQW6pKHruKJJaKtuggH5rReMXyeVXp', + 'vo': '1D80QRdTpe7H4mHFKpfugscsjX71kiMJN', + 'wa': '1m4B81QYbf74htpInDU5p7d0n0ot8WLPZ', + 'war': '1EC3jsHtu22tHBv6jX_I4rupC5RwV3OYd', + 'wo': '1vChyqNNLu5xYHdyHpACwwpw4l3ptiKlo', + 'wuu': '1_EIn02xCUBcwLOwYnA-lScjS2Lh2ECw6', + 'xal': '19bKXsL1D2UesbB50JPyc9TpG1lNc2POt', + 'xh': '1pPVcxBG3xsCzEnUzlohc_p89gQ9dSJB3', + 'xmf': '1SM9llku6I_ZuZz05mOBuL2lx-KQXvehr', + 'yi': '1WNWr1oV-Nl7c1Jv8x_MiAj2vxRtyQawu', + 'yo': '1yNVOwMOWeglbOcRoZzgd4uwlN5JMynnY', + 'za': '1i7pg162cD_iU9h8dgtI2An8QCcbzUAjB', + 'zea': '1EWSkiSkPBfbyjWjZK0VuKdpqFnFOpXXQ', + 'zh-classical': '1uUKZamNp08KA7s7794sKPOqPALvo_btl', + 'zh-min-nan': '1oSgz3YBXLGUgI7kl-uMOC_ww6L0FNFmp', + 'zh-yue': '1zhwlUeeiyOAU1QqwqZ8n91yXIRPFA7UE', + 'zh': '1LZ96GUhkVHQU-aj2C3WOrtffOp0U3Z7f', + 'zu': '1FyXl_UK1737XB3drqQFhGXiJrJckiB1W' + } + return languages_ids[language] - super(WNUT_17, self).__init__( - data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, - ) -class WEIBO_NER(ColumnCorpus): +class WIKIGOLD_NER(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, @@ -2268,12 +2404,11 @@ def __init__( **corpusargs, ): """ - Initialize the WEIBO_NER corpus . The first time you call this constructor it will automatically + Initialize the wikigold corpus. The first time you call this constructor it will automatically download the dataset. :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict - POS tags instead + :param tag_to_bioes: NER by default, should not be changed :param in_memory: If True, keeps dataset in memory giving speedups in training. :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object """ @@ -2281,7 +2416,7 @@ def __init__( base_path: Path = Path(base_path) # column format - columns = {0: 'text', 1: 'ner'} + columns = {0: "text", 1: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2292,38 +2427,34 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - weiboNER_conll_path = "https://raw.githubusercontent.com/87302380/WEIBO_NER/main/data/" - cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.train", Path("datasets") / dataset_name) - cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.test", Path("datasets") / dataset_name) - cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.dev", Path("datasets") / dataset_name) - + wikigold_ner_path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/wikigold/CONLL-format/data/" + cached_path(f"{wikigold_ner_path}wikigold.conll.txt", Path("datasets") / dataset_name) - super(WEIBO_NER, self).__init__( + super(WIKIGOLD_NER, self).__init__( data_folder, columns, tag_to_bioes=tag_to_bioes, encoding="utf-8", in_memory=in_memory, - train_file="weiboNER_2nd_conll_format.train", - test_file="weiboNER_2nd_conll_format.test", - dev_file="weiboNER_2nd_conll_format.dev", + train_file='wikigold.conll.txt', document_separator_token=None if not document_as_sequence else "-DOCSTART-", **corpusargs, ) -class BIOSCOPE(ColumnCorpus): +class WIKINER_ENGLISH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - in_memory: bool = True, + tag_to_bioes: str = "ner", + in_memory: bool = False, **corpusargs, ): if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "tag"} + columns = {0: "text", 1: "pos", 2: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2334,128 +2465,26 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - bioscope_path = "https://raw.githubusercontent.com/whoisjones/BioScopeSequenceLabelingData/master/sequence_labeled/" - cached_path(f"{bioscope_path}output.txt", Path("datasets") / dataset_name) - - super(BIOSCOPE, self).__init__( - data_folder, columns, in_memory=in_memory, train_file="output.txt", **corpusargs, - ) - - -def _download_wikiner(language_code: str, dataset_name: str): - # download data if necessary - wikiner_path = ( - "https://raw.githubusercontent.com/dice-group/FOX/master/input/Wikiner/" - ) - lc = language_code - - data_file = ( - Path(flair.cache_root) - / "datasets" - / dataset_name - / f"aij-wikiner-{lc}-wp3.train" - ) - if not data_file.is_file(): - - cached_path( - f"{wikiner_path}aij-wikiner-{lc}-wp3.bz2", Path("datasets") / dataset_name - ) - import bz2, shutil + _download_wikiner("en", dataset_name) - # unpack and write out in CoNLL column-like format - bz_file = bz2.BZ2File( - Path(flair.cache_root) - / "datasets" - / dataset_name - / f"aij-wikiner-{lc}-wp3.bz2", - "rb", + super(WIKINER_ENGLISH, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, ) - with bz_file as f, open( - Path(flair.cache_root) - / "datasets" - / dataset_name - / f"aij-wikiner-{lc}-wp3.train", - "w", - encoding="utf-8" - ) as out: - for line in f: - line = line.decode("utf-8") - words = line.split(" ") - for word in words: - out.write("\t".join(word.split("|")) + "\n") - -class UP_CHINESE(ColumnCorpus): - def __init__( - self, - base_path: Union[str, Path] = None, - in_memory: bool = True, - document_as_sequence: bool = False, - **corpusargs, - ): - """ - Initialize the Chinese dataset from the Universal Propositions Bank, comming from that webpage: - https://github.com/System-T/UniversalPropositions/tree/master/UP_Chinese - - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ - if type(base_path) == str: - base_path: Path = Path(base_path) - - # column format - columns = {1: "text", 9: "frame"} - - # this dataset name - dataset_name = self.__class__.__name__.lower() - - # default dataset folder is the cache root - if not base_path: - base_path = Path(flair.cache_root) / "datasets" - data_folder = base_path / dataset_name - - # download data if necessary - up_zh_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Chinese/" - cached_path(f"{up_zh_path}zh-up-train.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_zh_path}zh-up-dev.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_zh_path}zh-up-test.conllu", Path("datasets") / dataset_name) - super(UP_CHINESE, self).__init__( - data_folder, - columns, - encoding="utf-8", - train_file="zh-up-train.conllu", - test_file="zh-up-test.conllu", - dev_file="zh-up-dev.conllu", - in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", - comment_symbol="#", - **corpusargs, - ) -class UP_ENGLISH(ColumnCorpus): +class WIKINER_GERMAN(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - in_memory: bool = True, - document_as_sequence: bool = False, + tag_to_bioes: str = "ner", + in_memory: bool = False, **corpusargs, ): - """ - Initialize the English dataset from the Universal Propositions Bank, comming from that webpage: - https://github.com/System-T/UniversalPropositions. - - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {1: "text", 10: "frame"} + columns = {0: "text", 1: "pos", 2: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2466,46 +2495,26 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - up_en_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_English-EWT/" - cached_path(f"{up_en_path}en_ewt-up-train.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_en_path}en_ewt-up-dev.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_en_path}en_ewt-up-test.conllu", Path("datasets") / dataset_name) - - super(UP_ENGLISH, self).__init__( - data_folder, - columns, - encoding="utf-8", - train_file="en_ewt-up-train.conllu", - test_file="en_ewt-up-test.conllu", - dev_file="en_ewt-up-dev.conllu", - in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", - comment_symbol="#", - **corpusargs, + _download_wikiner("de", dataset_name) + + super(WIKINER_GERMAN, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, ) -class UP_FRENCH(ColumnCorpus): + +class WIKINER_DUTCH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - in_memory: bool = True, - document_as_sequence: bool = False, + tag_to_bioes: str = "ner", + in_memory: bool = False, **corpusargs, ): - """ - Initialize the French dataset from the Universal Propositions Bank, comming from that webpage: - https://github.com/System-T/UniversalPropositions. - - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {1: "text", 9: "frame"} + columns = {0: "text", 1: "pos", 2: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2516,46 +2525,26 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - up_fr_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_French/" - cached_path(f"{up_fr_path}fr-up-train.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_fr_path}fr-up-dev.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_fr_path}fr-up-test.conllu", Path("datasets") / dataset_name) + _download_wikiner("nl", dataset_name) - super(UP_FRENCH, self).__init__( - data_folder, - columns, - encoding="utf-8", - train_file="fr-up-train.conllu", - test_file="fr-up-test.conllu", - dev_file="fr-up-dev.conllu", - in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", - comment_symbol="#", - **corpusargs, + super(WIKINER_DUTCH, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, ) -class UP_FINNISH(ColumnCorpus): + +class WIKINER_FRENCH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - in_memory: bool = True, - document_as_sequence: bool = False, + tag_to_bioes: str = "ner", + in_memory: bool = False, **corpusargs, ): - """ - Initialize the Finnish dataset from the Universal Propositions Bank, comming from that webpage: - https://github.com/System-T/UniversalPropositions/tree/master/UP_Finnish - - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {1: "text", 9: "frame"} + columns = {0: "text", 1: "pos", 2: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2566,46 +2555,26 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - up_fi_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Finnish/" - cached_path(f"{up_fi_path}fi-up-train.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_fi_path}fi-up-dev.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_fi_path}fi-up-test.conllu", Path("datasets") / dataset_name) + _download_wikiner("fr", dataset_name) - super(UP_FINNISH, self).__init__( - data_folder, - columns, - encoding="utf-8", - train_file="fi-up-train.conllu", - test_file="fi-up-test.conllu", - dev_file="fi-up-dev.conllu", - in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", - comment_symbol="#", - **corpusargs, + super(WIKINER_FRENCH, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, ) -class UP_GERMAN(ColumnCorpus): + +class WIKINER_ITALIAN(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - in_memory: bool = True, - document_as_sequence: bool = False, + tag_to_bioes: str = "ner", + in_memory: bool = False, **corpusargs, ): - """ - Initialize the German dataset from the Universal Propositions Bank, comming from that webpage: - https://github.com/System-T/UniversalPropositions. - - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {1: "text", 9: "frame"} + columns = {0: "text", 1: "pos", 2: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2616,46 +2585,26 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - up_de_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_German/" - cached_path(f"{up_de_path}de-up-train.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_de_path}de-up-dev.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_de_path}de-up-test.conllu", Path("datasets") / dataset_name) + _download_wikiner("it", dataset_name) - super(UP_GERMAN, self).__init__( - data_folder, - columns, - encoding="utf-8", - train_file="de-up-train.conllu", - test_file="de-up-test.conllu", - dev_file="de-up-dev.conllu", - in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", - comment_symbol="#", - **corpusargs, + super(WIKINER_ITALIAN, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory ) -class UP_ITALIAN(ColumnCorpus): + +class WIKINER_SPANISH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - in_memory: bool = True, - document_as_sequence: bool = False, + tag_to_bioes: str = "ner", + in_memory: bool = False, **corpusargs, ): - """ - Initialize the Italian dataset from the Universal Propositions Bank, comming from that webpage: - https://github.com/System-T/UniversalPropositions/tree/master/UP_Italian - - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {1: "text", 9: "frame"} + columns = {0: "text", 1: "pos", 2: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2666,46 +2615,26 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - up_it_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Italian/" - cached_path(f"{up_it_path}it-up-train.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_it_path}it-up-dev.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_it_path}it-up-test.conllu", Path("datasets") / dataset_name) + _download_wikiner("es", dataset_name) - super(UP_ITALIAN, self).__init__( - data_folder, - columns, - encoding="utf-8", - train_file="it-up-train.conllu", - test_file="it-up-test.conllu", - dev_file="it-up-dev.conllu", - in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", - comment_symbol="#", - **corpusargs, + super(WIKINER_SPANISH, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, ) -class UP_SPANISH(ColumnCorpus): + +class WIKINER_PORTUGUESE(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - in_memory: bool = True, - document_as_sequence: bool = False, + tag_to_bioes: str = "ner", + in_memory: bool = False, **corpusargs, ): - """ - Initialize the Spanish dataset from the Universal Propositions Bank, comming from that webpage: - https://github.com/System-T/UniversalPropositions - - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {1: "text", 9: "frame"} + columns = {0: "text", 1: "pos", 2: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2716,46 +2645,26 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish/" - cached_path(f"{up_es_path}es-up-train.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_es_path}es-up-dev.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_es_path}es-up-test.conllu", Path("datasets") / dataset_name) + _download_wikiner("pt", dataset_name) - super(UP_SPANISH, self).__init__( - data_folder, - columns, - encoding="utf-8", - train_file="es-up-train.conllu", - test_file="es-up-test.conllu", - dev_file="es-up-dev.conllu", - in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", - comment_symbol="#", - **corpusargs, + super(WIKINER_PORTUGUESE, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, ) -class UP_SPANISH_ANCORA(ColumnCorpus): + +class WIKINER_POLISH(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, - in_memory: bool = True, - document_as_sequence: bool = False, + tag_to_bioes: str = "ner", + in_memory: bool = False, **corpusargs, ): - """ - Initialize the Spanish AnCora dataset from the Universal Propositions Bank, comming from that webpage: - https://github.com/System-T/UniversalPropositions - - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object - """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {1: "text", 9: "frame"} + columns = {0: "text", 1: "pos", 2: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2766,73 +2675,44 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish-AnCora/" - cached_path(f"{up_es_path}es_ancora-up-train.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_es_path}es_ancora-up-dev.conllu", Path("datasets") / dataset_name) - cached_path(f"{up_es_path}es_ancora-up-test.conllu", Path("datasets") / dataset_name) + _download_wikiner("pl", dataset_name) - super(UP_SPANISH_ANCORA, self).__init__( - data_folder, - columns, - encoding="utf-8", - train_file="es_ancora-up-train.conllu", - test_file="es_ancora-up-test.conllu", - dev_file="es_ancora-up-dev.conllu", - in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", - comment_symbol="#", - **corpusargs, + super(WIKINER_POLISH, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, ) -class MITMovieNERSimple(ColumnCorpus): +class WIKINER_RUSSIAN(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", - in_memory: bool = True, + in_memory: bool = False, **corpusargs, ): - """ - Initialize the eng corpus of the MIT Movie Corpus (it has simpler queries compared to the trivia10k13 corpus) - in BIO format. The first time you call this constructor it will automatically download the dataset. - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict - POS tags instead - :param in_memory: If True, keeps dataset in memory giving speedups in training. - """ + if type(base_path) == str: + base_path: Path = Path(base_path) + # column format - columns = {0: "ner", 1: "text"} + columns = {0: "text", 1: "pos", 2: "ner"} - # dataset name + # this dataset name dataset_name = self.__class__.__name__.lower() - # data folder: default dataset folder is the cache root - if type(base_path) == str: - base_path: Path = Path(base_path) + # default dataset folder is the cache root if not base_path: - base_path: Path = Path(flair.cache_root) / "datasets" + base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name # download data if necessary - mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/" - train_file = "engtrain.bio" - test_file = "engtest.bio" - cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name) - cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name) + _download_wikiner("ru", dataset_name) - super(MITMovieNERSimple, self).__init__( - data_folder, - columns, - train_file=train_file, - test_file=test_file, - tag_to_bioes=tag_to_bioes, - in_memory=in_memory, - **corpusargs, + super(WIKINER_RUSSIAN, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, ) -class MITMovieNERComplex(ColumnCorpus): + +class WNUT_17(ColumnCorpus): def __init__( self, base_path: Union[str, Path] = None, @@ -2840,59 +2720,56 @@ def __init__( in_memory: bool = True, **corpusargs, ): - """ - Initialize the trivia10k13 corpus of the MIT Movie Corpus (it has more complex queries compared to the eng corpus) - in BIO format. The first time you call this constructor it will automatically download the dataset. - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict - POS tags instead - :param in_memory: If True, keeps dataset in memory giving speedups in training. - """ + if type(base_path) == str: + base_path: Path = Path(base_path) + # column format - columns = {0: "ner", 1: "text"} + columns = {0: "text", 1: "ner"} - # dataset name + # this dataset name dataset_name = self.__class__.__name__.lower() - # data folder: default dataset folder is the cache root - if type(base_path) == str: - base_path: Path = Path(base_path) + # default dataset folder is the cache root if not base_path: - base_path: Path = Path(flair.cache_root) / "datasets" + base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name # download data if necessary - mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/" - train_file = "trivia10k13train.bio" - test_file = "trivia10k13test.bio" - cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name) - cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name) + wnut_path = "https://noisy-text.github.io/2017/files/" + cached_path(f"{wnut_path}wnut17train.conll", Path("datasets") / dataset_name) + cached_path(f"{wnut_path}emerging.dev.conll", Path("datasets") / dataset_name) + cached_path( + f"{wnut_path}emerging.test.annotated", Path("datasets") / dataset_name + ) - super(MITMovieNERComplex, self).__init__( - data_folder, - columns, - train_file=train_file, - test_file=test_file, - tag_to_bioes=tag_to_bioes, - in_memory=in_memory, - **corpusargs, + super(WNUT_17, self).__init__( + data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs, ) -class SEC_FILLINGS(ColumnCorpus): + +class WNUT_2020_NER(ColumnCorpus): def __init__( self, - base_path: Union[str, Path] = None, + base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", in_memory: bool = True, + document_as_sequence: bool = False, **corpusargs, ): - + """ + Initialize the WNUT_2020_NER corpus. The first time you call this constructor it will automatically + download the dataset. + :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + :param tag_to_bioes: NER by default, since it is the only option of the WNUT corpus. + :param in_memory: If True, keeps dataset in memory giving speedups in training. + :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + """ if type(base_path) == str: base_path: Path = Path(base_path) # column format - columns = {0: "text", 1: "pos", 3: "ner"} + columns = {0: "text", 1: "ner"} # this dataset name dataset_name = self.__class__.__name__.lower() @@ -2903,41 +2780,125 @@ def __init__( data_folder = base_path / dataset_name # download data if necessary - SEC_FILLINGS_Path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/SEC-filings/CONLL-format/data/" - cached_path(f"{SEC_FILLINGS_Path}test/FIN3.txt", Path("datasets") / dataset_name) - cached_path(f"{SEC_FILLINGS_Path}train/FIN5.txt", Path("datasets") / dataset_name) + github_url = "https://github.com/jeniyat/WNUT_2020_NER/archive/master.zip" - super(SEC_FILLINGS, self).__init__( + for sample in ["train", "test", "dev"]: + + sample_file = data_folder / (sample + ".txt") + if not sample_file.is_file(): + + zip_path = cached_path( + f"{github_url}", Path("datasets") / dataset_name + ) + + # unzip the downloaded repo and merge the train, dev and test datasets + unpack_file(zip_path, data_folder, "zip", False) # unzipped folder name: WNUT_2020_NER-master + + if sample == "test": + file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data_2020/Conll_Format/") + else: + file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data/Conll_Format/") + filenames = os.listdir(file_path) + with open(data_folder / (sample + '.txt'), 'w') as outfile: + for fname in filenames: + with open(file_path / fname) as infile: + lines = infile.read() + outfile.write(lines) + + shutil.rmtree(str(data_folder / "WNUT_2020_NER-master")) # clean up when done + + super(WNUT_2020_NER, self).__init__( data_folder, columns, tag_to_bioes=tag_to_bioes, encoding="utf-8", in_memory=in_memory, - train_file='FIN5.txt', - test_file="FIN3.txt", - skip_first_line=True + document_separator_token=None if not document_as_sequence else "-DOCSTART-", **corpusargs, ) -class TURKU_NER(ColumnCorpus): + +def _download_wikiner(language_code: str, dataset_name: str): + # download data if necessary + wikiner_path = ( + "https://raw.githubusercontent.com/dice-group/FOX/master/input/Wikiner/" + ) + lc = language_code + + data_file = ( + Path(flair.cache_root) + / "datasets" + / dataset_name + / f"aij-wikiner-{lc}-wp3.train" + ) + if not data_file.is_file(): + + cached_path( + f"{wikiner_path}aij-wikiner-{lc}-wp3.bz2", Path("datasets") / dataset_name + ) + import bz2, shutil + + # unpack and write out in CoNLL column-like format + bz_file = bz2.BZ2File( + Path(flair.cache_root) + / "datasets" + / dataset_name + / f"aij-wikiner-{lc}-wp3.bz2", + "rb", + ) + with bz_file as f, open( + Path(flair.cache_root) + / "datasets" + / dataset_name + / f"aij-wikiner-{lc}-wp3.train", + "w", + encoding="utf-8" + ) as out: + for line in f: + line = line.decode("utf-8") + words = line.split(" ") + for word in words: + out.write("\t".join(word.split("|")) + "\n") + + +class XTREME(MultiCorpus): def __init__( self, + languages: Union[str, List[str]] = None, base_path: Union[str, Path] = None, tag_to_bioes: str = "ner", - in_memory: bool = True, - document_as_sequence: bool = False, + in_memory: bool = False, **corpusargs, ): """ - Initialize the Finnish TurkuNER corpus. The first time you call this constructor it will automatically - download the dataset. - :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this - to point to a different folder but typically this should not be necessary. - :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict - POS tags instead - :param in_memory: If True, keeps dataset in memory giving speedups in training. - :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object + Xtreme corpus for cross-lingual NER consisting of datasets of a total of 176 languages. The data comes from the google + research work XTREME https://github.com/google-research/xtreme. All datasets for NER and respective language abbreviations (e.g. + "en" for english can be found here https://www.amazon.com/clouddrive/share/d3KGCRCIYwhKJF0H3eWA26hjg2ZCRhjpEQtDL70FSBN/folder/C43gs51bSIaq5sFTQkWNCQ?_encoding=UTF8&*Version*=1&*entries*=0&mgh=1 ) + The data is derived from the wikiann dataset https://elisa-ie.github.io/wikiann/ (license: https://opendatacommons.org/licenses/by/) + + Parameters + ---------- + languages : Union[str, List[str]], optional + Default the 40 languages that are used in XTREME are loaded. Otherwise on can hand over a strings or a list of strings + consisiting of abbreviations for languages. All datasets will be loaded in a MultiCorpus object. + base_path : Union[str, Path], optional + Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this + to point to a different folder but typically this should not be necessary. + tag_to_bioes : str, optional + The data is in bio-format. It will by default (with the string "ner" as value) be transformed + into the bioes format. If you dont want that set it to None. + """ + # if no languages are given as argument all languages used in XTREME will be loaded + if not languages: + languages = ["af", "ar", "bg", "bn", "de", "el", "en", "es", "et", "eu", "fa", "fi", "fr", "he", "hi", "hu", + "id", "it", "ja", "jv", "ka", "kk", "ko", "ml", "mr", "ms", "my", "nl", "pt", "ru", "sw", "ta", + "te", "th", "tl", "tr", "ur", "vi", "yo", "zh"] + + # if only one language is given + if type(languages) == str: + languages = [languages] + if type(base_path) == str: base_path: Path = Path(base_path) @@ -2945,32 +2906,77 @@ def __init__( columns = {0: "text", 1: "ner"} # this dataset name - dataset_name = self.__class__.__name__.lower() + dataset_name = "xtreme" # default dataset folder is the cache root if not base_path: base_path = Path(flair.cache_root) / "datasets" data_folder = base_path / dataset_name + # For each language in languages, the file is downloaded if not existent + # Then a comlumncorpus of that data is created and saved in a list + # This list is handed to the multicorpus + + # list that contains the columncopora + corpora = [] + + hu_path = "https://nlp.informatik.hu-berlin.de/resources/datasets/panx_dataset" + # download data if necessary - conll_path = "https://raw.githubusercontent.com/TurkuNLP/turku-ner-corpus/master/data/conll" - dev_file = "dev.tsv" - test_file = "test.tsv" - train_file = "train.tsv" - cached_path(f"{conll_path}/{dev_file}", Path("datasets") / dataset_name) - cached_path(f"{conll_path}/{test_file}", Path("datasets") / dataset_name) - cached_path(f"{conll_path}/{train_file}", Path("datasets") / dataset_name) + for language in languages: - super(TURKU_NER, self).__init__( - data_folder, - columns, - dev_file=dev_file, - test_file=test_file, - train_file=train_file, - column_delimiter="\t", - tag_to_bioes=tag_to_bioes, - encoding="latin-1", - in_memory=in_memory, - document_separator_token=None if not document_as_sequence else "-DOCSTART-", - **corpusargs, + language_folder = data_folder / language + + # if language not downloaded yet, download it + if not language_folder.exists(): + + file_name = language + '.tar.gz' + # create folder + os.makedirs(language_folder) + + # download from HU Server + temp_file = cached_path( + hu_path + "/" + file_name, + Path("datasets") / dataset_name / language + ) + + # unzip + print("Extract data...") + import tarfile + tar = tarfile.open(str(temp_file), "r:gz") + for part in ["train", "test", "dev"]: + tar.extract(part, str(language_folder)) + tar.close() + print('...done.') + + # transform data into required format + print("Process dataset...") + for part in ["train", "test", "dev"]: + xtreme_to_simple_ner_annotation(str(language_folder / part)) + print('...done.') + + # initialize comlumncorpus and add it to list + print("Read data into corpus...") + corp = ColumnCorpus(data_folder=language_folder, + column_format=columns, + tag_to_bioes=tag_to_bioes, + in_memory=in_memory, + ) + corpora.append(corp) + print("...done.") + + super(XTREME, self).__init__( + corpora, name='xtreme', ) + + +def xtreme_to_simple_ner_annotation(data_file: Union[str, Path]): + with open(data_file, 'r', encoding='utf-8') as f: + lines = f.readlines() + with open(data_file, 'w', encoding='utf-8') as f: + for line in lines: + if line == '\n': + f.write(line) + else: + liste = line.split() + f.write(liste[0].split(':', 1)[1] + ' ' + liste[1] + '\n') diff --git a/resources/docs/TUTORIAL_6_CORPUS.md b/resources/docs/TUTORIAL_6_CORPUS.md index f981bf715..0c7419abe 100644 --- a/resources/docs/TUTORIAL_6_CORPUS.md +++ b/resources/docs/TUTORIAL_6_CORPUS.md @@ -162,20 +162,24 @@ data the first time you call the corresponding constructor ID. The following dat | ID(s) | Languages | Description | | ------------- | ------------- |------------- +| 'ANER_CORP' | Arabic | [Arabic Named Entity Recognition Corpus](http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp/) 4-class NER | | 'BIOFID' | German | [CoNLL-03](https://www.aclweb.org/anthology/K19-1081/) Biodiversity literature NER | +| 'BIOSCOPE' | English | [BioScope](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-9-S11-S9/) biomedical texts annotated for uncertainty, negation and their scopes | | 'CONLL_03_DUTCH' | Dutch | [CoNLL-03](https://www.clips.uantwerpen.be/conll2002/ner/) 4-class NER | | 'CONLL_03_SPANISH' | Spanish | [CoNLL-03](https://www.clips.uantwerpen.be/conll2002/ner/) 4-class NER | | 'DANE' | Danish | [DaNE dataset](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#danish-dependency-treebank) | | 'EUROPARL_NER_GERMAN' | German | [German Europarl dataset](https://nlpado.de/~sebastian/software/ner_german.shtml) NER in German EU parliament speeches | | 'LER_GERMAN' | German | [Legal Entity Recognition](https://github.com/elenanereiss/Legal-Entity-Recognition) NER in German Legal Documents | -| 'MIT_RESTAURANTS' | English | [NER dataset for restaurant reviews](https://groups.csail.mit.edu/sls/downloads/restaurant/) | +| 'MIT_MOVIE_NER_SIMPLE' | English | [NER dataset for movie reviews](https://groups.csail.mit.edu/sls/downloads/movie/) - simple NER | +| 'MIT_MOVIE_NER_COMPLEX' | English | [NER dataset for movie reviews](https://groups.csail.mit.edu/sls/downloads/movie/) - complex NER | +| 'MIT_RESTAURANT_NER' | English | [NER dataset for restaurant reviews](https://groups.csail.mit.edu/sls/downloads/restaurant/) | | 'NER_BASQUE' | Basque | [NER dataset for Basque](http://ixa2.si.ehu.eus/eiec/) | | 'NER_FINNISH' | Finnish | [Finer-data](https://github.com/mpsilfve/finer-data) | | 'NER_SWEDISH' | Swedish | [Swedish Spraakbanken NER](https://github.com/klintan/swedish-ner-corpus/) 4-class NER | +| 'TURKU_NER' | Finnish | [TURKU_NER](https://github.com/TurkuNLP/turku-ner-corpus) NER corpus created by the Turku NLP Group, University of Turku, Finland | | 'TWITTER_NER' | English | [Twitter NER dataset](https://github.com/aritter/twitter_nlp/) | +| 'WEIBO_NER' | Chinese | [Weibo NER corpus](https://paperswithcode.com/sota/chinese-named-entity-recognition-on-weibo-ner/). | | 'WIKIANN' | 282 languages | Gigantic [corpus for cross-lingual NER derived from Wikipedia](https://elisa-ie.github.io/wikiann/). | -| 'WNUT_17' | English | [WNUT-17](https://noisy-text.github.io/2017/files/) emerging entity detection | -| 'WNUT_20' | English | [WNUT-20](https://github.com/jeniyat/WNUT_2020_NER) named entity extraction | | 'WIKIGOLD_NER' | English | [Wikigold](https://github.com/juand-r/entity-recognition-datasets/tree/master/data/wikigold) a manually annotated collection of Wikipedia text | | 'WIKINER_ENGLISH' | English | [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia | | 'WIKINER_GERMAN' | German | [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia | @@ -185,16 +189,33 @@ data the first time you call the corresponding constructor ID. The following dat | 'WIKINER_PORTUGUESE' | Portuguese | [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia | | 'WIKINER_POLISH' | Polish | [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia | | 'WIKINER_RUSSIAN' | Russian | [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia | +| 'WNUT_17' | English | [WNUT-17](https://noisy-text.github.io/2017/files/) emerging entity detection | +| 'WNUT_2020_NER' | English | [WNUT-20](https://github.com/jeniyat/WNUT_2020_NER) named entity extraction | | 'XTREME' | 176 languages | [Xtreme](https://github.com/google-research/xtreme) corpus by Google Research for cross-lingual NER consisting of datasets of a total of 176 languages | -| 'MITMovieNERSimple' | English | [eng](https://groups.csail.mit.edu/sls/downloads/movie) NER movie corpus collected by MIT (simpler queries) | -| 'MITMovieNERComplex' | English | [trivia10k13](https://groups.csail.mit.edu/sls/downloads/movie) NER movie corpus collected by MIT (more complex queries) | -| 'TURKU_NER' | Finnish | [TURKU_NER](https://github.com/TurkuNLP/turku-ner-corpus) NER corpus created by the Turku NLP Group, University of Turku, Finland | #### Biomedical Named Entity Recognition We support 31 biomedical NER datasets, listed [here](HUNFLAIR_CORPORA.md). + +#### Universal Proposition Banks + +We now also support loading the [Universal Proposition Banks](https://github.com/System-T/UniversalPropositions) +for the purpose of training multilingual frame detection systems. + +| ID(s) | Languages | Description | +| ------------- | ------------- |------------- | +| 'UP_CHINESE' | Chinese | Universal Propositions for [Chinese](https://github.com/System-T/UniversalPropositions/tree/master/UP_Chinese) | +| 'UP_ENGLISH'| English | Universal Propositions for [English](https://github.com/System-T/UniversalPropositions/tree/master/UP_English-EWT) | +| 'UP_FINNISH'| Finnish | Universal Propositions for [Finnish](https://github.com/System-T/UniversalPropositions/tree/master/UP_Finnish) +| 'UP_FRENCH'| French | Universal Propositions for [French](https://github.com/System-T/UniversalPropositions/tree/master/UP_French) +| 'UP_GERMAN'| German | Universal Propositions for [German](https://github.com/System-T/UniversalPropositions/tree/master/UP_German) | +| 'UP_ITALIAN', | Italian | Universal Propositions for [Italian](https://github.com/System-T/UniversalPropositions/tree/master/UP_Italian) | +| 'UP_SPANISH' | Spanish | Universal Propositions for [Spanish](https://github.com/System-T/UniversalPropositions/tree/master/UP_Spanish) | +| 'UP_SPANISH_ANCORA' | Spanish (Ancora Corpus) | Universal Propositions for [Spanish](https://github.com/System-T/UniversalPropositions/tree/master/UP_Spanish-AnCora) | + + #### Universal Dependency Treebanks | ID(s) | Languages | Description | From a75f13a8f5438a7d65d11faaa656f33a9295ab41 Mon Sep 17 00:00:00 2001 From: Alan Akbik Date: Wed, 25 Nov 2020 14:44:58 +0100 Subject: [PATCH 24/35] GH-1983: bump version number --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index fa33a27cc..d82f2155d 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ document embeddings, including our proposed **[Flair embeddings](https://www.acl * **A PyTorch NLP framework.** Our framework builds directly on [PyTorch](https://pytorch.org/), making it easy to train your own models and experiment with new approaches using Flair embeddings and classes. -Now at [version 0.6.1](https://github.com/flairNLP/flair/releases)! +Now at [version 0.7](https://github.com/flairNLP/flair/releases)! ## Comparison with State-of-the-Art From 514ca76e062eb264f237a13df3c9e60a250b3ee6 Mon Sep 17 00:00:00 2001 From: Alan Akbik Date: Wed, 25 Nov 2020 14:51:25 +0100 Subject: [PATCH 25/35] Update TUTORIAL_1_BASICS.md --- resources/docs/TUTORIAL_1_BASICS.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/resources/docs/TUTORIAL_1_BASICS.md b/resources/docs/TUTORIAL_1_BASICS.md index 655ef375e..61828d0d0 100644 --- a/resources/docs/TUTORIAL_1_BASICS.md +++ b/resources/docs/TUTORIAL_1_BASICS.md @@ -80,7 +80,7 @@ print(untokenized_sentence) In this case, no tokenization is performed and the text is split on whitespaces, thus resulting in only 4 tokens here. -### Using a Different Tokenizer +### Using a different tokenizer You can also pass custom tokenizers to the initialization method. For instance, if you want to tokenize a Japanese sentence you can use the 'janome' tokenizer instead, like this: @@ -110,12 +110,12 @@ You can write your own tokenization routine. Check the code of `flair.data.Token your own tokenization method. ### Using pretokenized sequences -You can pass pass a pretokenized sequence as list of words, e.g. +You can alternatively pass a pretokenized sequence as list of words, e.g. ```python from flair.data import Sentence -my_sent = Sentence(['The', 'grass', 'is', 'green', '.']) -print(my_sent) +sentence = Sentence(['The', 'grass', 'is', 'green', '.']) +print(sentence) ``` This should print: @@ -129,7 +129,7 @@ Sentence: "The grass is green ." [− Tokens: 5] In Flair, any data point can be labeled. For instance, you can label a word or label a sentence: -### Adding Labels to Tokens +### Adding labels to tokens A `Token` has fields for linguistic annotation, such as lemmas, part-of-speech tags or named entity tags. You can add a tag by specifying the tag type and the tag value. In this example, we're adding an NER tag of type 'color' to @@ -171,7 +171,7 @@ This should print: Our color tag has a score of 1.0 since we manually added it. If a tag is predicted by our sequence labeler, the score value will indicate classifier confidence. -### Adding Labels to Sentences +### Adding labels to sentences You can also add a `Label` to a whole `Sentence`. For instance, the example below shows how we add the label 'sports' to a sentence, thereby labeling it @@ -199,7 +199,7 @@ Sentence: "France is the current world cup winner." [− Tokens: 7 − Senten Indicating that this sentence belongs to the topic 'sports' with confidence 1.0. -### Multiple Labels +### Multiple labels Any data point can be labeled multiple times. A sentence for instance might belong to two topics. In this case, add two labels with the same label name: @@ -234,7 +234,7 @@ Sentence: "France is the current world cup winner." [− Tokens: 7 − Senten Indicating that this sentence has two "topic" labels and one "language" label. -### Accessing a Sentence's Labels +### Accessing a sentence's labels You can access these labels like this: From 84f2f2f4ba10688876514c939550d17d3f1e3cc1 Mon Sep 17 00:00:00 2001 From: Alan Akbik Date: Fri, 27 Nov 2020 12:50:11 +0100 Subject: [PATCH 26/35] Update TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md --- resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md index eba2594df..50bbfc633 100644 --- a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md +++ b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md @@ -18,6 +18,8 @@ For instance, say you want to predict whether text is "happy" or "sad" but you h Just use TARS with this snippet: ```python +from flair.models.text_classification_model import TARSClassifier + # 1. Load our pre-trained TARS model for English tars = TARSClassifier.load('tars-base') From f3eab501a956b70abd0b1ba84a6608c3880aa43d Mon Sep 17 00:00:00 2001 From: Alan Akbik Date: Fri, 27 Nov 2020 12:51:28 +0100 Subject: [PATCH 27/35] Update TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md --- resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md index 50bbfc633..16f19b7ce 100644 --- a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md +++ b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md @@ -19,6 +19,7 @@ Just use TARS with this snippet: ```python from flair.models.text_classification_model import TARSClassifier +from flair.data import Sentence # 1. Load our pre-trained TARS model for English tars = TARSClassifier.load('tars-base') @@ -69,6 +70,8 @@ In this case, zero-shot prediction falsely predicts "drink" for the sentence "I To improve this, let's first create a small corpus of 4 training and 2 testing examples: ```python +from flair.datasets import SentenceDataset + # training dataset consisting of four sentences (2 labeled as "food" and 2 labeled as "drink") train = SentenceDataset( [ From 0e12b0a90f28ef0efe0b23a67a9567056a7c5e2b Mon Sep 17 00:00:00 2001 From: alanakbik Date: Tue, 1 Dec 2020 10:52:55 +0100 Subject: [PATCH 28/35] GH-1983: move distance classifier to diagnostics module --- flair/models/__init__.py | 1 - flair/models/text_classification_model.py | 486 +--------------------- 2 files changed, 1 insertion(+), 486 deletions(-) diff --git a/flair/models/__init__.py b/flair/models/__init__.py index 15f2a326b..ebb6827d3 100644 --- a/flair/models/__init__.py +++ b/flair/models/__init__.py @@ -2,4 +2,3 @@ from .simple_sequence_tagger_model import SimpleSequenceTagger from .language_model import LanguageModel from .text_classification_model import TextClassifier -from .text_classification_model import DistClassifier diff --git a/flair/models/text_classification_model.py b/flair/models/text_classification_model.py index 00115d2aa..7e0dab976 100644 --- a/flair/models/text_classification_model.py +++ b/flair/models/text_classification_model.py @@ -7,7 +7,6 @@ from torch.utils.data.dataset import Dataset from tqdm import tqdm import numpy as np -from math import floor import sklearn.metrics as metrics from sklearn.metrics.pairwise import cosine_similarity @@ -17,12 +16,7 @@ from flair.data import Dictionary, Sentence, Label, DataPoint from flair.datasets import SentenceDataset, DataLoader from flair.file_utils import cached_path -from flair.training_utils import ( - MetricRegression, - convert_labels_to_one_hot, - Result, - store_embeddings, -) +from flair.training_utils import convert_labels_to_one_hot, Result, store_embeddings log = logging.getLogger("flair") @@ -947,481 +941,3 @@ def _fetch_model(model_name) -> str: model_name = cached_path(model_map[model_name], cache_dir=cache_dir) return model_name - - - -class DistClassifier(flair.nn.Model): - """ - DistClassifier - Model to predict distance between two words given their embeddings. Takes (contextual) word embedding as input. - The pair of word embeddings is passed through a linear layer that predicts their distance in a sentence. - Note: When used for training the batch size must be set to 1!!! - """ - - def __init__( - self, - word_embeddings: flair.embeddings.TokenEmbeddings, - max_distance: int = 20, - beta: float = 1.0, - loss_max_weight: float = 1, - regression = False, - regr_loss_step = 0 - ): - """ - Initializes a DistClassifier - :param word_embeddings: embeddings used to embed each sentence - .param max_distance: max dist between word pairs = number of predicted classes - 1 - :param beta: Parameter for F-beta score for evaluation and training annealing - :param loss_max_weight: Only for classification: Since small distances between word pairs occur mor frequent it makes sense to give them less weight - in the loss function. loss_max_weight will be used as the weight for the maximum distance and should be a number >=1 - The other weights decrease with equidistant steps from high to low distance. - :param regression: if True the class does regression instead of classification - :param regr_loss_step: if > 0, the MSE-Loss in regression will be weighted. Word pairs with - distance 0 have weight 1. Then, as the distance increases, the weight in the loss function, - increases step by step with size regr_loss_step - """ - - super(DistClassifier, self).__init__() - - self.word_embeddings: flair.embeddings.TokenEmbeddings = word_embeddings - - self.beta = beta - - self.loss_max_weight = loss_max_weight - - self.regression = regression - - self.regr_loss_step = regr_loss_step - - if not regression: - self.max_distance = max_distance - - # weights for loss function - if self.loss_max_weight > 1: - step = (self.loss_max_weight - 1) / self.max_distance - - weight_list = [1. + i * step for i in range(self.max_distance + 1)] - - self.loss_weights = torch.FloatTensor(weight_list).to(flair.device) - - else: - self.loss_weights = None - - # iput size is two times wordembedding size since we use pair of words as input - # the output size is max_distance + 1, i.e. we allow 0,1,...,max_distance words between pairs - self.decoder = nn.Linear( - self.word_embeddings.embedding_length * 2, self.max_distance + 1) - - self.loss_function = nn.CrossEntropyLoss(weight=self.loss_weights) - - # regression - else: - self.max_distance = float('inf') - - # iput size is two times wordembedding size since we use pair of words as input - # the output size is 1 - self.decoder = nn.Linear( - self.word_embeddings.embedding_length * 2, 1) - - if regr_loss_step > 0: - self.loss_function = self.weighted_mse_loss - else: - self.loss_function = nn.MSELoss() - - nn.init.xavier_uniform_(self.decoder.weight) - - # auto-spawn on GPU if available - self.to(flair.device) - - - # all input should be tensors - def weighted_mse_loss(self,predictions, target): - - weight = 1 + self.regr_loss_step * target - - return (weight * ((predictions - target) ** 2)).mean() - - - # forward allows only a single sentcence!! - def forward(self, sentence: Sentence): - - # embed words of sentence - self.word_embeddings.embed(sentence) - - # go through all pairs of words with a maximum number of max_distance in between - numberOfWords = len(sentence) - text_embedding_list = [] - # go through all pairs - for i in range(numberOfWords): - for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)): - text_embedding_list.append(torch.cat((sentence[i].embedding, sentence[j].embedding)).unsqueeze(0)) - - # 2-dim matrix whose rows are the embeddings of word pairs of the sentence - text_embedding_tensor = torch.cat(text_embedding_list, 0).to(flair.device) - - label_scores = self.decoder(text_embedding_tensor) - - if self.regression: - return label_scores.squeeze(1) - - return label_scores - - def _get_state_dict(self): - model_state = { - "state_dict": self.state_dict(), - "word_embeddings": self.word_embeddings, - "max_distance": self.max_distance, - "beta": self.beta, - "loss_max_weight": self.loss_max_weight, - "regression": self.regression, - "regr_loss_step": self.regr_loss_step - } - return model_state - - @staticmethod - def _init_model_with_state_dict(state): - beta = 1.0 if "beta" not in state.keys() else state["beta"] - weight = 1 if "loss_max_weight" not in state.keys() else state["loss_max_weight"] - - model = DistClassifier( - word_embeddings=state["word_embeddings"], - max_distance=state["max_distance"], - beta=beta, - loss_max_weight=weight, - regression=state["regression"], - regr_loss_step=state["regr_loss_step"] - ) - - model.load_state_dict(state["state_dict"]) - return model - - # So far only one sentence allowed - # If list of sentences is handed the function works with the first sentence of the list - def forward_loss( - self, data_points: Union[List[Sentence], Sentence] - ) -> torch.tensor: - - if isinstance(data_points, list): # first sentence - data_points = data_points[0] - - if len(data_points) < 2: - return torch.tensor([0.], requires_grad=True) - - scores = self.forward(data_points) - - return self._calculate_loss(scores, data_points) - - # Assume data_points is a single sentence!!! - # scores are the predictions for each word pair - def _calculate_loss(self, scores, data_points): - - indices = [] - numberOfWords = len(data_points) - - # classification needs labels to be integers, regression needs labels to be float - # this is due to the different loss functions - if not self.regression: - for i in range(numberOfWords): - for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)): - indices.append(torch.LongTensor([j - i - 1])) # distance between words - else: - for i in range(numberOfWords): - for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)): - indices.append(torch.Tensor([j - i - 1])) # distance between words - - labels = torch.cat(indices, 0).to(flair.device) - - return self.loss_function(scores, labels) - - # only single sentences as input - def _forward_scores_and_loss( - self, data_points: Union[List[Sentence], Sentence], return_loss=False): - - if isinstance(data_points, list): # first sentence - data_points = data_points[0] - - scores = self.forward(data_points) - - loss = None - if return_loss: - loss = self._calculate_loss(scores, data_points) - - return scores, loss - - def evaluate( - self, - sentences: Union[List[DataPoint], Dataset], - out_path: Union[str, Path] = None, - embedding_storage_mode: str = "none", - mini_batch_size: int = 1, # unnecessary, but trainer.train calls evaluate with this parameter - num_workers: int = 8, - ) -> (Result, float): - - if self.regression: - return self.evaluate_regression( - sentences = sentences, - out_path = out_path, - embedding_storage_mode=embedding_storage_mode, - ) - - return self.evaluate_classification( - sentences = sentences, - out_path = out_path, - embedding_storage_mode=embedding_storage_mode, - ) - - def evaluate_regression( - self, - sentences: Union[List[DataPoint], Dataset], - out_path: Union[str, Path] = None, - embedding_storage_mode: str = "none", - ) -> (Result, float): - - with torch.no_grad(): - - buckets = [0 for _ in range(11)] - - eval_loss = 0 - - metric = MetricRegression("Evaluation") - - lines: List[str] = [] - - max_dist_plus_one = max([len(sent) for sent in sentences]) - 1 - - num_occurences = [0 for _ in range(max_dist_plus_one)] - - cumulated_values = [0 for _ in range(max_dist_plus_one)] - - for sentence in sentences: - - if len(sentence) < 2: # we need at least 2 words per sentence - continue - - scores, loss = self._forward_scores_and_loss(sentence, return_loss=True) - - predictions = scores.tolist() - - # gold labels - true_values_for_sentence = [] - numberOfPairs = 0 - numberOfWords = len(sentence) - lines.append(sentence.to_tokenized_string() + '\n') - for i in range(numberOfWords): - for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)): - true_dist = j - i - 1 - pred = predictions[numberOfPairs] - - true_values_for_sentence.append(true_dist) - - # for output text file - eval_line = f"({i},{j})\t{true_dist}\t{pred:.2f}\n" - lines.append(eval_line) - - # for buckets - error = abs(true_dist - pred) - if error >= 10: - buckets[10] += 1 - else: - buckets[floor(error)] += 1 - - # for average prediction - num_occurences[true_dist] += 1 - cumulated_values[true_dist] += pred - - numberOfPairs += 1 - - eval_loss += loss/numberOfPairs - - metric.true.extend(true_values_for_sentence) - metric.pred.extend(predictions) - - store_embeddings(sentence, embedding_storage_mode) - - eval_loss /= len(sentences) # w.r.t self.loss - - # add some statistics to the output - eval_line = f"Number of Sentences: {len(sentences)}\nBuckets:\n | 0-1 | 1-2 | 2-3 | 3-4 | 4-5 | 5-6 | 6-7 | 7-8 | 8-9 | 9-10 | >10 |\n" - lines.append(eval_line) - eval_line = "| {} | {} | {} | {} | {} | {} | {} | {} | {} | {} | {} |".format(buckets[0],buckets[1],buckets[2],buckets[3], - buckets[4],buckets[5],buckets[6],buckets[7], - buckets[8],buckets[9],buckets[10]) - lines.append(eval_line) - lines.append("\nAverage predicted values per distance:\n") - eval_line = "" - for i in range(max_dist_plus_one): - eval_line += str(i) + ": " + f"{cumulated_values[i]/num_occurences[i]:.2f}" + " " - if i!=0 and i%15==0: - eval_line += "\n" - - lines.append(eval_line) - - - - if out_path is not None: - with open(out_path, "w", encoding="utf-8") as outfile: - outfile.write("".join(lines)) - - log_line = f"{metric.mean_squared_error()}\t{metric.spearmanr()}\t{metric.pearsonr()}" - log_header = "MSE\tSPEARMAN\tPEARSON" - - detailed_result = ( - f"AVG: mse: {metric.mean_squared_error():.4f} - " - f"mae: {metric.mean_absolute_error():.4f} - " - f"pearson: {metric.pearsonr():.4f} - " - f"spearman: {metric.spearmanr():.4f}" - ) - - result: Result = Result( - metric.pearsonr(), log_header, log_line, detailed_result - ) - - - return result, eval_loss - - def evaluate_classification( - self, - sentences: Union[List[DataPoint], Dataset], - out_path: Union[str, Path] = None, - embedding_storage_mode: str = "none", - ) -> (Result, float): - - # use scikit-learn to evaluate - y_true = [] - y_pred = [] - - with torch.no_grad(): - eval_loss = 0 - - lines: List[str] = [] - # we iterate over each sentence, instead of batches - for sentence in sentences: - - if len(sentence) < 2: # we need at least 2 words per sentence - continue - - scores, loss = self._forward_scores_and_loss(sentence, return_loss=True) - - # get single labels from scores - predictions = [self._get_single_label(s) for s in scores] - - # gold labels - true_values_for_sentence = [] - numberOfPairs = 0 - numberOfWords = len(sentence) - lines.append(sentence.to_tokenized_string() + '\n') - for i in range(numberOfWords): - for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)): - true_values_for_sentence.append(j - i - 1) - - # for output text file - eval_line = "({},{})\t{}\t{}\n".format(i, j, j - i - 1, predictions[numberOfPairs]) - lines.append(eval_line) - - numberOfPairs += 1 - - eval_loss += loss / numberOfPairs # add average loss of word pairs - - for prediction_for_sentence, true_value_for_sentence in zip( - predictions, true_values_for_sentence - ): - # hot one vector of true value - y_true_instance = np.zeros(self.max_distance + 1, dtype=int) - y_true_instance[true_value_for_sentence] = 1 - y_true.append(y_true_instance.tolist()) - - # hot one vector of predicted value - y_pred_instance = np.zeros(self.max_distance + 1, dtype=int) - y_pred_instance[prediction_for_sentence] = 1 - y_pred.append(y_pred_instance.tolist()) - - # speichert embeddings, falls embedding_storage!= 'None' - store_embeddings(sentence, embedding_storage_mode) - - if out_path is not None: - with open(out_path, "w", encoding="utf-8") as outfile: - outfile.write("".join(lines)) - - # make "classification report" - target_names = [] # liste aller labels, ins unserem Fall - for i in range(self.max_distance + 1): - target_names.append(str(i)) - classification_report = metrics.classification_report(y_true, y_pred, digits=4, - target_names=target_names, zero_division=0) - - # get scores - micro_f_score = round(metrics.fbeta_score(y_true, y_pred, beta=self.beta, average='micro', zero_division=0), - 4) - accuracy_score = round(metrics.accuracy_score(y_true, y_pred), 4) - macro_f_score = round(metrics.fbeta_score(y_true, y_pred, beta=self.beta, average='macro', zero_division=0), - 4) - # precision_score = round(metrics.precision_score(y_true, y_pred, average='macro', zero_division=0), 4) - # recall_score = round(metrics.recall_score(y_true, y_pred, average='macro', zero_division=0), 4) - - detailed_result = ( - "\nResults:" - f"\n- F-score (micro) {micro_f_score}" - f"\n- F-score (macro) {macro_f_score}" - f"\n- Accuracy {accuracy_score}" - '\n\nBy class:\n' + classification_report - ) - - # line for log file - log_header = "ACCURACY" - log_line = f"\t{accuracy_score}" - - result = Result( - main_score=micro_f_score, - log_line=log_line, - log_header=log_header, - detailed_results=detailed_result, - ) - - eval_loss /= len(sentences) - - return result, eval_loss - - @staticmethod - def _filter_empty_sentences(sentences: List[Sentence]) -> List[Sentence]: - filtered_sentences = [sentence for sentence in sentences if sentence.tokens] - if len(sentences) != len(filtered_sentences): - log.warning( - "Ignore {} sentence(s) with no tokens.".format( - len(sentences) - len(filtered_sentences) - ) - ) - return filtered_sentences - - def _obtain_labels( - self, scores: List[List[float]], predict_prob: bool = False - ) -> List[List[Label]]: - """ - Predicts the labels of sentences. - :param scores: the prediction scores from the model - :return: list of predicted labels - """ - - if predict_prob: - return [self._predict_label_prob(s) for s in scores] - - return [self._get_single_label(s) for s in scores] - - def _get_single_label(self, label_scores): # -> List[Label]: - softmax = torch.nn.functional.softmax(label_scores, dim=0) - conf, idx = torch.max(softmax, 0) - - return idx.item() - - def _predict_label_prob(self, label_scores) -> List[Label]: - softmax = torch.nn.functional.softmax(label_scores, dim=0) - label_probs = [] - for idx, conf in enumerate(softmax): - label_probs.append(Label(idx, conf.item())) - return label_probs - - def __str__(self): - return super(flair.nn.Model, self).__str__().rstrip(')') + \ - f' (beta): {self.beta}\n' + \ - f' (loss_max_weight): {self.loss_max_weight}\n' + \ - f' (max_distance) {self.max_distance}\n)' - From da01d603731d1ae96b8900b40ae3c312f7a46cc6 Mon Sep 17 00:00:00 2001 From: alanakbik Date: Tue, 1 Dec 2020 11:15:44 +0100 Subject: [PATCH 29/35] GH-1983: move simple tagger to sandbox module --- flair/models/__init__.py | 1 - .../simple_sequence_tagger_model.py | 26 +++++++++---------- 2 files changed, 13 insertions(+), 14 deletions(-) rename flair/models/{ => sandbox}/simple_sequence_tagger_model.py (97%) diff --git a/flair/models/__init__.py b/flair/models/__init__.py index ebb6827d3..784b038a9 100644 --- a/flair/models/__init__.py +++ b/flair/models/__init__.py @@ -1,4 +1,3 @@ from .sequence_tagger_model import SequenceTagger, MultiTagger -from .simple_sequence_tagger_model import SimpleSequenceTagger from .language_model import LanguageModel from .text_classification_model import TextClassifier diff --git a/flair/models/simple_sequence_tagger_model.py b/flair/models/sandbox/simple_sequence_tagger_model.py similarity index 97% rename from flair/models/simple_sequence_tagger_model.py rename to flair/models/sandbox/simple_sequence_tagger_model.py index 298d887e0..211744643 100644 --- a/flair/models/simple_sequence_tagger_model.py +++ b/flair/models/sandbox/simple_sequence_tagger_model.py @@ -3,7 +3,6 @@ from pathlib import Path from typing import List, Union, Optional -import numpy as np import torch import torch.nn import torch.nn.functional as F @@ -18,19 +17,20 @@ log = logging.getLogger("flair") -""" -This class is a simple version of the SequenceTagger class. -The purpose of this class is to demonstrate the basic hierarchy of a -sequence tagger (this could be helpful for new developers). -It only uses the given embeddings and maps them with a linear layer to -the tag_dictionary dimension. -Thus, this class misses following functionalities from the SequenceTagger: -- CRF, -- RNN, -- Reprojection. -As a result, only poor results can be expected. -""" + class SimpleSequenceTagger(flair.nn.Model): + """ + This class is a simple version of the SequenceTagger class. + The purpose of this class is to demonstrate the basic hierarchy of a + sequence tagger (this could be helpful for new developers). + It only uses the given embeddings and maps them with a linear layer to + the tag_dictionary dimension. + Thus, this class misses following functionalities from the SequenceTagger: + - CRF, + - RNN, + - Reprojection. + As a result, only poor results can be expected. + """ def __init__( self, embeddings: TokenEmbeddings, From 49ce54b75ea11e9ef0b1153d239a8693ec42d487 Mon Sep 17 00:00:00 2001 From: Alan Akbik Date: Tue, 1 Dec 2020 12:38:59 +0100 Subject: [PATCH 30/35] Remove travis tag --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index d82f2155d..f145a7196 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,6 @@ [![GitHub Issues](https://img.shields.io/github/issues/flairNLP/flair.svg)](https://github.com/flairNLP/flair/issues) [![Contributions welcome](https://img.shields.io/badge/contributions-welcome-brightgreen.svg)](CONTRIBUTING.md) [![License: MIT](https://img.shields.io/badge/License-MIT-brightgreen.svg)](https://opensource.org/licenses/MIT) -[![Travis](https://img.shields.io/travis/flairNLP/flair.svg)](https://travis-ci.org/flairNLP/flair) A very simple framework for **state-of-the-art NLP**. Developed by [Humboldt University of Berlin](https://www.informatik.hu-berlin.de/en/forschung-en/gebiete/ml-en/) and friends. From 8f748da712e7aadb0aa985c12348ee73eac2777b Mon Sep 17 00:00:00 2001 From: alanakbik Date: Tue, 1 Dec 2020 14:31:01 +0100 Subject: [PATCH 31/35] GH-1983: update tutorial --- resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md b/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md index 371aabe31..0066acdb8 100644 --- a/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md +++ b/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md @@ -246,11 +246,20 @@ This gives you a multilingual model. Try experimenting with more languages! ## Plotting Training Curves and Weights Flair includes a helper method to plot training curves and weights in the neural network. -The `ModelTrainer` automatically generates a `loss.tsv` and a `weights.txt` file in the result folder. +The `ModelTrainer` automatically generates a `loss.tsv` in the result folder. If you set +`write_weights=True` during training, it will also generate a `weights.txt` file. After training, simple point the plotter to these files: ```python +# set write_weights to True to write weights +trainer.train('resources/taggers/example-universal-pos', + ... + write_weights=True, + ... + ) + +# visualize from flair.visual.training_curves import Plotter plotter = Plotter() plotter.plot_training_curves('loss.tsv') From 08463b3ba8dcf77595358c776199bd25e0d515dd Mon Sep 17 00:00:00 2001 From: alanakbik Date: Tue, 1 Dec 2020 19:15:46 +0100 Subject: [PATCH 32/35] GH-1983: update tutorial --- flair/models/text_classification_model.py | 3 +-- resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md | 2 +- resources/docs/TUTORIAL_2_TAGGING.md | 8 +++++++- resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md | 7 ++++++- 4 files changed, 15 insertions(+), 5 deletions(-) diff --git a/flair/models/text_classification_model.py b/flair/models/text_classification_model.py index 7e0dab976..368831475 100644 --- a/flair/models/text_classification_model.py +++ b/flair/models/text_classification_model.py @@ -883,8 +883,7 @@ def predict_zero_shot(self, Method to make zero shot predictions from the TARS model :param sentences: input sentence objects to classify :param candidate_label_set: set of candidate labels - :param multi_label: indicates whether multi-label or single class prediction. - Defaults to False + :param multi_label: indicates whether multi-label or single class prediction. Defaults to True. """ # check if candidate_label_set is empty diff --git a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md index 16f19b7ce..8df6e0a85 100644 --- a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md +++ b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md @@ -123,7 +123,7 @@ Done! Let's load the newly trained model and see if it does better: tars = TARSClassifier.load('resources/taggers/food_drink/final-model.pt') # 2. Prepare a test sentence -sentence = Sentence("I am so glad you like coffee") +sentence = Sentence("I am so glad you like burritos") # 3. Predict for food and drink tars.predict(sentence) diff --git a/resources/docs/TUTORIAL_2_TAGGING.md b/resources/docs/TUTORIAL_2_TAGGING.md index f0b1cde82..6b8c7986b 100644 --- a/resources/docs/TUTORIAL_2_TAGGING.md +++ b/resources/docs/TUTORIAL_2_TAGGING.md @@ -350,8 +350,14 @@ are provided: | 'communicative-functions' | English | detecting function of sentence in research paper (BETA) | scholarly papers | | | 'de-offensive-language' | German | detecting offensive language | [GermEval 2018 Task 1](https://projects.fzai.h-da.de/iggsa/projekt/) | **75.71** (Macro F1) | +## Tagging new classes without training data + +In case you need to label classes that are not included you can also try +our pre-trained zero-shot classifier TARS +(skip ahead to the [zero-shot tutorial](/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md)). +TARS can perform text classification for arbitrary classes. ## Next Now, let us look at how to use different [word embeddings](/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md) to embed your -text. +text. diff --git a/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md b/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md index 0066acdb8..ea663c512 100644 --- a/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md +++ b/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md @@ -364,4 +364,9 @@ However, if the dataset fits into CUDA memory, this option is the fastest one. ## Next -You can now look into [training your own embeddings](/resources/docs/TUTORIAL_9_TRAINING_LM_EMBEDDINGS.md). +If you don't have training data (or only very little), our TARS approach might be best for you. +Check out the TARS tutorial on [few-shot and zero-shot classification](/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md)). + +Alternatively, you can + look into [training your own embeddings](/resources/docs/TUTORIAL_9_TRAINING_LM_EMBEDDINGS.md). + From b9df3a7606532dcbedd29875316dd176c23b2dce Mon Sep 17 00:00:00 2001 From: alanakbik Date: Tue, 1 Dec 2020 19:17:28 +0100 Subject: [PATCH 33/35] GH-1983: update tutorial --- resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md | 1 + 1 file changed, 1 insertion(+) diff --git a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md index 8df6e0a85..39bbd194a 100644 --- a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md +++ b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md @@ -70,6 +70,7 @@ In this case, zero-shot prediction falsely predicts "drink" for the sentence "I To improve this, let's first create a small corpus of 4 training and 2 testing examples: ```python +from flair.data import Corpus from flair.datasets import SentenceDataset # training dataset consisting of four sentences (2 labeled as "food" and 2 labeled as "drink") From 8dc970bd2fd75e5ecc1836c5f728cfb8afd0233b Mon Sep 17 00:00:00 2001 From: alanakbik Date: Tue, 1 Dec 2020 19:22:55 +0100 Subject: [PATCH 34/35] GH-1983: update tutorial --- resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md index 39bbd194a..e05bf5185 100644 --- a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md +++ b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md @@ -99,6 +99,8 @@ whether a sentence mentions food or drink. Now, let's take the Corpus we created and do few-shot learning with our pre-trained TARS: ```python +from flair.trainers import ModelTrainer + # 1. load base TARS tars = TARSClassifier.load('tars-base') From 61d89c9128ec1c21417d2878d306b95956cd4979 Mon Sep 17 00:00:00 2001 From: alanakbik Date: Tue, 1 Dec 2020 19:34:38 +0100 Subject: [PATCH 35/35] GH-1983: add distance diagnostic predicor --- .../diagnosis/distance_prediction_model.py | 493 ++++++++++++++++++ 1 file changed, 493 insertions(+) create mode 100644 flair/models/diagnosis/distance_prediction_model.py diff --git a/flair/models/diagnosis/distance_prediction_model.py b/flair/models/diagnosis/distance_prediction_model.py new file mode 100644 index 000000000..0a0cba866 --- /dev/null +++ b/flair/models/diagnosis/distance_prediction_model.py @@ -0,0 +1,493 @@ +import logging +from pathlib import Path +from typing import List, Union + +import torch +import torch.nn as nn +from torch.utils.data.dataset import Dataset +import numpy as np +from math import floor + +import sklearn.metrics as metrics +import flair.nn +import flair.embeddings +from flair.data import Sentence, Label, DataPoint +from flair.training_utils import MetricRegression, Result, store_embeddings + +log = logging.getLogger("flair") + + +class DistancePredictor(flair.nn.Model): + """ + DistancePredictor + Model to predict distance between two words given their embeddings, modeled either as a classification or a + regression model. Takes (contextual) word embedding as input. + The pair of word embeddings is passed through a linear layer that predicts their distance in a sentence. + Note: When used for training the batch size must be set to 1!!! + """ + + def __init__( + self, + word_embeddings: flair.embeddings.TokenEmbeddings, + max_distance: int = 20, + beta: float = 1.0, + loss_max_weight: float = 1, + regression=False, + regr_loss_step=0 + ): + """ + Initializes a DistClassifier + :param word_embeddings: embeddings used to embed each sentence + .param max_distance: max dist between word pairs = number of predicted classes - 1 + :param beta: Parameter for F-beta score for evaluation and training annealing + :param loss_max_weight: Only for classification: Since small distances between word pairs occur mor frequent it makes sense to give them less weight + in the loss function. loss_max_weight will be used as the weight for the maximum distance and should be a number >=1 + The other weights decrease with equidistant steps from high to low distance. + :param regression: if True the class does regression instead of classification + :param regr_loss_step: if > 0, the MSE-Loss in regression will be weighted. Word pairs with + distance 0 have weight 1. Then, as the distance increases, the weight in the loss function, + increases step by step with size regr_loss_step + """ + + super(DistancePredictor, self).__init__() + + self.word_embeddings: flair.embeddings.TokenEmbeddings = word_embeddings + + self.beta = beta + + self.loss_max_weight = loss_max_weight + + self.regression = regression + + self.regr_loss_step = regr_loss_step + + if not regression: + self.max_distance = max_distance + + # weights for loss function + if self.loss_max_weight > 1: + step = (self.loss_max_weight - 1) / self.max_distance + + weight_list = [1. + i * step for i in range(self.max_distance + 1)] + + self.loss_weights = torch.FloatTensor(weight_list).to(flair.device) + + else: + self.loss_weights = None + + # iput size is two times wordembedding size since we use pair of words as input + # the output size is max_distance + 1, i.e. we allow 0,1,...,max_distance words between pairs + self.decoder = nn.Linear( + self.word_embeddings.embedding_length * 2, self.max_distance + 1) + + self.loss_function = nn.CrossEntropyLoss(weight=self.loss_weights) + + # regression + else: + self.max_distance = float('inf') + + # input size is two times word embedding size since we use pair of words as input + # the output size is 1 + self.decoder = nn.Linear( + self.word_embeddings.embedding_length * 2, 1) + + if regr_loss_step > 0: + self.loss_function = self.weighted_mse_loss + else: + self.loss_function = nn.MSELoss() + + nn.init.xavier_uniform_(self.decoder.weight) + + # auto-spawn on GPU if available + self.to(flair.device) + + # all input should be tensors + def weighted_mse_loss(self, predictions, target): + + weight = 1 + self.regr_loss_step * target + + return (weight * ((predictions - target) ** 2)).mean() + + # forward allows only a single sentcence!! + def forward(self, sentence: Sentence): + + # embed words of sentence + self.word_embeddings.embed(sentence) + + # go through all pairs of words with a maximum number of max_distance in between + numberOfWords = len(sentence) + text_embedding_list = [] + # go through all pairs + for i in range(numberOfWords): + for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)): + text_embedding_list.append(torch.cat((sentence[i].embedding, sentence[j].embedding)).unsqueeze(0)) + + # 2-dim matrix whose rows are the embeddings of word pairs of the sentence + text_embedding_tensor = torch.cat(text_embedding_list, 0).to(flair.device) + + label_scores = self.decoder(text_embedding_tensor) + + if self.regression: + return label_scores.squeeze(1) + + return label_scores + + def _get_state_dict(self): + model_state = { + "state_dict": self.state_dict(), + "word_embeddings": self.word_embeddings, + "max_distance": self.max_distance, + "beta": self.beta, + "loss_max_weight": self.loss_max_weight, + "regression": self.regression, + "regr_loss_step": self.regr_loss_step + } + return model_state + + @staticmethod + def _init_model_with_state_dict(state): + beta = 1.0 if "beta" not in state.keys() else state["beta"] + weight = 1 if "loss_max_weight" not in state.keys() else state["loss_max_weight"] + + model = DistancePredictor( + word_embeddings=state["word_embeddings"], + max_distance=state["max_distance"], + beta=beta, + loss_max_weight=weight, + regression=state["regression"], + regr_loss_step=state["regr_loss_step"] + ) + + model.load_state_dict(state["state_dict"]) + return model + + # So far only one sentence allowed + # If list of sentences is handed the function works with the first sentence of the list + def forward_loss( + self, data_points: Union[List[Sentence], Sentence] + ) -> torch.tensor: + + if isinstance(data_points, list): # first sentence + data_points = data_points[0] + + if len(data_points) < 2: + return torch.tensor([0.], requires_grad=True) + + scores = self.forward(data_points) + + return self._calculate_loss(scores, data_points) + + # Assume data_points is a single sentence!!! + # scores are the predictions for each word pair + def _calculate_loss(self, scores, data_points): + + indices = [] + numberOfWords = len(data_points) + + # classification needs labels to be integers, regression needs labels to be float + # this is due to the different loss functions + if not self.regression: + for i in range(numberOfWords): + for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)): + indices.append(torch.LongTensor([j - i - 1])) # distance between words + else: + for i in range(numberOfWords): + for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)): + indices.append(torch.Tensor([j - i - 1])) # distance between words + + labels = torch.cat(indices, 0).to(flair.device) + + return self.loss_function(scores, labels) + + # only single sentences as input + def _forward_scores_and_loss( + self, data_points: Union[List[Sentence], Sentence], return_loss=False): + + if isinstance(data_points, list): # first sentence + data_points = data_points[0] + + scores = self.forward(data_points) + + loss = None + if return_loss: + loss = self._calculate_loss(scores, data_points) + + return scores, loss + + def evaluate( + self, + sentences: Union[List[DataPoint], Dataset], + out_path: Union[str, Path] = None, + embedding_storage_mode: str = "none", + mini_batch_size: int = 1, # unnecessary, but trainer.train calls evaluate with this parameter + num_workers: int = 8, + ) -> (Result, float): + + if self.regression: + return self.evaluate_regression( + sentences=sentences, + out_path=out_path, + embedding_storage_mode=embedding_storage_mode, + ) + + return self.evaluate_classification( + sentences=sentences, + out_path=out_path, + embedding_storage_mode=embedding_storage_mode, + ) + + def evaluate_regression( + self, + sentences: Union[List[DataPoint], Dataset], + out_path: Union[str, Path] = None, + embedding_storage_mode: str = "none", + ) -> (Result, float): + + with torch.no_grad(): + + buckets = [0 for _ in range(11)] + + eval_loss = 0 + + metric = MetricRegression("Evaluation") + + lines: List[str] = [] + + max_dist_plus_one = max([len(sent) for sent in sentences]) - 1 + + num_occurences = [0 for _ in range(max_dist_plus_one)] + + cumulated_values = [0 for _ in range(max_dist_plus_one)] + + for sentence in sentences: + + if len(sentence) < 2: # we need at least 2 words per sentence + continue + + scores, loss = self._forward_scores_and_loss(sentence, return_loss=True) + + predictions = scores.tolist() + + # gold labels + true_values_for_sentence = [] + numberOfPairs = 0 + numberOfWords = len(sentence) + lines.append(sentence.to_tokenized_string() + '\n') + for i in range(numberOfWords): + for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)): + true_dist = j - i - 1 + pred = predictions[numberOfPairs] + + true_values_for_sentence.append(true_dist) + + # for output text file + eval_line = f"({i},{j})\t{true_dist}\t{pred:.2f}\n" + lines.append(eval_line) + + # for buckets + error = abs(true_dist - pred) + if error >= 10: + buckets[10] += 1 + else: + buckets[floor(error)] += 1 + + # for average prediction + num_occurences[true_dist] += 1 + cumulated_values[true_dist] += pred + + numberOfPairs += 1 + + eval_loss += loss / numberOfPairs + + metric.true.extend(true_values_for_sentence) + metric.pred.extend(predictions) + + store_embeddings(sentence, embedding_storage_mode) + + eval_loss /= len(sentences) # w.r.t self.loss + + # add some statistics to the output + eval_line = f"Number of Sentences: {len(sentences)}\nBuckets:\n | 0-1 | 1-2 | 2-3 | 3-4 | 4-5 | 5-6 | 6-7 | 7-8 | 8-9 | 9-10 | >10 |\n" + lines.append(eval_line) + eval_line = "| {} | {} | {} | {} | {} | {} | {} | {} | {} | {} | {} |".format(buckets[0], buckets[1], + buckets[2], buckets[3], + buckets[4], buckets[5], + buckets[6], buckets[7], + buckets[8], buckets[9], + buckets[10]) + lines.append(eval_line) + lines.append("\nAverage predicted values per distance:\n") + eval_line = "" + for i in range(max_dist_plus_one): + eval_line += str(i) + ": " + f"{cumulated_values[i] / num_occurences[i]:.2f}" + " " + if i != 0 and i % 15 == 0: + eval_line += "\n" + + lines.append(eval_line) + + if out_path is not None: + with open(out_path, "w", encoding="utf-8") as outfile: + outfile.write("".join(lines)) + + log_line = f"{metric.mean_squared_error()}\t{metric.spearmanr()}\t{metric.pearsonr()}" + log_header = "MSE\tSPEARMAN\tPEARSON" + + detailed_result = ( + f"AVG: mse: {metric.mean_squared_error():.4f} - " + f"mae: {metric.mean_absolute_error():.4f} - " + f"pearson: {metric.pearsonr():.4f} - " + f"spearman: {metric.spearmanr():.4f}" + ) + + result: Result = Result( + metric.pearsonr(), log_header, log_line, detailed_result + ) + + return result, eval_loss + + def evaluate_classification( + self, + sentences: Union[List[DataPoint], Dataset], + out_path: Union[str, Path] = None, + embedding_storage_mode: str = "none", + ) -> (Result, float): + + # use scikit-learn to evaluate + y_true = [] + y_pred = [] + + with torch.no_grad(): + eval_loss = 0 + + lines: List[str] = [] + # we iterate over each sentence, instead of batches + for sentence in sentences: + + if len(sentence) < 2: # we need at least 2 words per sentence + continue + + scores, loss = self._forward_scores_and_loss(sentence, return_loss=True) + + # get single labels from scores + predictions = [self._get_single_label(s) for s in scores] + + # gold labels + true_values_for_sentence = [] + numberOfPairs = 0 + numberOfWords = len(sentence) + lines.append(sentence.to_tokenized_string() + '\n') + for i in range(numberOfWords): + for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)): + true_values_for_sentence.append(j - i - 1) + + # for output text file + eval_line = "({},{})\t{}\t{}\n".format(i, j, j - i - 1, predictions[numberOfPairs]) + lines.append(eval_line) + + numberOfPairs += 1 + + eval_loss += loss / numberOfPairs # add average loss of word pairs + + for prediction_for_sentence, true_value_for_sentence in zip( + predictions, true_values_for_sentence + ): + # hot one vector of true value + y_true_instance = np.zeros(self.max_distance + 1, dtype=int) + y_true_instance[true_value_for_sentence] = 1 + y_true.append(y_true_instance.tolist()) + + # hot one vector of predicted value + y_pred_instance = np.zeros(self.max_distance + 1, dtype=int) + y_pred_instance[prediction_for_sentence] = 1 + y_pred.append(y_pred_instance.tolist()) + + # speichert embeddings, falls embedding_storage!= 'None' + store_embeddings(sentence, embedding_storage_mode) + + if out_path is not None: + with open(out_path, "w", encoding="utf-8") as outfile: + outfile.write("".join(lines)) + + # make "classification report" + target_names = [] # liste aller labels, ins unserem Fall + for i in range(self.max_distance + 1): + target_names.append(str(i)) + classification_report = metrics.classification_report(y_true, y_pred, digits=4, + target_names=target_names, zero_division=0) + + # get scores + micro_f_score = round(metrics.fbeta_score(y_true, y_pred, beta=self.beta, average='micro', zero_division=0), + 4) + accuracy_score = round(metrics.accuracy_score(y_true, y_pred), 4) + macro_f_score = round(metrics.fbeta_score(y_true, y_pred, beta=self.beta, average='macro', zero_division=0), + 4) + # precision_score = round(metrics.precision_score(y_true, y_pred, average='macro', zero_division=0), 4) + # recall_score = round(metrics.recall_score(y_true, y_pred, average='macro', zero_division=0), 4) + + detailed_result = ( + "\nResults:" + f"\n- F-score (micro) {micro_f_score}" + f"\n- F-score (macro) {macro_f_score}" + f"\n- Accuracy {accuracy_score}" + '\n\nBy class:\n' + classification_report + ) + + # line for log file + log_header = "ACCURACY" + log_line = f"\t{accuracy_score}" + + result = Result( + main_score=micro_f_score, + log_line=log_line, + log_header=log_header, + detailed_results=detailed_result, + ) + + eval_loss /= len(sentences) + + return result, eval_loss + + @staticmethod + def _filter_empty_sentences(sentences: List[Sentence]) -> List[Sentence]: + filtered_sentences = [sentence for sentence in sentences if sentence.tokens] + if len(sentences) != len(filtered_sentences): + log.warning( + "Ignore {} sentence(s) with no tokens.".format( + len(sentences) - len(filtered_sentences) + ) + ) + return filtered_sentences + + def _obtain_labels( + self, scores: List[List[float]], predict_prob: bool = False + ) -> List[List[Label]]: + """ + Predicts the labels of sentences. + :param scores: the prediction scores from the model + :return: list of predicted labels + """ + + if predict_prob: + return [self._predict_label_prob(s) for s in scores] + + return [self._get_single_label(s) for s in scores] + + def _get_single_label(self, label_scores): # -> List[Label]: + softmax = torch.nn.functional.softmax(label_scores, dim=0) + conf, idx = torch.max(softmax, 0) + + return idx.item() + + def _predict_label_prob(self, label_scores) -> List[Label]: + softmax = torch.nn.functional.softmax(label_scores, dim=0) + label_probs = [] + for idx, conf in enumerate(softmax): + label_probs.append(Label(idx, conf.item())) + return label_probs + + def __str__(self): + return super(flair.nn.Model, self).__str__().rstrip(')') + \ + f' (beta): {self.beta}\n' + \ + f' (loss_max_weight): {self.loss_max_weight}\n' + \ + f' (max_distance) {self.max_distance}\n)' +