From af8f5abc74df3c478e287c17de5124c73ea0277b Mon Sep 17 00:00:00 2001
From: alanakbik <alan.akbik@gmail.com>
Date: Wed, 25 Nov 2020 12:01:26 +0100
Subject: [PATCH 01/35] GH-1983: bump version numbers

---
 flair/__init__.py | 2 +-
 setup.py          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/flair/__init__.py b/flair/__init__.py
index 7d3e9a311..ecb28ec24 100644
--- a/flair/__init__.py
+++ b/flair/__init__.py
@@ -25,7 +25,7 @@
 
 import logging.config
 
-__version__ = "0.6.1.post1"
+__version__ = "0.7"
 
 logging.config.dictConfig(
     {
diff --git a/setup.py b/setup.py
index 0ca078dc0..824626455 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name="flair",
-    version="0.6.1.post1",
+    version="0.7",
     description="A very simple framework for state-of-the-art NLP",
     long_description=open("README.md", encoding="utf-8").read(),
     long_description_content_type="text/markdown",

From 8f20886c1516c22bbb597e9c1a74b5eb79dac954 Mon Sep 17 00:00:00 2001
From: alanakbik <alan.akbik@gmail.com>
Date: Wed, 25 Nov 2020 12:40:42 +0100
Subject: [PATCH 02/35] GH-1983: update list of datasets

---
 flair/datasets/__init__.py          |   32 +-
 flair/datasets/sequence_labeling.py | 3008 ++++++++++++++-------------
 resources/docs/TUTORIAL_6_CORPUS.md |   33 +-
 3 files changed, 1553 insertions(+), 1520 deletions(-)

diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py
index 5b611cd23..a59181506 100755
--- a/flair/datasets/__init__.py
+++ b/flair/datasets/__init__.py
@@ -7,6 +7,7 @@
 # Expose all sequence labeling datasets
 from .sequence_labeling import ColumnCorpus
 from .sequence_labeling import ColumnDataset
+from .sequence_labeling import ANER_CORP
 from .sequence_labeling import BIOFID
 from .sequence_labeling import BIOSCOPE
 from .sequence_labeling import CONLL_03
@@ -14,19 +15,31 @@
 from .sequence_labeling import CONLL_03_DUTCH
 from .sequence_labeling import CONLL_03_SPANISH
 from .sequence_labeling import CONLL_2000
-from .sequence_labeling import TWITTER_NER
 from .sequence_labeling import DANE
 from .sequence_labeling import EUROPARL_NER_GERMAN
 from .sequence_labeling import GERMEVAL_14
 from .sequence_labeling import INSPEC
 from .sequence_labeling import LER_GERMAN
+from .sequence_labeling import MIT_MOVIE_NER_SIMPLE
+from .sequence_labeling import MIT_MOVIE_NER_COMPLEX
+from .sequence_labeling import MIT_RESTAURANT_NER
 from .sequence_labeling import NER_BASQUE
 from .sequence_labeling import NER_FINNISH
 from .sequence_labeling import NER_SWEDISH
 from .sequence_labeling import SEMEVAL2010
 from .sequence_labeling import SEMEVAL2017
+from .sequence_labeling import TURKU_NER
+from .sequence_labeling import TWITTER_NER
+from .sequence_labeling import UP_CHINESE
+from .sequence_labeling import UP_ENGLISH
+from .sequence_labeling import UP_FINNISH
+from .sequence_labeling import UP_FRENCH
+from .sequence_labeling import UP_GERMAN
+from .sequence_labeling import UP_ITALIAN
+from .sequence_labeling import UP_SPANISH
+from .sequence_labeling import UP_SPANISH_ANCORA
+from .sequence_labeling import WEIBO_NER
 from .sequence_labeling import WIKIANN
-from .sequence_labeling import XTREME
 from .sequence_labeling import WIKIGOLD_NER
 from .sequence_labeling import WIKINER_ENGLISH
 from .sequence_labeling import WIKINER_GERMAN
@@ -39,20 +52,7 @@
 from .sequence_labeling import WIKINER_RUSSIAN
 from .sequence_labeling import WNUT_17
 from .sequence_labeling import WNUT_2020_NER
-from .sequence_labeling import WEIBO_NER
-from .sequence_labeling import MIT_RESTAURANTS
-from .sequence_labeling import UP_CHINESE
-from .sequence_labeling import UP_ENGLISH
-from .sequence_labeling import UP_FINNISH
-from .sequence_labeling import UP_FRENCH
-from .sequence_labeling import UP_GERMAN
-from .sequence_labeling import UP_ITALIAN
-from .sequence_labeling import UP_SPANISH
-from .sequence_labeling import UP_SPANISH_ANCORA
-from .sequence_labeling import ANER_CORP
-from .sequence_labeling import MITMovieNERSimple
-from .sequence_labeling import MITMovieNERComplex
-from .sequence_labeling import TURKU_NER
+from .sequence_labeling import XTREME
 
 # Expose all document classification datasets
 from .document_classification import ClassificationCorpus
diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index 7dc950dba..02e0a5800 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -267,6 +267,56 @@ def __getitem__(self, index: int = 0) -> Sentence:
         return sentence
 
 
+class ANER_CORP(ColumnCorpus):
+    def __init__(
+            self,
+            base_path: Union[str, Path] = None,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = True,
+            document_as_sequence: bool = False,
+    ):
+        """
+        Initialize a preprocessed version of the Arabic Named Entity Recognition Corpus (ANERCorp) dataset available
+        from https://github.com/EmnamoR/Arabic-named-entity-recognition/blob/master/ANERCorp.rar.
+        http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp
+        Column order is swapped
+        The first time you call this constructor it will automatically download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
+        POS tags instead
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
+        # column format
+        columns = {0: "text", 1: "ner"}
+
+        # this dataset name
+        dataset_name = self.__class__.__name__.lower()
+
+        # default dataset folder is the cache root
+        if not base_path:
+            base_path = Path(flair.cache_root) / "datasets"
+        data_folder = base_path / dataset_name
+
+        # download data if necessary
+        anercorp_path = "https://megantosh.s3.eu-central-1.amazonaws.com/ANERcorp/"
+        # cached_path(f"{anercorp_path}test.txt", Path("datasets") / dataset_name)
+        cached_path(f"{anercorp_path}train.txt", Path("datasets") / dataset_name)
+
+        super(ANER_CORP, self).__init__(
+            data_folder,
+            columns,
+            # tag_to_bioes=tag_to_bioes,
+            encoding="utf-8",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+        )
+
+
 class BIOFID(ColumnCorpus):
     def __init__(
             self,
@@ -299,6 +349,36 @@ def __init__(
         )
 
 
+class BIOSCOPE(ColumnCorpus):
+
+    def __init__(
+            self,
+            base_path: Union[str, Path] = None,
+            in_memory: bool = True,
+    ):
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
+        # column format
+        columns = {0: "text", 1: "tag"}
+
+        # this dataset name
+        dataset_name = self.__class__.__name__.lower()
+
+        # default dataset folder is the cache root
+        if not base_path:
+            base_path = Path(flair.cache_root) / "datasets"
+        data_folder = base_path / dataset_name
+
+        # download data if necessary
+        bioscope_path = "https://raw.githubusercontent.com/whoisjones/BioScopeSequenceLabelingData/master/sequence_labeled/"
+        cached_path(f"{bioscope_path}output.txt", Path("datasets") / dataset_name)
+
+        super(BIOSCOPE, self).__init__(
+            data_folder, columns, in_memory=in_memory, train_file="output.txt"
+        )
+
+
 class CONLL_03(ColumnCorpus):
     def __init__(
             self,
@@ -449,21 +529,123 @@ def __init__(
         )
 
 
+def add_IOB_tags(data_file: Union[str, Path], encoding: str = "utf8", ner_column: int = 1):
+    """
+Function that adds IOB tags if only chunk names are provided (e.g. words are tagged PER instead
+of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects
+the letter 'O'. Additionally it removes lines with no tags in the data file and can also
+be used if the data is only partially IOB tagged.
+Parameters
+----------
+data_file : Union[str, Path]
+    Path to the data file.
+encoding : str, optional
+    Encoding used in open function. The default is "utf8".
+ner_column : int, optional
+    Specifies the ner-tagged column. The default is 1 (the second column).
 
-class WNUT_2020_NER(ColumnCorpus):
+"""
+
+    def add_I_prefix(current_line: List[str], ner: int, tag: str):
+        for i in range(0, len(current_line)):
+            if i == 0:
+                f.write(line_list[i])
+            elif i == ner:
+                f.write(' I-' + tag)
+            else:
+                f.write(' ' + current_line[i])
+        f.write('\n')
+
+    with open(file=data_file, mode='r', encoding=encoding) as f:
+        lines = f.readlines()
+    with open(file=data_file, mode='w', encoding=encoding) as f:
+        pred = 'O'  # remembers ner tag of predecessing line
+        for line in lines:
+            line_list = line.split()
+            if len(line_list) > 2:  # word with tags
+                ner_tag = line_list[ner_column]
+                if ner_tag in ['0', 'O']:  # no chunk
+                    for i in range(0, len(line_list)):
+                        if i == 0:
+                            f.write(line_list[i])
+                        elif i == ner_column:
+                            f.write(' O')
+                        else:
+                            f.write(' ' + line_list[i])
+                    f.write('\n')
+                    pred = 'O'
+                elif '-' not in ner_tag:  # no IOB tags
+                    if pred == 'O':  # found a new chunk
+                        add_I_prefix(line_list, ner_column, ner_tag)
+                        pred = ner_tag
+                    else:  # found further part of chunk or new chunk directly after old chunk
+                        add_I_prefix(line_list, ner_column, ner_tag)
+                        pred = ner_tag
+                else:  # line already has IOB tag (tag contains '-')
+                    f.write(line)
+                    pred = ner_tag.split('-')[1]
+            elif len(line_list) == 0:  # empty line
+                f.write('\n')
+                pred = 'O'
+
+
+def add_IOB2_tags(data_file: Union[str, Path], encoding: str = "utf8"):
+    """
+Function that adds IOB2 tags if only chunk names are provided (e.g. words are tagged PER instead
+of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects
+the letter 'O'. Additionally it removes lines with no tags in the data file and can also
+be used if the data is only partially IOB tagged.
+Parameters
+----------
+data_file : Union[str, Path]
+    Path to the data file.
+encoding : str, optional
+    Encoding used in open function. The default is "utf8".
+
+"""
+    with open(file=data_file, mode='r', encoding=encoding) as f:
+        lines = f.readlines()
+    with open(file=data_file, mode='w', encoding=encoding) as f:
+        pred = 'O'  # remembers tag of predecessing line
+        for line in lines:
+            line_list = line.split()
+            if len(line_list) == 2:  # word with tag
+                word = line_list[0]
+                tag = line_list[1]
+                if tag in ['0', 'O']:  # no chunk
+                    f.write(word + ' O\n')
+                    pred = 'O'
+                elif '-' not in tag:  # no IOB tags
+                    if pred == 'O':  # found a new chunk
+                        f.write(word + ' B-' + tag + '\n')
+                        pred = tag
+                    else:  # found further part of chunk or new chunk directly after old chunk
+                        if pred == tag:
+                            f.write(word + ' I-' + tag + '\n')
+                        else:
+                            f.write(word + ' B-' + tag + '\n')
+                            pred = tag
+                else:  # line already has IOB tag (tag contains '-')
+                    f.write(line)
+                    pred = tag.split('-')[1]
+            elif len(line_list) == 0:  # empty line
+                f.write('\n')
+                pred = 'O'
+
+
+class CONLL_03_SPANISH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
             in_memory: bool = True,
-            document_as_sequence: bool = False,
     ):
         """
-        Initialize the WNUT_2020_NER corpus. The first time you call this constructor it will automatically
+        Initialize the CoNLL-03 corpus for Spanish. The first time you call this constructor it will automatically
         download the dataset.
         :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
         to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, since it is the only option of the WNUT corpus.
+        :param tag_to_bioes: NER by default, should not be changed
         :param in_memory: If True, keeps dataset in memory giving speedups in training.
         :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
@@ -482,65 +664,40 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        github_url = "https://github.com/jeniyat/WNUT_2020_NER/archive/master.zip"
-
-        for sample in ["train", "test", "dev"]:
-
-            sample_file = data_folder / (sample + ".txt")
-            if not sample_file.is_file():
-
-                zip_path = cached_path(
-                    f"{github_url}", Path("datasets") / dataset_name
-                    )
-
-                # unzip the downloaded repo and merge the train, dev and test datasets 
-                unpack_file(zip_path, data_folder, "zip", False) # unzipped folder name: WNUT_2020_NER-master
-
-                if sample == "test":
-                	file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data_2020/Conll_Format/")
-                else:
-                	file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data/Conll_Format/")
-                filenames = os.listdir(file_path)
-                with open(data_folder / (sample + '.txt'), 'w') as outfile: 
-                    for fname in filenames:
-                        with open(file_path / fname) as infile:
-                            lines = infile.read()
-                            outfile.write(lines)
-
-                shutil.rmtree(str(data_folder / "WNUT_2020_NER-master")) # clean up when done
+        conll_02_path = "https://www.clips.uantwerpen.be/conll2002/ner/data/"
+        cached_path(f"{conll_02_path}esp.testa", Path("datasets") / dataset_name)
+        cached_path(f"{conll_02_path}esp.testb", Path("datasets") / dataset_name)
+        cached_path(f"{conll_02_path}esp.train", Path("datasets") / dataset_name)
 
-        super(WNUT_2020_NER, self).__init__(
+        super(CONLL_03_SPANISH, self).__init__(
             data_folder,
             columns,
             tag_to_bioes=tag_to_bioes,
-            encoding="utf-8",
+            encoding="latin-1",
             in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
         )
 
 
-class WIKIGOLD_NER(ColumnCorpus):
+class CONLL_2000(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
+            tag_to_bioes: str = "np",
             in_memory: bool = True,
-            document_as_sequence: bool = False,
     ):
         """
-        Initialize the wikigold corpus. The first time you call this constructor it will automatically
-        download the dataset.
+        Initialize the CoNLL-2000 corpus for English chunking.
+        The first time you call this constructor it will automatically download the dataset.
         :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
         to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, should not be changed
+        :param tag_to_bioes: 'np' by default, should not be changed, but you can set 'pos' instead to predict POS tags
         :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "ner"}
+        columns = {0: "text", 1: "pos", 2: "np"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -551,45 +708,52 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        wikigold_ner_path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/wikigold/CONLL-format/data/"
-        cached_path(f"{wikigold_ner_path}wikigold.conll.txt", Path("datasets") / dataset_name)
-
-        super(WIKIGOLD_NER, self).__init__(
-            data_folder,
-            columns,
-            tag_to_bioes=tag_to_bioes,
-            encoding="utf-8",
-            in_memory=in_memory,
-            train_file='wikigold.conll.txt',
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-        )
-
+        conll_2000_path = "https://www.clips.uantwerpen.be/conll2000/chunking/"
+        data_file = Path(flair.cache_root) / "datasets" / dataset_name / "train.txt"
+        if not data_file.is_file():
+            cached_path(
+                f"{conll_2000_path}train.txt.gz", Path("datasets") / dataset_name
+            )
+            cached_path(
+                f"{conll_2000_path}test.txt.gz", Path("datasets") / dataset_name
+            )
+            import gzip, shutil
 
-class TWITTER_NER(ColumnCorpus):
+            with gzip.open(
+                    Path(flair.cache_root) / "datasets" / dataset_name / "train.txt.gz",
+                    "rb",
+            ) as f_in:
+                with open(
+                        Path(flair.cache_root) / "datasets" / dataset_name / "train.txt",
+                        "wb",
+                ) as f_out:
+                    shutil.copyfileobj(f_in, f_out)
+            with gzip.open(
+                    Path(flair.cache_root) / "datasets" / dataset_name / "test.txt.gz", "rb"
+            ) as f_in:
+                with open(
+                        Path(flair.cache_root) / "datasets" / dataset_name / "test.txt",
+                        "wb",
+                ) as f_out:
+                    shutil.copyfileobj(f_in, f_out)
+
+        super(CONLL_2000, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
+        )
+
+
+class DANE(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
             in_memory: bool = True,
-            document_as_sequence: bool = False,
     ):
-        """
-        Initialize a dataset called twitter_ner which can be found on the following page:
-        https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/ner.txt.
-
-        The first time you call this constructor it will automatically
-        download the dataset.
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, need not be changed
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: 'text', 1: 'ner'}
+        columns = {1: 'text', 3: 'pos', 9: 'ner'}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -600,43 +764,61 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        twitter_ner_path = "https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/"
-        cached_path(f"{twitter_ner_path}ner.txt", Path("datasets") / dataset_name)
+        data_path = Path(flair.cache_root) / "datasets" / dataset_name
+        train_data_file = data_path / "ddt.train.conllu"
+        if not train_data_file.is_file():
+            temp_file = cached_path(
+                'https://danlp.alexandra.dk/304bd159d5de/datasets/ddt.zip',
+                Path("datasets") / dataset_name
+            )
+            from zipfile import ZipFile
 
-        super(TWITTER_NER, self).__init__(
-            data_folder,
-            columns,
-            tag_to_bioes=tag_to_bioes,
-            encoding="latin-1",
-            train_file="ner.txt",
-            in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            with ZipFile(temp_file, 'r') as zip_file:
+                zip_file.extractall(path=data_path)
+
+            # Remove CoNLL-U meta information in the last column
+            for part in ['train', 'dev', 'test']:
+                lines = []
+                data_file = "ddt.{}.conllu".format(part)
+                with open(data_path / data_file, 'r') as file:
+                    for line in file:
+                        if line.startswith("#") or line == "\n":
+                            lines.append(line)
+                        lines.append(line.replace("name=", "").replace("|SpaceAfter=No", ""))
+
+                with open(data_path / data_file, 'w') as file:
+                    file.writelines(lines)
+
+                print(data_path / data_file)
+
+        super(DANE, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes,
+            in_memory=in_memory, comment_symbol="#"
         )
 
 
-class MIT_RESTAURANTS(ColumnCorpus):
+class EUROPARL_NER_GERMAN(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
-            in_memory: bool = True,
-            document_as_sequence: bool = False,
+            in_memory: bool = False,
     ):
         """
-        Initialize the experimental MIT Restaurant corpus available on https://groups.csail.mit.edu/sls/downloads/restaurant/.
-        The first time you call this constructor it will automatically download the dataset.
+        Initialize the EUROPARL_NER_GERMAN corpus. The first time you call this constructor it will automatically
+        download the dataset.
         :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
         to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
-        POS tags instead
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param tag_to_bioes: 'ner' by default, should not be changed.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
         :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
+
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "ner"}
+        columns = {0: 'text', 1: 'lemma', 2: 'pos', 3: 'np', 4: 'ner'}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -647,125 +829,25 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        mit_restaurants_path = "https://megantosh.s3.eu-central-1.amazonaws.com/MITRestoCorpus/"
-        cached_path(f"{mit_restaurants_path}test.txt", Path("datasets") / dataset_name)
-        cached_path(f"{mit_restaurants_path}train.txt", Path("datasets") / dataset_name)
+        europarl_ner_german_path = "https://nlpado.de/~sebastian/software/ner/"
+        cached_path(f"{europarl_ner_german_path}ep-96-04-15.conll", Path("datasets") / dataset_name)
+        cached_path(f"{europarl_ner_german_path}ep-96-04-16.conll", Path("datasets") / dataset_name)
+
+        add_IOB_tags(data_file=Path(data_folder / "ep-96-04-15.conll"), encoding="latin-1", ner_column=4)
+        add_IOB_tags(data_file=Path(data_folder / "ep-96-04-16.conll"), encoding="latin-1", ner_column=4)
 
-        super(MIT_RESTAURANTS, self).__init__(
+        super(EUROPARL_NER_GERMAN, self).__init__(
             data_folder,
             columns,
             tag_to_bioes=tag_to_bioes,
             encoding="latin-1",
             in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            train_file='ep-96-04-16.conll',
+            test_file='ep-96-04-15.conll'
         )
 
 
-def add_IOB_tags(data_file: Union[str, Path], encoding: str = "utf8", ner_column: int = 1):
-    """
-Function that adds IOB tags if only chunk names are provided (e.g. words are tagged PER instead
-of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects
-the letter 'O'. Additionally it removes lines with no tags in the data file and can also
-be used if the data is only partially IOB tagged.
-Parameters
-----------
-data_file : Union[str, Path]
-    Path to the data file.
-encoding : str, optional
-    Encoding used in open function. The default is "utf8".
-ner_column : int, optional
-    Specifies the ner-tagged column. The default is 1 (the second column).
-
-"""
-
-    def add_I_prefix(current_line: List[str], ner: int, tag: str):
-        for i in range(0, len(current_line)):
-            if i == 0:
-                f.write(line_list[i])
-            elif i == ner:
-                f.write(' I-' + tag)
-            else:
-                f.write(' ' + current_line[i])
-        f.write('\n')
-
-    with open(file=data_file, mode='r', encoding=encoding) as f:
-        lines = f.readlines()
-    with open(file=data_file, mode='w', encoding=encoding) as f:
-        pred = 'O'  # remembers ner tag of predecessing line
-        for line in lines:
-            line_list = line.split()
-            if len(line_list) > 2:  # word with tags
-                ner_tag = line_list[ner_column]
-                if ner_tag in ['0', 'O']:  # no chunk
-                    for i in range(0, len(line_list)):
-                        if i == 0:
-                            f.write(line_list[i])
-                        elif i == ner_column:
-                            f.write(' O')
-                        else:
-                            f.write(' ' + line_list[i])
-                    f.write('\n')
-                    pred = 'O'
-                elif '-' not in ner_tag:  # no IOB tags
-                    if pred == 'O':  # found a new chunk
-                        add_I_prefix(line_list, ner_column, ner_tag)
-                        pred = ner_tag
-                    else:  # found further part of chunk or new chunk directly after old chunk
-                        add_I_prefix(line_list, ner_column, ner_tag)
-                        pred = ner_tag
-                else:  # line already has IOB tag (tag contains '-')
-                    f.write(line)
-                    pred = ner_tag.split('-')[1]
-            elif len(line_list) == 0:  # empty line
-                f.write('\n')
-                pred = 'O'
-
-
-def add_IOB2_tags(data_file: Union[str, Path], encoding: str = "utf8"):
-    """
-Function that adds IOB2 tags if only chunk names are provided (e.g. words are tagged PER instead
-of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects
-the letter 'O'. Additionally it removes lines with no tags in the data file and can also
-be used if the data is only partially IOB tagged.
-Parameters
-----------
-data_file : Union[str, Path]
-    Path to the data file.
-encoding : str, optional
-    Encoding used in open function. The default is "utf8".
-
-"""
-    with open(file=data_file, mode='r', encoding=encoding) as f:
-        lines = f.readlines()
-    with open(file=data_file, mode='w', encoding=encoding) as f:
-        pred = 'O'  # remembers tag of predecessing line
-        for line in lines:
-            line_list = line.split()
-            if len(line_list) == 2:  # word with tag
-                word = line_list[0]
-                tag = line_list[1]
-                if tag in ['0', 'O']:  # no chunk
-                    f.write(word + ' O\n')
-                    pred = 'O'
-                elif '-' not in tag:  # no IOB tags
-                    if pred == 'O':  # found a new chunk
-                        f.write(word + ' B-' + tag + '\n')
-                        pred = tag
-                    else:  # found further part of chunk or new chunk directly after old chunk
-                        if pred == tag:
-                            f.write(word + ' I-' + tag + '\n')
-                        else:
-                            f.write(word + ' B-' + tag + '\n')
-                            pred = tag
-                else:  # line already has IOB tag (tag contains '-')
-                    f.write(line)
-                    pred = tag.split('-')[1]
-            elif len(line_list) == 0:  # empty line
-                f.write('\n')
-                pred = 'O'
-
-
-class CONLL_03_SPANISH(ColumnCorpus):
+class GERMEVAL_14(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
@@ -773,19 +855,18 @@ def __init__(
             in_memory: bool = True,
     ):
         """
-        Initialize the CoNLL-03 corpus for Spanish. The first time you call this constructor it will automatically
-        download the dataset.
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, should not be changed
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Initialize the GermEval NER corpus for German. This is only possible if you've manually downloaded it to your
+        machine. Obtain the corpus from https://sites.google.com/site/germeval2014ner/data and put it into some folder.
+        Then point the base_path parameter in the constructor to this folder
+        :param base_path: Path to the GermEval corpus on your machine
+        :param tag_to_bioes: 'ner' by default, should not be changed.
+        :param in_memory:If True, keeps dataset in memory giving speedups in training.
         """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "ner"}
+        columns = {1: "text", 2: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -795,41 +876,36 @@ def __init__(
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
-        # download data if necessary
-        conll_02_path = "https://www.clips.uantwerpen.be/conll2002/ner/data/"
-        cached_path(f"{conll_02_path}esp.testa", Path("datasets") / dataset_name)
-        cached_path(f"{conll_02_path}esp.testb", Path("datasets") / dataset_name)
-        cached_path(f"{conll_02_path}esp.train", Path("datasets") / dataset_name)
-
-        super(CONLL_03_SPANISH, self).__init__(
+        # check if data there
+        if not data_folder.exists():
+            log.warning("-" * 100)
+            log.warning(f'WARNING: GermEval-14 dataset not found at "{data_folder}".')
+            log.warning(
+                'Instructions for obtaining the data can be found here: https://sites.google.com/site/germeval2014ner/data"'
+            )
+            log.warning("-" * 100)
+        super(GERMEVAL_14, self).__init__(
             data_folder,
             columns,
             tag_to_bioes=tag_to_bioes,
-            encoding="latin-1",
+            comment_symbol="#",
             in_memory=in_memory,
         )
 
 
-class CONLL_2000(ColumnCorpus):
+class INSPEC(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "np",
+            tag_to_bioes: str = "keyword",
             in_memory: bool = True,
     ):
-        """
-        Initialize the CoNLL-2000 corpus for English chunking.
-        The first time you call this constructor it will automatically download the dataset.
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: 'np' by default, should not be changed, but you can set 'pos' instead to predict POS tags
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        """
+
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "pos", 2: "np"}
+        columns = {0: "text", 1: "keyword"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -839,77 +915,34 @@ def __init__(
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
-        # download data if necessary
-        conll_2000_path = "https://www.clips.uantwerpen.be/conll2000/chunking/"
-        data_file = Path(flair.cache_root) / "datasets" / dataset_name / "train.txt"
-        if not data_file.is_file():
-            cached_path(
-                f"{conll_2000_path}train.txt.gz", Path("datasets") / dataset_name
-            )
-            cached_path(
-                f"{conll_2000_path}test.txt.gz", Path("datasets") / dataset_name
-            )
-            import gzip, shutil
-
-            with gzip.open(
-                    Path(flair.cache_root) / "datasets" / dataset_name / "train.txt.gz",
-                    "rb",
-            ) as f_in:
-                with open(
-                        Path(flair.cache_root) / "datasets" / dataset_name / "train.txt",
-                        "wb",
-                ) as f_out:
-                    shutil.copyfileobj(f_in, f_out)
-            with gzip.open(
-                    Path(flair.cache_root) / "datasets" / dataset_name / "test.txt.gz", "rb"
-            ) as f_in:
-                with open(
-                        Path(flair.cache_root) / "datasets" / dataset_name / "test.txt",
-                        "wb",
-                ) as f_out:
-                    shutil.copyfileobj(f_in, f_out)
+        inspec_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/Inspec"
+        cached_path(f"{inspec_path}/train.txt", Path("datasets") / dataset_name)
+        cached_path(f"{inspec_path}/test.txt", Path("datasets") / dataset_name)
+        if not "dev.txt" in os.listdir(data_folder):
+            cached_path(f"{inspec_path}/valid.txt", Path("datasets") / dataset_name)
+            # rename according to train - test - dev - convention
+            os.rename(data_folder / "valid.txt", data_folder / "dev.txt")
 
-        super(CONLL_2000, self).__init__(
+        super(INSPEC, self).__init__(
             data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
 
-class XTREME(MultiCorpus):
+class LER_GERMAN(ColumnCorpus):
     def __init__(
             self,
-            languages: Union[str, List[str]] = None,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
             in_memory: bool = False,
     ):
         """
-        Xtreme corpus for cross-lingual NER consisting of datasets of a total of 176 languages. The data comes from the google 
-        research work XTREME https://github.com/google-research/xtreme. All datasets for NER and respective language abbreviations (e.g. 
-        "en" for english can be found here https://www.amazon.com/clouddrive/share/d3KGCRCIYwhKJF0H3eWA26hjg2ZCRhjpEQtDL70FSBN/folder/C43gs51bSIaq5sFTQkWNCQ?_encoding=UTF8&*Version*=1&*entries*=0&mgh=1 )
-        The data is derived from the wikiann dataset https://elisa-ie.github.io/wikiann/ (license: https://opendatacommons.org/licenses/by/)
-
-        Parameters
-        ----------
-        languages : Union[str, List[str]], optional
-            Default the 40 languages that are used in XTREME are loaded. Otherwise on can hand over a strings or a list of strings 
-            consisiting of abbreviations for languages. All datasets will be loaded in a MultiCorpus object.
-        base_path : Union[str, Path], optional
-            Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-            to point to a different folder but typically this should not be necessary.
-        tag_to_bioes : str, optional
-            The data is in bio-format. It will by default (with the string "ner" as value) be transformed
-            into the bioes format. If you dont want that set it to None.
-
+        Initialize the LER_GERMAN (Legal Entity Recognition) corpus. The first time you call this constructor it will automatically
+        download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
-        # if no languages are given as argument all languages used in XTREME will be loaded
-        if not languages:
-            languages = ["af", "ar", "bg", "bn", "de", "el", "en", "es", "et", "eu", "fa", "fi", "fr", "he", "hi", "hu",
-                         "id", "it", "ja", "jv", "ka", "kk", "ko", "ml", "mr", "ms", "my", "nl", "pt", "ru", "sw", "ta",
-                         "te", "th", "tl", "tr", "ur", "vi", "yo", "zh"]
-
-        # if only one language is given
-        if type(languages) == str:
-            languages = [languages]
 
         if type(base_path) == str:
             base_path: Path = Path(base_path)
@@ -918,112 +951,136 @@ def __init__(
         columns = {0: "text", 1: "ner"}
 
         # this dataset name
-        dataset_name = "xtreme"
+        dataset_name = self.__class__.__name__.lower()
 
         # default dataset folder is the cache root
         if not base_path:
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
-        # For each language in languages, the file is downloaded if not existent
-        # Then a comlumncorpus of that data is created and saved in a list
-        # This list is handed to the multicorpus
+        # download data if necessary
+        ler_path = "https://raw.githubusercontent.com/elenanereiss/Legal-Entity-Recognition/master/data/"
+        cached_path(f"{ler_path}ler.conll", Path("datasets") / dataset_name)
 
-        # list that contains the columncopora
-        corpora = []
+        super(LER_GERMAN, self).__init__(
+            data_folder,
+            columns,
+            tag_to_bioes=tag_to_bioes,
+            in_memory=in_memory,
+            train_file='ler.conll'
+        )
 
-        hu_path = "https://nlp.informatik.hu-berlin.de/resources/datasets/panx_dataset"
 
-        # download data if necessary
-        for language in languages:
+class MIT_MOVIE_NER_SIMPLE(ColumnCorpus):
+    def __init__(
+            self,
+            base_path: Union[str, Path] = None,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = True,
+    ):
+        """
+        Initialize the eng corpus of the MIT Movie Corpus (it has simpler queries compared to the trivia10k13 corpus)
+        in BIO format. The first time you call this constructor it will automatically download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
+        POS tags instead
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        """
+        # column format
+        columns = {0: "ner", 1: "text"}
 
-            language_folder = data_folder / language
+        # dataset name
+        dataset_name = self.__class__.__name__.lower()
 
-            # if language not downloaded yet, download it
-            if not language_folder.exists():
+        # data folder: default dataset folder is the cache root
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+        if not base_path:
+            base_path: Path = Path(flair.cache_root) / "datasets"
+        data_folder = base_path / dataset_name
 
-                file_name = language + '.tar.gz'
-                # create folder
-                os.makedirs(language_folder)
+        # download data if necessary
+        mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/"
+        train_file = "engtrain.bio"
+        test_file = "engtest.bio"
+        cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name)
+        cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name)
 
-                # download from HU Server
-                temp_file = cached_path(
-                    hu_path + "/" + file_name,
-                    Path("datasets") / dataset_name / language
-                )
+        super(MIT_MOVIE_NER_SIMPLE, self).__init__(
+            data_folder,
+            columns,
+            train_file=train_file,
+            test_file=test_file,
+            tag_to_bioes=tag_to_bioes,
+            in_memory=in_memory,
+        )
 
-                # unzip
-                print("Extract data...")
-                import tarfile
-                tar = tarfile.open(str(temp_file), "r:gz")
-                for part in ["train", "test", "dev"]:
-                    tar.extract(part, str(language_folder))
-                tar.close()
-                print('...done.')
 
-                # transform data into required format
-                print("Process dataset...")
-                for part in ["train", "test", "dev"]:
-                    xtreme_to_simple_ner_annotation(str(language_folder / part))
-                print('...done.')
+class MIT_MOVIE_NER_COMPLEX(ColumnCorpus):
+    def __init__(
+            self,
+            base_path: Union[str, Path] = None,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = True,
+    ):
+        """
+        Initialize the trivia10k13 corpus of the MIT Movie Corpus (it has more complex queries compared to the eng corpus)
+        in BIO format. The first time you call this constructor it will automatically download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
+        POS tags instead
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        """
+        # column format
+        columns = {0: "ner", 1: "text"}
 
-            # initialize comlumncorpus and add it to list
-            print("Read data into corpus...")
-            corp = ColumnCorpus(data_folder=language_folder,
-                                column_format=columns,
-                                tag_to_bioes=tag_to_bioes,
-                                in_memory=in_memory,
-                                )
-            corpora.append(corp)
-            print("...done.")
+        # dataset name
+        dataset_name = self.__class__.__name__.lower()
 
-        super(XTREME, self).__init__(
-            corpora, name='xtreme'
-        )
+        # data folder: default dataset folder is the cache root
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+        if not base_path:
+            base_path: Path = Path(flair.cache_root) / "datasets"
+        data_folder = base_path / dataset_name
 
+        # download data if necessary
+        mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/"
+        train_file = "trivia10k13train.bio"
+        test_file = "trivia10k13test.bio"
+        cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name)
+        cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name)
 
-def xtreme_to_simple_ner_annotation(data_file: Union[str, Path]):
-    with open(data_file, 'r', encoding='utf-8') as f:
-        lines = f.readlines()
-    with open(data_file, 'w', encoding='utf-8') as f:
-        for line in lines:
-            if line == '\n':
-                f.write(line)
-            else:
-                liste = line.split()
-                f.write(liste[0].split(':', 1)[1] + ' ' + liste[1] + '\n')
+        super(MIT_MOVIE_NER_COMPLEX, self).__init__(
+            data_folder,
+            columns,
+            train_file=train_file,
+            test_file=test_file,
+            tag_to_bioes=tag_to_bioes,
+            in_memory=in_memory,
+        )
 
 
-class WIKIANN(MultiCorpus):
+class MIT_RESTAURANT_NER(ColumnCorpus):
     def __init__(
             self,
-            languages: Union[str, List[str]],
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
-            in_memory: bool = False,
+            in_memory: bool = True,
+            document_as_sequence: bool = False,
     ):
         """
-        WkiAnn corpus for cross-lingual NER consisting of datasets from 282 languages that exist
-        in Wikipedia. See https://elisa-ie.github.io/wikiann/ for details and for the languages and their
-        respective abbreveations, i.e. "en" for english. (license: https://opendatacommons.org/licenses/by/)
-        Parameters
-        ----------
-        languages : Union[str, List[str]]
-            Should be an abbreviation of a language ("en", "de",..) or a list of abbreviations.
-            The datasets of all passed languages will be saved in one MultiCorpus.
-            (Note that, even though listed on https://elisa-ie.github.io/wikiann/ some datasets are empty.
-            This includes "aa", "cho", "ho", "hz", "ii", "jam", "kj", "kr", "mus", "olo" and "tcy".)
-        base_path : Union[str, Path], optional
-            Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-            to point to a different folder but typically this should not be necessary.
-        tag_to_bioes : str, optional
-            The data is in bio-format. It will by default (with the string "ner" as value) be transformed
-            into the bioes format. If you dont want that set it to None.
-
+        Initialize the experimental MIT Restaurant corpus available on https://groups.csail.mit.edu/sls/downloads/restaurant/.
+        The first time you call this constructor it will automatically download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
+        POS tags instead
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
-        if type(languages) == str:
-            languages = [languages]
-
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
@@ -1031,405 +1088,140 @@ def __init__(
         columns = {0: "text", 1: "ner"}
 
         # this dataset name
-        dataset_name = "wikiann"
+        dataset_name = self.__class__.__name__.lower()
 
         # default dataset folder is the cache root
         if not base_path:
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
-        # For each language in languages, the file is downloaded if not existent
-        # Then a comlumncorpus of that data is created and saved in a list
-        # this list is handed to the multicorpus
+        # download data if necessary
+        mit_restaurants_path = "https://megantosh.s3.eu-central-1.amazonaws.com/MITRestoCorpus/"
+        cached_path(f"{mit_restaurants_path}test.txt", Path("datasets") / dataset_name)
+        cached_path(f"{mit_restaurants_path}train.txt", Path("datasets") / dataset_name)
 
-        # list that contains the columncopora
-        corpora = []
+        super(MIT_RESTAURANT_NER, self).__init__(
+            data_folder,
+            columns,
+            tag_to_bioes=tag_to_bioes,
+            encoding="latin-1",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+        )
+
+
+class NER_BASQUE(ColumnCorpus):
+    def __init__(
+            self,
+            base_path: Union[str, Path] = None,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = True,
+    ):
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
+        # column format
+        columns = {0: "text", 1: "ner"}
+
+        # this dataset name
+        dataset_name = self.__class__.__name__.lower()
+
+        # default dataset folder is the cache root
+        if not base_path:
+            base_path = Path(flair.cache_root) / "datasets"
+        data_folder = base_path / dataset_name
 
-        google_drive_path = 'https://drive.google.com/uc?id='
         # download data if necessary
-        first = True
-        for language in languages:
+        ner_basque_path = "http://ixa2.si.ehu.eus/eiec/"
+        data_path = Path(flair.cache_root) / "datasets" / dataset_name
+        data_file = data_path / "named_ent_eu.train"
+        if not data_file.is_file():
+            cached_path(
+                f"{ner_basque_path}/eiec_v1.0.tgz", Path("datasets") / dataset_name
+            )
+            import tarfile, shutil
 
-            language_folder = data_folder / language
-            file_name = 'wikiann-' + language + '.bio'
+            with tarfile.open(
+                    Path(flair.cache_root) / "datasets" / dataset_name / "eiec_v1.0.tgz",
+                    "r:gz",
+            ) as f_in:
+                corpus_files = (
+                    "eiec_v1.0/named_ent_eu.train",
+                    "eiec_v1.0/named_ent_eu.test",
+                )
+                for corpus_file in corpus_files:
+                    f_in.extract(corpus_file, data_path)
+                    shutil.move(f"{data_path}/{corpus_file}", data_path)
 
-            # if language not downloaded yet, download it
-            if not language_folder.exists():
-                if first == True:
-                    import gdown
-                    import tarfile
-                    first = False
-                # create folder
-                os.makedirs(language_folder)
-                # get google drive id from list
-                google_id = google_drive_id_from_language_name(language)
-                url = google_drive_path + google_id
+        super(NER_BASQUE, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
+        )
 
-                # download from google drive
-                gdown.download(url, str(language_folder / language) + '.tar.gz')
 
-                # unzip
-                print("Extract data...")
-                tar = tarfile.open(str(language_folder / language) + '.tar.gz', "r:gz")
-                # tar.extractall(language_folder,members=[tar.getmember(file_name)])
-                tar.extract(file_name, str(language_folder))
-                tar.close()
-                print('...done.')
+class NER_FINNISH(ColumnCorpus):
+    def __init__(
+            self,
+            base_path: Union[str, Path] = None,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = True,
+    ):
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
 
-                # transform data into required format
-                # the processed dataset has the additional ending "_new"
-                print("Process dataset...")
-                silver_standard_to_simple_ner_annotation(str(language_folder / file_name))
-                # remove the unprocessed dataset
-                os.remove(str(language_folder / file_name))
-                print('...done.')
+        # column format
+        columns = {0: "text", 1: "ner"}
 
-            # initialize comlumncorpus and add it to list
-            print("Read data into corpus...")
-            corp = ColumnCorpus(data_folder=language_folder,
-                                column_format=columns,
-                                train_file=file_name + '_new',
-                                tag_to_bioes=tag_to_bioes,
-                                in_memory=in_memory,
-                                )
-            corpora.append(corp)
-            print("...done.")
+        # this dataset name
+        dataset_name = self.__class__.__name__.lower()
 
-        super(WIKIANN, self).__init__(
-            corpora, name='wikiann'
+        # default dataset folder is the cache root
+        if not base_path:
+            base_path = Path(flair.cache_root) / "datasets"
+        data_folder = base_path / dataset_name
+
+        # download data if necessary
+        ner_finnish_path = "https://raw.githubusercontent.com/mpsilfve/finer-data/master/data/digitoday."
+        cached_path(f"{ner_finnish_path}2014.train.csv", Path("datasets") / dataset_name)
+        cached_path(f"{ner_finnish_path}2014.dev.csv", Path("datasets") / dataset_name)
+        cached_path(f"{ner_finnish_path}2015.test.csv", Path("datasets") / dataset_name)
+
+        _remove_lines_without_annotations(data_file=Path(data_folder / "digitoday.2015.test.csv"))
+
+        super(NER_FINNISH, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, skip_first_line=True
         )
 
 
-def silver_standard_to_simple_ner_annotation(data_file: Union[str, Path]):
-    f_read = open(data_file, 'r', encoding='utf-8')
-    f_write = open(data_file + '_new', 'w+', encoding='utf-8')
-    while True:
-        line = f_read.readline()
-        if line:
-            if line == '\n':
-                f_write.write(line)
-            else:
-                liste = line.split()
-                f_write.write(liste[0] + ' ' + liste[-1] + '\n')
-        else:
-            break
-    f_read.close()
-    f_write.close()
+def _remove_lines_without_annotations(data_file: Union[str, Path] = None):
+    with open(data_file, 'r') as f:
+        lines = f.readlines()
+    with open(data_file, 'w') as f:
+        for line in lines:
+            if len(line.split()) != 1:
+                f.write(line)
 
 
-def google_drive_id_from_language_name(language):
-    languages_ids = {
-        'aa': '1tDDlydKq7KQQ3_23Ysbtke4HJOe4snIk',  # leer
-        'ab': '1hB8REj2XA_0DjI9hdQvNvSDpuBIb8qRf',
-        'ace': '1WENJS2ppHcZqaBEXRZyk2zY-PqXkTkgG',
-        'ady': '1n6On8WWDHxEoybj7F9K15d_fkGPy6KgO',
-        'af': '1CPB-0BD2tg3zIT60D3hmJT0i5O_SKja0',
-        'ak': '1l2vlGHnQwvm9XhW5S-403fetwUXhBlZm',
-        'als': '196xyYjhbie7sYLHLZHWkkurOwQLi8wK-',
-        'am': '1ug1IEoExKD3xWpvfZprAPSQi82YF9Cet',
-        'an': '1DNLgPOAOsGZBYd6rC5ddhzvc9_DtWnk2',
-        'ang': '1W_0ti7Tl8AkqM91lRCMPWEuUnPOAZroV',
-        'ar': '1tyvd32udEQG_cNeVpaD5I2fxvCc6XKIS',
-        'arc': '1hSOByStqPmP3b9HfQ39EclUZGo8IKCMb',
-        'arz': '1CKW5ZhxTpIHmc8Jt5JLz_5O6Cr8Icsan',
-        'as': '12opBoIweBLM8XciMHT4B6-MAaKdYdvpE',
-        'ast': '1rp64PxGZBDfcw-tpFBjLg_ddLDElG1II',
-        'av': '1hncGUrkG1vwAAQgLtwOf41BWkHkEvdss',
-        'ay': '1VmIsWpMTz442b4Mx798ZOgtB9vquKQtf',
-        'az': '1FXDXsvBSdqc7GGIDZv0hqBOaaw12Ip2-',
-        'azb': '1amVqOuHLEkhjn8rkGUl-mXdZlaACWyNT',
-        'ba': '1aLx1d8GagI11VZVYOGQy0BEePeqoT0x3',
-        'bar': '1JZ8-k8ZmnpWYI_Yl_cBBgjVdxoM9Daci',
-        'bat-smg': '1trxKXDFSeKsygTMKi-ZqXSJs7F90k5a8',
-        'bcl': '1Hs0k7KVZ2DPsqroZ4cUKcwZG4HdPV794',
-        'be-x-old': '1gaK-spj1m6eGYQ-SsngLxxLUvP1VRk08',
-        'be': '1_ttfOSy9BzCRkIT_p3mImT82XRPpEiuH',
-        'bg': '1Iug6gYKemb0OrLTUrKDc_c66YGypTfCF',
-        'bh': '12OcSFLu940A8tVQLxI8pnxKBpTeZHmrh',
-        'bi': '1rftVziS_pqARx4mvLJC0sKLY-OL5ZIjE',
-        'bjn': '1n17mkRjPUAOWQk5LQs2C3Tz3ShxK0enZ',
-        'bm': '1284dwO_sfdsWE7FR06HhfBRUb8ePesKR',
-        'bn': '1K2DM1mT4hkr6NlAIBTj95BeVXcgvpgDm',
-        'bo': '1SzGHDVK-OguKdjZ4DXWiOJVrie1iHeWm',
-        'bpy': '1m-e5EoruJufvwBEgJLmJtx6jzx64pYN2',
-        'br': '1xdaBoJ1DnwI0iEq7gQN1dWcABAs_bM9H',
-        'bs': '167dsB01trMYFQl8FshtIdfhjw7IfVKbk',
-        'bug': '1yCnevM9_KJzFk27Vxsva_20OacLo4Uam',
-        'bxr': '1DlByAX3zB-9UyEAVD4wtX-R7mXC-8xum',
-        'ca': '1LuUgbd9sGa-5Ahcsy31EK89a3WOowftY',
-        'cbk-zam': '1kgF8xoD-kIOWZET_9kp_4yNX6AAXn6PI',
-        'cdo': '14x1y6611G-UAEGq92QEHRpreVkYnoUCw',
-        'ce': '1QUUCVKA-fkiCHd3KT3zUWefaWnxzlZLu',
-        'ceb': '1DJZE9RfaMoPNXHI73KBXAm4YSe-_YCUk',
-        'ch': '1YzAfhmatkmTpkZbAcD6X83epCgzD5S2_',
-        'cho': '1ciY0vF3c5a2mTOo_k32A2wMs0klK98Kb',  # leer
-        'chr': '1EHaxz1UZHn7v2bbRzCLAhPsNtRzrG3Ae',
-        'chy': '1nNWwMAJr1KNdz3bHf6uIn-thZCknlTeB',
-        'ckb': '1llpaftcUSiXCZQZMdAqaJSrhwMdcf9IV',
-        'co': '1ZP-8oWgMYfW7a6w6ygEFkKDGbN39QnDn',
-        'cr': '1ST0xRicLAG4JdCZwGdaY-0pEXooQh7e6',
-        'crh': '1Jmpq2XVYUR_XaXU5XNhtOMnz-qkpsgpE',
-        'cs': '1Vydyze-jBkK_S1uV5ewV_Y6dbwhXr7lk',
-        'csb': '1naUyF74lZPnnopXdOqf5Xor2kT4WoHfS',
-        'cu': '1EN5dVTU6jc7YOYPCHq8EYUF31HlMUKs7',
-        'cv': '1gEUAlqYSSDI4TrWCqP1LUq2n0X1XEjN3',
-        'cy': '1q5g6NJE5GXf65Vc_P4BnUMHQ49Prz-J1',
-        'da': '11onAGOLkkqrIwM784siWlg-cewa5WKm8',
-        'de': '1f9nWvNkCCy6XWhd9uf4Dq-2--GzSaYAb',
-        'diq': '1IkpJaVbEOuOs9qay_KG9rkxRghWZhWPm',
-        'dsb': '1hlExWaMth-2eVIQ3i3siJSG-MN_7Z6MY',
-        'dv': '1WpCrslO4I7TMb2uaKVQw4U2U8qMs5szi',
-        'dz': '10WX52ePq2KfyGliwPvY_54hIjpzW6klV',
-        'ee': '1tYEt3oN2KPzBSWrk9jpCqnW3J1KXdhjz',
-        'el': '1cxq4NUYmHwWsEn5waYXfFSanlINXWLfM',
-        'eml': '17FgGhPZqZNtzbxpTJOf-6nxEuI5oU4Vd',
-        'en': '1mqxeCPjxqmO7e8utj1MQv1CICLFVvKa-',
-        'eo': '1YeknLymGcqj44ug2yd4P7xQVpSK27HkK',
-        'es': '1Dnx3MVR9r5cuoOgeew2gT8bDvWpOKxkU',
-        'et': '1Qhb3kYlQnLefWmNimdN_Vykm4mWzbcWy',
-        'eu': '1f613wH88UeITYyBSEMZByK-nRNMwLHTs',
-        'ext': '1D0nLOZ3aolCM8TShIRyCgF3-_MhWXccN',
-        'fa': '1QOG15HU8VfZvJUNKos024xI-OGm0zhEX',
-        'ff': '1h5pVjxDYcq70bSus30oqi9KzDmezVNry',
-        'fi': '1y3Kf6qYsSvL8_nSEwE1Y6Bf6ninaPvqa',
-        'fiu-vro': '1oKUiqG19WgPd3CCl4FGudk5ATmtNfToR',
-        'fj': '10xDMuqtoTJlJFp5ghbhKfNWRpLDK3W4d',
-        'fo': '1RhjYqgtri1276Be1N9RrNitdBNkpzh0J',
-        'fr': '1sK_T_-wzVPJYrnziNqWTriU52rEsXGjn',
-        'frp': '1NUm8B2zClBcEa8dHLBb-ZgzEr8phcQyZ',
-        'frr': '1FjNqbIUlOW1deJdB8WCuWjaZfUzKqujV',
-        'fur': '1oqHZMK7WAV8oHoZLjGR0PfmO38wmR6XY',
-        'fy': '1DvnU6iaTJc9bWedmDklHyx8nzKD1s3Ge',
-        'ga': '1Ql6rh7absdYQ8l-3hj_MVKcEC3tHKeFB',
-        'gag': '1zli-hOl2abuQ2wsDJU45qbb0xuvYwA3a',
-        'gan': '1u2dOwy58y-GaS-tCPJS_i9VRDQIPXwCr',
-        'gd': '1umsUpngJiwkLdGQbRqYpkgxZju9dWlRz',
-        'gl': '141K2IbLjJfXwFTIf-kthmmG0YWdi8liE',
-        'glk': '1ZDaxQ6ilXaoivo4_KllagabbvfOuiZ0c',
-        'gn': '1hM4MuCaVnZqnL-w-0N-WcWag22ikVLtZ',
-        'gom': '1BNOSw75tzPC0wEgLOCKbwu9wg9gcLOzs',
-        'got': '1YSHYBtXc1WvUvMIHPz6HHgJvaXKulJUj',
-        'gu': '1VdK-B2drqFwKg8KD23c3dKXY-cZgCMgd',
-        'gv': '1XZFohYNbKszEFR-V-yDXxx40V41PV9Zm',
-        'ha': '18ZG4tUU0owRtQA8Ey3Dl72ALjryEJWMC',
-        'hak': '1QQe3WgrCWbvnVH42QXD7KX4kihHURB0Z',
-        'haw': '1FLqlK-wpz4jy768XbQAtxd9PhC-9ciP7',
-        'he': '18K-Erc2VOgtIdskaQq4D5A3XkVstDmfX',
-        'hi': '1lBRapb5tjBqT176gD36K5yb_qsaFeu-k',
-        'hif': '153MQ9Ga4NQ-CkK8UiJM3DjKOk09fhCOV',
-        'ho': '1c1AoS7yq15iVkTEE-0f3x25NT4F202B8',  # leer
-        'hr': '1wS-UtB3sGHuXJQQGR0F5lDegogsgoyif',
-        'hsb': '1_3mMLzAE5OmXn2z64rW3OwWbo85Mirbd',
-        'ht': '1BwCaF0nfdgkM7Yt7A7d7KyVk0BcuwPGk',
-        'hu': '10AkDmTxUWNbOXuYLYZ-ZPbLAdGAGZZ8J',
-        'hy': '1Mi2k2alJJquT1ybd3GC3QYDstSagaWdo',
-        'hz': '1c1m_-Q92v0Di7Nez6VuaccrN19i8icKV',  # leer
-        'ia': '1jPyqTmDuVhEhj89N606Cja5heJEbcMoM',
-        'id': '1JWIvIh8fQoMQqk1rPvUThaskxnTs8tsf',
-        'ie': '1TaKRlTtB8-Wqu4sfvx6JQKIugAlg0pV-',
-        'ig': '15NFAf2Qx6BXSjv_Oun9_3QRBWNn49g86',
-        'ii': '1qldGJkMOMKwY13DpcgbxQCbff0K982f9',  # leer
-        'ik': '1VoSTou2ZlwVhply26ujowDz6gjwtxmny',
-        'ilo': '1-xMuIT6GaM_YeHqgm1OamGkxYfBREiv3',
-        'io': '19Zla0wsAcrZm2c0Pw5ghpp4rHjYs26Pp',
-        'is': '11i-NCyqS6HbldIbYulsCgQGZFXR8hwoB',
-        'it': '1HmjlOaQunHqL2Te7pIkuBWrnjlmdfYo_',
-        'iu': '18jKm1S7Ls3l0_pHqQH8MycG3LhoC2pdX',
-        'ja': '10dz8UxyK4RIacXE2HcGdrharmp5rwc3r',
-        'jam': '1v99CXf9RnbF6aJo669YeTR6mQRTOLZ74',  # leer
-        'jbo': '1_LmH9hc6FDGE3F7pyGB1fUEbSwuTYQdD',
-        'jv': '1qiSu1uECCLl4IBZS27FBdJIBivkJ7GwE',
-        'ka': '172UFuFRBX2V1aWeXlPSpu9TjS-3cxNaD',
-        'kaa': '1kh6hMPUdqO-FIxRY6qaIBZothBURXxbY',
-        'kab': '1oKjbZI6ZrrALCqnPCYgIjKNrKDA7ehcs',
-        'kbd': '1jNbfrboPOwJmlXQBIv053d7n5WXpMRv7',
-        'kg': '1iiu5z-sdJ2JLC4Ja9IgDxpRZklIb6nDx',
-        'ki': '1GUtt0QI84c5McyLGGxoi5uwjHOq1d6G8',
-        'kj': '1nSxXUSGDlXVCIPGlVpcakRc537MwuKZR',  # leer
-        'kk': '1ryC3UN0myckc1awrWhhb6RIi17C0LCuS',
-        'kl': '1gXtGtX9gcTXms1IExICnqZUHefrlcIFf',
-        'km': '1DS5ATxvxyfn1iWvq2G6qmjZv9pv0T6hD',
-        'kn': '1ZGLYMxbb5-29MNmuUfg2xFhYUbkJFMJJ',
-        'ko': '12r8tIkTnwKhLJxy71qpIcoLrT6NNhQYm',
-        'koi': '1EdG_wZ_Qk124EPAZw-w6rdEhYLsgcvIj',
-        'kr': '19VNQtnBA-YL_avWuVeHQHxJZ9MZ04WPF',  # leer
-        'krc': '1nReV4Mb7Wdj96czpO5regFbdBPu0zZ_y',
-        'ks': '1kzh0Pgrv27WRMstR9MpU8mu7p60TcT-X',
-        'ksh': '1iHJvrl2HeRaCumlrx3N7CPrHQ2KuLUkt',
-        'ku': '1YqJog7Bkk0fHBCSTxJ9heeE-bfbkbkye',
-        'kv': '1s91HI4eq8lQYlZwfrJAgaGlCyAtIhvIJ',
-        'kw': '16TaIX2nRfqDp8n7zudd4bqf5abN49dvW',
-        'ky': '17HPUKFdKWhUjuR1NOp5f3PQYfMlMCxCT',
-        'la': '1NiQuBaUIFEERvVXo6CQLwosPraGyiRYw',
-        'lad': '1PEmXCWLCqnjLBomMAYHeObM1AmVHtD08',
-        'lb': '1nE4g10xoTU23idmDtOQ0w2QCuizZ6QH_',
-        'lbe': '1KOm-AdRcCHfSc1-uYBxBA4GjxXjnIlE-',
-        'lez': '1cJAXshrLlF1TZlPHJTpDwEvurIOsz4yR',
-        'lg': '1Ur0y7iiEpWBgHECrIrT1OyIC8um_y4th',
-        'li': '1TikIqfqcZlSDWhOae1JnjJiDko4nj4Dj',
-        'lij': '1ro5ItUcF49iP3JdV82lhCQ07MtZn_VjW',
-        'lmo': '1W4rhBy2Pi5SuYWyWbNotOVkVY3kYWS_O',
-        'ln': '1bLSV6bWx0CgFm7ByKppZLpYCFL8EIAoD',
-        'lo': '1C6SSLeKF3QirjZbAZAcpVX_AXYg_TJG3',
-        'lrc': '1GUcS28MlJe_OjeQfS2AJ8uczpD8ut60e',
-        'lt': '1gAG6TcMTmC128wWK0rCXRlCTsJY9wFQY',
-        'ltg': '12ziP8t_fAAS9JqOCEC0kuJObEyuoiOjD',
-        'lv': '1MPuAM04u-AtfybXdpHwCqUpFWbe-zD0_',
-        'mai': '1d_nUewBkka2QGEmxCc9v3dTfvo7lPATH',
-        'map-bms': '1wrNIE-mqp2xb3lrNdwADe6pb7f35NP6V',
-        'mdf': '1BmMGUJy7afuKfhfTBMiKxM3D7FY-JrQ2',
-        'mg': '105WaMhcWa-46tCztoj8npUyg0aH18nFL',
-        'mh': '1Ej7n6yA1cF1cpD5XneftHtL33iHJwntT',
-        'mhr': '1CCPIUaFkEYXiHO0HF8_w07UzVyWchrjS',
-        'mi': '1F6au9xQjnF-aNBupGJ1PwaMMM6T_PgdQ',
-        'min': '1tVK5SHiCy_DaZSDm3nZBgT5bgWThbJt_',
-        'mk': '18NpudytGhSWq_LbmycTDw10cSftlSBGS',
-        'ml': '1V73UE-EvcE-vV3V1RTvU4sak6QFcP91y',
-        'mn': '14jRXicA87oXZOZllWqUjKBMetNpQEUUp',
-        'mo': '1YsLGNMsJ7VsekhdcITQeolzOSK4NzE6U',
-        'mr': '1vOr1AIHbgkhTO9Ol9Jx5Wh98Qdyh1QKI',
-        'mrj': '1dW-YmEW8a9D5KyXz8ojSdIXWGekNzGzN',
-        'ms': '1bs-_5WNRiZBjO-DtcNtkcIle-98homf_',
-        'mt': '1L7aU3iGjm6SmPIU74k990qRgHFV9hrL0',
-        'mus': '1_b7DcRqiKJFEFwp87cUecqf8A5BDbTIJ',  # leer
-        'mwl': '1MfP0jba2jQfGVeJOLq26MjI6fYY7xTPu',
-        'my': '16wsIGBhNVd2lC2p6n1X8rdMbiaemeiUM',
-        'myv': '1KEqHmfx2pfU-a1tdI_7ZxMQAk5NJzJjB',
-        'mzn': '1CflvmYEXZnWwpsBmIs2OvG-zDDvLEMDJ',
-        'na': '1r0AVjee5wNnrcgJxQmVGPVKg5YWz1irz',
-        'nah': '1fx6eu91NegyueZ1i0XaB07CKjUwjHN7H',
-        'nap': '1bhT4sXCJvaTchCIV9mwLBtf3a7OprbVB',
-        'nds-nl': '1UIFi8eOCuFYJXSAXZ9pCWwkQMlHaY4ye',
-        'nds': '1FLgZIXUWa_vekDt4ndY0B5XL7FNLiulr',
-        'ne': '1gEoCjSJmzjIH4kdHsbDZzD6ID4_78ekS',
-        'new': '1_-p45Ny4w9UvGuhD8uRNSPPeaARYvESH',
-        'ng': '11yxPdkmpmnijQUcnFHZ3xcOmLTYJmN_R',
-        'nl': '1dqYXg3ilzVOSQ_tz_dF47elSIvSIhgqd',
-        'nn': '1pDrtRhQ001z2WUNMWCZQU3RV_M0BqOmv',
-        'no': '1zuT8MI96Ivpiu9mEVFNjwbiM8gJlSzY2',
-        'nov': '1l38388Rln0NXsSARMZHmTmyfo5C0wYTd',
-        'nrm': '10vxPq1Nci7Wpq4XOvx3dtqODskzjdxJQ',
-        'nso': '1iaIV8qlT0RDnbeQlnxJ3RehsG3gU5ePK',
-        'nv': '1oN31jT0w3wP9aGwAPz91pSdUytnd9B0g',
-        'ny': '1eEKH_rUPC560bfEg11kp3kbe8qWm35IG',
-        'oc': '1C01cW8G_j8US-DTrsmeal_ENHTtNWn-H',
-        'olo': '1vbDwKZKqFq84dusr1SvDx5JbBcPanx9L',  # leer
-        'om': '1q3h22VMbWg2kgVFm-OArR-E4y1yBQ1JX',
-        'or': '1k8LwCE8nC7lq6neXDaS3zRn0KOrd9RnS',
-        'os': '1u81KAB34aEQfet00dLMRIBJsfRwbDTij',
-        'pa': '1JDEHL1VcLHBamgTPBom_Ryi8hk6PBpsu',
-        'pag': '1k905VUWnRgY8kFb2P2431Kr4dZuolYGF',
-        'pam': '1ssugGyJb8ipispC60B3I6kzMsri1WcvC',
-        'pap': '1Za0wfwatxYoD7jGclmTtRoBP0uV_qImQ',
-        'pcd': '1csJlKgtG04pdIYCUWhsCCZARKIGlEYPx',
-        'pdc': '1Xnms4RXZKZ1BBQmQJEPokmkiweTpouUw',
-        'pfl': '1tPQfHX7E0uKMdDSlwNw5aGmaS5bUK0rn',
-        'pi': '16b-KxNxzbEuyoNSlI3bfe2YXmdSEsPFu',
-        'pih': '1vwyihTnS8_PE5BNK7cTISmIBqGWvsVnF',
-        'pl': '1fijjS0LbfpKcoPB5V8c8fH08T8AkXRp9',
-        'pms': '12ySc7X9ajWWqMlBjyrPiEdc-qVBuIkbA',
-        'pnb': '1RB3-wjluhTKbdTGCsk3nag1bM3m4wENb',
-        'pnt': '1ZCUzms6fY4on_fW8uVgO7cEs9KHydHY_',
-        'ps': '1WKl9Av6Sqz6aHKyUM5kIh90mzFzyVWH9',
-        'pt': '13BX-_4_hcTUp59HDyczFDI32qUB94vUY',
-        'qu': '1CB_C4ygtRoegkqgcqfXNHr8oQd-UcvDE',
-        'rm': '1YRSGgWoxEqSojHXuBHJnY8vAHr1VgLu-',
-        'rmy': '1uFcCyvOWBJWKFQxbkYSp373xUXVl4IgF',
-        'rn': '1ekyyb2MvupYGY_E8_BhKvV664sLvW4aE',
-        'ro': '1YfeNTSoxU-zJMnyQotLk5X8B_6nHryBu',
-        'roa-rup': '150s4H4TdQ5nNYVC6j0E416TUAjBE85yy',
-        'roa-tara': '1H6emfQsD_a5yohK4RMPQ-GrnHXqqVgr3',
-        'ru': '11gP2s-SYcfS3j9MjPp5C3_nFeQB-8x86',
-        'rue': '1OuSglZAndja1J5D5IUmdbt_niTTyEgYK',
-        'rw': '1NuhHfi0-B-Xlr_BApijnxCw0WMEltttP',
-        'sa': '1P2S3gL_zvKgXLKJJxg-Fb4z8XdlVpQik',
-        'sah': '1qz0MpKckzUref2FX_FYiNzI2p4BDc5oR',
-        'sc': '1oAYj_Fty4FUwjAOBEBaiZt_cY8dtpDfA',
-        'scn': '1sDN9zHkXWYoHYx-DUu-GPvsUgB_IRa8S',
-        'sco': '1i8W7KQPj6YZQLop89vZBSybJNgNsvXWR',
-        'sd': '1vaNqfv3S8Gl5pQmig3vwWQ3cqRTsXmMR',
-        'se': '1RT9xhn0Vl90zjWYDTw5V1L_u1Oh16tpP',
-        'sg': '1iIh2oXD2Szz_AygUvTt3_ZK8a3RYEGZ_',
-        'sh': '1qPwLiAm6t4__G-zVEOrBgYx6VRmgDgiS',
-        'si': '1G5ryceID0TP6SAO42e-HAbIlCvYmnUN7',
-        'simple': '1FVV49o_RlK6M5Iw_7zeJOEDQoTa5zSbq',
-        'sk': '11mkYvbmAWKTInj6t4Ma8BUPxoR5o6irL',
-        'sl': '1fsIZS5LgMzMzZ6T7ogStyj-ILEZIBRvO',
-        'sm': '1yefECpKX_Y4R7G2tggIxvc_BvJfOAz-t',
-        'sn': '1fYeCjMPvRAv94kvZjiKI-ktIDLkbv0Ve',
-        'so': '1Uc-eSZnJb36SgeTvRU3GirXZOlGD_NB6',
-        'sq': '11u-53n71O_yjpwRiCQSwgL7N2w72ZptX',
-        'sr': '1PGLGlQi8Q0Eac6dib-uuCJAAHK6SF5Pz',
-        'srn': '1JKiL3TSXqK1-KhPfAwMK0uqw90WEzg7M',
-        'ss': '1e0quNEsA1dn57-IbincF4D82dRWgzQlp',
-        'st': '1ny-FBzpBqIDgv6jMcsoFev3Ih65FNZFO',
-        'stq': '15Fx32ROy2IM6lSqAPUykkr3CITR6Xd7v',
-        'su': '1C0FJum7bYZpnyptBvfAgwJb0TX2hggtO',
-        'sv': '1YyqzOSXzK5yrAou9zeTDWH_7s569mDcz',
-        'sw': '1_bNTj6T8eXlNAIuHaveleWlHB_22alJs',
-        'szl': '1_dXEip1snK4CPVGqH8x7lF5O-6FdCNFW',
-        'ta': '1ZFTONsxGtSnC9QB6RpWSvgD_MbZwIhHH',
-        'tcy': '15R6u7KQs1vmDSm_aSDrQMJ3Q6q3Be0r7',  # leer
-        'te': '11Sx-pBAPeZOXGyv48UNSVMD0AH7uf4YN',
-        'tet': '11mr2MYLcv9pz7mHhGGNi5iNCOVErYeOt',
-        'tg': '16ttF7HWqM9Cnj4qmgf3ZfNniiOJfZ52w',
-        'th': '14xhIt-xr5n9nMuvcwayCGM1-zBCFZquW',
-        'ti': '123q5e9MStMShp8eESGtHdSBGLDrCKfJU',
-        'tk': '1X-JNInt34BNGhg8A8Peyjw2WjsALdXsD',
-        'tl': '1WkQHbWd9cqtTnSHAv0DpUThaBnzeSPTJ',
-        'tn': '1fHfQHetZn8-fLuRZEu-cvs-kQYwPvjyL',
-        'to': '1cHOLaczYJ8h-OqQgxeoH9vMG3izg6muT',
-        'tpi': '1YsRjxVu6NYOrXRb8oqMO9FPaicelFEcu',
-        'tr': '1J1Zy02IxvtCK0d1Ba2h_Ulit1mVb9UIX',
-        'ts': '1pIcfAt3KmtmDkyhOl-SMSeoM8aP8bOpl',
-        'tt': '1vsfzCjj-_bMOn5jBai41TF5GjKJM_Ius',
-        'tum': '1NWcg65daI2Bt0awyEgU6apUDbBmiqCus',
-        'tw': '1WCYKZIqS7AagS76QFSfbteiOgFNBvNne',
-        'ty': '1DIqaP1l-N9VXTNokrlr6EuPMGE765o4h',
-        'tyv': '1F3qa05OYLBcjT1lXMurAJFDXP_EesCvM',
-        'udm': '1T0YMTAPLOk768sstnewy5Jxgx2RPu3Rb',
-        'ug': '1fjezvqlysyZhiQMZdazqLGgk72PqtXAw',
-        'uk': '1UMJCHtzxkfLDBJE7NtfN5FeMrnnUVwoh',
-        'ur': '1WNaD2TuHvdsF-z0k_emQYchwoQQDFmRk',
-        'uz': '11wrG2FSTpRJc2jb5MhgvxjkVDYhT8M-l',
-        've': '1PucJ7pJ4CXGEXZ5p_WleZDs2usNz74to',
-        'vec': '1cAVjm_y3ehNteDQIYz9yyoq1EKkqOXZ0',
-        'vep': '1K_eqV7O6C7KPJWZtmIuzFMKAagj-0O85',
-        'vi': '1yQ6nhm1BmG9lD4_NaG1hE5VV6biEaV5f',
-        'vls': '1bpQQW6pKHruKJJaKtuggH5rReMXyeVXp',
-        'vo': '1D80QRdTpe7H4mHFKpfugscsjX71kiMJN',
-        'wa': '1m4B81QYbf74htpInDU5p7d0n0ot8WLPZ',
-        'war': '1EC3jsHtu22tHBv6jX_I4rupC5RwV3OYd',
-        'wo': '1vChyqNNLu5xYHdyHpACwwpw4l3ptiKlo',
-        'wuu': '1_EIn02xCUBcwLOwYnA-lScjS2Lh2ECw6',
-        'xal': '19bKXsL1D2UesbB50JPyc9TpG1lNc2POt',
-        'xh': '1pPVcxBG3xsCzEnUzlohc_p89gQ9dSJB3',
-        'xmf': '1SM9llku6I_ZuZz05mOBuL2lx-KQXvehr',
-        'yi': '1WNWr1oV-Nl7c1Jv8x_MiAj2vxRtyQawu',
-        'yo': '1yNVOwMOWeglbOcRoZzgd4uwlN5JMynnY',
-        'za': '1i7pg162cD_iU9h8dgtI2An8QCcbzUAjB',
-        'zea': '1EWSkiSkPBfbyjWjZK0VuKdpqFnFOpXXQ',
-        'zh-classical': '1uUKZamNp08KA7s7794sKPOqPALvo_btl',
-        'zh-min-nan': '1oSgz3YBXLGUgI7kl-uMOC_ww6L0FNFmp',
-        'zh-yue': '1zhwlUeeiyOAU1QqwqZ8n91yXIRPFA7UE',
-        'zh': '1LZ96GUhkVHQU-aj2C3WOrtffOp0U3Z7f',
-        'zu': '1FyXl_UK1737XB3drqQFhGXiJrJckiB1W'
-    }
-    return languages_ids[language]
-
-
-class DANE(ColumnCorpus):
+class NER_SWEDISH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
             in_memory: bool = True,
     ):
+        """
+        Initialize the NER_SWEDISH corpus for Swedish. The first time you call this constructor it will automatically
+        download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
+
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {1: 'text', 3: 'pos', 9: 'ner'}
+        columns = {0: "text", 1: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1440,61 +1232,35 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        data_path = Path(flair.cache_root) / "datasets" / dataset_name
-        train_data_file = data_path / "ddt.train.conllu"
-        if not train_data_file.is_file():
-            temp_file = cached_path(
-                'https://danlp.alexandra.dk/304bd159d5de/datasets/ddt.zip',
-                Path("datasets") / dataset_name
-            )
-            from zipfile import ZipFile
-
-            with ZipFile(temp_file, 'r') as zip_file:
-                zip_file.extractall(path=data_path)
-
-            # Remove CoNLL-U meta information in the last column
-            for part in ['train', 'dev', 'test']:
-                lines = []
-                data_file = "ddt.{}.conllu".format(part)
-                with open(data_path / data_file, 'r') as file:
-                    for line in file:
-                        if line.startswith("#") or line == "\n":
-                            lines.append(line)
-                        lines.append(line.replace("name=", "").replace("|SpaceAfter=No", ""))
-
-                with open(data_path / data_file, 'w') as file:
-                    file.writelines(lines)
+        ner_spraakbanken_path = "https://raw.githubusercontent.com/klintan/swedish-ner-corpus/master/"
+        cached_path(f"{ner_spraakbanken_path}test_corpus.txt", Path("datasets") / dataset_name)
+        cached_path(f"{ner_spraakbanken_path}train_corpus.txt", Path("datasets") / dataset_name)
 
-                print(data_path / data_file)
+        # data is not in IOB2 format. Thus we transform it to IOB2
+        add_IOB2_tags(data_file=Path(data_folder / "test_corpus.txt"))
+        add_IOB2_tags(data_file=Path(data_folder / "train_corpus.txt"))
 
-        super(DANE, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes,
-            in_memory=in_memory, comment_symbol="#"
+        super(NER_SWEDISH, self).__init__(
+            data_folder,
+            columns,
+            tag_to_bioes=tag_to_bioes,
+            in_memory=in_memory,
         )
 
 
-class EUROPARL_NER_GERMAN(ColumnCorpus):
+class SEC_FILLINGS(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
-            in_memory: bool = False,
+            in_memory: bool = True,
     ):
-        """
-        Initialize the EUROPARL_NER_GERMAN corpus. The first time you call this constructor it will automatically
-        download the dataset.
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: 'ner' by default, should not be changed.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
 
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: 'text', 1: 'lemma', 2: 'pos', 3: 'np', 4: 'ner'}
+        columns = {0: "text", 1: "pos", 3: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1505,44 +1271,35 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        europarl_ner_german_path = "https://nlpado.de/~sebastian/software/ner/"
-        cached_path(f"{europarl_ner_german_path}ep-96-04-15.conll", Path("datasets") / dataset_name)
-        cached_path(f"{europarl_ner_german_path}ep-96-04-16.conll", Path("datasets") / dataset_name)
-
-        add_IOB_tags(data_file=Path(data_folder / "ep-96-04-15.conll"), encoding="latin-1", ner_column=4)
-        add_IOB_tags(data_file=Path(data_folder / "ep-96-04-16.conll"), encoding="latin-1", ner_column=4)
+        SEC_FILLINGS_Path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/SEC-filings/CONLL-format/data/"
+        cached_path(f"{SEC_FILLINGS_Path}test/FIN3.txt", Path("datasets") / dataset_name)
+        cached_path(f"{SEC_FILLINGS_Path}train/FIN5.txt", Path("datasets") / dataset_name)
 
-        super(EUROPARL_NER_GERMAN, self).__init__(
+        super(SEC_FILLINGS, self).__init__(
             data_folder,
             columns,
             tag_to_bioes=tag_to_bioes,
-            encoding="latin-1",
+            encoding="utf-8",
             in_memory=in_memory,
-            train_file='ep-96-04-16.conll',
-            test_file='ep-96-04-15.conll'
+            train_file='FIN5.txt',
+            test_file="FIN3.txt",
+            skip_first_line=True
         )
 
 
-class GERMEVAL_14(ColumnCorpus):
+class SEMEVAL2017(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
+            tag_to_bioes: str = "keyword",
             in_memory: bool = True,
     ):
-        """
-        Initialize the GermEval NER corpus for German. This is only possible if you've manually downloaded it to your
-        machine. Obtain the corpus from https://sites.google.com/site/germeval2014ner/data and put it into some folder.
-        Then point the base_path parameter in the constructor to this folder
-        :param base_path: Path to the GermEval corpus on your machine
-        :param tag_to_bioes: 'ner' by default, should not be changed.
-        :param in_memory:If True, keeps dataset in memory giving speedups in training.
-        """
+
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {1: "text", 2: "ner"}
+        columns = {0: "text", 1: "keyword"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1552,24 +1309,17 @@ def __init__(
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
-        # check if data there
-        if not data_folder.exists():
-            log.warning("-" * 100)
-            log.warning(f'WARNING: GermEval-14 dataset not found at "{data_folder}".')
-            log.warning(
-                'Instructions for obtaining the data can be found here: https://sites.google.com/site/germeval2014ner/data"'
-            )
-            log.warning("-" * 100)
-        super(GERMEVAL_14, self).__init__(
-            data_folder,
-            columns,
-            tag_to_bioes=tag_to_bioes,
-            comment_symbol="#",
-            in_memory=in_memory,
+        semeval2017_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/SemEval-2017"
+        cached_path(f"{semeval2017_path}/train.txt", Path("datasets") / dataset_name)
+        cached_path(f"{semeval2017_path}/test.txt", Path("datasets") / dataset_name)
+        cached_path(f"{semeval2017_path}/dev.txt", Path("datasets") / dataset_name)
+
+        super(SEMEVAL2017, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
 
-class INSPEC(ColumnCorpus):
+class SEMEVAL2010(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
@@ -1591,35 +1341,33 @@ def __init__(
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
-        inspec_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/Inspec"
-        cached_path(f"{inspec_path}/train.txt", Path("datasets") / dataset_name)
-        cached_path(f"{inspec_path}/test.txt", Path("datasets") / dataset_name)
-        if not "dev.txt" in os.listdir(data_folder):
-            cached_path(f"{inspec_path}/valid.txt", Path("datasets") / dataset_name)
-            # rename according to train - test - dev - convention
-            os.rename(data_folder / "valid.txt", data_folder / "dev.txt")
+        semeval2010_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/processed_semeval-2010"
+        cached_path(f"{semeval2010_path}/train.txt", Path("datasets") / dataset_name)
+        cached_path(f"{semeval2010_path}/test.txt", Path("datasets") / dataset_name)
 
-        super(INSPEC, self).__init__(
+        super(SEMEVAL2010, self).__init__(
             data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
 
-class LER_GERMAN(ColumnCorpus):
+class TURKU_NER(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
-            in_memory: bool = False,
+            in_memory: bool = True,
+            document_as_sequence: bool = False,
     ):
         """
-        Initialize the LER_GERMAN (Legal Entity Recognition) corpus. The first time you call this constructor it will automatically
+        Initialize the Finnish TurkuNER corpus. The first time you call this constructor it will automatically
         download the dataset.
         :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
         to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
+        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
+        POS tags instead
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
         :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
-
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
@@ -1635,18 +1383,29 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        ler_path = "https://raw.githubusercontent.com/elenanereiss/Legal-Entity-Recognition/master/data/"
-        cached_path(f"{ler_path}ler.conll", Path("datasets") / dataset_name)
+        conll_path = "https://raw.githubusercontent.com/TurkuNLP/turku-ner-corpus/master/data/conll"
+        dev_file = "dev.tsv"
+        test_file = "test.tsv"
+        train_file = "train.tsv"
+        cached_path(f"{conll_path}/{dev_file}", Path("datasets") / dataset_name)
+        cached_path(f"{conll_path}/{test_file}", Path("datasets") / dataset_name)
+        cached_path(f"{conll_path}/{train_file}", Path("datasets") / dataset_name)
 
-        super(LER_GERMAN, self).__init__(
+        super(TURKU_NER, self).__init__(
             data_folder,
             columns,
+            dev_file=dev_file,
+            test_file=test_file,
+            train_file=train_file,
+            column_delimiter="\t",
             tag_to_bioes=tag_to_bioes,
+            encoding="latin-1",
             in_memory=in_memory,
-            train_file='ler.conll'
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
         )
 
-class ANER_CORP(ColumnCorpus):
+
+class TWITTER_NER(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
@@ -1655,15 +1414,14 @@ def __init__(
             document_as_sequence: bool = False,
     ):
         """
-        Initialize a preprocessed version of the Arabic Named Entity Recognition Corpus (ANERCorp) dataset available
-        from https://github.com/EmnamoR/Arabic-named-entity-recognition/blob/master/ANERCorp.rar.
-        http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp
-        Column order is swapped
-        The first time you call this constructor it will automatically download the dataset.
+        Initialize a dataset called twitter_ner which can be found on the following page:
+        https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/ner.txt.
+
+        The first time you call this constructor it will automatically
+        download the dataset.
         :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
         to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
-        POS tags instead
+        :param tag_to_bioes: NER by default, need not be changed
         :param in_memory: If True, keeps dataset in memory giving speedups in training.
         :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
@@ -1671,7 +1429,7 @@ def __init__(
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "ner"}
+        columns = {0: 'text', 1: 'ner'}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1682,32 +1440,41 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        anercorp_path = "https://megantosh.s3.eu-central-1.amazonaws.com/ANERcorp/"
-        # cached_path(f"{anercorp_path}test.txt", Path("datasets") / dataset_name)
-        cached_path(f"{anercorp_path}train.txt", Path("datasets") / dataset_name)
+        twitter_ner_path = "https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/"
+        cached_path(f"{twitter_ner_path}ner.txt", Path("datasets") / dataset_name)
 
-        super(ANER_CORP, self).__init__(
+        super(TWITTER_NER, self).__init__(
             data_folder,
             columns,
-            # tag_to_bioes=tag_to_bioes,
-            encoding="utf-8",
+            tag_to_bioes=tag_to_bioes,
+            encoding="latin-1",
+            train_file="ner.txt",
             in_memory=in_memory,
             document_separator_token=None if not document_as_sequence else "-DOCSTART-",
         )
 
 
-class NER_BASQUE(ColumnCorpus):
+class UP_CHINESE(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
             in_memory: bool = True,
+            document_as_sequence: bool = False,
     ):
+        """
+        Initialize the Chinese dataset from the Universal Propositions Bank, comming from that webpage:
+        https://github.com/System-T/UniversalPropositions/tree/master/UP_Chinese
+
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "ner"}
+        columns = {1: "text", 9: "frame"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1718,44 +1485,45 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        ner_basque_path = "http://ixa2.si.ehu.eus/eiec/"
-        data_path = Path(flair.cache_root) / "datasets" / dataset_name
-        data_file = data_path / "named_ent_eu.train"
-        if not data_file.is_file():
-            cached_path(
-                f"{ner_basque_path}/eiec_v1.0.tgz", Path("datasets") / dataset_name
-            )
-            import tarfile, shutil
-
-            with tarfile.open(
-                    Path(flair.cache_root) / "datasets" / dataset_name / "eiec_v1.0.tgz",
-                    "r:gz",
-            ) as f_in:
-                corpus_files = (
-                    "eiec_v1.0/named_ent_eu.train",
-                    "eiec_v1.0/named_ent_eu.test",
-                )
-                for corpus_file in corpus_files:
-                    f_in.extract(corpus_file, data_path)
-                    shutil.move(f"{data_path}/{corpus_file}", data_path)
+        up_zh_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Chinese/"
+        cached_path(f"{up_zh_path}zh-up-train.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_zh_path}zh-up-dev.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_zh_path}zh-up-test.conllu", Path("datasets") / dataset_name)
 
-        super(NER_BASQUE, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
+        super(UP_CHINESE, self).__init__(
+            data_folder,
+            columns,
+            encoding="utf-8",
+            train_file="zh-up-train.conllu",
+            test_file="zh-up-test.conllu",
+            dev_file="zh-up-dev.conllu",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            comment_symbol="#",
         )
 
 
-class NER_FINNISH(ColumnCorpus):
+class UP_ENGLISH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
             in_memory: bool = True,
+            document_as_sequence: bool = False,
     ):
+        """
+        Initialize the English dataset from the Universal Propositions Bank, comming from that webpage:
+        https://github.com/System-T/UniversalPropositions.
+
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "ner"}
+        columns = {1: "text", 10: "frame"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1766,48 +1534,45 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        ner_finnish_path = "https://raw.githubusercontent.com/mpsilfve/finer-data/master/data/digitoday."
-        cached_path(f"{ner_finnish_path}2014.train.csv", Path("datasets") / dataset_name)
-        cached_path(f"{ner_finnish_path}2014.dev.csv", Path("datasets") / dataset_name)
-        cached_path(f"{ner_finnish_path}2015.test.csv", Path("datasets") / dataset_name)
-
-        _remove_lines_without_annotations(data_file=Path(data_folder / "digitoday.2015.test.csv"))
+        up_en_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_English-EWT/"
+        cached_path(f"{up_en_path}en_ewt-up-train.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_en_path}en_ewt-up-dev.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_en_path}en_ewt-up-test.conllu", Path("datasets") / dataset_name)
 
-        super(NER_FINNISH, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, skip_first_line=True
+        super(UP_ENGLISH, self).__init__(
+            data_folder,
+            columns,
+            encoding="utf-8",
+            train_file="en_ewt-up-train.conllu",
+            test_file="en_ewt-up-test.conllu",
+            dev_file="en_ewt-up-dev.conllu",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            comment_symbol="#",
         )
 
 
-def _remove_lines_without_annotations(data_file: Union[str, Path] = None):
-    with open(data_file, 'r') as f:
-        lines = f.readlines()
-    with open(data_file, 'w') as f:
-        for line in lines:
-            if len(line.split()) != 1:
-                f.write(line)
-
-
-class NER_SWEDISH(ColumnCorpus):
+class UP_FRENCH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
             in_memory: bool = True,
+            document_as_sequence: bool = False,
     ):
         """
-        Initialize the NER_SWEDISH corpus for Swedish. The first time you call this constructor it will automatically
-        download the dataset.
+        Initialize the French dataset from the Universal Propositions Bank, comming from that webpage:
+        https://github.com/System-T/UniversalPropositions.
+
         :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
         to point to a different folder but typically this should not be necessary.
         :param in_memory: If True, keeps dataset in memory giving speedups in training.
         :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
-
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "ner"}
+        columns = {1: "text", 9: "frame"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1818,35 +1583,45 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        ner_spraakbanken_path = "https://raw.githubusercontent.com/klintan/swedish-ner-corpus/master/"
-        cached_path(f"{ner_spraakbanken_path}test_corpus.txt", Path("datasets") / dataset_name)
-        cached_path(f"{ner_spraakbanken_path}train_corpus.txt", Path("datasets") / dataset_name)
-
-        # data is not in IOB2 format. Thus we transform it to IOB2
-        add_IOB2_tags(data_file=Path(data_folder / "test_corpus.txt"))
-        add_IOB2_tags(data_file=Path(data_folder / "train_corpus.txt"))
+        up_fr_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_French/"
+        cached_path(f"{up_fr_path}fr-up-train.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_fr_path}fr-up-dev.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_fr_path}fr-up-test.conllu", Path("datasets") / dataset_name)
 
-        super(NER_SWEDISH, self).__init__(
+        super(UP_FRENCH, self).__init__(
             data_folder,
             columns,
-            tag_to_bioes=tag_to_bioes,
+            encoding="utf-8",
+            train_file="fr-up-train.conllu",
+            test_file="fr-up-test.conllu",
+            dev_file="fr-up-dev.conllu",
             in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            comment_symbol="#",
         )
 
 
-class SEMEVAL2017(ColumnCorpus):
+class UP_FINNISH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "keyword",
             in_memory: bool = True,
+            document_as_sequence: bool = False,
     ):
+        """
+        Initialize the Finnish dataset from the Universal Propositions Bank, comming from that webpage:
+        https://github.com/System-T/UniversalPropositions/tree/master/UP_Finnish
 
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "keyword"}
+        columns = {1: "text", 9: "frame"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1856,29 +1631,46 @@ def __init__(
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
-        semeval2017_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/SemEval-2017"
-        cached_path(f"{semeval2017_path}/train.txt", Path("datasets") / dataset_name)
-        cached_path(f"{semeval2017_path}/test.txt", Path("datasets") / dataset_name)
-        cached_path(f"{semeval2017_path}/dev.txt", Path("datasets") / dataset_name)
+        # download data if necessary
+        up_fi_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Finnish/"
+        cached_path(f"{up_fi_path}fi-up-train.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_fi_path}fi-up-dev.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_fi_path}fi-up-test.conllu", Path("datasets") / dataset_name)
 
-        super(SEMEVAL2017, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
+        super(UP_FINNISH, self).__init__(
+            data_folder,
+            columns,
+            encoding="utf-8",
+            train_file="fi-up-train.conllu",
+            test_file="fi-up-test.conllu",
+            dev_file="fi-up-dev.conllu",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            comment_symbol="#",
         )
 
 
-class SEMEVAL2010(ColumnCorpus):
+class UP_GERMAN(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "keyword",
             in_memory: bool = True,
+            document_as_sequence: bool = False,
     ):
+        """
+        Initialize the German dataset from the Universal Propositions Bank, comming from that webpage:
+        https://github.com/System-T/UniversalPropositions.
 
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "keyword"}
+        columns = {1: "text", 9: "frame"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1888,27 +1680,46 @@ def __init__(
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
-        semeval2010_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/processed_semeval-2010"
-        cached_path(f"{semeval2010_path}/train.txt", Path("datasets") / dataset_name)
-        cached_path(f"{semeval2010_path}/test.txt", Path("datasets") / dataset_name)
+        # download data if necessary
+        up_de_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_German/"
+        cached_path(f"{up_de_path}de-up-train.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_de_path}de-up-dev.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_de_path}de-up-test.conllu", Path("datasets") / dataset_name)
 
-        super(SEMEVAL2010, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
+        super(UP_GERMAN, self).__init__(
+            data_folder,
+            columns,
+            encoding="utf-8",
+            train_file="de-up-train.conllu",
+            test_file="de-up-test.conllu",
+            dev_file="de-up-dev.conllu",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            comment_symbol="#",
         )
 
 
-class WIKINER_ENGLISH(ColumnCorpus):
+class UP_ITALIAN(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
-            in_memory: bool = False,
+            in_memory: bool = True,
+            document_as_sequence: bool = False,
     ):
+        """
+        Initialize the Italian dataset from the Universal Propositions Bank, comming from that webpage:
+        https://github.com/System-T/UniversalPropositions/tree/master/UP_Italian
+
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "pos", 2: "ner"}
+        columns = {1: "text", 9: "frame"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1919,25 +1730,45 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        _download_wikiner("en", dataset_name)
+        up_it_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Italian/"
+        cached_path(f"{up_it_path}it-up-train.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_it_path}it-up-dev.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_it_path}it-up-test.conllu", Path("datasets") / dataset_name)
 
-        super(WIKINER_ENGLISH, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
+        super(UP_ITALIAN, self).__init__(
+            data_folder,
+            columns,
+            encoding="utf-8",
+            train_file="it-up-train.conllu",
+            test_file="it-up-test.conllu",
+            dev_file="it-up-dev.conllu",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            comment_symbol="#",
         )
 
 
-class WIKINER_GERMAN(ColumnCorpus):
+class UP_SPANISH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
-            in_memory: bool = False,
+            in_memory: bool = True,
+            document_as_sequence: bool = False,
     ):
+        """
+        Initialize the Spanish dataset from the Universal Propositions Bank, comming from that webpage:
+        https://github.com/System-T/UniversalPropositions
+
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "pos", 2: "ner"}
+        columns = {1: "text", 9: "frame"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1948,25 +1779,45 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        _download_wikiner("de", dataset_name)
+        up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish/"
+        cached_path(f"{up_es_path}es-up-train.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_es_path}es-up-dev.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_es_path}es-up-test.conllu", Path("datasets") / dataset_name)
 
-        super(WIKINER_GERMAN, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
+        super(UP_SPANISH, self).__init__(
+            data_folder,
+            columns,
+            encoding="utf-8",
+            train_file="es-up-train.conllu",
+            test_file="es-up-test.conllu",
+            dev_file="es-up-dev.conllu",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            comment_symbol="#",
         )
 
 
-class WIKINER_DUTCH(ColumnCorpus):
+class UP_SPANISH_ANCORA(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
-            in_memory: bool = False,
+            in_memory: bool = True,
+            document_as_sequence: bool = False,
     ):
+        """
+        Initialize the Spanish AnCora dataset from the Universal Propositions Bank, comming from that webpage:
+        https://github.com/System-T/UniversalPropositions
+
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "pos", 2: "ner"}
+        columns = {1: "text", 9: "frame"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1977,25 +1828,47 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        _download_wikiner("nl", dataset_name)
+        up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish-AnCora/"
+        cached_path(f"{up_es_path}es_ancora-up-train.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_es_path}es_ancora-up-dev.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_es_path}es_ancora-up-test.conllu", Path("datasets") / dataset_name)
 
-        super(WIKINER_DUTCH, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
+        super(UP_SPANISH_ANCORA, self).__init__(
+            data_folder,
+            columns,
+            encoding="utf-8",
+            train_file="es_ancora-up-train.conllu",
+            test_file="es_ancora-up-test.conllu",
+            dev_file="es_ancora-up-dev.conllu",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            comment_symbol="#",
         )
 
 
-class WIKINER_FRENCH(ColumnCorpus):
+class WEIBO_NER(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
-            in_memory: bool = False,
+            in_memory: bool = True,
+            document_as_sequence: bool = False,
     ):
+        """
+        Initialize the WEIBO_NER corpus . The first time you call this constructor it will automatically
+        download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
+        POS tags instead
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "pos", 2: "ner"}
+        columns = {0: 'text', 1: 'ner'}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2006,192 +1879,449 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        _download_wikiner("fr", dataset_name)
+        weiboNER_conll_path = "https://raw.githubusercontent.com/87302380/WEIBO_NER/main/data/"
+        cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.train", Path("datasets") / dataset_name)
+        cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.test", Path("datasets") / dataset_name)
+        cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.dev", Path("datasets") / dataset_name)
 
-        super(WIKINER_FRENCH, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
+        super(WEIBO_NER, self).__init__(
+            data_folder,
+            columns,
+            tag_to_bioes=tag_to_bioes,
+            encoding="utf-8",
+            in_memory=in_memory,
+            train_file="weiboNER_2nd_conll_format.train",
+            test_file="weiboNER_2nd_conll_format.test",
+            dev_file="weiboNER_2nd_conll_format.dev",
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
         )
 
 
-class WIKINER_ITALIAN(ColumnCorpus):
+class WIKIANN(MultiCorpus):
     def __init__(
             self,
+            languages: Union[str, List[str]],
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
             in_memory: bool = False,
     ):
+        """
+        WkiAnn corpus for cross-lingual NER consisting of datasets from 282 languages that exist
+        in Wikipedia. See https://elisa-ie.github.io/wikiann/ for details and for the languages and their
+        respective abbreveations, i.e. "en" for english. (license: https://opendatacommons.org/licenses/by/)
+        Parameters
+        ----------
+        languages : Union[str, List[str]]
+            Should be an abbreviation of a language ("en", "de",..) or a list of abbreviations.
+            The datasets of all passed languages will be saved in one MultiCorpus.
+            (Note that, even though listed on https://elisa-ie.github.io/wikiann/ some datasets are empty.
+            This includes "aa", "cho", "ho", "hz", "ii", "jam", "kj", "kr", "mus", "olo" and "tcy".)
+        base_path : Union[str, Path], optional
+            Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+            to point to a different folder but typically this should not be necessary.
+        tag_to_bioes : str, optional
+            The data is in bio-format. It will by default (with the string "ner" as value) be transformed
+            into the bioes format. If you dont want that set it to None.
+
+        """
+        if type(languages) == str:
+            languages = [languages]
+
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "pos", 2: "ner"}
+        columns = {0: "text", 1: "ner"}
 
         # this dataset name
-        dataset_name = self.__class__.__name__.lower()
+        dataset_name = "wikiann"
 
         # default dataset folder is the cache root
         if not base_path:
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
-        # download data if necessary
-        _download_wikiner("it", dataset_name)
+        # For each language in languages, the file is downloaded if not existent
+        # Then a comlumncorpus of that data is created and saved in a list
+        # this list is handed to the multicorpus
 
-        super(WIKINER_ITALIAN, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
-        )
+        # list that contains the columncopora
+        corpora = []
 
+        google_drive_path = 'https://drive.google.com/uc?id='
+        # download data if necessary
+        first = True
+        for language in languages:
 
-class WIKINER_SPANISH(ColumnCorpus):
-    def __init__(
-            self,
-            base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
-            in_memory: bool = False,
-    ):
-        if type(base_path) == str:
-            base_path: Path = Path(base_path)
-
-        # column format
-        columns = {0: "text", 1: "pos", 2: "ner"}
-
-        # this dataset name
-        dataset_name = self.__class__.__name__.lower()
-
-        # default dataset folder is the cache root
-        if not base_path:
-            base_path = Path(flair.cache_root) / "datasets"
-        data_folder = base_path / dataset_name
-
-        # download data if necessary
-        _download_wikiner("es", dataset_name)
-
-        super(WIKINER_SPANISH, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
-        )
-
-
-class WIKINER_PORTUGUESE(ColumnCorpus):
-    def __init__(
-            self,
-            base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
-            in_memory: bool = False,
-    ):
-        if type(base_path) == str:
-            base_path: Path = Path(base_path)
-
-        # column format
-        columns = {0: "text", 1: "pos", 2: "ner"}
-
-        # this dataset name
-        dataset_name = self.__class__.__name__.lower()
-
-        # default dataset folder is the cache root
-        if not base_path:
-            base_path = Path(flair.cache_root) / "datasets"
-        data_folder = base_path / dataset_name
-
-        # download data if necessary
-        _download_wikiner("pt", dataset_name)
-
-        super(WIKINER_PORTUGUESE, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
-        )
-
-
-class WIKINER_POLISH(ColumnCorpus):
-    def __init__(
-            self,
-            base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
-            in_memory: bool = False,
-    ):
-        if type(base_path) == str:
-            base_path: Path = Path(base_path)
-
-        # column format
-        columns = {0: "text", 1: "pos", 2: "ner"}
-
-        # this dataset name
-        dataset_name = self.__class__.__name__.lower()
-
-        # default dataset folder is the cache root
-        if not base_path:
-            base_path = Path(flair.cache_root) / "datasets"
-        data_folder = base_path / dataset_name
-
-        # download data if necessary
-        _download_wikiner("pl", dataset_name)
-
-        super(WIKINER_POLISH, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
-        )
-
+            language_folder = data_folder / language
+            file_name = 'wikiann-' + language + '.bio'
 
-class WIKINER_RUSSIAN(ColumnCorpus):
-    def __init__(
-            self,
-            base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
-            in_memory: bool = False,
-    ):
-        if type(base_path) == str:
-            base_path: Path = Path(base_path)
+            # if language not downloaded yet, download it
+            if not language_folder.exists():
+                if first == True:
+                    import gdown
+                    import tarfile
+                    first = False
+                # create folder
+                os.makedirs(language_folder)
+                # get google drive id from list
+                google_id = google_drive_id_from_language_name(language)
+                url = google_drive_path + google_id
 
-        # column format
-        columns = {0: "text", 1: "pos", 2: "ner"}
+                # download from google drive
+                gdown.download(url, str(language_folder / language) + '.tar.gz')
 
-        # this dataset name
-        dataset_name = self.__class__.__name__.lower()
+                # unzip
+                print("Extract data...")
+                tar = tarfile.open(str(language_folder / language) + '.tar.gz', "r:gz")
+                # tar.extractall(language_folder,members=[tar.getmember(file_name)])
+                tar.extract(file_name, str(language_folder))
+                tar.close()
+                print('...done.')
 
-        # default dataset folder is the cache root
-        if not base_path:
-            base_path = Path(flair.cache_root) / "datasets"
-        data_folder = base_path / dataset_name
+                # transform data into required format
+                # the processed dataset has the additional ending "_new"
+                print("Process dataset...")
+                silver_standard_to_simple_ner_annotation(str(language_folder / file_name))
+                # remove the unprocessed dataset
+                os.remove(str(language_folder / file_name))
+                print('...done.')
 
-        # download data if necessary
-        _download_wikiner("ru", dataset_name)
+            # initialize comlumncorpus and add it to list
+            print("Read data into corpus...")
+            corp = ColumnCorpus(data_folder=language_folder,
+                                column_format=columns,
+                                train_file=file_name + '_new',
+                                tag_to_bioes=tag_to_bioes,
+                                in_memory=in_memory,
+                                )
+            corpora.append(corp)
+            print("...done.")
 
-        super(WIKINER_RUSSIAN, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
+        super(WIKIANN, self).__init__(
+            corpora, name='wikiann'
         )
 
 
-class WNUT_17(ColumnCorpus):
-    def __init__(
-            self,
-            base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
-            in_memory: bool = True,
-    ):
-        if type(base_path) == str:
-            base_path: Path = Path(base_path)
-
-        # column format
-        columns = {0: "text", 1: "ner"}
-
-        # this dataset name
-        dataset_name = self.__class__.__name__.lower()
+def silver_standard_to_simple_ner_annotation(data_file: Union[str, Path]):
+    f_read = open(data_file, 'r', encoding='utf-8')
+    f_write = open(data_file + '_new', 'w+', encoding='utf-8')
+    while True:
+        line = f_read.readline()
+        if line:
+            if line == '\n':
+                f_write.write(line)
+            else:
+                liste = line.split()
+                f_write.write(liste[0] + ' ' + liste[-1] + '\n')
+        else:
+            break
+    f_read.close()
+    f_write.close()
 
-        # default dataset folder is the cache root
-        if not base_path:
-            base_path = Path(flair.cache_root) / "datasets"
-        data_folder = base_path / dataset_name
 
-        # download data if necessary
-        wnut_path = "https://noisy-text.github.io/2017/files/"
-        cached_path(f"{wnut_path}wnut17train.conll", Path("datasets") / dataset_name)
-        cached_path(f"{wnut_path}emerging.dev.conll", Path("datasets") / dataset_name)
-        cached_path(
-            f"{wnut_path}emerging.test.annotated", Path("datasets") / dataset_name
-        )
+def google_drive_id_from_language_name(language):
+    languages_ids = {
+        'aa': '1tDDlydKq7KQQ3_23Ysbtke4HJOe4snIk',  # leer
+        'ab': '1hB8REj2XA_0DjI9hdQvNvSDpuBIb8qRf',
+        'ace': '1WENJS2ppHcZqaBEXRZyk2zY-PqXkTkgG',
+        'ady': '1n6On8WWDHxEoybj7F9K15d_fkGPy6KgO',
+        'af': '1CPB-0BD2tg3zIT60D3hmJT0i5O_SKja0',
+        'ak': '1l2vlGHnQwvm9XhW5S-403fetwUXhBlZm',
+        'als': '196xyYjhbie7sYLHLZHWkkurOwQLi8wK-',
+        'am': '1ug1IEoExKD3xWpvfZprAPSQi82YF9Cet',
+        'an': '1DNLgPOAOsGZBYd6rC5ddhzvc9_DtWnk2',
+        'ang': '1W_0ti7Tl8AkqM91lRCMPWEuUnPOAZroV',
+        'ar': '1tyvd32udEQG_cNeVpaD5I2fxvCc6XKIS',
+        'arc': '1hSOByStqPmP3b9HfQ39EclUZGo8IKCMb',
+        'arz': '1CKW5ZhxTpIHmc8Jt5JLz_5O6Cr8Icsan',
+        'as': '12opBoIweBLM8XciMHT4B6-MAaKdYdvpE',
+        'ast': '1rp64PxGZBDfcw-tpFBjLg_ddLDElG1II',
+        'av': '1hncGUrkG1vwAAQgLtwOf41BWkHkEvdss',
+        'ay': '1VmIsWpMTz442b4Mx798ZOgtB9vquKQtf',
+        'az': '1FXDXsvBSdqc7GGIDZv0hqBOaaw12Ip2-',
+        'azb': '1amVqOuHLEkhjn8rkGUl-mXdZlaACWyNT',
+        'ba': '1aLx1d8GagI11VZVYOGQy0BEePeqoT0x3',
+        'bar': '1JZ8-k8ZmnpWYI_Yl_cBBgjVdxoM9Daci',
+        'bat-smg': '1trxKXDFSeKsygTMKi-ZqXSJs7F90k5a8',
+        'bcl': '1Hs0k7KVZ2DPsqroZ4cUKcwZG4HdPV794',
+        'be-x-old': '1gaK-spj1m6eGYQ-SsngLxxLUvP1VRk08',
+        'be': '1_ttfOSy9BzCRkIT_p3mImT82XRPpEiuH',
+        'bg': '1Iug6gYKemb0OrLTUrKDc_c66YGypTfCF',
+        'bh': '12OcSFLu940A8tVQLxI8pnxKBpTeZHmrh',
+        'bi': '1rftVziS_pqARx4mvLJC0sKLY-OL5ZIjE',
+        'bjn': '1n17mkRjPUAOWQk5LQs2C3Tz3ShxK0enZ',
+        'bm': '1284dwO_sfdsWE7FR06HhfBRUb8ePesKR',
+        'bn': '1K2DM1mT4hkr6NlAIBTj95BeVXcgvpgDm',
+        'bo': '1SzGHDVK-OguKdjZ4DXWiOJVrie1iHeWm',
+        'bpy': '1m-e5EoruJufvwBEgJLmJtx6jzx64pYN2',
+        'br': '1xdaBoJ1DnwI0iEq7gQN1dWcABAs_bM9H',
+        'bs': '167dsB01trMYFQl8FshtIdfhjw7IfVKbk',
+        'bug': '1yCnevM9_KJzFk27Vxsva_20OacLo4Uam',
+        'bxr': '1DlByAX3zB-9UyEAVD4wtX-R7mXC-8xum',
+        'ca': '1LuUgbd9sGa-5Ahcsy31EK89a3WOowftY',
+        'cbk-zam': '1kgF8xoD-kIOWZET_9kp_4yNX6AAXn6PI',
+        'cdo': '14x1y6611G-UAEGq92QEHRpreVkYnoUCw',
+        'ce': '1QUUCVKA-fkiCHd3KT3zUWefaWnxzlZLu',
+        'ceb': '1DJZE9RfaMoPNXHI73KBXAm4YSe-_YCUk',
+        'ch': '1YzAfhmatkmTpkZbAcD6X83epCgzD5S2_',
+        'cho': '1ciY0vF3c5a2mTOo_k32A2wMs0klK98Kb',  # leer
+        'chr': '1EHaxz1UZHn7v2bbRzCLAhPsNtRzrG3Ae',
+        'chy': '1nNWwMAJr1KNdz3bHf6uIn-thZCknlTeB',
+        'ckb': '1llpaftcUSiXCZQZMdAqaJSrhwMdcf9IV',
+        'co': '1ZP-8oWgMYfW7a6w6ygEFkKDGbN39QnDn',
+        'cr': '1ST0xRicLAG4JdCZwGdaY-0pEXooQh7e6',
+        'crh': '1Jmpq2XVYUR_XaXU5XNhtOMnz-qkpsgpE',
+        'cs': '1Vydyze-jBkK_S1uV5ewV_Y6dbwhXr7lk',
+        'csb': '1naUyF74lZPnnopXdOqf5Xor2kT4WoHfS',
+        'cu': '1EN5dVTU6jc7YOYPCHq8EYUF31HlMUKs7',
+        'cv': '1gEUAlqYSSDI4TrWCqP1LUq2n0X1XEjN3',
+        'cy': '1q5g6NJE5GXf65Vc_P4BnUMHQ49Prz-J1',
+        'da': '11onAGOLkkqrIwM784siWlg-cewa5WKm8',
+        'de': '1f9nWvNkCCy6XWhd9uf4Dq-2--GzSaYAb',
+        'diq': '1IkpJaVbEOuOs9qay_KG9rkxRghWZhWPm',
+        'dsb': '1hlExWaMth-2eVIQ3i3siJSG-MN_7Z6MY',
+        'dv': '1WpCrslO4I7TMb2uaKVQw4U2U8qMs5szi',
+        'dz': '10WX52ePq2KfyGliwPvY_54hIjpzW6klV',
+        'ee': '1tYEt3oN2KPzBSWrk9jpCqnW3J1KXdhjz',
+        'el': '1cxq4NUYmHwWsEn5waYXfFSanlINXWLfM',
+        'eml': '17FgGhPZqZNtzbxpTJOf-6nxEuI5oU4Vd',
+        'en': '1mqxeCPjxqmO7e8utj1MQv1CICLFVvKa-',
+        'eo': '1YeknLymGcqj44ug2yd4P7xQVpSK27HkK',
+        'es': '1Dnx3MVR9r5cuoOgeew2gT8bDvWpOKxkU',
+        'et': '1Qhb3kYlQnLefWmNimdN_Vykm4mWzbcWy',
+        'eu': '1f613wH88UeITYyBSEMZByK-nRNMwLHTs',
+        'ext': '1D0nLOZ3aolCM8TShIRyCgF3-_MhWXccN',
+        'fa': '1QOG15HU8VfZvJUNKos024xI-OGm0zhEX',
+        'ff': '1h5pVjxDYcq70bSus30oqi9KzDmezVNry',
+        'fi': '1y3Kf6qYsSvL8_nSEwE1Y6Bf6ninaPvqa',
+        'fiu-vro': '1oKUiqG19WgPd3CCl4FGudk5ATmtNfToR',
+        'fj': '10xDMuqtoTJlJFp5ghbhKfNWRpLDK3W4d',
+        'fo': '1RhjYqgtri1276Be1N9RrNitdBNkpzh0J',
+        'fr': '1sK_T_-wzVPJYrnziNqWTriU52rEsXGjn',
+        'frp': '1NUm8B2zClBcEa8dHLBb-ZgzEr8phcQyZ',
+        'frr': '1FjNqbIUlOW1deJdB8WCuWjaZfUzKqujV',
+        'fur': '1oqHZMK7WAV8oHoZLjGR0PfmO38wmR6XY',
+        'fy': '1DvnU6iaTJc9bWedmDklHyx8nzKD1s3Ge',
+        'ga': '1Ql6rh7absdYQ8l-3hj_MVKcEC3tHKeFB',
+        'gag': '1zli-hOl2abuQ2wsDJU45qbb0xuvYwA3a',
+        'gan': '1u2dOwy58y-GaS-tCPJS_i9VRDQIPXwCr',
+        'gd': '1umsUpngJiwkLdGQbRqYpkgxZju9dWlRz',
+        'gl': '141K2IbLjJfXwFTIf-kthmmG0YWdi8liE',
+        'glk': '1ZDaxQ6ilXaoivo4_KllagabbvfOuiZ0c',
+        'gn': '1hM4MuCaVnZqnL-w-0N-WcWag22ikVLtZ',
+        'gom': '1BNOSw75tzPC0wEgLOCKbwu9wg9gcLOzs',
+        'got': '1YSHYBtXc1WvUvMIHPz6HHgJvaXKulJUj',
+        'gu': '1VdK-B2drqFwKg8KD23c3dKXY-cZgCMgd',
+        'gv': '1XZFohYNbKszEFR-V-yDXxx40V41PV9Zm',
+        'ha': '18ZG4tUU0owRtQA8Ey3Dl72ALjryEJWMC',
+        'hak': '1QQe3WgrCWbvnVH42QXD7KX4kihHURB0Z',
+        'haw': '1FLqlK-wpz4jy768XbQAtxd9PhC-9ciP7',
+        'he': '18K-Erc2VOgtIdskaQq4D5A3XkVstDmfX',
+        'hi': '1lBRapb5tjBqT176gD36K5yb_qsaFeu-k',
+        'hif': '153MQ9Ga4NQ-CkK8UiJM3DjKOk09fhCOV',
+        'ho': '1c1AoS7yq15iVkTEE-0f3x25NT4F202B8',  # leer
+        'hr': '1wS-UtB3sGHuXJQQGR0F5lDegogsgoyif',
+        'hsb': '1_3mMLzAE5OmXn2z64rW3OwWbo85Mirbd',
+        'ht': '1BwCaF0nfdgkM7Yt7A7d7KyVk0BcuwPGk',
+        'hu': '10AkDmTxUWNbOXuYLYZ-ZPbLAdGAGZZ8J',
+        'hy': '1Mi2k2alJJquT1ybd3GC3QYDstSagaWdo',
+        'hz': '1c1m_-Q92v0Di7Nez6VuaccrN19i8icKV',  # leer
+        'ia': '1jPyqTmDuVhEhj89N606Cja5heJEbcMoM',
+        'id': '1JWIvIh8fQoMQqk1rPvUThaskxnTs8tsf',
+        'ie': '1TaKRlTtB8-Wqu4sfvx6JQKIugAlg0pV-',
+        'ig': '15NFAf2Qx6BXSjv_Oun9_3QRBWNn49g86',
+        'ii': '1qldGJkMOMKwY13DpcgbxQCbff0K982f9',  # leer
+        'ik': '1VoSTou2ZlwVhply26ujowDz6gjwtxmny',
+        'ilo': '1-xMuIT6GaM_YeHqgm1OamGkxYfBREiv3',
+        'io': '19Zla0wsAcrZm2c0Pw5ghpp4rHjYs26Pp',
+        'is': '11i-NCyqS6HbldIbYulsCgQGZFXR8hwoB',
+        'it': '1HmjlOaQunHqL2Te7pIkuBWrnjlmdfYo_',
+        'iu': '18jKm1S7Ls3l0_pHqQH8MycG3LhoC2pdX',
+        'ja': '10dz8UxyK4RIacXE2HcGdrharmp5rwc3r',
+        'jam': '1v99CXf9RnbF6aJo669YeTR6mQRTOLZ74',  # leer
+        'jbo': '1_LmH9hc6FDGE3F7pyGB1fUEbSwuTYQdD',
+        'jv': '1qiSu1uECCLl4IBZS27FBdJIBivkJ7GwE',
+        'ka': '172UFuFRBX2V1aWeXlPSpu9TjS-3cxNaD',
+        'kaa': '1kh6hMPUdqO-FIxRY6qaIBZothBURXxbY',
+        'kab': '1oKjbZI6ZrrALCqnPCYgIjKNrKDA7ehcs',
+        'kbd': '1jNbfrboPOwJmlXQBIv053d7n5WXpMRv7',
+        'kg': '1iiu5z-sdJ2JLC4Ja9IgDxpRZklIb6nDx',
+        'ki': '1GUtt0QI84c5McyLGGxoi5uwjHOq1d6G8',
+        'kj': '1nSxXUSGDlXVCIPGlVpcakRc537MwuKZR',  # leer
+        'kk': '1ryC3UN0myckc1awrWhhb6RIi17C0LCuS',
+        'kl': '1gXtGtX9gcTXms1IExICnqZUHefrlcIFf',
+        'km': '1DS5ATxvxyfn1iWvq2G6qmjZv9pv0T6hD',
+        'kn': '1ZGLYMxbb5-29MNmuUfg2xFhYUbkJFMJJ',
+        'ko': '12r8tIkTnwKhLJxy71qpIcoLrT6NNhQYm',
+        'koi': '1EdG_wZ_Qk124EPAZw-w6rdEhYLsgcvIj',
+        'kr': '19VNQtnBA-YL_avWuVeHQHxJZ9MZ04WPF',  # leer
+        'krc': '1nReV4Mb7Wdj96czpO5regFbdBPu0zZ_y',
+        'ks': '1kzh0Pgrv27WRMstR9MpU8mu7p60TcT-X',
+        'ksh': '1iHJvrl2HeRaCumlrx3N7CPrHQ2KuLUkt',
+        'ku': '1YqJog7Bkk0fHBCSTxJ9heeE-bfbkbkye',
+        'kv': '1s91HI4eq8lQYlZwfrJAgaGlCyAtIhvIJ',
+        'kw': '16TaIX2nRfqDp8n7zudd4bqf5abN49dvW',
+        'ky': '17HPUKFdKWhUjuR1NOp5f3PQYfMlMCxCT',
+        'la': '1NiQuBaUIFEERvVXo6CQLwosPraGyiRYw',
+        'lad': '1PEmXCWLCqnjLBomMAYHeObM1AmVHtD08',
+        'lb': '1nE4g10xoTU23idmDtOQ0w2QCuizZ6QH_',
+        'lbe': '1KOm-AdRcCHfSc1-uYBxBA4GjxXjnIlE-',
+        'lez': '1cJAXshrLlF1TZlPHJTpDwEvurIOsz4yR',
+        'lg': '1Ur0y7iiEpWBgHECrIrT1OyIC8um_y4th',
+        'li': '1TikIqfqcZlSDWhOae1JnjJiDko4nj4Dj',
+        'lij': '1ro5ItUcF49iP3JdV82lhCQ07MtZn_VjW',
+        'lmo': '1W4rhBy2Pi5SuYWyWbNotOVkVY3kYWS_O',
+        'ln': '1bLSV6bWx0CgFm7ByKppZLpYCFL8EIAoD',
+        'lo': '1C6SSLeKF3QirjZbAZAcpVX_AXYg_TJG3',
+        'lrc': '1GUcS28MlJe_OjeQfS2AJ8uczpD8ut60e',
+        'lt': '1gAG6TcMTmC128wWK0rCXRlCTsJY9wFQY',
+        'ltg': '12ziP8t_fAAS9JqOCEC0kuJObEyuoiOjD',
+        'lv': '1MPuAM04u-AtfybXdpHwCqUpFWbe-zD0_',
+        'mai': '1d_nUewBkka2QGEmxCc9v3dTfvo7lPATH',
+        'map-bms': '1wrNIE-mqp2xb3lrNdwADe6pb7f35NP6V',
+        'mdf': '1BmMGUJy7afuKfhfTBMiKxM3D7FY-JrQ2',
+        'mg': '105WaMhcWa-46tCztoj8npUyg0aH18nFL',
+        'mh': '1Ej7n6yA1cF1cpD5XneftHtL33iHJwntT',
+        'mhr': '1CCPIUaFkEYXiHO0HF8_w07UzVyWchrjS',
+        'mi': '1F6au9xQjnF-aNBupGJ1PwaMMM6T_PgdQ',
+        'min': '1tVK5SHiCy_DaZSDm3nZBgT5bgWThbJt_',
+        'mk': '18NpudytGhSWq_LbmycTDw10cSftlSBGS',
+        'ml': '1V73UE-EvcE-vV3V1RTvU4sak6QFcP91y',
+        'mn': '14jRXicA87oXZOZllWqUjKBMetNpQEUUp',
+        'mo': '1YsLGNMsJ7VsekhdcITQeolzOSK4NzE6U',
+        'mr': '1vOr1AIHbgkhTO9Ol9Jx5Wh98Qdyh1QKI',
+        'mrj': '1dW-YmEW8a9D5KyXz8ojSdIXWGekNzGzN',
+        'ms': '1bs-_5WNRiZBjO-DtcNtkcIle-98homf_',
+        'mt': '1L7aU3iGjm6SmPIU74k990qRgHFV9hrL0',
+        'mus': '1_b7DcRqiKJFEFwp87cUecqf8A5BDbTIJ',  # leer
+        'mwl': '1MfP0jba2jQfGVeJOLq26MjI6fYY7xTPu',
+        'my': '16wsIGBhNVd2lC2p6n1X8rdMbiaemeiUM',
+        'myv': '1KEqHmfx2pfU-a1tdI_7ZxMQAk5NJzJjB',
+        'mzn': '1CflvmYEXZnWwpsBmIs2OvG-zDDvLEMDJ',
+        'na': '1r0AVjee5wNnrcgJxQmVGPVKg5YWz1irz',
+        'nah': '1fx6eu91NegyueZ1i0XaB07CKjUwjHN7H',
+        'nap': '1bhT4sXCJvaTchCIV9mwLBtf3a7OprbVB',
+        'nds-nl': '1UIFi8eOCuFYJXSAXZ9pCWwkQMlHaY4ye',
+        'nds': '1FLgZIXUWa_vekDt4ndY0B5XL7FNLiulr',
+        'ne': '1gEoCjSJmzjIH4kdHsbDZzD6ID4_78ekS',
+        'new': '1_-p45Ny4w9UvGuhD8uRNSPPeaARYvESH',
+        'ng': '11yxPdkmpmnijQUcnFHZ3xcOmLTYJmN_R',
+        'nl': '1dqYXg3ilzVOSQ_tz_dF47elSIvSIhgqd',
+        'nn': '1pDrtRhQ001z2WUNMWCZQU3RV_M0BqOmv',
+        'no': '1zuT8MI96Ivpiu9mEVFNjwbiM8gJlSzY2',
+        'nov': '1l38388Rln0NXsSARMZHmTmyfo5C0wYTd',
+        'nrm': '10vxPq1Nci7Wpq4XOvx3dtqODskzjdxJQ',
+        'nso': '1iaIV8qlT0RDnbeQlnxJ3RehsG3gU5ePK',
+        'nv': '1oN31jT0w3wP9aGwAPz91pSdUytnd9B0g',
+        'ny': '1eEKH_rUPC560bfEg11kp3kbe8qWm35IG',
+        'oc': '1C01cW8G_j8US-DTrsmeal_ENHTtNWn-H',
+        'olo': '1vbDwKZKqFq84dusr1SvDx5JbBcPanx9L',  # leer
+        'om': '1q3h22VMbWg2kgVFm-OArR-E4y1yBQ1JX',
+        'or': '1k8LwCE8nC7lq6neXDaS3zRn0KOrd9RnS',
+        'os': '1u81KAB34aEQfet00dLMRIBJsfRwbDTij',
+        'pa': '1JDEHL1VcLHBamgTPBom_Ryi8hk6PBpsu',
+        'pag': '1k905VUWnRgY8kFb2P2431Kr4dZuolYGF',
+        'pam': '1ssugGyJb8ipispC60B3I6kzMsri1WcvC',
+        'pap': '1Za0wfwatxYoD7jGclmTtRoBP0uV_qImQ',
+        'pcd': '1csJlKgtG04pdIYCUWhsCCZARKIGlEYPx',
+        'pdc': '1Xnms4RXZKZ1BBQmQJEPokmkiweTpouUw',
+        'pfl': '1tPQfHX7E0uKMdDSlwNw5aGmaS5bUK0rn',
+        'pi': '16b-KxNxzbEuyoNSlI3bfe2YXmdSEsPFu',
+        'pih': '1vwyihTnS8_PE5BNK7cTISmIBqGWvsVnF',
+        'pl': '1fijjS0LbfpKcoPB5V8c8fH08T8AkXRp9',
+        'pms': '12ySc7X9ajWWqMlBjyrPiEdc-qVBuIkbA',
+        'pnb': '1RB3-wjluhTKbdTGCsk3nag1bM3m4wENb',
+        'pnt': '1ZCUzms6fY4on_fW8uVgO7cEs9KHydHY_',
+        'ps': '1WKl9Av6Sqz6aHKyUM5kIh90mzFzyVWH9',
+        'pt': '13BX-_4_hcTUp59HDyczFDI32qUB94vUY',
+        'qu': '1CB_C4ygtRoegkqgcqfXNHr8oQd-UcvDE',
+        'rm': '1YRSGgWoxEqSojHXuBHJnY8vAHr1VgLu-',
+        'rmy': '1uFcCyvOWBJWKFQxbkYSp373xUXVl4IgF',
+        'rn': '1ekyyb2MvupYGY_E8_BhKvV664sLvW4aE',
+        'ro': '1YfeNTSoxU-zJMnyQotLk5X8B_6nHryBu',
+        'roa-rup': '150s4H4TdQ5nNYVC6j0E416TUAjBE85yy',
+        'roa-tara': '1H6emfQsD_a5yohK4RMPQ-GrnHXqqVgr3',
+        'ru': '11gP2s-SYcfS3j9MjPp5C3_nFeQB-8x86',
+        'rue': '1OuSglZAndja1J5D5IUmdbt_niTTyEgYK',
+        'rw': '1NuhHfi0-B-Xlr_BApijnxCw0WMEltttP',
+        'sa': '1P2S3gL_zvKgXLKJJxg-Fb4z8XdlVpQik',
+        'sah': '1qz0MpKckzUref2FX_FYiNzI2p4BDc5oR',
+        'sc': '1oAYj_Fty4FUwjAOBEBaiZt_cY8dtpDfA',
+        'scn': '1sDN9zHkXWYoHYx-DUu-GPvsUgB_IRa8S',
+        'sco': '1i8W7KQPj6YZQLop89vZBSybJNgNsvXWR',
+        'sd': '1vaNqfv3S8Gl5pQmig3vwWQ3cqRTsXmMR',
+        'se': '1RT9xhn0Vl90zjWYDTw5V1L_u1Oh16tpP',
+        'sg': '1iIh2oXD2Szz_AygUvTt3_ZK8a3RYEGZ_',
+        'sh': '1qPwLiAm6t4__G-zVEOrBgYx6VRmgDgiS',
+        'si': '1G5ryceID0TP6SAO42e-HAbIlCvYmnUN7',
+        'simple': '1FVV49o_RlK6M5Iw_7zeJOEDQoTa5zSbq',
+        'sk': '11mkYvbmAWKTInj6t4Ma8BUPxoR5o6irL',
+        'sl': '1fsIZS5LgMzMzZ6T7ogStyj-ILEZIBRvO',
+        'sm': '1yefECpKX_Y4R7G2tggIxvc_BvJfOAz-t',
+        'sn': '1fYeCjMPvRAv94kvZjiKI-ktIDLkbv0Ve',
+        'so': '1Uc-eSZnJb36SgeTvRU3GirXZOlGD_NB6',
+        'sq': '11u-53n71O_yjpwRiCQSwgL7N2w72ZptX',
+        'sr': '1PGLGlQi8Q0Eac6dib-uuCJAAHK6SF5Pz',
+        'srn': '1JKiL3TSXqK1-KhPfAwMK0uqw90WEzg7M',
+        'ss': '1e0quNEsA1dn57-IbincF4D82dRWgzQlp',
+        'st': '1ny-FBzpBqIDgv6jMcsoFev3Ih65FNZFO',
+        'stq': '15Fx32ROy2IM6lSqAPUykkr3CITR6Xd7v',
+        'su': '1C0FJum7bYZpnyptBvfAgwJb0TX2hggtO',
+        'sv': '1YyqzOSXzK5yrAou9zeTDWH_7s569mDcz',
+        'sw': '1_bNTj6T8eXlNAIuHaveleWlHB_22alJs',
+        'szl': '1_dXEip1snK4CPVGqH8x7lF5O-6FdCNFW',
+        'ta': '1ZFTONsxGtSnC9QB6RpWSvgD_MbZwIhHH',
+        'tcy': '15R6u7KQs1vmDSm_aSDrQMJ3Q6q3Be0r7',  # leer
+        'te': '11Sx-pBAPeZOXGyv48UNSVMD0AH7uf4YN',
+        'tet': '11mr2MYLcv9pz7mHhGGNi5iNCOVErYeOt',
+        'tg': '16ttF7HWqM9Cnj4qmgf3ZfNniiOJfZ52w',
+        'th': '14xhIt-xr5n9nMuvcwayCGM1-zBCFZquW',
+        'ti': '123q5e9MStMShp8eESGtHdSBGLDrCKfJU',
+        'tk': '1X-JNInt34BNGhg8A8Peyjw2WjsALdXsD',
+        'tl': '1WkQHbWd9cqtTnSHAv0DpUThaBnzeSPTJ',
+        'tn': '1fHfQHetZn8-fLuRZEu-cvs-kQYwPvjyL',
+        'to': '1cHOLaczYJ8h-OqQgxeoH9vMG3izg6muT',
+        'tpi': '1YsRjxVu6NYOrXRb8oqMO9FPaicelFEcu',
+        'tr': '1J1Zy02IxvtCK0d1Ba2h_Ulit1mVb9UIX',
+        'ts': '1pIcfAt3KmtmDkyhOl-SMSeoM8aP8bOpl',
+        'tt': '1vsfzCjj-_bMOn5jBai41TF5GjKJM_Ius',
+        'tum': '1NWcg65daI2Bt0awyEgU6apUDbBmiqCus',
+        'tw': '1WCYKZIqS7AagS76QFSfbteiOgFNBvNne',
+        'ty': '1DIqaP1l-N9VXTNokrlr6EuPMGE765o4h',
+        'tyv': '1F3qa05OYLBcjT1lXMurAJFDXP_EesCvM',
+        'udm': '1T0YMTAPLOk768sstnewy5Jxgx2RPu3Rb',
+        'ug': '1fjezvqlysyZhiQMZdazqLGgk72PqtXAw',
+        'uk': '1UMJCHtzxkfLDBJE7NtfN5FeMrnnUVwoh',
+        'ur': '1WNaD2TuHvdsF-z0k_emQYchwoQQDFmRk',
+        'uz': '11wrG2FSTpRJc2jb5MhgvxjkVDYhT8M-l',
+        've': '1PucJ7pJ4CXGEXZ5p_WleZDs2usNz74to',
+        'vec': '1cAVjm_y3ehNteDQIYz9yyoq1EKkqOXZ0',
+        'vep': '1K_eqV7O6C7KPJWZtmIuzFMKAagj-0O85',
+        'vi': '1yQ6nhm1BmG9lD4_NaG1hE5VV6biEaV5f',
+        'vls': '1bpQQW6pKHruKJJaKtuggH5rReMXyeVXp',
+        'vo': '1D80QRdTpe7H4mHFKpfugscsjX71kiMJN',
+        'wa': '1m4B81QYbf74htpInDU5p7d0n0ot8WLPZ',
+        'war': '1EC3jsHtu22tHBv6jX_I4rupC5RwV3OYd',
+        'wo': '1vChyqNNLu5xYHdyHpACwwpw4l3ptiKlo',
+        'wuu': '1_EIn02xCUBcwLOwYnA-lScjS2Lh2ECw6',
+        'xal': '19bKXsL1D2UesbB50JPyc9TpG1lNc2POt',
+        'xh': '1pPVcxBG3xsCzEnUzlohc_p89gQ9dSJB3',
+        'xmf': '1SM9llku6I_ZuZz05mOBuL2lx-KQXvehr',
+        'yi': '1WNWr1oV-Nl7c1Jv8x_MiAj2vxRtyQawu',
+        'yo': '1yNVOwMOWeglbOcRoZzgd4uwlN5JMynnY',
+        'za': '1i7pg162cD_iU9h8dgtI2An8QCcbzUAjB',
+        'zea': '1EWSkiSkPBfbyjWjZK0VuKdpqFnFOpXXQ',
+        'zh-classical': '1uUKZamNp08KA7s7794sKPOqPALvo_btl',
+        'zh-min-nan': '1oSgz3YBXLGUgI7kl-uMOC_ww6L0FNFmp',
+        'zh-yue': '1zhwlUeeiyOAU1QqwqZ8n91yXIRPFA7UE',
+        'zh': '1LZ96GUhkVHQU-aj2C3WOrtffOp0U3Z7f',
+        'zu': '1FyXl_UK1737XB3drqQFhGXiJrJckiB1W'
+    }
+    return languages_ids[language]
 
-        super(WNUT_17, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
-        )
 
-class WEIBO_NER(ColumnCorpus):
+class WIKIGOLD_NER(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
@@ -2200,12 +2330,11 @@ def __init__(
             document_as_sequence: bool = False,
     ):
         """
-        Initialize the WEIBO_NER corpus . The first time you call this constructor it will automatically
+        Initialize the wikigold corpus. The first time you call this constructor it will automatically
         download the dataset.
         :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
         to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
-        POS tags instead
+        :param tag_to_bioes: NER by default, should not be changed
         :param in_memory: If True, keeps dataset in memory giving speedups in training.
         :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
@@ -2213,7 +2342,7 @@ def __init__(
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: 'text', 1: 'ner'}
+        columns = {0: "text", 1: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2224,117 +2353,32 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        weiboNER_conll_path = "https://raw.githubusercontent.com/87302380/WEIBO_NER/main/data/"
-        cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.train", Path("datasets") / dataset_name)
-        cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.test", Path("datasets") / dataset_name)
-        cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.dev", Path("datasets") / dataset_name)
-
+        wikigold_ner_path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/wikigold/CONLL-format/data/"
+        cached_path(f"{wikigold_ner_path}wikigold.conll.txt", Path("datasets") / dataset_name)
 
-        super(WEIBO_NER, self).__init__(
+        super(WIKIGOLD_NER, self).__init__(
             data_folder,
             columns,
             tag_to_bioes=tag_to_bioes,
             encoding="utf-8",
             in_memory=in_memory,
-            train_file="weiboNER_2nd_conll_format.train",
-            test_file="weiboNER_2nd_conll_format.test",
-            dev_file="weiboNER_2nd_conll_format.dev",
+            train_file='wikigold.conll.txt',
             document_separator_token=None if not document_as_sequence else "-DOCSTART-",
         )
 
-class BIOSCOPE(ColumnCorpus):
-
-    def __init__(
-            self,
-            base_path: Union[str, Path] = None,
-            in_memory: bool = True,
-    ):
-        if type(base_path) == str:
-            base_path: Path = Path(base_path)
-
-        # column format
-        columns = {0: "text", 1: "tag"}
-
-        # this dataset name
-        dataset_name = self.__class__.__name__.lower()
-
-        # default dataset folder is the cache root
-        if not base_path:
-            base_path = Path(flair.cache_root) / "datasets"
-        data_folder = base_path / dataset_name
-
-        # download data if necessary
-        bioscope_path = "https://raw.githubusercontent.com/whoisjones/BioScopeSequenceLabelingData/master/sequence_labeled/"
-        cached_path(f"{bioscope_path}output.txt", Path("datasets") / dataset_name)
-
-        super(BIOSCOPE, self).__init__(
-            data_folder, columns, in_memory=in_memory, train_file="output.txt"
-        )
-
-
-def _download_wikiner(language_code: str, dataset_name: str):
-    # download data if necessary
-    wikiner_path = (
-        "https://raw.githubusercontent.com/dice-group/FOX/master/input/Wikiner/"
-    )
-    lc = language_code
-
-    data_file = (
-            Path(flair.cache_root)
-            / "datasets"
-            / dataset_name
-            / f"aij-wikiner-{lc}-wp3.train"
-    )
-    if not data_file.is_file():
-
-        cached_path(
-            f"{wikiner_path}aij-wikiner-{lc}-wp3.bz2", Path("datasets") / dataset_name
-        )
-        import bz2, shutil
-
-        # unpack and write out in CoNLL column-like format
-        bz_file = bz2.BZ2File(
-            Path(flair.cache_root)
-            / "datasets"
-            / dataset_name
-            / f"aij-wikiner-{lc}-wp3.bz2",
-            "rb",
-        )
-        with bz_file as f, open(
-                Path(flair.cache_root)
-                / "datasets"
-                / dataset_name
-                / f"aij-wikiner-{lc}-wp3.train",
-                "w",
-                encoding="utf-8"
-        ) as out:
-            for line in f:
-                line = line.decode("utf-8")
-                words = line.split(" ")
-                for word in words:
-                    out.write("\t".join(word.split("|")) + "\n")
 
-class UP_CHINESE(ColumnCorpus):
+class WIKINER_ENGLISH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            in_memory: bool = True,
-            document_as_sequence: bool = False,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = False,
     ):
-        """
-        Initialize the Chinese dataset from the Universal Propositions Bank, comming from that webpage:
-        https://github.com/System-T/UniversalPropositions/tree/master/UP_Chinese
-
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {1: "text", 9: "frame"}
+        columns = {0: "text", 1: "pos", 2: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2345,92 +2389,25 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        up_zh_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Chinese/"
-        cached_path(f"{up_zh_path}zh-up-train.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_zh_path}zh-up-dev.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_zh_path}zh-up-test.conllu", Path("datasets") / dataset_name)
+        _download_wikiner("en", dataset_name)
 
-        super(UP_CHINESE, self).__init__(
-            data_folder,
-            columns,
-            encoding="utf-8",
-            train_file="zh-up-train.conllu",
-            test_file="zh-up-test.conllu",
-            dev_file="zh-up-dev.conllu",
-            in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-            comment_symbol="#",
+        super(WIKINER_ENGLISH, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
-class UP_ENGLISH(ColumnCorpus):
-    def __init__(
-            self,
-            base_path: Union[str, Path] = None,
-            in_memory: bool = True,
-            document_as_sequence: bool = False,
-    ):
-        """
-        Initialize the English dataset from the Universal Propositions Bank, comming from that webpage:
-        https://github.com/System-T/UniversalPropositions.
-
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
-        if type(base_path) == str:
-            base_path: Path = Path(base_path)
-
-        # column format
-        columns = {1: "text", 10: "frame"}
-
-        # this dataset name
-        dataset_name = self.__class__.__name__.lower()
-
-        # default dataset folder is the cache root
-        if not base_path:
-            base_path = Path(flair.cache_root) / "datasets"
-        data_folder = base_path / dataset_name
-
-        # download data if necessary
-        up_en_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_English-EWT/"
-        cached_path(f"{up_en_path}en_ewt-up-train.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_en_path}en_ewt-up-dev.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_en_path}en_ewt-up-test.conllu", Path("datasets") / dataset_name)
-
-        super(UP_ENGLISH, self).__init__(
-            data_folder,
-            columns,
-            encoding="utf-8",
-            train_file="en_ewt-up-train.conllu",
-            test_file="en_ewt-up-test.conllu",
-            dev_file="en_ewt-up-dev.conllu",
-            in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-            comment_symbol="#",
-        )
 
-class UP_FRENCH(ColumnCorpus):
+class WIKINER_GERMAN(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            in_memory: bool = True,
-            document_as_sequence: bool = False,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = False,
     ):
-        """
-        Initialize the French dataset from the Universal Propositions Bank, comming from that webpage:
-        https://github.com/System-T/UniversalPropositions.
-
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {1: "text", 9: "frame"}
+        columns = {0: "text", 1: "pos", 2: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2441,44 +2418,25 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        up_fr_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_French/"
-        cached_path(f"{up_fr_path}fr-up-train.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_fr_path}fr-up-dev.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_fr_path}fr-up-test.conllu", Path("datasets") / dataset_name)
+        _download_wikiner("de", dataset_name)
 
-        super(UP_FRENCH, self).__init__(
-            data_folder,
-            columns,
-            encoding="utf-8",
-            train_file="fr-up-train.conllu",
-            test_file="fr-up-test.conllu",
-            dev_file="fr-up-dev.conllu",
-            in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-            comment_symbol="#",
+        super(WIKINER_GERMAN, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
-class UP_FINNISH(ColumnCorpus):
+
+class WIKINER_DUTCH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            in_memory: bool = True,
-            document_as_sequence: bool = False,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = False,
     ):
-        """
-        Initialize the Finnish dataset from the Universal Propositions Bank, comming from that webpage:
-        https://github.com/System-T/UniversalPropositions/tree/master/UP_Finnish
-
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {1: "text", 9: "frame"}
+        columns = {0: "text", 1: "pos", 2: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2489,44 +2447,25 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        up_fi_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Finnish/"
-        cached_path(f"{up_fi_path}fi-up-train.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_fi_path}fi-up-dev.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_fi_path}fi-up-test.conllu", Path("datasets") / dataset_name)
+        _download_wikiner("nl", dataset_name)
 
-        super(UP_FINNISH, self).__init__(
-            data_folder,
-            columns,
-            encoding="utf-8",
-            train_file="fi-up-train.conllu",
-            test_file="fi-up-test.conllu",
-            dev_file="fi-up-dev.conllu",
-            in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-            comment_symbol="#",
+        super(WIKINER_DUTCH, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
-class UP_GERMAN(ColumnCorpus):
+
+class WIKINER_FRENCH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            in_memory: bool = True,
-            document_as_sequence: bool = False,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = False,
     ):
-        """
-        Initialize the German dataset from the Universal Propositions Bank, comming from that webpage:
-        https://github.com/System-T/UniversalPropositions.
-
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {1: "text", 9: "frame"}
+        columns = {0: "text", 1: "pos", 2: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2537,44 +2476,25 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        up_de_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_German/"
-        cached_path(f"{up_de_path}de-up-train.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_de_path}de-up-dev.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_de_path}de-up-test.conllu", Path("datasets") / dataset_name)
+        _download_wikiner("fr", dataset_name)
 
-        super(UP_GERMAN, self).__init__(
-            data_folder,
-            columns,
-            encoding="utf-8",
-            train_file="de-up-train.conllu",
-            test_file="de-up-test.conllu",
-            dev_file="de-up-dev.conllu",
-            in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-            comment_symbol="#",
+        super(WIKINER_FRENCH, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
-class UP_ITALIAN(ColumnCorpus):
+
+class WIKINER_ITALIAN(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            in_memory: bool = True,
-            document_as_sequence: bool = False,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = False,
     ):
-        """
-        Initialize the Italian dataset from the Universal Propositions Bank, comming from that webpage:
-        https://github.com/System-T/UniversalPropositions/tree/master/UP_Italian
-
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {1: "text", 9: "frame"}
+        columns = {0: "text", 1: "pos", 2: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2585,44 +2505,25 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        up_it_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Italian/"
-        cached_path(f"{up_it_path}it-up-train.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_it_path}it-up-dev.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_it_path}it-up-test.conllu", Path("datasets") / dataset_name)
+        _download_wikiner("it", dataset_name)
 
-        super(UP_ITALIAN, self).__init__(
-            data_folder,
-            columns,
-            encoding="utf-8",
-            train_file="it-up-train.conllu",
-            test_file="it-up-test.conllu",
-            dev_file="it-up-dev.conllu",
-            in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-            comment_symbol="#",
+        super(WIKINER_ITALIAN, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
-class UP_SPANISH(ColumnCorpus):
+
+class WIKINER_SPANISH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            in_memory: bool = True,
-            document_as_sequence: bool = False,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = False,
     ):
-        """
-        Initialize the Spanish dataset from the Universal Propositions Bank, comming from that webpage:
-        https://github.com/System-T/UniversalPropositions
-
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {1: "text", 9: "frame"}
+        columns = {0: "text", 1: "pos", 2: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2633,44 +2534,25 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish/"
-        cached_path(f"{up_es_path}es-up-train.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_es_path}es-up-dev.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_es_path}es-up-test.conllu", Path("datasets") / dataset_name)
+        _download_wikiner("es", dataset_name)
 
-        super(UP_SPANISH, self).__init__(
-            data_folder,
-            columns,
-            encoding="utf-8",
-            train_file="es-up-train.conllu",
-            test_file="es-up-test.conllu",
-            dev_file="es-up-dev.conllu",
-            in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-            comment_symbol="#",
+        super(WIKINER_SPANISH, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
-class UP_SPANISH_ANCORA(ColumnCorpus):
+
+class WIKINER_PORTUGUESE(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            in_memory: bool = True,
-            document_as_sequence: bool = False,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = False,
     ):
-        """
-        Initialize the Spanish AnCora dataset from the Universal Propositions Bank, comming from that webpage:
-        https://github.com/System-T/UniversalPropositions
-
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {1: "text", 9: "frame"}
+        columns = {0: "text", 1: "pos", 2: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2681,127 +2563,83 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish-AnCora/"
-        cached_path(f"{up_es_path}es_ancora-up-train.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_es_path}es_ancora-up-dev.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_es_path}es_ancora-up-test.conllu", Path("datasets") / dataset_name)
+        _download_wikiner("pt", dataset_name)
 
-        super(UP_SPANISH_ANCORA, self).__init__(
-            data_folder,
-            columns,
-            encoding="utf-8",
-            train_file="es_ancora-up-train.conllu",
-            test_file="es_ancora-up-test.conllu",
-            dev_file="es_ancora-up-dev.conllu",
-            in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-            comment_symbol="#",
+        super(WIKINER_PORTUGUESE, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
 
-class MITMovieNERSimple(ColumnCorpus):
+class WIKINER_POLISH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
-            in_memory: bool = True,
+            in_memory: bool = False,
     ):
-        """
-        Initialize the eng corpus of the MIT Movie Corpus (it has simpler queries compared to the trivia10k13 corpus)
-        in BIO format. The first time you call this constructor it will automatically download the dataset.
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
-        POS tags instead
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        """
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # column format
-        columns = {0: "ner", 1: "text"}
+        columns = {0: "text", 1: "pos", 2: "ner"}
 
-        # dataset name
+        # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
-        # data folder: default dataset folder is the cache root
-        if type(base_path) == str:
-            base_path: Path = Path(base_path)
+        # default dataset folder is the cache root
         if not base_path:
-            base_path: Path = Path(flair.cache_root) / "datasets"
+            base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/"
-        train_file = "engtrain.bio"
-        test_file = "engtest.bio"
-        cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name)
-        cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name)
+        _download_wikiner("pl", dataset_name)
 
-        super(MITMovieNERSimple, self).__init__(
-            data_folder,
-            columns,
-            train_file=train_file,
-            test_file=test_file,
-            tag_to_bioes=tag_to_bioes,
-            in_memory=in_memory,
+        super(WIKINER_POLISH, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
-class MITMovieNERComplex(ColumnCorpus):
+
+class WIKINER_RUSSIAN(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
-            in_memory: bool = True,
+            in_memory: bool = False,
     ):
-        """
-        Initialize the trivia10k13 corpus of the MIT Movie Corpus (it has more complex queries compared to the eng corpus)
-        in BIO format. The first time you call this constructor it will automatically download the dataset.
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
-        POS tags instead
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        """
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # column format
-        columns = {0: "ner", 1: "text"}
+        columns = {0: "text", 1: "pos", 2: "ner"}
 
-        # dataset name
+        # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
-        # data folder: default dataset folder is the cache root
-        if type(base_path) == str:
-            base_path: Path = Path(base_path)
+        # default dataset folder is the cache root
         if not base_path:
-            base_path: Path = Path(flair.cache_root) / "datasets"
+            base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/"
-        train_file = "trivia10k13train.bio"
-        test_file = "trivia10k13test.bio"
-        cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name)
-        cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name)
+        _download_wikiner("ru", dataset_name)
 
-        super(MITMovieNERComplex, self).__init__(
-            data_folder,
-            columns,
-            train_file=train_file,
-            test_file=test_file,
-            tag_to_bioes=tag_to_bioes,
-            in_memory=in_memory,
+        super(WIKINER_RUSSIAN, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
-class SEC_FILLINGS(ColumnCorpus):
+
+class WNUT_17(ColumnCorpus):
     def __init__(
             self,
-            base_path: Union[str, Path] = None, 
+            base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
             in_memory: bool = True,
     ):
-        
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "pos", 3: "ner"}
+        columns = {0: "text", 1: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2812,22 +2650,19 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        SEC_FILLINGS_Path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/SEC-filings/CONLL-format/data/"
-        cached_path(f"{SEC_FILLINGS_Path}test/FIN3.txt", Path("datasets") / dataset_name)
-        cached_path(f"{SEC_FILLINGS_Path}train/FIN5.txt", Path("datasets") / dataset_name)
+        wnut_path = "https://noisy-text.github.io/2017/files/"
+        cached_path(f"{wnut_path}wnut17train.conll", Path("datasets") / dataset_name)
+        cached_path(f"{wnut_path}emerging.dev.conll", Path("datasets") / dataset_name)
+        cached_path(
+            f"{wnut_path}emerging.test.annotated", Path("datasets") / dataset_name
+        )
 
-        super(SEC_FILLINGS, self).__init__(
-            data_folder,
-            columns,
-            tag_to_bioes=tag_to_bioes,
-            encoding="utf-8",
-            in_memory=in_memory,
-            train_file='FIN5.txt',
-            test_file="FIN3.txt",
-            skip_first_line=True
+        super(WNUT_17, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
-class TURKU_NER(ColumnCorpus):
+
+class WNUT_2020_NER(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
@@ -2836,12 +2671,11 @@ def __init__(
             document_as_sequence: bool = False,
     ):
         """
-        Initialize the Finnish TurkuNER corpus. The first time you call this constructor it will automatically
+        Initialize the WNUT_2020_NER corpus. The first time you call this constructor it will automatically
         download the dataset.
         :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
         to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
-        POS tags instead
+        :param tag_to_bioes: NER by default, since it is the only option of the WNUT corpus.
         :param in_memory: If True, keeps dataset in memory giving speedups in training.
         :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
@@ -2860,23 +2694,201 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        conll_path = "https://raw.githubusercontent.com/TurkuNLP/turku-ner-corpus/master/data/conll"
-        dev_file = "dev.tsv"
-        test_file = "test.tsv"
-        train_file = "train.tsv"
-        cached_path(f"{conll_path}/{dev_file}", Path("datasets") / dataset_name)
-        cached_path(f"{conll_path}/{test_file}", Path("datasets") / dataset_name)
-        cached_path(f"{conll_path}/{train_file}", Path("datasets") / dataset_name)
+        github_url = "https://github.com/jeniyat/WNUT_2020_NER/archive/master.zip"
 
-        super(TURKU_NER, self).__init__(
+        for sample in ["train", "test", "dev"]:
+
+            sample_file = data_folder / (sample + ".txt")
+            if not sample_file.is_file():
+
+                zip_path = cached_path(
+                    f"{github_url}", Path("datasets") / dataset_name
+                )
+
+                # unzip the downloaded repo and merge the train, dev and test datasets
+                unpack_file(zip_path, data_folder, "zip", False)  # unzipped folder name: WNUT_2020_NER-master
+
+                if sample == "test":
+                    file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data_2020/Conll_Format/")
+                else:
+                    file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data/Conll_Format/")
+                filenames = os.listdir(file_path)
+                with open(data_folder / (sample + '.txt'), 'w') as outfile:
+                    for fname in filenames:
+                        with open(file_path / fname) as infile:
+                            lines = infile.read()
+                            outfile.write(lines)
+
+                shutil.rmtree(str(data_folder / "WNUT_2020_NER-master"))  # clean up when done
+
+        super(WNUT_2020_NER, self).__init__(
             data_folder,
             columns,
-            dev_file=dev_file,
-            test_file=test_file,
-            train_file=train_file,
-            column_delimiter="\t",
             tag_to_bioes=tag_to_bioes,
-            encoding="latin-1",
+            encoding="utf-8",
             in_memory=in_memory,
             document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-        )
\ No newline at end of file
+        )
+
+
+def _download_wikiner(language_code: str, dataset_name: str):
+    # download data if necessary
+    wikiner_path = (
+        "https://raw.githubusercontent.com/dice-group/FOX/master/input/Wikiner/"
+    )
+    lc = language_code
+
+    data_file = (
+            Path(flair.cache_root)
+            / "datasets"
+            / dataset_name
+            / f"aij-wikiner-{lc}-wp3.train"
+    )
+    if not data_file.is_file():
+
+        cached_path(
+            f"{wikiner_path}aij-wikiner-{lc}-wp3.bz2", Path("datasets") / dataset_name
+        )
+        import bz2, shutil
+
+        # unpack and write out in CoNLL column-like format
+        bz_file = bz2.BZ2File(
+            Path(flair.cache_root)
+            / "datasets"
+            / dataset_name
+            / f"aij-wikiner-{lc}-wp3.bz2",
+            "rb",
+        )
+        with bz_file as f, open(
+                Path(flair.cache_root)
+                / "datasets"
+                / dataset_name
+                / f"aij-wikiner-{lc}-wp3.train",
+                "w",
+                encoding="utf-8"
+        ) as out:
+            for line in f:
+                line = line.decode("utf-8")
+                words = line.split(" ")
+                for word in words:
+                    out.write("\t".join(word.split("|")) + "\n")
+
+
+class XTREME(MultiCorpus):
+    def __init__(
+            self,
+            languages: Union[str, List[str]] = None,
+            base_path: Union[str, Path] = None,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = False,
+    ):
+        """
+        Xtreme corpus for cross-lingual NER consisting of datasets of a total of 176 languages. The data comes from the google
+        research work XTREME https://github.com/google-research/xtreme. All datasets for NER and respective language abbreviations (e.g.
+        "en" for english can be found here https://www.amazon.com/clouddrive/share/d3KGCRCIYwhKJF0H3eWA26hjg2ZCRhjpEQtDL70FSBN/folder/C43gs51bSIaq5sFTQkWNCQ?_encoding=UTF8&*Version*=1&*entries*=0&mgh=1 )
+        The data is derived from the wikiann dataset https://elisa-ie.github.io/wikiann/ (license: https://opendatacommons.org/licenses/by/)
+
+        Parameters
+        ----------
+        languages : Union[str, List[str]], optional
+            Default the 40 languages that are used in XTREME are loaded. Otherwise on can hand over a strings or a list of strings
+            consisiting of abbreviations for languages. All datasets will be loaded in a MultiCorpus object.
+        base_path : Union[str, Path], optional
+            Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+            to point to a different folder but typically this should not be necessary.
+        tag_to_bioes : str, optional
+            The data is in bio-format. It will by default (with the string "ner" as value) be transformed
+            into the bioes format. If you dont want that set it to None.
+
+        """
+        # if no languages are given as argument all languages used in XTREME will be loaded
+        if not languages:
+            languages = ["af", "ar", "bg", "bn", "de", "el", "en", "es", "et", "eu", "fa", "fi", "fr", "he", "hi", "hu",
+                         "id", "it", "ja", "jv", "ka", "kk", "ko", "ml", "mr", "ms", "my", "nl", "pt", "ru", "sw", "ta",
+                         "te", "th", "tl", "tr", "ur", "vi", "yo", "zh"]
+
+        # if only one language is given
+        if type(languages) == str:
+            languages = [languages]
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
+        # column format
+        columns = {0: "text", 1: "ner"}
+
+        # this dataset name
+        dataset_name = "xtreme"
+
+        # default dataset folder is the cache root
+        if not base_path:
+            base_path = Path(flair.cache_root) / "datasets"
+        data_folder = base_path / dataset_name
+
+        # For each language in languages, the file is downloaded if not existent
+        # Then a comlumncorpus of that data is created and saved in a list
+        # This list is handed to the multicorpus
+
+        # list that contains the columncopora
+        corpora = []
+
+        hu_path = "https://nlp.informatik.hu-berlin.de/resources/datasets/panx_dataset"
+
+        # download data if necessary
+        for language in languages:
+
+            language_folder = data_folder / language
+
+            # if language not downloaded yet, download it
+            if not language_folder.exists():
+
+                file_name = language + '.tar.gz'
+                # create folder
+                os.makedirs(language_folder)
+
+                # download from HU Server
+                temp_file = cached_path(
+                    hu_path + "/" + file_name,
+                    Path("datasets") / dataset_name / language
+                )
+
+                # unzip
+                print("Extract data...")
+                import tarfile
+                tar = tarfile.open(str(temp_file), "r:gz")
+                for part in ["train", "test", "dev"]:
+                    tar.extract(part, str(language_folder))
+                tar.close()
+                print('...done.')
+
+                # transform data into required format
+                print("Process dataset...")
+                for part in ["train", "test", "dev"]:
+                    xtreme_to_simple_ner_annotation(str(language_folder / part))
+                print('...done.')
+
+            # initialize comlumncorpus and add it to list
+            print("Read data into corpus...")
+            corp = ColumnCorpus(data_folder=language_folder,
+                                column_format=columns,
+                                tag_to_bioes=tag_to_bioes,
+                                in_memory=in_memory,
+                                )
+            corpora.append(corp)
+            print("...done.")
+
+        super(XTREME, self).__init__(
+            corpora, name='xtreme'
+        )
+
+
+def xtreme_to_simple_ner_annotation(data_file: Union[str, Path]):
+    with open(data_file, 'r', encoding='utf-8') as f:
+        lines = f.readlines()
+    with open(data_file, 'w', encoding='utf-8') as f:
+        for line in lines:
+            if line == '\n':
+                f.write(line)
+            else:
+                liste = line.split()
+                f.write(liste[0].split(':', 1)[1] + ' ' + liste[1] + '\n')
diff --git a/resources/docs/TUTORIAL_6_CORPUS.md b/resources/docs/TUTORIAL_6_CORPUS.md
index f981bf715..0c7419abe 100644
--- a/resources/docs/TUTORIAL_6_CORPUS.md
+++ b/resources/docs/TUTORIAL_6_CORPUS.md
@@ -162,20 +162,24 @@ data the first time you call the corresponding constructor ID. The following dat
 
 | ID(s) | Languages | Description |
 | -------------    | ------------- |------------- 
+| 'ANER_CORP' | Arabic  |  [Arabic Named Entity Recognition Corpus](http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp/) 4-class NER |
 | 'BIOFID' | German  |  [CoNLL-03](https://www.aclweb.org/anthology/K19-1081/) Biodiversity literature NER |
+| 'BIOSCOPE' | English  |  [BioScope](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-9-S11-S9/) biomedical texts annotated for uncertainty, negation and their scopes |
 | 'CONLL_03_DUTCH' | Dutch  |  [CoNLL-03](https://www.clips.uantwerpen.be/conll2002/ner/) 4-class NER |
 | 'CONLL_03_SPANISH' | Spanish  |  [CoNLL-03](https://www.clips.uantwerpen.be/conll2002/ner/) 4-class NER |
 | 'DANE' | Danish | [DaNE dataset](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#danish-dependency-treebank) | 
 | 'EUROPARL_NER_GERMAN' | German | [German Europarl dataset](https://nlpado.de/~sebastian/software/ner_german.shtml) NER in German EU parliament speeches | 
 | 'LER_GERMAN' | German | [Legal Entity Recognition](https://github.com/elenanereiss/Legal-Entity-Recognition) NER in German Legal Documents | 
-| 'MIT_RESTAURANTS' | English  |  [NER dataset for restaurant reviews](https://groups.csail.mit.edu/sls/downloads/restaurant/) |
+| 'MIT_MOVIE_NER_SIMPLE' | English  |  [NER dataset for movie reviews](https://groups.csail.mit.edu/sls/downloads/movie/) - simple NER |
+| 'MIT_MOVIE_NER_COMPLEX' | English  |  [NER dataset for movie reviews](https://groups.csail.mit.edu/sls/downloads/movie/) - complex NER |
+| 'MIT_RESTAURANT_NER' | English  |  [NER dataset for restaurant reviews](https://groups.csail.mit.edu/sls/downloads/restaurant/) |
 | 'NER_BASQUE' | Basque  |  [NER dataset for Basque](http://ixa2.si.ehu.eus/eiec/) |
 | 'NER_FINNISH' | Finnish | [Finer-data](https://github.com/mpsilfve/finer-data) | 
 | 'NER_SWEDISH' | Swedish | [Swedish Spraakbanken NER](https://github.com/klintan/swedish-ner-corpus/) 4-class NER |
+| 'TURKU_NER' | Finnish | [TURKU_NER](https://github.com/TurkuNLP/turku-ner-corpus) NER corpus created by the Turku NLP Group, University of Turku, Finland |
 | 'TWITTER_NER' | English  |  [Twitter NER dataset](https://github.com/aritter/twitter_nlp/) |
+| 'WEIBO_NER' | Chinese  | [Weibo NER corpus](https://paperswithcode.com/sota/chinese-named-entity-recognition-on-weibo-ner/).  |
 | 'WIKIANN' | 282 languages  | Gigantic [corpus for cross-lingual NER derived from Wikipedia](https://elisa-ie.github.io/wikiann/).  |
-| 'WNUT_17' | English  |  [WNUT-17](https://noisy-text.github.io/2017/files/) emerging entity detection |
-| 'WNUT_20' | English  |  [WNUT-20](https://github.com/jeniyat/WNUT_2020_NER) named entity extraction |
 | 'WIKIGOLD_NER' | English  |  [Wikigold](https://github.com/juand-r/entity-recognition-datasets/tree/master/data/wikigold) a manually annotated collection of Wikipedia text |
 | 'WIKINER_ENGLISH' | English  |  [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia |
 | 'WIKINER_GERMAN'  | German  |  [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia |
@@ -185,16 +189,33 @@ data the first time you call the corresponding constructor ID. The following dat
 | 'WIKINER_PORTUGUESE' | Portuguese  |  [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia |
 | 'WIKINER_POLISH' | Polish  |  [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia |
 | 'WIKINER_RUSSIAN'  | Russian  |  [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia |
+| 'WNUT_17' | English  |  [WNUT-17](https://noisy-text.github.io/2017/files/) emerging entity detection |
+| 'WNUT_2020_NER' | English  |  [WNUT-20](https://github.com/jeniyat/WNUT_2020_NER) named entity extraction |
 | 'XTREME' | 176 languages  |  [Xtreme](https://github.com/google-research/xtreme) corpus by Google Research for cross-lingual NER consisting of datasets of a total of 176 languages |
-| 'MITMovieNERSimple' | English | [eng](https://groups.csail.mit.edu/sls/downloads/movie) NER movie corpus collected by MIT (simpler queries) |
-| 'MITMovieNERComplex' | English | [trivia10k13](https://groups.csail.mit.edu/sls/downloads/movie) NER movie corpus collected by MIT (more complex queries) |
-| 'TURKU_NER' | Finnish | [TURKU_NER](https://github.com/TurkuNLP/turku-ner-corpus) NER corpus created by the Turku NLP Group, University of Turku, Finland |
 
 
 #### Biomedical Named Entity Recognition
 
 We support 31 biomedical NER datasets, listed [here](HUNFLAIR_CORPORA.md).
 
+
+#### Universal Proposition Banks 
+
+We now also support loading the [Universal Proposition Banks](https://github.com/System-T/UniversalPropositions)
+for the purpose of training multilingual frame detection systems. 
+
+| ID(s) | Languages | Description |
+| -------------    | ------------- |------------- |
+| 'UP_CHINESE' | Chinese  |  Universal Propositions for [Chinese](https://github.com/System-T/UniversalPropositions/tree/master/UP_Chinese) |
+| 'UP_ENGLISH'| English  |  Universal Propositions for [English](https://github.com/System-T/UniversalPropositions/tree/master/UP_English-EWT) |
+| 'UP_FINNISH'| Finnish  |  Universal Propositions for [Finnish](https://github.com/System-T/UniversalPropositions/tree/master/UP_Finnish)
+| 'UP_FRENCH'| French  |  Universal Propositions for [French](https://github.com/System-T/UniversalPropositions/tree/master/UP_French)
+| 'UP_GERMAN'| German  |  Universal Propositions for [German](https://github.com/System-T/UniversalPropositions/tree/master/UP_German) |
+| 'UP_ITALIAN', | Italian  |  Universal Propositions for [Italian](https://github.com/System-T/UniversalPropositions/tree/master/UP_Italian) |
+| 'UP_SPANISH' | Spanish  |  Universal Propositions for [Spanish](https://github.com/System-T/UniversalPropositions/tree/master/UP_Spanish) |
+| 'UP_SPANISH_ANCORA' | Spanish (Ancora Corpus)  |  Universal Propositions for [Spanish](https://github.com/System-T/UniversalPropositions/tree/master/UP_Spanish-AnCora) |
+
+
 #### Universal Dependency Treebanks
 
 | ID(s) | Languages | Description |

From d256d947b55cb8f057a7606f0de057afd0bb4c1d Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Wed, 25 Nov 2020 14:44:58 +0100
Subject: [PATCH 03/35] GH-1983: bump version number

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index fa33a27cc..d82f2155d 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ document embeddings, including our proposed **[Flair embeddings](https://www.acl
 * **A PyTorch NLP framework.** Our framework builds directly on [PyTorch](https://pytorch.org/), making it easy to
 train your own models and experiment with new approaches using Flair embeddings and classes.
 
-Now at [version 0.6.1](https://github.com/flairNLP/flair/releases)!
+Now at [version 0.7](https://github.com/flairNLP/flair/releases)!
 
 ## Comparison with State-of-the-Art
 

From 541e0a8fed226082d51df09f86ced3e8b6fd05d0 Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Wed, 25 Nov 2020 14:51:25 +0100
Subject: [PATCH 04/35] Update TUTORIAL_1_BASICS.md

---
 resources/docs/TUTORIAL_1_BASICS.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/resources/docs/TUTORIAL_1_BASICS.md b/resources/docs/TUTORIAL_1_BASICS.md
index 655ef375e..61828d0d0 100644
--- a/resources/docs/TUTORIAL_1_BASICS.md
+++ b/resources/docs/TUTORIAL_1_BASICS.md
@@ -80,7 +80,7 @@ print(untokenized_sentence)
 
 In this case, no tokenization is performed and the text is split on whitespaces, thus resulting in only 4 tokens here. 
 
-### Using a Different Tokenizer
+### Using a different tokenizer
 
 You can also pass custom tokenizers to the initialization method. For instance, if you want to tokenize a Japanese
 sentence you can use the 'janome' tokenizer instead, like this: 
@@ -110,12 +110,12 @@ You can write your own tokenization routine. Check the code of `flair.data.Token
  your own tokenization method.  
 
 ### Using pretokenized sequences
-You can pass pass a pretokenized sequence as list of words, e.g.
+You can alternatively pass a pretokenized sequence as list of words, e.g.
 
 ```python
 from flair.data import Sentence
-my_sent = Sentence(['The', 'grass', 'is', 'green', '.'])
-print(my_sent)
+sentence = Sentence(['The', 'grass', 'is', 'green', '.'])
+print(sentence)
 ```
 
 This should print:
@@ -129,7 +129,7 @@ Sentence: "The grass is green ."   [− Tokens: 5]
 
 In Flair, any data point can be labeled. For instance, you can label a word or label a sentence:
 
-### Adding Labels to Tokens
+### Adding labels to tokens
 
 A `Token` has fields for linguistic annotation, such as lemmas, part-of-speech tags or named entity tags. You can
 add a tag by specifying the tag type and the tag value. In this example, we're adding an NER tag of type 'color' to
@@ -171,7 +171,7 @@ This should print:
 Our color tag has a score of 1.0 since we manually added it. If a tag is predicted by our
 sequence labeler, the score value will indicate classifier confidence.
 
-### Adding Labels to Sentences
+### Adding labels to sentences
 
 You can also add a `Label` to a whole `Sentence`.
 For instance, the example below shows how we add the label 'sports' to a sentence, thereby labeling it
@@ -199,7 +199,7 @@ Sentence: "France is the current world cup winner."   [− Tokens: 7  − Senten
 
 Indicating that this sentence belongs to the topic 'sports' with confidence 1.0.
 
-### Multiple Labels
+### Multiple labels
 
 Any data point can be labeled multiple times. A sentence for instance might belong to two topics. In this case, add two labels with the same label name:
 
@@ -234,7 +234,7 @@ Sentence: "France is the current world cup winner."   [− Tokens: 7  − Senten
 
 Indicating that this sentence has two "topic" labels and one "language" label. 
 
-### Accessing a Sentence's Labels
+### Accessing a sentence's labels
 
 You can access these labels like this: 
 

From d5f951a943ff905ae91a333f401f421b5442ddd9 Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Fri, 27 Nov 2020 12:50:11 +0100
Subject: [PATCH 05/35] Update TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md

---
 resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md
index eba2594df..50bbfc633 100644
--- a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md
+++ b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md
@@ -18,6 +18,8 @@ For instance, say you want to predict whether text is "happy" or "sad" but you h
 Just use TARS with this snippet:
 
 ```python
+from flair.models.text_classification_model import TARSClassifier
+
 # 1. Load our pre-trained TARS model for English
 tars = TARSClassifier.load('tars-base')
 

From dff58a08da42de9dfe3d95fdc3808fb250c0e664 Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Fri, 27 Nov 2020 12:51:28 +0100
Subject: [PATCH 06/35] Update TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md

---
 resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md
index 50bbfc633..16f19b7ce 100644
--- a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md
+++ b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md
@@ -19,6 +19,7 @@ Just use TARS with this snippet:
 
 ```python
 from flair.models.text_classification_model import TARSClassifier
+from flair.data import Sentence
 
 # 1. Load our pre-trained TARS model for English
 tars = TARSClassifier.load('tars-base')
@@ -69,6 +70,8 @@ In this case, zero-shot prediction falsely predicts "drink" for the sentence "I
 To improve this, let's first create a small corpus of 4 training and 2 testing examples: 
 
 ```python
+from flair.datasets import SentenceDataset
+
 # training dataset consisting of four sentences (2 labeled as "food" and 2 labeled as "drink")
 train = SentenceDataset(
     [

From 8ce61612ab9ad02a99bbdc91339059b40bda0644 Mon Sep 17 00:00:00 2001
From: alanakbik <alan.akbik@gmail.com>
Date: Wed, 25 Nov 2020 12:01:26 +0100
Subject: [PATCH 07/35] GH-1983: bump version numbers

---
 flair/__init__.py | 2 +-
 setup.py          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/flair/__init__.py b/flair/__init__.py
index 7d3e9a311..ecb28ec24 100644
--- a/flair/__init__.py
+++ b/flair/__init__.py
@@ -25,7 +25,7 @@
 
 import logging.config
 
-__version__ = "0.6.1.post1"
+__version__ = "0.7"
 
 logging.config.dictConfig(
     {
diff --git a/setup.py b/setup.py
index 0ca078dc0..824626455 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name="flair",
-    version="0.6.1.post1",
+    version="0.7",
     description="A very simple framework for state-of-the-art NLP",
     long_description=open("README.md", encoding="utf-8").read(),
     long_description_content_type="text/markdown",

From 1a3bcce5d4c82a9f18ef11eb76024dfd3f931ea6 Mon Sep 17 00:00:00 2001
From: alanakbik <alan.akbik@gmail.com>
Date: Wed, 25 Nov 2020 12:40:42 +0100
Subject: [PATCH 08/35] GH-1983: update list of datasets

---
 flair/datasets/__init__.py          |   32 +-
 flair/datasets/sequence_labeling.py | 3008 ++++++++++++++-------------
 resources/docs/TUTORIAL_6_CORPUS.md |   33 +-
 3 files changed, 1553 insertions(+), 1520 deletions(-)

diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py
index 5b611cd23..a59181506 100755
--- a/flair/datasets/__init__.py
+++ b/flair/datasets/__init__.py
@@ -7,6 +7,7 @@
 # Expose all sequence labeling datasets
 from .sequence_labeling import ColumnCorpus
 from .sequence_labeling import ColumnDataset
+from .sequence_labeling import ANER_CORP
 from .sequence_labeling import BIOFID
 from .sequence_labeling import BIOSCOPE
 from .sequence_labeling import CONLL_03
@@ -14,19 +15,31 @@
 from .sequence_labeling import CONLL_03_DUTCH
 from .sequence_labeling import CONLL_03_SPANISH
 from .sequence_labeling import CONLL_2000
-from .sequence_labeling import TWITTER_NER
 from .sequence_labeling import DANE
 from .sequence_labeling import EUROPARL_NER_GERMAN
 from .sequence_labeling import GERMEVAL_14
 from .sequence_labeling import INSPEC
 from .sequence_labeling import LER_GERMAN
+from .sequence_labeling import MIT_MOVIE_NER_SIMPLE
+from .sequence_labeling import MIT_MOVIE_NER_COMPLEX
+from .sequence_labeling import MIT_RESTAURANT_NER
 from .sequence_labeling import NER_BASQUE
 from .sequence_labeling import NER_FINNISH
 from .sequence_labeling import NER_SWEDISH
 from .sequence_labeling import SEMEVAL2010
 from .sequence_labeling import SEMEVAL2017
+from .sequence_labeling import TURKU_NER
+from .sequence_labeling import TWITTER_NER
+from .sequence_labeling import UP_CHINESE
+from .sequence_labeling import UP_ENGLISH
+from .sequence_labeling import UP_FINNISH
+from .sequence_labeling import UP_FRENCH
+from .sequence_labeling import UP_GERMAN
+from .sequence_labeling import UP_ITALIAN
+from .sequence_labeling import UP_SPANISH
+from .sequence_labeling import UP_SPANISH_ANCORA
+from .sequence_labeling import WEIBO_NER
 from .sequence_labeling import WIKIANN
-from .sequence_labeling import XTREME
 from .sequence_labeling import WIKIGOLD_NER
 from .sequence_labeling import WIKINER_ENGLISH
 from .sequence_labeling import WIKINER_GERMAN
@@ -39,20 +52,7 @@
 from .sequence_labeling import WIKINER_RUSSIAN
 from .sequence_labeling import WNUT_17
 from .sequence_labeling import WNUT_2020_NER
-from .sequence_labeling import WEIBO_NER
-from .sequence_labeling import MIT_RESTAURANTS
-from .sequence_labeling import UP_CHINESE
-from .sequence_labeling import UP_ENGLISH
-from .sequence_labeling import UP_FINNISH
-from .sequence_labeling import UP_FRENCH
-from .sequence_labeling import UP_GERMAN
-from .sequence_labeling import UP_ITALIAN
-from .sequence_labeling import UP_SPANISH
-from .sequence_labeling import UP_SPANISH_ANCORA
-from .sequence_labeling import ANER_CORP
-from .sequence_labeling import MITMovieNERSimple
-from .sequence_labeling import MITMovieNERComplex
-from .sequence_labeling import TURKU_NER
+from .sequence_labeling import XTREME
 
 # Expose all document classification datasets
 from .document_classification import ClassificationCorpus
diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index 7dc950dba..02e0a5800 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -267,6 +267,56 @@ def __getitem__(self, index: int = 0) -> Sentence:
         return sentence
 
 
+class ANER_CORP(ColumnCorpus):
+    def __init__(
+            self,
+            base_path: Union[str, Path] = None,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = True,
+            document_as_sequence: bool = False,
+    ):
+        """
+        Initialize a preprocessed version of the Arabic Named Entity Recognition Corpus (ANERCorp) dataset available
+        from https://github.com/EmnamoR/Arabic-named-entity-recognition/blob/master/ANERCorp.rar.
+        http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp
+        Column order is swapped
+        The first time you call this constructor it will automatically download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
+        POS tags instead
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
+        # column format
+        columns = {0: "text", 1: "ner"}
+
+        # this dataset name
+        dataset_name = self.__class__.__name__.lower()
+
+        # default dataset folder is the cache root
+        if not base_path:
+            base_path = Path(flair.cache_root) / "datasets"
+        data_folder = base_path / dataset_name
+
+        # download data if necessary
+        anercorp_path = "https://megantosh.s3.eu-central-1.amazonaws.com/ANERcorp/"
+        # cached_path(f"{anercorp_path}test.txt", Path("datasets") / dataset_name)
+        cached_path(f"{anercorp_path}train.txt", Path("datasets") / dataset_name)
+
+        super(ANER_CORP, self).__init__(
+            data_folder,
+            columns,
+            # tag_to_bioes=tag_to_bioes,
+            encoding="utf-8",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+        )
+
+
 class BIOFID(ColumnCorpus):
     def __init__(
             self,
@@ -299,6 +349,36 @@ def __init__(
         )
 
 
+class BIOSCOPE(ColumnCorpus):
+
+    def __init__(
+            self,
+            base_path: Union[str, Path] = None,
+            in_memory: bool = True,
+    ):
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
+        # column format
+        columns = {0: "text", 1: "tag"}
+
+        # this dataset name
+        dataset_name = self.__class__.__name__.lower()
+
+        # default dataset folder is the cache root
+        if not base_path:
+            base_path = Path(flair.cache_root) / "datasets"
+        data_folder = base_path / dataset_name
+
+        # download data if necessary
+        bioscope_path = "https://raw.githubusercontent.com/whoisjones/BioScopeSequenceLabelingData/master/sequence_labeled/"
+        cached_path(f"{bioscope_path}output.txt", Path("datasets") / dataset_name)
+
+        super(BIOSCOPE, self).__init__(
+            data_folder, columns, in_memory=in_memory, train_file="output.txt"
+        )
+
+
 class CONLL_03(ColumnCorpus):
     def __init__(
             self,
@@ -449,21 +529,123 @@ def __init__(
         )
 
 
+def add_IOB_tags(data_file: Union[str, Path], encoding: str = "utf8", ner_column: int = 1):
+    """
+Function that adds IOB tags if only chunk names are provided (e.g. words are tagged PER instead
+of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects
+the letter 'O'. Additionally it removes lines with no tags in the data file and can also
+be used if the data is only partially IOB tagged.
+Parameters
+----------
+data_file : Union[str, Path]
+    Path to the data file.
+encoding : str, optional
+    Encoding used in open function. The default is "utf8".
+ner_column : int, optional
+    Specifies the ner-tagged column. The default is 1 (the second column).
 
-class WNUT_2020_NER(ColumnCorpus):
+"""
+
+    def add_I_prefix(current_line: List[str], ner: int, tag: str):
+        for i in range(0, len(current_line)):
+            if i == 0:
+                f.write(line_list[i])
+            elif i == ner:
+                f.write(' I-' + tag)
+            else:
+                f.write(' ' + current_line[i])
+        f.write('\n')
+
+    with open(file=data_file, mode='r', encoding=encoding) as f:
+        lines = f.readlines()
+    with open(file=data_file, mode='w', encoding=encoding) as f:
+        pred = 'O'  # remembers ner tag of predecessing line
+        for line in lines:
+            line_list = line.split()
+            if len(line_list) > 2:  # word with tags
+                ner_tag = line_list[ner_column]
+                if ner_tag in ['0', 'O']:  # no chunk
+                    for i in range(0, len(line_list)):
+                        if i == 0:
+                            f.write(line_list[i])
+                        elif i == ner_column:
+                            f.write(' O')
+                        else:
+                            f.write(' ' + line_list[i])
+                    f.write('\n')
+                    pred = 'O'
+                elif '-' not in ner_tag:  # no IOB tags
+                    if pred == 'O':  # found a new chunk
+                        add_I_prefix(line_list, ner_column, ner_tag)
+                        pred = ner_tag
+                    else:  # found further part of chunk or new chunk directly after old chunk
+                        add_I_prefix(line_list, ner_column, ner_tag)
+                        pred = ner_tag
+                else:  # line already has IOB tag (tag contains '-')
+                    f.write(line)
+                    pred = ner_tag.split('-')[1]
+            elif len(line_list) == 0:  # empty line
+                f.write('\n')
+                pred = 'O'
+
+
+def add_IOB2_tags(data_file: Union[str, Path], encoding: str = "utf8"):
+    """
+Function that adds IOB2 tags if only chunk names are provided (e.g. words are tagged PER instead
+of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects
+the letter 'O'. Additionally it removes lines with no tags in the data file and can also
+be used if the data is only partially IOB tagged.
+Parameters
+----------
+data_file : Union[str, Path]
+    Path to the data file.
+encoding : str, optional
+    Encoding used in open function. The default is "utf8".
+
+"""
+    with open(file=data_file, mode='r', encoding=encoding) as f:
+        lines = f.readlines()
+    with open(file=data_file, mode='w', encoding=encoding) as f:
+        pred = 'O'  # remembers tag of predecessing line
+        for line in lines:
+            line_list = line.split()
+            if len(line_list) == 2:  # word with tag
+                word = line_list[0]
+                tag = line_list[1]
+                if tag in ['0', 'O']:  # no chunk
+                    f.write(word + ' O\n')
+                    pred = 'O'
+                elif '-' not in tag:  # no IOB tags
+                    if pred == 'O':  # found a new chunk
+                        f.write(word + ' B-' + tag + '\n')
+                        pred = tag
+                    else:  # found further part of chunk or new chunk directly after old chunk
+                        if pred == tag:
+                            f.write(word + ' I-' + tag + '\n')
+                        else:
+                            f.write(word + ' B-' + tag + '\n')
+                            pred = tag
+                else:  # line already has IOB tag (tag contains '-')
+                    f.write(line)
+                    pred = tag.split('-')[1]
+            elif len(line_list) == 0:  # empty line
+                f.write('\n')
+                pred = 'O'
+
+
+class CONLL_03_SPANISH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
             in_memory: bool = True,
-            document_as_sequence: bool = False,
     ):
         """
-        Initialize the WNUT_2020_NER corpus. The first time you call this constructor it will automatically
+        Initialize the CoNLL-03 corpus for Spanish. The first time you call this constructor it will automatically
         download the dataset.
         :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
         to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, since it is the only option of the WNUT corpus.
+        :param tag_to_bioes: NER by default, should not be changed
         :param in_memory: If True, keeps dataset in memory giving speedups in training.
         :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
@@ -482,65 +664,40 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        github_url = "https://github.com/jeniyat/WNUT_2020_NER/archive/master.zip"
-
-        for sample in ["train", "test", "dev"]:
-
-            sample_file = data_folder / (sample + ".txt")
-            if not sample_file.is_file():
-
-                zip_path = cached_path(
-                    f"{github_url}", Path("datasets") / dataset_name
-                    )
-
-                # unzip the downloaded repo and merge the train, dev and test datasets 
-                unpack_file(zip_path, data_folder, "zip", False) # unzipped folder name: WNUT_2020_NER-master
-
-                if sample == "test":
-                	file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data_2020/Conll_Format/")
-                else:
-                	file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data/Conll_Format/")
-                filenames = os.listdir(file_path)
-                with open(data_folder / (sample + '.txt'), 'w') as outfile: 
-                    for fname in filenames:
-                        with open(file_path / fname) as infile:
-                            lines = infile.read()
-                            outfile.write(lines)
-
-                shutil.rmtree(str(data_folder / "WNUT_2020_NER-master")) # clean up when done
+        conll_02_path = "https://www.clips.uantwerpen.be/conll2002/ner/data/"
+        cached_path(f"{conll_02_path}esp.testa", Path("datasets") / dataset_name)
+        cached_path(f"{conll_02_path}esp.testb", Path("datasets") / dataset_name)
+        cached_path(f"{conll_02_path}esp.train", Path("datasets") / dataset_name)
 
-        super(WNUT_2020_NER, self).__init__(
+        super(CONLL_03_SPANISH, self).__init__(
             data_folder,
             columns,
             tag_to_bioes=tag_to_bioes,
-            encoding="utf-8",
+            encoding="latin-1",
             in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
         )
 
 
-class WIKIGOLD_NER(ColumnCorpus):
+class CONLL_2000(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
+            tag_to_bioes: str = "np",
             in_memory: bool = True,
-            document_as_sequence: bool = False,
     ):
         """
-        Initialize the wikigold corpus. The first time you call this constructor it will automatically
-        download the dataset.
+        Initialize the CoNLL-2000 corpus for English chunking.
+        The first time you call this constructor it will automatically download the dataset.
         :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
         to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, should not be changed
+        :param tag_to_bioes: 'np' by default, should not be changed, but you can set 'pos' instead to predict POS tags
         :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "ner"}
+        columns = {0: "text", 1: "pos", 2: "np"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -551,45 +708,52 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        wikigold_ner_path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/wikigold/CONLL-format/data/"
-        cached_path(f"{wikigold_ner_path}wikigold.conll.txt", Path("datasets") / dataset_name)
-
-        super(WIKIGOLD_NER, self).__init__(
-            data_folder,
-            columns,
-            tag_to_bioes=tag_to_bioes,
-            encoding="utf-8",
-            in_memory=in_memory,
-            train_file='wikigold.conll.txt',
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-        )
-
+        conll_2000_path = "https://www.clips.uantwerpen.be/conll2000/chunking/"
+        data_file = Path(flair.cache_root) / "datasets" / dataset_name / "train.txt"
+        if not data_file.is_file():
+            cached_path(
+                f"{conll_2000_path}train.txt.gz", Path("datasets") / dataset_name
+            )
+            cached_path(
+                f"{conll_2000_path}test.txt.gz", Path("datasets") / dataset_name
+            )
+            import gzip, shutil
 
-class TWITTER_NER(ColumnCorpus):
+            with gzip.open(
+                    Path(flair.cache_root) / "datasets" / dataset_name / "train.txt.gz",
+                    "rb",
+            ) as f_in:
+                with open(
+                        Path(flair.cache_root) / "datasets" / dataset_name / "train.txt",
+                        "wb",
+                ) as f_out:
+                    shutil.copyfileobj(f_in, f_out)
+            with gzip.open(
+                    Path(flair.cache_root) / "datasets" / dataset_name / "test.txt.gz", "rb"
+            ) as f_in:
+                with open(
+                        Path(flair.cache_root) / "datasets" / dataset_name / "test.txt",
+                        "wb",
+                ) as f_out:
+                    shutil.copyfileobj(f_in, f_out)
+
+        super(CONLL_2000, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
+        )
+
+
+class DANE(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
             in_memory: bool = True,
-            document_as_sequence: bool = False,
     ):
-        """
-        Initialize a dataset called twitter_ner which can be found on the following page:
-        https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/ner.txt.
-
-        The first time you call this constructor it will automatically
-        download the dataset.
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, need not be changed
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: 'text', 1: 'ner'}
+        columns = {1: 'text', 3: 'pos', 9: 'ner'}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -600,43 +764,61 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        twitter_ner_path = "https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/"
-        cached_path(f"{twitter_ner_path}ner.txt", Path("datasets") / dataset_name)
+        data_path = Path(flair.cache_root) / "datasets" / dataset_name
+        train_data_file = data_path / "ddt.train.conllu"
+        if not train_data_file.is_file():
+            temp_file = cached_path(
+                'https://danlp.alexandra.dk/304bd159d5de/datasets/ddt.zip',
+                Path("datasets") / dataset_name
+            )
+            from zipfile import ZipFile
 
-        super(TWITTER_NER, self).__init__(
-            data_folder,
-            columns,
-            tag_to_bioes=tag_to_bioes,
-            encoding="latin-1",
-            train_file="ner.txt",
-            in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            with ZipFile(temp_file, 'r') as zip_file:
+                zip_file.extractall(path=data_path)
+
+            # Remove CoNLL-U meta information in the last column
+            for part in ['train', 'dev', 'test']:
+                lines = []
+                data_file = "ddt.{}.conllu".format(part)
+                with open(data_path / data_file, 'r') as file:
+                    for line in file:
+                        if line.startswith("#") or line == "\n":
+                            lines.append(line)
+                        lines.append(line.replace("name=", "").replace("|SpaceAfter=No", ""))
+
+                with open(data_path / data_file, 'w') as file:
+                    file.writelines(lines)
+
+                print(data_path / data_file)
+
+        super(DANE, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes,
+            in_memory=in_memory, comment_symbol="#"
         )
 
 
-class MIT_RESTAURANTS(ColumnCorpus):
+class EUROPARL_NER_GERMAN(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
-            in_memory: bool = True,
-            document_as_sequence: bool = False,
+            in_memory: bool = False,
     ):
         """
-        Initialize the experimental MIT Restaurant corpus available on https://groups.csail.mit.edu/sls/downloads/restaurant/.
-        The first time you call this constructor it will automatically download the dataset.
+        Initialize the EUROPARL_NER_GERMAN corpus. The first time you call this constructor it will automatically
+        download the dataset.
         :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
         to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
-        POS tags instead
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param tag_to_bioes: 'ner' by default, should not be changed.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
         :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
+
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "ner"}
+        columns = {0: 'text', 1: 'lemma', 2: 'pos', 3: 'np', 4: 'ner'}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -647,125 +829,25 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        mit_restaurants_path = "https://megantosh.s3.eu-central-1.amazonaws.com/MITRestoCorpus/"
-        cached_path(f"{mit_restaurants_path}test.txt", Path("datasets") / dataset_name)
-        cached_path(f"{mit_restaurants_path}train.txt", Path("datasets") / dataset_name)
+        europarl_ner_german_path = "https://nlpado.de/~sebastian/software/ner/"
+        cached_path(f"{europarl_ner_german_path}ep-96-04-15.conll", Path("datasets") / dataset_name)
+        cached_path(f"{europarl_ner_german_path}ep-96-04-16.conll", Path("datasets") / dataset_name)
+
+        add_IOB_tags(data_file=Path(data_folder / "ep-96-04-15.conll"), encoding="latin-1", ner_column=4)
+        add_IOB_tags(data_file=Path(data_folder / "ep-96-04-16.conll"), encoding="latin-1", ner_column=4)
 
-        super(MIT_RESTAURANTS, self).__init__(
+        super(EUROPARL_NER_GERMAN, self).__init__(
             data_folder,
             columns,
             tag_to_bioes=tag_to_bioes,
             encoding="latin-1",
             in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            train_file='ep-96-04-16.conll',
+            test_file='ep-96-04-15.conll'
         )
 
 
-def add_IOB_tags(data_file: Union[str, Path], encoding: str = "utf8", ner_column: int = 1):
-    """
-Function that adds IOB tags if only chunk names are provided (e.g. words are tagged PER instead
-of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects
-the letter 'O'. Additionally it removes lines with no tags in the data file and can also
-be used if the data is only partially IOB tagged.
-Parameters
-----------
-data_file : Union[str, Path]
-    Path to the data file.
-encoding : str, optional
-    Encoding used in open function. The default is "utf8".
-ner_column : int, optional
-    Specifies the ner-tagged column. The default is 1 (the second column).
-
-"""
-
-    def add_I_prefix(current_line: List[str], ner: int, tag: str):
-        for i in range(0, len(current_line)):
-            if i == 0:
-                f.write(line_list[i])
-            elif i == ner:
-                f.write(' I-' + tag)
-            else:
-                f.write(' ' + current_line[i])
-        f.write('\n')
-
-    with open(file=data_file, mode='r', encoding=encoding) as f:
-        lines = f.readlines()
-    with open(file=data_file, mode='w', encoding=encoding) as f:
-        pred = 'O'  # remembers ner tag of predecessing line
-        for line in lines:
-            line_list = line.split()
-            if len(line_list) > 2:  # word with tags
-                ner_tag = line_list[ner_column]
-                if ner_tag in ['0', 'O']:  # no chunk
-                    for i in range(0, len(line_list)):
-                        if i == 0:
-                            f.write(line_list[i])
-                        elif i == ner_column:
-                            f.write(' O')
-                        else:
-                            f.write(' ' + line_list[i])
-                    f.write('\n')
-                    pred = 'O'
-                elif '-' not in ner_tag:  # no IOB tags
-                    if pred == 'O':  # found a new chunk
-                        add_I_prefix(line_list, ner_column, ner_tag)
-                        pred = ner_tag
-                    else:  # found further part of chunk or new chunk directly after old chunk
-                        add_I_prefix(line_list, ner_column, ner_tag)
-                        pred = ner_tag
-                else:  # line already has IOB tag (tag contains '-')
-                    f.write(line)
-                    pred = ner_tag.split('-')[1]
-            elif len(line_list) == 0:  # empty line
-                f.write('\n')
-                pred = 'O'
-
-
-def add_IOB2_tags(data_file: Union[str, Path], encoding: str = "utf8"):
-    """
-Function that adds IOB2 tags if only chunk names are provided (e.g. words are tagged PER instead
-of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects
-the letter 'O'. Additionally it removes lines with no tags in the data file and can also
-be used if the data is only partially IOB tagged.
-Parameters
-----------
-data_file : Union[str, Path]
-    Path to the data file.
-encoding : str, optional
-    Encoding used in open function. The default is "utf8".
-
-"""
-    with open(file=data_file, mode='r', encoding=encoding) as f:
-        lines = f.readlines()
-    with open(file=data_file, mode='w', encoding=encoding) as f:
-        pred = 'O'  # remembers tag of predecessing line
-        for line in lines:
-            line_list = line.split()
-            if len(line_list) == 2:  # word with tag
-                word = line_list[0]
-                tag = line_list[1]
-                if tag in ['0', 'O']:  # no chunk
-                    f.write(word + ' O\n')
-                    pred = 'O'
-                elif '-' not in tag:  # no IOB tags
-                    if pred == 'O':  # found a new chunk
-                        f.write(word + ' B-' + tag + '\n')
-                        pred = tag
-                    else:  # found further part of chunk or new chunk directly after old chunk
-                        if pred == tag:
-                            f.write(word + ' I-' + tag + '\n')
-                        else:
-                            f.write(word + ' B-' + tag + '\n')
-                            pred = tag
-                else:  # line already has IOB tag (tag contains '-')
-                    f.write(line)
-                    pred = tag.split('-')[1]
-            elif len(line_list) == 0:  # empty line
-                f.write('\n')
-                pred = 'O'
-
-
-class CONLL_03_SPANISH(ColumnCorpus):
+class GERMEVAL_14(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
@@ -773,19 +855,18 @@ def __init__(
             in_memory: bool = True,
     ):
         """
-        Initialize the CoNLL-03 corpus for Spanish. The first time you call this constructor it will automatically
-        download the dataset.
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, should not be changed
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Initialize the GermEval NER corpus for German. This is only possible if you've manually downloaded it to your
+        machine. Obtain the corpus from https://sites.google.com/site/germeval2014ner/data and put it into some folder.
+        Then point the base_path parameter in the constructor to this folder
+        :param base_path: Path to the GermEval corpus on your machine
+        :param tag_to_bioes: 'ner' by default, should not be changed.
+        :param in_memory:If True, keeps dataset in memory giving speedups in training.
         """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "ner"}
+        columns = {1: "text", 2: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -795,41 +876,36 @@ def __init__(
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
-        # download data if necessary
-        conll_02_path = "https://www.clips.uantwerpen.be/conll2002/ner/data/"
-        cached_path(f"{conll_02_path}esp.testa", Path("datasets") / dataset_name)
-        cached_path(f"{conll_02_path}esp.testb", Path("datasets") / dataset_name)
-        cached_path(f"{conll_02_path}esp.train", Path("datasets") / dataset_name)
-
-        super(CONLL_03_SPANISH, self).__init__(
+        # check if data there
+        if not data_folder.exists():
+            log.warning("-" * 100)
+            log.warning(f'WARNING: GermEval-14 dataset not found at "{data_folder}".')
+            log.warning(
+                'Instructions for obtaining the data can be found here: https://sites.google.com/site/germeval2014ner/data"'
+            )
+            log.warning("-" * 100)
+        super(GERMEVAL_14, self).__init__(
             data_folder,
             columns,
             tag_to_bioes=tag_to_bioes,
-            encoding="latin-1",
+            comment_symbol="#",
             in_memory=in_memory,
         )
 
 
-class CONLL_2000(ColumnCorpus):
+class INSPEC(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "np",
+            tag_to_bioes: str = "keyword",
             in_memory: bool = True,
     ):
-        """
-        Initialize the CoNLL-2000 corpus for English chunking.
-        The first time you call this constructor it will automatically download the dataset.
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: 'np' by default, should not be changed, but you can set 'pos' instead to predict POS tags
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        """
+
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "pos", 2: "np"}
+        columns = {0: "text", 1: "keyword"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -839,77 +915,34 @@ def __init__(
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
-        # download data if necessary
-        conll_2000_path = "https://www.clips.uantwerpen.be/conll2000/chunking/"
-        data_file = Path(flair.cache_root) / "datasets" / dataset_name / "train.txt"
-        if not data_file.is_file():
-            cached_path(
-                f"{conll_2000_path}train.txt.gz", Path("datasets") / dataset_name
-            )
-            cached_path(
-                f"{conll_2000_path}test.txt.gz", Path("datasets") / dataset_name
-            )
-            import gzip, shutil
-
-            with gzip.open(
-                    Path(flair.cache_root) / "datasets" / dataset_name / "train.txt.gz",
-                    "rb",
-            ) as f_in:
-                with open(
-                        Path(flair.cache_root) / "datasets" / dataset_name / "train.txt",
-                        "wb",
-                ) as f_out:
-                    shutil.copyfileobj(f_in, f_out)
-            with gzip.open(
-                    Path(flair.cache_root) / "datasets" / dataset_name / "test.txt.gz", "rb"
-            ) as f_in:
-                with open(
-                        Path(flair.cache_root) / "datasets" / dataset_name / "test.txt",
-                        "wb",
-                ) as f_out:
-                    shutil.copyfileobj(f_in, f_out)
+        inspec_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/Inspec"
+        cached_path(f"{inspec_path}/train.txt", Path("datasets") / dataset_name)
+        cached_path(f"{inspec_path}/test.txt", Path("datasets") / dataset_name)
+        if not "dev.txt" in os.listdir(data_folder):
+            cached_path(f"{inspec_path}/valid.txt", Path("datasets") / dataset_name)
+            # rename according to train - test - dev - convention
+            os.rename(data_folder / "valid.txt", data_folder / "dev.txt")
 
-        super(CONLL_2000, self).__init__(
+        super(INSPEC, self).__init__(
             data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
 
-class XTREME(MultiCorpus):
+class LER_GERMAN(ColumnCorpus):
     def __init__(
             self,
-            languages: Union[str, List[str]] = None,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
             in_memory: bool = False,
     ):
         """
-        Xtreme corpus for cross-lingual NER consisting of datasets of a total of 176 languages. The data comes from the google 
-        research work XTREME https://github.com/google-research/xtreme. All datasets for NER and respective language abbreviations (e.g. 
-        "en" for english can be found here https://www.amazon.com/clouddrive/share/d3KGCRCIYwhKJF0H3eWA26hjg2ZCRhjpEQtDL70FSBN/folder/C43gs51bSIaq5sFTQkWNCQ?_encoding=UTF8&*Version*=1&*entries*=0&mgh=1 )
-        The data is derived from the wikiann dataset https://elisa-ie.github.io/wikiann/ (license: https://opendatacommons.org/licenses/by/)
-
-        Parameters
-        ----------
-        languages : Union[str, List[str]], optional
-            Default the 40 languages that are used in XTREME are loaded. Otherwise on can hand over a strings or a list of strings 
-            consisiting of abbreviations for languages. All datasets will be loaded in a MultiCorpus object.
-        base_path : Union[str, Path], optional
-            Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-            to point to a different folder but typically this should not be necessary.
-        tag_to_bioes : str, optional
-            The data is in bio-format. It will by default (with the string "ner" as value) be transformed
-            into the bioes format. If you dont want that set it to None.
-
+        Initialize the LER_GERMAN (Legal Entity Recognition) corpus. The first time you call this constructor it will automatically
+        download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
-        # if no languages are given as argument all languages used in XTREME will be loaded
-        if not languages:
-            languages = ["af", "ar", "bg", "bn", "de", "el", "en", "es", "et", "eu", "fa", "fi", "fr", "he", "hi", "hu",
-                         "id", "it", "ja", "jv", "ka", "kk", "ko", "ml", "mr", "ms", "my", "nl", "pt", "ru", "sw", "ta",
-                         "te", "th", "tl", "tr", "ur", "vi", "yo", "zh"]
-
-        # if only one language is given
-        if type(languages) == str:
-            languages = [languages]
 
         if type(base_path) == str:
             base_path: Path = Path(base_path)
@@ -918,112 +951,136 @@ def __init__(
         columns = {0: "text", 1: "ner"}
 
         # this dataset name
-        dataset_name = "xtreme"
+        dataset_name = self.__class__.__name__.lower()
 
         # default dataset folder is the cache root
         if not base_path:
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
-        # For each language in languages, the file is downloaded if not existent
-        # Then a comlumncorpus of that data is created and saved in a list
-        # This list is handed to the multicorpus
+        # download data if necessary
+        ler_path = "https://raw.githubusercontent.com/elenanereiss/Legal-Entity-Recognition/master/data/"
+        cached_path(f"{ler_path}ler.conll", Path("datasets") / dataset_name)
 
-        # list that contains the columncopora
-        corpora = []
+        super(LER_GERMAN, self).__init__(
+            data_folder,
+            columns,
+            tag_to_bioes=tag_to_bioes,
+            in_memory=in_memory,
+            train_file='ler.conll'
+        )
 
-        hu_path = "https://nlp.informatik.hu-berlin.de/resources/datasets/panx_dataset"
 
-        # download data if necessary
-        for language in languages:
+class MIT_MOVIE_NER_SIMPLE(ColumnCorpus):
+    def __init__(
+            self,
+            base_path: Union[str, Path] = None,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = True,
+    ):
+        """
+        Initialize the eng corpus of the MIT Movie Corpus (it has simpler queries compared to the trivia10k13 corpus)
+        in BIO format. The first time you call this constructor it will automatically download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
+        POS tags instead
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        """
+        # column format
+        columns = {0: "ner", 1: "text"}
 
-            language_folder = data_folder / language
+        # dataset name
+        dataset_name = self.__class__.__name__.lower()
 
-            # if language not downloaded yet, download it
-            if not language_folder.exists():
+        # data folder: default dataset folder is the cache root
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+        if not base_path:
+            base_path: Path = Path(flair.cache_root) / "datasets"
+        data_folder = base_path / dataset_name
 
-                file_name = language + '.tar.gz'
-                # create folder
-                os.makedirs(language_folder)
+        # download data if necessary
+        mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/"
+        train_file = "engtrain.bio"
+        test_file = "engtest.bio"
+        cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name)
+        cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name)
 
-                # download from HU Server
-                temp_file = cached_path(
-                    hu_path + "/" + file_name,
-                    Path("datasets") / dataset_name / language
-                )
+        super(MIT_MOVIE_NER_SIMPLE, self).__init__(
+            data_folder,
+            columns,
+            train_file=train_file,
+            test_file=test_file,
+            tag_to_bioes=tag_to_bioes,
+            in_memory=in_memory,
+        )
 
-                # unzip
-                print("Extract data...")
-                import tarfile
-                tar = tarfile.open(str(temp_file), "r:gz")
-                for part in ["train", "test", "dev"]:
-                    tar.extract(part, str(language_folder))
-                tar.close()
-                print('...done.')
 
-                # transform data into required format
-                print("Process dataset...")
-                for part in ["train", "test", "dev"]:
-                    xtreme_to_simple_ner_annotation(str(language_folder / part))
-                print('...done.')
+class MIT_MOVIE_NER_COMPLEX(ColumnCorpus):
+    def __init__(
+            self,
+            base_path: Union[str, Path] = None,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = True,
+    ):
+        """
+        Initialize the trivia10k13 corpus of the MIT Movie Corpus (it has more complex queries compared to the eng corpus)
+        in BIO format. The first time you call this constructor it will automatically download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
+        POS tags instead
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        """
+        # column format
+        columns = {0: "ner", 1: "text"}
 
-            # initialize comlumncorpus and add it to list
-            print("Read data into corpus...")
-            corp = ColumnCorpus(data_folder=language_folder,
-                                column_format=columns,
-                                tag_to_bioes=tag_to_bioes,
-                                in_memory=in_memory,
-                                )
-            corpora.append(corp)
-            print("...done.")
+        # dataset name
+        dataset_name = self.__class__.__name__.lower()
 
-        super(XTREME, self).__init__(
-            corpora, name='xtreme'
-        )
+        # data folder: default dataset folder is the cache root
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+        if not base_path:
+            base_path: Path = Path(flair.cache_root) / "datasets"
+        data_folder = base_path / dataset_name
 
+        # download data if necessary
+        mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/"
+        train_file = "trivia10k13train.bio"
+        test_file = "trivia10k13test.bio"
+        cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name)
+        cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name)
 
-def xtreme_to_simple_ner_annotation(data_file: Union[str, Path]):
-    with open(data_file, 'r', encoding='utf-8') as f:
-        lines = f.readlines()
-    with open(data_file, 'w', encoding='utf-8') as f:
-        for line in lines:
-            if line == '\n':
-                f.write(line)
-            else:
-                liste = line.split()
-                f.write(liste[0].split(':', 1)[1] + ' ' + liste[1] + '\n')
+        super(MIT_MOVIE_NER_COMPLEX, self).__init__(
+            data_folder,
+            columns,
+            train_file=train_file,
+            test_file=test_file,
+            tag_to_bioes=tag_to_bioes,
+            in_memory=in_memory,
+        )
 
 
-class WIKIANN(MultiCorpus):
+class MIT_RESTAURANT_NER(ColumnCorpus):
     def __init__(
             self,
-            languages: Union[str, List[str]],
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
-            in_memory: bool = False,
+            in_memory: bool = True,
+            document_as_sequence: bool = False,
     ):
         """
-        WkiAnn corpus for cross-lingual NER consisting of datasets from 282 languages that exist
-        in Wikipedia. See https://elisa-ie.github.io/wikiann/ for details and for the languages and their
-        respective abbreveations, i.e. "en" for english. (license: https://opendatacommons.org/licenses/by/)
-        Parameters
-        ----------
-        languages : Union[str, List[str]]
-            Should be an abbreviation of a language ("en", "de",..) or a list of abbreviations.
-            The datasets of all passed languages will be saved in one MultiCorpus.
-            (Note that, even though listed on https://elisa-ie.github.io/wikiann/ some datasets are empty.
-            This includes "aa", "cho", "ho", "hz", "ii", "jam", "kj", "kr", "mus", "olo" and "tcy".)
-        base_path : Union[str, Path], optional
-            Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-            to point to a different folder but typically this should not be necessary.
-        tag_to_bioes : str, optional
-            The data is in bio-format. It will by default (with the string "ner" as value) be transformed
-            into the bioes format. If you dont want that set it to None.
-
+        Initialize the experimental MIT Restaurant corpus available on https://groups.csail.mit.edu/sls/downloads/restaurant/.
+        The first time you call this constructor it will automatically download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
+        POS tags instead
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
-        if type(languages) == str:
-            languages = [languages]
-
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
@@ -1031,405 +1088,140 @@ def __init__(
         columns = {0: "text", 1: "ner"}
 
         # this dataset name
-        dataset_name = "wikiann"
+        dataset_name = self.__class__.__name__.lower()
 
         # default dataset folder is the cache root
         if not base_path:
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
-        # For each language in languages, the file is downloaded if not existent
-        # Then a comlumncorpus of that data is created and saved in a list
-        # this list is handed to the multicorpus
+        # download data if necessary
+        mit_restaurants_path = "https://megantosh.s3.eu-central-1.amazonaws.com/MITRestoCorpus/"
+        cached_path(f"{mit_restaurants_path}test.txt", Path("datasets") / dataset_name)
+        cached_path(f"{mit_restaurants_path}train.txt", Path("datasets") / dataset_name)
 
-        # list that contains the columncopora
-        corpora = []
+        super(MIT_RESTAURANT_NER, self).__init__(
+            data_folder,
+            columns,
+            tag_to_bioes=tag_to_bioes,
+            encoding="latin-1",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+        )
+
+
+class NER_BASQUE(ColumnCorpus):
+    def __init__(
+            self,
+            base_path: Union[str, Path] = None,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = True,
+    ):
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
+        # column format
+        columns = {0: "text", 1: "ner"}
+
+        # this dataset name
+        dataset_name = self.__class__.__name__.lower()
+
+        # default dataset folder is the cache root
+        if not base_path:
+            base_path = Path(flair.cache_root) / "datasets"
+        data_folder = base_path / dataset_name
 
-        google_drive_path = 'https://drive.google.com/uc?id='
         # download data if necessary
-        first = True
-        for language in languages:
+        ner_basque_path = "http://ixa2.si.ehu.eus/eiec/"
+        data_path = Path(flair.cache_root) / "datasets" / dataset_name
+        data_file = data_path / "named_ent_eu.train"
+        if not data_file.is_file():
+            cached_path(
+                f"{ner_basque_path}/eiec_v1.0.tgz", Path("datasets") / dataset_name
+            )
+            import tarfile, shutil
 
-            language_folder = data_folder / language
-            file_name = 'wikiann-' + language + '.bio'
+            with tarfile.open(
+                    Path(flair.cache_root) / "datasets" / dataset_name / "eiec_v1.0.tgz",
+                    "r:gz",
+            ) as f_in:
+                corpus_files = (
+                    "eiec_v1.0/named_ent_eu.train",
+                    "eiec_v1.0/named_ent_eu.test",
+                )
+                for corpus_file in corpus_files:
+                    f_in.extract(corpus_file, data_path)
+                    shutil.move(f"{data_path}/{corpus_file}", data_path)
 
-            # if language not downloaded yet, download it
-            if not language_folder.exists():
-                if first == True:
-                    import gdown
-                    import tarfile
-                    first = False
-                # create folder
-                os.makedirs(language_folder)
-                # get google drive id from list
-                google_id = google_drive_id_from_language_name(language)
-                url = google_drive_path + google_id
+        super(NER_BASQUE, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
+        )
 
-                # download from google drive
-                gdown.download(url, str(language_folder / language) + '.tar.gz')
 
-                # unzip
-                print("Extract data...")
-                tar = tarfile.open(str(language_folder / language) + '.tar.gz', "r:gz")
-                # tar.extractall(language_folder,members=[tar.getmember(file_name)])
-                tar.extract(file_name, str(language_folder))
-                tar.close()
-                print('...done.')
+class NER_FINNISH(ColumnCorpus):
+    def __init__(
+            self,
+            base_path: Union[str, Path] = None,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = True,
+    ):
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
 
-                # transform data into required format
-                # the processed dataset has the additional ending "_new"
-                print("Process dataset...")
-                silver_standard_to_simple_ner_annotation(str(language_folder / file_name))
-                # remove the unprocessed dataset
-                os.remove(str(language_folder / file_name))
-                print('...done.')
+        # column format
+        columns = {0: "text", 1: "ner"}
 
-            # initialize comlumncorpus and add it to list
-            print("Read data into corpus...")
-            corp = ColumnCorpus(data_folder=language_folder,
-                                column_format=columns,
-                                train_file=file_name + '_new',
-                                tag_to_bioes=tag_to_bioes,
-                                in_memory=in_memory,
-                                )
-            corpora.append(corp)
-            print("...done.")
+        # this dataset name
+        dataset_name = self.__class__.__name__.lower()
 
-        super(WIKIANN, self).__init__(
-            corpora, name='wikiann'
+        # default dataset folder is the cache root
+        if not base_path:
+            base_path = Path(flair.cache_root) / "datasets"
+        data_folder = base_path / dataset_name
+
+        # download data if necessary
+        ner_finnish_path = "https://raw.githubusercontent.com/mpsilfve/finer-data/master/data/digitoday."
+        cached_path(f"{ner_finnish_path}2014.train.csv", Path("datasets") / dataset_name)
+        cached_path(f"{ner_finnish_path}2014.dev.csv", Path("datasets") / dataset_name)
+        cached_path(f"{ner_finnish_path}2015.test.csv", Path("datasets") / dataset_name)
+
+        _remove_lines_without_annotations(data_file=Path(data_folder / "digitoday.2015.test.csv"))
+
+        super(NER_FINNISH, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, skip_first_line=True
         )
 
 
-def silver_standard_to_simple_ner_annotation(data_file: Union[str, Path]):
-    f_read = open(data_file, 'r', encoding='utf-8')
-    f_write = open(data_file + '_new', 'w+', encoding='utf-8')
-    while True:
-        line = f_read.readline()
-        if line:
-            if line == '\n':
-                f_write.write(line)
-            else:
-                liste = line.split()
-                f_write.write(liste[0] + ' ' + liste[-1] + '\n')
-        else:
-            break
-    f_read.close()
-    f_write.close()
+def _remove_lines_without_annotations(data_file: Union[str, Path] = None):
+    with open(data_file, 'r') as f:
+        lines = f.readlines()
+    with open(data_file, 'w') as f:
+        for line in lines:
+            if len(line.split()) != 1:
+                f.write(line)
 
 
-def google_drive_id_from_language_name(language):
-    languages_ids = {
-        'aa': '1tDDlydKq7KQQ3_23Ysbtke4HJOe4snIk',  # leer
-        'ab': '1hB8REj2XA_0DjI9hdQvNvSDpuBIb8qRf',
-        'ace': '1WENJS2ppHcZqaBEXRZyk2zY-PqXkTkgG',
-        'ady': '1n6On8WWDHxEoybj7F9K15d_fkGPy6KgO',
-        'af': '1CPB-0BD2tg3zIT60D3hmJT0i5O_SKja0',
-        'ak': '1l2vlGHnQwvm9XhW5S-403fetwUXhBlZm',
-        'als': '196xyYjhbie7sYLHLZHWkkurOwQLi8wK-',
-        'am': '1ug1IEoExKD3xWpvfZprAPSQi82YF9Cet',
-        'an': '1DNLgPOAOsGZBYd6rC5ddhzvc9_DtWnk2',
-        'ang': '1W_0ti7Tl8AkqM91lRCMPWEuUnPOAZroV',
-        'ar': '1tyvd32udEQG_cNeVpaD5I2fxvCc6XKIS',
-        'arc': '1hSOByStqPmP3b9HfQ39EclUZGo8IKCMb',
-        'arz': '1CKW5ZhxTpIHmc8Jt5JLz_5O6Cr8Icsan',
-        'as': '12opBoIweBLM8XciMHT4B6-MAaKdYdvpE',
-        'ast': '1rp64PxGZBDfcw-tpFBjLg_ddLDElG1II',
-        'av': '1hncGUrkG1vwAAQgLtwOf41BWkHkEvdss',
-        'ay': '1VmIsWpMTz442b4Mx798ZOgtB9vquKQtf',
-        'az': '1FXDXsvBSdqc7GGIDZv0hqBOaaw12Ip2-',
-        'azb': '1amVqOuHLEkhjn8rkGUl-mXdZlaACWyNT',
-        'ba': '1aLx1d8GagI11VZVYOGQy0BEePeqoT0x3',
-        'bar': '1JZ8-k8ZmnpWYI_Yl_cBBgjVdxoM9Daci',
-        'bat-smg': '1trxKXDFSeKsygTMKi-ZqXSJs7F90k5a8',
-        'bcl': '1Hs0k7KVZ2DPsqroZ4cUKcwZG4HdPV794',
-        'be-x-old': '1gaK-spj1m6eGYQ-SsngLxxLUvP1VRk08',
-        'be': '1_ttfOSy9BzCRkIT_p3mImT82XRPpEiuH',
-        'bg': '1Iug6gYKemb0OrLTUrKDc_c66YGypTfCF',
-        'bh': '12OcSFLu940A8tVQLxI8pnxKBpTeZHmrh',
-        'bi': '1rftVziS_pqARx4mvLJC0sKLY-OL5ZIjE',
-        'bjn': '1n17mkRjPUAOWQk5LQs2C3Tz3ShxK0enZ',
-        'bm': '1284dwO_sfdsWE7FR06HhfBRUb8ePesKR',
-        'bn': '1K2DM1mT4hkr6NlAIBTj95BeVXcgvpgDm',
-        'bo': '1SzGHDVK-OguKdjZ4DXWiOJVrie1iHeWm',
-        'bpy': '1m-e5EoruJufvwBEgJLmJtx6jzx64pYN2',
-        'br': '1xdaBoJ1DnwI0iEq7gQN1dWcABAs_bM9H',
-        'bs': '167dsB01trMYFQl8FshtIdfhjw7IfVKbk',
-        'bug': '1yCnevM9_KJzFk27Vxsva_20OacLo4Uam',
-        'bxr': '1DlByAX3zB-9UyEAVD4wtX-R7mXC-8xum',
-        'ca': '1LuUgbd9sGa-5Ahcsy31EK89a3WOowftY',
-        'cbk-zam': '1kgF8xoD-kIOWZET_9kp_4yNX6AAXn6PI',
-        'cdo': '14x1y6611G-UAEGq92QEHRpreVkYnoUCw',
-        'ce': '1QUUCVKA-fkiCHd3KT3zUWefaWnxzlZLu',
-        'ceb': '1DJZE9RfaMoPNXHI73KBXAm4YSe-_YCUk',
-        'ch': '1YzAfhmatkmTpkZbAcD6X83epCgzD5S2_',
-        'cho': '1ciY0vF3c5a2mTOo_k32A2wMs0klK98Kb',  # leer
-        'chr': '1EHaxz1UZHn7v2bbRzCLAhPsNtRzrG3Ae',
-        'chy': '1nNWwMAJr1KNdz3bHf6uIn-thZCknlTeB',
-        'ckb': '1llpaftcUSiXCZQZMdAqaJSrhwMdcf9IV',
-        'co': '1ZP-8oWgMYfW7a6w6ygEFkKDGbN39QnDn',
-        'cr': '1ST0xRicLAG4JdCZwGdaY-0pEXooQh7e6',
-        'crh': '1Jmpq2XVYUR_XaXU5XNhtOMnz-qkpsgpE',
-        'cs': '1Vydyze-jBkK_S1uV5ewV_Y6dbwhXr7lk',
-        'csb': '1naUyF74lZPnnopXdOqf5Xor2kT4WoHfS',
-        'cu': '1EN5dVTU6jc7YOYPCHq8EYUF31HlMUKs7',
-        'cv': '1gEUAlqYSSDI4TrWCqP1LUq2n0X1XEjN3',
-        'cy': '1q5g6NJE5GXf65Vc_P4BnUMHQ49Prz-J1',
-        'da': '11onAGOLkkqrIwM784siWlg-cewa5WKm8',
-        'de': '1f9nWvNkCCy6XWhd9uf4Dq-2--GzSaYAb',
-        'diq': '1IkpJaVbEOuOs9qay_KG9rkxRghWZhWPm',
-        'dsb': '1hlExWaMth-2eVIQ3i3siJSG-MN_7Z6MY',
-        'dv': '1WpCrslO4I7TMb2uaKVQw4U2U8qMs5szi',
-        'dz': '10WX52ePq2KfyGliwPvY_54hIjpzW6klV',
-        'ee': '1tYEt3oN2KPzBSWrk9jpCqnW3J1KXdhjz',
-        'el': '1cxq4NUYmHwWsEn5waYXfFSanlINXWLfM',
-        'eml': '17FgGhPZqZNtzbxpTJOf-6nxEuI5oU4Vd',
-        'en': '1mqxeCPjxqmO7e8utj1MQv1CICLFVvKa-',
-        'eo': '1YeknLymGcqj44ug2yd4P7xQVpSK27HkK',
-        'es': '1Dnx3MVR9r5cuoOgeew2gT8bDvWpOKxkU',
-        'et': '1Qhb3kYlQnLefWmNimdN_Vykm4mWzbcWy',
-        'eu': '1f613wH88UeITYyBSEMZByK-nRNMwLHTs',
-        'ext': '1D0nLOZ3aolCM8TShIRyCgF3-_MhWXccN',
-        'fa': '1QOG15HU8VfZvJUNKos024xI-OGm0zhEX',
-        'ff': '1h5pVjxDYcq70bSus30oqi9KzDmezVNry',
-        'fi': '1y3Kf6qYsSvL8_nSEwE1Y6Bf6ninaPvqa',
-        'fiu-vro': '1oKUiqG19WgPd3CCl4FGudk5ATmtNfToR',
-        'fj': '10xDMuqtoTJlJFp5ghbhKfNWRpLDK3W4d',
-        'fo': '1RhjYqgtri1276Be1N9RrNitdBNkpzh0J',
-        'fr': '1sK_T_-wzVPJYrnziNqWTriU52rEsXGjn',
-        'frp': '1NUm8B2zClBcEa8dHLBb-ZgzEr8phcQyZ',
-        'frr': '1FjNqbIUlOW1deJdB8WCuWjaZfUzKqujV',
-        'fur': '1oqHZMK7WAV8oHoZLjGR0PfmO38wmR6XY',
-        'fy': '1DvnU6iaTJc9bWedmDklHyx8nzKD1s3Ge',
-        'ga': '1Ql6rh7absdYQ8l-3hj_MVKcEC3tHKeFB',
-        'gag': '1zli-hOl2abuQ2wsDJU45qbb0xuvYwA3a',
-        'gan': '1u2dOwy58y-GaS-tCPJS_i9VRDQIPXwCr',
-        'gd': '1umsUpngJiwkLdGQbRqYpkgxZju9dWlRz',
-        'gl': '141K2IbLjJfXwFTIf-kthmmG0YWdi8liE',
-        'glk': '1ZDaxQ6ilXaoivo4_KllagabbvfOuiZ0c',
-        'gn': '1hM4MuCaVnZqnL-w-0N-WcWag22ikVLtZ',
-        'gom': '1BNOSw75tzPC0wEgLOCKbwu9wg9gcLOzs',
-        'got': '1YSHYBtXc1WvUvMIHPz6HHgJvaXKulJUj',
-        'gu': '1VdK-B2drqFwKg8KD23c3dKXY-cZgCMgd',
-        'gv': '1XZFohYNbKszEFR-V-yDXxx40V41PV9Zm',
-        'ha': '18ZG4tUU0owRtQA8Ey3Dl72ALjryEJWMC',
-        'hak': '1QQe3WgrCWbvnVH42QXD7KX4kihHURB0Z',
-        'haw': '1FLqlK-wpz4jy768XbQAtxd9PhC-9ciP7',
-        'he': '18K-Erc2VOgtIdskaQq4D5A3XkVstDmfX',
-        'hi': '1lBRapb5tjBqT176gD36K5yb_qsaFeu-k',
-        'hif': '153MQ9Ga4NQ-CkK8UiJM3DjKOk09fhCOV',
-        'ho': '1c1AoS7yq15iVkTEE-0f3x25NT4F202B8',  # leer
-        'hr': '1wS-UtB3sGHuXJQQGR0F5lDegogsgoyif',
-        'hsb': '1_3mMLzAE5OmXn2z64rW3OwWbo85Mirbd',
-        'ht': '1BwCaF0nfdgkM7Yt7A7d7KyVk0BcuwPGk',
-        'hu': '10AkDmTxUWNbOXuYLYZ-ZPbLAdGAGZZ8J',
-        'hy': '1Mi2k2alJJquT1ybd3GC3QYDstSagaWdo',
-        'hz': '1c1m_-Q92v0Di7Nez6VuaccrN19i8icKV',  # leer
-        'ia': '1jPyqTmDuVhEhj89N606Cja5heJEbcMoM',
-        'id': '1JWIvIh8fQoMQqk1rPvUThaskxnTs8tsf',
-        'ie': '1TaKRlTtB8-Wqu4sfvx6JQKIugAlg0pV-',
-        'ig': '15NFAf2Qx6BXSjv_Oun9_3QRBWNn49g86',
-        'ii': '1qldGJkMOMKwY13DpcgbxQCbff0K982f9',  # leer
-        'ik': '1VoSTou2ZlwVhply26ujowDz6gjwtxmny',
-        'ilo': '1-xMuIT6GaM_YeHqgm1OamGkxYfBREiv3',
-        'io': '19Zla0wsAcrZm2c0Pw5ghpp4rHjYs26Pp',
-        'is': '11i-NCyqS6HbldIbYulsCgQGZFXR8hwoB',
-        'it': '1HmjlOaQunHqL2Te7pIkuBWrnjlmdfYo_',
-        'iu': '18jKm1S7Ls3l0_pHqQH8MycG3LhoC2pdX',
-        'ja': '10dz8UxyK4RIacXE2HcGdrharmp5rwc3r',
-        'jam': '1v99CXf9RnbF6aJo669YeTR6mQRTOLZ74',  # leer
-        'jbo': '1_LmH9hc6FDGE3F7pyGB1fUEbSwuTYQdD',
-        'jv': '1qiSu1uECCLl4IBZS27FBdJIBivkJ7GwE',
-        'ka': '172UFuFRBX2V1aWeXlPSpu9TjS-3cxNaD',
-        'kaa': '1kh6hMPUdqO-FIxRY6qaIBZothBURXxbY',
-        'kab': '1oKjbZI6ZrrALCqnPCYgIjKNrKDA7ehcs',
-        'kbd': '1jNbfrboPOwJmlXQBIv053d7n5WXpMRv7',
-        'kg': '1iiu5z-sdJ2JLC4Ja9IgDxpRZklIb6nDx',
-        'ki': '1GUtt0QI84c5McyLGGxoi5uwjHOq1d6G8',
-        'kj': '1nSxXUSGDlXVCIPGlVpcakRc537MwuKZR',  # leer
-        'kk': '1ryC3UN0myckc1awrWhhb6RIi17C0LCuS',
-        'kl': '1gXtGtX9gcTXms1IExICnqZUHefrlcIFf',
-        'km': '1DS5ATxvxyfn1iWvq2G6qmjZv9pv0T6hD',
-        'kn': '1ZGLYMxbb5-29MNmuUfg2xFhYUbkJFMJJ',
-        'ko': '12r8tIkTnwKhLJxy71qpIcoLrT6NNhQYm',
-        'koi': '1EdG_wZ_Qk124EPAZw-w6rdEhYLsgcvIj',
-        'kr': '19VNQtnBA-YL_avWuVeHQHxJZ9MZ04WPF',  # leer
-        'krc': '1nReV4Mb7Wdj96czpO5regFbdBPu0zZ_y',
-        'ks': '1kzh0Pgrv27WRMstR9MpU8mu7p60TcT-X',
-        'ksh': '1iHJvrl2HeRaCumlrx3N7CPrHQ2KuLUkt',
-        'ku': '1YqJog7Bkk0fHBCSTxJ9heeE-bfbkbkye',
-        'kv': '1s91HI4eq8lQYlZwfrJAgaGlCyAtIhvIJ',
-        'kw': '16TaIX2nRfqDp8n7zudd4bqf5abN49dvW',
-        'ky': '17HPUKFdKWhUjuR1NOp5f3PQYfMlMCxCT',
-        'la': '1NiQuBaUIFEERvVXo6CQLwosPraGyiRYw',
-        'lad': '1PEmXCWLCqnjLBomMAYHeObM1AmVHtD08',
-        'lb': '1nE4g10xoTU23idmDtOQ0w2QCuizZ6QH_',
-        'lbe': '1KOm-AdRcCHfSc1-uYBxBA4GjxXjnIlE-',
-        'lez': '1cJAXshrLlF1TZlPHJTpDwEvurIOsz4yR',
-        'lg': '1Ur0y7iiEpWBgHECrIrT1OyIC8um_y4th',
-        'li': '1TikIqfqcZlSDWhOae1JnjJiDko4nj4Dj',
-        'lij': '1ro5ItUcF49iP3JdV82lhCQ07MtZn_VjW',
-        'lmo': '1W4rhBy2Pi5SuYWyWbNotOVkVY3kYWS_O',
-        'ln': '1bLSV6bWx0CgFm7ByKppZLpYCFL8EIAoD',
-        'lo': '1C6SSLeKF3QirjZbAZAcpVX_AXYg_TJG3',
-        'lrc': '1GUcS28MlJe_OjeQfS2AJ8uczpD8ut60e',
-        'lt': '1gAG6TcMTmC128wWK0rCXRlCTsJY9wFQY',
-        'ltg': '12ziP8t_fAAS9JqOCEC0kuJObEyuoiOjD',
-        'lv': '1MPuAM04u-AtfybXdpHwCqUpFWbe-zD0_',
-        'mai': '1d_nUewBkka2QGEmxCc9v3dTfvo7lPATH',
-        'map-bms': '1wrNIE-mqp2xb3lrNdwADe6pb7f35NP6V',
-        'mdf': '1BmMGUJy7afuKfhfTBMiKxM3D7FY-JrQ2',
-        'mg': '105WaMhcWa-46tCztoj8npUyg0aH18nFL',
-        'mh': '1Ej7n6yA1cF1cpD5XneftHtL33iHJwntT',
-        'mhr': '1CCPIUaFkEYXiHO0HF8_w07UzVyWchrjS',
-        'mi': '1F6au9xQjnF-aNBupGJ1PwaMMM6T_PgdQ',
-        'min': '1tVK5SHiCy_DaZSDm3nZBgT5bgWThbJt_',
-        'mk': '18NpudytGhSWq_LbmycTDw10cSftlSBGS',
-        'ml': '1V73UE-EvcE-vV3V1RTvU4sak6QFcP91y',
-        'mn': '14jRXicA87oXZOZllWqUjKBMetNpQEUUp',
-        'mo': '1YsLGNMsJ7VsekhdcITQeolzOSK4NzE6U',
-        'mr': '1vOr1AIHbgkhTO9Ol9Jx5Wh98Qdyh1QKI',
-        'mrj': '1dW-YmEW8a9D5KyXz8ojSdIXWGekNzGzN',
-        'ms': '1bs-_5WNRiZBjO-DtcNtkcIle-98homf_',
-        'mt': '1L7aU3iGjm6SmPIU74k990qRgHFV9hrL0',
-        'mus': '1_b7DcRqiKJFEFwp87cUecqf8A5BDbTIJ',  # leer
-        'mwl': '1MfP0jba2jQfGVeJOLq26MjI6fYY7xTPu',
-        'my': '16wsIGBhNVd2lC2p6n1X8rdMbiaemeiUM',
-        'myv': '1KEqHmfx2pfU-a1tdI_7ZxMQAk5NJzJjB',
-        'mzn': '1CflvmYEXZnWwpsBmIs2OvG-zDDvLEMDJ',
-        'na': '1r0AVjee5wNnrcgJxQmVGPVKg5YWz1irz',
-        'nah': '1fx6eu91NegyueZ1i0XaB07CKjUwjHN7H',
-        'nap': '1bhT4sXCJvaTchCIV9mwLBtf3a7OprbVB',
-        'nds-nl': '1UIFi8eOCuFYJXSAXZ9pCWwkQMlHaY4ye',
-        'nds': '1FLgZIXUWa_vekDt4ndY0B5XL7FNLiulr',
-        'ne': '1gEoCjSJmzjIH4kdHsbDZzD6ID4_78ekS',
-        'new': '1_-p45Ny4w9UvGuhD8uRNSPPeaARYvESH',
-        'ng': '11yxPdkmpmnijQUcnFHZ3xcOmLTYJmN_R',
-        'nl': '1dqYXg3ilzVOSQ_tz_dF47elSIvSIhgqd',
-        'nn': '1pDrtRhQ001z2WUNMWCZQU3RV_M0BqOmv',
-        'no': '1zuT8MI96Ivpiu9mEVFNjwbiM8gJlSzY2',
-        'nov': '1l38388Rln0NXsSARMZHmTmyfo5C0wYTd',
-        'nrm': '10vxPq1Nci7Wpq4XOvx3dtqODskzjdxJQ',
-        'nso': '1iaIV8qlT0RDnbeQlnxJ3RehsG3gU5ePK',
-        'nv': '1oN31jT0w3wP9aGwAPz91pSdUytnd9B0g',
-        'ny': '1eEKH_rUPC560bfEg11kp3kbe8qWm35IG',
-        'oc': '1C01cW8G_j8US-DTrsmeal_ENHTtNWn-H',
-        'olo': '1vbDwKZKqFq84dusr1SvDx5JbBcPanx9L',  # leer
-        'om': '1q3h22VMbWg2kgVFm-OArR-E4y1yBQ1JX',
-        'or': '1k8LwCE8nC7lq6neXDaS3zRn0KOrd9RnS',
-        'os': '1u81KAB34aEQfet00dLMRIBJsfRwbDTij',
-        'pa': '1JDEHL1VcLHBamgTPBom_Ryi8hk6PBpsu',
-        'pag': '1k905VUWnRgY8kFb2P2431Kr4dZuolYGF',
-        'pam': '1ssugGyJb8ipispC60B3I6kzMsri1WcvC',
-        'pap': '1Za0wfwatxYoD7jGclmTtRoBP0uV_qImQ',
-        'pcd': '1csJlKgtG04pdIYCUWhsCCZARKIGlEYPx',
-        'pdc': '1Xnms4RXZKZ1BBQmQJEPokmkiweTpouUw',
-        'pfl': '1tPQfHX7E0uKMdDSlwNw5aGmaS5bUK0rn',
-        'pi': '16b-KxNxzbEuyoNSlI3bfe2YXmdSEsPFu',
-        'pih': '1vwyihTnS8_PE5BNK7cTISmIBqGWvsVnF',
-        'pl': '1fijjS0LbfpKcoPB5V8c8fH08T8AkXRp9',
-        'pms': '12ySc7X9ajWWqMlBjyrPiEdc-qVBuIkbA',
-        'pnb': '1RB3-wjluhTKbdTGCsk3nag1bM3m4wENb',
-        'pnt': '1ZCUzms6fY4on_fW8uVgO7cEs9KHydHY_',
-        'ps': '1WKl9Av6Sqz6aHKyUM5kIh90mzFzyVWH9',
-        'pt': '13BX-_4_hcTUp59HDyczFDI32qUB94vUY',
-        'qu': '1CB_C4ygtRoegkqgcqfXNHr8oQd-UcvDE',
-        'rm': '1YRSGgWoxEqSojHXuBHJnY8vAHr1VgLu-',
-        'rmy': '1uFcCyvOWBJWKFQxbkYSp373xUXVl4IgF',
-        'rn': '1ekyyb2MvupYGY_E8_BhKvV664sLvW4aE',
-        'ro': '1YfeNTSoxU-zJMnyQotLk5X8B_6nHryBu',
-        'roa-rup': '150s4H4TdQ5nNYVC6j0E416TUAjBE85yy',
-        'roa-tara': '1H6emfQsD_a5yohK4RMPQ-GrnHXqqVgr3',
-        'ru': '11gP2s-SYcfS3j9MjPp5C3_nFeQB-8x86',
-        'rue': '1OuSglZAndja1J5D5IUmdbt_niTTyEgYK',
-        'rw': '1NuhHfi0-B-Xlr_BApijnxCw0WMEltttP',
-        'sa': '1P2S3gL_zvKgXLKJJxg-Fb4z8XdlVpQik',
-        'sah': '1qz0MpKckzUref2FX_FYiNzI2p4BDc5oR',
-        'sc': '1oAYj_Fty4FUwjAOBEBaiZt_cY8dtpDfA',
-        'scn': '1sDN9zHkXWYoHYx-DUu-GPvsUgB_IRa8S',
-        'sco': '1i8W7KQPj6YZQLop89vZBSybJNgNsvXWR',
-        'sd': '1vaNqfv3S8Gl5pQmig3vwWQ3cqRTsXmMR',
-        'se': '1RT9xhn0Vl90zjWYDTw5V1L_u1Oh16tpP',
-        'sg': '1iIh2oXD2Szz_AygUvTt3_ZK8a3RYEGZ_',
-        'sh': '1qPwLiAm6t4__G-zVEOrBgYx6VRmgDgiS',
-        'si': '1G5ryceID0TP6SAO42e-HAbIlCvYmnUN7',
-        'simple': '1FVV49o_RlK6M5Iw_7zeJOEDQoTa5zSbq',
-        'sk': '11mkYvbmAWKTInj6t4Ma8BUPxoR5o6irL',
-        'sl': '1fsIZS5LgMzMzZ6T7ogStyj-ILEZIBRvO',
-        'sm': '1yefECpKX_Y4R7G2tggIxvc_BvJfOAz-t',
-        'sn': '1fYeCjMPvRAv94kvZjiKI-ktIDLkbv0Ve',
-        'so': '1Uc-eSZnJb36SgeTvRU3GirXZOlGD_NB6',
-        'sq': '11u-53n71O_yjpwRiCQSwgL7N2w72ZptX',
-        'sr': '1PGLGlQi8Q0Eac6dib-uuCJAAHK6SF5Pz',
-        'srn': '1JKiL3TSXqK1-KhPfAwMK0uqw90WEzg7M',
-        'ss': '1e0quNEsA1dn57-IbincF4D82dRWgzQlp',
-        'st': '1ny-FBzpBqIDgv6jMcsoFev3Ih65FNZFO',
-        'stq': '15Fx32ROy2IM6lSqAPUykkr3CITR6Xd7v',
-        'su': '1C0FJum7bYZpnyptBvfAgwJb0TX2hggtO',
-        'sv': '1YyqzOSXzK5yrAou9zeTDWH_7s569mDcz',
-        'sw': '1_bNTj6T8eXlNAIuHaveleWlHB_22alJs',
-        'szl': '1_dXEip1snK4CPVGqH8x7lF5O-6FdCNFW',
-        'ta': '1ZFTONsxGtSnC9QB6RpWSvgD_MbZwIhHH',
-        'tcy': '15R6u7KQs1vmDSm_aSDrQMJ3Q6q3Be0r7',  # leer
-        'te': '11Sx-pBAPeZOXGyv48UNSVMD0AH7uf4YN',
-        'tet': '11mr2MYLcv9pz7mHhGGNi5iNCOVErYeOt',
-        'tg': '16ttF7HWqM9Cnj4qmgf3ZfNniiOJfZ52w',
-        'th': '14xhIt-xr5n9nMuvcwayCGM1-zBCFZquW',
-        'ti': '123q5e9MStMShp8eESGtHdSBGLDrCKfJU',
-        'tk': '1X-JNInt34BNGhg8A8Peyjw2WjsALdXsD',
-        'tl': '1WkQHbWd9cqtTnSHAv0DpUThaBnzeSPTJ',
-        'tn': '1fHfQHetZn8-fLuRZEu-cvs-kQYwPvjyL',
-        'to': '1cHOLaczYJ8h-OqQgxeoH9vMG3izg6muT',
-        'tpi': '1YsRjxVu6NYOrXRb8oqMO9FPaicelFEcu',
-        'tr': '1J1Zy02IxvtCK0d1Ba2h_Ulit1mVb9UIX',
-        'ts': '1pIcfAt3KmtmDkyhOl-SMSeoM8aP8bOpl',
-        'tt': '1vsfzCjj-_bMOn5jBai41TF5GjKJM_Ius',
-        'tum': '1NWcg65daI2Bt0awyEgU6apUDbBmiqCus',
-        'tw': '1WCYKZIqS7AagS76QFSfbteiOgFNBvNne',
-        'ty': '1DIqaP1l-N9VXTNokrlr6EuPMGE765o4h',
-        'tyv': '1F3qa05OYLBcjT1lXMurAJFDXP_EesCvM',
-        'udm': '1T0YMTAPLOk768sstnewy5Jxgx2RPu3Rb',
-        'ug': '1fjezvqlysyZhiQMZdazqLGgk72PqtXAw',
-        'uk': '1UMJCHtzxkfLDBJE7NtfN5FeMrnnUVwoh',
-        'ur': '1WNaD2TuHvdsF-z0k_emQYchwoQQDFmRk',
-        'uz': '11wrG2FSTpRJc2jb5MhgvxjkVDYhT8M-l',
-        've': '1PucJ7pJ4CXGEXZ5p_WleZDs2usNz74to',
-        'vec': '1cAVjm_y3ehNteDQIYz9yyoq1EKkqOXZ0',
-        'vep': '1K_eqV7O6C7KPJWZtmIuzFMKAagj-0O85',
-        'vi': '1yQ6nhm1BmG9lD4_NaG1hE5VV6biEaV5f',
-        'vls': '1bpQQW6pKHruKJJaKtuggH5rReMXyeVXp',
-        'vo': '1D80QRdTpe7H4mHFKpfugscsjX71kiMJN',
-        'wa': '1m4B81QYbf74htpInDU5p7d0n0ot8WLPZ',
-        'war': '1EC3jsHtu22tHBv6jX_I4rupC5RwV3OYd',
-        'wo': '1vChyqNNLu5xYHdyHpACwwpw4l3ptiKlo',
-        'wuu': '1_EIn02xCUBcwLOwYnA-lScjS2Lh2ECw6',
-        'xal': '19bKXsL1D2UesbB50JPyc9TpG1lNc2POt',
-        'xh': '1pPVcxBG3xsCzEnUzlohc_p89gQ9dSJB3',
-        'xmf': '1SM9llku6I_ZuZz05mOBuL2lx-KQXvehr',
-        'yi': '1WNWr1oV-Nl7c1Jv8x_MiAj2vxRtyQawu',
-        'yo': '1yNVOwMOWeglbOcRoZzgd4uwlN5JMynnY',
-        'za': '1i7pg162cD_iU9h8dgtI2An8QCcbzUAjB',
-        'zea': '1EWSkiSkPBfbyjWjZK0VuKdpqFnFOpXXQ',
-        'zh-classical': '1uUKZamNp08KA7s7794sKPOqPALvo_btl',
-        'zh-min-nan': '1oSgz3YBXLGUgI7kl-uMOC_ww6L0FNFmp',
-        'zh-yue': '1zhwlUeeiyOAU1QqwqZ8n91yXIRPFA7UE',
-        'zh': '1LZ96GUhkVHQU-aj2C3WOrtffOp0U3Z7f',
-        'zu': '1FyXl_UK1737XB3drqQFhGXiJrJckiB1W'
-    }
-    return languages_ids[language]
-
-
-class DANE(ColumnCorpus):
+class NER_SWEDISH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
             in_memory: bool = True,
     ):
+        """
+        Initialize the NER_SWEDISH corpus for Swedish. The first time you call this constructor it will automatically
+        download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
+
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {1: 'text', 3: 'pos', 9: 'ner'}
+        columns = {0: "text", 1: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1440,61 +1232,35 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        data_path = Path(flair.cache_root) / "datasets" / dataset_name
-        train_data_file = data_path / "ddt.train.conllu"
-        if not train_data_file.is_file():
-            temp_file = cached_path(
-                'https://danlp.alexandra.dk/304bd159d5de/datasets/ddt.zip',
-                Path("datasets") / dataset_name
-            )
-            from zipfile import ZipFile
-
-            with ZipFile(temp_file, 'r') as zip_file:
-                zip_file.extractall(path=data_path)
-
-            # Remove CoNLL-U meta information in the last column
-            for part in ['train', 'dev', 'test']:
-                lines = []
-                data_file = "ddt.{}.conllu".format(part)
-                with open(data_path / data_file, 'r') as file:
-                    for line in file:
-                        if line.startswith("#") or line == "\n":
-                            lines.append(line)
-                        lines.append(line.replace("name=", "").replace("|SpaceAfter=No", ""))
-
-                with open(data_path / data_file, 'w') as file:
-                    file.writelines(lines)
+        ner_spraakbanken_path = "https://raw.githubusercontent.com/klintan/swedish-ner-corpus/master/"
+        cached_path(f"{ner_spraakbanken_path}test_corpus.txt", Path("datasets") / dataset_name)
+        cached_path(f"{ner_spraakbanken_path}train_corpus.txt", Path("datasets") / dataset_name)
 
-                print(data_path / data_file)
+        # data is not in IOB2 format. Thus we transform it to IOB2
+        add_IOB2_tags(data_file=Path(data_folder / "test_corpus.txt"))
+        add_IOB2_tags(data_file=Path(data_folder / "train_corpus.txt"))
 
-        super(DANE, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes,
-            in_memory=in_memory, comment_symbol="#"
+        super(NER_SWEDISH, self).__init__(
+            data_folder,
+            columns,
+            tag_to_bioes=tag_to_bioes,
+            in_memory=in_memory,
         )
 
 
-class EUROPARL_NER_GERMAN(ColumnCorpus):
+class SEC_FILLINGS(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
-            in_memory: bool = False,
+            in_memory: bool = True,
     ):
-        """
-        Initialize the EUROPARL_NER_GERMAN corpus. The first time you call this constructor it will automatically
-        download the dataset.
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: 'ner' by default, should not be changed.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
 
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: 'text', 1: 'lemma', 2: 'pos', 3: 'np', 4: 'ner'}
+        columns = {0: "text", 1: "pos", 3: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1505,44 +1271,35 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        europarl_ner_german_path = "https://nlpado.de/~sebastian/software/ner/"
-        cached_path(f"{europarl_ner_german_path}ep-96-04-15.conll", Path("datasets") / dataset_name)
-        cached_path(f"{europarl_ner_german_path}ep-96-04-16.conll", Path("datasets") / dataset_name)
-
-        add_IOB_tags(data_file=Path(data_folder / "ep-96-04-15.conll"), encoding="latin-1", ner_column=4)
-        add_IOB_tags(data_file=Path(data_folder / "ep-96-04-16.conll"), encoding="latin-1", ner_column=4)
+        SEC_FILLINGS_Path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/SEC-filings/CONLL-format/data/"
+        cached_path(f"{SEC_FILLINGS_Path}test/FIN3.txt", Path("datasets") / dataset_name)
+        cached_path(f"{SEC_FILLINGS_Path}train/FIN5.txt", Path("datasets") / dataset_name)
 
-        super(EUROPARL_NER_GERMAN, self).__init__(
+        super(SEC_FILLINGS, self).__init__(
             data_folder,
             columns,
             tag_to_bioes=tag_to_bioes,
-            encoding="latin-1",
+            encoding="utf-8",
             in_memory=in_memory,
-            train_file='ep-96-04-16.conll',
-            test_file='ep-96-04-15.conll'
+            train_file='FIN5.txt',
+            test_file="FIN3.txt",
+            skip_first_line=True
         )
 
 
-class GERMEVAL_14(ColumnCorpus):
+class SEMEVAL2017(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
+            tag_to_bioes: str = "keyword",
             in_memory: bool = True,
     ):
-        """
-        Initialize the GermEval NER corpus for German. This is only possible if you've manually downloaded it to your
-        machine. Obtain the corpus from https://sites.google.com/site/germeval2014ner/data and put it into some folder.
-        Then point the base_path parameter in the constructor to this folder
-        :param base_path: Path to the GermEval corpus on your machine
-        :param tag_to_bioes: 'ner' by default, should not be changed.
-        :param in_memory:If True, keeps dataset in memory giving speedups in training.
-        """
+
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {1: "text", 2: "ner"}
+        columns = {0: "text", 1: "keyword"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1552,24 +1309,17 @@ def __init__(
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
-        # check if data there
-        if not data_folder.exists():
-            log.warning("-" * 100)
-            log.warning(f'WARNING: GermEval-14 dataset not found at "{data_folder}".')
-            log.warning(
-                'Instructions for obtaining the data can be found here: https://sites.google.com/site/germeval2014ner/data"'
-            )
-            log.warning("-" * 100)
-        super(GERMEVAL_14, self).__init__(
-            data_folder,
-            columns,
-            tag_to_bioes=tag_to_bioes,
-            comment_symbol="#",
-            in_memory=in_memory,
+        semeval2017_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/SemEval-2017"
+        cached_path(f"{semeval2017_path}/train.txt", Path("datasets") / dataset_name)
+        cached_path(f"{semeval2017_path}/test.txt", Path("datasets") / dataset_name)
+        cached_path(f"{semeval2017_path}/dev.txt", Path("datasets") / dataset_name)
+
+        super(SEMEVAL2017, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
 
-class INSPEC(ColumnCorpus):
+class SEMEVAL2010(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
@@ -1591,35 +1341,33 @@ def __init__(
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
-        inspec_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/Inspec"
-        cached_path(f"{inspec_path}/train.txt", Path("datasets") / dataset_name)
-        cached_path(f"{inspec_path}/test.txt", Path("datasets") / dataset_name)
-        if not "dev.txt" in os.listdir(data_folder):
-            cached_path(f"{inspec_path}/valid.txt", Path("datasets") / dataset_name)
-            # rename according to train - test - dev - convention
-            os.rename(data_folder / "valid.txt", data_folder / "dev.txt")
+        semeval2010_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/processed_semeval-2010"
+        cached_path(f"{semeval2010_path}/train.txt", Path("datasets") / dataset_name)
+        cached_path(f"{semeval2010_path}/test.txt", Path("datasets") / dataset_name)
 
-        super(INSPEC, self).__init__(
+        super(SEMEVAL2010, self).__init__(
             data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
 
-class LER_GERMAN(ColumnCorpus):
+class TURKU_NER(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
-            in_memory: bool = False,
+            in_memory: bool = True,
+            document_as_sequence: bool = False,
     ):
         """
-        Initialize the LER_GERMAN (Legal Entity Recognition) corpus. The first time you call this constructor it will automatically
+        Initialize the Finnish TurkuNER corpus. The first time you call this constructor it will automatically
         download the dataset.
         :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
         to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
+        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
+        POS tags instead
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
         :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
-
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
@@ -1635,18 +1383,29 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        ler_path = "https://raw.githubusercontent.com/elenanereiss/Legal-Entity-Recognition/master/data/"
-        cached_path(f"{ler_path}ler.conll", Path("datasets") / dataset_name)
+        conll_path = "https://raw.githubusercontent.com/TurkuNLP/turku-ner-corpus/master/data/conll"
+        dev_file = "dev.tsv"
+        test_file = "test.tsv"
+        train_file = "train.tsv"
+        cached_path(f"{conll_path}/{dev_file}", Path("datasets") / dataset_name)
+        cached_path(f"{conll_path}/{test_file}", Path("datasets") / dataset_name)
+        cached_path(f"{conll_path}/{train_file}", Path("datasets") / dataset_name)
 
-        super(LER_GERMAN, self).__init__(
+        super(TURKU_NER, self).__init__(
             data_folder,
             columns,
+            dev_file=dev_file,
+            test_file=test_file,
+            train_file=train_file,
+            column_delimiter="\t",
             tag_to_bioes=tag_to_bioes,
+            encoding="latin-1",
             in_memory=in_memory,
-            train_file='ler.conll'
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
         )
 
-class ANER_CORP(ColumnCorpus):
+
+class TWITTER_NER(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
@@ -1655,15 +1414,14 @@ def __init__(
             document_as_sequence: bool = False,
     ):
         """
-        Initialize a preprocessed version of the Arabic Named Entity Recognition Corpus (ANERCorp) dataset available
-        from https://github.com/EmnamoR/Arabic-named-entity-recognition/blob/master/ANERCorp.rar.
-        http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp
-        Column order is swapped
-        The first time you call this constructor it will automatically download the dataset.
+        Initialize a dataset called twitter_ner which can be found on the following page:
+        https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/ner.txt.
+
+        The first time you call this constructor it will automatically
+        download the dataset.
         :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
         to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
-        POS tags instead
+        :param tag_to_bioes: NER by default, need not be changed
         :param in_memory: If True, keeps dataset in memory giving speedups in training.
         :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
@@ -1671,7 +1429,7 @@ def __init__(
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "ner"}
+        columns = {0: 'text', 1: 'ner'}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1682,32 +1440,41 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        anercorp_path = "https://megantosh.s3.eu-central-1.amazonaws.com/ANERcorp/"
-        # cached_path(f"{anercorp_path}test.txt", Path("datasets") / dataset_name)
-        cached_path(f"{anercorp_path}train.txt", Path("datasets") / dataset_name)
+        twitter_ner_path = "https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/"
+        cached_path(f"{twitter_ner_path}ner.txt", Path("datasets") / dataset_name)
 
-        super(ANER_CORP, self).__init__(
+        super(TWITTER_NER, self).__init__(
             data_folder,
             columns,
-            # tag_to_bioes=tag_to_bioes,
-            encoding="utf-8",
+            tag_to_bioes=tag_to_bioes,
+            encoding="latin-1",
+            train_file="ner.txt",
             in_memory=in_memory,
             document_separator_token=None if not document_as_sequence else "-DOCSTART-",
         )
 
 
-class NER_BASQUE(ColumnCorpus):
+class UP_CHINESE(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
             in_memory: bool = True,
+            document_as_sequence: bool = False,
     ):
+        """
+        Initialize the Chinese dataset from the Universal Propositions Bank, comming from that webpage:
+        https://github.com/System-T/UniversalPropositions/tree/master/UP_Chinese
+
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "ner"}
+        columns = {1: "text", 9: "frame"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1718,44 +1485,45 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        ner_basque_path = "http://ixa2.si.ehu.eus/eiec/"
-        data_path = Path(flair.cache_root) / "datasets" / dataset_name
-        data_file = data_path / "named_ent_eu.train"
-        if not data_file.is_file():
-            cached_path(
-                f"{ner_basque_path}/eiec_v1.0.tgz", Path("datasets") / dataset_name
-            )
-            import tarfile, shutil
-
-            with tarfile.open(
-                    Path(flair.cache_root) / "datasets" / dataset_name / "eiec_v1.0.tgz",
-                    "r:gz",
-            ) as f_in:
-                corpus_files = (
-                    "eiec_v1.0/named_ent_eu.train",
-                    "eiec_v1.0/named_ent_eu.test",
-                )
-                for corpus_file in corpus_files:
-                    f_in.extract(corpus_file, data_path)
-                    shutil.move(f"{data_path}/{corpus_file}", data_path)
+        up_zh_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Chinese/"
+        cached_path(f"{up_zh_path}zh-up-train.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_zh_path}zh-up-dev.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_zh_path}zh-up-test.conllu", Path("datasets") / dataset_name)
 
-        super(NER_BASQUE, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
+        super(UP_CHINESE, self).__init__(
+            data_folder,
+            columns,
+            encoding="utf-8",
+            train_file="zh-up-train.conllu",
+            test_file="zh-up-test.conllu",
+            dev_file="zh-up-dev.conllu",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            comment_symbol="#",
         )
 
 
-class NER_FINNISH(ColumnCorpus):
+class UP_ENGLISH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
             in_memory: bool = True,
+            document_as_sequence: bool = False,
     ):
+        """
+        Initialize the English dataset from the Universal Propositions Bank, comming from that webpage:
+        https://github.com/System-T/UniversalPropositions.
+
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "ner"}
+        columns = {1: "text", 10: "frame"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1766,48 +1534,45 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        ner_finnish_path = "https://raw.githubusercontent.com/mpsilfve/finer-data/master/data/digitoday."
-        cached_path(f"{ner_finnish_path}2014.train.csv", Path("datasets") / dataset_name)
-        cached_path(f"{ner_finnish_path}2014.dev.csv", Path("datasets") / dataset_name)
-        cached_path(f"{ner_finnish_path}2015.test.csv", Path("datasets") / dataset_name)
-
-        _remove_lines_without_annotations(data_file=Path(data_folder / "digitoday.2015.test.csv"))
+        up_en_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_English-EWT/"
+        cached_path(f"{up_en_path}en_ewt-up-train.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_en_path}en_ewt-up-dev.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_en_path}en_ewt-up-test.conllu", Path("datasets") / dataset_name)
 
-        super(NER_FINNISH, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, skip_first_line=True
+        super(UP_ENGLISH, self).__init__(
+            data_folder,
+            columns,
+            encoding="utf-8",
+            train_file="en_ewt-up-train.conllu",
+            test_file="en_ewt-up-test.conllu",
+            dev_file="en_ewt-up-dev.conllu",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            comment_symbol="#",
         )
 
 
-def _remove_lines_without_annotations(data_file: Union[str, Path] = None):
-    with open(data_file, 'r') as f:
-        lines = f.readlines()
-    with open(data_file, 'w') as f:
-        for line in lines:
-            if len(line.split()) != 1:
-                f.write(line)
-
-
-class NER_SWEDISH(ColumnCorpus):
+class UP_FRENCH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
             in_memory: bool = True,
+            document_as_sequence: bool = False,
     ):
         """
-        Initialize the NER_SWEDISH corpus for Swedish. The first time you call this constructor it will automatically
-        download the dataset.
+        Initialize the French dataset from the Universal Propositions Bank, comming from that webpage:
+        https://github.com/System-T/UniversalPropositions.
+
         :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
         to point to a different folder but typically this should not be necessary.
         :param in_memory: If True, keeps dataset in memory giving speedups in training.
         :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
-
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "ner"}
+        columns = {1: "text", 9: "frame"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1818,35 +1583,45 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        ner_spraakbanken_path = "https://raw.githubusercontent.com/klintan/swedish-ner-corpus/master/"
-        cached_path(f"{ner_spraakbanken_path}test_corpus.txt", Path("datasets") / dataset_name)
-        cached_path(f"{ner_spraakbanken_path}train_corpus.txt", Path("datasets") / dataset_name)
-
-        # data is not in IOB2 format. Thus we transform it to IOB2
-        add_IOB2_tags(data_file=Path(data_folder / "test_corpus.txt"))
-        add_IOB2_tags(data_file=Path(data_folder / "train_corpus.txt"))
+        up_fr_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_French/"
+        cached_path(f"{up_fr_path}fr-up-train.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_fr_path}fr-up-dev.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_fr_path}fr-up-test.conllu", Path("datasets") / dataset_name)
 
-        super(NER_SWEDISH, self).__init__(
+        super(UP_FRENCH, self).__init__(
             data_folder,
             columns,
-            tag_to_bioes=tag_to_bioes,
+            encoding="utf-8",
+            train_file="fr-up-train.conllu",
+            test_file="fr-up-test.conllu",
+            dev_file="fr-up-dev.conllu",
             in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            comment_symbol="#",
         )
 
 
-class SEMEVAL2017(ColumnCorpus):
+class UP_FINNISH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "keyword",
             in_memory: bool = True,
+            document_as_sequence: bool = False,
     ):
+        """
+        Initialize the Finnish dataset from the Universal Propositions Bank, comming from that webpage:
+        https://github.com/System-T/UniversalPropositions/tree/master/UP_Finnish
 
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "keyword"}
+        columns = {1: "text", 9: "frame"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1856,29 +1631,46 @@ def __init__(
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
-        semeval2017_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/SemEval-2017"
-        cached_path(f"{semeval2017_path}/train.txt", Path("datasets") / dataset_name)
-        cached_path(f"{semeval2017_path}/test.txt", Path("datasets") / dataset_name)
-        cached_path(f"{semeval2017_path}/dev.txt", Path("datasets") / dataset_name)
+        # download data if necessary
+        up_fi_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Finnish/"
+        cached_path(f"{up_fi_path}fi-up-train.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_fi_path}fi-up-dev.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_fi_path}fi-up-test.conllu", Path("datasets") / dataset_name)
 
-        super(SEMEVAL2017, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
+        super(UP_FINNISH, self).__init__(
+            data_folder,
+            columns,
+            encoding="utf-8",
+            train_file="fi-up-train.conllu",
+            test_file="fi-up-test.conllu",
+            dev_file="fi-up-dev.conllu",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            comment_symbol="#",
         )
 
 
-class SEMEVAL2010(ColumnCorpus):
+class UP_GERMAN(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "keyword",
             in_memory: bool = True,
+            document_as_sequence: bool = False,
     ):
+        """
+        Initialize the German dataset from the Universal Propositions Bank, comming from that webpage:
+        https://github.com/System-T/UniversalPropositions.
 
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "keyword"}
+        columns = {1: "text", 9: "frame"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1888,27 +1680,46 @@ def __init__(
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
-        semeval2010_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/processed_semeval-2010"
-        cached_path(f"{semeval2010_path}/train.txt", Path("datasets") / dataset_name)
-        cached_path(f"{semeval2010_path}/test.txt", Path("datasets") / dataset_name)
+        # download data if necessary
+        up_de_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_German/"
+        cached_path(f"{up_de_path}de-up-train.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_de_path}de-up-dev.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_de_path}de-up-test.conllu", Path("datasets") / dataset_name)
 
-        super(SEMEVAL2010, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
+        super(UP_GERMAN, self).__init__(
+            data_folder,
+            columns,
+            encoding="utf-8",
+            train_file="de-up-train.conllu",
+            test_file="de-up-test.conllu",
+            dev_file="de-up-dev.conllu",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            comment_symbol="#",
         )
 
 
-class WIKINER_ENGLISH(ColumnCorpus):
+class UP_ITALIAN(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
-            in_memory: bool = False,
+            in_memory: bool = True,
+            document_as_sequence: bool = False,
     ):
+        """
+        Initialize the Italian dataset from the Universal Propositions Bank, comming from that webpage:
+        https://github.com/System-T/UniversalPropositions/tree/master/UP_Italian
+
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "pos", 2: "ner"}
+        columns = {1: "text", 9: "frame"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1919,25 +1730,45 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        _download_wikiner("en", dataset_name)
+        up_it_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Italian/"
+        cached_path(f"{up_it_path}it-up-train.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_it_path}it-up-dev.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_it_path}it-up-test.conllu", Path("datasets") / dataset_name)
 
-        super(WIKINER_ENGLISH, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
+        super(UP_ITALIAN, self).__init__(
+            data_folder,
+            columns,
+            encoding="utf-8",
+            train_file="it-up-train.conllu",
+            test_file="it-up-test.conllu",
+            dev_file="it-up-dev.conllu",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            comment_symbol="#",
         )
 
 
-class WIKINER_GERMAN(ColumnCorpus):
+class UP_SPANISH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
-            in_memory: bool = False,
+            in_memory: bool = True,
+            document_as_sequence: bool = False,
     ):
+        """
+        Initialize the Spanish dataset from the Universal Propositions Bank, comming from that webpage:
+        https://github.com/System-T/UniversalPropositions
+
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "pos", 2: "ner"}
+        columns = {1: "text", 9: "frame"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1948,25 +1779,45 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        _download_wikiner("de", dataset_name)
+        up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish/"
+        cached_path(f"{up_es_path}es-up-train.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_es_path}es-up-dev.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_es_path}es-up-test.conllu", Path("datasets") / dataset_name)
 
-        super(WIKINER_GERMAN, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
+        super(UP_SPANISH, self).__init__(
+            data_folder,
+            columns,
+            encoding="utf-8",
+            train_file="es-up-train.conllu",
+            test_file="es-up-test.conllu",
+            dev_file="es-up-dev.conllu",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            comment_symbol="#",
         )
 
 
-class WIKINER_DUTCH(ColumnCorpus):
+class UP_SPANISH_ANCORA(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
-            in_memory: bool = False,
+            in_memory: bool = True,
+            document_as_sequence: bool = False,
     ):
+        """
+        Initialize the Spanish AnCora dataset from the Universal Propositions Bank, comming from that webpage:
+        https://github.com/System-T/UniversalPropositions
+
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "pos", 2: "ner"}
+        columns = {1: "text", 9: "frame"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1977,25 +1828,47 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        _download_wikiner("nl", dataset_name)
+        up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish-AnCora/"
+        cached_path(f"{up_es_path}es_ancora-up-train.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_es_path}es_ancora-up-dev.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_es_path}es_ancora-up-test.conllu", Path("datasets") / dataset_name)
 
-        super(WIKINER_DUTCH, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
+        super(UP_SPANISH_ANCORA, self).__init__(
+            data_folder,
+            columns,
+            encoding="utf-8",
+            train_file="es_ancora-up-train.conllu",
+            test_file="es_ancora-up-test.conllu",
+            dev_file="es_ancora-up-dev.conllu",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            comment_symbol="#",
         )
 
 
-class WIKINER_FRENCH(ColumnCorpus):
+class WEIBO_NER(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
-            in_memory: bool = False,
+            in_memory: bool = True,
+            document_as_sequence: bool = False,
     ):
+        """
+        Initialize the WEIBO_NER corpus . The first time you call this constructor it will automatically
+        download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
+        POS tags instead
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "pos", 2: "ner"}
+        columns = {0: 'text', 1: 'ner'}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2006,192 +1879,449 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        _download_wikiner("fr", dataset_name)
+        weiboNER_conll_path = "https://raw.githubusercontent.com/87302380/WEIBO_NER/main/data/"
+        cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.train", Path("datasets") / dataset_name)
+        cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.test", Path("datasets") / dataset_name)
+        cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.dev", Path("datasets") / dataset_name)
 
-        super(WIKINER_FRENCH, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
+        super(WEIBO_NER, self).__init__(
+            data_folder,
+            columns,
+            tag_to_bioes=tag_to_bioes,
+            encoding="utf-8",
+            in_memory=in_memory,
+            train_file="weiboNER_2nd_conll_format.train",
+            test_file="weiboNER_2nd_conll_format.test",
+            dev_file="weiboNER_2nd_conll_format.dev",
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
         )
 
 
-class WIKINER_ITALIAN(ColumnCorpus):
+class WIKIANN(MultiCorpus):
     def __init__(
             self,
+            languages: Union[str, List[str]],
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
             in_memory: bool = False,
     ):
+        """
+        WkiAnn corpus for cross-lingual NER consisting of datasets from 282 languages that exist
+        in Wikipedia. See https://elisa-ie.github.io/wikiann/ for details and for the languages and their
+        respective abbreveations, i.e. "en" for english. (license: https://opendatacommons.org/licenses/by/)
+        Parameters
+        ----------
+        languages : Union[str, List[str]]
+            Should be an abbreviation of a language ("en", "de",..) or a list of abbreviations.
+            The datasets of all passed languages will be saved in one MultiCorpus.
+            (Note that, even though listed on https://elisa-ie.github.io/wikiann/ some datasets are empty.
+            This includes "aa", "cho", "ho", "hz", "ii", "jam", "kj", "kr", "mus", "olo" and "tcy".)
+        base_path : Union[str, Path], optional
+            Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+            to point to a different folder but typically this should not be necessary.
+        tag_to_bioes : str, optional
+            The data is in bio-format. It will by default (with the string "ner" as value) be transformed
+            into the bioes format. If you dont want that set it to None.
+
+        """
+        if type(languages) == str:
+            languages = [languages]
+
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "pos", 2: "ner"}
+        columns = {0: "text", 1: "ner"}
 
         # this dataset name
-        dataset_name = self.__class__.__name__.lower()
+        dataset_name = "wikiann"
 
         # default dataset folder is the cache root
         if not base_path:
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
-        # download data if necessary
-        _download_wikiner("it", dataset_name)
+        # For each language in languages, the file is downloaded if not existent
+        # Then a comlumncorpus of that data is created and saved in a list
+        # this list is handed to the multicorpus
 
-        super(WIKINER_ITALIAN, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
-        )
+        # list that contains the columncopora
+        corpora = []
 
+        google_drive_path = 'https://drive.google.com/uc?id='
+        # download data if necessary
+        first = True
+        for language in languages:
 
-class WIKINER_SPANISH(ColumnCorpus):
-    def __init__(
-            self,
-            base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
-            in_memory: bool = False,
-    ):
-        if type(base_path) == str:
-            base_path: Path = Path(base_path)
-
-        # column format
-        columns = {0: "text", 1: "pos", 2: "ner"}
-
-        # this dataset name
-        dataset_name = self.__class__.__name__.lower()
-
-        # default dataset folder is the cache root
-        if not base_path:
-            base_path = Path(flair.cache_root) / "datasets"
-        data_folder = base_path / dataset_name
-
-        # download data if necessary
-        _download_wikiner("es", dataset_name)
-
-        super(WIKINER_SPANISH, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
-        )
-
-
-class WIKINER_PORTUGUESE(ColumnCorpus):
-    def __init__(
-            self,
-            base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
-            in_memory: bool = False,
-    ):
-        if type(base_path) == str:
-            base_path: Path = Path(base_path)
-
-        # column format
-        columns = {0: "text", 1: "pos", 2: "ner"}
-
-        # this dataset name
-        dataset_name = self.__class__.__name__.lower()
-
-        # default dataset folder is the cache root
-        if not base_path:
-            base_path = Path(flair.cache_root) / "datasets"
-        data_folder = base_path / dataset_name
-
-        # download data if necessary
-        _download_wikiner("pt", dataset_name)
-
-        super(WIKINER_PORTUGUESE, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
-        )
-
-
-class WIKINER_POLISH(ColumnCorpus):
-    def __init__(
-            self,
-            base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
-            in_memory: bool = False,
-    ):
-        if type(base_path) == str:
-            base_path: Path = Path(base_path)
-
-        # column format
-        columns = {0: "text", 1: "pos", 2: "ner"}
-
-        # this dataset name
-        dataset_name = self.__class__.__name__.lower()
-
-        # default dataset folder is the cache root
-        if not base_path:
-            base_path = Path(flair.cache_root) / "datasets"
-        data_folder = base_path / dataset_name
-
-        # download data if necessary
-        _download_wikiner("pl", dataset_name)
-
-        super(WIKINER_POLISH, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
-        )
-
+            language_folder = data_folder / language
+            file_name = 'wikiann-' + language + '.bio'
 
-class WIKINER_RUSSIAN(ColumnCorpus):
-    def __init__(
-            self,
-            base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
-            in_memory: bool = False,
-    ):
-        if type(base_path) == str:
-            base_path: Path = Path(base_path)
+            # if language not downloaded yet, download it
+            if not language_folder.exists():
+                if first == True:
+                    import gdown
+                    import tarfile
+                    first = False
+                # create folder
+                os.makedirs(language_folder)
+                # get google drive id from list
+                google_id = google_drive_id_from_language_name(language)
+                url = google_drive_path + google_id
 
-        # column format
-        columns = {0: "text", 1: "pos", 2: "ner"}
+                # download from google drive
+                gdown.download(url, str(language_folder / language) + '.tar.gz')
 
-        # this dataset name
-        dataset_name = self.__class__.__name__.lower()
+                # unzip
+                print("Extract data...")
+                tar = tarfile.open(str(language_folder / language) + '.tar.gz', "r:gz")
+                # tar.extractall(language_folder,members=[tar.getmember(file_name)])
+                tar.extract(file_name, str(language_folder))
+                tar.close()
+                print('...done.')
 
-        # default dataset folder is the cache root
-        if not base_path:
-            base_path = Path(flair.cache_root) / "datasets"
-        data_folder = base_path / dataset_name
+                # transform data into required format
+                # the processed dataset has the additional ending "_new"
+                print("Process dataset...")
+                silver_standard_to_simple_ner_annotation(str(language_folder / file_name))
+                # remove the unprocessed dataset
+                os.remove(str(language_folder / file_name))
+                print('...done.')
 
-        # download data if necessary
-        _download_wikiner("ru", dataset_name)
+            # initialize comlumncorpus and add it to list
+            print("Read data into corpus...")
+            corp = ColumnCorpus(data_folder=language_folder,
+                                column_format=columns,
+                                train_file=file_name + '_new',
+                                tag_to_bioes=tag_to_bioes,
+                                in_memory=in_memory,
+                                )
+            corpora.append(corp)
+            print("...done.")
 
-        super(WIKINER_RUSSIAN, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
+        super(WIKIANN, self).__init__(
+            corpora, name='wikiann'
         )
 
 
-class WNUT_17(ColumnCorpus):
-    def __init__(
-            self,
-            base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
-            in_memory: bool = True,
-    ):
-        if type(base_path) == str:
-            base_path: Path = Path(base_path)
-
-        # column format
-        columns = {0: "text", 1: "ner"}
-
-        # this dataset name
-        dataset_name = self.__class__.__name__.lower()
+def silver_standard_to_simple_ner_annotation(data_file: Union[str, Path]):
+    f_read = open(data_file, 'r', encoding='utf-8')
+    f_write = open(data_file + '_new', 'w+', encoding='utf-8')
+    while True:
+        line = f_read.readline()
+        if line:
+            if line == '\n':
+                f_write.write(line)
+            else:
+                liste = line.split()
+                f_write.write(liste[0] + ' ' + liste[-1] + '\n')
+        else:
+            break
+    f_read.close()
+    f_write.close()
 
-        # default dataset folder is the cache root
-        if not base_path:
-            base_path = Path(flair.cache_root) / "datasets"
-        data_folder = base_path / dataset_name
 
-        # download data if necessary
-        wnut_path = "https://noisy-text.github.io/2017/files/"
-        cached_path(f"{wnut_path}wnut17train.conll", Path("datasets") / dataset_name)
-        cached_path(f"{wnut_path}emerging.dev.conll", Path("datasets") / dataset_name)
-        cached_path(
-            f"{wnut_path}emerging.test.annotated", Path("datasets") / dataset_name
-        )
+def google_drive_id_from_language_name(language):
+    languages_ids = {
+        'aa': '1tDDlydKq7KQQ3_23Ysbtke4HJOe4snIk',  # leer
+        'ab': '1hB8REj2XA_0DjI9hdQvNvSDpuBIb8qRf',
+        'ace': '1WENJS2ppHcZqaBEXRZyk2zY-PqXkTkgG',
+        'ady': '1n6On8WWDHxEoybj7F9K15d_fkGPy6KgO',
+        'af': '1CPB-0BD2tg3zIT60D3hmJT0i5O_SKja0',
+        'ak': '1l2vlGHnQwvm9XhW5S-403fetwUXhBlZm',
+        'als': '196xyYjhbie7sYLHLZHWkkurOwQLi8wK-',
+        'am': '1ug1IEoExKD3xWpvfZprAPSQi82YF9Cet',
+        'an': '1DNLgPOAOsGZBYd6rC5ddhzvc9_DtWnk2',
+        'ang': '1W_0ti7Tl8AkqM91lRCMPWEuUnPOAZroV',
+        'ar': '1tyvd32udEQG_cNeVpaD5I2fxvCc6XKIS',
+        'arc': '1hSOByStqPmP3b9HfQ39EclUZGo8IKCMb',
+        'arz': '1CKW5ZhxTpIHmc8Jt5JLz_5O6Cr8Icsan',
+        'as': '12opBoIweBLM8XciMHT4B6-MAaKdYdvpE',
+        'ast': '1rp64PxGZBDfcw-tpFBjLg_ddLDElG1II',
+        'av': '1hncGUrkG1vwAAQgLtwOf41BWkHkEvdss',
+        'ay': '1VmIsWpMTz442b4Mx798ZOgtB9vquKQtf',
+        'az': '1FXDXsvBSdqc7GGIDZv0hqBOaaw12Ip2-',
+        'azb': '1amVqOuHLEkhjn8rkGUl-mXdZlaACWyNT',
+        'ba': '1aLx1d8GagI11VZVYOGQy0BEePeqoT0x3',
+        'bar': '1JZ8-k8ZmnpWYI_Yl_cBBgjVdxoM9Daci',
+        'bat-smg': '1trxKXDFSeKsygTMKi-ZqXSJs7F90k5a8',
+        'bcl': '1Hs0k7KVZ2DPsqroZ4cUKcwZG4HdPV794',
+        'be-x-old': '1gaK-spj1m6eGYQ-SsngLxxLUvP1VRk08',
+        'be': '1_ttfOSy9BzCRkIT_p3mImT82XRPpEiuH',
+        'bg': '1Iug6gYKemb0OrLTUrKDc_c66YGypTfCF',
+        'bh': '12OcSFLu940A8tVQLxI8pnxKBpTeZHmrh',
+        'bi': '1rftVziS_pqARx4mvLJC0sKLY-OL5ZIjE',
+        'bjn': '1n17mkRjPUAOWQk5LQs2C3Tz3ShxK0enZ',
+        'bm': '1284dwO_sfdsWE7FR06HhfBRUb8ePesKR',
+        'bn': '1K2DM1mT4hkr6NlAIBTj95BeVXcgvpgDm',
+        'bo': '1SzGHDVK-OguKdjZ4DXWiOJVrie1iHeWm',
+        'bpy': '1m-e5EoruJufvwBEgJLmJtx6jzx64pYN2',
+        'br': '1xdaBoJ1DnwI0iEq7gQN1dWcABAs_bM9H',
+        'bs': '167dsB01trMYFQl8FshtIdfhjw7IfVKbk',
+        'bug': '1yCnevM9_KJzFk27Vxsva_20OacLo4Uam',
+        'bxr': '1DlByAX3zB-9UyEAVD4wtX-R7mXC-8xum',
+        'ca': '1LuUgbd9sGa-5Ahcsy31EK89a3WOowftY',
+        'cbk-zam': '1kgF8xoD-kIOWZET_9kp_4yNX6AAXn6PI',
+        'cdo': '14x1y6611G-UAEGq92QEHRpreVkYnoUCw',
+        'ce': '1QUUCVKA-fkiCHd3KT3zUWefaWnxzlZLu',
+        'ceb': '1DJZE9RfaMoPNXHI73KBXAm4YSe-_YCUk',
+        'ch': '1YzAfhmatkmTpkZbAcD6X83epCgzD5S2_',
+        'cho': '1ciY0vF3c5a2mTOo_k32A2wMs0klK98Kb',  # leer
+        'chr': '1EHaxz1UZHn7v2bbRzCLAhPsNtRzrG3Ae',
+        'chy': '1nNWwMAJr1KNdz3bHf6uIn-thZCknlTeB',
+        'ckb': '1llpaftcUSiXCZQZMdAqaJSrhwMdcf9IV',
+        'co': '1ZP-8oWgMYfW7a6w6ygEFkKDGbN39QnDn',
+        'cr': '1ST0xRicLAG4JdCZwGdaY-0pEXooQh7e6',
+        'crh': '1Jmpq2XVYUR_XaXU5XNhtOMnz-qkpsgpE',
+        'cs': '1Vydyze-jBkK_S1uV5ewV_Y6dbwhXr7lk',
+        'csb': '1naUyF74lZPnnopXdOqf5Xor2kT4WoHfS',
+        'cu': '1EN5dVTU6jc7YOYPCHq8EYUF31HlMUKs7',
+        'cv': '1gEUAlqYSSDI4TrWCqP1LUq2n0X1XEjN3',
+        'cy': '1q5g6NJE5GXf65Vc_P4BnUMHQ49Prz-J1',
+        'da': '11onAGOLkkqrIwM784siWlg-cewa5WKm8',
+        'de': '1f9nWvNkCCy6XWhd9uf4Dq-2--GzSaYAb',
+        'diq': '1IkpJaVbEOuOs9qay_KG9rkxRghWZhWPm',
+        'dsb': '1hlExWaMth-2eVIQ3i3siJSG-MN_7Z6MY',
+        'dv': '1WpCrslO4I7TMb2uaKVQw4U2U8qMs5szi',
+        'dz': '10WX52ePq2KfyGliwPvY_54hIjpzW6klV',
+        'ee': '1tYEt3oN2KPzBSWrk9jpCqnW3J1KXdhjz',
+        'el': '1cxq4NUYmHwWsEn5waYXfFSanlINXWLfM',
+        'eml': '17FgGhPZqZNtzbxpTJOf-6nxEuI5oU4Vd',
+        'en': '1mqxeCPjxqmO7e8utj1MQv1CICLFVvKa-',
+        'eo': '1YeknLymGcqj44ug2yd4P7xQVpSK27HkK',
+        'es': '1Dnx3MVR9r5cuoOgeew2gT8bDvWpOKxkU',
+        'et': '1Qhb3kYlQnLefWmNimdN_Vykm4mWzbcWy',
+        'eu': '1f613wH88UeITYyBSEMZByK-nRNMwLHTs',
+        'ext': '1D0nLOZ3aolCM8TShIRyCgF3-_MhWXccN',
+        'fa': '1QOG15HU8VfZvJUNKos024xI-OGm0zhEX',
+        'ff': '1h5pVjxDYcq70bSus30oqi9KzDmezVNry',
+        'fi': '1y3Kf6qYsSvL8_nSEwE1Y6Bf6ninaPvqa',
+        'fiu-vro': '1oKUiqG19WgPd3CCl4FGudk5ATmtNfToR',
+        'fj': '10xDMuqtoTJlJFp5ghbhKfNWRpLDK3W4d',
+        'fo': '1RhjYqgtri1276Be1N9RrNitdBNkpzh0J',
+        'fr': '1sK_T_-wzVPJYrnziNqWTriU52rEsXGjn',
+        'frp': '1NUm8B2zClBcEa8dHLBb-ZgzEr8phcQyZ',
+        'frr': '1FjNqbIUlOW1deJdB8WCuWjaZfUzKqujV',
+        'fur': '1oqHZMK7WAV8oHoZLjGR0PfmO38wmR6XY',
+        'fy': '1DvnU6iaTJc9bWedmDklHyx8nzKD1s3Ge',
+        'ga': '1Ql6rh7absdYQ8l-3hj_MVKcEC3tHKeFB',
+        'gag': '1zli-hOl2abuQ2wsDJU45qbb0xuvYwA3a',
+        'gan': '1u2dOwy58y-GaS-tCPJS_i9VRDQIPXwCr',
+        'gd': '1umsUpngJiwkLdGQbRqYpkgxZju9dWlRz',
+        'gl': '141K2IbLjJfXwFTIf-kthmmG0YWdi8liE',
+        'glk': '1ZDaxQ6ilXaoivo4_KllagabbvfOuiZ0c',
+        'gn': '1hM4MuCaVnZqnL-w-0N-WcWag22ikVLtZ',
+        'gom': '1BNOSw75tzPC0wEgLOCKbwu9wg9gcLOzs',
+        'got': '1YSHYBtXc1WvUvMIHPz6HHgJvaXKulJUj',
+        'gu': '1VdK-B2drqFwKg8KD23c3dKXY-cZgCMgd',
+        'gv': '1XZFohYNbKszEFR-V-yDXxx40V41PV9Zm',
+        'ha': '18ZG4tUU0owRtQA8Ey3Dl72ALjryEJWMC',
+        'hak': '1QQe3WgrCWbvnVH42QXD7KX4kihHURB0Z',
+        'haw': '1FLqlK-wpz4jy768XbQAtxd9PhC-9ciP7',
+        'he': '18K-Erc2VOgtIdskaQq4D5A3XkVstDmfX',
+        'hi': '1lBRapb5tjBqT176gD36K5yb_qsaFeu-k',
+        'hif': '153MQ9Ga4NQ-CkK8UiJM3DjKOk09fhCOV',
+        'ho': '1c1AoS7yq15iVkTEE-0f3x25NT4F202B8',  # leer
+        'hr': '1wS-UtB3sGHuXJQQGR0F5lDegogsgoyif',
+        'hsb': '1_3mMLzAE5OmXn2z64rW3OwWbo85Mirbd',
+        'ht': '1BwCaF0nfdgkM7Yt7A7d7KyVk0BcuwPGk',
+        'hu': '10AkDmTxUWNbOXuYLYZ-ZPbLAdGAGZZ8J',
+        'hy': '1Mi2k2alJJquT1ybd3GC3QYDstSagaWdo',
+        'hz': '1c1m_-Q92v0Di7Nez6VuaccrN19i8icKV',  # leer
+        'ia': '1jPyqTmDuVhEhj89N606Cja5heJEbcMoM',
+        'id': '1JWIvIh8fQoMQqk1rPvUThaskxnTs8tsf',
+        'ie': '1TaKRlTtB8-Wqu4sfvx6JQKIugAlg0pV-',
+        'ig': '15NFAf2Qx6BXSjv_Oun9_3QRBWNn49g86',
+        'ii': '1qldGJkMOMKwY13DpcgbxQCbff0K982f9',  # leer
+        'ik': '1VoSTou2ZlwVhply26ujowDz6gjwtxmny',
+        'ilo': '1-xMuIT6GaM_YeHqgm1OamGkxYfBREiv3',
+        'io': '19Zla0wsAcrZm2c0Pw5ghpp4rHjYs26Pp',
+        'is': '11i-NCyqS6HbldIbYulsCgQGZFXR8hwoB',
+        'it': '1HmjlOaQunHqL2Te7pIkuBWrnjlmdfYo_',
+        'iu': '18jKm1S7Ls3l0_pHqQH8MycG3LhoC2pdX',
+        'ja': '10dz8UxyK4RIacXE2HcGdrharmp5rwc3r',
+        'jam': '1v99CXf9RnbF6aJo669YeTR6mQRTOLZ74',  # leer
+        'jbo': '1_LmH9hc6FDGE3F7pyGB1fUEbSwuTYQdD',
+        'jv': '1qiSu1uECCLl4IBZS27FBdJIBivkJ7GwE',
+        'ka': '172UFuFRBX2V1aWeXlPSpu9TjS-3cxNaD',
+        'kaa': '1kh6hMPUdqO-FIxRY6qaIBZothBURXxbY',
+        'kab': '1oKjbZI6ZrrALCqnPCYgIjKNrKDA7ehcs',
+        'kbd': '1jNbfrboPOwJmlXQBIv053d7n5WXpMRv7',
+        'kg': '1iiu5z-sdJ2JLC4Ja9IgDxpRZklIb6nDx',
+        'ki': '1GUtt0QI84c5McyLGGxoi5uwjHOq1d6G8',
+        'kj': '1nSxXUSGDlXVCIPGlVpcakRc537MwuKZR',  # leer
+        'kk': '1ryC3UN0myckc1awrWhhb6RIi17C0LCuS',
+        'kl': '1gXtGtX9gcTXms1IExICnqZUHefrlcIFf',
+        'km': '1DS5ATxvxyfn1iWvq2G6qmjZv9pv0T6hD',
+        'kn': '1ZGLYMxbb5-29MNmuUfg2xFhYUbkJFMJJ',
+        'ko': '12r8tIkTnwKhLJxy71qpIcoLrT6NNhQYm',
+        'koi': '1EdG_wZ_Qk124EPAZw-w6rdEhYLsgcvIj',
+        'kr': '19VNQtnBA-YL_avWuVeHQHxJZ9MZ04WPF',  # leer
+        'krc': '1nReV4Mb7Wdj96czpO5regFbdBPu0zZ_y',
+        'ks': '1kzh0Pgrv27WRMstR9MpU8mu7p60TcT-X',
+        'ksh': '1iHJvrl2HeRaCumlrx3N7CPrHQ2KuLUkt',
+        'ku': '1YqJog7Bkk0fHBCSTxJ9heeE-bfbkbkye',
+        'kv': '1s91HI4eq8lQYlZwfrJAgaGlCyAtIhvIJ',
+        'kw': '16TaIX2nRfqDp8n7zudd4bqf5abN49dvW',
+        'ky': '17HPUKFdKWhUjuR1NOp5f3PQYfMlMCxCT',
+        'la': '1NiQuBaUIFEERvVXo6CQLwosPraGyiRYw',
+        'lad': '1PEmXCWLCqnjLBomMAYHeObM1AmVHtD08',
+        'lb': '1nE4g10xoTU23idmDtOQ0w2QCuizZ6QH_',
+        'lbe': '1KOm-AdRcCHfSc1-uYBxBA4GjxXjnIlE-',
+        'lez': '1cJAXshrLlF1TZlPHJTpDwEvurIOsz4yR',
+        'lg': '1Ur0y7iiEpWBgHECrIrT1OyIC8um_y4th',
+        'li': '1TikIqfqcZlSDWhOae1JnjJiDko4nj4Dj',
+        'lij': '1ro5ItUcF49iP3JdV82lhCQ07MtZn_VjW',
+        'lmo': '1W4rhBy2Pi5SuYWyWbNotOVkVY3kYWS_O',
+        'ln': '1bLSV6bWx0CgFm7ByKppZLpYCFL8EIAoD',
+        'lo': '1C6SSLeKF3QirjZbAZAcpVX_AXYg_TJG3',
+        'lrc': '1GUcS28MlJe_OjeQfS2AJ8uczpD8ut60e',
+        'lt': '1gAG6TcMTmC128wWK0rCXRlCTsJY9wFQY',
+        'ltg': '12ziP8t_fAAS9JqOCEC0kuJObEyuoiOjD',
+        'lv': '1MPuAM04u-AtfybXdpHwCqUpFWbe-zD0_',
+        'mai': '1d_nUewBkka2QGEmxCc9v3dTfvo7lPATH',
+        'map-bms': '1wrNIE-mqp2xb3lrNdwADe6pb7f35NP6V',
+        'mdf': '1BmMGUJy7afuKfhfTBMiKxM3D7FY-JrQ2',
+        'mg': '105WaMhcWa-46tCztoj8npUyg0aH18nFL',
+        'mh': '1Ej7n6yA1cF1cpD5XneftHtL33iHJwntT',
+        'mhr': '1CCPIUaFkEYXiHO0HF8_w07UzVyWchrjS',
+        'mi': '1F6au9xQjnF-aNBupGJ1PwaMMM6T_PgdQ',
+        'min': '1tVK5SHiCy_DaZSDm3nZBgT5bgWThbJt_',
+        'mk': '18NpudytGhSWq_LbmycTDw10cSftlSBGS',
+        'ml': '1V73UE-EvcE-vV3V1RTvU4sak6QFcP91y',
+        'mn': '14jRXicA87oXZOZllWqUjKBMetNpQEUUp',
+        'mo': '1YsLGNMsJ7VsekhdcITQeolzOSK4NzE6U',
+        'mr': '1vOr1AIHbgkhTO9Ol9Jx5Wh98Qdyh1QKI',
+        'mrj': '1dW-YmEW8a9D5KyXz8ojSdIXWGekNzGzN',
+        'ms': '1bs-_5WNRiZBjO-DtcNtkcIle-98homf_',
+        'mt': '1L7aU3iGjm6SmPIU74k990qRgHFV9hrL0',
+        'mus': '1_b7DcRqiKJFEFwp87cUecqf8A5BDbTIJ',  # leer
+        'mwl': '1MfP0jba2jQfGVeJOLq26MjI6fYY7xTPu',
+        'my': '16wsIGBhNVd2lC2p6n1X8rdMbiaemeiUM',
+        'myv': '1KEqHmfx2pfU-a1tdI_7ZxMQAk5NJzJjB',
+        'mzn': '1CflvmYEXZnWwpsBmIs2OvG-zDDvLEMDJ',
+        'na': '1r0AVjee5wNnrcgJxQmVGPVKg5YWz1irz',
+        'nah': '1fx6eu91NegyueZ1i0XaB07CKjUwjHN7H',
+        'nap': '1bhT4sXCJvaTchCIV9mwLBtf3a7OprbVB',
+        'nds-nl': '1UIFi8eOCuFYJXSAXZ9pCWwkQMlHaY4ye',
+        'nds': '1FLgZIXUWa_vekDt4ndY0B5XL7FNLiulr',
+        'ne': '1gEoCjSJmzjIH4kdHsbDZzD6ID4_78ekS',
+        'new': '1_-p45Ny4w9UvGuhD8uRNSPPeaARYvESH',
+        'ng': '11yxPdkmpmnijQUcnFHZ3xcOmLTYJmN_R',
+        'nl': '1dqYXg3ilzVOSQ_tz_dF47elSIvSIhgqd',
+        'nn': '1pDrtRhQ001z2WUNMWCZQU3RV_M0BqOmv',
+        'no': '1zuT8MI96Ivpiu9mEVFNjwbiM8gJlSzY2',
+        'nov': '1l38388Rln0NXsSARMZHmTmyfo5C0wYTd',
+        'nrm': '10vxPq1Nci7Wpq4XOvx3dtqODskzjdxJQ',
+        'nso': '1iaIV8qlT0RDnbeQlnxJ3RehsG3gU5ePK',
+        'nv': '1oN31jT0w3wP9aGwAPz91pSdUytnd9B0g',
+        'ny': '1eEKH_rUPC560bfEg11kp3kbe8qWm35IG',
+        'oc': '1C01cW8G_j8US-DTrsmeal_ENHTtNWn-H',
+        'olo': '1vbDwKZKqFq84dusr1SvDx5JbBcPanx9L',  # leer
+        'om': '1q3h22VMbWg2kgVFm-OArR-E4y1yBQ1JX',
+        'or': '1k8LwCE8nC7lq6neXDaS3zRn0KOrd9RnS',
+        'os': '1u81KAB34aEQfet00dLMRIBJsfRwbDTij',
+        'pa': '1JDEHL1VcLHBamgTPBom_Ryi8hk6PBpsu',
+        'pag': '1k905VUWnRgY8kFb2P2431Kr4dZuolYGF',
+        'pam': '1ssugGyJb8ipispC60B3I6kzMsri1WcvC',
+        'pap': '1Za0wfwatxYoD7jGclmTtRoBP0uV_qImQ',
+        'pcd': '1csJlKgtG04pdIYCUWhsCCZARKIGlEYPx',
+        'pdc': '1Xnms4RXZKZ1BBQmQJEPokmkiweTpouUw',
+        'pfl': '1tPQfHX7E0uKMdDSlwNw5aGmaS5bUK0rn',
+        'pi': '16b-KxNxzbEuyoNSlI3bfe2YXmdSEsPFu',
+        'pih': '1vwyihTnS8_PE5BNK7cTISmIBqGWvsVnF',
+        'pl': '1fijjS0LbfpKcoPB5V8c8fH08T8AkXRp9',
+        'pms': '12ySc7X9ajWWqMlBjyrPiEdc-qVBuIkbA',
+        'pnb': '1RB3-wjluhTKbdTGCsk3nag1bM3m4wENb',
+        'pnt': '1ZCUzms6fY4on_fW8uVgO7cEs9KHydHY_',
+        'ps': '1WKl9Av6Sqz6aHKyUM5kIh90mzFzyVWH9',
+        'pt': '13BX-_4_hcTUp59HDyczFDI32qUB94vUY',
+        'qu': '1CB_C4ygtRoegkqgcqfXNHr8oQd-UcvDE',
+        'rm': '1YRSGgWoxEqSojHXuBHJnY8vAHr1VgLu-',
+        'rmy': '1uFcCyvOWBJWKFQxbkYSp373xUXVl4IgF',
+        'rn': '1ekyyb2MvupYGY_E8_BhKvV664sLvW4aE',
+        'ro': '1YfeNTSoxU-zJMnyQotLk5X8B_6nHryBu',
+        'roa-rup': '150s4H4TdQ5nNYVC6j0E416TUAjBE85yy',
+        'roa-tara': '1H6emfQsD_a5yohK4RMPQ-GrnHXqqVgr3',
+        'ru': '11gP2s-SYcfS3j9MjPp5C3_nFeQB-8x86',
+        'rue': '1OuSglZAndja1J5D5IUmdbt_niTTyEgYK',
+        'rw': '1NuhHfi0-B-Xlr_BApijnxCw0WMEltttP',
+        'sa': '1P2S3gL_zvKgXLKJJxg-Fb4z8XdlVpQik',
+        'sah': '1qz0MpKckzUref2FX_FYiNzI2p4BDc5oR',
+        'sc': '1oAYj_Fty4FUwjAOBEBaiZt_cY8dtpDfA',
+        'scn': '1sDN9zHkXWYoHYx-DUu-GPvsUgB_IRa8S',
+        'sco': '1i8W7KQPj6YZQLop89vZBSybJNgNsvXWR',
+        'sd': '1vaNqfv3S8Gl5pQmig3vwWQ3cqRTsXmMR',
+        'se': '1RT9xhn0Vl90zjWYDTw5V1L_u1Oh16tpP',
+        'sg': '1iIh2oXD2Szz_AygUvTt3_ZK8a3RYEGZ_',
+        'sh': '1qPwLiAm6t4__G-zVEOrBgYx6VRmgDgiS',
+        'si': '1G5ryceID0TP6SAO42e-HAbIlCvYmnUN7',
+        'simple': '1FVV49o_RlK6M5Iw_7zeJOEDQoTa5zSbq',
+        'sk': '11mkYvbmAWKTInj6t4Ma8BUPxoR5o6irL',
+        'sl': '1fsIZS5LgMzMzZ6T7ogStyj-ILEZIBRvO',
+        'sm': '1yefECpKX_Y4R7G2tggIxvc_BvJfOAz-t',
+        'sn': '1fYeCjMPvRAv94kvZjiKI-ktIDLkbv0Ve',
+        'so': '1Uc-eSZnJb36SgeTvRU3GirXZOlGD_NB6',
+        'sq': '11u-53n71O_yjpwRiCQSwgL7N2w72ZptX',
+        'sr': '1PGLGlQi8Q0Eac6dib-uuCJAAHK6SF5Pz',
+        'srn': '1JKiL3TSXqK1-KhPfAwMK0uqw90WEzg7M',
+        'ss': '1e0quNEsA1dn57-IbincF4D82dRWgzQlp',
+        'st': '1ny-FBzpBqIDgv6jMcsoFev3Ih65FNZFO',
+        'stq': '15Fx32ROy2IM6lSqAPUykkr3CITR6Xd7v',
+        'su': '1C0FJum7bYZpnyptBvfAgwJb0TX2hggtO',
+        'sv': '1YyqzOSXzK5yrAou9zeTDWH_7s569mDcz',
+        'sw': '1_bNTj6T8eXlNAIuHaveleWlHB_22alJs',
+        'szl': '1_dXEip1snK4CPVGqH8x7lF5O-6FdCNFW',
+        'ta': '1ZFTONsxGtSnC9QB6RpWSvgD_MbZwIhHH',
+        'tcy': '15R6u7KQs1vmDSm_aSDrQMJ3Q6q3Be0r7',  # leer
+        'te': '11Sx-pBAPeZOXGyv48UNSVMD0AH7uf4YN',
+        'tet': '11mr2MYLcv9pz7mHhGGNi5iNCOVErYeOt',
+        'tg': '16ttF7HWqM9Cnj4qmgf3ZfNniiOJfZ52w',
+        'th': '14xhIt-xr5n9nMuvcwayCGM1-zBCFZquW',
+        'ti': '123q5e9MStMShp8eESGtHdSBGLDrCKfJU',
+        'tk': '1X-JNInt34BNGhg8A8Peyjw2WjsALdXsD',
+        'tl': '1WkQHbWd9cqtTnSHAv0DpUThaBnzeSPTJ',
+        'tn': '1fHfQHetZn8-fLuRZEu-cvs-kQYwPvjyL',
+        'to': '1cHOLaczYJ8h-OqQgxeoH9vMG3izg6muT',
+        'tpi': '1YsRjxVu6NYOrXRb8oqMO9FPaicelFEcu',
+        'tr': '1J1Zy02IxvtCK0d1Ba2h_Ulit1mVb9UIX',
+        'ts': '1pIcfAt3KmtmDkyhOl-SMSeoM8aP8bOpl',
+        'tt': '1vsfzCjj-_bMOn5jBai41TF5GjKJM_Ius',
+        'tum': '1NWcg65daI2Bt0awyEgU6apUDbBmiqCus',
+        'tw': '1WCYKZIqS7AagS76QFSfbteiOgFNBvNne',
+        'ty': '1DIqaP1l-N9VXTNokrlr6EuPMGE765o4h',
+        'tyv': '1F3qa05OYLBcjT1lXMurAJFDXP_EesCvM',
+        'udm': '1T0YMTAPLOk768sstnewy5Jxgx2RPu3Rb',
+        'ug': '1fjezvqlysyZhiQMZdazqLGgk72PqtXAw',
+        'uk': '1UMJCHtzxkfLDBJE7NtfN5FeMrnnUVwoh',
+        'ur': '1WNaD2TuHvdsF-z0k_emQYchwoQQDFmRk',
+        'uz': '11wrG2FSTpRJc2jb5MhgvxjkVDYhT8M-l',
+        've': '1PucJ7pJ4CXGEXZ5p_WleZDs2usNz74to',
+        'vec': '1cAVjm_y3ehNteDQIYz9yyoq1EKkqOXZ0',
+        'vep': '1K_eqV7O6C7KPJWZtmIuzFMKAagj-0O85',
+        'vi': '1yQ6nhm1BmG9lD4_NaG1hE5VV6biEaV5f',
+        'vls': '1bpQQW6pKHruKJJaKtuggH5rReMXyeVXp',
+        'vo': '1D80QRdTpe7H4mHFKpfugscsjX71kiMJN',
+        'wa': '1m4B81QYbf74htpInDU5p7d0n0ot8WLPZ',
+        'war': '1EC3jsHtu22tHBv6jX_I4rupC5RwV3OYd',
+        'wo': '1vChyqNNLu5xYHdyHpACwwpw4l3ptiKlo',
+        'wuu': '1_EIn02xCUBcwLOwYnA-lScjS2Lh2ECw6',
+        'xal': '19bKXsL1D2UesbB50JPyc9TpG1lNc2POt',
+        'xh': '1pPVcxBG3xsCzEnUzlohc_p89gQ9dSJB3',
+        'xmf': '1SM9llku6I_ZuZz05mOBuL2lx-KQXvehr',
+        'yi': '1WNWr1oV-Nl7c1Jv8x_MiAj2vxRtyQawu',
+        'yo': '1yNVOwMOWeglbOcRoZzgd4uwlN5JMynnY',
+        'za': '1i7pg162cD_iU9h8dgtI2An8QCcbzUAjB',
+        'zea': '1EWSkiSkPBfbyjWjZK0VuKdpqFnFOpXXQ',
+        'zh-classical': '1uUKZamNp08KA7s7794sKPOqPALvo_btl',
+        'zh-min-nan': '1oSgz3YBXLGUgI7kl-uMOC_ww6L0FNFmp',
+        'zh-yue': '1zhwlUeeiyOAU1QqwqZ8n91yXIRPFA7UE',
+        'zh': '1LZ96GUhkVHQU-aj2C3WOrtffOp0U3Z7f',
+        'zu': '1FyXl_UK1737XB3drqQFhGXiJrJckiB1W'
+    }
+    return languages_ids[language]
 
-        super(WNUT_17, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
-        )
 
-class WEIBO_NER(ColumnCorpus):
+class WIKIGOLD_NER(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
@@ -2200,12 +2330,11 @@ def __init__(
             document_as_sequence: bool = False,
     ):
         """
-        Initialize the WEIBO_NER corpus . The first time you call this constructor it will automatically
+        Initialize the wikigold corpus. The first time you call this constructor it will automatically
         download the dataset.
         :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
         to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
-        POS tags instead
+        :param tag_to_bioes: NER by default, should not be changed
         :param in_memory: If True, keeps dataset in memory giving speedups in training.
         :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
@@ -2213,7 +2342,7 @@ def __init__(
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: 'text', 1: 'ner'}
+        columns = {0: "text", 1: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2224,117 +2353,32 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        weiboNER_conll_path = "https://raw.githubusercontent.com/87302380/WEIBO_NER/main/data/"
-        cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.train", Path("datasets") / dataset_name)
-        cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.test", Path("datasets") / dataset_name)
-        cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.dev", Path("datasets") / dataset_name)
-
+        wikigold_ner_path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/wikigold/CONLL-format/data/"
+        cached_path(f"{wikigold_ner_path}wikigold.conll.txt", Path("datasets") / dataset_name)
 
-        super(WEIBO_NER, self).__init__(
+        super(WIKIGOLD_NER, self).__init__(
             data_folder,
             columns,
             tag_to_bioes=tag_to_bioes,
             encoding="utf-8",
             in_memory=in_memory,
-            train_file="weiboNER_2nd_conll_format.train",
-            test_file="weiboNER_2nd_conll_format.test",
-            dev_file="weiboNER_2nd_conll_format.dev",
+            train_file='wikigold.conll.txt',
             document_separator_token=None if not document_as_sequence else "-DOCSTART-",
         )
 
-class BIOSCOPE(ColumnCorpus):
-
-    def __init__(
-            self,
-            base_path: Union[str, Path] = None,
-            in_memory: bool = True,
-    ):
-        if type(base_path) == str:
-            base_path: Path = Path(base_path)
-
-        # column format
-        columns = {0: "text", 1: "tag"}
-
-        # this dataset name
-        dataset_name = self.__class__.__name__.lower()
-
-        # default dataset folder is the cache root
-        if not base_path:
-            base_path = Path(flair.cache_root) / "datasets"
-        data_folder = base_path / dataset_name
-
-        # download data if necessary
-        bioscope_path = "https://raw.githubusercontent.com/whoisjones/BioScopeSequenceLabelingData/master/sequence_labeled/"
-        cached_path(f"{bioscope_path}output.txt", Path("datasets") / dataset_name)
-
-        super(BIOSCOPE, self).__init__(
-            data_folder, columns, in_memory=in_memory, train_file="output.txt"
-        )
-
-
-def _download_wikiner(language_code: str, dataset_name: str):
-    # download data if necessary
-    wikiner_path = (
-        "https://raw.githubusercontent.com/dice-group/FOX/master/input/Wikiner/"
-    )
-    lc = language_code
-
-    data_file = (
-            Path(flair.cache_root)
-            / "datasets"
-            / dataset_name
-            / f"aij-wikiner-{lc}-wp3.train"
-    )
-    if not data_file.is_file():
-
-        cached_path(
-            f"{wikiner_path}aij-wikiner-{lc}-wp3.bz2", Path("datasets") / dataset_name
-        )
-        import bz2, shutil
-
-        # unpack and write out in CoNLL column-like format
-        bz_file = bz2.BZ2File(
-            Path(flair.cache_root)
-            / "datasets"
-            / dataset_name
-            / f"aij-wikiner-{lc}-wp3.bz2",
-            "rb",
-        )
-        with bz_file as f, open(
-                Path(flair.cache_root)
-                / "datasets"
-                / dataset_name
-                / f"aij-wikiner-{lc}-wp3.train",
-                "w",
-                encoding="utf-8"
-        ) as out:
-            for line in f:
-                line = line.decode("utf-8")
-                words = line.split(" ")
-                for word in words:
-                    out.write("\t".join(word.split("|")) + "\n")
 
-class UP_CHINESE(ColumnCorpus):
+class WIKINER_ENGLISH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            in_memory: bool = True,
-            document_as_sequence: bool = False,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = False,
     ):
-        """
-        Initialize the Chinese dataset from the Universal Propositions Bank, comming from that webpage:
-        https://github.com/System-T/UniversalPropositions/tree/master/UP_Chinese
-
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {1: "text", 9: "frame"}
+        columns = {0: "text", 1: "pos", 2: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2345,92 +2389,25 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        up_zh_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Chinese/"
-        cached_path(f"{up_zh_path}zh-up-train.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_zh_path}zh-up-dev.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_zh_path}zh-up-test.conllu", Path("datasets") / dataset_name)
+        _download_wikiner("en", dataset_name)
 
-        super(UP_CHINESE, self).__init__(
-            data_folder,
-            columns,
-            encoding="utf-8",
-            train_file="zh-up-train.conllu",
-            test_file="zh-up-test.conllu",
-            dev_file="zh-up-dev.conllu",
-            in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-            comment_symbol="#",
+        super(WIKINER_ENGLISH, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
-class UP_ENGLISH(ColumnCorpus):
-    def __init__(
-            self,
-            base_path: Union[str, Path] = None,
-            in_memory: bool = True,
-            document_as_sequence: bool = False,
-    ):
-        """
-        Initialize the English dataset from the Universal Propositions Bank, comming from that webpage:
-        https://github.com/System-T/UniversalPropositions.
-
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
-        if type(base_path) == str:
-            base_path: Path = Path(base_path)
-
-        # column format
-        columns = {1: "text", 10: "frame"}
-
-        # this dataset name
-        dataset_name = self.__class__.__name__.lower()
-
-        # default dataset folder is the cache root
-        if not base_path:
-            base_path = Path(flair.cache_root) / "datasets"
-        data_folder = base_path / dataset_name
-
-        # download data if necessary
-        up_en_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_English-EWT/"
-        cached_path(f"{up_en_path}en_ewt-up-train.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_en_path}en_ewt-up-dev.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_en_path}en_ewt-up-test.conllu", Path("datasets") / dataset_name)
-
-        super(UP_ENGLISH, self).__init__(
-            data_folder,
-            columns,
-            encoding="utf-8",
-            train_file="en_ewt-up-train.conllu",
-            test_file="en_ewt-up-test.conllu",
-            dev_file="en_ewt-up-dev.conllu",
-            in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-            comment_symbol="#",
-        )
 
-class UP_FRENCH(ColumnCorpus):
+class WIKINER_GERMAN(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            in_memory: bool = True,
-            document_as_sequence: bool = False,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = False,
     ):
-        """
-        Initialize the French dataset from the Universal Propositions Bank, comming from that webpage:
-        https://github.com/System-T/UniversalPropositions.
-
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {1: "text", 9: "frame"}
+        columns = {0: "text", 1: "pos", 2: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2441,44 +2418,25 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        up_fr_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_French/"
-        cached_path(f"{up_fr_path}fr-up-train.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_fr_path}fr-up-dev.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_fr_path}fr-up-test.conllu", Path("datasets") / dataset_name)
+        _download_wikiner("de", dataset_name)
 
-        super(UP_FRENCH, self).__init__(
-            data_folder,
-            columns,
-            encoding="utf-8",
-            train_file="fr-up-train.conllu",
-            test_file="fr-up-test.conllu",
-            dev_file="fr-up-dev.conllu",
-            in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-            comment_symbol="#",
+        super(WIKINER_GERMAN, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
-class UP_FINNISH(ColumnCorpus):
+
+class WIKINER_DUTCH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            in_memory: bool = True,
-            document_as_sequence: bool = False,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = False,
     ):
-        """
-        Initialize the Finnish dataset from the Universal Propositions Bank, comming from that webpage:
-        https://github.com/System-T/UniversalPropositions/tree/master/UP_Finnish
-
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {1: "text", 9: "frame"}
+        columns = {0: "text", 1: "pos", 2: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2489,44 +2447,25 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        up_fi_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Finnish/"
-        cached_path(f"{up_fi_path}fi-up-train.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_fi_path}fi-up-dev.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_fi_path}fi-up-test.conllu", Path("datasets") / dataset_name)
+        _download_wikiner("nl", dataset_name)
 
-        super(UP_FINNISH, self).__init__(
-            data_folder,
-            columns,
-            encoding="utf-8",
-            train_file="fi-up-train.conllu",
-            test_file="fi-up-test.conllu",
-            dev_file="fi-up-dev.conllu",
-            in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-            comment_symbol="#",
+        super(WIKINER_DUTCH, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
-class UP_GERMAN(ColumnCorpus):
+
+class WIKINER_FRENCH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            in_memory: bool = True,
-            document_as_sequence: bool = False,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = False,
     ):
-        """
-        Initialize the German dataset from the Universal Propositions Bank, comming from that webpage:
-        https://github.com/System-T/UniversalPropositions.
-
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {1: "text", 9: "frame"}
+        columns = {0: "text", 1: "pos", 2: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2537,44 +2476,25 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        up_de_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_German/"
-        cached_path(f"{up_de_path}de-up-train.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_de_path}de-up-dev.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_de_path}de-up-test.conllu", Path("datasets") / dataset_name)
+        _download_wikiner("fr", dataset_name)
 
-        super(UP_GERMAN, self).__init__(
-            data_folder,
-            columns,
-            encoding="utf-8",
-            train_file="de-up-train.conllu",
-            test_file="de-up-test.conllu",
-            dev_file="de-up-dev.conllu",
-            in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-            comment_symbol="#",
+        super(WIKINER_FRENCH, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
-class UP_ITALIAN(ColumnCorpus):
+
+class WIKINER_ITALIAN(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            in_memory: bool = True,
-            document_as_sequence: bool = False,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = False,
     ):
-        """
-        Initialize the Italian dataset from the Universal Propositions Bank, comming from that webpage:
-        https://github.com/System-T/UniversalPropositions/tree/master/UP_Italian
-
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {1: "text", 9: "frame"}
+        columns = {0: "text", 1: "pos", 2: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2585,44 +2505,25 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        up_it_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Italian/"
-        cached_path(f"{up_it_path}it-up-train.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_it_path}it-up-dev.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_it_path}it-up-test.conllu", Path("datasets") / dataset_name)
+        _download_wikiner("it", dataset_name)
 
-        super(UP_ITALIAN, self).__init__(
-            data_folder,
-            columns,
-            encoding="utf-8",
-            train_file="it-up-train.conllu",
-            test_file="it-up-test.conllu",
-            dev_file="it-up-dev.conllu",
-            in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-            comment_symbol="#",
+        super(WIKINER_ITALIAN, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
-class UP_SPANISH(ColumnCorpus):
+
+class WIKINER_SPANISH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            in_memory: bool = True,
-            document_as_sequence: bool = False,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = False,
     ):
-        """
-        Initialize the Spanish dataset from the Universal Propositions Bank, comming from that webpage:
-        https://github.com/System-T/UniversalPropositions
-
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {1: "text", 9: "frame"}
+        columns = {0: "text", 1: "pos", 2: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2633,44 +2534,25 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish/"
-        cached_path(f"{up_es_path}es-up-train.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_es_path}es-up-dev.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_es_path}es-up-test.conllu", Path("datasets") / dataset_name)
+        _download_wikiner("es", dataset_name)
 
-        super(UP_SPANISH, self).__init__(
-            data_folder,
-            columns,
-            encoding="utf-8",
-            train_file="es-up-train.conllu",
-            test_file="es-up-test.conllu",
-            dev_file="es-up-dev.conllu",
-            in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-            comment_symbol="#",
+        super(WIKINER_SPANISH, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
-class UP_SPANISH_ANCORA(ColumnCorpus):
+
+class WIKINER_PORTUGUESE(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            in_memory: bool = True,
-            document_as_sequence: bool = False,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = False,
     ):
-        """
-        Initialize the Spanish AnCora dataset from the Universal Propositions Bank, comming from that webpage:
-        https://github.com/System-T/UniversalPropositions
-
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {1: "text", 9: "frame"}
+        columns = {0: "text", 1: "pos", 2: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2681,127 +2563,83 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish-AnCora/"
-        cached_path(f"{up_es_path}es_ancora-up-train.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_es_path}es_ancora-up-dev.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_es_path}es_ancora-up-test.conllu", Path("datasets") / dataset_name)
+        _download_wikiner("pt", dataset_name)
 
-        super(UP_SPANISH_ANCORA, self).__init__(
-            data_folder,
-            columns,
-            encoding="utf-8",
-            train_file="es_ancora-up-train.conllu",
-            test_file="es_ancora-up-test.conllu",
-            dev_file="es_ancora-up-dev.conllu",
-            in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-            comment_symbol="#",
+        super(WIKINER_PORTUGUESE, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
 
-class MITMovieNERSimple(ColumnCorpus):
+class WIKINER_POLISH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
-            in_memory: bool = True,
+            in_memory: bool = False,
     ):
-        """
-        Initialize the eng corpus of the MIT Movie Corpus (it has simpler queries compared to the trivia10k13 corpus)
-        in BIO format. The first time you call this constructor it will automatically download the dataset.
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
-        POS tags instead
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        """
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # column format
-        columns = {0: "ner", 1: "text"}
+        columns = {0: "text", 1: "pos", 2: "ner"}
 
-        # dataset name
+        # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
-        # data folder: default dataset folder is the cache root
-        if type(base_path) == str:
-            base_path: Path = Path(base_path)
+        # default dataset folder is the cache root
         if not base_path:
-            base_path: Path = Path(flair.cache_root) / "datasets"
+            base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/"
-        train_file = "engtrain.bio"
-        test_file = "engtest.bio"
-        cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name)
-        cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name)
+        _download_wikiner("pl", dataset_name)
 
-        super(MITMovieNERSimple, self).__init__(
-            data_folder,
-            columns,
-            train_file=train_file,
-            test_file=test_file,
-            tag_to_bioes=tag_to_bioes,
-            in_memory=in_memory,
+        super(WIKINER_POLISH, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
-class MITMovieNERComplex(ColumnCorpus):
+
+class WIKINER_RUSSIAN(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
-            in_memory: bool = True,
+            in_memory: bool = False,
     ):
-        """
-        Initialize the trivia10k13 corpus of the MIT Movie Corpus (it has more complex queries compared to the eng corpus)
-        in BIO format. The first time you call this constructor it will automatically download the dataset.
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
-        POS tags instead
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        """
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # column format
-        columns = {0: "ner", 1: "text"}
+        columns = {0: "text", 1: "pos", 2: "ner"}
 
-        # dataset name
+        # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
-        # data folder: default dataset folder is the cache root
-        if type(base_path) == str:
-            base_path: Path = Path(base_path)
+        # default dataset folder is the cache root
         if not base_path:
-            base_path: Path = Path(flair.cache_root) / "datasets"
+            base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/"
-        train_file = "trivia10k13train.bio"
-        test_file = "trivia10k13test.bio"
-        cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name)
-        cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name)
+        _download_wikiner("ru", dataset_name)
 
-        super(MITMovieNERComplex, self).__init__(
-            data_folder,
-            columns,
-            train_file=train_file,
-            test_file=test_file,
-            tag_to_bioes=tag_to_bioes,
-            in_memory=in_memory,
+        super(WIKINER_RUSSIAN, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
-class SEC_FILLINGS(ColumnCorpus):
+
+class WNUT_17(ColumnCorpus):
     def __init__(
             self,
-            base_path: Union[str, Path] = None, 
+            base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
             in_memory: bool = True,
     ):
-        
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "pos", 3: "ner"}
+        columns = {0: "text", 1: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2812,22 +2650,19 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        SEC_FILLINGS_Path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/SEC-filings/CONLL-format/data/"
-        cached_path(f"{SEC_FILLINGS_Path}test/FIN3.txt", Path("datasets") / dataset_name)
-        cached_path(f"{SEC_FILLINGS_Path}train/FIN5.txt", Path("datasets") / dataset_name)
+        wnut_path = "https://noisy-text.github.io/2017/files/"
+        cached_path(f"{wnut_path}wnut17train.conll", Path("datasets") / dataset_name)
+        cached_path(f"{wnut_path}emerging.dev.conll", Path("datasets") / dataset_name)
+        cached_path(
+            f"{wnut_path}emerging.test.annotated", Path("datasets") / dataset_name
+        )
 
-        super(SEC_FILLINGS, self).__init__(
-            data_folder,
-            columns,
-            tag_to_bioes=tag_to_bioes,
-            encoding="utf-8",
-            in_memory=in_memory,
-            train_file='FIN5.txt',
-            test_file="FIN3.txt",
-            skip_first_line=True
+        super(WNUT_17, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
-class TURKU_NER(ColumnCorpus):
+
+class WNUT_2020_NER(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
@@ -2836,12 +2671,11 @@ def __init__(
             document_as_sequence: bool = False,
     ):
         """
-        Initialize the Finnish TurkuNER corpus. The first time you call this constructor it will automatically
+        Initialize the WNUT_2020_NER corpus. The first time you call this constructor it will automatically
         download the dataset.
         :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
         to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
-        POS tags instead
+        :param tag_to_bioes: NER by default, since it is the only option of the WNUT corpus.
         :param in_memory: If True, keeps dataset in memory giving speedups in training.
         :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
@@ -2860,23 +2694,201 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        conll_path = "https://raw.githubusercontent.com/TurkuNLP/turku-ner-corpus/master/data/conll"
-        dev_file = "dev.tsv"
-        test_file = "test.tsv"
-        train_file = "train.tsv"
-        cached_path(f"{conll_path}/{dev_file}", Path("datasets") / dataset_name)
-        cached_path(f"{conll_path}/{test_file}", Path("datasets") / dataset_name)
-        cached_path(f"{conll_path}/{train_file}", Path("datasets") / dataset_name)
+        github_url = "https://github.com/jeniyat/WNUT_2020_NER/archive/master.zip"
 
-        super(TURKU_NER, self).__init__(
+        for sample in ["train", "test", "dev"]:
+
+            sample_file = data_folder / (sample + ".txt")
+            if not sample_file.is_file():
+
+                zip_path = cached_path(
+                    f"{github_url}", Path("datasets") / dataset_name
+                )
+
+                # unzip the downloaded repo and merge the train, dev and test datasets
+                unpack_file(zip_path, data_folder, "zip", False)  # unzipped folder name: WNUT_2020_NER-master
+
+                if sample == "test":
+                    file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data_2020/Conll_Format/")
+                else:
+                    file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data/Conll_Format/")
+                filenames = os.listdir(file_path)
+                with open(data_folder / (sample + '.txt'), 'w') as outfile:
+                    for fname in filenames:
+                        with open(file_path / fname) as infile:
+                            lines = infile.read()
+                            outfile.write(lines)
+
+                shutil.rmtree(str(data_folder / "WNUT_2020_NER-master"))  # clean up when done
+
+        super(WNUT_2020_NER, self).__init__(
             data_folder,
             columns,
-            dev_file=dev_file,
-            test_file=test_file,
-            train_file=train_file,
-            column_delimiter="\t",
             tag_to_bioes=tag_to_bioes,
-            encoding="latin-1",
+            encoding="utf-8",
             in_memory=in_memory,
             document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-        )
\ No newline at end of file
+        )
+
+
+def _download_wikiner(language_code: str, dataset_name: str):
+    # download data if necessary
+    wikiner_path = (
+        "https://raw.githubusercontent.com/dice-group/FOX/master/input/Wikiner/"
+    )
+    lc = language_code
+
+    data_file = (
+            Path(flair.cache_root)
+            / "datasets"
+            / dataset_name
+            / f"aij-wikiner-{lc}-wp3.train"
+    )
+    if not data_file.is_file():
+
+        cached_path(
+            f"{wikiner_path}aij-wikiner-{lc}-wp3.bz2", Path("datasets") / dataset_name
+        )
+        import bz2, shutil
+
+        # unpack and write out in CoNLL column-like format
+        bz_file = bz2.BZ2File(
+            Path(flair.cache_root)
+            / "datasets"
+            / dataset_name
+            / f"aij-wikiner-{lc}-wp3.bz2",
+            "rb",
+        )
+        with bz_file as f, open(
+                Path(flair.cache_root)
+                / "datasets"
+                / dataset_name
+                / f"aij-wikiner-{lc}-wp3.train",
+                "w",
+                encoding="utf-8"
+        ) as out:
+            for line in f:
+                line = line.decode("utf-8")
+                words = line.split(" ")
+                for word in words:
+                    out.write("\t".join(word.split("|")) + "\n")
+
+
+class XTREME(MultiCorpus):
+    def __init__(
+            self,
+            languages: Union[str, List[str]] = None,
+            base_path: Union[str, Path] = None,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = False,
+    ):
+        """
+        Xtreme corpus for cross-lingual NER consisting of datasets of a total of 176 languages. The data comes from the google
+        research work XTREME https://github.com/google-research/xtreme. All datasets for NER and respective language abbreviations (e.g.
+        "en" for english can be found here https://www.amazon.com/clouddrive/share/d3KGCRCIYwhKJF0H3eWA26hjg2ZCRhjpEQtDL70FSBN/folder/C43gs51bSIaq5sFTQkWNCQ?_encoding=UTF8&*Version*=1&*entries*=0&mgh=1 )
+        The data is derived from the wikiann dataset https://elisa-ie.github.io/wikiann/ (license: https://opendatacommons.org/licenses/by/)
+
+        Parameters
+        ----------
+        languages : Union[str, List[str]], optional
+            Default the 40 languages that are used in XTREME are loaded. Otherwise on can hand over a strings or a list of strings
+            consisiting of abbreviations for languages. All datasets will be loaded in a MultiCorpus object.
+        base_path : Union[str, Path], optional
+            Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+            to point to a different folder but typically this should not be necessary.
+        tag_to_bioes : str, optional
+            The data is in bio-format. It will by default (with the string "ner" as value) be transformed
+            into the bioes format. If you dont want that set it to None.
+
+        """
+        # if no languages are given as argument all languages used in XTREME will be loaded
+        if not languages:
+            languages = ["af", "ar", "bg", "bn", "de", "el", "en", "es", "et", "eu", "fa", "fi", "fr", "he", "hi", "hu",
+                         "id", "it", "ja", "jv", "ka", "kk", "ko", "ml", "mr", "ms", "my", "nl", "pt", "ru", "sw", "ta",
+                         "te", "th", "tl", "tr", "ur", "vi", "yo", "zh"]
+
+        # if only one language is given
+        if type(languages) == str:
+            languages = [languages]
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
+        # column format
+        columns = {0: "text", 1: "ner"}
+
+        # this dataset name
+        dataset_name = "xtreme"
+
+        # default dataset folder is the cache root
+        if not base_path:
+            base_path = Path(flair.cache_root) / "datasets"
+        data_folder = base_path / dataset_name
+
+        # For each language in languages, the file is downloaded if not existent
+        # Then a comlumncorpus of that data is created and saved in a list
+        # This list is handed to the multicorpus
+
+        # list that contains the columncopora
+        corpora = []
+
+        hu_path = "https://nlp.informatik.hu-berlin.de/resources/datasets/panx_dataset"
+
+        # download data if necessary
+        for language in languages:
+
+            language_folder = data_folder / language
+
+            # if language not downloaded yet, download it
+            if not language_folder.exists():
+
+                file_name = language + '.tar.gz'
+                # create folder
+                os.makedirs(language_folder)
+
+                # download from HU Server
+                temp_file = cached_path(
+                    hu_path + "/" + file_name,
+                    Path("datasets") / dataset_name / language
+                )
+
+                # unzip
+                print("Extract data...")
+                import tarfile
+                tar = tarfile.open(str(temp_file), "r:gz")
+                for part in ["train", "test", "dev"]:
+                    tar.extract(part, str(language_folder))
+                tar.close()
+                print('...done.')
+
+                # transform data into required format
+                print("Process dataset...")
+                for part in ["train", "test", "dev"]:
+                    xtreme_to_simple_ner_annotation(str(language_folder / part))
+                print('...done.')
+
+            # initialize comlumncorpus and add it to list
+            print("Read data into corpus...")
+            corp = ColumnCorpus(data_folder=language_folder,
+                                column_format=columns,
+                                tag_to_bioes=tag_to_bioes,
+                                in_memory=in_memory,
+                                )
+            corpora.append(corp)
+            print("...done.")
+
+        super(XTREME, self).__init__(
+            corpora, name='xtreme'
+        )
+
+
+def xtreme_to_simple_ner_annotation(data_file: Union[str, Path]):
+    with open(data_file, 'r', encoding='utf-8') as f:
+        lines = f.readlines()
+    with open(data_file, 'w', encoding='utf-8') as f:
+        for line in lines:
+            if line == '\n':
+                f.write(line)
+            else:
+                liste = line.split()
+                f.write(liste[0].split(':', 1)[1] + ' ' + liste[1] + '\n')
diff --git a/resources/docs/TUTORIAL_6_CORPUS.md b/resources/docs/TUTORIAL_6_CORPUS.md
index f981bf715..0c7419abe 100644
--- a/resources/docs/TUTORIAL_6_CORPUS.md
+++ b/resources/docs/TUTORIAL_6_CORPUS.md
@@ -162,20 +162,24 @@ data the first time you call the corresponding constructor ID. The following dat
 
 | ID(s) | Languages | Description |
 | -------------    | ------------- |------------- 
+| 'ANER_CORP' | Arabic  |  [Arabic Named Entity Recognition Corpus](http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp/) 4-class NER |
 | 'BIOFID' | German  |  [CoNLL-03](https://www.aclweb.org/anthology/K19-1081/) Biodiversity literature NER |
+| 'BIOSCOPE' | English  |  [BioScope](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-9-S11-S9/) biomedical texts annotated for uncertainty, negation and their scopes |
 | 'CONLL_03_DUTCH' | Dutch  |  [CoNLL-03](https://www.clips.uantwerpen.be/conll2002/ner/) 4-class NER |
 | 'CONLL_03_SPANISH' | Spanish  |  [CoNLL-03](https://www.clips.uantwerpen.be/conll2002/ner/) 4-class NER |
 | 'DANE' | Danish | [DaNE dataset](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#danish-dependency-treebank) | 
 | 'EUROPARL_NER_GERMAN' | German | [German Europarl dataset](https://nlpado.de/~sebastian/software/ner_german.shtml) NER in German EU parliament speeches | 
 | 'LER_GERMAN' | German | [Legal Entity Recognition](https://github.com/elenanereiss/Legal-Entity-Recognition) NER in German Legal Documents | 
-| 'MIT_RESTAURANTS' | English  |  [NER dataset for restaurant reviews](https://groups.csail.mit.edu/sls/downloads/restaurant/) |
+| 'MIT_MOVIE_NER_SIMPLE' | English  |  [NER dataset for movie reviews](https://groups.csail.mit.edu/sls/downloads/movie/) - simple NER |
+| 'MIT_MOVIE_NER_COMPLEX' | English  |  [NER dataset for movie reviews](https://groups.csail.mit.edu/sls/downloads/movie/) - complex NER |
+| 'MIT_RESTAURANT_NER' | English  |  [NER dataset for restaurant reviews](https://groups.csail.mit.edu/sls/downloads/restaurant/) |
 | 'NER_BASQUE' | Basque  |  [NER dataset for Basque](http://ixa2.si.ehu.eus/eiec/) |
 | 'NER_FINNISH' | Finnish | [Finer-data](https://github.com/mpsilfve/finer-data) | 
 | 'NER_SWEDISH' | Swedish | [Swedish Spraakbanken NER](https://github.com/klintan/swedish-ner-corpus/) 4-class NER |
+| 'TURKU_NER' | Finnish | [TURKU_NER](https://github.com/TurkuNLP/turku-ner-corpus) NER corpus created by the Turku NLP Group, University of Turku, Finland |
 | 'TWITTER_NER' | English  |  [Twitter NER dataset](https://github.com/aritter/twitter_nlp/) |
+| 'WEIBO_NER' | Chinese  | [Weibo NER corpus](https://paperswithcode.com/sota/chinese-named-entity-recognition-on-weibo-ner/).  |
 | 'WIKIANN' | 282 languages  | Gigantic [corpus for cross-lingual NER derived from Wikipedia](https://elisa-ie.github.io/wikiann/).  |
-| 'WNUT_17' | English  |  [WNUT-17](https://noisy-text.github.io/2017/files/) emerging entity detection |
-| 'WNUT_20' | English  |  [WNUT-20](https://github.com/jeniyat/WNUT_2020_NER) named entity extraction |
 | 'WIKIGOLD_NER' | English  |  [Wikigold](https://github.com/juand-r/entity-recognition-datasets/tree/master/data/wikigold) a manually annotated collection of Wikipedia text |
 | 'WIKINER_ENGLISH' | English  |  [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia |
 | 'WIKINER_GERMAN'  | German  |  [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia |
@@ -185,16 +189,33 @@ data the first time you call the corresponding constructor ID. The following dat
 | 'WIKINER_PORTUGUESE' | Portuguese  |  [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia |
 | 'WIKINER_POLISH' | Polish  |  [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia |
 | 'WIKINER_RUSSIAN'  | Russian  |  [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia |
+| 'WNUT_17' | English  |  [WNUT-17](https://noisy-text.github.io/2017/files/) emerging entity detection |
+| 'WNUT_2020_NER' | English  |  [WNUT-20](https://github.com/jeniyat/WNUT_2020_NER) named entity extraction |
 | 'XTREME' | 176 languages  |  [Xtreme](https://github.com/google-research/xtreme) corpus by Google Research for cross-lingual NER consisting of datasets of a total of 176 languages |
-| 'MITMovieNERSimple' | English | [eng](https://groups.csail.mit.edu/sls/downloads/movie) NER movie corpus collected by MIT (simpler queries) |
-| 'MITMovieNERComplex' | English | [trivia10k13](https://groups.csail.mit.edu/sls/downloads/movie) NER movie corpus collected by MIT (more complex queries) |
-| 'TURKU_NER' | Finnish | [TURKU_NER](https://github.com/TurkuNLP/turku-ner-corpus) NER corpus created by the Turku NLP Group, University of Turku, Finland |
 
 
 #### Biomedical Named Entity Recognition
 
 We support 31 biomedical NER datasets, listed [here](HUNFLAIR_CORPORA.md).
 
+
+#### Universal Proposition Banks 
+
+We now also support loading the [Universal Proposition Banks](https://github.com/System-T/UniversalPropositions)
+for the purpose of training multilingual frame detection systems. 
+
+| ID(s) | Languages | Description |
+| -------------    | ------------- |------------- |
+| 'UP_CHINESE' | Chinese  |  Universal Propositions for [Chinese](https://github.com/System-T/UniversalPropositions/tree/master/UP_Chinese) |
+| 'UP_ENGLISH'| English  |  Universal Propositions for [English](https://github.com/System-T/UniversalPropositions/tree/master/UP_English-EWT) |
+| 'UP_FINNISH'| Finnish  |  Universal Propositions for [Finnish](https://github.com/System-T/UniversalPropositions/tree/master/UP_Finnish)
+| 'UP_FRENCH'| French  |  Universal Propositions for [French](https://github.com/System-T/UniversalPropositions/tree/master/UP_French)
+| 'UP_GERMAN'| German  |  Universal Propositions for [German](https://github.com/System-T/UniversalPropositions/tree/master/UP_German) |
+| 'UP_ITALIAN', | Italian  |  Universal Propositions for [Italian](https://github.com/System-T/UniversalPropositions/tree/master/UP_Italian) |
+| 'UP_SPANISH' | Spanish  |  Universal Propositions for [Spanish](https://github.com/System-T/UniversalPropositions/tree/master/UP_Spanish) |
+| 'UP_SPANISH_ANCORA' | Spanish (Ancora Corpus)  |  Universal Propositions for [Spanish](https://github.com/System-T/UniversalPropositions/tree/master/UP_Spanish-AnCora) |
+
+
 #### Universal Dependency Treebanks
 
 | ID(s) | Languages | Description |

From 08e027cfd5f42ee9b82220ef769f8f814b6ff7fb Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Wed, 25 Nov 2020 14:44:58 +0100
Subject: [PATCH 09/35] GH-1983: bump version number

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index fa33a27cc..d82f2155d 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ document embeddings, including our proposed **[Flair embeddings](https://www.acl
 * **A PyTorch NLP framework.** Our framework builds directly on [PyTorch](https://pytorch.org/), making it easy to
 train your own models and experiment with new approaches using Flair embeddings and classes.
 
-Now at [version 0.6.1](https://github.com/flairNLP/flair/releases)!
+Now at [version 0.7](https://github.com/flairNLP/flair/releases)!
 
 ## Comparison with State-of-the-Art
 

From fa854426b7eb9c7d2285ab514048a5db8775de3d Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Wed, 25 Nov 2020 14:51:25 +0100
Subject: [PATCH 10/35] Update TUTORIAL_1_BASICS.md

---
 resources/docs/TUTORIAL_1_BASICS.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/resources/docs/TUTORIAL_1_BASICS.md b/resources/docs/TUTORIAL_1_BASICS.md
index 655ef375e..61828d0d0 100644
--- a/resources/docs/TUTORIAL_1_BASICS.md
+++ b/resources/docs/TUTORIAL_1_BASICS.md
@@ -80,7 +80,7 @@ print(untokenized_sentence)
 
 In this case, no tokenization is performed and the text is split on whitespaces, thus resulting in only 4 tokens here. 
 
-### Using a Different Tokenizer
+### Using a different tokenizer
 
 You can also pass custom tokenizers to the initialization method. For instance, if you want to tokenize a Japanese
 sentence you can use the 'janome' tokenizer instead, like this: 
@@ -110,12 +110,12 @@ You can write your own tokenization routine. Check the code of `flair.data.Token
  your own tokenization method.  
 
 ### Using pretokenized sequences
-You can pass pass a pretokenized sequence as list of words, e.g.
+You can alternatively pass a pretokenized sequence as list of words, e.g.
 
 ```python
 from flair.data import Sentence
-my_sent = Sentence(['The', 'grass', 'is', 'green', '.'])
-print(my_sent)
+sentence = Sentence(['The', 'grass', 'is', 'green', '.'])
+print(sentence)
 ```
 
 This should print:
@@ -129,7 +129,7 @@ Sentence: "The grass is green ."   [− Tokens: 5]
 
 In Flair, any data point can be labeled. For instance, you can label a word or label a sentence:
 
-### Adding Labels to Tokens
+### Adding labels to tokens
 
 A `Token` has fields for linguistic annotation, such as lemmas, part-of-speech tags or named entity tags. You can
 add a tag by specifying the tag type and the tag value. In this example, we're adding an NER tag of type 'color' to
@@ -171,7 +171,7 @@ This should print:
 Our color tag has a score of 1.0 since we manually added it. If a tag is predicted by our
 sequence labeler, the score value will indicate classifier confidence.
 
-### Adding Labels to Sentences
+### Adding labels to sentences
 
 You can also add a `Label` to a whole `Sentence`.
 For instance, the example below shows how we add the label 'sports' to a sentence, thereby labeling it
@@ -199,7 +199,7 @@ Sentence: "France is the current world cup winner."   [− Tokens: 7  − Senten
 
 Indicating that this sentence belongs to the topic 'sports' with confidence 1.0.
 
-### Multiple Labels
+### Multiple labels
 
 Any data point can be labeled multiple times. A sentence for instance might belong to two topics. In this case, add two labels with the same label name:
 
@@ -234,7 +234,7 @@ Sentence: "France is the current world cup winner."   [− Tokens: 7  − Senten
 
 Indicating that this sentence has two "topic" labels and one "language" label. 
 
-### Accessing a Sentence's Labels
+### Accessing a sentence's labels
 
 You can access these labels like this: 
 

From d02ad73c5485b5bcada7cd0462737f4a6921d53a Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Fri, 27 Nov 2020 12:50:11 +0100
Subject: [PATCH 11/35] Update TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md

---
 resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md
index eba2594df..50bbfc633 100644
--- a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md
+++ b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md
@@ -18,6 +18,8 @@ For instance, say you want to predict whether text is "happy" or "sad" but you h
 Just use TARS with this snippet:
 
 ```python
+from flair.models.text_classification_model import TARSClassifier
+
 # 1. Load our pre-trained TARS model for English
 tars = TARSClassifier.load('tars-base')
 

From 2ab3139d90d7bd9ce2ed36033077483b5b8c2459 Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Fri, 27 Nov 2020 12:51:28 +0100
Subject: [PATCH 12/35] Update TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md

---
 resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md
index 50bbfc633..16f19b7ce 100644
--- a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md
+++ b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md
@@ -19,6 +19,7 @@ Just use TARS with this snippet:
 
 ```python
 from flair.models.text_classification_model import TARSClassifier
+from flair.data import Sentence
 
 # 1. Load our pre-trained TARS model for English
 tars = TARSClassifier.load('tars-base')
@@ -69,6 +70,8 @@ In this case, zero-shot prediction falsely predicts "drink" for the sentence "I
 To improve this, let's first create a small corpus of 4 training and 2 testing examples: 
 
 ```python
+from flair.datasets import SentenceDataset
+
 # training dataset consisting of four sentences (2 labeled as "food" and 2 labeled as "drink")
 train = SentenceDataset(
     [

From ce9904a09e796d186537f14b48af4f4e3179eee8 Mon Sep 17 00:00:00 2001
From: alanakbik <alan.akbik@gmail.com>
Date: Tue, 1 Dec 2020 10:52:55 +0100
Subject: [PATCH 13/35] GH-1983: move distance classifier to diagnostics module

---
 flair/models/__init__.py                  |   1 -
 flair/models/text_classification_model.py | 486 +---------------------
 2 files changed, 1 insertion(+), 486 deletions(-)

diff --git a/flair/models/__init__.py b/flair/models/__init__.py
index 16a09af1c..784b038a9 100644
--- a/flair/models/__init__.py
+++ b/flair/models/__init__.py
@@ -1,4 +1,3 @@
 from .sequence_tagger_model import SequenceTagger, MultiTagger
 from .language_model import LanguageModel
 from .text_classification_model import TextClassifier
-from .text_classification_model import DistClassifier
diff --git a/flair/models/text_classification_model.py b/flair/models/text_classification_model.py
index 00115d2aa..7e0dab976 100644
--- a/flair/models/text_classification_model.py
+++ b/flair/models/text_classification_model.py
@@ -7,7 +7,6 @@
 from torch.utils.data.dataset import Dataset
 from tqdm import tqdm
 import numpy as np
-from math import floor
 
 import sklearn.metrics as metrics
 from sklearn.metrics.pairwise import cosine_similarity
@@ -17,12 +16,7 @@
 from flair.data import Dictionary, Sentence, Label, DataPoint
 from flair.datasets import SentenceDataset, DataLoader
 from flair.file_utils import cached_path
-from flair.training_utils import (
-    MetricRegression,
-    convert_labels_to_one_hot,
-    Result,
-    store_embeddings,
-)
+from flair.training_utils import convert_labels_to_one_hot, Result, store_embeddings
 
 log = logging.getLogger("flair")
 
@@ -947,481 +941,3 @@ def _fetch_model(model_name) -> str:
             model_name = cached_path(model_map[model_name], cache_dir=cache_dir)
 
         return model_name
-
-    
-
-class DistClassifier(flair.nn.Model):
-    """
-    DistClassifier
-    Model to predict distance between two words given their embeddings. Takes (contextual) word embedding as input.
-    The pair of word embeddings is passed through a linear layer that predicts their distance in a sentence. 
-    Note: When used for training the batch size must be set to 1!!!
-    """
-
-    def __init__(
-            self,
-            word_embeddings: flair.embeddings.TokenEmbeddings,
-            max_distance: int = 20,
-            beta: float = 1.0,
-            loss_max_weight: float = 1,
-            regression = False,
-            regr_loss_step = 0
-    ):
-        """
-        Initializes a DistClassifier
-        :param word_embeddings: embeddings used to embed each sentence
-        .param max_distance: max dist between word pairs = number of predicted classes - 1
-        :param beta: Parameter for F-beta score for evaluation and training annealing
-        :param loss_max_weight: Only for classification: Since small distances between word pairs occur mor frequent it makes sense to give them less weight
-        in the loss function. loss_max_weight will be used as the weight for the maximum distance and should be a number >=1
-        The other weights decrease with equidistant steps from high to low distance.
-        :param regression: if True the class does regression instead of classification
-        :param regr_loss_step: if > 0, the MSE-Loss in regression will be weighted. Word pairs with 
-        distance 0 have weight 1. Then, as the distance increases, the weight in the loss function,
-        increases step by step with size regr_loss_step 
-        """
-
-        super(DistClassifier, self).__init__()
-
-        self.word_embeddings: flair.embeddings.TokenEmbeddings = word_embeddings
-
-        self.beta = beta
-
-        self.loss_max_weight = loss_max_weight
-                            
-        self.regression = regression
-
-        self.regr_loss_step = regr_loss_step
-
-        if not regression:
-            self.max_distance = max_distance
-            
-            # weights for loss function
-            if self.loss_max_weight > 1:
-                step = (self.loss_max_weight - 1) / self.max_distance
-
-                weight_list = [1. + i * step for i in range(self.max_distance + 1)]
-
-                self.loss_weights = torch.FloatTensor(weight_list).to(flair.device)
-                
-            else:
-                self.loss_weights = None
-            
-            # iput size is two times wordembedding size since we use pair of words as input
-            # the output size is max_distance + 1, i.e. we allow 0,1,...,max_distance words between pairs
-            self.decoder = nn.Linear(
-                self.word_embeddings.embedding_length * 2, self.max_distance + 1)
-            
-            self.loss_function = nn.CrossEntropyLoss(weight=self.loss_weights)
-        
-        # regression
-        else:
-            self.max_distance = float('inf')
-            
-            # iput size is two times wordembedding size since we use pair of words as input
-            # the output size is 1
-            self.decoder = nn.Linear(
-                self.word_embeddings.embedding_length * 2, 1)
-            
-            if regr_loss_step > 0:
-                self.loss_function = self.weighted_mse_loss
-            else:
-                self.loss_function = nn.MSELoss()
-            
-        nn.init.xavier_uniform_(self.decoder.weight)
-
-        # auto-spawn on GPU if available
-        self.to(flair.device)
-        
-        
-    # all input should be tensors
-    def weighted_mse_loss(self,predictions, target):
-        
-        weight = 1 + self.regr_loss_step * target
-        
-        return (weight * ((predictions - target) ** 2)).mean()
-        
-
-    # forward allows only a single sentcence!!
-    def forward(self, sentence: Sentence):
-
-        # embed words of sentence
-        self.word_embeddings.embed(sentence)
-
-        # go through all pairs of words with a maximum number of max_distance in between
-        numberOfWords = len(sentence)
-        text_embedding_list = []
-        # go through all pairs
-        for i in range(numberOfWords):
-            for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)):
-                text_embedding_list.append(torch.cat((sentence[i].embedding, sentence[j].embedding)).unsqueeze(0))
-
-        # 2-dim matrix whose rows are the embeddings of word pairs of the sentence
-        text_embedding_tensor = torch.cat(text_embedding_list, 0).to(flair.device)
-
-        label_scores = self.decoder(text_embedding_tensor)
-        
-        if self.regression:
-            return label_scores.squeeze(1)
-        
-        return label_scores
-
-    def _get_state_dict(self):
-        model_state = {
-            "state_dict": self.state_dict(),
-            "word_embeddings": self.word_embeddings,
-            "max_distance": self.max_distance,
-            "beta": self.beta,
-            "loss_max_weight": self.loss_max_weight,
-            "regression": self.regression,
-            "regr_loss_step": self.regr_loss_step
-        }
-        return model_state
-
-    @staticmethod
-    def _init_model_with_state_dict(state):
-        beta = 1.0 if "beta" not in state.keys() else state["beta"]
-        weight = 1 if "loss_max_weight" not in state.keys() else state["loss_max_weight"]
-
-        model = DistClassifier(
-            word_embeddings=state["word_embeddings"],
-            max_distance=state["max_distance"],
-            beta=beta,
-            loss_max_weight=weight,
-            regression=state["regression"],
-            regr_loss_step=state["regr_loss_step"]
-        )
-
-        model.load_state_dict(state["state_dict"])
-        return model
-
-    # So far only one sentence allowed
-    # If list of sentences is handed the function works with the first sentence of the list
-    def forward_loss(
-            self, data_points: Union[List[Sentence], Sentence]
-    ) -> torch.tensor:
-
-        if isinstance(data_points, list):  # first sentence
-            data_points = data_points[0]
-
-        if len(data_points) < 2:
-            return torch.tensor([0.], requires_grad=True)
-
-        scores = self.forward(data_points)
-
-        return self._calculate_loss(scores, data_points)
-
-    # Assume data_points is a single sentence!!!
-    # scores are the predictions for each word pair
-    def _calculate_loss(self, scores, data_points):
-
-        indices = []
-        numberOfWords = len(data_points)
-
-        # classification needs labels to be integers, regression needs labels to be float
-        # this is due to the different loss functions
-        if not self.regression:
-            for i in range(numberOfWords):
-                for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)):
-                    indices.append(torch.LongTensor([j - i - 1]))  # distance between words
-        else:
-            for i in range(numberOfWords):
-                for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)):            
-                    indices.append(torch.Tensor([j - i - 1])) # distance between words
-
-        labels = torch.cat(indices, 0).to(flair.device)
-        
-        return self.loss_function(scores, labels)
-
-    # only single sentences as input
-    def _forward_scores_and_loss(
-            self, data_points: Union[List[Sentence], Sentence], return_loss=False):
-
-        if isinstance(data_points, list):  # first sentence
-            data_points = data_points[0]
-
-        scores = self.forward(data_points)
-
-        loss = None
-        if return_loss:
-            loss = self._calculate_loss(scores, data_points)
-
-        return scores, loss
-    
-    def evaluate(
-            self,
-            sentences: Union[List[DataPoint], Dataset],
-            out_path: Union[str, Path] = None,
-            embedding_storage_mode: str = "none",
-            mini_batch_size: int = 1,  # unnecessary, but trainer.train calls evaluate with this parameter
-            num_workers: int = 8,
-    ) -> (Result, float):
-        
-        if self.regression:
-            return self.evaluate_regression(
-            sentences = sentences,
-            out_path = out_path,
-            embedding_storage_mode=embedding_storage_mode,
-                )
-        
-        return self.evaluate_classification(
-            sentences = sentences,
-            out_path = out_path,
-            embedding_storage_mode=embedding_storage_mode,
-            )
-    
-    def evaluate_regression(
-            self,
-            sentences: Union[List[DataPoint], Dataset],
-            out_path: Union[str, Path] = None,
-            embedding_storage_mode: str = "none",
-    ) -> (Result, float):
-        
-        with torch.no_grad():
-            
-            buckets = [0 for _ in range(11)]
-            
-            eval_loss = 0
-
-            metric = MetricRegression("Evaluation")
-
-            lines: List[str] = []
-            
-            max_dist_plus_one = max([len(sent) for sent in sentences]) - 1
-            
-            num_occurences = [0 for _ in range(max_dist_plus_one)]
-            
-            cumulated_values = [0 for _ in range(max_dist_plus_one)]
-            
-            for sentence in sentences:
-                
-                if len(sentence) < 2:  # we need at least 2 words per sentence
-                    continue
-
-                scores, loss = self._forward_scores_and_loss(sentence, return_loss=True)
-
-                predictions = scores.tolist()
-
-                # gold labels
-                true_values_for_sentence = []
-                numberOfPairs = 0
-                numberOfWords = len(sentence)
-                lines.append(sentence.to_tokenized_string() + '\n')
-                for i in range(numberOfWords):
-                    for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)):
-                        true_dist = j - i - 1
-                        pred = predictions[numberOfPairs]
-                        
-                        true_values_for_sentence.append(true_dist)
-
-                        # for output text file
-                        eval_line = f"({i},{j})\t{true_dist}\t{pred:.2f}\n"
-                        lines.append(eval_line)
-                        
-                        # for buckets
-                        error = abs(true_dist - pred)
-                        if error >= 10:
-                            buckets[10] += 1
-                        else:
-                            buckets[floor(error)] += 1
-                            
-                        # for average prediction
-                        num_occurences[true_dist] += 1
-                        cumulated_values[true_dist] += pred
-
-                        numberOfPairs += 1
-                        
-                eval_loss += loss/numberOfPairs
-
-                metric.true.extend(true_values_for_sentence)
-                metric.pred.extend(predictions)
-
-                store_embeddings(sentence, embedding_storage_mode)
-
-            eval_loss /= len(sentences) # w.r.t self.loss
-            
-            # add some statistics to the output
-            eval_line = f"Number of Sentences: {len(sentences)}\nBuckets:\n | 0-1 | 1-2 | 2-3 | 3-4 | 4-5 | 5-6 | 6-7 | 7-8 | 8-9 | 9-10 | >10 |\n"
-            lines.append(eval_line)
-            eval_line = "| {} | {} | {} | {} | {} | {} | {} | {} | {} | {} | {} |".format(buckets[0],buckets[1],buckets[2],buckets[3],
-                                                                                          buckets[4],buckets[5],buckets[6],buckets[7],
-                                                                                          buckets[8],buckets[9],buckets[10])
-            lines.append(eval_line)
-            lines.append("\nAverage predicted values per distance:\n")
-            eval_line = ""
-            for i in range(max_dist_plus_one):
-                eval_line += str(i) + ": " + f"{cumulated_values[i]/num_occurences[i]:.2f}" + " "
-                if i!=0 and i%15==0:
-                    eval_line += "\n"
-            
-            lines.append(eval_line)
-                
-            
-
-            if out_path is not None:
-                with open(out_path, "w", encoding="utf-8") as outfile:
-                    outfile.write("".join(lines))
-
-            log_line = f"{metric.mean_squared_error()}\t{metric.spearmanr()}\t{metric.pearsonr()}"
-            log_header = "MSE\tSPEARMAN\tPEARSON"
-
-            detailed_result = (
-                f"AVG: mse: {metric.mean_squared_error():.4f} - "
-                f"mae: {metric.mean_absolute_error():.4f} - "
-                f"pearson: {metric.pearsonr():.4f} - "
-                f"spearman: {metric.spearmanr():.4f}"
-            )
-
-            result: Result = Result(
-                metric.pearsonr(), log_header, log_line, detailed_result
-            )
-
-
-            return result, eval_loss           
-
-    def evaluate_classification(
-            self,
-            sentences: Union[List[DataPoint], Dataset],
-            out_path: Union[str, Path] = None,
-            embedding_storage_mode: str = "none",
-    ) -> (Result, float):
-
-        # use scikit-learn to evaluate
-        y_true = []
-        y_pred = []
-
-        with torch.no_grad():
-            eval_loss = 0
-
-            lines: List[str] = []
-            # we iterate over each sentence, instead of batches
-            for sentence in sentences:
-
-                if len(sentence) < 2:  # we need at least 2 words per sentence
-                    continue
-
-                scores, loss = self._forward_scores_and_loss(sentence, return_loss=True)
-
-                # get single labels from scores
-                predictions = [self._get_single_label(s) for s in scores]
-
-                # gold labels
-                true_values_for_sentence = []
-                numberOfPairs = 0
-                numberOfWords = len(sentence)
-                lines.append(sentence.to_tokenized_string() + '\n')
-                for i in range(numberOfWords):
-                    for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)):
-                        true_values_for_sentence.append(j - i - 1)
-
-                        # for output text file
-                        eval_line = "({},{})\t{}\t{}\n".format(i, j, j - i - 1, predictions[numberOfPairs])
-                        lines.append(eval_line)
-
-                        numberOfPairs += 1
-
-                eval_loss += loss / numberOfPairs  # add average loss of word pairs
-
-                for prediction_for_sentence, true_value_for_sentence in zip(
-                        predictions, true_values_for_sentence
-                ):
-                    # hot one vector of true value
-                    y_true_instance = np.zeros(self.max_distance + 1, dtype=int)
-                    y_true_instance[true_value_for_sentence] = 1
-                    y_true.append(y_true_instance.tolist())
-
-                    # hot one vector of predicted value
-                    y_pred_instance = np.zeros(self.max_distance + 1, dtype=int)
-                    y_pred_instance[prediction_for_sentence] = 1
-                    y_pred.append(y_pred_instance.tolist())
-                    
-                # speichert embeddings, falls embedding_storage!= 'None'
-                store_embeddings(sentence, embedding_storage_mode)
-
-            if out_path is not None:
-                with open(out_path, "w", encoding="utf-8") as outfile:
-                    outfile.write("".join(lines))
-
-            # make "classification report"
-            target_names = []  # liste aller labels, ins unserem Fall
-            for i in range(self.max_distance + 1):
-                target_names.append(str(i))
-            classification_report = metrics.classification_report(y_true, y_pred, digits=4,
-                                                                  target_names=target_names, zero_division=0)
-
-            # get scores
-            micro_f_score = round(metrics.fbeta_score(y_true, y_pred, beta=self.beta, average='micro', zero_division=0),
-                                  4)
-            accuracy_score = round(metrics.accuracy_score(y_true, y_pred), 4)
-            macro_f_score = round(metrics.fbeta_score(y_true, y_pred, beta=self.beta, average='macro', zero_division=0),
-                                  4)
-            # precision_score = round(metrics.precision_score(y_true, y_pred, average='macro', zero_division=0), 4)
-            # recall_score = round(metrics.recall_score(y_true, y_pred, average='macro', zero_division=0), 4)
-
-            detailed_result = (
-                    "\nResults:"
-                    f"\n- F-score (micro) {micro_f_score}"
-                    f"\n- F-score (macro) {macro_f_score}"
-                    f"\n- Accuracy {accuracy_score}"
-                    '\n\nBy class:\n' + classification_report
-            )
-
-            # line for log file
-            log_header = "ACCURACY"
-            log_line = f"\t{accuracy_score}"
-
-            result = Result(
-                main_score=micro_f_score,
-                log_line=log_line,
-                log_header=log_header,
-                detailed_results=detailed_result,
-            )
-
-            eval_loss /= len(sentences)
-
-            return result, eval_loss
-
-    @staticmethod
-    def _filter_empty_sentences(sentences: List[Sentence]) -> List[Sentence]:
-        filtered_sentences = [sentence for sentence in sentences if sentence.tokens]
-        if len(sentences) != len(filtered_sentences):
-            log.warning(
-                "Ignore {} sentence(s) with no tokens.".format(
-                    len(sentences) - len(filtered_sentences)
-                )
-            )
-        return filtered_sentences
-
-    def _obtain_labels(
-            self, scores: List[List[float]], predict_prob: bool = False
-    ) -> List[List[Label]]:
-        """
-        Predicts the labels of sentences.
-        :param scores: the prediction scores from the model
-        :return: list of predicted labels
-        """
-
-        if predict_prob:
-            return [self._predict_label_prob(s) for s in scores]
-
-        return [self._get_single_label(s) for s in scores]
-
-    def _get_single_label(self, label_scores):  # -> List[Label]:
-        softmax = torch.nn.functional.softmax(label_scores, dim=0)
-        conf, idx = torch.max(softmax, 0)
-
-        return idx.item()
-
-    def _predict_label_prob(self, label_scores) -> List[Label]:
-        softmax = torch.nn.functional.softmax(label_scores, dim=0)
-        label_probs = []
-        for idx, conf in enumerate(softmax):
-            label_probs.append(Label(idx, conf.item()))
-        return label_probs
-
-    def __str__(self):
-        return super(flair.nn.Model, self).__str__().rstrip(')') + \
-               f'  (beta): {self.beta}\n' + \
-               f'  (loss_max_weight): {self.loss_max_weight}\n' + \
-               f'  (max_distance) {self.max_distance}\n)'
-

From 80a675b596bbc268ac383175e97a32fa5247e6e6 Mon Sep 17 00:00:00 2001
From: alanakbik <alan.akbik@gmail.com>
Date: Wed, 25 Nov 2020 12:01:26 +0100
Subject: [PATCH 14/35] GH-1983: bump version numbers

---
 flair/__init__.py | 2 +-
 setup.py          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/flair/__init__.py b/flair/__init__.py
index 7d3e9a311..ecb28ec24 100644
--- a/flair/__init__.py
+++ b/flair/__init__.py
@@ -25,7 +25,7 @@
 
 import logging.config
 
-__version__ = "0.6.1.post1"
+__version__ = "0.7"
 
 logging.config.dictConfig(
     {
diff --git a/setup.py b/setup.py
index 0ca078dc0..824626455 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name="flair",
-    version="0.6.1.post1",
+    version="0.7",
     description="A very simple framework for state-of-the-art NLP",
     long_description=open("README.md", encoding="utf-8").read(),
     long_description_content_type="text/markdown",

From 04f227e9fbc4a356fd8ad5d72fecf0091f5487a8 Mon Sep 17 00:00:00 2001
From: alanakbik <alan.akbik@gmail.com>
Date: Wed, 25 Nov 2020 12:40:42 +0100
Subject: [PATCH 15/35] GH-1983: update list of datasets

---
 flair/datasets/__init__.py          |   32 +-
 flair/datasets/sequence_labeling.py | 3008 ++++++++++++++-------------
 resources/docs/TUTORIAL_6_CORPUS.md |   33 +-
 3 files changed, 1553 insertions(+), 1520 deletions(-)

diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py
index 5b611cd23..a59181506 100755
--- a/flair/datasets/__init__.py
+++ b/flair/datasets/__init__.py
@@ -7,6 +7,7 @@
 # Expose all sequence labeling datasets
 from .sequence_labeling import ColumnCorpus
 from .sequence_labeling import ColumnDataset
+from .sequence_labeling import ANER_CORP
 from .sequence_labeling import BIOFID
 from .sequence_labeling import BIOSCOPE
 from .sequence_labeling import CONLL_03
@@ -14,19 +15,31 @@
 from .sequence_labeling import CONLL_03_DUTCH
 from .sequence_labeling import CONLL_03_SPANISH
 from .sequence_labeling import CONLL_2000
-from .sequence_labeling import TWITTER_NER
 from .sequence_labeling import DANE
 from .sequence_labeling import EUROPARL_NER_GERMAN
 from .sequence_labeling import GERMEVAL_14
 from .sequence_labeling import INSPEC
 from .sequence_labeling import LER_GERMAN
+from .sequence_labeling import MIT_MOVIE_NER_SIMPLE
+from .sequence_labeling import MIT_MOVIE_NER_COMPLEX
+from .sequence_labeling import MIT_RESTAURANT_NER
 from .sequence_labeling import NER_BASQUE
 from .sequence_labeling import NER_FINNISH
 from .sequence_labeling import NER_SWEDISH
 from .sequence_labeling import SEMEVAL2010
 from .sequence_labeling import SEMEVAL2017
+from .sequence_labeling import TURKU_NER
+from .sequence_labeling import TWITTER_NER
+from .sequence_labeling import UP_CHINESE
+from .sequence_labeling import UP_ENGLISH
+from .sequence_labeling import UP_FINNISH
+from .sequence_labeling import UP_FRENCH
+from .sequence_labeling import UP_GERMAN
+from .sequence_labeling import UP_ITALIAN
+from .sequence_labeling import UP_SPANISH
+from .sequence_labeling import UP_SPANISH_ANCORA
+from .sequence_labeling import WEIBO_NER
 from .sequence_labeling import WIKIANN
-from .sequence_labeling import XTREME
 from .sequence_labeling import WIKIGOLD_NER
 from .sequence_labeling import WIKINER_ENGLISH
 from .sequence_labeling import WIKINER_GERMAN
@@ -39,20 +52,7 @@
 from .sequence_labeling import WIKINER_RUSSIAN
 from .sequence_labeling import WNUT_17
 from .sequence_labeling import WNUT_2020_NER
-from .sequence_labeling import WEIBO_NER
-from .sequence_labeling import MIT_RESTAURANTS
-from .sequence_labeling import UP_CHINESE
-from .sequence_labeling import UP_ENGLISH
-from .sequence_labeling import UP_FINNISH
-from .sequence_labeling import UP_FRENCH
-from .sequence_labeling import UP_GERMAN
-from .sequence_labeling import UP_ITALIAN
-from .sequence_labeling import UP_SPANISH
-from .sequence_labeling import UP_SPANISH_ANCORA
-from .sequence_labeling import ANER_CORP
-from .sequence_labeling import MITMovieNERSimple
-from .sequence_labeling import MITMovieNERComplex
-from .sequence_labeling import TURKU_NER
+from .sequence_labeling import XTREME
 
 # Expose all document classification datasets
 from .document_classification import ClassificationCorpus
diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index 7dc950dba..02e0a5800 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -267,6 +267,56 @@ def __getitem__(self, index: int = 0) -> Sentence:
         return sentence
 
 
+class ANER_CORP(ColumnCorpus):
+    def __init__(
+            self,
+            base_path: Union[str, Path] = None,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = True,
+            document_as_sequence: bool = False,
+    ):
+        """
+        Initialize a preprocessed version of the Arabic Named Entity Recognition Corpus (ANERCorp) dataset available
+        from https://github.com/EmnamoR/Arabic-named-entity-recognition/blob/master/ANERCorp.rar.
+        http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp
+        Column order is swapped
+        The first time you call this constructor it will automatically download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
+        POS tags instead
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
+        # column format
+        columns = {0: "text", 1: "ner"}
+
+        # this dataset name
+        dataset_name = self.__class__.__name__.lower()
+
+        # default dataset folder is the cache root
+        if not base_path:
+            base_path = Path(flair.cache_root) / "datasets"
+        data_folder = base_path / dataset_name
+
+        # download data if necessary
+        anercorp_path = "https://megantosh.s3.eu-central-1.amazonaws.com/ANERcorp/"
+        # cached_path(f"{anercorp_path}test.txt", Path("datasets") / dataset_name)
+        cached_path(f"{anercorp_path}train.txt", Path("datasets") / dataset_name)
+
+        super(ANER_CORP, self).__init__(
+            data_folder,
+            columns,
+            # tag_to_bioes=tag_to_bioes,
+            encoding="utf-8",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+        )
+
+
 class BIOFID(ColumnCorpus):
     def __init__(
             self,
@@ -299,6 +349,36 @@ def __init__(
         )
 
 
+class BIOSCOPE(ColumnCorpus):
+
+    def __init__(
+            self,
+            base_path: Union[str, Path] = None,
+            in_memory: bool = True,
+    ):
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
+        # column format
+        columns = {0: "text", 1: "tag"}
+
+        # this dataset name
+        dataset_name = self.__class__.__name__.lower()
+
+        # default dataset folder is the cache root
+        if not base_path:
+            base_path = Path(flair.cache_root) / "datasets"
+        data_folder = base_path / dataset_name
+
+        # download data if necessary
+        bioscope_path = "https://raw.githubusercontent.com/whoisjones/BioScopeSequenceLabelingData/master/sequence_labeled/"
+        cached_path(f"{bioscope_path}output.txt", Path("datasets") / dataset_name)
+
+        super(BIOSCOPE, self).__init__(
+            data_folder, columns, in_memory=in_memory, train_file="output.txt"
+        )
+
+
 class CONLL_03(ColumnCorpus):
     def __init__(
             self,
@@ -449,21 +529,123 @@ def __init__(
         )
 
 
+def add_IOB_tags(data_file: Union[str, Path], encoding: str = "utf8", ner_column: int = 1):
+    """
+Function that adds IOB tags if only chunk names are provided (e.g. words are tagged PER instead
+of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects
+the letter 'O'. Additionally it removes lines with no tags in the data file and can also
+be used if the data is only partially IOB tagged.
+Parameters
+----------
+data_file : Union[str, Path]
+    Path to the data file.
+encoding : str, optional
+    Encoding used in open function. The default is "utf8".
+ner_column : int, optional
+    Specifies the ner-tagged column. The default is 1 (the second column).
 
-class WNUT_2020_NER(ColumnCorpus):
+"""
+
+    def add_I_prefix(current_line: List[str], ner: int, tag: str):
+        for i in range(0, len(current_line)):
+            if i == 0:
+                f.write(line_list[i])
+            elif i == ner:
+                f.write(' I-' + tag)
+            else:
+                f.write(' ' + current_line[i])
+        f.write('\n')
+
+    with open(file=data_file, mode='r', encoding=encoding) as f:
+        lines = f.readlines()
+    with open(file=data_file, mode='w', encoding=encoding) as f:
+        pred = 'O'  # remembers ner tag of predecessing line
+        for line in lines:
+            line_list = line.split()
+            if len(line_list) > 2:  # word with tags
+                ner_tag = line_list[ner_column]
+                if ner_tag in ['0', 'O']:  # no chunk
+                    for i in range(0, len(line_list)):
+                        if i == 0:
+                            f.write(line_list[i])
+                        elif i == ner_column:
+                            f.write(' O')
+                        else:
+                            f.write(' ' + line_list[i])
+                    f.write('\n')
+                    pred = 'O'
+                elif '-' not in ner_tag:  # no IOB tags
+                    if pred == 'O':  # found a new chunk
+                        add_I_prefix(line_list, ner_column, ner_tag)
+                        pred = ner_tag
+                    else:  # found further part of chunk or new chunk directly after old chunk
+                        add_I_prefix(line_list, ner_column, ner_tag)
+                        pred = ner_tag
+                else:  # line already has IOB tag (tag contains '-')
+                    f.write(line)
+                    pred = ner_tag.split('-')[1]
+            elif len(line_list) == 0:  # empty line
+                f.write('\n')
+                pred = 'O'
+
+
+def add_IOB2_tags(data_file: Union[str, Path], encoding: str = "utf8"):
+    """
+Function that adds IOB2 tags if only chunk names are provided (e.g. words are tagged PER instead
+of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects
+the letter 'O'. Additionally it removes lines with no tags in the data file and can also
+be used if the data is only partially IOB tagged.
+Parameters
+----------
+data_file : Union[str, Path]
+    Path to the data file.
+encoding : str, optional
+    Encoding used in open function. The default is "utf8".
+
+"""
+    with open(file=data_file, mode='r', encoding=encoding) as f:
+        lines = f.readlines()
+    with open(file=data_file, mode='w', encoding=encoding) as f:
+        pred = 'O'  # remembers tag of predecessing line
+        for line in lines:
+            line_list = line.split()
+            if len(line_list) == 2:  # word with tag
+                word = line_list[0]
+                tag = line_list[1]
+                if tag in ['0', 'O']:  # no chunk
+                    f.write(word + ' O\n')
+                    pred = 'O'
+                elif '-' not in tag:  # no IOB tags
+                    if pred == 'O':  # found a new chunk
+                        f.write(word + ' B-' + tag + '\n')
+                        pred = tag
+                    else:  # found further part of chunk or new chunk directly after old chunk
+                        if pred == tag:
+                            f.write(word + ' I-' + tag + '\n')
+                        else:
+                            f.write(word + ' B-' + tag + '\n')
+                            pred = tag
+                else:  # line already has IOB tag (tag contains '-')
+                    f.write(line)
+                    pred = tag.split('-')[1]
+            elif len(line_list) == 0:  # empty line
+                f.write('\n')
+                pred = 'O'
+
+
+class CONLL_03_SPANISH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
             in_memory: bool = True,
-            document_as_sequence: bool = False,
     ):
         """
-        Initialize the WNUT_2020_NER corpus. The first time you call this constructor it will automatically
+        Initialize the CoNLL-03 corpus for Spanish. The first time you call this constructor it will automatically
         download the dataset.
         :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
         to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, since it is the only option of the WNUT corpus.
+        :param tag_to_bioes: NER by default, should not be changed
         :param in_memory: If True, keeps dataset in memory giving speedups in training.
         :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
@@ -482,65 +664,40 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        github_url = "https://github.com/jeniyat/WNUT_2020_NER/archive/master.zip"
-
-        for sample in ["train", "test", "dev"]:
-
-            sample_file = data_folder / (sample + ".txt")
-            if not sample_file.is_file():
-
-                zip_path = cached_path(
-                    f"{github_url}", Path("datasets") / dataset_name
-                    )
-
-                # unzip the downloaded repo and merge the train, dev and test datasets 
-                unpack_file(zip_path, data_folder, "zip", False) # unzipped folder name: WNUT_2020_NER-master
-
-                if sample == "test":
-                	file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data_2020/Conll_Format/")
-                else:
-                	file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data/Conll_Format/")
-                filenames = os.listdir(file_path)
-                with open(data_folder / (sample + '.txt'), 'w') as outfile: 
-                    for fname in filenames:
-                        with open(file_path / fname) as infile:
-                            lines = infile.read()
-                            outfile.write(lines)
-
-                shutil.rmtree(str(data_folder / "WNUT_2020_NER-master")) # clean up when done
+        conll_02_path = "https://www.clips.uantwerpen.be/conll2002/ner/data/"
+        cached_path(f"{conll_02_path}esp.testa", Path("datasets") / dataset_name)
+        cached_path(f"{conll_02_path}esp.testb", Path("datasets") / dataset_name)
+        cached_path(f"{conll_02_path}esp.train", Path("datasets") / dataset_name)
 
-        super(WNUT_2020_NER, self).__init__(
+        super(CONLL_03_SPANISH, self).__init__(
             data_folder,
             columns,
             tag_to_bioes=tag_to_bioes,
-            encoding="utf-8",
+            encoding="latin-1",
             in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
         )
 
 
-class WIKIGOLD_NER(ColumnCorpus):
+class CONLL_2000(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
+            tag_to_bioes: str = "np",
             in_memory: bool = True,
-            document_as_sequence: bool = False,
     ):
         """
-        Initialize the wikigold corpus. The first time you call this constructor it will automatically
-        download the dataset.
+        Initialize the CoNLL-2000 corpus for English chunking.
+        The first time you call this constructor it will automatically download the dataset.
         :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
         to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, should not be changed
+        :param tag_to_bioes: 'np' by default, should not be changed, but you can set 'pos' instead to predict POS tags
         :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "ner"}
+        columns = {0: "text", 1: "pos", 2: "np"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -551,45 +708,52 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        wikigold_ner_path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/wikigold/CONLL-format/data/"
-        cached_path(f"{wikigold_ner_path}wikigold.conll.txt", Path("datasets") / dataset_name)
-
-        super(WIKIGOLD_NER, self).__init__(
-            data_folder,
-            columns,
-            tag_to_bioes=tag_to_bioes,
-            encoding="utf-8",
-            in_memory=in_memory,
-            train_file='wikigold.conll.txt',
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-        )
-
+        conll_2000_path = "https://www.clips.uantwerpen.be/conll2000/chunking/"
+        data_file = Path(flair.cache_root) / "datasets" / dataset_name / "train.txt"
+        if not data_file.is_file():
+            cached_path(
+                f"{conll_2000_path}train.txt.gz", Path("datasets") / dataset_name
+            )
+            cached_path(
+                f"{conll_2000_path}test.txt.gz", Path("datasets") / dataset_name
+            )
+            import gzip, shutil
 
-class TWITTER_NER(ColumnCorpus):
+            with gzip.open(
+                    Path(flair.cache_root) / "datasets" / dataset_name / "train.txt.gz",
+                    "rb",
+            ) as f_in:
+                with open(
+                        Path(flair.cache_root) / "datasets" / dataset_name / "train.txt",
+                        "wb",
+                ) as f_out:
+                    shutil.copyfileobj(f_in, f_out)
+            with gzip.open(
+                    Path(flair.cache_root) / "datasets" / dataset_name / "test.txt.gz", "rb"
+            ) as f_in:
+                with open(
+                        Path(flair.cache_root) / "datasets" / dataset_name / "test.txt",
+                        "wb",
+                ) as f_out:
+                    shutil.copyfileobj(f_in, f_out)
+
+        super(CONLL_2000, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
+        )
+
+
+class DANE(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
             in_memory: bool = True,
-            document_as_sequence: bool = False,
     ):
-        """
-        Initialize a dataset called twitter_ner which can be found on the following page:
-        https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/ner.txt.
-
-        The first time you call this constructor it will automatically
-        download the dataset.
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, need not be changed
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: 'text', 1: 'ner'}
+        columns = {1: 'text', 3: 'pos', 9: 'ner'}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -600,43 +764,61 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        twitter_ner_path = "https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/"
-        cached_path(f"{twitter_ner_path}ner.txt", Path("datasets") / dataset_name)
+        data_path = Path(flair.cache_root) / "datasets" / dataset_name
+        train_data_file = data_path / "ddt.train.conllu"
+        if not train_data_file.is_file():
+            temp_file = cached_path(
+                'https://danlp.alexandra.dk/304bd159d5de/datasets/ddt.zip',
+                Path("datasets") / dataset_name
+            )
+            from zipfile import ZipFile
 
-        super(TWITTER_NER, self).__init__(
-            data_folder,
-            columns,
-            tag_to_bioes=tag_to_bioes,
-            encoding="latin-1",
-            train_file="ner.txt",
-            in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            with ZipFile(temp_file, 'r') as zip_file:
+                zip_file.extractall(path=data_path)
+
+            # Remove CoNLL-U meta information in the last column
+            for part in ['train', 'dev', 'test']:
+                lines = []
+                data_file = "ddt.{}.conllu".format(part)
+                with open(data_path / data_file, 'r') as file:
+                    for line in file:
+                        if line.startswith("#") or line == "\n":
+                            lines.append(line)
+                        lines.append(line.replace("name=", "").replace("|SpaceAfter=No", ""))
+
+                with open(data_path / data_file, 'w') as file:
+                    file.writelines(lines)
+
+                print(data_path / data_file)
+
+        super(DANE, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes,
+            in_memory=in_memory, comment_symbol="#"
         )
 
 
-class MIT_RESTAURANTS(ColumnCorpus):
+class EUROPARL_NER_GERMAN(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
-            in_memory: bool = True,
-            document_as_sequence: bool = False,
+            in_memory: bool = False,
     ):
         """
-        Initialize the experimental MIT Restaurant corpus available on https://groups.csail.mit.edu/sls/downloads/restaurant/.
-        The first time you call this constructor it will automatically download the dataset.
+        Initialize the EUROPARL_NER_GERMAN corpus. The first time you call this constructor it will automatically
+        download the dataset.
         :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
         to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
-        POS tags instead
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param tag_to_bioes: 'ner' by default, should not be changed.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
         :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
+
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "ner"}
+        columns = {0: 'text', 1: 'lemma', 2: 'pos', 3: 'np', 4: 'ner'}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -647,125 +829,25 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        mit_restaurants_path = "https://megantosh.s3.eu-central-1.amazonaws.com/MITRestoCorpus/"
-        cached_path(f"{mit_restaurants_path}test.txt", Path("datasets") / dataset_name)
-        cached_path(f"{mit_restaurants_path}train.txt", Path("datasets") / dataset_name)
+        europarl_ner_german_path = "https://nlpado.de/~sebastian/software/ner/"
+        cached_path(f"{europarl_ner_german_path}ep-96-04-15.conll", Path("datasets") / dataset_name)
+        cached_path(f"{europarl_ner_german_path}ep-96-04-16.conll", Path("datasets") / dataset_name)
+
+        add_IOB_tags(data_file=Path(data_folder / "ep-96-04-15.conll"), encoding="latin-1", ner_column=4)
+        add_IOB_tags(data_file=Path(data_folder / "ep-96-04-16.conll"), encoding="latin-1", ner_column=4)
 
-        super(MIT_RESTAURANTS, self).__init__(
+        super(EUROPARL_NER_GERMAN, self).__init__(
             data_folder,
             columns,
             tag_to_bioes=tag_to_bioes,
             encoding="latin-1",
             in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            train_file='ep-96-04-16.conll',
+            test_file='ep-96-04-15.conll'
         )
 
 
-def add_IOB_tags(data_file: Union[str, Path], encoding: str = "utf8", ner_column: int = 1):
-    """
-Function that adds IOB tags if only chunk names are provided (e.g. words are tagged PER instead
-of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects
-the letter 'O'. Additionally it removes lines with no tags in the data file and can also
-be used if the data is only partially IOB tagged.
-Parameters
-----------
-data_file : Union[str, Path]
-    Path to the data file.
-encoding : str, optional
-    Encoding used in open function. The default is "utf8".
-ner_column : int, optional
-    Specifies the ner-tagged column. The default is 1 (the second column).
-
-"""
-
-    def add_I_prefix(current_line: List[str], ner: int, tag: str):
-        for i in range(0, len(current_line)):
-            if i == 0:
-                f.write(line_list[i])
-            elif i == ner:
-                f.write(' I-' + tag)
-            else:
-                f.write(' ' + current_line[i])
-        f.write('\n')
-
-    with open(file=data_file, mode='r', encoding=encoding) as f:
-        lines = f.readlines()
-    with open(file=data_file, mode='w', encoding=encoding) as f:
-        pred = 'O'  # remembers ner tag of predecessing line
-        for line in lines:
-            line_list = line.split()
-            if len(line_list) > 2:  # word with tags
-                ner_tag = line_list[ner_column]
-                if ner_tag in ['0', 'O']:  # no chunk
-                    for i in range(0, len(line_list)):
-                        if i == 0:
-                            f.write(line_list[i])
-                        elif i == ner_column:
-                            f.write(' O')
-                        else:
-                            f.write(' ' + line_list[i])
-                    f.write('\n')
-                    pred = 'O'
-                elif '-' not in ner_tag:  # no IOB tags
-                    if pred == 'O':  # found a new chunk
-                        add_I_prefix(line_list, ner_column, ner_tag)
-                        pred = ner_tag
-                    else:  # found further part of chunk or new chunk directly after old chunk
-                        add_I_prefix(line_list, ner_column, ner_tag)
-                        pred = ner_tag
-                else:  # line already has IOB tag (tag contains '-')
-                    f.write(line)
-                    pred = ner_tag.split('-')[1]
-            elif len(line_list) == 0:  # empty line
-                f.write('\n')
-                pred = 'O'
-
-
-def add_IOB2_tags(data_file: Union[str, Path], encoding: str = "utf8"):
-    """
-Function that adds IOB2 tags if only chunk names are provided (e.g. words are tagged PER instead
-of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects
-the letter 'O'. Additionally it removes lines with no tags in the data file and can also
-be used if the data is only partially IOB tagged.
-Parameters
-----------
-data_file : Union[str, Path]
-    Path to the data file.
-encoding : str, optional
-    Encoding used in open function. The default is "utf8".
-
-"""
-    with open(file=data_file, mode='r', encoding=encoding) as f:
-        lines = f.readlines()
-    with open(file=data_file, mode='w', encoding=encoding) as f:
-        pred = 'O'  # remembers tag of predecessing line
-        for line in lines:
-            line_list = line.split()
-            if len(line_list) == 2:  # word with tag
-                word = line_list[0]
-                tag = line_list[1]
-                if tag in ['0', 'O']:  # no chunk
-                    f.write(word + ' O\n')
-                    pred = 'O'
-                elif '-' not in tag:  # no IOB tags
-                    if pred == 'O':  # found a new chunk
-                        f.write(word + ' B-' + tag + '\n')
-                        pred = tag
-                    else:  # found further part of chunk or new chunk directly after old chunk
-                        if pred == tag:
-                            f.write(word + ' I-' + tag + '\n')
-                        else:
-                            f.write(word + ' B-' + tag + '\n')
-                            pred = tag
-                else:  # line already has IOB tag (tag contains '-')
-                    f.write(line)
-                    pred = tag.split('-')[1]
-            elif len(line_list) == 0:  # empty line
-                f.write('\n')
-                pred = 'O'
-
-
-class CONLL_03_SPANISH(ColumnCorpus):
+class GERMEVAL_14(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
@@ -773,19 +855,18 @@ def __init__(
             in_memory: bool = True,
     ):
         """
-        Initialize the CoNLL-03 corpus for Spanish. The first time you call this constructor it will automatically
-        download the dataset.
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, should not be changed
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Initialize the GermEval NER corpus for German. This is only possible if you've manually downloaded it to your
+        machine. Obtain the corpus from https://sites.google.com/site/germeval2014ner/data and put it into some folder.
+        Then point the base_path parameter in the constructor to this folder
+        :param base_path: Path to the GermEval corpus on your machine
+        :param tag_to_bioes: 'ner' by default, should not be changed.
+        :param in_memory:If True, keeps dataset in memory giving speedups in training.
         """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "ner"}
+        columns = {1: "text", 2: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -795,41 +876,36 @@ def __init__(
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
-        # download data if necessary
-        conll_02_path = "https://www.clips.uantwerpen.be/conll2002/ner/data/"
-        cached_path(f"{conll_02_path}esp.testa", Path("datasets") / dataset_name)
-        cached_path(f"{conll_02_path}esp.testb", Path("datasets") / dataset_name)
-        cached_path(f"{conll_02_path}esp.train", Path("datasets") / dataset_name)
-
-        super(CONLL_03_SPANISH, self).__init__(
+        # check if data there
+        if not data_folder.exists():
+            log.warning("-" * 100)
+            log.warning(f'WARNING: GermEval-14 dataset not found at "{data_folder}".')
+            log.warning(
+                'Instructions for obtaining the data can be found here: https://sites.google.com/site/germeval2014ner/data"'
+            )
+            log.warning("-" * 100)
+        super(GERMEVAL_14, self).__init__(
             data_folder,
             columns,
             tag_to_bioes=tag_to_bioes,
-            encoding="latin-1",
+            comment_symbol="#",
             in_memory=in_memory,
         )
 
 
-class CONLL_2000(ColumnCorpus):
+class INSPEC(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "np",
+            tag_to_bioes: str = "keyword",
             in_memory: bool = True,
     ):
-        """
-        Initialize the CoNLL-2000 corpus for English chunking.
-        The first time you call this constructor it will automatically download the dataset.
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: 'np' by default, should not be changed, but you can set 'pos' instead to predict POS tags
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        """
+
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "pos", 2: "np"}
+        columns = {0: "text", 1: "keyword"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -839,77 +915,34 @@ def __init__(
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
-        # download data if necessary
-        conll_2000_path = "https://www.clips.uantwerpen.be/conll2000/chunking/"
-        data_file = Path(flair.cache_root) / "datasets" / dataset_name / "train.txt"
-        if not data_file.is_file():
-            cached_path(
-                f"{conll_2000_path}train.txt.gz", Path("datasets") / dataset_name
-            )
-            cached_path(
-                f"{conll_2000_path}test.txt.gz", Path("datasets") / dataset_name
-            )
-            import gzip, shutil
-
-            with gzip.open(
-                    Path(flair.cache_root) / "datasets" / dataset_name / "train.txt.gz",
-                    "rb",
-            ) as f_in:
-                with open(
-                        Path(flair.cache_root) / "datasets" / dataset_name / "train.txt",
-                        "wb",
-                ) as f_out:
-                    shutil.copyfileobj(f_in, f_out)
-            with gzip.open(
-                    Path(flair.cache_root) / "datasets" / dataset_name / "test.txt.gz", "rb"
-            ) as f_in:
-                with open(
-                        Path(flair.cache_root) / "datasets" / dataset_name / "test.txt",
-                        "wb",
-                ) as f_out:
-                    shutil.copyfileobj(f_in, f_out)
+        inspec_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/Inspec"
+        cached_path(f"{inspec_path}/train.txt", Path("datasets") / dataset_name)
+        cached_path(f"{inspec_path}/test.txt", Path("datasets") / dataset_name)
+        if not "dev.txt" in os.listdir(data_folder):
+            cached_path(f"{inspec_path}/valid.txt", Path("datasets") / dataset_name)
+            # rename according to train - test - dev - convention
+            os.rename(data_folder / "valid.txt", data_folder / "dev.txt")
 
-        super(CONLL_2000, self).__init__(
+        super(INSPEC, self).__init__(
             data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
 
-class XTREME(MultiCorpus):
+class LER_GERMAN(ColumnCorpus):
     def __init__(
             self,
-            languages: Union[str, List[str]] = None,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
             in_memory: bool = False,
     ):
         """
-        Xtreme corpus for cross-lingual NER consisting of datasets of a total of 176 languages. The data comes from the google 
-        research work XTREME https://github.com/google-research/xtreme. All datasets for NER and respective language abbreviations (e.g. 
-        "en" for english can be found here https://www.amazon.com/clouddrive/share/d3KGCRCIYwhKJF0H3eWA26hjg2ZCRhjpEQtDL70FSBN/folder/C43gs51bSIaq5sFTQkWNCQ?_encoding=UTF8&*Version*=1&*entries*=0&mgh=1 )
-        The data is derived from the wikiann dataset https://elisa-ie.github.io/wikiann/ (license: https://opendatacommons.org/licenses/by/)
-
-        Parameters
-        ----------
-        languages : Union[str, List[str]], optional
-            Default the 40 languages that are used in XTREME are loaded. Otherwise on can hand over a strings or a list of strings 
-            consisiting of abbreviations for languages. All datasets will be loaded in a MultiCorpus object.
-        base_path : Union[str, Path], optional
-            Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-            to point to a different folder but typically this should not be necessary.
-        tag_to_bioes : str, optional
-            The data is in bio-format. It will by default (with the string "ner" as value) be transformed
-            into the bioes format. If you dont want that set it to None.
-
+        Initialize the LER_GERMAN (Legal Entity Recognition) corpus. The first time you call this constructor it will automatically
+        download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
-        # if no languages are given as argument all languages used in XTREME will be loaded
-        if not languages:
-            languages = ["af", "ar", "bg", "bn", "de", "el", "en", "es", "et", "eu", "fa", "fi", "fr", "he", "hi", "hu",
-                         "id", "it", "ja", "jv", "ka", "kk", "ko", "ml", "mr", "ms", "my", "nl", "pt", "ru", "sw", "ta",
-                         "te", "th", "tl", "tr", "ur", "vi", "yo", "zh"]
-
-        # if only one language is given
-        if type(languages) == str:
-            languages = [languages]
 
         if type(base_path) == str:
             base_path: Path = Path(base_path)
@@ -918,112 +951,136 @@ def __init__(
         columns = {0: "text", 1: "ner"}
 
         # this dataset name
-        dataset_name = "xtreme"
+        dataset_name = self.__class__.__name__.lower()
 
         # default dataset folder is the cache root
         if not base_path:
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
-        # For each language in languages, the file is downloaded if not existent
-        # Then a comlumncorpus of that data is created and saved in a list
-        # This list is handed to the multicorpus
+        # download data if necessary
+        ler_path = "https://raw.githubusercontent.com/elenanereiss/Legal-Entity-Recognition/master/data/"
+        cached_path(f"{ler_path}ler.conll", Path("datasets") / dataset_name)
 
-        # list that contains the columncopora
-        corpora = []
+        super(LER_GERMAN, self).__init__(
+            data_folder,
+            columns,
+            tag_to_bioes=tag_to_bioes,
+            in_memory=in_memory,
+            train_file='ler.conll'
+        )
 
-        hu_path = "https://nlp.informatik.hu-berlin.de/resources/datasets/panx_dataset"
 
-        # download data if necessary
-        for language in languages:
+class MIT_MOVIE_NER_SIMPLE(ColumnCorpus):
+    def __init__(
+            self,
+            base_path: Union[str, Path] = None,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = True,
+    ):
+        """
+        Initialize the eng corpus of the MIT Movie Corpus (it has simpler queries compared to the trivia10k13 corpus)
+        in BIO format. The first time you call this constructor it will automatically download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
+        POS tags instead
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        """
+        # column format
+        columns = {0: "ner", 1: "text"}
 
-            language_folder = data_folder / language
+        # dataset name
+        dataset_name = self.__class__.__name__.lower()
 
-            # if language not downloaded yet, download it
-            if not language_folder.exists():
+        # data folder: default dataset folder is the cache root
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+        if not base_path:
+            base_path: Path = Path(flair.cache_root) / "datasets"
+        data_folder = base_path / dataset_name
 
-                file_name = language + '.tar.gz'
-                # create folder
-                os.makedirs(language_folder)
+        # download data if necessary
+        mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/"
+        train_file = "engtrain.bio"
+        test_file = "engtest.bio"
+        cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name)
+        cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name)
 
-                # download from HU Server
-                temp_file = cached_path(
-                    hu_path + "/" + file_name,
-                    Path("datasets") / dataset_name / language
-                )
+        super(MIT_MOVIE_NER_SIMPLE, self).__init__(
+            data_folder,
+            columns,
+            train_file=train_file,
+            test_file=test_file,
+            tag_to_bioes=tag_to_bioes,
+            in_memory=in_memory,
+        )
 
-                # unzip
-                print("Extract data...")
-                import tarfile
-                tar = tarfile.open(str(temp_file), "r:gz")
-                for part in ["train", "test", "dev"]:
-                    tar.extract(part, str(language_folder))
-                tar.close()
-                print('...done.')
 
-                # transform data into required format
-                print("Process dataset...")
-                for part in ["train", "test", "dev"]:
-                    xtreme_to_simple_ner_annotation(str(language_folder / part))
-                print('...done.')
+class MIT_MOVIE_NER_COMPLEX(ColumnCorpus):
+    def __init__(
+            self,
+            base_path: Union[str, Path] = None,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = True,
+    ):
+        """
+        Initialize the trivia10k13 corpus of the MIT Movie Corpus (it has more complex queries compared to the eng corpus)
+        in BIO format. The first time you call this constructor it will automatically download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
+        POS tags instead
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        """
+        # column format
+        columns = {0: "ner", 1: "text"}
 
-            # initialize comlumncorpus and add it to list
-            print("Read data into corpus...")
-            corp = ColumnCorpus(data_folder=language_folder,
-                                column_format=columns,
-                                tag_to_bioes=tag_to_bioes,
-                                in_memory=in_memory,
-                                )
-            corpora.append(corp)
-            print("...done.")
+        # dataset name
+        dataset_name = self.__class__.__name__.lower()
 
-        super(XTREME, self).__init__(
-            corpora, name='xtreme'
-        )
+        # data folder: default dataset folder is the cache root
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+        if not base_path:
+            base_path: Path = Path(flair.cache_root) / "datasets"
+        data_folder = base_path / dataset_name
 
+        # download data if necessary
+        mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/"
+        train_file = "trivia10k13train.bio"
+        test_file = "trivia10k13test.bio"
+        cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name)
+        cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name)
 
-def xtreme_to_simple_ner_annotation(data_file: Union[str, Path]):
-    with open(data_file, 'r', encoding='utf-8') as f:
-        lines = f.readlines()
-    with open(data_file, 'w', encoding='utf-8') as f:
-        for line in lines:
-            if line == '\n':
-                f.write(line)
-            else:
-                liste = line.split()
-                f.write(liste[0].split(':', 1)[1] + ' ' + liste[1] + '\n')
+        super(MIT_MOVIE_NER_COMPLEX, self).__init__(
+            data_folder,
+            columns,
+            train_file=train_file,
+            test_file=test_file,
+            tag_to_bioes=tag_to_bioes,
+            in_memory=in_memory,
+        )
 
 
-class WIKIANN(MultiCorpus):
+class MIT_RESTAURANT_NER(ColumnCorpus):
     def __init__(
             self,
-            languages: Union[str, List[str]],
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
-            in_memory: bool = False,
+            in_memory: bool = True,
+            document_as_sequence: bool = False,
     ):
         """
-        WkiAnn corpus for cross-lingual NER consisting of datasets from 282 languages that exist
-        in Wikipedia. See https://elisa-ie.github.io/wikiann/ for details and for the languages and their
-        respective abbreveations, i.e. "en" for english. (license: https://opendatacommons.org/licenses/by/)
-        Parameters
-        ----------
-        languages : Union[str, List[str]]
-            Should be an abbreviation of a language ("en", "de",..) or a list of abbreviations.
-            The datasets of all passed languages will be saved in one MultiCorpus.
-            (Note that, even though listed on https://elisa-ie.github.io/wikiann/ some datasets are empty.
-            This includes "aa", "cho", "ho", "hz", "ii", "jam", "kj", "kr", "mus", "olo" and "tcy".)
-        base_path : Union[str, Path], optional
-            Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-            to point to a different folder but typically this should not be necessary.
-        tag_to_bioes : str, optional
-            The data is in bio-format. It will by default (with the string "ner" as value) be transformed
-            into the bioes format. If you dont want that set it to None.
-
+        Initialize the experimental MIT Restaurant corpus available on https://groups.csail.mit.edu/sls/downloads/restaurant/.
+        The first time you call this constructor it will automatically download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
+        POS tags instead
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
-        if type(languages) == str:
-            languages = [languages]
-
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
@@ -1031,405 +1088,140 @@ def __init__(
         columns = {0: "text", 1: "ner"}
 
         # this dataset name
-        dataset_name = "wikiann"
+        dataset_name = self.__class__.__name__.lower()
 
         # default dataset folder is the cache root
         if not base_path:
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
-        # For each language in languages, the file is downloaded if not existent
-        # Then a comlumncorpus of that data is created and saved in a list
-        # this list is handed to the multicorpus
+        # download data if necessary
+        mit_restaurants_path = "https://megantosh.s3.eu-central-1.amazonaws.com/MITRestoCorpus/"
+        cached_path(f"{mit_restaurants_path}test.txt", Path("datasets") / dataset_name)
+        cached_path(f"{mit_restaurants_path}train.txt", Path("datasets") / dataset_name)
 
-        # list that contains the columncopora
-        corpora = []
+        super(MIT_RESTAURANT_NER, self).__init__(
+            data_folder,
+            columns,
+            tag_to_bioes=tag_to_bioes,
+            encoding="latin-1",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+        )
+
+
+class NER_BASQUE(ColumnCorpus):
+    def __init__(
+            self,
+            base_path: Union[str, Path] = None,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = True,
+    ):
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
+        # column format
+        columns = {0: "text", 1: "ner"}
+
+        # this dataset name
+        dataset_name = self.__class__.__name__.lower()
+
+        # default dataset folder is the cache root
+        if not base_path:
+            base_path = Path(flair.cache_root) / "datasets"
+        data_folder = base_path / dataset_name
 
-        google_drive_path = 'https://drive.google.com/uc?id='
         # download data if necessary
-        first = True
-        for language in languages:
+        ner_basque_path = "http://ixa2.si.ehu.eus/eiec/"
+        data_path = Path(flair.cache_root) / "datasets" / dataset_name
+        data_file = data_path / "named_ent_eu.train"
+        if not data_file.is_file():
+            cached_path(
+                f"{ner_basque_path}/eiec_v1.0.tgz", Path("datasets") / dataset_name
+            )
+            import tarfile, shutil
 
-            language_folder = data_folder / language
-            file_name = 'wikiann-' + language + '.bio'
+            with tarfile.open(
+                    Path(flair.cache_root) / "datasets" / dataset_name / "eiec_v1.0.tgz",
+                    "r:gz",
+            ) as f_in:
+                corpus_files = (
+                    "eiec_v1.0/named_ent_eu.train",
+                    "eiec_v1.0/named_ent_eu.test",
+                )
+                for corpus_file in corpus_files:
+                    f_in.extract(corpus_file, data_path)
+                    shutil.move(f"{data_path}/{corpus_file}", data_path)
 
-            # if language not downloaded yet, download it
-            if not language_folder.exists():
-                if first == True:
-                    import gdown
-                    import tarfile
-                    first = False
-                # create folder
-                os.makedirs(language_folder)
-                # get google drive id from list
-                google_id = google_drive_id_from_language_name(language)
-                url = google_drive_path + google_id
+        super(NER_BASQUE, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
+        )
 
-                # download from google drive
-                gdown.download(url, str(language_folder / language) + '.tar.gz')
 
-                # unzip
-                print("Extract data...")
-                tar = tarfile.open(str(language_folder / language) + '.tar.gz', "r:gz")
-                # tar.extractall(language_folder,members=[tar.getmember(file_name)])
-                tar.extract(file_name, str(language_folder))
-                tar.close()
-                print('...done.')
+class NER_FINNISH(ColumnCorpus):
+    def __init__(
+            self,
+            base_path: Union[str, Path] = None,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = True,
+    ):
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
 
-                # transform data into required format
-                # the processed dataset has the additional ending "_new"
-                print("Process dataset...")
-                silver_standard_to_simple_ner_annotation(str(language_folder / file_name))
-                # remove the unprocessed dataset
-                os.remove(str(language_folder / file_name))
-                print('...done.')
+        # column format
+        columns = {0: "text", 1: "ner"}
 
-            # initialize comlumncorpus and add it to list
-            print("Read data into corpus...")
-            corp = ColumnCorpus(data_folder=language_folder,
-                                column_format=columns,
-                                train_file=file_name + '_new',
-                                tag_to_bioes=tag_to_bioes,
-                                in_memory=in_memory,
-                                )
-            corpora.append(corp)
-            print("...done.")
+        # this dataset name
+        dataset_name = self.__class__.__name__.lower()
 
-        super(WIKIANN, self).__init__(
-            corpora, name='wikiann'
+        # default dataset folder is the cache root
+        if not base_path:
+            base_path = Path(flair.cache_root) / "datasets"
+        data_folder = base_path / dataset_name
+
+        # download data if necessary
+        ner_finnish_path = "https://raw.githubusercontent.com/mpsilfve/finer-data/master/data/digitoday."
+        cached_path(f"{ner_finnish_path}2014.train.csv", Path("datasets") / dataset_name)
+        cached_path(f"{ner_finnish_path}2014.dev.csv", Path("datasets") / dataset_name)
+        cached_path(f"{ner_finnish_path}2015.test.csv", Path("datasets") / dataset_name)
+
+        _remove_lines_without_annotations(data_file=Path(data_folder / "digitoday.2015.test.csv"))
+
+        super(NER_FINNISH, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, skip_first_line=True
         )
 
 
-def silver_standard_to_simple_ner_annotation(data_file: Union[str, Path]):
-    f_read = open(data_file, 'r', encoding='utf-8')
-    f_write = open(data_file + '_new', 'w+', encoding='utf-8')
-    while True:
-        line = f_read.readline()
-        if line:
-            if line == '\n':
-                f_write.write(line)
-            else:
-                liste = line.split()
-                f_write.write(liste[0] + ' ' + liste[-1] + '\n')
-        else:
-            break
-    f_read.close()
-    f_write.close()
+def _remove_lines_without_annotations(data_file: Union[str, Path] = None):
+    with open(data_file, 'r') as f:
+        lines = f.readlines()
+    with open(data_file, 'w') as f:
+        for line in lines:
+            if len(line.split()) != 1:
+                f.write(line)
 
 
-def google_drive_id_from_language_name(language):
-    languages_ids = {
-        'aa': '1tDDlydKq7KQQ3_23Ysbtke4HJOe4snIk',  # leer
-        'ab': '1hB8REj2XA_0DjI9hdQvNvSDpuBIb8qRf',
-        'ace': '1WENJS2ppHcZqaBEXRZyk2zY-PqXkTkgG',
-        'ady': '1n6On8WWDHxEoybj7F9K15d_fkGPy6KgO',
-        'af': '1CPB-0BD2tg3zIT60D3hmJT0i5O_SKja0',
-        'ak': '1l2vlGHnQwvm9XhW5S-403fetwUXhBlZm',
-        'als': '196xyYjhbie7sYLHLZHWkkurOwQLi8wK-',
-        'am': '1ug1IEoExKD3xWpvfZprAPSQi82YF9Cet',
-        'an': '1DNLgPOAOsGZBYd6rC5ddhzvc9_DtWnk2',
-        'ang': '1W_0ti7Tl8AkqM91lRCMPWEuUnPOAZroV',
-        'ar': '1tyvd32udEQG_cNeVpaD5I2fxvCc6XKIS',
-        'arc': '1hSOByStqPmP3b9HfQ39EclUZGo8IKCMb',
-        'arz': '1CKW5ZhxTpIHmc8Jt5JLz_5O6Cr8Icsan',
-        'as': '12opBoIweBLM8XciMHT4B6-MAaKdYdvpE',
-        'ast': '1rp64PxGZBDfcw-tpFBjLg_ddLDElG1II',
-        'av': '1hncGUrkG1vwAAQgLtwOf41BWkHkEvdss',
-        'ay': '1VmIsWpMTz442b4Mx798ZOgtB9vquKQtf',
-        'az': '1FXDXsvBSdqc7GGIDZv0hqBOaaw12Ip2-',
-        'azb': '1amVqOuHLEkhjn8rkGUl-mXdZlaACWyNT',
-        'ba': '1aLx1d8GagI11VZVYOGQy0BEePeqoT0x3',
-        'bar': '1JZ8-k8ZmnpWYI_Yl_cBBgjVdxoM9Daci',
-        'bat-smg': '1trxKXDFSeKsygTMKi-ZqXSJs7F90k5a8',
-        'bcl': '1Hs0k7KVZ2DPsqroZ4cUKcwZG4HdPV794',
-        'be-x-old': '1gaK-spj1m6eGYQ-SsngLxxLUvP1VRk08',
-        'be': '1_ttfOSy9BzCRkIT_p3mImT82XRPpEiuH',
-        'bg': '1Iug6gYKemb0OrLTUrKDc_c66YGypTfCF',
-        'bh': '12OcSFLu940A8tVQLxI8pnxKBpTeZHmrh',
-        'bi': '1rftVziS_pqARx4mvLJC0sKLY-OL5ZIjE',
-        'bjn': '1n17mkRjPUAOWQk5LQs2C3Tz3ShxK0enZ',
-        'bm': '1284dwO_sfdsWE7FR06HhfBRUb8ePesKR',
-        'bn': '1K2DM1mT4hkr6NlAIBTj95BeVXcgvpgDm',
-        'bo': '1SzGHDVK-OguKdjZ4DXWiOJVrie1iHeWm',
-        'bpy': '1m-e5EoruJufvwBEgJLmJtx6jzx64pYN2',
-        'br': '1xdaBoJ1DnwI0iEq7gQN1dWcABAs_bM9H',
-        'bs': '167dsB01trMYFQl8FshtIdfhjw7IfVKbk',
-        'bug': '1yCnevM9_KJzFk27Vxsva_20OacLo4Uam',
-        'bxr': '1DlByAX3zB-9UyEAVD4wtX-R7mXC-8xum',
-        'ca': '1LuUgbd9sGa-5Ahcsy31EK89a3WOowftY',
-        'cbk-zam': '1kgF8xoD-kIOWZET_9kp_4yNX6AAXn6PI',
-        'cdo': '14x1y6611G-UAEGq92QEHRpreVkYnoUCw',
-        'ce': '1QUUCVKA-fkiCHd3KT3zUWefaWnxzlZLu',
-        'ceb': '1DJZE9RfaMoPNXHI73KBXAm4YSe-_YCUk',
-        'ch': '1YzAfhmatkmTpkZbAcD6X83epCgzD5S2_',
-        'cho': '1ciY0vF3c5a2mTOo_k32A2wMs0klK98Kb',  # leer
-        'chr': '1EHaxz1UZHn7v2bbRzCLAhPsNtRzrG3Ae',
-        'chy': '1nNWwMAJr1KNdz3bHf6uIn-thZCknlTeB',
-        'ckb': '1llpaftcUSiXCZQZMdAqaJSrhwMdcf9IV',
-        'co': '1ZP-8oWgMYfW7a6w6ygEFkKDGbN39QnDn',
-        'cr': '1ST0xRicLAG4JdCZwGdaY-0pEXooQh7e6',
-        'crh': '1Jmpq2XVYUR_XaXU5XNhtOMnz-qkpsgpE',
-        'cs': '1Vydyze-jBkK_S1uV5ewV_Y6dbwhXr7lk',
-        'csb': '1naUyF74lZPnnopXdOqf5Xor2kT4WoHfS',
-        'cu': '1EN5dVTU6jc7YOYPCHq8EYUF31HlMUKs7',
-        'cv': '1gEUAlqYSSDI4TrWCqP1LUq2n0X1XEjN3',
-        'cy': '1q5g6NJE5GXf65Vc_P4BnUMHQ49Prz-J1',
-        'da': '11onAGOLkkqrIwM784siWlg-cewa5WKm8',
-        'de': '1f9nWvNkCCy6XWhd9uf4Dq-2--GzSaYAb',
-        'diq': '1IkpJaVbEOuOs9qay_KG9rkxRghWZhWPm',
-        'dsb': '1hlExWaMth-2eVIQ3i3siJSG-MN_7Z6MY',
-        'dv': '1WpCrslO4I7TMb2uaKVQw4U2U8qMs5szi',
-        'dz': '10WX52ePq2KfyGliwPvY_54hIjpzW6klV',
-        'ee': '1tYEt3oN2KPzBSWrk9jpCqnW3J1KXdhjz',
-        'el': '1cxq4NUYmHwWsEn5waYXfFSanlINXWLfM',
-        'eml': '17FgGhPZqZNtzbxpTJOf-6nxEuI5oU4Vd',
-        'en': '1mqxeCPjxqmO7e8utj1MQv1CICLFVvKa-',
-        'eo': '1YeknLymGcqj44ug2yd4P7xQVpSK27HkK',
-        'es': '1Dnx3MVR9r5cuoOgeew2gT8bDvWpOKxkU',
-        'et': '1Qhb3kYlQnLefWmNimdN_Vykm4mWzbcWy',
-        'eu': '1f613wH88UeITYyBSEMZByK-nRNMwLHTs',
-        'ext': '1D0nLOZ3aolCM8TShIRyCgF3-_MhWXccN',
-        'fa': '1QOG15HU8VfZvJUNKos024xI-OGm0zhEX',
-        'ff': '1h5pVjxDYcq70bSus30oqi9KzDmezVNry',
-        'fi': '1y3Kf6qYsSvL8_nSEwE1Y6Bf6ninaPvqa',
-        'fiu-vro': '1oKUiqG19WgPd3CCl4FGudk5ATmtNfToR',
-        'fj': '10xDMuqtoTJlJFp5ghbhKfNWRpLDK3W4d',
-        'fo': '1RhjYqgtri1276Be1N9RrNitdBNkpzh0J',
-        'fr': '1sK_T_-wzVPJYrnziNqWTriU52rEsXGjn',
-        'frp': '1NUm8B2zClBcEa8dHLBb-ZgzEr8phcQyZ',
-        'frr': '1FjNqbIUlOW1deJdB8WCuWjaZfUzKqujV',
-        'fur': '1oqHZMK7WAV8oHoZLjGR0PfmO38wmR6XY',
-        'fy': '1DvnU6iaTJc9bWedmDklHyx8nzKD1s3Ge',
-        'ga': '1Ql6rh7absdYQ8l-3hj_MVKcEC3tHKeFB',
-        'gag': '1zli-hOl2abuQ2wsDJU45qbb0xuvYwA3a',
-        'gan': '1u2dOwy58y-GaS-tCPJS_i9VRDQIPXwCr',
-        'gd': '1umsUpngJiwkLdGQbRqYpkgxZju9dWlRz',
-        'gl': '141K2IbLjJfXwFTIf-kthmmG0YWdi8liE',
-        'glk': '1ZDaxQ6ilXaoivo4_KllagabbvfOuiZ0c',
-        'gn': '1hM4MuCaVnZqnL-w-0N-WcWag22ikVLtZ',
-        'gom': '1BNOSw75tzPC0wEgLOCKbwu9wg9gcLOzs',
-        'got': '1YSHYBtXc1WvUvMIHPz6HHgJvaXKulJUj',
-        'gu': '1VdK-B2drqFwKg8KD23c3dKXY-cZgCMgd',
-        'gv': '1XZFohYNbKszEFR-V-yDXxx40V41PV9Zm',
-        'ha': '18ZG4tUU0owRtQA8Ey3Dl72ALjryEJWMC',
-        'hak': '1QQe3WgrCWbvnVH42QXD7KX4kihHURB0Z',
-        'haw': '1FLqlK-wpz4jy768XbQAtxd9PhC-9ciP7',
-        'he': '18K-Erc2VOgtIdskaQq4D5A3XkVstDmfX',
-        'hi': '1lBRapb5tjBqT176gD36K5yb_qsaFeu-k',
-        'hif': '153MQ9Ga4NQ-CkK8UiJM3DjKOk09fhCOV',
-        'ho': '1c1AoS7yq15iVkTEE-0f3x25NT4F202B8',  # leer
-        'hr': '1wS-UtB3sGHuXJQQGR0F5lDegogsgoyif',
-        'hsb': '1_3mMLzAE5OmXn2z64rW3OwWbo85Mirbd',
-        'ht': '1BwCaF0nfdgkM7Yt7A7d7KyVk0BcuwPGk',
-        'hu': '10AkDmTxUWNbOXuYLYZ-ZPbLAdGAGZZ8J',
-        'hy': '1Mi2k2alJJquT1ybd3GC3QYDstSagaWdo',
-        'hz': '1c1m_-Q92v0Di7Nez6VuaccrN19i8icKV',  # leer
-        'ia': '1jPyqTmDuVhEhj89N606Cja5heJEbcMoM',
-        'id': '1JWIvIh8fQoMQqk1rPvUThaskxnTs8tsf',
-        'ie': '1TaKRlTtB8-Wqu4sfvx6JQKIugAlg0pV-',
-        'ig': '15NFAf2Qx6BXSjv_Oun9_3QRBWNn49g86',
-        'ii': '1qldGJkMOMKwY13DpcgbxQCbff0K982f9',  # leer
-        'ik': '1VoSTou2ZlwVhply26ujowDz6gjwtxmny',
-        'ilo': '1-xMuIT6GaM_YeHqgm1OamGkxYfBREiv3',
-        'io': '19Zla0wsAcrZm2c0Pw5ghpp4rHjYs26Pp',
-        'is': '11i-NCyqS6HbldIbYulsCgQGZFXR8hwoB',
-        'it': '1HmjlOaQunHqL2Te7pIkuBWrnjlmdfYo_',
-        'iu': '18jKm1S7Ls3l0_pHqQH8MycG3LhoC2pdX',
-        'ja': '10dz8UxyK4RIacXE2HcGdrharmp5rwc3r',
-        'jam': '1v99CXf9RnbF6aJo669YeTR6mQRTOLZ74',  # leer
-        'jbo': '1_LmH9hc6FDGE3F7pyGB1fUEbSwuTYQdD',
-        'jv': '1qiSu1uECCLl4IBZS27FBdJIBivkJ7GwE',
-        'ka': '172UFuFRBX2V1aWeXlPSpu9TjS-3cxNaD',
-        'kaa': '1kh6hMPUdqO-FIxRY6qaIBZothBURXxbY',
-        'kab': '1oKjbZI6ZrrALCqnPCYgIjKNrKDA7ehcs',
-        'kbd': '1jNbfrboPOwJmlXQBIv053d7n5WXpMRv7',
-        'kg': '1iiu5z-sdJ2JLC4Ja9IgDxpRZklIb6nDx',
-        'ki': '1GUtt0QI84c5McyLGGxoi5uwjHOq1d6G8',
-        'kj': '1nSxXUSGDlXVCIPGlVpcakRc537MwuKZR',  # leer
-        'kk': '1ryC3UN0myckc1awrWhhb6RIi17C0LCuS',
-        'kl': '1gXtGtX9gcTXms1IExICnqZUHefrlcIFf',
-        'km': '1DS5ATxvxyfn1iWvq2G6qmjZv9pv0T6hD',
-        'kn': '1ZGLYMxbb5-29MNmuUfg2xFhYUbkJFMJJ',
-        'ko': '12r8tIkTnwKhLJxy71qpIcoLrT6NNhQYm',
-        'koi': '1EdG_wZ_Qk124EPAZw-w6rdEhYLsgcvIj',
-        'kr': '19VNQtnBA-YL_avWuVeHQHxJZ9MZ04WPF',  # leer
-        'krc': '1nReV4Mb7Wdj96czpO5regFbdBPu0zZ_y',
-        'ks': '1kzh0Pgrv27WRMstR9MpU8mu7p60TcT-X',
-        'ksh': '1iHJvrl2HeRaCumlrx3N7CPrHQ2KuLUkt',
-        'ku': '1YqJog7Bkk0fHBCSTxJ9heeE-bfbkbkye',
-        'kv': '1s91HI4eq8lQYlZwfrJAgaGlCyAtIhvIJ',
-        'kw': '16TaIX2nRfqDp8n7zudd4bqf5abN49dvW',
-        'ky': '17HPUKFdKWhUjuR1NOp5f3PQYfMlMCxCT',
-        'la': '1NiQuBaUIFEERvVXo6CQLwosPraGyiRYw',
-        'lad': '1PEmXCWLCqnjLBomMAYHeObM1AmVHtD08',
-        'lb': '1nE4g10xoTU23idmDtOQ0w2QCuizZ6QH_',
-        'lbe': '1KOm-AdRcCHfSc1-uYBxBA4GjxXjnIlE-',
-        'lez': '1cJAXshrLlF1TZlPHJTpDwEvurIOsz4yR',
-        'lg': '1Ur0y7iiEpWBgHECrIrT1OyIC8um_y4th',
-        'li': '1TikIqfqcZlSDWhOae1JnjJiDko4nj4Dj',
-        'lij': '1ro5ItUcF49iP3JdV82lhCQ07MtZn_VjW',
-        'lmo': '1W4rhBy2Pi5SuYWyWbNotOVkVY3kYWS_O',
-        'ln': '1bLSV6bWx0CgFm7ByKppZLpYCFL8EIAoD',
-        'lo': '1C6SSLeKF3QirjZbAZAcpVX_AXYg_TJG3',
-        'lrc': '1GUcS28MlJe_OjeQfS2AJ8uczpD8ut60e',
-        'lt': '1gAG6TcMTmC128wWK0rCXRlCTsJY9wFQY',
-        'ltg': '12ziP8t_fAAS9JqOCEC0kuJObEyuoiOjD',
-        'lv': '1MPuAM04u-AtfybXdpHwCqUpFWbe-zD0_',
-        'mai': '1d_nUewBkka2QGEmxCc9v3dTfvo7lPATH',
-        'map-bms': '1wrNIE-mqp2xb3lrNdwADe6pb7f35NP6V',
-        'mdf': '1BmMGUJy7afuKfhfTBMiKxM3D7FY-JrQ2',
-        'mg': '105WaMhcWa-46tCztoj8npUyg0aH18nFL',
-        'mh': '1Ej7n6yA1cF1cpD5XneftHtL33iHJwntT',
-        'mhr': '1CCPIUaFkEYXiHO0HF8_w07UzVyWchrjS',
-        'mi': '1F6au9xQjnF-aNBupGJ1PwaMMM6T_PgdQ',
-        'min': '1tVK5SHiCy_DaZSDm3nZBgT5bgWThbJt_',
-        'mk': '18NpudytGhSWq_LbmycTDw10cSftlSBGS',
-        'ml': '1V73UE-EvcE-vV3V1RTvU4sak6QFcP91y',
-        'mn': '14jRXicA87oXZOZllWqUjKBMetNpQEUUp',
-        'mo': '1YsLGNMsJ7VsekhdcITQeolzOSK4NzE6U',
-        'mr': '1vOr1AIHbgkhTO9Ol9Jx5Wh98Qdyh1QKI',
-        'mrj': '1dW-YmEW8a9D5KyXz8ojSdIXWGekNzGzN',
-        'ms': '1bs-_5WNRiZBjO-DtcNtkcIle-98homf_',
-        'mt': '1L7aU3iGjm6SmPIU74k990qRgHFV9hrL0',
-        'mus': '1_b7DcRqiKJFEFwp87cUecqf8A5BDbTIJ',  # leer
-        'mwl': '1MfP0jba2jQfGVeJOLq26MjI6fYY7xTPu',
-        'my': '16wsIGBhNVd2lC2p6n1X8rdMbiaemeiUM',
-        'myv': '1KEqHmfx2pfU-a1tdI_7ZxMQAk5NJzJjB',
-        'mzn': '1CflvmYEXZnWwpsBmIs2OvG-zDDvLEMDJ',
-        'na': '1r0AVjee5wNnrcgJxQmVGPVKg5YWz1irz',
-        'nah': '1fx6eu91NegyueZ1i0XaB07CKjUwjHN7H',
-        'nap': '1bhT4sXCJvaTchCIV9mwLBtf3a7OprbVB',
-        'nds-nl': '1UIFi8eOCuFYJXSAXZ9pCWwkQMlHaY4ye',
-        'nds': '1FLgZIXUWa_vekDt4ndY0B5XL7FNLiulr',
-        'ne': '1gEoCjSJmzjIH4kdHsbDZzD6ID4_78ekS',
-        'new': '1_-p45Ny4w9UvGuhD8uRNSPPeaARYvESH',
-        'ng': '11yxPdkmpmnijQUcnFHZ3xcOmLTYJmN_R',
-        'nl': '1dqYXg3ilzVOSQ_tz_dF47elSIvSIhgqd',
-        'nn': '1pDrtRhQ001z2WUNMWCZQU3RV_M0BqOmv',
-        'no': '1zuT8MI96Ivpiu9mEVFNjwbiM8gJlSzY2',
-        'nov': '1l38388Rln0NXsSARMZHmTmyfo5C0wYTd',
-        'nrm': '10vxPq1Nci7Wpq4XOvx3dtqODskzjdxJQ',
-        'nso': '1iaIV8qlT0RDnbeQlnxJ3RehsG3gU5ePK',
-        'nv': '1oN31jT0w3wP9aGwAPz91pSdUytnd9B0g',
-        'ny': '1eEKH_rUPC560bfEg11kp3kbe8qWm35IG',
-        'oc': '1C01cW8G_j8US-DTrsmeal_ENHTtNWn-H',
-        'olo': '1vbDwKZKqFq84dusr1SvDx5JbBcPanx9L',  # leer
-        'om': '1q3h22VMbWg2kgVFm-OArR-E4y1yBQ1JX',
-        'or': '1k8LwCE8nC7lq6neXDaS3zRn0KOrd9RnS',
-        'os': '1u81KAB34aEQfet00dLMRIBJsfRwbDTij',
-        'pa': '1JDEHL1VcLHBamgTPBom_Ryi8hk6PBpsu',
-        'pag': '1k905VUWnRgY8kFb2P2431Kr4dZuolYGF',
-        'pam': '1ssugGyJb8ipispC60B3I6kzMsri1WcvC',
-        'pap': '1Za0wfwatxYoD7jGclmTtRoBP0uV_qImQ',
-        'pcd': '1csJlKgtG04pdIYCUWhsCCZARKIGlEYPx',
-        'pdc': '1Xnms4RXZKZ1BBQmQJEPokmkiweTpouUw',
-        'pfl': '1tPQfHX7E0uKMdDSlwNw5aGmaS5bUK0rn',
-        'pi': '16b-KxNxzbEuyoNSlI3bfe2YXmdSEsPFu',
-        'pih': '1vwyihTnS8_PE5BNK7cTISmIBqGWvsVnF',
-        'pl': '1fijjS0LbfpKcoPB5V8c8fH08T8AkXRp9',
-        'pms': '12ySc7X9ajWWqMlBjyrPiEdc-qVBuIkbA',
-        'pnb': '1RB3-wjluhTKbdTGCsk3nag1bM3m4wENb',
-        'pnt': '1ZCUzms6fY4on_fW8uVgO7cEs9KHydHY_',
-        'ps': '1WKl9Av6Sqz6aHKyUM5kIh90mzFzyVWH9',
-        'pt': '13BX-_4_hcTUp59HDyczFDI32qUB94vUY',
-        'qu': '1CB_C4ygtRoegkqgcqfXNHr8oQd-UcvDE',
-        'rm': '1YRSGgWoxEqSojHXuBHJnY8vAHr1VgLu-',
-        'rmy': '1uFcCyvOWBJWKFQxbkYSp373xUXVl4IgF',
-        'rn': '1ekyyb2MvupYGY_E8_BhKvV664sLvW4aE',
-        'ro': '1YfeNTSoxU-zJMnyQotLk5X8B_6nHryBu',
-        'roa-rup': '150s4H4TdQ5nNYVC6j0E416TUAjBE85yy',
-        'roa-tara': '1H6emfQsD_a5yohK4RMPQ-GrnHXqqVgr3',
-        'ru': '11gP2s-SYcfS3j9MjPp5C3_nFeQB-8x86',
-        'rue': '1OuSglZAndja1J5D5IUmdbt_niTTyEgYK',
-        'rw': '1NuhHfi0-B-Xlr_BApijnxCw0WMEltttP',
-        'sa': '1P2S3gL_zvKgXLKJJxg-Fb4z8XdlVpQik',
-        'sah': '1qz0MpKckzUref2FX_FYiNzI2p4BDc5oR',
-        'sc': '1oAYj_Fty4FUwjAOBEBaiZt_cY8dtpDfA',
-        'scn': '1sDN9zHkXWYoHYx-DUu-GPvsUgB_IRa8S',
-        'sco': '1i8W7KQPj6YZQLop89vZBSybJNgNsvXWR',
-        'sd': '1vaNqfv3S8Gl5pQmig3vwWQ3cqRTsXmMR',
-        'se': '1RT9xhn0Vl90zjWYDTw5V1L_u1Oh16tpP',
-        'sg': '1iIh2oXD2Szz_AygUvTt3_ZK8a3RYEGZ_',
-        'sh': '1qPwLiAm6t4__G-zVEOrBgYx6VRmgDgiS',
-        'si': '1G5ryceID0TP6SAO42e-HAbIlCvYmnUN7',
-        'simple': '1FVV49o_RlK6M5Iw_7zeJOEDQoTa5zSbq',
-        'sk': '11mkYvbmAWKTInj6t4Ma8BUPxoR5o6irL',
-        'sl': '1fsIZS5LgMzMzZ6T7ogStyj-ILEZIBRvO',
-        'sm': '1yefECpKX_Y4R7G2tggIxvc_BvJfOAz-t',
-        'sn': '1fYeCjMPvRAv94kvZjiKI-ktIDLkbv0Ve',
-        'so': '1Uc-eSZnJb36SgeTvRU3GirXZOlGD_NB6',
-        'sq': '11u-53n71O_yjpwRiCQSwgL7N2w72ZptX',
-        'sr': '1PGLGlQi8Q0Eac6dib-uuCJAAHK6SF5Pz',
-        'srn': '1JKiL3TSXqK1-KhPfAwMK0uqw90WEzg7M',
-        'ss': '1e0quNEsA1dn57-IbincF4D82dRWgzQlp',
-        'st': '1ny-FBzpBqIDgv6jMcsoFev3Ih65FNZFO',
-        'stq': '15Fx32ROy2IM6lSqAPUykkr3CITR6Xd7v',
-        'su': '1C0FJum7bYZpnyptBvfAgwJb0TX2hggtO',
-        'sv': '1YyqzOSXzK5yrAou9zeTDWH_7s569mDcz',
-        'sw': '1_bNTj6T8eXlNAIuHaveleWlHB_22alJs',
-        'szl': '1_dXEip1snK4CPVGqH8x7lF5O-6FdCNFW',
-        'ta': '1ZFTONsxGtSnC9QB6RpWSvgD_MbZwIhHH',
-        'tcy': '15R6u7KQs1vmDSm_aSDrQMJ3Q6q3Be0r7',  # leer
-        'te': '11Sx-pBAPeZOXGyv48UNSVMD0AH7uf4YN',
-        'tet': '11mr2MYLcv9pz7mHhGGNi5iNCOVErYeOt',
-        'tg': '16ttF7HWqM9Cnj4qmgf3ZfNniiOJfZ52w',
-        'th': '14xhIt-xr5n9nMuvcwayCGM1-zBCFZquW',
-        'ti': '123q5e9MStMShp8eESGtHdSBGLDrCKfJU',
-        'tk': '1X-JNInt34BNGhg8A8Peyjw2WjsALdXsD',
-        'tl': '1WkQHbWd9cqtTnSHAv0DpUThaBnzeSPTJ',
-        'tn': '1fHfQHetZn8-fLuRZEu-cvs-kQYwPvjyL',
-        'to': '1cHOLaczYJ8h-OqQgxeoH9vMG3izg6muT',
-        'tpi': '1YsRjxVu6NYOrXRb8oqMO9FPaicelFEcu',
-        'tr': '1J1Zy02IxvtCK0d1Ba2h_Ulit1mVb9UIX',
-        'ts': '1pIcfAt3KmtmDkyhOl-SMSeoM8aP8bOpl',
-        'tt': '1vsfzCjj-_bMOn5jBai41TF5GjKJM_Ius',
-        'tum': '1NWcg65daI2Bt0awyEgU6apUDbBmiqCus',
-        'tw': '1WCYKZIqS7AagS76QFSfbteiOgFNBvNne',
-        'ty': '1DIqaP1l-N9VXTNokrlr6EuPMGE765o4h',
-        'tyv': '1F3qa05OYLBcjT1lXMurAJFDXP_EesCvM',
-        'udm': '1T0YMTAPLOk768sstnewy5Jxgx2RPu3Rb',
-        'ug': '1fjezvqlysyZhiQMZdazqLGgk72PqtXAw',
-        'uk': '1UMJCHtzxkfLDBJE7NtfN5FeMrnnUVwoh',
-        'ur': '1WNaD2TuHvdsF-z0k_emQYchwoQQDFmRk',
-        'uz': '11wrG2FSTpRJc2jb5MhgvxjkVDYhT8M-l',
-        've': '1PucJ7pJ4CXGEXZ5p_WleZDs2usNz74to',
-        'vec': '1cAVjm_y3ehNteDQIYz9yyoq1EKkqOXZ0',
-        'vep': '1K_eqV7O6C7KPJWZtmIuzFMKAagj-0O85',
-        'vi': '1yQ6nhm1BmG9lD4_NaG1hE5VV6biEaV5f',
-        'vls': '1bpQQW6pKHruKJJaKtuggH5rReMXyeVXp',
-        'vo': '1D80QRdTpe7H4mHFKpfugscsjX71kiMJN',
-        'wa': '1m4B81QYbf74htpInDU5p7d0n0ot8WLPZ',
-        'war': '1EC3jsHtu22tHBv6jX_I4rupC5RwV3OYd',
-        'wo': '1vChyqNNLu5xYHdyHpACwwpw4l3ptiKlo',
-        'wuu': '1_EIn02xCUBcwLOwYnA-lScjS2Lh2ECw6',
-        'xal': '19bKXsL1D2UesbB50JPyc9TpG1lNc2POt',
-        'xh': '1pPVcxBG3xsCzEnUzlohc_p89gQ9dSJB3',
-        'xmf': '1SM9llku6I_ZuZz05mOBuL2lx-KQXvehr',
-        'yi': '1WNWr1oV-Nl7c1Jv8x_MiAj2vxRtyQawu',
-        'yo': '1yNVOwMOWeglbOcRoZzgd4uwlN5JMynnY',
-        'za': '1i7pg162cD_iU9h8dgtI2An8QCcbzUAjB',
-        'zea': '1EWSkiSkPBfbyjWjZK0VuKdpqFnFOpXXQ',
-        'zh-classical': '1uUKZamNp08KA7s7794sKPOqPALvo_btl',
-        'zh-min-nan': '1oSgz3YBXLGUgI7kl-uMOC_ww6L0FNFmp',
-        'zh-yue': '1zhwlUeeiyOAU1QqwqZ8n91yXIRPFA7UE',
-        'zh': '1LZ96GUhkVHQU-aj2C3WOrtffOp0U3Z7f',
-        'zu': '1FyXl_UK1737XB3drqQFhGXiJrJckiB1W'
-    }
-    return languages_ids[language]
-
-
-class DANE(ColumnCorpus):
+class NER_SWEDISH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
             in_memory: bool = True,
     ):
+        """
+        Initialize the NER_SWEDISH corpus for Swedish. The first time you call this constructor it will automatically
+        download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
+
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {1: 'text', 3: 'pos', 9: 'ner'}
+        columns = {0: "text", 1: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1440,61 +1232,35 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        data_path = Path(flair.cache_root) / "datasets" / dataset_name
-        train_data_file = data_path / "ddt.train.conllu"
-        if not train_data_file.is_file():
-            temp_file = cached_path(
-                'https://danlp.alexandra.dk/304bd159d5de/datasets/ddt.zip',
-                Path("datasets") / dataset_name
-            )
-            from zipfile import ZipFile
-
-            with ZipFile(temp_file, 'r') as zip_file:
-                zip_file.extractall(path=data_path)
-
-            # Remove CoNLL-U meta information in the last column
-            for part in ['train', 'dev', 'test']:
-                lines = []
-                data_file = "ddt.{}.conllu".format(part)
-                with open(data_path / data_file, 'r') as file:
-                    for line in file:
-                        if line.startswith("#") or line == "\n":
-                            lines.append(line)
-                        lines.append(line.replace("name=", "").replace("|SpaceAfter=No", ""))
-
-                with open(data_path / data_file, 'w') as file:
-                    file.writelines(lines)
+        ner_spraakbanken_path = "https://raw.githubusercontent.com/klintan/swedish-ner-corpus/master/"
+        cached_path(f"{ner_spraakbanken_path}test_corpus.txt", Path("datasets") / dataset_name)
+        cached_path(f"{ner_spraakbanken_path}train_corpus.txt", Path("datasets") / dataset_name)
 
-                print(data_path / data_file)
+        # data is not in IOB2 format. Thus we transform it to IOB2
+        add_IOB2_tags(data_file=Path(data_folder / "test_corpus.txt"))
+        add_IOB2_tags(data_file=Path(data_folder / "train_corpus.txt"))
 
-        super(DANE, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes,
-            in_memory=in_memory, comment_symbol="#"
+        super(NER_SWEDISH, self).__init__(
+            data_folder,
+            columns,
+            tag_to_bioes=tag_to_bioes,
+            in_memory=in_memory,
         )
 
 
-class EUROPARL_NER_GERMAN(ColumnCorpus):
+class SEC_FILLINGS(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
-            in_memory: bool = False,
+            in_memory: bool = True,
     ):
-        """
-        Initialize the EUROPARL_NER_GERMAN corpus. The first time you call this constructor it will automatically
-        download the dataset.
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: 'ner' by default, should not be changed.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
 
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: 'text', 1: 'lemma', 2: 'pos', 3: 'np', 4: 'ner'}
+        columns = {0: "text", 1: "pos", 3: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1505,44 +1271,35 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        europarl_ner_german_path = "https://nlpado.de/~sebastian/software/ner/"
-        cached_path(f"{europarl_ner_german_path}ep-96-04-15.conll", Path("datasets") / dataset_name)
-        cached_path(f"{europarl_ner_german_path}ep-96-04-16.conll", Path("datasets") / dataset_name)
-
-        add_IOB_tags(data_file=Path(data_folder / "ep-96-04-15.conll"), encoding="latin-1", ner_column=4)
-        add_IOB_tags(data_file=Path(data_folder / "ep-96-04-16.conll"), encoding="latin-1", ner_column=4)
+        SEC_FILLINGS_Path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/SEC-filings/CONLL-format/data/"
+        cached_path(f"{SEC_FILLINGS_Path}test/FIN3.txt", Path("datasets") / dataset_name)
+        cached_path(f"{SEC_FILLINGS_Path}train/FIN5.txt", Path("datasets") / dataset_name)
 
-        super(EUROPARL_NER_GERMAN, self).__init__(
+        super(SEC_FILLINGS, self).__init__(
             data_folder,
             columns,
             tag_to_bioes=tag_to_bioes,
-            encoding="latin-1",
+            encoding="utf-8",
             in_memory=in_memory,
-            train_file='ep-96-04-16.conll',
-            test_file='ep-96-04-15.conll'
+            train_file='FIN5.txt',
+            test_file="FIN3.txt",
+            skip_first_line=True
         )
 
 
-class GERMEVAL_14(ColumnCorpus):
+class SEMEVAL2017(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
+            tag_to_bioes: str = "keyword",
             in_memory: bool = True,
     ):
-        """
-        Initialize the GermEval NER corpus for German. This is only possible if you've manually downloaded it to your
-        machine. Obtain the corpus from https://sites.google.com/site/germeval2014ner/data and put it into some folder.
-        Then point the base_path parameter in the constructor to this folder
-        :param base_path: Path to the GermEval corpus on your machine
-        :param tag_to_bioes: 'ner' by default, should not be changed.
-        :param in_memory:If True, keeps dataset in memory giving speedups in training.
-        """
+
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {1: "text", 2: "ner"}
+        columns = {0: "text", 1: "keyword"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1552,24 +1309,17 @@ def __init__(
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
-        # check if data there
-        if not data_folder.exists():
-            log.warning("-" * 100)
-            log.warning(f'WARNING: GermEval-14 dataset not found at "{data_folder}".')
-            log.warning(
-                'Instructions for obtaining the data can be found here: https://sites.google.com/site/germeval2014ner/data"'
-            )
-            log.warning("-" * 100)
-        super(GERMEVAL_14, self).__init__(
-            data_folder,
-            columns,
-            tag_to_bioes=tag_to_bioes,
-            comment_symbol="#",
-            in_memory=in_memory,
+        semeval2017_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/SemEval-2017"
+        cached_path(f"{semeval2017_path}/train.txt", Path("datasets") / dataset_name)
+        cached_path(f"{semeval2017_path}/test.txt", Path("datasets") / dataset_name)
+        cached_path(f"{semeval2017_path}/dev.txt", Path("datasets") / dataset_name)
+
+        super(SEMEVAL2017, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
 
-class INSPEC(ColumnCorpus):
+class SEMEVAL2010(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
@@ -1591,35 +1341,33 @@ def __init__(
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
-        inspec_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/Inspec"
-        cached_path(f"{inspec_path}/train.txt", Path("datasets") / dataset_name)
-        cached_path(f"{inspec_path}/test.txt", Path("datasets") / dataset_name)
-        if not "dev.txt" in os.listdir(data_folder):
-            cached_path(f"{inspec_path}/valid.txt", Path("datasets") / dataset_name)
-            # rename according to train - test - dev - convention
-            os.rename(data_folder / "valid.txt", data_folder / "dev.txt")
+        semeval2010_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/processed_semeval-2010"
+        cached_path(f"{semeval2010_path}/train.txt", Path("datasets") / dataset_name)
+        cached_path(f"{semeval2010_path}/test.txt", Path("datasets") / dataset_name)
 
-        super(INSPEC, self).__init__(
+        super(SEMEVAL2010, self).__init__(
             data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
 
-class LER_GERMAN(ColumnCorpus):
+class TURKU_NER(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
-            in_memory: bool = False,
+            in_memory: bool = True,
+            document_as_sequence: bool = False,
     ):
         """
-        Initialize the LER_GERMAN (Legal Entity Recognition) corpus. The first time you call this constructor it will automatically
+        Initialize the Finnish TurkuNER corpus. The first time you call this constructor it will automatically
         download the dataset.
         :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
         to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
+        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
+        POS tags instead
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
         :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
-
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
@@ -1635,18 +1383,29 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        ler_path = "https://raw.githubusercontent.com/elenanereiss/Legal-Entity-Recognition/master/data/"
-        cached_path(f"{ler_path}ler.conll", Path("datasets") / dataset_name)
+        conll_path = "https://raw.githubusercontent.com/TurkuNLP/turku-ner-corpus/master/data/conll"
+        dev_file = "dev.tsv"
+        test_file = "test.tsv"
+        train_file = "train.tsv"
+        cached_path(f"{conll_path}/{dev_file}", Path("datasets") / dataset_name)
+        cached_path(f"{conll_path}/{test_file}", Path("datasets") / dataset_name)
+        cached_path(f"{conll_path}/{train_file}", Path("datasets") / dataset_name)
 
-        super(LER_GERMAN, self).__init__(
+        super(TURKU_NER, self).__init__(
             data_folder,
             columns,
+            dev_file=dev_file,
+            test_file=test_file,
+            train_file=train_file,
+            column_delimiter="\t",
             tag_to_bioes=tag_to_bioes,
+            encoding="latin-1",
             in_memory=in_memory,
-            train_file='ler.conll'
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
         )
 
-class ANER_CORP(ColumnCorpus):
+
+class TWITTER_NER(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
@@ -1655,15 +1414,14 @@ def __init__(
             document_as_sequence: bool = False,
     ):
         """
-        Initialize a preprocessed version of the Arabic Named Entity Recognition Corpus (ANERCorp) dataset available
-        from https://github.com/EmnamoR/Arabic-named-entity-recognition/blob/master/ANERCorp.rar.
-        http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp
-        Column order is swapped
-        The first time you call this constructor it will automatically download the dataset.
+        Initialize a dataset called twitter_ner which can be found on the following page:
+        https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/ner.txt.
+
+        The first time you call this constructor it will automatically
+        download the dataset.
         :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
         to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
-        POS tags instead
+        :param tag_to_bioes: NER by default, need not be changed
         :param in_memory: If True, keeps dataset in memory giving speedups in training.
         :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
@@ -1671,7 +1429,7 @@ def __init__(
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "ner"}
+        columns = {0: 'text', 1: 'ner'}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1682,32 +1440,41 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        anercorp_path = "https://megantosh.s3.eu-central-1.amazonaws.com/ANERcorp/"
-        # cached_path(f"{anercorp_path}test.txt", Path("datasets") / dataset_name)
-        cached_path(f"{anercorp_path}train.txt", Path("datasets") / dataset_name)
+        twitter_ner_path = "https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/"
+        cached_path(f"{twitter_ner_path}ner.txt", Path("datasets") / dataset_name)
 
-        super(ANER_CORP, self).__init__(
+        super(TWITTER_NER, self).__init__(
             data_folder,
             columns,
-            # tag_to_bioes=tag_to_bioes,
-            encoding="utf-8",
+            tag_to_bioes=tag_to_bioes,
+            encoding="latin-1",
+            train_file="ner.txt",
             in_memory=in_memory,
             document_separator_token=None if not document_as_sequence else "-DOCSTART-",
         )
 
 
-class NER_BASQUE(ColumnCorpus):
+class UP_CHINESE(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
             in_memory: bool = True,
+            document_as_sequence: bool = False,
     ):
+        """
+        Initialize the Chinese dataset from the Universal Propositions Bank, comming from that webpage:
+        https://github.com/System-T/UniversalPropositions/tree/master/UP_Chinese
+
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "ner"}
+        columns = {1: "text", 9: "frame"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1718,44 +1485,45 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        ner_basque_path = "http://ixa2.si.ehu.eus/eiec/"
-        data_path = Path(flair.cache_root) / "datasets" / dataset_name
-        data_file = data_path / "named_ent_eu.train"
-        if not data_file.is_file():
-            cached_path(
-                f"{ner_basque_path}/eiec_v1.0.tgz", Path("datasets") / dataset_name
-            )
-            import tarfile, shutil
-
-            with tarfile.open(
-                    Path(flair.cache_root) / "datasets" / dataset_name / "eiec_v1.0.tgz",
-                    "r:gz",
-            ) as f_in:
-                corpus_files = (
-                    "eiec_v1.0/named_ent_eu.train",
-                    "eiec_v1.0/named_ent_eu.test",
-                )
-                for corpus_file in corpus_files:
-                    f_in.extract(corpus_file, data_path)
-                    shutil.move(f"{data_path}/{corpus_file}", data_path)
+        up_zh_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Chinese/"
+        cached_path(f"{up_zh_path}zh-up-train.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_zh_path}zh-up-dev.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_zh_path}zh-up-test.conllu", Path("datasets") / dataset_name)
 
-        super(NER_BASQUE, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
+        super(UP_CHINESE, self).__init__(
+            data_folder,
+            columns,
+            encoding="utf-8",
+            train_file="zh-up-train.conllu",
+            test_file="zh-up-test.conllu",
+            dev_file="zh-up-dev.conllu",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            comment_symbol="#",
         )
 
 
-class NER_FINNISH(ColumnCorpus):
+class UP_ENGLISH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
             in_memory: bool = True,
+            document_as_sequence: bool = False,
     ):
+        """
+        Initialize the English dataset from the Universal Propositions Bank, comming from that webpage:
+        https://github.com/System-T/UniversalPropositions.
+
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "ner"}
+        columns = {1: "text", 10: "frame"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1766,48 +1534,45 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        ner_finnish_path = "https://raw.githubusercontent.com/mpsilfve/finer-data/master/data/digitoday."
-        cached_path(f"{ner_finnish_path}2014.train.csv", Path("datasets") / dataset_name)
-        cached_path(f"{ner_finnish_path}2014.dev.csv", Path("datasets") / dataset_name)
-        cached_path(f"{ner_finnish_path}2015.test.csv", Path("datasets") / dataset_name)
-
-        _remove_lines_without_annotations(data_file=Path(data_folder / "digitoday.2015.test.csv"))
+        up_en_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_English-EWT/"
+        cached_path(f"{up_en_path}en_ewt-up-train.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_en_path}en_ewt-up-dev.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_en_path}en_ewt-up-test.conllu", Path("datasets") / dataset_name)
 
-        super(NER_FINNISH, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, skip_first_line=True
+        super(UP_ENGLISH, self).__init__(
+            data_folder,
+            columns,
+            encoding="utf-8",
+            train_file="en_ewt-up-train.conllu",
+            test_file="en_ewt-up-test.conllu",
+            dev_file="en_ewt-up-dev.conllu",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            comment_symbol="#",
         )
 
 
-def _remove_lines_without_annotations(data_file: Union[str, Path] = None):
-    with open(data_file, 'r') as f:
-        lines = f.readlines()
-    with open(data_file, 'w') as f:
-        for line in lines:
-            if len(line.split()) != 1:
-                f.write(line)
-
-
-class NER_SWEDISH(ColumnCorpus):
+class UP_FRENCH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
             in_memory: bool = True,
+            document_as_sequence: bool = False,
     ):
         """
-        Initialize the NER_SWEDISH corpus for Swedish. The first time you call this constructor it will automatically
-        download the dataset.
+        Initialize the French dataset from the Universal Propositions Bank, comming from that webpage:
+        https://github.com/System-T/UniversalPropositions.
+
         :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
         to point to a different folder but typically this should not be necessary.
         :param in_memory: If True, keeps dataset in memory giving speedups in training.
         :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
-
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "ner"}
+        columns = {1: "text", 9: "frame"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1818,35 +1583,45 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        ner_spraakbanken_path = "https://raw.githubusercontent.com/klintan/swedish-ner-corpus/master/"
-        cached_path(f"{ner_spraakbanken_path}test_corpus.txt", Path("datasets") / dataset_name)
-        cached_path(f"{ner_spraakbanken_path}train_corpus.txt", Path("datasets") / dataset_name)
-
-        # data is not in IOB2 format. Thus we transform it to IOB2
-        add_IOB2_tags(data_file=Path(data_folder / "test_corpus.txt"))
-        add_IOB2_tags(data_file=Path(data_folder / "train_corpus.txt"))
+        up_fr_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_French/"
+        cached_path(f"{up_fr_path}fr-up-train.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_fr_path}fr-up-dev.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_fr_path}fr-up-test.conllu", Path("datasets") / dataset_name)
 
-        super(NER_SWEDISH, self).__init__(
+        super(UP_FRENCH, self).__init__(
             data_folder,
             columns,
-            tag_to_bioes=tag_to_bioes,
+            encoding="utf-8",
+            train_file="fr-up-train.conllu",
+            test_file="fr-up-test.conllu",
+            dev_file="fr-up-dev.conllu",
             in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            comment_symbol="#",
         )
 
 
-class SEMEVAL2017(ColumnCorpus):
+class UP_FINNISH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "keyword",
             in_memory: bool = True,
+            document_as_sequence: bool = False,
     ):
+        """
+        Initialize the Finnish dataset from the Universal Propositions Bank, comming from that webpage:
+        https://github.com/System-T/UniversalPropositions/tree/master/UP_Finnish
 
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "keyword"}
+        columns = {1: "text", 9: "frame"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1856,29 +1631,46 @@ def __init__(
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
-        semeval2017_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/SemEval-2017"
-        cached_path(f"{semeval2017_path}/train.txt", Path("datasets") / dataset_name)
-        cached_path(f"{semeval2017_path}/test.txt", Path("datasets") / dataset_name)
-        cached_path(f"{semeval2017_path}/dev.txt", Path("datasets") / dataset_name)
+        # download data if necessary
+        up_fi_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Finnish/"
+        cached_path(f"{up_fi_path}fi-up-train.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_fi_path}fi-up-dev.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_fi_path}fi-up-test.conllu", Path("datasets") / dataset_name)
 
-        super(SEMEVAL2017, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
+        super(UP_FINNISH, self).__init__(
+            data_folder,
+            columns,
+            encoding="utf-8",
+            train_file="fi-up-train.conllu",
+            test_file="fi-up-test.conllu",
+            dev_file="fi-up-dev.conllu",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            comment_symbol="#",
         )
 
 
-class SEMEVAL2010(ColumnCorpus):
+class UP_GERMAN(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "keyword",
             in_memory: bool = True,
+            document_as_sequence: bool = False,
     ):
+        """
+        Initialize the German dataset from the Universal Propositions Bank, comming from that webpage:
+        https://github.com/System-T/UniversalPropositions.
 
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "keyword"}
+        columns = {1: "text", 9: "frame"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1888,27 +1680,46 @@ def __init__(
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
-        semeval2010_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/processed_semeval-2010"
-        cached_path(f"{semeval2010_path}/train.txt", Path("datasets") / dataset_name)
-        cached_path(f"{semeval2010_path}/test.txt", Path("datasets") / dataset_name)
+        # download data if necessary
+        up_de_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_German/"
+        cached_path(f"{up_de_path}de-up-train.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_de_path}de-up-dev.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_de_path}de-up-test.conllu", Path("datasets") / dataset_name)
 
-        super(SEMEVAL2010, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
+        super(UP_GERMAN, self).__init__(
+            data_folder,
+            columns,
+            encoding="utf-8",
+            train_file="de-up-train.conllu",
+            test_file="de-up-test.conllu",
+            dev_file="de-up-dev.conllu",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            comment_symbol="#",
         )
 
 
-class WIKINER_ENGLISH(ColumnCorpus):
+class UP_ITALIAN(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
-            in_memory: bool = False,
+            in_memory: bool = True,
+            document_as_sequence: bool = False,
     ):
+        """
+        Initialize the Italian dataset from the Universal Propositions Bank, comming from that webpage:
+        https://github.com/System-T/UniversalPropositions/tree/master/UP_Italian
+
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "pos", 2: "ner"}
+        columns = {1: "text", 9: "frame"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1919,25 +1730,45 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        _download_wikiner("en", dataset_name)
+        up_it_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Italian/"
+        cached_path(f"{up_it_path}it-up-train.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_it_path}it-up-dev.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_it_path}it-up-test.conllu", Path("datasets") / dataset_name)
 
-        super(WIKINER_ENGLISH, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
+        super(UP_ITALIAN, self).__init__(
+            data_folder,
+            columns,
+            encoding="utf-8",
+            train_file="it-up-train.conllu",
+            test_file="it-up-test.conllu",
+            dev_file="it-up-dev.conllu",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            comment_symbol="#",
         )
 
 
-class WIKINER_GERMAN(ColumnCorpus):
+class UP_SPANISH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
-            in_memory: bool = False,
+            in_memory: bool = True,
+            document_as_sequence: bool = False,
     ):
+        """
+        Initialize the Spanish dataset from the Universal Propositions Bank, comming from that webpage:
+        https://github.com/System-T/UniversalPropositions
+
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "pos", 2: "ner"}
+        columns = {1: "text", 9: "frame"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1948,25 +1779,45 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        _download_wikiner("de", dataset_name)
+        up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish/"
+        cached_path(f"{up_es_path}es-up-train.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_es_path}es-up-dev.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_es_path}es-up-test.conllu", Path("datasets") / dataset_name)
 
-        super(WIKINER_GERMAN, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
+        super(UP_SPANISH, self).__init__(
+            data_folder,
+            columns,
+            encoding="utf-8",
+            train_file="es-up-train.conllu",
+            test_file="es-up-test.conllu",
+            dev_file="es-up-dev.conllu",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            comment_symbol="#",
         )
 
 
-class WIKINER_DUTCH(ColumnCorpus):
+class UP_SPANISH_ANCORA(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
-            in_memory: bool = False,
+            in_memory: bool = True,
+            document_as_sequence: bool = False,
     ):
+        """
+        Initialize the Spanish AnCora dataset from the Universal Propositions Bank, comming from that webpage:
+        https://github.com/System-T/UniversalPropositions
+
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "pos", 2: "ner"}
+        columns = {1: "text", 9: "frame"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1977,25 +1828,47 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        _download_wikiner("nl", dataset_name)
+        up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish-AnCora/"
+        cached_path(f"{up_es_path}es_ancora-up-train.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_es_path}es_ancora-up-dev.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_es_path}es_ancora-up-test.conllu", Path("datasets") / dataset_name)
 
-        super(WIKINER_DUTCH, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
+        super(UP_SPANISH_ANCORA, self).__init__(
+            data_folder,
+            columns,
+            encoding="utf-8",
+            train_file="es_ancora-up-train.conllu",
+            test_file="es_ancora-up-test.conllu",
+            dev_file="es_ancora-up-dev.conllu",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            comment_symbol="#",
         )
 
 
-class WIKINER_FRENCH(ColumnCorpus):
+class WEIBO_NER(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
-            in_memory: bool = False,
+            in_memory: bool = True,
+            document_as_sequence: bool = False,
     ):
+        """
+        Initialize the WEIBO_NER corpus . The first time you call this constructor it will automatically
+        download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
+        POS tags instead
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "pos", 2: "ner"}
+        columns = {0: 'text', 1: 'ner'}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2006,192 +1879,449 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        _download_wikiner("fr", dataset_name)
+        weiboNER_conll_path = "https://raw.githubusercontent.com/87302380/WEIBO_NER/main/data/"
+        cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.train", Path("datasets") / dataset_name)
+        cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.test", Path("datasets") / dataset_name)
+        cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.dev", Path("datasets") / dataset_name)
 
-        super(WIKINER_FRENCH, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
+        super(WEIBO_NER, self).__init__(
+            data_folder,
+            columns,
+            tag_to_bioes=tag_to_bioes,
+            encoding="utf-8",
+            in_memory=in_memory,
+            train_file="weiboNER_2nd_conll_format.train",
+            test_file="weiboNER_2nd_conll_format.test",
+            dev_file="weiboNER_2nd_conll_format.dev",
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
         )
 
 
-class WIKINER_ITALIAN(ColumnCorpus):
+class WIKIANN(MultiCorpus):
     def __init__(
             self,
+            languages: Union[str, List[str]],
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
             in_memory: bool = False,
     ):
+        """
+        WkiAnn corpus for cross-lingual NER consisting of datasets from 282 languages that exist
+        in Wikipedia. See https://elisa-ie.github.io/wikiann/ for details and for the languages and their
+        respective abbreveations, i.e. "en" for english. (license: https://opendatacommons.org/licenses/by/)
+        Parameters
+        ----------
+        languages : Union[str, List[str]]
+            Should be an abbreviation of a language ("en", "de",..) or a list of abbreviations.
+            The datasets of all passed languages will be saved in one MultiCorpus.
+            (Note that, even though listed on https://elisa-ie.github.io/wikiann/ some datasets are empty.
+            This includes "aa", "cho", "ho", "hz", "ii", "jam", "kj", "kr", "mus", "olo" and "tcy".)
+        base_path : Union[str, Path], optional
+            Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+            to point to a different folder but typically this should not be necessary.
+        tag_to_bioes : str, optional
+            The data is in bio-format. It will by default (with the string "ner" as value) be transformed
+            into the bioes format. If you dont want that set it to None.
+
+        """
+        if type(languages) == str:
+            languages = [languages]
+
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "pos", 2: "ner"}
+        columns = {0: "text", 1: "ner"}
 
         # this dataset name
-        dataset_name = self.__class__.__name__.lower()
+        dataset_name = "wikiann"
 
         # default dataset folder is the cache root
         if not base_path:
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
-        # download data if necessary
-        _download_wikiner("it", dataset_name)
+        # For each language in languages, the file is downloaded if not existent
+        # Then a comlumncorpus of that data is created and saved in a list
+        # this list is handed to the multicorpus
 
-        super(WIKINER_ITALIAN, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
-        )
+        # list that contains the columncopora
+        corpora = []
 
+        google_drive_path = 'https://drive.google.com/uc?id='
+        # download data if necessary
+        first = True
+        for language in languages:
 
-class WIKINER_SPANISH(ColumnCorpus):
-    def __init__(
-            self,
-            base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
-            in_memory: bool = False,
-    ):
-        if type(base_path) == str:
-            base_path: Path = Path(base_path)
-
-        # column format
-        columns = {0: "text", 1: "pos", 2: "ner"}
-
-        # this dataset name
-        dataset_name = self.__class__.__name__.lower()
-
-        # default dataset folder is the cache root
-        if not base_path:
-            base_path = Path(flair.cache_root) / "datasets"
-        data_folder = base_path / dataset_name
-
-        # download data if necessary
-        _download_wikiner("es", dataset_name)
-
-        super(WIKINER_SPANISH, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
-        )
-
-
-class WIKINER_PORTUGUESE(ColumnCorpus):
-    def __init__(
-            self,
-            base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
-            in_memory: bool = False,
-    ):
-        if type(base_path) == str:
-            base_path: Path = Path(base_path)
-
-        # column format
-        columns = {0: "text", 1: "pos", 2: "ner"}
-
-        # this dataset name
-        dataset_name = self.__class__.__name__.lower()
-
-        # default dataset folder is the cache root
-        if not base_path:
-            base_path = Path(flair.cache_root) / "datasets"
-        data_folder = base_path / dataset_name
-
-        # download data if necessary
-        _download_wikiner("pt", dataset_name)
-
-        super(WIKINER_PORTUGUESE, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
-        )
-
-
-class WIKINER_POLISH(ColumnCorpus):
-    def __init__(
-            self,
-            base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
-            in_memory: bool = False,
-    ):
-        if type(base_path) == str:
-            base_path: Path = Path(base_path)
-
-        # column format
-        columns = {0: "text", 1: "pos", 2: "ner"}
-
-        # this dataset name
-        dataset_name = self.__class__.__name__.lower()
-
-        # default dataset folder is the cache root
-        if not base_path:
-            base_path = Path(flair.cache_root) / "datasets"
-        data_folder = base_path / dataset_name
-
-        # download data if necessary
-        _download_wikiner("pl", dataset_name)
-
-        super(WIKINER_POLISH, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
-        )
-
+            language_folder = data_folder / language
+            file_name = 'wikiann-' + language + '.bio'
 
-class WIKINER_RUSSIAN(ColumnCorpus):
-    def __init__(
-            self,
-            base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
-            in_memory: bool = False,
-    ):
-        if type(base_path) == str:
-            base_path: Path = Path(base_path)
+            # if language not downloaded yet, download it
+            if not language_folder.exists():
+                if first == True:
+                    import gdown
+                    import tarfile
+                    first = False
+                # create folder
+                os.makedirs(language_folder)
+                # get google drive id from list
+                google_id = google_drive_id_from_language_name(language)
+                url = google_drive_path + google_id
 
-        # column format
-        columns = {0: "text", 1: "pos", 2: "ner"}
+                # download from google drive
+                gdown.download(url, str(language_folder / language) + '.tar.gz')
 
-        # this dataset name
-        dataset_name = self.__class__.__name__.lower()
+                # unzip
+                print("Extract data...")
+                tar = tarfile.open(str(language_folder / language) + '.tar.gz', "r:gz")
+                # tar.extractall(language_folder,members=[tar.getmember(file_name)])
+                tar.extract(file_name, str(language_folder))
+                tar.close()
+                print('...done.')
 
-        # default dataset folder is the cache root
-        if not base_path:
-            base_path = Path(flair.cache_root) / "datasets"
-        data_folder = base_path / dataset_name
+                # transform data into required format
+                # the processed dataset has the additional ending "_new"
+                print("Process dataset...")
+                silver_standard_to_simple_ner_annotation(str(language_folder / file_name))
+                # remove the unprocessed dataset
+                os.remove(str(language_folder / file_name))
+                print('...done.')
 
-        # download data if necessary
-        _download_wikiner("ru", dataset_name)
+            # initialize comlumncorpus and add it to list
+            print("Read data into corpus...")
+            corp = ColumnCorpus(data_folder=language_folder,
+                                column_format=columns,
+                                train_file=file_name + '_new',
+                                tag_to_bioes=tag_to_bioes,
+                                in_memory=in_memory,
+                                )
+            corpora.append(corp)
+            print("...done.")
 
-        super(WIKINER_RUSSIAN, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
+        super(WIKIANN, self).__init__(
+            corpora, name='wikiann'
         )
 
 
-class WNUT_17(ColumnCorpus):
-    def __init__(
-            self,
-            base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
-            in_memory: bool = True,
-    ):
-        if type(base_path) == str:
-            base_path: Path = Path(base_path)
-
-        # column format
-        columns = {0: "text", 1: "ner"}
-
-        # this dataset name
-        dataset_name = self.__class__.__name__.lower()
+def silver_standard_to_simple_ner_annotation(data_file: Union[str, Path]):
+    f_read = open(data_file, 'r', encoding='utf-8')
+    f_write = open(data_file + '_new', 'w+', encoding='utf-8')
+    while True:
+        line = f_read.readline()
+        if line:
+            if line == '\n':
+                f_write.write(line)
+            else:
+                liste = line.split()
+                f_write.write(liste[0] + ' ' + liste[-1] + '\n')
+        else:
+            break
+    f_read.close()
+    f_write.close()
 
-        # default dataset folder is the cache root
-        if not base_path:
-            base_path = Path(flair.cache_root) / "datasets"
-        data_folder = base_path / dataset_name
 
-        # download data if necessary
-        wnut_path = "https://noisy-text.github.io/2017/files/"
-        cached_path(f"{wnut_path}wnut17train.conll", Path("datasets") / dataset_name)
-        cached_path(f"{wnut_path}emerging.dev.conll", Path("datasets") / dataset_name)
-        cached_path(
-            f"{wnut_path}emerging.test.annotated", Path("datasets") / dataset_name
-        )
+def google_drive_id_from_language_name(language):
+    languages_ids = {
+        'aa': '1tDDlydKq7KQQ3_23Ysbtke4HJOe4snIk',  # leer
+        'ab': '1hB8REj2XA_0DjI9hdQvNvSDpuBIb8qRf',
+        'ace': '1WENJS2ppHcZqaBEXRZyk2zY-PqXkTkgG',
+        'ady': '1n6On8WWDHxEoybj7F9K15d_fkGPy6KgO',
+        'af': '1CPB-0BD2tg3zIT60D3hmJT0i5O_SKja0',
+        'ak': '1l2vlGHnQwvm9XhW5S-403fetwUXhBlZm',
+        'als': '196xyYjhbie7sYLHLZHWkkurOwQLi8wK-',
+        'am': '1ug1IEoExKD3xWpvfZprAPSQi82YF9Cet',
+        'an': '1DNLgPOAOsGZBYd6rC5ddhzvc9_DtWnk2',
+        'ang': '1W_0ti7Tl8AkqM91lRCMPWEuUnPOAZroV',
+        'ar': '1tyvd32udEQG_cNeVpaD5I2fxvCc6XKIS',
+        'arc': '1hSOByStqPmP3b9HfQ39EclUZGo8IKCMb',
+        'arz': '1CKW5ZhxTpIHmc8Jt5JLz_5O6Cr8Icsan',
+        'as': '12opBoIweBLM8XciMHT4B6-MAaKdYdvpE',
+        'ast': '1rp64PxGZBDfcw-tpFBjLg_ddLDElG1II',
+        'av': '1hncGUrkG1vwAAQgLtwOf41BWkHkEvdss',
+        'ay': '1VmIsWpMTz442b4Mx798ZOgtB9vquKQtf',
+        'az': '1FXDXsvBSdqc7GGIDZv0hqBOaaw12Ip2-',
+        'azb': '1amVqOuHLEkhjn8rkGUl-mXdZlaACWyNT',
+        'ba': '1aLx1d8GagI11VZVYOGQy0BEePeqoT0x3',
+        'bar': '1JZ8-k8ZmnpWYI_Yl_cBBgjVdxoM9Daci',
+        'bat-smg': '1trxKXDFSeKsygTMKi-ZqXSJs7F90k5a8',
+        'bcl': '1Hs0k7KVZ2DPsqroZ4cUKcwZG4HdPV794',
+        'be-x-old': '1gaK-spj1m6eGYQ-SsngLxxLUvP1VRk08',
+        'be': '1_ttfOSy9BzCRkIT_p3mImT82XRPpEiuH',
+        'bg': '1Iug6gYKemb0OrLTUrKDc_c66YGypTfCF',
+        'bh': '12OcSFLu940A8tVQLxI8pnxKBpTeZHmrh',
+        'bi': '1rftVziS_pqARx4mvLJC0sKLY-OL5ZIjE',
+        'bjn': '1n17mkRjPUAOWQk5LQs2C3Tz3ShxK0enZ',
+        'bm': '1284dwO_sfdsWE7FR06HhfBRUb8ePesKR',
+        'bn': '1K2DM1mT4hkr6NlAIBTj95BeVXcgvpgDm',
+        'bo': '1SzGHDVK-OguKdjZ4DXWiOJVrie1iHeWm',
+        'bpy': '1m-e5EoruJufvwBEgJLmJtx6jzx64pYN2',
+        'br': '1xdaBoJ1DnwI0iEq7gQN1dWcABAs_bM9H',
+        'bs': '167dsB01trMYFQl8FshtIdfhjw7IfVKbk',
+        'bug': '1yCnevM9_KJzFk27Vxsva_20OacLo4Uam',
+        'bxr': '1DlByAX3zB-9UyEAVD4wtX-R7mXC-8xum',
+        'ca': '1LuUgbd9sGa-5Ahcsy31EK89a3WOowftY',
+        'cbk-zam': '1kgF8xoD-kIOWZET_9kp_4yNX6AAXn6PI',
+        'cdo': '14x1y6611G-UAEGq92QEHRpreVkYnoUCw',
+        'ce': '1QUUCVKA-fkiCHd3KT3zUWefaWnxzlZLu',
+        'ceb': '1DJZE9RfaMoPNXHI73KBXAm4YSe-_YCUk',
+        'ch': '1YzAfhmatkmTpkZbAcD6X83epCgzD5S2_',
+        'cho': '1ciY0vF3c5a2mTOo_k32A2wMs0klK98Kb',  # leer
+        'chr': '1EHaxz1UZHn7v2bbRzCLAhPsNtRzrG3Ae',
+        'chy': '1nNWwMAJr1KNdz3bHf6uIn-thZCknlTeB',
+        'ckb': '1llpaftcUSiXCZQZMdAqaJSrhwMdcf9IV',
+        'co': '1ZP-8oWgMYfW7a6w6ygEFkKDGbN39QnDn',
+        'cr': '1ST0xRicLAG4JdCZwGdaY-0pEXooQh7e6',
+        'crh': '1Jmpq2XVYUR_XaXU5XNhtOMnz-qkpsgpE',
+        'cs': '1Vydyze-jBkK_S1uV5ewV_Y6dbwhXr7lk',
+        'csb': '1naUyF74lZPnnopXdOqf5Xor2kT4WoHfS',
+        'cu': '1EN5dVTU6jc7YOYPCHq8EYUF31HlMUKs7',
+        'cv': '1gEUAlqYSSDI4TrWCqP1LUq2n0X1XEjN3',
+        'cy': '1q5g6NJE5GXf65Vc_P4BnUMHQ49Prz-J1',
+        'da': '11onAGOLkkqrIwM784siWlg-cewa5WKm8',
+        'de': '1f9nWvNkCCy6XWhd9uf4Dq-2--GzSaYAb',
+        'diq': '1IkpJaVbEOuOs9qay_KG9rkxRghWZhWPm',
+        'dsb': '1hlExWaMth-2eVIQ3i3siJSG-MN_7Z6MY',
+        'dv': '1WpCrslO4I7TMb2uaKVQw4U2U8qMs5szi',
+        'dz': '10WX52ePq2KfyGliwPvY_54hIjpzW6klV',
+        'ee': '1tYEt3oN2KPzBSWrk9jpCqnW3J1KXdhjz',
+        'el': '1cxq4NUYmHwWsEn5waYXfFSanlINXWLfM',
+        'eml': '17FgGhPZqZNtzbxpTJOf-6nxEuI5oU4Vd',
+        'en': '1mqxeCPjxqmO7e8utj1MQv1CICLFVvKa-',
+        'eo': '1YeknLymGcqj44ug2yd4P7xQVpSK27HkK',
+        'es': '1Dnx3MVR9r5cuoOgeew2gT8bDvWpOKxkU',
+        'et': '1Qhb3kYlQnLefWmNimdN_Vykm4mWzbcWy',
+        'eu': '1f613wH88UeITYyBSEMZByK-nRNMwLHTs',
+        'ext': '1D0nLOZ3aolCM8TShIRyCgF3-_MhWXccN',
+        'fa': '1QOG15HU8VfZvJUNKos024xI-OGm0zhEX',
+        'ff': '1h5pVjxDYcq70bSus30oqi9KzDmezVNry',
+        'fi': '1y3Kf6qYsSvL8_nSEwE1Y6Bf6ninaPvqa',
+        'fiu-vro': '1oKUiqG19WgPd3CCl4FGudk5ATmtNfToR',
+        'fj': '10xDMuqtoTJlJFp5ghbhKfNWRpLDK3W4d',
+        'fo': '1RhjYqgtri1276Be1N9RrNitdBNkpzh0J',
+        'fr': '1sK_T_-wzVPJYrnziNqWTriU52rEsXGjn',
+        'frp': '1NUm8B2zClBcEa8dHLBb-ZgzEr8phcQyZ',
+        'frr': '1FjNqbIUlOW1deJdB8WCuWjaZfUzKqujV',
+        'fur': '1oqHZMK7WAV8oHoZLjGR0PfmO38wmR6XY',
+        'fy': '1DvnU6iaTJc9bWedmDklHyx8nzKD1s3Ge',
+        'ga': '1Ql6rh7absdYQ8l-3hj_MVKcEC3tHKeFB',
+        'gag': '1zli-hOl2abuQ2wsDJU45qbb0xuvYwA3a',
+        'gan': '1u2dOwy58y-GaS-tCPJS_i9VRDQIPXwCr',
+        'gd': '1umsUpngJiwkLdGQbRqYpkgxZju9dWlRz',
+        'gl': '141K2IbLjJfXwFTIf-kthmmG0YWdi8liE',
+        'glk': '1ZDaxQ6ilXaoivo4_KllagabbvfOuiZ0c',
+        'gn': '1hM4MuCaVnZqnL-w-0N-WcWag22ikVLtZ',
+        'gom': '1BNOSw75tzPC0wEgLOCKbwu9wg9gcLOzs',
+        'got': '1YSHYBtXc1WvUvMIHPz6HHgJvaXKulJUj',
+        'gu': '1VdK-B2drqFwKg8KD23c3dKXY-cZgCMgd',
+        'gv': '1XZFohYNbKszEFR-V-yDXxx40V41PV9Zm',
+        'ha': '18ZG4tUU0owRtQA8Ey3Dl72ALjryEJWMC',
+        'hak': '1QQe3WgrCWbvnVH42QXD7KX4kihHURB0Z',
+        'haw': '1FLqlK-wpz4jy768XbQAtxd9PhC-9ciP7',
+        'he': '18K-Erc2VOgtIdskaQq4D5A3XkVstDmfX',
+        'hi': '1lBRapb5tjBqT176gD36K5yb_qsaFeu-k',
+        'hif': '153MQ9Ga4NQ-CkK8UiJM3DjKOk09fhCOV',
+        'ho': '1c1AoS7yq15iVkTEE-0f3x25NT4F202B8',  # leer
+        'hr': '1wS-UtB3sGHuXJQQGR0F5lDegogsgoyif',
+        'hsb': '1_3mMLzAE5OmXn2z64rW3OwWbo85Mirbd',
+        'ht': '1BwCaF0nfdgkM7Yt7A7d7KyVk0BcuwPGk',
+        'hu': '10AkDmTxUWNbOXuYLYZ-ZPbLAdGAGZZ8J',
+        'hy': '1Mi2k2alJJquT1ybd3GC3QYDstSagaWdo',
+        'hz': '1c1m_-Q92v0Di7Nez6VuaccrN19i8icKV',  # leer
+        'ia': '1jPyqTmDuVhEhj89N606Cja5heJEbcMoM',
+        'id': '1JWIvIh8fQoMQqk1rPvUThaskxnTs8tsf',
+        'ie': '1TaKRlTtB8-Wqu4sfvx6JQKIugAlg0pV-',
+        'ig': '15NFAf2Qx6BXSjv_Oun9_3QRBWNn49g86',
+        'ii': '1qldGJkMOMKwY13DpcgbxQCbff0K982f9',  # leer
+        'ik': '1VoSTou2ZlwVhply26ujowDz6gjwtxmny',
+        'ilo': '1-xMuIT6GaM_YeHqgm1OamGkxYfBREiv3',
+        'io': '19Zla0wsAcrZm2c0Pw5ghpp4rHjYs26Pp',
+        'is': '11i-NCyqS6HbldIbYulsCgQGZFXR8hwoB',
+        'it': '1HmjlOaQunHqL2Te7pIkuBWrnjlmdfYo_',
+        'iu': '18jKm1S7Ls3l0_pHqQH8MycG3LhoC2pdX',
+        'ja': '10dz8UxyK4RIacXE2HcGdrharmp5rwc3r',
+        'jam': '1v99CXf9RnbF6aJo669YeTR6mQRTOLZ74',  # leer
+        'jbo': '1_LmH9hc6FDGE3F7pyGB1fUEbSwuTYQdD',
+        'jv': '1qiSu1uECCLl4IBZS27FBdJIBivkJ7GwE',
+        'ka': '172UFuFRBX2V1aWeXlPSpu9TjS-3cxNaD',
+        'kaa': '1kh6hMPUdqO-FIxRY6qaIBZothBURXxbY',
+        'kab': '1oKjbZI6ZrrALCqnPCYgIjKNrKDA7ehcs',
+        'kbd': '1jNbfrboPOwJmlXQBIv053d7n5WXpMRv7',
+        'kg': '1iiu5z-sdJ2JLC4Ja9IgDxpRZklIb6nDx',
+        'ki': '1GUtt0QI84c5McyLGGxoi5uwjHOq1d6G8',
+        'kj': '1nSxXUSGDlXVCIPGlVpcakRc537MwuKZR',  # leer
+        'kk': '1ryC3UN0myckc1awrWhhb6RIi17C0LCuS',
+        'kl': '1gXtGtX9gcTXms1IExICnqZUHefrlcIFf',
+        'km': '1DS5ATxvxyfn1iWvq2G6qmjZv9pv0T6hD',
+        'kn': '1ZGLYMxbb5-29MNmuUfg2xFhYUbkJFMJJ',
+        'ko': '12r8tIkTnwKhLJxy71qpIcoLrT6NNhQYm',
+        'koi': '1EdG_wZ_Qk124EPAZw-w6rdEhYLsgcvIj',
+        'kr': '19VNQtnBA-YL_avWuVeHQHxJZ9MZ04WPF',  # leer
+        'krc': '1nReV4Mb7Wdj96czpO5regFbdBPu0zZ_y',
+        'ks': '1kzh0Pgrv27WRMstR9MpU8mu7p60TcT-X',
+        'ksh': '1iHJvrl2HeRaCumlrx3N7CPrHQ2KuLUkt',
+        'ku': '1YqJog7Bkk0fHBCSTxJ9heeE-bfbkbkye',
+        'kv': '1s91HI4eq8lQYlZwfrJAgaGlCyAtIhvIJ',
+        'kw': '16TaIX2nRfqDp8n7zudd4bqf5abN49dvW',
+        'ky': '17HPUKFdKWhUjuR1NOp5f3PQYfMlMCxCT',
+        'la': '1NiQuBaUIFEERvVXo6CQLwosPraGyiRYw',
+        'lad': '1PEmXCWLCqnjLBomMAYHeObM1AmVHtD08',
+        'lb': '1nE4g10xoTU23idmDtOQ0w2QCuizZ6QH_',
+        'lbe': '1KOm-AdRcCHfSc1-uYBxBA4GjxXjnIlE-',
+        'lez': '1cJAXshrLlF1TZlPHJTpDwEvurIOsz4yR',
+        'lg': '1Ur0y7iiEpWBgHECrIrT1OyIC8um_y4th',
+        'li': '1TikIqfqcZlSDWhOae1JnjJiDko4nj4Dj',
+        'lij': '1ro5ItUcF49iP3JdV82lhCQ07MtZn_VjW',
+        'lmo': '1W4rhBy2Pi5SuYWyWbNotOVkVY3kYWS_O',
+        'ln': '1bLSV6bWx0CgFm7ByKppZLpYCFL8EIAoD',
+        'lo': '1C6SSLeKF3QirjZbAZAcpVX_AXYg_TJG3',
+        'lrc': '1GUcS28MlJe_OjeQfS2AJ8uczpD8ut60e',
+        'lt': '1gAG6TcMTmC128wWK0rCXRlCTsJY9wFQY',
+        'ltg': '12ziP8t_fAAS9JqOCEC0kuJObEyuoiOjD',
+        'lv': '1MPuAM04u-AtfybXdpHwCqUpFWbe-zD0_',
+        'mai': '1d_nUewBkka2QGEmxCc9v3dTfvo7lPATH',
+        'map-bms': '1wrNIE-mqp2xb3lrNdwADe6pb7f35NP6V',
+        'mdf': '1BmMGUJy7afuKfhfTBMiKxM3D7FY-JrQ2',
+        'mg': '105WaMhcWa-46tCztoj8npUyg0aH18nFL',
+        'mh': '1Ej7n6yA1cF1cpD5XneftHtL33iHJwntT',
+        'mhr': '1CCPIUaFkEYXiHO0HF8_w07UzVyWchrjS',
+        'mi': '1F6au9xQjnF-aNBupGJ1PwaMMM6T_PgdQ',
+        'min': '1tVK5SHiCy_DaZSDm3nZBgT5bgWThbJt_',
+        'mk': '18NpudytGhSWq_LbmycTDw10cSftlSBGS',
+        'ml': '1V73UE-EvcE-vV3V1RTvU4sak6QFcP91y',
+        'mn': '14jRXicA87oXZOZllWqUjKBMetNpQEUUp',
+        'mo': '1YsLGNMsJ7VsekhdcITQeolzOSK4NzE6U',
+        'mr': '1vOr1AIHbgkhTO9Ol9Jx5Wh98Qdyh1QKI',
+        'mrj': '1dW-YmEW8a9D5KyXz8ojSdIXWGekNzGzN',
+        'ms': '1bs-_5WNRiZBjO-DtcNtkcIle-98homf_',
+        'mt': '1L7aU3iGjm6SmPIU74k990qRgHFV9hrL0',
+        'mus': '1_b7DcRqiKJFEFwp87cUecqf8A5BDbTIJ',  # leer
+        'mwl': '1MfP0jba2jQfGVeJOLq26MjI6fYY7xTPu',
+        'my': '16wsIGBhNVd2lC2p6n1X8rdMbiaemeiUM',
+        'myv': '1KEqHmfx2pfU-a1tdI_7ZxMQAk5NJzJjB',
+        'mzn': '1CflvmYEXZnWwpsBmIs2OvG-zDDvLEMDJ',
+        'na': '1r0AVjee5wNnrcgJxQmVGPVKg5YWz1irz',
+        'nah': '1fx6eu91NegyueZ1i0XaB07CKjUwjHN7H',
+        'nap': '1bhT4sXCJvaTchCIV9mwLBtf3a7OprbVB',
+        'nds-nl': '1UIFi8eOCuFYJXSAXZ9pCWwkQMlHaY4ye',
+        'nds': '1FLgZIXUWa_vekDt4ndY0B5XL7FNLiulr',
+        'ne': '1gEoCjSJmzjIH4kdHsbDZzD6ID4_78ekS',
+        'new': '1_-p45Ny4w9UvGuhD8uRNSPPeaARYvESH',
+        'ng': '11yxPdkmpmnijQUcnFHZ3xcOmLTYJmN_R',
+        'nl': '1dqYXg3ilzVOSQ_tz_dF47elSIvSIhgqd',
+        'nn': '1pDrtRhQ001z2WUNMWCZQU3RV_M0BqOmv',
+        'no': '1zuT8MI96Ivpiu9mEVFNjwbiM8gJlSzY2',
+        'nov': '1l38388Rln0NXsSARMZHmTmyfo5C0wYTd',
+        'nrm': '10vxPq1Nci7Wpq4XOvx3dtqODskzjdxJQ',
+        'nso': '1iaIV8qlT0RDnbeQlnxJ3RehsG3gU5ePK',
+        'nv': '1oN31jT0w3wP9aGwAPz91pSdUytnd9B0g',
+        'ny': '1eEKH_rUPC560bfEg11kp3kbe8qWm35IG',
+        'oc': '1C01cW8G_j8US-DTrsmeal_ENHTtNWn-H',
+        'olo': '1vbDwKZKqFq84dusr1SvDx5JbBcPanx9L',  # leer
+        'om': '1q3h22VMbWg2kgVFm-OArR-E4y1yBQ1JX',
+        'or': '1k8LwCE8nC7lq6neXDaS3zRn0KOrd9RnS',
+        'os': '1u81KAB34aEQfet00dLMRIBJsfRwbDTij',
+        'pa': '1JDEHL1VcLHBamgTPBom_Ryi8hk6PBpsu',
+        'pag': '1k905VUWnRgY8kFb2P2431Kr4dZuolYGF',
+        'pam': '1ssugGyJb8ipispC60B3I6kzMsri1WcvC',
+        'pap': '1Za0wfwatxYoD7jGclmTtRoBP0uV_qImQ',
+        'pcd': '1csJlKgtG04pdIYCUWhsCCZARKIGlEYPx',
+        'pdc': '1Xnms4RXZKZ1BBQmQJEPokmkiweTpouUw',
+        'pfl': '1tPQfHX7E0uKMdDSlwNw5aGmaS5bUK0rn',
+        'pi': '16b-KxNxzbEuyoNSlI3bfe2YXmdSEsPFu',
+        'pih': '1vwyihTnS8_PE5BNK7cTISmIBqGWvsVnF',
+        'pl': '1fijjS0LbfpKcoPB5V8c8fH08T8AkXRp9',
+        'pms': '12ySc7X9ajWWqMlBjyrPiEdc-qVBuIkbA',
+        'pnb': '1RB3-wjluhTKbdTGCsk3nag1bM3m4wENb',
+        'pnt': '1ZCUzms6fY4on_fW8uVgO7cEs9KHydHY_',
+        'ps': '1WKl9Av6Sqz6aHKyUM5kIh90mzFzyVWH9',
+        'pt': '13BX-_4_hcTUp59HDyczFDI32qUB94vUY',
+        'qu': '1CB_C4ygtRoegkqgcqfXNHr8oQd-UcvDE',
+        'rm': '1YRSGgWoxEqSojHXuBHJnY8vAHr1VgLu-',
+        'rmy': '1uFcCyvOWBJWKFQxbkYSp373xUXVl4IgF',
+        'rn': '1ekyyb2MvupYGY_E8_BhKvV664sLvW4aE',
+        'ro': '1YfeNTSoxU-zJMnyQotLk5X8B_6nHryBu',
+        'roa-rup': '150s4H4TdQ5nNYVC6j0E416TUAjBE85yy',
+        'roa-tara': '1H6emfQsD_a5yohK4RMPQ-GrnHXqqVgr3',
+        'ru': '11gP2s-SYcfS3j9MjPp5C3_nFeQB-8x86',
+        'rue': '1OuSglZAndja1J5D5IUmdbt_niTTyEgYK',
+        'rw': '1NuhHfi0-B-Xlr_BApijnxCw0WMEltttP',
+        'sa': '1P2S3gL_zvKgXLKJJxg-Fb4z8XdlVpQik',
+        'sah': '1qz0MpKckzUref2FX_FYiNzI2p4BDc5oR',
+        'sc': '1oAYj_Fty4FUwjAOBEBaiZt_cY8dtpDfA',
+        'scn': '1sDN9zHkXWYoHYx-DUu-GPvsUgB_IRa8S',
+        'sco': '1i8W7KQPj6YZQLop89vZBSybJNgNsvXWR',
+        'sd': '1vaNqfv3S8Gl5pQmig3vwWQ3cqRTsXmMR',
+        'se': '1RT9xhn0Vl90zjWYDTw5V1L_u1Oh16tpP',
+        'sg': '1iIh2oXD2Szz_AygUvTt3_ZK8a3RYEGZ_',
+        'sh': '1qPwLiAm6t4__G-zVEOrBgYx6VRmgDgiS',
+        'si': '1G5ryceID0TP6SAO42e-HAbIlCvYmnUN7',
+        'simple': '1FVV49o_RlK6M5Iw_7zeJOEDQoTa5zSbq',
+        'sk': '11mkYvbmAWKTInj6t4Ma8BUPxoR5o6irL',
+        'sl': '1fsIZS5LgMzMzZ6T7ogStyj-ILEZIBRvO',
+        'sm': '1yefECpKX_Y4R7G2tggIxvc_BvJfOAz-t',
+        'sn': '1fYeCjMPvRAv94kvZjiKI-ktIDLkbv0Ve',
+        'so': '1Uc-eSZnJb36SgeTvRU3GirXZOlGD_NB6',
+        'sq': '11u-53n71O_yjpwRiCQSwgL7N2w72ZptX',
+        'sr': '1PGLGlQi8Q0Eac6dib-uuCJAAHK6SF5Pz',
+        'srn': '1JKiL3TSXqK1-KhPfAwMK0uqw90WEzg7M',
+        'ss': '1e0quNEsA1dn57-IbincF4D82dRWgzQlp',
+        'st': '1ny-FBzpBqIDgv6jMcsoFev3Ih65FNZFO',
+        'stq': '15Fx32ROy2IM6lSqAPUykkr3CITR6Xd7v',
+        'su': '1C0FJum7bYZpnyptBvfAgwJb0TX2hggtO',
+        'sv': '1YyqzOSXzK5yrAou9zeTDWH_7s569mDcz',
+        'sw': '1_bNTj6T8eXlNAIuHaveleWlHB_22alJs',
+        'szl': '1_dXEip1snK4CPVGqH8x7lF5O-6FdCNFW',
+        'ta': '1ZFTONsxGtSnC9QB6RpWSvgD_MbZwIhHH',
+        'tcy': '15R6u7KQs1vmDSm_aSDrQMJ3Q6q3Be0r7',  # leer
+        'te': '11Sx-pBAPeZOXGyv48UNSVMD0AH7uf4YN',
+        'tet': '11mr2MYLcv9pz7mHhGGNi5iNCOVErYeOt',
+        'tg': '16ttF7HWqM9Cnj4qmgf3ZfNniiOJfZ52w',
+        'th': '14xhIt-xr5n9nMuvcwayCGM1-zBCFZquW',
+        'ti': '123q5e9MStMShp8eESGtHdSBGLDrCKfJU',
+        'tk': '1X-JNInt34BNGhg8A8Peyjw2WjsALdXsD',
+        'tl': '1WkQHbWd9cqtTnSHAv0DpUThaBnzeSPTJ',
+        'tn': '1fHfQHetZn8-fLuRZEu-cvs-kQYwPvjyL',
+        'to': '1cHOLaczYJ8h-OqQgxeoH9vMG3izg6muT',
+        'tpi': '1YsRjxVu6NYOrXRb8oqMO9FPaicelFEcu',
+        'tr': '1J1Zy02IxvtCK0d1Ba2h_Ulit1mVb9UIX',
+        'ts': '1pIcfAt3KmtmDkyhOl-SMSeoM8aP8bOpl',
+        'tt': '1vsfzCjj-_bMOn5jBai41TF5GjKJM_Ius',
+        'tum': '1NWcg65daI2Bt0awyEgU6apUDbBmiqCus',
+        'tw': '1WCYKZIqS7AagS76QFSfbteiOgFNBvNne',
+        'ty': '1DIqaP1l-N9VXTNokrlr6EuPMGE765o4h',
+        'tyv': '1F3qa05OYLBcjT1lXMurAJFDXP_EesCvM',
+        'udm': '1T0YMTAPLOk768sstnewy5Jxgx2RPu3Rb',
+        'ug': '1fjezvqlysyZhiQMZdazqLGgk72PqtXAw',
+        'uk': '1UMJCHtzxkfLDBJE7NtfN5FeMrnnUVwoh',
+        'ur': '1WNaD2TuHvdsF-z0k_emQYchwoQQDFmRk',
+        'uz': '11wrG2FSTpRJc2jb5MhgvxjkVDYhT8M-l',
+        've': '1PucJ7pJ4CXGEXZ5p_WleZDs2usNz74to',
+        'vec': '1cAVjm_y3ehNteDQIYz9yyoq1EKkqOXZ0',
+        'vep': '1K_eqV7O6C7KPJWZtmIuzFMKAagj-0O85',
+        'vi': '1yQ6nhm1BmG9lD4_NaG1hE5VV6biEaV5f',
+        'vls': '1bpQQW6pKHruKJJaKtuggH5rReMXyeVXp',
+        'vo': '1D80QRdTpe7H4mHFKpfugscsjX71kiMJN',
+        'wa': '1m4B81QYbf74htpInDU5p7d0n0ot8WLPZ',
+        'war': '1EC3jsHtu22tHBv6jX_I4rupC5RwV3OYd',
+        'wo': '1vChyqNNLu5xYHdyHpACwwpw4l3ptiKlo',
+        'wuu': '1_EIn02xCUBcwLOwYnA-lScjS2Lh2ECw6',
+        'xal': '19bKXsL1D2UesbB50JPyc9TpG1lNc2POt',
+        'xh': '1pPVcxBG3xsCzEnUzlohc_p89gQ9dSJB3',
+        'xmf': '1SM9llku6I_ZuZz05mOBuL2lx-KQXvehr',
+        'yi': '1WNWr1oV-Nl7c1Jv8x_MiAj2vxRtyQawu',
+        'yo': '1yNVOwMOWeglbOcRoZzgd4uwlN5JMynnY',
+        'za': '1i7pg162cD_iU9h8dgtI2An8QCcbzUAjB',
+        'zea': '1EWSkiSkPBfbyjWjZK0VuKdpqFnFOpXXQ',
+        'zh-classical': '1uUKZamNp08KA7s7794sKPOqPALvo_btl',
+        'zh-min-nan': '1oSgz3YBXLGUgI7kl-uMOC_ww6L0FNFmp',
+        'zh-yue': '1zhwlUeeiyOAU1QqwqZ8n91yXIRPFA7UE',
+        'zh': '1LZ96GUhkVHQU-aj2C3WOrtffOp0U3Z7f',
+        'zu': '1FyXl_UK1737XB3drqQFhGXiJrJckiB1W'
+    }
+    return languages_ids[language]
 
-        super(WNUT_17, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
-        )
 
-class WEIBO_NER(ColumnCorpus):
+class WIKIGOLD_NER(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
@@ -2200,12 +2330,11 @@ def __init__(
             document_as_sequence: bool = False,
     ):
         """
-        Initialize the WEIBO_NER corpus . The first time you call this constructor it will automatically
+        Initialize the wikigold corpus. The first time you call this constructor it will automatically
         download the dataset.
         :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
         to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
-        POS tags instead
+        :param tag_to_bioes: NER by default, should not be changed
         :param in_memory: If True, keeps dataset in memory giving speedups in training.
         :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
@@ -2213,7 +2342,7 @@ def __init__(
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: 'text', 1: 'ner'}
+        columns = {0: "text", 1: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2224,117 +2353,32 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        weiboNER_conll_path = "https://raw.githubusercontent.com/87302380/WEIBO_NER/main/data/"
-        cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.train", Path("datasets") / dataset_name)
-        cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.test", Path("datasets") / dataset_name)
-        cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.dev", Path("datasets") / dataset_name)
-
+        wikigold_ner_path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/wikigold/CONLL-format/data/"
+        cached_path(f"{wikigold_ner_path}wikigold.conll.txt", Path("datasets") / dataset_name)
 
-        super(WEIBO_NER, self).__init__(
+        super(WIKIGOLD_NER, self).__init__(
             data_folder,
             columns,
             tag_to_bioes=tag_to_bioes,
             encoding="utf-8",
             in_memory=in_memory,
-            train_file="weiboNER_2nd_conll_format.train",
-            test_file="weiboNER_2nd_conll_format.test",
-            dev_file="weiboNER_2nd_conll_format.dev",
+            train_file='wikigold.conll.txt',
             document_separator_token=None if not document_as_sequence else "-DOCSTART-",
         )
 
-class BIOSCOPE(ColumnCorpus):
-
-    def __init__(
-            self,
-            base_path: Union[str, Path] = None,
-            in_memory: bool = True,
-    ):
-        if type(base_path) == str:
-            base_path: Path = Path(base_path)
-
-        # column format
-        columns = {0: "text", 1: "tag"}
-
-        # this dataset name
-        dataset_name = self.__class__.__name__.lower()
-
-        # default dataset folder is the cache root
-        if not base_path:
-            base_path = Path(flair.cache_root) / "datasets"
-        data_folder = base_path / dataset_name
-
-        # download data if necessary
-        bioscope_path = "https://raw.githubusercontent.com/whoisjones/BioScopeSequenceLabelingData/master/sequence_labeled/"
-        cached_path(f"{bioscope_path}output.txt", Path("datasets") / dataset_name)
-
-        super(BIOSCOPE, self).__init__(
-            data_folder, columns, in_memory=in_memory, train_file="output.txt"
-        )
-
-
-def _download_wikiner(language_code: str, dataset_name: str):
-    # download data if necessary
-    wikiner_path = (
-        "https://raw.githubusercontent.com/dice-group/FOX/master/input/Wikiner/"
-    )
-    lc = language_code
-
-    data_file = (
-            Path(flair.cache_root)
-            / "datasets"
-            / dataset_name
-            / f"aij-wikiner-{lc}-wp3.train"
-    )
-    if not data_file.is_file():
-
-        cached_path(
-            f"{wikiner_path}aij-wikiner-{lc}-wp3.bz2", Path("datasets") / dataset_name
-        )
-        import bz2, shutil
-
-        # unpack and write out in CoNLL column-like format
-        bz_file = bz2.BZ2File(
-            Path(flair.cache_root)
-            / "datasets"
-            / dataset_name
-            / f"aij-wikiner-{lc}-wp3.bz2",
-            "rb",
-        )
-        with bz_file as f, open(
-                Path(flair.cache_root)
-                / "datasets"
-                / dataset_name
-                / f"aij-wikiner-{lc}-wp3.train",
-                "w",
-                encoding="utf-8"
-        ) as out:
-            for line in f:
-                line = line.decode("utf-8")
-                words = line.split(" ")
-                for word in words:
-                    out.write("\t".join(word.split("|")) + "\n")
 
-class UP_CHINESE(ColumnCorpus):
+class WIKINER_ENGLISH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            in_memory: bool = True,
-            document_as_sequence: bool = False,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = False,
     ):
-        """
-        Initialize the Chinese dataset from the Universal Propositions Bank, comming from that webpage:
-        https://github.com/System-T/UniversalPropositions/tree/master/UP_Chinese
-
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {1: "text", 9: "frame"}
+        columns = {0: "text", 1: "pos", 2: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2345,92 +2389,25 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        up_zh_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Chinese/"
-        cached_path(f"{up_zh_path}zh-up-train.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_zh_path}zh-up-dev.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_zh_path}zh-up-test.conllu", Path("datasets") / dataset_name)
+        _download_wikiner("en", dataset_name)
 
-        super(UP_CHINESE, self).__init__(
-            data_folder,
-            columns,
-            encoding="utf-8",
-            train_file="zh-up-train.conllu",
-            test_file="zh-up-test.conllu",
-            dev_file="zh-up-dev.conllu",
-            in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-            comment_symbol="#",
+        super(WIKINER_ENGLISH, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
-class UP_ENGLISH(ColumnCorpus):
-    def __init__(
-            self,
-            base_path: Union[str, Path] = None,
-            in_memory: bool = True,
-            document_as_sequence: bool = False,
-    ):
-        """
-        Initialize the English dataset from the Universal Propositions Bank, comming from that webpage:
-        https://github.com/System-T/UniversalPropositions.
-
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
-        if type(base_path) == str:
-            base_path: Path = Path(base_path)
-
-        # column format
-        columns = {1: "text", 10: "frame"}
-
-        # this dataset name
-        dataset_name = self.__class__.__name__.lower()
-
-        # default dataset folder is the cache root
-        if not base_path:
-            base_path = Path(flair.cache_root) / "datasets"
-        data_folder = base_path / dataset_name
-
-        # download data if necessary
-        up_en_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_English-EWT/"
-        cached_path(f"{up_en_path}en_ewt-up-train.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_en_path}en_ewt-up-dev.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_en_path}en_ewt-up-test.conllu", Path("datasets") / dataset_name)
-
-        super(UP_ENGLISH, self).__init__(
-            data_folder,
-            columns,
-            encoding="utf-8",
-            train_file="en_ewt-up-train.conllu",
-            test_file="en_ewt-up-test.conllu",
-            dev_file="en_ewt-up-dev.conllu",
-            in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-            comment_symbol="#",
-        )
 
-class UP_FRENCH(ColumnCorpus):
+class WIKINER_GERMAN(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            in_memory: bool = True,
-            document_as_sequence: bool = False,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = False,
     ):
-        """
-        Initialize the French dataset from the Universal Propositions Bank, comming from that webpage:
-        https://github.com/System-T/UniversalPropositions.
-
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {1: "text", 9: "frame"}
+        columns = {0: "text", 1: "pos", 2: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2441,44 +2418,25 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        up_fr_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_French/"
-        cached_path(f"{up_fr_path}fr-up-train.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_fr_path}fr-up-dev.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_fr_path}fr-up-test.conllu", Path("datasets") / dataset_name)
+        _download_wikiner("de", dataset_name)
 
-        super(UP_FRENCH, self).__init__(
-            data_folder,
-            columns,
-            encoding="utf-8",
-            train_file="fr-up-train.conllu",
-            test_file="fr-up-test.conllu",
-            dev_file="fr-up-dev.conllu",
-            in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-            comment_symbol="#",
+        super(WIKINER_GERMAN, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
-class UP_FINNISH(ColumnCorpus):
+
+class WIKINER_DUTCH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            in_memory: bool = True,
-            document_as_sequence: bool = False,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = False,
     ):
-        """
-        Initialize the Finnish dataset from the Universal Propositions Bank, comming from that webpage:
-        https://github.com/System-T/UniversalPropositions/tree/master/UP_Finnish
-
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {1: "text", 9: "frame"}
+        columns = {0: "text", 1: "pos", 2: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2489,44 +2447,25 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        up_fi_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Finnish/"
-        cached_path(f"{up_fi_path}fi-up-train.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_fi_path}fi-up-dev.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_fi_path}fi-up-test.conllu", Path("datasets") / dataset_name)
+        _download_wikiner("nl", dataset_name)
 
-        super(UP_FINNISH, self).__init__(
-            data_folder,
-            columns,
-            encoding="utf-8",
-            train_file="fi-up-train.conllu",
-            test_file="fi-up-test.conllu",
-            dev_file="fi-up-dev.conllu",
-            in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-            comment_symbol="#",
+        super(WIKINER_DUTCH, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
-class UP_GERMAN(ColumnCorpus):
+
+class WIKINER_FRENCH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            in_memory: bool = True,
-            document_as_sequence: bool = False,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = False,
     ):
-        """
-        Initialize the German dataset from the Universal Propositions Bank, comming from that webpage:
-        https://github.com/System-T/UniversalPropositions.
-
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {1: "text", 9: "frame"}
+        columns = {0: "text", 1: "pos", 2: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2537,44 +2476,25 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        up_de_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_German/"
-        cached_path(f"{up_de_path}de-up-train.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_de_path}de-up-dev.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_de_path}de-up-test.conllu", Path("datasets") / dataset_name)
+        _download_wikiner("fr", dataset_name)
 
-        super(UP_GERMAN, self).__init__(
-            data_folder,
-            columns,
-            encoding="utf-8",
-            train_file="de-up-train.conllu",
-            test_file="de-up-test.conllu",
-            dev_file="de-up-dev.conllu",
-            in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-            comment_symbol="#",
+        super(WIKINER_FRENCH, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
-class UP_ITALIAN(ColumnCorpus):
+
+class WIKINER_ITALIAN(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            in_memory: bool = True,
-            document_as_sequence: bool = False,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = False,
     ):
-        """
-        Initialize the Italian dataset from the Universal Propositions Bank, comming from that webpage:
-        https://github.com/System-T/UniversalPropositions/tree/master/UP_Italian
-
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {1: "text", 9: "frame"}
+        columns = {0: "text", 1: "pos", 2: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2585,44 +2505,25 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        up_it_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Italian/"
-        cached_path(f"{up_it_path}it-up-train.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_it_path}it-up-dev.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_it_path}it-up-test.conllu", Path("datasets") / dataset_name)
+        _download_wikiner("it", dataset_name)
 
-        super(UP_ITALIAN, self).__init__(
-            data_folder,
-            columns,
-            encoding="utf-8",
-            train_file="it-up-train.conllu",
-            test_file="it-up-test.conllu",
-            dev_file="it-up-dev.conllu",
-            in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-            comment_symbol="#",
+        super(WIKINER_ITALIAN, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
-class UP_SPANISH(ColumnCorpus):
+
+class WIKINER_SPANISH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            in_memory: bool = True,
-            document_as_sequence: bool = False,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = False,
     ):
-        """
-        Initialize the Spanish dataset from the Universal Propositions Bank, comming from that webpage:
-        https://github.com/System-T/UniversalPropositions
-
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {1: "text", 9: "frame"}
+        columns = {0: "text", 1: "pos", 2: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2633,44 +2534,25 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish/"
-        cached_path(f"{up_es_path}es-up-train.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_es_path}es-up-dev.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_es_path}es-up-test.conllu", Path("datasets") / dataset_name)
+        _download_wikiner("es", dataset_name)
 
-        super(UP_SPANISH, self).__init__(
-            data_folder,
-            columns,
-            encoding="utf-8",
-            train_file="es-up-train.conllu",
-            test_file="es-up-test.conllu",
-            dev_file="es-up-dev.conllu",
-            in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-            comment_symbol="#",
+        super(WIKINER_SPANISH, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
-class UP_SPANISH_ANCORA(ColumnCorpus):
+
+class WIKINER_PORTUGUESE(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            in_memory: bool = True,
-            document_as_sequence: bool = False,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = False,
     ):
-        """
-        Initialize the Spanish AnCora dataset from the Universal Propositions Bank, comming from that webpage:
-        https://github.com/System-T/UniversalPropositions
-
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {1: "text", 9: "frame"}
+        columns = {0: "text", 1: "pos", 2: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2681,127 +2563,83 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish-AnCora/"
-        cached_path(f"{up_es_path}es_ancora-up-train.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_es_path}es_ancora-up-dev.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_es_path}es_ancora-up-test.conllu", Path("datasets") / dataset_name)
+        _download_wikiner("pt", dataset_name)
 
-        super(UP_SPANISH_ANCORA, self).__init__(
-            data_folder,
-            columns,
-            encoding="utf-8",
-            train_file="es_ancora-up-train.conllu",
-            test_file="es_ancora-up-test.conllu",
-            dev_file="es_ancora-up-dev.conllu",
-            in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-            comment_symbol="#",
+        super(WIKINER_PORTUGUESE, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
 
-class MITMovieNERSimple(ColumnCorpus):
+class WIKINER_POLISH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
-            in_memory: bool = True,
+            in_memory: bool = False,
     ):
-        """
-        Initialize the eng corpus of the MIT Movie Corpus (it has simpler queries compared to the trivia10k13 corpus)
-        in BIO format. The first time you call this constructor it will automatically download the dataset.
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
-        POS tags instead
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        """
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # column format
-        columns = {0: "ner", 1: "text"}
+        columns = {0: "text", 1: "pos", 2: "ner"}
 
-        # dataset name
+        # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
-        # data folder: default dataset folder is the cache root
-        if type(base_path) == str:
-            base_path: Path = Path(base_path)
+        # default dataset folder is the cache root
         if not base_path:
-            base_path: Path = Path(flair.cache_root) / "datasets"
+            base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/"
-        train_file = "engtrain.bio"
-        test_file = "engtest.bio"
-        cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name)
-        cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name)
+        _download_wikiner("pl", dataset_name)
 
-        super(MITMovieNERSimple, self).__init__(
-            data_folder,
-            columns,
-            train_file=train_file,
-            test_file=test_file,
-            tag_to_bioes=tag_to_bioes,
-            in_memory=in_memory,
+        super(WIKINER_POLISH, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
-class MITMovieNERComplex(ColumnCorpus):
+
+class WIKINER_RUSSIAN(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
-            in_memory: bool = True,
+            in_memory: bool = False,
     ):
-        """
-        Initialize the trivia10k13 corpus of the MIT Movie Corpus (it has more complex queries compared to the eng corpus)
-        in BIO format. The first time you call this constructor it will automatically download the dataset.
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
-        POS tags instead
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        """
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # column format
-        columns = {0: "ner", 1: "text"}
+        columns = {0: "text", 1: "pos", 2: "ner"}
 
-        # dataset name
+        # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
-        # data folder: default dataset folder is the cache root
-        if type(base_path) == str:
-            base_path: Path = Path(base_path)
+        # default dataset folder is the cache root
         if not base_path:
-            base_path: Path = Path(flair.cache_root) / "datasets"
+            base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/"
-        train_file = "trivia10k13train.bio"
-        test_file = "trivia10k13test.bio"
-        cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name)
-        cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name)
+        _download_wikiner("ru", dataset_name)
 
-        super(MITMovieNERComplex, self).__init__(
-            data_folder,
-            columns,
-            train_file=train_file,
-            test_file=test_file,
-            tag_to_bioes=tag_to_bioes,
-            in_memory=in_memory,
+        super(WIKINER_RUSSIAN, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
-class SEC_FILLINGS(ColumnCorpus):
+
+class WNUT_17(ColumnCorpus):
     def __init__(
             self,
-            base_path: Union[str, Path] = None, 
+            base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
             in_memory: bool = True,
     ):
-        
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "pos", 3: "ner"}
+        columns = {0: "text", 1: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2812,22 +2650,19 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        SEC_FILLINGS_Path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/SEC-filings/CONLL-format/data/"
-        cached_path(f"{SEC_FILLINGS_Path}test/FIN3.txt", Path("datasets") / dataset_name)
-        cached_path(f"{SEC_FILLINGS_Path}train/FIN5.txt", Path("datasets") / dataset_name)
+        wnut_path = "https://noisy-text.github.io/2017/files/"
+        cached_path(f"{wnut_path}wnut17train.conll", Path("datasets") / dataset_name)
+        cached_path(f"{wnut_path}emerging.dev.conll", Path("datasets") / dataset_name)
+        cached_path(
+            f"{wnut_path}emerging.test.annotated", Path("datasets") / dataset_name
+        )
 
-        super(SEC_FILLINGS, self).__init__(
-            data_folder,
-            columns,
-            tag_to_bioes=tag_to_bioes,
-            encoding="utf-8",
-            in_memory=in_memory,
-            train_file='FIN5.txt',
-            test_file="FIN3.txt",
-            skip_first_line=True
+        super(WNUT_17, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
-class TURKU_NER(ColumnCorpus):
+
+class WNUT_2020_NER(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
@@ -2836,12 +2671,11 @@ def __init__(
             document_as_sequence: bool = False,
     ):
         """
-        Initialize the Finnish TurkuNER corpus. The first time you call this constructor it will automatically
+        Initialize the WNUT_2020_NER corpus. The first time you call this constructor it will automatically
         download the dataset.
         :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
         to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
-        POS tags instead
+        :param tag_to_bioes: NER by default, since it is the only option of the WNUT corpus.
         :param in_memory: If True, keeps dataset in memory giving speedups in training.
         :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
@@ -2860,23 +2694,201 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        conll_path = "https://raw.githubusercontent.com/TurkuNLP/turku-ner-corpus/master/data/conll"
-        dev_file = "dev.tsv"
-        test_file = "test.tsv"
-        train_file = "train.tsv"
-        cached_path(f"{conll_path}/{dev_file}", Path("datasets") / dataset_name)
-        cached_path(f"{conll_path}/{test_file}", Path("datasets") / dataset_name)
-        cached_path(f"{conll_path}/{train_file}", Path("datasets") / dataset_name)
+        github_url = "https://github.com/jeniyat/WNUT_2020_NER/archive/master.zip"
 
-        super(TURKU_NER, self).__init__(
+        for sample in ["train", "test", "dev"]:
+
+            sample_file = data_folder / (sample + ".txt")
+            if not sample_file.is_file():
+
+                zip_path = cached_path(
+                    f"{github_url}", Path("datasets") / dataset_name
+                )
+
+                # unzip the downloaded repo and merge the train, dev and test datasets
+                unpack_file(zip_path, data_folder, "zip", False)  # unzipped folder name: WNUT_2020_NER-master
+
+                if sample == "test":
+                    file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data_2020/Conll_Format/")
+                else:
+                    file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data/Conll_Format/")
+                filenames = os.listdir(file_path)
+                with open(data_folder / (sample + '.txt'), 'w') as outfile:
+                    for fname in filenames:
+                        with open(file_path / fname) as infile:
+                            lines = infile.read()
+                            outfile.write(lines)
+
+                shutil.rmtree(str(data_folder / "WNUT_2020_NER-master"))  # clean up when done
+
+        super(WNUT_2020_NER, self).__init__(
             data_folder,
             columns,
-            dev_file=dev_file,
-            test_file=test_file,
-            train_file=train_file,
-            column_delimiter="\t",
             tag_to_bioes=tag_to_bioes,
-            encoding="latin-1",
+            encoding="utf-8",
             in_memory=in_memory,
             document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-        )
\ No newline at end of file
+        )
+
+
+def _download_wikiner(language_code: str, dataset_name: str):
+    # download data if necessary
+    wikiner_path = (
+        "https://raw.githubusercontent.com/dice-group/FOX/master/input/Wikiner/"
+    )
+    lc = language_code
+
+    data_file = (
+            Path(flair.cache_root)
+            / "datasets"
+            / dataset_name
+            / f"aij-wikiner-{lc}-wp3.train"
+    )
+    if not data_file.is_file():
+
+        cached_path(
+            f"{wikiner_path}aij-wikiner-{lc}-wp3.bz2", Path("datasets") / dataset_name
+        )
+        import bz2, shutil
+
+        # unpack and write out in CoNLL column-like format
+        bz_file = bz2.BZ2File(
+            Path(flair.cache_root)
+            / "datasets"
+            / dataset_name
+            / f"aij-wikiner-{lc}-wp3.bz2",
+            "rb",
+        )
+        with bz_file as f, open(
+                Path(flair.cache_root)
+                / "datasets"
+                / dataset_name
+                / f"aij-wikiner-{lc}-wp3.train",
+                "w",
+                encoding="utf-8"
+        ) as out:
+            for line in f:
+                line = line.decode("utf-8")
+                words = line.split(" ")
+                for word in words:
+                    out.write("\t".join(word.split("|")) + "\n")
+
+
+class XTREME(MultiCorpus):
+    def __init__(
+            self,
+            languages: Union[str, List[str]] = None,
+            base_path: Union[str, Path] = None,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = False,
+    ):
+        """
+        Xtreme corpus for cross-lingual NER consisting of datasets of a total of 176 languages. The data comes from the google
+        research work XTREME https://github.com/google-research/xtreme. All datasets for NER and respective language abbreviations (e.g.
+        "en" for english can be found here https://www.amazon.com/clouddrive/share/d3KGCRCIYwhKJF0H3eWA26hjg2ZCRhjpEQtDL70FSBN/folder/C43gs51bSIaq5sFTQkWNCQ?_encoding=UTF8&*Version*=1&*entries*=0&mgh=1 )
+        The data is derived from the wikiann dataset https://elisa-ie.github.io/wikiann/ (license: https://opendatacommons.org/licenses/by/)
+
+        Parameters
+        ----------
+        languages : Union[str, List[str]], optional
+            Default the 40 languages that are used in XTREME are loaded. Otherwise on can hand over a strings or a list of strings
+            consisiting of abbreviations for languages. All datasets will be loaded in a MultiCorpus object.
+        base_path : Union[str, Path], optional
+            Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+            to point to a different folder but typically this should not be necessary.
+        tag_to_bioes : str, optional
+            The data is in bio-format. It will by default (with the string "ner" as value) be transformed
+            into the bioes format. If you dont want that set it to None.
+
+        """
+        # if no languages are given as argument all languages used in XTREME will be loaded
+        if not languages:
+            languages = ["af", "ar", "bg", "bn", "de", "el", "en", "es", "et", "eu", "fa", "fi", "fr", "he", "hi", "hu",
+                         "id", "it", "ja", "jv", "ka", "kk", "ko", "ml", "mr", "ms", "my", "nl", "pt", "ru", "sw", "ta",
+                         "te", "th", "tl", "tr", "ur", "vi", "yo", "zh"]
+
+        # if only one language is given
+        if type(languages) == str:
+            languages = [languages]
+
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
+        # column format
+        columns = {0: "text", 1: "ner"}
+
+        # this dataset name
+        dataset_name = "xtreme"
+
+        # default dataset folder is the cache root
+        if not base_path:
+            base_path = Path(flair.cache_root) / "datasets"
+        data_folder = base_path / dataset_name
+
+        # For each language in languages, the file is downloaded if not existent
+        # Then a comlumncorpus of that data is created and saved in a list
+        # This list is handed to the multicorpus
+
+        # list that contains the columncopora
+        corpora = []
+
+        hu_path = "https://nlp.informatik.hu-berlin.de/resources/datasets/panx_dataset"
+
+        # download data if necessary
+        for language in languages:
+
+            language_folder = data_folder / language
+
+            # if language not downloaded yet, download it
+            if not language_folder.exists():
+
+                file_name = language + '.tar.gz'
+                # create folder
+                os.makedirs(language_folder)
+
+                # download from HU Server
+                temp_file = cached_path(
+                    hu_path + "/" + file_name,
+                    Path("datasets") / dataset_name / language
+                )
+
+                # unzip
+                print("Extract data...")
+                import tarfile
+                tar = tarfile.open(str(temp_file), "r:gz")
+                for part in ["train", "test", "dev"]:
+                    tar.extract(part, str(language_folder))
+                tar.close()
+                print('...done.')
+
+                # transform data into required format
+                print("Process dataset...")
+                for part in ["train", "test", "dev"]:
+                    xtreme_to_simple_ner_annotation(str(language_folder / part))
+                print('...done.')
+
+            # initialize comlumncorpus and add it to list
+            print("Read data into corpus...")
+            corp = ColumnCorpus(data_folder=language_folder,
+                                column_format=columns,
+                                tag_to_bioes=tag_to_bioes,
+                                in_memory=in_memory,
+                                )
+            corpora.append(corp)
+            print("...done.")
+
+        super(XTREME, self).__init__(
+            corpora, name='xtreme'
+        )
+
+
+def xtreme_to_simple_ner_annotation(data_file: Union[str, Path]):
+    with open(data_file, 'r', encoding='utf-8') as f:
+        lines = f.readlines()
+    with open(data_file, 'w', encoding='utf-8') as f:
+        for line in lines:
+            if line == '\n':
+                f.write(line)
+            else:
+                liste = line.split()
+                f.write(liste[0].split(':', 1)[1] + ' ' + liste[1] + '\n')
diff --git a/resources/docs/TUTORIAL_6_CORPUS.md b/resources/docs/TUTORIAL_6_CORPUS.md
index f981bf715..0c7419abe 100644
--- a/resources/docs/TUTORIAL_6_CORPUS.md
+++ b/resources/docs/TUTORIAL_6_CORPUS.md
@@ -162,20 +162,24 @@ data the first time you call the corresponding constructor ID. The following dat
 
 | ID(s) | Languages | Description |
 | -------------    | ------------- |------------- 
+| 'ANER_CORP' | Arabic  |  [Arabic Named Entity Recognition Corpus](http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp/) 4-class NER |
 | 'BIOFID' | German  |  [CoNLL-03](https://www.aclweb.org/anthology/K19-1081/) Biodiversity literature NER |
+| 'BIOSCOPE' | English  |  [BioScope](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-9-S11-S9/) biomedical texts annotated for uncertainty, negation and their scopes |
 | 'CONLL_03_DUTCH' | Dutch  |  [CoNLL-03](https://www.clips.uantwerpen.be/conll2002/ner/) 4-class NER |
 | 'CONLL_03_SPANISH' | Spanish  |  [CoNLL-03](https://www.clips.uantwerpen.be/conll2002/ner/) 4-class NER |
 | 'DANE' | Danish | [DaNE dataset](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#danish-dependency-treebank) | 
 | 'EUROPARL_NER_GERMAN' | German | [German Europarl dataset](https://nlpado.de/~sebastian/software/ner_german.shtml) NER in German EU parliament speeches | 
 | 'LER_GERMAN' | German | [Legal Entity Recognition](https://github.com/elenanereiss/Legal-Entity-Recognition) NER in German Legal Documents | 
-| 'MIT_RESTAURANTS' | English  |  [NER dataset for restaurant reviews](https://groups.csail.mit.edu/sls/downloads/restaurant/) |
+| 'MIT_MOVIE_NER_SIMPLE' | English  |  [NER dataset for movie reviews](https://groups.csail.mit.edu/sls/downloads/movie/) - simple NER |
+| 'MIT_MOVIE_NER_COMPLEX' | English  |  [NER dataset for movie reviews](https://groups.csail.mit.edu/sls/downloads/movie/) - complex NER |
+| 'MIT_RESTAURANT_NER' | English  |  [NER dataset for restaurant reviews](https://groups.csail.mit.edu/sls/downloads/restaurant/) |
 | 'NER_BASQUE' | Basque  |  [NER dataset for Basque](http://ixa2.si.ehu.eus/eiec/) |
 | 'NER_FINNISH' | Finnish | [Finer-data](https://github.com/mpsilfve/finer-data) | 
 | 'NER_SWEDISH' | Swedish | [Swedish Spraakbanken NER](https://github.com/klintan/swedish-ner-corpus/) 4-class NER |
+| 'TURKU_NER' | Finnish | [TURKU_NER](https://github.com/TurkuNLP/turku-ner-corpus) NER corpus created by the Turku NLP Group, University of Turku, Finland |
 | 'TWITTER_NER' | English  |  [Twitter NER dataset](https://github.com/aritter/twitter_nlp/) |
+| 'WEIBO_NER' | Chinese  | [Weibo NER corpus](https://paperswithcode.com/sota/chinese-named-entity-recognition-on-weibo-ner/).  |
 | 'WIKIANN' | 282 languages  | Gigantic [corpus for cross-lingual NER derived from Wikipedia](https://elisa-ie.github.io/wikiann/).  |
-| 'WNUT_17' | English  |  [WNUT-17](https://noisy-text.github.io/2017/files/) emerging entity detection |
-| 'WNUT_20' | English  |  [WNUT-20](https://github.com/jeniyat/WNUT_2020_NER) named entity extraction |
 | 'WIKIGOLD_NER' | English  |  [Wikigold](https://github.com/juand-r/entity-recognition-datasets/tree/master/data/wikigold) a manually annotated collection of Wikipedia text |
 | 'WIKINER_ENGLISH' | English  |  [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia |
 | 'WIKINER_GERMAN'  | German  |  [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia |
@@ -185,16 +189,33 @@ data the first time you call the corresponding constructor ID. The following dat
 | 'WIKINER_PORTUGUESE' | Portuguese  |  [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia |
 | 'WIKINER_POLISH' | Polish  |  [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia |
 | 'WIKINER_RUSSIAN'  | Russian  |  [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia |
+| 'WNUT_17' | English  |  [WNUT-17](https://noisy-text.github.io/2017/files/) emerging entity detection |
+| 'WNUT_2020_NER' | English  |  [WNUT-20](https://github.com/jeniyat/WNUT_2020_NER) named entity extraction |
 | 'XTREME' | 176 languages  |  [Xtreme](https://github.com/google-research/xtreme) corpus by Google Research for cross-lingual NER consisting of datasets of a total of 176 languages |
-| 'MITMovieNERSimple' | English | [eng](https://groups.csail.mit.edu/sls/downloads/movie) NER movie corpus collected by MIT (simpler queries) |
-| 'MITMovieNERComplex' | English | [trivia10k13](https://groups.csail.mit.edu/sls/downloads/movie) NER movie corpus collected by MIT (more complex queries) |
-| 'TURKU_NER' | Finnish | [TURKU_NER](https://github.com/TurkuNLP/turku-ner-corpus) NER corpus created by the Turku NLP Group, University of Turku, Finland |
 
 
 #### Biomedical Named Entity Recognition
 
 We support 31 biomedical NER datasets, listed [here](HUNFLAIR_CORPORA.md).
 
+
+#### Universal Proposition Banks 
+
+We now also support loading the [Universal Proposition Banks](https://github.com/System-T/UniversalPropositions)
+for the purpose of training multilingual frame detection systems. 
+
+| ID(s) | Languages | Description |
+| -------------    | ------------- |------------- |
+| 'UP_CHINESE' | Chinese  |  Universal Propositions for [Chinese](https://github.com/System-T/UniversalPropositions/tree/master/UP_Chinese) |
+| 'UP_ENGLISH'| English  |  Universal Propositions for [English](https://github.com/System-T/UniversalPropositions/tree/master/UP_English-EWT) |
+| 'UP_FINNISH'| Finnish  |  Universal Propositions for [Finnish](https://github.com/System-T/UniversalPropositions/tree/master/UP_Finnish)
+| 'UP_FRENCH'| French  |  Universal Propositions for [French](https://github.com/System-T/UniversalPropositions/tree/master/UP_French)
+| 'UP_GERMAN'| German  |  Universal Propositions for [German](https://github.com/System-T/UniversalPropositions/tree/master/UP_German) |
+| 'UP_ITALIAN', | Italian  |  Universal Propositions for [Italian](https://github.com/System-T/UniversalPropositions/tree/master/UP_Italian) |
+| 'UP_SPANISH' | Spanish  |  Universal Propositions for [Spanish](https://github.com/System-T/UniversalPropositions/tree/master/UP_Spanish) |
+| 'UP_SPANISH_ANCORA' | Spanish (Ancora Corpus)  |  Universal Propositions for [Spanish](https://github.com/System-T/UniversalPropositions/tree/master/UP_Spanish-AnCora) |
+
+
 #### Universal Dependency Treebanks
 
 | ID(s) | Languages | Description |

From b5db7ed62c00f618fa5e1ed520bfd979a5cd362a Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Wed, 25 Nov 2020 14:44:58 +0100
Subject: [PATCH 16/35] GH-1983: bump version number

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index fa33a27cc..d82f2155d 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ document embeddings, including our proposed **[Flair embeddings](https://www.acl
 * **A PyTorch NLP framework.** Our framework builds directly on [PyTorch](https://pytorch.org/), making it easy to
 train your own models and experiment with new approaches using Flair embeddings and classes.
 
-Now at [version 0.6.1](https://github.com/flairNLP/flair/releases)!
+Now at [version 0.7](https://github.com/flairNLP/flair/releases)!
 
 ## Comparison with State-of-the-Art
 

From 6dbef308d17ba4578f013441a8b315eb1f95e498 Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Wed, 25 Nov 2020 14:51:25 +0100
Subject: [PATCH 17/35] Update TUTORIAL_1_BASICS.md

---
 resources/docs/TUTORIAL_1_BASICS.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/resources/docs/TUTORIAL_1_BASICS.md b/resources/docs/TUTORIAL_1_BASICS.md
index 655ef375e..61828d0d0 100644
--- a/resources/docs/TUTORIAL_1_BASICS.md
+++ b/resources/docs/TUTORIAL_1_BASICS.md
@@ -80,7 +80,7 @@ print(untokenized_sentence)
 
 In this case, no tokenization is performed and the text is split on whitespaces, thus resulting in only 4 tokens here. 
 
-### Using a Different Tokenizer
+### Using a different tokenizer
 
 You can also pass custom tokenizers to the initialization method. For instance, if you want to tokenize a Japanese
 sentence you can use the 'janome' tokenizer instead, like this: 
@@ -110,12 +110,12 @@ You can write your own tokenization routine. Check the code of `flair.data.Token
  your own tokenization method.  
 
 ### Using pretokenized sequences
-You can pass pass a pretokenized sequence as list of words, e.g.
+You can alternatively pass a pretokenized sequence as list of words, e.g.
 
 ```python
 from flair.data import Sentence
-my_sent = Sentence(['The', 'grass', 'is', 'green', '.'])
-print(my_sent)
+sentence = Sentence(['The', 'grass', 'is', 'green', '.'])
+print(sentence)
 ```
 
 This should print:
@@ -129,7 +129,7 @@ Sentence: "The grass is green ."   [− Tokens: 5]
 
 In Flair, any data point can be labeled. For instance, you can label a word or label a sentence:
 
-### Adding Labels to Tokens
+### Adding labels to tokens
 
 A `Token` has fields for linguistic annotation, such as lemmas, part-of-speech tags or named entity tags. You can
 add a tag by specifying the tag type and the tag value. In this example, we're adding an NER tag of type 'color' to
@@ -171,7 +171,7 @@ This should print:
 Our color tag has a score of 1.0 since we manually added it. If a tag is predicted by our
 sequence labeler, the score value will indicate classifier confidence.
 
-### Adding Labels to Sentences
+### Adding labels to sentences
 
 You can also add a `Label` to a whole `Sentence`.
 For instance, the example below shows how we add the label 'sports' to a sentence, thereby labeling it
@@ -199,7 +199,7 @@ Sentence: "France is the current world cup winner."   [− Tokens: 7  − Senten
 
 Indicating that this sentence belongs to the topic 'sports' with confidence 1.0.
 
-### Multiple Labels
+### Multiple labels
 
 Any data point can be labeled multiple times. A sentence for instance might belong to two topics. In this case, add two labels with the same label name:
 
@@ -234,7 +234,7 @@ Sentence: "France is the current world cup winner."   [− Tokens: 7  − Senten
 
 Indicating that this sentence has two "topic" labels and one "language" label. 
 
-### Accessing a Sentence's Labels
+### Accessing a sentence's labels
 
 You can access these labels like this: 
 

From 06ec50c1129963b3db575348ce088c50935ff42a Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Fri, 27 Nov 2020 12:50:11 +0100
Subject: [PATCH 18/35] Update TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md

---
 resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md
index eba2594df..50bbfc633 100644
--- a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md
+++ b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md
@@ -18,6 +18,8 @@ For instance, say you want to predict whether text is "happy" or "sad" but you h
 Just use TARS with this snippet:
 
 ```python
+from flair.models.text_classification_model import TARSClassifier
+
 # 1. Load our pre-trained TARS model for English
 tars = TARSClassifier.load('tars-base')
 

From 1d91f254e8de01362e4f72d4b4308edb697f520f Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Fri, 27 Nov 2020 12:51:28 +0100
Subject: [PATCH 19/35] Update TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md

---
 resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md
index 50bbfc633..16f19b7ce 100644
--- a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md
+++ b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md
@@ -19,6 +19,7 @@ Just use TARS with this snippet:
 
 ```python
 from flair.models.text_classification_model import TARSClassifier
+from flair.data import Sentence
 
 # 1. Load our pre-trained TARS model for English
 tars = TARSClassifier.load('tars-base')
@@ -69,6 +70,8 @@ In this case, zero-shot prediction falsely predicts "drink" for the sentence "I
 To improve this, let's first create a small corpus of 4 training and 2 testing examples: 
 
 ```python
+from flair.datasets import SentenceDataset
+
 # training dataset consisting of four sentences (2 labeled as "food" and 2 labeled as "drink")
 train = SentenceDataset(
     [

From 4c274dc5416a395c4f0e4824b4cb0c7f78749529 Mon Sep 17 00:00:00 2001
From: alanakbik <alan.akbik@gmail.com>
Date: Tue, 1 Dec 2020 10:52:55 +0100
Subject: [PATCH 20/35] GH-1983: move distance classifier to diagnostics module

---
 flair/models/__init__.py                  |   1 -
 flair/models/text_classification_model.py | 486 +---------------------
 2 files changed, 1 insertion(+), 486 deletions(-)

diff --git a/flair/models/__init__.py b/flair/models/__init__.py
index 15f2a326b..ebb6827d3 100644
--- a/flair/models/__init__.py
+++ b/flair/models/__init__.py
@@ -2,4 +2,3 @@
 from .simple_sequence_tagger_model import SimpleSequenceTagger
 from .language_model import LanguageModel
 from .text_classification_model import TextClassifier
-from .text_classification_model import DistClassifier
diff --git a/flair/models/text_classification_model.py b/flair/models/text_classification_model.py
index 00115d2aa..7e0dab976 100644
--- a/flair/models/text_classification_model.py
+++ b/flair/models/text_classification_model.py
@@ -7,7 +7,6 @@
 from torch.utils.data.dataset import Dataset
 from tqdm import tqdm
 import numpy as np
-from math import floor
 
 import sklearn.metrics as metrics
 from sklearn.metrics.pairwise import cosine_similarity
@@ -17,12 +16,7 @@
 from flair.data import Dictionary, Sentence, Label, DataPoint
 from flair.datasets import SentenceDataset, DataLoader
 from flair.file_utils import cached_path
-from flair.training_utils import (
-    MetricRegression,
-    convert_labels_to_one_hot,
-    Result,
-    store_embeddings,
-)
+from flair.training_utils import convert_labels_to_one_hot, Result, store_embeddings
 
 log = logging.getLogger("flair")
 
@@ -947,481 +941,3 @@ def _fetch_model(model_name) -> str:
             model_name = cached_path(model_map[model_name], cache_dir=cache_dir)
 
         return model_name
-
-    
-
-class DistClassifier(flair.nn.Model):
-    """
-    DistClassifier
-    Model to predict distance between two words given their embeddings. Takes (contextual) word embedding as input.
-    The pair of word embeddings is passed through a linear layer that predicts their distance in a sentence. 
-    Note: When used for training the batch size must be set to 1!!!
-    """
-
-    def __init__(
-            self,
-            word_embeddings: flair.embeddings.TokenEmbeddings,
-            max_distance: int = 20,
-            beta: float = 1.0,
-            loss_max_weight: float = 1,
-            regression = False,
-            regr_loss_step = 0
-    ):
-        """
-        Initializes a DistClassifier
-        :param word_embeddings: embeddings used to embed each sentence
-        .param max_distance: max dist between word pairs = number of predicted classes - 1
-        :param beta: Parameter for F-beta score for evaluation and training annealing
-        :param loss_max_weight: Only for classification: Since small distances between word pairs occur mor frequent it makes sense to give them less weight
-        in the loss function. loss_max_weight will be used as the weight for the maximum distance and should be a number >=1
-        The other weights decrease with equidistant steps from high to low distance.
-        :param regression: if True the class does regression instead of classification
-        :param regr_loss_step: if > 0, the MSE-Loss in regression will be weighted. Word pairs with 
-        distance 0 have weight 1. Then, as the distance increases, the weight in the loss function,
-        increases step by step with size regr_loss_step 
-        """
-
-        super(DistClassifier, self).__init__()
-
-        self.word_embeddings: flair.embeddings.TokenEmbeddings = word_embeddings
-
-        self.beta = beta
-
-        self.loss_max_weight = loss_max_weight
-                            
-        self.regression = regression
-
-        self.regr_loss_step = regr_loss_step
-
-        if not regression:
-            self.max_distance = max_distance
-            
-            # weights for loss function
-            if self.loss_max_weight > 1:
-                step = (self.loss_max_weight - 1) / self.max_distance
-
-                weight_list = [1. + i * step for i in range(self.max_distance + 1)]
-
-                self.loss_weights = torch.FloatTensor(weight_list).to(flair.device)
-                
-            else:
-                self.loss_weights = None
-            
-            # iput size is two times wordembedding size since we use pair of words as input
-            # the output size is max_distance + 1, i.e. we allow 0,1,...,max_distance words between pairs
-            self.decoder = nn.Linear(
-                self.word_embeddings.embedding_length * 2, self.max_distance + 1)
-            
-            self.loss_function = nn.CrossEntropyLoss(weight=self.loss_weights)
-        
-        # regression
-        else:
-            self.max_distance = float('inf')
-            
-            # iput size is two times wordembedding size since we use pair of words as input
-            # the output size is 1
-            self.decoder = nn.Linear(
-                self.word_embeddings.embedding_length * 2, 1)
-            
-            if regr_loss_step > 0:
-                self.loss_function = self.weighted_mse_loss
-            else:
-                self.loss_function = nn.MSELoss()
-            
-        nn.init.xavier_uniform_(self.decoder.weight)
-
-        # auto-spawn on GPU if available
-        self.to(flair.device)
-        
-        
-    # all input should be tensors
-    def weighted_mse_loss(self,predictions, target):
-        
-        weight = 1 + self.regr_loss_step * target
-        
-        return (weight * ((predictions - target) ** 2)).mean()
-        
-
-    # forward allows only a single sentcence!!
-    def forward(self, sentence: Sentence):
-
-        # embed words of sentence
-        self.word_embeddings.embed(sentence)
-
-        # go through all pairs of words with a maximum number of max_distance in between
-        numberOfWords = len(sentence)
-        text_embedding_list = []
-        # go through all pairs
-        for i in range(numberOfWords):
-            for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)):
-                text_embedding_list.append(torch.cat((sentence[i].embedding, sentence[j].embedding)).unsqueeze(0))
-
-        # 2-dim matrix whose rows are the embeddings of word pairs of the sentence
-        text_embedding_tensor = torch.cat(text_embedding_list, 0).to(flair.device)
-
-        label_scores = self.decoder(text_embedding_tensor)
-        
-        if self.regression:
-            return label_scores.squeeze(1)
-        
-        return label_scores
-
-    def _get_state_dict(self):
-        model_state = {
-            "state_dict": self.state_dict(),
-            "word_embeddings": self.word_embeddings,
-            "max_distance": self.max_distance,
-            "beta": self.beta,
-            "loss_max_weight": self.loss_max_weight,
-            "regression": self.regression,
-            "regr_loss_step": self.regr_loss_step
-        }
-        return model_state
-
-    @staticmethod
-    def _init_model_with_state_dict(state):
-        beta = 1.0 if "beta" not in state.keys() else state["beta"]
-        weight = 1 if "loss_max_weight" not in state.keys() else state["loss_max_weight"]
-
-        model = DistClassifier(
-            word_embeddings=state["word_embeddings"],
-            max_distance=state["max_distance"],
-            beta=beta,
-            loss_max_weight=weight,
-            regression=state["regression"],
-            regr_loss_step=state["regr_loss_step"]
-        )
-
-        model.load_state_dict(state["state_dict"])
-        return model
-
-    # So far only one sentence allowed
-    # If list of sentences is handed the function works with the first sentence of the list
-    def forward_loss(
-            self, data_points: Union[List[Sentence], Sentence]
-    ) -> torch.tensor:
-
-        if isinstance(data_points, list):  # first sentence
-            data_points = data_points[0]
-
-        if len(data_points) < 2:
-            return torch.tensor([0.], requires_grad=True)
-
-        scores = self.forward(data_points)
-
-        return self._calculate_loss(scores, data_points)
-
-    # Assume data_points is a single sentence!!!
-    # scores are the predictions for each word pair
-    def _calculate_loss(self, scores, data_points):
-
-        indices = []
-        numberOfWords = len(data_points)
-
-        # classification needs labels to be integers, regression needs labels to be float
-        # this is due to the different loss functions
-        if not self.regression:
-            for i in range(numberOfWords):
-                for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)):
-                    indices.append(torch.LongTensor([j - i - 1]))  # distance between words
-        else:
-            for i in range(numberOfWords):
-                for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)):            
-                    indices.append(torch.Tensor([j - i - 1])) # distance between words
-
-        labels = torch.cat(indices, 0).to(flair.device)
-        
-        return self.loss_function(scores, labels)
-
-    # only single sentences as input
-    def _forward_scores_and_loss(
-            self, data_points: Union[List[Sentence], Sentence], return_loss=False):
-
-        if isinstance(data_points, list):  # first sentence
-            data_points = data_points[0]
-
-        scores = self.forward(data_points)
-
-        loss = None
-        if return_loss:
-            loss = self._calculate_loss(scores, data_points)
-
-        return scores, loss
-    
-    def evaluate(
-            self,
-            sentences: Union[List[DataPoint], Dataset],
-            out_path: Union[str, Path] = None,
-            embedding_storage_mode: str = "none",
-            mini_batch_size: int = 1,  # unnecessary, but trainer.train calls evaluate with this parameter
-            num_workers: int = 8,
-    ) -> (Result, float):
-        
-        if self.regression:
-            return self.evaluate_regression(
-            sentences = sentences,
-            out_path = out_path,
-            embedding_storage_mode=embedding_storage_mode,
-                )
-        
-        return self.evaluate_classification(
-            sentences = sentences,
-            out_path = out_path,
-            embedding_storage_mode=embedding_storage_mode,
-            )
-    
-    def evaluate_regression(
-            self,
-            sentences: Union[List[DataPoint], Dataset],
-            out_path: Union[str, Path] = None,
-            embedding_storage_mode: str = "none",
-    ) -> (Result, float):
-        
-        with torch.no_grad():
-            
-            buckets = [0 for _ in range(11)]
-            
-            eval_loss = 0
-
-            metric = MetricRegression("Evaluation")
-
-            lines: List[str] = []
-            
-            max_dist_plus_one = max([len(sent) for sent in sentences]) - 1
-            
-            num_occurences = [0 for _ in range(max_dist_plus_one)]
-            
-            cumulated_values = [0 for _ in range(max_dist_plus_one)]
-            
-            for sentence in sentences:
-                
-                if len(sentence) < 2:  # we need at least 2 words per sentence
-                    continue
-
-                scores, loss = self._forward_scores_and_loss(sentence, return_loss=True)
-
-                predictions = scores.tolist()
-
-                # gold labels
-                true_values_for_sentence = []
-                numberOfPairs = 0
-                numberOfWords = len(sentence)
-                lines.append(sentence.to_tokenized_string() + '\n')
-                for i in range(numberOfWords):
-                    for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)):
-                        true_dist = j - i - 1
-                        pred = predictions[numberOfPairs]
-                        
-                        true_values_for_sentence.append(true_dist)
-
-                        # for output text file
-                        eval_line = f"({i},{j})\t{true_dist}\t{pred:.2f}\n"
-                        lines.append(eval_line)
-                        
-                        # for buckets
-                        error = abs(true_dist - pred)
-                        if error >= 10:
-                            buckets[10] += 1
-                        else:
-                            buckets[floor(error)] += 1
-                            
-                        # for average prediction
-                        num_occurences[true_dist] += 1
-                        cumulated_values[true_dist] += pred
-
-                        numberOfPairs += 1
-                        
-                eval_loss += loss/numberOfPairs
-
-                metric.true.extend(true_values_for_sentence)
-                metric.pred.extend(predictions)
-
-                store_embeddings(sentence, embedding_storage_mode)
-
-            eval_loss /= len(sentences) # w.r.t self.loss
-            
-            # add some statistics to the output
-            eval_line = f"Number of Sentences: {len(sentences)}\nBuckets:\n | 0-1 | 1-2 | 2-3 | 3-4 | 4-5 | 5-6 | 6-7 | 7-8 | 8-9 | 9-10 | >10 |\n"
-            lines.append(eval_line)
-            eval_line = "| {} | {} | {} | {} | {} | {} | {} | {} | {} | {} | {} |".format(buckets[0],buckets[1],buckets[2],buckets[3],
-                                                                                          buckets[4],buckets[5],buckets[6],buckets[7],
-                                                                                          buckets[8],buckets[9],buckets[10])
-            lines.append(eval_line)
-            lines.append("\nAverage predicted values per distance:\n")
-            eval_line = ""
-            for i in range(max_dist_plus_one):
-                eval_line += str(i) + ": " + f"{cumulated_values[i]/num_occurences[i]:.2f}" + " "
-                if i!=0 and i%15==0:
-                    eval_line += "\n"
-            
-            lines.append(eval_line)
-                
-            
-
-            if out_path is not None:
-                with open(out_path, "w", encoding="utf-8") as outfile:
-                    outfile.write("".join(lines))
-
-            log_line = f"{metric.mean_squared_error()}\t{metric.spearmanr()}\t{metric.pearsonr()}"
-            log_header = "MSE\tSPEARMAN\tPEARSON"
-
-            detailed_result = (
-                f"AVG: mse: {metric.mean_squared_error():.4f} - "
-                f"mae: {metric.mean_absolute_error():.4f} - "
-                f"pearson: {metric.pearsonr():.4f} - "
-                f"spearman: {metric.spearmanr():.4f}"
-            )
-
-            result: Result = Result(
-                metric.pearsonr(), log_header, log_line, detailed_result
-            )
-
-
-            return result, eval_loss           
-
-    def evaluate_classification(
-            self,
-            sentences: Union[List[DataPoint], Dataset],
-            out_path: Union[str, Path] = None,
-            embedding_storage_mode: str = "none",
-    ) -> (Result, float):
-
-        # use scikit-learn to evaluate
-        y_true = []
-        y_pred = []
-
-        with torch.no_grad():
-            eval_loss = 0
-
-            lines: List[str] = []
-            # we iterate over each sentence, instead of batches
-            for sentence in sentences:
-
-                if len(sentence) < 2:  # we need at least 2 words per sentence
-                    continue
-
-                scores, loss = self._forward_scores_and_loss(sentence, return_loss=True)
-
-                # get single labels from scores
-                predictions = [self._get_single_label(s) for s in scores]
-
-                # gold labels
-                true_values_for_sentence = []
-                numberOfPairs = 0
-                numberOfWords = len(sentence)
-                lines.append(sentence.to_tokenized_string() + '\n')
-                for i in range(numberOfWords):
-                    for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)):
-                        true_values_for_sentence.append(j - i - 1)
-
-                        # for output text file
-                        eval_line = "({},{})\t{}\t{}\n".format(i, j, j - i - 1, predictions[numberOfPairs])
-                        lines.append(eval_line)
-
-                        numberOfPairs += 1
-
-                eval_loss += loss / numberOfPairs  # add average loss of word pairs
-
-                for prediction_for_sentence, true_value_for_sentence in zip(
-                        predictions, true_values_for_sentence
-                ):
-                    # hot one vector of true value
-                    y_true_instance = np.zeros(self.max_distance + 1, dtype=int)
-                    y_true_instance[true_value_for_sentence] = 1
-                    y_true.append(y_true_instance.tolist())
-
-                    # hot one vector of predicted value
-                    y_pred_instance = np.zeros(self.max_distance + 1, dtype=int)
-                    y_pred_instance[prediction_for_sentence] = 1
-                    y_pred.append(y_pred_instance.tolist())
-                    
-                # speichert embeddings, falls embedding_storage!= 'None'
-                store_embeddings(sentence, embedding_storage_mode)
-
-            if out_path is not None:
-                with open(out_path, "w", encoding="utf-8") as outfile:
-                    outfile.write("".join(lines))
-
-            # make "classification report"
-            target_names = []  # liste aller labels, ins unserem Fall
-            for i in range(self.max_distance + 1):
-                target_names.append(str(i))
-            classification_report = metrics.classification_report(y_true, y_pred, digits=4,
-                                                                  target_names=target_names, zero_division=0)
-
-            # get scores
-            micro_f_score = round(metrics.fbeta_score(y_true, y_pred, beta=self.beta, average='micro', zero_division=0),
-                                  4)
-            accuracy_score = round(metrics.accuracy_score(y_true, y_pred), 4)
-            macro_f_score = round(metrics.fbeta_score(y_true, y_pred, beta=self.beta, average='macro', zero_division=0),
-                                  4)
-            # precision_score = round(metrics.precision_score(y_true, y_pred, average='macro', zero_division=0), 4)
-            # recall_score = round(metrics.recall_score(y_true, y_pred, average='macro', zero_division=0), 4)
-
-            detailed_result = (
-                    "\nResults:"
-                    f"\n- F-score (micro) {micro_f_score}"
-                    f"\n- F-score (macro) {macro_f_score}"
-                    f"\n- Accuracy {accuracy_score}"
-                    '\n\nBy class:\n' + classification_report
-            )
-
-            # line for log file
-            log_header = "ACCURACY"
-            log_line = f"\t{accuracy_score}"
-
-            result = Result(
-                main_score=micro_f_score,
-                log_line=log_line,
-                log_header=log_header,
-                detailed_results=detailed_result,
-            )
-
-            eval_loss /= len(sentences)
-
-            return result, eval_loss
-
-    @staticmethod
-    def _filter_empty_sentences(sentences: List[Sentence]) -> List[Sentence]:
-        filtered_sentences = [sentence for sentence in sentences if sentence.tokens]
-        if len(sentences) != len(filtered_sentences):
-            log.warning(
-                "Ignore {} sentence(s) with no tokens.".format(
-                    len(sentences) - len(filtered_sentences)
-                )
-            )
-        return filtered_sentences
-
-    def _obtain_labels(
-            self, scores: List[List[float]], predict_prob: bool = False
-    ) -> List[List[Label]]:
-        """
-        Predicts the labels of sentences.
-        :param scores: the prediction scores from the model
-        :return: list of predicted labels
-        """
-
-        if predict_prob:
-            return [self._predict_label_prob(s) for s in scores]
-
-        return [self._get_single_label(s) for s in scores]
-
-    def _get_single_label(self, label_scores):  # -> List[Label]:
-        softmax = torch.nn.functional.softmax(label_scores, dim=0)
-        conf, idx = torch.max(softmax, 0)
-
-        return idx.item()
-
-    def _predict_label_prob(self, label_scores) -> List[Label]:
-        softmax = torch.nn.functional.softmax(label_scores, dim=0)
-        label_probs = []
-        for idx, conf in enumerate(softmax):
-            label_probs.append(Label(idx, conf.item()))
-        return label_probs
-
-    def __str__(self):
-        return super(flair.nn.Model, self).__str__().rstrip(')') + \
-               f'  (beta): {self.beta}\n' + \
-               f'  (loss_max_weight): {self.loss_max_weight}\n' + \
-               f'  (max_distance) {self.max_distance}\n)'
-

From 3020313ccf7dcc594409c7a530f0ced138e6a608 Mon Sep 17 00:00:00 2001
From: alanakbik <alan.akbik@gmail.com>
Date: Tue, 1 Dec 2020 11:15:44 +0100
Subject: [PATCH 21/35] GH-1983: move simple tagger to sandbox module

---
 flair/models/__init__.py                      |  1 -
 .../simple_sequence_tagger_model.py           | 26 +++++++++----------
 2 files changed, 13 insertions(+), 14 deletions(-)
 rename flair/models/{ => sandbox}/simple_sequence_tagger_model.py (97%)

diff --git a/flair/models/__init__.py b/flair/models/__init__.py
index ebb6827d3..784b038a9 100644
--- a/flair/models/__init__.py
+++ b/flair/models/__init__.py
@@ -1,4 +1,3 @@
 from .sequence_tagger_model import SequenceTagger, MultiTagger
-from .simple_sequence_tagger_model import SimpleSequenceTagger
 from .language_model import LanguageModel
 from .text_classification_model import TextClassifier
diff --git a/flair/models/simple_sequence_tagger_model.py b/flair/models/sandbox/simple_sequence_tagger_model.py
similarity index 97%
rename from flair/models/simple_sequence_tagger_model.py
rename to flair/models/sandbox/simple_sequence_tagger_model.py
index 298d887e0..211744643 100644
--- a/flair/models/simple_sequence_tagger_model.py
+++ b/flair/models/sandbox/simple_sequence_tagger_model.py
@@ -3,7 +3,6 @@
 from pathlib import Path
 from typing import List, Union, Optional
 
-import numpy as np
 import torch
 import torch.nn
 import torch.nn.functional as F
@@ -18,19 +17,20 @@
 
 log = logging.getLogger("flair")
 
-"""
-This class is a simple version of the SequenceTagger class. 
-The purpose of this class is to demonstrate the basic hierarchy of a 
-sequence tagger (this could be helpful for new developers).
-It only uses the given embeddings and maps them with a linear layer to
-the tag_dictionary dimension.
-Thus, this class misses following functionalities from the SequenceTagger:
-- CRF,
-- RNN,
-- Reprojection.
-As a result, only poor results can be expected.
-"""
+
 class SimpleSequenceTagger(flair.nn.Model):
+    """
+    This class is a simple version of the SequenceTagger class.
+    The purpose of this class is to demonstrate the basic hierarchy of a
+    sequence tagger (this could be helpful for new developers).
+    It only uses the given embeddings and maps them with a linear layer to
+    the tag_dictionary dimension.
+    Thus, this class misses following functionalities from the SequenceTagger:
+    - CRF,
+    - RNN,
+    - Reprojection.
+    As a result, only poor results can be expected.
+    """
     def __init__(
             self,
             embeddings: TokenEmbeddings,

From d1b09ab85fc8352a7f508c4074e1f1a552f9a0fa Mon Sep 17 00:00:00 2001
From: alanakbik <alan.akbik@gmail.com>
Date: Wed, 25 Nov 2020 12:01:26 +0100
Subject: [PATCH 22/35] GH-1983: bump version numbers

---
 flair/__init__.py | 2 +-
 setup.py          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/flair/__init__.py b/flair/__init__.py
index 7d3e9a311..ecb28ec24 100644
--- a/flair/__init__.py
+++ b/flair/__init__.py
@@ -25,7 +25,7 @@
 
 import logging.config
 
-__version__ = "0.6.1.post1"
+__version__ = "0.7"
 
 logging.config.dictConfig(
     {
diff --git a/setup.py b/setup.py
index 0ca078dc0..824626455 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name="flair",
-    version="0.6.1.post1",
+    version="0.7",
     description="A very simple framework for state-of-the-art NLP",
     long_description=open("README.md", encoding="utf-8").read(),
     long_description_content_type="text/markdown",

From b5d08ccb9df3c5249586fa8fb55c4b3ec982ea13 Mon Sep 17 00:00:00 2001
From: alanakbik <alan.akbik@gmail.com>
Date: Wed, 25 Nov 2020 12:40:42 +0100
Subject: [PATCH 23/35] GH-1983: update list of datasets

---
 flair/datasets/__init__.py          |   32 +-
 flair/datasets/sequence_labeling.py | 3010 ++++++++++++++-------------
 resources/docs/TUTORIAL_6_CORPUS.md |   33 +-
 3 files changed, 1551 insertions(+), 1524 deletions(-)

diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py
index 5b611cd23..a59181506 100755
--- a/flair/datasets/__init__.py
+++ b/flair/datasets/__init__.py
@@ -7,6 +7,7 @@
 # Expose all sequence labeling datasets
 from .sequence_labeling import ColumnCorpus
 from .sequence_labeling import ColumnDataset
+from .sequence_labeling import ANER_CORP
 from .sequence_labeling import BIOFID
 from .sequence_labeling import BIOSCOPE
 from .sequence_labeling import CONLL_03
@@ -14,19 +15,31 @@
 from .sequence_labeling import CONLL_03_DUTCH
 from .sequence_labeling import CONLL_03_SPANISH
 from .sequence_labeling import CONLL_2000
-from .sequence_labeling import TWITTER_NER
 from .sequence_labeling import DANE
 from .sequence_labeling import EUROPARL_NER_GERMAN
 from .sequence_labeling import GERMEVAL_14
 from .sequence_labeling import INSPEC
 from .sequence_labeling import LER_GERMAN
+from .sequence_labeling import MIT_MOVIE_NER_SIMPLE
+from .sequence_labeling import MIT_MOVIE_NER_COMPLEX
+from .sequence_labeling import MIT_RESTAURANT_NER
 from .sequence_labeling import NER_BASQUE
 from .sequence_labeling import NER_FINNISH
 from .sequence_labeling import NER_SWEDISH
 from .sequence_labeling import SEMEVAL2010
 from .sequence_labeling import SEMEVAL2017
+from .sequence_labeling import TURKU_NER
+from .sequence_labeling import TWITTER_NER
+from .sequence_labeling import UP_CHINESE
+from .sequence_labeling import UP_ENGLISH
+from .sequence_labeling import UP_FINNISH
+from .sequence_labeling import UP_FRENCH
+from .sequence_labeling import UP_GERMAN
+from .sequence_labeling import UP_ITALIAN
+from .sequence_labeling import UP_SPANISH
+from .sequence_labeling import UP_SPANISH_ANCORA
+from .sequence_labeling import WEIBO_NER
 from .sequence_labeling import WIKIANN
-from .sequence_labeling import XTREME
 from .sequence_labeling import WIKIGOLD_NER
 from .sequence_labeling import WIKINER_ENGLISH
 from .sequence_labeling import WIKINER_GERMAN
@@ -39,20 +52,7 @@
 from .sequence_labeling import WIKINER_RUSSIAN
 from .sequence_labeling import WNUT_17
 from .sequence_labeling import WNUT_2020_NER
-from .sequence_labeling import WEIBO_NER
-from .sequence_labeling import MIT_RESTAURANTS
-from .sequence_labeling import UP_CHINESE
-from .sequence_labeling import UP_ENGLISH
-from .sequence_labeling import UP_FINNISH
-from .sequence_labeling import UP_FRENCH
-from .sequence_labeling import UP_GERMAN
-from .sequence_labeling import UP_ITALIAN
-from .sequence_labeling import UP_SPANISH
-from .sequence_labeling import UP_SPANISH_ANCORA
-from .sequence_labeling import ANER_CORP
-from .sequence_labeling import MITMovieNERSimple
-from .sequence_labeling import MITMovieNERComplex
-from .sequence_labeling import TURKU_NER
+from .sequence_labeling import XTREME
 
 # Expose all document classification datasets
 from .document_classification import ClassificationCorpus
diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index 0da2a1fd5..f9ee3ce0f 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -32,7 +32,6 @@ def __init__(
     ):
         """
         Instantiates a Corpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000.
-
         :param data_folder: base folder with the task data
         :param column_format: a map specifying the column format
         :param train_file: the name of the train file
@@ -118,7 +117,6 @@ def __init__(
     ):
         """
         Instantiates a column dataset (typically used for sequence labeling or word-level prediction).
-
         :param path_to_column_file: path to the file with the column-formatted data
         :param column_name_map: a map specifying the column format
         :param tag_to_bioes: whether to convert to BIOES tagging scheme
@@ -219,7 +217,7 @@ def _parse_token(self, line: str) -> Token:
             if len(fields) > column:
                 if column != self.text_column and self.column_name_map[column] != self.SPACE_AFTER_KEY:
                     task = self.column_name_map[column] # for example 'pos'
-                    tag = fields[column] 
+                    tag = fields[column]
                     if tag.count("-") >= 1: # tag with prefix, for example tag='B-OBJ'
                         split_at_first_hyphen = tag.split("-", 1)
                         tagging_format_prefix = split_at_first_hyphen[0]
@@ -284,6 +282,58 @@ def __getitem__(self, index: int = 0) -> Sentence:
         return sentence
 
 
+class ANER_CORP(ColumnCorpus):
+    def __init__(
+            self,
+            base_path: Union[str, Path] = None,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = True,
+            document_as_sequence: bool = False,
+            **corpusargs,
+    ):
+        """
+        Initialize a preprocessed version of the Arabic Named Entity Recognition Corpus (ANERCorp) dataset available
+        from https://github.com/EmnamoR/Arabic-named-entity-recognition/blob/master/ANERCorp.rar.
+        http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp
+        Column order is swapped
+        The first time you call this constructor it will automatically download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
+        POS tags instead
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
+        # column format
+        columns = {0: "text", 1: "ner"}
+
+        # this dataset name
+        dataset_name = self.__class__.__name__.lower()
+
+        # default dataset folder is the cache root
+        if not base_path:
+            base_path = Path(flair.cache_root) / "datasets"
+        data_folder = base_path / dataset_name
+
+        # download data if necessary
+        anercorp_path = "https://megantosh.s3.eu-central-1.amazonaws.com/ANERcorp/"
+        # cached_path(f"{anercorp_path}test.txt", Path("datasets") / dataset_name)
+        cached_path(f"{anercorp_path}train.txt", Path("datasets") / dataset_name)
+
+        super(ANER_CORP, self).__init__(
+            data_folder,
+            columns,
+            # tag_to_bioes=tag_to_bioes,
+            encoding="utf-8",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            **corpusargs,
+        )
+
+
 class BIOFID(ColumnCorpus):
     def __init__(
             self,
@@ -317,6 +367,37 @@ def __init__(
         )
 
 
+class BIOSCOPE(ColumnCorpus):
+
+    def __init__(
+            self,
+            base_path: Union[str, Path] = None,
+            in_memory: bool = True,
+            **corpusargs,
+    ):
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
+        # column format
+        columns = {0: "text", 1: "tag"}
+
+        # this dataset name
+        dataset_name = self.__class__.__name__.lower()
+
+        # default dataset folder is the cache root
+        if not base_path:
+            base_path = Path(flair.cache_root) / "datasets"
+        data_folder = base_path / dataset_name
+
+        # download data if necessary
+        bioscope_path = "https://raw.githubusercontent.com/whoisjones/BioScopeSequenceLabelingData/master/sequence_labeled/"
+        cached_path(f"{bioscope_path}output.txt", Path("datasets") / dataset_name)
+
+        super(BIOSCOPE, self).__init__(
+            data_folder, columns, in_memory=in_memory, train_file="output.txt", **corpusargs,
+        )
+
+
 class CONLL_03(ColumnCorpus):
     def __init__(
             self,
@@ -473,22 +554,124 @@ def __init__(
         )
 
 
+def add_IOB_tags(data_file: Union[str, Path], encoding: str = "utf8", ner_column: int = 1):
+    """
+Function that adds IOB tags if only chunk names are provided (e.g. words are tagged PER instead
+of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects
+the letter 'O'. Additionally it removes lines with no tags in the data file and can also
+be used if the data is only partially IOB tagged.
+Parameters
+----------
+data_file : Union[str, Path]
+    Path to the data file.
+encoding : str, optional
+    Encoding used in open function. The default is "utf8".
+ner_column : int, optional
+    Specifies the ner-tagged column. The default is 1 (the second column).
+
+"""
+
+    def add_I_prefix(current_line: List[str], ner: int, tag: str):
+        for i in range(0, len(current_line)):
+            if i == 0:
+                f.write(line_list[i])
+            elif i == ner:
+                f.write(' I-' + tag)
+            else:
+                f.write(' ' + current_line[i])
+        f.write('\n')
 
-class WNUT_2020_NER(ColumnCorpus):
+    with open(file=data_file, mode='r', encoding=encoding) as f:
+        lines = f.readlines()
+    with open(file=data_file, mode='w', encoding=encoding) as f:
+        pred = 'O'  # remembers ner tag of predecessing line
+        for line in lines:
+            line_list = line.split()
+            if len(line_list) > 2:  # word with tags
+                ner_tag = line_list[ner_column]
+                if ner_tag in ['0', 'O']:  # no chunk
+                    for i in range(0, len(line_list)):
+                        if i == 0:
+                            f.write(line_list[i])
+                        elif i == ner_column:
+                            f.write(' O')
+                        else:
+                            f.write(' ' + line_list[i])
+                    f.write('\n')
+                    pred = 'O'
+                elif '-' not in ner_tag:  # no IOB tags
+                    if pred == 'O':  # found a new chunk
+                        add_I_prefix(line_list, ner_column, ner_tag)
+                        pred = ner_tag
+                    else:  # found further part of chunk or new chunk directly after old chunk
+                        add_I_prefix(line_list, ner_column, ner_tag)
+                        pred = ner_tag
+                else:  # line already has IOB tag (tag contains '-')
+                    f.write(line)
+                    pred = ner_tag.split('-')[1]
+            elif len(line_list) == 0:  # empty line
+                f.write('\n')
+                pred = 'O'
+
+
+def add_IOB2_tags(data_file: Union[str, Path], encoding: str = "utf8"):
+    """
+Function that adds IOB2 tags if only chunk names are provided (e.g. words are tagged PER instead
+of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects
+the letter 'O'. Additionally it removes lines with no tags in the data file and can also
+be used if the data is only partially IOB tagged.
+Parameters
+----------
+data_file : Union[str, Path]
+    Path to the data file.
+encoding : str, optional
+    Encoding used in open function. The default is "utf8".
+
+"""
+    with open(file=data_file, mode='r', encoding=encoding) as f:
+        lines = f.readlines()
+    with open(file=data_file, mode='w', encoding=encoding) as f:
+        pred = 'O'  # remembers tag of predecessing line
+        for line in lines:
+            line_list = line.split()
+            if len(line_list) == 2:  # word with tag
+                word = line_list[0]
+                tag = line_list[1]
+                if tag in ['0', 'O']:  # no chunk
+                    f.write(word + ' O\n')
+                    pred = 'O'
+                elif '-' not in tag:  # no IOB tags
+                    if pred == 'O':  # found a new chunk
+                        f.write(word + ' B-' + tag + '\n')
+                        pred = tag
+                    else:  # found further part of chunk or new chunk directly after old chunk
+                        if pred == tag:
+                            f.write(word + ' I-' + tag + '\n')
+                        else:
+                            f.write(word + ' B-' + tag + '\n')
+                            pred = tag
+                else:  # line already has IOB tag (tag contains '-')
+                    f.write(line)
+                    pred = tag.split('-')[1]
+            elif len(line_list) == 0:  # empty line
+                f.write('\n')
+                pred = 'O'
+
+
+class CONLL_03_SPANISH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
             in_memory: bool = True,
-            document_as_sequence: bool = False,
             **corpusargs,
     ):
         """
-        Initialize the WNUT_2020_NER corpus. The first time you call this constructor it will automatically
+        Initialize the CoNLL-03 corpus for Spanish. The first time you call this constructor it will automatically
         download the dataset.
         :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
         to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, since it is the only option of the WNUT corpus.
+        :param tag_to_bioes: NER by default, should not be changed
         :param in_memory: If True, keeps dataset in memory giving speedups in training.
         :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
@@ -507,67 +690,42 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        github_url = "https://github.com/jeniyat/WNUT_2020_NER/archive/master.zip"
-
-        for sample in ["train", "test", "dev"]:
-
-            sample_file = data_folder / (sample + ".txt")
-            if not sample_file.is_file():
-
-                zip_path = cached_path(
-                    f"{github_url}", Path("datasets") / dataset_name
-                    )
-
-                # unzip the downloaded repo and merge the train, dev and test datasets 
-                unpack_file(zip_path, data_folder, "zip", False) # unzipped folder name: WNUT_2020_NER-master
-
-                if sample == "test":
-                	file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data_2020/Conll_Format/")
-                else:
-                	file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data/Conll_Format/")
-                filenames = os.listdir(file_path)
-                with open(data_folder / (sample + '.txt'), 'w') as outfile: 
-                    for fname in filenames:
-                        with open(file_path / fname) as infile:
-                            lines = infile.read()
-                            outfile.write(lines)
-
-                shutil.rmtree(str(data_folder / "WNUT_2020_NER-master")) # clean up when done
+        conll_02_path = "https://www.clips.uantwerpen.be/conll2002/ner/data/"
+        cached_path(f"{conll_02_path}esp.testa", Path("datasets") / dataset_name)
+        cached_path(f"{conll_02_path}esp.testb", Path("datasets") / dataset_name)
+        cached_path(f"{conll_02_path}esp.train", Path("datasets") / dataset_name)
 
-        super(WNUT_2020_NER, self).__init__(
+        super(CONLL_03_SPANISH, self).__init__(
             data_folder,
             columns,
             tag_to_bioes=tag_to_bioes,
-            encoding="utf-8",
+            encoding="latin-1",
             in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
             **corpusargs,
         )
 
 
-class WIKIGOLD_NER(ColumnCorpus):
+class CONLL_2000(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
+            tag_to_bioes: str = "np",
             in_memory: bool = True,
-            document_as_sequence: bool = False,
             **corpusargs,
     ):
         """
-        Initialize the wikigold corpus. The first time you call this constructor it will automatically
-        download the dataset.
+        Initialize the CoNLL-2000 corpus for English chunking.
+        The first time you call this constructor it will automatically download the dataset.
         :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
         to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, should not be changed
+        :param tag_to_bioes: 'np' by default, should not be changed, but you can set 'pos' instead to predict POS tags
         :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "ner"}
+        columns = {0: "text", 1: "pos", 2: "np"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -578,47 +736,53 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        wikigold_ner_path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/wikigold/CONLL-format/data/"
-        cached_path(f"{wikigold_ner_path}wikigold.conll.txt", Path("datasets") / dataset_name)
+        conll_2000_path = "https://www.clips.uantwerpen.be/conll2000/chunking/"
+        data_file = Path(flair.cache_root) / "datasets" / dataset_name / "train.txt"
+        if not data_file.is_file():
+            cached_path(
+                f"{conll_2000_path}train.txt.gz", Path("datasets") / dataset_name
+            )
+            cached_path(
+                f"{conll_2000_path}test.txt.gz", Path("datasets") / dataset_name
+            )
+            import gzip, shutil
 
-        super(WIKIGOLD_NER, self).__init__(
-            data_folder,
-            columns,
-            tag_to_bioes=tag_to_bioes,
-            encoding="utf-8",
-            in_memory=in_memory,
-            train_file='wikigold.conll.txt',
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-            **corpusargs,
+            with gzip.open(
+                    Path(flair.cache_root) / "datasets" / dataset_name / "train.txt.gz",
+                    "rb",
+            ) as f_in:
+                with open(
+                        Path(flair.cache_root) / "datasets" / dataset_name / "train.txt",
+                        "wb",
+                ) as f_out:
+                    shutil.copyfileobj(f_in, f_out)
+            with gzip.open(
+                    Path(flair.cache_root) / "datasets" / dataset_name / "test.txt.gz", "rb"
+            ) as f_in:
+                with open(
+                        Path(flair.cache_root) / "datasets" / dataset_name / "test.txt",
+                        "wb",
+                ) as f_out:
+                    shutil.copyfileobj(f_in, f_out)
+
+        super(CONLL_2000, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
         )
 
 
-class TWITTER_NER(ColumnCorpus):
+class DANE(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
             in_memory: bool = True,
-            document_as_sequence: bool = False,
             **corpusargs,
     ):
-        """
-        Initialize a dataset called twitter_ner which can be found on the following page:
-        https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/ner.txt.
-
-        The first time you call this constructor it will automatically
-        download the dataset.
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, need not be changed
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: 'text', 1: 'ner'}
+        columns = {1: 'text', 3: 'pos', 9: 'ner'}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -629,45 +793,63 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        twitter_ner_path = "https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/"
-        cached_path(f"{twitter_ner_path}ner.txt", Path("datasets") / dataset_name)
+        data_path = Path(flair.cache_root) / "datasets" / dataset_name
+        train_data_file = data_path / "ddt.train.conllu"
+        if not train_data_file.is_file():
+            temp_file = cached_path(
+                'https://danlp.alexandra.dk/304bd159d5de/datasets/ddt.zip',
+                Path("datasets") / dataset_name
+            )
+            from zipfile import ZipFile
 
-        super(TWITTER_NER, self).__init__(
-            data_folder,
-            columns,
-            tag_to_bioes=tag_to_bioes,
-            encoding="latin-1",
-            train_file="ner.txt",
-            in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            with ZipFile(temp_file, 'r') as zip_file:
+                zip_file.extractall(path=data_path)
+
+            # Remove CoNLL-U meta information in the last column
+            for part in ['train', 'dev', 'test']:
+                lines = []
+                data_file = "ddt.{}.conllu".format(part)
+                with open(data_path / data_file, 'r') as file:
+                    for line in file:
+                        if line.startswith("#") or line == "\n":
+                            lines.append(line)
+                        lines.append(line.replace("name=", "").replace("|SpaceAfter=No", ""))
+
+                with open(data_path / data_file, 'w') as file:
+                    file.writelines(lines)
+
+                print(data_path / data_file)
+
+        super(DANE, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes,
+            in_memory=in_memory, comment_symbol="#",
             **corpusargs,
         )
 
 
-class MIT_RESTAURANTS(ColumnCorpus):
+class EUROPARL_NER_GERMAN(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
-            in_memory: bool = True,
-            document_as_sequence: bool = False,
+            in_memory: bool = False,
             **corpusargs,
     ):
         """
-        Initialize the experimental MIT Restaurant corpus available on https://groups.csail.mit.edu/sls/downloads/restaurant/.
-        The first time you call this constructor it will automatically download the dataset.
+        Initialize the EUROPARL_NER_GERMAN corpus. The first time you call this constructor it will automatically
+        download the dataset.
         :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
         to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
-        POS tags instead
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param tag_to_bioes: 'ner' by default, should not be changed.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
         :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
+
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "ner"}
+        columns = {0: 'text', 1: 'lemma', 2: 'pos', 3: 'np', 4: 'ner'}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -678,126 +860,26 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        mit_restaurants_path = "https://megantosh.s3.eu-central-1.amazonaws.com/MITRestoCorpus/"
-        cached_path(f"{mit_restaurants_path}test.txt", Path("datasets") / dataset_name)
-        cached_path(f"{mit_restaurants_path}train.txt", Path("datasets") / dataset_name)
+        europarl_ner_german_path = "https://nlpado.de/~sebastian/software/ner/"
+        cached_path(f"{europarl_ner_german_path}ep-96-04-15.conll", Path("datasets") / dataset_name)
+        cached_path(f"{europarl_ner_german_path}ep-96-04-16.conll", Path("datasets") / dataset_name)
+
+        add_IOB_tags(data_file=Path(data_folder / "ep-96-04-15.conll"), encoding="latin-1", ner_column=4)
+        add_IOB_tags(data_file=Path(data_folder / "ep-96-04-16.conll"), encoding="latin-1", ner_column=4)
 
-        super(MIT_RESTAURANTS, self).__init__(
+        super(EUROPARL_NER_GERMAN, self).__init__(
             data_folder,
             columns,
             tag_to_bioes=tag_to_bioes,
             encoding="latin-1",
             in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            train_file='ep-96-04-16.conll',
+            test_file='ep-96-04-15.conll',
             **corpusargs,
         )
 
 
-def add_IOB_tags(data_file: Union[str, Path], encoding: str = "utf8", ner_column: int = 1):
-    """
-Function that adds IOB tags if only chunk names are provided (e.g. words are tagged PER instead
-of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects
-the letter 'O'. Additionally it removes lines with no tags in the data file and can also
-be used if the data is only partially IOB tagged.
-Parameters
-----------
-data_file : Union[str, Path]
-    Path to the data file.
-encoding : str, optional
-    Encoding used in open function. The default is "utf8".
-ner_column : int, optional
-    Specifies the ner-tagged column. The default is 1 (the second column).
-
-"""
-
-    def add_I_prefix(current_line: List[str], ner: int, tag: str):
-        for i in range(0, len(current_line)):
-            if i == 0:
-                f.write(line_list[i])
-            elif i == ner:
-                f.write(' I-' + tag)
-            else:
-                f.write(' ' + current_line[i])
-        f.write('\n')
-
-    with open(file=data_file, mode='r', encoding=encoding) as f:
-        lines = f.readlines()
-    with open(file=data_file, mode='w', encoding=encoding) as f:
-        pred = 'O'  # remembers ner tag of predecessing line
-        for line in lines:
-            line_list = line.split()
-            if len(line_list) > 2:  # word with tags
-                ner_tag = line_list[ner_column]
-                if ner_tag in ['0', 'O']:  # no chunk
-                    for i in range(0, len(line_list)):
-                        if i == 0:
-                            f.write(line_list[i])
-                        elif i == ner_column:
-                            f.write(' O')
-                        else:
-                            f.write(' ' + line_list[i])
-                    f.write('\n')
-                    pred = 'O'
-                elif '-' not in ner_tag:  # no IOB tags
-                    if pred == 'O':  # found a new chunk
-                        add_I_prefix(line_list, ner_column, ner_tag)
-                        pred = ner_tag
-                    else:  # found further part of chunk or new chunk directly after old chunk
-                        add_I_prefix(line_list, ner_column, ner_tag)
-                        pred = ner_tag
-                else:  # line already has IOB tag (tag contains '-')
-                    f.write(line)
-                    pred = ner_tag.split('-')[1]
-            elif len(line_list) == 0:  # empty line
-                f.write('\n')
-                pred = 'O'
-
-
-def add_IOB2_tags(data_file: Union[str, Path], encoding: str = "utf8"):
-    """
-Function that adds IOB2 tags if only chunk names are provided (e.g. words are tagged PER instead
-of B-PER or I-PER). Replaces '0' with 'O' as the no-chunk tag since ColumnCorpus expects
-the letter 'O'. Additionally it removes lines with no tags in the data file and can also
-be used if the data is only partially IOB tagged.
-Parameters
-----------
-data_file : Union[str, Path]
-    Path to the data file.
-encoding : str, optional
-    Encoding used in open function. The default is "utf8".
-
-"""
-    with open(file=data_file, mode='r', encoding=encoding) as f:
-        lines = f.readlines()
-    with open(file=data_file, mode='w', encoding=encoding) as f:
-        pred = 'O'  # remembers tag of predecessing line
-        for line in lines:
-            line_list = line.split()
-            if len(line_list) == 2:  # word with tag
-                word = line_list[0]
-                tag = line_list[1]
-                if tag in ['0', 'O']:  # no chunk
-                    f.write(word + ' O\n')
-                    pred = 'O'
-                elif '-' not in tag:  # no IOB tags
-                    if pred == 'O':  # found a new chunk
-                        f.write(word + ' B-' + tag + '\n')
-                        pred = tag
-                    else:  # found further part of chunk or new chunk directly after old chunk
-                        if pred == tag:
-                            f.write(word + ' I-' + tag + '\n')
-                        else:
-                            f.write(word + ' B-' + tag + '\n')
-                            pred = tag
-                else:  # line already has IOB tag (tag contains '-')
-                    f.write(line)
-                    pred = tag.split('-')[1]
-            elif len(line_list) == 0:  # empty line
-                f.write('\n')
-                pred = 'O'
-
-
-class CONLL_03_SPANISH(ColumnCorpus):
+class GERMEVAL_14(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
@@ -806,19 +888,18 @@ def __init__(
             **corpusargs,
     ):
         """
-        Initialize the CoNLL-03 corpus for Spanish. The first time you call this constructor it will automatically
-        download the dataset.
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, should not be changed
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Initialize the GermEval NER corpus for German. This is only possible if you've manually downloaded it to your
+        machine. Obtain the corpus from https://sites.google.com/site/germeval2014ner/data and put it into some folder.
+        Then point the base_path parameter in the constructor to this folder
+        :param base_path: Path to the GermEval corpus on your machine
+        :param tag_to_bioes: 'ner' by default, should not be changed.
+        :param in_memory:If True, keeps dataset in memory giving speedups in training.
         """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "ner"}
+        columns = {1: "text", 2: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -828,43 +909,38 @@ def __init__(
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
-        # download data if necessary
-        conll_02_path = "https://www.clips.uantwerpen.be/conll2002/ner/data/"
-        cached_path(f"{conll_02_path}esp.testa", Path("datasets") / dataset_name)
-        cached_path(f"{conll_02_path}esp.testb", Path("datasets") / dataset_name)
-        cached_path(f"{conll_02_path}esp.train", Path("datasets") / dataset_name)
-
-        super(CONLL_03_SPANISH, self).__init__(
+        # check if data there
+        if not data_folder.exists():
+            log.warning("-" * 100)
+            log.warning(f'WARNING: GermEval-14 dataset not found at "{data_folder}".')
+            log.warning(
+                'Instructions for obtaining the data can be found here: https://sites.google.com/site/germeval2014ner/data"'
+            )
+            log.warning("-" * 100)
+        super(GERMEVAL_14, self).__init__(
             data_folder,
             columns,
             tag_to_bioes=tag_to_bioes,
-            encoding="latin-1",
+            comment_symbol="#",
             in_memory=in_memory,
             **corpusargs,
         )
 
 
-class CONLL_2000(ColumnCorpus):
+class INSPEC(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "np",
+            tag_to_bioes: str = "keyword",
             in_memory: bool = True,
             **corpusargs,
     ):
-        """
-        Initialize the CoNLL-2000 corpus for English chunking.
-        The first time you call this constructor it will automatically download the dataset.
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: 'np' by default, should not be changed, but you can set 'pos' instead to predict POS tags
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        """
+
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "pos", 2: "np"}
+        columns = {0: "text", 1: "keyword"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -874,78 +950,35 @@ def __init__(
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
-        # download data if necessary
-        conll_2000_path = "https://www.clips.uantwerpen.be/conll2000/chunking/"
-        data_file = Path(flair.cache_root) / "datasets" / dataset_name / "train.txt"
-        if not data_file.is_file():
-            cached_path(
-                f"{conll_2000_path}train.txt.gz", Path("datasets") / dataset_name
-            )
-            cached_path(
-                f"{conll_2000_path}test.txt.gz", Path("datasets") / dataset_name
-            )
-            import gzip, shutil
-
-            with gzip.open(
-                    Path(flair.cache_root) / "datasets" / dataset_name / "train.txt.gz",
-                    "rb",
-            ) as f_in:
-                with open(
-                        Path(flair.cache_root) / "datasets" / dataset_name / "train.txt",
-                        "wb",
-                ) as f_out:
-                    shutil.copyfileobj(f_in, f_out)
-            with gzip.open(
-                    Path(flair.cache_root) / "datasets" / dataset_name / "test.txt.gz", "rb"
-            ) as f_in:
-                with open(
-                        Path(flair.cache_root) / "datasets" / dataset_name / "test.txt",
-                        "wb",
-                ) as f_out:
-                    shutil.copyfileobj(f_in, f_out)
+        inspec_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/Inspec"
+        cached_path(f"{inspec_path}/train.txt", Path("datasets") / dataset_name)
+        cached_path(f"{inspec_path}/test.txt", Path("datasets") / dataset_name)
+        if not "dev.txt" in os.listdir(data_folder):
+            cached_path(f"{inspec_path}/valid.txt", Path("datasets") / dataset_name)
+            # rename according to train - test - dev - convention
+            os.rename(data_folder / "valid.txt", data_folder / "dev.txt")
 
-        super(CONLL_2000, self).__init__(
+        super(INSPEC, self).__init__(
             data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
         )
 
 
-class XTREME(MultiCorpus):
+class LER_GERMAN(ColumnCorpus):
     def __init__(
             self,
-            languages: Union[str, List[str]] = None,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
             in_memory: bool = False,
             **corpusargs,
     ):
         """
-        Xtreme corpus for cross-lingual NER consisting of datasets of a total of 176 languages. The data comes from the google 
-        research work XTREME https://github.com/google-research/xtreme. All datasets for NER and respective language abbreviations (e.g. 
-        "en" for english can be found here https://www.amazon.com/clouddrive/share/d3KGCRCIYwhKJF0H3eWA26hjg2ZCRhjpEQtDL70FSBN/folder/C43gs51bSIaq5sFTQkWNCQ?_encoding=UTF8&*Version*=1&*entries*=0&mgh=1 )
-        The data is derived from the wikiann dataset https://elisa-ie.github.io/wikiann/ (license: https://opendatacommons.org/licenses/by/)
-
-        Parameters
-        ----------
-        languages : Union[str, List[str]], optional
-            Default the 40 languages that are used in XTREME are loaded. Otherwise on can hand over a strings or a list of strings 
-            consisiting of abbreviations for languages. All datasets will be loaded in a MultiCorpus object.
-        base_path : Union[str, Path], optional
-            Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-            to point to a different folder but typically this should not be necessary.
-        tag_to_bioes : str, optional
-            The data is in bio-format. It will by default (with the string "ner" as value) be transformed
-            into the bioes format. If you dont want that set it to None.
-
+        Initialize the LER_GERMAN (Legal Entity Recognition) corpus. The first time you call this constructor it will automatically
+        download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
-        # if no languages are given as argument all languages used in XTREME will be loaded
-        if not languages:
-            languages = ["af", "ar", "bg", "bn", "de", "el", "en", "es", "et", "eu", "fa", "fi", "fr", "he", "hi", "hu",
-                         "id", "it", "ja", "jv", "ka", "kk", "ko", "ml", "mr", "ms", "my", "nl", "pt", "ru", "sw", "ta",
-                         "te", "th", "tl", "tr", "ur", "vi", "yo", "zh"]
-
-        # if only one language is given
-        if type(languages) == str:
-            languages = [languages]
 
         if type(base_path) == str:
             base_path: Path = Path(base_path)
@@ -954,113 +987,142 @@ def __init__(
         columns = {0: "text", 1: "ner"}
 
         # this dataset name
-        dataset_name = "xtreme"
+        dataset_name = self.__class__.__name__.lower()
 
         # default dataset folder is the cache root
         if not base_path:
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
-        # For each language in languages, the file is downloaded if not existent
-        # Then a comlumncorpus of that data is created and saved in a list
-        # This list is handed to the multicorpus
+        # download data if necessary
+        ler_path = "https://raw.githubusercontent.com/elenanereiss/Legal-Entity-Recognition/master/data/"
+        cached_path(f"{ler_path}ler.conll", Path("datasets") / dataset_name)
 
-        # list that contains the columncopora
-        corpora = []
+        super(LER_GERMAN, self).__init__(
+            data_folder,
+            columns,
+            tag_to_bioes=tag_to_bioes,
+            in_memory=in_memory,
+            train_file='ler.conll',
+            **corpusargs,
+        )
 
-        hu_path = "https://nlp.informatik.hu-berlin.de/resources/datasets/panx_dataset"
 
-        # download data if necessary
-        for language in languages:
+class MIT_MOVIE_NER_SIMPLE(ColumnCorpus):
+    def __init__(
+            self,
+            base_path: Union[str, Path] = None,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = True,
+            **corpusargs,
+    ):
+        """
+        Initialize the eng corpus of the MIT Movie Corpus (it has simpler queries compared to the trivia10k13 corpus)
+        in BIO format. The first time you call this constructor it will automatically download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
+        POS tags instead
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        """
+        # column format
+        columns = {0: "ner", 1: "text"}
 
-            language_folder = data_folder / language
+        # dataset name
+        dataset_name = self.__class__.__name__.lower()
 
-            # if language not downloaded yet, download it
-            if not language_folder.exists():
+        # data folder: default dataset folder is the cache root
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+        if not base_path:
+            base_path: Path = Path(flair.cache_root) / "datasets"
+        data_folder = base_path / dataset_name
 
-                file_name = language + '.tar.gz'
-                # create folder
-                os.makedirs(language_folder)
+        # download data if necessary
+        mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/"
+        train_file = "engtrain.bio"
+        test_file = "engtest.bio"
+        cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name)
+        cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name)
 
-                # download from HU Server
-                temp_file = cached_path(
-                    hu_path + "/" + file_name,
-                    Path("datasets") / dataset_name / language
-                )
+        super(MIT_MOVIE_NER_SIMPLE, self).__init__(
+            data_folder,
+            columns,
+            train_file=train_file,
+            test_file=test_file,
+            tag_to_bioes=tag_to_bioes,
+            in_memory=in_memory,
+            **corpusargs,
+        )
 
-                # unzip
-                print("Extract data...")
-                import tarfile
-                tar = tarfile.open(str(temp_file), "r:gz")
-                for part in ["train", "test", "dev"]:
-                    tar.extract(part, str(language_folder))
-                tar.close()
-                print('...done.')
 
-                # transform data into required format
-                print("Process dataset...")
-                for part in ["train", "test", "dev"]:
-                    xtreme_to_simple_ner_annotation(str(language_folder / part))
-                print('...done.')
+class MIT_MOVIE_NER_COMPLEX(ColumnCorpus):
+    def __init__(
+            self,
+            base_path: Union[str, Path] = None,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = True,
+            **corpusargs,
+    ):
+        """
+        Initialize the trivia10k13 corpus of the MIT Movie Corpus (it has more complex queries compared to the eng corpus)
+        in BIO format. The first time you call this constructor it will automatically download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
+        POS tags instead
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        """
+        # column format
+        columns = {0: "ner", 1: "text"}
 
-            # initialize comlumncorpus and add it to list
-            print("Read data into corpus...")
-            corp = ColumnCorpus(data_folder=language_folder,
-                                column_format=columns,
-                                tag_to_bioes=tag_to_bioes,
-                                in_memory=in_memory,
-                                )
-            corpora.append(corp)
-            print("...done.")
+        # dataset name
+        dataset_name = self.__class__.__name__.lower()
 
-        super(XTREME, self).__init__(
-            corpora, name='xtreme', **corpusargs,
-        )
+        # data folder: default dataset folder is the cache root
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+        if not base_path:
+            base_path: Path = Path(flair.cache_root) / "datasets"
+        data_folder = base_path / dataset_name
 
+        # download data if necessary
+        mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/"
+        train_file = "trivia10k13train.bio"
+        test_file = "trivia10k13test.bio"
+        cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name)
+        cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name)
 
-def xtreme_to_simple_ner_annotation(data_file: Union[str, Path]):
-    with open(data_file, 'r', encoding='utf-8') as f:
-        lines = f.readlines()
-    with open(data_file, 'w', encoding='utf-8') as f:
-        for line in lines:
-            if line == '\n':
-                f.write(line)
-            else:
-                liste = line.split()
-                f.write(liste[0].split(':', 1)[1] + ' ' + liste[1] + '\n')
+        super(MIT_MOVIE_NER_COMPLEX, self).__init__(
+            data_folder,
+            columns,
+            train_file=train_file,
+            test_file=test_file,
+            tag_to_bioes=tag_to_bioes,
+            in_memory=in_memory,
+            **corpusargs,
+        )
 
 
-class WIKIANN(MultiCorpus):
+class MIT_RESTAURANT_NER(ColumnCorpus):
     def __init__(
             self,
-            languages: Union[str, List[str]],
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
-            in_memory: bool = False,
+            in_memory: bool = True,
+            document_as_sequence: bool = False,
             **corpusargs,
     ):
         """
-        WkiAnn corpus for cross-lingual NER consisting of datasets from 282 languages that exist
-        in Wikipedia. See https://elisa-ie.github.io/wikiann/ for details and for the languages and their
-        respective abbreveations, i.e. "en" for english. (license: https://opendatacommons.org/licenses/by/)
-        Parameters
-        ----------
-        languages : Union[str, List[str]]
-            Should be an abbreviation of a language ("en", "de",..) or a list of abbreviations.
-            The datasets of all passed languages will be saved in one MultiCorpus.
-            (Note that, even though listed on https://elisa-ie.github.io/wikiann/ some datasets are empty.
-            This includes "aa", "cho", "ho", "hz", "ii", "jam", "kj", "kr", "mus", "olo" and "tcy".)
-        base_path : Union[str, Path], optional
-            Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-            to point to a different folder but typically this should not be necessary.
-        tag_to_bioes : str, optional
-            The data is in bio-format. It will by default (with the string "ner" as value) be transformed
-            into the bioes format. If you dont want that set it to None.
-
+        Initialize the experimental MIT Restaurant corpus available on https://groups.csail.mit.edu/sls/downloads/restaurant/.
+        The first time you call this constructor it will automatically download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
+        POS tags instead
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
-        if type(languages) == str:
-            languages = [languages]
-
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
@@ -1068,394 +1130,123 @@ def __init__(
         columns = {0: "text", 1: "ner"}
 
         # this dataset name
-        dataset_name = "wikiann"
+        dataset_name = self.__class__.__name__.lower()
 
         # default dataset folder is the cache root
         if not base_path:
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
-        # For each language in languages, the file is downloaded if not existent
-        # Then a comlumncorpus of that data is created and saved in a list
-        # this list is handed to the multicorpus
+        # download data if necessary
+        mit_restaurants_path = "https://megantosh.s3.eu-central-1.amazonaws.com/MITRestoCorpus/"
+        cached_path(f"{mit_restaurants_path}test.txt", Path("datasets") / dataset_name)
+        cached_path(f"{mit_restaurants_path}train.txt", Path("datasets") / dataset_name)
 
-        # list that contains the columncopora
-        corpora = []
+        super(MIT_RESTAURANT_NER, self).__init__(
+            data_folder,
+            columns,
+            tag_to_bioes=tag_to_bioes,
+            encoding="latin-1",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            **corpusargs,
+        )
+
+
+class NER_BASQUE(ColumnCorpus):
+    def __init__(
+            self,
+            base_path: Union[str, Path] = None,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = True,
+            **corpusargs,
+    ):
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
+        # column format
+        columns = {0: "text", 1: "ner"}
+
+        # this dataset name
+        dataset_name = self.__class__.__name__.lower()
+
+        # default dataset folder is the cache root
+        if not base_path:
+            base_path = Path(flair.cache_root) / "datasets"
+        data_folder = base_path / dataset_name
 
-        google_drive_path = 'https://drive.google.com/uc?id='
         # download data if necessary
-        first = True
-        for language in languages:
+        ner_basque_path = "http://ixa2.si.ehu.eus/eiec/"
+        data_path = Path(flair.cache_root) / "datasets" / dataset_name
+        data_file = data_path / "named_ent_eu.train"
+        if not data_file.is_file():
+            cached_path(
+                f"{ner_basque_path}/eiec_v1.0.tgz", Path("datasets") / dataset_name
+            )
+            import tarfile, shutil
 
-            language_folder = data_folder / language
-            file_name = 'wikiann-' + language + '.bio'
+            with tarfile.open(
+                    Path(flair.cache_root) / "datasets" / dataset_name / "eiec_v1.0.tgz",
+                    "r:gz",
+            ) as f_in:
+                corpus_files = (
+                    "eiec_v1.0/named_ent_eu.train",
+                    "eiec_v1.0/named_ent_eu.test",
+                )
+                for corpus_file in corpus_files:
+                    f_in.extract(corpus_file, data_path)
+                    shutil.move(f"{data_path}/{corpus_file}", data_path)
 
-            # if language not downloaded yet, download it
-            if not language_folder.exists():
-                if first == True:
-                    import gdown
-                    import tarfile
-                    first = False
-                # create folder
-                os.makedirs(language_folder)
-                # get google drive id from list
-                google_id = google_drive_id_from_language_name(language)
-                url = google_drive_path + google_id
+        super(NER_BASQUE, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
+        )
 
-                # download from google drive
-                gdown.download(url, str(language_folder / language) + '.tar.gz')
 
-                # unzip
-                print("Extract data...")
-                tar = tarfile.open(str(language_folder / language) + '.tar.gz', "r:gz")
-                # tar.extractall(language_folder,members=[tar.getmember(file_name)])
-                tar.extract(file_name, str(language_folder))
-                tar.close()
-                print('...done.')
+class NER_FINNISH(ColumnCorpus):
+    def __init__(
+            self,
+            base_path: Union[str, Path] = None,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = True,
+            **corpusargs,
+    ):
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
 
-                # transform data into required format
-                # the processed dataset has the additional ending "_new"
-                print("Process dataset...")
-                silver_standard_to_simple_ner_annotation(str(language_folder / file_name))
-                # remove the unprocessed dataset
-                os.remove(str(language_folder / file_name))
-                print('...done.')
+        # column format
+        columns = {0: "text", 1: "ner"}
 
-            # initialize comlumncorpus and add it to list
-            print("Read data into corpus...")
-            corp = ColumnCorpus(data_folder=language_folder,
-                                column_format=columns,
-                                train_file=file_name + '_new',
-                                tag_to_bioes=tag_to_bioes,
-                                in_memory=in_memory,
-                                )
-            corpora.append(corp)
-            print("...done.")
+        # this dataset name
+        dataset_name = self.__class__.__name__.lower()
 
-        super(WIKIANN, self).__init__(
-            corpora, name='wikiann', **corpusargs,
+        # default dataset folder is the cache root
+        if not base_path:
+            base_path = Path(flair.cache_root) / "datasets"
+        data_folder = base_path / dataset_name
+
+        # download data if necessary
+        ner_finnish_path = "https://raw.githubusercontent.com/mpsilfve/finer-data/master/data/digitoday."
+        cached_path(f"{ner_finnish_path}2014.train.csv", Path("datasets") / dataset_name)
+        cached_path(f"{ner_finnish_path}2014.dev.csv", Path("datasets") / dataset_name)
+        cached_path(f"{ner_finnish_path}2015.test.csv", Path("datasets") / dataset_name)
+
+        _remove_lines_without_annotations(data_file=Path(data_folder / "digitoday.2015.test.csv"))
+
+        super(NER_FINNISH, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, skip_first_line=True, **corpusargs,
         )
 
 
-def silver_standard_to_simple_ner_annotation(data_file: Union[str, Path]):
-    f_read = open(data_file, 'r', encoding='utf-8')
-    f_write = open(data_file + '_new', 'w+', encoding='utf-8')
-    while True:
-        line = f_read.readline()
-        if line:
-            if line == '\n':
-                f_write.write(line)
-            else:
-                liste = line.split()
-                f_write.write(liste[0] + ' ' + liste[-1] + '\n')
-        else:
-            break
-    f_read.close()
-    f_write.close()
+def _remove_lines_without_annotations(data_file: Union[str, Path] = None):
+    with open(data_file, 'r') as f:
+        lines = f.readlines()
+    with open(data_file, 'w') as f:
+        for line in lines:
+            if len(line.split()) != 1:
+                f.write(line)
 
 
-def google_drive_id_from_language_name(language):
-    languages_ids = {
-        'aa': '1tDDlydKq7KQQ3_23Ysbtke4HJOe4snIk',  # leer
-        'ab': '1hB8REj2XA_0DjI9hdQvNvSDpuBIb8qRf',
-        'ace': '1WENJS2ppHcZqaBEXRZyk2zY-PqXkTkgG',
-        'ady': '1n6On8WWDHxEoybj7F9K15d_fkGPy6KgO',
-        'af': '1CPB-0BD2tg3zIT60D3hmJT0i5O_SKja0',
-        'ak': '1l2vlGHnQwvm9XhW5S-403fetwUXhBlZm',
-        'als': '196xyYjhbie7sYLHLZHWkkurOwQLi8wK-',
-        'am': '1ug1IEoExKD3xWpvfZprAPSQi82YF9Cet',
-        'an': '1DNLgPOAOsGZBYd6rC5ddhzvc9_DtWnk2',
-        'ang': '1W_0ti7Tl8AkqM91lRCMPWEuUnPOAZroV',
-        'ar': '1tyvd32udEQG_cNeVpaD5I2fxvCc6XKIS',
-        'arc': '1hSOByStqPmP3b9HfQ39EclUZGo8IKCMb',
-        'arz': '1CKW5ZhxTpIHmc8Jt5JLz_5O6Cr8Icsan',
-        'as': '12opBoIweBLM8XciMHT4B6-MAaKdYdvpE',
-        'ast': '1rp64PxGZBDfcw-tpFBjLg_ddLDElG1II',
-        'av': '1hncGUrkG1vwAAQgLtwOf41BWkHkEvdss',
-        'ay': '1VmIsWpMTz442b4Mx798ZOgtB9vquKQtf',
-        'az': '1FXDXsvBSdqc7GGIDZv0hqBOaaw12Ip2-',
-        'azb': '1amVqOuHLEkhjn8rkGUl-mXdZlaACWyNT',
-        'ba': '1aLx1d8GagI11VZVYOGQy0BEePeqoT0x3',
-        'bar': '1JZ8-k8ZmnpWYI_Yl_cBBgjVdxoM9Daci',
-        'bat-smg': '1trxKXDFSeKsygTMKi-ZqXSJs7F90k5a8',
-        'bcl': '1Hs0k7KVZ2DPsqroZ4cUKcwZG4HdPV794',
-        'be-x-old': '1gaK-spj1m6eGYQ-SsngLxxLUvP1VRk08',
-        'be': '1_ttfOSy9BzCRkIT_p3mImT82XRPpEiuH',
-        'bg': '1Iug6gYKemb0OrLTUrKDc_c66YGypTfCF',
-        'bh': '12OcSFLu940A8tVQLxI8pnxKBpTeZHmrh',
-        'bi': '1rftVziS_pqARx4mvLJC0sKLY-OL5ZIjE',
-        'bjn': '1n17mkRjPUAOWQk5LQs2C3Tz3ShxK0enZ',
-        'bm': '1284dwO_sfdsWE7FR06HhfBRUb8ePesKR',
-        'bn': '1K2DM1mT4hkr6NlAIBTj95BeVXcgvpgDm',
-        'bo': '1SzGHDVK-OguKdjZ4DXWiOJVrie1iHeWm',
-        'bpy': '1m-e5EoruJufvwBEgJLmJtx6jzx64pYN2',
-        'br': '1xdaBoJ1DnwI0iEq7gQN1dWcABAs_bM9H',
-        'bs': '167dsB01trMYFQl8FshtIdfhjw7IfVKbk',
-        'bug': '1yCnevM9_KJzFk27Vxsva_20OacLo4Uam',
-        'bxr': '1DlByAX3zB-9UyEAVD4wtX-R7mXC-8xum',
-        'ca': '1LuUgbd9sGa-5Ahcsy31EK89a3WOowftY',
-        'cbk-zam': '1kgF8xoD-kIOWZET_9kp_4yNX6AAXn6PI',
-        'cdo': '14x1y6611G-UAEGq92QEHRpreVkYnoUCw',
-        'ce': '1QUUCVKA-fkiCHd3KT3zUWefaWnxzlZLu',
-        'ceb': '1DJZE9RfaMoPNXHI73KBXAm4YSe-_YCUk',
-        'ch': '1YzAfhmatkmTpkZbAcD6X83epCgzD5S2_',
-        'cho': '1ciY0vF3c5a2mTOo_k32A2wMs0klK98Kb',  # leer
-        'chr': '1EHaxz1UZHn7v2bbRzCLAhPsNtRzrG3Ae',
-        'chy': '1nNWwMAJr1KNdz3bHf6uIn-thZCknlTeB',
-        'ckb': '1llpaftcUSiXCZQZMdAqaJSrhwMdcf9IV',
-        'co': '1ZP-8oWgMYfW7a6w6ygEFkKDGbN39QnDn',
-        'cr': '1ST0xRicLAG4JdCZwGdaY-0pEXooQh7e6',
-        'crh': '1Jmpq2XVYUR_XaXU5XNhtOMnz-qkpsgpE',
-        'cs': '1Vydyze-jBkK_S1uV5ewV_Y6dbwhXr7lk',
-        'csb': '1naUyF74lZPnnopXdOqf5Xor2kT4WoHfS',
-        'cu': '1EN5dVTU6jc7YOYPCHq8EYUF31HlMUKs7',
-        'cv': '1gEUAlqYSSDI4TrWCqP1LUq2n0X1XEjN3',
-        'cy': '1q5g6NJE5GXf65Vc_P4BnUMHQ49Prz-J1',
-        'da': '11onAGOLkkqrIwM784siWlg-cewa5WKm8',
-        'de': '1f9nWvNkCCy6XWhd9uf4Dq-2--GzSaYAb',
-        'diq': '1IkpJaVbEOuOs9qay_KG9rkxRghWZhWPm',
-        'dsb': '1hlExWaMth-2eVIQ3i3siJSG-MN_7Z6MY',
-        'dv': '1WpCrslO4I7TMb2uaKVQw4U2U8qMs5szi',
-        'dz': '10WX52ePq2KfyGliwPvY_54hIjpzW6klV',
-        'ee': '1tYEt3oN2KPzBSWrk9jpCqnW3J1KXdhjz',
-        'el': '1cxq4NUYmHwWsEn5waYXfFSanlINXWLfM',
-        'eml': '17FgGhPZqZNtzbxpTJOf-6nxEuI5oU4Vd',
-        'en': '1mqxeCPjxqmO7e8utj1MQv1CICLFVvKa-',
-        'eo': '1YeknLymGcqj44ug2yd4P7xQVpSK27HkK',
-        'es': '1Dnx3MVR9r5cuoOgeew2gT8bDvWpOKxkU',
-        'et': '1Qhb3kYlQnLefWmNimdN_Vykm4mWzbcWy',
-        'eu': '1f613wH88UeITYyBSEMZByK-nRNMwLHTs',
-        'ext': '1D0nLOZ3aolCM8TShIRyCgF3-_MhWXccN',
-        'fa': '1QOG15HU8VfZvJUNKos024xI-OGm0zhEX',
-        'ff': '1h5pVjxDYcq70bSus30oqi9KzDmezVNry',
-        'fi': '1y3Kf6qYsSvL8_nSEwE1Y6Bf6ninaPvqa',
-        'fiu-vro': '1oKUiqG19WgPd3CCl4FGudk5ATmtNfToR',
-        'fj': '10xDMuqtoTJlJFp5ghbhKfNWRpLDK3W4d',
-        'fo': '1RhjYqgtri1276Be1N9RrNitdBNkpzh0J',
-        'fr': '1sK_T_-wzVPJYrnziNqWTriU52rEsXGjn',
-        'frp': '1NUm8B2zClBcEa8dHLBb-ZgzEr8phcQyZ',
-        'frr': '1FjNqbIUlOW1deJdB8WCuWjaZfUzKqujV',
-        'fur': '1oqHZMK7WAV8oHoZLjGR0PfmO38wmR6XY',
-        'fy': '1DvnU6iaTJc9bWedmDklHyx8nzKD1s3Ge',
-        'ga': '1Ql6rh7absdYQ8l-3hj_MVKcEC3tHKeFB',
-        'gag': '1zli-hOl2abuQ2wsDJU45qbb0xuvYwA3a',
-        'gan': '1u2dOwy58y-GaS-tCPJS_i9VRDQIPXwCr',
-        'gd': '1umsUpngJiwkLdGQbRqYpkgxZju9dWlRz',
-        'gl': '141K2IbLjJfXwFTIf-kthmmG0YWdi8liE',
-        'glk': '1ZDaxQ6ilXaoivo4_KllagabbvfOuiZ0c',
-        'gn': '1hM4MuCaVnZqnL-w-0N-WcWag22ikVLtZ',
-        'gom': '1BNOSw75tzPC0wEgLOCKbwu9wg9gcLOzs',
-        'got': '1YSHYBtXc1WvUvMIHPz6HHgJvaXKulJUj',
-        'gu': '1VdK-B2drqFwKg8KD23c3dKXY-cZgCMgd',
-        'gv': '1XZFohYNbKszEFR-V-yDXxx40V41PV9Zm',
-        'ha': '18ZG4tUU0owRtQA8Ey3Dl72ALjryEJWMC',
-        'hak': '1QQe3WgrCWbvnVH42QXD7KX4kihHURB0Z',
-        'haw': '1FLqlK-wpz4jy768XbQAtxd9PhC-9ciP7',
-        'he': '18K-Erc2VOgtIdskaQq4D5A3XkVstDmfX',
-        'hi': '1lBRapb5tjBqT176gD36K5yb_qsaFeu-k',
-        'hif': '153MQ9Ga4NQ-CkK8UiJM3DjKOk09fhCOV',
-        'ho': '1c1AoS7yq15iVkTEE-0f3x25NT4F202B8',  # leer
-        'hr': '1wS-UtB3sGHuXJQQGR0F5lDegogsgoyif',
-        'hsb': '1_3mMLzAE5OmXn2z64rW3OwWbo85Mirbd',
-        'ht': '1BwCaF0nfdgkM7Yt7A7d7KyVk0BcuwPGk',
-        'hu': '10AkDmTxUWNbOXuYLYZ-ZPbLAdGAGZZ8J',
-        'hy': '1Mi2k2alJJquT1ybd3GC3QYDstSagaWdo',
-        'hz': '1c1m_-Q92v0Di7Nez6VuaccrN19i8icKV',  # leer
-        'ia': '1jPyqTmDuVhEhj89N606Cja5heJEbcMoM',
-        'id': '1JWIvIh8fQoMQqk1rPvUThaskxnTs8tsf',
-        'ie': '1TaKRlTtB8-Wqu4sfvx6JQKIugAlg0pV-',
-        'ig': '15NFAf2Qx6BXSjv_Oun9_3QRBWNn49g86',
-        'ii': '1qldGJkMOMKwY13DpcgbxQCbff0K982f9',  # leer
-        'ik': '1VoSTou2ZlwVhply26ujowDz6gjwtxmny',
-        'ilo': '1-xMuIT6GaM_YeHqgm1OamGkxYfBREiv3',
-        'io': '19Zla0wsAcrZm2c0Pw5ghpp4rHjYs26Pp',
-        'is': '11i-NCyqS6HbldIbYulsCgQGZFXR8hwoB',
-        'it': '1HmjlOaQunHqL2Te7pIkuBWrnjlmdfYo_',
-        'iu': '18jKm1S7Ls3l0_pHqQH8MycG3LhoC2pdX',
-        'ja': '10dz8UxyK4RIacXE2HcGdrharmp5rwc3r',
-        'jam': '1v99CXf9RnbF6aJo669YeTR6mQRTOLZ74',  # leer
-        'jbo': '1_LmH9hc6FDGE3F7pyGB1fUEbSwuTYQdD',
-        'jv': '1qiSu1uECCLl4IBZS27FBdJIBivkJ7GwE',
-        'ka': '172UFuFRBX2V1aWeXlPSpu9TjS-3cxNaD',
-        'kaa': '1kh6hMPUdqO-FIxRY6qaIBZothBURXxbY',
-        'kab': '1oKjbZI6ZrrALCqnPCYgIjKNrKDA7ehcs',
-        'kbd': '1jNbfrboPOwJmlXQBIv053d7n5WXpMRv7',
-        'kg': '1iiu5z-sdJ2JLC4Ja9IgDxpRZklIb6nDx',
-        'ki': '1GUtt0QI84c5McyLGGxoi5uwjHOq1d6G8',
-        'kj': '1nSxXUSGDlXVCIPGlVpcakRc537MwuKZR',  # leer
-        'kk': '1ryC3UN0myckc1awrWhhb6RIi17C0LCuS',
-        'kl': '1gXtGtX9gcTXms1IExICnqZUHefrlcIFf',
-        'km': '1DS5ATxvxyfn1iWvq2G6qmjZv9pv0T6hD',
-        'kn': '1ZGLYMxbb5-29MNmuUfg2xFhYUbkJFMJJ',
-        'ko': '12r8tIkTnwKhLJxy71qpIcoLrT6NNhQYm',
-        'koi': '1EdG_wZ_Qk124EPAZw-w6rdEhYLsgcvIj',
-        'kr': '19VNQtnBA-YL_avWuVeHQHxJZ9MZ04WPF',  # leer
-        'krc': '1nReV4Mb7Wdj96czpO5regFbdBPu0zZ_y',
-        'ks': '1kzh0Pgrv27WRMstR9MpU8mu7p60TcT-X',
-        'ksh': '1iHJvrl2HeRaCumlrx3N7CPrHQ2KuLUkt',
-        'ku': '1YqJog7Bkk0fHBCSTxJ9heeE-bfbkbkye',
-        'kv': '1s91HI4eq8lQYlZwfrJAgaGlCyAtIhvIJ',
-        'kw': '16TaIX2nRfqDp8n7zudd4bqf5abN49dvW',
-        'ky': '17HPUKFdKWhUjuR1NOp5f3PQYfMlMCxCT',
-        'la': '1NiQuBaUIFEERvVXo6CQLwosPraGyiRYw',
-        'lad': '1PEmXCWLCqnjLBomMAYHeObM1AmVHtD08',
-        'lb': '1nE4g10xoTU23idmDtOQ0w2QCuizZ6QH_',
-        'lbe': '1KOm-AdRcCHfSc1-uYBxBA4GjxXjnIlE-',
-        'lez': '1cJAXshrLlF1TZlPHJTpDwEvurIOsz4yR',
-        'lg': '1Ur0y7iiEpWBgHECrIrT1OyIC8um_y4th',
-        'li': '1TikIqfqcZlSDWhOae1JnjJiDko4nj4Dj',
-        'lij': '1ro5ItUcF49iP3JdV82lhCQ07MtZn_VjW',
-        'lmo': '1W4rhBy2Pi5SuYWyWbNotOVkVY3kYWS_O',
-        'ln': '1bLSV6bWx0CgFm7ByKppZLpYCFL8EIAoD',
-        'lo': '1C6SSLeKF3QirjZbAZAcpVX_AXYg_TJG3',
-        'lrc': '1GUcS28MlJe_OjeQfS2AJ8uczpD8ut60e',
-        'lt': '1gAG6TcMTmC128wWK0rCXRlCTsJY9wFQY',
-        'ltg': '12ziP8t_fAAS9JqOCEC0kuJObEyuoiOjD',
-        'lv': '1MPuAM04u-AtfybXdpHwCqUpFWbe-zD0_',
-        'mai': '1d_nUewBkka2QGEmxCc9v3dTfvo7lPATH',
-        'map-bms': '1wrNIE-mqp2xb3lrNdwADe6pb7f35NP6V',
-        'mdf': '1BmMGUJy7afuKfhfTBMiKxM3D7FY-JrQ2',
-        'mg': '105WaMhcWa-46tCztoj8npUyg0aH18nFL',
-        'mh': '1Ej7n6yA1cF1cpD5XneftHtL33iHJwntT',
-        'mhr': '1CCPIUaFkEYXiHO0HF8_w07UzVyWchrjS',
-        'mi': '1F6au9xQjnF-aNBupGJ1PwaMMM6T_PgdQ',
-        'min': '1tVK5SHiCy_DaZSDm3nZBgT5bgWThbJt_',
-        'mk': '18NpudytGhSWq_LbmycTDw10cSftlSBGS',
-        'ml': '1V73UE-EvcE-vV3V1RTvU4sak6QFcP91y',
-        'mn': '14jRXicA87oXZOZllWqUjKBMetNpQEUUp',
-        'mo': '1YsLGNMsJ7VsekhdcITQeolzOSK4NzE6U',
-        'mr': '1vOr1AIHbgkhTO9Ol9Jx5Wh98Qdyh1QKI',
-        'mrj': '1dW-YmEW8a9D5KyXz8ojSdIXWGekNzGzN',
-        'ms': '1bs-_5WNRiZBjO-DtcNtkcIle-98homf_',
-        'mt': '1L7aU3iGjm6SmPIU74k990qRgHFV9hrL0',
-        'mus': '1_b7DcRqiKJFEFwp87cUecqf8A5BDbTIJ',  # leer
-        'mwl': '1MfP0jba2jQfGVeJOLq26MjI6fYY7xTPu',
-        'my': '16wsIGBhNVd2lC2p6n1X8rdMbiaemeiUM',
-        'myv': '1KEqHmfx2pfU-a1tdI_7ZxMQAk5NJzJjB',
-        'mzn': '1CflvmYEXZnWwpsBmIs2OvG-zDDvLEMDJ',
-        'na': '1r0AVjee5wNnrcgJxQmVGPVKg5YWz1irz',
-        'nah': '1fx6eu91NegyueZ1i0XaB07CKjUwjHN7H',
-        'nap': '1bhT4sXCJvaTchCIV9mwLBtf3a7OprbVB',
-        'nds-nl': '1UIFi8eOCuFYJXSAXZ9pCWwkQMlHaY4ye',
-        'nds': '1FLgZIXUWa_vekDt4ndY0B5XL7FNLiulr',
-        'ne': '1gEoCjSJmzjIH4kdHsbDZzD6ID4_78ekS',
-        'new': '1_-p45Ny4w9UvGuhD8uRNSPPeaARYvESH',
-        'ng': '11yxPdkmpmnijQUcnFHZ3xcOmLTYJmN_R',
-        'nl': '1dqYXg3ilzVOSQ_tz_dF47elSIvSIhgqd',
-        'nn': '1pDrtRhQ001z2WUNMWCZQU3RV_M0BqOmv',
-        'no': '1zuT8MI96Ivpiu9mEVFNjwbiM8gJlSzY2',
-        'nov': '1l38388Rln0NXsSARMZHmTmyfo5C0wYTd',
-        'nrm': '10vxPq1Nci7Wpq4XOvx3dtqODskzjdxJQ',
-        'nso': '1iaIV8qlT0RDnbeQlnxJ3RehsG3gU5ePK',
-        'nv': '1oN31jT0w3wP9aGwAPz91pSdUytnd9B0g',
-        'ny': '1eEKH_rUPC560bfEg11kp3kbe8qWm35IG',
-        'oc': '1C01cW8G_j8US-DTrsmeal_ENHTtNWn-H',
-        'olo': '1vbDwKZKqFq84dusr1SvDx5JbBcPanx9L',  # leer
-        'om': '1q3h22VMbWg2kgVFm-OArR-E4y1yBQ1JX',
-        'or': '1k8LwCE8nC7lq6neXDaS3zRn0KOrd9RnS',
-        'os': '1u81KAB34aEQfet00dLMRIBJsfRwbDTij',
-        'pa': '1JDEHL1VcLHBamgTPBom_Ryi8hk6PBpsu',
-        'pag': '1k905VUWnRgY8kFb2P2431Kr4dZuolYGF',
-        'pam': '1ssugGyJb8ipispC60B3I6kzMsri1WcvC',
-        'pap': '1Za0wfwatxYoD7jGclmTtRoBP0uV_qImQ',
-        'pcd': '1csJlKgtG04pdIYCUWhsCCZARKIGlEYPx',
-        'pdc': '1Xnms4RXZKZ1BBQmQJEPokmkiweTpouUw',
-        'pfl': '1tPQfHX7E0uKMdDSlwNw5aGmaS5bUK0rn',
-        'pi': '16b-KxNxzbEuyoNSlI3bfe2YXmdSEsPFu',
-        'pih': '1vwyihTnS8_PE5BNK7cTISmIBqGWvsVnF',
-        'pl': '1fijjS0LbfpKcoPB5V8c8fH08T8AkXRp9',
-        'pms': '12ySc7X9ajWWqMlBjyrPiEdc-qVBuIkbA',
-        'pnb': '1RB3-wjluhTKbdTGCsk3nag1bM3m4wENb',
-        'pnt': '1ZCUzms6fY4on_fW8uVgO7cEs9KHydHY_',
-        'ps': '1WKl9Av6Sqz6aHKyUM5kIh90mzFzyVWH9',
-        'pt': '13BX-_4_hcTUp59HDyczFDI32qUB94vUY',
-        'qu': '1CB_C4ygtRoegkqgcqfXNHr8oQd-UcvDE',
-        'rm': '1YRSGgWoxEqSojHXuBHJnY8vAHr1VgLu-',
-        'rmy': '1uFcCyvOWBJWKFQxbkYSp373xUXVl4IgF',
-        'rn': '1ekyyb2MvupYGY_E8_BhKvV664sLvW4aE',
-        'ro': '1YfeNTSoxU-zJMnyQotLk5X8B_6nHryBu',
-        'roa-rup': '150s4H4TdQ5nNYVC6j0E416TUAjBE85yy',
-        'roa-tara': '1H6emfQsD_a5yohK4RMPQ-GrnHXqqVgr3',
-        'ru': '11gP2s-SYcfS3j9MjPp5C3_nFeQB-8x86',
-        'rue': '1OuSglZAndja1J5D5IUmdbt_niTTyEgYK',
-        'rw': '1NuhHfi0-B-Xlr_BApijnxCw0WMEltttP',
-        'sa': '1P2S3gL_zvKgXLKJJxg-Fb4z8XdlVpQik',
-        'sah': '1qz0MpKckzUref2FX_FYiNzI2p4BDc5oR',
-        'sc': '1oAYj_Fty4FUwjAOBEBaiZt_cY8dtpDfA',
-        'scn': '1sDN9zHkXWYoHYx-DUu-GPvsUgB_IRa8S',
-        'sco': '1i8W7KQPj6YZQLop89vZBSybJNgNsvXWR',
-        'sd': '1vaNqfv3S8Gl5pQmig3vwWQ3cqRTsXmMR',
-        'se': '1RT9xhn0Vl90zjWYDTw5V1L_u1Oh16tpP',
-        'sg': '1iIh2oXD2Szz_AygUvTt3_ZK8a3RYEGZ_',
-        'sh': '1qPwLiAm6t4__G-zVEOrBgYx6VRmgDgiS',
-        'si': '1G5ryceID0TP6SAO42e-HAbIlCvYmnUN7',
-        'simple': '1FVV49o_RlK6M5Iw_7zeJOEDQoTa5zSbq',
-        'sk': '11mkYvbmAWKTInj6t4Ma8BUPxoR5o6irL',
-        'sl': '1fsIZS5LgMzMzZ6T7ogStyj-ILEZIBRvO',
-        'sm': '1yefECpKX_Y4R7G2tggIxvc_BvJfOAz-t',
-        'sn': '1fYeCjMPvRAv94kvZjiKI-ktIDLkbv0Ve',
-        'so': '1Uc-eSZnJb36SgeTvRU3GirXZOlGD_NB6',
-        'sq': '11u-53n71O_yjpwRiCQSwgL7N2w72ZptX',
-        'sr': '1PGLGlQi8Q0Eac6dib-uuCJAAHK6SF5Pz',
-        'srn': '1JKiL3TSXqK1-KhPfAwMK0uqw90WEzg7M',
-        'ss': '1e0quNEsA1dn57-IbincF4D82dRWgzQlp',
-        'st': '1ny-FBzpBqIDgv6jMcsoFev3Ih65FNZFO',
-        'stq': '15Fx32ROy2IM6lSqAPUykkr3CITR6Xd7v',
-        'su': '1C0FJum7bYZpnyptBvfAgwJb0TX2hggtO',
-        'sv': '1YyqzOSXzK5yrAou9zeTDWH_7s569mDcz',
-        'sw': '1_bNTj6T8eXlNAIuHaveleWlHB_22alJs',
-        'szl': '1_dXEip1snK4CPVGqH8x7lF5O-6FdCNFW',
-        'ta': '1ZFTONsxGtSnC9QB6RpWSvgD_MbZwIhHH',
-        'tcy': '15R6u7KQs1vmDSm_aSDrQMJ3Q6q3Be0r7',  # leer
-        'te': '11Sx-pBAPeZOXGyv48UNSVMD0AH7uf4YN',
-        'tet': '11mr2MYLcv9pz7mHhGGNi5iNCOVErYeOt',
-        'tg': '16ttF7HWqM9Cnj4qmgf3ZfNniiOJfZ52w',
-        'th': '14xhIt-xr5n9nMuvcwayCGM1-zBCFZquW',
-        'ti': '123q5e9MStMShp8eESGtHdSBGLDrCKfJU',
-        'tk': '1X-JNInt34BNGhg8A8Peyjw2WjsALdXsD',
-        'tl': '1WkQHbWd9cqtTnSHAv0DpUThaBnzeSPTJ',
-        'tn': '1fHfQHetZn8-fLuRZEu-cvs-kQYwPvjyL',
-        'to': '1cHOLaczYJ8h-OqQgxeoH9vMG3izg6muT',
-        'tpi': '1YsRjxVu6NYOrXRb8oqMO9FPaicelFEcu',
-        'tr': '1J1Zy02IxvtCK0d1Ba2h_Ulit1mVb9UIX',
-        'ts': '1pIcfAt3KmtmDkyhOl-SMSeoM8aP8bOpl',
-        'tt': '1vsfzCjj-_bMOn5jBai41TF5GjKJM_Ius',
-        'tum': '1NWcg65daI2Bt0awyEgU6apUDbBmiqCus',
-        'tw': '1WCYKZIqS7AagS76QFSfbteiOgFNBvNne',
-        'ty': '1DIqaP1l-N9VXTNokrlr6EuPMGE765o4h',
-        'tyv': '1F3qa05OYLBcjT1lXMurAJFDXP_EesCvM',
-        'udm': '1T0YMTAPLOk768sstnewy5Jxgx2RPu3Rb',
-        'ug': '1fjezvqlysyZhiQMZdazqLGgk72PqtXAw',
-        'uk': '1UMJCHtzxkfLDBJE7NtfN5FeMrnnUVwoh',
-        'ur': '1WNaD2TuHvdsF-z0k_emQYchwoQQDFmRk',
-        'uz': '11wrG2FSTpRJc2jb5MhgvxjkVDYhT8M-l',
-        've': '1PucJ7pJ4CXGEXZ5p_WleZDs2usNz74to',
-        'vec': '1cAVjm_y3ehNteDQIYz9yyoq1EKkqOXZ0',
-        'vep': '1K_eqV7O6C7KPJWZtmIuzFMKAagj-0O85',
-        'vi': '1yQ6nhm1BmG9lD4_NaG1hE5VV6biEaV5f',
-        'vls': '1bpQQW6pKHruKJJaKtuggH5rReMXyeVXp',
-        'vo': '1D80QRdTpe7H4mHFKpfugscsjX71kiMJN',
-        'wa': '1m4B81QYbf74htpInDU5p7d0n0ot8WLPZ',
-        'war': '1EC3jsHtu22tHBv6jX_I4rupC5RwV3OYd',
-        'wo': '1vChyqNNLu5xYHdyHpACwwpw4l3ptiKlo',
-        'wuu': '1_EIn02xCUBcwLOwYnA-lScjS2Lh2ECw6',
-        'xal': '19bKXsL1D2UesbB50JPyc9TpG1lNc2POt',
-        'xh': '1pPVcxBG3xsCzEnUzlohc_p89gQ9dSJB3',
-        'xmf': '1SM9llku6I_ZuZz05mOBuL2lx-KQXvehr',
-        'yi': '1WNWr1oV-Nl7c1Jv8x_MiAj2vxRtyQawu',
-        'yo': '1yNVOwMOWeglbOcRoZzgd4uwlN5JMynnY',
-        'za': '1i7pg162cD_iU9h8dgtI2An8QCcbzUAjB',
-        'zea': '1EWSkiSkPBfbyjWjZK0VuKdpqFnFOpXXQ',
-        'zh-classical': '1uUKZamNp08KA7s7794sKPOqPALvo_btl',
-        'zh-min-nan': '1oSgz3YBXLGUgI7kl-uMOC_ww6L0FNFmp',
-        'zh-yue': '1zhwlUeeiyOAU1QqwqZ8n91yXIRPFA7UE',
-        'zh': '1LZ96GUhkVHQU-aj2C3WOrtffOp0U3Z7f',
-        'zu': '1FyXl_UK1737XB3drqQFhGXiJrJckiB1W'
-    }
-    return languages_ids[language]
-
-
-class DANE(ColumnCorpus):
+class NER_SWEDISH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
@@ -1463,11 +1254,20 @@ def __init__(
             in_memory: bool = True,
             **corpusargs,
     ):
+        """
+        Initialize the NER_SWEDISH corpus for Swedish. The first time you call this constructor it will automatically
+        download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
+
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {1: 'text', 3: 'pos', 9: 'ner'}
+        columns = {0: "text", 1: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1478,66 +1278,37 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        data_path = Path(flair.cache_root) / "datasets" / dataset_name
-        train_data_file = data_path / "ddt.train.conllu"
-        if not train_data_file.is_file():
-            temp_file = cached_path(
-                'https://danlp.alexandra.dk/304bd159d5de/datasets/ddt.zip',
-                Path("datasets") / dataset_name
-            )
-            from zipfile import ZipFile
-
-            with ZipFile(temp_file, 'r') as zip_file:
-                zip_file.extractall(path=data_path)
-
-            # Remove CoNLL-U meta information in the last column
-            for part in ['train', 'dev', 'test']:
-                lines = []
-                data_file = "ddt.{}.conllu".format(part)
-                with open(data_path / data_file, 'r') as file:
-                    for line in file:
-                        if line.startswith("#") or line == "\n":
-                            lines.append(line)
-                        lines.append(line.replace("name=", "").replace("|SpaceAfter=No", ""))
-
-                with open(data_path / data_file, 'w') as file:
-                    file.writelines(lines)
+        ner_spraakbanken_path = "https://raw.githubusercontent.com/klintan/swedish-ner-corpus/master/"
+        cached_path(f"{ner_spraakbanken_path}test_corpus.txt", Path("datasets") / dataset_name)
+        cached_path(f"{ner_spraakbanken_path}train_corpus.txt", Path("datasets") / dataset_name)
 
-                print(data_path / data_file)
+        # data is not in IOB2 format. Thus we transform it to IOB2
+        add_IOB2_tags(data_file=Path(data_folder / "test_corpus.txt"))
+        add_IOB2_tags(data_file=Path(data_folder / "train_corpus.txt"))
 
-        super(DANE, self).__init__(
+        super(NER_SWEDISH, self).__init__(
             data_folder,
             columns,
             tag_to_bioes=tag_to_bioes,
             in_memory=in_memory,
-            comment_symbol="#",
             **corpusargs,
         )
 
 
-class EUROPARL_NER_GERMAN(ColumnCorpus):
+class SEC_FILLINGS(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
-            in_memory: bool = False,
+            in_memory: bool = True,
             **corpusargs,
     ):
-        """
-        Initialize the EUROPARL_NER_GERMAN corpus. The first time you call this constructor it will automatically
-        download the dataset.
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: 'ner' by default, should not be changed.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
 
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: 'text', 1: 'lemma', 2: 'pos', 3: 'np', 4: 'ner'}
+        columns = {0: "text", 1: "pos", 3: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1548,46 +1319,37 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        europarl_ner_german_path = "https://nlpado.de/~sebastian/software/ner/"
-        cached_path(f"{europarl_ner_german_path}ep-96-04-15.conll", Path("datasets") / dataset_name)
-        cached_path(f"{europarl_ner_german_path}ep-96-04-16.conll", Path("datasets") / dataset_name)
-
-        add_IOB_tags(data_file=Path(data_folder / "ep-96-04-15.conll"), encoding="latin-1", ner_column=4)
-        add_IOB_tags(data_file=Path(data_folder / "ep-96-04-16.conll"), encoding="latin-1", ner_column=4)
+        SEC_FILLINGS_Path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/SEC-filings/CONLL-format/data/"
+        cached_path(f"{SEC_FILLINGS_Path}test/FIN3.txt", Path("datasets") / dataset_name)
+        cached_path(f"{SEC_FILLINGS_Path}train/FIN5.txt", Path("datasets") / dataset_name)
 
-        super(EUROPARL_NER_GERMAN, self).__init__(
+        super(SEC_FILLINGS, self).__init__(
             data_folder,
             columns,
             tag_to_bioes=tag_to_bioes,
-            encoding="latin-1",
+            encoding="utf-8",
             in_memory=in_memory,
-            train_file='ep-96-04-16.conll',
-            test_file='ep-96-04-15.conll',
+            train_file='FIN5.txt',
+            test_file="FIN3.txt",
+            skip_first_line=True,
             **corpusargs,
         )
 
 
-class GERMEVAL_14(ColumnCorpus):
+class SEMEVAL2017(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
+            tag_to_bioes: str = "keyword",
             in_memory: bool = True,
             **corpusargs,
     ):
-        """
-        Initialize the GermEval NER corpus for German. This is only possible if you've manually downloaded it to your
-        machine. Obtain the corpus from https://sites.google.com/site/germeval2014ner/data and put it into some folder.
-        Then point the base_path parameter in the constructor to this folder
-        :param base_path: Path to the GermEval corpus on your machine
-        :param tag_to_bioes: 'ner' by default, should not be changed.
-        :param in_memory:If True, keeps dataset in memory giving speedups in training.
-        """
+
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {1: "text", 2: "ner"}
+        columns = {0: "text", 1: "keyword"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1597,25 +1359,17 @@ def __init__(
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
-        # check if data there
-        if not data_folder.exists():
-            log.warning("-" * 100)
-            log.warning(f'WARNING: GermEval-14 dataset not found at "{data_folder}".')
-            log.warning(
-                'Instructions for obtaining the data can be found here: https://sites.google.com/site/germeval2014ner/data"'
-            )
-            log.warning("-" * 100)
-        super(GERMEVAL_14, self).__init__(
-            data_folder,
-            columns,
-            tag_to_bioes=tag_to_bioes,
-            comment_symbol="#",
-            in_memory=in_memory,
-            **corpusargs,
+        semeval2017_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/SemEval-2017"
+        cached_path(f"{semeval2017_path}/train.txt", Path("datasets") / dataset_name)
+        cached_path(f"{semeval2017_path}/test.txt", Path("datasets") / dataset_name)
+        cached_path(f"{semeval2017_path}/dev.txt", Path("datasets") / dataset_name)
+
+        super(SEMEVAL2017, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
         )
 
 
-class INSPEC(ColumnCorpus):
+class SEMEVAL2010(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
@@ -1638,36 +1392,34 @@ def __init__(
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
-        inspec_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/Inspec"
-        cached_path(f"{inspec_path}/train.txt", Path("datasets") / dataset_name)
-        cached_path(f"{inspec_path}/test.txt", Path("datasets") / dataset_name)
-        if not "dev.txt" in os.listdir(data_folder):
-            cached_path(f"{inspec_path}/valid.txt", Path("datasets") / dataset_name)
-            # rename according to train - test - dev - convention
-            os.rename(data_folder / "valid.txt", data_folder / "dev.txt")
+        semeval2010_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/processed_semeval-2010"
+        cached_path(f"{semeval2010_path}/train.txt", Path("datasets") / dataset_name)
+        cached_path(f"{semeval2010_path}/test.txt", Path("datasets") / dataset_name)
 
-        super(INSPEC, self).__init__(
+        super(SEMEVAL2010, self).__init__(
             data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
         )
 
 
-class LER_GERMAN(ColumnCorpus):
+class TURKU_NER(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
-            in_memory: bool = False,
+            in_memory: bool = True,
+            document_as_sequence: bool = False,
             **corpusargs,
     ):
         """
-        Initialize the LER_GERMAN (Legal Entity Recognition) corpus. The first time you call this constructor it will automatically
+        Initialize the Finnish TurkuNER corpus. The first time you call this constructor it will automatically
         download the dataset.
         :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
         to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
+        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
+        POS tags instead
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
         :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
-
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
@@ -1683,19 +1435,30 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        ler_path = "https://raw.githubusercontent.com/elenanereiss/Legal-Entity-Recognition/master/data/"
-        cached_path(f"{ler_path}ler.conll", Path("datasets") / dataset_name)
+        conll_path = "https://raw.githubusercontent.com/TurkuNLP/turku-ner-corpus/master/data/conll"
+        dev_file = "dev.tsv"
+        test_file = "test.tsv"
+        train_file = "train.tsv"
+        cached_path(f"{conll_path}/{dev_file}", Path("datasets") / dataset_name)
+        cached_path(f"{conll_path}/{test_file}", Path("datasets") / dataset_name)
+        cached_path(f"{conll_path}/{train_file}", Path("datasets") / dataset_name)
 
-        super(LER_GERMAN, self).__init__(
+        super(TURKU_NER, self).__init__(
             data_folder,
             columns,
+            dev_file=dev_file,
+            test_file=test_file,
+            train_file=train_file,
+            column_delimiter="\t",
             tag_to_bioes=tag_to_bioes,
+            encoding="latin-1",
             in_memory=in_memory,
-            train_file='ler.conll',
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
             **corpusargs,
         )
 
-class ANER_CORP(ColumnCorpus):
+
+class TWITTER_NER(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
@@ -1705,15 +1468,14 @@ def __init__(
             **corpusargs,
     ):
         """
-        Initialize a preprocessed version of the Arabic Named Entity Recognition Corpus (ANERCorp) dataset available
-        from https://github.com/EmnamoR/Arabic-named-entity-recognition/blob/master/ANERCorp.rar.
-        http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp
-        Column order is swapped
-        The first time you call this constructor it will automatically download the dataset.
+        Initialize a dataset called twitter_ner which can be found on the following page:
+        https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/ner.txt.
+
+        The first time you call this constructor it will automatically
+        download the dataset.
         :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
         to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
-        POS tags instead
+        :param tag_to_bioes: NER by default, need not be changed
         :param in_memory: If True, keeps dataset in memory giving speedups in training.
         :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
@@ -1721,7 +1483,7 @@ def __init__(
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "ner"}
+        columns = {0: 'text', 1: 'ner'}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1732,34 +1494,43 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        anercorp_path = "https://megantosh.s3.eu-central-1.amazonaws.com/ANERcorp/"
-        # cached_path(f"{anercorp_path}test.txt", Path("datasets") / dataset_name)
-        cached_path(f"{anercorp_path}train.txt", Path("datasets") / dataset_name)
+        twitter_ner_path = "https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/"
+        cached_path(f"{twitter_ner_path}ner.txt", Path("datasets") / dataset_name)
 
-        super(ANER_CORP, self).__init__(
+        super(TWITTER_NER, self).__init__(
             data_folder,
             columns,
-            # tag_to_bioes=tag_to_bioes,
-            encoding="utf-8",
+            tag_to_bioes=tag_to_bioes,
+            encoding="latin-1",
+            train_file="ner.txt",
             in_memory=in_memory,
             document_separator_token=None if not document_as_sequence else "-DOCSTART-",
             **corpusargs,
         )
 
 
-class NER_BASQUE(ColumnCorpus):
+class UP_CHINESE(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
             in_memory: bool = True,
+            document_as_sequence: bool = False,
             **corpusargs,
     ):
+        """
+        Initialize the Chinese dataset from the Universal Propositions Bank, comming from that webpage:
+        https://github.com/System-T/UniversalPropositions/tree/master/UP_Chinese
+
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "ner"}
+        columns = {1: "text", 9: "frame"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1770,45 +1541,47 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        ner_basque_path = "http://ixa2.si.ehu.eus/eiec/"
-        data_path = Path(flair.cache_root) / "datasets" / dataset_name
-        data_file = data_path / "named_ent_eu.train"
-        if not data_file.is_file():
-            cached_path(
-                f"{ner_basque_path}/eiec_v1.0.tgz", Path("datasets") / dataset_name
-            )
-            import tarfile, shutil
-
-            with tarfile.open(
-                    Path(flair.cache_root) / "datasets" / dataset_name / "eiec_v1.0.tgz",
-                    "r:gz",
-            ) as f_in:
-                corpus_files = (
-                    "eiec_v1.0/named_ent_eu.train",
-                    "eiec_v1.0/named_ent_eu.test",
-                )
-                for corpus_file in corpus_files:
-                    f_in.extract(corpus_file, data_path)
-                    shutil.move(f"{data_path}/{corpus_file}", data_path)
+        up_zh_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Chinese/"
+        cached_path(f"{up_zh_path}zh-up-train.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_zh_path}zh-up-dev.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_zh_path}zh-up-test.conllu", Path("datasets") / dataset_name)
 
-        super(NER_BASQUE, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
+        super(UP_CHINESE, self).__init__(
+            data_folder,
+            columns,
+            encoding="utf-8",
+            train_file="zh-up-train.conllu",
+            test_file="zh-up-test.conllu",
+            dev_file="zh-up-dev.conllu",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            comment_symbol="#",
+            **corpusargs,
         )
 
 
-class NER_FINNISH(ColumnCorpus):
+class UP_ENGLISH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
             in_memory: bool = True,
+            document_as_sequence: bool = False,
             **corpusargs,
     ):
+        """
+        Initialize the English dataset from the Universal Propositions Bank, comming from that webpage:
+        https://github.com/System-T/UniversalPropositions.
+
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "ner"}
+        columns = {1: "text", 10: "frame"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1819,49 +1592,47 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        ner_finnish_path = "https://raw.githubusercontent.com/mpsilfve/finer-data/master/data/digitoday."
-        cached_path(f"{ner_finnish_path}2014.train.csv", Path("datasets") / dataset_name)
-        cached_path(f"{ner_finnish_path}2014.dev.csv", Path("datasets") / dataset_name)
-        cached_path(f"{ner_finnish_path}2015.test.csv", Path("datasets") / dataset_name)
-
-        _remove_lines_without_annotations(data_file=Path(data_folder / "digitoday.2015.test.csv"))
+        up_en_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_English-EWT/"
+        cached_path(f"{up_en_path}en_ewt-up-train.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_en_path}en_ewt-up-dev.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_en_path}en_ewt-up-test.conllu", Path("datasets") / dataset_name)
 
-        super(NER_FINNISH, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, skip_first_line=True, **corpusargs,
+        super(UP_ENGLISH, self).__init__(
+            data_folder,
+            columns,
+            encoding="utf-8",
+            train_file="en_ewt-up-train.conllu",
+            test_file="en_ewt-up-test.conllu",
+            dev_file="en_ewt-up-dev.conllu",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            comment_symbol="#",
+            **corpusargs,
         )
 
 
-def _remove_lines_without_annotations(data_file: Union[str, Path] = None):
-    with open(data_file, 'r') as f:
-        lines = f.readlines()
-    with open(data_file, 'w') as f:
-        for line in lines:
-            if len(line.split()) != 1:
-                f.write(line)
-
-
-class NER_SWEDISH(ColumnCorpus):
+class UP_FRENCH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
             in_memory: bool = True,
+            document_as_sequence: bool = False,
             **corpusargs,
     ):
         """
-        Initialize the NER_SWEDISH corpus for Swedish. The first time you call this constructor it will automatically
-        download the dataset.
+        Initialize the French dataset from the Universal Propositions Bank, comming from that webpage:
+        https://github.com/System-T/UniversalPropositions.
+
         :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
         to point to a different folder but typically this should not be necessary.
         :param in_memory: If True, keeps dataset in memory giving speedups in training.
         :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
-
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "ner"}
+        columns = {1: "text", 9: "frame"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1872,37 +1643,47 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        ner_spraakbanken_path = "https://raw.githubusercontent.com/klintan/swedish-ner-corpus/master/"
-        cached_path(f"{ner_spraakbanken_path}test_corpus.txt", Path("datasets") / dataset_name)
-        cached_path(f"{ner_spraakbanken_path}train_corpus.txt", Path("datasets") / dataset_name)
-
-        # data is not in IOB2 format. Thus we transform it to IOB2
-        add_IOB2_tags(data_file=Path(data_folder / "test_corpus.txt"))
-        add_IOB2_tags(data_file=Path(data_folder / "train_corpus.txt"))
+        up_fr_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_French/"
+        cached_path(f"{up_fr_path}fr-up-train.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_fr_path}fr-up-dev.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_fr_path}fr-up-test.conllu", Path("datasets") / dataset_name)
 
-        super(NER_SWEDISH, self).__init__(
+        super(UP_FRENCH, self).__init__(
             data_folder,
             columns,
-            tag_to_bioes=tag_to_bioes,
+            encoding="utf-8",
+            train_file="fr-up-train.conllu",
+            test_file="fr-up-test.conllu",
+            dev_file="fr-up-dev.conllu",
             in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            comment_symbol="#",
             **corpusargs,
         )
 
 
-class SEMEVAL2017(ColumnCorpus):
+class UP_FINNISH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "keyword",
             in_memory: bool = True,
+            document_as_sequence: bool = False,
             **corpusargs,
     ):
+        """
+        Initialize the Finnish dataset from the Universal Propositions Bank, comming from that webpage:
+        https://github.com/System-T/UniversalPropositions/tree/master/UP_Finnish
 
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "keyword"}
+        columns = {1: "text", 9: "frame"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1912,30 +1693,48 @@ def __init__(
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
-        semeval2017_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/SemEval-2017"
-        cached_path(f"{semeval2017_path}/train.txt", Path("datasets") / dataset_name)
-        cached_path(f"{semeval2017_path}/test.txt", Path("datasets") / dataset_name)
-        cached_path(f"{semeval2017_path}/dev.txt", Path("datasets") / dataset_name)
+        # download data if necessary
+        up_fi_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Finnish/"
+        cached_path(f"{up_fi_path}fi-up-train.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_fi_path}fi-up-dev.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_fi_path}fi-up-test.conllu", Path("datasets") / dataset_name)
 
-        super(SEMEVAL2017, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
+        super(UP_FINNISH, self).__init__(
+            data_folder,
+            columns,
+            encoding="utf-8",
+            train_file="fi-up-train.conllu",
+            test_file="fi-up-test.conllu",
+            dev_file="fi-up-dev.conllu",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            comment_symbol="#",
+            **corpusargs,
         )
 
 
-class SEMEVAL2010(ColumnCorpus):
+class UP_GERMAN(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "keyword",
             in_memory: bool = True,
+            document_as_sequence: bool = False,
             **corpusargs,
     ):
+        """
+        Initialize the German dataset from the Universal Propositions Bank, comming from that webpage:
+        https://github.com/System-T/UniversalPropositions.
 
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "keyword"}
+        columns = {1: "text", 9: "frame"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1945,28 +1744,48 @@ def __init__(
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
-        semeval2010_path = "https://raw.githubusercontent.com/midas-research/keyphrase-extraction-as-sequence-labeling-data/master/processed_semeval-2010"
-        cached_path(f"{semeval2010_path}/train.txt", Path("datasets") / dataset_name)
-        cached_path(f"{semeval2010_path}/test.txt", Path("datasets") / dataset_name)
+        # download data if necessary
+        up_de_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_German/"
+        cached_path(f"{up_de_path}de-up-train.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_de_path}de-up-dev.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_de_path}de-up-test.conllu", Path("datasets") / dataset_name)
 
-        super(SEMEVAL2010, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
+        super(UP_GERMAN, self).__init__(
+            data_folder,
+            columns,
+            encoding="utf-8",
+            train_file="de-up-train.conllu",
+            test_file="de-up-test.conllu",
+            dev_file="de-up-dev.conllu",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            comment_symbol="#",
+            **corpusargs,
         )
 
 
-class WIKINER_ENGLISH(ColumnCorpus):
+class UP_ITALIAN(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
-            in_memory: bool = False,
+            in_memory: bool = True,
+            document_as_sequence: bool = False,
             **corpusargs,
     ):
+        """
+        Initialize the Italian dataset from the Universal Propositions Bank, comming from that webpage:
+        https://github.com/System-T/UniversalPropositions/tree/master/UP_Italian
+
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "pos", 2: "ner"}
+        columns = {1: "text", 9: "frame"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -1977,26 +1796,47 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        _download_wikiner("en", dataset_name)
+        up_it_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Italian/"
+        cached_path(f"{up_it_path}it-up-train.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_it_path}it-up-dev.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_it_path}it-up-test.conllu", Path("datasets") / dataset_name)
 
-        super(WIKINER_ENGLISH, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
+        super(UP_ITALIAN, self).__init__(
+            data_folder,
+            columns,
+            encoding="utf-8",
+            train_file="it-up-train.conllu",
+            test_file="it-up-test.conllu",
+            dev_file="it-up-dev.conllu",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            comment_symbol="#",
+            **corpusargs,
         )
 
 
-class WIKINER_GERMAN(ColumnCorpus):
+class UP_SPANISH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
-            in_memory: bool = False,
+            in_memory: bool = True,
+            document_as_sequence: bool = False,
             **corpusargs,
     ):
+        """
+        Initialize the Spanish dataset from the Universal Propositions Bank, comming from that webpage:
+        https://github.com/System-T/UniversalPropositions
+
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "pos", 2: "ner"}
+        columns = {1: "text", 9: "frame"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2007,26 +1847,47 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        _download_wikiner("de", dataset_name)
+        up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish/"
+        cached_path(f"{up_es_path}es-up-train.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_es_path}es-up-dev.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_es_path}es-up-test.conllu", Path("datasets") / dataset_name)
 
-        super(WIKINER_GERMAN, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
+        super(UP_SPANISH, self).__init__(
+            data_folder,
+            columns,
+            encoding="utf-8",
+            train_file="es-up-train.conllu",
+            test_file="es-up-test.conllu",
+            dev_file="es-up-dev.conllu",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            comment_symbol="#",
+            **corpusargs,
         )
 
 
-class WIKINER_DUTCH(ColumnCorpus):
+class UP_SPANISH_ANCORA(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
-            in_memory: bool = False,
+            in_memory: bool = True,
+            document_as_sequence: bool = False,
             **corpusargs,
     ):
+        """
+        Initialize the Spanish AnCora dataset from the Universal Propositions Bank, comming from that webpage:
+        https://github.com/System-T/UniversalPropositions
+
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "pos", 2: "ner"}
+        columns = {1: "text", 9: "frame"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2037,26 +1898,49 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        _download_wikiner("nl", dataset_name)
+        up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish-AnCora/"
+        cached_path(f"{up_es_path}es_ancora-up-train.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_es_path}es_ancora-up-dev.conllu", Path("datasets") / dataset_name)
+        cached_path(f"{up_es_path}es_ancora-up-test.conllu", Path("datasets") / dataset_name)
 
-        super(WIKINER_DUTCH, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
+        super(UP_SPANISH_ANCORA, self).__init__(
+            data_folder,
+            columns,
+            encoding="utf-8",
+            train_file="es_ancora-up-train.conllu",
+            test_file="es_ancora-up-test.conllu",
+            dev_file="es_ancora-up-dev.conllu",
+            in_memory=in_memory,
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            comment_symbol="#",
+            **corpusargs,
         )
 
 
-class WIKINER_FRENCH(ColumnCorpus):
+class WEIBO_NER(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
-            in_memory: bool = False,
+            in_memory: bool = True,
+            document_as_sequence: bool = False,
             **corpusargs,
     ):
+        """
+        Initialize the WEIBO_NER corpus . The first time you call this constructor it will automatically
+        download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
+        POS tags instead
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "pos", 2: "ner"}
+        columns = {0: 'text', 1: 'ner'}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2067,198 +1951,450 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        _download_wikiner("fr", dataset_name)
+        weiboNER_conll_path = "https://raw.githubusercontent.com/87302380/WEIBO_NER/main/data/"
+        cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.train", Path("datasets") / dataset_name)
+        cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.test", Path("datasets") / dataset_name)
+        cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.dev", Path("datasets") / dataset_name)
 
-        super(WIKINER_FRENCH, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
+        super(WEIBO_NER, self).__init__(
+            data_folder,
+            columns,
+            tag_to_bioes=tag_to_bioes,
+            encoding="utf-8",
+            in_memory=in_memory,
+            train_file="weiboNER_2nd_conll_format.train",
+            test_file="weiboNER_2nd_conll_format.test",
+            dev_file="weiboNER_2nd_conll_format.dev",
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
+            **corpusargs,
         )
 
 
-class WIKINER_ITALIAN(ColumnCorpus):
+class WIKIANN(MultiCorpus):
     def __init__(
             self,
+            languages: Union[str, List[str]],
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
             in_memory: bool = False,
-            **corpusargs,
     ):
+        """
+        WkiAnn corpus for cross-lingual NER consisting of datasets from 282 languages that exist
+        in Wikipedia. See https://elisa-ie.github.io/wikiann/ for details and for the languages and their
+        respective abbreveations, i.e. "en" for english. (license: https://opendatacommons.org/licenses/by/)
+        Parameters
+        ----------
+        languages : Union[str, List[str]]
+            Should be an abbreviation of a language ("en", "de",..) or a list of abbreviations.
+            The datasets of all passed languages will be saved in one MultiCorpus.
+            (Note that, even though listed on https://elisa-ie.github.io/wikiann/ some datasets are empty.
+            This includes "aa", "cho", "ho", "hz", "ii", "jam", "kj", "kr", "mus", "olo" and "tcy".)
+        base_path : Union[str, Path], optional
+            Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+            to point to a different folder but typically this should not be necessary.
+        tag_to_bioes : str, optional
+            The data is in bio-format. It will by default (with the string "ner" as value) be transformed
+            into the bioes format. If you dont want that set it to None.
+
+        """
+        if type(languages) == str:
+            languages = [languages]
+
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "pos", 2: "ner"}
+        columns = {0: "text", 1: "ner"}
 
         # this dataset name
-        dataset_name = self.__class__.__name__.lower()
+        dataset_name = "wikiann"
 
         # default dataset folder is the cache root
         if not base_path:
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
-        # download data if necessary
-        _download_wikiner("it", dataset_name)
+        # For each language in languages, the file is downloaded if not existent
+        # Then a comlumncorpus of that data is created and saved in a list
+        # this list is handed to the multicorpus
 
-        super(WIKINER_ITALIAN, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
-        )
-
-
-class WIKINER_SPANISH(ColumnCorpus):
-    def __init__(
-            self,
-            base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
-            in_memory: bool = False,
-            **corpusargs,
-    ):
-        if type(base_path) == str:
-            base_path: Path = Path(base_path)
-
-        # column format
-        columns = {0: "text", 1: "pos", 2: "ner"}
-
-        # this dataset name
-        dataset_name = self.__class__.__name__.lower()
-
-        # default dataset folder is the cache root
-        if not base_path:
-            base_path = Path(flair.cache_root) / "datasets"
-        data_folder = base_path / dataset_name
-
-        # download data if necessary
-        _download_wikiner("es", dataset_name)
-
-        super(WIKINER_SPANISH, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
-        )
-
-
-class WIKINER_PORTUGUESE(ColumnCorpus):
-    def __init__(
-            self,
-            base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
-            in_memory: bool = False,
-            **corpusargs,
-    ):
-        if type(base_path) == str:
-            base_path: Path = Path(base_path)
-
-        # column format
-        columns = {0: "text", 1: "pos", 2: "ner"}
-
-        # this dataset name
-        dataset_name = self.__class__.__name__.lower()
-
-        # default dataset folder is the cache root
-        if not base_path:
-            base_path = Path(flair.cache_root) / "datasets"
-        data_folder = base_path / dataset_name
-
-        # download data if necessary
-        _download_wikiner("pt", dataset_name)
-
-        super(WIKINER_PORTUGUESE, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
-        )
-
-
-class WIKINER_POLISH(ColumnCorpus):
-    def __init__(
-            self,
-            base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
-            in_memory: bool = False,
-            **corpusargs,
-    ):
-        if type(base_path) == str:
-            base_path: Path = Path(base_path)
-
-        # column format
-        columns = {0: "text", 1: "pos", 2: "ner"}
-
-        # this dataset name
-        dataset_name = self.__class__.__name__.lower()
-
-        # default dataset folder is the cache root
-        if not base_path:
-            base_path = Path(flair.cache_root) / "datasets"
-        data_folder = base_path / dataset_name
+        # list that contains the columncopora
+        corpora = []
 
+        google_drive_path = 'https://drive.google.com/uc?id='
         # download data if necessary
-        _download_wikiner("pl", dataset_name)
-
-        super(WIKINER_POLISH, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
-        )
+        first = True
+        for language in languages:
 
+            language_folder = data_folder / language
+            file_name = 'wikiann-' + language + '.bio'
 
-class WIKINER_RUSSIAN(ColumnCorpus):
-    def __init__(
-            self,
-            base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
-            in_memory: bool = False,
-            **corpusargs,
-    ):
-        if type(base_path) == str:
-            base_path: Path = Path(base_path)
+            # if language not downloaded yet, download it
+            if not language_folder.exists():
+                if first == True:
+                    import gdown
+                    import tarfile
+                    first = False
+                # create folder
+                os.makedirs(language_folder)
+                # get google drive id from list
+                google_id = google_drive_id_from_language_name(language)
+                url = google_drive_path + google_id
 
-        # column format
-        columns = {0: "text", 1: "pos", 2: "ner"}
+                # download from google drive
+                gdown.download(url, str(language_folder / language) + '.tar.gz')
 
-        # this dataset name
-        dataset_name = self.__class__.__name__.lower()
+                # unzip
+                print("Extract data...")
+                tar = tarfile.open(str(language_folder / language) + '.tar.gz', "r:gz")
+                # tar.extractall(language_folder,members=[tar.getmember(file_name)])
+                tar.extract(file_name, str(language_folder))
+                tar.close()
+                print('...done.')
 
-        # default dataset folder is the cache root
-        if not base_path:
-            base_path = Path(flair.cache_root) / "datasets"
-        data_folder = base_path / dataset_name
+                # transform data into required format
+                # the processed dataset has the additional ending "_new"
+                print("Process dataset...")
+                silver_standard_to_simple_ner_annotation(str(language_folder / file_name))
+                # remove the unprocessed dataset
+                os.remove(str(language_folder / file_name))
+                print('...done.')
 
-        # download data if necessary
-        _download_wikiner("ru", dataset_name)
+            # initialize comlumncorpus and add it to list
+            print("Read data into corpus...")
+            corp = ColumnCorpus(data_folder=language_folder,
+                                column_format=columns,
+                                train_file=file_name + '_new',
+                                tag_to_bioes=tag_to_bioes,
+                                in_memory=in_memory,
+                                )
+            corpora.append(corp)
+            print("...done.")
 
-        super(WIKINER_RUSSIAN, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
+        super(WIKIANN, self).__init__(
+            corpora, name='wikiann',
         )
 
 
-class WNUT_17(ColumnCorpus):
-    def __init__(
-            self,
-            base_path: Union[str, Path] = None,
-            tag_to_bioes: str = "ner",
-            in_memory: bool = True,
-            **corpusargs,
-    ):
-        if type(base_path) == str:
-            base_path: Path = Path(base_path)
-
-        # column format
-        columns = {0: "text", 1: "ner"}
-
-        # this dataset name
-        dataset_name = self.__class__.__name__.lower()
+def silver_standard_to_simple_ner_annotation(data_file: Union[str, Path]):
+    f_read = open(data_file, 'r', encoding='utf-8')
+    f_write = open(data_file + '_new', 'w+', encoding='utf-8')
+    while True:
+        line = f_read.readline()
+        if line:
+            if line == '\n':
+                f_write.write(line)
+            else:
+                liste = line.split()
+                f_write.write(liste[0] + ' ' + liste[-1] + '\n')
+        else:
+            break
+    f_read.close()
+    f_write.close()
 
-        # default dataset folder is the cache root
-        if not base_path:
-            base_path = Path(flair.cache_root) / "datasets"
-        data_folder = base_path / dataset_name
 
-        # download data if necessary
-        wnut_path = "https://noisy-text.github.io/2017/files/"
-        cached_path(f"{wnut_path}wnut17train.conll", Path("datasets") / dataset_name)
-        cached_path(f"{wnut_path}emerging.dev.conll", Path("datasets") / dataset_name)
-        cached_path(
-            f"{wnut_path}emerging.test.annotated", Path("datasets") / dataset_name
-        )
+def google_drive_id_from_language_name(language):
+    languages_ids = {
+        'aa': '1tDDlydKq7KQQ3_23Ysbtke4HJOe4snIk',  # leer
+        'ab': '1hB8REj2XA_0DjI9hdQvNvSDpuBIb8qRf',
+        'ace': '1WENJS2ppHcZqaBEXRZyk2zY-PqXkTkgG',
+        'ady': '1n6On8WWDHxEoybj7F9K15d_fkGPy6KgO',
+        'af': '1CPB-0BD2tg3zIT60D3hmJT0i5O_SKja0',
+        'ak': '1l2vlGHnQwvm9XhW5S-403fetwUXhBlZm',
+        'als': '196xyYjhbie7sYLHLZHWkkurOwQLi8wK-',
+        'am': '1ug1IEoExKD3xWpvfZprAPSQi82YF9Cet',
+        'an': '1DNLgPOAOsGZBYd6rC5ddhzvc9_DtWnk2',
+        'ang': '1W_0ti7Tl8AkqM91lRCMPWEuUnPOAZroV',
+        'ar': '1tyvd32udEQG_cNeVpaD5I2fxvCc6XKIS',
+        'arc': '1hSOByStqPmP3b9HfQ39EclUZGo8IKCMb',
+        'arz': '1CKW5ZhxTpIHmc8Jt5JLz_5O6Cr8Icsan',
+        'as': '12opBoIweBLM8XciMHT4B6-MAaKdYdvpE',
+        'ast': '1rp64PxGZBDfcw-tpFBjLg_ddLDElG1II',
+        'av': '1hncGUrkG1vwAAQgLtwOf41BWkHkEvdss',
+        'ay': '1VmIsWpMTz442b4Mx798ZOgtB9vquKQtf',
+        'az': '1FXDXsvBSdqc7GGIDZv0hqBOaaw12Ip2-',
+        'azb': '1amVqOuHLEkhjn8rkGUl-mXdZlaACWyNT',
+        'ba': '1aLx1d8GagI11VZVYOGQy0BEePeqoT0x3',
+        'bar': '1JZ8-k8ZmnpWYI_Yl_cBBgjVdxoM9Daci',
+        'bat-smg': '1trxKXDFSeKsygTMKi-ZqXSJs7F90k5a8',
+        'bcl': '1Hs0k7KVZ2DPsqroZ4cUKcwZG4HdPV794',
+        'be-x-old': '1gaK-spj1m6eGYQ-SsngLxxLUvP1VRk08',
+        'be': '1_ttfOSy9BzCRkIT_p3mImT82XRPpEiuH',
+        'bg': '1Iug6gYKemb0OrLTUrKDc_c66YGypTfCF',
+        'bh': '12OcSFLu940A8tVQLxI8pnxKBpTeZHmrh',
+        'bi': '1rftVziS_pqARx4mvLJC0sKLY-OL5ZIjE',
+        'bjn': '1n17mkRjPUAOWQk5LQs2C3Tz3ShxK0enZ',
+        'bm': '1284dwO_sfdsWE7FR06HhfBRUb8ePesKR',
+        'bn': '1K2DM1mT4hkr6NlAIBTj95BeVXcgvpgDm',
+        'bo': '1SzGHDVK-OguKdjZ4DXWiOJVrie1iHeWm',
+        'bpy': '1m-e5EoruJufvwBEgJLmJtx6jzx64pYN2',
+        'br': '1xdaBoJ1DnwI0iEq7gQN1dWcABAs_bM9H',
+        'bs': '167dsB01trMYFQl8FshtIdfhjw7IfVKbk',
+        'bug': '1yCnevM9_KJzFk27Vxsva_20OacLo4Uam',
+        'bxr': '1DlByAX3zB-9UyEAVD4wtX-R7mXC-8xum',
+        'ca': '1LuUgbd9sGa-5Ahcsy31EK89a3WOowftY',
+        'cbk-zam': '1kgF8xoD-kIOWZET_9kp_4yNX6AAXn6PI',
+        'cdo': '14x1y6611G-UAEGq92QEHRpreVkYnoUCw',
+        'ce': '1QUUCVKA-fkiCHd3KT3zUWefaWnxzlZLu',
+        'ceb': '1DJZE9RfaMoPNXHI73KBXAm4YSe-_YCUk',
+        'ch': '1YzAfhmatkmTpkZbAcD6X83epCgzD5S2_',
+        'cho': '1ciY0vF3c5a2mTOo_k32A2wMs0klK98Kb',  # leer
+        'chr': '1EHaxz1UZHn7v2bbRzCLAhPsNtRzrG3Ae',
+        'chy': '1nNWwMAJr1KNdz3bHf6uIn-thZCknlTeB',
+        'ckb': '1llpaftcUSiXCZQZMdAqaJSrhwMdcf9IV',
+        'co': '1ZP-8oWgMYfW7a6w6ygEFkKDGbN39QnDn',
+        'cr': '1ST0xRicLAG4JdCZwGdaY-0pEXooQh7e6',
+        'crh': '1Jmpq2XVYUR_XaXU5XNhtOMnz-qkpsgpE',
+        'cs': '1Vydyze-jBkK_S1uV5ewV_Y6dbwhXr7lk',
+        'csb': '1naUyF74lZPnnopXdOqf5Xor2kT4WoHfS',
+        'cu': '1EN5dVTU6jc7YOYPCHq8EYUF31HlMUKs7',
+        'cv': '1gEUAlqYSSDI4TrWCqP1LUq2n0X1XEjN3',
+        'cy': '1q5g6NJE5GXf65Vc_P4BnUMHQ49Prz-J1',
+        'da': '11onAGOLkkqrIwM784siWlg-cewa5WKm8',
+        'de': '1f9nWvNkCCy6XWhd9uf4Dq-2--GzSaYAb',
+        'diq': '1IkpJaVbEOuOs9qay_KG9rkxRghWZhWPm',
+        'dsb': '1hlExWaMth-2eVIQ3i3siJSG-MN_7Z6MY',
+        'dv': '1WpCrslO4I7TMb2uaKVQw4U2U8qMs5szi',
+        'dz': '10WX52ePq2KfyGliwPvY_54hIjpzW6klV',
+        'ee': '1tYEt3oN2KPzBSWrk9jpCqnW3J1KXdhjz',
+        'el': '1cxq4NUYmHwWsEn5waYXfFSanlINXWLfM',
+        'eml': '17FgGhPZqZNtzbxpTJOf-6nxEuI5oU4Vd',
+        'en': '1mqxeCPjxqmO7e8utj1MQv1CICLFVvKa-',
+        'eo': '1YeknLymGcqj44ug2yd4P7xQVpSK27HkK',
+        'es': '1Dnx3MVR9r5cuoOgeew2gT8bDvWpOKxkU',
+        'et': '1Qhb3kYlQnLefWmNimdN_Vykm4mWzbcWy',
+        'eu': '1f613wH88UeITYyBSEMZByK-nRNMwLHTs',
+        'ext': '1D0nLOZ3aolCM8TShIRyCgF3-_MhWXccN',
+        'fa': '1QOG15HU8VfZvJUNKos024xI-OGm0zhEX',
+        'ff': '1h5pVjxDYcq70bSus30oqi9KzDmezVNry',
+        'fi': '1y3Kf6qYsSvL8_nSEwE1Y6Bf6ninaPvqa',
+        'fiu-vro': '1oKUiqG19WgPd3CCl4FGudk5ATmtNfToR',
+        'fj': '10xDMuqtoTJlJFp5ghbhKfNWRpLDK3W4d',
+        'fo': '1RhjYqgtri1276Be1N9RrNitdBNkpzh0J',
+        'fr': '1sK_T_-wzVPJYrnziNqWTriU52rEsXGjn',
+        'frp': '1NUm8B2zClBcEa8dHLBb-ZgzEr8phcQyZ',
+        'frr': '1FjNqbIUlOW1deJdB8WCuWjaZfUzKqujV',
+        'fur': '1oqHZMK7WAV8oHoZLjGR0PfmO38wmR6XY',
+        'fy': '1DvnU6iaTJc9bWedmDklHyx8nzKD1s3Ge',
+        'ga': '1Ql6rh7absdYQ8l-3hj_MVKcEC3tHKeFB',
+        'gag': '1zli-hOl2abuQ2wsDJU45qbb0xuvYwA3a',
+        'gan': '1u2dOwy58y-GaS-tCPJS_i9VRDQIPXwCr',
+        'gd': '1umsUpngJiwkLdGQbRqYpkgxZju9dWlRz',
+        'gl': '141K2IbLjJfXwFTIf-kthmmG0YWdi8liE',
+        'glk': '1ZDaxQ6ilXaoivo4_KllagabbvfOuiZ0c',
+        'gn': '1hM4MuCaVnZqnL-w-0N-WcWag22ikVLtZ',
+        'gom': '1BNOSw75tzPC0wEgLOCKbwu9wg9gcLOzs',
+        'got': '1YSHYBtXc1WvUvMIHPz6HHgJvaXKulJUj',
+        'gu': '1VdK-B2drqFwKg8KD23c3dKXY-cZgCMgd',
+        'gv': '1XZFohYNbKszEFR-V-yDXxx40V41PV9Zm',
+        'ha': '18ZG4tUU0owRtQA8Ey3Dl72ALjryEJWMC',
+        'hak': '1QQe3WgrCWbvnVH42QXD7KX4kihHURB0Z',
+        'haw': '1FLqlK-wpz4jy768XbQAtxd9PhC-9ciP7',
+        'he': '18K-Erc2VOgtIdskaQq4D5A3XkVstDmfX',
+        'hi': '1lBRapb5tjBqT176gD36K5yb_qsaFeu-k',
+        'hif': '153MQ9Ga4NQ-CkK8UiJM3DjKOk09fhCOV',
+        'ho': '1c1AoS7yq15iVkTEE-0f3x25NT4F202B8',  # leer
+        'hr': '1wS-UtB3sGHuXJQQGR0F5lDegogsgoyif',
+        'hsb': '1_3mMLzAE5OmXn2z64rW3OwWbo85Mirbd',
+        'ht': '1BwCaF0nfdgkM7Yt7A7d7KyVk0BcuwPGk',
+        'hu': '10AkDmTxUWNbOXuYLYZ-ZPbLAdGAGZZ8J',
+        'hy': '1Mi2k2alJJquT1ybd3GC3QYDstSagaWdo',
+        'hz': '1c1m_-Q92v0Di7Nez6VuaccrN19i8icKV',  # leer
+        'ia': '1jPyqTmDuVhEhj89N606Cja5heJEbcMoM',
+        'id': '1JWIvIh8fQoMQqk1rPvUThaskxnTs8tsf',
+        'ie': '1TaKRlTtB8-Wqu4sfvx6JQKIugAlg0pV-',
+        'ig': '15NFAf2Qx6BXSjv_Oun9_3QRBWNn49g86',
+        'ii': '1qldGJkMOMKwY13DpcgbxQCbff0K982f9',  # leer
+        'ik': '1VoSTou2ZlwVhply26ujowDz6gjwtxmny',
+        'ilo': '1-xMuIT6GaM_YeHqgm1OamGkxYfBREiv3',
+        'io': '19Zla0wsAcrZm2c0Pw5ghpp4rHjYs26Pp',
+        'is': '11i-NCyqS6HbldIbYulsCgQGZFXR8hwoB',
+        'it': '1HmjlOaQunHqL2Te7pIkuBWrnjlmdfYo_',
+        'iu': '18jKm1S7Ls3l0_pHqQH8MycG3LhoC2pdX',
+        'ja': '10dz8UxyK4RIacXE2HcGdrharmp5rwc3r',
+        'jam': '1v99CXf9RnbF6aJo669YeTR6mQRTOLZ74',  # leer
+        'jbo': '1_LmH9hc6FDGE3F7pyGB1fUEbSwuTYQdD',
+        'jv': '1qiSu1uECCLl4IBZS27FBdJIBivkJ7GwE',
+        'ka': '172UFuFRBX2V1aWeXlPSpu9TjS-3cxNaD',
+        'kaa': '1kh6hMPUdqO-FIxRY6qaIBZothBURXxbY',
+        'kab': '1oKjbZI6ZrrALCqnPCYgIjKNrKDA7ehcs',
+        'kbd': '1jNbfrboPOwJmlXQBIv053d7n5WXpMRv7',
+        'kg': '1iiu5z-sdJ2JLC4Ja9IgDxpRZklIb6nDx',
+        'ki': '1GUtt0QI84c5McyLGGxoi5uwjHOq1d6G8',
+        'kj': '1nSxXUSGDlXVCIPGlVpcakRc537MwuKZR',  # leer
+        'kk': '1ryC3UN0myckc1awrWhhb6RIi17C0LCuS',
+        'kl': '1gXtGtX9gcTXms1IExICnqZUHefrlcIFf',
+        'km': '1DS5ATxvxyfn1iWvq2G6qmjZv9pv0T6hD',
+        'kn': '1ZGLYMxbb5-29MNmuUfg2xFhYUbkJFMJJ',
+        'ko': '12r8tIkTnwKhLJxy71qpIcoLrT6NNhQYm',
+        'koi': '1EdG_wZ_Qk124EPAZw-w6rdEhYLsgcvIj',
+        'kr': '19VNQtnBA-YL_avWuVeHQHxJZ9MZ04WPF',  # leer
+        'krc': '1nReV4Mb7Wdj96czpO5regFbdBPu0zZ_y',
+        'ks': '1kzh0Pgrv27WRMstR9MpU8mu7p60TcT-X',
+        'ksh': '1iHJvrl2HeRaCumlrx3N7CPrHQ2KuLUkt',
+        'ku': '1YqJog7Bkk0fHBCSTxJ9heeE-bfbkbkye',
+        'kv': '1s91HI4eq8lQYlZwfrJAgaGlCyAtIhvIJ',
+        'kw': '16TaIX2nRfqDp8n7zudd4bqf5abN49dvW',
+        'ky': '17HPUKFdKWhUjuR1NOp5f3PQYfMlMCxCT',
+        'la': '1NiQuBaUIFEERvVXo6CQLwosPraGyiRYw',
+        'lad': '1PEmXCWLCqnjLBomMAYHeObM1AmVHtD08',
+        'lb': '1nE4g10xoTU23idmDtOQ0w2QCuizZ6QH_',
+        'lbe': '1KOm-AdRcCHfSc1-uYBxBA4GjxXjnIlE-',
+        'lez': '1cJAXshrLlF1TZlPHJTpDwEvurIOsz4yR',
+        'lg': '1Ur0y7iiEpWBgHECrIrT1OyIC8um_y4th',
+        'li': '1TikIqfqcZlSDWhOae1JnjJiDko4nj4Dj',
+        'lij': '1ro5ItUcF49iP3JdV82lhCQ07MtZn_VjW',
+        'lmo': '1W4rhBy2Pi5SuYWyWbNotOVkVY3kYWS_O',
+        'ln': '1bLSV6bWx0CgFm7ByKppZLpYCFL8EIAoD',
+        'lo': '1C6SSLeKF3QirjZbAZAcpVX_AXYg_TJG3',
+        'lrc': '1GUcS28MlJe_OjeQfS2AJ8uczpD8ut60e',
+        'lt': '1gAG6TcMTmC128wWK0rCXRlCTsJY9wFQY',
+        'ltg': '12ziP8t_fAAS9JqOCEC0kuJObEyuoiOjD',
+        'lv': '1MPuAM04u-AtfybXdpHwCqUpFWbe-zD0_',
+        'mai': '1d_nUewBkka2QGEmxCc9v3dTfvo7lPATH',
+        'map-bms': '1wrNIE-mqp2xb3lrNdwADe6pb7f35NP6V',
+        'mdf': '1BmMGUJy7afuKfhfTBMiKxM3D7FY-JrQ2',
+        'mg': '105WaMhcWa-46tCztoj8npUyg0aH18nFL',
+        'mh': '1Ej7n6yA1cF1cpD5XneftHtL33iHJwntT',
+        'mhr': '1CCPIUaFkEYXiHO0HF8_w07UzVyWchrjS',
+        'mi': '1F6au9xQjnF-aNBupGJ1PwaMMM6T_PgdQ',
+        'min': '1tVK5SHiCy_DaZSDm3nZBgT5bgWThbJt_',
+        'mk': '18NpudytGhSWq_LbmycTDw10cSftlSBGS',
+        'ml': '1V73UE-EvcE-vV3V1RTvU4sak6QFcP91y',
+        'mn': '14jRXicA87oXZOZllWqUjKBMetNpQEUUp',
+        'mo': '1YsLGNMsJ7VsekhdcITQeolzOSK4NzE6U',
+        'mr': '1vOr1AIHbgkhTO9Ol9Jx5Wh98Qdyh1QKI',
+        'mrj': '1dW-YmEW8a9D5KyXz8ojSdIXWGekNzGzN',
+        'ms': '1bs-_5WNRiZBjO-DtcNtkcIle-98homf_',
+        'mt': '1L7aU3iGjm6SmPIU74k990qRgHFV9hrL0',
+        'mus': '1_b7DcRqiKJFEFwp87cUecqf8A5BDbTIJ',  # leer
+        'mwl': '1MfP0jba2jQfGVeJOLq26MjI6fYY7xTPu',
+        'my': '16wsIGBhNVd2lC2p6n1X8rdMbiaemeiUM',
+        'myv': '1KEqHmfx2pfU-a1tdI_7ZxMQAk5NJzJjB',
+        'mzn': '1CflvmYEXZnWwpsBmIs2OvG-zDDvLEMDJ',
+        'na': '1r0AVjee5wNnrcgJxQmVGPVKg5YWz1irz',
+        'nah': '1fx6eu91NegyueZ1i0XaB07CKjUwjHN7H',
+        'nap': '1bhT4sXCJvaTchCIV9mwLBtf3a7OprbVB',
+        'nds-nl': '1UIFi8eOCuFYJXSAXZ9pCWwkQMlHaY4ye',
+        'nds': '1FLgZIXUWa_vekDt4ndY0B5XL7FNLiulr',
+        'ne': '1gEoCjSJmzjIH4kdHsbDZzD6ID4_78ekS',
+        'new': '1_-p45Ny4w9UvGuhD8uRNSPPeaARYvESH',
+        'ng': '11yxPdkmpmnijQUcnFHZ3xcOmLTYJmN_R',
+        'nl': '1dqYXg3ilzVOSQ_tz_dF47elSIvSIhgqd',
+        'nn': '1pDrtRhQ001z2WUNMWCZQU3RV_M0BqOmv',
+        'no': '1zuT8MI96Ivpiu9mEVFNjwbiM8gJlSzY2',
+        'nov': '1l38388Rln0NXsSARMZHmTmyfo5C0wYTd',
+        'nrm': '10vxPq1Nci7Wpq4XOvx3dtqODskzjdxJQ',
+        'nso': '1iaIV8qlT0RDnbeQlnxJ3RehsG3gU5ePK',
+        'nv': '1oN31jT0w3wP9aGwAPz91pSdUytnd9B0g',
+        'ny': '1eEKH_rUPC560bfEg11kp3kbe8qWm35IG',
+        'oc': '1C01cW8G_j8US-DTrsmeal_ENHTtNWn-H',
+        'olo': '1vbDwKZKqFq84dusr1SvDx5JbBcPanx9L',  # leer
+        'om': '1q3h22VMbWg2kgVFm-OArR-E4y1yBQ1JX',
+        'or': '1k8LwCE8nC7lq6neXDaS3zRn0KOrd9RnS',
+        'os': '1u81KAB34aEQfet00dLMRIBJsfRwbDTij',
+        'pa': '1JDEHL1VcLHBamgTPBom_Ryi8hk6PBpsu',
+        'pag': '1k905VUWnRgY8kFb2P2431Kr4dZuolYGF',
+        'pam': '1ssugGyJb8ipispC60B3I6kzMsri1WcvC',
+        'pap': '1Za0wfwatxYoD7jGclmTtRoBP0uV_qImQ',
+        'pcd': '1csJlKgtG04pdIYCUWhsCCZARKIGlEYPx',
+        'pdc': '1Xnms4RXZKZ1BBQmQJEPokmkiweTpouUw',
+        'pfl': '1tPQfHX7E0uKMdDSlwNw5aGmaS5bUK0rn',
+        'pi': '16b-KxNxzbEuyoNSlI3bfe2YXmdSEsPFu',
+        'pih': '1vwyihTnS8_PE5BNK7cTISmIBqGWvsVnF',
+        'pl': '1fijjS0LbfpKcoPB5V8c8fH08T8AkXRp9',
+        'pms': '12ySc7X9ajWWqMlBjyrPiEdc-qVBuIkbA',
+        'pnb': '1RB3-wjluhTKbdTGCsk3nag1bM3m4wENb',
+        'pnt': '1ZCUzms6fY4on_fW8uVgO7cEs9KHydHY_',
+        'ps': '1WKl9Av6Sqz6aHKyUM5kIh90mzFzyVWH9',
+        'pt': '13BX-_4_hcTUp59HDyczFDI32qUB94vUY',
+        'qu': '1CB_C4ygtRoegkqgcqfXNHr8oQd-UcvDE',
+        'rm': '1YRSGgWoxEqSojHXuBHJnY8vAHr1VgLu-',
+        'rmy': '1uFcCyvOWBJWKFQxbkYSp373xUXVl4IgF',
+        'rn': '1ekyyb2MvupYGY_E8_BhKvV664sLvW4aE',
+        'ro': '1YfeNTSoxU-zJMnyQotLk5X8B_6nHryBu',
+        'roa-rup': '150s4H4TdQ5nNYVC6j0E416TUAjBE85yy',
+        'roa-tara': '1H6emfQsD_a5yohK4RMPQ-GrnHXqqVgr3',
+        'ru': '11gP2s-SYcfS3j9MjPp5C3_nFeQB-8x86',
+        'rue': '1OuSglZAndja1J5D5IUmdbt_niTTyEgYK',
+        'rw': '1NuhHfi0-B-Xlr_BApijnxCw0WMEltttP',
+        'sa': '1P2S3gL_zvKgXLKJJxg-Fb4z8XdlVpQik',
+        'sah': '1qz0MpKckzUref2FX_FYiNzI2p4BDc5oR',
+        'sc': '1oAYj_Fty4FUwjAOBEBaiZt_cY8dtpDfA',
+        'scn': '1sDN9zHkXWYoHYx-DUu-GPvsUgB_IRa8S',
+        'sco': '1i8W7KQPj6YZQLop89vZBSybJNgNsvXWR',
+        'sd': '1vaNqfv3S8Gl5pQmig3vwWQ3cqRTsXmMR',
+        'se': '1RT9xhn0Vl90zjWYDTw5V1L_u1Oh16tpP',
+        'sg': '1iIh2oXD2Szz_AygUvTt3_ZK8a3RYEGZ_',
+        'sh': '1qPwLiAm6t4__G-zVEOrBgYx6VRmgDgiS',
+        'si': '1G5ryceID0TP6SAO42e-HAbIlCvYmnUN7',
+        'simple': '1FVV49o_RlK6M5Iw_7zeJOEDQoTa5zSbq',
+        'sk': '11mkYvbmAWKTInj6t4Ma8BUPxoR5o6irL',
+        'sl': '1fsIZS5LgMzMzZ6T7ogStyj-ILEZIBRvO',
+        'sm': '1yefECpKX_Y4R7G2tggIxvc_BvJfOAz-t',
+        'sn': '1fYeCjMPvRAv94kvZjiKI-ktIDLkbv0Ve',
+        'so': '1Uc-eSZnJb36SgeTvRU3GirXZOlGD_NB6',
+        'sq': '11u-53n71O_yjpwRiCQSwgL7N2w72ZptX',
+        'sr': '1PGLGlQi8Q0Eac6dib-uuCJAAHK6SF5Pz',
+        'srn': '1JKiL3TSXqK1-KhPfAwMK0uqw90WEzg7M',
+        'ss': '1e0quNEsA1dn57-IbincF4D82dRWgzQlp',
+        'st': '1ny-FBzpBqIDgv6jMcsoFev3Ih65FNZFO',
+        'stq': '15Fx32ROy2IM6lSqAPUykkr3CITR6Xd7v',
+        'su': '1C0FJum7bYZpnyptBvfAgwJb0TX2hggtO',
+        'sv': '1YyqzOSXzK5yrAou9zeTDWH_7s569mDcz',
+        'sw': '1_bNTj6T8eXlNAIuHaveleWlHB_22alJs',
+        'szl': '1_dXEip1snK4CPVGqH8x7lF5O-6FdCNFW',
+        'ta': '1ZFTONsxGtSnC9QB6RpWSvgD_MbZwIhHH',
+        'tcy': '15R6u7KQs1vmDSm_aSDrQMJ3Q6q3Be0r7',  # leer
+        'te': '11Sx-pBAPeZOXGyv48UNSVMD0AH7uf4YN',
+        'tet': '11mr2MYLcv9pz7mHhGGNi5iNCOVErYeOt',
+        'tg': '16ttF7HWqM9Cnj4qmgf3ZfNniiOJfZ52w',
+        'th': '14xhIt-xr5n9nMuvcwayCGM1-zBCFZquW',
+        'ti': '123q5e9MStMShp8eESGtHdSBGLDrCKfJU',
+        'tk': '1X-JNInt34BNGhg8A8Peyjw2WjsALdXsD',
+        'tl': '1WkQHbWd9cqtTnSHAv0DpUThaBnzeSPTJ',
+        'tn': '1fHfQHetZn8-fLuRZEu-cvs-kQYwPvjyL',
+        'to': '1cHOLaczYJ8h-OqQgxeoH9vMG3izg6muT',
+        'tpi': '1YsRjxVu6NYOrXRb8oqMO9FPaicelFEcu',
+        'tr': '1J1Zy02IxvtCK0d1Ba2h_Ulit1mVb9UIX',
+        'ts': '1pIcfAt3KmtmDkyhOl-SMSeoM8aP8bOpl',
+        'tt': '1vsfzCjj-_bMOn5jBai41TF5GjKJM_Ius',
+        'tum': '1NWcg65daI2Bt0awyEgU6apUDbBmiqCus',
+        'tw': '1WCYKZIqS7AagS76QFSfbteiOgFNBvNne',
+        'ty': '1DIqaP1l-N9VXTNokrlr6EuPMGE765o4h',
+        'tyv': '1F3qa05OYLBcjT1lXMurAJFDXP_EesCvM',
+        'udm': '1T0YMTAPLOk768sstnewy5Jxgx2RPu3Rb',
+        'ug': '1fjezvqlysyZhiQMZdazqLGgk72PqtXAw',
+        'uk': '1UMJCHtzxkfLDBJE7NtfN5FeMrnnUVwoh',
+        'ur': '1WNaD2TuHvdsF-z0k_emQYchwoQQDFmRk',
+        'uz': '11wrG2FSTpRJc2jb5MhgvxjkVDYhT8M-l',
+        've': '1PucJ7pJ4CXGEXZ5p_WleZDs2usNz74to',
+        'vec': '1cAVjm_y3ehNteDQIYz9yyoq1EKkqOXZ0',
+        'vep': '1K_eqV7O6C7KPJWZtmIuzFMKAagj-0O85',
+        'vi': '1yQ6nhm1BmG9lD4_NaG1hE5VV6biEaV5f',
+        'vls': '1bpQQW6pKHruKJJaKtuggH5rReMXyeVXp',
+        'vo': '1D80QRdTpe7H4mHFKpfugscsjX71kiMJN',
+        'wa': '1m4B81QYbf74htpInDU5p7d0n0ot8WLPZ',
+        'war': '1EC3jsHtu22tHBv6jX_I4rupC5RwV3OYd',
+        'wo': '1vChyqNNLu5xYHdyHpACwwpw4l3ptiKlo',
+        'wuu': '1_EIn02xCUBcwLOwYnA-lScjS2Lh2ECw6',
+        'xal': '19bKXsL1D2UesbB50JPyc9TpG1lNc2POt',
+        'xh': '1pPVcxBG3xsCzEnUzlohc_p89gQ9dSJB3',
+        'xmf': '1SM9llku6I_ZuZz05mOBuL2lx-KQXvehr',
+        'yi': '1WNWr1oV-Nl7c1Jv8x_MiAj2vxRtyQawu',
+        'yo': '1yNVOwMOWeglbOcRoZzgd4uwlN5JMynnY',
+        'za': '1i7pg162cD_iU9h8dgtI2An8QCcbzUAjB',
+        'zea': '1EWSkiSkPBfbyjWjZK0VuKdpqFnFOpXXQ',
+        'zh-classical': '1uUKZamNp08KA7s7794sKPOqPALvo_btl',
+        'zh-min-nan': '1oSgz3YBXLGUgI7kl-uMOC_ww6L0FNFmp',
+        'zh-yue': '1zhwlUeeiyOAU1QqwqZ8n91yXIRPFA7UE',
+        'zh': '1LZ96GUhkVHQU-aj2C3WOrtffOp0U3Z7f',
+        'zu': '1FyXl_UK1737XB3drqQFhGXiJrJckiB1W'
+    }
+    return languages_ids[language]
 
-        super(WNUT_17, self).__init__(
-            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
-        )
 
-class WEIBO_NER(ColumnCorpus):
+class WIKIGOLD_NER(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
@@ -2268,12 +2404,11 @@ def __init__(
             **corpusargs,
     ):
         """
-        Initialize the WEIBO_NER corpus . The first time you call this constructor it will automatically
+        Initialize the wikigold corpus. The first time you call this constructor it will automatically
         download the dataset.
         :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
         to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
-        POS tags instead
+        :param tag_to_bioes: NER by default, should not be changed
         :param in_memory: If True, keeps dataset in memory giving speedups in training.
         :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
@@ -2281,7 +2416,7 @@ def __init__(
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: 'text', 1: 'ner'}
+        columns = {0: "text", 1: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2292,38 +2427,34 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        weiboNER_conll_path = "https://raw.githubusercontent.com/87302380/WEIBO_NER/main/data/"
-        cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.train", Path("datasets") / dataset_name)
-        cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.test", Path("datasets") / dataset_name)
-        cached_path(f"{weiboNER_conll_path}weiboNER_2nd_conll_format.dev", Path("datasets") / dataset_name)
-
+        wikigold_ner_path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/wikigold/CONLL-format/data/"
+        cached_path(f"{wikigold_ner_path}wikigold.conll.txt", Path("datasets") / dataset_name)
 
-        super(WEIBO_NER, self).__init__(
+        super(WIKIGOLD_NER, self).__init__(
             data_folder,
             columns,
             tag_to_bioes=tag_to_bioes,
             encoding="utf-8",
             in_memory=in_memory,
-            train_file="weiboNER_2nd_conll_format.train",
-            test_file="weiboNER_2nd_conll_format.test",
-            dev_file="weiboNER_2nd_conll_format.dev",
+            train_file='wikigold.conll.txt',
             document_separator_token=None if not document_as_sequence else "-DOCSTART-",
             **corpusargs,
         )
 
-class BIOSCOPE(ColumnCorpus):
 
+class WIKINER_ENGLISH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            in_memory: bool = True,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = False,
             **corpusargs,
     ):
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "tag"}
+        columns = {0: "text", 1: "pos", 2: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2334,128 +2465,26 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        bioscope_path = "https://raw.githubusercontent.com/whoisjones/BioScopeSequenceLabelingData/master/sequence_labeled/"
-        cached_path(f"{bioscope_path}output.txt", Path("datasets") / dataset_name)
-
-        super(BIOSCOPE, self).__init__(
-            data_folder, columns, in_memory=in_memory, train_file="output.txt", **corpusargs,
-        )
-
-
-def _download_wikiner(language_code: str, dataset_name: str):
-    # download data if necessary
-    wikiner_path = (
-        "https://raw.githubusercontent.com/dice-group/FOX/master/input/Wikiner/"
-    )
-    lc = language_code
-
-    data_file = (
-            Path(flair.cache_root)
-            / "datasets"
-            / dataset_name
-            / f"aij-wikiner-{lc}-wp3.train"
-    )
-    if not data_file.is_file():
-
-        cached_path(
-            f"{wikiner_path}aij-wikiner-{lc}-wp3.bz2", Path("datasets") / dataset_name
-        )
-        import bz2, shutil
+        _download_wikiner("en", dataset_name)
 
-        # unpack and write out in CoNLL column-like format
-        bz_file = bz2.BZ2File(
-            Path(flair.cache_root)
-            / "datasets"
-            / dataset_name
-            / f"aij-wikiner-{lc}-wp3.bz2",
-            "rb",
+        super(WIKINER_ENGLISH, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
         )
-        with bz_file as f, open(
-                Path(flair.cache_root)
-                / "datasets"
-                / dataset_name
-                / f"aij-wikiner-{lc}-wp3.train",
-                "w",
-                encoding="utf-8"
-        ) as out:
-            for line in f:
-                line = line.decode("utf-8")
-                words = line.split(" ")
-                for word in words:
-                    out.write("\t".join(word.split("|")) + "\n")
-
-class UP_CHINESE(ColumnCorpus):
-    def __init__(
-            self,
-            base_path: Union[str, Path] = None,
-            in_memory: bool = True,
-            document_as_sequence: bool = False,
-            **corpusargs,
-    ):
-        """
-        Initialize the Chinese dataset from the Universal Propositions Bank, comming from that webpage:
-        https://github.com/System-T/UniversalPropositions/tree/master/UP_Chinese
-
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
-        if type(base_path) == str:
-            base_path: Path = Path(base_path)
-
-        # column format
-        columns = {1: "text", 9: "frame"}
-
-        # this dataset name
-        dataset_name = self.__class__.__name__.lower()
-
-        # default dataset folder is the cache root
-        if not base_path:
-            base_path = Path(flair.cache_root) / "datasets"
-        data_folder = base_path / dataset_name
-
-        # download data if necessary
-        up_zh_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Chinese/"
-        cached_path(f"{up_zh_path}zh-up-train.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_zh_path}zh-up-dev.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_zh_path}zh-up-test.conllu", Path("datasets") / dataset_name)
 
-        super(UP_CHINESE, self).__init__(
-            data_folder,
-            columns,
-            encoding="utf-8",
-            train_file="zh-up-train.conllu",
-            test_file="zh-up-test.conllu",
-            dev_file="zh-up-dev.conllu",
-            in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-            comment_symbol="#",
-            **corpusargs,
-        )
 
-class UP_ENGLISH(ColumnCorpus):
+class WIKINER_GERMAN(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            in_memory: bool = True,
-            document_as_sequence: bool = False,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = False,
             **corpusargs,
     ):
-        """
-        Initialize the English dataset from the Universal Propositions Bank, comming from that webpage:
-        https://github.com/System-T/UniversalPropositions.
-
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {1: "text", 10: "frame"}
+        columns = {0: "text", 1: "pos", 2: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2466,46 +2495,26 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        up_en_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_English-EWT/"
-        cached_path(f"{up_en_path}en_ewt-up-train.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_en_path}en_ewt-up-dev.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_en_path}en_ewt-up-test.conllu", Path("datasets") / dataset_name)
-
-        super(UP_ENGLISH, self).__init__(
-            data_folder,
-            columns,
-            encoding="utf-8",
-            train_file="en_ewt-up-train.conllu",
-            test_file="en_ewt-up-test.conllu",
-            dev_file="en_ewt-up-dev.conllu",
-            in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-            comment_symbol="#",
-            **corpusargs,
+        _download_wikiner("de", dataset_name)
+
+        super(WIKINER_GERMAN, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
         )
 
-class UP_FRENCH(ColumnCorpus):
+
+class WIKINER_DUTCH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            in_memory: bool = True,
-            document_as_sequence: bool = False,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = False,
             **corpusargs,
     ):
-        """
-        Initialize the French dataset from the Universal Propositions Bank, comming from that webpage:
-        https://github.com/System-T/UniversalPropositions.
-
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {1: "text", 9: "frame"}
+        columns = {0: "text", 1: "pos", 2: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2516,46 +2525,26 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        up_fr_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_French/"
-        cached_path(f"{up_fr_path}fr-up-train.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_fr_path}fr-up-dev.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_fr_path}fr-up-test.conllu", Path("datasets") / dataset_name)
+        _download_wikiner("nl", dataset_name)
 
-        super(UP_FRENCH, self).__init__(
-            data_folder,
-            columns,
-            encoding="utf-8",
-            train_file="fr-up-train.conllu",
-            test_file="fr-up-test.conllu",
-            dev_file="fr-up-dev.conllu",
-            in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-            comment_symbol="#",
-            **corpusargs,
+        super(WIKINER_DUTCH, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
         )
 
-class UP_FINNISH(ColumnCorpus):
+
+class WIKINER_FRENCH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            in_memory: bool = True,
-            document_as_sequence: bool = False,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = False,
             **corpusargs,
     ):
-        """
-        Initialize the Finnish dataset from the Universal Propositions Bank, comming from that webpage:
-        https://github.com/System-T/UniversalPropositions/tree/master/UP_Finnish
-
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {1: "text", 9: "frame"}
+        columns = {0: "text", 1: "pos", 2: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2566,46 +2555,26 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        up_fi_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Finnish/"
-        cached_path(f"{up_fi_path}fi-up-train.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_fi_path}fi-up-dev.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_fi_path}fi-up-test.conllu", Path("datasets") / dataset_name)
+        _download_wikiner("fr", dataset_name)
 
-        super(UP_FINNISH, self).__init__(
-            data_folder,
-            columns,
-            encoding="utf-8",
-            train_file="fi-up-train.conllu",
-            test_file="fi-up-test.conllu",
-            dev_file="fi-up-dev.conllu",
-            in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-            comment_symbol="#",
-            **corpusargs,
+        super(WIKINER_FRENCH, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
         )
 
-class UP_GERMAN(ColumnCorpus):
+
+class WIKINER_ITALIAN(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            in_memory: bool = True,
-            document_as_sequence: bool = False,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = False,
             **corpusargs,
     ):
-        """
-        Initialize the German dataset from the Universal Propositions Bank, comming from that webpage:
-        https://github.com/System-T/UniversalPropositions.
-
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {1: "text", 9: "frame"}
+        columns = {0: "text", 1: "pos", 2: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2616,46 +2585,26 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        up_de_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_German/"
-        cached_path(f"{up_de_path}de-up-train.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_de_path}de-up-dev.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_de_path}de-up-test.conllu", Path("datasets") / dataset_name)
+        _download_wikiner("it", dataset_name)
 
-        super(UP_GERMAN, self).__init__(
-            data_folder,
-            columns,
-            encoding="utf-8",
-            train_file="de-up-train.conllu",
-            test_file="de-up-test.conllu",
-            dev_file="de-up-dev.conllu",
-            in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-            comment_symbol="#",
-            **corpusargs,
+        super(WIKINER_ITALIAN, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
         )
 
-class UP_ITALIAN(ColumnCorpus):
+
+class WIKINER_SPANISH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            in_memory: bool = True,
-            document_as_sequence: bool = False,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = False,
             **corpusargs,
     ):
-        """
-        Initialize the Italian dataset from the Universal Propositions Bank, comming from that webpage:
-        https://github.com/System-T/UniversalPropositions/tree/master/UP_Italian
-
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {1: "text", 9: "frame"}
+        columns = {0: "text", 1: "pos", 2: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2666,46 +2615,26 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        up_it_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Italian/"
-        cached_path(f"{up_it_path}it-up-train.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_it_path}it-up-dev.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_it_path}it-up-test.conllu", Path("datasets") / dataset_name)
+        _download_wikiner("es", dataset_name)
 
-        super(UP_ITALIAN, self).__init__(
-            data_folder,
-            columns,
-            encoding="utf-8",
-            train_file="it-up-train.conllu",
-            test_file="it-up-test.conllu",
-            dev_file="it-up-dev.conllu",
-            in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-            comment_symbol="#",
-            **corpusargs,
+        super(WIKINER_SPANISH, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
         )
 
-class UP_SPANISH(ColumnCorpus):
+
+class WIKINER_PORTUGUESE(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            in_memory: bool = True,
-            document_as_sequence: bool = False,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = False,
             **corpusargs,
     ):
-        """
-        Initialize the Spanish dataset from the Universal Propositions Bank, comming from that webpage:
-        https://github.com/System-T/UniversalPropositions
-
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {1: "text", 9: "frame"}
+        columns = {0: "text", 1: "pos", 2: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2716,46 +2645,26 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish/"
-        cached_path(f"{up_es_path}es-up-train.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_es_path}es-up-dev.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_es_path}es-up-test.conllu", Path("datasets") / dataset_name)
+        _download_wikiner("pt", dataset_name)
 
-        super(UP_SPANISH, self).__init__(
-            data_folder,
-            columns,
-            encoding="utf-8",
-            train_file="es-up-train.conllu",
-            test_file="es-up-test.conllu",
-            dev_file="es-up-dev.conllu",
-            in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-            comment_symbol="#",
-            **corpusargs,
+        super(WIKINER_PORTUGUESE, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
         )
 
-class UP_SPANISH_ANCORA(ColumnCorpus):
+
+class WIKINER_POLISH(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
-            in_memory: bool = True,
-            document_as_sequence: bool = False,
+            tag_to_bioes: str = "ner",
+            in_memory: bool = False,
             **corpusargs,
     ):
-        """
-        Initialize the Spanish AnCora dataset from the Universal Propositions Bank, comming from that webpage:
-        https://github.com/System-T/UniversalPropositions
-
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {1: "text", 9: "frame"}
+        columns = {0: "text", 1: "pos", 2: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2766,73 +2675,44 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        up_es_path = "https://raw.githubusercontent.com/System-T/UniversalPropositions/master/UP_Spanish-AnCora/"
-        cached_path(f"{up_es_path}es_ancora-up-train.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_es_path}es_ancora-up-dev.conllu", Path("datasets") / dataset_name)
-        cached_path(f"{up_es_path}es_ancora-up-test.conllu", Path("datasets") / dataset_name)
+        _download_wikiner("pl", dataset_name)
 
-        super(UP_SPANISH_ANCORA, self).__init__(
-            data_folder,
-            columns,
-            encoding="utf-8",
-            train_file="es_ancora-up-train.conllu",
-            test_file="es_ancora-up-test.conllu",
-            dev_file="es_ancora-up-dev.conllu",
-            in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-            comment_symbol="#",
-            **corpusargs,
+        super(WIKINER_POLISH, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
         )
 
 
-class MITMovieNERSimple(ColumnCorpus):
+class WIKINER_RUSSIAN(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
-            in_memory: bool = True,
+            in_memory: bool = False,
             **corpusargs,
     ):
-        """
-        Initialize the eng corpus of the MIT Movie Corpus (it has simpler queries compared to the trivia10k13 corpus)
-        in BIO format. The first time you call this constructor it will automatically download the dataset.
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
-        POS tags instead
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        """
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # column format
-        columns = {0: "ner", 1: "text"}
+        columns = {0: "text", 1: "pos", 2: "ner"}
 
-        # dataset name
+        # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
-        # data folder: default dataset folder is the cache root
-        if type(base_path) == str:
-            base_path: Path = Path(base_path)
+        # default dataset folder is the cache root
         if not base_path:
-            base_path: Path = Path(flair.cache_root) / "datasets"
+            base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/"
-        train_file = "engtrain.bio"
-        test_file = "engtest.bio"
-        cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name)
-        cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name)
+        _download_wikiner("ru", dataset_name)
 
-        super(MITMovieNERSimple, self).__init__(
-            data_folder,
-            columns,
-            train_file=train_file,
-            test_file=test_file,
-            tag_to_bioes=tag_to_bioes,
-            in_memory=in_memory,
-            **corpusargs,
+        super(WIKINER_RUSSIAN, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
         )
 
-class MITMovieNERComplex(ColumnCorpus):
+
+class WNUT_17(ColumnCorpus):
     def __init__(
             self,
             base_path: Union[str, Path] = None,
@@ -2840,59 +2720,56 @@ def __init__(
             in_memory: bool = True,
             **corpusargs,
     ):
-        """
-        Initialize the trivia10k13 corpus of the MIT Movie Corpus (it has more complex queries compared to the eng corpus)
-        in BIO format. The first time you call this constructor it will automatically download the dataset.
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
-        POS tags instead
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        """
+        if type(base_path) == str:
+            base_path: Path = Path(base_path)
+
         # column format
-        columns = {0: "ner", 1: "text"}
+        columns = {0: "text", 1: "ner"}
 
-        # dataset name
+        # this dataset name
         dataset_name = self.__class__.__name__.lower()
 
-        # data folder: default dataset folder is the cache root
-        if type(base_path) == str:
-            base_path: Path = Path(base_path)
+        # default dataset folder is the cache root
         if not base_path:
-            base_path: Path = Path(flair.cache_root) / "datasets"
+            base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        mit_movie_path = "https://groups.csail.mit.edu/sls/downloads/movie/"
-        train_file = "trivia10k13train.bio"
-        test_file = "trivia10k13test.bio"
-        cached_path(f"{mit_movie_path}{train_file}", Path("datasets") / dataset_name)
-        cached_path(f"{mit_movie_path}{test_file}", Path("datasets") / dataset_name)
+        wnut_path = "https://noisy-text.github.io/2017/files/"
+        cached_path(f"{wnut_path}wnut17train.conll", Path("datasets") / dataset_name)
+        cached_path(f"{wnut_path}emerging.dev.conll", Path("datasets") / dataset_name)
+        cached_path(
+            f"{wnut_path}emerging.test.annotated", Path("datasets") / dataset_name
+        )
 
-        super(MITMovieNERComplex, self).__init__(
-            data_folder,
-            columns,
-            train_file=train_file,
-            test_file=test_file,
-            tag_to_bioes=tag_to_bioes,
-            in_memory=in_memory,
-            **corpusargs,
+        super(WNUT_17, self).__init__(
+            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory, **corpusargs,
         )
 
-class SEC_FILLINGS(ColumnCorpus):
+
+class WNUT_2020_NER(ColumnCorpus):
     def __init__(
             self,
-            base_path: Union[str, Path] = None, 
+            base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
             in_memory: bool = True,
+            document_as_sequence: bool = False,
             **corpusargs,
     ):
-        
+        """
+        Initialize the WNUT_2020_NER corpus. The first time you call this constructor it will automatically
+        download the dataset.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param tag_to_bioes: NER by default, since it is the only option of the WNUT corpus.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        """
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
         # column format
-        columns = {0: "text", 1: "pos", 3: "ner"}
+        columns = {0: "text", 1: "ner"}
 
         # this dataset name
         dataset_name = self.__class__.__name__.lower()
@@ -2903,41 +2780,125 @@ def __init__(
         data_folder = base_path / dataset_name
 
         # download data if necessary
-        SEC_FILLINGS_Path = "https://raw.githubusercontent.com/juand-r/entity-recognition-datasets/master/data/SEC-filings/CONLL-format/data/"
-        cached_path(f"{SEC_FILLINGS_Path}test/FIN3.txt", Path("datasets") / dataset_name)
-        cached_path(f"{SEC_FILLINGS_Path}train/FIN5.txt", Path("datasets") / dataset_name)
+        github_url = "https://github.com/jeniyat/WNUT_2020_NER/archive/master.zip"
 
-        super(SEC_FILLINGS, self).__init__(
+        for sample in ["train", "test", "dev"]:
+
+            sample_file = data_folder / (sample + ".txt")
+            if not sample_file.is_file():
+
+                zip_path = cached_path(
+                    f"{github_url}", Path("datasets") / dataset_name
+                )
+
+                # unzip the downloaded repo and merge the train, dev and test datasets
+                unpack_file(zip_path, data_folder, "zip", False)  # unzipped folder name: WNUT_2020_NER-master
+
+                if sample == "test":
+                    file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data_2020/Conll_Format/")
+                else:
+                    file_path = data_folder / Path("WNUT_2020_NER-master/data/" + sample + "_data/Conll_Format/")
+                filenames = os.listdir(file_path)
+                with open(data_folder / (sample + '.txt'), 'w') as outfile:
+                    for fname in filenames:
+                        with open(file_path / fname) as infile:
+                            lines = infile.read()
+                            outfile.write(lines)
+
+                shutil.rmtree(str(data_folder / "WNUT_2020_NER-master"))  # clean up when done
+
+        super(WNUT_2020_NER, self).__init__(
             data_folder,
             columns,
             tag_to_bioes=tag_to_bioes,
             encoding="utf-8",
             in_memory=in_memory,
-            train_file='FIN5.txt',
-            test_file="FIN3.txt",
-            skip_first_line=True
+            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
             **corpusargs,
         )
 
-class TURKU_NER(ColumnCorpus):
+
+def _download_wikiner(language_code: str, dataset_name: str):
+    # download data if necessary
+    wikiner_path = (
+        "https://raw.githubusercontent.com/dice-group/FOX/master/input/Wikiner/"
+    )
+    lc = language_code
+
+    data_file = (
+            Path(flair.cache_root)
+            / "datasets"
+            / dataset_name
+            / f"aij-wikiner-{lc}-wp3.train"
+    )
+    if not data_file.is_file():
+
+        cached_path(
+            f"{wikiner_path}aij-wikiner-{lc}-wp3.bz2", Path("datasets") / dataset_name
+        )
+        import bz2, shutil
+
+        # unpack and write out in CoNLL column-like format
+        bz_file = bz2.BZ2File(
+            Path(flair.cache_root)
+            / "datasets"
+            / dataset_name
+            / f"aij-wikiner-{lc}-wp3.bz2",
+            "rb",
+        )
+        with bz_file as f, open(
+                Path(flair.cache_root)
+                / "datasets"
+                / dataset_name
+                / f"aij-wikiner-{lc}-wp3.train",
+                "w",
+                encoding="utf-8"
+        ) as out:
+            for line in f:
+                line = line.decode("utf-8")
+                words = line.split(" ")
+                for word in words:
+                    out.write("\t".join(word.split("|")) + "\n")
+
+
+class XTREME(MultiCorpus):
     def __init__(
             self,
+            languages: Union[str, List[str]] = None,
             base_path: Union[str, Path] = None,
             tag_to_bioes: str = "ner",
-            in_memory: bool = True,
-            document_as_sequence: bool = False,
+            in_memory: bool = False,
             **corpusargs,
     ):
         """
-        Initialize the Finnish TurkuNER corpus. The first time you call this constructor it will automatically
-        download the dataset.
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' to predict
-        POS tags instead
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Xtreme corpus for cross-lingual NER consisting of datasets of a total of 176 languages. The data comes from the google
+        research work XTREME https://github.com/google-research/xtreme. All datasets for NER and respective language abbreviations (e.g.
+        "en" for english can be found here https://www.amazon.com/clouddrive/share/d3KGCRCIYwhKJF0H3eWA26hjg2ZCRhjpEQtDL70FSBN/folder/C43gs51bSIaq5sFTQkWNCQ?_encoding=UTF8&*Version*=1&*entries*=0&mgh=1 )
+        The data is derived from the wikiann dataset https://elisa-ie.github.io/wikiann/ (license: https://opendatacommons.org/licenses/by/)
+
+        Parameters
+        ----------
+        languages : Union[str, List[str]], optional
+            Default the 40 languages that are used in XTREME are loaded. Otherwise on can hand over a strings or a list of strings
+            consisiting of abbreviations for languages. All datasets will be loaded in a MultiCorpus object.
+        base_path : Union[str, Path], optional
+            Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+            to point to a different folder but typically this should not be necessary.
+        tag_to_bioes : str, optional
+            The data is in bio-format. It will by default (with the string "ner" as value) be transformed
+            into the bioes format. If you dont want that set it to None.
+
         """
+        # if no languages are given as argument all languages used in XTREME will be loaded
+        if not languages:
+            languages = ["af", "ar", "bg", "bn", "de", "el", "en", "es", "et", "eu", "fa", "fi", "fr", "he", "hi", "hu",
+                         "id", "it", "ja", "jv", "ka", "kk", "ko", "ml", "mr", "ms", "my", "nl", "pt", "ru", "sw", "ta",
+                         "te", "th", "tl", "tr", "ur", "vi", "yo", "zh"]
+
+        # if only one language is given
+        if type(languages) == str:
+            languages = [languages]
+
         if type(base_path) == str:
             base_path: Path = Path(base_path)
 
@@ -2945,32 +2906,77 @@ def __init__(
         columns = {0: "text", 1: "ner"}
 
         # this dataset name
-        dataset_name = self.__class__.__name__.lower()
+        dataset_name = "xtreme"
 
         # default dataset folder is the cache root
         if not base_path:
             base_path = Path(flair.cache_root) / "datasets"
         data_folder = base_path / dataset_name
 
+        # For each language in languages, the file is downloaded if not existent
+        # Then a comlumncorpus of that data is created and saved in a list
+        # This list is handed to the multicorpus
+
+        # list that contains the columncopora
+        corpora = []
+
+        hu_path = "https://nlp.informatik.hu-berlin.de/resources/datasets/panx_dataset"
+
         # download data if necessary
-        conll_path = "https://raw.githubusercontent.com/TurkuNLP/turku-ner-corpus/master/data/conll"
-        dev_file = "dev.tsv"
-        test_file = "test.tsv"
-        train_file = "train.tsv"
-        cached_path(f"{conll_path}/{dev_file}", Path("datasets") / dataset_name)
-        cached_path(f"{conll_path}/{test_file}", Path("datasets") / dataset_name)
-        cached_path(f"{conll_path}/{train_file}", Path("datasets") / dataset_name)
+        for language in languages:
 
-        super(TURKU_NER, self).__init__(
-            data_folder,
-            columns,
-            dev_file=dev_file,
-            test_file=test_file,
-            train_file=train_file,
-            column_delimiter="\t",
-            tag_to_bioes=tag_to_bioes,
-            encoding="latin-1",
-            in_memory=in_memory,
-            document_separator_token=None if not document_as_sequence else "-DOCSTART-",
-            **corpusargs,
+            language_folder = data_folder / language
+
+            # if language not downloaded yet, download it
+            if not language_folder.exists():
+
+                file_name = language + '.tar.gz'
+                # create folder
+                os.makedirs(language_folder)
+
+                # download from HU Server
+                temp_file = cached_path(
+                    hu_path + "/" + file_name,
+                    Path("datasets") / dataset_name / language
+                )
+
+                # unzip
+                print("Extract data...")
+                import tarfile
+                tar = tarfile.open(str(temp_file), "r:gz")
+                for part in ["train", "test", "dev"]:
+                    tar.extract(part, str(language_folder))
+                tar.close()
+                print('...done.')
+
+                # transform data into required format
+                print("Process dataset...")
+                for part in ["train", "test", "dev"]:
+                    xtreme_to_simple_ner_annotation(str(language_folder / part))
+                print('...done.')
+
+            # initialize comlumncorpus and add it to list
+            print("Read data into corpus...")
+            corp = ColumnCorpus(data_folder=language_folder,
+                                column_format=columns,
+                                tag_to_bioes=tag_to_bioes,
+                                in_memory=in_memory,
+                                )
+            corpora.append(corp)
+            print("...done.")
+
+        super(XTREME, self).__init__(
+            corpora, name='xtreme',
         )
+
+
+def xtreme_to_simple_ner_annotation(data_file: Union[str, Path]):
+    with open(data_file, 'r', encoding='utf-8') as f:
+        lines = f.readlines()
+    with open(data_file, 'w', encoding='utf-8') as f:
+        for line in lines:
+            if line == '\n':
+                f.write(line)
+            else:
+                liste = line.split()
+                f.write(liste[0].split(':', 1)[1] + ' ' + liste[1] + '\n')
diff --git a/resources/docs/TUTORIAL_6_CORPUS.md b/resources/docs/TUTORIAL_6_CORPUS.md
index f981bf715..0c7419abe 100644
--- a/resources/docs/TUTORIAL_6_CORPUS.md
+++ b/resources/docs/TUTORIAL_6_CORPUS.md
@@ -162,20 +162,24 @@ data the first time you call the corresponding constructor ID. The following dat
 
 | ID(s) | Languages | Description |
 | -------------    | ------------- |------------- 
+| 'ANER_CORP' | Arabic  |  [Arabic Named Entity Recognition Corpus](http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp/) 4-class NER |
 | 'BIOFID' | German  |  [CoNLL-03](https://www.aclweb.org/anthology/K19-1081/) Biodiversity literature NER |
+| 'BIOSCOPE' | English  |  [BioScope](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-9-S11-S9/) biomedical texts annotated for uncertainty, negation and their scopes |
 | 'CONLL_03_DUTCH' | Dutch  |  [CoNLL-03](https://www.clips.uantwerpen.be/conll2002/ner/) 4-class NER |
 | 'CONLL_03_SPANISH' | Spanish  |  [CoNLL-03](https://www.clips.uantwerpen.be/conll2002/ner/) 4-class NER |
 | 'DANE' | Danish | [DaNE dataset](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#danish-dependency-treebank) | 
 | 'EUROPARL_NER_GERMAN' | German | [German Europarl dataset](https://nlpado.de/~sebastian/software/ner_german.shtml) NER in German EU parliament speeches | 
 | 'LER_GERMAN' | German | [Legal Entity Recognition](https://github.com/elenanereiss/Legal-Entity-Recognition) NER in German Legal Documents | 
-| 'MIT_RESTAURANTS' | English  |  [NER dataset for restaurant reviews](https://groups.csail.mit.edu/sls/downloads/restaurant/) |
+| 'MIT_MOVIE_NER_SIMPLE' | English  |  [NER dataset for movie reviews](https://groups.csail.mit.edu/sls/downloads/movie/) - simple NER |
+| 'MIT_MOVIE_NER_COMPLEX' | English  |  [NER dataset for movie reviews](https://groups.csail.mit.edu/sls/downloads/movie/) - complex NER |
+| 'MIT_RESTAURANT_NER' | English  |  [NER dataset for restaurant reviews](https://groups.csail.mit.edu/sls/downloads/restaurant/) |
 | 'NER_BASQUE' | Basque  |  [NER dataset for Basque](http://ixa2.si.ehu.eus/eiec/) |
 | 'NER_FINNISH' | Finnish | [Finer-data](https://github.com/mpsilfve/finer-data) | 
 | 'NER_SWEDISH' | Swedish | [Swedish Spraakbanken NER](https://github.com/klintan/swedish-ner-corpus/) 4-class NER |
+| 'TURKU_NER' | Finnish | [TURKU_NER](https://github.com/TurkuNLP/turku-ner-corpus) NER corpus created by the Turku NLP Group, University of Turku, Finland |
 | 'TWITTER_NER' | English  |  [Twitter NER dataset](https://github.com/aritter/twitter_nlp/) |
+| 'WEIBO_NER' | Chinese  | [Weibo NER corpus](https://paperswithcode.com/sota/chinese-named-entity-recognition-on-weibo-ner/).  |
 | 'WIKIANN' | 282 languages  | Gigantic [corpus for cross-lingual NER derived from Wikipedia](https://elisa-ie.github.io/wikiann/).  |
-| 'WNUT_17' | English  |  [WNUT-17](https://noisy-text.github.io/2017/files/) emerging entity detection |
-| 'WNUT_20' | English  |  [WNUT-20](https://github.com/jeniyat/WNUT_2020_NER) named entity extraction |
 | 'WIKIGOLD_NER' | English  |  [Wikigold](https://github.com/juand-r/entity-recognition-datasets/tree/master/data/wikigold) a manually annotated collection of Wikipedia text |
 | 'WIKINER_ENGLISH' | English  |  [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia |
 | 'WIKINER_GERMAN'  | German  |  [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia |
@@ -185,16 +189,33 @@ data the first time you call the corresponding constructor ID. The following dat
 | 'WIKINER_PORTUGUESE' | Portuguese  |  [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia |
 | 'WIKINER_POLISH' | Polish  |  [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia |
 | 'WIKINER_RUSSIAN'  | Russian  |  [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia |
+| 'WNUT_17' | English  |  [WNUT-17](https://noisy-text.github.io/2017/files/) emerging entity detection |
+| 'WNUT_2020_NER' | English  |  [WNUT-20](https://github.com/jeniyat/WNUT_2020_NER) named entity extraction |
 | 'XTREME' | 176 languages  |  [Xtreme](https://github.com/google-research/xtreme) corpus by Google Research for cross-lingual NER consisting of datasets of a total of 176 languages |
-| 'MITMovieNERSimple' | English | [eng](https://groups.csail.mit.edu/sls/downloads/movie) NER movie corpus collected by MIT (simpler queries) |
-| 'MITMovieNERComplex' | English | [trivia10k13](https://groups.csail.mit.edu/sls/downloads/movie) NER movie corpus collected by MIT (more complex queries) |
-| 'TURKU_NER' | Finnish | [TURKU_NER](https://github.com/TurkuNLP/turku-ner-corpus) NER corpus created by the Turku NLP Group, University of Turku, Finland |
 
 
 #### Biomedical Named Entity Recognition
 
 We support 31 biomedical NER datasets, listed [here](HUNFLAIR_CORPORA.md).
 
+
+#### Universal Proposition Banks 
+
+We now also support loading the [Universal Proposition Banks](https://github.com/System-T/UniversalPropositions)
+for the purpose of training multilingual frame detection systems. 
+
+| ID(s) | Languages | Description |
+| -------------    | ------------- |------------- |
+| 'UP_CHINESE' | Chinese  |  Universal Propositions for [Chinese](https://github.com/System-T/UniversalPropositions/tree/master/UP_Chinese) |
+| 'UP_ENGLISH'| English  |  Universal Propositions for [English](https://github.com/System-T/UniversalPropositions/tree/master/UP_English-EWT) |
+| 'UP_FINNISH'| Finnish  |  Universal Propositions for [Finnish](https://github.com/System-T/UniversalPropositions/tree/master/UP_Finnish)
+| 'UP_FRENCH'| French  |  Universal Propositions for [French](https://github.com/System-T/UniversalPropositions/tree/master/UP_French)
+| 'UP_GERMAN'| German  |  Universal Propositions for [German](https://github.com/System-T/UniversalPropositions/tree/master/UP_German) |
+| 'UP_ITALIAN', | Italian  |  Universal Propositions for [Italian](https://github.com/System-T/UniversalPropositions/tree/master/UP_Italian) |
+| 'UP_SPANISH' | Spanish  |  Universal Propositions for [Spanish](https://github.com/System-T/UniversalPropositions/tree/master/UP_Spanish) |
+| 'UP_SPANISH_ANCORA' | Spanish (Ancora Corpus)  |  Universal Propositions for [Spanish](https://github.com/System-T/UniversalPropositions/tree/master/UP_Spanish-AnCora) |
+
+
 #### Universal Dependency Treebanks
 
 | ID(s) | Languages | Description |

From a75f13a8f5438a7d65d11faaa656f33a9295ab41 Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Wed, 25 Nov 2020 14:44:58 +0100
Subject: [PATCH 24/35] GH-1983: bump version number

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index fa33a27cc..d82f2155d 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ document embeddings, including our proposed **[Flair embeddings](https://www.acl
 * **A PyTorch NLP framework.** Our framework builds directly on [PyTorch](https://pytorch.org/), making it easy to
 train your own models and experiment with new approaches using Flair embeddings and classes.
 
-Now at [version 0.6.1](https://github.com/flairNLP/flair/releases)!
+Now at [version 0.7](https://github.com/flairNLP/flair/releases)!
 
 ## Comparison with State-of-the-Art
 

From 514ca76e062eb264f237a13df3c9e60a250b3ee6 Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Wed, 25 Nov 2020 14:51:25 +0100
Subject: [PATCH 25/35] Update TUTORIAL_1_BASICS.md

---
 resources/docs/TUTORIAL_1_BASICS.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/resources/docs/TUTORIAL_1_BASICS.md b/resources/docs/TUTORIAL_1_BASICS.md
index 655ef375e..61828d0d0 100644
--- a/resources/docs/TUTORIAL_1_BASICS.md
+++ b/resources/docs/TUTORIAL_1_BASICS.md
@@ -80,7 +80,7 @@ print(untokenized_sentence)
 
 In this case, no tokenization is performed and the text is split on whitespaces, thus resulting in only 4 tokens here. 
 
-### Using a Different Tokenizer
+### Using a different tokenizer
 
 You can also pass custom tokenizers to the initialization method. For instance, if you want to tokenize a Japanese
 sentence you can use the 'janome' tokenizer instead, like this: 
@@ -110,12 +110,12 @@ You can write your own tokenization routine. Check the code of `flair.data.Token
  your own tokenization method.  
 
 ### Using pretokenized sequences
-You can pass pass a pretokenized sequence as list of words, e.g.
+You can alternatively pass a pretokenized sequence as list of words, e.g.
 
 ```python
 from flair.data import Sentence
-my_sent = Sentence(['The', 'grass', 'is', 'green', '.'])
-print(my_sent)
+sentence = Sentence(['The', 'grass', 'is', 'green', '.'])
+print(sentence)
 ```
 
 This should print:
@@ -129,7 +129,7 @@ Sentence: "The grass is green ."   [− Tokens: 5]
 
 In Flair, any data point can be labeled. For instance, you can label a word or label a sentence:
 
-### Adding Labels to Tokens
+### Adding labels to tokens
 
 A `Token` has fields for linguistic annotation, such as lemmas, part-of-speech tags or named entity tags. You can
 add a tag by specifying the tag type and the tag value. In this example, we're adding an NER tag of type 'color' to
@@ -171,7 +171,7 @@ This should print:
 Our color tag has a score of 1.0 since we manually added it. If a tag is predicted by our
 sequence labeler, the score value will indicate classifier confidence.
 
-### Adding Labels to Sentences
+### Adding labels to sentences
 
 You can also add a `Label` to a whole `Sentence`.
 For instance, the example below shows how we add the label 'sports' to a sentence, thereby labeling it
@@ -199,7 +199,7 @@ Sentence: "France is the current world cup winner."   [− Tokens: 7  − Senten
 
 Indicating that this sentence belongs to the topic 'sports' with confidence 1.0.
 
-### Multiple Labels
+### Multiple labels
 
 Any data point can be labeled multiple times. A sentence for instance might belong to two topics. In this case, add two labels with the same label name:
 
@@ -234,7 +234,7 @@ Sentence: "France is the current world cup winner."   [− Tokens: 7  − Senten
 
 Indicating that this sentence has two "topic" labels and one "language" label. 
 
-### Accessing a Sentence's Labels
+### Accessing a sentence's labels
 
 You can access these labels like this: 
 

From 84f2f2f4ba10688876514c939550d17d3f1e3cc1 Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Fri, 27 Nov 2020 12:50:11 +0100
Subject: [PATCH 26/35] Update TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md

---
 resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md
index eba2594df..50bbfc633 100644
--- a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md
+++ b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md
@@ -18,6 +18,8 @@ For instance, say you want to predict whether text is "happy" or "sad" but you h
 Just use TARS with this snippet:
 
 ```python
+from flair.models.text_classification_model import TARSClassifier
+
 # 1. Load our pre-trained TARS model for English
 tars = TARSClassifier.load('tars-base')
 

From f3eab501a956b70abd0b1ba84a6608c3880aa43d Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Fri, 27 Nov 2020 12:51:28 +0100
Subject: [PATCH 27/35] Update TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md

---
 resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md
index 50bbfc633..16f19b7ce 100644
--- a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md
+++ b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md
@@ -19,6 +19,7 @@ Just use TARS with this snippet:
 
 ```python
 from flair.models.text_classification_model import TARSClassifier
+from flair.data import Sentence
 
 # 1. Load our pre-trained TARS model for English
 tars = TARSClassifier.load('tars-base')
@@ -69,6 +70,8 @@ In this case, zero-shot prediction falsely predicts "drink" for the sentence "I
 To improve this, let's first create a small corpus of 4 training and 2 testing examples: 
 
 ```python
+from flair.datasets import SentenceDataset
+
 # training dataset consisting of four sentences (2 labeled as "food" and 2 labeled as "drink")
 train = SentenceDataset(
     [

From 0e12b0a90f28ef0efe0b23a67a9567056a7c5e2b Mon Sep 17 00:00:00 2001
From: alanakbik <alan.akbik@gmail.com>
Date: Tue, 1 Dec 2020 10:52:55 +0100
Subject: [PATCH 28/35] GH-1983: move distance classifier to diagnostics module

---
 flair/models/__init__.py                  |   1 -
 flair/models/text_classification_model.py | 486 +---------------------
 2 files changed, 1 insertion(+), 486 deletions(-)

diff --git a/flair/models/__init__.py b/flair/models/__init__.py
index 15f2a326b..ebb6827d3 100644
--- a/flair/models/__init__.py
+++ b/flair/models/__init__.py
@@ -2,4 +2,3 @@
 from .simple_sequence_tagger_model import SimpleSequenceTagger
 from .language_model import LanguageModel
 from .text_classification_model import TextClassifier
-from .text_classification_model import DistClassifier
diff --git a/flair/models/text_classification_model.py b/flair/models/text_classification_model.py
index 00115d2aa..7e0dab976 100644
--- a/flair/models/text_classification_model.py
+++ b/flair/models/text_classification_model.py
@@ -7,7 +7,6 @@
 from torch.utils.data.dataset import Dataset
 from tqdm import tqdm
 import numpy as np
-from math import floor
 
 import sklearn.metrics as metrics
 from sklearn.metrics.pairwise import cosine_similarity
@@ -17,12 +16,7 @@
 from flair.data import Dictionary, Sentence, Label, DataPoint
 from flair.datasets import SentenceDataset, DataLoader
 from flair.file_utils import cached_path
-from flair.training_utils import (
-    MetricRegression,
-    convert_labels_to_one_hot,
-    Result,
-    store_embeddings,
-)
+from flair.training_utils import convert_labels_to_one_hot, Result, store_embeddings
 
 log = logging.getLogger("flair")
 
@@ -947,481 +941,3 @@ def _fetch_model(model_name) -> str:
             model_name = cached_path(model_map[model_name], cache_dir=cache_dir)
 
         return model_name
-
-    
-
-class DistClassifier(flair.nn.Model):
-    """
-    DistClassifier
-    Model to predict distance between two words given their embeddings. Takes (contextual) word embedding as input.
-    The pair of word embeddings is passed through a linear layer that predicts their distance in a sentence. 
-    Note: When used for training the batch size must be set to 1!!!
-    """
-
-    def __init__(
-            self,
-            word_embeddings: flair.embeddings.TokenEmbeddings,
-            max_distance: int = 20,
-            beta: float = 1.0,
-            loss_max_weight: float = 1,
-            regression = False,
-            regr_loss_step = 0
-    ):
-        """
-        Initializes a DistClassifier
-        :param word_embeddings: embeddings used to embed each sentence
-        .param max_distance: max dist between word pairs = number of predicted classes - 1
-        :param beta: Parameter for F-beta score for evaluation and training annealing
-        :param loss_max_weight: Only for classification: Since small distances between word pairs occur mor frequent it makes sense to give them less weight
-        in the loss function. loss_max_weight will be used as the weight for the maximum distance and should be a number >=1
-        The other weights decrease with equidistant steps from high to low distance.
-        :param regression: if True the class does regression instead of classification
-        :param regr_loss_step: if > 0, the MSE-Loss in regression will be weighted. Word pairs with 
-        distance 0 have weight 1. Then, as the distance increases, the weight in the loss function,
-        increases step by step with size regr_loss_step 
-        """
-
-        super(DistClassifier, self).__init__()
-
-        self.word_embeddings: flair.embeddings.TokenEmbeddings = word_embeddings
-
-        self.beta = beta
-
-        self.loss_max_weight = loss_max_weight
-                            
-        self.regression = regression
-
-        self.regr_loss_step = regr_loss_step
-
-        if not regression:
-            self.max_distance = max_distance
-            
-            # weights for loss function
-            if self.loss_max_weight > 1:
-                step = (self.loss_max_weight - 1) / self.max_distance
-
-                weight_list = [1. + i * step for i in range(self.max_distance + 1)]
-
-                self.loss_weights = torch.FloatTensor(weight_list).to(flair.device)
-                
-            else:
-                self.loss_weights = None
-            
-            # iput size is two times wordembedding size since we use pair of words as input
-            # the output size is max_distance + 1, i.e. we allow 0,1,...,max_distance words between pairs
-            self.decoder = nn.Linear(
-                self.word_embeddings.embedding_length * 2, self.max_distance + 1)
-            
-            self.loss_function = nn.CrossEntropyLoss(weight=self.loss_weights)
-        
-        # regression
-        else:
-            self.max_distance = float('inf')
-            
-            # iput size is two times wordembedding size since we use pair of words as input
-            # the output size is 1
-            self.decoder = nn.Linear(
-                self.word_embeddings.embedding_length * 2, 1)
-            
-            if regr_loss_step > 0:
-                self.loss_function = self.weighted_mse_loss
-            else:
-                self.loss_function = nn.MSELoss()
-            
-        nn.init.xavier_uniform_(self.decoder.weight)
-
-        # auto-spawn on GPU if available
-        self.to(flair.device)
-        
-        
-    # all input should be tensors
-    def weighted_mse_loss(self,predictions, target):
-        
-        weight = 1 + self.regr_loss_step * target
-        
-        return (weight * ((predictions - target) ** 2)).mean()
-        
-
-    # forward allows only a single sentcence!!
-    def forward(self, sentence: Sentence):
-
-        # embed words of sentence
-        self.word_embeddings.embed(sentence)
-
-        # go through all pairs of words with a maximum number of max_distance in between
-        numberOfWords = len(sentence)
-        text_embedding_list = []
-        # go through all pairs
-        for i in range(numberOfWords):
-            for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)):
-                text_embedding_list.append(torch.cat((sentence[i].embedding, sentence[j].embedding)).unsqueeze(0))
-
-        # 2-dim matrix whose rows are the embeddings of word pairs of the sentence
-        text_embedding_tensor = torch.cat(text_embedding_list, 0).to(flair.device)
-
-        label_scores = self.decoder(text_embedding_tensor)
-        
-        if self.regression:
-            return label_scores.squeeze(1)
-        
-        return label_scores
-
-    def _get_state_dict(self):
-        model_state = {
-            "state_dict": self.state_dict(),
-            "word_embeddings": self.word_embeddings,
-            "max_distance": self.max_distance,
-            "beta": self.beta,
-            "loss_max_weight": self.loss_max_weight,
-            "regression": self.regression,
-            "regr_loss_step": self.regr_loss_step
-        }
-        return model_state
-
-    @staticmethod
-    def _init_model_with_state_dict(state):
-        beta = 1.0 if "beta" not in state.keys() else state["beta"]
-        weight = 1 if "loss_max_weight" not in state.keys() else state["loss_max_weight"]
-
-        model = DistClassifier(
-            word_embeddings=state["word_embeddings"],
-            max_distance=state["max_distance"],
-            beta=beta,
-            loss_max_weight=weight,
-            regression=state["regression"],
-            regr_loss_step=state["regr_loss_step"]
-        )
-
-        model.load_state_dict(state["state_dict"])
-        return model
-
-    # So far only one sentence allowed
-    # If list of sentences is handed the function works with the first sentence of the list
-    def forward_loss(
-            self, data_points: Union[List[Sentence], Sentence]
-    ) -> torch.tensor:
-
-        if isinstance(data_points, list):  # first sentence
-            data_points = data_points[0]
-
-        if len(data_points) < 2:
-            return torch.tensor([0.], requires_grad=True)
-
-        scores = self.forward(data_points)
-
-        return self._calculate_loss(scores, data_points)
-
-    # Assume data_points is a single sentence!!!
-    # scores are the predictions for each word pair
-    def _calculate_loss(self, scores, data_points):
-
-        indices = []
-        numberOfWords = len(data_points)
-
-        # classification needs labels to be integers, regression needs labels to be float
-        # this is due to the different loss functions
-        if not self.regression:
-            for i in range(numberOfWords):
-                for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)):
-                    indices.append(torch.LongTensor([j - i - 1]))  # distance between words
-        else:
-            for i in range(numberOfWords):
-                for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)):            
-                    indices.append(torch.Tensor([j - i - 1])) # distance between words
-
-        labels = torch.cat(indices, 0).to(flair.device)
-        
-        return self.loss_function(scores, labels)
-
-    # only single sentences as input
-    def _forward_scores_and_loss(
-            self, data_points: Union[List[Sentence], Sentence], return_loss=False):
-
-        if isinstance(data_points, list):  # first sentence
-            data_points = data_points[0]
-
-        scores = self.forward(data_points)
-
-        loss = None
-        if return_loss:
-            loss = self._calculate_loss(scores, data_points)
-
-        return scores, loss
-    
-    def evaluate(
-            self,
-            sentences: Union[List[DataPoint], Dataset],
-            out_path: Union[str, Path] = None,
-            embedding_storage_mode: str = "none",
-            mini_batch_size: int = 1,  # unnecessary, but trainer.train calls evaluate with this parameter
-            num_workers: int = 8,
-    ) -> (Result, float):
-        
-        if self.regression:
-            return self.evaluate_regression(
-            sentences = sentences,
-            out_path = out_path,
-            embedding_storage_mode=embedding_storage_mode,
-                )
-        
-        return self.evaluate_classification(
-            sentences = sentences,
-            out_path = out_path,
-            embedding_storage_mode=embedding_storage_mode,
-            )
-    
-    def evaluate_regression(
-            self,
-            sentences: Union[List[DataPoint], Dataset],
-            out_path: Union[str, Path] = None,
-            embedding_storage_mode: str = "none",
-    ) -> (Result, float):
-        
-        with torch.no_grad():
-            
-            buckets = [0 for _ in range(11)]
-            
-            eval_loss = 0
-
-            metric = MetricRegression("Evaluation")
-
-            lines: List[str] = []
-            
-            max_dist_plus_one = max([len(sent) for sent in sentences]) - 1
-            
-            num_occurences = [0 for _ in range(max_dist_plus_one)]
-            
-            cumulated_values = [0 for _ in range(max_dist_plus_one)]
-            
-            for sentence in sentences:
-                
-                if len(sentence) < 2:  # we need at least 2 words per sentence
-                    continue
-
-                scores, loss = self._forward_scores_and_loss(sentence, return_loss=True)
-
-                predictions = scores.tolist()
-
-                # gold labels
-                true_values_for_sentence = []
-                numberOfPairs = 0
-                numberOfWords = len(sentence)
-                lines.append(sentence.to_tokenized_string() + '\n')
-                for i in range(numberOfWords):
-                    for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)):
-                        true_dist = j - i - 1
-                        pred = predictions[numberOfPairs]
-                        
-                        true_values_for_sentence.append(true_dist)
-
-                        # for output text file
-                        eval_line = f"({i},{j})\t{true_dist}\t{pred:.2f}\n"
-                        lines.append(eval_line)
-                        
-                        # for buckets
-                        error = abs(true_dist - pred)
-                        if error >= 10:
-                            buckets[10] += 1
-                        else:
-                            buckets[floor(error)] += 1
-                            
-                        # for average prediction
-                        num_occurences[true_dist] += 1
-                        cumulated_values[true_dist] += pred
-
-                        numberOfPairs += 1
-                        
-                eval_loss += loss/numberOfPairs
-
-                metric.true.extend(true_values_for_sentence)
-                metric.pred.extend(predictions)
-
-                store_embeddings(sentence, embedding_storage_mode)
-
-            eval_loss /= len(sentences) # w.r.t self.loss
-            
-            # add some statistics to the output
-            eval_line = f"Number of Sentences: {len(sentences)}\nBuckets:\n | 0-1 | 1-2 | 2-3 | 3-4 | 4-5 | 5-6 | 6-7 | 7-8 | 8-9 | 9-10 | >10 |\n"
-            lines.append(eval_line)
-            eval_line = "| {} | {} | {} | {} | {} | {} | {} | {} | {} | {} | {} |".format(buckets[0],buckets[1],buckets[2],buckets[3],
-                                                                                          buckets[4],buckets[5],buckets[6],buckets[7],
-                                                                                          buckets[8],buckets[9],buckets[10])
-            lines.append(eval_line)
-            lines.append("\nAverage predicted values per distance:\n")
-            eval_line = ""
-            for i in range(max_dist_plus_one):
-                eval_line += str(i) + ": " + f"{cumulated_values[i]/num_occurences[i]:.2f}" + " "
-                if i!=0 and i%15==0:
-                    eval_line += "\n"
-            
-            lines.append(eval_line)
-                
-            
-
-            if out_path is not None:
-                with open(out_path, "w", encoding="utf-8") as outfile:
-                    outfile.write("".join(lines))
-
-            log_line = f"{metric.mean_squared_error()}\t{metric.spearmanr()}\t{metric.pearsonr()}"
-            log_header = "MSE\tSPEARMAN\tPEARSON"
-
-            detailed_result = (
-                f"AVG: mse: {metric.mean_squared_error():.4f} - "
-                f"mae: {metric.mean_absolute_error():.4f} - "
-                f"pearson: {metric.pearsonr():.4f} - "
-                f"spearman: {metric.spearmanr():.4f}"
-            )
-
-            result: Result = Result(
-                metric.pearsonr(), log_header, log_line, detailed_result
-            )
-
-
-            return result, eval_loss           
-
-    def evaluate_classification(
-            self,
-            sentences: Union[List[DataPoint], Dataset],
-            out_path: Union[str, Path] = None,
-            embedding_storage_mode: str = "none",
-    ) -> (Result, float):
-
-        # use scikit-learn to evaluate
-        y_true = []
-        y_pred = []
-
-        with torch.no_grad():
-            eval_loss = 0
-
-            lines: List[str] = []
-            # we iterate over each sentence, instead of batches
-            for sentence in sentences:
-
-                if len(sentence) < 2:  # we need at least 2 words per sentence
-                    continue
-
-                scores, loss = self._forward_scores_and_loss(sentence, return_loss=True)
-
-                # get single labels from scores
-                predictions = [self._get_single_label(s) for s in scores]
-
-                # gold labels
-                true_values_for_sentence = []
-                numberOfPairs = 0
-                numberOfWords = len(sentence)
-                lines.append(sentence.to_tokenized_string() + '\n')
-                for i in range(numberOfWords):
-                    for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)):
-                        true_values_for_sentence.append(j - i - 1)
-
-                        # for output text file
-                        eval_line = "({},{})\t{}\t{}\n".format(i, j, j - i - 1, predictions[numberOfPairs])
-                        lines.append(eval_line)
-
-                        numberOfPairs += 1
-
-                eval_loss += loss / numberOfPairs  # add average loss of word pairs
-
-                for prediction_for_sentence, true_value_for_sentence in zip(
-                        predictions, true_values_for_sentence
-                ):
-                    # hot one vector of true value
-                    y_true_instance = np.zeros(self.max_distance + 1, dtype=int)
-                    y_true_instance[true_value_for_sentence] = 1
-                    y_true.append(y_true_instance.tolist())
-
-                    # hot one vector of predicted value
-                    y_pred_instance = np.zeros(self.max_distance + 1, dtype=int)
-                    y_pred_instance[prediction_for_sentence] = 1
-                    y_pred.append(y_pred_instance.tolist())
-                    
-                # speichert embeddings, falls embedding_storage!= 'None'
-                store_embeddings(sentence, embedding_storage_mode)
-
-            if out_path is not None:
-                with open(out_path, "w", encoding="utf-8") as outfile:
-                    outfile.write("".join(lines))
-
-            # make "classification report"
-            target_names = []  # liste aller labels, ins unserem Fall
-            for i in range(self.max_distance + 1):
-                target_names.append(str(i))
-            classification_report = metrics.classification_report(y_true, y_pred, digits=4,
-                                                                  target_names=target_names, zero_division=0)
-
-            # get scores
-            micro_f_score = round(metrics.fbeta_score(y_true, y_pred, beta=self.beta, average='micro', zero_division=0),
-                                  4)
-            accuracy_score = round(metrics.accuracy_score(y_true, y_pred), 4)
-            macro_f_score = round(metrics.fbeta_score(y_true, y_pred, beta=self.beta, average='macro', zero_division=0),
-                                  4)
-            # precision_score = round(metrics.precision_score(y_true, y_pred, average='macro', zero_division=0), 4)
-            # recall_score = round(metrics.recall_score(y_true, y_pred, average='macro', zero_division=0), 4)
-
-            detailed_result = (
-                    "\nResults:"
-                    f"\n- F-score (micro) {micro_f_score}"
-                    f"\n- F-score (macro) {macro_f_score}"
-                    f"\n- Accuracy {accuracy_score}"
-                    '\n\nBy class:\n' + classification_report
-            )
-
-            # line for log file
-            log_header = "ACCURACY"
-            log_line = f"\t{accuracy_score}"
-
-            result = Result(
-                main_score=micro_f_score,
-                log_line=log_line,
-                log_header=log_header,
-                detailed_results=detailed_result,
-            )
-
-            eval_loss /= len(sentences)
-
-            return result, eval_loss
-
-    @staticmethod
-    def _filter_empty_sentences(sentences: List[Sentence]) -> List[Sentence]:
-        filtered_sentences = [sentence for sentence in sentences if sentence.tokens]
-        if len(sentences) != len(filtered_sentences):
-            log.warning(
-                "Ignore {} sentence(s) with no tokens.".format(
-                    len(sentences) - len(filtered_sentences)
-                )
-            )
-        return filtered_sentences
-
-    def _obtain_labels(
-            self, scores: List[List[float]], predict_prob: bool = False
-    ) -> List[List[Label]]:
-        """
-        Predicts the labels of sentences.
-        :param scores: the prediction scores from the model
-        :return: list of predicted labels
-        """
-
-        if predict_prob:
-            return [self._predict_label_prob(s) for s in scores]
-
-        return [self._get_single_label(s) for s in scores]
-
-    def _get_single_label(self, label_scores):  # -> List[Label]:
-        softmax = torch.nn.functional.softmax(label_scores, dim=0)
-        conf, idx = torch.max(softmax, 0)
-
-        return idx.item()
-
-    def _predict_label_prob(self, label_scores) -> List[Label]:
-        softmax = torch.nn.functional.softmax(label_scores, dim=0)
-        label_probs = []
-        for idx, conf in enumerate(softmax):
-            label_probs.append(Label(idx, conf.item()))
-        return label_probs
-
-    def __str__(self):
-        return super(flair.nn.Model, self).__str__().rstrip(')') + \
-               f'  (beta): {self.beta}\n' + \
-               f'  (loss_max_weight): {self.loss_max_weight}\n' + \
-               f'  (max_distance) {self.max_distance}\n)'
-

From da01d603731d1ae96b8900b40ae3c312f7a46cc6 Mon Sep 17 00:00:00 2001
From: alanakbik <alan.akbik@gmail.com>
Date: Tue, 1 Dec 2020 11:15:44 +0100
Subject: [PATCH 29/35] GH-1983: move simple tagger to sandbox module

---
 flair/models/__init__.py                      |  1 -
 .../simple_sequence_tagger_model.py           | 26 +++++++++----------
 2 files changed, 13 insertions(+), 14 deletions(-)
 rename flair/models/{ => sandbox}/simple_sequence_tagger_model.py (97%)

diff --git a/flair/models/__init__.py b/flair/models/__init__.py
index ebb6827d3..784b038a9 100644
--- a/flair/models/__init__.py
+++ b/flair/models/__init__.py
@@ -1,4 +1,3 @@
 from .sequence_tagger_model import SequenceTagger, MultiTagger
-from .simple_sequence_tagger_model import SimpleSequenceTagger
 from .language_model import LanguageModel
 from .text_classification_model import TextClassifier
diff --git a/flair/models/simple_sequence_tagger_model.py b/flair/models/sandbox/simple_sequence_tagger_model.py
similarity index 97%
rename from flair/models/simple_sequence_tagger_model.py
rename to flair/models/sandbox/simple_sequence_tagger_model.py
index 298d887e0..211744643 100644
--- a/flair/models/simple_sequence_tagger_model.py
+++ b/flair/models/sandbox/simple_sequence_tagger_model.py
@@ -3,7 +3,6 @@
 from pathlib import Path
 from typing import List, Union, Optional
 
-import numpy as np
 import torch
 import torch.nn
 import torch.nn.functional as F
@@ -18,19 +17,20 @@
 
 log = logging.getLogger("flair")
 
-"""
-This class is a simple version of the SequenceTagger class. 
-The purpose of this class is to demonstrate the basic hierarchy of a 
-sequence tagger (this could be helpful for new developers).
-It only uses the given embeddings and maps them with a linear layer to
-the tag_dictionary dimension.
-Thus, this class misses following functionalities from the SequenceTagger:
-- CRF,
-- RNN,
-- Reprojection.
-As a result, only poor results can be expected.
-"""
+
 class SimpleSequenceTagger(flair.nn.Model):
+    """
+    This class is a simple version of the SequenceTagger class.
+    The purpose of this class is to demonstrate the basic hierarchy of a
+    sequence tagger (this could be helpful for new developers).
+    It only uses the given embeddings and maps them with a linear layer to
+    the tag_dictionary dimension.
+    Thus, this class misses following functionalities from the SequenceTagger:
+    - CRF,
+    - RNN,
+    - Reprojection.
+    As a result, only poor results can be expected.
+    """
     def __init__(
             self,
             embeddings: TokenEmbeddings,

From 49ce54b75ea11e9ef0b1153d239a8693ec42d487 Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Tue, 1 Dec 2020 12:38:59 +0100
Subject: [PATCH 30/35] Remove travis tag

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index d82f2155d..f145a7196 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,6 @@
 [![GitHub Issues](https://img.shields.io/github/issues/flairNLP/flair.svg)](https://github.com/flairNLP/flair/issues)
 [![Contributions welcome](https://img.shields.io/badge/contributions-welcome-brightgreen.svg)](CONTRIBUTING.md)
 [![License: MIT](https://img.shields.io/badge/License-MIT-brightgreen.svg)](https://opensource.org/licenses/MIT)
-[![Travis](https://img.shields.io/travis/flairNLP/flair.svg)](https://travis-ci.org/flairNLP/flair)
 
 A very simple framework for **state-of-the-art NLP**. Developed by [Humboldt University of Berlin](https://www.informatik.hu-berlin.de/en/forschung-en/gebiete/ml-en/) and friends.
 

From 8f748da712e7aadb0aa985c12348ee73eac2777b Mon Sep 17 00:00:00 2001
From: alanakbik <alan.akbik@gmail.com>
Date: Tue, 1 Dec 2020 14:31:01 +0100
Subject: [PATCH 31/35] GH-1983: update tutorial

---
 resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md b/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md
index 371aabe31..0066acdb8 100644
--- a/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md
+++ b/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md
@@ -246,11 +246,20 @@ This gives you a multilingual model. Try experimenting with more languages!
 ## Plotting Training Curves and Weights
 
 Flair includes a helper method to plot training curves and weights in the neural network.
-The `ModelTrainer` automatically generates a `loss.tsv` and a `weights.txt` file in the result folder.
+The `ModelTrainer` automatically generates a `loss.tsv` in the result folder. If you set
+`write_weights=True` during training, it will also generate a `weights.txt` file.
 
 After training, simple point the plotter to these files:
 
 ```python
+# set write_weights to True to write weights
+trainer.train('resources/taggers/example-universal-pos',
+               ...                
+               write_weights=True,
+               ...
+              )
+
+# visualize
 from flair.visual.training_curves import Plotter
 plotter = Plotter()
 plotter.plot_training_curves('loss.tsv')

From 08463b3ba8dcf77595358c776199bd25e0d515dd Mon Sep 17 00:00:00 2001
From: alanakbik <alan.akbik@gmail.com>
Date: Tue, 1 Dec 2020 19:15:46 +0100
Subject: [PATCH 32/35] GH-1983: update tutorial

---
 flair/models/text_classification_model.py              | 3 +--
 resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md | 2 +-
 resources/docs/TUTORIAL_2_TAGGING.md                   | 8 +++++++-
 resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md          | 7 ++++++-
 4 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/flair/models/text_classification_model.py b/flair/models/text_classification_model.py
index 7e0dab976..368831475 100644
--- a/flair/models/text_classification_model.py
+++ b/flair/models/text_classification_model.py
@@ -883,8 +883,7 @@ def predict_zero_shot(self,
         Method to make zero shot predictions from the TARS model
         :param sentences: input sentence objects to classify
         :param candidate_label_set: set of candidate labels
-        :param multi_label: indicates whether multi-label or single class prediction.
-        Defaults to False
+        :param multi_label: indicates whether multi-label or single class prediction. Defaults to True.
         """
 
         # check if candidate_label_set is empty
diff --git a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md
index 16f19b7ce..8df6e0a85 100644
--- a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md
+++ b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md
@@ -123,7 +123,7 @@ Done! Let's load the newly trained model and see if it does better:
 tars = TARSClassifier.load('resources/taggers/food_drink/final-model.pt')
 
 # 2. Prepare a test sentence
-sentence = Sentence("I am so glad you like coffee")
+sentence = Sentence("I am so glad you like burritos")
 
 # 3. Predict for food and drink
 tars.predict(sentence)
diff --git a/resources/docs/TUTORIAL_2_TAGGING.md b/resources/docs/TUTORIAL_2_TAGGING.md
index f0b1cde82..6b8c7986b 100644
--- a/resources/docs/TUTORIAL_2_TAGGING.md
+++ b/resources/docs/TUTORIAL_2_TAGGING.md
@@ -350,8 +350,14 @@ are provided:
 | 'communicative-functions' | English | detecting function of sentence in research paper (BETA) | scholarly papers |  |
 | 'de-offensive-language' | German | detecting offensive language | [GermEval 2018 Task 1](https://projects.fzai.h-da.de/iggsa/projekt/) |  **75.71** (Macro F1) |
 
+## Tagging new classes without training data
+
+In case you need to label classes that are not included you can also try
+our pre-trained zero-shot classifier TARS 
+(skip ahead to the [zero-shot tutorial](/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md)).
+TARS can perform text classification for arbitrary classes. 
 
 ## Next 
 
 Now, let us look at how to use different [word embeddings](/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md) to embed your
-text.
+text. 
diff --git a/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md b/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md
index 0066acdb8..ea663c512 100644
--- a/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md
+++ b/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md
@@ -364,4 +364,9 @@ However, if the dataset fits into CUDA memory, this option is the fastest one.
 
 ## Next
 
-You can now look into [training your own embeddings](/resources/docs/TUTORIAL_9_TRAINING_LM_EMBEDDINGS.md).
+If you don't have training data (or only very little), our TARS approach might be best for you. 
+Check out the TARS tutorial on [few-shot and zero-shot classification](/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md)). 
+
+Alternatively, you can
+ look into [training your own embeddings](/resources/docs/TUTORIAL_9_TRAINING_LM_EMBEDDINGS.md).
+

From b9df3a7606532dcbedd29875316dd176c23b2dce Mon Sep 17 00:00:00 2001
From: alanakbik <alan.akbik@gmail.com>
Date: Tue, 1 Dec 2020 19:17:28 +0100
Subject: [PATCH 33/35] GH-1983: update tutorial

---
 resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md
index 8df6e0a85..39bbd194a 100644
--- a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md
+++ b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md
@@ -70,6 +70,7 @@ In this case, zero-shot prediction falsely predicts "drink" for the sentence "I
 To improve this, let's first create a small corpus of 4 training and 2 testing examples: 
 
 ```python
+from flair.data import Corpus
 from flair.datasets import SentenceDataset
 
 # training dataset consisting of four sentences (2 labeled as "food" and 2 labeled as "drink")

From 8dc970bd2fd75e5ecc1836c5f728cfb8afd0233b Mon Sep 17 00:00:00 2001
From: alanakbik <alan.akbik@gmail.com>
Date: Tue, 1 Dec 2020 19:22:55 +0100
Subject: [PATCH 34/35] GH-1983: update tutorial

---
 resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md
index 39bbd194a..e05bf5185 100644
--- a/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md
+++ b/resources/docs/TUTORIAL_10_TRAINING_ZERO_SHOT_MODEL.md
@@ -99,6 +99,8 @@ whether a sentence mentions food or drink.
 Now, let's take the Corpus we created and do few-shot learning with our pre-trained TARS: 
 
 ```python
+from flair.trainers import ModelTrainer
+
 # 1. load base TARS
 tars = TARSClassifier.load('tars-base')
 

From 61d89c9128ec1c21417d2878d306b95956cd4979 Mon Sep 17 00:00:00 2001
From: alanakbik <alan.akbik@gmail.com>
Date: Tue, 1 Dec 2020 19:34:38 +0100
Subject: [PATCH 35/35] GH-1983: add distance diagnostic predicor

---
 .../diagnosis/distance_prediction_model.py    | 493 ++++++++++++++++++
 1 file changed, 493 insertions(+)
 create mode 100644 flair/models/diagnosis/distance_prediction_model.py

diff --git a/flair/models/diagnosis/distance_prediction_model.py b/flair/models/diagnosis/distance_prediction_model.py
new file mode 100644
index 000000000..0a0cba866
--- /dev/null
+++ b/flair/models/diagnosis/distance_prediction_model.py
@@ -0,0 +1,493 @@
+import logging
+from pathlib import Path
+from typing import List, Union
+
+import torch
+import torch.nn as nn
+from torch.utils.data.dataset import Dataset
+import numpy as np
+from math import floor
+
+import sklearn.metrics as metrics
+import flair.nn
+import flair.embeddings
+from flair.data import Sentence, Label, DataPoint
+from flair.training_utils import MetricRegression, Result, store_embeddings
+
+log = logging.getLogger("flair")
+
+
+class DistancePredictor(flair.nn.Model):
+    """
+    DistancePredictor
+    Model to predict distance between two words given their embeddings, modeled either as a classification or a
+    regression model. Takes (contextual) word embedding as input.
+    The pair of word embeddings is passed through a linear layer that predicts their distance in a sentence.
+    Note: When used for training the batch size must be set to 1!!!
+    """
+
+    def __init__(
+            self,
+            word_embeddings: flair.embeddings.TokenEmbeddings,
+            max_distance: int = 20,
+            beta: float = 1.0,
+            loss_max_weight: float = 1,
+            regression=False,
+            regr_loss_step=0
+    ):
+        """
+        Initializes a DistClassifier
+        :param word_embeddings: embeddings used to embed each sentence
+        .param max_distance: max dist between word pairs = number of predicted classes - 1
+        :param beta: Parameter for F-beta score for evaluation and training annealing
+        :param loss_max_weight: Only for classification: Since small distances between word pairs occur mor frequent it makes sense to give them less weight
+        in the loss function. loss_max_weight will be used as the weight for the maximum distance and should be a number >=1
+        The other weights decrease with equidistant steps from high to low distance.
+        :param regression: if True the class does regression instead of classification
+        :param regr_loss_step: if > 0, the MSE-Loss in regression will be weighted. Word pairs with
+        distance 0 have weight 1. Then, as the distance increases, the weight in the loss function,
+        increases step by step with size regr_loss_step
+        """
+
+        super(DistancePredictor, self).__init__()
+
+        self.word_embeddings: flair.embeddings.TokenEmbeddings = word_embeddings
+
+        self.beta = beta
+
+        self.loss_max_weight = loss_max_weight
+
+        self.regression = regression
+
+        self.regr_loss_step = regr_loss_step
+
+        if not regression:
+            self.max_distance = max_distance
+
+            # weights for loss function
+            if self.loss_max_weight > 1:
+                step = (self.loss_max_weight - 1) / self.max_distance
+
+                weight_list = [1. + i * step for i in range(self.max_distance + 1)]
+
+                self.loss_weights = torch.FloatTensor(weight_list).to(flair.device)
+
+            else:
+                self.loss_weights = None
+
+            # iput size is two times wordembedding size since we use pair of words as input
+            # the output size is max_distance + 1, i.e. we allow 0,1,...,max_distance words between pairs
+            self.decoder = nn.Linear(
+                self.word_embeddings.embedding_length * 2, self.max_distance + 1)
+
+            self.loss_function = nn.CrossEntropyLoss(weight=self.loss_weights)
+
+        # regression
+        else:
+            self.max_distance = float('inf')
+
+            # input size is two times word embedding size since we use pair of words as input
+            # the output size is 1
+            self.decoder = nn.Linear(
+                self.word_embeddings.embedding_length * 2, 1)
+
+            if regr_loss_step > 0:
+                self.loss_function = self.weighted_mse_loss
+            else:
+                self.loss_function = nn.MSELoss()
+
+        nn.init.xavier_uniform_(self.decoder.weight)
+
+        # auto-spawn on GPU if available
+        self.to(flair.device)
+
+    # all input should be tensors
+    def weighted_mse_loss(self, predictions, target):
+
+        weight = 1 + self.regr_loss_step * target
+
+        return (weight * ((predictions - target) ** 2)).mean()
+
+    # forward allows only a single sentcence!!
+    def forward(self, sentence: Sentence):
+
+        # embed words of sentence
+        self.word_embeddings.embed(sentence)
+
+        # go through all pairs of words with a maximum number of max_distance in between
+        numberOfWords = len(sentence)
+        text_embedding_list = []
+        # go through all pairs
+        for i in range(numberOfWords):
+            for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)):
+                text_embedding_list.append(torch.cat((sentence[i].embedding, sentence[j].embedding)).unsqueeze(0))
+
+        # 2-dim matrix whose rows are the embeddings of word pairs of the sentence
+        text_embedding_tensor = torch.cat(text_embedding_list, 0).to(flair.device)
+
+        label_scores = self.decoder(text_embedding_tensor)
+
+        if self.regression:
+            return label_scores.squeeze(1)
+
+        return label_scores
+
+    def _get_state_dict(self):
+        model_state = {
+            "state_dict": self.state_dict(),
+            "word_embeddings": self.word_embeddings,
+            "max_distance": self.max_distance,
+            "beta": self.beta,
+            "loss_max_weight": self.loss_max_weight,
+            "regression": self.regression,
+            "regr_loss_step": self.regr_loss_step
+        }
+        return model_state
+
+    @staticmethod
+    def _init_model_with_state_dict(state):
+        beta = 1.0 if "beta" not in state.keys() else state["beta"]
+        weight = 1 if "loss_max_weight" not in state.keys() else state["loss_max_weight"]
+
+        model = DistancePredictor(
+            word_embeddings=state["word_embeddings"],
+            max_distance=state["max_distance"],
+            beta=beta,
+            loss_max_weight=weight,
+            regression=state["regression"],
+            regr_loss_step=state["regr_loss_step"]
+        )
+
+        model.load_state_dict(state["state_dict"])
+        return model
+
+    # So far only one sentence allowed
+    # If list of sentences is handed the function works with the first sentence of the list
+    def forward_loss(
+            self, data_points: Union[List[Sentence], Sentence]
+    ) -> torch.tensor:
+
+        if isinstance(data_points, list):  # first sentence
+            data_points = data_points[0]
+
+        if len(data_points) < 2:
+            return torch.tensor([0.], requires_grad=True)
+
+        scores = self.forward(data_points)
+
+        return self._calculate_loss(scores, data_points)
+
+    # Assume data_points is a single sentence!!!
+    # scores are the predictions for each word pair
+    def _calculate_loss(self, scores, data_points):
+
+        indices = []
+        numberOfWords = len(data_points)
+
+        # classification needs labels to be integers, regression needs labels to be float
+        # this is due to the different loss functions
+        if not self.regression:
+            for i in range(numberOfWords):
+                for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)):
+                    indices.append(torch.LongTensor([j - i - 1]))  # distance between words
+        else:
+            for i in range(numberOfWords):
+                for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)):
+                    indices.append(torch.Tensor([j - i - 1]))  # distance between words
+
+        labels = torch.cat(indices, 0).to(flair.device)
+
+        return self.loss_function(scores, labels)
+
+    # only single sentences as input
+    def _forward_scores_and_loss(
+            self, data_points: Union[List[Sentence], Sentence], return_loss=False):
+
+        if isinstance(data_points, list):  # first sentence
+            data_points = data_points[0]
+
+        scores = self.forward(data_points)
+
+        loss = None
+        if return_loss:
+            loss = self._calculate_loss(scores, data_points)
+
+        return scores, loss
+
+    def evaluate(
+            self,
+            sentences: Union[List[DataPoint], Dataset],
+            out_path: Union[str, Path] = None,
+            embedding_storage_mode: str = "none",
+            mini_batch_size: int = 1,  # unnecessary, but trainer.train calls evaluate with this parameter
+            num_workers: int = 8,
+    ) -> (Result, float):
+
+        if self.regression:
+            return self.evaluate_regression(
+                sentences=sentences,
+                out_path=out_path,
+                embedding_storage_mode=embedding_storage_mode,
+            )
+
+        return self.evaluate_classification(
+            sentences=sentences,
+            out_path=out_path,
+            embedding_storage_mode=embedding_storage_mode,
+        )
+
+    def evaluate_regression(
+            self,
+            sentences: Union[List[DataPoint], Dataset],
+            out_path: Union[str, Path] = None,
+            embedding_storage_mode: str = "none",
+    ) -> (Result, float):
+
+        with torch.no_grad():
+
+            buckets = [0 for _ in range(11)]
+
+            eval_loss = 0
+
+            metric = MetricRegression("Evaluation")
+
+            lines: List[str] = []
+
+            max_dist_plus_one = max([len(sent) for sent in sentences]) - 1
+
+            num_occurences = [0 for _ in range(max_dist_plus_one)]
+
+            cumulated_values = [0 for _ in range(max_dist_plus_one)]
+
+            for sentence in sentences:
+
+                if len(sentence) < 2:  # we need at least 2 words per sentence
+                    continue
+
+                scores, loss = self._forward_scores_and_loss(sentence, return_loss=True)
+
+                predictions = scores.tolist()
+
+                # gold labels
+                true_values_for_sentence = []
+                numberOfPairs = 0
+                numberOfWords = len(sentence)
+                lines.append(sentence.to_tokenized_string() + '\n')
+                for i in range(numberOfWords):
+                    for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)):
+                        true_dist = j - i - 1
+                        pred = predictions[numberOfPairs]
+
+                        true_values_for_sentence.append(true_dist)
+
+                        # for output text file
+                        eval_line = f"({i},{j})\t{true_dist}\t{pred:.2f}\n"
+                        lines.append(eval_line)
+
+                        # for buckets
+                        error = abs(true_dist - pred)
+                        if error >= 10:
+                            buckets[10] += 1
+                        else:
+                            buckets[floor(error)] += 1
+
+                        # for average prediction
+                        num_occurences[true_dist] += 1
+                        cumulated_values[true_dist] += pred
+
+                        numberOfPairs += 1
+
+                eval_loss += loss / numberOfPairs
+
+                metric.true.extend(true_values_for_sentence)
+                metric.pred.extend(predictions)
+
+                store_embeddings(sentence, embedding_storage_mode)
+
+            eval_loss /= len(sentences)  # w.r.t self.loss
+
+            # add some statistics to the output
+            eval_line = f"Number of Sentences: {len(sentences)}\nBuckets:\n | 0-1 | 1-2 | 2-3 | 3-4 | 4-5 | 5-6 | 6-7 | 7-8 | 8-9 | 9-10 | >10 |\n"
+            lines.append(eval_line)
+            eval_line = "| {} | {} | {} | {} | {} | {} | {} | {} | {} | {} | {} |".format(buckets[0], buckets[1],
+                                                                                          buckets[2], buckets[3],
+                                                                                          buckets[4], buckets[5],
+                                                                                          buckets[6], buckets[7],
+                                                                                          buckets[8], buckets[9],
+                                                                                          buckets[10])
+            lines.append(eval_line)
+            lines.append("\nAverage predicted values per distance:\n")
+            eval_line = ""
+            for i in range(max_dist_plus_one):
+                eval_line += str(i) + ": " + f"{cumulated_values[i] / num_occurences[i]:.2f}" + " "
+                if i != 0 and i % 15 == 0:
+                    eval_line += "\n"
+
+            lines.append(eval_line)
+
+            if out_path is not None:
+                with open(out_path, "w", encoding="utf-8") as outfile:
+                    outfile.write("".join(lines))
+
+            log_line = f"{metric.mean_squared_error()}\t{metric.spearmanr()}\t{metric.pearsonr()}"
+            log_header = "MSE\tSPEARMAN\tPEARSON"
+
+            detailed_result = (
+                f"AVG: mse: {metric.mean_squared_error():.4f} - "
+                f"mae: {metric.mean_absolute_error():.4f} - "
+                f"pearson: {metric.pearsonr():.4f} - "
+                f"spearman: {metric.spearmanr():.4f}"
+            )
+
+            result: Result = Result(
+                metric.pearsonr(), log_header, log_line, detailed_result
+            )
+
+            return result, eval_loss
+
+    def evaluate_classification(
+            self,
+            sentences: Union[List[DataPoint], Dataset],
+            out_path: Union[str, Path] = None,
+            embedding_storage_mode: str = "none",
+    ) -> (Result, float):
+
+        # use scikit-learn to evaluate
+        y_true = []
+        y_pred = []
+
+        with torch.no_grad():
+            eval_loss = 0
+
+            lines: List[str] = []
+            # we iterate over each sentence, instead of batches
+            for sentence in sentences:
+
+                if len(sentence) < 2:  # we need at least 2 words per sentence
+                    continue
+
+                scores, loss = self._forward_scores_and_loss(sentence, return_loss=True)
+
+                # get single labels from scores
+                predictions = [self._get_single_label(s) for s in scores]
+
+                # gold labels
+                true_values_for_sentence = []
+                numberOfPairs = 0
+                numberOfWords = len(sentence)
+                lines.append(sentence.to_tokenized_string() + '\n')
+                for i in range(numberOfWords):
+                    for j in range(i + 1, min(i + self.max_distance + 2, numberOfWords)):
+                        true_values_for_sentence.append(j - i - 1)
+
+                        # for output text file
+                        eval_line = "({},{})\t{}\t{}\n".format(i, j, j - i - 1, predictions[numberOfPairs])
+                        lines.append(eval_line)
+
+                        numberOfPairs += 1
+
+                eval_loss += loss / numberOfPairs  # add average loss of word pairs
+
+                for prediction_for_sentence, true_value_for_sentence in zip(
+                        predictions, true_values_for_sentence
+                ):
+                    # hot one vector of true value
+                    y_true_instance = np.zeros(self.max_distance + 1, dtype=int)
+                    y_true_instance[true_value_for_sentence] = 1
+                    y_true.append(y_true_instance.tolist())
+
+                    # hot one vector of predicted value
+                    y_pred_instance = np.zeros(self.max_distance + 1, dtype=int)
+                    y_pred_instance[prediction_for_sentence] = 1
+                    y_pred.append(y_pred_instance.tolist())
+
+                # speichert embeddings, falls embedding_storage!= 'None'
+                store_embeddings(sentence, embedding_storage_mode)
+
+            if out_path is not None:
+                with open(out_path, "w", encoding="utf-8") as outfile:
+                    outfile.write("".join(lines))
+
+            # make "classification report"
+            target_names = []  # liste aller labels, ins unserem Fall
+            for i in range(self.max_distance + 1):
+                target_names.append(str(i))
+            classification_report = metrics.classification_report(y_true, y_pred, digits=4,
+                                                                  target_names=target_names, zero_division=0)
+
+            # get scores
+            micro_f_score = round(metrics.fbeta_score(y_true, y_pred, beta=self.beta, average='micro', zero_division=0),
+                                  4)
+            accuracy_score = round(metrics.accuracy_score(y_true, y_pred), 4)
+            macro_f_score = round(metrics.fbeta_score(y_true, y_pred, beta=self.beta, average='macro', zero_division=0),
+                                  4)
+            # precision_score = round(metrics.precision_score(y_true, y_pred, average='macro', zero_division=0), 4)
+            # recall_score = round(metrics.recall_score(y_true, y_pred, average='macro', zero_division=0), 4)
+
+            detailed_result = (
+                    "\nResults:"
+                    f"\n- F-score (micro) {micro_f_score}"
+                    f"\n- F-score (macro) {macro_f_score}"
+                    f"\n- Accuracy {accuracy_score}"
+                    '\n\nBy class:\n' + classification_report
+            )
+
+            # line for log file
+            log_header = "ACCURACY"
+            log_line = f"\t{accuracy_score}"
+
+            result = Result(
+                main_score=micro_f_score,
+                log_line=log_line,
+                log_header=log_header,
+                detailed_results=detailed_result,
+            )
+
+            eval_loss /= len(sentences)
+
+            return result, eval_loss
+
+    @staticmethod
+    def _filter_empty_sentences(sentences: List[Sentence]) -> List[Sentence]:
+        filtered_sentences = [sentence for sentence in sentences if sentence.tokens]
+        if len(sentences) != len(filtered_sentences):
+            log.warning(
+                "Ignore {} sentence(s) with no tokens.".format(
+                    len(sentences) - len(filtered_sentences)
+                )
+            )
+        return filtered_sentences
+
+    def _obtain_labels(
+            self, scores: List[List[float]], predict_prob: bool = False
+    ) -> List[List[Label]]:
+        """
+        Predicts the labels of sentences.
+        :param scores: the prediction scores from the model
+        :return: list of predicted labels
+        """
+
+        if predict_prob:
+            return [self._predict_label_prob(s) for s in scores]
+
+        return [self._get_single_label(s) for s in scores]
+
+    def _get_single_label(self, label_scores):  # -> List[Label]:
+        softmax = torch.nn.functional.softmax(label_scores, dim=0)
+        conf, idx = torch.max(softmax, 0)
+
+        return idx.item()
+
+    def _predict_label_prob(self, label_scores) -> List[Label]:
+        softmax = torch.nn.functional.softmax(label_scores, dim=0)
+        label_probs = []
+        for idx, conf in enumerate(softmax):
+            label_probs.append(Label(idx, conf.item()))
+        return label_probs
+
+    def __str__(self):
+        return super(flair.nn.Model, self).__str__().rstrip(')') + \
+               f'  (beta): {self.beta}\n' + \
+               f'  (loss_max_weight): {self.loss_max_weight}\n' + \
+               f'  (max_distance) {self.max_distance}\n)'
+