Merge pull request #1740 from flairNLP/GH-1739-bext-release

GH-1739: prepare release 0.5.1
flairNLP · Jul 5, 2020 · 13f5e8d · 13f5e8d
2 parents ddd7a16 + dbfea9c
commit 13f5e8d
Show file tree

Hide file tree

Showing 10 changed files with 36 additions and 58 deletions.
diff --git a/README.md b/README.md
@@ -25,7 +25,7 @@ document embeddings, including our proposed **[Flair embeddings](https://drive.g
 * **A PyTorch NLP framework.** Our framework builds directly on [PyTorch](https://pytorch.org/), making it easy to 
 train your own models and experiment with new approaches using Flair embeddings and classes.
 
-Now at [version 0.5](https://github.com/flairNLP/flair/releases)!
+Now at [version 0.5.1](https://github.com/flairNLP/flair/releases)!
 
 ## Comparison with State-of-the-Art
 
@@ -196,18 +196,6 @@ To also run slow tests, such as loading and using the embeddings provided by fla
 pytest --runslow tests/
 ```
 
-### Code Style
-
-To ensure a standardized code style we use the formatter [black](https://github.com/ambv/black).
-If your code is not formatted properly, travis will fail to build.
-
-If you want to automatically format your code on every commit, you can use [pre-commit](https://pre-commit.com/).
-Just install it via `pip install pre-commit` and execute `pre-commit install` in the root folder.
-This will add a hook to the repository, which reformats files on every commit.
-
-If you want to set it up manually, install black via `pip install black`.
-To reformat files execute `black .`.
-
 ## [License](/LICENSE)
 
 The MIT License (MIT)

diff --git a/flair/__init__.py b/flair/__init__.py
@@ -24,7 +24,7 @@
 
 import logging.config
 
-__version__ = "0.5"
+__version__ = "0.5.1"
 
 logging.config.dictConfig(
     {

diff --git a/flair/embeddings/document.py b/flair/embeddings/document.py
@@ -113,7 +113,9 @@ def _add_embeddings_to_sentences(self, sentences: List[Sentence]):
                 # tokenize and truncate to 512 subtokens (TODO: check better truncation strategies)
                 subtokenized_sentence = self.tokenizer.encode(sentence.to_tokenized_string(),
                                                               add_special_tokens=True,
-                                                              max_length=512)
+                                                              max_length=512,
+                                                              truncation=True,
+                                                              )
                 subtokenized_sentences.append(
                     torch.tensor(subtokenized_sentence, dtype=torch.long, device=flair.device))
 

diff --git a/flair/embeddings/token.py b/flair/embeddings/token.py
@@ -962,7 +962,9 @@ def _add_embeddings_to_sentences(self, sentences: List[Sentence]):
                 encoded_inputs = self.tokenizer.encode_plus(subtoken_ids_sentence,
                                                             max_length=self.max_subtokens_sequence_length,
                                                             stride=self.stride,
-                                                            return_overflowing_tokens=self.allow_long_sentences)
+                                                            return_overflowing_tokens=self.allow_long_sentences,
+                                                            truncation=True,
+                                                            )
 
                 subtoken_ids_split_sentence = encoded_inputs['input_ids']
                 subtokenized_sentences.append(torch.tensor(subtoken_ids_split_sentence, dtype=torch.long))

diff --git a/flair/models/sequence_tagger_model.py b/flair/models/sequence_tagger_model.py
@@ -1128,7 +1128,7 @@ def _fetch_model(model_name) -> str:
             [aws_resource_path_v04, "release-fr-ner-0", "fr-ner-wikiner-0.4.pt"]
         )
         model_map["nl-ner"] = "/".join(
-            [hu_path, "dutch-ner_0", "nl-ner-bert-conll02-v0.5.pt"]
+            [hu_path, "dutch-ner_0", "nl-ner-bert-conll02-v0.5b.pt"]
         )
         model_map["nl-ner-rnn"] = "/".join(
             [hu_path, "dutch-ner-flair-0", "nl-ner-conll02-v0.5.pt"]

diff --git a/flair/models/text_classification_model.py b/flair/models/text_classification_model.py
@@ -476,7 +476,7 @@ def _fetch_model(model_name) -> str:
 
         #Communicative Functions Model
         model_map["communicative-functions"] = "/".join(
-            [hu_path, "communicative-functions", "communicative-functions.pt"]
+            [hu_path, "comfunc", "communicative-functions-v0.5b.pt"]
         )
 
         cache_dir = Path("models")

diff --git a/resources/docs/TUTORIAL_2_TAGGING.md b/resources/docs/TUTORIAL_2_TAGGING.md
@@ -84,6 +84,7 @@ are provided:
 | 'chunk' |  Syntactic Chunking   |  Conll-2000     |  **96.47** (F1) |
 | 'pos' |  Part-of-Speech Tagging (fine-grained) |  Ontonotes     |  **98.19** (Accuracy) |
 | 'upos' |  Part-of-Speech Tagging (universal) |  Ontonotes     |  **98.6** (Accuracy) |
+| 'keyphrase' |  Methods and materials in science papers (BETA) |  Semeval2017   |   |
 | 'frame'  |   Semantic Frame Detection |  Propbank 3.0     |  **97.54** (F1) |
 
 
@@ -143,7 +144,8 @@ Thanks to our contributors we are also able to distribute a couple of models for
 | ID | Task | Training Dataset | Accuracy | Contributor |
 | -------------    | ------------- |------------- |------------- |------------- |
 | 'fr-ner' | Named Entity Recognition |  [WikiNER (aij-wikiner-fr-wp3)](https://github.com/dice-group/FOX/tree/master/input/Wikiner)  |  **95.57** (F1) | [mhham](https://github.com/mhham) |
-| 'nl-ner' | Named Entity Recognition |  [CoNLL 2002](https://www.clips.uantwerpen.be/conll2002/ner/)  |  **89.56** (F1) | [stefan-it](https://github.com/stefan-it/flair-experiments/tree/master/conll2002-ner-dutch) |
+| 'nl-ner' | Named Entity Recognition |  [CoNLL 2002](https://www.clips.uantwerpen.be/conll2002/ner/)  |  **92.58** (F1) |  |
+| 'nl-ner-rnn' | Named Entity Recognition |  [CoNLL 2002](https://www.clips.uantwerpen.be/conll2002/ner/)  |  **90.79** (F1) | |
 | 'da-ner' | Named Entity Recognition |  [Danish NER dataset](https://github.com/alexandrainst/danlp)  |   | [AmaliePauli](https://github.com/AmaliePauli) |
 | 'da-pos' | Named Entity Recognition |  [Danish Dependency Treebank](https://github.com/UniversalDependencies/UD_Danish-DDT/blob/master/README.md)  |  | [AmaliePauli](https://github.com/AmaliePauli) |
 | 'ml-pos' | Part-of-Speech Tagging (fine-grained) |  30000 Malayalam sentences  | **83** | [sabiqueqb](https://github.com/sabiqueqb) |
@@ -273,7 +275,7 @@ tagger. Depending on your resources, you might want to play around with this par
 ## Tagging with Pre-Trained Text Classification Models
 
 Let's use a pre-trained model for detecting positive or negative comments.
-This model was trained over the [IMDB](http://ai.stanford.edu/~amaas/data/sentiment/) dataset and can recognize positive
+This model was trained over a mix of product and movie review datasets and can recognize positive
 and negative sentiment in English text.
 
 ```python
@@ -315,6 +317,7 @@ are provided:
 | ------------- | ---- | ------------- |------------- |------------- |
 | 'sentiment' | English | detecting positive and negative sentiment (transformer-based) | movie and product reviews |  **98.87** |
 | 'sentiment-fast' | English | detecting positive and negative sentiment (RNN-based) | movie and product reviews |  **96.83**|
+| 'communicative-functions' | English | detecting function of sentence in research paper (BETA) | scholarly papers |  |
 | 'de-offensive-language' | German | detecting offensive language | [GermEval 2018 Task 1](https://projects.fzai.h-da.de/iggsa/projekt/) |  **75.71** (Macro F1) |
 
 

diff --git a/resources/docs/TUTORIAL_5_DOCUMENT_EMBEDDINGS.md b/resources/docs/TUTORIAL_5_DOCUMENT_EMBEDDINGS.md
@@ -10,39 +10,15 @@ library and how [word embeddings](/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md)
 
 All document embedding classes inherit from the `DocumentEmbeddings` class and implement the `embed()` method which you
 need to call to embed your text. This means that for most users of Flair, the complexity of different embeddings remains
-hidden behind this interface. Simply instantiate the embedding class you require and call `embed()` to embed your text.
+hidden behind this interface. 
 
-All embeddings produced with our methods are PyTorch vectors, so they can be immediately used for training and
-fine-tuning. There are four main document embeddings in Flair: (1) `DocumentPoolEmbeddings` that simply do an average over all word embeddings in the sentence, (2) `DocumentRNNEmbeddings` that train an RNN over all word embeddings in a sentence, and (3) `TransformerDocumentEmbeddings` / (4) `SentenceTransformerDocumentEmbeddings` that both use a pre-trained transformer: 
+There are four main document embeddings in Flair:
+ 1. `DocumentPoolEmbeddings` that simply do an average over all word embeddings in the sentence,
+ 2. `DocumentRNNEmbeddings` that train an RNN over all word embeddings in a sentence
+ 3. `TransformerDocumentEmbeddings` that use pre-trained transformers and are **recommended** for most text classification tasks
+ 4. `SentenceTransformerDocumentEmbeddings` that use pre-trained transformers and are *recommended* if you need a good vector representation of a sentence
 
-```python
-from flair.embeddings import TransformerDocumentEmbeddings, DocumentPoolEmbeddings, DocumentRNNEmbeddings, SentenceTransformerDocumentEmbeddings
-
-# document embedding is a mean over GloVe word embeddings
-pooled_embeddings = DocumentPoolEmbeddings([WordEmbeddings('glove')], pooling='mean')
-
-# document embedding is an LSTM over GloVe word embeddings
-lstm_embeddings = DocumentRNNEmbeddings([WordEmbeddings('glove')], rnn_type='lstm')
-
-# document embedding is a pre-trained transformer
-transformer_embeddings = TransformerDocumentEmbeddings('bert-base-uncased')
-
-# document embedding is a pre-trained transformer
-sent_transformer_embeddings = SentenceTransformerDocumentEmbeddings('bert-base-nli-mean-tokens')
-```
-
-Simply call embed() to embed your sentence with one of these three options: 
-
-```python
-# example sentence
-sentence = Sentence('The grass is green.')
-
-# embed with pooled embeddings
-pooled_embeddings.embed(sentence)
-
-# print embedding for whole sentence
-print(sentence.embedding)
-```
+Initialize one of these four options and call `embed()` to embed your sentence. 
 
 We give details on all four document embeddings in the following:
 
@@ -79,7 +55,7 @@ print(sentence.embedding)
 
 This prints out the embedding of the document. Since the document embedding is derived from word embeddings, its dimensionality depends on the dimensionality of word embeddings you are using. For more details on these embeddings, check [here](/resources/docs/embeddings/DOCUMENT_POOL_EMBEDDINGS.md). 
 
-One advantage of DocumentPoolEmbeddings is that they do not need to be trained, you can immediately use them to embed your documents. 
+One advantage of `DocumentPoolEmbeddings` is that they do not need to be trained, you can immediately use them to embed your documents. 
 
 ## Document RNN Embeddings
 
@@ -147,7 +123,9 @@ embedding.embed(sentence)
 
 ## SentenceTransformerDocumentEmbeddings
 
-You can also get several embeddings from the [`sentence-transformer`](https://github.com/UKPLab/sentence-transformers) library. The procedure is similar to the TransformerDocumentEmbeddings class:  
+You can also get several embeddings from 
+the [`sentence-transformer`](https://github.com/UKPLab/sentence-transformers) library. 
+These models are pre-trained to give good general-purpose vector representations for sentences. 
 
 ```python
 from flair.data import Sentence
@@ -165,6 +143,10 @@ embedding.embed(sentence)
 
 You can find a full list of their pretained models [here](https://docs.google.com/spreadsheets/d/14QplCdTCDwEmTqrn1LH4yrbKvdogK4oQvYO1K1aPR5M/edit#gid=0).
 
+**Note**: To use this embedding, you need to install `sentence-transformers` 
+with `pip install sentence-transformers`. This library currently requires an older version of `transformers`, 
+so installing it will uninstall the latest `transformers`, causing other transformer embeddings to break.
+
 ## Next
 
 You can now either look into the tutorial about [loading your corpus](/resources/docs/TUTORIAL_6_CORPUS.md), which

diff --git a/resources/docs/TUTORIAL_6_CORPUS.md b/resources/docs/TUTORIAL_6_CORPUS.md
@@ -161,6 +161,10 @@ data the first time you call the corresponding constructor ID. The following dat
 | 'CONLL_03_DUTCH' | Dutch  |  [CoNLL-03](https://www.clips.uantwerpen.be/conll2002/ner/) 4-class NER |
 | 'CONLL_03_SPANISH' | Spanish  |  [CoNLL-03](https://www.clips.uantwerpen.be/conll2002/ner/) 4-class NER |
 | 'DANE' | Danish | [DaNE dataset](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#danish-dependency-treebank) | 
+| 'LER_GERMAN' | German | [Legal Entity Recognition](https://github.com/elenanereiss/Legal-Entity-Recognition) NER in German Legal Documents | 
+| 'NER_BASQUE' | Basque  |  [NER dataset for Basque](http://ixa2.si.ehu.eus/eiec/) |
+| 'NER_FINNISH' | Finnish | [Finer-data](https://github.com/mpsilfve/finer-data) | 
+| 'NER_SWEDISH' | Swedish | [Swedish Spraakbanken NER] (https://github.com/klintan/swedish-ner-corpus/) 4-class NER |
 | 'WNUT_17' | English  |  [WNUT-17](https://noisy-text.github.io/2017/files/) emerging entity detection |
 | 'WIKINER_ENGLISH' | English  |  [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia |
 | 'WIKINER_GERMAN'  | German  |  [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia |
@@ -170,10 +174,6 @@ data the first time you call the corresponding constructor ID. The following dat
 | 'WIKINER_PORTUGUESE' | Portuguese  |  [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia |
 | 'WIKINER_POLISH' | Polish  |  [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia |
 | 'WIKINER_RUSSIAN'  | Russian  |  [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia |
-| 'NER_BASQUE' | Basque  |  [NER dataset for Basque](http://ixa2.si.ehu.eus/eiec/) |
-| 'NER_FINNISH' | Finnish | [Finer-data](https://github.com/mpsilfve/finer-data) | 
-| 'NER_SWEDISH' | Swedish | [Swedish Spraakbanken NER] (https://github.com/klintan/swedish-ner-corpus/) 4-class NER |
-| 'LER_GERMAN' | German | [Legal Entity Recognition](https://github.com/elenanereiss/Legal-Entity-Recognition) NER in German Legal Documents | 
 
 #### Universal Dependency Treebanks
 
@@ -223,6 +223,7 @@ data the first time you call the corresponding constructor ID. The following dat
 | ID(s) | Languages | Description |
 | -------------    | ------------- |------------- |
 | 'AMAZON_REVIEWS' | English |  [Amazon product reviews](https://nijianmo.github.io/amazon/index.html/) dataset with sentiment annotation |
+| 'COMMUNICATIVE_FUNCTIONS' | English |  [Communicative functions](https://github.com/Alab-NII/FECFevalDataset) of sentences in scholarly papers |
 | 'IMDB' | English |  [IMDB](http://ai.stanford.edu/~amaas/data/sentiment/) dataset of movie reviews with sentiment annotation  |
 | 'NEWSGROUPS' | English | The popular [20 newsgroups](http://qwone.com/~jason/20Newsgroups/) classification dataset |
 | 'SENTIMENT_140' | English | [Tweets dataset](http://help.sentiment140.com/for-students/) with sentiment annotation |

diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name="flair",
-    version="0.5",
+    version="0.5.1",
     description="A very simple framework for state-of-the-art NLP",
     long_description=open("README.md", encoding="utf-8").read(),
     long_description_content_type="text/markdown",