Skip to content

Commit

Permalink
Added WMT20 newstest (#109)
Browse files Browse the repository at this point in the history
* Added WMT20 newstest (#103)

* updated CHANGELOG and README

Co-authored-by: Ozan Caglayan <ozancag@gmail.com>
  • Loading branch information
mjpost and ozancaglayan authored Jul 30, 2020
1 parent b4864c3 commit abfbf38
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 1 deletion.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
# VERSION HISTORY

- 1.4.13 (2020-07-30)
- Added WMT20 newstest test sets (#103)
- Make mecab3-python an extra dependency, adapt code to new mecab3-python
This fixes the recent Windows installation issues as well (#104)
Japanese support should now be explicitly installed through sacrebleu[ja] package.
- Fix return type annotation of corpus_bleu()
- Improve sentence_score's documentation, do not allow single ref string (#98)

- 1.4.12 (2020-07-03)
- Fix a deployment bug (#96)

Expand Down
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@ Install the Python module (Python 3 only)

pip3 install sacrebleu

In order to install Japanese tokenizer support through `mecab-python3`, you need to run the
following command instead, to perform a full installation with dependencies:

pip3 install sacrebleu[ja]

Alternately, you can install from the source:

python3 setup.py install
Expand Down
2 changes: 1 addition & 1 deletion sacrebleu/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# express or implied. See the License for the specific language governing
# permissions and limitations under the License.

__version__ = '1.4.12'
__version__ = '1.4.13'
__description__ = 'Hassle-free computation of shareable, comparable, and reproducible BLEU scores'


Expand Down
62 changes: 62 additions & 0 deletions sacrebleu/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,43 @@
# Many of these are *.sgm files, which are processed to produced plain text that can be used by this script.
# The canonical location of unpacked, processed data is $SACREBLEU_DIR/$TEST/$SOURCE-$TARGET.{$SOURCE,$TARGET}
DATASETS = {
"wmt20/tworefs": {
'data': ['http://data.statmt.org/wmt20/translation-task/test.tgz'],
'description': 'WMT20 news test sets with two references',
'md5': ['3b1f777cfd2fb15ccf66e9bfdb2b1699'],
'de-en': ['sgm/newstest2020-deen-src.de.sgm', 'sgm/newstest2020-deen-ref.en.sgm', 'sgm/newstestB2020-deen-ref.en.sgm'],
'en-de': ['sgm/newstest2020-ende-src.en.sgm', 'sgm/newstest2020-ende-ref.de.sgm', 'sgm/newstestB2020-ende-ref.de.sgm'],
'en-zh': ['sgm/newstest2020-enzh-src.en.sgm', 'sgm/newstest2020-enzh-ref.zh.sgm', 'sgm/newstestB2020-enzh-ref.zh.sgm'],
'ru-en': ['sgm/newstest2020-ruen-src.ru.sgm', 'sgm/newstest2020-ruen-ref.en.sgm', 'sgm/newstestB2020-ruen-ref.en.sgm'],
'zh-en': ['sgm/newstest2020-zhen-src.zh.sgm', 'sgm/newstest2020-zhen-ref.en.sgm', 'sgm/newstestB2020-zhen-ref.en.sgm'],
},
"wmt20": {
'data': ['http://data.statmt.org/wmt20/translation-task/test.tgz'],
'description': 'Official evaluation data for WMT20',
'md5': ['3b1f777cfd2fb15ccf66e9bfdb2b1699'],
'cs-en': ['sgm/newstest2020-csen-src.cs.sgm', 'sgm/newstest2020-csen-ref.en.sgm'],
'de-en': ['sgm/newstest2020-deen-src.de.sgm', 'sgm/newstest2020-deen-ref.en.sgm'],
'de-fr': ['sgm/newstest2020-defr-src.de.sgm', 'sgm/newstest2020-defr-ref.fr.sgm'],
'en-cs': ['sgm/newstest2020-encs-src.en.sgm', 'sgm/newstest2020-encs-ref.cs.sgm'],
'en-de': ['sgm/newstest2020-ende-src.en.sgm', 'sgm/newstest2020-ende-ref.de.sgm'],
'en-iu': ['sgm/newstest2020-eniu-src.en.sgm', 'sgm/newstest2020-eniu-ref.iu.sgm'],
'en-ja': ['sgm/newstest2020-enja-src.en.sgm', 'sgm/newstest2020-enja-ref.ja.sgm'],
'en-km': ['sgm/newstest2020-enkm-src.en.sgm', 'sgm/newstest2020-enkm-ref.km.sgm'],
'en-pl': ['sgm/newstest2020-enpl-src.en.sgm', 'sgm/newstest2020-enpl-ref.pl.sgm'],
'en-ps': ['sgm/newstest2020-enps-src.en.sgm', 'sgm/newstest2020-enps-ref.ps.sgm'],
'en-ru': ['sgm/newstest2020-enru-src.en.sgm', 'sgm/newstest2020-enru-ref.ru.sgm'],
'en-ta': ['sgm/newstest2020-enta-src.en.sgm', 'sgm/newstest2020-enta-ref.ta.sgm'],
'en-zh': ['sgm/newstest2020-enzh-src.en.sgm', 'sgm/newstest2020-enzh-ref.zh.sgm'],
'fr-de': ['sgm/newstest2020-frde-src.fr.sgm', 'sgm/newstest2020-frde-ref.de.sgm'],
'iu-en': ['sgm/newstest2020-iuen-src.iu.sgm', 'sgm/newstest2020-iuen-ref.en.sgm'],
'ja-en': ['sgm/newstest2020-jaen-src.ja.sgm', 'sgm/newstest2020-jaen-ref.en.sgm'],
'km-en': ['sgm/newstest2020-kmen-src.km.sgm', 'sgm/newstest2020-kmen-ref.en.sgm'],
'pl-en': ['sgm/newstest2020-plen-src.pl.sgm', 'sgm/newstest2020-plen-ref.en.sgm'],
'ps-en': ['sgm/newstest2020-psen-src.ps.sgm', 'sgm/newstest2020-psen-ref.en.sgm'],
'ru-en': ['sgm/newstest2020-ruen-src.ru.sgm', 'sgm/newstest2020-ruen-ref.en.sgm'],
'ta-en': ['sgm/newstest2020-taen-src.ta.sgm', 'sgm/newstest2020-taen-ref.en.sgm'],
'zh-en': ['sgm/newstest2020-zhen-src.zh.sgm', 'sgm/newstest2020-zhen-ref.en.sgm'],
},
'mtnt2019': {
'data': ['http://www.cs.cmu.edu/~pmichel1/hosting/MTNT2019.tar.gz'],
'description': 'Test set for the WMT 19 robustness shared task',
Expand Down Expand Up @@ -78,6 +115,31 @@
'data': ['http://data.statmt.org/wmt19/translation-task/test.tgz'],
'description': 'Official evaluation data.',
'md5': ['84de7162d158e28403103b01aeefc39a'],
'citation': r"""@proceedings{ws-2019-machine,
title = "Proceedings of the Fourth Conference on Machine Translation (Volume 1: Research Papers)",
editor = "Bojar, Ond{\v{r}}ej and
Chatterjee, Rajen and
Federmann, Christian and
Fishel, Mark and
Graham, Yvette and
Haddow, Barry and
Huck, Matthias and
Yepes, Antonio Jimeno and
Koehn, Philipp and
Martins, Andr{\'e} and
Monz, Christof and
Negri, Matteo and
N{\'e}v{\'e}ol, Aur{\'e}lie and
Neves, Mariana and
Post, Matt and
Turchi, Marco and
Verspoor, Karin",
month = aug,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/W19-5200",
}""",
'cs-de': ['sgm/newstest2019-csde-src.cs.sgm', 'sgm/newstest2019-csde-ref.de.sgm'],
'de-cs': ['sgm/newstest2019-decs-src.de.sgm', 'sgm/newstest2019-decs-ref.cs.sgm'],
'de-en': ['sgm/newstest2019-deen-src.de.sgm', 'sgm/newstest2019-deen-ref.en.sgm'],
Expand Down

0 comments on commit abfbf38

Please sign in to comment.