From 73d7b8503f1651b3e882abc9053c7c7caaa4229f Mon Sep 17 00:00:00 2001 From: morganmcg1 Date: Mon, 27 Jul 2020 21:28:00 +0100 Subject: [PATCH] adds best experiment from v0.2, exp4 --- ...F_paracrawl_pytorch_simplified_en-ga.ipynb | 3904 +++++++++++++++++ 1 file changed, 3904 insertions(+) create mode 100644 v0.2_exp4_HF_paracrawl_pytorch_simplified_en-ga.ipynb diff --git a/v0.2_exp4_HF_paracrawl_pytorch_simplified_en-ga.ipynb b/v0.2_exp4_HF_paracrawl_pytorch_simplified_en-ga.ipynb new file mode 100644 index 0000000..6d8322c --- /dev/null +++ b/v0.2_exp4_HF_paracrawl_pytorch_simplified_en-ga.ipynb @@ -0,0 +1,3904 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# v0.2 - Exp4\n", + "Continued 5e from Exp3" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from fastai2.text.all import *\n", + "from fastai2.callback.all import *\n", + "from fastai2.basics import *\n", + "import seaborn as sns\n", + "\n", + "from nlp import load_dataset\n", + "\n", + "from einops import rearrange\n", + "import gc\n", + "import csv" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "path = Path('data/irish/parallel_corpora/paracrawl')\n", + "fn = 'para_crawl_huggingface_clean_v02_20200723.csv'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Experiment Number" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "exp = '4'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load saved dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "357399\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
engaclean
0Among the French PIM , in 2013, it is only 9 islands that have been chiroptérologiques inventories .I measc na PIM Fraince, i 2013, tá sé ach 9 oileáin a bhí chiroptérologiques fardail.True
1Among the French PIM, in 2013, it is only 9 islands that have been chiroptérologiques inventories.I measc na PIM Fraince , i 2013, tá sé ach 9 oileáin a bhí chiroptérologiques fardail .True
2Among the French PIM, in 2013, it is only 9 islands that have been chiroptérologiques inventories.I measc na PIM Fraince, i 2013, tá sé ach 9 oileáin a bhí chiroptérologiques fardail.True
3As you can see, so get to show off the spacious shapes in 3D (red and blue).Mar is féidir leat a fheiceáil, a fháil mar sin a thaispeáint as na cruthanna mhór i 3D (dearg agus gorm).True
4Equation Solving – Traditional, simpleLigningsløsning – Traidisiúnta, simplíTrue
\n", + "
" + ], + "text/plain": [ + " en \\\n", + "0 Among the French PIM , in 2013, it is only 9 islands that have been chiroptérologiques inventories . \n", + "1 Among the French PIM, in 2013, it is only 9 islands that have been chiroptérologiques inventories. \n", + "2 Among the French PIM, in 2013, it is only 9 islands that have been chiroptérologiques inventories. \n", + "3 As you can see, so get to show off the spacious shapes in 3D (red and blue). \n", + "4 Equation Solving – Traditional, simple \n", + "\n", + " ga \\\n", + "0 I measc na PIM Fraince, i 2013, tá sé ach 9 oileáin a bhí chiroptérologiques fardail. \n", + "1 I measc na PIM Fraince , i 2013, tá sé ach 9 oileáin a bhí chiroptérologiques fardail . \n", + "2 I measc na PIM Fraince, i 2013, tá sé ach 9 oileáin a bhí chiroptérologiques fardail. \n", + "3 Mar is féidir leat a fheiceáil, a fháil mar sin a thaispeáint as na cruthanna mhór i 3D (dearg agus gorm). \n", + "4 Ligningsløsning – Traidisiúnta, simplí \n", + "\n", + " clean \n", + "0 True \n", + "1 True \n", + "2 True \n", + "3 True \n", + "4 True " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df=pd.read_csv(path/fn)\n", + "print(len(df))\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# RAISE BUG" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "path = Path('data/irish/parallel_corpora/paracrawl')\n", + "fn = 'para_crawl_huggingface_clean_v02_20200723.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using custom data configuration default\n" + ] + } + ], + "source": [ + "# ds_dict = load_dataset('csv', data_files=str(path/fn),\n", + "# description='en-ga Paracrawl data from HuggingFace, clean for suspect translations',\n", + "# download_mode='force_redownload', version='0.0.2')\n", + "ds_dict = load_dataset('csv', data_files=str(path/fn),\n", + " description='en-ga Paracrawl data from HuggingFace, clean for suspect translations',\n", + " version='0.0.2')\n", + "ds=ds_dict['train']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Remove columns that were identified as noisy" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "357399\n", + "355837\n" + ] + } + ], + "source": [ + "def is_clean(example): return example['clean']\n", + "print(len(ds))\n", + "ds = ds.filter(is_clean)\n", + "print(len(ds))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Pre-processing\n", + "\n", + "**Remove long texts to make things easier**" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# df['ga_len'] = df['ga'].str.split().str.len()\n", + "# df['en_len'] = df['en'].str.split().str.len()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get sample lengths" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Dataset(schema: {'en': 'string', 'ga': 'string', 'clean': 'bool', 'ga_len': 'int64', 'en_len': 'int64'}, num_rows: 355837)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def get_lens(example, lang):\n", + " example[f'{lang}_len'] = len(example[lang].split())\n", + " return example\n", + "\n", + "ds = ds.map(partial(get_lens, lang='ga'))\n", + "ds = ds.map(partial(get_lens, lang='en'))\n", + "ds" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(49.0, 47.0)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Word count 90th percentile\n", + "np.percentile([o for o in ds['ga_len']], 90), np.percentile([o for o in ds['en_len']], 90)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(, 18.0)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sns.distplot(ds['ga_len']), np.median(ds['ga_len'])" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(, 17.0)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sns.distplot(ds['en_len']), np.median(ds['en_len'])" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "355837\n", + "334244\n" + ] + } + ], + "source": [ + "def len_filter(example,col,l): \n", + " return example[col] <= l\n", + "\n", + "print(len(ds))\n", + "ds = ds.filter(partial(len_filter, col='ga_len', l=60))\n", + "ds = ds.filter(partial(len_filter, col='en_len', l=60))\n", + "print(len(ds))" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# print(f'Removing {len(df.query(\"en_len > 60\"))} EN samples where len was > 60')\n", + "# print(len(df))\n", + "# df=df[~df.index.isin(df.query(\"en_len > 60\").index)]\n", + "# print(len(df))\n", + " \n", + "# print(f'Removing {len(df.query(\"ga_len > 60\"))} FR samples where len was > 60')\n", + "# print(len(df))\n", + "# df=df[~df.index.isin(df.query(\"ga_len > 60\").index)]\n", + "# print(len(df))" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(, 17.0)" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sns.distplot(ds['ga_len']), np.median(ds['ga_len'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Lowercase everything**\n", + "\n", + "NOT Done as it is appliced in the rules below" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# df['en'] = df['en'].apply(lambda x:x.lower())\n", + "# df['ga'] = df['ga'].apply(lambda x:x.lower())" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# def lowercase_all(example, lang):\n", + "# example[lang] = example[lang].lower()\n", + "# return example\n", + "\n", + "# ds = ds.map(partial(lowercase_all, lang='ga'))\n", + "# ds = ds.map(partial(lowercase_all, lang='en'))\n", + "# ds['ga'][400:420]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Rules used as part of tokenization" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " ,\n", + " functools.partial(, add_eos=True)]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "proc_rules=defaults.text_proc_rules[:-1] + [partial(lowercase, add_eos=True)]\n", + "proc_rules" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get Dataloaders" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load vocab to speed up data loading" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
engacleanga_lenen_len
0Among the French PIM , in 2013, it is only 9 islands that have been chiroptérologiques inventories .I measc na PIM Fraince, i 2013, tá sé ach 9 oileáin a bhí chiroptérologiques fardail.True1618
1Among the French PIM, in 2013, it is only 9 islands that have been chiroptérologiques inventories.I measc na PIM Fraince , i 2013, tá sé ach 9 oileáin a bhí chiroptérologiques fardail .True1816
2Among the French PIM, in 2013, it is only 9 islands that have been chiroptérologiques inventories.I measc na PIM Fraince, i 2013, tá sé ach 9 oileáin a bhí chiroptérologiques fardail.True1616
3As you can see, so get to show off the spacious shapes in 3D (red and blue).Mar is féidir leat a fheiceáil, a fháil mar sin a thaispeáint as na cruthanna mhór i 3D (dearg agus gorm).True2117
4Equation Solving – Traditional, simpleLigningsløsning – Traidisiúnta, simplíTrue45
\n", + "
" + ], + "text/plain": [ + " en \\\n", + "0 Among the French PIM , in 2013, it is only 9 islands that have been chiroptérologiques inventories . \n", + "1 Among the French PIM, in 2013, it is only 9 islands that have been chiroptérologiques inventories. \n", + "2 Among the French PIM, in 2013, it is only 9 islands that have been chiroptérologiques inventories. \n", + "3 As you can see, so get to show off the spacious shapes in 3D (red and blue). \n", + "4 Equation Solving – Traditional, simple \n", + "\n", + " ga \\\n", + "0 I measc na PIM Fraince, i 2013, tá sé ach 9 oileáin a bhí chiroptérologiques fardail. \n", + "1 I measc na PIM Fraince , i 2013, tá sé ach 9 oileáin a bhí chiroptérologiques fardail . \n", + "2 I measc na PIM Fraince, i 2013, tá sé ach 9 oileáin a bhí chiroptérologiques fardail. \n", + "3 Mar is féidir leat a fheiceáil, a fháil mar sin a thaispeáint as na cruthanna mhór i 3D (dearg agus gorm). \n", + "4 Ligningsløsning – Traidisiúnta, simplí \n", + "\n", + " clean ga_len en_len \n", + "0 True 16 18 \n", + "1 True 18 16 \n", + "2 True 16 16 \n", + "3 True 21 17 \n", + "4 True 4 5 " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df=ds.data.to_pandas()\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "#df = df.iloc[:1000]" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "hi\n" + ] + }, + { + "data": { + "text/html": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "hi\n" + ] + } + ], + "source": [ + "# at 30k tokens per vocab sometimes this works, sometimes it doesn't\n", + "\n", + "# Couldnt process 30k tokens until I added the 'hi' below, it was getting stuck at 94.87%, no idea why\n", + "@Numericalize\n", + "def encodes(self, o): \n", + " print('hi')\n", + " return TensorText(tensor([self.o2i [o_] for o_ in o]))\n", + "\n", + "class floatify_tfm(Transform):\n", + " def encodes(self,o): return o.float()\n", + " def decodes(self,o): return o.long()\n", + "\n", + "max_vocab=30000\n", + "#splits = ColSplitter()(df) \n", + "splits = RandomSplitter(valid_pct=0.2, seed=42)(df)\n", + "\n", + "tfms = [[Tokenizer.from_df(text_cols='en' , rules=proc_rules), attrgetter(\"text\"), Numericalize(max_vocab=max_vocab)], \n", + " [Tokenizer.from_df(text_cols='ga', lang='ga', rules=proc_rules), attrgetter(\"text\"), Numericalize(max_vocab=max_vocab)]]\n", + "\n", + "dl = partial(SortedDL, shuffle=True, res=df.ga_len.values)\n", + "\n", + "dsets = Datasets(df, tfms, splits=splits, dl_type=dl)\n", + "\n", + "# remove the print from Numericalize\n", + "@Numericalize\n", + "def encodes(self, o): return TensorText(tensor([self.o2i [o_] for o_ in o]))" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "# en_vocab=[]\n", + "# ga_vocab=[]\n", + "# with open(f'paracrawl_vocab_en_exp{exp}.csv', newline='') as csvfile:\n", + "# v_reader = csv.reader(csvfile, delimiter=',')\n", + "# for row in v_reader:\n", + "# en_vocab.append(row[0])\n", + " \n", + "# with open(f'paracrawl_vocab_ga_exp{exp}.csv', newline='') as csvfile:\n", + "# v_reader = csv.reader(csvfile, delimiter=',')\n", + "# for row in v_reader:\n", + "# ga_vocab.append(row[0])\n", + " \n", + "# len(en_vocab), len(ga_vocab), en_vocab[:10], ga_vocab[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(334244,\n", + " ((#267396) [142788,143864,71475,232172,136743,17163,145516,163557,296155,19037...],\n", + " (#66848) [255414,148963,186606,40380,28590,6799,259930,188378,286014,4807...]),\n", + " 24,\n", + " 23,\n", + " (TensorText([ 2, 8, 1018, 9, 8, 2104, 7, 7176, 11, 16,\n", + " 821, 11, 50, 23, 189, 165, 3062, 32, 60, 92,\n", + " 19526, 8919, 14, 3]),\n", + " TensorText([ 2, 17, 757, 18, 7, 9046, 8, 2425, 10, 17,\n", + " 832, 10, 58, 43, 82, 194, 4016, 9, 135, 29410,\n", + " 7653, 14, 3])))" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(dsets), splits, len(dsets[2][0]), len(dsets[2][1]), dsets[2]" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
texttext_
0xxbos ( i ) the application of any goods of a kind specified in the xxmaj fourth xxmaj schedule by a person for the purposes of his business and treated as delivered in accordance with section 3 ( 1 ) ( e ) , xxeos xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpadxxbos ( i ) le duine do bhaint úsáid chun críocha a ghnó as aon earraí de chineál a shonraítear sa xxmaj cheathrú xxmaj sceideal agus a áirítear mar earraí arna seachadadh de réir alt 3 ( 1 ) ( e ) , xxeos xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad
1xxbos ( xxrep 3 i ) which is under the control of not more than 5 persons . xxeos xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpadxxbos ( xxrep 3 i ) ar líon daoine nach mó ná 5 dhuine a rialaíonn í . xxeos xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad
2xxbos xxmaj welcome to xxmaj holm new owners ! xxeos xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpadxxbos xxmaj fáilte go dtí xxmaj holm úinéirí nua ! xxeos xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad
3xxbos ( 5 ) xxmaj this section has effect notwithstanding — xxeos xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpadxxbos ( 5 ) xxup tá éifeacht leis an alt seo d’ainneoin — xxeos xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad
4xxbos xxmaj the height of risers 108 mm xxeos xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpadxxbos xxup ní rachaidh airde risers 108 mm xxeos xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad
5xxbos xxmaj state ownership of certain inland waters . xxeos xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpadxxbos xxmaj uiscí intire áirithe ar úinéireacht ag an xxmaj stát . xxeos xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad
6xxbos xxmaj entry year property levy . xxeos xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpadxxbos xxmaj tobhach maoine bliana iontrála . xxeos xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad
7xxbos ( e ) no councillor shall be a member of more than one group . xxeos xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpadxxbos ( d ) ní bheidh comhairleoir ina chomhalta de chuid is mó ná grúpa amháin . xxeos xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad
8xxbos xxmaj appointments by local authorities to certain committees and other bodies . xxeos xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpadxxbos xxmaj ceapacháin ag údaráis áitiúla chun coistí áirithe agus chun comhlachtaí eile . xxeos xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "bs,sl = 32, 512\n", + "dls = dsets.dataloaders(bs=bs, seq_len=sl, before_batch=partial(pad_input, pad_fields=[0,1]))\n", + "dls.show_batch()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Save vocab to speed up data loading" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "# with open(f'models/paracrawl_vocab_en_v0.2_exp{exp}.csv', 'w', newline='') as csvfile:\n", + "# v_writer = csv.writer(csvfile, delimiter=',')\n", + "# for l in dls.vocab[0]:\n", + "# v_writer.writerow([l])\n", + " \n", + "# with open(f'models/paracrawl_vocab_ga_v0.2_exp{exp}.csv', 'w', newline='') as csvfile:\n", + "# v_writer = csv.writer(csvfile, delimiter=',')\n", + "# for l in dls.vocab[1]:\n", + "# v_writer.writerow([l])" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(334244, 8356, 2089)" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(dls.train_ds)+len(dls.valid_ds), len(dls.train), len(dls.valid)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Vocab lengths are : (22952, 30008)\n" + ] + } + ], + "source": [ + "print(f'Vocab lengths are : {len(dls.vocab[0]), len(dls.vocab[1])}')" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(torch.Size([32, 58]),\n", + " torch.Size([32, 64]),\n", + " (TensorText([[ 2, 13, 57, ..., 1, 1, 1],\n", + " [ 2, 13, 18, ..., 1, 1, 1],\n", + " [ 2, 13, 57, ..., 1, 1, 1],\n", + " ...,\n", + " [ 2, 13, 595, ..., 1, 1, 1],\n", + " [ 2, 8, 9, ..., 1, 1, 1],\n", + " [ 2, 8, 726, ..., 1, 1, 1]], device='cuda:0'),\n", + " TensorText([[ 2, 12, 17, ..., 1, 1, 1],\n", + " [ 2, 12, 9, ..., 1, 1, 1],\n", + " [ 2, 12, 17, ..., 1, 1, 1],\n", + " ...,\n", + " [ 2, 12, 602, ..., 1, 1, 1],\n", + " [ 2, 8, 2416, ..., 1, 1, 1],\n", + " [ 2, 8, 969, ..., 1, 1, 1]], device='cuda:0')))" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "o=dls.one_batch(); o[0].size(), o[1].size(), o" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Transformer model" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "class PositionalEncoding(nn.Module):\n", + " \"Encode the position with a sinusoid.\"\n", + " def __init__(self, d):\n", + " super().__init__()\n", + " self.register_buffer('freq', 1 / (10000 ** (torch.arange(0., d, 2.)/d)))\n", + " \n", + " def forward(self, pos):\n", + " #inp = torch.ger(pos, self.freq)\n", + " inp = torch.ger(pos, self.freq.float())\n", + " enc = torch.cat([inp.sin(), inp.cos()], dim=-1)\n", + " return enc" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "# tst_encoding = PositionalEncoding(20)\n", + "# res = tst_encoding(torch.arange(0,100).float())\n", + "# _, ax = plt.subplots(1,1)\n", + "# for i in range(1,5): ax.plot(res[:,i])" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "class TransformerEmbedding(nn.Module):\n", + " \"Embedding + positional encoding + dropout\"\n", + " def __init__(self, vocab_sz, emb_sz, inp_p=0.):\n", + " super().__init__()\n", + " self.emb_sz = emb_sz\n", + " self.embed = Embedding(vocab_sz, emb_sz)\n", + " self.pos_enc = PositionalEncoding(emb_sz)\n", + " self.drop = nn.Dropout(inp_p)\n", + " \n", + " def forward(self, inp): \n", + " #pos = torch.arange(0, inp.size(1), device=inp.device).float() \n", + " pos = torch.arange(0, inp.size(1), device=inp.device).float() \n", + " return self.drop(self.embed(inp) * math.sqrt(self.emb_sz) + self.pos_enc(pos))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## PyTorch Transformer Simple" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note: [src/tgt/memory]_mask should be filled with float(‘-inf’) for the masked positions and float(0.0) else. These masks ensure that predictions for position i depend only on the unmasked positions j and are applied identically for each sequence in a batch. \n", + "\n", + "[src/tgt/memory]_key_padding_mask should be a ByteTensor where True values are positions that should be masked with float(‘-inf’) and False values will be unchanged. This mask ensures that no information will be taken from position i if it is masked, and has a separate mask for each sequence in a batch.\n", + "\n", + "attn mask with -inf\n", + "key_padding mask with True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### pt_Transformer" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "def to_half(b):\n", + " \"Recursively map lists of tensors in `b ` to FP16.\"\n", + " return apply(lambda x: x.half() if torch.is_floating_point(x) else x, b)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "class pt_Transformer(Module):\n", + " def __init__(self, src_vcbsz, trg_vcbsz, n_enc_layers=6, n_dec_layers=6, n_heads=8, d_model=256, d_head=32, \n", + " d_inner=1024, p=0.1, bias=True, scale=True, double_drop=True, pad_idx=1, fp16=True):\n", + " self.pad_idx = pad_idx\n", + " self.enc_tfmr_emb = TransformerEmbedding(src_vcbsz, d_model, p)\n", + " self.dec_tfmr_emb = TransformerEmbedding(trg_vcbsz, d_model, 0.) \n", + " self.final = nn.Linear(d_model, trg_vcbsz)\n", + " self.fp16 = fp16\n", + " # !!!\n", + " #self.final.weight = self.dec_tfmr_emb.embed.weight # !! What does this do?\n", + " \n", + " self.transformer_model=torch.nn.Transformer(d_model=d_model, nhead=n_heads, num_encoder_layers=n_enc_layers, \n", + " num_decoder_layers=n_dec_layers, dim_feedforward=d_inner, dropout=p, \n", + " activation='relu', custom_encoder=None, custom_decoder=None)\n", + " \n", + " \n", + " def forward(self, src, trg, src_mask=None, tgt_mask=None, memory_mask=None, \n", + " src_key_padding_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None):\n", + " \n", + " enc_emb, dec_emb = self.enc_tfmr_emb(src), self.dec_tfmr_emb(trg)\n", + " \n", + " # Test whether fp16 is being used or not\n", + " if not isinstance(model.transformer_model.encoder.layers[0].self_attn.out_proj.weight,\n", + " torch.cuda.FloatTensor):\n", + " enc_emb=to_half(enc_emb)\n", + " dec_emb=to_half(dec_emb)\n", + " \n", + " src_mask=self.transformer_model.generate_square_subsequent_mask(src.size(1)).cuda()\n", + " trg_mask=self.transformer_model.generate_square_subsequent_mask(trg.size(1)).cuda()\n", + " \n", + " dec_out = self.transformer_model(enc_emb.permute(1,0,2), dec_emb.permute(1,0,2),\n", + " src_mask=src_mask, tgt_mask=trg_mask, memory_mask=None, \n", + " src_key_padding_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None)\n", + " \n", + " out=self.final(dec_out)\n", + " \n", + " return out.permute(1,0,2)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# for name, param in model.transformer_model.encoder.layers[0].self_attn.out_proj.named_parameters():\n", + "# print (name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Metric" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "class CorpusBLEUMetric(Metric):\n", + " def __init__(self, vocab_sz=5000, axis=-1):\n", + " \"\"\"BLEU Metric calculated over the validation corpus\"\"\"\n", + " self.pred_len, self.targ_len, self.corrects, self.counts = 0,0,[0]*4,[0]*4\n", + " self.axis, self.vocab_sz = axis, vocab_sz\n", + " \n", + " def reset(self):\n", + " self.pred_len,self.targ_len,self.corrects,self.counts = 0,0,[0]*4,[0]*4\n", + " \n", + " class NGram():\n", + " def __init__(self, ngram, max_n=5000): self.ngram,self.max_n = ngram,max_n\n", + " def __eq__(self, other):\n", + " if len(self.ngram) != len(other.ngram): return False\n", + " return np.all(np.array(self.ngram) == np.array(other.ngram))\n", + " def __hash__(self): return int(sum([o * self.max_n**i for i,o in enumerate(self.ngram)]))\n", + " \n", + " def get_grams(self, x, n, max_n=5000):\n", + " return x if n==1 else [self.NGram(x[i:i+n], max_n=max_n) for i in range(len(x)-n+1)]\n", + " \n", + " def get_correct_ngrams(self, pred, targ, n, max_n=5000):\n", + " pred_grams,targ_grams = self.get_grams(pred, n, max_n=max_n),self.get_grams(targ, n, max_n=max_n)\n", + " pred_cnt,targ_cnt = Counter(pred_grams),Counter(targ_grams)\n", + " return sum([min(c, targ_cnt[g]) for g,c in pred_cnt.items()]),len(pred_grams)\n", + " \n", + " def accumulate(self, learn):\n", + " last_output = learn.pred.argmax(dim=self.axis)\n", + " last_target = learn.y\n", + " for pred,targ in zip(last_output.cpu().numpy(),last_target.cpu().numpy()):\n", + " self.pred_len += len(pred)\n", + " self.targ_len += len(targ)\n", + " smooth_mteval = 1\n", + " for i in range(4):\n", + " c,t = self.get_correct_ngrams(pred, targ, i+1, max_n=self.vocab_sz)\n", + "\n", + " if c == 0:\n", + " smooth_mteval *= 2\n", + " c = 1 / smooth_mteval # exp smoothing, method 3 from http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf\n", + " \n", + " self.corrects[i] += c\n", + " self.counts[i] += t\n", + " \n", + " @property\n", + " def value(self): \n", + " if self.counts == 0: return None\n", + " elif max(self.corrects) == 0: return 0.0\n", + " else:\n", + " precs = [c/t for c,t in zip(self.corrects,self.counts)]\n", + " len_penalty = math.exp(1 - self.targ_len/self.pred_len) if self.pred_len < self.targ_len else 1\n", + " return len_penalty * ((precs[0]*precs[1]*precs[2]*precs[3]) ** 0.25)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Callbacks\n", + "\n", + "#### Present Input and Target in a single tuple" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "class CombineInputOutputCallback(Callback):\n", + " '''Callback to combine the input and target text into self.xb'''\n", + " def __init__(self): pass\n", + " def begin_batch(self): \n", + " self.learn.xb = (self.xb[0], self.yb[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Shifting and masking of y, from [Annotated Transformer](http://nlp.seas.harvard.edu/2018/04/03/attention.html#training):\n", + "\n", + "> We also modify the self-attention sub-layer in the decoder stack to prevent positions from attending to subsequent positions. This masking, combined with fact that the output embeddings are offset by one position, ensures that the predictions for position i can depend only on the known outputs at positions less than i." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Shifting" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Target shift/offset explained\n", + "\n", + "**Taken from [@bentrevett's brilliant github repo \"pytorch-seq2seq\" tutorials](https://github.com/bentrevett/pytorch-seq2seq/blob/master/6%20-%20Attention%20is%20All%20You%20Need.ipynb):**\n", + "\n", + "As we want our model to predict the token but not have it be an input into our model we simply slice the token off the end of the sequence. Thus:\n", + "\n", + "$$\\begin{align*}\\text{trg} &= [sos, x_1, x_2, x_3, eos]\\\\\\text{trg[:-1]} &= [sos, x_1, x_2, x_3]\\end{align*}$$\n", + "\n", + "$x_i$ denotes **actual** target sequence element. We then feed this into the model to get a predicted sequence that should hopefully predict the token:\n", + "\n", + "$$\\begin{align*}\n", + "\\text{output} &= [y_1, y_2, y_3, eos]\n", + "\\end{align*}$$\n", + "\n", + "$y_i$ denotes **predicted** target sequence element. We then calculate our loss using the original trg tensor with the token sliced off the front, leaving the token:\n", + "\n", + "$$\\begin{align*} \\text{output} &= [y_1, y_2, y_3, eos]\\\\ \\text{trg[1:]} &= [x_1, x_2, x_3, eos] \\end{align*}$$\n", + "\n", + "We then calculate our losses and update our parameters as is standard.\n", + " \n", + " \n", + "We don't want to punish the model for not translating the 'sos' token, but we do need it to predict/define the end of the sentence" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**RemoveEOSCallback** \n", + "\n", + "Cut the *EOS* token token from the **output_x** presented to the model as we are trying to predict the next word. Therefore don't want to model to try anything after the *EOS* token. So the last token given to the model will be the token before *EOS*. This callback modifies the second element of our learn.xb, (which is the *copied* yb)\n", + "\n", + "But this should also ignore padding, as otherwise we'll be just cutting the last padding token and not the EOS" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "class RemoveEOSCallback(Callback):\n", + " '''\n", + " Shift the target presented to the model during training to remove the \"eos\" token as \n", + " we don't want the model to learn to translate EOS when it sees EOS.\n", + " \n", + " In practice we actually mask the EOS token as due to batching the last token will often be a token,\n", + " not EOS\n", + " '''\n", + " def __init__(self, eos_idx): self.eos_idx=eos_idx\n", + " def begin_batch(self): \n", + " eos_mask=(self.learn.xb[1]!=self.eos_idx)\n", + " sz=torch.tensor(self.learn.xb[1].size())\n", + " sz[1]=sz[1]-1\n", + " self.learn.xb = (self.learn.xb[0], self.learn.xb[1][eos_mask].view((sz[0],sz[1])))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**LossTargetShiftCallback:** Shift the target shown to the loss to exclude the \"eos\" token, as translating \"bos\" is not part of our language translation objective" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "class LossTargetShiftCallback(Callback):\n", + " '''\n", + " Shift the target shown to the loss to exclude the \"bos\" token as the first token we want predicted\n", + " should be an actual word, not the \"bos\" token (as we have already given the model \"bos\" )\n", + " '''\n", + " def __init__(self): pass\n", + " def after_pred(self): \n", + " self.learn.yb = (self.learn.yb[0][:,1:],)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Transformer size from Annotated Transformer:\n", + "\n", + "N=6, d_model=512, d_ff=2048, h=8" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "pad_idx=1\n", + "assert dls.vocab[1][pad_idx] == 'xxpad' \n", + "n_x_vocab, n_y_vocab = len(dls.vocab[0]), len(dls.vocab[1])\n", + "d_model=512\n", + "n_heads=8 #12\n", + "d_inner=2048 #1024\n", + "\n", + "#model = Transformer(n_x_vocab, n_y_vocab, d_model=d_model, n_heads=n_heads, pad_idx=pad_idx)\n", + "\n", + "model=pt_Transformer(src_vcbsz=n_x_vocab, trg_vcbsz=n_y_vocab, d_model=d_model, d_inner=d_inner)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "#model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Kaiming_Normal works terrribly, at least if you apply it to everything except LayerNorm...\n", + "\n", + "DistilBERT works ok\n", + "\n", + "Could try xavier:\n", + "\n", + "```\n", + "def initialize_weights(m):\n", + " if hasattr(m, 'weight') and m.weight.dim() > 1:\n", + " nn.init.xavier_uniform_(m.weight.data)\n", + "\n", + "model.apply(initialize_weights);\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**DistilBERT initialisation**" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "# DistilERT HF init weights https://github.com/huggingface/transformers/blob/31e67dd19f1b3fe2bc9a13f86d814f3f7bba48e4/src/transformers/modeling_distilbert.py\n", + "\n", + "def distil_apply_leaf(m, f):\n", + " \"Apply `f` to children of `m`.\"\n", + " c = m.children()\n", + " if isinstance(m, nn.Module): f(m)\n", + " for l in c: apply_leaf(l,f)\n", + "\n", + "\n", + "def _distilbert_init_weights(module):\n", + " \"\"\" Initialize the weights.\n", + " \"\"\"\n", + " if isinstance(module, nn.Embedding):\n", + " if module.weight.requires_grad:\n", + " module.weight.data.normal_(mean=0.0, std=0.02) #std=self.config.initializer_range)\n", + " if isinstance(module, nn.Linear):\n", + " module.weight.data.normal_(mean=0.0, std=0.02) #self.config.initializer_range)\n", + " elif isinstance(module, nn.LayerNorm):\n", + " module.bias.data.zero_()\n", + " module.weight.data.fill_(1.0)\n", + " if isinstance(module, nn.Linear) and module.bias is not None:\n", + " module.bias.data.zero_()\n", + "\n", + "distil_apply_leaf(model, _distilbert_init_weights)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The model has 86,650,168 trainable parameters\n" + ] + } + ], + "source": [ + "def count_parameters(model):\n", + " return sum(p.numel() for p in model.parameters() if p.requires_grad)\n", + "\n", + "print(f'The model has {count_parameters(model):,} trainable parameters')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Learner" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "cbs = [CombineInputOutputCallback, RemoveEOSCallback(eos_idx=3), LossTargetShiftCallback]\n", + "\n", + "pad_idx=1\n", + "assert dls.vocab[1][pad_idx] == 'xxpad' \n", + "loss_func = CrossEntropyLossFlat(ignore_index=pad_idx)\n", + "\n", + "learn = Learner(dls, model, metrics=[accuracy, Perplexity(), CorpusBLEUMetric(vocab_sz=n_y_vocab)], \n", + " cbs=cbs, loss_func=loss_func).to_fp16()\n", + "\n", + "# learn = Learner(dls, model, metrics=[accuracy, Perplexity()],\n", + "# cbs=cbs, loss_func=loss_func).to_fp16()" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "learn.load('paracrawl_en_ga_5e_5e-4_5e_1e-5_v0.2_exp4')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Training" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### First 5e" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "SuggestedLRs(lr_min=0.09120108485221863, lr_steep=0.7585775852203369)" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "learn.lr_find()" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
epochtrain_lossvalid_lossaccuracyperplexitycorpus_bleutime
01.5548501.5466010.5220554.6954830.35692115:13
11.1389241.2231970.5573453.3980330.41025515:13
20.8475481.0018770.5874292.7233880.45707915:20
30.6234730.8592330.6063022.3613480.48930015:22
40.5493040.8246820.6112782.2811560.49906915:13
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "learn.fit_one_cycle(20, 5e-4, div=5)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "learn.recorder.plot_loss()" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "#learn.save('paracrawl_en_ga_5e_5e-4_v0.2_exp3')" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "learn.load('paracrawl_en_ga_5e_5e-4_v0.2_exp3')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Second 5e" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "SuggestedLRs(lr_min=5.754399353463669e-07, lr_steep=2.2908675418875646e-06)" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "learn.lr_find()" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
epochtrain_lossvalid_lossaccuracyperplexitycorpus_bleutime
00.5461470.8194120.6120082.2691660.50056114:13
10.5478590.8174790.6124142.2647840.50171714:50
20.5297730.8154510.6127942.2601950.50266414:32
30.5337700.8138850.6130002.2566580.50324614:59
40.5286610.8137340.6130742.2563170.50338614:46
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "learn.fit_one_cycle(5, 1e-5, div=5)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "learn.recorder.plot_loss()" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "learn.save('paracrawl_en_ga_5e_5e-4_5e_1e-5_v0.2_exp4')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Generate and Process Translations funcs" + ] + }, + { + "cell_type": "code", + "execution_count": 165, + "metadata": {}, + "outputs": [], + "source": [ + "def generate(model, sentence, vocab): \n", + " model=model.eval()\n", + " \n", + " sentence=learn.dls.tokenizer[0][1].encodes(sentence)\n", + " sentence=learn.dls.numericalize[0].encodes(sentence)\n", + " \n", + " translated_sentence = [2] \n", + " i = 0\n", + " while int(translated_sentence[-1]) != 3 and i < 75: \n", + " output = forward_model(model, sentence, translated_sentence).cuda()\n", + " values, indices = torch.topk(output, 5)\n", + " translated_sentence.append(int(indices[-1][0]))\n", + " i+=1\n", + "\n", + " detok_translated_sentence=detokenize(translated_sentence, vocab)\n", + " #print(' '.join(detok_translated_sentence))\n", + " return ' '.join(detok_translated_sentence)\n", + " \n", + "\n", + "def forward_model(model, src, tgt):\n", + " src = torch.as_tensor(src).unsqueeze(0).long().cuda()\n", + " tgt = torch.as_tensor(tgt).unsqueeze(0).cuda()\n", + " tgt_mask = gen_nopeek_mask(tgt.shape[1]).cuda()\n", + " src = to_half(src)\n", + " tgt = to_half(tgt)\n", + " output = model.forward(src, tgt, tgt_mask=tgt_mask, src_key_padding_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None)\n", + "\n", + " #return output.squeeze(0).to('cpu')\n", + " return output.squeeze(0).detach()\n", + "\n", + "\n", + "# def tokenize(sentence, freq_list, lang_model):\n", + "# punctuation = ['(', ')', ':', '\"', ' ']\n", + "\n", + "# sentence = sentence.lower()\n", + "# sentence = [tok.text for tok in lang_model.tokenizer(sentence) if tok.text not in punctuation]\n", + "# return [freq_list[word] if word in freq_list else freq_list['[OOV]'] for word in sentence]\n", + "\n", + "\n", + "def detokenize(sentence, vocab):\n", + " #freq_list = {v: k for k, v in freq_list.items()}\n", + " return [vocab[token] for token in sentence]\n", + " #return [freq_list[token] for token in sentence]\n", + "# def detokenize(sentence, freq_list):\n", + "# freq_list = {v: k for k, v in freq_list.items()}\n", + "# return [freq_list[token] for token in sentence]\n", + "\n", + "\n", + "def gen_nopeek_mask(length):\n", + " mask = rearrange(torch.triu(torch.ones(length, length)) == 1, 'h w -> w h')\n", + " mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))\n", + " return mask" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "def process_trans(trans):\n", + " trans_ls=[]\n", + " for s in trans: \n", + " #print(s)\n", + " tmp = s.replace('xxbos','')\n", + " tmp = tmp.replace('xxeos','')\n", + " tmp = tmp.replace(' .','.')\n", + " tmp = tmp.replace(' ,',',')\n", + " tmp = tmp.replace(' ?','?')\n", + " tmp = tmp.replace(' !','!')\n", + " #print(tmp[0])\n", + " if tmp.endswith('. '): tmp=tmp[:-1]\n", + " if tmp.endswith('? '): tmp=tmp[:-1]\n", + " if tmp.endswith('! '): tmp=tmp[:-1]\n", + " \n", + " for spec in ['xxmaj ', 'xxup ']:\n", + " found=[]\n", + " for m in re.finditer(spec, tmp):\n", + " found.append(m.start())\n", + "\n", + " for f in found:\n", + " m = tmp.find(spec)\n", + " if m != -1: \n", + " ml = m+len(spec)\n", + " if m != 0:\n", + " tmp = tmp[:ml] + tmp[ml].upper() + tmp[ml+1:]\n", + " tmp = tmp[:m] + tmp[ml:]\n", + " else: \n", + " tmp = tmp[ml].upper() + tmp[ml+1:]\n", + " tmp = tmp[ml:]\n", + " \n", + " # Remove space at start\n", + " if tmp[0] == ' ': tmp = tmp[1:]\n", + " \n", + " # Uppercase start of sentence\n", + " #tmp = tmp[0].upper() + tmp[1:]\n", + " \n", + " trans_ls.append(tmp)\n", + " return trans_ls" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Testing" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ga_idgaen_idenga_len
0557291Cá bhfuil críochfort na mbus?35406Where is the bus terminal?5
1557299Nuair a dhúisigh mé, bhí brón orm.1361When I woke up, I was sad.7
2557533Tosaíonn an t-oideachas sa bhaile.19122Education starts at home.5
3557579Táim i ngrá leat.1434I love you.4
4934942Tá grá agam duit.1434I love you.4
\n", + "
" + ], + "text/plain": [ + " ga_id ga en_id \\\n", + "0 557291 Cá bhfuil críochfort na mbus? 35406 \n", + "1 557299 Nuair a dhúisigh mé, bhí brón orm. 1361 \n", + "2 557533 Tosaíonn an t-oideachas sa bhaile. 19122 \n", + "3 557579 Táim i ngrá leat. 1434 \n", + "4 934942 Tá grá agam duit. 1434 \n", + "\n", + " en ga_len \n", + "0 Where is the bus terminal? 5 \n", + "1 When I woke up, I was sad. 7 \n", + "2 Education starts at home. 5 \n", + "3 I love you. 4 \n", + "4 I love you. 4 " + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "t_path = Path('data/irish/parallel_corpora/tatoeba')\n", + "t_fn = 'tatoeba_en-ga.csv'\n", + "t_df = pd.read_csv(t_path/t_fn)\n", + "\n", + "t_df['ga_len'] = t_df.ga.str.split().str.len()\n", + "t_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# learn.dls.test_dl = t_dls.valid\n", + "# def act_fn(x): return L(F.softmax(o, dim=-1) for o in x)\n", + "# o = learn.get_preds(dl=t_dls.valid, act=act_fn)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Generate translations" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'xxbos xxup tá mé réidh don deireadh seachtaine ! xxeos'" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "generate(learn.model, \"I'm ready for the weekend!\", dls.vocab[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['xxbos xxmaj an bhfuil an críochfort bus 2005 ? xxeos',\n", + " 'xxbos xxmaj nuair a xxunk mé suas , bhí mé brónach . xxeos',\n", + " 'xxbos xxmaj tosaíonn an t - oideachas sa bhaile . xxeos',\n", + " 'xxbos xxmaj grá i tú . xxeos',\n", + " 'xxbos xxmaj grá i tú . xxeos']" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trans_ls=[]\n", + "for e in t_df.en.values:\n", + " trans_ls.append(generate(learn.model, e, dls.vocab[1])) \n", + "trans_ls[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [], + "source": [ + "f=open('tatoeba_raw_translations_exp4_20200724.txt','w')\n", + "for ele in trans_ls:\n", + " f.write(ele+'\\n')\n", + "f.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Process translations" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['An bhfuil an críochfort bus 2005?',\n", + " 'Nuair a xxunk mé suas, bhí mé brónach.',\n", + " 'Tosaíonn an t - oideachas sa bhaile.',\n", + " 'Grá i tú.',\n", + " 'Grá i tú.',\n", + " 'Tá mé i ngrá le tú.',\n", + " 'Glan ár seomra ranga tar éis na scoile.',\n", + " 'Nach bhfuil muid ag comhlíonadh roimh?']" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "t_final = process_trans(trans_ls)\n", + "t_final[:8]" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "f=open('tatoeba_processed_translations_exp4_20200724.txt','w')\n", + "for ele in t_final:\n", + " f.write(ele+'\\n')\n", + "f.close()" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [], + "source": [ + "t_df['translation'] = t_final\n", + "t_df.to_csv('tatoeba_with_translation_exp4_20200724.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
gatranslation
1796Tá mé ábalta é sin a dhéanamh anoisTá mé in ann a dhéanamh go anois
1261Is maith liom ceoldráma.Cosúil liom opera.
849Tá siad ag labhairt Gaeilge.Tá siad ag labhairt na Gaeilge.
14Ná déan dhá rud ag an am céanna.Ná déan dhá rud ag am.
323Nach ndearnadh rud ar bith faoi sin?Ná raibh aon rud déanta faoin sin?
854Tá na húlla díolta amach.Díoladh an xxunk amach.
836Labhraím duit.Labhair mé leat.
1971Cá bhfuil an t-arán?Cén áit a bhfuil an t - arán?
858Cheannaigh mé carbhat dearg.Cheannaigh mé xxunk dearg.
1321Táim ag ól an caife.Tá mé ag ól an caife.
1782Tá grá againn ar an gcailín céanna.Grá againn an cailín céanna cailín.
102Tá an leac oighir an-tiubh.Is é an oighear an - tiubh.
1611Ní mian liom ceapairí ar bith.Níl mé ag iarraidh aon xxunk.
1933Ní theastaíonn buachaill uait.Ní gá duit a xxunk.
656Táimid ag léamh.Táimid ag léamh.
1636Tá cúpla daoine uaim chun cabhair a tabhairt dom.Ní mór dom a roinnt daoine chun cabhrú liom.
1218Oibrím ag barra.Obair mé ag barra.
1120Is liomsa é sin.Is mianach é sin.
1394Cad atá sibh ag déanamh staidéir?Cad atá tú ag déanamh staidéir?
4Tá grá agam duit.Grá i tú.
\n", + "
" + ], + "text/plain": [ + " ga \\\n", + "1796 Tá mé ábalta é sin a dhéanamh anois \n", + "1261 Is maith liom ceoldráma. \n", + "849 Tá siad ag labhairt Gaeilge. \n", + "14 Ná déan dhá rud ag an am céanna. \n", + "323 Nach ndearnadh rud ar bith faoi sin? \n", + "854 Tá na húlla díolta amach. \n", + "836 Labhraím duit. \n", + "1971 Cá bhfuil an t-arán? \n", + "858 Cheannaigh mé carbhat dearg. \n", + "1321 Táim ag ól an caife. \n", + "1782 Tá grá againn ar an gcailín céanna. \n", + "102 Tá an leac oighir an-tiubh. \n", + "1611 Ní mian liom ceapairí ar bith. \n", + "1933 Ní theastaíonn buachaill uait. \n", + "656 Táimid ag léamh. \n", + "1636 Tá cúpla daoine uaim chun cabhair a tabhairt dom. \n", + "1218 Oibrím ag barra. \n", + "1120 Is liomsa é sin. \n", + "1394 Cad atá sibh ag déanamh staidéir? \n", + "4 Tá grá agam duit. \n", + "\n", + " translation \n", + "1796 Tá mé in ann a dhéanamh go anois \n", + "1261 Cosúil liom opera. \n", + "849 Tá siad ag labhairt na Gaeilge. \n", + "14 Ná déan dhá rud ag am. \n", + "323 Ná raibh aon rud déanta faoin sin? \n", + "854 Díoladh an xxunk amach. \n", + "836 Labhair mé leat. \n", + "1971 Cén áit a bhfuil an t - arán? \n", + "858 Cheannaigh mé xxunk dearg. \n", + "1321 Tá mé ag ól an caife. \n", + "1782 Grá againn an cailín céanna cailín. \n", + "102 Is é an oighear an - tiubh. \n", + "1611 Níl mé ag iarraidh aon xxunk. \n", + "1933 Ní gá duit a xxunk. \n", + "656 Táimid ag léamh. \n", + "1636 Ní mór dom a roinnt daoine chun cabhrú liom. \n", + "1218 Obair mé ag barra. \n", + "1120 Is mianach é sin. \n", + "1394 Cad atá tú ag déanamh staidéir? \n", + "4 Grá i tú. " + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "t_df[['ga','translation']].sample(20)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## SacreBLEU" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "25.141619186580616\n" + ] + } + ], + "source": [ + "import sacrebleu\n", + "\n", + "refs = [t_df.ga.values.tolist()]\n", + "sys = t_df.translation.values.tolist()\n", + "\n", + "bleu = sacrebleu.corpus_bleu(sys, refs)\n", + "print(bleu.score)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Inspect top losses" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "hi\n" + ] + }, + { + "data": { + "text/html": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "hi\n" + ] + } + ], + "source": [ + "# at 30k tokens per vocab sometimes this works, sometimes it doesn't\n", + "\n", + "# Couldnt process 30k tokens until I added the 'hi' below, it was getting stuck at 94.87%, no idea why\n", + "@Numericalize\n", + "def encodes(self, o): \n", + " print('hi')\n", + " return TensorText(tensor([self.o2i [o_] for o_ in o]))\n", + "\n", + "class floatify_tfm(Transform):\n", + " def encodes(self,o): return o.float()\n", + " def decodes(self,o): return o.long()\n", + "\n", + "max_vocab=30000\n", + "#splits = ColSplitter()(df) \n", + "splits = RandomSplitter(valid_pct=0.2, seed=42)(df)\n", + "splits = (splits[0], splits[1][:2000])\n", + "\n", + "tfms = [[Tokenizer.from_df(text_cols='en' , rules=proc_rules), attrgetter(\"text\"), Numericalize(max_vocab=max_vocab)], \n", + " [Tokenizer.from_df(text_cols='ga', lang='ga', rules=proc_rules), attrgetter(\"text\"), Numericalize(max_vocab=max_vocab)]]\n", + "\n", + "dl = partial(SortedDL, shuffle=True, res=df.ga_len.values)\n", + "\n", + "dsets = Datasets(df, tfms, splits=splits, dl_type=dl)\n", + "\n", + "# remove the print from Numericalize\n", + "@Numericalize\n", + "def encodes(self, o): return TensorText(tensor([self.o2i [o_] for o_ in o]))" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
texttext_
0xxbos ( i ) the application of any goods of a kind specified in the xxmaj fourth xxmaj schedule by a person for the purposes of his business and treated as delivered in accordance with section 3 ( 1 ) ( e ) , xxeos xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpadxxbos ( i ) le duine do bhaint úsáid chun críocha a ghnó as aon earraí de chineál a shonraítear sa xxmaj cheathrú xxmaj sceideal agus a áirítear mar earraí arna seachadadh de réir alt 3 ( 1 ) ( e ) , xxeos xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad
1xxbos xxup xxunk . xxmaj population aged one year and over , usually resident and present in the xxmaj state , whose usual residence one year previously was outside the xxmaj state , classified by former country of usual residence , sex and distinguishing those with xxmaj irish nationality or other nationality xxeos xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpadxxbos › xxup xxunk . xxmaj daonra xxmaj gnáthchónaitheoirí xxmaj aon xxmaj bliain d'aois agus xxmaj níos xxmaj sine a bhí i xxmaj láthair sa xxmaj stát de réir xxmaj náisiúntacht , xxmaj inscne , xxmaj gnátháit xxmaj chónaithe xxmaj bliain xxmaj roimhe agus bliaindaonáirimh › xxup xxunk . xxeos xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad
2xxbos ( a ) the constituencies for the election of members to the xxmaj dáil , and xxeos xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpadxxbos ( a ) na dáilcheantair chun comhaltaí a thoghadh chun na xxmaj dála , agus xxeos xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad
3xxbos “ but as climate change increases the frequency and severity of droughts and floods and makes food more difficult to produce , we need innovative solutions to support communities on the frontline . xxeos xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpadxxbos \" ach mar a ardaíonn an t - athrú aeráide minicíocht agus déine na xxunk agus na dtuilte agus dhéanann sé bia níos deacra le táirgeadh , is gá réitigh nuálacha chun tacú le pobail ar an líne thosaigh . xxeos xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad
4xxbos xxmaj member xxmaj states : xxmaj germany , xxmaj france , xxmaj italy , the xxmaj netherlands , xxmaj belgium and xxmaj luxembourg . xxeos xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpadxxbos xxmaj ballstáit : xxmaj an xxmaj ghearmáin , an xxmaj fhrainc , an xxmaj iodáil , an ísiltír , an xxmaj bheilg agus xxmaj lucsamburg . xxeos xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad
5xxbos ( b ) contributions in respect of that service have been returned to him , xxeos xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpadxxbos ( b ) go mbeifear tar éis ranníoca i leith na seirbhíse sin a thabhairt ar ais dó , xxeos xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad
6xxbos xxmaj this website is a national shared service for the collection of the charge for xxmaj non xxmaj principal xxmaj private xxmaj residences on behalf of the local authorities . xxeos xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpadxxbos xxmaj is seirbhís náisiúnta chomhroinnte an láithreán gréasáin seo chun an muirear d’áiteanna xxmaj cónaithe xxmaj príobháideacha neamhphríomha ar son na n - údarás áitiúil a bhailiú . xxeos xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad
7xxbos ( 2 ) xxmaj the xxmaj minister may order that a person committed under section 14 be released from custody if the xxmaj minister is of the opinion that a request for the person 's surrender is not being proceeded with . xxeos xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpadxxbos ( 2 ) xxmaj féadfaidh an taire a ordú duine a cimíodh faoi alt 14 a scaoileadh saor ó choimeád más é tuairim an xxmaj aire nach bhfuiltear ag dul ar aghaidh le hiarraidh chun an duine a thabhairt suas . xxeos xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad
8xxbos xxmaj posted on xxmaj august 28 , 2013 by xxmaj kaia xxeos xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpadxxbos by xxmaj kaia xxmaj postáilte ar 28 xxmaj lúnasa , 2013 ag xxmaj kaia xxeos xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad xxpad
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "bs,sl = 32, 512\n", + "dls = dsets.dataloaders(bs=bs, seq_len=sl, before_batch=partial(pad_input, pad_fields=[0,1]))\n", + "dls.show_batch()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "manually calculate" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "eos_idx=3\n", + "dls.valid.bs = 1\n", + "\n", + "loss_ls = []\n", + "en_ls, ga_trg_ls, ga_pred_ls = [], [], []\n", + "\n", + "for xb, yb in dls.valid:\n", + " yy = yb\n", + " eos_mask=(yb!=eos_idx)\n", + " sz=torch.tensor(yb.size())\n", + " sz[1]=sz[1]-1\n", + " yb = yb[eos_mask].view((sz[0],sz[1])) # drop the last token (\"eos\") for training \n", + " \n", + " with torch.no_grad():\n", + " y_pred_logits = model.forward(src=xb, trg=yb)\n", + " \n", + " yb_loss = yy[:,1:] # shift target to exclude xxbos\n", + " loss = learn.loss_func(y_pred_logits, yb_loss)\n", + " loss_ls.append(loss)\n", + "\n", + " y_pred_act = F.softmax(y_pred_logits, dim=-1)\n", + " \n", + " preds_ls = []\n", + " for p in y_pred_act: \n", + " preds_ls.append(p.argmax(dim=-1))\n", + " \n", + " tmp_ls = []\n", + " for i in xb[0]:\n", + " if (dls.vocab[0][i] != 'xxpad') and (dls.vocab[1][i] != 'xxbos'):\n", + " tmp_ls.append(dls.vocab[0][i])\n", + " en_ls.append(' '.join(tmp_ls))\n", + "\n", + " tmp_ls = []\n", + " for i in yb[0]:\n", + " if (dls.vocab[1][i] != 'xxpad') and (dls.vocab[1][i] != 'xxbos'):\n", + " tmp_ls.append(dls.vocab[1][i])\n", + " ga_trg_ls.append(' '.join(tmp_ls))\n", + "\n", + " tmp_ls = []\n", + " for i in preds_ls[0]:\n", + " if dls.vocab[1][i] != 'xxpad':\n", + " tmp_ls.append(dls.vocab[1][i])\n", + " ga_pred_ls.append(' '.join(tmp_ls))" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(tensor(4.8346, device='cuda:0'), tensor(0.0034, device='cuda:0'))" + ] + }, + "execution_count": 117, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "max_loss_idx = np.argmax(loss_ls)\n", + "min_loss_idx = np.argmin(loss_ls)\n", + "loss_ls[max_loss_idx], loss_ls[min_loss_idx]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Sorted indices by loss, (highest loss to lowest)" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "metadata": {}, + "outputs": [], + "source": [ + "loss_sorted_idxs = np.argsort(-np.array(loss_ls))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Show `n` top losses" + ] + }, + { + "cell_type": "code", + "execution_count": 163, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LOSS: 4.834611415863037\n", + "xxmaj we are currently experiencing technical difficulties with our subtitles on some browsers . xxeos\n", + "\n", + "xxup tá deacrachtaí le fotheidil ar roinnt ‘ xxunk ’ faoi láthair .\n", + "\n", + "xxmaj tá muid teicniúla deacrachtaí teicniúla ár brabhsálaithe brabhsálaithe faoi faoi láthair atá xxeos\n", + "\n", + "\n", + "LOSS: 4.4953813552856445\n", + "frosted green cosmetic glass bottle with silver … xxeos\n", + "\n", + "frosted buidéal dropper gloine glas do cosmaideacha …\n", + "\n", + "xxmaj cosmaideacha gloine cosmaideacha cosmaideacha cosmaideacha airgid le xxeos\n", + "\n", + "\n", + "LOSS: 4.347179412841797\n", + "xxmaj not even all the end of compulsory schooling despite compulsory schooling . xxeos\n", + "\n", + "xxup ní fiú go léir a chuaigh amach bhunscoil ainneoin scolaíocht éigeantach .\n", + "\n", + "xxmaj ní fiú amháin léir deireadh deireadh deireadh dheireadh éigeantach in éigeantach in xxeos\n", + "\n", + "\n", + "LOSS: 4.304743766784668\n", + "p - xxmaj depth of xxmaj foundation xxeos\n", + "\n", + "p - xxmaj fondúireacht doimhneacht\n", + "\n", + "p - xxmaj doimhneacht na na\n", + "\n", + "\n", + "LOSS: 4.188599586486816\n", + "xxmaj open 7 days including bank holidays xxeos\n", + "\n", + "xxmaj samhradh : xxmaj oscailte 7 lá , laethanta saoire bainc san áireamh\n", + "\n", + "7 laethanta 7 laethanta laethanta 7 lá lena laethanta saoire bainc lena áireamh laethanta\n", + "\n", + "\n", + "LOSS: 4.172364234924316\n", + "xxmaj ok . xxmaj then i reboot and cross the border . xxeos\n", + "\n", + "xxmaj ansin leithroinnte agus dtrasnaíonn an teorainn .\n", + "\n", + "xxup ok i mé tras an teorainn . xxeos\n", + "\n", + "\n", + "LOSS: 4.111687660217285\n", + "xxmaj this was equivalent to a 4.5 % unemployment rate , up slightly from 4.4 % in 2002 but still well below the xxup eu average of 8.0 % . xxeos\n", + "\n", + "xxmaj b’ionann sin agus 4.5 % ráta dífhostaíochta ardú beagán ó 4.4 % in 2002 ach go maith faoi leibhéal an mheán xxup ae ag 8.0 % . xxmaj d’ardaigh an líon daoine ar dhífhostaíocht fhadtéarmach 1,200 .\n", + "\n", + "xxmaj bhí seo agus ráta % xxunk dífhostaíochta , , ón 4.4 % i 2002 ach fós maith faoi bhun an xxup an ae de brath % . xxeos bhí % ráta tí is an % . % xxeos\n", + "\n", + "\n", + "LOSS: 4.084066390991211\n", + "( i ) substitute “ the actuary to or trustees of ” for “ the trustees of ” , xxeos\n", + "\n", + "( i ) “ déanfaidh achtúire nó iontaobhaithe scéime nó iontaobhais xxup cbs ” a chur in ionad “ déanfaidh iontaobhaithe scéime nó iontaobhais xxup cbs ” ,\n", + "\n", + "( i ) “ an achtúire do iontaobhaithe ” ” iontaobhaithe ” xxunk iontaobhais a chur in ionad “ iontaobhaithe iontaobhaithe ” ” , ” na ” , xxeos\n", + "\n", + "\n", + "LOSS: 3.9471380710601807\n", + "xxmaj ensure national mail services packages are posted either without a cover or in a cover which can be easily removed for the purpose of examination . xxeos\n", + "\n", + "xxmaj déan cinnte de go seoltar pacáistí gan chlúdach nó i gclúdach atá furasta a bhaint le haghaidh scrúdúcháin sa chóras náisiúnta poist .\n", + "\n", + "xxmaj cuirtear pacáistí go pacáistí gcuirfear pacáistí seirbhísí chlúdach nó i xxunk ar féidir a aistriú go go an a phost seirbhísí a a xxeos\n", + "\n", + "\n", + "LOSS: 3.885239362716675\n", + "a sea change in xxmaj europe 's innovation performance is the only way to create lasting and well - paid jobs that withstand the pressures of globalisation . \" xxeos\n", + "\n", + "xxmaj beidh gá le feabhas ó bhonn a dhéanamh ar fheidhmíocht nuálaíochta na heorpa má táthar chun poist bhuana ar phá maith a chruthú , poist a bheidh in ann brú an domhandaithe a sheasamh . \"\n", + "\n", + "xxmaj is athrú le hathrú farraige thaobh na xxunk ar fheidhmíocht na san heorpa ar tá chun poist buan buan aghaidh agus agus chruthú a go atá sheasamh sheasamh aice a an domhandú . chur . \" xxeos\n", + "\n", + "\n", + "LOSS: 3.8690526485443115\n", + "xxmaj heat loss or gains from pipes , xxunk and vessels is to be limited and energy efficient lighting systems provided . xxeos\n", + "\n", + "xxup ní mór caillteanas nó gnóthachain teasa trí phíopaí , xxunk agus soithigh a mhaolú agus córas éifeachtach soilse a fheistiú .\n", + "\n", + "xxup tá mór córas teasa gnóchain ó ó phíopaí , xxunk agus soithí a bhfuil a a fuinnimh fuinnimh fuinnimh sholáthar ar xxeos\n", + "\n", + "\n", + "LOSS: 3.851304292678833\n", + "xxmaj provisions in relation to reserve values , transfer values and transfers of insurance . xxeos\n", + "\n", + "xxmaj forálacha maidir le cúl - luacha , luacha aistriúcháin agus aistrithe árachais .\n", + "\n", + "xxmaj forálacha maidir le luachanna - luachanna , luachanna aistrithe d'aistriú aistrithe luachanna d'aistriú xxeos\n", + "\n", + "\n", + "LOSS: 3.835312604904175\n", + "xxmaj article xxunk in attack xxeos\n", + "\n", + "xxmaj airteagal xxunk i gcás ionsaithe\n", + "\n", + "xxmaj airteagal xxunk ionsaí ionsaí ionsaí ionsaí\n", + "\n", + "\n", + "LOSS: 3.74434494972229\n", + "xxmaj xxunk and licences - construction - road safety - traffic xxmaj management and xxmaj parking xxeos\n", + "\n", + "xxmaj xxunk xxmaj tógáil bóthair - sábháilteacht bóthair - bainistíocht xxmaj tráchta\n", + "\n", + "xxmaj xxunk agus xxunk agus agus sábháilteacht agus agus agus agus tráchta agus\n", + "\n", + "\n", + "LOSS: 3.710318088531494\n", + "xxmaj standard link building sources xxeos\n", + "\n", + "xxmaj foinsí xxmaj standard tógáil nasc\n", + "\n", + "xxmaj foinsí nasc caighdeánach nasc xxmaj tógála\n", + "\n", + "\n", + "LOSS: 3.7060744762420654\n", + "xxmaj after this the delegates late xxmaj louis xxup xvi to the death , by a majority vote of one vote . xxeos\n", + "\n", + "xxmaj tar éis seo an xxmaj toscairí déanach xxmaj louis xxup xvi xxmaj chun an xxmaj bás , xxmaj de a xxmaj formhór na xxmaj vóta haon .\n", + "\n", + "xxmaj tar éis na na xxmaj louis xxmaj xxmaj louis xxup xvi le louis an bháis bháis , trí vóta vóta vóta vóta vóta vóta vóta amháin vóta xxeos\n", + "\n", + "\n", + "LOSS: 3.6944961547851562\n", + "17 . a sensible housing policy in xxmaj xxunk . xxeos\n", + "\n", + "17 . xxmaj polasaí ceart tithíochta i mbearna .\n", + "\n", + "17 . xxmaj polasaí tithíochta tithíochta ciallmhar xxmaj xxmaj xxeos\n", + "\n", + "\n", + "LOSS: 3.6918556690216064\n", + "xxup eu budget 2014 by financial framework heading xxeos\n", + "\n", + "xxmaj buiséad 2014 de réir na gceannteideal atá sa chreat airgeadais\n", + "\n", + "xxmaj buiséad an ag réir réigiúin creat airgeadais ann xxup airgeadais xxup\n", + "\n", + "\n", + "LOSS: 3.656128406524658\n", + "3.1 xxmaj the purpose of this chapter is to give a brief review of the more important or interesting decisions and developments in the area of criminal law in 2007 . xxeos\n", + "\n", + "3.1 xxmaj is é cuspóir atá leis an gcaibidil seo cuntas gairid a thabhairt ar chinntí agus ar chora xxunk eile den tábhacht maidir le réimse dhlí na coireachta i rith na bliana 2007 .\n", + "\n", + "3.1 xxmaj is é is na leis an gcaibidil seo athbhreithniú gearr ar thabhairt ar na níos forbairtí na is nó nó dlí níos le forbairtí an coiriúil coiriúil in 2007 2007 bliana 2007 . xxeos\n", + "\n", + "\n", + "LOSS: 3.5991711616516113\n", + "( 5 ) fire or water hose fittings of the following descriptions : xxeos\n", + "\n", + "( 5 ) feistisí feadán tóiteáin no feadán uisce de sna xxunk so leanas :\n", + "\n", + "( 5 ) xxup xxunk dóiteáin nó uisce uisce de sna saghsanna seo a : xxeos\n", + "\n", + "\n", + "LOSS: 3.5880415439605713\n", + "xxmaj this play was first performed in 1979 and is a xxunk look at xxmaj irish society , still relevant today . xxeos\n", + "\n", + "xxmaj tugann an dráma seo , a léiríodh ar dtús i 1979 , léargas xxunk ar shochaí na héireann , léargas atá fós suntasach inniu .\n", + "\n", + "xxmaj an an spraoi seo an ar bhí i dtús i 1979 , agus xxunk ar chumann na héireann , agus ábhartha ábhartha ar sa . xxeos\n", + "\n", + "\n", + "LOSS: 3.5831832885742188\n", + "xxmaj first elections of members of certain public assistance authorities . xxeos\n", + "\n", + "xxmaj céad - toghcháin chomhaltaí údarásanna conganta xxunk áirithe .\n", + "\n", + "xxmaj na toghcháin toghcháin do údarás cúnaimh phuiblí áirithe . xxeos\n", + "\n", + "\n", + "LOSS: 3.514817714691162\n", + "xxmaj withdrawal xxmaj date of v 1 xxmaj assessments xxeos\n", + "\n", + "xxmaj dáta xxmaj xxunk xxmaj measúnaithe l 1\n", + "\n", + "xxmaj dáta xxmaj siar v measúnachtaí xxup xxeos xxmaj\n", + "\n", + "\n", + "LOSS: 3.4614880084991455\n", + "( ii ) certifies that the person , although the person ’s condition is not such as to require the person ’s hospitalisation , is unfit for any questioning for the purpose of the investigation for a specified period , xxeos\n", + "\n", + "( ii ) go ndeimhníonn sé nach xxunk an duine chun críche an imscrúdaithe go ceann tréimhse sonraithe , d’ainneoin nach gá an duine a chur isteach in ospidéal de dheasca na baile atá air ,\n", + "\n", + "( ii ) go ndeimhneoidh an nó bhfuil an duine , a aon imscrúdaithe ar bhfuil tréimhse sonraithe , cé nach amhlaidh don duine chun cheangal i ar imthosca an shórt xxunk xxunk a sheachaint , xxeos\n", + "\n", + "\n", + "LOSS: 3.3972277641296387\n", + "xxmaj shoot 2 : xxmaj cruise xxmaj control xxeos\n", + "\n", + "xxmaj shoot 2 : cúrsála rialaithe\n", + "\n", + "xxmaj shoot 2 : xxmaj xxmaj xxmaj\n", + "\n", + "\n", + "LOSS: 3.344511032104492\n", + "xxmaj the blue flag campaign today flies in 13 different countries having started life in xxmaj france in 1987 . xxeos\n", + "\n", + "xxmaj san xxmaj fhrainc a thosaigh an feachtas seo i 1987 agus anois tá an brat gorm ar foluain i 13 thír éagsúla .\n", + "\n", + "xxmaj déanann fheachtas eoraip i cuireadh an feachtas bratach caite 1987 , cuileoga i na feachtas gorm i xxmaj i 13 thír éagsúla a xxmaj\n", + "\n", + "\n", + "LOSS: 3.3257603645324707\n", + "xxmaj contingency allowances given to compensate for the time required by the workers to perform all necessary additional and periodic activities , e.g. reading drawings , cleaning machinery etc . xxeos\n", + "\n", + "liúntais teagmhasacha a tugadh mar chúiteamh ar an am is gá ag na hoibrithe chun gach gníomhaíocht breise agus tréimhsiúla gá , m.sh. líníochtaí léamh , glanadh innealra srl\n", + "\n", + "xxmaj teagmhais na thabhairt mar chúiteamh don an am a gá de na hoibrithe chun gach gníomhaíocht bhreise agus thréimhsiúil a a m.sh. ag , , innealra etc etc .\n", + "\n", + "\n", + "LOSS: 3.311084747314453\n", + "relationship , and supply chain processes . xxeos\n", + "\n", + "caidreamh , agus slabhra soláthair próisis .\n", + "\n", + "le , próisis próisis soláthair próisis slabhra xxeos\n", + "\n", + "\n", + "LOSS: 3.306642532348633\n", + "xxmaj in 1963 introduced the first helicopters into service in the xxmaj state and within one year provided a daytime xxmaj search and xxmaj rescue service and within a further year established an inter - hospital air ambulance service , the first of its kind in xxmaj europe . xxeos\n", + "\n", + "xxmaj na chéad héileacaptair a thionscnamh i mbun seirbhíse sa xxmaj stát sa bhliain 1963 : soláthraíodh seirbhís xxmaj chuardaigh agus xxmaj tarrthála lae faoi cheann bliana agus bunaíodh seirbhís aerárthaigh othar idir - ospidéil faoi cheann bliana eile , an chéad seirbhís dá leithéid san xxmaj eoraip .\n", + "\n", + "i thug chéad uair a tugadh i seirbhís seirbhíse sa xxmaj stát agus bhliain 1963 agus xxmaj an xxmaj cuardaigh agus xxmaj tarrthála agus agus cheann bliana agus laistigh seirbhís ospidéil idir idir - ospidéil bhreise láthair dá breise , an chéad cheann dá chineál san xxmaj eoraip . xxeos\n", + "\n", + "\n", + "LOSS: 3.2318711280822754\n", + "“ in the 25 years since our previous logo was developed , the world has moved from the printed page to an open fluid digital environment where more and more people have access to data . xxeos\n", + "\n", + "“ le linn an 25 bliana ó forbraíodh ár sean - lógó , tá an saol mór tar éis bogadh ar aghaidh ó cháipéisí clóite go timpeallacht oscailte solúbtha digiteach ina bhfuil rochtain ar sonraí ag líon méadaithe daoine .\n", + "\n", + "“ sa linn an 25 bliana ó rinneadh ár lógó - lógó , tá an domhan bhog ar éis a ón an ar leathanach clóite go dtí digiteach digiteach i i bhfuil rochtain níos níos agus níos daoine agus agus xxeos\n", + "\n", + "\n", + "LOSS: 3.222780704498291\n", + "xxmaj it takes into account the extent to which individual students ’ characteristics such as gender , age , socio - economic background and prior educational attainment , have an impact on progression . xxeos\n", + "\n", + "xxmaj cuireann sé san áireamh an tionchar atá ag éagsúlacht thréithe na mac léinn ar an xxunk , leithéidí inscne , aois , cúlra socheacnamaíoch agus xxunk oideachasúil .\n", + "\n", + "xxmaj cuirtear sí san áireamh a méid ar ag mic na na mic léinn aonair leith xxunk , ar , , aois , cúlra eacnamaíoch agus roimh oideachais roimh sula\n", + "\n", + "\n", + "LOSS: 3.1986210346221924\n", + "xxmaj look at some other items of cutlery . xxmaj can you make drawings of them ? xxeos\n", + "\n", + "xxmaj an féidir leat cur síos a dhéanamh air ? xxmaj déan líníocht de . xxmaj amharc ar roinnt míreanna eile sceanra .\n", + "\n", + "xxmaj féach féidir leat líníochtaí isteach ar dhéanamh ar ? xxmaj féachaint tú ar roinnt xxmaj is ar roinnt míreanna eile de ? xxeos\n", + "\n", + "\n", + "LOSS: 3.1934149265289307\n", + "but does not include — xxeos\n", + "\n", + "ach ní fholuíonn sí —\n", + "\n", + "ach ní fholaíonn sé — ní\n", + "\n", + "\n", + "LOSS: 3.170163154602051\n", + "xxmaj in all places that were affected by lightning discharges electricity . xxeos\n", + "\n", + "i ngach áit xxmaj bhí tionchar ag go tintreach leictreachas sceitheadh .\n", + "\n", + "i ngach áiteanna go go tionchar ag scaoileadh raibh scaoileadh scaoilte leictreachais xxeos\n", + "\n", + "\n", + "LOSS: 3.164721727371216\n", + "xxmaj since 2004 the works programme has focused on preserving the extensive structural remains on the xxmaj south xxmaj peak . xxeos\n", + "\n", + "á “ 2004 , xxunk clár na n - oibreacha ar xxunk fairsinge na struchtúr ar an xxunk xxmaj theas a chaomhnú .\n", + "\n", + "ó xxup tá tá tá an oibreacha n - oibreacha ar an an atá n mór an xxmaj xxmaj theas a chaomhnú . xxeos\n", + "\n", + "\n", + "LOSS: 3.1545069217681885\n", + "xxmaj please ensure that you provide complete and accurate information ( e.g. on dates of birth and xxup pps numbers ) as failure to do this will delay processing of your application . xxeos\n", + "\n", + "xxmaj cinntigh , le do thoil , go soláthraíonn tú faisnéis shoiléir agus chruinn ( dátaí breithe , uimhreacha xxup psp agus sonraí bainc go háirithe ) . xxup má theipeann ort é seo a dhéanamh , beidh moill ar phróiseáil d’iarratais .\n", + "\n", + "xxmaj déan go go do thoil go go xxunk tú faisnéis iomlán ( cruinn ( e.g breithe agus agus xxup psp ) uimhreacha xxup xxup mbeidh ) mar xxmaj má dhéantar ar an seo a dhéanamh tuilleadh xxunk tú ar d’iarratas d . xxeos\n", + "\n", + "\n", + "LOSS: 3.147977828979492\n", + "xxmaj fan circulation xxunk optimized for higher efficient heat transfer in the low - temperature application . xxeos\n", + "\n", + "xxunk le haghaidh níos airde aistriú teasa éifeachtach i gcur i bhfeidhm íseal\n", + "\n", + "xxmaj xxmaj scaipeadh aistriú airde aistriú teasa níos ó bhfeidhm i bhfeidhm teocht –\n", + "\n", + "\n", + "LOSS: 3.143120765686035\n", + "xxmaj but xxmaj plague knocked people from all those different stalls , but not as many suffered by those of them poor . xxeos\n", + "\n", + "xxmaj ach leag plague daoine ó siúd go léir stallaí éagsúla , ach bhí tionchar nach mar go leor acu siúd acu bochta .\n", + "\n", + "xxmaj ach tá xxmaj daoine ó gach go léir stallaí éagsúla , ach ní nach ag bhfuil chuid leor ag siúd iad bochta . xxeos\n", + "\n", + "\n", + "LOSS: 3.1297659873962402\n", + "xxmaj poor standards of communication and documentation xxeos\n", + "\n", + "droch - chaighdeáin cumarsáide agus doiciméadú\n", + "\n", + "xxmaj - caighdeáin na agus doiciméid cumarsáide\n", + "\n", + "\n", + "LOSS: 3.117694139480591\n", + "xxmaj those who look to achieve the physique of their dreams can look no further than these xxunk , time - tested brands . xxeos\n", + "\n", + "xxmaj is féidir leo siúd a thugann aire a bhaint amach an physique a n - aisling breathnú níos faide ná seo xxunk , ama de réir tástála acmhainne brandaí .\n", + "\n", + "xxmaj iad féidir leo siúd a breathnú chun a bhaint amach ar dá a bhaint - aisling a a mó ná na xxunk , am - na na - . ama xxeos\n", + "\n", + "\n", + "LOSS: 3.1128244400024414\n", + "xxmaj this progression is part of the developmental process for the young sailor . xxeos\n", + "\n", + "xxmaj is cuid de phróiseas forbartha an mhairnéalaigh óig an dul chun cinn seo .\n", + "\n", + "xxmaj is dul den phróiseas forbartha an xxunk do do ea chun cinn seo . xxeos\n", + "\n", + "\n", + "LOSS: 3.0744216442108154\n", + "xxmaj are another generation of xxmaj xxunk children to miss out on the benefits of an all - irish education or will the xxmaj department of xxmaj education and xxmaj skills do the right thing and support xxmaj gaelscoil xxmaj ráth xxup xxunk by giving it official recognition ? ” xxeos\n", + "\n", + "“ an bhfuil glúin eile de pháistí xxmaj ráth xxup tó chun an deis sin a chailliúint arís , nó bhfuil an xxmaj roinn xxmaj oideachais agus xxmaj scileanna chun an rud cheart a dhéanamh agus tacú le xxmaj gaelscoil xxmaj ráth xxup tó trí aitheantas a thabhairt di ?\n", + "\n", + "xxmaj tá bhfuil glúin eile de leanaí xxmaj xxunk xxmaj xxunk chun aitheantas t a a chailleann as ar ar go an xxmaj roinn xxmaj oideachais agus xxmaj scileanna ag an ceart ceart agus chur agus tacaíocht le xxmaj creidiúnaithe xxmaj ráth xxup xxunk trí aithint oifigiúil thabhairt dó oifigiúil ”\n", + "\n", + "\n", + "LOSS: 3.034301996231079\n", + "xxmaj the marathon cycle will take place on 28 xxmaj april , so there 's plenty of time to train for the event . xxeos\n", + "\n", + "xxmaj beidh an rothaíocht ar siúl ar 28 xxmaj aibreán ; mar sin tá xxunk ama agat le haghaidh traenála chuige .\n", + "\n", + "xxmaj beidh an timthriall xxunk siúl an an xxmaj aibreán , agus sin , neart am go chun haghaidh oiliúint ar . xxeos\n", + "\n", + "\n", + "LOSS: 3.0320944786071777\n", + "xxmaj of the farmers needed more days to erect large structures , such as temple pyramids . xxeos\n", + "\n", + "xxmaj as na feirmeoirí ag teastáil níos mó agus níos mó lá gnó le struchtúir mhóra , ar nós pirimidí teampall in airde .\n", + "\n", + "xxmaj as na feirmeoirí a teastáil níos mó lá struchtúir mó struchtúir a a struchtúir móra a mar nós na teampall . áirithe . xxeos\n", + "\n", + "\n", + "LOSS: 2.9994633197784424\n", + "xxmaj undergraduate , xxmaj master 's or phd students xxeos\n", + "\n", + "xxmaj mic léinn bunchéime , xxmaj máistreachta nó phd\n", + "\n", + "xxmaj mic léinn xxmaj , xxmaj máistir nó mic ,\n", + "\n", + "\n", + "LOSS: 2.971003532409668\n", + "xxmaj located in the heart of xxmaj europe , several other xxmaj european capitals are only a couple of hours away ( amsterdam , xxmaj paris , xxmaj london ) . xxeos\n", + "\n", + "xxmaj is i gcroílár na heorpa atá siad suite , agus níl roinnt príomhchathracha eile ( amstardam , xxmaj páras , xxmaj londain ) ach cúpla uair an chloig taistil uathu .\n", + "\n", + "xxmaj suite lánúin gcroílár na heorpa , roinnt , ach agus tá cúpla xxmaj xxmaj xxmaj xxmaj ) xxmaj amstardam , xxmaj londain ) . cúpla uair an chloig de ar . xxeos\n", + "\n", + "\n", + "LOSS: 2.9318175315856934\n", + "a key work - stream in this unit relates to assessing the xxmaj irish economic implications of xxmaj brexit . xxeos\n", + "\n", + "xxmaj ceann de xxunk oibre an aonaid seo ná na himpleachtaí eacnamaíocha a bheidh ag xxmaj imeacht na xxmaj breataine as an xxmaj aontas xxmaj eorpach do éirinn a mheas .\n", + "\n", + "xxmaj baineann de na oibre - t seo , baineann xxunk eacnamaíochta na bhaineann ag xxmaj xxunk xxmaj héireann gaeilge - xxmaj xxmaj aontas xxmaj eorpach a bhaint a mheasúnú . xxeos\n", + "\n", + "\n", + "LOSS: 2.9126083850860596\n", + "xxmaj home page » xxmaj games » xxmaj mutually xxmaj assured xxmaj destruction ( mad ) xxeos\n", + "\n", + "xxmaj leathanach xxmaj baile » xxmaj cluichí » xxmaj scrios roicéad\n", + "\n", + "xxmaj leathanach xxmaj baile » xxmaj cluichí » xxmaj cinnte xxmaj xxmaj\n", + "\n", + "\n", + "LOSS: 2.9069716930389404\n", + "xxmaj automatic recording of sickness and absence . xxeos\n", + "\n", + "xxmaj breoiteacht agus as láthair taifeadadh go huathoibríoch .\n", + "\n", + "xxmaj taifeadadh xxunk taifeadadh láthair a uathoibríoch huathoibríoch . xxeos\n", + "\n", + "\n", + "LOSS: 2.895787239074707\n", + "xxmaj it shall be embossed with the stamp of the military authority . xxeos\n", + "\n", + "xxmaj beidh stampa an údaráis mhíleata múnlaithe air .\n", + "\n", + "xxmaj beidh sé ar údaráis mhíleata uirthi . . xxeos\n", + "\n", + "\n" + ] + } + ], + "source": [ + "n_losses = 50\n", + "top_losses = loss_sorted_idxs[:n_losses]\n", + "\n", + "for i in top_losses:\n", + " print(f'LOSS: {float(loss_ls[i])}')\n", + " print(en_ls[i])\n", + " print()\n", + " print(ga_trg_ls[i])\n", + " print()\n", + " print(ga_pred_ls[i])\n", + " print()\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Show smallest `n` losses" + ] + }, + { + "cell_type": "code", + "execution_count": 167, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LOSS: 0.012944857589900494\n", + "( 2 ) xxmaj this xxmaj act shall come into operation on the 1st day of xxmaj january , 1937 . xxeos\n", + "\n", + "( 2 ) xxmaj tiocfaidh an tacht so i ngníomh an 1adh lá d'eanar , 1937 .\n", + "\n", + "( 2 ) xxmaj tiocfaidh an tacht so i ngníomh an 1adh lá d'eanar , 1937 . xxeos\n", + "\n", + "\n", + "LOSS: 0.012784866616129875\n", + "( f ) by substituting the following for paragraph 2 of xxmaj schedule 3 : xxeos\n", + "\n", + "( f ) tríd an méid seo a leanas a chur in ionad mhír 2 de xxmaj sceideal 3 :\n", + "\n", + "( f ) tríd an méid seo a leanas a chur in ionad mhír 2 de xxmaj sceideal 3 : xxeos\n", + "\n", + "\n", + "LOSS: 0.012429873459041119\n", + "xxmaj amendment of section 96 of xxmaj act of 2001 . xxeos\n", + "\n", + "xxmaj leasú ar alt 96 d’acht 2001 .\n", + "\n", + "xxmaj leasú ar alt 96 d’acht 2001 . xxeos\n", + "\n", + "\n", + "LOSS: 0.012094180099666119\n", + "11 . — this xxmaj act may be cited as the xxmaj transport ( miscellaneous xxmaj provisions ) xxmaj act , 1979 . xxeos\n", + "\n", + "11 . — féadfar an tacht xxmaj iompair ( forálacha xxmaj ilghnéitheacha ) , 1979 , a ghairm den xxmaj acht seo .\n", + "\n", + "11 . — féadfar an tacht xxmaj iompair ( forálacha xxmaj ilghnéitheacha ) , 1979 , a ghairm den xxmaj acht seo . xxeos\n", + "\n", + "\n", + "LOSS: 0.01209115982055664\n", + "93 . xxmaj amendment of section 1 ( interpretation ) of xxmaj principal xxmaj act . xxeos\n", + "\n", + "93 . xxmaj leasú ar alt 1 ( léiriú ) den phríomh - acht .\n", + "\n", + "93 . xxmaj leasú ar alt 1 ( léiriú ) den phríomh - acht . xxeos\n", + "\n", + "\n", + "LOSS: 0.011882972903549671\n", + "[ ga ] ( xxrep 3 i ) by inserting the following after paragraph 2 : xxeos\n", + "\n", + "( xxrep 3 i ) tríd an méid seo a leanas a chur isteach i ndiaidh mhír 2 :\n", + "\n", + "( xxrep 3 i ) tríd an méid seo a leanas a chur isteach i ndiaidh mhír 2 : xxeos\n", + "\n", + "\n", + "LOSS: 0.011682409793138504\n", + "xxmaj number 32 / 2009 : xxmaj criminal xxmaj justice ( amendment ) xxmaj act 2009 xxmaj home xxeos\n", + "\n", + "xxmaj uimhir 32 / 2009 : xxmaj an tacht um xxmaj cheartas xxmaj coiriúil ( leasú ) 2009\n", + "\n", + "xxmaj uimhir 32 / 2009 : xxmaj an tacht um xxmaj cheartas xxmaj coiriúil ( leasú ) 2009 xxeos\n", + "\n", + "\n", + "LOSS: 0.011207898147404194\n", + "xxmaj amendment to section 20 of xxmaj act of 2003 . xxeos\n", + "\n", + "xxmaj leasú ar alt 20 d’acht 2003 .\n", + "\n", + "xxmaj leasú ar alt 20 d’acht 2003 . xxeos\n", + "\n", + "\n", + "LOSS: 0.01114108320325613\n", + "xxmaj central xxmaj bank xxmaj act , 1971 . xxeos\n", + "\n", + "xxmaj acht an xxmaj bhainc xxmaj ceannais , 1971 .\n", + "\n", + "xxmaj acht an xxmaj bhainc xxmaj ceannais , 1971 . xxeos\n", + "\n", + "\n", + "LOSS: 0.01099395751953125\n", + "“ the xxmaj act of 1995 ” means the xxmaj industrial xxmaj development xxmaj act , 1995 ; xxeos\n", + "\n", + "ciallaíonn “ acht 1995 ” an tacht um xxmaj fhorbairt xxmaj tionscail , 1995 ;\n", + "\n", + "ciallaíonn “ acht 1995 ” an tacht um xxmaj fhorbairt xxmaj tionscail , 1995 ; xxeos\n", + "\n", + "\n", + "LOSS: 0.010816644877195358\n", + "45 . — ( 1 ) xxmaj the xxmaj principal xxmaj act is amended by inserting the following section after section 611 : xxeos\n", + "\n", + "45 . — ( 1 ) xxmaj leasaítear an príomh - acht tríd an alt seo a leanas a chur isteach i ndiaidh alt 611 :\n", + "\n", + "45 . — ( 1 ) xxmaj leasaítear an príomh - acht tríd an alt seo a leanas a chur isteach i ndiaidh alt 611 : xxeos\n", + "\n", + "\n", + "LOSS: 0.010740697383880615\n", + "( xxrep 3 i ) which is not a scheduled ( part xxup i ) pension , and xxeos\n", + "\n", + "( xxrep 3 i ) nach pinsean sceidealta ( cuid xxup i ) , agus\n", + "\n", + "( xxrep 3 i ) nach pinsean sceidealta ( cuid xxup i ) , agus xxeos\n", + "\n", + "\n", + "LOSS: 0.010490857996046543\n", + "( 2 ) xxmaj section 11 of the xxmaj act of 1997 is hereby repealed . xxeos\n", + "\n", + "( 2 ) xxmaj aisghairtear leis seo alt 11 d'acht 1997 .\n", + "\n", + "( 2 ) xxmaj aisghairtear leis seo alt 11 d'acht 1997 . xxeos\n", + "\n", + "\n", + "LOSS: 0.010299509391188622\n", + "22 . — xxmaj the xxmaj second xxmaj schedule to the xxmaj courts and xxmaj court xxmaj officers xxmaj act 1995 is amended by the addition of the following paragraph : xxeos\n", + "\n", + "22 . — xxmaj leasaítear an xxmaj dara xxmaj sceideal a ghabhann le hacht na gcúirteanna agus na noifigeach xxmaj cúirte 1995 tríd an mír seo a leanas a chur leis :\n", + "\n", + "22 . — xxmaj leasaítear an xxmaj dara xxmaj sceideal a ghabhann le hacht na gcúirteanna agus na noifigeach xxmaj cúirte 1995 tríd an mír seo a leanas a chur leis : xxeos\n", + "\n", + "\n", + "LOSS: 0.010137557983398438\n", + "xxmaj sex ( 3 ) xxmaj general xxmaj health ( 7 ) xxmaj field of xxmaj study ( 45 ) xxeos\n", + "\n", + "xxmaj inscne ( 3 ) xxmaj sláinte xxmaj ginearálta ( 7 ) xxmaj réimse xxmaj staidéir ( 45 )\n", + "\n", + "xxmaj inscne ( 3 ) xxmaj sláinte xxmaj ginearálta ( 7 ) xxmaj réimse xxmaj staidéir ( 45 ) xxeos\n", + "\n", + "\n", + "LOSS: 0.010010609403252602\n", + "( b ) by the insertion of the following subsection after subsection ( 1 ) of section 5 : xxeos\n", + "\n", + "( b ) tríd an bhfo - alt seo a leanas a chur isteach i ndiaidh fho - alt ( 1 ) d'alt 5 :\n", + "\n", + "( b ) tríd an bhfo - alt seo a leanas a chur isteach i ndiaidh fho - alt ( 1 ) d'alt 5 : xxeos\n", + "\n", + "\n", + "LOSS: 0.009908217005431652\n", + "( g ) in subsection ( 9 ) , by substituting for paragraph ( a ) the following : xxeos\n", + "\n", + "( g ) i bhfo - alt ( 9 ) , tríd an méid seo a leanas a chur in ionad mhír ( a ) :\n", + "\n", + "( g ) i bhfo - alt ( 9 ) , tríd an méid seo a leanas a chur in ionad mhír ( a ) : xxeos\n", + "\n", + "\n", + "LOSS: 0.009814311750233173\n", + "( a ) in paragraph 1 of xxmaj part xxup i , “ £ 158 10s . ” shall be substituted for “ £ 132 ” ( inserted by section 16 of the xxmaj increase xxmaj act of 1961 ) , xxeos\n", + "\n", + "( a ) i mír 1 de xxmaj chuid xxup i , cuirfear “ £ 158 10s . ” in ionad “ £ 132 ” ( a cuireadh isteach le halt 16 d'acht xxmaj mhéadú 1961 ) ,\n", + "\n", + "( a ) i mír 1 de xxmaj chuid xxup i , cuirfear “ £ 158 10s . ” in ionad “ £ 132 ” ( a cuireadh isteach le halt 16 d'acht xxmaj mhéadú 1961 ) , xxeos\n", + "\n", + "\n", + "LOSS: 0.00944268237799406\n", + "48 . — xxmaj the xxmaj taxes xxmaj consolidation xxmaj act 1997 is amended by the insertion of the following section after section xxunk : xxeos\n", + "\n", + "48 . — xxmaj leasaítear an tacht xxmaj comhdhlúite xxmaj cánacha 1997 tríd an alt seo a leanas a chur isteach i ndiaidh alt xxunk :\n", + "\n", + "48 . — xxmaj leasaítear an tacht xxmaj comhdhlúite xxmaj cánacha 1997 tríd an alt seo a leanas a chur isteach i ndiaidh alt xxunk : xxeos\n", + "\n", + "\n", + "LOSS: 0.009387349709868431\n", + "( ii ) on conviction on indictment , to a fine not exceeding £ 100 , xxrep 3 0 or to imprisonment for a term not exceeding five years , or both . xxeos\n", + "\n", + "( ii ) ar é a chiontú ar díotáil , fíneáil nach mó ná £ 100 , xxrep 3 0 nó príosúnacht ar feadh téarma nach faide ná cúig bliana , nó iad araon , a chur air .\n", + "\n", + "( ii ) ar é a chiontú ar díotáil , fíneáil nach mó ná £ 100 , xxrep 3 0 nó príosúnacht ar feadh téarma nach faide ná cúig bliana , nó iad araon , a chur air . xxeos\n", + "\n", + "\n", + "LOSS: 0.009340242482721806\n", + "( xxrep 3 i ) by inserting the following after paragraph ( c ) : xxeos\n", + "\n", + "( xxrep 3 i ) tríd an méid seo a leanas a chur isteach i ndiaidh mhír ( c ) :\n", + "\n", + "( xxrep 3 i ) tríd an méid seo a leanas a chur isteach i ndiaidh mhír ( c ) : xxeos\n", + "\n", + "\n", + "LOSS: 0.009309934452176094\n", + "( e ) the substitution of the following subsection for subsection ( 12 ) : xxeos\n", + "\n", + "( e ) tríd an bhfo - alt seo a leanas a chur in ionad fho - alt ( 12 ) :\n", + "\n", + "( e ) tríd an bhfo - alt seo a leanas a chur in ionad fho - alt ( 12 ) : xxeos\n", + "\n", + "\n", + "LOSS: 0.009225739166140556\n", + "( a ) in subsection ( 1 ) , by substituting for paragraph ( a ) the following : xxeos\n", + "\n", + "( a ) i bhfo - alt ( 1 ) , tríd an méid seo a leanas a chur in ionad mhír ( a ) :\n", + "\n", + "( a ) i bhfo - alt ( 1 ) , tríd an méid seo a leanas a chur in ionad mhír ( a ) : xxeos\n", + "\n", + "\n", + "LOSS: 0.00919419713318348\n", + "1 . — this xxmaj act may be cited as the xxmaj petroleum and xxmaj other xxmaj minerals xxmaj development xxmaj act , 1960 . xxeos\n", + "\n", + "1 . — féadfar an tacht um xxmaj fhorbairt xxmaj pheitriliaim agus xxmaj mianraí xxmaj eile , 1960 , a ghairm den xxmaj acht seo .\n", + "\n", + "1 . — féadfar an tacht um xxmaj fhorbairt xxmaj pheitriliaim agus xxmaj mianraí xxmaj eile , 1960 , a ghairm den xxmaj acht seo . xxeos\n", + "\n", + "\n", + "LOSS: 0.00910924281924963\n", + "“ contractor ” has the meaning assigned to it by section 5 ; xxeos\n", + "\n", + "tá le “ conraitheoir ” an bhrí a shanntar dó le halt 5 ;\n", + "\n", + "tá le “ conraitheoir ” an bhrí a shanntar dó le halt 5 ; xxeos\n", + "\n", + "\n", + "LOSS: 0.008987490087747574\n", + "xxmaj number 30 / 2007 : xxup part 1 xxmaj preliminary and xxmaj general xxmaj home xxeos\n", + "\n", + "xxmaj uimhir 30 / 2007 : xxup cuid 1 xxmaj réamhráiteach agus xxmaj ginearálta\n", + "\n", + "xxmaj uimhir 30 / 2007 : xxup cuid 1 xxmaj réamhráiteach agus xxmaj ginearálta xxeos\n", + "\n", + "\n", + "LOSS: 0.008874460123479366\n", + "“ the xxmaj act of 1952 ” means the xxmaj housing ( amendment ) xxmaj act , 1952 ( no . 16 of 1952 ) ; xxeos\n", + "\n", + "ciallaíonn “ acht 1952 ” xxmaj acht na dtithe ( leasú ) , 1952 ( uimh. 16 de 1952 ) ;\n", + "\n", + "ciallaíonn “ acht 1952 ” xxmaj acht na dtithe ( leasú ) , 1952 ( uimh. 16 de 1952 ) ; xxeos\n", + "\n", + "\n", + "LOSS: 0.008825826458632946\n", + "( 2 ) xxmaj this section shall come into operation on the 6th day of xxmaj april , 1983 . xxeos\n", + "\n", + "( 2 ) xxmaj tiocfaidh an t - alt seo i ngníomh an 6ú lá d'aibreán , 1983 .\n", + "\n", + "( 2 ) xxmaj tiocfaidh an t - alt seo i ngníomh an 6ú lá d'aibreán , 1983 . xxeos\n", + "\n", + "\n", + "LOSS: 0.008588433265686035\n", + "66 . — ( 1 ) xxmaj section 766 of the xxmaj principal xxmaj act is amended — xxeos\n", + "\n", + "66 . — ( 1 ) xxmaj leasaítear alt 766 den phríomh - acht —\n", + "\n", + "66 . — ( 1 ) xxmaj leasaítear alt 766 den phríomh - acht — xxeos\n", + "\n", + "\n", + "LOSS: 0.008027076721191406\n", + "“ the xxmaj act of 1976 ” means the xxmaj finance xxmaj act , 1976 ; xxeos\n", + "\n", + "ciallaíonn “ acht 1976 ” an tacht xxmaj airgeadais , 1976 ;\n", + "\n", + "ciallaíonn “ acht 1976 ” an tacht xxmaj airgeadais , 1976 ; xxeos\n", + "\n", + "\n", + "LOSS: 0.007852774113416672\n", + "xxmaj type of xxmaj household ( 8) xxmaj religion ( 11 ) censusyear xxeos\n", + "\n", + "xxmaj cineál xxmaj teaghlaigh ( 8) xxmaj creideamh ( 11 ) bliaindaonáirimh\n", + "\n", + "xxmaj cineál xxmaj teaghlaigh ( 8) xxmaj creideamh ( 11 ) bliaindaonáirimh xxeos\n", + "\n", + "\n", + "LOSS: 0.007078517694026232\n", + "“ ( xxrep 3 i ) in the case of a person — xxeos\n", + "\n", + "“ ( xxrep 3 i ) i gcás duine —\n", + "\n", + "“ ( xxrep 3 i ) i gcás duine — xxeos\n", + "\n", + "\n", + "LOSS: 0.007061743643134832\n", + "28 . — ( 1 ) xxmaj section 224 of the xxmaj principal xxmaj act is hereby amended by the substitution for subsections ( 1 ) and ( 2 ) of the following subsections : xxeos\n", + "\n", + "28 . — ( 1 ) xxmaj leasaítear leis seo alt 224 den phríomh - acht trí na fo - ailt seo a leanas a chur in ionad fho - ailt ( 1 ) agus ( 2 ) :\n", + "\n", + "28 . — ( 1 ) xxmaj leasaítear leis seo alt 224 den phríomh - acht trí na fo - ailt seo a leanas a chur in ionad fho - ailt ( 1 ) agus ( 2 ) : xxeos\n", + "\n", + "\n", + "LOSS: 0.007009965367615223\n", + "xxmaj the xxmaj finance ( customs xxmaj duties ) ( no . 4 ) xxmaj act , 1932 ( no . 34 of 1932 ) , section 3 . xxeos\n", + "\n", + "xxmaj an tacht xxmaj airgid ( diúitéthe xxmaj custum ) ( uimh. 4 ) , 1932 ( uimh. 34 de 1932 ) , alt 3 .\n", + "\n", + "xxmaj an tacht xxmaj airgid ( diúitéthe xxmaj custum ) ( uimh. 4 ) , 1932 ( uimh. 34 de 1932 ) , alt 3 . xxeos\n", + "\n", + "\n", + "LOSS: 0.006748199462890625\n", + "( xxrep 3 i ) in subsection ( 5 ) — xxeos\n", + "\n", + "( xxrep 3 i ) i bhfo - alt ( 5 ) —\n", + "\n", + "( xxrep 3 i ) i bhfo - alt ( 5 ) — xxeos\n", + "\n", + "\n", + "LOSS: 0.006730107590556145\n", + "2 . — this xxmaj act may be cited as the xxmaj restrictive xxmaj trade xxmaj practices ( confirmation of xxmaj order ) ( no . 3 ) xxmaj act , 1956 . xxeos\n", + "\n", + "2 . — féadfar an tacht um xxmaj chleachtais xxmaj srianta xxmaj trádála ( ordú a xxmaj dhaingniú ) ( uimh. 3 ) , 1956 , a ghairm den xxmaj acht seo .\n", + "\n", + "2 . — féadfar an tacht um xxmaj chleachtais xxmaj srianta xxmaj trádála ( ordú a xxmaj dhaingniú ) ( uimh. 3 ) , 1956 , a ghairm den xxmaj acht seo . xxeos\n", + "\n", + "\n", + "LOSS: 0.006541644688695669\n", + "xxmaj nature of xxmaj occupancy ( 8) xxmaj towns by xxmaj size ( 205 ) censusyear xxeos\n", + "\n", + "xxmaj cineál xxmaj seilbhe ( 8) xxmaj bailte de réir xxmaj méide ( 205 ) bliaindaonáirimh\n", + "\n", + "xxmaj cineál xxmaj seilbhe ( 8) xxmaj bailte de réir xxmaj méide ( 205 ) bliaindaonáirimh xxeos\n", + "\n", + "\n", + "LOSS: 0.006427327636629343\n", + "( a ) by the substitution of “ £ 150 ” for “ twenty pounds ” in paragraph ( a ) , and xxeos\n", + "\n", + "( a ) trí “ £ 150 ” a chur in ionad “ fiche punt ” i mír ( a ) , agus\n", + "\n", + "( a ) trí “ £ 150 ” a chur in ionad “ fiche punt ” i mír ( a ) , agus xxeos\n", + "\n", + "\n", + "LOSS: 0.005883284844458103\n", + "35 . — xxmaj section 90 of the xxmaj corporation xxmaj tax xxmaj act , 1976 , is hereby amended by the addition to subsection ( 4 ) ( as amended by the xxmaj act of 1978 ) of the following proviso : xxeos\n", + "\n", + "35 . — leasaítear leis seo alt 90 den xxmaj acht xxmaj cánach xxmaj corparáide , 1976 , tríd an gcoinníoll seo a leanas a chur le fo - alt ( 4 ) ( arna leasú le hacht 1978 ) :\n", + "\n", + "35 . — leasaítear leis seo alt 90 den xxmaj acht xxmaj cánach xxmaj corparáide , 1976 , tríd an gcoinníoll seo a leanas a chur le fo - alt ( 4 ) ( arna leasú le hacht 1978 ) : xxeos\n", + "\n", + "\n", + "LOSS: 0.005392891820520163\n", + "81 . — section 12 of the xxmaj principal xxmaj act is hereby amended — xxeos\n", + "\n", + "81 . — leasaítear leis seo alt 12 den phríomh - acht —\n", + "\n", + "81 . — leasaítear leis seo alt 12 den phríomh - acht — xxeos\n", + "\n", + "\n", + "LOSS: 0.005264571402221918\n", + "2 . — ( 1 ) xxmaj this xxmaj act may be cited as the xxmaj rent xxmaj restrictions ( temporary xxmaj provisions ) ( continuance ) xxmaj act , 1981 . xxeos\n", + "\n", + "2 . — ( 1 ) xxmaj féadfar an tacht xxmaj srianta xxmaj cíosa ( forálacha xxmaj sealadacha ) ( buanú ) , 1981 , a ghairm den xxmaj acht seo .\n", + "\n", + "2 . — ( 1 ) xxmaj féadfar an tacht xxmaj srianta xxmaj cíosa ( forálacha xxmaj sealadacha ) ( buanú ) , 1981 , a ghairm den xxmaj acht seo . xxeos\n", + "\n", + "\n", + "LOSS: 0.004902585409581661\n", + "xxmaj sex ( 3 ) xxmaj disability xxmaj type ( 14 ) censusyear xxeos\n", + "\n", + "xxmaj inscne ( 3 ) xxmaj cineál an xxmaj mhàchumais ( 14 ) bliaindaonáirimh\n", + "\n", + "xxmaj inscne ( 3 ) xxmaj cineál an xxmaj mhàchumais ( 14 ) bliaindaonáirimh xxeos\n", + "\n", + "\n", + "LOSS: 0.004811659920960665\n", + "4 . — the xxmaj act of 1976 is hereby amended by the substitution of the following section for section 29 : xxeos\n", + "\n", + "4 . — leasaítear leis seo xxmaj acht 1976 tríd an alt seo a leanas a chur in ionad alt 29 :\n", + "\n", + "4 . — leasaítear leis seo xxmaj acht 1976 tríd an alt seo a leanas a chur in ionad alt 29 : xxeos\n", + "\n", + "\n", + "LOSS: 0.004624366760253906\n", + "7 . — ( 1 ) xxmaj this xxmaj act may be cited as the xxmaj postal and xxmaj telecommunications xxmaj services ( amendment ) xxmaj act , 1984 . xxeos\n", + "\n", + "7 . — ( 1 ) xxmaj féadfar an tacht xxmaj seirbhísí xxmaj poist agus xxmaj teileachumarsáide ( leasú ) , 1984 , a ghairm den xxmaj acht seo .\n", + "\n", + "7 . — ( 1 ) xxmaj féadfar an tacht xxmaj seirbhísí xxmaj poist agus xxmaj teileachumarsáide ( leasú ) , 1984 , a ghairm den xxmaj acht seo . xxeos\n", + "\n", + "\n", + "LOSS: 0.00429901946336031\n", + "xxmaj sex ( 3 ) xxmaj aggregate xxmaj town or xxmaj rural xxmaj area ( 3 ) xxmaj province xxmaj county or xxmaj city ( 44 ) xxeos\n", + "\n", + "xxmaj inscne ( 3 ) xxmaj ceantar xxmaj iomlán xxmaj bhaile nó xxmaj ceantar xxmaj iomlán xxmaj tuaithe ( 3 ) xxmaj cúige , xxmaj contae nó xxmaj cathair ( 44 )\n", + "\n", + "xxmaj inscne ( 3 ) xxmaj ceantar xxmaj iomlán xxmaj bhaile nó xxmaj ceantar xxmaj iomlán xxmaj tuaithe ( 3 ) xxmaj cúige , xxmaj contae nó xxmaj cathair ( 44 ) xxeos\n", + "\n", + "\n", + "LOSS: 0.0040727341547608376\n", + "101 . — ( 1 ) xxmaj the xxmaj principal xxmaj act is amended — xxeos\n", + "\n", + "101 . — ( 1 ) xxmaj leasaítear an príomh - acht —\n", + "\n", + "101 . — ( 1 ) xxmaj leasaítear an príomh - acht — xxeos\n", + "\n", + "\n", + "LOSS: 0.0039478447288274765\n", + "56 . — xxmaj section 208 of the xxmaj principal xxmaj act is amended — xxeos\n", + "\n", + "56 . — xxmaj leasaítear alt 208 den phríomh - acht —\n", + "\n", + "56 . — xxmaj leasaítear alt 208 den phríomh - acht — xxeos\n", + "\n", + "\n", + "LOSS: 0.003833770751953125\n", + "xxmaj number 5 / 2005 : xxup part 5 xxmaj capital xxmaj acquisitions xxmaj tax xxmaj home xxeos\n", + "\n", + "xxmaj uimhir 5 / 2005 : xxup cuid 5 xxmaj cáin xxmaj fháltas xxmaj caipitiúil\n", + "\n", + "xxmaj uimhir 5 / 2005 : xxup cuid 5 xxmaj cáin xxmaj fháltas xxmaj caipitiúil xxeos\n", + "\n", + "\n", + "LOSS: 0.0036608653608709574\n", + "12 . — ( 1 ) xxmaj this xxmaj act may be cited as the xxmaj adoption xxmaj act , 1964 . xxeos\n", + "\n", + "12 . — ( 1 ) xxmaj féadfar an tacht xxmaj uchtála , 1964 , a ghairm den xxmaj acht seo .\n", + "\n", + "12 . — ( 1 ) xxmaj féadfar an tacht xxmaj uchtála , 1964 , a ghairm den xxmaj acht seo . xxeos\n", + "\n", + "\n", + "LOSS: 0.0034489380195736885\n", + "xxmaj number 2 / 1 xxrep 3 9 : xxup part 5 xxmaj residential xxmaj property xxmaj tax xxmaj home xxeos\n", + "\n", + "xxmaj uimhir 2 / 1 xxrep 3 9 : xxup cuid 5 xxmaj cáin xxmaj mhaoine xxmaj cónaithe\n", + "\n", + "xxmaj uimhir 2 / 1 xxrep 3 9 : xxup cuid 5 xxmaj cáin xxmaj mhaoine xxmaj cónaithe xxeos\n", + "\n", + "\n" + ] + } + ], + "source": [ + "n_losses = 50\n", + "top_losses = loss_sorted_idxs[-n_losses:]\n", + "\n", + "for i in top_losses:\n", + " print(f'LOSS: {float(loss_ls[i])}')\n", + " print(en_ls[i])\n", + " print()\n", + " print(ga_trg_ls[i])\n", + " print()\n", + " print(ga_pred_ls[i])\n", + " print()\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Manual results" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'xxbos dia dia duit , conas atá tú ? xxeos'" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "generate(learn.model, \"hello, how are you?\", dls.vocab[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'xxbos xxmaj an féidir leat a insint dúinn i gcás go bhfuil an stáisiún bus le do thoil ? xxeos'" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "generate(learn.model, \"Can you tell we where the bus station is please?\", dls.vocab[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'xxbos xxmaj inné xxunk sé , ach amárach beidh amárach a bheith an - xxunk xxeos'" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "generate(learn.model, \"Yesterday it rained, but tomorrow will be very sunny\", dls.vocab[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'xxbos xxmaj bhí mé lá mór , xxmaj is é mo aistritheoir ag obair xxeos'" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "generate(learn.model, \"I had a great day, my translator is working\", dls.vocab[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 166, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'xxbos xxmaj mar sin , tá sé seo scéal ar fad faoi conas mo shaol fuair smeach xxunk síos , mar sin mhaith liom a ghlacadh nóiméad díreach suí ceart go bhfuil , beidh mé go léir faoi conas a tháinig mé an xxunk úr xxeos'" + ] + }, + "execution_count": 166, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "generate(learn.model, \"So this is a story all about how my life got flip turned \\\n", + "upside down, so I'd like to take a minute just sit right there, I'll you all about how I became the fresh prince\\\n", + "of belair\", dls.vocab[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'xxbos madra xxeos'" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "generate(learn.model, \"dog\", dls.vocab[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'xxbos cat cat cat xxeos'" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "generate(learn.model, \"cat\", dls.vocab[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'xxbos crann crann xxeos'" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "generate(learn.model, \"tree\", dls.vocab[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'xxbos foirgneamh tógála xxeos'" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "generate(learn.model, \"building\", dls.vocab[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'xxbos xxmaj cathair cathair cathair xxeos'" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "generate(learn.model, \"city\", dls.vocab[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'xxbos bean xxeos'" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "generate(learn.model, \"woman\", dls.vocab[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'xxbos fear xxeos'" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "generate(learn.model, \"man\", dls.vocab[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'xxbos seacláide xxeos'" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "generate(learn.model, \"chocolate\", dls.vocab[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'xxbos spásárthach spásárthach xxeos'" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "generate(learn.model, \"spaceship\", dls.vocab[1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# https://forums.fast.ai/t/fastai-v2-text/53529/334\n", + "from fastai2.text.all import *\n", + "\n", + "defaults.device = torch.device('cpu')\n", + "path = Path('.')\n", + "learner = load_learner(\"./export.pkl\")\n", + "\n", + "f = open(\"/tmp/test.txt\", \"r\")\n", + "test_file_contents = f.read()\n", + "\n", + "_, _, losses = learner.predict(test_file_contents)\n", + "cats = [learner.dls.categorize.decode(i) for i in range(len(losses))]\n", + "\n", + "predictions = sorted(\n", + " zip(cats, map(float, losses)),\n", + " key=lambda p: p[1],\n", + " reverse=True\n", + ")\n", + "print(predictions)\n", + "\n", + "# OR\n", + "\n", + "items = pd.read_csv(\"/tmp/test.txt\", sep = '\\t')\n", + "test_dl = learner.dls.test_dl(items.values)\n", + "\n", + "learner.get_preds(dl=test_dl, with_decoded=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}