diff --git a/examples/examples.ipynb b/examples/examples.ipynb index ca19b2c..63d35b1 100644 --- a/examples/examples.ipynb +++ b/examples/examples.ipynb @@ -1,15 +1,5 @@ { "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "sys.path.append(\"../src/\")" - ] - }, { "cell_type": "code", "execution_count": 2, @@ -138,21 +128,21 @@ { "data": { "text/plain": [ - "[0,\n", - " 102,\n", - " 59,\n", - " 65,\n", - " 43,\n", - " 25,\n", - " 27,\n", - " 65,\n", - " 43,\n", - " 12,\n", - " 40,\n", - " 65,\n", - " 98,\n", + "[32,\n", + " 17,\n", + " 105,\n", + " 15,\n", + " 82,\n", " 24,\n", - " 65,\n", + " 31,\n", + " 15,\n", + " 82,\n", + " 52,\n", + " 7,\n", + " 15,\n", + " 116,\n", + " 110,\n", + " 15,\n", " 126,\n", " 126,\n", " 127,\n", @@ -326,6 +316,26 @@ "assert loaded_tokenizer(text, truncate_and_pad=True)==tokenizer(text, truncate_and_pad=True)" ] }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Tokenizer(max_length=90).load_from_pretrained('asKDS')" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -335,7 +345,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -344,7 +354,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -361,7 +371,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -370,7 +380,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -379,7 +389,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -388,7 +398,7 @@ "'me athara poson poho dina paniwudayak nikuth karamin janadhipathiwaraya penwa denne mihidu maharahathan wahanse visi.... In linguistics, romanization is the conversion...., adding special chars ^^*#(&#&$^)'" ] }, - "execution_count": 22, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -406,7 +416,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -415,7 +425,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -424,7 +434,7 @@ "271" ] }, - "execution_count": 24, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -435,7 +445,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -444,7 +454,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -463,7 +473,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -472,7 +482,7 @@ "' ^^*#(&#&$^)ශ්\\u200dරී ලංකා ප්\\u200dරජාතාන්ත්\\u200dරික සමාජවාදී \\n'" ] }, - "execution_count": 27, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -483,7 +493,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -492,7 +502,7 @@ "'මේ අතර පොසොන් පොහෝ දින පණිවුඩයක් නිකුත් කරමින් ජනාධිපතිවරයා පෙන්වා දෙන්නේ මිහිඳු මහරහතන් වහන්සේ විසි.... , ...., ^^*#(&#&$^)ශ්\\u200dරී ලංකා ප්\\u200dරජාතාන්ත්\\u200dරික සමාජවාදී'" ] }, - "execution_count": 28, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -503,7 +513,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 30, "metadata": {}, "outputs": [ { @@ -512,7 +522,7 @@ "' ^^*#(&#&$^)ශ්රී ලංකා ප්රජාතාන්ත්රික සමාජවාදී '" ] }, - "execution_count": 29, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -523,7 +533,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -532,7 +542,7 @@ "0.610738255033557" ] }, - "execution_count": 30, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -543,7 +553,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 32, "metadata": {}, "outputs": [ { @@ -552,7 +562,7 @@ "1.0" ] }, - "execution_count": 31, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } diff --git a/pyproject.toml b/pyproject.toml index 31c545a..4cb6cf3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "sinlib" -version = "0.0.9" +version = "0.1.0" description = "Sinhala NLP Toolkit" authors = [ { name = "Ransaka", email = "ransaka.ravihara@gmail.com" } diff --git a/src/sinlib/utils/preprocessing.py b/src/sinlib/utils/preprocessing.py index 442cca6..4c88817 100644 --- a/src/sinlib/utils/preprocessing.py +++ b/src/sinlib/utils/preprocessing.py @@ -28,7 +28,7 @@ def load_char_mapper(char_mapper_fp): def load_default_vocab_map(): - with open(DEFAULT_VOCAB_MAP_FP, "r") as f: + with open(Path(DEFAULT_VOCAB_MAP_FP) / "vocab.json", "r") as f: vocab_map = json.load(f) return vocab_map