Skip to content

Commit

Permalink
Releasing V0.1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
ranzaka committed Jun 25, 2024
1 parent 93c21d4 commit 4766852
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 47 deletions.
100 changes: 55 additions & 45 deletions examples/examples.ipynb
Original file line number Diff line number Diff line change
@@ -1,15 +1,5 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"sys.path.append(\"../src/\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
Expand Down Expand Up @@ -138,21 +128,21 @@
{
"data": {
"text/plain": [
"[0,\n",
" 102,\n",
" 59,\n",
" 65,\n",
" 43,\n",
" 25,\n",
" 27,\n",
" 65,\n",
" 43,\n",
" 12,\n",
" 40,\n",
" 65,\n",
" 98,\n",
"[32,\n",
" 17,\n",
" 105,\n",
" 15,\n",
" 82,\n",
" 24,\n",
" 65,\n",
" 31,\n",
" 15,\n",
" 82,\n",
" 52,\n",
" 7,\n",
" 15,\n",
" 116,\n",
" 110,\n",
" 15,\n",
" 126,\n",
" 126,\n",
" 127,\n",
Expand Down Expand Up @@ -326,6 +316,26 @@
"assert loaded_tokenizer(text, truncate_and_pad=True)==tokenizer(text, truncate_and_pad=True)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<sinlib.tokenizer.Tokenizer at 0x10814b790>"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Tokenizer(max_length=90).load_from_pretrained('asKDS')"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -335,7 +345,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -344,7 +354,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 20,
"metadata": {},
"outputs": [
{
Expand All @@ -361,7 +371,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -370,7 +380,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -379,7 +389,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 23,
"metadata": {},
"outputs": [
{
Expand All @@ -388,7 +398,7 @@
"'me athara poson poho dina paniwudayak nikuth karamin janadhipathiwaraya penwa denne mihidu maharahathan wahanse visi.... In linguistics, romanization is the conversion...., adding special chars ^^*#(&#&$^)'"
]
},
"execution_count": 22,
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -406,7 +416,7 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -415,7 +425,7 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 25,
"metadata": {},
"outputs": [
{
Expand All @@ -424,7 +434,7 @@
"271"
]
},
"execution_count": 24,
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -435,7 +445,7 @@
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -444,7 +454,7 @@
},
{
"cell_type": "code",
"execution_count": 26,
"execution_count": 27,
"metadata": {},
"outputs": [
{
Expand All @@ -463,7 +473,7 @@
},
{
"cell_type": "code",
"execution_count": 27,
"execution_count": 28,
"metadata": {},
"outputs": [
{
Expand All @@ -472,7 +482,7 @@
"' ^^*#(&#&$^)ශ්\\u200dරී ලංකා ප්\\u200dරජාතාන්ත්\\u200dරික සමාජවාදී \\n'"
]
},
"execution_count": 27,
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -483,7 +493,7 @@
},
{
"cell_type": "code",
"execution_count": 28,
"execution_count": 29,
"metadata": {},
"outputs": [
{
Expand All @@ -492,7 +502,7 @@
"'මේ අතර පොසොන් පොහෝ දින පණිවුඩයක් නිකුත් කරමින් ජනාධිපතිවරයා පෙන්වා දෙන්නේ මිහිඳු මහරහතන් වහන්සේ විසි.... , ...., ^^*#(&#&$^)ශ්\\u200dරී ලංකා ප්\\u200dරජාතාන්ත්\\u200dරික සමාජවාදී'"
]
},
"execution_count": 28,
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -503,7 +513,7 @@
},
{
"cell_type": "code",
"execution_count": 29,
"execution_count": 30,
"metadata": {},
"outputs": [
{
Expand All @@ -512,7 +522,7 @@
"' ^^*#(&#&$^)ශ්රී ලංකා ප්රජාතාන්ත්රික සමාජවාදී '"
]
},
"execution_count": 29,
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -523,7 +533,7 @@
},
{
"cell_type": "code",
"execution_count": 30,
"execution_count": 31,
"metadata": {},
"outputs": [
{
Expand All @@ -532,7 +542,7 @@
"0.610738255033557"
]
},
"execution_count": 30,
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -543,7 +553,7 @@
},
{
"cell_type": "code",
"execution_count": 31,
"execution_count": 32,
"metadata": {},
"outputs": [
{
Expand All @@ -552,7 +562,7 @@
"1.0"
]
},
"execution_count": 31,
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "sinlib"
version = "0.0.9"
version = "0.1.0"
description = "Sinhala NLP Toolkit"
authors = [
{ name = "Ransaka", email = "ransaka.ravihara@gmail.com" }
Expand Down
2 changes: 1 addition & 1 deletion src/sinlib/utils/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def load_char_mapper(char_mapper_fp):


def load_default_vocab_map():
with open(DEFAULT_VOCAB_MAP_FP, "r") as f:
with open(Path(DEFAULT_VOCAB_MAP_FP) / "vocab.json", "r") as f:
vocab_map = json.load(f)
return vocab_map

Expand Down

0 comments on commit 4766852

Please sign in to comment.