forked from karpathy/nanoGPT
-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #161 from klei22/add_korean_parallel_corpora
Add scripts compatible wtih the korean parallel corpora
- Loading branch information
Showing
7 changed files
with
260 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
# Korean-Parallel-Corpora | ||
|
||
This folder contains scripts and resources for working with the | ||
korean-parallel-corpora dataset. | ||
|
||
The Moo/korean-parallel-corpora is designed for tasks related to language | ||
translation, specifically translating between Korean and English. | ||
|
||
## Dataset Description: | ||
- **URL:** [Moo/korean-parallel-corpora on Huggingface](https://huggingface.co/datasets/Moo/korean-parallel-corpora/discussions/1) | ||
- **Tasks:** Translation | ||
- **Languages:** Korean, English | ||
- **License:** [cc-by-sa-3.0](https://creativecommons.org/licenses/by-sa/3.0/) | ||
|
||
## Script Overview: | ||
Scripts specific to this dataset are described below. | ||
|
||
### Obtaining dataset with `get_dataset.py` | ||
To obtain the dataset locally: | ||
|
||
```bash | ||
python3 get_dataset.py | ||
``` | ||
|
||
This will create the following files: | ||
- `data.json` - json containing Korean, English, and jamon language entries | ||
- `input.txt` - text format with prefixed entries according to args sent to the `get_dataset.py` file. | ||
|
||
### Testing Korean <-> Jamon Conversion | ||
|
||
This script is an example of how to convert to jamon and back: | ||
```bash | ||
python3 korean_jamo_conversion_test.py | ||
``` | ||
This will allow the model to be optionally trained on jamon but still recover | ||
the original Korean text. | ||
|
||
## License: | ||
|
||
The dataset is listed at Huggingface under the Creative Commons Attribution-ShareAlike 3.0 License. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
import pandas as pd | ||
import requests | ||
import os | ||
import argparse | ||
import json | ||
from tqdm import tqdm | ||
from jamo import h2j, j2hcj, is_hangul_char | ||
|
||
def download_file(url, filename): | ||
""" | ||
Download a file from a given URL with a progress bar, only if it is not already present. | ||
""" | ||
if os.path.exists(filename): | ||
print(f"{filename} already downloaded.") | ||
return | ||
response = requests.get(url, stream=True) | ||
response.raise_for_status() # Ensure the download was successful. | ||
total_size = int(response.headers.get("content-length", 0)) | ||
block_size = 1024 # 1 Kibibyte | ||
progress_bar = tqdm(total=total_size, unit="iB", unit_scale=True, desc="Downloading file") | ||
with open(filename, "wb") as f: | ||
for data in response.iter_content(block_size): | ||
progress_bar.update(len(data)) | ||
f.write(data) | ||
progress_bar.close() | ||
if total_size != 0 and progress_bar.n != total_size: | ||
print("Error: Failed to download the file completely.") | ||
else: | ||
print(f"Downloaded {filename}") | ||
|
||
def format_hangul_text(text): | ||
"""Format the text so that each Hangul character is surrounded by exactly one space.""" | ||
formatted_text = [] | ||
for char in text: | ||
if is_hangul_char(char): | ||
formatted_text.append(' ' + char + ' ') | ||
else: | ||
formatted_text.append(char) | ||
return ''.join(formatted_text) | ||
|
||
def korean_to_phonetic(text): | ||
"""Converts Korean text to its phonetic representation.""" | ||
text = text.replace(' ', '▁') | ||
text = format_hangul_text(text) | ||
decomposed_text = h2j(text) | ||
phonetic_text = j2hcj(decomposed_text) | ||
return phonetic_text | ||
|
||
def process_csv(csv_path, json_path, txt_path, order, prefixes): | ||
""" | ||
Process CSV file and save data as JSON and text with progress bars. | ||
""" | ||
df = pd.read_csv(csv_path, keep_default_na=False) | ||
data = [] | ||
progress_bar = tqdm(total=len(df), desc="Processing records", unit="records") | ||
with open(txt_path, 'w') as txt_file: | ||
for _, row in df.iterrows(): | ||
entries = { | ||
'ko': '\n' + prefixes['ko'] + row['ko'], | ||
'en': '\n' + prefixes['en'] + row['en'], | ||
'ph': '\n' + prefixes['ph'] + korean_to_phonetic(row['ko']) | ||
} | ||
line_content = [entries[item] for item in order if item in entries] | ||
txt_file.write(' '.join(line_content) + "\n") | ||
data.append({'ko': row['ko'], 'en': row['en'], 'ph': entries['ph']}) | ||
progress_bar.update(1) | ||
progress_bar.close() | ||
with open(json_path, 'w') as f: | ||
json.dump(data, f, ensure_ascii=False, indent=4) | ||
print(f"Converted {csv_path} to JSON at {json_path} and to text at {txt_path}") | ||
|
||
def main(url, json_path, txt_path, order, ko_prefix, en_prefix, ph_prefix): | ||
download_dir = "./downloaded_files" | ||
os.makedirs(download_dir, exist_ok=True) | ||
file_name = url.split("/")[-1].split("?")[0] | ||
csv_path = os.path.join(download_dir, file_name) | ||
download_file(url, csv_path) | ||
process_csv(csv_path, json_path, txt_path, order.split(','), {'ko': ko_prefix, 'en': en_prefix, 'ph': ph_prefix}) | ||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser( | ||
description="Convert CSV file to JSON and text file with phonetic representation of Korean text." | ||
) | ||
parser.add_argument("--url", type=str, default="https://huggingface.co/datasets/Moo/korean-parallel-corpora/resolve/main/train.csv?download=true", help="URL to download the CSV file from.") | ||
parser.add_argument("-o", "--output_json_file", type=str, default="data.json", help="Path to the output JSON file.") | ||
parser.add_argument("-t", "--output_txt_file", type=str, default="input.txt", help="Path to the output text file.") | ||
parser.add_argument("--order", type=str, default="ko,en,ph", help="Comma-separated order of fields to write to the text file: ko, en, ph.") | ||
parser.add_argument("--ko_prefix", type=str, default="ko:", help="Prefix for Korean text.") | ||
parser.add_argument("--en_prefix", type=str, default="en:", help="Prefix for English text.") | ||
parser.add_argument("--ph_prefix", type=str, default="ph:", help="Prefix for Phonetic text.") | ||
args = parser.parse_args() | ||
main(args.url, args.output_json_file, args.output_txt_file, args.order, args.ko_prefix, args.en_prefix, args.ph_prefix) | ||
|
36 changes: 36 additions & 0 deletions
36
data/korean-parallel-corpora/korean_jamo_conversion_test.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
from jamo import h2j, j2hcj, j2h, is_jamo | ||
|
||
def korean_to_phonetic(text): | ||
"""Converts Korean text to its phonetic representation.""" | ||
# Convert Hangul to individual jamos | ||
decomposed_text = h2j(text) | ||
# Convert jamos back to Hangul compatibility jamos (for readability) | ||
phonetic_text = j2hcj(decomposed_text) | ||
return phonetic_text | ||
|
||
# Example usage | ||
korean_text = "안 녕 하 세 요" | ||
phonetic_text = korean_to_phonetic(korean_text) | ||
print("Original:", korean_text) | ||
print("Phonetic:", phonetic_text.split(" ")) | ||
|
||
# Test string | ||
phonetic_text="ㅇㅏㄴ ㄴㅕㅇ ㅎㅏ ㅅㅔ ㅇㅛ" | ||
|
||
reconstructed_list = [] | ||
for pho in phonetic_text.split(" "): | ||
if len(pho) == 0: | ||
# if '' then skip | ||
continue | ||
elif is_jamo(pho[0]): | ||
# if is jamo then add after conversion | ||
# print(reconstructed_list) | ||
# print(pho) | ||
reconstructed_list.append(j2h(*pho)) | ||
else: | ||
# if special space character reconstruct back to spaces | ||
reconstructed_list.append(pho.replace('▁', ' ')) | ||
|
||
# Reconstructed phrase | ||
print(''.join(reconstructed_list)) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../template/prepare.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../template/utils/meta_util.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
i: | ||
I | ||
iI | ||
eI | ||
a | ||
A: | ||
Q | ||
0 | ||
' | ||
O: | ||
U | ||
u: | ||
V | ||
@ | ||
eI | ||
aI | ||
OI | ||
aU | ||
oU | ||
p | ||
b | ||
t | ||
d | ||
k | ||
g | ||
f | ||
v | ||
T | ||
D | ||
s | ||
z | ||
S | ||
Z | ||
h | ||
m | ||
n | ||
N | ||
l | ||
r | ||
w | ||
j | ||
iu | ||
i | ||
e | ||
o | ||
u | ||
W | ||
A | ||
y | ||
E | ||
ME | ||
O | ||
oo | ||
ou | ||
ye | ||
|
||
\n | ||
\r | ||
: | ||
, | ||
F | ||
C | ||
Y | ||
? | ||
. | ||
B | ||
c | ||
R | ||
M | ||
L | ||
c | ||
; | ||
! | ||
H | ||
P | ||
q | ||
|
||
G | ||
- | ||
x | ||
$ | ||
& | ||
3 | ||
J | ||
K | ||
X | ||
_ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../template/utils/txt_to_phonemes.sh |