Skip to content

Commit

Permalink
Merge pull request #161 from klei22/add_korean_parallel_corpora
Browse files Browse the repository at this point in the history
Add scripts compatible wtih the korean parallel corpora
  • Loading branch information
gkielian authored Apr 29, 2024
2 parents a8a138b + 0caf61f commit c42743a
Show file tree
Hide file tree
Showing 7 changed files with 260 additions and 0 deletions.
41 changes: 41 additions & 0 deletions data/korean-parallel-corpora/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Korean-Parallel-Corpora

This folder contains scripts and resources for working with the
korean-parallel-corpora dataset.

The Moo/korean-parallel-corpora is designed for tasks related to language
translation, specifically translating between Korean and English.

## Dataset Description:
- **URL:** [Moo/korean-parallel-corpora on Huggingface](https://huggingface.co/datasets/Moo/korean-parallel-corpora/discussions/1)
- **Tasks:** Translation
- **Languages:** Korean, English
- **License:** [cc-by-sa-3.0](https://creativecommons.org/licenses/by-sa/3.0/)

## Script Overview:
Scripts specific to this dataset are described below.

### Obtaining dataset with `get_dataset.py`
To obtain the dataset locally:

```bash
python3 get_dataset.py
```

This will create the following files:
- `data.json` - json containing Korean, English, and jamon language entries
- `input.txt` - text format with prefixed entries according to args sent to the `get_dataset.py` file.

### Testing Korean <-> Jamon Conversion

This script is an example of how to convert to jamon and back:
```bash
python3 korean_jamo_conversion_test.py
```
This will allow the model to be optionally trained on jamon but still recover
the original Korean text.

## License:

The dataset is listed at Huggingface under the Creative Commons Attribution-ShareAlike 3.0 License.

93 changes: 93 additions & 0 deletions data/korean-parallel-corpora/get_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import pandas as pd
import requests
import os
import argparse
import json
from tqdm import tqdm
from jamo import h2j, j2hcj, is_hangul_char

def download_file(url, filename):
"""
Download a file from a given URL with a progress bar, only if it is not already present.
"""
if os.path.exists(filename):
print(f"{filename} already downloaded.")
return
response = requests.get(url, stream=True)
response.raise_for_status() # Ensure the download was successful.
total_size = int(response.headers.get("content-length", 0))
block_size = 1024 # 1 Kibibyte
progress_bar = tqdm(total=total_size, unit="iB", unit_scale=True, desc="Downloading file")
with open(filename, "wb") as f:
for data in response.iter_content(block_size):
progress_bar.update(len(data))
f.write(data)
progress_bar.close()
if total_size != 0 and progress_bar.n != total_size:
print("Error: Failed to download the file completely.")
else:
print(f"Downloaded {filename}")

def format_hangul_text(text):
"""Format the text so that each Hangul character is surrounded by exactly one space."""
formatted_text = []
for char in text:
if is_hangul_char(char):
formatted_text.append(' ' + char + ' ')
else:
formatted_text.append(char)
return ''.join(formatted_text)

def korean_to_phonetic(text):
"""Converts Korean text to its phonetic representation."""
text = text.replace(' ', '▁')
text = format_hangul_text(text)
decomposed_text = h2j(text)
phonetic_text = j2hcj(decomposed_text)
return phonetic_text

def process_csv(csv_path, json_path, txt_path, order, prefixes):
"""
Process CSV file and save data as JSON and text with progress bars.
"""
df = pd.read_csv(csv_path, keep_default_na=False)
data = []
progress_bar = tqdm(total=len(df), desc="Processing records", unit="records")
with open(txt_path, 'w') as txt_file:
for _, row in df.iterrows():
entries = {
'ko': '\n' + prefixes['ko'] + row['ko'],
'en': '\n' + prefixes['en'] + row['en'],
'ph': '\n' + prefixes['ph'] + korean_to_phonetic(row['ko'])
}
line_content = [entries[item] for item in order if item in entries]
txt_file.write(' '.join(line_content) + "\n")
data.append({'ko': row['ko'], 'en': row['en'], 'ph': entries['ph']})
progress_bar.update(1)
progress_bar.close()
with open(json_path, 'w') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
print(f"Converted {csv_path} to JSON at {json_path} and to text at {txt_path}")

def main(url, json_path, txt_path, order, ko_prefix, en_prefix, ph_prefix):
download_dir = "./downloaded_files"
os.makedirs(download_dir, exist_ok=True)
file_name = url.split("/")[-1].split("?")[0]
csv_path = os.path.join(download_dir, file_name)
download_file(url, csv_path)
process_csv(csv_path, json_path, txt_path, order.split(','), {'ko': ko_prefix, 'en': en_prefix, 'ph': ph_prefix})

if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Convert CSV file to JSON and text file with phonetic representation of Korean text."
)
parser.add_argument("--url", type=str, default="https://huggingface.co/datasets/Moo/korean-parallel-corpora/resolve/main/train.csv?download=true", help="URL to download the CSV file from.")
parser.add_argument("-o", "--output_json_file", type=str, default="data.json", help="Path to the output JSON file.")
parser.add_argument("-t", "--output_txt_file", type=str, default="input.txt", help="Path to the output text file.")
parser.add_argument("--order", type=str, default="ko,en,ph", help="Comma-separated order of fields to write to the text file: ko, en, ph.")
parser.add_argument("--ko_prefix", type=str, default="ko:", help="Prefix for Korean text.")
parser.add_argument("--en_prefix", type=str, default="en:", help="Prefix for English text.")
parser.add_argument("--ph_prefix", type=str, default="ph:", help="Prefix for Phonetic text.")
args = parser.parse_args()
main(args.url, args.output_json_file, args.output_txt_file, args.order, args.ko_prefix, args.en_prefix, args.ph_prefix)

36 changes: 36 additions & 0 deletions data/korean-parallel-corpora/korean_jamo_conversion_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from jamo import h2j, j2hcj, j2h, is_jamo

def korean_to_phonetic(text):
"""Converts Korean text to its phonetic representation."""
# Convert Hangul to individual jamos
decomposed_text = h2j(text)
# Convert jamos back to Hangul compatibility jamos (for readability)
phonetic_text = j2hcj(decomposed_text)
return phonetic_text

# Example usage
korean_text = "안 녕 하 세 요"
phonetic_text = korean_to_phonetic(korean_text)
print("Original:", korean_text)
print("Phonetic:", phonetic_text.split(" "))

# Test string
phonetic_text="ㅇㅏㄴ ㄴㅕㅇ ㅎㅏ ㅅㅔ ㅇㅛ"

reconstructed_list = []
for pho in phonetic_text.split(" "):
if len(pho) == 0:
# if '' then skip
continue
elif is_jamo(pho[0]):
# if is jamo then add after conversion
# print(reconstructed_list)
# print(pho)
reconstructed_list.append(j2h(*pho))
else:
# if special space character reconstruct back to spaces
reconstructed_list.append(pho.replace('▁', ' '))

# Reconstructed phrase
print(''.join(reconstructed_list))

1 change: 1 addition & 0 deletions data/korean-parallel-corpora/prepare.py
1 change: 1 addition & 0 deletions data/korean-parallel-corpora/utils/meta_util.py
87 changes: 87 additions & 0 deletions data/korean-parallel-corpora/utils/phoneme_list.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
i:
I
iI
eI
a
A:
Q
0
'
O:
U
u:
V
@
eI
aI
OI
aU
oU
p
b
t
d
k
g
f
v
T
D
s
z
S
Z
h
m
n
N
l
r
w
j
iu
i
e
o
u
W
A
y
E
ME
O
oo
ou
ye

\n
\r
:
,
F
C
Y
?
.
B
c
R
M
L
c
;
!
H
P
q

G
-
x
$
&
3
J
K
X
_
1 change: 1 addition & 0 deletions data/korean-parallel-corpora/utils/txt_to_phonemes.sh

0 comments on commit c42743a

Please sign in to comment.