-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
28 lines (18 loc) · 783 Bytes
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import argparse
from pathlib import Path
from eurlex_ds import EurLexDataset
def main(args: argparse.Namespace):
dataset = EurLexDataset(data_root=args.data_root)
print(f'Number of documents: {len(dataset)}')
# Dump all concatenated docs in human readable text
dataset.dump_to_txt(args.output_dir / 'all_txt.txt', mode='text')
# Dump all document headers
dataset.dump_to_txt(args.output_dir / 'stats.csv', mode='headers')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('data_root', type=Path)
parser.add_argument('--output_dir', type=Path, default='./output')
args = parser.parse_args()
if not args.output_dir.is_dir():
args.output_dir.mkdir(exist_ok=True, parents=True)
main(args)