-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathgenerate_corpus.php
executable file
·78 lines (57 loc) · 1.57 KB
/
generate_corpus.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
<?php
// membuat korpus, versi flat file
// profiling
$time_start = microtime(true);
include 'lib/fonetik.php';
if ($argc == 1) { echo 'Tambahkan argumen "V" atau "NV"' . "\n"; exit(1); }
if ($argv[1] == "V")
$bervokal = true;
else if ($argv[1] == "NV")
$bervokal = false;
else {
echo 'Tambahkan argumen "V" atau "NV"' . "\n";
exit(1);
}
// baca file, satu baris disimpan dalam satu array
$docs = file('data/quran_teks.txt');
$count = 0;
$id = 1;
if ($bervokal) {
$target_file = "index/fonetik_vokal.txt";
$mapping_file = "index/mapping_posisi_vokal.txt";
} else {
$target_file = "index/fonetik.txt";
$mapping_file = "index/mapping_posisi.txt";
}
$f = fopen($target_file, "w");
$fm = fopen($mapping_file, "w");
$limit = 8000;
$i = 1;
foreach ($docs as $doc) {
// split pada karakter "|"
// [0] = nomor surat
// [1] = nama surat
// [2] = nomor ayat
// [3] = teks ayat
$data = mb_split("\|", $doc);
$fonetik = ar_fonetik($data[3], !$bervokal);
$mapping_posisi = map_reduksi_ke_asli($data[3], !$bervokal);
fwrite($f, $id."|".$fonetik."\n");
fwrite($fm, implode(",", $mapping_posisi) ."\n");
echo $id . ". Diproses surah {$data[0]} ayat {$data[2]}\n";
$count++;
$id++;
if ($i >= $limit) break;
$i++;
}
fclose($f);
fclose($fm);
echo 'Total : ' . $count;
echo "\n\n";
// hasil profiling waktu eksekusi
$time_end = microtime(true);
$time = $time_end - $time_start;
echo "\nDiproses dalam $time detik\n";
echo "File disimpan di:\n";
echo "- $target_file\n";
echo "- $mapping_file\n\n";