-
Notifications
You must be signed in to change notification settings - Fork 1
/
APS-C_to_Unicode.html
463 lines (376 loc) · 24.4 KB
/
APS-C_to_Unicode.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
<!DOCTYPE html">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
<head>
<meta http-equiv="content-type" charset="UTF-8"/>
<meta name="author" content="Department of Culture, UP"/>
<meta name="author" content="http://www.rkmveda.org/wp-content/uploads/2015/05/APS-C-DV-Prakash-to-Unicode-Converter.html"/>
<meta name="author" content="Sannyasins of BSY"/>
<meta name="description" content="The purpose of this tool is to convert text written with APS-C-DV-* fonts to Unicode encoding."/>
<link rel="shortcut icon" href="favicon.ico" type="image/x-icon"/>
<title>APS-C-DV-X to Unicode Converter. Updated for YPT conversion.</title><br/>
<style type="text/css">
textarea {
width: 100%;
height: auto;
/* background-color: yellow; */
font-size: 2em;
font-weight: normal;
font-family: Verdana, Arial, Helvetica, sans-serif;
border: 1px solid black;
}
textarea.aps-text {
font-family: "APS-C-DV-Priyanka", "APS-C-DV-Prakash", Verdana, Arial, Helvetica, sans-serif;
}
textarea.unicode-text {
font-family: "Kokila", Verdana, Arial, Helvetica, sans-serif;
}
</style>
<script type="text/javascript">
// initial code taken from Department of Culture, UP site and then tweaked for non-commercial purpose
// new code taken from: http://www.rkmveda.org/wp-content/uploads/2015/05/APS-C-DV-Prakash-to-Unicode-Converter.html
// References: https://i.imgur.com/ICUC6Wk.png
/* Additions:
1. Added replacement: "‚ããò" , "औं" ,
2. Added replacement: "æ" , "ॐ" ,
3. Added replacement: "¹Ç", "प्र्",
4. Added replacement: ". " , "। ", // TODO: regular expression to possibly add space between last letter in sentence and adjancent full stop.
" . " , " । "
5. Added replacement: "éè","é", // catch and correct possible double long I typing mistake
6. Added replacement: "èé","é", // catch and correct possible double long I typing mistake
7. Added replacement: "èâ","é", // replace long I + bindu with single character for that combo
TO-DO:
1. Check for other consanant + 'r' combos like "pr", e.g. "phr"
1.1. Update replacement logic as it fails for Roman script mixed with APS. Need to skip over areas of text that are in standard [A-Z] block.
2. Add option for converting numerals
3. Add option for choosing input box font (e.g. APS if available)
4. Add option for choosing output box fonts
5. Add default test data (e.g. from Wikipedia's Devanagari page)
6. Add option for setting/changing chunk_size from default (6000 chars)
7. Keep totals for number of replacements made to identify which code blocks aren't being used
8. Replace 2x single danda with single double danda character
9. Add option for scroll-sync
10. Add option for top/bottom vs side/side textarea positioning
*/
function initiate_conversion( input_field_id = "aps_c_dv_text_input_field", output_field_id = "unicode_devanagari_text_output_field" )
{
// Set default values for parameters
//if ( typeof( input_field_id ) === 'undefined' ) input_field_id = "aps_c_dv_text_input_field";
//if ( typeof( output_field_id ) === 'undefined' ) output_field_id = "unicode_devanagari_text_output_field";
// Get input
var input_field = document.getElementById( input_field_id );
var input_text = input_field.value;
var input_text_length = input_text.length;
// Display progress message in Unicode output textarea
var output_field = document.getElementById( output_field_id );
output_field.value = "Starting conversion...";
//****************************************************
// Break the long text into small bunches of chunk_size characters each.
//****************************************************
// processed_text: keeps running concatenation of converted input, starting with empty string
// chunk_size: (max) portion/length of input to be processed at a time
var chunk_size = 6000;
var processed_text = '';
var start_pos = 0; // start position of text to be processed
var end_pos = 0; // end position of text to be processed
var keep_going = 1; // text processing continues until set to 0
// keep updating start and ending cursor positions as text gets process chunk_size at a time
while (keep_going == 1)
{
// set start cursor position to (previous) ending cursor position
start_pos = end_pos;
// check if a whole chunk can be taken
if (end_pos < (input_text_length - chunk_size))
{
// advance ending cursor position by chunk_size
end_pos += chunk_size;
// This was making problem if there is no 'space' in the whole document.
// while (input_text.charAt ( end_pos ) != ' ') {end_pos--;}
} else // less than whole chunk left to process
{
// set ending cursor position to end of input
end_pos = input_text_length;
// set flag for ending processing (after this final run)
keep_going = 0
}
var modified_substring = input_text.substring(start_pos, end_pos);
modified_substring = convert_APS_to_Unicode( modified_substring );
var processed_text = processed_text + modified_substring;
output_field.value = "Conversion in progress.." + '\n\n' + 'Conversion of ' + end_pos + ' characters out of ' + input_text_length + ' completed.';
}
output_field.value = processed_text;
} // end of initiate_conversion function
function convert_APS_to_Unicode( aps_input = "" )
{
var APS2Unicode_table = new Array(
/* Possible input errors */
"ññ", "ñ", // "े"
"éè", "é", // "é" // double long I typing mistake for I + bindu
"èé", "é", // "é" // double long I typing mistake for I + bindu
"ââ", "â", // "ं"
"ãèâ", "ãé", // "ीं" // replace long I + bindu with single character for that combo
"ãâè", "ãé", // "ीं" // replace bindu + long I with single character for that combo
"ýý", "॥", // "॥" // replace 2x single danda with single double danda
/* Numerals
"1" , "१" , // "१"
"2" , "२" , // "२"
"3" , "३" , // "३"
"4" , "४" , // "४"
"5" , "५" , // "५"
"6" , "६" , // "६"
"7" , "७" , // "७"
"8" , "८" , // "८"
"9" , "९" , // "९"
"0" , "०" , // "०"
*/
/* Symbols */
"æ", "ॐ", // "ॐ"
/* */
"ü¡", "¡ü", // "ड़"
"ü¤", "¤ü", // "ढ़"
"ü", "़", // "़"
"òŠ", "Šò", //"Šें"
"îŠ", "Šî", // "Šू"
"ñŠ", "Šñ", // "Šे"
"õŠ", "Šõ", // "Šै"
"âŠ", "Šâ", // "Šं" // duplicate
"ìŠ", "Šì", // "Šु"
"ðŠ", "Šð", // "Šृ"
"ÆŠ", "ŠÆ", // "Š्र"
"áŠ", "Šá", // "Š्"
"ƒÃ", "ई", // "ई"
"¹Ç", "प्र्", // "प्र्"
// "ãä" , "ि" , // "ि" // test
"âãä", "MMMIII", // "ंि" // TODO: find a non-standard char as temp replacement holder
"úãä", "úIII", // "ँि" // TODO: find a non-standard char as temp replacement holder
"ãä", "ि", // "ि" // ORIG
"â", "ं", // "ं"
"¶ã", "न", // "न"
"ãå", "å", // "å"
"ðð", "ॄ", // "ॄ"
"ó", "ेê", // "ेर्"
"Ä", "RRRâ", // "र्ं" // TODO: find a non-standard char as temp replacement holder
"ãê", "ीRRR", // "ीर्" // TODO: find a non-standard char as temp replacement holder
"ê", "RRR", // "र्" // TODO: find a non-standard char as temp replacement holder
"Ê", "ŠÃ", // "Šर्"
"Ã", "RRR", // "र्" // TODO: find a non-standard char as temp replacement holder
"ãó", "ोRRR", // "ोर्" // TODO: find a non-standard char as temp replacement holder
//"âŠ" , "Šâ" , // "Šं" // duplicate
"È", "्र", // "्र"
"¨ãÉ", "ऋ", // "ऋ"
"í", "ु", // "ु"
"Ôã", "स", // "स"
"Êãð", "ऌ", // "ऌ"
"Ë", "ल", // "ल"
"É", "â", // "ं"
"¾ã", "य", // "य"
"‰ã‹", "क्र्", // "क्र्"
"¨ãŠ", "‰ãŠ", // "क्र"
"ãå‡ãŠ", "किं", // "किं"
"‡ã‹", "क्", // "क्"
"§ãŠ", "क्त", // "क्त"
"ãé", "ीं", // "ीं"
"‡ã‹Ìã", "‡ãŠáÌ", // "क्व्"
"‰ãŠ", "‡ãŠáÀ", // "क्र"
"‡ãŠ", "क", // "क"
"¹ãŠ", "फ", // "फ"
"ˆ", "व्न्", // "व्न्"
"OE", "ख्", // "ख्" // added
"Œ", "ख्", // "ख्"
"", "ज्", // "ज्"
"", "ज्ञ्", // "ज्ञ्"
"•", "ज्", // "ज्"
"–", "ज्र्", // "ज्र्"
"—", "ज्ञ्", // "ज्ञ्"
"™", "झ्र्", // "झ्र्"
"š", "ञ्", // "ञ्"
"›", "ट", // "ट"
"oe", "छ", // "छ" // added
"œ", "छ", // "छ"
"Ÿ", "ठ", // "ठ"
"¡", "ड", // "ड"
"¢", "झ्", // "झ्"
"£", "ध्", // "ध्"
"¤", "ढ", // "ढ"
"¥", "ण्", // "ण्"
"¦ã", "त", // "त"
"¦", "त्", // "त्"
"§", "त्त्", // "त्त्"
"©", "थ्", // "थ्"
"ª", "द", // "द"
"¬", "द्ग", // "द्ग"
"®", "द्ध", // "द्ध"
"¯", "द्ब", // "द्ब"
"°", "द्भ", // "द्भ"
"±", "द्म्", // "द्म्"
"²", "द्य्", // "द्य्"
"³", "द्र", // "द्र"
"´", "द्व", // "द्व"
"¶", "न्", // "न्"
"¸", "न्न्", // "न्न्"
"¹", "प्", // "प्"
"º", "ब्", // "ब्"
"¼ã", "भ", // "भ"
"¼", "भ", // "भ"
"½", "म्", // "म्"
"¾", "य्", // "य्"
"À", "र", // "र"
"Á", "रु", // "रु"
"Â", "रू", // "रू"
"Ê", "ल्", // "ल्"
"Ë", "ळ", // "ळ"
"Í", "श्", // "श्"
"Î", "श्", // "श्"
"Ï", "श्", // "श्"
"Ð", "ख्र्", // "ख्र्"
"Ñ", "श्र्", // "श्र्"
"Ò", "दृ", // "दृ"
"Ó", "ष्", // "ष्"
"Ô", "स्", // "स्"
"Õ", "स्र्", // "स्र्"
"Ö", "ह", // "ह"
"×", "हृ", // "हृ"
"Ø", "ग्", // "ग्"
"Ù", "ह्", // "ह्"
"Ú", "ह्म्", // "ह्म्"
"Ü", "घ्", // "घ्"
"Ý", "ङ", // "ङ"
"Þ", "च्", // "च्"
"ß", "ळ", // "ळ"
"àã", "क्ष", // "क्ष"
"à", "क्ष्", // "क्ष्"
"¨", "त्र्", // "त्र्"
"®", "द्ध", // "द्ध"
"Æ", "्र", // "्र"
"‡ã", "व", // "व"
"Ì", "व्", // "व्"
"‡", "व्", // "व्"
"‚ããñ", "ओ", // "ओ"
"‚ããõ", "औ", // "औ"
"‚ããò", "औं", // "औं"
"‚ãã", "आ", // "आ"
"‚ã", "अ", // "अ"
"¿ã", "्य", // "्य"
"ãÄ", "RRRâ", // "र्ं" // TODO: find a non-standard char as temp replacement holder
"ƒ", "इ", // "इ"
"…", "ऊ", // "ऊ"
"„", "उ", // "उ"
"†ñ", "ऐ", // "ऐ"
"†", "ए", // "ए"
"्ã", "", // ""
"ãõ", "ौ", // "ौ"
"ãñ", "ो", // "ो"
"ñ", "े", // "े"
"ãè", "ी", // "ी"
"ì", "ु", // "ु"
"î", "ू", // "ू"
"ò", "ें", // "ें"
"ö", "ैं", // "ैं"
"ãे", "ो", // "ो"
"ãै", "ौ", // "ौ"
"â", "ं", // "ं"
"ç", "ऽ", // "ऽ"
"á", "्", // "्"
"ý", "।", // "।"
"ð", "ृ", // "ृ"
"‡", "व", // "व"
"õ", "ै", // "ै"
"Ûã", "ह्य", // "ह्य"
"Û", "ह्", // "ह्"
"त्रÉ", "ऋ", // "ऋ"
"ÿ", "द्द", // "द्द"
"ह्ो", "ह्ये", // "ह्ये"
"ã÷", "ौRRR", // "ौर्" // TODO: find a non-standard char as temp replacement holder
"÷", "ैRRR", // "ैर्" // TODO: find a non-standard char as temp replacement holder
"ú", "ँ", // "ँ"
"`", "'", // "'"
"ǔ", "ॄ", // "ॄ"
"ãã", "ा", // "ा"
" :", " CccOooLllOooNnn", // " :" // mark colon punctuation for later replacement
":", "ः", // "ः"
"ã", "ा", // "ा"
". ", "। ", // "। " // TODO: regular expression to possibly add space between last letter in sentence and adjancent full stop.
); // end Array::APS2Unicode_table
var APS2Unicode_table_length = APS2Unicode_table.length;
//substitute array_two elements in place of corresponding APS2Unicode_table elements
if (aps_input !== "") // if string to be converted is non-blank then no need of any processing.
{
// traverse through conversion table by APS and Unicode replacement pairs
for (input_symbol_idx = 0; input_symbol_idx < APS2Unicode_table_length - 1; input_symbol_idx = input_symbol_idx + 2)
{
//******************************************************
idx = 0; // index of the symbol being searched for replacement
// keep making replacements to current APS char as long as it is found in current chunk
while (idx !== -1) //while-00
{
aps_find_token = APS2Unicode_table[ input_symbol_idx ];
uni_replace_token = APS2Unicode_table[ input_symbol_idx + 1 ];
// TODO: check if this string function only makes first/single replacement or all/mass replacements in given string, in which case while loop may not be needed
aps_input = aps_input.replace(aps_find_token, uni_replace_token);
idx = aps_input.indexOf(aps_find_token);
} // end of while-00 loop
} // end of for loop
aps_input = aps_input.replace(/([ंँ])([ािीुूृेैोौ])/g, "$2$1");
aps_input = aps_input.replace(/([ंँ])्र/g, "्र$1");
//rem changing व to क and प to फ etc
aps_input = aps_input.replace(/‡([ुूृॄेैंँर्]*)Š/g, "क$1");
//aps_input = aps_input.replace( /‡/g , "क" );
/*
aps_input = aps_input.replace( /प([ुूृॄेैंँर्]*)§/g , "फ$1" );
//aps_input = aps_input.replace( /प्र([ुूृॄेैंँ]*)§/g , "फ्र$1" );
aps_input = aps_input.replace( /रं§/g , "रुं" );
aps_input = aps_input.replace( /§/g , "" );
aps_input = aps_input.replace( /([ुूृॄेैंँर्]*)Π/g , "़$1" );
*/
// following statements for adjusting postion of i maatraas.
// aps_input = aps_input.replace( /ä/g , "ि" ) ; // moved up to top
aps_input = aps_input.replace(/([å])([कखगघङचछजझञटठडड़ढढ़णतथदधनपफबभमयरलळवशषसहक्षज्ञ])/g, "$2िं");
aps_input = aps_input.replace(/([ं])([्])([कखगघङचछजझञटठडड़ढढ़णतथदधनपफबभमयरलळवशषसहक्षज्ञ])/g, "$2$3ं");
aps_input = aps_input.replace(/([ि])([कखगघङचछजझञटठडड़ढढ़णतथदधनपफबभमयरलळवशषसहक्षज्ञ])/g, "$2$1");
aps_input = aps_input.replace(/([ि])([्])([कखगघङचछजझञटठडड़ढढ़णतथदधनपफबभमयरलळवशषसहक्षज्ञ])/g, "$2$3$1");
aps_input = aps_input.replace(/ä/g, "ि");
// TODO: use temp replacement characters that are unlikely to be in the document so that intermixed Roman characters A/b/c don't get converted inadvertently
// following three statement for adjusting position of reph ie, half r
aps_input = aps_input.replace(/([कखगघचछजझटठडड़ढढ़णतथदधनपफबभमयरलळवशषसहक्षज्ञ])([्])([कखगघचछजझटठडड़ढढ़णतथदधनपफबभमयरलळवशषसहक्षज्ञ])RRR/g, "र्$1$2$3");
aps_input = aps_input.replace(/([कखगघचछजझटठडड़ढढ़णतथदधनपफबभमयरलळवशषसहक्षज्ञ])RRR/g, "र्$1");
aps_input = aps_input.replace(/([कखगघचछजझटठडड़ढढ़णतथदधनपफबभमयरलळवशषसहक्षज्ञ])([ािीुूृेैोौंँः])RRR/g, "RRR$1$2");
aps_input = aps_input.replace(/([कखगघचछजझटठडड़ढढ़णतथदधनपफबभमयरलळवशषसहक्षज्ञ])([्])RRR/g, "RRR$1$2");
aps_input = aps_input.replace(/RRR/g, "र्");
// aps_input = aps_input.replace( /(III)([कखगघचछजझटठडड़ढढ़णतथदधनपफबभमयरलळवशषसहक्षज्ञ])([्])/g, "$2$3$1" ) ;
// aps_input = aps_input.replace( /(III)([कखगघचछजझटठडड़ढढ़णतथदधनपफबभमयरलळवशषसहक्षज्ञ])/g, "$2III" ) ;
aps_input = aps_input.replace(/III/g, "ि");
aps_input = aps_input.replace(/MMM/g, "ं");
aps_input = aps_input.replace(/CccOooLllOooNnn/g, ":");
} // end of IF statement meant to supress processing of blank string.
return aps_input;
} // end of the function convert_APS_to_Unicode
</script>
</head>
<body>
<main>
<header>
<strong>Hari Om. This tool is for converting APC-C-DV font input into Unicode. <br/>
To use:<br/>
1. Copy text APS text from original document <br/>
2. Paste into top field ("APS-C-DV-X font text box")<br/>
3. Press "Convert to Unicode" button. This should produce Unicode text in field below the button ("Unicode Devanagari text-box")<br/>
4. Copy converted Unicode text and place in your desired document using any Unicode font (e.g. Kokila, Mangal, Arial, ...)<br/>
<br/>
If you see any conversion errors, bugs or issues, please keep copy of sample APS text and inform Satchidananda or Sw. Shivadhyanam.<br/>
<br/>
Om Tat Sat.<br/>
<br/>
v20180321
</strong><br/><br/>
</header>
<form name="form1"><br/>
<b>APS-C-DV-X font text-box </b><br/>
<textarea class="aps-text devanagari-text" id="aps_c_dv_text_input_field" name="TextToConvert" placeholder="Paste here your text in any APS-C-DV font" cols="92" rows="12"></textarea>
<br/><br/>
<input id="converter" name="converter" value=" Convert to Unicode >> " onclick="initiate_conversion();" accesskey="c" title="शॉर्टकट alt+c" type="button">
<br/><br/>
<b>Unicode Devanagari text-box</b> <br/>
<textarea class="unicode-text devanagari-text" id="unicode_devanagari_text_output_field" name="ConvertedText" placeholder="Conversion out put will be shown here" cols="92" rows="12"></textarea>
</form>
</main>
</body>
</html>