-
Notifications
You must be signed in to change notification settings - Fork 39
/
Copy pathdict-split.lisp
1767 lines (1541 loc) · 70.7 KB
/
dict-split.lisp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
(in-package :ichiran/dict)
;; SPLITS (words that should be scored as two or more other words)
(defparameter *split-map* (make-hash-table)) ;; seq -> split function
(defmacro defsplit (name seq (reading-var) &body body)
`(progn
(defun ,name (,reading-var) ;; reading -> (values parts score-bonus)
,@body)
(setf (gethash ,seq *split-map*) ',name)))
(defmacro def-simple-split (name seq score (&optional length-var text-var reading-var) &body parts-def)
"each part is (seq length-form)"
(alexandria:with-gensyms (offset parts pseq part-length part-txt score-var)
(unless name (setf name (intern (format nil "~a~a" :split- seq))))
(unless reading-var (setf reading-var (gensym "RV")))
(unless length-var (setf length-var (gensym "LV")))
(unless text-var (setf text-var (gensym "TV")))
`(defsplit ,name ,seq (,reading-var)
(prog* ((,text-var (true-text ,reading-var))
(,length-var (length ,text-var))
(,offset 0)
(,parts nil)
(,score-var ,score))
(declare (ignorable ,text-var ,length-var ,offset))
,@(loop for (part-seq part-length-form conj-p modify) in parts-def
if (eql part-seq :test) collect
;; ends splitting if test fails
;; part-length-form is an expression to test
;; conj-p is new score if test fails
;; if modify is :score or :pscore, add modify to parts if test fails
`(unless ,part-length-form
,@(when conj-p `((setf ,score-var ,conj-p)))
,@(when modify `((push ,modify ,parts)))
(go :end))
else if (find part-seq '(:score :pscore)) collect
`(push ,part-seq ,parts)
else collect
`(let* ((,pseq ,(if (listp part-seq)
(if (and part-seq (stringp (car part-seq)))
`(list (seq (car (find-word-conj-of ,@part-seq))))
`',part-seq)
`',(list part-seq)))
(,part-length ,part-length-form)
(,part-txt (safe-subseq ,text-var ,offset
(and ,part-length (+ ,offset ,part-length)))))
(push
(when ,part-txt
(car (apply
,(if conj-p
''find-word-conj-of
''find-word-seq)
,(case modify
((t) `(unrendaku ,part-txt))
((nil) part-txt)
(t `(funcall ,modify ,part-txt)))
,pseq)))
,parts)
(when ,part-length
(incf ,offset ,part-length))))
:end
(return (values (nreverse ,parts) ,score-var))))))
(defun get-split* (reading &optional conj-of)
(let ((split-fn (gethash (seq reading) *split-map*)))
(if split-fn
(funcall split-fn reading)
(loop for seq in conj-of
for split-fn = (gethash seq *split-map*)
when split-fn do (return (funcall split-fn reading))))))
(defun get-split (reading &optional conj-of)
"Includes safety check if one of split words is missing"
(multiple-value-bind (split score) (get-split* reading conj-of)
(when (and split (every 'identity split))
(values split score))))
;; split definitions
;; -de expressions (need to be split otherwise -desune parses as -de sune)
#|
(:select 'kt.seq 'kt.text
:from (:as 'kanji-text 'kt) (:as 'sense-prop 'sp)
:where (:and (:like 'kt.text "%で")
(:= 'sp.seq 'kt.seq) (:= 'sp.tag "pos")
(:not (:in 'sp.seq (:set (alexandria:hash-table-keys *split-map*))))
(:= 'sp.text "exp")))
|#
(defmacro def-de-split (seq seq-a &key (score 20))
(let ((name (intern (format nil "~a~a" :split-de- seq))))
`(def-simple-split ,name ,seq ,score (len)
(,seq-a (- len 1))
(2028980 1))))
(def-de-split 1163700 1576150) ;; 一人で
(def-de-split 1611020 1577100) ;; 何で
(def-de-split 1004800 1628530) ;; これで
(def-de-split 2810720 1004820) ;; 此れまでで
(def-de-split 1006840 1006880) ;; その上で
(def-de-split 1530610 1530600) ;; 無断で
(def-de-split 1245390 1245290) ;; 空で
(def-de-split 2719270 1445430) ;; 土足で
(def-de-split 1189420 2416780) ;; 何用で
(def-de-split 1272220 1592990) ;; 交代で
(def-de-split 1311360 1311350) ;; 私費で
(def-de-split 1368500 1368490) ;; 人前で
(def-de-split 1395670 1395660) ;; 全体で
(def-de-split 1417790 1417780) ;; 単独で
(def-de-split 1454270 1454260) ;; 道理で
(def-de-split 1479100 1679020) ;; 半眼で
(def-de-split 1510140 1680900) ;; 別封で
(def-de-split 1518550 1529560) ;; 無しで
(def-de-split 1531420 1531410) ;; 名義で
(def-de-split 1597400 1585205) ;; 力尽くで
(def-de-split 1679990 2582460) ;; 抜き足で
(def-de-split 1682060 2085340) ;; 金ずくで
(def-de-split 1736650 1611710) ;; 水入らずで
(def-de-split 1865020 1590150) ;; 陰で
(def-de-split 1878880 2423450) ;; 差しで
(def-de-split 2126220 1802920) ;; 捩じり鉢巻きで
(def-de-split 2136520 2005870) ;; もう少しで
(def-de-split 2513590 2513650) ;; 詰め開きで
(def-de-split 2771850 2563780) ;; 気にしないで
(def-de-split 2810800 1587590) ;; 今までで
(def-de-split 1343110 1343100) ;; ところで
(def-de-split 1270210 1001640) ;; お陰で
(def-simple-split split-degozaimasu 2253080 20 () ;; でございます
(2028980 1)
(1612690 nil t))
(defmacro def-toori-split (seq seq-a &key (score 50) (seq-b 1432930))
(let ((name (intern (format nil "~a~a" :split-toori- seq))))
`(def-simple-split ,name ,seq ,score (len txt r)
(:test (eql (word-type r) :kanji))
(,seq-a (- len 2))
(,seq-b 2))))
(def-toori-split 1260990 1260670) ;; 元通り
(def-toori-split 1414570 2082450) ;; 大通り
(def-toori-split 1424950 1620400) ;; 中通り [ちゅう通り]
(def-toori-split 1424960 1423310) ;; 中通り [なか通り]
(def-toori-split 1820790 1250090) ;; 型通り
(def-toori-split 1489800 1489340) ;; 表通り
(def-toori-split 1523010 1522150) ;; 本通り
(def-toori-split 1808080 1604890) ;; 目通り
(def-toori-split 1368820 1580640) ;; 人通り
(def-toori-split 1550490 1550190) ;; 裏通り
(def-toori-split 1619440 2069220) ;; 素通り
(def-toori-split 1164910 2821500 :seq-b 1432920) ;; 一通り
(def-toori-split 1462720 1461140 :seq-b 1432920) ;; 二通り
(defmacro def-do-split (seq seq-b &key (score 30) (seq-a 2252690))
(let ((name (intern (format nil "~a~a" :split-do- seq))))
`(def-simple-split ,name ,seq ,score (len txt r)
(,seq-a 1)
(,seq-b))))
(def-do-split 2142710 1185200) ;; ど下手
(def-do-split 2803190 1595630) ;; どすけべ
(def-do-split 2142680 1290210) ;; ど根性
(def-do-split 2523480 1442750) ;; ど田舎
#|
(query (:select 'kt.seq 'kt.text
:from (:as 'kanji-text 'kt) (:as 'sense-prop 'sp)
:where (:and (:like 'kt.text "し%")
(:= 'sp.seq 'kt.seq) (:= 'sp.tag "pos")
(:not (:in 'sp.seq (:set (alexandria:hash-table-keys *split-map*))))
(:in 'sp.text (:set *pos-with-conj-rules*)))))
|#
(defmacro def-shi-split (seq seq-b &key (score 30) (seq-a '("し" 1157170)))
(let ((name (intern (format nil "~a~a" :split-shi- seq))))
`(def-simple-split ,name ,seq ,score (len txt r)
(,seq-a 1)
(,seq-b nil t))))
(def-shi-split 1005700 1156990) ;; し易い
(def-shi-split 1005830 1370760) ;; し吹く
(def-shi-split 1157200 2772730) ;; し難い
(def-shi-split 1157220 1195970) ;; し過ぎる
(def-shi-split 1157230 1284430) ;; し合う
(def-shi-split 1157280 1370090) ;; し尽す
(def-shi-split 1157310 1405800) ;; し続ける
(def-shi-split 1304890 1256520) ;; し兼ねる
(def-shi-split 1304960 1307550) ;; し始める
(def-shi-split 1305110 1338180) ;; し出す
(def-shi-split 1305280 1599390) ;; し直す
(def-shi-split 1305290 1212670) ;; し慣れる
(def-shi-split 1594300 1596510) ;; し損なう
(def-shi-split 1594310 1406680) ;; し損じる
(def-shi-split 1594460 1372620) ;; し遂げる
(def-shi-split 1594580 1277100) ;; し向ける
(def-shi-split 2518250 1332760) ;; し終える
(def-shi-split 1157240 1600260) ;; し残す
(def-shi-split 1304820 1207610) ;; し掛ける
(def-shi-split 2858937 1406690) ;; し損ねる
;; nakunaru split: because naku often attaches to previous word
(def-simple-split split-nakunaru 1529550 30 (len) ;; 無くなる
(("無く" 1529520) 2)
(1375610 nil t))
(def-simple-split split-nakunaru2 1518540 10 (len txt r) ;; 亡くなる
(:test (eql (word-type r) :kana))
(("亡く" 1518450) 2)
(1375610 nil t))
;; tegakakaru split (kana form might conflict with other uses of kakaru verb)
(def-simple-split split-tegakakaru 2089710 10 (len) ;; 手が掛かる
(1327190 1) ;; 手
(2028930 1) ;; が
(1207590 nil t))
(def-simple-split split-kawaribae 1411570 10 (len txt) ;; 代わり映え
((1590770 1510720) (1+ (position #\り txt)))
(("映え" 1600620) 2))
(def-simple-split split-hayaimonode 2815260 100 (len txt) ;; 早いもので
(1404975 (1+ (position #\い txt)))
(1502390 (if (find #\物 txt) 1 2))
(2028980 1))
(def-simple-split split-dogatsukeru 2800540 30 (len) ;; ドが付ける
(2252690 1)
(2028930 1)
(1495740 nil t))
(def-simple-split split-janaika 2819990 20 (len) ;; じゃないか
(("じゃない" 2089020) 4)
(2028970 1))
(def-simple-split split-kaasan 1609470 50 (len txt r) ;; 母さん
(:test (eql (word-type r) :kanji))
(1514990 1)
(1005340 2))
(def-simple-split split-souda 1006650 5 ()
(2137720 2)
(2089020))
(def-simple-split split-soudesu 2837492 5 ()
(2137720 2)
(1628500))
(def-simple-split split-kinosei 1221750 100 ()
(1221520 1)
(1469800 1)
(1610040 2))
(def-simple-split split-kigatsuku 1591050 100 ()
(1221520 1)
(2028930 1)
(1495740 nil t))
(def-simple-split split-nanimokamo 1599590 20 (len) ;; なにもかも
(1188490 (- len 2))
(2143350 2))
(def-simple-split split-katawonaraberu 2102910 20 (len txt) ;; 肩を並べる
(1258950 (position #\を txt))
(2029010 1)
(1508390 nil t))
(def-simple-split split-moushiwakenasasou 2057340 300 (len txt) ;; 申し訳なさそう
(1363050 (position #\な txt))
(2246510))
(def-simple-split split-kimatte 1951150 50 () ;; 決まって
(("決まって" 1591420)))
(def-simple-split split-osoreiru 1236680 100 (len txt) ;; 恐れ入る
(1236660 (1+ (position #\れ txt)))
(1465580 nil t))
(def-simple-split split-nantokanaru 2104540 20 (len txt) ;; なんとかなる
(1188420 (1+ (position #\か txt)))
(1375610 nil t))
(def-simple-split split-hajiketobu 2610760 50 (len txt) ;; 弾け飛ぶ
(("弾け" 1419380) (1+ (position #\け txt)))
(1429700 nil t))
(def-simple-split split-motteiku 1315700 50 (len txt) ;; 持って行く
(("持って" 1315720) (1+ (position #\て txt)))
(1578850 nil t))
(def-simple-split split-hairikomeru 1465460 100 (len txt r) ;; 入り込める
(:test (eql (word-type r) :kanji))
(("入り" 1465590) (1+ (position #\り txt)))
(1288790 nil t))
(def-simple-split split-shinikakaru 1881080 30 () ;;死に掛かる
(1310720 1)
(2028990 1)
(1207590 nil t))
(def-simple-split split-hisshininatte 1903910 50 (len txt) ;;必死になって
(1601890 (position #\に txt))
(2028990 1)
(("なって" 1375610) nil))
(def-simple-split split-nitotte 1009600 50 (len txt) ;; にとって
(2028990 1)
(("取って" 1326980)))
(def-simple-split split-kotonisuru 2215340 100 (len txt) ;; 事にする
(1313580 (position #\に txt))
(2028990 1)
(1157170 nil t))
(def-simple-split split-hajikidasu 1419350 100 (len txt) ;; 弾き出す
(1901710 (1+ (position #\き txt)))
(1338180 nil t))
(def-simple-split split-hitotachi 1368740 100 (len txt) ;; 人たち
(1580640 (if (position #\人 txt) 1 2))
(1416220 (if (position #\達 txt) 1 2)))
(def-simple-split split-desura 2034520 30 (len txt) ;; でさえ ですら
(2028980 1)
((2827091))) ;; 1005120
(def-simple-split split-gotoni 1524660 50 (len txt) ;; ごとに
(1524640 (position #\に txt))
(2028990 1))
(def-simple-split split-osagari 1693800 50 () ;; お下がり
(2826528 1)
(1609810))
(def-simple-split split-kaisasae 1752860 50 () ;; 買い支え
(1636070 2)
(("支え" 1310090)))
(def-simple-split split-toiukotoda 2612990 30 (len) ;; ということだ
(1922760 3)
(1313580 (- len 4))
(2089020))
(def-simple-split split-tonattara 2100770 50 (len) ;; となったら
(1008490 1)
(("なったら" 1375610)))
(def-simple-split split-tonaru 2100900 10 (len) ;; となる
(1008490 1)
(1375610 nil t))
(def-simple-split nil 1327220 50 (len) ;; 手に入る
(1327190 1)
(2028990 1)
(1465590 nil t))
(def-simple-split nil 1327230 50 (len) ;; 手に入れる
(1327190 1)
(2028990 1)
(1465610 nil t))
(def-simple-split nil 2433760 50 (len) ;; そうなんです
(1006610 2)
(2683060))
(def-simple-split nil 2088480 20 (len) ;; 良さげ
(1634130 2)
(2006580 1))
(def-simple-split nil 2724560 30 (len) ;; のせいで
(1469800 1)
(1610040 (- len 2))
(2028980 1))
(def-simple-split nil 2666360 30 () ;; 少なくない
(("少なくない" 1348910)))
(def-simple-split split-janai 2755350 10 (len) ;; じゃない
(2089020 2)
(1529520 nil t))
(def-simple-split split-jan 2135280 10 () ;; じゃん
(2089020 2)
(2139720 1))
(def-simple-split nil 2771940 -5 (len txt) ;; はないか
(:test (equal txt "はないか"))
(2028920 1)
(1529520 2)
(2028970 1))
(def-simple-split split-nara 1009470 1 () ;; なら
(("なら" 2089020)))
(def-simple-split nil 2083990 20 (len txt) ;; ならん
(:test (equal txt "ならん"))
(1009470 2)
(2139720 1))
(def-simple-split nil 2762260 0 (len txt) ;; ならんで
(("ならんで" 1508380)))
(def-simple-split nil 1508380 10 (len txt r) ;; ならんで
(:test (eql (word-type r) :kana))
(2083990 3)
(2028980 1))
(def-simple-split nil 2009290 100 (len txt) ;; 中でも
(1423310 (- len 2))
(1008460))
(def-simple-split nil 1502500 100 (len txt) ;; 物好き
(1502390 (- len 2))
(1277450 2 nil t))
(def-simple-split nil 1002970 600 (len txt r) ;; かもしれない
(:test (eql (word-type r) :kanji))
(2143350 2)
(("知れない" 1420490)))
(def-simple-split nil 1005600 -10 () ;; しまった
(("しまった" 1305380)))
(def-simple-split nil 2016840 -5 () ;; やった
(("やった" 1012980)))
(def-simple-split nil 1000430 -5 () ;; あの
(1000420))
(def-simple-split nil 1612640 5 () ;; あのね
(1000420 2)
((2029080 2029120 1005110)))
(def-simple-split nil 1314600 -5 () ;; に+ない
(2028990 1)
(1529520 nil t))
(def-simple-split nil 1322540 -5 () ;; に+ない
(2028990 1)
(1529520 nil t))
(def-simple-split nil 1221680 50 () ;; 気にします
(1221520 1)
(2028990 1)
(1157170 nil t))
(def-simple-split nil 1538340 50 (len txt) ;; わけがわからない
(1538330 (position #\が txt))
(2028930 1)
(1606560 nil t))
(def-simple-split nil 2757500 50 (len txt) ;; わけのわからない
(1538330 (position #\の txt))
(1469800 1)
(1606560 nil t))
;; (def-simple-split nil 1715710 10 (len txt) ;; 見たところ
;; (("見た" 1259290) 2)
;; (1343100))
(def-simple-split nil 1315860 20 (len) ;; 時には
(1315840 (- len 2))
(2215430 2))
(def-simple-split nil 1474200 -10 (len txt r) ;; 這います/います
(:test (eql (word-type r) :kana))
(2028920 1)
(1577980 nil t))
(def-simple-split nil 2276360 10 (len) ;; 尽くし
(2436480 (- len 1))
(2086640 1))
(def-simple-split nil 1579130 -1 (len txt) ;; ことし
(:test (equal txt "ことし"))
(1313580 2)
(2086640 1))
(def-simple-split nil 2668400 50 (len txt) ;; 汗を流す
(1213060 (position #\を txt))
(2029010 1)
(1552120 nil t))
(def-simple-split nil 1591050 100 () ;; 気がつく
(1221520 1)
(2028930 1)
(1495740 nil t))
(def-simple-split nil 2835890 50 () ;; 折りたたみ式
(1385860 5)
(1319060 1))
(defun optprefix (prefix)
(lambda (txt)
(if (alexandria:starts-with-subseq prefix txt)
txt
(concatenate 'string prefix txt))))
(def-simple-split nil 1894260 50 (len txt) ;; ついてる
(:test (> len 3))
;; refers to itself because 1495740 kana is not strong enough...
(("付いて" 1894260) 3)
(1577980 nil t (optprefix "い")))
(def-simple-split nil 1854750 20 ()
(("付いて" 1495740)))
(def-simple-split nil 2526850 10 () ;; にしろ
(2028990 1)
(("しろ" 1157170)))
(def-simple-split nil 2026650 10 () ;; にせよ
(2028990 1)
(("せよ" 1157170)))
(def-simple-split nil 1602740 50 (len) ;; 普段着
(1497180 (1- len))
(2093780))
(def-simple-split nil 1349300 5 () ;; なお
(2029110 1)
(2826528))
(def-simple-split nil 1221530 50 () ;; 気がある
(1221520 1)
(2028930 1)
(1296400 nil t))
(def-simple-split nil 2272780 50 () ;; 気がない
(1221520 1)
(2028930 1)
(1529520 nil t))
(def-simple-split nil 2846470 50 () ;; 気はない
(1221520 1)
(2028920 1)
(1529520 nil t))
(def-simple-split nil 1591980 50 () ;; 気を使う 気を遣う
(1221520 1)
(2029010 1)
(1305990 nil t)
)
(def-simple-split nil 1551500 50 () ;; 立ちすくむ
(("立ち" 1597040) 2)
(1570220 nil t))
(def-simple-split nil 2002270 50 (len txt) ;; 零れ落ちる
(("零れ" 1557650) (1+ (position #\れ txt)))
(1548550 nil t))
(def-simple-split nil 1314770 -10 (len txt r) ;; につく
(:test (eql (word-type r) :kana))
(2028990 1)
(1495740 nil t))
(def-simple-split nil 1008030 -10 () ;; つい
(:score))
(def-simple-split nil 1597740 5 (len txt r) ;; ついたて
(:test (eql (word-type r) :kana))
(1008030 2)
(2081610))
(def-simple-split nil 1581550 10 (len txt) ;; 雪がない
(:test (alexandria:starts-with-subseq "雪" txt))
(1386500 1)
(2028930 1)
(:test (> len 2) -2 :pscore)
(1529520 nil t))
(def-simple-split nil 1601080 -5 (len txt) ;; はやめる
(2028920 1)
(1310680 nil t))
(def-simple-split nil 2529050 30 (len txt) ;; 者ども
(1322990 (if (alexandria:starts-with-subseq "もの" txt) 2 1))
(1234250))
(def-simple-split nil 1006280 30 (len txt) ;; すると
(1157170 2)
(1008490 1))
(def-simple-split nil 2757540 90 () ;; 出しな
(1896380 1)
(2728200))
(def-simple-split nil 1606530 100 (len txt) ;; わかりきる
(("分かり" 1606560) 3)
(1384830 nil t))
(def-simple-split nil 2007500 100 (len txt) ;; 落ちこぼれる
(("落ち" 1548550) 2)
(1557650 nil t))
(def-simple-split nil 1532270 100 (len txt) ;; あけましておめでとうございます
(("あけまして" 1202450) 5)
(1001540))
(def-simple-split nil 2133750 100 (len txt) ;; よろしくおねがいします
(1224890 (1+ (position #\く txt)))
(1001720))
(def-simple-split nil 1863230 15 (len txt r) ;; 俺たち
(:test (eql (word-type r) :kana))
(1576870 2)
(1416220))
(def-simple-split nil 2834051 15 (len txt r) ;; お前たち
(:test (eql (word-type r) :kana))
(1002290 3)
(1416220))
(def-simple-split nil 1207840 50 (len) ;; 割り切れる
(("割り" 1208000) 2)
(1384860 nil t))
(def-simple-split nil 2109610 50 (len) ;; あり得ない
(("有り" 1296400) 2)
(1588760 nil t))
;; (def-simple-split nil 2827864 100 (len) ;; なので
;; (2029110 1)
;; (1009970 2))
(def-simple-split nil 1322560 -10 (len txt r) ;; につまる
(:test (eql (word-type r) :kana))
(2028990 1)
(1226480 nil t))
(def-simple-split nil 1006880 50 (len) ;; その上
(1006830 2)
(1352130))
(def-simple-split nil 1601010 50 (len) ;; はね上がる
(("跳ね" 1429620) 2)
(1352290 nil t))
;; SEGMENT SPLITS (allows to expand one segment into several, e.g. "ところが" "ところ+が")
(defparameter *segsplit-map* (make-hash-table)) ;; seq -> split function
(let ((*split-map* *segsplit-map*))
(def-simple-split split-tokoroga 1008570 '(-10) (len) ;; ところが
(1343100 (- len 1))
(2028930 1))
(def-simple-split split-tokorode 1343110 '(-10 :root (1)) (len) ;; ところで
(1343100 (- len 1))
(2028980 1))
(def-simple-split split-dokoroka 2009220 '(-10) (len) ;; 所か
(1343100 (- len 1))
(2028970 1))
(def-simple-split split-tokoroe 2097010 '(-10) (len) ;; ところへ
(1343100 (- len 1))
(2029000 1))
(def-simple-split split-tokorowo 2136660 '(-10) (len) ;; ところを
(1343100 (- len 1))
(2029010 1))
(def-simple-split split-tokorodewa 1897510 '(-10) (len) ;; ところでは
(1343100 (- len 2))
(2028980 1)
(2028920 1))
(def-simple-split split-omise 2409240 '(20 :primary 1 :connector "") (len) ;; お店
(2826528 1)
(1582120))
(def-simple-split split-hitorashii 1366490 '(-10 :connector "") (len) ;; 人らしい
(1580640 (- len 3))
(1013240))
(def-simple-split split-toha 2028950 '(-5) (len) ;; とは
(1008490 1)
(2028920 1))
(def-simple-split split-deha 1008450 '(-5) (len) ;; では
(2028980 1)
(2028920 1))
(def-simple-split split-naito 2394710 '(-5) (len) ;; ないと
(1529520 2)
(1008490 1))
(def-simple-split split-honno 1011740 '(-5) (len) ;; ほんの
(1522150 (- len 1))
(1469800 1))
(def-simple-split split-kanatte 1208870 '(5) (len txt) ;; かなって
(:test (equal txt "かなって"))
(1002940 2)
(2086960 2))
(def-simple-split split-dakara 1007310 '(-5) () ;; だから
(2089020 1)
(1002980))
(def-simple-split nil 1675330 '(10 :primary 1) () ;; から元気
(1002980 2)
(1260720))
(def-simple-split nil 2841254 '(5) () ;; からって
(1002980 2)
(2086960 2))
(def-simple-split nil 1567610 '(5) (len txt) ;; もんだ
(:test (equal txt "もんだ"))
(1502390 2)
(2089020))
(def-simple-split nil 1010105 '(5) (len txt) ;; はぐったり
(:test (equal txt "はぐったり"))
(2028920 1)
(1004070))
)
(defun get-segsplit (segment &aux (word (segment-word segment)))
(when (typep word 'simple-text)
(let ((*split-map* *segsplit-map*))
(multiple-value-bind (split attrs)
(get-split word (cdr (getf (segment-info segment) :seq-set)))
(when split
(destructuring-bind (score &key (primary 0) (connector " ") root) attrs
(let* ((word
(make-instance 'compound-text
:text (join "" (mapcar 'get-text split))
:kana (join connector (mapcar 'get-kana split))
:primary (elt split primary)
:words split
:score-mod score))
(new-seg (copy-segment segment)))
(when root
(loop for i from 0 for word in split
if (find i root) do (setf (word-conjugations word) :root)))
(setf (segment-word new-seg) word
(segment-text new-seg) (get-text word)
(segment-score new-seg) (+ (segment-score segment) score)
(segment-info new-seg) (nth-value 1 (calc-score (primary word)))
(getf (segment-info new-seg) :conj) (word-conj-data word)
)
new-seg)))))))
;; KANA HINTS (indicate when to romanize は as わ etc.)
(defparameter *kana-hint-mod* #\u200c)
(defparameter *kana-hint-space* #\u200b)
(defparameter *hint-char-map* `(:space ,*kana-hint-space* :mod ,*kana-hint-mod*))
(defparameter *hint-simplify-map*
(list (string *kana-hint-space*) " "
(coerce (list *kana-hint-mod* #\は) 'string) "わ"
(coerce (list *kana-hint-mod* #\ハ) 'string) "ワ"
(coerce (list *kana-hint-mod* #\へ) 'string) "え"
(coerce (list *kana-hint-mod* #\ヘ) 'string) "エ"
(string *kana-hint-mod*) ""))
(defun process-hints (word)
(simplify-ngrams word *hint-simplify-map*))
(defun strip-hints (word)
(remove-if (lambda (c) (find c *hint-char-map*)) word))
(defparameter *kana-hint-map* (make-hash-table)) ;; seq -> split function
(defun insert-hints (str hints &aux (len (length str)))
;; hints are ((character-kw position) ...)
(unless hints
(return-from insert-hints str))
(let ((positions (make-array (1+ len) :initial-element nil)))
(loop for (character-kw position) in hints
for char = (getf *hint-char-map* character-kw)
when (<= 0 position len)
do (push char (aref positions position)))
(with-output-to-string (s)
(loop for i from 0 upto len
do (loop for char in (reverse (aref positions i))
do (write-char char s))
when (< i len)
do (write-char (char str i) s)))))
(defparameter *hint-map* (make-hash-table)) ;; seq -> hint function
(defmacro defhint (seqs (reading-var) &body body)
(unless (listp seqs)
(setf seqs (list seqs)))
(alexandria:with-gensyms (fn)
`(let ((,fn (lambda (,reading-var) ,@body)))
,@(loop for seq in seqs
collect `(setf (gethash ,seq *hint-map*) ,fn)))))
(defmacro def-simple-hint (seqs (&optional length-var kana-var reading-var) &body hints-def
&aux test-var test-var-used)
(unless reading-var (setf reading-var (gensym "RV")))
(unless length-var (setf length-var (gensym "LV")))
(unless kana-var (setf kana-var (gensym "KV")))
(setf test-var (gensym "TV"))
`(defhint ,seqs (,reading-var)
(block hint
(let* ((,kana-var (true-kana ,reading-var))
(,length-var (length ,kana-var))
,@(loop for (var value) in hints-def
for tvar = (cond ((eql var :test) (setf test-var-used t) test-var)
((keywordp var) nil)
(t var))
when tvar collect `(,tvar (or ,value (return-from hint nil)))))
(declare (ignorable ,length-var ,@(when test-var-used (list test-var))))
(insert-hints (get-kana ,reading-var)
(list
,@(loop for pair in hints-def
when (and (keywordp (car pair)) (not (eql (car pair) :test)))
collect `(list ,@pair))))))))
(defun translate-hint-position (match position)
(loop with off = 0 and rem = position
for part in match
do (if (atom part)
(let ((len (length part)))
(cond
((<= rem len) (return (+ off rem)))
(t (decf rem len) (incf off len))))
(let ((len (length (first part)))
(clen (length (second part))))
(cond
((< rem len)
(return (+ off (min 1 (max clen rem)))))
((= rem len)
(return (+ off clen)))
(t (decf rem len) (incf off clen)))))))
(defun translate-hints (match hints)
(loop for (hint pos) in hints
for new-pos = (translate-hint-position match pos)
if new-pos collect (list hint new-pos)))
(defparameter *easy-hints-seqs* nil "Only used for testing")
(defun check-easy-hints ()
(with-db nil
(let ((readings (select-dao 'kana-text (:in 'seq (:set *easy-hints-seqs*))))
(*disable-hints* t))
(loop for reading in readings
for kanji = (true-kanji reading)
for kana = (true-kana reading)
for match = (ichiran/kanji:match-readings kanji kana)
unless match collect (list reading kanji kana)))))
(defmacro def-easy-hint (seq kanji-split)
(let* ((parts (split-sequence #\Space kanji-split))
(text (remove #\Space kanji-split))
(hints (loop with pos = 0
for part in parts
unless (zerop pos)
collect (list :space pos)
and if (find part '("は" "へ" "には" "とは") :test 'equal)
collect (list :mod (+ pos (length part) -1))
do (incf pos (length part))))
(reading-var (gensym "RV")))
(alexandria:with-gensyms (match kr rtext)
`(progn
(push ,seq *easy-hints-seqs*)
(defhint (,seq) (,reading-var)
(when (typep ,reading-var 'simple-text)
(let* ((,rtext (true-kanji ,reading-var))
(,match (match-diff ,text ,rtext))
(,kr (ichiran/kanji:match-readings ,rtext (true-kana ,reading-var))))
(when (and ,match ,kr)
(insert-hints (get-kana ,reading-var) (translate-hints ,kr (translate-hints ,match ',hints)))))))))))
(defun get-hint (reading)
(let ((hint-fn (gethash (seq reading) *hint-map*))
(conj-of (mapcar #'conj-data-from (word-conj-data reading))))
(if hint-fn
(funcall hint-fn reading)
(loop for seq in conj-of
for hint-fn = (gethash seq *hint-map*)
when hint-fn do (return (funcall hint-fn reading))))))
(defparameter *hints-checked*
(mapcar 'car
'(
;; は
(1186700 "化けの皮をはぐ") (1236510 "強盗にはいる") (1252080 "はかり知れない")
(1259320 "見えをはる") (1259320 "見栄をはる") (1324680 "弱音をはく") (1327220 "手にはいる")
(1348240 "小耳にはさむ") (1370020 "はなはだ以て") (1483810 "皮をはぐ") (1531720 "名前をはせる")
(1535270 "目をはなす") (1540770 "憂さをはらしに") (1632820 "意地をはる") (1636580 "型にはめる")
(1641640 "耳がはやい") (1671190 "あぶはち取らず") (1856780 "当てがはずれる") (1872190 "口をはさむ")
(1872750 "車にはねられる") (1899360 "並はずれて") (1901660 "はしごを掛ける")
(1917360 "枠にはまる") (2006850 "思う壷にはまる") (2006850 "思う壺にはまる") (2006850 "思うつぼにはまる")
(2020910 "目をみはる") (2029360 "胸をはる") (2067580 "羽目をはずす") (2067580 "はめを外す")
(2095060 "上前をはねる") (2099720 "色は順") (2099720 "いろは順") (2099770 "思いをはせる")
(2101090 "公言してはばからない") (2114550 "はっと息を呑む") (2115810 "生まれてはじめて")
(2121160 "薄紙をはぐように") (2125840 "手がはなせない") (2140480 "物のあはれ") (2183830 "命をはる")
(2183840 "命はる") (2207940 "はぶりが良い") (2215370 "はずが無い") (2223210 "はらわたが煮えくり返る")
(2263410 "横から口をはさむ") (2276210 "身包みはがれる") (2276210 "身ぐるみはがれる") (2399360 "意地はる")
(2399890 "耳にはいる") (2401870 "敬意をはらう") (2402670 "気をはる") (2407860 "めりはりの利いた")
(2557390 "感情にはしる") (2560100 "薄皮をはぐように")
(2568460 "名をはせる") (2603950 "型にはまる") (2627910 "はるの雲") (2655400 "心臓にけがはえている")
(2655420 "心臓にけがはえた") (2657160 "融和をはかる") (2678440 "其れも其のはず")
(2684060 "出はなをくじく") (2684060 "出はなを挫く") (2709160 "宇宙の距離はしご") (2717360 "えも言はず")
(2727860 "壺にはまる") (2729830 "物のはずみ") (2755410 "はるか遠く") (2759720 "はっきり言う")
(2776170 "はるの川") (2777440 "めりはりを付ける") (2793750 "嘴をはさむ") (2795820 "良きにはからえ")
(2799570 "大枚をはたく") (2817370 "はかが行く") (2826563 "予防線をはる") (2827090 "はさみを入れる")
(2829589 "分母をはらう") (2831138 "はずの無い") (2832146 "はだかの王さま") (2832146 "はだかの王様")
(2833092 "梅雨のはしり") (2833874 "耳にはさむ") (2834024 "はなも引っかけない")
(2835778 "今はやり") (2836685 "はたから見る") (2836884 "下駄をはかせる") (2837561 "二足のわらじをはく")
(2837561 "二足の草鞋をはく") (2837752 "肩肘をはる") (2837752 "肩ひじをはる") (2839604 "音程がはずれる")
(2841916 "はたきを掛ける") (1002340 "おはよう御座います") (2159030 "はしごを外される") (2131510 "根をはる")
(2131510 "根をはる") (2131510 "根をはる") (2849623 "少しでもはやく") (2238150 "はぶりの良い")
(2832275 "算盤をはじく") (2832275 "算盤をはじく") (2850988 "無念をはらす") (1344300 "もろはの剣")
(2102270 "鬱憤をはらす") (2708470 "百舌のはやにえ") (2770500 "はしが進む") (2770500 "はしが進む") (2788150 "本題にはいる")
(2859213 "頬をはる") (2858410 "筆をはしらせる")
;; へ
(1185210 "へたの横好き") (1381650 "青天のへきれき") (1381650 "晴天のへきれき") (1919400 "へそを曲げる")
(2217150 "お腹がへる") (2222890 "構へん") (2399430 "口がへらない") (2399440 "口のへらない") (2761770 "へそで茶をわかす")
(2761770 "へそで茶を沸かす") (2794610 "へそ出しルック") (2796060 "へどが出る") (2803060 "明へん")
(2803060 "明けへん") (2803060 "明かへん") (2830220 "へそが茶を沸かす") (2830220 "へそが茶をわかす")
(2830575 "鼻っ柱をへし折る") (2418770 "憎まれっ子世にはばかる") (2844727 "陰裏の豆もはじけ時") (2844727 "陰うらの豆もはじけ時")
(2847931 "はず無い") (2848855 "暴言をはく") (1626200 "腹がへる") (2126810 "平気のへいざ")
)))
#|
(query (:select 'kt.seq 'kt.text :from (:as 'kanji-text 'kt) (:as 'sense-prop 'sp)
:where (:and (:= 'kt.seq 'sp.seq)
(:= 'sp.tag "pos")
(:or (:= 'sp.text "exp") (:= 'sp.text "int"))
(:like 'kt.text "%は%")
(:not (:in 'kt.seq (:set (union (alexandria:hash-table-keys *hint-map*)
*hints-checked*)))))))
|#
;; TODO pos=adv
;; expressions ending with は
(def-simple-hint
(2028920 ;; は
2029000 ;; へ
)
(l)
(:mod (- l 1)))
(def-simple-hint ;; no space
(1289480 ;; こんばんは
1289400 ;; こんにちは
1008450 ;; では
2215430 ;; には
2028950 ;; とは
)
(l k)
(:test (alexandria:ends-with #\は k))
(:mod (- l 1)))
(def-simple-hint ;; with space
(1006660 ;; そうでないばあいは
1008500 ;; というのは
1307530 ;; はじめは
1320830 ;; じつは
1324320 ;; もしくは
1524990 ;; または
1586850 ;; あるいは
1586850 ;; あるは
1877880 ;; ごきぼうのむきは
1897510 ;; ところでは
1907300 ;; へいそは
1912570 ;; もとは
2034440 ;; にかけては
2098160 ;; なくては
2105820 ;; にしては
2134680 ;; それは
2136300 ;; ということは