This repository has been archived by the owner on Nov 10, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
/
index.html
1155 lines (939 loc) · 68.2 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html>
<html lang="en">
<!-- todo: validate, e.g., using https://validator.w3.org/ -->
<head>
<title>The Ontolex Module for Frequency, Attestation and Corpus Information</title>
<meta charset='utf-8'>
<script src='http://www.w3.org/Tools/respec/respec-w3c-common'
async class='remove'></script>
<link rel="stylesheet" href="stylesheets/codemirror.css">
<!-- style taken from the ontolex report -->
<style>
h2 a {
color:#444;
text-decoration:none;
}
h2 :link {
color:#444;
}
h2 :visited {
color:#444;
}
h1 a {
color:#444;
text-decoration:none;
}
h1 :link {
color:#444;
}
h1 :visited {
color:#444;
}
.entity { border:1px solid #000080; width:80%; margin-left:auto; margin-right:auto; maxwidth:80%; margin-left:auto; margin-right:auto; margin-bottom:30px; margin-top:30px; padding: 10px; }
.entity h3 { margin-top:3px;padding-bottom:5px;border-bottom:1px solid #000080; }
.description { border-top:1px dashed #808080; border-bottom:1px dashed #808080; margin-top:5px; padding-bottom:5px; }
img.example { max-width:100%; margin-left:auto;margin-right:auto;display:block;}
.beispiel { border: 1px dashed #808080; width:80%; margin-left:auto; margin-right:auto; margin-bottom:30px; margin-top:30px; overflow:hidden;}
.beispiel img { text-align:center; margin: 0px; border: 1px solid #000080;}
.beispiel pre { border:none; font-size:130%;}
.beispiel a { display:block; margin: 20px; padding:0px;}
.caption {display:none;}
.tn img {max-width:100%;}
</style>
<script src="javascripts/codemirror-compressed.js"></script>
<script src="http://codemirror.net/mode/sparql/sparql.js"></script>
<script src="http://codemirror.net/addon/runmode/runmode.js"></script>
<script src="http://codemirror.net/addon/runmode/colorize.js"></script>
<script class='remove'>
var respecConfig = {
specStatus: "CG-DRAFT",
doRDFa: "1.1",
shortName: "ontolex-frac",
publishDate: "2018-11-09",
editors: [
{ name: "Christian Chiarcos",
url: "http://acoli.informatik.uni-frankfurt.de/",
company: "Applied Computational Linguistics, Goethe Universität Frankfurt, Germany",
companyURL: "http://informatik.uni-frankfurt.de" },
{ name: "editor2"},
{ name: "editor3"}
],
authors: [
{ name: "author1"},
{ name: "author2"},
{ name: "..."},
],
previousMaturity: "CG-DRAFT",
previousPublishDate: "",
wg: "Ontology Lexica",
wgURI: "http://www.w3.org/community/ontolex/",
wgPublicList: "http://lists.w3.org/Archives/Public/public-ontolex/",
// wgPatentURI: "http://www.w3.org/2004/01/pp-impl/424242/status",
};
</script>
<link rel="stylesheet" href="stylesheets/codemirror.css">
<script src="javascripts/codemirror.js"></script>
</head>
<body>
<section id='abstract'>
<p>
This document describes the <em>module for frequency, attestation and corpus information</em> of the Lexicon Model for Ontologies (<em>lemon</em>) as a result of the work of the Ontology Lexica community group (OntoLex). The module is targeted at complementing dictionaries and other linguistic resources containing lexicographic data with a vocabulary to express</p>
<ul>
<li> corpus-derived statistics (frequency and cooccurrence information, collocations),</li>
<li> pointers from lexical resources to corpora and other collections of text (attestations),</li>
<li> the annotation of corpora and other language resources with lexical information (lemmatization against a dictionary), and</li>
<li> distributional semantics (collocation vectors, word embeddings, sense embeddings, concept embeddings).</li>
</ul>
<p>
The module tackles use cases in corpus-based lexicography, corpus linguistics and natural language processing, and operates in combination with the <em>lemon</em> core module, referred to as <em>OntoLex</em>, as well as with other <em>lemon</em> modules.
</p>
</section>
<section id='sotd'>
<p>This document is a working draft for a module for frequency, attestation and corpus data of the OntoLex specifications.
It is not a W3C Standard nor is it on the W3C Standards Track.</p>
<p>There are a number of ways that one may participate in the development of this report:</p>
<ul>
<li>Mailing list: <a href="http://lists.w3.org/Archives/Public/public-ontolex/">public-ontolex@w3.org</a>
<li>Wiki: <a href="https://www.w3.org/community/ontolex/wiki/Main_Page">Main page</a>
<li>More information about meetings of the ONTOLEX group can be obtained
<a href="https://www.w3.org/community/ontolex/wiki/Main_Page#Meetings">here</a></li>
<li><a href="https://github.com/acoli-repo/ontolex-frac">Source code</a>
for this document can be found on Github.</li>
</ul>
<p>Disclaimer: This draft follows closely the structure and design of <a href="https://jogracia.github.io/ontolex-lexicog/">The Ontolex Lexicography Module. Draft Community Group Report 28 October 2018</a>, edited by Julia Bosque-Gil and Jorge Gracia. In particular, motivational and introductory text are partially adapted without being marked as quotes. This is to be replaced by original text before publication.
</p>
</section>
<section>
<h2>Introduction</h2>
<section>
<h3> Background and Motivation </h3>
<p> The <a href="https://www.w3.org/2016/05/ontolex/"><em>lemon</em> model</a> provides a <a href="https://www.w3.org/2016/05/ontolex/#core">core</a> vocabulary (OntoLex) to represent <em>linguistic information</em> associated to ontology and vocabulary elements. The model follows the principle of <em>semantics by reference</em> in the sense that the semantics of a <a href="https://www.w3.org/2016/05/ontolex/#LexicalEntry">lexical entry</a> is expressed by reference to an individual, class or property defined in an ontology. </p>
<p> The current version of <em>lemon</em> (as an outcome of the OntoLex group, sometimes referred as OntoLex-lemon in the literature) as well as its previous version (<a href="https://lemon-model.net/">lemon</a> [<cite><a href="#bib-lemon_paper">1</a></cite>]) have been increasingly used in the context of dictionaries and lexicographical data to convert existent lexicographic information into the standards and formats of the Semantic Web. In consequence, a designated <em>lemon</em> <a href="">module for lexicography</a> (<em>lexicog</em>) has been designed, with applications in monolingual [<cite><a href="#bib-klimek-kdict">2</a></cite>], bilingual [<cite><a href="#bib-gracia-apertium">3</a></cite>], and multilingual [<cite><a href="#bib-bosque-kdict">4</a></cite>] dictionaries, as well as diachronic [<cite><a href="#bib-kahn-diachronic">5</a></cite>], dialectal [<cite><a href="#bib-declerck-dialectal">6</a></cite>], and etymological ones [<cite><a href="#bib-abromeit-etymological">7</a></cite>], among others.
This module is partially motivated by requirements of corpus-based lexicography (frequency and collocation information) and digital philology (linking lexical resources with corpus data).</p>
<p> A second motivation for a <em>lemon</em> model for corpus-based information comes from natural language processing. With the rise of distributional semantics since the early 1990s, lexical semantics have been complemented by corpus-based co-occurrence statistics (KEYNESS-REFERENCE???), collocation vectors (Schütze 1993), word embeddings (Collobert et al. 2012) and sense embeddings (??? and Schütze, 2017). With the proposed module, <em>lemon</em> can serve as a community standard to encode, store and exchange vector representations (embeddings) along with the lexical concepts, senses, lemmas or words that they represent. The processing of word embeddings is beyond the scope of this module. Embeddings are thus represented as literals ("BLOB").</p>
<p> The added value of using linked data technologies to represent such information is an increased level of interoperability and integration between different types of lexical resources, the textual data they pertain to, as well as distributional representations of words, lexical senses and lexical concepts. Creating a <em>lemon</em> module in the OntoLex CG is a suitable means for establishing a vocabulary on a broad consensus that takes into account all use cases identified above in an adequate fashion.
<!-- From lexicog:
<p> After analysing the literature, the proposers of this module perceived an obvious need for reaching some agreement that allows for a better and more inter-operable migration of existing dictionaries into linked data [<cite><a href="#bib-bosque-module">10</a></cite>]. For illustration, the OGL ontology [<cite><a href="#bib-parvizi-oxford">11</a></cite>] has its own notion of dictionary entry materialised in the <tt>ogl:Entry</tt> class, while in [<cite><a href="#bib-bosque-kdict">4</a></cite>] the ad-hoc <tt>kd:dictionaryEntry</tt> relation was introduced in the conversion of the KD Multilingual Global Series dictionaries, i.e, different researchers introduced their own modelling solutions to account for similar notions. Being interoperability a key issue in linked data technologies, building a common space in which these concepts can be agreed on and commonly defined comes as a logical step.-->
The OntoLex community is the natural forum to accomplish this for several reasons: </p>
<ol>
<li> The extended use of <em>lemon</em> to support digital lexicography,
<li> the improved application and applicabiltiy of <em>lemon</em> in natural language processing,
<li> the coming together of the lexicography, AI and human language technology communities, resp. resources, and
<li> the possibility of reusing already available mechanisms in <em>lemon</em>, preventing researchers from "re-inventing the wheel",
</ol>
</section>
<section>
<h3> Aim and Scope </h3>
<p>
The goal of this module is to complement <em>lemon</em> core elements with a vocabulary layer to represent lexicographical and semantic information derived from or defined with reference to corpora and external resources in a way that (a) <i>generalizes</i> over use cases from digital lexicography, natural language processing, artificial intelligence, computational philology and corpus linguistics, that (b) facilitates <i>exchange, storage and re-usability</i> of such data along with lexical information,
and that (c) <i>minimizes information loss</i>.
</p>
<p> The scope of the model is three-fold:
<ol>
<li> extending the <em>OntoLex-lexicog</em> model with corpus information to support existing challenges in corpus-driven lexicography,</li>
<li> modelling <em>existing</em> lexical and distributional-semantic resources (corpus-based dictionaries, collocation dictionaries, embeddings) as linked data, to allow their conjoint publication and inter-operation by Semantic Web standards, and
<li> providing a conceptual / abstract model of relevant concepts in <em>distributional semantics</em> that facilitates building linked data-based applications that consume and combine both lexical and distributional information.
</ol>
<div class="note"><p>
<em>Corpus</em> as used throughout this document is understood in its traditional, broader sense as a structured data collection -- or material suitable for being included into such a collection, such as manuscripts or other works.
We do not intend to limit the use of the term to corpora in a linguistic or NLP sense. Language resources of any kind (web documents, dictionaries, plain text, unannotated corpora, etc.) are considered "corpus data" and a collection of such information as a "corpus" in this sense. Any information drawn from or pertaining to such information is considered "corpus-based".
</p>
</div>
<!--
lexicog:
<div class="note"><p>
In terms of applying the module, we propose the following best practice or "rule of thumb" ... :
<ol>
<li> As long as the entities in OntoLex and the other <em>lemon</em> modules, together with those of catalogues of linguistic categories (e.g. LexInfo), suffice to represent the information encoded in the lexicographic resource (e.g., lexical entry, part of speech, translation, ...), the OntoLex lexicography module should not be instantiated.
<li> In case that there is some lexicographic information that cannot be modelled by using either OntoLex or any of the other <em>lemon</em> modules (e.g., to denote sense ordering), then the model should be instantiated but avoiding duplicities and keeping extra information to the minimum.
</ol>
The reason behind this is that this module adds some complexity by providing additional description capabilities to the purely lexical description accounted by OntoLex. If this information is not needed for a specific conversion, i.e, if the lexicographical view is not key, reusing <em>lemon</em> would allow to keep the representation simpler but still sufficient.
</p></div-->
</section>
<section>
<h3> Namespaces </h3>
This is a list of relevant namespaces that will be used in the rest of this document:
<p> OntoLex module for frequency, attestation and corpus information
<pre><code class="cm">
@prefix frac: <http://www.w3.org/ns/lemon/frac#> .
</code>
</pre>
</p>
<p> OntoLex (core) model and other <em>lemon</em> modules:
<pre><code class="cm">
@prefix ontolex: <http://www.w3.org/ns/lemon/ontolex#> .
@prefix synsem: <http://www.w3.org/ns/lemon/synsem#> .
@prefix decomp: <http://www.w3.org/ns/lemon/decomp#> .
@prefix vartrans: <http://www.w3.org/ns/lemon/vartrans#> .
@prefix lime: <http://www.w3.org/ns/lemon/lime#> .
@prefix lexicog: <http://www.w3.org/ns/lemon/lexicog#> .
</code>
</pre>
</p>
<p> Other models [TO REVIEW]:
<pre><code class="cm">
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>.
@prefix owl: <http://www.w3.org/2002/07/owl#>.
@prefix xsd: <http://www.w3.org/2001/XMLSchema#>.
@prefix skos: <http://www.w3.org/2004/02/skos#>.
@prefix dbr: <http://dbpedia.org/resource/>.
@prefix dbo: <http://dbpedia.org/ontology/>.
@prefix void: <http://rdfs.org/ns/void#>.
@prefix lexinfo: <http://www.lexinfo.net/ontology/2.0/lexinfo#>.
@prefix dct: <http://purl.org/dc/terms/>.
@prefix provo: <http://www.w3.org/ns/prov#>.
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>.
@prefix oa: <http://www.w3.org/ns/oa#>.
@prefix aat: <http://vocab.getty.edu/aat/>.</code>
</pre>
</p>
</section>
<section>
<h3> ontolex:Element </h3>
<p>
We consider all <em>lemon</em> core concepts as being countable, annotatable/attestable and suitable for a numerical representation by means of a vector (embedding). For this reason, we define the rdfs:domain of all properties that link lexical and corpus information by means of ontolex:Element, an abstract superclass of
ontolex:Form (for word frequency and plain word/phrase embeddings),
ontolex:LexicalEntry (for lemma frequency and lemma-based word/phrase embeddings),
ontolex:LexicalSense (for sense frequency and sense embeddings), and
ontolex:LexicalConcept (for concept frequency and concept embeddings).
<figure>
<img src="img/ontolex-element.png"
title="ontolex:Element"
alt="ontolex-element.png" width="80%"><figcaption>ontolex:Element as a superclass of ontolex:LexicalEntry, ontolex:Form, ontolex:LexicalSense and ontolex:LexicalConcept</figcaption>
</figure>
<div class="note"><p>
Such a top-level concept used to exist in <em>Monnet-lemon</em>, but has been abandoned in the 2016 edition of <em>lemon</em>.
If this concept is not provided by a future revision of the <em>lemon</em> core vocabulary, it will be introduced by this module.
Note that the introduction of ontolex:Element has no effect on <em>lemon</em> core other that facilitating vocabulary organization, as ontolex:Element is not to be used for data modeling.</p></div>
</p>
</section>
</section>
<section>
<h2>Overview</h2>
The following diagram depicts the OntoLex module for frequency, attestation and corpus information (<i>fraq</i>). Boxes represent classes of the model. Arrows with filled heads represent object properties. Arrows with empty heads represent rdfs:subClassOf.
Vocabulary elements introduced by this module are shaded grey (classes) or set in <i>italics</i>.
<figure>
<!--img src="img/ontolex-frac-2018-11.png" title="ontolex-frac-2018-11.png" alt="ontolex-frac-2018-11.png" width="80%"><figcaption>Module for Frequency, Attestation and Corpus Information (<i>frac</i>), overview</figcaption-->
<img src="img/ontolex-frac-2019-03.png" title="ontolex-frac-2019-03.png" alt="ontolex-frac-2019-03.png" width="80%"><figcaption>Module for Frequency, Attestation and Corpus Information (<i>frac</i>), overview</figcaption>
</figure>
<div class="note"><p>DISCUSSION:
Looks more complicated than it is. Shall we drop inferrable information ? (rdf:rest, rdf:first are available vocabulary elements because ContextualRelation is a subclass of rdf:List, subclasses of ontolex:Element should be dropped once ontolex:Element is introduced.)
Keep rdf:List elements only if preserved in other ontolex modules.
</p>
</div>
</section>
<section>
<h2>Definitions</h2>
<section>
<h3>Frequency</h3>
<p> Frequency information is a crucial component in human language technology. Corpus-based lexicography originates with Francis and Kucera (1958), and subsequently, the analysis of frequency distributions of word forms, lemmas and other linguistic elements has become a standard technique in lexicography and philology, and given rise to the field of corpus linguistics.
At its core, this means that lexicographers use corpus frequency and distribution information while compiling lexical entries (also see the section on collocations and similarity below).
As a qualitative assessment, frequency can be expressed with <a href="http://www.lexinfo.net/ontology/2.0/lexinfo#frequency">lexinfo:frequency</a>, "[t]he relative commonness with which a term occurs". However, this is an object property with possible values lexinfo:commonlyUsed, lexinfo:infrequentlyUsed, lexinfo:rarelyUsed, while absolute counts over a particular resource (corpus) require novel vocabulary elements.
</p>
<p>
Absolute frequencies are used in computational lexicography (e.g., the <a href="http://oracc.museum.upenn.edu/epsd2/">Electronic Penn Sumerian Dictionary</a>), and they are an essential piece of information for NLP and corpus linguistics.
In order to avoid confusion with lexinfo:Frequency, this is defined with reference to a particular dataset, a corpus.
</p>
<p><div class='entity'>
<h3>frequency (ObjectProperty)</h3>
<div>
<p><strong>URI:</strong> <a href="http://www.w3.org/nl/lemon/frac#frequency" class="uri">http://www.w3.org/nl/lemon/frac#frequency</a></p>
</div>
<div class='comment'>
<p>The property <strong>frequency</strong> assigns a particular ontolex:Element a frac:CorpusFrequency.</p>
</div>
<div class='description'>
<p><strong>rdfs:range</strong> ontolex:Element</p>
<p><strong>rdfs:domain</strong> frac:CorpusFrequency</p>
</div>
</div></p>
<p><div class='entity'>
<h3>CorpusFrequency (Class)</h3>
<div>
<p><strong>URI:</strong> <a href="http://www.w3.org/nl/lemon/frac#CorpusFrequency" class="uri">http://www.w3.org/nl/lemon/frac#CorpusFrequency</a></p>
</div>
<div class='comment'>
<p><strong>Corpus frequency</strong> provides the absolute number of attestations (rdf:value) of a particular ontolex:Element (see frac:frequency) in a particular language resource (dct:source).</p>
</div>
<div class='description'>
<p><strong>SubClassOf:</strong> rdf:value exactly 1 xsd:int, dct:source min 1</p>
</div>
</div></p>
<div class="note">
<p>If information from multiple language resources is aggregated (also cf. the section on embeddings below), multiple <tt>dct:source</tt> statements should be provided, to each resource individually. The cardinality of <tt>dct:source</tt> is thus 1 or higher.
</p>
</div>
<div class="Note">
<p>QUESTION: better alternative to dct:source?</p>
</div>
<p>
The following example illustrates word and form frequencies for the Sumerian word <i>a</i> (n.) "water" from the <a href="http://oracc.museum.upenn.edu/epsd2/sux">Electronic Penn Sumerian Dictionary</a> and the frequencies of the underlying corpus.
</p>
<div class='beispiel'>
<!--p><a href='Examples/example1.png' class='tn'/><img src='Examples/example1.png'/></a></p-->
<div>
<pre><code>
# word frequency, over all form variants
epsd:a_water_n a ontolex:LexicalEntry;
frac:frequency [
a frac:CorpusFrequency;
rdf:value "4683"^^xsd:int;
dct:source <http://oracc.museum.upenn.edu/epsd2/pager> ] .
# form frequency for individual orthographical variants
epsd:a_water_n ontolex:canonicalForm [
ontolex:writtenRep "𒀀"@sux-Xsux, "a"@sux-Latn;
frac:frequency [
a frac:CorpusFrequency;
rdf:value "4656"^^xsd:int;
dct:source <http://oracc.museum.upenn.edu/epsd2/pager> ] ] .
epsd:a_water_n ontolex:otherForm [
ontolex:writtenRep "𒀉"@sux-Xsux, "a2"@sux-Latn;
frac:frequency [
a frac:CorpusFrequency;
rdf:value "1"^^xsd:int;
dct:source <http://oracc.museum.upenn.edu/epsd2/pager> ] ] .
epsd:a_water_n ontolex:otherForm [
ontolex:writtenRep "𒂊"@sux-Xsux, "e"@sux-Latn;
frac:frequency [
a frac:CorpusFrequency;
rdf:value "24"^^xsd:int;
dct:source <http://oracc.museum.upenn.edu/epsd2/pager> ] ].</code></pre>
</div>
</div>
<p>
The example shows orthographic variation (in the original writing system, Sumerian Cuneiform sux-Xsux, and its Latin transcription sux-Latn). It is slightly simplified insofar as the ePSD2 provides individual counts for different periods and that only three of six orthographical variants are given. Note that these are orthographical variants, not morphological variants (which are not given in the dictionary).
</p>
<div class="note">
<p>It is necessary to provide the link to the underlying corpus <em>for every frequency assessment</em> because the same element may receive different counts over different corpora. For data modelling, it is recommended to define a corpus- or collection-specific subclass of frac:CorpusFrequency with a fixed dct:source value. This leads to more compact data and avoids potential difficulties with the Open World Assumption (interpretability of incomplete data).
<div class='beispiel'>
<div>
<pre>
<code>
# Corpus Frequency in the EPSD corpus
:EPSDFrequency rdfs:subClassOf frac:CorpusFrequency.
:EPSDFrequency rdfs:subClassOf
[ a owl:Restriction ;
owl:onProperty dct:source ;
owl:hasValue <http://oracc.museum.upenn.edu/epsd2/pager> ] .
# frequency assessment
epsd:a_water_n frac:frequency [
a :EPSDFrequency;
rdf:value "4683"^^xsd:int ].</code>
</pre>
</div>
</div>
</div>
<div class="note">
<p>frac:CorpusFrequency can be extended with additional filter conditions to define sub-corpora. For example, we can restrict the subcorpus to a particular time period, e.g., the Neo-Sumerian Ur III period:
<div class="beispiel">
<div>
<pre>
<code>
# EPSD frequency for the Ur-III period (aat:300019910)
:EPSDFrequency_UrIII
rdfs:subClassOf :EPSDFrequency;
rdfs:subClassOf
[ a owl:Restriction ;
owl:onProperty dct:temporal ;
owl:hasValue aat:300019910 ] .
# frequency assessment for sub-corpus
epsd:a_water_n frac:frequency [
a :EPSDFrequency_UrIII;
rdf:value "2299"^^xsd:int ].
</code></pre></div></div></div>
</section>
<section>
<h3>Attestation</h3>
<div class="note">
<p>This is an attempt for a consensus model based on Depuydt and de Does (2018) and Khan and Boschetti (2018). We do focus on data structures, the following aspects are not covered: Datatype properties regarding confidence (assumed to be in lexinfo), bibliographical details (subject to other vocabularies), and details of resource linking (subject to other vocabularies).</p>
<figure>
<img src="img/attestations-lexcit.png"
title="Depuydt and de Does (2018)"
alt="img/attestations-lexcit.png" width="80%"/><figcaption>Attestation module following Depuydt and de Does (2018)</figcaption>
</figure>
<figure>
<img src="img/attestations-khan-boschetti.png"
title="Khan and Boschetti (2018)"
alt="img/attestations-khan-boschetti.png" width="80%"/><figcaption>Attestation module following Khan and Boschetti (2018)</figcaption>
</figure>
</div>
<p>"Lexicographers use examples to support their analysis of the headword. The examples can either be
authentic (exact quotations), adapted (modified versions of authentic examples) or invented examples.
Authentic examples are attributed quotations (citations), which not only elucidate
meaning and illustrate features of the headword (spelling, syntax, collocation, register etc.), but also
function as attestations and are used provide evidence of the existence of a headword.
We therefore call these examples “attestations”." (Depuydt and de Does 2018)
</p>
<p><div class='entity'>
<h3>attestation (ObjectProperty)</h3>
<div>
<p><strong>URI:</strong> <a href="http://www.w3.org/nl/lemon/frac#attestation" class="uri">http://www.w3.org/nl/lemon/frac#attestation</a></p>
</div>
<div class='comment'>
<p>The property <strong>attestation</strong> assigns a particular ontolex:Element a frac:Attestation.</p>
</div>
<div class='description'>
<p><strong>rdfs:range</strong> ontolex:Element</p>
<p><strong>rdfs:domain</strong> frac:Attestation</p>
</div>
</div></p>
<p><div class='entity'>
<h3>Attestation (Class)</h3>
<div>
<p><strong>URI:</strong> <a href="http://www.w3.org/nl/lemon/frac#Attestation" class="uri">http://www.w3.org/nl/lemon/frac#Attestation</a></p>
</div>
<div class='comment'>
<p>An <strong>Attestation</strong> is normally an exact or normalized quotation or excerpt from a source document that illustrates a particular form, sense or lexeme in authentic data.
Attestations should be accompanied by a <tt>Citation</tt> or the URI of a digital edition of the respective locus (<tt>dct:source</tt>). This URI can be externally defined (e.g., as a <tt>oa:Annotation</tt> or as a NIF URI), and can refer either to the entire work or to the exact location of the attestation within this source.
</div>
<div class='description'>
<p><strong>SubClassOf:</strong> rdf:quotation exactly 1 xsd:string</p>
</div>
</div></p>
<p><div class='entity'>
<h3>citation (ObjectProperty)</h3>
<div>
<p><strong>URI:</strong> <a href="http://www.w3.org/nl/lemon/frac#citation" class="uri">http://www.w3.org/nl/lemon/frac#citation</a></p>
</div>
<div class='comment'>
<p>The property <strong>citation</strong> assigns a particular ontolex:Element a frac:Citation.</p>
</div>
<div class='description'>
<p><strong>rdfs:range</strong> ontolex:Element</p>
<p><strong>rdfs:domain</strong> frac:Citation</p>
</div>
</div></p>
<p><div class='entity'>
<h3>Citation (Class)</h3>
<div>
<p><strong>URI:</strong> <a href="http://www.w3.org/nl/lemon/frac#Citation" class="uri">http://www.w3.org/nl/lemon/frac#Citation</a></p>
</div>
<div class='comment'>
<p>A <strong>Citation</strong> is a bibliographical reference to a source for the definition or illustration of a particular sense, form or lexeme. A citation <i>can</i> provide an attestation, but can also stand on its own.
</div>
</div></p>
<div class="note"><p>Details of bibliographical references are beyond the scope of the current proposal. Several designated vocabularies exist, e.g., FaBiO and CiTO,
<!-- Peroni, S., & Shotton, D. (2012). FaBiO and CiTO: Ontologies for describing bibliographic resources and citations. Web Semantics: Science, Services and Agents on the World Wide Web, 17: 33{43. DOI: 10.1016/j.websem.2012.08.001 -->
Bibo,
<!-- D'Arcus, B., & Giasson, F. (2009). Bibliographic Ontology Specication. Specication Document, 4 November 2009. Retrieved April 9, 2014, from http://bibliontology.com/ -->
the Open Citation Corpus,
<!-- Shotton, D. (2013). Publishing: Open citations. Nature, 502(7471): 295{297. DOI:
10.1038/502295a-->
SpringerNature SciGraph
<!-- https://scigraph.springernature.com/explorer -->
BiRO or C4O
<!-- Di Iorio, A., Nuzzolese, A. G., Peroni, S., Shotton, D. M., & Vitali, F. (2014, May). Describing bibliographic references in RDF. In SePublica. -->
</p>
</div>
<p><div class='entity'>
<h3>makeAttestation (ObjectProperty)</h3>
<div>
<p><strong>URI:</strong> <a href="http://www.w3.org/nl/lemon/frac#makeAttestation" class="uri">http://www.w3.org/nl/lemon/frac#makeAttestation</a></p>
</div>
<div class='comment'>
<p>The property <strong>makeAttestation</strong> assigns a particular Citation a frac:Attestation.</p>
</div>
<div class='description'>
<p><strong>rdfs:range</strong> frac:Citation</p>
<p><strong>rdfs:domain</strong> frac:Attestation</p>
</div>
</div></p>
<div class="note"><p>CC: Naming follows K and B, I'm not too happy with the name, though, because it's too close to <tt>attestation</tt>, it will likely be confused.</p></div>
</section>
<section>
<h3>Embeddings</h3>
<p>In distributional semantics, the contexts in which a word is attested are taken to define its meaning. Contextual similarity is thus a correlate of semantic similarity. Different representations of context are possible, the most prominent model to date is the form of a vector.
A word vector can be created, for example, by means of a reference list of vocabulary items, where every reference word is associated with a fixed position, e.g., <i>ship</i> with position 1, <i>ocean</i> with 2, <i>sky</i> with 3, etc.
Given a corpus (and a selection criterion for collocates, e.g., within the same sentence), every word in the corpus can be described by the frequency that a reference word occurred as a collocate in the corpus.
Assume we want to define the meaning of <i>frak</i>, with (exactly) the following attestations in our sample corpus (random samples from <a href="https://en.wikiquote.org/wiki/Battlestar_Galactica_(2003)">wikiquote</a>):
<ul>
<li><i>It's in the frakking ship!</i></li>
<li><i>Have you lost your frakkin' mind?</i></li>
<li><i>Oh, for frak's sake, let me see if I can make heads or tails of it.</i></li>
<li><i>It's a frakking Cylon.</i></li>
<li><i>Our job isn't to be careful, it's to shoot Cylons out of the frakking sky!</i></li>
</ul>
With the following list of reference words: <tt>(ship, ocean, lose, find, brain, mind, head, sky, Cylon, ...)</tt>, we obtain the vector <tt>(1,0,1,0,0,1,1,1,2,...)</tt> for the lemma (lexical entry) <i>frak</i>. For practical applications, these vectors are projected into lower-dimensional spaces, e.g., by means of statistical (Schütze 1993) or neural methods (Socher et al. 2011).
<!-- Socher, R., Huang, E. H., Pennin, J., Manning, C. D., & Ng, A. Y. (2011). Dynamic pooling and unfolding recursive autoencoders for paraphrase detection. In Advances in neural information processing systems (pp. 801-809). -->
The process of mapping a word to a numerical vector and its result are referred to as "word embedding". Aside from collocation counts, other methods for creating word embeddings do exist, but they are always defined relative to a corpus.
</p>
<p>Embeddings have become a dominating paradigm in natural language processing and machine learning, but, if compiled from large corpora, they require long training periods and thus tend to be re-used.
However, embedding distributions often use tool-specific binary formats (cf. <a href="https://radimrehurek.com/gensim/models/word2vec.html">Gensim</a>), and thus a portability problem arises.
CSV and related formats (cf. <a href="https://github.com/baojie/senna/tree/master/embeddings">SENNA embeddings</a>) are a better alternative, but their application to sense and concept embeddings (as provided, for example, by Rothe and Schütze 2017)
<!-- Rothe, S., & Schütze, H. (2017). Autoextend: Combining word embeddings with semantic resources. Computational Linguistics, 43(3), 593-617. -->
is problematic if their distribution is detached from the definition of the underlying sense and concept definitions.
With frac, Ontolex-lemon provides a vocabulary for the conjoint publication and sharing of embeddings and lexical information at all levels: non-lemmatized words (ontolex:Form), lemmatized words (ontolex:LexicalEntry), phrases (ontolex:MultiWordExpression), lexical senses (ontolex:LexicalSense) and lexical concepts (ontolex:LexicalConcept).</p>
<div class="note">
<p>We focus on <em>publishing and sharing</em> embeddings, not on their processing by means of Semantic Web formalisms, and thus, embeddings are represented as untyped or string literals with whitespace-separated numbers. If necessary, more elaborate representations, e.g., using rdf:List, may subsequently be generated from these literals.</p>
</div>
<p>Lexicalized embeddings provide their data via <tt>rdf:value</tt>, and should be published together with their metadata, most importantly
<ul>
<li>procedure/method (<tt>dct:description</tt> with free text, e.g., "CBOW", "SKIP-GRAM", "collocation counts")</li>
<li>corpus (<tt>dct:source</tt>)</li>
<li>dimensionality (<tt>dct:extent</tt>)</li>
</ul>
</p>
<p><div class='entity'>
<h3>embedding (ObjectProperty)</h3>
<div>
<p><strong>URI:</strong> <a href="http://www.w3.org/nl/lemon/frac#embedding" class="uri">http://www.w3.org/nl/lemon/frac#embedding</a></p>
</div>
<div class='comment'>
<p>The property <strong>embedding</strong> assigns a particular ontolex:Element a frac:Embedding.</p>
</div>
<div class='description'>
<p><strong>rdfs:range</strong> ontolex:Element</p>
<p><strong>rdfs:domain</strong> frac:Embedding</p>
</div>
</div></p>
<p><div class='entity'>
<h3>Embedding (Class)</h3>
<div>
<p><strong>URI:</strong> <a href="http://www.w3.org/nl/lemon/frac#Embedding" class="uri">http://www.w3.org/nl/lemon/frac#Embedding</a></p>
</div>
<div class='comment'>
<p>An <strong>Embedding</strong> provides a numerical vector (the string of <tt>rdf:value</tt>) for a given ontolex:Element (see <tt>frac:embedding</tt>). It is defined by the methodology used for creating it (<tt>dct:description</tt>), the URI of the corpus or language resource from which it was created (<tt>dct:source</tt>), and its dimensionality (length of the vector, <tt>dct:extent</tt>).</p>
</div>
<div class='description'>
<p><strong>SubClassOf:</strong> rdf:value exactly 1 xsd:string, dct:source min 1, dct:description min 1</p>
</div>
</div></p>
<div class="note">
<p>Question: Rename "Embedding" (the concept, not the property) to "Vector" ?</p>
</div>
<div class="note">
<p>For embeddings, we recommend using whitespace-separated numbers as their <tt>rdf:value</tt>. In particular, commas as separators are discouraged because they might be confused with the decimal point, depending on the locale of the user. We recommend the following regular expression for parsing embedding values (example in Perl):</p>
<p>
<code>split(/[^0-9\.,\-]+/, $value)</code></p>
<p>This means that doubles should be provided in the conventional format, not using the exponent notation.
</p>
</div>
<p>
The 50-dimensional
<a href="https://nlp.stanford.edu/projects/glove/">GloVe</a> 6B (Wikipedia 2014+Gigaword 5) embedding for <i>frak</i> is given below:
</p>
<p><tt>frak 0.015246 -0.30472 0.68107 -0.59727 -0.95368 -1.0931 0.58783 -0.19128 0.49108 0.61215 -0.14967 0.68197 0.22723 0.38514 -0.54721 -0.71187 0.21832 0.59857 0.1076 -0.23619 -0.86604 -0.91168 0.26087 -0.42067 0.60649 0.80644 -1.0477 0.67461 0.34154 -0.072511 -1.01 0.35331 -0.35636 0.9764 -0.62665 -0.29075 0.50797 -1.3538 0.18744 0.27852 -0.22557 -1.187 -0.11523 -0.078265 0.29849 0.22993 -0.12354 0.2829 1.0697 0.015366</tt></p>
<p>
As a lemma (LexicalEntry) embedding, this can be represented as follows:
</p>
<div class='beispiel'>
<div>
<pre>
<code>
:frak a ontolex:LexicalEntry;
ontolex:canonicalForm/ontolex:writtenRep "frak"@en;
frac:embedding [
a frac:Embedding;
rdf:value "0.015246 -0.30472 0.68107 ...";
dct:source
<http://dumps.wikimedia.org/enwiki/20140102/>,
<https://catalog.ldc.upenn.edu/LDC2011T07>;
dct:extent 50^^^xsd:int;
dct:description "GloVe v.1.1, documented in Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. GloVe: Global Vectors for Word Representation, see https://nlp.stanford.edu/projects/glove/; uncased"@en. ].</code>
</pre>
</div>
</div>
<div class="note">
<p>As with <tt>frac:Frequency</tt>, we recommend defining resource-specific subclasses of <tt>frac:Embedding</tt> in order to reduce redundancy in the data:</p>
<div class="beispiel">
<div>
<pre>
<code>
# resource-specific embedding class
:GloVe6BEmbedding_50d rdfs:subClassOf frac:Embedding;
rdfs:subClassOf
[ a owl:Restriction;
owl:onProperty dct:source;
owl:hasValue
<http://dumps.wikimedia.org/enwiki/20140102/>,
<https://catalog.ldc.upenn.edu/LDC2011T07> ],
[ a owl:Restriction;
owl:onProperty dct:extent;
owl:hasValue 50^^^xsd:int ],
[ a owl:Restriction;
owl:onProperty dct:description;
owl:hasValue "GloVe v.1.1, documented in Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. GloVe: Global Vectors for Word Representation, see https://nlp.stanford.edu/projects/glove/; uncased"@en. ].
# embedding assignment
:frak a ontolex:LexicalEntry;
ontolex:canonicalForm/ontolex:writtenRep "frak"@en;
frac:embedding [
a :GloVe6BEmbedding_50d;
rdf:value "0.015246 -0.30472 0.68107 ..." ].</code></pre>
</div></div></div>
<div class="note">
<p>Examples for non-word embeddings:
<ul>
<li> <a href="http://www.cis.lmu.de/~sascha/AutoExtend/">AutoExtend</a>: (a method to build) synset and lexeme embeddings, data <a href="http://www.cis.lmu.de/~sascha/AutoExtend/embeddings.zip">here</a></li>
<li> <a href="https://github.com/uhh-lt/sensegram">SenseGram</a>: sense embeddings, data <a href="http://ltdata1.informatik.uni-hamburg.de/sensegram/">here</a></li>
<li> <a href="http://tudarmstadt-lt.github.io/vec2synset/">Vec2Synset</a>: (a method to build) WordNet synset (= LexicalConcept) embeddings</li>
<li> <a href="https://minimaxir.com/2017/04/char-embeddings/">Character embeddings</a> are probably beyond the scope of OntoLex, unless characters are regarded LexicalEntries. (Which they could, for languages such as Chinese or Sumerian certainly, but also for Western languages -- given the fact that character-level pseudo entries are sometimes used in dictionaries to describe the phonology and orthography of a language. This is the case, for example, for Grimm's <a href="http://woerterbuchnetz.de/cgi-bin/WBNetz/wbgui_py?sigle=DWB">Deutsches Wörterbuch</a>.)</li>
</ul>
</p>
</div>
</section>
<section><h2>Collocations</h2>
<div class="note"><p>CC: this is a part I am less certain about, mostly because of the rdf:List modelling (which is inspired by lexicog). Alternative suggestions welcome.</p></div>
<p>Collocation analysis is an important tool for lexicographical research and instrumental for modern NLP techniques. It has been the mainstay of 1990s corpus linguistics and continues to be an area of active research in computational philology. ... (MORE MOTIVATION AND EXAMPLES)</p>
<p>Collocations are usually defined on surface-oriented criteria, i.e., as a relation between forms or lemmas (lexical entries), not between senses, but they can be analyzed on the level of word senses (the sense that gave rise to the idiom or collocation). Indeed, collocations often contain a variable part, which can be represented by a <tt>ontolex:LexicalConcept</tt>.</p>
<p>Collocations can involve two or more words, they are thus modelled as an <tt>rdf:List</tt> of <tt>ontolex:Element</tt>s.
Collocations may have a fixed or a variable word order. By default, we assume variable word order, where a fixed word order is required, the collocation must be assigned <tt>lexinfo:termType lexinfo:idiom</tt>.</p>
<p>Collocations obtained by quantitative methods are characterized by their method of creation (<tt>dct:description</tt>), their collocation strength (<tt>rdf:value</tt>), and the corpus used to create them (<tt>dct:source</tt>). Collocations share these characteristics with other types of contextual relations (see below), and thus, these are inherited from the abstract <tt>frac:ContextualRelation</tt> class.</p>
<p><div class='entity'>
<h3>ContextualRelation (Class)</h3>
<div>
<p><strong>URI:</strong> <a href="http://www.w3.org/nl/lemon/frac#ContextualRelation" class="uri">http://www.w3.org/nl/lemon/frac#ContextualRelation</a></p>
</div>
<div class='comment'>
<p><strong>ContextualRelation</strong> provides a relation between two or more lexical elements, characterized by a <tt>dct:description</tt> of the nature of relation, a corpus (<tt>dct:source</tt>) from which this relation was inferred, and a weight or probability assessment (<tt>rdf:value</tt>).</p>
</div>
<div class='description'>
<p><strong>SubClassOf:</strong> rdf:List; rdf:value exactly 1 xsd:double, dct:source min 1, dct:description min 1 xsd:string</p>
</div>
</div></p>
<p>We distinguish two primary contextual relations: syntagmatic (between co-occurring elements) and paradigmatic (between elements that can be substituted for each other). Syntagmatic contextual relations are formalized with <tt>frac:Collocation</tt>.</p>
<p><div class="entity">
<h3>Collocation (Class)</h3>
<div>
<p><strong>URI:</strong> <a href="http://www.w3.org/nl/lemon/frac#Collocation" class="uri">http://www.w3.org/nl/lemon/frac#Collocation</a></p>
</div>
<div class='comment'>
<p>A <strong>Collocation</strong> is a <tt>frac:ContextualRelation</tt> that holds between two or more <tt>ontolex:Element</tt>s based on their co-occurrence within the same utterance and characterized by their collocation weight (<tt>rdf:value</tt>) in one or multiple source corpora (<tt>dct:source</tt>).
</div>
<div class='description'>
<p><strong>SubClassOf:</strong> <tt>frac:ContextualRelation</tt></p>
<p><strong>rdf:first:</strong> only <tt>ontolex:Element</tt></p>
<p><strong>rdf:rest*/rdf:first:</strong> only <tt>ontolex:Element</tt>
</div>
</div>
</p>
<p>
Collocations are lists of ontolex:Elements, and formalized as <tt>rdf:List</tt>. Collocation elements can thus be directly accessed by <tt>rdf:first</tt>, <tt>rdf:_1</tt>, <tt>rdf:_2</tt>, etc. The property <tt>rdf:rest</tt> returns a <tt>rdf:List</tt> of <tt>ontolex:Element</tt>s, but not a <tt>frac:Collocation</tt>.</p>
<p>By default, <tt>frac:Collocation</tt> is insensitive to word order. If a collocation is word order sensitive, it should be characterized by an appropriate <tt>dct:description</tt>, as well as by having <tt>lexinfo:termType lexinfo:idiom</tt>.</p>
<div class="note">
<p><tt>lexinfo:idiom</tt> is ``[a] group of words in a fixed order that have a particular meaning that is different from the meanings of each word understood on its own.'' In application to automatically generated collocations, the criterion of having `a particular meaning' is necessarily replaced by `a particular distribution pattern', as reflected by the collocation weight (<tt>rdf:value</tt>). <i>Idioms</i> in the narrower sense of lexicalized multi-word expressions should not be modelled as <tt>frac:Collocation</tt>s, but as <tt>ontolex:MultiWordExpression</tt>s.
[TO BE DISCUSSED]
</p>
</div>
<p>The most elementary level of a collocation is an n-gram, as provided, for example, by <a href="http://storage.googleapis.com/books/ngrams/books/datasetsv2.html">Google Books</a>, which provide n-gram frequencies per publication year as tab-separated values. For 2008, the 2012 edition provides the following statistics for the bigram <i>kill</i> + <i>switch</i>.
</p>
<div class='beispiel'>
<div>
<pre>
<code>
# form-form bigrams
kill switch 2008 199 121
# form-lexeme bigrams
kill switch_NOUN 2008 187 115
kill switch_VERB 2008 8 8
# lexeme-form bigrams
kill_ADJ switch 2008 70 48
kill_NOUN switch 2008 89 64
kill_VERB switch 2008 40 30
# lexeme-lexeme bigrams
kill_VERB switch_VERB 2008 2 2
kill_NOUN switch_NOUN 2008 83 61
kill_VERB switch_NOUN 2008 35 26
kill_ADJ switch_NOUN 2008 69 48
kill_NOUN switch_VERB 2008 6 6
</code></pre></div></div>
<p>In this example, forms are string values (cf. <tt>ontolex:LexicalForm</tt>), lexemes are string values with parts-of-speech (cf. <tt>ontolex:LexicalEntry</tt>). A partial ontolex-frac representation is given below:
</p>
<div class='beispiel'>
<div>
<pre>
<code>
# kill (verb)
:kill_v a ontolex:LexicalEntry;
lexinfo:partOfSpeech lexinfo:verb;
ontolex:canonicalForm :kill_cf.
# kill (canonical form)
:kill_cf ontolex:writtenRep "kill"@en.
# switch (noun)
:switch_n a ontolex:LexicalEntry;
lexinfo:partOfSpeech lexinfo:noun;
ontolex:canonicalForm :switch_cf.
# switch (canonical form)
:switch_cf ontolex:writtenRep "switch"@en.
# form-form bigrams
(:kill_cf :switch_cf) a frac:Collocation;
rdf:value "199";
dct:description "2-grams, English Version 20120701, word frequency";
dct:source <https://books.google.com/ngrams>;
dct:temporal "2008"^^xsd:date;
lexinfo:termType lexinfo:idiom.
(:kill_cf :switch_cf) a frac:Collocation;
rdf:value "121";
dct:description "2-grams, English Version 20120701, document frequency";
dct:source <https://books.google.com/ngrams>;
dct:temporal "2008"^^xsd:date;
lexinfo:termType lexinfo:idiom.
# form-lexeme bigrams
(:kill_cf :switch_n) a frac:Collocation;
rdf:value "187";
dct:description "2-grams, English Version 20120701, word frequency";
dct:source <https://books.google.com/ngrams>;
dct:temporal "2008"^^xsd:date;
lexinfo:termType lexinfo:idiom.
(:kill_cf :switch_n) a frac:Collocation;
rdf:value "115";
dct:description "2-grams, English Version 20120701, document frequency";
dct:source <https://books.google.com/ngrams>;
dct:temporal "2008"^^xsd:date;
lexinfo:termType lexinfo:idiom.
</code></pre></div></div>
<div class="note"><p>Question: can canonical forms be shared across different lexical entries? For the case of plain word n-grams, this is presupposed here.</p></div>
<p>The second example illustrates more complex types of collocation are provided as provided by the <a href="http://corpora.uni-leipzig.de/en/res?corpusId=eng_news_2012">Wortschatz</a> portal (scores and definitions as provided for <a href="http://corpora.uni-leipzig.de/en/res?corpusId=eng_news_2012&word=beans">beans</a>, <a href="http://corpora.uni-leipzig.de/en/res?corpusId=eng_news_2012&word=spill+the+beans">spill the beans</a>, etc.
</p>
<div class='beispiel'>
<div>
<pre>
<code>
@prefix wsen: <http://corpora.uni-leipzig.de/en/res?corpusId=eng_news_2012&word=>
# selected lexical entries
# (we assume that every Wortschatz word is an independent lexical entry)
wsen:beans a ontolex:LexicalEntry;
ontolex:canonicalForm/ontolex:writtenRep "beans"@en.
wsen:spill a ontolex:LexicalEntry;
ontolex:canonicalForm/ontolex:writtenRep "spill"@en.
wsen:green a ontolex:LexicalEntry;
ontolex:canonicalForm/ontolex:writtenRep "green"@en.
wsen:about a ontolex:LexicalEntry;
ontolex:canonicalForm/ontolex:writtenRep "about"@en
# collocations, non-lexicalized
(wsen:spill wsen:beans) a frac:Collocation;
rdf:value "182";
dct:description "cooccurrences in the same sentence, unordered";
dct:source <http://corpora.uni-leipzig.de/en/res?corpusId=eng_news_2012>.
(wsen:green wsen:beans) a frac:Collocation;
rdf:value "778";
dct:description "left neighbor cooccurrence";
dct:source <http://corpora.uni-leipzig.de/en/res?corpusId=eng_news_2012>;
lexinfo:termType lexinfo:idiom.
(wsen:beans wsen:about) a frac:Collocation;
rdf:value "35";
dct:description "right neighbor cooccurrence";
dct:source <http://corpora.uni-leipzig.de/en/res?corpusId=eng_news_2012>;
lexinfo:termType lexinfo:idiom.
# multi-word expression, lexicalized (!)
wsen:spill+the+beans a ontolex:MultiWordExpression;
ontolex:canonicalForm/ontolex:writtenRep "spill the beans"@en.
(wsen:beans wsen:spill+the+beans) a frac:Collocation;
rdf:value "401";
dct:description "cooccurrences in the same sentence, unordered";
dct:source <http://corpora.uni-leipzig.de/en/res?corpusId=eng_news_2012>.
</code></pre></div></div>
<!--p>More examples https://www.sketchengine.eu/documentation/statistics-used-in-sketch-engine/</p-->
<div class="note"><p>Again, it is recommended to define resource-specific subclasses of <tt>frac:Collocation</tt> with default values for <tt>dct:description</tt>, <tt>dct:source</tt>, and (where applicable) <tt>lexinfo:termType</tt>.
</p>
</div>
</section>
<section>
<h2>Similarity</h2>
<p>Similarity is a paradigmatic contextual relation between elements that can replace each other in the same context. In distributional semantics, a quantitative assessment of the similarity of two forms, lexemes, phrases, word senses or concepts is thus grounded in numerical representations of their respective contexts, i.e., their embeddings.
In a broader sense of `embedding', also bags of words fall under the scope of <tt>frac:Embedding</tt>, see the usage note below.
</p>
<p>Similarity is characterized by a similarity score (<tt>rdf:value</tt>), e.g., the number of shared dimensions/collocates (in a bag-of-word model) or the cosine distance between two word vectors (for fixed-size embeddings), the corpora which we used to generate this score (<tt>dct:source</tt>), and the method used for calculating the score (<tt>dct:description</tt>).</p>
<p>Similarity is symmetric. The order of similes is irrelevant.</p>
<p>Like <tt>frac:Collocation</tt>, quantitative similarity relations are modelled as a subclass of <tt>frac:ContextualRelation</tt> (and thus, as an <tt>rdf:List</tt>).</p>
<p><div class="entity">
<h3>Similarity (Class)</h3>
<div>
<p><strong>URI:</strong> <a href="http://www.w3.org/nl/lemon/frac#Similarity" class="uri">http://www.w3.org/nl/lemon/frac#Similarity</a></p>
</div>
<div class='comment'>
<p><strong>Similarity</strong> is a <tt>frac:ContextualRelation</tt> that holds between two or more <tt>frac:Embedding</tt>s, and is characterized by a similarity score (<tt>rdf:value</tt>) in one or multiple source corpora (<tt>dct:source</tt>) and a <tt>dct:description</tt> that explains the method of comparison.
</div>
<div class='description'>
<p><strong>SubClassOf:</strong> <tt>frac:ContextualRelation</tt></p>
<p><strong>rdf:first:</strong> only <tt>frac:Embedding</tt></p>
<p><strong>rdf:rest*/rdf:first:</strong> only <tt>frac:Embedding</tt>
</div>
</div>
</p>
<p>
<tt>frac:Similarity</tt> applies to two different use cases: The specific similarity between (exactly) two words, and similarity clusters (synonym groups obtained from clustering quantitatively obtained synonym candidates according to their distributional semantics in a particular corpus) that can contain an arbitrary number of words.
Both differ in the semantics of <tt>rdf:value</tt>:
Quantitatively obtained similarity <i>relations</i> normally provide a different score for every pair of similes.
Within a similarity <i>cluster</i>, a generalization over these pair-wise scores must be provided.
This could be the minimal similarity between all cluster members or a score produced by the clustering algorithm (e.g., depth or size of cluster).
This must be explained in <tt>dct:description</tt>.
</p>
<div class="note">
<p>
Similarity clusters are typical outcomes of <a href="https://www.cs.york.ac.uk/semeval2010_WSI/datasets.html">Word Sense Induction</a> techniques or <a href="http://www.aclweb.org/anthology/D10-1056">unsupervised POS tagging</a>. Classical sample data are Brown clusters, e.g., <a href="https://github.com/Derekkk/Brown-Word-Clustering-and-word-similarity/blob/master/results-brown.txt">here</a> or <a href="https://s3-eu-west-1.amazonaws.com/downloads.gate.ac.uk/resources/derczynski-chester-boegh-brownpaths.tar.bz2">here</a>.
</p>
</div>
<div class="note">
<p><tt>Similarity</tt> is defined as a property of embeddings, not between <tt>ontolex:Element</tt>s.
This excludes at least two important use cases: </p>
<ul>
<li>manual similarity assessments as used for evaluating similarity assessments, and as created, for example, as part of psycholinguistic association or priming experiments (also cf. WordNet synsets, which provide, however, detailed lexicographic information in addition to similarity, and which thus to be represented as <tt>ontolex:LexicalConcept</tt>),</li>
<li>similarity assessments obtained by other means than embeddings, e.g., by means of a traditional bag of words.
</li>
</ul>
</p>
<p>
In both (and similar) cases, the recommendation is to make use of (a resource-specific subclass of) <tt>frac:Embedding</tt>, nevertheless, and to document the specifics of the similarity relation and/or the embeddings in the <tt>dct:description</tt> of these embeddings. For the first use case, this approach can be justified by assuming that embeddings are correlated with a psycholinguistically `real' phenomenon. For the second use case, any bag of words can be interpreted as an infinite-size binary vector for which an embedding provides a fixed-size approximation.
</p>
</div>
<div class="note">
<p>
As with frequency and embeddings, a resource-specific similarity type can be defined, analoguously. In particular, this is required if directed (asymmetric) similarity assessments are to be provided.
</p>
</div>
</section>
</section>
<section>
<h2>Corpus Annotation (non-normative)</h2>
<div class="note"><p>The Ontolex Module for Frequency, Attestation and Corpus Information does not specify a vocabulary for annotating corpora or other data with lexical information, as this is being provided by the <a href="https://www.w3.org/TR/annotation-vocab/">Web Annotation Vocabulary</a>. The following description is non-normative as Web Annotation is defined in a separate W3C recommendation. The definitions below are reproduced, and refined only insofar as domain and range declarations have been refined to our usecase.</p>
</div>
<p>In Web Annotation terminology, the annotated element is the `target', the content of the annotation is the `body', and the process and provenance of the annotation is expressed by properties of <tt>oa:Annotation</tt>.</p>
<div class="entity">
<h3>oa:Annotation (Class)</h3>
<div>
<p><strong>IRI:</strong> <a href="http://www.w3.org/ns/oa#Annotation" class="uri">http://www.w3.org/ns/oa#Annotation</a></p>
</div>
<div class='description'>
<p><strong>Required Predicates:</strong> <a href="#hastarget">oa:hasTarget</a>, <a href="#rdf-type">rdf:type</a>, <a href="#hasbody">oa:hasBody</a></p>
<p><strong>Recommended Predicates:</strong> <a href="#motivatedby">oa:motivatedBy</a>, <a href="#dcterms-creator">dcterms:creator</a>, <a href="#dcterms-created">dcterms:created</a></p>
<p><strong>Other Predicates:</strong> <a href="#styledby">oa:styledBy</a>, <a href="#dcterms-issued">dcterms:issued</a>, <a href="#as-generator">as:generator</a> </p>
</div>
</div>
<div class="diagram_img">
<img src="https://www.w3.org/TR/annotation-vocab/images/examples/annotation.png" alt="oa:Annotation with properties" longdesc="#example_anno">
</div>
<div class="entity">
<h3>oa:hasBody (Object Property)</h3>
<div>
<p><strong>IRI:</strong> <a href="http://www.w3.org/ns/oa#hasBody" class="uri">http://www.w3.org/ns/oa#hasBody</a></p>
</div>
<div class="comment">The object of the relationship is a resource that is a body of the Annotation. In the context of <em>lemon</em>, the body is an <tt>ontolex:Element</tt></div>
<div class="description">
<p><strong>Domain:</strong> oa:Annotation</p>
<p><strong>Range:</strong> ontolex:Element</p>
</div>
<div class="diagram_img">
<img src="https://www.w3.org/TR/annotation-vocab/images/examples/hasBody.png" alt="oa:hasBody"/>
</div>
</div>
<div class="entity">
<h3>oa:hasTarget (Object Property)</h3>
<div>
<p><strong>IRI:</strong> <a href="http://www.w3.org/ns/oa#hasTarget" class="uri">http://www.w3.org/ns/oa#hasTarget</a></p>
</div>
<div class="comment">The relationship between an Annotation and its Target.</div>
<div class="description">
<p><strong>Domain:</strong> oa:Annotation</p>
</div>
</div>