From 2a5df9415f705705e451de2b21d05e1e49cffff7 Mon Sep 17 00:00:00 2001 From: Simon Gray Date: Wed, 12 Jun 2024 11:56:29 +0200 Subject: [PATCH] release 2024-06-12 --- pages/about-da.md | 20 +++++++++++++++++--- pages/about-en.md | 20 +++++++++++++++++--- pages/releases-da.md | 5 +++-- pages/releases-en.md | 15 ++++++++------- resources/schemas/internal/dannet-schema.ttl | 10 +++++----- src/main/dk/cst/dannet/db/bootstrap.clj | 4 ++-- src/main/dk/cst/dannet/db/export/wn_lmf.clj | 2 +- src/main/dk/cst/dannet/prefix.cljc | 9 +++++---- 8 files changed, 58 insertions(+), 27 deletions(-) diff --git a/pages/about-da.md b/pages/about-da.md index c4b93d7..a87b94c 100644 --- a/pages/about-da.md +++ b/pages/about-da.md @@ -7,11 +7,23 @@ Et [WordNet][WordNet] er en lexico-semantisk netværksgraf, der viser hvordan be Til forskel fra en almindelig ordbog er det ikke definitionen af ordet, der står i centrum, men i højere grad ordets relationer til andre ord. I DanNet kan man f.eks. se at en [dværgpil][dværgpil] er en slags [busk][busk], at et [lysthus][lysthus] findes i en [have][have], at [fiberdrys][fiberdrys] bruges til at [spise][spise], og at [kager][kage] typisk fremstilles ved [bagning][bage] og typisk er lavet af [mel][mel] og [sukker][sukker]. ## Hent vores data -DanNet er baseret på [Ontolex][Ontolex]-standarden med [tilføjelser][GWA RDF] fra [Global WordNet Association][GWA]. Du kan udforske DanNet direkte her på wordnet.dk, men du kan også downloade vores data som et [RDF-datasæt][DanNet RDF] eller i en lidt mere begrænset [CSV-udgave][DanNet CSV]. Alle vores datasæt udgives under [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/)-licensen. +DanNet er baseret på [Ontolex][Ontolex]-standarden med [tilføjelser][GWA RDF] fra [Global WordNet Association][GWA]. Du kan udforske DanNet direkte her på wordnet.dk, men du kan også downloade vores data som et `RDF`-datasæt eller i en lidt mere begrænset `CSV`-udgave. Vi tilbyder også DanNet som `WN-LMF`, klar til at blive brugt i software der understøtter dette format. Alle vores datasæt udgives under [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/)-licensen: -DanNet er også integreret med [COR][COR] og [DDS][DDS], samt det [engelske WordNet][OEWN] (du kan hente deres datasæt fra deres egen side) som vi har udvidet med RDF-etiketter, der minder om dem brugt i DanNet. Disse alternative RDF-datasæt kan ligeledes downloades her på siden ([COR-integration][COR-integration], [DDS-integration][DDS-integration], [OEWN-udvidelse][OEWN-extension]). Vi har også inkluderet [CILI][CILI]-data i vores database, da dette bruges som et fælles integrationspunkt for forskellige WordNets. +- [RDF-udgave][DanNet RDF] - fuldt datasæt (minus udledt data) +- [CSV-udgave][DanNet CSV] - alternativt datasæt +- [WN-LMF-udgave][DanNet WN-LMF] - alternativt datasæt, begrænset til WN-LMF relations -I tidligere versioner af DanNet, kunne du derudover også hente en komplet kopi af al data der kunne tilgås på wordnet.dk/dannet, inklusiv logisk udledt data og tilknyttede RDF-skemaer. Desværre har dette vist sig at være for at ressourcekrævende at generere som en del af en almindelig DanNet-udgivelse. Vi vil forsøge at ændre på dette i fremtiden. +DanNet er også integreret med [COR][COR] og [DDS][DDS], samt det [engelske WordNet][OEWN] (du kan hente deres datasæt fra deres egen side) som vi har udvidet med RDF-etiketter, der minder om dem brugt i DanNet. Disse alternative RDF-datasæt kan ligeledes downloades her på siden: + +- [COR-integration][COR-integration] +- [DDS-integration][DDS-integration] +- [OEWN-udvidelse][OEWN-extension] + +Vi har inkluderer også [CILI][CILI]-data i vores database, da dette bruges som et fælles integrationspunkt for forskellige WordNets. + +Alle udgaver af denne iteration af DanNet kan derudover downloades fra vores [releases][releases]-side på Github. + +> NOTE: I tidligere versioner af DanNet, kunne du derudover hente en komplet kopi af al data der kunne tilgås på wordnet.dk/dannet, inklusiv logisk udledt data og tilknyttede RDF-skemaer. Desværre har dette vist sig at være for at ressourcekrævende at generere som en del af en almindelig DanNet-udgivelse. Vi vil forsøge at ændre på dette i fremtiden. ## Dokumentation Følgende dokumenter er kun tilgængelige på engelsk og primært tiltænkt udviklere: @@ -34,6 +46,7 @@ Følgende dokumenter er kun tilgængelige på engelsk og primært tiltænkt udvi [CILI]: https://github.com/globalwordnet/cili "Collaborative Interlingual Index" [DanNet RDF]: /export/rdf/dn "DanNet (RDF)" [DanNet CSV]: /export/csv/dn "DanNet (CSV)" +[DanNet WN-LMF]: /export/wn-lmf/dn "DanNet (WN-LMF)" [COR-integration]: /export/rdf/cor "COR-integration (RDF)" [DDS-integration]: /export/rdf/dds "DDS-integration (RDF)" [OEWN-extension]: /export/rdf/oewn-extension "OEWN-udvidelse (RDF)" @@ -52,3 +65,4 @@ Følgende dokumenter er kun tilgængelige på engelsk og primært tiltænkt udvi [rationale]: /dannet/page/rationale "Rationale" [queries]: /dannet/page/queries "Queries" [Github]: https://github.com/kuhumcst/DanNet "Github-projektet" +[releases]: https://github.com/kuhumcst/DanNet/releases "Tidligere releases" diff --git a/pages/about-en.md b/pages/about-en.md index bcad224..aa12099 100644 --- a/pages/about-en.md +++ b/pages/about-en.md @@ -7,11 +7,23 @@ A [WordNet][WordNet] is a lexico-semantic network graph that shows how senses of Unlike a normal dictionary, the definitions of words aren't central; instead, relations to other words are the important part. For example, in DanNet you can see that a [Swiss willow][dværgpil] is a kind of [bush][busk], that a [gazebo][lysthus] is located in a [garden][have], that "[fiberdrys][fiberdrys]" is for [eating][spise], and that [cakes][kage] are typically produced by [baking][bage] and usually made from [flour][mel] and [sugar][sukker]. ## Download our data -DanNet is based on the [Ontolex][Ontolex] standard with [additions][GWA RDF] from the [Global WordNet Association][GWA]. You can explore DanNet directly on wordnet.dk, but you may also download our data as an [RDF data set][DanNet RDF] or in a slightly more limited [CSV edition][DanNet CSV]. All our data sets are published under the [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/) license. +DanNet is based on the [Ontolex][Ontolex] standard with [additions][GWA RDF] from the [Global WordNet Association][GWA]. You can explore DanNet directly on wordnet.dk, but you may also download our data as an `RDF` dataset or in a slightly more limited `CSV` edition. We also offer DanNet as `WN-LMF` ready to be used in supporting software. All our datasets are published under the [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/) license: -DanNet is integrated with [COR][COR] and [DDS][DDS] too, as well as the [English WordNet][OEWN] (you may download their data set from that page) which we have extended with RDF labels resembling those used in DanNet. These alternative RDF data sets can also be downloaded on this page ([COR integration][COR-integration], [DDS integration][DDS-integration], [OEWN extension][OEWN-extension]). We have also included the [CILI][CILI] data in our database, as it is used as a common integration point for different WordNets. +- [RDF edition][DanNet RDF] - full dataset (sans inferences) +- [CSV edition][DanNet CSV] - alternative dataset +- [WN-LMF edition][DanNet WN-LMF] - alternative dataset, limited to WN-LMF relations -In earlier releases, you could also download a complete copy of all the data that can be found on wordnet.dk/dannet, including logically inferred data and associated RDF schemas. However, this has proven too resource-intensive to generate as part of a regular DanNet release. We will try to remedy this in the future. +DanNet is integrated with [COR][COR] and [DDS][DDS] too, as well as the [English WordNet][OEWN] (you may download their dataset from that page) which we have extended with RDF labels resembling those used in DanNet. These alternative RDF datasets can also be downloaded on this page: + +- [COR integration][COR-integration] +- [DDS integration][DDS-integration] +- [OEWN extension][OEWN-extension]). + +We also include the [CILI][CILI] data in our database, as it is used as a common integration point for different WordNets. + +All releases of this iteration of DanNet may also be downloaded on our [releases][releases] page on Github. + +> NOTE: In earlier releases, you could download a complete copy of all the data that can be found on wordnet.dk/dannet, including logically inferred data and associated RDF schemas. However, this has proven too resource-intensive to generate as part of a regular DanNet release. We will try to remedy this in the future. ## Documentation The following documents are only available in English and mostly for developers: @@ -34,6 +46,7 @@ The following documents are only available in English and mostly for developers: [CILI]: https://github.com/globalwordnet/cili "Collaborative Interlingual Index" [DanNet RDF]: /export/rdf/dn "DanNet (RDF)" [DanNet CSV]: /export/csv/dn "DanNet (CSV)" +[DanNet WN-LMF]: /export/wn-lmf/dn "DanNet (WN-LMF)" [COR-integration]: /export/rdf/cor "COR-integration (RDF)" [DDS-integration]: /export/rdf/dds "DDS-integration (RDF)" [OEWN-extension]: /export/rdf/oewn-extension "OEWN extension (RDF)" @@ -52,3 +65,4 @@ The following documents are only available in English and mostly for developers: [rationale]: /dannet/page/rationale "Rationale" [queries]: /dannet/page/queries "Queries" [Github]: https://github.com/kuhumcst/DanNet "The Github project page" +[releases]: https://github.com/kuhumcst/DanNet/releases "Past releases" diff --git a/pages/releases-da.md b/pages/releases-da.md index 44f7160..403bffb 100644 --- a/pages/releases-da.md +++ b/pages/releases-da.md @@ -1,10 +1,11 @@ # Versioner De nye DanNet-versioner bruger udgivelsesdatoen som versionsnummer, formateret som `YYYY-MM-DD`. -## **SNAPSHOT**: WN-LMF som alternativt format -* På baggrund af en forespørgsel via Github er WN-LMF nu blevet tilføjet som et alternativt format. Den nye fil, `dannet-wn-lmf.gz`, kan bruges direkte i software som [goodmami/wn](https://github.com/goodmami/wn) (se også: [eksempel på Github](https://github.com/kuhumcst/DanNet/blob/master/examples/wn_lmf_query.py)). Desværre understøtter WN-LMF ikke det fulde sortiment af data som findes i DanNet; eksempelvis er vores ontologiske typer ikke at finde i dette format og det samme gælder DanNet-specifikke relationer som `brugt til`. +## **2024-06-12**: WN-LMF som alternativt format +* På baggrund af en forespørgsel via Github er WN-LMF nu blevet tilføjet som et alternativt format. Den nye fil, `dannet-wn-lmf.gz`, kan bruges direkte i software som [goodmami/wn](https://github.com/goodmami/wn) (se også: [eksempel på Github](https://github.com/kuhumcst/DanNet/blob/master/examples/wn_lmf_query.py)). Desværre understøtter WN-LMF ikke det fulde sortiment af data som findes i DanNet; eksempelvis er vores ontologiske typer ikke at finde i dette format og det samme gælder DanNet-specifikke relationer som `bruges til`. * I alt 1906 dårlige kildehenvisninger til DDO er blevet fjernet fra datasættet. Disse `dns:source`-relationer var blevet oprettet automatisk baseret på ID'er der udelukkende eksister i DanNet, så derfor kan der ikke kildehenvises til DDO. * 88 Synset-definitioner er rettet således at opdelingen mellem titler og embeder er korrekt. +* Synset indegrees er blevet genberegnet. ## **2024-04-30**: Forbedret CSV-eksport + andre små rettelser * CSV-eksporten er blevet forbedret ved... diff --git a/pages/releases-en.md b/pages/releases-en.md index 329069f..f9a1730 100644 --- a/pages/releases-en.md +++ b/pages/releases-en.md @@ -1,10 +1,11 @@ # Releases The newer DanNet releases use the release date as the version number, formatted as `YYYY-MM-DD`. -## **SNAPSHOT**: WN-LMF as an alternative format +## **2024-06-12**: WN-LMF as an alternative format * WN-LMF has been added as an alternative format following a request on Github. The new file, `dannet-wn-lmf.gz`, can even be used directly in software such as [goodmami/wn](https://github.com/goodmami/wn) (see also: [example on Github](https://github.com/kuhumcst/DanNet/blob/master/examples/wn_lmf_query.py)). Unfortunately, WN-LMF currently does not support the full set of data found in DanNet; for instance, our ontological types are not present in this format and the same applies to DanNet-specific relations such as `used for`. * A total of 1906 bad source references to DDO have been removed from the dataset. These `dns:source`-relations had been created automatically based on IDs that exclusively exist within DanNet and for this reason they couldn't reference DDO. * 88 Synset definitions have been fixed such that the split between titles and occupations is correct. +* Synset indegrees have been recalculated. ## **2024-04-30**: Improved CSV export + other small fixes * The CSV export has been improved by... @@ -31,26 +32,26 @@ The newer DanNet releases use the release date as the version number, formatted ## **2023-06-01**: ~5000 links to the Open English WordNet * The schema translations have been updated. * Around 5000 links have been added which link DanNet to the [Open English WordNet](https://github.com/globalwordnet/english-wordnet) or indirectly via the [CILI](https://github.com/globalwordnet/cili). -* The OEWN data set has received a companion data set containing generated labels for synsets, senses, and words. +* The OEWN dataset has received a companion dataset containing generated labels for synsets, senses, and words. * `dns:dslSense` and `dns:source` have been removed from the DanNet schema (`dns:source` has been replaced by `dc:source`) ## **2023-05-23**: DDS/COR improvements & links to DDO -The following changes to our data sets will be available in the next version: +The following changes to our datasets will be available in the next version: * Many DanNet words and senses have been linked to [DDO](https://ordnet.dk/ddo) via the new `dns:source` relation. -* Unofficial conjugations present in the COR companion data set have been marked as such in their `rdfs:label`. +* Unofficial conjugations present in the COR companion dataset have been marked as such in their `rdfs:label`. * Various other smaller tweaks to the COR dataset. -* The DDS data set now uses 32-bit `float` as opposed to `double`, which results in a smaller RDF export as this data type doesn't require any special encoding in .ttl-files. +* The DDS dataset now uses 32-bit `float` as opposed to `double`, which results in a smaller RDF export as this data type doesn't require any special encoding in .ttl-files. ## **2023-05-11**: The new DanNet There are too many changes in this initial release to list all of them in a succinct way: * Around 5000 new senses have been added, mostly adjectives. -* Many data set inconsistencies and other undesirable properties have been cleaned up. +* Many dataset inconsistencies and other undesirable properties have been cleaned up. * The entirety of DanNet has been converted to the Ontolex standard and uses the relations from the Global WordDet Association. * DanNet is now RDF-native; RDF schemas are also available covering e.g. the ontological types. * The DSL-derived DanNet IDs all resolve to actual RDF resources which can be viewed in a browser. -* Several companion data sets are available for download and are also merged with the data on wordnet.dk. +* Several companion datasets are available for download and are also merged with the data on wordnet.dk. * Additional data points have also been inferred from the bootstrap data, e.g. inverse relations. * The CSV download is now CSVW and includes metadata files describing contents of the columns. * The DanNet data is now licensed as CC BY-SA 4.0 and the source code of the project is available under the MIT licence. diff --git a/resources/schemas/internal/dannet-schema.ttl b/resources/schemas/internal/dannet-schema.ttl index 195ebcc..28d0043 100644 --- a/resources/schemas/internal/dannet-schema.ttl +++ b/resources/schemas/internal/dannet-schema.ttl @@ -39,21 +39,21 @@ # NOTE: the format of :eq_hypernym and :eq_hyponym is a copy of wn:eq_synonym :eqHypernym a wn:SynsetRelType ; rdfs:label "eq hypernym"@en ; - rdfs:comment "A relation between two concepts in separate data sets where the object is a hypernym of the subject."@en ; + rdfs:comment "A relation between two concepts in separate datasets where the object is a hypernym of the subject."@en ; rdfs:comment "En relation mellem to begreber i separate datasæt, hvor objektet er et hypernym af subjektet."@da ; owl:inverseOf :eqHyponym ; rdfs:isDefinedBy . :eqHyponym a wn:SynsetRelType ; rdfs:label "eq hyponym"@en ; - rdfs:comment "A relation between two concepts in separate data sets where the object is a hyponym of the subject."@en ; + rdfs:comment "A relation between two concepts in separate datasets where the object is a hyponym of the subject."@en ; rdfs:comment "En relation mellem to begreber i separate datasæt, hvor objektet er et hyponym af subjektet."@da ; owl:inverseOf :eqHypernym ; rdfs:isDefinedBy . :eqSimilar a wn:SynsetRelType ; rdfs:label "eq similar"@en ; - rdfs:comment "A relation between two concepts in separate data sets where the object is similar to the subject, though not synonymous."@en ; + rdfs:comment "A relation between two concepts in separate datasets where the object is similar to the subject, though not synonymous."@en ; rdfs:comment "En relation mellem to begreber i separate datasæt, hvor objektet minder om subjektet men dog ikke er et synonym."@da ; owl:inverseOf :eqSimilar ; rdfs:isDefinedBy . @@ -136,7 +136,7 @@ rdfs:comment "A lexical concept which the subject concept may be used for; see .https://cst.ku.dk/projekter/dannet/#heading-1612789460891>"@en ; rdfs:comment "Et leksikalsk begreb som subjekt-begrebet kan bruges til; se .https://cst.ku.dk/projekter/dannet/#heading-1612789460891>"@da ; rdfs:label "used for"@en ; - rdfs:label "brugt til"@da ; + rdfs:label "bruges til"@da ; rdfs:isDefinedBy . :usedForObject a wn:SynsetRelType ; @@ -145,7 +145,7 @@ rdfs:comment "TODO; see .https://cst.ku.dk/projekter/dannet/#heading-1612789460891>"@en ; rdfs:comment "TODO; se .https://cst.ku.dk/projekter/dannet/#heading-1612789460891>"@da ; rdfs:label "used for object"@en ; - rdfs:label "brugt til objekt"@da ; + rdfs:label "bruges til objekt"@da ; rdfs:isDefinedBy . :nearAntonym a wn:SynsetRelType ; diff --git a/src/main/dk/cst/dannet/db/bootstrap.clj b/src/main/dk/cst/dannet/db/bootstrap.clj index a338de9..4ce3fcb 100644 --- a/src/main/dk/cst/dannet/db/bootstrap.clj +++ b/src/main/dk/cst/dannet/db/bootstrap.clj @@ -90,7 +90,7 @@ "2024-04-30") (def current-release - (str "2024-04-30-SNAPSHOT")) + (str "2024-06-12")) (defn assert-expected-dannet-release! "Assert that the DanNet `model` is the expected release to boostrap from." @@ -378,7 +378,7 @@ This function survives between releases, but the functions it calls are all considered temporary and should be deleted when the release comes." [dataset] - (let [expected-release "2024-04-30-SNAPSHOT"] + (let [expected-release "2024-06-12"] (assert (= current-release expected-release)) ; another check (println "Applying release changes for" expected-release "...") diff --git a/src/main/dk/cst/dannet/db/export/wn_lmf.clj b/src/main/dk/cst/dannet/db/export/wn_lmf.clj index 74c2d80..dd680b7 100644 --- a/src/main/dk/cst/dannet/db/export/wn_lmf.clj +++ b/src/main/dk/cst/dannet/db/export/wn_lmf.clj @@ -314,7 +314,7 @@ (println "Beginning WN-LMF export of DanNet into" dir) (println "----") (let [f (str dir "dannet-wn-lmf.xml") - gz (str dir "dannet-wn-lmf.gz")] + gz (str dir "dannet-wn-lmf.xml.gz")] (export-xml! f) (gzip (io/file f) (io/file gz))) (println "----") diff --git a/src/main/dk/cst/dannet/prefix.cljc b/src/main/dk/cst/dannet/prefix.cljc index 59889af..c0a40fb 100644 --- a/src/main/dk/cst/dannet/prefix.cljc +++ b/src/main/dk/cst/dannet/prefix.cljc @@ -91,10 +91,11 @@ 'wn 'ontolex 'skos 'lexinfo 'dcat 'vann 'foaf 'dc 'ili 'en} - :download {"rdf" {:default "dannet.zip" - "merged" "dannet-dds-cor.zip" - "complete" "dannet-complete.zip"} - "csv" {:default "dannet-csv.zip"}}} + :download {"rdf" {:default "dannet.zip" + "merged" "dannet-dds-cor.zip" + "complete" "dannet-complete.zip"} + "csv" {:default "dannet-csv.zip"} + "wn-lmf" {:default "dannet-wn-lmf.xml.gz"}}} 'dnc {:uri (str dannet-root "concepts/") :resource (str "<" dannet-root "concepts>") :alt "schemas/internal/dannet-concepts.ttl"}