From 0db26a75862e3e1f487e1247f1f32b36c574285f Mon Sep 17 00:00:00 2001 From: Sven Hertling Date: Thu, 28 Mar 2019 12:50:12 +0100 Subject: [PATCH] updated language for inter wiki link extraction --- .../dbpedia/extraction/util/Language.scala | 43 +++++++++++++------ .../extraction/wikiparser/WikiTitle.scala | 12 +++++- .../impl/wikipedia/Namespaces.scala | 4 +- dump/extraction.dbkwik.properties | 2 +- 4 files changed, 44 insertions(+), 17 deletions(-) diff --git a/core/src/main/scala/org/dbpedia/extraction/util/Language.scala b/core/src/main/scala/org/dbpedia/extraction/util/Language.scala index 72e8a1d1e6..64751aef70 100644 --- a/core/src/main/scala/org/dbpedia/extraction/util/Language.scala +++ b/core/src/main/scala/org/dbpedia/extraction/util/Language.scala @@ -124,7 +124,7 @@ object Language extends (String => Language) for(lang <- wikiLanguageCodes) { try { - languages(lang) = makeDbkwikLanguage(lang, "default") + languages(lang) = makeDbkwikLanguage(lang, "default", true) } catch{ case mre : MissingResourceException => logger.log(Level.WARNING, "Could not create the language: " + lang) @@ -134,24 +134,38 @@ object Language extends (String => Language) } - def updateOneLanguage(wikiprefix : String, wiki: String):Unit={ - map(wikiprefix) = makeDbkwikLanguage(wikiprefix, wiki) - English = map("en") + def preprocessWikiBase(wikiBase: String): String ={ + //return new java.net.URI(wikiBase).getScheme() + return wikiBase.stripPrefix("http://").stripPrefix("https://").stripSuffix("/$1").stripSuffix("/wiki") + } + + def getLanguageFreeWikiBase(wikiBase: String): String ={ + var base = preprocessWikiBase(wikiBase) + var splits = base.split("\\.") + if(splits.length > 1){ + if(wikiLanguageCodes.contains(splits(0))){ + var test = splits.slice(1, splits.length).mkString(".") + return test + } + } + return base } def updateInterwikis(interwikis : scala.collection.Map[String, String]):Unit={ for ((prefix, url) <- interwikis) { - Language.updateOneLanguage(prefix, url.stripPrefix("http://").stripSuffix("/$1").stripSuffix("/wiki")) + if(url.contains("fandom") || url.contains("wikia")) + map(prefix) = makeDbkwikLanguage(prefix, preprocessWikiBase(url), false) } - print("test") + English = map("en") } - def updateAllLanguages(wikiBase: String): Unit ={ + def updateAllLanguages(base: String): Unit ={ //map.clear()//do not clear because we want to keep "mappings", "wikidata" etc. + var wikiBase = getLanguageFreeWikiBase(base) for(lang <- wikiLanguageCodes) { try { - map(lang) = makeDbkwikLanguage(lang, wikiBase) + map(lang) = makeDbkwikLanguage(lang, wikiBase, true) } catch{ case mre : MissingResourceException => logger.log(Level.WARNING, "Could not create the language: " + lang) @@ -167,9 +181,14 @@ object Language extends (String => Language) - def makeDbkwikLanguage(language : String, wikiBase: String): Language = { + def makeDbkwikLanguage(language : String, wikiBase: String, modifyBase: Boolean): Language = { + var base = wikiBase + if(language.equals("en") == false && modifyBase){ + base = language + "." + base + } + - val baseDomain = "dbkwik.webdatacommons.org/" + wikiBase + val baseDomain = "dbkwik.webdatacommons.org/" + base val loc = Locale.forLanguageTag(language) @@ -187,8 +206,8 @@ object Language extends (String => Language) "http://" + baseDomain, //val dbpediaUri: String, new DBpediaNamespace("http://" + baseDomain + "/resource/"), //val resourceUri: RdfNamespace, new DBpediaNamespace("http://" + baseDomain + "/property/"), //val propertyUri: RdfNamespace, - "http://"+wikiBase.stripPrefix("http://"), //val baseUri: String, - "https://"+wikiBase.stripPrefix("http://")+"/api.php", //val apiUri: String, + "http://"+ base, //val baseUri: String, + "https://"+ base + "/api.php", //val apiUri: String, 0 //val pages: Int ) } diff --git a/core/src/main/scala/org/dbpedia/extraction/wikiparser/WikiTitle.scala b/core/src/main/scala/org/dbpedia/extraction/wikiparser/WikiTitle.scala index 60bac292ba..909d627281 100644 --- a/core/src/main/scala/org/dbpedia/extraction/wikiparser/WikiTitle.scala +++ b/core/src/main/scala/org/dbpedia/extraction/wikiparser/WikiTitle.scala @@ -1,7 +1,9 @@ package org.dbpedia.extraction.wikiparser +import org.dbpedia.extraction.ontology.RdfNamespace import org.dbpedia.extraction.util.RichString.wrapString -import org.dbpedia.extraction.util.{Language, WikiUtil} +import org.dbpedia.extraction.util.StringUtils.replacements +import org.dbpedia.extraction.util.{Language, StringUtils, WikiUtil} import org.dbpedia.iri.UriDecoder import org.dbpedia.util.text.ParseExceptionIgnorer import org.dbpedia.util.text.html.{HtmlCoder, XmlCodes} @@ -41,7 +43,7 @@ class WikiTitle ( val encodedWithNamespace = withNamespace(true) /** page IRI for this page title */ - val pageIri = language.baseUri+"/wiki/"+encodedWithNamespace + val pageIri = language.baseUri+"/wiki/"+StringUtils.escape(encodedWithNamespace, WikiTitle.iriEscapes) /** resource IRI for this page title */ val resourceIri = language.resourceUri.append(encodedWithNamespace) @@ -92,6 +94,12 @@ class WikiTitle ( object WikiTitle { + // for this list of characters, see RFC 3987 and https://sourceforge.net/mailarchive/message.php?msg_id=28982391 + private val iriEscapes = { + val chars = ('\u0000' to '\u0020').mkString + "\"#%<>?[\\]^`{|}" + ('\u007F' to '\u009F').mkString + replacements('%', chars) + } + /** * Parses a MediaWiki link or title. * diff --git a/core/src/main/scala/org/dbpedia/extraction/wikiparser/impl/wikipedia/Namespaces.scala b/core/src/main/scala/org/dbpedia/extraction/wikiparser/impl/wikipedia/Namespaces.scala index 8fa02a1c9f..70bab61179 100644 --- a/core/src/main/scala/org/dbpedia/extraction/wikiparser/impl/wikipedia/Namespaces.scala +++ b/core/src/main/scala/org/dbpedia/extraction/wikiparser/impl/wikipedia/Namespaces.scala @@ -324,6 +324,6 @@ object Namespaces (names.toMap, codes.toMap) } - def names(lang : Language) : Map[Int, String] = names.getOrElse(lang.wikiCode, throw new IllegalArgumentException("no namespace names found for language '"+lang.wikiCode+"'")) - def codes(lang : Language) : Map[String, Int] = codes.getOrElse(lang.wikiCode, throw new IllegalArgumentException("no namespace codes found for language '"+lang.wikiCode+"'")) + def names(lang : Language) : Map[Int, String] = names.getOrElse(lang.wikiCode, names("en")) + def codes(lang : Language) : Map[String, Int] = codes.getOrElse(lang.wikiCode, codes("en")) } diff --git a/dump/extraction.dbkwik.properties b/dump/extraction.dbkwik.properties index 53e0503034..fd7ec62f33 100644 --- a/dump/extraction.dbkwik.properties +++ b/dump/extraction.dbkwik.properties @@ -31,7 +31,7 @@ copyrightCheck=false #parameter for xml-safe properties: -uri-policy.iri=xml-safe-predicates:*;xml-safe-subjects:* +uri-policy.iri=xml-safe:* format.ttl=turtle-triples;uri-policy.iri