Skip to content

Commit

Permalink
updated language for inter wiki link extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
sven-h committed Mar 28, 2019
1 parent e8a83a0 commit 0db26a7
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 17 deletions.
43 changes: 31 additions & 12 deletions core/src/main/scala/org/dbpedia/extraction/util/Language.scala
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ object Language extends (String => Language)
for(lang <- wikiLanguageCodes)
{
try {
languages(lang) = makeDbkwikLanguage(lang, "default")
languages(lang) = makeDbkwikLanguage(lang, "default", true)
}
catch{
case mre : MissingResourceException => logger.log(Level.WARNING, "Could not create the language: " + lang)
Expand All @@ -134,24 +134,38 @@ object Language extends (String => Language)
}


def updateOneLanguage(wikiprefix : String, wiki: String):Unit={
map(wikiprefix) = makeDbkwikLanguage(wikiprefix, wiki)
English = map("en")
def preprocessWikiBase(wikiBase: String): String ={
//return new java.net.URI(wikiBase).getScheme()
return wikiBase.stripPrefix("http://").stripPrefix("https://").stripSuffix("/$1").stripSuffix("/wiki")
}

def getLanguageFreeWikiBase(wikiBase: String): String ={
var base = preprocessWikiBase(wikiBase)
var splits = base.split("\\.")
if(splits.length > 1){
if(wikiLanguageCodes.contains(splits(0))){
var test = splits.slice(1, splits.length).mkString(".")
return test
}
}
return base
}

def updateInterwikis(interwikis : scala.collection.Map[String, String]):Unit={
for ((prefix, url) <- interwikis) {
Language.updateOneLanguage(prefix, url.stripPrefix("http://").stripSuffix("/$1").stripSuffix("/wiki"))
if(url.contains("fandom") || url.contains("wikia"))
map(prefix) = makeDbkwikLanguage(prefix, preprocessWikiBase(url), false)
}
print("test")
English = map("en")
}

def updateAllLanguages(wikiBase: String): Unit ={
def updateAllLanguages(base: String): Unit ={
//map.clear()//do not clear because we want to keep "mappings", "wikidata" etc.
var wikiBase = getLanguageFreeWikiBase(base)
for(lang <- wikiLanguageCodes)
{
try {
map(lang) = makeDbkwikLanguage(lang, wikiBase)
map(lang) = makeDbkwikLanguage(lang, wikiBase, true)
}
catch{
case mre : MissingResourceException => logger.log(Level.WARNING, "Could not create the language: " + lang)
Expand All @@ -167,9 +181,14 @@ object Language extends (String => Language)



def makeDbkwikLanguage(language : String, wikiBase: String): Language = {
def makeDbkwikLanguage(language : String, wikiBase: String, modifyBase: Boolean): Language = {
var base = wikiBase
if(language.equals("en") == false && modifyBase){
base = language + "." + base
}


val baseDomain = "dbkwik.webdatacommons.org/" + wikiBase
val baseDomain = "dbkwik.webdatacommons.org/" + base

val loc = Locale.forLanguageTag(language)

Expand All @@ -187,8 +206,8 @@ object Language extends (String => Language)
"http://" + baseDomain, //val dbpediaUri: String,
new DBpediaNamespace("http://" + baseDomain + "/resource/"), //val resourceUri: RdfNamespace,
new DBpediaNamespace("http://" + baseDomain + "/property/"), //val propertyUri: RdfNamespace,
"http://"+wikiBase.stripPrefix("http://"), //val baseUri: String,
"https://"+wikiBase.stripPrefix("http://")+"/api.php", //val apiUri: String,
"http://"+ base, //val baseUri: String,
"https://"+ base + "/api.php", //val apiUri: String,
0 //val pages: Int
)
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
package org.dbpedia.extraction.wikiparser

import org.dbpedia.extraction.ontology.RdfNamespace
import org.dbpedia.extraction.util.RichString.wrapString
import org.dbpedia.extraction.util.{Language, WikiUtil}
import org.dbpedia.extraction.util.StringUtils.replacements
import org.dbpedia.extraction.util.{Language, StringUtils, WikiUtil}
import org.dbpedia.iri.UriDecoder
import org.dbpedia.util.text.ParseExceptionIgnorer
import org.dbpedia.util.text.html.{HtmlCoder, XmlCodes}
Expand Down Expand Up @@ -41,7 +43,7 @@ class WikiTitle (
val encodedWithNamespace = withNamespace(true)

/** page IRI for this page title */
val pageIri = language.baseUri+"/wiki/"+encodedWithNamespace
val pageIri = language.baseUri+"/wiki/"+StringUtils.escape(encodedWithNamespace, WikiTitle.iriEscapes)

/** resource IRI for this page title */
val resourceIri = language.resourceUri.append(encodedWithNamespace)
Expand Down Expand Up @@ -92,6 +94,12 @@ class WikiTitle (

object WikiTitle
{
// for this list of characters, see RFC 3987 and https://sourceforge.net/mailarchive/message.php?msg_id=28982391
private val iriEscapes = {
val chars = ('\u0000' to '\u0020').mkString + "\"#%<>?[\\]^`{|}" + ('\u007F' to '\u009F').mkString
replacements('%', chars)
}

/**
* Parses a MediaWiki link or title.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,6 @@ object Namespaces
(names.toMap, codes.toMap)
}

def names(lang : Language) : Map[Int, String] = names.getOrElse(lang.wikiCode, throw new IllegalArgumentException("no namespace names found for language '"+lang.wikiCode+"'"))
def codes(lang : Language) : Map[String, Int] = codes.getOrElse(lang.wikiCode, throw new IllegalArgumentException("no namespace codes found for language '"+lang.wikiCode+"'"))
def names(lang : Language) : Map[Int, String] = names.getOrElse(lang.wikiCode, names("en"))
def codes(lang : Language) : Map[String, Int] = codes.getOrElse(lang.wikiCode, codes("en"))
}
2 changes: 1 addition & 1 deletion dump/extraction.dbkwik.properties
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ copyrightCheck=false

#parameter for xml-safe properties:

uri-policy.iri=xml-safe-predicates:*;xml-safe-subjects:*
uri-policy.iri=xml-safe:*

format.ttl=turtle-triples;uri-policy.iri

Expand Down

0 comments on commit 0db26a7

Please sign in to comment.