diff --git a/.github/workflows/snapshot_deploy.yml b/.github/workflows/snapshot_deploy.yml
index f75c29486c..e61a1fd7dc 100644
--- a/.github/workflows/snapshot_deploy.yml
+++ b/.github/workflows/snapshot_deploy.yml
@@ -77,4 +77,5 @@ jobs:
run: |
echo "Deploying to https://maven.pkg.github.com/${REPO} with revision ${REVISION}"
mvn deploy -DskipTests \
- -Dgithub.repo.url="https://maven.pkg.github.com/${REPO}"
+ -Dgithub.repo.url="https://maven.pkg.github.com/${REPO}" \
+ -DskipNexusStagingDeployMojo=true
diff --git a/core/src/main/java/org/dbpedia/extraction/nif/Link.java b/core/src/main/java/org/dbpedia/extraction/nif/Link.java
index d6b6ffb806..ddfadf5820 100644
--- a/core/src/main/java/org/dbpedia/extraction/nif/Link.java
+++ b/core/src/main/java/org/dbpedia/extraction/nif/Link.java
@@ -11,11 +11,29 @@ public class Link implements Comparable {
private boolean topicLink = false;
private boolean topicPartLink = false;
private boolean surfaceFormLink = false;
-
+ private boolean citation = false;
+ private String citationId = "";
+
public Link() {
-
+
+ }
+
+ public boolean isCitation() {
+ return citation;
+ }
+
+ public void setCitation(boolean citation) {
+ this.citation = citation;
}
-
+
+ public String getCitationId() {
+ return citationId;
+ }
+
+ public void setCitationId(String citationId) {
+ this.citationId = citationId;
+ }
+
public boolean isSurfaceFormLink() {
return surfaceFormLink;
}
@@ -91,12 +109,12 @@ public void setExternal(boolean external) {
@Override
public int compareTo(Link link) {
// TODO Auto-generated method stub
- if(this.wordStart==link.getWordStart())
+ if (this.wordStart == link.getWordStart())
return 0;
- else if(this.wordStart paragraphs = null;
private Paragraph paragraph = null;
- private Link tempLink;
+ private Link tempLink;
private boolean inSup = false;
private boolean invisible = false;
- private NifExtractorContext context;
+ private NifExtractorContext context;
private ArrayList errors = new ArrayList<>();
-
+
public LinkExtractor(NifExtractorContext context) {
- paragraphs = new ArrayList();
+ paragraphs = new ArrayList();
this.context = context;
}
-
+
/**
* Gets called when entering an element
- * -handle text cleanup and remove Wikipedia specific stuff like reference numbers
+ * -handle text cleanup and remove Wikipedia specific stuff like reference
+ * numbers
* -if we encounter a link, we make a new nif:Word
- * -we get the text out of a whitelist of elements.
- * If we encounter a non-whitelisted element, we set this.skipLevel to the current depth
- * of the dom tree and skip everything until we are back to that depth
+ * -we get the text out of a whitelist of elements.
+ * If we encounter a non-whitelisted element, we set this.skipLevel to the
+ * current depth
+ * of the dom tree and skip everything until we are back to that depth
* -this thing badly needs refactoring
*/
-
+
public void head(Node node, int depth) {
- if(skipLevel>=0){
+ if (skipLevel >= 0) {
return;
}
- if(paragraph == null) {
+ if (paragraph == null) {
paragraph = new Paragraph(0, "", "p");
}
- //ignore all content inside invisible tags
- if(invisible || node.attr("style").matches(".*display\\s*:\\s*none.*")) {
+ // ignore all content inside invisible tags
+ if (invisible || node.attr("style").matches(".*display\\s*:\\s*none.*")) {
invisible = true;
return;
}
- if(node.nodeName().equals("#text")) {
- String tempText = node.toString();
-
- //replace no-break spaces because unescape doesn't deal with them
- tempText = StringEscapeUtils.unescapeHtml4(tempText);
- tempText = org.dbpedia.extraction.util.StringUtils.escape(tempText, replaceChars());
- tempText = tempText.replace("\\n", "\n").replace("\\t", "\t").replace("\\r", "");
-
- //this text node is the content of an element: make a new nif:Word
- if(inLink) {
- if(!tempText.trim().startsWith(this.context.wikipediaTemplateString + ":")) //not!
- {
- tempLink.setLinkText(tempText);
- tempLink.setWordStart(paragraph.getLength() + (Paragraph.FollowedByWhiteSpace(paragraph.getText()) ? 1 : 0));
- paragraph.addText(tempText);
- tempLink.setWordEnd(paragraph.getLength());
- }
- else{ // -> filter out hidden links to the underlying template
- errors.add("found Template in resource: " + this.context.resource + ": " + tempText);
- return;
- }
- }
- else
- paragraph.addText(tempText);
+ if (node.nodeName().equals("#text")) {
+ String tempText = node.toString();
+
+ // replace no-break spaces because unescape doesn't deal with them
+ tempText = StringEscapeUtils.unescapeHtml4(tempText);
+ tempText = org.dbpedia.extraction.util.StringUtils.escape(tempText, replaceChars());
+ tempText = tempText.replace("\\n", "\n").replace("\\t", "\t").replace("\\r", "");
+
+ // this text node is the content of an element: make a new nif:Word
+ if (inLink) {
+ if (!tempText.trim().startsWith(this.context.wikipediaTemplateString + ":")) // not!
+ {
+ tempLink.setLinkText(tempText);
+ tempLink.setWordStart(
+ paragraph.getLength() + (Paragraph.FollowedByWhiteSpace(paragraph.getText()) ? 1 : 0));
+ paragraph.addText(tempText);
+ tempLink.setWordEnd(paragraph.getLength());
+ } else { // -> filter out hidden links to the underlying template
+ errors.add("found Template in resource: " + this.context.resource + ": " + tempText);
+ return;
+ }
+ } else
+ paragraph.addText(tempText);
}
- else if(node.nodeName().equals("a")) {
+ else if (node.nodeName().equals("a")) {
- String link = node.attr("href");
- //TODO central string management
+ String link = node.attr("href");
+ // TODO central string management
/**
- * remove internal links linking to mediawiki meta pages. Also removes links that contain ":".
+ * remove internal links linking to mediawiki meta pages. Also removes links
+ * that contain ":".
* Wikipedia api standard link looks like (allowed):
* philosopher
- * see Schopenhauer: https://en.wikipedia.org/w/api.php?uselang=en&format=xml&action=parse&prop=text&pageid=17340400
+ * see Schopenhauer:
+ * https://en.wikipedia.org/w/api.php?uselang=en&format=xml&action=parse&prop=text&pageid=17340400
*/
- String linkPrefix = "/wiki/";
+ String linkPrefix = "/wiki/";
// SPECIAL CASE FOR RESTAPI PARSING https://en.wikipedia.org/api/rest_v1/
- if(node.hasAttr("rel")) {
+ if (node.hasAttr("rel")) {
String relType = node.attr("rel");
- if(relType.equals("mw:WikiLink")){
+ if (relType.equals("mw:WikiLink")) {
tempLink = new Link();
String uri = cleanLink(node.attr("href"), false);
setUri(uri);
@@ -109,13 +112,20 @@ else if(node.nodeName().equals("a")) {
String uri = cleanLink(node.attr("href"), false);
setUri(uri);
- //simple example of Help:IPA
- // [ˈaɐ̯tʊɐ̯ ˈʃoːpn̩haʊ̯ɐ]
+ // simple example of Help:IPA
+ // [ˈaɐ̯tʊɐ̯ ˈʃoːpn̩haʊ̯ɐ]
} else if (link.contains(linkPrefix) && link.contains(":")) {
/**
* TODO buggy
* Cleans up child nodes: difficult example
- * /ˈʃoʊpənhaʊ.ər/
+ * /ˈʃoʊpənhaʊ.ər/
*/
if (!node.childNodes().isEmpty()) {
if (node.childNode(0).nodeName().equals("#text") &&
@@ -128,93 +138,98 @@ else if(node.nodeName().equals("a")) {
} else {
skipLevel = depth;
}
- //TODO add example
+ // TODO add example
} else if (node.attr("class").equals("external text")) {
- //don't skip external links
+ // don't skip external links
tempLink = new Link();
String uri = cleanLink(node.attr("href"), true);
setUri(uri);
+ } else if (link.startsWith("#cite_note-")) {
+ tempLink = new Link();
+ tempLink.setCitation(true);
+ tempLink.setCitationId(link.substring(1));
+ inLink = true;
} else {
skipLevel = depth;
}
}
- } else if(node.nodeName().equals("p")) {
- if(paragraph != null) {
- addParagraph("p");
- }
- else
- paragraph = new Paragraph(0, "", "p");
- } else if(node.nodeName().equals("sup")) {
+ } else if (node.nodeName().equals("p")) {
+ if (paragraph != null) {
+ addParagraph("p");
+ } else
+ paragraph = new Paragraph(0, "", "p");
+ } else if (node.nodeName().equals("sup")) {
inSup = true;
- } else if(node.nodeName().matches("h\\d")) {
- addParagraph(node.nodeName());
- } else if(node.nodeName().equals("table")) {
- addParagraph("table");
- paragraph.addStructure(paragraph.getLength(), node.outerHtml(), "table", node.attr("class"), node.attr("id"));
- addParagraph("p");
- skipLevel = depth;
- } else if(node.nodeName().equals("span")) {
- //denote notes
-
- if(node.attr("class").contains("notebegin"))
- addParagraph("note");
-
- } else if(node.nodeName().equals("math")) {
- addParagraph("math");
- paragraph.addStructure(paragraph.getLength(), node.outerHtml(), "math", "tex", null);
- addParagraph("p");
- skipLevel = depth;
- }
+ } else if (node.nodeName().matches("h\\d")) {
+ addParagraph(node.nodeName());
+ } else if (node.nodeName().equals("table")) {
+ addParagraph("table");
+ paragraph.addStructure(paragraph.getLength(), node.outerHtml(), "table", node.attr("class"),
+ node.attr("id"));
+ addParagraph("p");
+ skipLevel = depth;
+ } else if (node.nodeName().equals("span")) {
+ // denote notes
+
+ if (node.attr("class").contains("notebegin"))
+ addParagraph("note");
+
+ } else if (node.nodeName().equals("math")) {
+ addParagraph("math");
+ paragraph.addStructure(paragraph.getLength(), node.outerHtml(), "math", "tex", null);
+ addParagraph("p");
+ skipLevel = depth;
+ }
}
private void setUri(String uri) {
- if(uri!=null) {
- tempLink.setUri(uri);
- tempLink.setExternal(true);
- inLink = true;
- } else {
- tempLink = new Link();
- }
+ if (uri != null) {
+ tempLink.setUri(uri);
+ tempLink.setExternal(true);
+ inLink = true;
+ } else {
+ tempLink = new Link();
+ }
}
-
+
private String cleanLink(String uri, boolean external) {
- if(!external) {
+ if (!external) {
String linkPrefix = "/wiki/";
- String linkPrefix2= "./";
- if(uri.contains(linkPrefix)){
- uri=uri.substring(uri.indexOf("?title=")+7);
+ String linkPrefix2 = "./";
+ if (uri.contains(linkPrefix)) {
+ uri = uri.substring(uri.indexOf("?title=") + 7);
} else if (uri.contains(linkPrefix2)) {
- uri=uri.substring(uri.indexOf("?title=")+3);
+ uri = uri.substring(uri.indexOf("?title=") + 3);
}
- //TODO central string management
- if(!this.context.language.equals("en")) {
- uri="http://"+this.context.language+".dbpedia.org/resource/"+uri;
+ // TODO central string management
+ if (!this.context.language.equals("en")) {
+ uri = "http://" + this.context.language + ".dbpedia.org/resource/" + uri;
} else {
- uri="http://dbpedia.org/resource/"+uri;
+ uri = "http://dbpedia.org/resource/" + uri;
}
uri = uri.replace("&action=edit&redlink=1", "");
-
+
} else {
- //there are links that contain illegal hostnames
+ // there are links that contain illegal hostnames
try {
- if(uri.startsWith("//"))
- uri = "http:"+uri;
- uri = URLEncoder.encode(uri,"UTF-8");
+ if (uri.startsWith("//"))
+ uri = "http:" + uri;
+ uri = URLEncoder.encode(uri, "UTF-8");
uri = uri.replace("%3A", ":").replace("%2F", "/").replace("%2E", ".");
-
- } catch(UnsupportedEncodingException e) {
- //this doesn't happen
+
+ } catch (UnsupportedEncodingException e) {
+ // this doesn't happen
e.printStackTrace();
}
}
return UriUtils.uriToDbpediaIri(uri).toString();
}
-
+
public void tail(Node node, int depth) {
- if(skipLevel>0) {
- if(skipLevel==depth) {
+ if (skipLevel > 0) {
+ if (skipLevel == depth) {
skipLevel = -1;
return;
} else {
@@ -222,74 +237,68 @@ public void tail(Node node, int depth) {
}
}
- if(node.nodeName().equals("a") && inLink) {
+ if (node.nodeName().equals("a") && inLink) {
inLink = false;
paragraph.addLink(tempLink);
tempLink = new Link();
- }
- else if(invisible && node.attr("style").matches(".*display\\s*:\\s*none.*")) {
- invisible = false;
- }
- else if(node.nodeName().equals("p") && paragraph != null) {
- addParagraph("p");
- }
- else if(node.nodeName().equals("sup") && inSup) {
+ } else if (invisible && node.attr("style").matches(".*display\\s*:\\s*none.*")) {
+ invisible = false;
+ } else if (node.nodeName().equals("p") && paragraph != null) {
+ addParagraph("p");
+ } else if (node.nodeName().equals("sup") && inSup) {
inSup = false;
+ } else if (node.nodeName().matches("h\\d")) {
+ addParagraph("p");
+ } else if (node.nodeName().equals("span")) {
+ if (node.attr("class").contains("noteend"))
+ addParagraph("p");
}
- else if(node.nodeName().matches("h\\d")) {
- addParagraph("p");
- }
- else if(node.nodeName().equals("span")) {
- if(node.attr("class").contains("noteend"))
- addParagraph("p");
- }
}
-
+
public List getParagraphs() {
- if(paragraph != null && paragraph.getLength() > 0)
- {
- paragraphs.add(paragraph);
- paragraph = null;
- }
- return paragraphs;
+ if (paragraph != null && paragraph.getLength() > 0) {
+ paragraphs.add(paragraph);
+ paragraph = null;
+ }
+ return paragraphs;
}
- private void addParagraph(String newTag){
- if(paragraph.getLength() != 0 || paragraph.getHtmlStrings().size() > 0)
- paragraphs.add(paragraph);
+ private void addParagraph(String newTag) {
+ if (paragraph.getLength() != 0 || paragraph.getHtmlStrings().size() > 0)
+ paragraphs.add(paragraph);
- paragraph = new Paragraph(0, "", (newTag == null ? "p" : newTag));
- }
+ paragraph = new Paragraph(0, "", (newTag == null ? "p" : newTag));
+ }
- public int getTableCount(){
- int count =0;
- for(Paragraph p : this.getParagraphs()){
+ public int getTableCount() {
+ int count = 0;
+ for (Paragraph p : this.getParagraphs()) {
count += paragraph.getHtmlStrings().size();
}
return count;
}
- public ArrayList getErrors(){
+ public ArrayList getErrors() {
return errors;
}
- private String[] replaceChars() {
- String[] rep = new String[256];
- rep['\n'] = "";
- rep['\u00A0'] = " ";
- return rep;
- }
-
- public static class NifExtractorContext {
- private String language;
- private String resource;
- private String wikipediaTemplateString;
-
- public NifExtractorContext(String language, String resource, String templateString){
- this.language = language;
- this.resource = resource;
- this.wikipediaTemplateString = templateString;
- }
- }
+ private String[] replaceChars() {
+ String[] rep = new String[256];
+ rep['\n'] = "";
+ rep['\u00A0'] = " ";
+ return rep;
+ }
+
+ public static class NifExtractorContext {
+ private String language;
+ private String resource;
+ private String wikipediaTemplateString;
+
+ public NifExtractorContext(String language, String resource, String templateString) {
+ this.language = language;
+ this.resource = resource;
+ this.wikipediaTemplateString = templateString;
+ }
+ }
}
diff --git a/core/src/main/resources/nifextractionconfig.json b/core/src/main/resources/nifextractionconfig.json
index c4548f431b..d66d3ceb55 100644
--- a/core/src/main/resources/nifextractionconfig.json
+++ b/core/src/main/resources/nifextractionconfig.json
@@ -14,7 +14,6 @@
"nif-remove-elements":[
".noprint",
".haudio",
- "sup.reference",
"span.mw-editsection",
".error",
"#coordinates",
diff --git a/core/src/main/scala/org/dbpedia/extraction/mappings/DisambiguationExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/mappings/DisambiguationExtractor.scala
index e9beac4911..3269244113 100644
--- a/core/src/main/scala/org/dbpedia/extraction/mappings/DisambiguationExtractor.scala
+++ b/core/src/main/scala/org/dbpedia/extraction/mappings/DisambiguationExtractor.scala
@@ -22,7 +22,7 @@ extends PageNodeExtractor
{
private val language = context.language
- private val replaceString = DisambiguationExtractorConfig.disambiguationTitlePartMap(language.wikiCode)
+ private val replaceString = DisambiguationExtractorConfig.disambiguationTitlePartMap.getOrElse(language.wikiCode, " (disambiguation)")
val wikiPageDisambiguatesProperty = context.ontology.properties("wikiPageDisambiguates")
diff --git a/core/src/main/scala/org/dbpedia/extraction/mappings/InfoboxExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/mappings/InfoboxExtractor.scala
index 779975518a..c10f249795 100644
--- a/core/src/main/scala/org/dbpedia/extraction/mappings/InfoboxExtractor.scala
+++ b/core/src/main/scala/org/dbpedia/extraction/mappings/InfoboxExtractor.scala
@@ -58,6 +58,7 @@ extends PageNodeExtractor
private val ignoreProperties = InfoboxExtractorConfig.ignoreProperties
private val labelProperty = ontology.properties("rdfs:label")
+ private val commentProperty = ontology.properties("rdfs:comment")
private val typeProperty = ontology.properties("rdf:type")
private val propertyClass = ontology.classes("rdf:Property")
private val rdfLangStrDt = ontology.datatypes("rdf:langString")
@@ -165,6 +166,8 @@ extends PageNodeExtractor
seenProperties += propertyUri
quads += new Quad(language, DBpediaDatasets.InfoboxPropertyDefinitions, propertyUri, typeProperty, propertyClass.uri, splitNode.sourceIri)
quads += new Quad(language, DBpediaDatasets.InfoboxPropertyDefinitions, propertyUri, labelProperty, propertyLabel, splitNode.sourceIri, rdfLangStrDt)
+ val propertyComment = "Raw property extracted from Wikipedia infobox: " + property.key
+ quads += new Quad(language, DBpediaDatasets.InfoboxPropertyDefinitions, propertyUri, commentProperty, propertyComment, splitNode.sourceIri, rdfLangStrDt)
}
}
}
diff --git a/core/src/main/scala/org/dbpedia/extraction/nif/HtmlNifExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/nif/HtmlNifExtractor.scala
index 130d2c7e92..d2d8f47c5f 100755
--- a/core/src/main/scala/org/dbpedia/extraction/nif/HtmlNifExtractor.scala
+++ b/core/src/main/scala/org/dbpedia/extraction/nif/HtmlNifExtractor.scala
@@ -39,6 +39,7 @@ abstract class HtmlNifExtractor(nifContextIri: String, language: String, nifPara
protected val templateString = "Template"
private val sectionMap = new mutable.HashMap[PageSection, ExtractedSection]()
+ private val citationMap = new mutable.HashMap[String, String]()
/**
* Extract the relevant html page divided in sections and paragraphs
@@ -285,9 +286,17 @@ abstract class HtmlNifExtractor(nifContextIri: String, language: String, nifPara
words += nifLinks(word, RdfNamespace.NIF.append("beginIndex"), (offset + link.getWordStart).toString, sourceUrl, RdfNamespace.XSD.append("nonNegativeInteger"))
words += nifLinks(word, RdfNamespace.NIF.append("endIndex"), (offset + link.getWordEnd).toString, sourceUrl, RdfNamespace.XSD.append("nonNegativeInteger"))
words += nifLinks(word, RdfNamespace.NIF.append("superString"), paragraphUri, sourceUrl, null)
- UriUtils.createURI(link.getUri) match{
- case Success(s) => words += nifLinks(word, "http://www.w3.org/2005/11/its/rdf#taIdentRef", s.toString, sourceUrl, null) //TODO IRI's might throw exception in org.dbpedia.extraction.destinations.formatters please check this
- case Failure(f) =>
+ if (link.isCitation) {
+ words += nifLinks(word, RdfNamespace.RDF.append("type"), "http://dbpedia.org/ontology/Citation", sourceUrl, null)
+ citationMap.get(link.getCitationId) match {
+ case Some(url) => words += nifLinks(word, "http://www.w3.org/2005/11/its/rdf#taIdentRef", url, sourceUrl, null)
+ case None =>
+ }
+ } else {
+ UriUtils.createURI(link.getUri) match{
+ case Success(s) => words += nifLinks(word, "http://www.w3.org/2005/11/its/rdf#taIdentRef", s.toString, sourceUrl, null) //TODO IRI's might throw exception in org.dbpedia.extraction.destinations.formatters please check this
+ case Failure(f) =>
+ }
}
if(writeLinkAnchors)
words += nifLinks(word, RdfNamespace.NIF.append("anchorOf"), link.getLinkText, sourceUrl, RdfNamespace.XSD.append("string"))
@@ -346,6 +355,15 @@ abstract class HtmlNifExtractor(nifContextIri: String, language: String, nifPara
protected def getJsoupDoc(html: String): Document = {
val doc = Jsoup.parse(cleanHtml(html))
+ //extract citations
+ for(note <- doc.select("li[id^=cite_note-]").asScala){
+ val id = note.id()
+ val extLink = note.select("a.external.text").first()
+ if (extLink != null) {
+ citationMap.put(id, extLink.attr("href"))
+ }
+ }
+
//delete queries
for(query <- cssSelectorConfigMap.removeElements)
for(item <- doc.select(query).asScala)
diff --git a/core/src/main/scala/org/dbpedia/extraction/util/DateFinder.scala b/core/src/main/scala/org/dbpedia/extraction/util/DateFinder.scala
index d3b00d29f3..ef3e77cae1 100644
--- a/core/src/main/scala/org/dbpedia/extraction/util/DateFinder.scala
+++ b/core/src/main/scala/org/dbpedia/extraction/util/DateFinder.scala
@@ -18,19 +18,23 @@ class DateFinder[T](val finder: Finder[T]){
_date
else throw new IllegalStateException("date not set")
- def byName(name: String, auto: Boolean = false): Option[T] = {
+ def byName(name: String, auto: Boolean = false, required: Boolean = true): Option[T] = {
if (_date == null) {
if (! auto)
throw new IllegalStateException("date not set")
- _date = finder.dates(name).last
+ val dates = finder.dates(name, required)
+ if (dates.isEmpty) return None
+ _date = dates.last
}
finder.file(_date, name)
}
- def byPattern (pattern: String, auto: Boolean = false): Seq[T] = {
+ def byPattern (pattern: String, auto: Boolean = false, required: Boolean = true): Seq[T] = {
if (_date == null) {
if (! auto) throw new IllegalStateException("date not set")
- _date = finder.dates(pattern, true, true).last
+ val dates = finder.dates(pattern, required, isSuffixRegex = true)
+ if (dates.isEmpty) return Seq.empty
+ _date = dates.last
}
finder.matchFiles(_date, pattern).toSeq
}
diff --git a/core/src/main/scala/org/dbpedia/extraction/util/Finder.scala b/core/src/main/scala/org/dbpedia/extraction/util/Finder.scala
index 715647bf8a..fa6bd579c3 100644
--- a/core/src/main/scala/org/dbpedia/extraction/util/Finder.scala
+++ b/core/src/main/scala/org/dbpedia/extraction/util/Finder.scala
@@ -63,7 +63,7 @@ class Finder[T](val baseDir: T, val language: Language, val wikiNameSuffix: Stri
case None => false
}}
- val dates = wikiDir.names.filter(dateFilter).filter(suffixFilter).sortBy(_.toInt)
+ val dates = wikiDir.names.filter(dateFilter).filter(suffixFilter).distinct.sortBy(_.toInt)
if (required && dates.isEmpty) {
var msg = "found no directory "+wikiDir+"/[YYYYMMDD]"
diff --git a/dump/src/test/resources/extraction-configs/extraction.nif.abstracts.properties b/dump/src/test/resources/extraction-configs/extraction.nif.abstracts.properties
index a0a85face1..bd35aa12a6 100755
--- a/dump/src/test/resources/extraction-configs/extraction.nif.abstracts.properties
+++ b/dump/src/test/resources/extraction-configs/extraction.nif.abstracts.properties
@@ -112,3 +112,4 @@ nif-isTestRun=false
nif-write-anchor=true
# write only the anchor text for link instances
nif-write-link-anchor=true
+compare-dataset-ids=false
diff --git a/dump/src/test/resources/extraction-configs/extraction.plain.abstracts.properties b/dump/src/test/resources/extraction-configs/extraction.plain.abstracts.properties
index e246433ce7..e407712cb9 100755
--- a/dump/src/test/resources/extraction-configs/extraction.plain.abstracts.properties
+++ b/dump/src/test/resources/extraction-configs/extraction.plain.abstracts.properties
@@ -105,3 +105,4 @@ nif-isTestRun=false
nif-write-anchor=true
# write only the anchor text for link instances
nif-write-link-anchor=true
+compare-dataset-ids=false
diff --git a/dump/src/test/resources/extraction-configs/generic-spark.extraction.minidump.properties b/dump/src/test/resources/extraction-configs/generic-spark.extraction.minidump.properties
index df7bbf26e5..ac494341ce 100644
--- a/dump/src/test/resources/extraction-configs/generic-spark.extraction.minidump.properties
+++ b/dump/src/test/resources/extraction-configs/generic-spark.extraction.minidump.properties
@@ -126,3 +126,6 @@ extractors.zh=
# If we need to Exclude Non-Free Images in this Extraction, set this to true
copyrightCheck=false
+# Disable remote DataID comparison for tests to avoid external dependencies
+compare-dataset-ids=false
+
diff --git a/dump/src/test/resources/extraction-configs/mappings.extraction.minidump.properties b/dump/src/test/resources/extraction-configs/mappings.extraction.minidump.properties
index 0b0a0e4879..710f2e6f28 100644
--- a/dump/src/test/resources/extraction-configs/mappings.extraction.minidump.properties
+++ b/dump/src/test/resources/extraction-configs/mappings.extraction.minidump.properties
@@ -28,3 +28,7 @@ extractors=.MappingExtractor
# If we need to Exclude Non-Free Images in this Extraction, set this to true
copyrightCheck=false
+# Disable remote DataID comparison for tests to avoid external dependencies
+compare-dataset-ids=false
+
+
diff --git a/dump/src/test/resources/extraction-configs/wikidata.extraction.properties b/dump/src/test/resources/extraction-configs/wikidata.extraction.properties
index 4f539985df..eb9b8d8319 100644
--- a/dump/src/test/resources/extraction-configs/wikidata.extraction.properties
+++ b/dump/src/test/resources/extraction-configs/wikidata.extraction.properties
@@ -30,3 +30,4 @@ extractors=
extractors.wikidata=.WikidataLexemeExtractor,.WikidataRawExtractor,.WikidataSameAsExtractor,.WikidataR2RExtractor,.WikidataLLExtractor,.WikidataReferenceExtractor,.WikidataAliasExtractor,.WikidataLabelExtractor,.WikidataNameSpaceSameAsExtractor,.WikidataPropertyExtractor,.WikidataDescriptionExtractor
+compare-dataset-ids=false
diff --git a/scripts/src/main/scala/org/dbpedia/extraction/scripts/MapObjectUris.scala b/scripts/src/main/scala/org/dbpedia/extraction/scripts/MapObjectUris.scala
index c408373a3c..312216b9db 100644
--- a/scripts/src/main/scala/org/dbpedia/extraction/scripts/MapObjectUris.scala
+++ b/scripts/src/main/scala/org/dbpedia/extraction/scripts/MapObjectUris.scala
@@ -5,7 +5,7 @@ import java.io.File
import org.apache.jena.ext.com.google.common.collect.{Multimaps, TreeMultimap}
import org.dbpedia.extraction.config.ConfigUtils.parseLanguages
import org.dbpedia.extraction.util.RichFile.wrapFile
-import org.dbpedia.extraction.util.{DateFinder, Language, SimpleWorkers, Workers}
+import org.dbpedia.extraction.util.{DateFinder, Language, SimpleWorkers, Workers, RichFile, FileLike}
import scala.Console.err
import scala.collection.convert.decorateAsScala._
@@ -125,46 +125,46 @@ object MapObjectUris {
Workers.work(SimpleWorkers(1.5, 1.0) { mapping: String =>
var count = 0
- new QuadMapper().readQuads(finder, mapping + mappingSuffix, auto = true) { quad =>
+ new QuadMapper().readQuads(finder, mapping + mappingSuffix, auto = true, required = false) { quad =>
if (quad.datatype != null) throw new IllegalArgumentException(mapping + ": expected object uri, found object literal: " + quad)
- // TODO: this wastes a lot of space. Storing the part after ...dbpedia.org/resource/ would
- // be enough. Also, the fields of the Quad are derived by calling substring() on the whole
- // line, which means that the character array for the whole line is kept in memory, which
- // basically means that the whole redirects file is kept in memory. We should
- // - only store the resource title in the map
- // - use new String(quad.subject), new String(quad.value) to cut the link to the whole line
- // - maybe use an index of titles as in ProcessInterLanguageLinks to avoid storing duplicate titles
map.put(quad.subject, quad.value)
count += 1
}
- err.println(mapping + ": found " + count + " mappings")
+ if (count > 0) err.println(mapping + ": found " + count + " mappings")
+ else err.println(mapping + ": mapping file not found or empty, skipping for " + language.wikiCode)
}, mappings.toList)
Workers.work(SimpleWorkers(1.5, 1.0) { input: (String, String) =>
- var count = 0
- val inputFile = if(isExternal) new File(secondary, input._1 + input._2) else finder.byName(input._1 + input._2, auto = true).get
- val outputFile = if(isExternal) new File(secondary, input._1 + extension + input._2) else finder.byName(input._1 + extension + input._2, auto = true).get
- new QuadMapper().mapQuads(language, inputFile, outputFile) { quad =>
-
- if (quad.datatype != null) {
- // just copy quad with literal values. TODO: make this configurable
- List(quad)
- }
- else {
- val uris = map.get(quad.value).asScala
- count = count + 1
- val ret = for (uri <- uris)
- yield quad.copy(
- value = uri, // change object URI
- context = if (quad.context == null) quad.context else quad.context + "&objectMappedFrom=" + quad.value) // add change provenance
- // none found
- if(ret.isEmpty)
+ var changeCount = 0
+ val inputFileOption: Option[FileLike[_]] = if(isExternal) Some(new RichFile(new File(secondary, input._1 + input._2))) else finder.byName(input._1 + input._2, auto = true, required = false).map(x => x: FileLike[_])
+ val outputFileOption: Option[FileLike[_]] = if(isExternal) Some(new RichFile(new File(secondary, input._1 + extension + input._2))) else finder.byName(input._1 + extension + input._2, auto = true, required = false).map(x => x: FileLike[_])
+
+ if (inputFileOption.isDefined && outputFileOption.isDefined && inputFileOption.get.exists) {
+ val inputFile = inputFileOption.get
+ val outputFile = outputFileOption.get
+ new QuadMapper().mapQuads(language, inputFile, outputFile) { quad =>
+ if (quad.datatype != null) {
+ // just copy quad with literal values. TODO: make this configurable
List(quad)
- else
- ret
+ }
+ else {
+ val uris = map.get(quad.value).asScala
+ changeCount = changeCount + 1
+ val ret = for (uri <- uris)
+ yield quad.copy(
+ value = uri, // change object URI
+ context = if (quad.context == null) quad.context else quad.context + "&objectMappedFrom=" + quad.value) // add change provenance
+ // none found
+ if(ret.isEmpty)
+ List(quad)
+ else
+ ret
+ }
}
+ err.println(input._1 + ": changed " + changeCount + " quads.")
+ } else {
+ err.println(input._1 + ": input file not found or empty, skipping for " + language.wikiCode)
}
- err.println(input._1 + ": changed " + count + " quads.")
}, inputs.flatMap(x => suffixes.map(y => (x, y))).toList)
}
}
diff --git a/scripts/src/main/scala/org/dbpedia/extraction/scripts/QuadReader.scala b/scripts/src/main/scala/org/dbpedia/extraction/scripts/QuadReader.scala
index 08ffad9cde..e6c5b4bdc1 100644
--- a/scripts/src/main/scala/org/dbpedia/extraction/scripts/QuadReader.scala
+++ b/scripts/src/main/scala/org/dbpedia/extraction/scripts/QuadReader.scala
@@ -40,16 +40,21 @@ class QuadReader(log: FileLike[File] = null, preamble: String = null) {
* @param input file name, e.g. interlanguage-links-same-as.nt.gz
* @param proc process quad
*/
- def readQuads[T <% FileLike[T]](finder: DateFinder[T], input: String, auto: Boolean = false)(proc: Quad => Unit): Unit = {
- readQuads(finder.language, finder.byName(input, auto).get)(proc)
+ def readQuads[T <% FileLike[T]](finder: DateFinder[T], input: String, auto: Boolean = false, required: Boolean = true)(proc: Quad => Unit): Unit = {
+ finder.byName(input, auto, required) match {
+ case Some(file) if file.exists => readQuads(finder.language, file)(proc)
+ case _ => if (required) throw new IllegalArgumentException("file " + input + " not found")
+ }
}
/**
* @param pattern regex of filenemes
* @param proc process quad
*/
- def readQuadsOfMultipleFiles[T <% FileLike[T]](finder: DateFinder[T], pattern: String, auto: Boolean = false)(proc: Quad => Unit): Unit = {
- for(file <- finder.byPattern(pattern, auto))
+ def readQuadsOfMultipleFiles[T <% FileLike[T]](finder: DateFinder[T], pattern: String, auto: Boolean = false, required: Boolean = true)(proc: Quad => Unit): Unit = {
+ val files = finder.byPattern(pattern, auto, required)
+ if (required && files.isEmpty) throw new IllegalArgumentException("no files found for pattern " + pattern)
+ for(file <- files)
readQuads(finder.language, file)(proc)
}
diff --git a/scripts/src/main/scala/org/dbpedia/extraction/scripts/ResolveTransitiveLinks.scala b/scripts/src/main/scala/org/dbpedia/extraction/scripts/ResolveTransitiveLinks.scala
index 78fbc7d6dd..6b9db230f9 100644
--- a/scripts/src/main/scala/org/dbpedia/extraction/scripts/ResolveTransitiveLinks.scala
+++ b/scripts/src/main/scala/org/dbpedia/extraction/scripts/ResolveTransitiveLinks.scala
@@ -67,7 +67,11 @@ object ResolveTransitiveLinks {
Workers.work(SimpleWorkers(1.5, 1.0) { language: Language =>
val finder = new DateFinder(baseDir, language)
- finder.byName("redirects" + suffix, auto = true)
+ val inputFile = finder.byName(input + suffix, auto = true, required = false)
+ if (inputFile.isEmpty) {
+ err.println(language.wikiCode + ": input dataset " + input + " not found, skipping.")
+ return
+ }
// use LinkedHashMap to preserve order
val map = new LinkedHashMap[String, String]()
@@ -96,7 +100,7 @@ object ResolveTransitiveLinks {
var count = 0
var predicate: String = null
- new QuadMapper(logfile).readQuads(finder, input + suffix) { quad =>
+ new QuadMapper(logfile).readQuads(finder, input + suffix, auto = true, required = false) { quad =>
wikidatamap.get(quad.subject) match {
case Some(s) =>
count = count +1 //do nothing since we dont want to redirect if a wikidata uri exists
@@ -108,6 +112,11 @@ object ResolveTransitiveLinks {
}
}
}
+
+ if (predicate == null) {
+ err.println(language.wikiCode + ": input dataset " + input + " not found or empty, skipping.")
+ return
+ }
err.println(language.wikiCode + ": " + count + " redirects were suppressed since they have a wikidata uri")
val buildQuad = QuadBuilder.stringPredicate(language, DBpediaDatasets.RedirectsTransitive, predicate) _
diff --git a/scripts/src/main/scala/org/dbpedia/extraction/scripts/TypeConsistencyCheck.scala b/scripts/src/main/scala/org/dbpedia/extraction/scripts/TypeConsistencyCheck.scala
index edf76eb2e1..0771746874 100644
--- a/scripts/src/main/scala/org/dbpedia/extraction/scripts/TypeConsistencyCheck.scala
+++ b/scripts/src/main/scala/org/dbpedia/extraction/scripts/TypeConsistencyCheck.scala
@@ -98,12 +98,23 @@ object TypeConsistencyCheck {
// create destination for this language
val finder = new Finder[File](baseDir, lang, "wiki")
- val date = finder.dates().last
- val destination = createDestination(finder, date, formats)
-
- val typeDatasetFile: File = finder.file(date, typesDataset).get
- val mappedTripleDatasetFile: File = finder.file(date, mappedTripleDataset).get
- checkTypeConsistency(ontology, typeDatasetFile, mappedTripleDatasetFile, destination, lang)
+ val dates = finder.dates(required = false)
+ if (dates.isEmpty) {
+ Console.err.println(lang.wikiCode + ": no date directory found, skipping.")
+ } else {
+ val date = dates.last
+ val destination = createDestination(finder, date, formats)
+
+ val typeDatasetFileOption = finder.file(date, typesDataset)
+ val mappedTripleDatasetFileOption = finder.file(date, mappedTripleDataset)
+
+ (typeDatasetFileOption, mappedTripleDatasetFileOption) match {
+ case (Some(typeDatasetFile), Some(mappedTripleDatasetFile)) if typeDatasetFile.exists && mappedTripleDatasetFile.exists =>
+ checkTypeConsistency(ontology, typeDatasetFile, mappedTripleDatasetFile, destination, lang)
+ case _ =>
+ Console.err.println(lang.wikiCode + ": required datasets missing, skipping.")
+ }
+ }
}
}