From 230c39faedc0cee6300e61134dd1d947e45e4c7d Mon Sep 17 00:00:00 2001 From: Anshuman Dutta Date: Tue, 10 Mar 2026 23:01:26 +0530 Subject: [PATCH 1/8] Fix NoSuchElementException in DisambiguationExtractor by using getOrElse for language config --- .../dbpedia/extraction/mappings/DisambiguationExtractor.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/scala/org/dbpedia/extraction/mappings/DisambiguationExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/mappings/DisambiguationExtractor.scala index e9beac4911..3269244113 100644 --- a/core/src/main/scala/org/dbpedia/extraction/mappings/DisambiguationExtractor.scala +++ b/core/src/main/scala/org/dbpedia/extraction/mappings/DisambiguationExtractor.scala @@ -22,7 +22,7 @@ extends PageNodeExtractor { private val language = context.language - private val replaceString = DisambiguationExtractorConfig.disambiguationTitlePartMap(language.wikiCode) + private val replaceString = DisambiguationExtractorConfig.disambiguationTitlePartMap.getOrElse(language.wikiCode, " (disambiguation)") val wikiPageDisambiguatesProperty = context.ontology.properties("wikiPageDisambiguates") From f55682b09afb86a6db97ed846ee1bb1f231de1d3 Mon Sep 17 00:00:00 2001 From: Anshuman Dutta Date: Sat, 14 Mar 2026 19:16:50 +0530 Subject: [PATCH 2/8] Skip Nexus staging during snapshot deployment to GitHub Packages --- .github/workflows/snapshot_deploy.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/snapshot_deploy.yml b/.github/workflows/snapshot_deploy.yml index f75c29486c..e61a1fd7dc 100644 --- a/.github/workflows/snapshot_deploy.yml +++ b/.github/workflows/snapshot_deploy.yml @@ -77,4 +77,5 @@ jobs: run: | echo "Deploying to https://maven.pkg.github.com/${REPO} with revision ${REVISION}" mvn deploy -DskipTests \ - -Dgithub.repo.url="https://maven.pkg.github.com/${REPO}" + -Dgithub.repo.url="https://maven.pkg.github.com/${REPO}" \ + -DskipNexusStagingDeployMojo=true From 78794006039ffafc9963b3f6a0ececbd8df30076 Mon Sep 17 00:00:00 2001 From: Anshuman Dutta Date: Mon, 16 Mar 2026 16:51:05 +0530 Subject: [PATCH 3/8] Implement NIF-based citation extraction with precise character offsets --- .../java/org/dbpedia/extraction/nif/Link.java | 30 +- .../dbpedia/extraction/nif/LinkExtractor.java | 319 +++++++++--------- .../main/resources/nifextractionconfig.json | 1 - .../extraction/nif/HtmlNifExtractor.scala | 24 +- 4 files changed, 209 insertions(+), 165 deletions(-) diff --git a/core/src/main/java/org/dbpedia/extraction/nif/Link.java b/core/src/main/java/org/dbpedia/extraction/nif/Link.java index d6b6ffb806..ddfadf5820 100644 --- a/core/src/main/java/org/dbpedia/extraction/nif/Link.java +++ b/core/src/main/java/org/dbpedia/extraction/nif/Link.java @@ -11,11 +11,29 @@ public class Link implements Comparable { private boolean topicLink = false; private boolean topicPartLink = false; private boolean surfaceFormLink = false; - + private boolean citation = false; + private String citationId = ""; + public Link() { - + + } + + public boolean isCitation() { + return citation; + } + + public void setCitation(boolean citation) { + this.citation = citation; } - + + public String getCitationId() { + return citationId; + } + + public void setCitationId(String citationId) { + this.citationId = citationId; + } + public boolean isSurfaceFormLink() { return surfaceFormLink; } @@ -91,12 +109,12 @@ public void setExternal(boolean external) { @Override public int compareTo(Link link) { // TODO Auto-generated method stub - if(this.wordStart==link.getWordStart()) + if (this.wordStart == link.getWordStart()) return 0; - else if(this.wordStart paragraphs = null; private Paragraph paragraph = null; - private Link tempLink; + private Link tempLink; private boolean inSup = false; private boolean invisible = false; - private NifExtractorContext context; + private NifExtractorContext context; private ArrayList errors = new ArrayList<>(); - + public LinkExtractor(NifExtractorContext context) { - paragraphs = new ArrayList(); + paragraphs = new ArrayList(); this.context = context; } - + /** * Gets called when entering an element - * -handle text cleanup and remove Wikipedia specific stuff like reference numbers + * -handle text cleanup and remove Wikipedia specific stuff like reference + * numbers * -if we encounter a link, we make a new nif:Word - * -we get the text out of a whitelist of elements. - * If we encounter a non-whitelisted element, we set this.skipLevel to the current depth - * of the dom tree and skip everything until we are back to that depth + * -we get the text out of a whitelist of elements. + * If we encounter a non-whitelisted element, we set this.skipLevel to the + * current depth + * of the dom tree and skip everything until we are back to that depth * -this thing badly needs refactoring */ - + public void head(Node node, int depth) { - if(skipLevel>=0){ + if (skipLevel >= 0) { return; } - if(paragraph == null) { + if (paragraph == null) { paragraph = new Paragraph(0, "", "p"); } - //ignore all content inside invisible tags - if(invisible || node.attr("style").matches(".*display\\s*:\\s*none.*")) { + // ignore all content inside invisible tags + if (invisible || node.attr("style").matches(".*display\\s*:\\s*none.*")) { invisible = true; return; } - if(node.nodeName().equals("#text")) { - String tempText = node.toString(); - - //replace no-break spaces because unescape doesn't deal with them - tempText = StringEscapeUtils.unescapeHtml4(tempText); - tempText = org.dbpedia.extraction.util.StringUtils.escape(tempText, replaceChars()); - tempText = tempText.replace("\\n", "\n").replace("\\t", "\t").replace("\\r", ""); - - //this text node is the content of an element: make a new nif:Word - if(inLink) { - if(!tempText.trim().startsWith(this.context.wikipediaTemplateString + ":")) //not! - { - tempLink.setLinkText(tempText); - tempLink.setWordStart(paragraph.getLength() + (Paragraph.FollowedByWhiteSpace(paragraph.getText()) ? 1 : 0)); - paragraph.addText(tempText); - tempLink.setWordEnd(paragraph.getLength()); - } - else{ // -> filter out hidden links to the underlying template - errors.add("found Template in resource: " + this.context.resource + ": " + tempText); - return; - } - } - else - paragraph.addText(tempText); + if (node.nodeName().equals("#text")) { + String tempText = node.toString(); + + // replace no-break spaces because unescape doesn't deal with them + tempText = StringEscapeUtils.unescapeHtml4(tempText); + tempText = org.dbpedia.extraction.util.StringUtils.escape(tempText, replaceChars()); + tempText = tempText.replace("\\n", "\n").replace("\\t", "\t").replace("\\r", ""); + + // this text node is the content of an element: make a new nif:Word + if (inLink) { + if (!tempText.trim().startsWith(this.context.wikipediaTemplateString + ":")) // not! + { + tempLink.setLinkText(tempText); + tempLink.setWordStart( + paragraph.getLength() + (Paragraph.FollowedByWhiteSpace(paragraph.getText()) ? 1 : 0)); + paragraph.addText(tempText); + tempLink.setWordEnd(paragraph.getLength()); + } else { // -> filter out hidden links to the underlying template + errors.add("found Template in resource: " + this.context.resource + ": " + tempText); + return; + } + } else + paragraph.addText(tempText); } - else if(node.nodeName().equals("a")) { + else if (node.nodeName().equals("a")) { - String link = node.attr("href"); - //TODO central string management + String link = node.attr("href"); + // TODO central string management /** - * remove internal links linking to mediawiki meta pages. Also removes links that contain ":". + * remove internal links linking to mediawiki meta pages. Also removes links + * that contain ":". * Wikipedia api standard link looks like (allowed): * philosopher - * see Schopenhauer: https://en.wikipedia.org/w/api.php?uselang=en&format=xml&action=parse&prop=text&pageid=17340400 + * see Schopenhauer: + * https://en.wikipedia.org/w/api.php?uselang=en&format=xml&action=parse&prop=text&pageid=17340400 */ - String linkPrefix = "/wiki/"; + String linkPrefix = "/wiki/"; // SPECIAL CASE FOR RESTAPI PARSING https://en.wikipedia.org/api/rest_v1/ - if(node.hasAttr("rel")) { + if (node.hasAttr("rel")) { String relType = node.attr("rel"); - if(relType.equals("mw:WikiLink")){ + if (relType.equals("mw:WikiLink")) { tempLink = new Link(); String uri = cleanLink(node.attr("href"), false); setUri(uri); @@ -109,13 +112,20 @@ else if(node.nodeName().equals("a")) { String uri = cleanLink(node.attr("href"), false); setUri(uri); - //simple example of Help:IPA - // [ˈaɐ̯tʊɐ̯ ˈʃoːpn̩haʊ̯ɐ] + // simple example of Help:IPA + // [ˈaɐ̯tʊɐ̯ ˈʃoːpn̩haʊ̯ɐ] } else if (link.contains(linkPrefix) && link.contains(":")) { /** * TODO buggy * Cleans up child nodes: difficult example - * /ˈʃpənh.ər/ + * /ˈʃpənh.ər/ */ if (!node.childNodes().isEmpty()) { if (node.childNode(0).nodeName().equals("#text") && @@ -128,93 +138,98 @@ else if(node.nodeName().equals("a")) { } else { skipLevel = depth; } - //TODO add example + // TODO add example } else if (node.attr("class").equals("external text")) { - //don't skip external links + // don't skip external links tempLink = new Link(); String uri = cleanLink(node.attr("href"), true); setUri(uri); + } else if (link.startsWith("#cite_note-")) { + tempLink = new Link(); + tempLink.setCitation(true); + tempLink.setCitationId(link.substring(1)); + inLink = true; } else { skipLevel = depth; } } - } else if(node.nodeName().equals("p")) { - if(paragraph != null) { - addParagraph("p"); - } - else - paragraph = new Paragraph(0, "", "p"); - } else if(node.nodeName().equals("sup")) { + } else if (node.nodeName().equals("p")) { + if (paragraph != null) { + addParagraph("p"); + } else + paragraph = new Paragraph(0, "", "p"); + } else if (node.nodeName().equals("sup")) { inSup = true; - } else if(node.nodeName().matches("h\\d")) { - addParagraph(node.nodeName()); - } else if(node.nodeName().equals("table")) { - addParagraph("table"); - paragraph.addStructure(paragraph.getLength(), node.outerHtml(), "table", node.attr("class"), node.attr("id")); - addParagraph("p"); - skipLevel = depth; - } else if(node.nodeName().equals("span")) { - //denote notes - - if(node.attr("class").contains("notebegin")) - addParagraph("note"); - - } else if(node.nodeName().equals("math")) { - addParagraph("math"); - paragraph.addStructure(paragraph.getLength(), node.outerHtml(), "math", "tex", null); - addParagraph("p"); - skipLevel = depth; - } + } else if (node.nodeName().matches("h\\d")) { + addParagraph(node.nodeName()); + } else if (node.nodeName().equals("table")) { + addParagraph("table"); + paragraph.addStructure(paragraph.getLength(), node.outerHtml(), "table", node.attr("class"), + node.attr("id")); + addParagraph("p"); + skipLevel = depth; + } else if (node.nodeName().equals("span")) { + // denote notes + + if (node.attr("class").contains("notebegin")) + addParagraph("note"); + + } else if (node.nodeName().equals("math")) { + addParagraph("math"); + paragraph.addStructure(paragraph.getLength(), node.outerHtml(), "math", "tex", null); + addParagraph("p"); + skipLevel = depth; + } } private void setUri(String uri) { - if(uri!=null) { - tempLink.setUri(uri); - tempLink.setExternal(true); - inLink = true; - } else { - tempLink = new Link(); - } + if (uri != null) { + tempLink.setUri(uri); + tempLink.setExternal(true); + inLink = true; + } else { + tempLink = new Link(); + } } - + private String cleanLink(String uri, boolean external) { - if(!external) { + if (!external) { String linkPrefix = "/wiki/"; - String linkPrefix2= "./"; - if(uri.contains(linkPrefix)){ - uri=uri.substring(uri.indexOf("?title=")+7); + String linkPrefix2 = "./"; + if (uri.contains(linkPrefix)) { + uri = uri.substring(uri.indexOf("?title=") + 7); } else if (uri.contains(linkPrefix2)) { - uri=uri.substring(uri.indexOf("?title=")+3); + uri = uri.substring(uri.indexOf("?title=") + 3); } - //TODO central string management - if(!this.context.language.equals("en")) { - uri="http://"+this.context.language+".dbpedia.org/resource/"+uri; + // TODO central string management + if (!this.context.language.equals("en")) { + uri = "http://" + this.context.language + ".dbpedia.org/resource/" + uri; } else { - uri="http://dbpedia.org/resource/"+uri; + uri = "http://dbpedia.org/resource/" + uri; } uri = uri.replace("&action=edit&redlink=1", ""); - + } else { - //there are links that contain illegal hostnames + // there are links that contain illegal hostnames try { - if(uri.startsWith("//")) - uri = "http:"+uri; - uri = URLEncoder.encode(uri,"UTF-8"); + if (uri.startsWith("//")) + uri = "http:" + uri; + uri = URLEncoder.encode(uri, "UTF-8"); uri = uri.replace("%3A", ":").replace("%2F", "/").replace("%2E", "."); - - } catch(UnsupportedEncodingException e) { - //this doesn't happen + + } catch (UnsupportedEncodingException e) { + // this doesn't happen e.printStackTrace(); } } return UriUtils.uriToDbpediaIri(uri).toString(); } - + public void tail(Node node, int depth) { - if(skipLevel>0) { - if(skipLevel==depth) { + if (skipLevel > 0) { + if (skipLevel == depth) { skipLevel = -1; return; } else { @@ -222,74 +237,68 @@ public void tail(Node node, int depth) { } } - if(node.nodeName().equals("a") && inLink) { + if (node.nodeName().equals("a") && inLink) { inLink = false; paragraph.addLink(tempLink); tempLink = new Link(); - } - else if(invisible && node.attr("style").matches(".*display\\s*:\\s*none.*")) { - invisible = false; - } - else if(node.nodeName().equals("p") && paragraph != null) { - addParagraph("p"); - } - else if(node.nodeName().equals("sup") && inSup) { + } else if (invisible && node.attr("style").matches(".*display\\s*:\\s*none.*")) { + invisible = false; + } else if (node.nodeName().equals("p") && paragraph != null) { + addParagraph("p"); + } else if (node.nodeName().equals("sup") && inSup) { inSup = false; + } else if (node.nodeName().matches("h\\d")) { + addParagraph("p"); + } else if (node.nodeName().equals("span")) { + if (node.attr("class").contains("noteend")) + addParagraph("p"); } - else if(node.nodeName().matches("h\\d")) { - addParagraph("p"); - } - else if(node.nodeName().equals("span")) { - if(node.attr("class").contains("noteend")) - addParagraph("p"); - } } - + public List getParagraphs() { - if(paragraph != null && paragraph.getLength() > 0) - { - paragraphs.add(paragraph); - paragraph = null; - } - return paragraphs; + if (paragraph != null && paragraph.getLength() > 0) { + paragraphs.add(paragraph); + paragraph = null; + } + return paragraphs; } - private void addParagraph(String newTag){ - if(paragraph.getLength() != 0 || paragraph.getHtmlStrings().size() > 0) - paragraphs.add(paragraph); + private void addParagraph(String newTag) { + if (paragraph.getLength() != 0 || paragraph.getHtmlStrings().size() > 0) + paragraphs.add(paragraph); - paragraph = new Paragraph(0, "", (newTag == null ? "p" : newTag)); - } + paragraph = new Paragraph(0, "", (newTag == null ? "p" : newTag)); + } - public int getTableCount(){ - int count =0; - for(Paragraph p : this.getParagraphs()){ + public int getTableCount() { + int count = 0; + for (Paragraph p : this.getParagraphs()) { count += paragraph.getHtmlStrings().size(); } return count; } - public ArrayList getErrors(){ + public ArrayList getErrors() { return errors; } - private String[] replaceChars() { - String[] rep = new String[256]; - rep['\n'] = ""; - rep['\u00A0'] = " "; - return rep; - } - - public static class NifExtractorContext { - private String language; - private String resource; - private String wikipediaTemplateString; - - public NifExtractorContext(String language, String resource, String templateString){ - this.language = language; - this.resource = resource; - this.wikipediaTemplateString = templateString; - } - } + private String[] replaceChars() { + String[] rep = new String[256]; + rep['\n'] = ""; + rep['\u00A0'] = " "; + return rep; + } + + public static class NifExtractorContext { + private String language; + private String resource; + private String wikipediaTemplateString; + + public NifExtractorContext(String language, String resource, String templateString) { + this.language = language; + this.resource = resource; + this.wikipediaTemplateString = templateString; + } + } } diff --git a/core/src/main/resources/nifextractionconfig.json b/core/src/main/resources/nifextractionconfig.json index c4548f431b..d66d3ceb55 100644 --- a/core/src/main/resources/nifextractionconfig.json +++ b/core/src/main/resources/nifextractionconfig.json @@ -14,7 +14,6 @@ "nif-remove-elements":[ ".noprint", ".haudio", - "sup.reference", "span.mw-editsection", ".error", "#coordinates", diff --git a/core/src/main/scala/org/dbpedia/extraction/nif/HtmlNifExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/nif/HtmlNifExtractor.scala index 130d2c7e92..d2d8f47c5f 100755 --- a/core/src/main/scala/org/dbpedia/extraction/nif/HtmlNifExtractor.scala +++ b/core/src/main/scala/org/dbpedia/extraction/nif/HtmlNifExtractor.scala @@ -39,6 +39,7 @@ abstract class HtmlNifExtractor(nifContextIri: String, language: String, nifPara protected val templateString = "Template" private val sectionMap = new mutable.HashMap[PageSection, ExtractedSection]() + private val citationMap = new mutable.HashMap[String, String]() /** * Extract the relevant html page divided in sections and paragraphs @@ -285,9 +286,17 @@ abstract class HtmlNifExtractor(nifContextIri: String, language: String, nifPara words += nifLinks(word, RdfNamespace.NIF.append("beginIndex"), (offset + link.getWordStart).toString, sourceUrl, RdfNamespace.XSD.append("nonNegativeInteger")) words += nifLinks(word, RdfNamespace.NIF.append("endIndex"), (offset + link.getWordEnd).toString, sourceUrl, RdfNamespace.XSD.append("nonNegativeInteger")) words += nifLinks(word, RdfNamespace.NIF.append("superString"), paragraphUri, sourceUrl, null) - UriUtils.createURI(link.getUri) match{ - case Success(s) => words += nifLinks(word, "http://www.w3.org/2005/11/its/rdf#taIdentRef", s.toString, sourceUrl, null) //TODO IRI's might throw exception in org.dbpedia.extraction.destinations.formatters please check this - case Failure(f) => + if (link.isCitation) { + words += nifLinks(word, RdfNamespace.RDF.append("type"), "http://dbpedia.org/ontology/Citation", sourceUrl, null) + citationMap.get(link.getCitationId) match { + case Some(url) => words += nifLinks(word, "http://www.w3.org/2005/11/its/rdf#taIdentRef", url, sourceUrl, null) + case None => + } + } else { + UriUtils.createURI(link.getUri) match{ + case Success(s) => words += nifLinks(word, "http://www.w3.org/2005/11/its/rdf#taIdentRef", s.toString, sourceUrl, null) //TODO IRI's might throw exception in org.dbpedia.extraction.destinations.formatters please check this + case Failure(f) => + } } if(writeLinkAnchors) words += nifLinks(word, RdfNamespace.NIF.append("anchorOf"), link.getLinkText, sourceUrl, RdfNamespace.XSD.append("string")) @@ -346,6 +355,15 @@ abstract class HtmlNifExtractor(nifContextIri: String, language: String, nifPara protected def getJsoupDoc(html: String): Document = { val doc = Jsoup.parse(cleanHtml(html)) + //extract citations + for(note <- doc.select("li[id^=cite_note-]").asScala){ + val id = note.id() + val extLink = note.select("a.external.text").first() + if (extLink != null) { + citationMap.put(id, extLink.attr("href")) + } + } + //delete queries for(query <- cssSelectorConfigMap.removeElements) for(item <- doc.select(query).asScala) From 38d62937be27c6089cc651c4a1d3996dfef9c7b1 Mon Sep 17 00:00:00 2001 From: Anshuman Dutta Date: Thu, 16 Apr 2026 20:45:45 +0530 Subject: [PATCH 4/8] feat(ontology): add rdfs:comment to infobox property definitions Added rdfs:comment to the properties extracted from Wikipedia infoboxes in the InfoboxExtractor. This provides descriptive metadata about the original raw property. --- .../org/dbpedia/extraction/mappings/InfoboxExtractor.scala | 3 +++ 1 file changed, 3 insertions(+) diff --git a/core/src/main/scala/org/dbpedia/extraction/mappings/InfoboxExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/mappings/InfoboxExtractor.scala index 779975518a..c10f249795 100644 --- a/core/src/main/scala/org/dbpedia/extraction/mappings/InfoboxExtractor.scala +++ b/core/src/main/scala/org/dbpedia/extraction/mappings/InfoboxExtractor.scala @@ -58,6 +58,7 @@ extends PageNodeExtractor private val ignoreProperties = InfoboxExtractorConfig.ignoreProperties private val labelProperty = ontology.properties("rdfs:label") + private val commentProperty = ontology.properties("rdfs:comment") private val typeProperty = ontology.properties("rdf:type") private val propertyClass = ontology.classes("rdf:Property") private val rdfLangStrDt = ontology.datatypes("rdf:langString") @@ -165,6 +166,8 @@ extends PageNodeExtractor seenProperties += propertyUri quads += new Quad(language, DBpediaDatasets.InfoboxPropertyDefinitions, propertyUri, typeProperty, propertyClass.uri, splitNode.sourceIri) quads += new Quad(language, DBpediaDatasets.InfoboxPropertyDefinitions, propertyUri, labelProperty, propertyLabel, splitNode.sourceIri, rdfLangStrDt) + val propertyComment = "Raw property extracted from Wikipedia infobox: " + property.key + quads += new Quad(language, DBpediaDatasets.InfoboxPropertyDefinitions, propertyUri, commentProperty, propertyComment, splitNode.sourceIri, rdfLangStrDt) } } } From ac97387306f6b2fec9e9c221ec2039be6c36b201 Mon Sep 17 00:00:00 2001 From: Anshuman Dutta Date: Fri, 17 Apr 2026 23:15:55 +0530 Subject: [PATCH 5/8] Robustify post-processing scripts to handle missing datasets in minidump tests --- .../dbpedia/extraction/util/DateFinder.scala | 12 ++-- .../org/dbpedia/extraction/util/Finder.scala | 2 +- .../extraction/scripts/MapObjectUris.scala | 62 +++++++++---------- .../extraction/scripts/QuadReader.scala | 13 ++-- .../scripts/ResolveTransitiveLinks.scala | 13 +++- .../scripts/TypeConsistencyCheck.scala | 23 +++++-- 6 files changed, 77 insertions(+), 48 deletions(-) diff --git a/core/src/main/scala/org/dbpedia/extraction/util/DateFinder.scala b/core/src/main/scala/org/dbpedia/extraction/util/DateFinder.scala index d3b00d29f3..ef3e77cae1 100644 --- a/core/src/main/scala/org/dbpedia/extraction/util/DateFinder.scala +++ b/core/src/main/scala/org/dbpedia/extraction/util/DateFinder.scala @@ -18,19 +18,23 @@ class DateFinder[T](val finder: Finder[T]){ _date else throw new IllegalStateException("date not set") - def byName(name: String, auto: Boolean = false): Option[T] = { + def byName(name: String, auto: Boolean = false, required: Boolean = true): Option[T] = { if (_date == null) { if (! auto) throw new IllegalStateException("date not set") - _date = finder.dates(name).last + val dates = finder.dates(name, required) + if (dates.isEmpty) return None + _date = dates.last } finder.file(_date, name) } - def byPattern (pattern: String, auto: Boolean = false): Seq[T] = { + def byPattern (pattern: String, auto: Boolean = false, required: Boolean = true): Seq[T] = { if (_date == null) { if (! auto) throw new IllegalStateException("date not set") - _date = finder.dates(pattern, true, true).last + val dates = finder.dates(pattern, required, isSuffixRegex = true) + if (dates.isEmpty) return Seq.empty + _date = dates.last } finder.matchFiles(_date, pattern).toSeq } diff --git a/core/src/main/scala/org/dbpedia/extraction/util/Finder.scala b/core/src/main/scala/org/dbpedia/extraction/util/Finder.scala index 715647bf8a..fa6bd579c3 100644 --- a/core/src/main/scala/org/dbpedia/extraction/util/Finder.scala +++ b/core/src/main/scala/org/dbpedia/extraction/util/Finder.scala @@ -63,7 +63,7 @@ class Finder[T](val baseDir: T, val language: Language, val wikiNameSuffix: Stri case None => false }} - val dates = wikiDir.names.filter(dateFilter).filter(suffixFilter).sortBy(_.toInt) + val dates = wikiDir.names.filter(dateFilter).filter(suffixFilter).distinct.sortBy(_.toInt) if (required && dates.isEmpty) { var msg = "found no directory "+wikiDir+"/[YYYYMMDD]" diff --git a/scripts/src/main/scala/org/dbpedia/extraction/scripts/MapObjectUris.scala b/scripts/src/main/scala/org/dbpedia/extraction/scripts/MapObjectUris.scala index c408373a3c..d87a46de03 100644 --- a/scripts/src/main/scala/org/dbpedia/extraction/scripts/MapObjectUris.scala +++ b/scripts/src/main/scala/org/dbpedia/extraction/scripts/MapObjectUris.scala @@ -125,46 +125,46 @@ object MapObjectUris { Workers.work(SimpleWorkers(1.5, 1.0) { mapping: String => var count = 0 - new QuadMapper().readQuads(finder, mapping + mappingSuffix, auto = true) { quad => + new QuadMapper().readQuads(finder, mapping + mappingSuffix, auto = true, required = false) { quad => if (quad.datatype != null) throw new IllegalArgumentException(mapping + ": expected object uri, found object literal: " + quad) - // TODO: this wastes a lot of space. Storing the part after ...dbpedia.org/resource/ would - // be enough. Also, the fields of the Quad are derived by calling substring() on the whole - // line, which means that the character array for the whole line is kept in memory, which - // basically means that the whole redirects file is kept in memory. We should - // - only store the resource title in the map - // - use new String(quad.subject), new String(quad.value) to cut the link to the whole line - // - maybe use an index of titles as in ProcessInterLanguageLinks to avoid storing duplicate titles map.put(quad.subject, quad.value) count += 1 } - err.println(mapping + ": found " + count + " mappings") + if (count > 0) err.println(mapping + ": found " + count + " mappings") + else err.println(mapping + ": mapping file not found or empty, skipping for " + language.wikiCode) }, mappings.toList) Workers.work(SimpleWorkers(1.5, 1.0) { input: (String, String) => var count = 0 - val inputFile = if(isExternal) new File(secondary, input._1 + input._2) else finder.byName(input._1 + input._2, auto = true).get - val outputFile = if(isExternal) new File(secondary, input._1 + extension + input._2) else finder.byName(input._1 + extension + input._2, auto = true).get - new QuadMapper().mapQuads(language, inputFile, outputFile) { quad => - - if (quad.datatype != null) { - // just copy quad with literal values. TODO: make this configurable - List(quad) - } - else { - val uris = map.get(quad.value).asScala - count = count + 1 - val ret = for (uri <- uris) - yield quad.copy( - value = uri, // change object URI - context = if (quad.context == null) quad.context else quad.context + "&objectMappedFrom=" + quad.value) // add change provenance - // none found - if(ret.isEmpty) - List(quad) - else - ret - } + var count = 0 + val inputFileOption = if(isExternal) Some(new File(secondary, input._1 + input._2)) else finder.byName(input._1 + input._2, auto = true, required = false) + val outputFileOption = if(isExternal) Some(new File(secondary, input._1 + extension + input._2)) else finder.byName(input._1 + extension + input._2, auto = true, required = false) + + (inputFileOption, outputFileOption) match { + case (Some(inputFile), Some(outputFile)) if inputFile.exists => + new QuadMapper().mapQuads(language, inputFile, outputFile) { quad => + if (quad.datatype != null) { + // just copy quad with literal values. TODO: make this configurable + List(quad) + } + else { + val uris = map.get(quad.value).asScala + count = count + 1 + val ret = for (uri <- uris) + yield quad.copy( + value = uri, // change object URI + context = if (quad.context == null) quad.context else quad.context + "&objectMappedFrom=" + quad.value) // add change provenance + // none found + if(ret.isEmpty) + List(quad) + else + ret + } + } + err.println(input._1 + ": changed " + count + " quads.") + case _ => + err.println(input._1 + ": input file not found or empty, skipping for " + language.wikiCode) } - err.println(input._1 + ": changed " + count + " quads.") }, inputs.flatMap(x => suffixes.map(y => (x, y))).toList) } } diff --git a/scripts/src/main/scala/org/dbpedia/extraction/scripts/QuadReader.scala b/scripts/src/main/scala/org/dbpedia/extraction/scripts/QuadReader.scala index 08ffad9cde..e6c5b4bdc1 100644 --- a/scripts/src/main/scala/org/dbpedia/extraction/scripts/QuadReader.scala +++ b/scripts/src/main/scala/org/dbpedia/extraction/scripts/QuadReader.scala @@ -40,16 +40,21 @@ class QuadReader(log: FileLike[File] = null, preamble: String = null) { * @param input file name, e.g. interlanguage-links-same-as.nt.gz * @param proc process quad */ - def readQuads[T <% FileLike[T]](finder: DateFinder[T], input: String, auto: Boolean = false)(proc: Quad => Unit): Unit = { - readQuads(finder.language, finder.byName(input, auto).get)(proc) + def readQuads[T <% FileLike[T]](finder: DateFinder[T], input: String, auto: Boolean = false, required: Boolean = true)(proc: Quad => Unit): Unit = { + finder.byName(input, auto, required) match { + case Some(file) if file.exists => readQuads(finder.language, file)(proc) + case _ => if (required) throw new IllegalArgumentException("file " + input + " not found") + } } /** * @param pattern regex of filenemes * @param proc process quad */ - def readQuadsOfMultipleFiles[T <% FileLike[T]](finder: DateFinder[T], pattern: String, auto: Boolean = false)(proc: Quad => Unit): Unit = { - for(file <- finder.byPattern(pattern, auto)) + def readQuadsOfMultipleFiles[T <% FileLike[T]](finder: DateFinder[T], pattern: String, auto: Boolean = false, required: Boolean = true)(proc: Quad => Unit): Unit = { + val files = finder.byPattern(pattern, auto, required) + if (required && files.isEmpty) throw new IllegalArgumentException("no files found for pattern " + pattern) + for(file <- files) readQuads(finder.language, file)(proc) } diff --git a/scripts/src/main/scala/org/dbpedia/extraction/scripts/ResolveTransitiveLinks.scala b/scripts/src/main/scala/org/dbpedia/extraction/scripts/ResolveTransitiveLinks.scala index 78fbc7d6dd..6b9db230f9 100644 --- a/scripts/src/main/scala/org/dbpedia/extraction/scripts/ResolveTransitiveLinks.scala +++ b/scripts/src/main/scala/org/dbpedia/extraction/scripts/ResolveTransitiveLinks.scala @@ -67,7 +67,11 @@ object ResolveTransitiveLinks { Workers.work(SimpleWorkers(1.5, 1.0) { language: Language => val finder = new DateFinder(baseDir, language) - finder.byName("redirects" + suffix, auto = true) + val inputFile = finder.byName(input + suffix, auto = true, required = false) + if (inputFile.isEmpty) { + err.println(language.wikiCode + ": input dataset " + input + " not found, skipping.") + return + } // use LinkedHashMap to preserve order val map = new LinkedHashMap[String, String]() @@ -96,7 +100,7 @@ object ResolveTransitiveLinks { var count = 0 var predicate: String = null - new QuadMapper(logfile).readQuads(finder, input + suffix) { quad => + new QuadMapper(logfile).readQuads(finder, input + suffix, auto = true, required = false) { quad => wikidatamap.get(quad.subject) match { case Some(s) => count = count +1 //do nothing since we dont want to redirect if a wikidata uri exists @@ -108,6 +112,11 @@ object ResolveTransitiveLinks { } } } + + if (predicate == null) { + err.println(language.wikiCode + ": input dataset " + input + " not found or empty, skipping.") + return + } err.println(language.wikiCode + ": " + count + " redirects were suppressed since they have a wikidata uri") val buildQuad = QuadBuilder.stringPredicate(language, DBpediaDatasets.RedirectsTransitive, predicate) _ diff --git a/scripts/src/main/scala/org/dbpedia/extraction/scripts/TypeConsistencyCheck.scala b/scripts/src/main/scala/org/dbpedia/extraction/scripts/TypeConsistencyCheck.scala index edf76eb2e1..0771746874 100644 --- a/scripts/src/main/scala/org/dbpedia/extraction/scripts/TypeConsistencyCheck.scala +++ b/scripts/src/main/scala/org/dbpedia/extraction/scripts/TypeConsistencyCheck.scala @@ -98,12 +98,23 @@ object TypeConsistencyCheck { // create destination for this language val finder = new Finder[File](baseDir, lang, "wiki") - val date = finder.dates().last - val destination = createDestination(finder, date, formats) - - val typeDatasetFile: File = finder.file(date, typesDataset).get - val mappedTripleDatasetFile: File = finder.file(date, mappedTripleDataset).get - checkTypeConsistency(ontology, typeDatasetFile, mappedTripleDatasetFile, destination, lang) + val dates = finder.dates(required = false) + if (dates.isEmpty) { + Console.err.println(lang.wikiCode + ": no date directory found, skipping.") + } else { + val date = dates.last + val destination = createDestination(finder, date, formats) + + val typeDatasetFileOption = finder.file(date, typesDataset) + val mappedTripleDatasetFileOption = finder.file(date, mappedTripleDataset) + + (typeDatasetFileOption, mappedTripleDatasetFileOption) match { + case (Some(typeDatasetFile), Some(mappedTripleDatasetFile)) if typeDatasetFile.exists && mappedTripleDatasetFile.exists => + checkTypeConsistency(ontology, typeDatasetFile, mappedTripleDatasetFile, destination, lang) + case _ => + Console.err.println(lang.wikiCode + ": required datasets missing, skipping.") + } + } } } From f49805a3c6cbdeaeae3d9bca10fdf02e7eb0edc7 Mon Sep 17 00:00:00 2001 From: Anshuman Dutta Date: Fri, 17 Apr 2026 23:41:38 +0530 Subject: [PATCH 6/8] Fix compilation errors in MapObjectUris.scala --- .../extraction/scripts/MapObjectUris.scala | 56 +++++++++---------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/scripts/src/main/scala/org/dbpedia/extraction/scripts/MapObjectUris.scala b/scripts/src/main/scala/org/dbpedia/extraction/scripts/MapObjectUris.scala index d87a46de03..312216b9db 100644 --- a/scripts/src/main/scala/org/dbpedia/extraction/scripts/MapObjectUris.scala +++ b/scripts/src/main/scala/org/dbpedia/extraction/scripts/MapObjectUris.scala @@ -5,7 +5,7 @@ import java.io.File import org.apache.jena.ext.com.google.common.collect.{Multimaps, TreeMultimap} import org.dbpedia.extraction.config.ConfigUtils.parseLanguages import org.dbpedia.extraction.util.RichFile.wrapFile -import org.dbpedia.extraction.util.{DateFinder, Language, SimpleWorkers, Workers} +import org.dbpedia.extraction.util.{DateFinder, Language, SimpleWorkers, Workers, RichFile, FileLike} import scala.Console.err import scala.collection.convert.decorateAsScala._ @@ -135,35 +135,35 @@ object MapObjectUris { }, mappings.toList) Workers.work(SimpleWorkers(1.5, 1.0) { input: (String, String) => - var count = 0 - var count = 0 - val inputFileOption = if(isExternal) Some(new File(secondary, input._1 + input._2)) else finder.byName(input._1 + input._2, auto = true, required = false) - val outputFileOption = if(isExternal) Some(new File(secondary, input._1 + extension + input._2)) else finder.byName(input._1 + extension + input._2, auto = true, required = false) - - (inputFileOption, outputFileOption) match { - case (Some(inputFile), Some(outputFile)) if inputFile.exists => - new QuadMapper().mapQuads(language, inputFile, outputFile) { quad => - if (quad.datatype != null) { - // just copy quad with literal values. TODO: make this configurable + var changeCount = 0 + val inputFileOption: Option[FileLike[_]] = if(isExternal) Some(new RichFile(new File(secondary, input._1 + input._2))) else finder.byName(input._1 + input._2, auto = true, required = false).map(x => x: FileLike[_]) + val outputFileOption: Option[FileLike[_]] = if(isExternal) Some(new RichFile(new File(secondary, input._1 + extension + input._2))) else finder.byName(input._1 + extension + input._2, auto = true, required = false).map(x => x: FileLike[_]) + + if (inputFileOption.isDefined && outputFileOption.isDefined && inputFileOption.get.exists) { + val inputFile = inputFileOption.get + val outputFile = outputFileOption.get + new QuadMapper().mapQuads(language, inputFile, outputFile) { quad => + if (quad.datatype != null) { + // just copy quad with literal values. TODO: make this configurable + List(quad) + } + else { + val uris = map.get(quad.value).asScala + changeCount = changeCount + 1 + val ret = for (uri <- uris) + yield quad.copy( + value = uri, // change object URI + context = if (quad.context == null) quad.context else quad.context + "&objectMappedFrom=" + quad.value) // add change provenance + // none found + if(ret.isEmpty) List(quad) - } - else { - val uris = map.get(quad.value).asScala - count = count + 1 - val ret = for (uri <- uris) - yield quad.copy( - value = uri, // change object URI - context = if (quad.context == null) quad.context else quad.context + "&objectMappedFrom=" + quad.value) // add change provenance - // none found - if(ret.isEmpty) - List(quad) - else - ret - } + else + ret } - err.println(input._1 + ": changed " + count + " quads.") - case _ => - err.println(input._1 + ": input file not found or empty, skipping for " + language.wikiCode) + } + err.println(input._1 + ": changed " + changeCount + " quads.") + } else { + err.println(input._1 + ": input file not found or empty, skipping for " + language.wikiCode) } }, inputs.flatMap(x => suffixes.map(y => (x, y))).toList) } From 8194f1d8710032368d8d4ceb528afe97e25e3a6d Mon Sep 17 00:00:00 2001 From: Anshuman Dutta Date: Sat, 18 Apr 2026 00:13:37 +0530 Subject: [PATCH 7/8] Disable remote DataID comparison in minidump tests to avoid RiotException --- .../mappings.extraction.minidump.properties | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dump/src/test/resources/extraction-configs/mappings.extraction.minidump.properties b/dump/src/test/resources/extraction-configs/mappings.extraction.minidump.properties index 0b0a0e4879..710f2e6f28 100644 --- a/dump/src/test/resources/extraction-configs/mappings.extraction.minidump.properties +++ b/dump/src/test/resources/extraction-configs/mappings.extraction.minidump.properties @@ -28,3 +28,7 @@ extractors=.MappingExtractor # If we need to Exclude Non-Free Images in this Extraction, set this to true copyrightCheck=false +# Disable remote DataID comparison for tests to avoid external dependencies +compare-dataset-ids=false + + From 9fc7dfdd8b47671eeda160b1b478b8c75c29eca8 Mon Sep 17 00:00:00 2001 From: Anshuman Dutta Date: Sat, 18 Apr 2026 00:14:08 +0530 Subject: [PATCH 8/8] Disable remote DataID comparison in all test configs --- .../extraction-configs/extraction.nif.abstracts.properties | 1 + .../extraction-configs/extraction.plain.abstracts.properties | 1 + .../generic-spark.extraction.minidump.properties | 3 +++ .../extraction-configs/wikidata.extraction.properties | 1 + 4 files changed, 6 insertions(+) diff --git a/dump/src/test/resources/extraction-configs/extraction.nif.abstracts.properties b/dump/src/test/resources/extraction-configs/extraction.nif.abstracts.properties index a0a85face1..bd35aa12a6 100755 --- a/dump/src/test/resources/extraction-configs/extraction.nif.abstracts.properties +++ b/dump/src/test/resources/extraction-configs/extraction.nif.abstracts.properties @@ -112,3 +112,4 @@ nif-isTestRun=false nif-write-anchor=true # write only the anchor text for link instances nif-write-link-anchor=true +compare-dataset-ids=false diff --git a/dump/src/test/resources/extraction-configs/extraction.plain.abstracts.properties b/dump/src/test/resources/extraction-configs/extraction.plain.abstracts.properties index e246433ce7..e407712cb9 100755 --- a/dump/src/test/resources/extraction-configs/extraction.plain.abstracts.properties +++ b/dump/src/test/resources/extraction-configs/extraction.plain.abstracts.properties @@ -105,3 +105,4 @@ nif-isTestRun=false nif-write-anchor=true # write only the anchor text for link instances nif-write-link-anchor=true +compare-dataset-ids=false diff --git a/dump/src/test/resources/extraction-configs/generic-spark.extraction.minidump.properties b/dump/src/test/resources/extraction-configs/generic-spark.extraction.minidump.properties index df7bbf26e5..ac494341ce 100644 --- a/dump/src/test/resources/extraction-configs/generic-spark.extraction.minidump.properties +++ b/dump/src/test/resources/extraction-configs/generic-spark.extraction.minidump.properties @@ -126,3 +126,6 @@ extractors.zh= # If we need to Exclude Non-Free Images in this Extraction, set this to true copyrightCheck=false +# Disable remote DataID comparison for tests to avoid external dependencies +compare-dataset-ids=false + diff --git a/dump/src/test/resources/extraction-configs/wikidata.extraction.properties b/dump/src/test/resources/extraction-configs/wikidata.extraction.properties index 4f539985df..eb9b8d8319 100644 --- a/dump/src/test/resources/extraction-configs/wikidata.extraction.properties +++ b/dump/src/test/resources/extraction-configs/wikidata.extraction.properties @@ -30,3 +30,4 @@ extractors= extractors.wikidata=.WikidataLexemeExtractor,.WikidataRawExtractor,.WikidataSameAsExtractor,.WikidataR2RExtractor,.WikidataLLExtractor,.WikidataReferenceExtractor,.WikidataAliasExtractor,.WikidataLabelExtractor,.WikidataNameSpaceSameAsExtractor,.WikidataPropertyExtractor,.WikidataDescriptionExtractor +compare-dataset-ids=false