Use namespace-aware XML parsing for workbook. by aquasync · Pull Request #632 · tafia/calamine

aquasync · 2026-03-25T03:26:19Z

Test spreadsheet comes from the R readxl package test suite. Returns a parse error without this change.

Currently uses some common hard-coded namespace prefixes (r: and relationships:) for parsing relationships. This changes it to match on the namespace URL.

Rather than change all XML parsing to be namespace-aware (which would likely have performance implications?), this only touches the workbook parsing.

aquasync · 2026-03-25T03:28:35Z

There's a bit of churn due to templating xml_reader to allow optional namespaced readers. If you've got a better/cleaner idea on how to tackle let me know.

jmcnamara · 2026-03-25T19:13:13Z

Thanks for the fix. It is good as a rule to make interfaces generic but in this particular case there will only be two Readers so separate functions and a small amount of code duplication is probably okay. That would give a less invasive fix like this:

diff --git a/src/xlsx/mod.rs b/src/xlsx/mod.rs
index 879b170..2a6056e 100644
--- a/src/xlsx/mod.rs
+++ b/src/xlsx/mod.rs
@@ -16,7 +16,7 @@ use log::warn;
 use quick_xml::events::attributes::{AttrError, Attribute, Attributes};
 use quick_xml::events::BytesStart;
 use quick_xml::events::Event;
-use quick_xml::name::QName;
+use quick_xml::name::{Namespace, QName, ResolveResult};
 use quick_xml::Decoder;
 use quick_xml::Reader as XmlReader;
 use zip::read::{ZipArchive, ZipFile};
@@ -35,6 +35,7 @@ use crate::{
 pub use cells_reader::XlsxCellReader;
 
 pub(crate) type XlReader<'a, RS> = XmlReader<BufReader<ZipFile<'a, RS>>>;
+pub(crate) type XlNsReader<'a, RS> = quick_xml::NsReader<BufReader<ZipFile<'a, RS>>>;
 
 /// Maximum number of rows allowed in an XLSX file.
 pub const MAX_ROWS: u32 = 1_048_576;
@@ -398,10 +399,14 @@ impl<RS: Read + Seek> Xlsx<RS> {
     }
 
     fn read_workbook(&mut self, relationships: &HashMap<Vec<u8>, String>) -> Result<(), XlsxError> {
-        let mut xml = match xml_reader(&mut self.zip, "xl/workbook.xml", &self.zip_path_cache) {
+        const NS_RELATIONSHIPS: Namespace =
+            Namespace(b"http://schemas.openxmlformats.org/officeDocument/2006/relationships");
+
+        let mut xml = match xml_ns_reader(&mut self.zip, "xl/workbook.xml", &self.zip_path_cache) {
             None => return Ok(()),
             Some(x) => x?,
         };
+
         let mut defined_names = Vec::new();
         let mut buf = Vec::with_capacity(1024);
         let mut val_buf = Vec::with_capacity(1024);
@@ -414,17 +419,12 @@ impl<RS: Read + Seek> Xlsx<RS> {
                     let mut visible = SheetVisible::Visible;
                     for a in e.attributes() {
                         let a = a?;
-                        match a {
-                            Attribute {
-                                key: QName(b"name"),
-                                ..
-                            } => {
+                        let (ns, key) = xml.resolver().resolve_attribute(a.key);
+                        match (ns, key.as_ref()) {
+                            (ResolveResult::Unbound, b"name") => {
                                 name = a.decode_and_unescape_value(xml.decoder())?.to_string();
                             }
-                            Attribute {
-                                key: QName(b"state"),
-                                ..
-                            } => {
+                            (ResolveResult::Unbound, b"state") => {
                                 visible = match a.decode_and_unescape_value(xml.decoder())?.as_ref()
                                 {
                                     "visible" => SheetVisible::Visible,
@@ -438,12 +438,9 @@ impl<RS: Read + Seek> Xlsx<RS> {
                                     }
                                 }
                             }
-                            Attribute {
-                                key: QName(b"r:id" | b"relationships:id"),
-                                value: v,
-                            } => {
+                            (ResolveResult::Bound(NS_RELATIONSHIPS), b"id") => {
                                 let r = &relationships
-                                    .get(&*v)
+                                    .get(&*a.value)
                                     .ok_or(XlsxError::RelationshipNotFound)?[..];
                                 // target may have prepended "/xl/" or "xl/" path;
                                 // strip if present
@@ -1757,6 +1754,7 @@ impl<RS: Read + Seek> ReaderRef<RS> for Xlsx<RS> {
     }
 }
 
+// Default XML reader. Not namespace-aware.
 fn xml_reader<'a, RS: Read + Seek>(
     zip: &'a mut ZipArchive<RS>,
     path: &str,
@@ -1767,11 +1765,26 @@ fn xml_reader<'a, RS: Read + Seek>(
     match zip.by_name(zip_path) {
         Ok(f) => {
             let mut r = XmlReader::from_reader(BufReader::new(f));
-            let config = r.config_mut();
-            config.check_end_names = false;
-            config.trim_text(false);
-            config.check_comments = false;
-            config.expand_empty_elements = true;
+            configure_reader(r.config_mut());
+            Some(Ok(r))
+        }
+        Err(ZipError::FileNotFound) => None,
+        Err(e) => Some(Err(e.into())),
+    }
+}
+
+// Namespace-aware XML reader. Used for attributes with namespaces.
+fn xml_ns_reader<'a, RS: Read + Seek>(
+    zip: &'a mut ZipArchive<RS>,
+    path: &str,
+    cache: &HashMap<String, String>,
+) -> Option<Result<XlNsReader<'a, RS>, XlsxError>> {
+    let zip_path = cached_zip_path(cache, path);
+
+    match zip.by_name(zip_path) {
+        Ok(f) => {
+            let mut r = quick_xml::NsReader::from_reader(BufReader::new(f));
+            configure_reader(r.config_mut());
             Some(Ok(r))
         }
         Err(ZipError::FileNotFound) => None,
@@ -1779,6 +1792,14 @@ fn xml_reader<'a, RS: Read + Seek>(
     }
 }
 
+// Configure the XML reader. Used for the default and namespace-aware readers.
+fn configure_reader(config: &mut quick_xml::reader::Config) {
+    config.check_end_names = false;
+    config.trim_text(false);
+    config.check_comments = false;
+    config.expand_empty_elements = true;
+}
+
 /// search through an Element's attributes for the named one
 pub(crate) fn get_attribute<'a>(
     atts: Attributes<'a>,

Alternatively, we could ignore the namespace and just handle the id. That would give a very simple fix:

diff --git a/src/xlsx/mod.rs b/src/xlsx/mod.rs
index 879b170..dd7e68a 100644
--- a/src/xlsx/mod.rs
+++ b/src/xlsx/mod.rs
@@ -438,10 +438,9 @@ impl<RS: Read + Seek> Xlsx<RS> {
                                     }
                                 }
                             }
-                            Attribute {
-                                key: QName(b"r:id" | b"relationships:id"),
-                                value: v,
-                            } => {
+                            // Ignore the "r:id" attribute namespace and match on the "id" name.
+                            a if a.key.local_name().as_ref() == b"id" => {
+                                let v = a.value;
                                 let r = &relationships
                                     .get(&*v)
                                     .ok_or(XlsxError::RelationshipNotFound)?[..];

@jqnatividad Any thoughts on this issue and the potential fixes.

aquasync · 2026-03-25T22:48:32Z

Yes I had something similar initially but was trying to remove the duplication. Using the shared reader config stops it being too bad though. As to whether we could just get away with ignoring namespaces entirely - I don't have a good feel for that. I can't see instances in the readxl test suite at least where that wouldn't work (eg use of bare id attr for something else, or a different namespaced id attr).

Happy to go with the simplest thing that works. OTOH I have another test case from the same suite that has namespaced nodes:

<?xml version="1.0" encoding="utf-8"?>
<x:workbook xmlns:x="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
    <x:sheets>
        <x:sheet name="СПАРК - Список" sheetId="1" r:id="R6082ddd3e995440f"
            xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" />
    </x:sheets>
</x:workbook>

Not just in the workbook, but even in the shared strings:

<?xml version="1.0" encoding="utf-8"?>
<x:sst xmlns:x="http://schemas.openxmlformats.org/spreadsheetml/2006/main" count="2191" uniqueCount="165">
  <x:si>
    <x:t/>
  </x:si>
  <x:si>
...
</x:sst>

And the actual sheet data:

<?xml version="1.0" encoding="utf-8"?>
<x:worksheet xmlns:x="http://schemas.openxmlformats.org/spreadsheetml/2006/main"
>
  <x:cols>
...
  </x:cols>
  <x:sheetData>
    <x:row r="2">
      <x:c r="A2" s="1" t="str">
        <x:v>СПАРК - Список </x:v>
      </x:c>
      <x:c r="K2" s="1"/>
    </x:row>
...
</x:worksheet>

File uploaded here: Ekaterinburg_IP_9.xlsx.

Note that it opens fine in LibreOffice and Excel.

Not sure if the preference would be pervasive namespace-stripping, or properly resolving. I'd go with the latter but wary of performance implications given it'd be needed for the cell data also.

jmcnamara · 2026-03-26T00:31:30Z

Happy to go with the simplest thing that works. OTOH I have another test case from the same suite that has namespaced nodes:

Namespaced element names are already handled using the same local_name().as_ref() throughout the code. So they should be okay. I ran a quick test using the Ekaterinburg_IP_9.xlsx and it loaded fine.

So, since that seems to be the preferred method in the existing codebase let's go with that.

Thanks for the diligence.

Previously accepted only "r:" and "relationships:" prefixes. Closes tafia#634

aquasync · 2026-03-26T04:14:37Z

Ah that's great, I had missed that. Ok all the more reason to go with namespace stripping then. I've updated based on that suggestion. Thanks!

jmcnamara · 2026-03-26T09:38:41Z

Thanks for that. That looks good. You managed to maintain the Attribute{} format like that other arms which is better than my suggestion.

I will leave this open until the weekend to see if there are any other comments and I will merge it then.

jqnatividad · 2026-03-26T13:57:10Z

I use calamine in my project - qsv - where performance is the top goal.

The last release saw a big performance improvement - from 11.251 seconds to 8.725 seconds to export a million row sample of NYC's 311 data.

See https://qsv.dathere.com/benchmarks and filter for excel

Hopefully, we don't get a perf regression with this change, but it LGTM after a quick pass.

On a related matter, it'd be useful to have a test suite for calamine with real-world Excel and ODS files.

In that way, we can track both accuracy and performance over releases.

jmcnamara · 2026-03-26T19:28:35Z

@jqnatividad

Hopefully, we don't get a perf regression with this change, but it LGTM after a quick pass.

This change won't affect performance. It is only on the rel_id lookup in Workbook.

The last release saw a big performance improvement - from 11.251 seconds to 8.725 seconds to export a million row sample of NYC's 311 data.

That is almost all down to @alexander-beedie's work.

Could you take a look at #621 which offers even more performance gains but contains a more fundamental change to the internal processing.

aquasync force-pushed the xlsx-nonstandard-ns-prefix branch 2 times, most recently from 28ee3c4 to 55871a8 Compare March 25, 2026 08:05

jmcnamara mentioned this pull request Mar 25, 2026

perf: custom zero-overhead attribute extraction #621

Open

xlsx: ignore namespace prefix when reading relation ids

7fa6766

Previously accepted only "r:" and "relationships:" prefixes. Closes tafia#634

aquasync force-pushed the xlsx-nonstandard-ns-prefix branch from 55871a8 to 7fa6766 Compare March 26, 2026 04:10

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Use namespace-aware XML parsing for workbook.#632

Use namespace-aware XML parsing for workbook.#632
aquasync wants to merge 1 commit intotafia:masterfrom
aquasync:xlsx-nonstandard-ns-prefix

aquasync commented Mar 25, 2026

Uh oh!

aquasync commented Mar 25, 2026

Uh oh!

jmcnamara commented Mar 25, 2026 •

edited

Loading

Uh oh!

aquasync commented Mar 25, 2026

Uh oh!

jmcnamara commented Mar 26, 2026

Uh oh!

aquasync commented Mar 26, 2026

Uh oh!

jmcnamara commented Mar 26, 2026

Uh oh!

jqnatividad commented Mar 26, 2026 •

edited

Loading

Uh oh!

jmcnamara commented Mar 26, 2026 •

edited

Loading

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

Conversation

aquasync commented Mar 25, 2026

Uh oh!

aquasync commented Mar 25, 2026

Uh oh!

jmcnamara commented Mar 25, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

aquasync commented Mar 25, 2026

Uh oh!

jmcnamara commented Mar 26, 2026

Uh oh!

aquasync commented Mar 26, 2026

Uh oh!

jmcnamara commented Mar 26, 2026

Uh oh!

jqnatividad commented Mar 26, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

jmcnamara commented Mar 26, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

jmcnamara commented Mar 25, 2026 •

edited

Loading

jqnatividad commented Mar 26, 2026 •

edited

Loading

jmcnamara commented Mar 26, 2026 •

edited

Loading