From ccc5c8ca197c90407e4fbc26a48bee486e02f28d Mon Sep 17 00:00:00 2001
From: woutdenolf <woutdenolf@users.sf.net>
Date: Wed, 29 Apr 2026 18:33:43 +0200
Subject: [PATCH 1/2] centralize some XML helper functions

---
 dev_tools/docs/nxdl.py        | 38 +++++++++---------
 dev_tools/docs/xsd.py         |  5 +--
 dev_tools/utils/nxdl_utils.py | 74 ++++++++++++-----------------------
 dev_tools/utils/xml_utils.py  | 40 +++++++++++++++++++
 4 files changed, 86 insertions(+), 71 deletions(-)
 create mode 100644 dev_tools/utils/xml_utils.py

diff --git a/dev_tools/docs/nxdl.py b/dev_tools/docs/nxdl.py
index bd8e98ccf2..ea80cca733 100644
--- a/dev_tools/docs/nxdl.py
+++ b/dev_tools/docs/nxdl.py
@@ -12,9 +12,9 @@
 from ..globals.errors import NXDLParseError
 from ..globals.nxdl import NXDL_NAMESPACE
 from ..globals.urls import REPO_URL
+from ..utils import nxdl_utils
+from ..utils import xml_utils
 from ..utils.github import get_file_contributors_via_api
-from ..utils.nxdl_utils import get_inherited_nodes
-from ..utils.nxdl_utils import get_rst_formatted_name
 from ..utils.types import PathLike
 from .anchor_list import AnchorRegistry
 
@@ -363,8 +363,8 @@ def _get_required_or_optional_text(self, node):
         :param obj node: instance of lxml.etree._Element
         :returns: formatted text
         """
-        tag = node.tag.split("}")[-1]
-        if tag in ("field", "group", "choice"):
+        nxdl_element_type = nxdl_utils.get_nxdl_element_type(node)
+        if nxdl_element_type in ("field", "group", "choice"):
             optional_default = not self._use_application_defaults
             optional = node.get("optional", optional_default) in (True, "true", "1", 1)
             recommended = node.get("recommended", None) in (True, "true", "1", 1)
@@ -379,7 +379,7 @@ def _get_required_or_optional_text(self, node):
                 # this is unexpected and remarkable
                 # TODO: add a remark to the log
                 optional_text = f"(``minOccurs={str(minOccurs)}``) "
-        elif tag in ("attribute",):
+        elif nxdl_element_type in ("attribute",):
             optional_default = not self._use_application_defaults
             optional = node.get("optional", optional_default) in (True, "true", "1", 1)
             recommended = node.get("recommended", None) in (True, "true", "1", 1)
@@ -387,7 +387,7 @@ def _get_required_or_optional_text(self, node):
             if recommended:
                 optional_text = "(recommended) "
         else:
-            optional_text = "(unknown tag: " + str(tag) + ") "
+            optional_text = "(unknown tag: " + str(nxdl_element_type) + ") "
         return optional_text
 
     def _analyze_dimensions(self, ns, parent) -> str:
@@ -596,7 +596,7 @@ def _print_doc_enum(self, indent, ns, node, required=False):
 
     def _print_attribute(self, ns, kind, node, optional, indent, parent_path):
         name = node.get("name")
-        formatted_name = get_rst_formatted_name(node)
+        formatted_name = nxdl_utils.get_rst_formatted_name(node)
         index_name = name
         self._print(
             f"{indent}" f"{self._hyperlink_target(parent_path, name, 'attribute')}"
@@ -626,11 +626,11 @@ def _print_full_tree(self, ns, parent, name, indent, parent_path):
         """
         # Process children in document order to preserve XML ordering.
         for node in parent.xpath("nx:field|nx:group|nx:choice|nx:link", namespaces=ns):
-            tag = node.tag.split("}")[-1]
+            nxdl_element_type = nxdl_utils.get_nxdl_element_type(node)
 
-            if tag == "field":
+            if nxdl_element_type == "field":
                 name = node.get("name")
-                formatted_name = get_rst_formatted_name(node)
+                formatted_name = nxdl_utils.get_rst_formatted_name(node)
                 index_name = name
                 dims = self._analyze_dimensions(ns, node)
 
@@ -663,9 +663,9 @@ def _print_full_tree(self, ns, parent, name, indent, parent_path):
                         parent_path + "/" + name,
                     )
 
-            elif tag == "group":
+            elif nxdl_element_type == "group":
                 name = node.get("name", "")
-                formatted_name = get_rst_formatted_name(node)
+                formatted_name = nxdl_utils.get_rst_formatted_name(node)
                 typ = node.get("type", "untyped (this is an error; please report)")
 
                 optional_text = self._get_required_or_optional_text(node)
@@ -705,7 +705,7 @@ def _print_full_tree(self, ns, parent, name, indent, parent_path):
                     parent_path + "/" + name,
                 )
 
-            elif tag == "choice":
+            elif nxdl_element_type == "choice":
                 name = node.get("name", "")
                 hTarget = self._hyperlink_target(parent_path, name, "choice")
                 self._print(f"{indent}{hTarget}")
@@ -746,9 +746,9 @@ def _print_full_tree(self, ns, parent, name, indent, parent_path):
                         parent_path + "/" + name + "/" + subname,
                     )
 
-            elif tag == "link":
+            elif nxdl_element_type == "link":
                 name = node.get("name")
-                formatted_name = get_rst_formatted_name(node)
+                formatted_name = nxdl_utils.get_rst_formatted_name(node)
                 self._print(
                     f"{indent}{self._hyperlink_target(parent_path, name, 'link')}"
                 )
@@ -761,7 +761,7 @@ def _print_full_tree(self, ns, parent, name, indent, parent_path):
                 self._print_doc_enum(indent, ns, node)
 
             else:
-                raise ValueError(f"Unknown node type: {tag}")
+                raise ValueError(f"Unknown node type: {nxdl_element_type}")
 
     def _print(self, *args, end="\n"):
         # TODO: change instances of \t to proper indentation
@@ -772,15 +772,13 @@ def get_first_parent_ref(self, path, tag):
         path = path[path.find("/", 1) :]
 
         try:
-            parents = get_inherited_nodes(path, nx_name)[2]
+            parents = nxdl_utils.get_inherited_nodes(path, nx_name)[2]
         except FileNotFoundError:
             return ""
         if len(parents) > 1:
             for parent in parents:
                 # iterate back and check tag matches
-                if not parent.tag.endswith(tag) and not parent.tag.endswith(
-                    "definition"
-                ):
+                if xml_utils.get_local_name(parent) not in (tag, "definition"):
                     print(
                         f"Warning: {path} has a mismatching inherited node - {parent.tag} cf {tag}"
                     )
diff --git a/dev_tools/docs/xsd.py b/dev_tools/docs/xsd.py
index 88c2e0fe98..2b07375101 100644
--- a/dev_tools/docs/xsd.py
+++ b/dev_tools/docs/xsd.py
@@ -7,6 +7,7 @@
 from ..globals import directories
 from ..globals.errors import NXDLParseError
 from ..globals.nxdl import XSD_NAMESPACE
+from ..utils import xml_utils
 from ..utils.types import PathLike
 
 
@@ -116,9 +117,7 @@ def general_handler(self, parent=None, indentLevel=0):
         if parent_name is None:
             return
 
-        simple_tag = parent.tag[
-            parent.tag.find("}") + 1 :
-        ]  # cut off the namespace identifier
+        simple_tag = xml_utils.get_local_name(parent)
 
         # <varlistentry> ...
         name = parent_name  # + ' data type'
diff --git a/dev_tools/utils/nxdl_utils.py b/dev_tools/utils/nxdl_utils.py
index d0256fbe27..429fa73c64 100644
--- a/dev_tools/utils/nxdl_utils.py
+++ b/dev_tools/utils/nxdl_utils.py
@@ -13,6 +13,8 @@
 import lxml.etree as ET
 from lxml.etree import ParseError as xmlER
 
+from . import xml_utils
+
 
 def decode_or_not(elem, encoding: str = "utf-8", decode: bool = True):
     """
@@ -52,10 +54,9 @@ def decode_or_not(elem, encoding: str = "utf-8", decode: bool = True):
     return elem
 
 
-def remove_namespace_from_tag(tag):
-    """Helper function to remove the namespace from an XML tag."""
-
-    return tag.split("}")[-1]
+def get_nxdl_element_type(element):
+    type = xml_utils.get_local_name(element)
+    return "field" if type == "link" else type
 
 
 class NxdlAttributeNotFoundError(Exception):
@@ -87,20 +88,13 @@ def get_app_defs_names():
 
     files = sorted(glob(str(app_def_path_glob)))
     for nexus_file in sorted(glob(str(contrib_def_path_glob))):
-        root = get_xml_root(nexus_file)
+        root = xml_utils.read_xml_file(nexus_file)
         if root.attrib["category"] == "application":
             files.append(nexus_file)
 
     return [Path(file).name[:-9] for file in files] + ["NXroot"]
 
 
-@lru_cache(maxsize=None)
-def get_xml_root(file_path):
-    """Reducing I/O time by caching technique"""
-
-    return ET.parse(file_path).getroot()
-
-
 def get_hdf_root(hdf_node):
     """Get the root HDF5 node"""
     node = hdf_node
@@ -243,7 +237,7 @@ def get_nx_classes():
     nx_class = []
     for nexus_file in base_classes + applications + contributed:
         try:
-            root = get_xml_root(nexus_file)
+            root = xml_utils.read_xml_file(nexus_file)
         except xmlER as e:
             raise ValueError(f"Getting an issue while parsing file {nexus_file}") from e
         if root.attrib["category"] == "base":
@@ -254,7 +248,7 @@ def get_nx_classes():
 def get_nx_units():
     """Read unit kinds from the NeXus definition/nxdlTypes.xsd file"""
     filepath = nexus_def_path / "nxdlTypes.xsd"
-    root = get_xml_root(filepath)
+    root = xml_utils.read_xml_file(filepath)
     units_and_type_list = []
     for child in root:
         units_and_type_list.extend(child.attrib.values())
@@ -275,7 +269,7 @@ def get_nx_attribute_type():
     """Read attribute types from the NeXus definition/nxdlTypes.xsd file"""
     filepath = nexus_def_path / "nxdlTypes.xsd"
 
-    root = get_xml_root(filepath)
+    root = xml_utils.read_xml_file(filepath)
     units_and_type_list = []
     for child in root:
         units_and_type_list.extend(child.attrib.values())
@@ -321,7 +315,7 @@ def is_name_type(child, name_type_value: str) -> bool:
         return True
 
     if name_type_value == "any" and (
-        get_local_name_from_xml(child) == "group"
+        get_nxdl_element_type(child) == "group"
         and "nameType" not in child.attrib
         and "name" not in child.attrib
     ):
@@ -355,7 +349,7 @@ def belongs_to(nxdl_elem, child, name, class_type=None, hdf_name=None):
             if not isinstance(child2.tag, str):
                 continue
             if (
-                get_local_name_from_xml(child) != get_local_name_from_xml(child2)
+                get_nxdl_element_type(child) != get_nxdl_element_type(child2)
                 or get_node_name(child2) == act_htmlname
             ):
                 continue
@@ -376,35 +370,29 @@ def belongs_to(nxdl_elem, child, name, class_type=None, hdf_name=None):
     return False
 
 
-def get_local_name_from_xml(element):
-    """Helper function to extract the element tag without the namespace."""
-    type = remove_namespace_from_tag(element.tag)
-    return "field" if type == "link" else type
-
-
 def get_own_nxdl_child_reserved_elements(child, name, nxdl_elem):
     """checking reserved elements, like doc, enumeration"""
-    local_name = get_local_name_from_xml(child)
-    if local_name == "doc" and name == "doc":
+    local_type = get_nxdl_element_type(child)
+    if local_type == "doc" and name == "doc":
         return set_nxdlpath(child, nxdl_elem, tag_name=name)
 
-    if local_name == "enumeration" and name == "enumeration":
+    if local_type == "enumeration" and name == "enumeration":
         return set_nxdlpath(child, nxdl_elem, tag_name=name)
     return False
 
 
 def get_own_nxdl_child_base_types(child, class_type, nxdl_elem, name, hdf_name):
     """checking base types of group, field, attribute"""
-    if get_local_name_from_xml(child) == "group":
+    if get_nxdl_element_type(child) == "group":
         if (
             class_type is None or (class_type and get_nx_class(child) == class_type)
         ) and belongs_to(nxdl_elem, child, name, class_type, hdf_name):
             return set_nxdlpath(child, nxdl_elem)
-    if get_local_name_from_xml(child) == "field" and belongs_to(
+    if get_nxdl_element_type(child) == "field" and belongs_to(
         nxdl_elem, child, name, None, hdf_name
     ):
         return set_nxdlpath(child, nxdl_elem)
-    if get_local_name_from_xml(child) == "attribute" and belongs_to(
+    if get_nxdl_element_type(child) == "attribute" and belongs_to(
         nxdl_elem, child, name, None, hdf_name
     ):
         return set_nxdlpath(child, nxdl_elem)
@@ -424,7 +412,7 @@ def get_own_nxdl_child(
         result = get_own_nxdl_child_reserved_elements(child, name, nxdl_elem)
         if result is not False:
             return result
-        if nexus_type and get_local_name_from_xml(child) != nexus_type:
+        if nexus_type and get_nxdl_element_type(child) != nexus_type:
             continue
         result = get_own_nxdl_child_base_types(
             child, class_type, nxdl_elem, name, hdf_name
@@ -470,7 +458,7 @@ def get_nxdl_child(
     bc_filename = find_definition_file(bc_name)
     if not bc_filename:
         raise ValueError("nxdl file not found in definitions folder!")
-    bc_obj = ET.parse(bc_filename).getroot()
+    bc_obj = xml_utils.read_xml_file(bc_filename)
     bc_obj.set("nxdlbase", bc_filename)
     if "category" in bc_obj.attrib:
         bc_obj.set("nxdlbase_class", bc_obj.attrib["category"])
@@ -692,11 +680,6 @@ def print_doc(node, ntype, level, nxhtml, nxpath):
             print(wrapper.fill(par))
 
 
-def get_namespace(element):
-    """Extracts the namespace for elements in the NXDL"""
-    return element.tag[element.tag.index("{") : element.tag.rindex("}") + 1]
-
-
 def get_enums(node: ET._Element) -> Optional[List[str]]:
     """
     Makes list of enumerations, if node contains any.
@@ -709,7 +692,7 @@ def get_enums(node: ET._Element) -> Optional[List[str]]:
             Returns a list of the enumeration values if an enumeration was found.
             If no enumeration was found it returns None.
     """
-    namespace = get_namespace(node)
+    namespace = xml_utils.get_namespace(node)
     enums = []
     for enumeration in node.findall(f"{namespace}enumeration"):
         for item in enumeration.findall(f"{namespace}item"):
@@ -736,12 +719,7 @@ def add_base_classes(elist, nx_name=None, elem: ET.Element = None):
         if nxdl_file_path is None:
             nxdl_file_path = f"{nx_name}.nxdl.xml"
 
-        try:
-            elem = ET.parse(os.path.abspath(nxdl_file_path)).getroot()
-            # elem = ET.parse(nxdl_file_path).getroot()
-        except OSError:
-            with open(nxdl_file_path, "r") as f:
-                elem = ET.parse(f).getroot()
+        elem = xml_utils.read_xml_file(nxdl_file_path)
 
         if not isinstance(nxdl_file_path, str):
             nxdl_file_path = str(nxdl_file_path)
@@ -781,7 +759,7 @@ def get_direct_child(nxdl_elem, html_name):
     for child in nxdl_elem:
         if not isinstance(child.tag, str):
             continue
-        if get_local_name_from_xml(child) in (
+        if get_nxdl_element_type(child) in (
             "group",
             "field",
             "attribute",
@@ -798,7 +776,7 @@ def get_field_child(nxdl_elem, html_name):
     for child in nxdl_elem:
         if not isinstance(child.tag, str):
             continue
-        if get_local_name_from_xml(child) != "field":
+        if get_nxdl_element_type(child) != "field":
             continue
         if get_node_name(child) == html_name:
             data_child = set_nxdlpath(child, nxdl_elem)
@@ -853,7 +831,7 @@ def get_best_child(nxdl_elem, hdf_node, hdf_name, hdf_class_name, nexus_type):
         if not isinstance(child.tag, str):
             continue
         fit = -2
-        if get_local_name_from_xml(child) == nexus_type and (
+        if get_nxdl_element_type(child) == nexus_type and (
             nexus_type != "group" or get_nx_class(child) == hdf_class_name
         ):
             name_any = is_name_type(child, "any")
@@ -885,7 +863,7 @@ def walk_elist(elist, html_name):
                         None,
                         html_name,
                         get_nx_class(main_child),
-                        get_local_name_from_xml(main_child),
+                        get_nxdl_element_type(main_child),
                     )
                     if fitting_child is not None:
                         child = fitting_child
@@ -973,7 +951,7 @@ def get_rst_formatted_name(node):
     name = node.get("name", "")
     nameType = node.get("nameType", "")
 
-    node_type = get_local_name_from_xml(node)
+    node_type = get_nxdl_element_type(node)
 
     if not name and node_type == "group":
         # Derive the name from the type without the NX prefix
diff --git a/dev_tools/utils/xml_utils.py b/dev_tools/utils/xml_utils.py
new file mode 100644
index 0000000000..4bb29709b1
--- /dev/null
+++ b/dev_tools/utils/xml_utils.py
@@ -0,0 +1,40 @@
+"""NXDL agnostic XML helper functions."""
+
+from functools import lru_cache
+from pathlib import Path
+
+import lxml.etree as ET
+
+
+def read_xml_file(file_path: Path | str) -> ET.Element:
+    """Read XML file with caching."""
+    normalized_path = Path(file_path).resolve()
+    return _read_xml_file(normalized_path)
+
+
+@lru_cache(maxsize=None)
+def _read_xml_file(normalized_path: Path) -> ET.Element:
+    try:
+        return ET.parse(normalized_path).getroot()
+    except OSError:
+        # Not sure this is still necessary
+        with open(normalized_path, "r") as f:
+            return ET.parse(f).getroot()
+
+
+def get_local_name(element: ET.Element) -> str:
+    """
+    Return the local XML tag name of an element (without its namespace).
+
+    '{http://example.org/ns}field' -> 'field'
+    """
+    return ET.QName(element).localname
+
+
+def get_namespace(element: ET.Element) -> str:
+    """
+    Return the namespace URI of an XML element.
+
+    '{http://example.org/ns}field' -> 'http://example.org/ns'
+    """
+    return ET.QName(element).namespace

From a168a5ddb1a8553027dcb14f4f9e5cfb36dc5698 Mon Sep 17 00:00:00 2001
From: woutdenolf <woutdenolf@users.sf.net>
Date: Thu, 21 May 2026 21:39:44 +0200
Subject: [PATCH 2/2] clarify OSError handling when reading XML files

---
 dev_tools/utils/xml_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dev_tools/utils/xml_utils.py b/dev_tools/utils/xml_utils.py
index 4bb29709b1..0a4f00a3a5 100644
--- a/dev_tools/utils/xml_utils.py
+++ b/dev_tools/utils/xml_utils.py
@@ -17,8 +17,8 @@ def _read_xml_file(normalized_path: Path) -> ET.Element:
     try:
         return ET.parse(normalized_path).getroot()
     except OSError:
-        # Not sure this is still necessary
-        with open(normalized_path, "r") as f:
+        # libxml2 failed to open the file directly; retry with Python's open().
+        with open(normalized_path, "rb") as f:
             return ET.parse(f).getroot()