From ccc5c8ca197c90407e4fbc26a48bee486e02f28d Mon Sep 17 00:00:00 2001 From: woutdenolf Date: Wed, 29 Apr 2026 18:33:43 +0200 Subject: [PATCH 1/2] centralize some XML helper functions --- dev_tools/docs/nxdl.py | 38 +++++++++--------- dev_tools/docs/xsd.py | 5 +-- dev_tools/utils/nxdl_utils.py | 74 ++++++++++++----------------------- dev_tools/utils/xml_utils.py | 40 +++++++++++++++++++ 4 files changed, 86 insertions(+), 71 deletions(-) create mode 100644 dev_tools/utils/xml_utils.py diff --git a/dev_tools/docs/nxdl.py b/dev_tools/docs/nxdl.py index bd8e98ccf2..ea80cca733 100644 --- a/dev_tools/docs/nxdl.py +++ b/dev_tools/docs/nxdl.py @@ -12,9 +12,9 @@ from ..globals.errors import NXDLParseError from ..globals.nxdl import NXDL_NAMESPACE from ..globals.urls import REPO_URL +from ..utils import nxdl_utils +from ..utils import xml_utils from ..utils.github import get_file_contributors_via_api -from ..utils.nxdl_utils import get_inherited_nodes -from ..utils.nxdl_utils import get_rst_formatted_name from ..utils.types import PathLike from .anchor_list import AnchorRegistry @@ -363,8 +363,8 @@ def _get_required_or_optional_text(self, node): :param obj node: instance of lxml.etree._Element :returns: formatted text """ - tag = node.tag.split("}")[-1] - if tag in ("field", "group", "choice"): + nxdl_element_type = nxdl_utils.get_nxdl_element_type(node) + if nxdl_element_type in ("field", "group", "choice"): optional_default = not self._use_application_defaults optional = node.get("optional", optional_default) in (True, "true", "1", 1) recommended = node.get("recommended", None) in (True, "true", "1", 1) @@ -379,7 +379,7 @@ def _get_required_or_optional_text(self, node): # this is unexpected and remarkable # TODO: add a remark to the log optional_text = f"(``minOccurs={str(minOccurs)}``) " - elif tag in ("attribute",): + elif nxdl_element_type in ("attribute",): optional_default = not self._use_application_defaults optional = node.get("optional", optional_default) in (True, "true", "1", 1) recommended = node.get("recommended", None) in (True, "true", "1", 1) @@ -387,7 +387,7 @@ def _get_required_or_optional_text(self, node): if recommended: optional_text = "(recommended) " else: - optional_text = "(unknown tag: " + str(tag) + ") " + optional_text = "(unknown tag: " + str(nxdl_element_type) + ") " return optional_text def _analyze_dimensions(self, ns, parent) -> str: @@ -596,7 +596,7 @@ def _print_doc_enum(self, indent, ns, node, required=False): def _print_attribute(self, ns, kind, node, optional, indent, parent_path): name = node.get("name") - formatted_name = get_rst_formatted_name(node) + formatted_name = nxdl_utils.get_rst_formatted_name(node) index_name = name self._print( f"{indent}" f"{self._hyperlink_target(parent_path, name, 'attribute')}" @@ -626,11 +626,11 @@ def _print_full_tree(self, ns, parent, name, indent, parent_path): """ # Process children in document order to preserve XML ordering. for node in parent.xpath("nx:field|nx:group|nx:choice|nx:link", namespaces=ns): - tag = node.tag.split("}")[-1] + nxdl_element_type = nxdl_utils.get_nxdl_element_type(node) - if tag == "field": + if nxdl_element_type == "field": name = node.get("name") - formatted_name = get_rst_formatted_name(node) + formatted_name = nxdl_utils.get_rst_formatted_name(node) index_name = name dims = self._analyze_dimensions(ns, node) @@ -663,9 +663,9 @@ def _print_full_tree(self, ns, parent, name, indent, parent_path): parent_path + "/" + name, ) - elif tag == "group": + elif nxdl_element_type == "group": name = node.get("name", "") - formatted_name = get_rst_formatted_name(node) + formatted_name = nxdl_utils.get_rst_formatted_name(node) typ = node.get("type", "untyped (this is an error; please report)") optional_text = self._get_required_or_optional_text(node) @@ -705,7 +705,7 @@ def _print_full_tree(self, ns, parent, name, indent, parent_path): parent_path + "/" + name, ) - elif tag == "choice": + elif nxdl_element_type == "choice": name = node.get("name", "") hTarget = self._hyperlink_target(parent_path, name, "choice") self._print(f"{indent}{hTarget}") @@ -746,9 +746,9 @@ def _print_full_tree(self, ns, parent, name, indent, parent_path): parent_path + "/" + name + "/" + subname, ) - elif tag == "link": + elif nxdl_element_type == "link": name = node.get("name") - formatted_name = get_rst_formatted_name(node) + formatted_name = nxdl_utils.get_rst_formatted_name(node) self._print( f"{indent}{self._hyperlink_target(parent_path, name, 'link')}" ) @@ -761,7 +761,7 @@ def _print_full_tree(self, ns, parent, name, indent, parent_path): self._print_doc_enum(indent, ns, node) else: - raise ValueError(f"Unknown node type: {tag}") + raise ValueError(f"Unknown node type: {nxdl_element_type}") def _print(self, *args, end="\n"): # TODO: change instances of \t to proper indentation @@ -772,15 +772,13 @@ def get_first_parent_ref(self, path, tag): path = path[path.find("/", 1) :] try: - parents = get_inherited_nodes(path, nx_name)[2] + parents = nxdl_utils.get_inherited_nodes(path, nx_name)[2] except FileNotFoundError: return "" if len(parents) > 1: for parent in parents: # iterate back and check tag matches - if not parent.tag.endswith(tag) and not parent.tag.endswith( - "definition" - ): + if xml_utils.get_local_name(parent) not in (tag, "definition"): print( f"Warning: {path} has a mismatching inherited node - {parent.tag} cf {tag}" ) diff --git a/dev_tools/docs/xsd.py b/dev_tools/docs/xsd.py index 88c2e0fe98..2b07375101 100644 --- a/dev_tools/docs/xsd.py +++ b/dev_tools/docs/xsd.py @@ -7,6 +7,7 @@ from ..globals import directories from ..globals.errors import NXDLParseError from ..globals.nxdl import XSD_NAMESPACE +from ..utils import xml_utils from ..utils.types import PathLike @@ -116,9 +117,7 @@ def general_handler(self, parent=None, indentLevel=0): if parent_name is None: return - simple_tag = parent.tag[ - parent.tag.find("}") + 1 : - ] # cut off the namespace identifier + simple_tag = xml_utils.get_local_name(parent) # ... name = parent_name # + ' data type' diff --git a/dev_tools/utils/nxdl_utils.py b/dev_tools/utils/nxdl_utils.py index d0256fbe27..429fa73c64 100644 --- a/dev_tools/utils/nxdl_utils.py +++ b/dev_tools/utils/nxdl_utils.py @@ -13,6 +13,8 @@ import lxml.etree as ET from lxml.etree import ParseError as xmlER +from . import xml_utils + def decode_or_not(elem, encoding: str = "utf-8", decode: bool = True): """ @@ -52,10 +54,9 @@ def decode_or_not(elem, encoding: str = "utf-8", decode: bool = True): return elem -def remove_namespace_from_tag(tag): - """Helper function to remove the namespace from an XML tag.""" - - return tag.split("}")[-1] +def get_nxdl_element_type(element): + type = xml_utils.get_local_name(element) + return "field" if type == "link" else type class NxdlAttributeNotFoundError(Exception): @@ -87,20 +88,13 @@ def get_app_defs_names(): files = sorted(glob(str(app_def_path_glob))) for nexus_file in sorted(glob(str(contrib_def_path_glob))): - root = get_xml_root(nexus_file) + root = xml_utils.read_xml_file(nexus_file) if root.attrib["category"] == "application": files.append(nexus_file) return [Path(file).name[:-9] for file in files] + ["NXroot"] -@lru_cache(maxsize=None) -def get_xml_root(file_path): - """Reducing I/O time by caching technique""" - - return ET.parse(file_path).getroot() - - def get_hdf_root(hdf_node): """Get the root HDF5 node""" node = hdf_node @@ -243,7 +237,7 @@ def get_nx_classes(): nx_class = [] for nexus_file in base_classes + applications + contributed: try: - root = get_xml_root(nexus_file) + root = xml_utils.read_xml_file(nexus_file) except xmlER as e: raise ValueError(f"Getting an issue while parsing file {nexus_file}") from e if root.attrib["category"] == "base": @@ -254,7 +248,7 @@ def get_nx_classes(): def get_nx_units(): """Read unit kinds from the NeXus definition/nxdlTypes.xsd file""" filepath = nexus_def_path / "nxdlTypes.xsd" - root = get_xml_root(filepath) + root = xml_utils.read_xml_file(filepath) units_and_type_list = [] for child in root: units_and_type_list.extend(child.attrib.values()) @@ -275,7 +269,7 @@ def get_nx_attribute_type(): """Read attribute types from the NeXus definition/nxdlTypes.xsd file""" filepath = nexus_def_path / "nxdlTypes.xsd" - root = get_xml_root(filepath) + root = xml_utils.read_xml_file(filepath) units_and_type_list = [] for child in root: units_and_type_list.extend(child.attrib.values()) @@ -321,7 +315,7 @@ def is_name_type(child, name_type_value: str) -> bool: return True if name_type_value == "any" and ( - get_local_name_from_xml(child) == "group" + get_nxdl_element_type(child) == "group" and "nameType" not in child.attrib and "name" not in child.attrib ): @@ -355,7 +349,7 @@ def belongs_to(nxdl_elem, child, name, class_type=None, hdf_name=None): if not isinstance(child2.tag, str): continue if ( - get_local_name_from_xml(child) != get_local_name_from_xml(child2) + get_nxdl_element_type(child) != get_nxdl_element_type(child2) or get_node_name(child2) == act_htmlname ): continue @@ -376,35 +370,29 @@ def belongs_to(nxdl_elem, child, name, class_type=None, hdf_name=None): return False -def get_local_name_from_xml(element): - """Helper function to extract the element tag without the namespace.""" - type = remove_namespace_from_tag(element.tag) - return "field" if type == "link" else type - - def get_own_nxdl_child_reserved_elements(child, name, nxdl_elem): """checking reserved elements, like doc, enumeration""" - local_name = get_local_name_from_xml(child) - if local_name == "doc" and name == "doc": + local_type = get_nxdl_element_type(child) + if local_type == "doc" and name == "doc": return set_nxdlpath(child, nxdl_elem, tag_name=name) - if local_name == "enumeration" and name == "enumeration": + if local_type == "enumeration" and name == "enumeration": return set_nxdlpath(child, nxdl_elem, tag_name=name) return False def get_own_nxdl_child_base_types(child, class_type, nxdl_elem, name, hdf_name): """checking base types of group, field, attribute""" - if get_local_name_from_xml(child) == "group": + if get_nxdl_element_type(child) == "group": if ( class_type is None or (class_type and get_nx_class(child) == class_type) ) and belongs_to(nxdl_elem, child, name, class_type, hdf_name): return set_nxdlpath(child, nxdl_elem) - if get_local_name_from_xml(child) == "field" and belongs_to( + if get_nxdl_element_type(child) == "field" and belongs_to( nxdl_elem, child, name, None, hdf_name ): return set_nxdlpath(child, nxdl_elem) - if get_local_name_from_xml(child) == "attribute" and belongs_to( + if get_nxdl_element_type(child) == "attribute" and belongs_to( nxdl_elem, child, name, None, hdf_name ): return set_nxdlpath(child, nxdl_elem) @@ -424,7 +412,7 @@ def get_own_nxdl_child( result = get_own_nxdl_child_reserved_elements(child, name, nxdl_elem) if result is not False: return result - if nexus_type and get_local_name_from_xml(child) != nexus_type: + if nexus_type and get_nxdl_element_type(child) != nexus_type: continue result = get_own_nxdl_child_base_types( child, class_type, nxdl_elem, name, hdf_name @@ -470,7 +458,7 @@ def get_nxdl_child( bc_filename = find_definition_file(bc_name) if not bc_filename: raise ValueError("nxdl file not found in definitions folder!") - bc_obj = ET.parse(bc_filename).getroot() + bc_obj = xml_utils.read_xml_file(bc_filename) bc_obj.set("nxdlbase", bc_filename) if "category" in bc_obj.attrib: bc_obj.set("nxdlbase_class", bc_obj.attrib["category"]) @@ -692,11 +680,6 @@ def print_doc(node, ntype, level, nxhtml, nxpath): print(wrapper.fill(par)) -def get_namespace(element): - """Extracts the namespace for elements in the NXDL""" - return element.tag[element.tag.index("{") : element.tag.rindex("}") + 1] - - def get_enums(node: ET._Element) -> Optional[List[str]]: """ Makes list of enumerations, if node contains any. @@ -709,7 +692,7 @@ def get_enums(node: ET._Element) -> Optional[List[str]]: Returns a list of the enumeration values if an enumeration was found. If no enumeration was found it returns None. """ - namespace = get_namespace(node) + namespace = xml_utils.get_namespace(node) enums = [] for enumeration in node.findall(f"{namespace}enumeration"): for item in enumeration.findall(f"{namespace}item"): @@ -736,12 +719,7 @@ def add_base_classes(elist, nx_name=None, elem: ET.Element = None): if nxdl_file_path is None: nxdl_file_path = f"{nx_name}.nxdl.xml" - try: - elem = ET.parse(os.path.abspath(nxdl_file_path)).getroot() - # elem = ET.parse(nxdl_file_path).getroot() - except OSError: - with open(nxdl_file_path, "r") as f: - elem = ET.parse(f).getroot() + elem = xml_utils.read_xml_file(nxdl_file_path) if not isinstance(nxdl_file_path, str): nxdl_file_path = str(nxdl_file_path) @@ -781,7 +759,7 @@ def get_direct_child(nxdl_elem, html_name): for child in nxdl_elem: if not isinstance(child.tag, str): continue - if get_local_name_from_xml(child) in ( + if get_nxdl_element_type(child) in ( "group", "field", "attribute", @@ -798,7 +776,7 @@ def get_field_child(nxdl_elem, html_name): for child in nxdl_elem: if not isinstance(child.tag, str): continue - if get_local_name_from_xml(child) != "field": + if get_nxdl_element_type(child) != "field": continue if get_node_name(child) == html_name: data_child = set_nxdlpath(child, nxdl_elem) @@ -853,7 +831,7 @@ def get_best_child(nxdl_elem, hdf_node, hdf_name, hdf_class_name, nexus_type): if not isinstance(child.tag, str): continue fit = -2 - if get_local_name_from_xml(child) == nexus_type and ( + if get_nxdl_element_type(child) == nexus_type and ( nexus_type != "group" or get_nx_class(child) == hdf_class_name ): name_any = is_name_type(child, "any") @@ -885,7 +863,7 @@ def walk_elist(elist, html_name): None, html_name, get_nx_class(main_child), - get_local_name_from_xml(main_child), + get_nxdl_element_type(main_child), ) if fitting_child is not None: child = fitting_child @@ -973,7 +951,7 @@ def get_rst_formatted_name(node): name = node.get("name", "") nameType = node.get("nameType", "") - node_type = get_local_name_from_xml(node) + node_type = get_nxdl_element_type(node) if not name and node_type == "group": # Derive the name from the type without the NX prefix diff --git a/dev_tools/utils/xml_utils.py b/dev_tools/utils/xml_utils.py new file mode 100644 index 0000000000..4bb29709b1 --- /dev/null +++ b/dev_tools/utils/xml_utils.py @@ -0,0 +1,40 @@ +"""NXDL agnostic XML helper functions.""" + +from functools import lru_cache +from pathlib import Path + +import lxml.etree as ET + + +def read_xml_file(file_path: Path | str) -> ET.Element: + """Read XML file with caching.""" + normalized_path = Path(file_path).resolve() + return _read_xml_file(normalized_path) + + +@lru_cache(maxsize=None) +def _read_xml_file(normalized_path: Path) -> ET.Element: + try: + return ET.parse(normalized_path).getroot() + except OSError: + # Not sure this is still necessary + with open(normalized_path, "r") as f: + return ET.parse(f).getroot() + + +def get_local_name(element: ET.Element) -> str: + """ + Return the local XML tag name of an element (without its namespace). + + '{http://example.org/ns}field' -> 'field' + """ + return ET.QName(element).localname + + +def get_namespace(element: ET.Element) -> str: + """ + Return the namespace URI of an XML element. + + '{http://example.org/ns}field' -> 'http://example.org/ns' + """ + return ET.QName(element).namespace From a168a5ddb1a8553027dcb14f4f9e5cfb36dc5698 Mon Sep 17 00:00:00 2001 From: woutdenolf Date: Thu, 21 May 2026 21:39:44 +0200 Subject: [PATCH 2/2] clarify OSError handling when reading XML files --- dev_tools/utils/xml_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dev_tools/utils/xml_utils.py b/dev_tools/utils/xml_utils.py index 4bb29709b1..0a4f00a3a5 100644 --- a/dev_tools/utils/xml_utils.py +++ b/dev_tools/utils/xml_utils.py @@ -17,8 +17,8 @@ def _read_xml_file(normalized_path: Path) -> ET.Element: try: return ET.parse(normalized_path).getroot() except OSError: - # Not sure this is still necessary - with open(normalized_path, "r") as f: + # libxml2 failed to open the file directly; retry with Python's open(). + with open(normalized_path, "rb") as f: return ET.parse(f).getroot()