From faf81ee0d1e814d448a02844f5a15c43448e1966 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Thu, 15 Jan 2026 23:28:03 +0100 Subject: [PATCH 01/17] Work around ckan/ckanapi#218 --- transport_data/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/transport_data/__init__.py b/transport_data/__init__.py index ffad20d..34853c1 100644 --- a/transport_data/__init__.py +++ b/transport_data/__init__.py @@ -1,10 +1,13 @@ import logging import sys +from warnings import filterwarnings from .config import Config from .store import UnionStore from .util.pluggy import register_internal +filterwarnings("ignore", "pkg_resources is deprecated", UserWarning, "ckanapi.version") + log = logging.getLogger(__name__) log.setLevel(logging.INFO) log.addHandler(logging.StreamHandler(sys.stdout)) From 4dddc60ceb41bd0232661f5566269b6c97ce693b Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Thu, 15 Jan 2026 23:29:42 +0100 Subject: [PATCH 02/17] Improve .util.ckan.Package - Convert "resources" collection to instances of Resource. - Adjust ckan_package_to_mdr() to match. - Add portal_url() method. - Add type hints for known members/attributes. --- transport_data/org/ckan.py | 17 +++++++++++++---- transport_data/util/ckan.py | 25 ++++++++++++++++++++++++- 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/transport_data/org/ckan.py b/transport_data/org/ckan.py index a636171..8b06fc8 100644 --- a/transport_data/org/ckan.py +++ b/transport_data/org/ckan.py @@ -121,7 +121,7 @@ def get_msd() -> "v21.MetadataStructureDefinition": return msd -def ckan_package_to_mdr(p) -> "v21.MetadataReport": +def ckan_package_to_mdr(package: Package) -> "v21.MetadataReport": """Convert a :class:`.Package` instance to a MetadataReport.""" from sdmx.model import v21 @@ -134,10 +134,19 @@ def ckan_package_to_mdr(p) -> "v21.MetadataReport": for mda in msd.report_structure["ALL"]: av = ONEAV(value_for=mda) if mda.id == "JSON": - av.value = repr(p.asdict()) + # All JSON data + av.value = repr(package.asdict()) else: - value = getattr(p, mda.id) - av.value = value if isinstance(value, str) else repr(value) + value = getattr(package, mda.id) + match value: + case str(): + av.value = value + case list() if len(value) and isinstance(value[0], ModelProxy): + # Restore ModelProxy contents to JSON-like instead of short __repr__ + av.value = repr([obj.asdict() for obj in value]) + case _: + av.value = repr(value) + mdr.metadata.append(av) return mdr diff --git a/transport_data/util/ckan.py b/transport_data/util/ckan.py index bb919f5..86697f4 100644 --- a/transport_data/util/ckan.py +++ b/transport_data/util/ckan.py @@ -94,7 +94,7 @@ def get_item(self, name: str, index: int | None = None): data = self.__dict__[name][index] cls = get_class(name) assert cls - return cls(data) + return data if isinstance(data, cls) else cls(data) def _process_collections(self) -> None: """Convert the :attr:`_collections` to the designated types.""" @@ -174,6 +174,29 @@ class Package(ModelProxy): `_. """ + _collections = { + "resources": (list, "Resource"), + } + + # Type hints + name: str + organization: dict[str, str] + resources: list["Resource"] + tdc_category: str + + def portal_url(self) -> str: + """Infer the TDC Portal URL for the package. + + The URL is not provided by the API, so we construct it with similar logic to + the portal. + """ + return ( + "https://portal.transport-data.org/@" + + self.organization["title"].lower() + + "/" + + self.name + ) + class Resource(ModelProxy): """Proxy for `ckan.model.Resource From cc145349990e9ec7add9818c931922db4dd0bf91 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Thu, 15 Jan 2026 23:35:51 +0100 Subject: [PATCH 03/17] Improve .util.ckan.Resource - Add .fetch() method. - Add type hints for known attributes. --- transport_data/util/ckan.py | 62 +++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/transport_data/util/ckan.py b/transport_data/util/ckan.py index 86697f4..50ab61a 100644 --- a/transport_data/util/ckan.py +++ b/transport_data/util/ckan.py @@ -14,6 +14,7 @@ class that provides conveniences used by other code in :mod:`transport_data`. from functools import partialmethod from importlib.metadata import version from itertools import count +from pathlib import Path from typing import TYPE_CHECKING, ClassVar, TypeVar from warnings import filterwarnings @@ -203,6 +204,67 @@ class Resource(ModelProxy): `_. """ + # Type hints + hash: str + name: str + size: int + url: str + + def fetch(self, max_size: int = 10_000_000) -> Path: + """Fetch the resource file and cache it locally. + + Parameters + ---------- + max_size + Maximum size of file to download. + + Raises + ------ + AssertionError + if the size of the file is equal to or greater than `max_size`. + """ + from hashlib import file_digest + + import requests + + from transport_data import CONFIG + + assert (self.size or 0) <= max_size, ( + f"File size {self.size} >= maxiumum {max_size} B" + ) + + # Identify the target local path. Use directory hierarchy to avoid directories + # with many files. + assert self.id is not None + target = CONFIG.cache_path.joinpath( + "resource", self.id[0], self.id[:2], self.id, self.name + ) + + # Ensure the target directory exists + target.parent.mkdir(parents=True, exist_ok=True) + + file_hash = "" + try: + # Check existence and hash of local file + with open(target, "rb") as fd: + file_hash = file_digest(fd, "md5").hexdigest() + + # Allow that self.hash is empty; don't force download in this case + assert self.hash in ("", file_hash) + except (AssertionError, FileNotFoundError) as e: + # Hash does not match or file does not exist + if isinstance(e, AssertionError): + print( + f"Hash {file_hash} of {target} does not match expected {self.hash};" + " will re-download" + ) + response = requests.get(self.url, stream=True) + with open(target, "wb") as fd: + for chunk in response.iter_content(): + fd.write(chunk) + + return target + class Tag(ModelProxy): """Proxy for the CKAN 'Tag' model. From e4d394ce9bbb6720b64de8cba1c551dcd490382c Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Thu, 15 Jan 2026 23:36:40 +0100 Subject: [PATCH 04/17] Add `tdc check-record` CLI command --- transport_data/cli/__init__.py | 85 ++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/transport_data/cli/__init__.py b/transport_data/cli/__init__.py index f2e6dad..c0898ec 100644 --- a/transport_data/cli/__init__.py +++ b/transport_data/cli/__init__.py @@ -1,12 +1,19 @@ """Command-line interface.""" +from collections import defaultdict +from collections.abc import MutableMapping +from dataclasses import dataclass from importlib import import_module from pathlib import Path +from typing import TYPE_CHECKING, Any import click from transport_data import CONFIG # noqa: F401 +if TYPE_CHECKING: + from transport_data.util.ckan import Package + @click.group("tdc") def main(): @@ -171,3 +178,81 @@ def check(structure_urn: str, path: Path, sheets, verbose, **options): # noqa: print(sdmx.to_pandas(ds)) else: print(sdmx.to_pandas(ds).to_string()) + + +@main.command() +@click.argument("id") +def check_record(id: str) -> None: + """Check record NAME on the TDC.""" + # TODO Use .org.ckan.instance_option + from transport_data.org.ckan import PROD + + # Retrieve the record, converted to an instance of Package + package = PROD.package_show(id) + + # Print general package information + print( + f"{package!r}", + package.portal_url(), + f"Title: {package.title!r}", + f"Category: {package.tdc_category}", + sep="\n- ", + ) + + check_package0(package) + + +SUFFIXES = { + "data": {".xlsx", ".csv"}, +} + + +def check_package0(package: "Package") -> None: + """Print some checks about a `package`.""" + # Convert resource file names to Path instances; count suffixes + files = [] + suffix_count: MutableMapping[str, int] = defaultdict(lambda: 0) + for resource in package.resources: + path = Path(resource.name) + files.append(path) + suffix_count[path.suffix.lower()] += 1 + + @dataclass + class Check: + label: str + value: Any + + def __str__(self) -> str: + return f"{self.label}: {self.value}" + + c0 = Check( + "Number of files by extension", + ", ".join(f"{c} {s}" for s, c in sorted(suffix_count.items())), + ) + c1 = Check( + "Number of data files", + sum(c for s, c in suffix_count.items() if s in SUFFIXES["data"]), + ) + c2 = Check("Number of possible SDMX-CSV files", suffix_count[".csv"]) + checks = [c0, c1, c2] + + lines = [""] + [f"- {check}" for check in checks] + [""] + + lines.append("Criteria for a TDC Formatted record:") + c3 = Check("At least one file in CSV format", c2.value >= 1) + c4 = Check( + "Correct category assigned", + package.tdc_category in {"tdc_formatted", "tdc_harmonized"}, + ) + c5 = Check("CSV file(s) are in SDMX-CSV format (not implemented yet)", True) + c6 = Check("Overall", "YES" if (c3.value and c4.value and c5.value) else "NO") + + lines.extend(f"- {check}" for check in (c3, c4, c5, c6)) + + lines.extend(["", "Criteria for a TDC Harmonized record—all of the above, plus:"]) + c7 = Check("Correct category assigned", package.tdc_category == "tdc_harmonized") + c8 = Check("Overall", "YES" if c7.value else "NO") + + lines.extend(f"- {check}" for check in (c7, c8)) + + print(*lines, sep="\n") From cce0e2240535861c3a95964cd44f9224de54b628 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Fri, 16 Jan 2026 11:42:06 +0100 Subject: [PATCH 05/17] Add cli_modules() plugin hook - Use in existing modules. --- transport_data/ipcc/__init__.py | 6 +++--- transport_data/other/__init__.py | 4 ++-- transport_data/util/hooks.py | 12 +++++++++++- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/transport_data/ipcc/__init__.py b/transport_data/ipcc/__init__.py index 2183019..b88ce6c 100644 --- a/transport_data/ipcc/__init__.py +++ b/transport_data/ipcc/__init__.py @@ -1,9 +1,9 @@ """Intergovernmental Panel on Climate Change metadata provider.""" -from transport_data.util.pluggy import hookimpl +from transport_data import hook -@hookimpl +@hook def get_agencies(): """Return the IPCC :class:`.Agency`.""" from sdmx.model import common @@ -16,7 +16,7 @@ def get_agencies(): return (a,) -@hookimpl +@hook def provides(): return ( "Codelist=TDCI:CL_IPCC_2006_V2_T3.1.1", diff --git a/transport_data/other/__init__.py b/transport_data/other/__init__.py index ba634ea..b3a8f46 100644 --- a/transport_data/other/__init__.py +++ b/transport_data/other/__init__.py @@ -1,9 +1,9 @@ """Other data providers.""" -from transport_data.util.pluggy import hookimpl +from transport_data import hook -@hookimpl +@hook def get_agencies(): from sdmx.model.common import Agency, Contact diff --git a/transport_data/util/hooks.py b/transport_data/util/hooks.py index e23aed6..a9a883f 100644 --- a/transport_data/util/hooks.py +++ b/transport_data/util/hooks.py @@ -7,9 +7,19 @@ if TYPE_CHECKING: import sdmx.model.v21 + hookspec = pluggy.HookspecMarker("transport_data") +@hookspec +def cli_modules() -> str | Iterable[str]: + """Return the fully-qualified name(s) of (a) module(s) with :mod:`click` commands. + + The module(s) **must** contain a :class:`click.Group` or command named :py:`main`. + """ + raise NotImplementedError + + @hookspec def get_agencies() -> Iterable["sdmx.model.v21.Agency"]: """Return :class:`sdmx.model.common.Agency` identifying (meta)data provider(s). @@ -21,5 +31,5 @@ def get_agencies() -> Iterable["sdmx.model.v21.Agency"]: @hookspec def provides() -> Iterable[str]: - """Return the URNs of SDMX artefacts available from a module.""" + """Return 0 or more URNs of SDMX artefacts available from a module.""" raise NotImplementedError From 6e241fcdfe16491d9c4f3f0b52cb1531b2d6cd0b Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Fri, 16 Jan 2026 11:42:31 +0100 Subject: [PATCH 06/17] Add transport_data.hook as a top-level item --- transport_data/__init__.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/transport_data/__init__.py b/transport_data/__init__.py index 34853c1..47300b9 100644 --- a/transport_data/__init__.py +++ b/transport_data/__init__.py @@ -4,8 +4,15 @@ from .config import Config from .store import UnionStore +from .util.pluggy import hookimpl as hook from .util.pluggy import register_internal +__all__ = [ + "CONFIG", + "STORE", + "hook", +] + filterwarnings("ignore", "pkg_resources is deprecated", UserWarning, "ckanapi.version") log = logging.getLogger(__name__) From 02b63808c03ee473241ad241d7efef02ad822c03 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Fri, 16 Jan 2026 11:46:10 +0100 Subject: [PATCH 07/17] Add cli_modules() hooks to existing modules - Adjust test. --- transport_data/ato/__init__.py | 11 ++++++++--- transport_data/estat/__init__.py | 8 +++++++- transport_data/iamc/__init__.py | 11 ++++++++--- transport_data/iso/__init__.py | 9 +++++++-- transport_data/itdp/__init__.py | 9 +++++++-- transport_data/jrc/__init__.py | 12 ++++++++---- transport_data/oica/__init__.py | 11 ++++++++--- transport_data/org/__init__.py | 12 +++++++++--- transport_data/tests/util/test_pluggy.py | 4 ++-- 9 files changed, 64 insertions(+), 23 deletions(-) diff --git a/transport_data/ato/__init__.py b/transport_data/ato/__init__.py index 2f0cb07..620778c 100644 --- a/transport_data/ato/__init__.py +++ b/transport_data/ato/__init__.py @@ -12,7 +12,7 @@ import sdmx.model.v21 as m from sdmx.model import common, v21 -from transport_data.util.pluggy import hookimpl +from transport_data import hook from transport_data.util.pooch import Pooch from transport_data.util.sdmx import anno_generated @@ -373,7 +373,12 @@ def format_data_provider(value: str) -> str: return value + "—republished by ATO" -@hookimpl +@hook +def cli_modules(): + return f"{__name__}.cli" + + +@hook def get_agencies(): a = m.Agency( id="ATO", @@ -392,7 +397,7 @@ def get_agencies(): return (a,) -@hookimpl +@hook def provides(): return ( "Codelist=TDCI:CL_ATO_ECONOMY", diff --git a/transport_data/estat/__init__.py b/transport_data/estat/__init__.py index 0b87e0a..10747c2 100644 --- a/transport_data/estat/__init__.py +++ b/transport_data/estat/__init__.py @@ -12,7 +12,13 @@ import click import sdmx -from transport_data import STORE +from transport_data import STORE, hook + + +@hook +def cli_modules(): + return __name__ + # General functions diff --git a/transport_data/iamc/__init__.py b/transport_data/iamc/__init__.py index b0776b3..eb4708a 100644 --- a/transport_data/iamc/__init__.py +++ b/transport_data/iamc/__init__.py @@ -10,12 +10,17 @@ import sdmx.model.v21 as m from sdmx.message import StructureMessage -from transport_data.util.pluggy import hookimpl +from transport_data import hook log = logging.getLogger(__name__) -@hookimpl +@hook +def cli_modules(): + return f"{__name__}.cli" + + +@hook def get_agencies(): a = m.Agency( id="IAMC", @@ -25,7 +30,7 @@ def get_agencies(): return (a,) -@hookimpl +@hook def provides(): return ("ConceptScheme=TDCI:CS_IAMC",) diff --git a/transport_data/iso/__init__.py b/transport_data/iso/__init__.py index f2ab05c..3ea0bda 100644 --- a/transport_data/iso/__init__.py +++ b/transport_data/iso/__init__.py @@ -6,7 +6,7 @@ from sdmx.model import common, v21 -from transport_data.util.pluggy import hookimpl +from transport_data import hook from transport_data.util.pycountry import LOCALIZABLE, get_database, load_translations if TYPE_CHECKING: @@ -16,7 +16,12 @@ log = logging.getLogger(__name__) -@hookimpl +@hook +def cli_modules(): + return f"{__name__}.cli" + + +@hook def get_agencies(): """Return the ``ISO`` :class:`~.sdmx.model.common.Agency`.""" a = common.Agency( diff --git a/transport_data/itdp/__init__.py b/transport_data/itdp/__init__.py index 6c9f0f5..a7258e5 100644 --- a/transport_data/itdp/__init__.py +++ b/transport_data/itdp/__init__.py @@ -1,9 +1,14 @@ """Institute for Transport & Development Policy (ITDP) provider.""" -from transport_data.util.pluggy import hookimpl +from transport_data import hook -@hookimpl +@hook +def cli_modules(): + return f"{__name__}.cli" + + +@hook def get_agencies(): from sdmx.model import common diff --git a/transport_data/jrc/__init__.py b/transport_data/jrc/__init__.py index defb394..8375ee9 100644 --- a/transport_data/jrc/__init__.py +++ b/transport_data/jrc/__init__.py @@ -22,13 +22,17 @@ import pandas as pd import sdmx.model.v21 as m -from transport_data import STORE -from transport_data.util.pluggy import hookimpl +from transport_data import STORE, hook from transport_data.util.pooch import Pooch from transport_data.util.sdmx import anno_generated -@hookimpl +@hook +def cli_modules(): + return f"{__name__}.cli" + + +@hook def get_agencies(): """Return information about the agency providing the data set. @@ -50,7 +54,7 @@ def get_agencies(): return (a,) -@hookimpl +@hook def provides(): return ("ConceptScheme=TDCI:CS_JRC_MEASURE",) diff --git a/transport_data/oica/__init__.py b/transport_data/oica/__init__.py index 498cf87..6d95ed6 100644 --- a/transport_data/oica/__init__.py +++ b/transport_data/oica/__init__.py @@ -20,7 +20,7 @@ import pandas as pd -from transport_data.util.pluggy import hookimpl +from transport_data import hook from transport_data.util.pooch import Pooch if TYPE_CHECKING: @@ -329,7 +329,12 @@ def _make_code(value: str): return id_for_name -@hookimpl +@hook +def cli_modules(): + return f"{__name__}.cli" + + +@hook def get_agencies(): """Return the OICA Agency.""" from sdmx.model import v21 @@ -342,7 +347,7 @@ def get_agencies(): return (a,) -@hookimpl +@hook def provides(): return ( "Codelist=TDCI:CL_OICA_GEO", diff --git a/transport_data/org/__init__.py b/transport_data/org/__init__.py index 4a5266e..94af42b 100644 --- a/transport_data/org/__init__.py +++ b/transport_data/org/__init__.py @@ -6,13 +6,19 @@ import sdmx.model.v21 as m -from transport_data.util.pluggy import hookimpl, pm +from transport_data import hook +from transport_data.util.pluggy import pm if TYPE_CHECKING: import sdmx.model.v21 -@hookimpl +@hook +def cli_modules(): + return f"{__name__}.cli" + + +@hook def get_agencies() -> "sdmx.model.v21.Agency": """Return agencies and organizations including and subsidiary to TDCI itself.""" # Agency @@ -48,7 +54,7 @@ def get_agencies() -> "sdmx.model.v21.Agency": return a1, a2 -@hookimpl +@hook def provides(): return ("AgencyScheme=TDCI:TDCI",) diff --git a/transport_data/tests/util/test_pluggy.py b/transport_data/tests/util/test_pluggy.py index b332d0b..6820ee7 100644 --- a/transport_data/tests/util/test_pluggy.py +++ b/transport_data/tests/util/test_pluggy.py @@ -1,4 +1,4 @@ -def test_plugin_manager(): +def test_plugin_manager() -> None: from transport_data.util.pluggy import pm - assert 9 == len(pm.list_name_plugin()) + assert 10 == len(pm.list_name_plugin()) From 666980ea242f40136e99cef26ff5c3eab689c68e Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Fri, 16 Jan 2026 11:47:37 +0100 Subject: [PATCH 08/17] Add CLI commands from cli_modules() hooks - Reduce MODULES_WITH_CLI to internal/non-provider modules. --- transport_data/__init__.py | 1 + transport_data/cli/__init__.py | 26 +++++++++----------------- 2 files changed, 10 insertions(+), 17 deletions(-) diff --git a/transport_data/__init__.py b/transport_data/__init__.py index 47300b9..1dd32f4 100644 --- a/transport_data/__init__.py +++ b/transport_data/__init__.py @@ -28,6 +28,7 @@ # Register plugin hooks register_internal( "ato", + "estat", "iamc", "ipcc", "iso", diff --git a/transport_data/cli/__init__.py b/transport_data/cli/__init__.py index c0898ec..b40804d 100644 --- a/transport_data/cli/__init__.py +++ b/transport_data/cli/__init__.py @@ -4,12 +4,14 @@ from collections.abc import MutableMapping from dataclasses import dataclass from importlib import import_module +from itertools import chain from pathlib import Path from typing import TYPE_CHECKING, Any import click from transport_data import CONFIG # noqa: F401 +from transport_data.util.pluggy import pm if TYPE_CHECKING: from transport_data.util.ckan import Package @@ -23,27 +25,17 @@ def main(): #: List of (sub)modules that define CLI (sub)commands. Each should contain a #: @click.command() named "main". MODULES_WITH_CLI = [ - "ato.cli", - "config", - "cli.interactive", - "estat", - "iamc.cli", - "iso.cli", - "itdp.cli", - "jrc.cli", - "oica.cli", - "org.cli", - "org.ckan", - "proto.cli", - "store", - "testing.cli", + "transport_data.config", + "transport_data.cli.interactive", + "transport_data.org.ckan", + "transport_data.proto.cli", + "transport_data.store", + "transport_data.testing.cli", ] - # Add commands from each module that defines them -for name in MODULES_WITH_CLI: +for full_name in chain(MODULES_WITH_CLI, pm.hook.cli_modules()): try: - full_name = f"transport_data.{name}" module = import_module(full_name) except ImportError as e: print(f"{full_name} commands not available: {e.args[0]}") From 83d996199e49a1535fde51ffcf7d320a8de1fa98 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Fri, 16 Jan 2026 14:13:18 +0100 Subject: [PATCH 09/17] Separate check- CLI commands into distinct modules - Adjust tests. - Type hint tests. --- transport_data/cli/__init__.py | 217 +------------------------ transport_data/cli/check_file.py | 134 +++++++++++++++ transport_data/cli/check_record.py | 88 ++++++++++ transport_data/testing/__init__.py | 6 +- transport_data/tests/test_cli.py | 36 ++-- transport_data/tests/util/test_ckan.py | 2 +- 6 files changed, 252 insertions(+), 231 deletions(-) create mode 100644 transport_data/cli/check_file.py create mode 100644 transport_data/cli/check_record.py diff --git a/transport_data/cli/__init__.py b/transport_data/cli/__init__.py index b40804d..4caf8a6 100644 --- a/transport_data/cli/__init__.py +++ b/transport_data/cli/__init__.py @@ -1,21 +1,13 @@ """Command-line interface.""" -from collections import defaultdict -from collections.abc import MutableMapping -from dataclasses import dataclass from importlib import import_module from itertools import chain -from pathlib import Path -from typing import TYPE_CHECKING, Any import click from transport_data import CONFIG # noqa: F401 from transport_data.util.pluggy import pm -if TYPE_CHECKING: - from transport_data.util.ckan import Package - @click.group("tdc") def main(): @@ -27,6 +19,8 @@ def main(): MODULES_WITH_CLI = [ "transport_data.config", "transport_data.cli.interactive", + "transport_data.cli.check_file", + "transport_data.cli.check_record", "transport_data.org.ckan", "transport_data.proto.cli", "transport_data.store", @@ -41,210 +35,3 @@ def main(): print(f"{full_name} commands not available: {e.args[0]}") else: main.add_command(getattr(module, "main")) - - -@main.command() -@click.argument("structure_urn", metavar="URN") -@click.argument( - "path", metavar="FILE", type=click.Path(exists=True, dir_okay=False, path_type=Path) -) -@click.option("--sheets", help="Sheet(s) in .xlsx FILE to check.") -@click.option("-v", "--verbose", count=True, help="Increase verbosity.") -@click.option("--structure", help="Value for STRUCTURE field.") -@click.option("--structure-id", "structure_id", help="Value for STRUCTURE_ID field.") -@click.option("--action", default="I", help="Value for ACTION field.") -def check(structure_urn: str, path: Path, sheets, verbose, **options): # noqa: C901 - """Check that FILE can be read as SDMX-CSV. - - URN is the shortened SDMX URN of a data flow or data structure definition that - describes the data in FILE, for example "Dataflow=PROVIDER:EXAMPLE(1.2.3)" (the - version is not required). This artefact must already be present in the local store. - - FILE may have a ".csv" or ".xlsx" suffix. In the latter case, it is converted to a - temporary set of CSV files. If --sheets are given, only these worksheets are - converted and checked. - - If not given, --structure and --structure-id are inferred from URN. - """ - from traceback import format_exception - - import sdmx - import sdmx.urn - from sdmx.model import common - - from transport_data import STORE - from transport_data.util.sdmx import read_csv - - # Pieces of any error message - message = [] - - # Handle `structure_urn`: retrieve a data structure that describes the data - try: - structure = STORE.get(structure_urn) - except Exception: - message.append(f"Structure {structure_urn!r} could not be loaded") - structure = structure_cls = structure_id = None - else: - structure_cls = type(structure).__name__.lower().replace("definition", "") - structure_id = sdmx.urn.shorten(structure.urn).split("=")[-1] - - if isinstance(structure, common.BaseDataflow): - # Also retrieve the data structure definition - STORE.resolve(structure, "structure") - assert len(structure.structure.dimensions) - - # Construct keyword arguments for CSVAdapter - # TODO Check if this works for full SDMX-CSV - adapt = { - "structure": options.pop("structure") or structure_cls, - "structure_id": options.pop("structure_id") or structure_id, - "action": options.pop("action"), - } - - # Handle `path`; construct a sequence of (label, path) of CSV files to be processed - label_path = [] - - if path.suffix == ".csv": - label_path.append((f"File: {path}", path)) - elif path.suffix == ".xlsx": - # Explode an Excel file into one or more CSV files in a temporary directory - import pandas as pd - from platformdirs import user_cache_path - - # Create a cache directory - cache_dir = user_cache_path("transport-data").joinpath("check") - cache_dir.mkdir(parents=True, exist_ok=True) - - # Explode Excel file into one CSV file per sheet - ef = pd.ExcelFile(path) - _sheets = set(sheets.split(",")) if sheets else set(ef.sheet_names) - for sheet_name in filter(_sheets.__contains__, ef.sheet_names): - # Construct a temporary path - label_path.append( - ( - f"File: {path}\nSheet: {sheet_name}", - cache_dir.joinpath(f"{path.stem}_xlsx_{sheet_name}.csv"), - ) - ) - # Read the sheet from the ExcelFile and write to a CSV file - pd.read_excel(ef, sheet_name).to_csv(label_path[-1][1], index=False) - ef.close() - else: - raise click.UsageError(f"Unsupported file extension: {path.suffix!r}") - - # Process `label_path` - for label, p in label_path: - print(f"\n{label}") - - # Read the file into an SDMX data message - try: - dm = read_csv(p, structure, adapt) - except Exception as e: - message.append(f"read failed with\n{type(e).__name__}: {' '.join(e.args)}") - - if len(e.args) and "line 1" in e.args[0]: - message.append( - "Hint: try giving --structure= or --structure-id argument(s) to " - "adapt to SDMX-CSV." - ) - elif structure is None: - pass - else: # pragma: no cover - message.append("\n".join(format_exception(e))) - - print("") - raise click.ClickException("\n\n".join(message)) - - # Show the contents of the data message - dfd_urn = sdmx.urn.shorten(sdmx.urn.make(dm.dataflow)) - print(f"\n{len(dm.data)} data set(s) in: {dfd_urn!s}") - - # Show information about each data set - for i, ds in enumerate(dm.data): - print(f"\nData set {i}: action={ds.action}") - - # Show the data set contents or summary, according to verbosity - if verbose == 0: - print(f"{len(ds)} observations") - elif verbose == 1: - print(sdmx.to_pandas(ds)) - else: - print(sdmx.to_pandas(ds).to_string()) - - -@main.command() -@click.argument("id") -def check_record(id: str) -> None: - """Check record NAME on the TDC.""" - # TODO Use .org.ckan.instance_option - from transport_data.org.ckan import PROD - - # Retrieve the record, converted to an instance of Package - package = PROD.package_show(id) - - # Print general package information - print( - f"{package!r}", - package.portal_url(), - f"Title: {package.title!r}", - f"Category: {package.tdc_category}", - sep="\n- ", - ) - - check_package0(package) - - -SUFFIXES = { - "data": {".xlsx", ".csv"}, -} - - -def check_package0(package: "Package") -> None: - """Print some checks about a `package`.""" - # Convert resource file names to Path instances; count suffixes - files = [] - suffix_count: MutableMapping[str, int] = defaultdict(lambda: 0) - for resource in package.resources: - path = Path(resource.name) - files.append(path) - suffix_count[path.suffix.lower()] += 1 - - @dataclass - class Check: - label: str - value: Any - - def __str__(self) -> str: - return f"{self.label}: {self.value}" - - c0 = Check( - "Number of files by extension", - ", ".join(f"{c} {s}" for s, c in sorted(suffix_count.items())), - ) - c1 = Check( - "Number of data files", - sum(c for s, c in suffix_count.items() if s in SUFFIXES["data"]), - ) - c2 = Check("Number of possible SDMX-CSV files", suffix_count[".csv"]) - checks = [c0, c1, c2] - - lines = [""] + [f"- {check}" for check in checks] + [""] - - lines.append("Criteria for a TDC Formatted record:") - c3 = Check("At least one file in CSV format", c2.value >= 1) - c4 = Check( - "Correct category assigned", - package.tdc_category in {"tdc_formatted", "tdc_harmonized"}, - ) - c5 = Check("CSV file(s) are in SDMX-CSV format (not implemented yet)", True) - c6 = Check("Overall", "YES" if (c3.value and c4.value and c5.value) else "NO") - - lines.extend(f"- {check}" for check in (c3, c4, c5, c6)) - - lines.extend(["", "Criteria for a TDC Harmonized record—all of the above, plus:"]) - c7 = Check("Correct category assigned", package.tdc_category == "tdc_harmonized") - c8 = Check("Overall", "YES" if c7.value else "NO") - - lines.extend(f"- {check}" for check in (c7, c8)) - - print(*lines, sep="\n") diff --git a/transport_data/cli/check_file.py b/transport_data/cli/check_file.py new file mode 100644 index 0000000..5ebd89d --- /dev/null +++ b/transport_data/cli/check_file.py @@ -0,0 +1,134 @@ +from pathlib import Path + +import click + +from transport_data import CONFIG # noqa: F401 + + +@click.command("check-file") +@click.argument("structure_urn", metavar="URN") +@click.argument( + "path", metavar="FILE", type=click.Path(exists=True, dir_okay=False, path_type=Path) +) +@click.option("--sheets", help="Sheet(s) in .xlsx FILE to check.") +@click.option("-v", "--verbose", count=True, help="Increase verbosity.") +@click.option("--structure", help="Value for STRUCTURE field.") +@click.option("--structure-id", "structure_id", help="Value for STRUCTURE_ID field.") +@click.option("--action", default="I", help="Value for ACTION field.") +def main(structure_urn: str, path: Path, sheets, verbose, **options): # noqa: C901 + """Check that FILE can be read as SDMX-CSV. + + URN is the shortened SDMX URN of a data flow or data structure definition that + describes the data in FILE, for example "Dataflow=PROVIDER:EXAMPLE(1.2.3)" (the + version is not required). This artefact must already be present in the local store. + + FILE may have a ".csv" or ".xlsx" suffix. In the latter case, it is converted to a + temporary set of CSV files. If --sheets are given, only these worksheets are + converted and checked. + + If not given, --structure and --structure-id are inferred from URN. + """ + from traceback import format_exception + + import sdmx + import sdmx.urn + from sdmx.model import common + + from transport_data import STORE + from transport_data.util.sdmx import read_csv + + # Pieces of any error message + message = [] + + # Handle `structure_urn`: retrieve a data structure that describes the data + try: + structure = STORE.get(structure_urn) + except Exception: + message.append(f"Structure {structure_urn!r} could not be loaded") + structure = structure_cls = structure_id = None + else: + structure_cls = type(structure).__name__.lower().replace("definition", "") + structure_id = sdmx.urn.shorten(structure.urn).split("=")[-1] + + if isinstance(structure, common.BaseDataflow): + # Also retrieve the data structure definition + STORE.resolve(structure, "structure") + assert len(structure.structure.dimensions) + + # Construct keyword arguments for CSVAdapter + # TODO Check if this works for full SDMX-CSV + adapt = { + "structure": options.pop("structure") or structure_cls, + "structure_id": options.pop("structure_id") or structure_id, + "action": options.pop("action"), + } + + # Handle `path`; construct a sequence of (label, path) of CSV files to be processed + label_path = [] + + if path.suffix == ".csv": + label_path.append((f"File: {path}", path)) + elif path.suffix == ".xlsx": + # Explode an Excel file into one or more CSV files in a temporary directory + import pandas as pd + from platformdirs import user_cache_path + + # Create a cache directory + cache_dir = user_cache_path("transport-data").joinpath("check") + cache_dir.mkdir(parents=True, exist_ok=True) + + # Explode Excel file into one CSV file per sheet + ef = pd.ExcelFile(path) + _sheets = set(sheets.split(",")) if sheets else set(ef.sheet_names) + for sheet_name in filter(_sheets.__contains__, ef.sheet_names): + # Construct a temporary path + label_path.append( + ( + f"File: {path}\nSheet: {sheet_name}", + cache_dir.joinpath(f"{path.stem}_xlsx_{sheet_name}.csv"), + ) + ) + # Read the sheet from the ExcelFile and write to a CSV file + pd.read_excel(ef, sheet_name).to_csv(label_path[-1][1], index=False) + ef.close() + else: + raise click.UsageError(f"Unsupported file extension: {path.suffix!r}") + + # Process `label_path` + for label, p in label_path: + print(f"\n{label}") + + # Read the file into an SDMX data message + try: + dm = read_csv(p, structure, adapt) + except Exception as e: + message.append(f"read failed with\n{type(e).__name__}: {' '.join(e.args)}") + + if len(e.args) and "line 1" in e.args[0]: + message.append( + "Hint: try giving --structure= or --structure-id argument(s) to " + "adapt to SDMX-CSV." + ) + elif structure is None: + pass + else: # pragma: no cover + message.append("\n".join(format_exception(e))) + + print("") + raise click.ClickException("\n\n".join(message)) + + # Show the contents of the data message + dfd_urn = sdmx.urn.shorten(sdmx.urn.make(dm.dataflow)) + print(f"\n{len(dm.data)} data set(s) in: {dfd_urn!s}") + + # Show information about each data set + for i, ds in enumerate(dm.data): + print(f"\nData set {i}: action={ds.action}") + + # Show the data set contents or summary, according to verbosity + if verbose == 0: + print(f"{len(ds)} observations") + elif verbose == 1: + print(sdmx.to_pandas(ds)) + else: + print(sdmx.to_pandas(ds).to_string()) diff --git a/transport_data/cli/check_record.py b/transport_data/cli/check_record.py new file mode 100644 index 0000000..25771e7 --- /dev/null +++ b/transport_data/cli/check_record.py @@ -0,0 +1,88 @@ +from collections import defaultdict +from collections.abc import MutableMapping +from dataclasses import dataclass +from pathlib import Path +from typing import TYPE_CHECKING, Any + +import click + +if TYPE_CHECKING: + from transport_data.util.ckan import Package + + +@click.command("check-record") +@click.argument("id") +def main(id: str) -> None: + """Check record NAME on the TDC.""" + # TODO Use .org.ckan.instance_option + from transport_data.org.ckan import PROD + + # Retrieve the record, converted to an instance of Package + package = PROD.package_show(id) + + # Print general package information + print( + f"{package!r}", + package.portal_url(), + f"Title: {package.title!r}", + f"Category: {package.tdc_category}", + sep="\n- ", + ) + + check_package0(package) + + +SUFFIXES = { + "data": {".xlsx", ".csv"}, +} + + +def check_package0(package: "Package") -> None: + """Print some checks about a `package`.""" + # Convert resource file names to Path instances; count suffixes + files = [] + suffix_count: MutableMapping[str, int] = defaultdict(lambda: 0) + for resource in package.resources: + path = Path(resource.name) + files.append(path) + suffix_count[path.suffix.lower()] += 1 + + @dataclass + class Check: + label: str + value: Any + + def __str__(self) -> str: + return f"{self.label}: {self.value}" + + c0 = Check( + "Number of files by extension", + ", ".join(f"{c} {s}" for s, c in sorted(suffix_count.items())), + ) + c1 = Check( + "Number of data files", + sum(c for s, c in suffix_count.items() if s in SUFFIXES["data"]), + ) + c2 = Check("Number of possible SDMX-CSV files", suffix_count[".csv"]) + checks = [c0, c1, c2] + + lines = [""] + [f"- {check}" for check in checks] + [""] + + lines.append("Criteria for a TDC Formatted record:") + c3 = Check("At least one file in CSV format", c2.value >= 1) + c4 = Check( + "Correct category assigned", + package.tdc_category in {"tdc_formatted", "tdc_harmonized"}, + ) + c5 = Check("CSV file(s) are in SDMX-CSV format (not implemented yet)", True) + c6 = Check("Overall", "YES" if (c3.value and c4.value and c5.value) else "NO") + + lines.extend(f"- {check}" for check in (c3, c4, c5, c6)) + + lines.extend(["", "Criteria for a TDC Harmonized record—all of the above, plus:"]) + c7 = Check("Correct category assigned", package.tdc_category == "tdc_harmonized") + c8 = Check("Overall", "YES" if c7.value else "NO") + + lines.extend(f"- {check}" for check in (c7, c8)) + + print(*lines, sep="\n") diff --git a/transport_data/testing/__init__.py b/transport_data/testing/__init__.py index e97b5a1..eab6181 100644 --- a/transport_data/testing/__init__.py +++ b/transport_data/testing/__init__.py @@ -3,7 +3,7 @@ import platform import re import zipfile -from collections.abc import Generator, Iterator +from collections.abc import Iterator from typing import TYPE_CHECKING, cast import click.testing @@ -170,7 +170,7 @@ def test_data_path() -> Iterator["Traversable"]: @pytest.fixture(scope="session") -def tmp_config(tmp_path_factory) -> Generator[Config, None, None]: +def tmp_config(tmp_path_factory) -> Iterator[Config]: """A :class:`.Config` instance pointing to a temporary directory.""" from platformdirs import user_data_path @@ -189,7 +189,7 @@ def tmp_config(tmp_path_factory) -> Generator[Config, None, None]: @pytest.fixture(scope="session") -def tmp_store(tmp_config) -> Generator[UnionStore, None, None]: +def tmp_store(tmp_config) -> Iterator[UnionStore]: """A :class`.UnionStore` in a temporary directory per :func:`.tmp_config`.""" result = UnionStore(tmp_config) diff --git a/transport_data/tests/test_cli.py b/transport_data/tests/test_cli.py index 2d681bd..282cb98 100644 --- a/transport_data/tests/test_cli.py +++ b/transport_data/tests/test_cli.py @@ -1,11 +1,13 @@ import re +from pathlib import Path import pytest from prompt_toolkit.input.ansi_escape_sequences import REVERSE_ANSI_SEQUENCES from prompt_toolkit.keys import Keys from transport_data.cli.interactive import Editor -from transport_data.testing import ember_dfd +from transport_data.store import UnionStore +from transport_data.testing import CliRunner, ember_dfd @pytest.mark.parametrize( @@ -14,7 +16,8 @@ ("--help",), ("ato", "--help"), ("ato", "fetch", "--all"), - ("check", "--help"), + ("check-file", "--help"), + ("check-record", "--help"), ("config", "--help"), ("estat", "--help"), ("estat", "fetch", "--help"), @@ -28,7 +31,7 @@ ("store", "--help"), ), ) -def test_cli(tdc_cli, command): +def test_cli(tdc_cli: CliRunner, command: tuple[str, ...]) -> None: tdc_cli.invoke(command) @@ -41,12 +44,14 @@ def test_cli(tdc_cli, command): ] -def test_check0(tdc_cli, test_data_path, tmp_store): +def test_check_file0( + tdc_cli: CliRunner, test_data_path: Path, tmp_store: UnionStore +) -> None: """Check a successful read of a .xlsx file.""" ember_dfd(tmp_store) path = test_data_path.joinpath("read-csv-2.xlsx") - result = tdc_cli.invoke(["check"] + CHECK_ARGS + [str(path)]) + result = tdc_cli.invoke(["check-file"] + CHECK_ARGS + [str(path)]) # Command runs without error assert 0 == result.exit_code, result.output @@ -76,12 +81,19 @@ def test_check0(tdc_cli, test_data_path, tmp_store): (CHECK_ARGS + ["-vv"], 0, ""), # Show pd.DataFrame full string repr ), ) -def test_check1(tdc_cli, test_data_path, tmp_store, args, exit_code, text): +def test_check_file1( + tdc_cli: CliRunner, + test_data_path: Path, + tmp_store: UnionStore, + args: list[str], + exit_code: int, + text: str, +) -> None: """Check various other argument combinations.""" ember_dfd(tmp_store) path = test_data_path.joinpath("read-csv-1.csv") - result = tdc_cli.invoke(["check"] + args + [str(path)]) + result = tdc_cli.invoke(["check-file"] + args + [str(path)]) # Command gives the expected exit code assert exit_code == result.exit_code, result.output @@ -91,10 +103,10 @@ def test_check1(tdc_cli, test_data_path, tmp_store, args, exit_code, text): assert text in result.output -def test_check2(tdc_cli, tmp_path): +def test_check_file2(tdc_cli: CliRunner, tmp_path: Path) -> None: path = tmp_path.joinpath("foo.txt") path.touch() - result = tdc_cli.invoke(["check", "X", str(path)]) + result = tdc_cli.invoke(["check-file", "X", str(path)]) assert 2 == result.exit_code, result.output assert "Unsupported file extension" in result.output @@ -149,7 +161,7 @@ def _backspace(text: str) -> str: @pytest.mark.timeout(1) -def test_edit2(tmp_store) -> None: +def test_edit2(tmp_store: UnionStore) -> None: # CLI runs and accepts the input without error run_script(SCRIPT_2) @@ -176,7 +188,7 @@ def test_edit2(tmp_store) -> None: @pytest.mark.timeout(1) @pytest.mark.usefixtures("sdmx_structures") -def test_edit4(tmp_store) -> None: +def test_edit4(tmp_store: UnionStore) -> None: # CLI runs and accepts the input without error run_script(SCRIPT_4) @@ -205,7 +217,7 @@ def test_edit4(tmp_store) -> None: @pytest.mark.timeout(1) -def test_edit5(tmp_store) -> None: +def test_edit5(tmp_store: UnionStore) -> None: # CLI runs and accepts the input without error run_script([REVERSE_ANSI_SEQUENCES[Keys.ControlC]]) # Nothing was saved because ControlC was given diff --git a/transport_data/tests/util/test_ckan.py b/transport_data/tests/util/test_ckan.py index 399c7c3..0cefa0c 100644 --- a/transport_data/tests/util/test_ckan.py +++ b/transport_data/tests/util/test_ckan.py @@ -48,7 +48,7 @@ def obj(self, test_data_path) -> Package: def test_asdict(self, obj) -> None: obj.asdict() - def test_get_item(self, obj) -> None: + def test_get_item(self, obj: Package) -> None: g = obj.get_item("groups", 0) assert isinstance(g, Group) From 241cad8ff1ced38a6c061d36f8a4ade1110cd17c Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Mon, 26 Jan 2026 15:53:23 +0100 Subject: [PATCH 10/17] Add .util.sdmx.structure_from_csv() --- transport_data/util/sdmx.py | 74 +++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/transport_data/util/sdmx.py b/transport_data/util/sdmx.py index 97b89e8..bcd07a6 100644 --- a/transport_data/util/sdmx.py +++ b/transport_data/util/sdmx.py @@ -1,6 +1,7 @@ """Utilities for :mod:`sdmx`.""" import io +import logging from dataclasses import fields from datetime import datetime from importlib.metadata import version @@ -33,6 +34,9 @@ class MAKeywords(VAKeywords): maintainer: sdmx.model.common.Agency | None +log = logging.getLogger(__name__) + + class CSVAdapter(io.RawIOBase): """Adapt CSV content from `path` into SDMX-CSV. @@ -195,6 +199,76 @@ def read_csv( ) +def structure_from_csv( + path: "pathlib.Path", +) -> tuple["sdmx.model.v30.Dataflow", dict]: + """Infer a data flow and arguments for :func:`.read_csv` from `path`. + + Returns + ------- + tuple + with 2 elements: + + 1. a :class:`sdmx.model.v30.Dataflow`. + 2. :class:`dict`, a value for the :py:`adapt` argument of :func:`read_csv`. + """ + + import csv + + from sdmx.model import v30 + + from transport_data.org import get_agencyscheme + + # Parse the first line of the file as CSV + with open(path, "r") as f: + reader = csv.reader(f) + row = next(reader) + + dsd = v30.DataStructureDefinition( + id="DS_INFERRED", + description=f"Inferred from the contents of {path}", + maintainer=get_agencyscheme()["TDCI"], + ) + adapt = dict() + + for column, default in ( + ("STRUCTURE", "datastructure"), + ("STRUCTURE_ID", dsd.id), + ("ACTION", "I"), + ): + try: + row.remove(column) + except ValueError: + adapt[column.lower()] = default + + # Assume the measure ID "OBS_VALUE" + index_obs_value = row.index("OBS_VALUE") + dsd.measures.getdefault(id="OBS_VALUE") + + # Preceding columns are dimensions + for dim_id in row[:index_obs_value]: + dsd.dimensions.getdefault(id=dim_id) + + # Following columns are attributes + for attr_id in row[index_obs_value + 1 :]: + dsd.attributes.getdefault(id=attr_id) + + log.info( + f"Inferred structure {dsd} with {len(dsd.dimensions)} dimension(s): " + + " ".join(d.id for d in dsd.dimensions) + ) + + # Construct a dataflow definition matching `dsd` + dfd = v30.Dataflow( + id="DF_INFERRED", + description=dsd.description, + maintainer=dsd.maintainer, + structure=dsd, + ) + + return dfd, adapt + + def fields_to_mda( cls: type, rs: "sdmx.model.v21.ReportStructure", From 781a450745a1c89695b014017d4127972426df4b Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Sun, 1 Feb 2026 15:26:01 +0100 Subject: [PATCH 11/17] Adjust oica.update_registry() per CI failures - Use sub-paths from registry in .is_available() call. - Handle ConnectionError/HTTPSConnectionPool max retries exceeded. This may be caused by repeated queries to incorrect URLs. --- transport_data/oica/__init__.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/transport_data/oica/__init__.py b/transport_data/oica/__init__.py index 6d95ed6..6912b73 100644 --- a/transport_data/oica/__init__.py +++ b/transport_data/oica/__init__.py @@ -19,6 +19,7 @@ from typing import TYPE_CHECKING import pandas as pd +from requests.exceptions import ConnectionError from transport_data import hook from transport_data.util.pooch import Pooch @@ -513,11 +514,13 @@ def update_registry() -> None: for dfd, _ in map(get_structures, ["PROD", "SALES", "STOCK"]): for file in filenames_for_dfd(dfd, fetch=False): - filename = file.name + filename = str(file) existing_hash = POOCH.registry.setdefault(filename, None) - if not POOCH.is_available(filename): - # File doesn't exist on the remote + try: + assert POOCH.is_available(filename) + except (AssertionError, ConnectionError): + # File doesn't exist on the remote, or request times out POOCH.registry.pop(filename) continue From 1ac323de83fc3855cf56daffdde526adcfdca6ba Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Tue, 28 Apr 2026 19:35:21 +0200 Subject: [PATCH 12/17] Fetch local copy of data files in check-record --- transport_data/cli/check_record.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/transport_data/cli/check_record.py b/transport_data/cli/check_record.py index 25771e7..6a92f44 100644 --- a/transport_data/cli/check_record.py +++ b/transport_data/cli/check_record.py @@ -1,7 +1,6 @@ from collections import defaultdict from collections.abc import MutableMapping from dataclasses import dataclass -from pathlib import Path from typing import TYPE_CHECKING, Any import click @@ -43,7 +42,8 @@ def check_package0(package: "Package") -> None: files = [] suffix_count: MutableMapping[str, int] = defaultdict(lambda: 0) for resource in package.resources: - path = Path(resource.name) + # Fetch a local copy of the resource; return its path + path = resource.fetch() files.append(path) suffix_count[path.suffix.lower()] += 1 From 026723b66548b6781ac160e4b023a1b6e83474db Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Tue, 28 Apr 2026 19:47:38 +0200 Subject: [PATCH 13/17] Order .util.sdmx alphabetically --- transport_data/util/sdmx.py | 120 ++++++++++++++++++------------------ 1 file changed, 60 insertions(+), 60 deletions(-) diff --git a/transport_data/util/sdmx.py b/transport_data/util/sdmx.py index bcd07a6..d625b44 100644 --- a/transport_data/util/sdmx.py +++ b/transport_data/util/sdmx.py @@ -144,6 +144,66 @@ def anno_generated(obj: "sdmx.model.common.AnnotableArtefact") -> None: ) +def fields_to_mda( + cls: type, + rs: "sdmx.model.v21.ReportStructure", + cs: "sdmx.model.common.ConceptScheme | None" = None, +) -> None: + """Populate `rs` with MetadataAttributes corresponding to dataclass fields of `cls`. + + Examples + -------- + >>> @dataclass + ... class MDSExample: + ... #: Foo + ... #: + ... #: Description of Foo. + ... foo: str + ... + ... bar: int + ... + ... fields_to_mda(MDSExample) + + In this example, two metadata attributes will be added to `rs`: + + 1. With id="foo" and an annotation with id="data-type" and text="". + The concept identity for the metadata attribute will also have id="foo", + name="Foo", and description="Description of Foo." + 2. With id="bar" and an annotation with id="data-type" and text="". + """ + from sdmx.model import common, v21 + + # Assemble info about the dataclass fields of `cls` + field_info = {f.name: (f, "") for f in fields(cls)} + + # Tokenize the source code of `cls` and update `field_info` with the Sphinx-style + # comments that precede each of the fields + # + # Thanks to https://davidism.com/attribute-docstrings/ and + # https://stackoverflow.com/a/7457047 + comments = [] # Accumulate comment tokens + for tok in generate_tokens(StringIO(dedent(getsource(cls))).readline): + if tok.type == COMMENT: + comments.append(tok.string.lstrip("#: ")) # Store + elif tok.type == NAME and tok.string in field_info and len(comments): + # Reached the definition of a field, and there are accumulated comments + field_info[tok.string] = (field_info[tok.string][0], "\n".join(comments)) + comments = [] # Reset + + cs = cs or common.ConceptScheme() + + for id_, (f, docstring) in field_info.items(): + # Split the docstring, if any, to a name and optional description + name, _, desc = docstring.partition("\n\n") + + # Construct the ConceptIdentity and add to `cs` + ci = cs.setdefault(id=id_, name=name or None, description=desc or None) + # Construct the data type annotation + type_anno = v21.Annotation(id="data-type", text={"zxx": repr(f.type)}) + # Add the metadata attribute to the report structure + rs.getdefault(id=id_, concept_identity=ci, annotations=[type_anno]) + + def make_obs( row: "pd.Series", dsd: "sdmx.model.v21.DataStructureDefinition" ) -> "sdmx.model.v21.Observation": @@ -267,63 +327,3 @@ def structure_from_csv( ) return dfd, adapt - - -def fields_to_mda( - cls: type, - rs: "sdmx.model.v21.ReportStructure", - cs: "sdmx.model.common.ConceptScheme | None" = None, -) -> None: - """Populate `rs` with MetadataAttributes corresponding to dataclass fields of `cls`. - - Examples - -------- - >>> @dataclass - ... class MDSExample: - ... #: Foo - ... #: - ... #: Description of Foo. - ... foo: str - ... - ... bar: int - ... - ... fields_to_mda(MDSExample) - - In this example, two metadata attributes will be added to `rs`: - - 1. With id="foo" and an annotation with id="data-type" and text="". - The concept identity for the metadata attribute will also have id="foo", - name="Foo", and description="Description of Foo." - 2. With id="bar" and an annotation with id="data-type" and text="". - """ - from sdmx.model import common, v21 - - # Assemble info about the dataclass fields of `cls` - field_info = {f.name: (f, "") for f in fields(cls)} - - # Tokenize the source code of `cls` and update `field_info` with the Sphinx-style - # comments that precede each of the fields - # - # Thanks to https://davidism.com/attribute-docstrings/ and - # https://stackoverflow.com/a/7457047 - comments = [] # Accumulate comment tokens - for tok in generate_tokens(StringIO(dedent(getsource(cls))).readline): - if tok.type == COMMENT: - comments.append(tok.string.lstrip("#: ")) # Store - elif tok.type == NAME and tok.string in field_info and len(comments): - # Reached the definition of a field, and there are accumulated comments - field_info[tok.string] = (field_info[tok.string][0], "\n".join(comments)) - comments = [] # Reset - - cs = cs or common.ConceptScheme() - - for id_, (f, docstring) in field_info.items(): - # Split the docstring, if any, to a name and optional description - name, _, desc = docstring.partition("\n\n") - - # Construct the ConceptIdentity and add to `cs` - ci = cs.setdefault(id=id_, name=name or None, description=desc or None) - # Construct the data type annotation - type_anno = v21.Annotation(id="data-type", text={"zxx": repr(f.type)}) - # Add the metadata attribute to the report structure - rs.getdefault(id=id_, concept_identity=ci, annotations=[type_anno]) From 755618f61ad54ce86b26f6003b8fcaa61921a2ba Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Tue, 28 Apr 2026 20:44:43 +0200 Subject: [PATCH 14/17] Check readable SDMX-CSV in .check_record --- transport_data/cli/check_record.py | 38 +++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/transport_data/cli/check_record.py b/transport_data/cli/check_record.py index 6a92f44..d14a2c3 100644 --- a/transport_data/cli/check_record.py +++ b/transport_data/cli/check_record.py @@ -6,6 +6,10 @@ import click if TYPE_CHECKING: + from pathlib import Path + + from sdmx.model.v30 import Dataflow + from transport_data.util.ckan import Package @@ -38,14 +42,28 @@ def main(id: str) -> None: def check_package0(package: "Package") -> None: """Print some checks about a `package`.""" - # Convert resource file names to Path instances; count suffixes - files = [] + from transport_data.util.sdmx import structure_from_csv + + # Mapping from Path instances to args for read_csv() or None + file_args: dict["Path", tuple["Dataflow", dict] | None] = {} + + # Counts of suffixes suffix_count: MutableMapping[str, int] = defaultdict(lambda: 0) for resource in package.resources: # Fetch a local copy of the resource; return its path path = resource.fetch() - files.append(path) - suffix_count[path.suffix.lower()] += 1 + + # Count the file suffix + suffix_lower = path.suffix.lower() + suffix_count[suffix_lower] += 1 + + try: + # Infer a data structure definition from the file + file_args[path] = structure_from_csv(path) + except Exception: + # Not a CSV file or cannot infer a DSD + file_args[path] = None + continue @dataclass class Check: @@ -74,15 +92,19 @@ def __str__(self) -> str: "Correct category assigned", package.tdc_category in {"tdc_formatted", "tdc_harmonized"}, ) - c5 = Check("CSV file(s) are in SDMX-CSV format (not implemented yet)", True) + c5 = Check("≥1 CSV file is in SDMX-CSV format", any(map(bool, file_args.values()))) c6 = Check("Overall", "YES" if (c3.value and c4.value and c5.value) else "NO") lines.extend(f"- {check}" for check in (c3, c4, c5, c6)) lines.extend(["", "Criteria for a TDC Harmonized record—all of the above, plus:"]) - c7 = Check("Correct category assigned", package.tdc_category == "tdc_harmonized") - c8 = Check("Overall", "YES" if c7.value else "NO") + c7 = Check( + "Data structure dimension IDs are all in TDCI:CS_CONCEPTS (not implemented yet)", + True, + ) + c8 = Check("Correct category assigned", package.tdc_category == "tdc_harmonized") + c9 = Check("Overall", "YES" if (c7.value and c8.value) else "NO") - lines.extend(f"- {check}" for check in (c7, c8)) + lines.extend(f"- {check}" for check in (c7, c8, c9)) print(*lines, sep="\n") From 1ba4de8aebf2da57e219103951d5c47f061d32cc Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Tue, 28 Apr 2026 23:02:59 +0200 Subject: [PATCH 15/17] Test check_package0 --- transport_data/tests/test_cli.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/transport_data/tests/test_cli.py b/transport_data/tests/test_cli.py index 282cb98..50c5ee0 100644 --- a/transport_data/tests/test_cli.py +++ b/transport_data/tests/test_cli.py @@ -5,9 +5,17 @@ from prompt_toolkit.input.ansi_escape_sequences import REVERSE_ANSI_SEQUENCES from prompt_toolkit.keys import Keys +from transport_data.cli.check_record import check_package0 from transport_data.cli.interactive import Editor from transport_data.store import UnionStore from transport_data.testing import CliRunner, ember_dfd +from transport_data.util.ckan import Package + + +@pytest.fixture +def package(test_data_path: Path) -> Package: + """A :class:`.Package` from a test specimen.""" + return Package.from_file(test_data_path.joinpath("ckan", "package.json")) @pytest.mark.parametrize( @@ -112,6 +120,12 @@ def test_check_file2(tdc_cli: CliRunner, tmp_path: Path) -> None: assert "Unsupported file extension" in result.output +def test_check_package0(package: Package) -> None: + # Function runs + check_package0(package) + # TODO extend with further assertions about stdout + + def run_script(lines: list[str]) -> None: """Create a contained instance of :class:`.Editor` and feed it `lines`.""" from prompt_toolkit.application import create_app_session From fc9945adbc7e94e5d50e1afb6852afe4685f9dcb Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Tue, 28 Apr 2026 23:16:49 +0200 Subject: [PATCH 16/17] Handle hashlib.file_digest() not in Python 3.10 --- transport_data/util/ckan.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/transport_data/util/ckan.py b/transport_data/util/ckan.py index 50ab61a..84a6ac6 100644 --- a/transport_data/util/ckan.py +++ b/transport_data/util/ckan.py @@ -223,7 +223,6 @@ def fetch(self, max_size: int = 10_000_000) -> Path: AssertionError if the size of the file is equal to or greater than `max_size`. """ - from hashlib import file_digest import requests @@ -245,19 +244,24 @@ def fetch(self, max_size: int = 10_000_000) -> Path: file_hash = "" try: + from hashlib import file_digest + # Check existence and hash of local file with open(target, "rb") as fd: file_hash = file_digest(fd, "md5").hexdigest() # Allow that self.hash is empty; don't force download in this case assert self.hash in ("", file_hash) - except (AssertionError, FileNotFoundError) as e: + except (AssertionError, FileNotFoundError, ImportError) as e: # Hash does not match or file does not exist - if isinstance(e, AssertionError): - print( - f"Hash {file_hash} of {target} does not match expected {self.hash};" - " will re-download" - ) + match e: + case AssertionError(): + print( + f"Hash {file_hash} of {target} does not match expected " + f"{self.hash}; will re-download" + ) + case ImportError(): + print("hashlib.file_digest() not available in Python 3.10") response = requests.get(self.url, stream=True) with open(target, "wb") as fd: for chunk in response.iter_content(): From 438982fbf2803782172e060db83d2264288e6f60 Mon Sep 17 00:00:00 2001 From: Paul Natsuo Kishimoto Date: Tue, 28 Apr 2026 19:46:07 +0200 Subject: [PATCH 17/17] Add #58 to doc/whatsnew --- doc/whatsnew.rst | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/doc/whatsnew.rst b/doc/whatsnew.rst index a7462a8..62f66cc 100644 --- a/doc/whatsnew.rst +++ b/doc/whatsnew.rst @@ -6,12 +6,28 @@ Next release - :mod:`transport_data` supports and is tested with `Pandas 3.0.0 `_, - released 2026-01-21 (:pull:`59`). + released 2026-01-21 (:pull:`59`). - Update for pycountry 26.2.16, released 2026-02-17 (:pull:`61`). -- :program:`tdc` command-line interface warns but does not error - if some modules/commands are not available (:pull:`63`). -- New CLI command :program:`tdc org qr` (:pull:`63`). - New HOWTO :doc:`Get involved ` (:pull:`62`). +- Improvements to the :program:`tdc` command-line interface (CLI): + + - :program:`tdc` warns but does not error + if some modules/commands are not available (:pull:`63`). + - New command :program:`tdc org qr` (:pull:`63`). + - New command :program:`tdc check-record` (:pull:`58`). + - Rename command :program:`tdc check` to :program:`tdc check-file` (:pull:`58`). + +- Improve :mod:`.util.ckan` (:pull:`58`): + + - New method :meth:`.Package.portal_url`. + - New method :meth:`.Resource.fetch` to fetch and cache files. + - Add type hints for commonly-used attributes of :class:`.Package`, :class:`.Resource`. + +- Improve utility code (:pull:`58`): + + - :any:`transport_data.hook` is available as a top-level import for marking hook implementations. + - New hook :func:`~.util.hooks.cli_modules`. + - New function :func:`.sdmx.structure_from_csv`. v26.1.13 ========