diff --git a/transport_data/__init__.py b/transport_data/__init__.py index 1dd32f4..784d46b 100644 --- a/transport_data/__init__.py +++ b/transport_data/__init__.py @@ -37,4 +37,5 @@ "oica", "org", "other", + "tumi", ) diff --git a/transport_data/tumi/__init__.py b/transport_data/tumi/__init__.py new file mode 100644 index 0000000..15fa152 --- /dev/null +++ b/transport_data/tumi/__init__.py @@ -0,0 +1,6 @@ +from transport_data import hook + + +@hook +def cli_modules() -> str: + return f"{__name__}.cli" diff --git a/transport_data/tumi/cli.py b/transport_data/tumi/cli.py new file mode 100644 index 0000000..23509bc --- /dev/null +++ b/transport_data/tumi/cli.py @@ -0,0 +1,102 @@ +from enum import Enum, auto +from itertools import chain +from pathlib import Path + +import click + + +@click.group("tumi", short_help="TUMI provider.") +def main() -> None: + """Transforming Urban Mobility Initiative (TUMI) provider.""" + + +@main.command("import") +@click.argument("path", type=click.Path(dir_okay=True, readable=True, path_type=Path)) +def import_(path: Path) -> None: + """Import data from an export. + + The tool expects a single argument with the path of a directory. The directory MUST + contain the following: + + \b + datasets.jsonlines + ckan/ + resources/ + 000/ + 000/ + 00-a1b2-c3d4-e5f6-a7b8c9d0e1f2 + """ + import json + + from transport_data.util.ckan import Package + + # Path to datasets.json + datasets_path = path.joinpath("datasets.jsonlines") + resources_path = path.joinpath("ckan", "resources") + assert datasets_path.exists() and resources_path.exists() + + # Read datasets.jsonlines, convert to a list of Package objects + packages = [] + with open(datasets_path) as f: + for line in f: + packages.append(Package(data=json.loads(line))) + + # Set of all paths to resource files + resources = list( + chain(*[map(d.joinpath, names) for d, _, names in resources_path.walk()]) + ) + + print(f"Read {len(packages)} packages from {datasets_path}") + print(f"{len(resources)} total resource files in {resources_path}") + + class STATE(Enum): + """State of Resources associated with a Package.""" + + NO_RESOURCES = auto() + ALL_PRESENT_OR_URL = auto() + ALL_MISSING = auto() + MIXED = auto() + + # Sort packages into 3 lists according to the state of their resources + packages1: dict[STATE, list[Package]] = {} + for p in packages: + present = [] + for r in p.resources: + if r.has_local_file: + local_path = r.local_path(resources_path) + present.append(local_path.exists()) + resources.remove(local_path) + else: + present.append(True) + if not len(present): + key = STATE.NO_RESOURCES + elif all(present): + key = STATE.ALL_PRESENT_OR_URL + elif not any(present): + key = STATE.ALL_MISSING + else: + key = STATE.MIXED + + packages1.setdefault(key, []) + packages1[key].append(p) + + for k, v in packages1.items(): + p = v[0] + print( + f"\n{len(v)} packages with state {k!r}", + # "Example:", + # p, + # f"{len(p.resources)} total resources", + sep="\n", + ) + # for r in p.resources: + # _path = r.local_path(base_path) + # print( + # r, + # f"Expected path: {_path}", + # f"Path exists : {_path.exists()}", + # json.dumps(r.asdict(), indent=2), + # sep="\n", + # ) + + print(f"{len(resources)} resources not matched to packages") diff --git a/transport_data/util/ckan.py b/transport_data/util/ckan.py index 84a6ac6..1e1bdeb 100644 --- a/transport_data/util/ckan.py +++ b/transport_data/util/ckan.py @@ -205,10 +205,18 @@ class Resource(ModelProxy): """ # Type hints + format: str hash: str + mimetype: str | None name: str - size: int + size: int | None url: str + url_type: str | None + + @property + def has_local_file(self) -> bool: + """:any:`True` if the Resource **should** have a local file.""" + return self.url_type == "upload" def fetch(self, max_size: int = 10_000_000) -> Path: """Fetch the resource file and cache it locally. @@ -269,6 +277,15 @@ def fetch(self, max_size: int = 10_000_000) -> Path: return target + def local_path(self, base_path: Path | None) -> Path: + """Return a path in the CKAN file system layout.""" + from transport_data import CONFIG + + base_path = base_path or CONFIG.cache_path.joinpath("resource") + + assert self.id + return base_path.joinpath(self.id[:3], self.id[3:6], self.id[6:]) + class Tag(ModelProxy): """Proxy for the CKAN 'Tag' model.