Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions transport_data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,5 @@
"oica",
"org",
"other",
"tumi",
)
6 changes: 6 additions & 0 deletions transport_data/tumi/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from transport_data import hook


@hook
def cli_modules() -> str:
return f"{__name__}.cli"
102 changes: 102 additions & 0 deletions transport_data/tumi/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
from enum import Enum, auto
from itertools import chain
from pathlib import Path

import click


@click.group("tumi", short_help="TUMI provider.")
def main() -> None:
"""Transforming Urban Mobility Initiative (TUMI) provider."""


@main.command("import")
@click.argument("path", type=click.Path(dir_okay=True, readable=True, path_type=Path))
def import_(path: Path) -> None:
"""Import data from an export.

The tool expects a single argument with the path of a directory. The directory MUST
contain the following:

\b
datasets.jsonlines
ckan/
resources/
000/
000/
00-a1b2-c3d4-e5f6-a7b8c9d0e1f2
"""
import json

from transport_data.util.ckan import Package

# Path to datasets.json
datasets_path = path.joinpath("datasets.jsonlines")
resources_path = path.joinpath("ckan", "resources")
assert datasets_path.exists() and resources_path.exists()

# Read datasets.jsonlines, convert to a list of Package objects
packages = []
with open(datasets_path) as f:
for line in f:
packages.append(Package(data=json.loads(line)))

# Set of all paths to resource files
resources = list(
chain(*[map(d.joinpath, names) for d, _, names in resources_path.walk()])
)

print(f"Read {len(packages)} packages from {datasets_path}")
print(f"{len(resources)} total resource files in {resources_path}")

class STATE(Enum):
"""State of Resources associated with a Package."""

NO_RESOURCES = auto()
ALL_PRESENT_OR_URL = auto()
ALL_MISSING = auto()
MIXED = auto()

# Sort packages into 3 lists according to the state of their resources
packages1: dict[STATE, list[Package]] = {}
for p in packages:
present = []
for r in p.resources:
if r.has_local_file:
local_path = r.local_path(resources_path)
present.append(local_path.exists())
resources.remove(local_path)
else:
present.append(True)
if not len(present):
key = STATE.NO_RESOURCES
elif all(present):
key = STATE.ALL_PRESENT_OR_URL
elif not any(present):
key = STATE.ALL_MISSING
else:
key = STATE.MIXED

packages1.setdefault(key, [])
packages1[key].append(p)

for k, v in packages1.items():
p = v[0]
print(
f"\n{len(v)} packages with state {k!r}",
# "Example:",
# p,
# f"{len(p.resources)} total resources",
sep="\n",
)
# for r in p.resources:
# _path = r.local_path(base_path)
# print(
# r,
# f"Expected path: {_path}",
# f"Path exists : {_path.exists()}",
# json.dumps(r.asdict(), indent=2),
# sep="\n",
# )

print(f"{len(resources)} resources not matched to packages")
19 changes: 18 additions & 1 deletion transport_data/util/ckan.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,10 +205,18 @@ class Resource(ModelProxy):
"""

# Type hints
format: str
hash: str
mimetype: str | None
name: str
size: int
size: int | None
url: str
url_type: str | None

@property
def has_local_file(self) -> bool:
""":any:`True` if the Resource **should** have a local file."""
return self.url_type == "upload"

def fetch(self, max_size: int = 10_000_000) -> Path:
"""Fetch the resource file and cache it locally.
Expand Down Expand Up @@ -269,6 +277,15 @@ def fetch(self, max_size: int = 10_000_000) -> Path:

return target

def local_path(self, base_path: Path | None) -> Path:
"""Return a path in the CKAN file system layout."""
from transport_data import CONFIG

base_path = base_path or CONFIG.cache_path.joinpath("resource")

assert self.id
return base_path.joinpath(self.id[:3], self.id[3:6], self.id[6:])


class Tag(ModelProxy):
"""Proxy for the CKAN 'Tag' model.
Expand Down
Loading