Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 118 additions & 0 deletions scripts/importer/build_cancers_from_oncotree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
#!/usr/bin/env python3
"""Build a tab-delimited cancers.txt file from the OncoTree tumorTypes API.

This mirrors ImporterImpl.importTypesOfCancer logic:
- query {oncotree_url}/tumorTypes[?version=...]
- write rows: code<TAB>name<TAB>color<TAB>parent
- fail if response is empty
"""

from __future__ import annotations

import argparse
import json
import sys
import time
from pathlib import Path
from typing import Any, Iterable
from urllib.error import HTTPError, URLError
from urllib.parse import urlencode, urljoin
from urllib.request import Request, urlopen

DEFAULT_FILENAME = "cancers.txt"


def _build_url(base_url: str, version: str | None) -> str:
base = base_url if base_url.endswith("/") else base_url + "/"
url = urljoin(base, "tumorTypes")
if version:
return f"{url}?{urlencode({'version': version})}"
return url


def _query_oncotree(url: str) -> list[dict[str, Any]]:
request = Request(url, headers={"Content-Type": "application/x-www-form-urlencoded"})
for attempt in range(2):
try:
with urlopen(request, timeout=60) as response:
payload = response.read().decode("utf-8")
data = json.loads(payload)
if not isinstance(data, list):
raise ValueError("Unexpected OncoTree response: expected a JSON list")
return data
except (HTTPError, URLError, TimeoutError) as exc:
if attempt == 0:
print("Warning: error loading OncoTree data, reattempting in 5 seconds...", file=sys.stderr)
print(f"Warning detail: {exc}", file=sys.stderr)
time.sleep(5)
continue
raise

# Unreachable but keeps static analyzers happy.
raise RuntimeError("Failed to query OncoTree")


def _normalize(value: Any) -> str:
if value is None:
return ""
return str(value)


def _to_lines(rows: Iterable[dict[str, Any]]) -> list[str]:
lines: list[str] = []
for row in rows:
code = _normalize(row.get("code"))
name = _normalize(row.get("name"))
color = _normalize(row.get("color"))
parent = _normalize(row.get("parent"))
lines.append("\t".join([code, name, color, parent]))
return lines


def build(oncotree_url: str, oncotree_version: str | None = None, output: str = DEFAULT_FILENAME) -> str:
"""Fetch cancer types from OncoTree and write to a file. Returns the output file path."""
url = _build_url(oncotree_url, oncotree_version or None)
oncotree_data = _query_oncotree(url)

if not oncotree_data:
raise RuntimeError("No oncotree data returned!")

lines = _to_lines(oncotree_data)
output_path = Path(output)
output_path.write_text("\n".join(lines) + "\n", encoding="utf-8")

print(f"Wrote {len(lines)} rows to {output_path}")
return str(output_path)


def main() -> int:
parser = argparse.ArgumentParser(
description="Build cancers.txt from OncoTree tumorTypes API using importer-equivalent logic."
)
parser.add_argument(
"--oncotree-url",
required=True,
help="OncoTree base URL (equivalent to importer property oncotree.url)",
)
parser.add_argument(
"--oncotree-version",
default="",
help="Optional OncoTree version value passed as query parameter 'version'.",
)
parser.add_argument(
"--output",
default=DEFAULT_FILENAME,
help=f"Output filename (default: {DEFAULT_FILENAME})",
)

args = parser.parse_args()
build(args.oncotree_url, args.oncotree_version or None, args.output)
return 0


if __name__ == "__main__":
try:
raise SystemExit(main())
except Exception as exc: # noqa: BLE001
print(f"Error: {exc}", file=sys.stderr)
raise SystemExit(1)
29 changes: 23 additions & 6 deletions scripts/importer/cbioportalImporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import argparse
import logging
import re
import tempfile
from pathlib import Path
from typing import Dict, Tuple

Expand All @@ -27,6 +28,7 @@
importlib.import_module(__package__)

from . import cbioportal_common
from . import build_cancers_from_oncotree
from .cbioportal_common import OUTPUT_FILE
from .cbioportal_common import ERROR_FILE
from .cbioportal_common import MetaFileTypes
Expand Down Expand Up @@ -65,11 +67,20 @@
# ------------------------------------------------------------------------------
# sub-routines

def import_cancer_type(jvm_args, data_filename):
def import_cancer_type(jvm_args, data_filename=None, oncotree_url=None, oncotree_version=None, clobber=False):
if data_filename is None and oncotree_url is None:
raise RuntimeError("import-cancer-type requires either --data_filename or --oncotree-url")
if data_filename is not None and oncotree_url is not None:
raise RuntimeError("import-cancer-type: --data_filename and --oncotree-url are mutually exclusive")
if oncotree_url is not None:
tmp = tempfile.NamedTemporaryFile(suffix='.txt', prefix='cancers_', delete=False)
tmp.close() # release fh so the other script can write to it
data_filename = build_cancers_from_oncotree.build(oncotree_url, oncotree_version, output=tmp.name)
args = jvm_args.split(' ')
args.append(IMPORT_CANCER_TYPE_CLASS)
args.append(data_filename)
args.append("false") # don't clobber existing table
if not clobber:
args.append("false") # don't clobber existing table
args.append("--noprogress") # don't report memory usage and % progress
run_java(*args)

Expand Down Expand Up @@ -235,9 +246,9 @@ def process_case_lists(jvm_args, case_list_dir):
if not (case_list.startswith('.') or case_list.endswith('~')):
import_case_list(jvm_args, os.path.join(case_list_dir, case_list))

def process_command(jvm_args, command, meta_filename, data_filename, study_ids, patient_ids, sample_ids, update_generic_assay_entity = None):
def process_command(jvm_args, command, meta_filename, data_filename, study_ids, patient_ids, sample_ids, update_generic_assay_entity=None, oncotree_url=None, oncotree_version=None):
if command == IMPORT_CANCER_TYPE:
import_cancer_type(jvm_args, data_filename)
import_cancer_type(jvm_args, data_filename=data_filename, oncotree_url=oncotree_url, oncotree_version=oncotree_version, clobber=True)
elif command == IMPORT_STUDY:
import_study(jvm_args, meta_filename)
elif command == REMOVE_STUDY:
Expand Down Expand Up @@ -384,7 +395,7 @@ def process_study_directory(jvm_args, study_directory, update_generic_assay_enti

# First, import cancer types
for meta_filename, data_filename in cancer_type_filepairs:
import_cancer_type(jvm_args, data_filename)
import_cancer_type(jvm_args, data_filename, clobber=False)

# Then define the study
if study_meta_filename is None:
Expand Down Expand Up @@ -575,6 +586,10 @@ def interface(args=None):
subparsers = parser.add_subparsers(title='subcommands', dest='subcommand',
help='Command for import. Allowed commands: ' + allowed_commands_csv)
import_cancer_type = subparsers.add_parser('import-cancer-type', parents=[parent_parser], add_help=False)
import_cancer_type.add_argument('--oncotree-url', type=str, required=False,
help='OncoTree base URL to fetch cancer types from (alternative to --data_filename)')
import_cancer_type.add_argument('--oncotree-version', type=str, required=False, default=None,
help='OncoTree version (used with --oncotree-url)')
import_study = subparsers.add_parser('import-study', parents=[parent_parser], add_help=False)
import_study_data = subparsers.add_parser('import-study-data', parents=[parent_parser], add_help=False)
import_case_list = subparsers.add_parser('import-case-list', parents=[parent_parser], add_help=False)
Expand Down Expand Up @@ -690,7 +705,9 @@ def main(args):
args.study_ids,
args.patient_ids if hasattr(args, 'patient_ids') else None,
args.sample_ids if hasattr(args, 'sample_ids') else None,
args.update_generic_assay_entity)
args.update_generic_assay_entity,
oncotree_url=getattr(args, 'oncotree_url', None),
oncotree_version=getattr(args, 'oncotree_version', None))

# ------------------------------------------------------------------------------
# ready to roll
Expand Down
Loading