From 322a66047c34337ca434e9d482d495c63a706a38 Mon Sep 17 00:00:00 2001 From: James Ko Date: Thu, 12 Feb 2026 12:18:59 -0500 Subject: [PATCH 1/3] add script to build cancer types from oncotree --- .../importer/build_cancers_from_oncotree.py | 113 ++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 scripts/importer/build_cancers_from_oncotree.py diff --git a/scripts/importer/build_cancers_from_oncotree.py b/scripts/importer/build_cancers_from_oncotree.py new file mode 100644 index 00000000..31ed59b1 --- /dev/null +++ b/scripts/importer/build_cancers_from_oncotree.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 +"""Build a tab-delimited cancers.txt file from the OncoTree tumorTypes API. + +This mirrors ImporterImpl.importTypesOfCancer logic: +- query {oncotree_url}/tumorTypes[?version=...] +- write rows: codenamecolorparent +- fail if response is empty +""" + +from __future__ import annotations + +import argparse +import json +import sys +import time +from pathlib import Path +from typing import Any, Iterable +from urllib.error import HTTPError, URLError +from urllib.parse import urlencode, urljoin +from urllib.request import Request, urlopen + +DEFAULT_FILENAME = "cancers.txt" + + +def _build_url(base_url: str, version: str | None) -> str: + base = base_url if base_url.endswith("/") else base_url + "/" + url = urljoin(base, "tumorTypes") + if version: + return f"{url}?{urlencode({'version': version})}" + return url + + +def _query_oncotree(url: str) -> list[dict[str, Any]]: + request = Request(url, headers={"Content-Type": "application/x-www-form-urlencoded"}) + for attempt in range(2): + try: + with urlopen(request, timeout=60) as response: + payload = response.read().decode("utf-8") + data = json.loads(payload) + if not isinstance(data, list): + raise ValueError("Unexpected OncoTree response: expected a JSON list") + return data + except (HTTPError, URLError, TimeoutError) as exc: + if attempt == 0: + print("Warning: error loading OncoTree data, reattempting in 5 seconds...", file=sys.stderr) + print(f"Warning detail: {exc}", file=sys.stderr) + time.sleep(5) + continue + raise + + # Unreachable but keeps static analyzers happy. + raise RuntimeError("Failed to query OncoTree") + + +def _normalize(value: Any) -> str: + if value is None: + return "" + return str(value) + + +def _to_lines(rows: Iterable[dict[str, Any]]) -> list[str]: + lines: list[str] = [] + for row in rows: + code = _normalize(row.get("code")) + name = _normalize(row.get("name")) + color = _normalize(row.get("color")) + parent = _normalize(row.get("parent")) + lines.append("\t".join([code, name, color, parent])) + return lines + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Build cancers.txt from OncoTree tumorTypes API using importer-equivalent logic." + ) + parser.add_argument( + "--oncotree-url", + required=True, + help="OncoTree base URL (equivalent to importer property oncotree.url)", + ) + parser.add_argument( + "--oncotree-version", + default="", + help="Optional OncoTree version value passed as query parameter 'version'.", + ) + parser.add_argument( + "--output", + default=DEFAULT_FILENAME, + help=f"Output filename (default: {DEFAULT_FILENAME})", + ) + + args = parser.parse_args() + + url = _build_url(args.oncotree_url, args.oncotree_version or None) + oncotree_data = _query_oncotree(url) + + if not oncotree_data: + raise RuntimeError("No oncotree data returned!") + + lines = _to_lines(oncotree_data) + output_path = Path(args.output) + output_path.write_text("\n".join(lines) + "\n", encoding="utf-8") + + print(f"Wrote {len(lines)} rows to {output_path}") + return 0 + + +if __name__ == "__main__": + try: + raise SystemExit(main()) + except Exception as exc: # noqa: BLE001 + print(f"Error: {exc}", file=sys.stderr) + raise SystemExit(1) From cca30e72931e77b30589c69dc32f5ca675b1cb10 Mon Sep 17 00:00:00 2001 From: James Ko Date: Thu, 12 Feb 2026 13:14:18 -0500 Subject: [PATCH 2/3] add support to cbioportalImporter for loading cancer types from the Oncotree API --- .../importer/build_cancers_from_oncotree.py | 29 +++++++++++-------- scripts/importer/cbioportalImporter.py | 26 +++++++++++++---- 2 files changed, 37 insertions(+), 18 deletions(-) diff --git a/scripts/importer/build_cancers_from_oncotree.py b/scripts/importer/build_cancers_from_oncotree.py index 31ed59b1..6dc4ccca 100644 --- a/scripts/importer/build_cancers_from_oncotree.py +++ b/scripts/importer/build_cancers_from_oncotree.py @@ -69,6 +69,22 @@ def _to_lines(rows: Iterable[dict[str, Any]]) -> list[str]: return lines +def build(oncotree_url: str, oncotree_version: str | None = None, output: str = DEFAULT_FILENAME) -> str: + """Fetch cancer types from OncoTree and write to a file. Returns the output file path.""" + url = _build_url(oncotree_url, oncotree_version or None) + oncotree_data = _query_oncotree(url) + + if not oncotree_data: + raise RuntimeError("No oncotree data returned!") + + lines = _to_lines(oncotree_data) + output_path = Path(output) + output_path.write_text("\n".join(lines) + "\n", encoding="utf-8") + + print(f"Wrote {len(lines)} rows to {output_path}") + return str(output_path) + + def main() -> int: parser = argparse.ArgumentParser( description="Build cancers.txt from OncoTree tumorTypes API using importer-equivalent logic." @@ -90,18 +106,7 @@ def main() -> int: ) args = parser.parse_args() - - url = _build_url(args.oncotree_url, args.oncotree_version or None) - oncotree_data = _query_oncotree(url) - - if not oncotree_data: - raise RuntimeError("No oncotree data returned!") - - lines = _to_lines(oncotree_data) - output_path = Path(args.output) - output_path.write_text("\n".join(lines) + "\n", encoding="utf-8") - - print(f"Wrote {len(lines)} rows to {output_path}") + build(args.oncotree_url, args.oncotree_version or None, args.output) return 0 diff --git a/scripts/importer/cbioportalImporter.py b/scripts/importer/cbioportalImporter.py index c29035a7..7dc6742a 100644 --- a/scripts/importer/cbioportalImporter.py +++ b/scripts/importer/cbioportalImporter.py @@ -27,6 +27,7 @@ importlib.import_module(__package__) from . import cbioportal_common +from . import build_cancers_from_oncotree from .cbioportal_common import OUTPUT_FILE from .cbioportal_common import ERROR_FILE from .cbioportal_common import MetaFileTypes @@ -65,11 +66,18 @@ # ------------------------------------------------------------------------------ # sub-routines -def import_cancer_type(jvm_args, data_filename): +def import_cancer_type(jvm_args, data_filename=None, oncotree_url=None, oncotree_version=None, clobber=False): + if data_filename is None and oncotree_url is None: + raise RuntimeError("import-cancer-type requires either --data_filename or --oncotree-url") + if data_filename is not None and oncotree_url is not None: + raise RuntimeError("import-cancer-type: --data_filename and --oncotree-url are mutually exclusive") + if oncotree_url is not None: + data_filename = build_cancers_from_oncotree.build(oncotree_url, oncotree_version) args = jvm_args.split(' ') args.append(IMPORT_CANCER_TYPE_CLASS) args.append(data_filename) - args.append("false") # don't clobber existing table + if not clobber: + args.append("false") # don't clobber existing table args.append("--noprogress") # don't report memory usage and % progress run_java(*args) @@ -235,9 +243,9 @@ def process_case_lists(jvm_args, case_list_dir): if not (case_list.startswith('.') or case_list.endswith('~')): import_case_list(jvm_args, os.path.join(case_list_dir, case_list)) -def process_command(jvm_args, command, meta_filename, data_filename, study_ids, patient_ids, sample_ids, update_generic_assay_entity = None): +def process_command(jvm_args, command, meta_filename, data_filename, study_ids, patient_ids, sample_ids, update_generic_assay_entity=None, oncotree_url=None, oncotree_version=None): if command == IMPORT_CANCER_TYPE: - import_cancer_type(jvm_args, data_filename) + import_cancer_type(jvm_args, data_filename=data_filename, oncotree_url=oncotree_url, oncotree_version=oncotree_version, clobber=True) elif command == IMPORT_STUDY: import_study(jvm_args, meta_filename) elif command == REMOVE_STUDY: @@ -384,7 +392,7 @@ def process_study_directory(jvm_args, study_directory, update_generic_assay_enti # First, import cancer types for meta_filename, data_filename in cancer_type_filepairs: - import_cancer_type(jvm_args, data_filename) + import_cancer_type(jvm_args, data_filename, clobber=False) # Then define the study if study_meta_filename is None: @@ -575,6 +583,10 @@ def interface(args=None): subparsers = parser.add_subparsers(title='subcommands', dest='subcommand', help='Command for import. Allowed commands: ' + allowed_commands_csv) import_cancer_type = subparsers.add_parser('import-cancer-type', parents=[parent_parser], add_help=False) + import_cancer_type.add_argument('--oncotree-url', type=str, required=False, + help='OncoTree base URL to fetch cancer types from (alternative to --data_filename)') + import_cancer_type.add_argument('--oncotree-version', type=str, required=False, default=None, + help='OncoTree version (used with --oncotree-url)') import_study = subparsers.add_parser('import-study', parents=[parent_parser], add_help=False) import_study_data = subparsers.add_parser('import-study-data', parents=[parent_parser], add_help=False) import_case_list = subparsers.add_parser('import-case-list', parents=[parent_parser], add_help=False) @@ -690,7 +702,9 @@ def main(args): args.study_ids, args.patient_ids if hasattr(args, 'patient_ids') else None, args.sample_ids if hasattr(args, 'sample_ids') else None, - args.update_generic_assay_entity) + args.update_generic_assay_entity, + oncotree_url=getattr(args, 'oncotree_url', None), + oncotree_version=getattr(args, 'oncotree_version', None)) # ------------------------------------------------------------------------------ # ready to roll From 71f088bc8f9addf49e9725b31a0cf89612aaffef Mon Sep 17 00:00:00 2001 From: James Ko Date: Thu, 12 Feb 2026 13:14:57 -0500 Subject: [PATCH 3/3] write to a tempfile --- scripts/importer/cbioportalImporter.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/importer/cbioportalImporter.py b/scripts/importer/cbioportalImporter.py index 7dc6742a..bf6a1a45 100644 --- a/scripts/importer/cbioportalImporter.py +++ b/scripts/importer/cbioportalImporter.py @@ -11,6 +11,7 @@ import argparse import logging import re +import tempfile from pathlib import Path from typing import Dict, Tuple @@ -72,7 +73,9 @@ def import_cancer_type(jvm_args, data_filename=None, oncotree_url=None, oncotree if data_filename is not None and oncotree_url is not None: raise RuntimeError("import-cancer-type: --data_filename and --oncotree-url are mutually exclusive") if oncotree_url is not None: - data_filename = build_cancers_from_oncotree.build(oncotree_url, oncotree_version) + tmp = tempfile.NamedTemporaryFile(suffix='.txt', prefix='cancers_', delete=False) + tmp.close() # release fh so the other script can write to it + data_filename = build_cancers_from_oncotree.build(oncotree_url, oncotree_version, output=tmp.name) args = jvm_args.split(' ') args.append(IMPORT_CANCER_TYPE_CLASS) args.append(data_filename)