cBioPortal · jamesqo · Feb 12, 2026 · Feb 12, 2026 · Feb 12, 2026
diff --git a/scripts/importer/build_cancers_from_oncotree.py b/scripts/importer/build_cancers_from_oncotree.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python3
+"""Build a tab-delimited cancers.txt file from the OncoTree tumorTypes API.
+
+This mirrors ImporterImpl.importTypesOfCancer logic:
+- query {oncotree_url}/tumorTypes[?version=...]
+- write rows: code<TAB>name<TAB>color<TAB>parent
+- fail if response is empty
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+import time
+from pathlib import Path
+from typing import Any, Iterable
+from urllib.error import HTTPError, URLError
+from urllib.parse import urlencode, urljoin
+from urllib.request import Request, urlopen
+
+DEFAULT_FILENAME = "cancers.txt"
+
+
+def _build_url(base_url: str, version: str | None) -> str:
+    base = base_url if base_url.endswith("/") else base_url + "/"
+    url = urljoin(base, "tumorTypes")
+    if version:
+        return f"{url}?{urlencode({'version': version})}"
+    return url
+
+
+def _query_oncotree(url: str) -> list[dict[str, Any]]:
+    request = Request(url, headers={"Content-Type": "application/x-www-form-urlencoded"})
+    for attempt in range(2):
+        try:
+            with urlopen(request, timeout=60) as response:
+                payload = response.read().decode("utf-8")
+            data = json.loads(payload)
+            if not isinstance(data, list):
+                raise ValueError("Unexpected OncoTree response: expected a JSON list")
+            return data
+        except (HTTPError, URLError, TimeoutError) as exc:
+            if attempt == 0:
+                print("Warning: error loading OncoTree data, reattempting in 5 seconds...", file=sys.stderr)
+                print(f"Warning detail: {exc}", file=sys.stderr)
+                time.sleep(5)
+                continue
+            raise
+
+    # Unreachable but keeps static analyzers happy.
+    raise RuntimeError("Failed to query OncoTree")
+
+
+def _normalize(value: Any) -> str:
+    if value is None:
+        return ""
+    return str(value)
+
+
+def _to_lines(rows: Iterable[dict[str, Any]]) -> list[str]:
+    lines: list[str] = []
+    for row in rows:
+        code = _normalize(row.get("code"))
+        name = _normalize(row.get("name"))
+        color = _normalize(row.get("color"))
+        parent = _normalize(row.get("parent"))
+        lines.append("\t".join([code, name, color, parent]))
+    return lines
+
+
+def build(oncotree_url: str, oncotree_version: str | None = None, output: str = DEFAULT_FILENAME) -> str:
+    """Fetch cancer types from OncoTree and write to a file. Returns the output file path."""
+    url = _build_url(oncotree_url, oncotree_version or None)
+    oncotree_data = _query_oncotree(url)
+
+    if not oncotree_data:
+        raise RuntimeError("No oncotree data returned!")
+
+    lines = _to_lines(oncotree_data)
+    output_path = Path(output)
+    output_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
+
+    print(f"Wrote {len(lines)} rows to {output_path}")
+    return str(output_path)
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Build cancers.txt from OncoTree tumorTypes API using importer-equivalent logic."
+    )
+    parser.add_argument(
+        "--oncotree-url",
+        required=True,
+        help="OncoTree base URL (equivalent to importer property oncotree.url)",
+    )
+    parser.add_argument(
+        "--oncotree-version",
+        default="",
+        help="Optional OncoTree version value passed as query parameter 'version'.",
+    )
+    parser.add_argument(
+        "--output",
+        default=DEFAULT_FILENAME,
+        help=f"Output filename (default: {DEFAULT_FILENAME})",
+    )
+
+    args = parser.parse_args()
+    build(args.oncotree_url, args.oncotree_version or None, args.output)
+    return 0
+
+
+if __name__ == "__main__":
+    try:
+        raise SystemExit(main())
+    except Exception as exc:  # noqa: BLE001
+        print(f"Error: {exc}", file=sys.stderr)
+        raise SystemExit(1)
diff --git a/scripts/importer/cbioportalImporter.py b/scripts/importer/cbioportalImporter.py
@@ -11,6 +11,7 @@
 import argparse
 import logging
 import re
+import tempfile
 from pathlib import Path
 from typing import Dict, Tuple
 
@@ -27,6 +28,7 @@
     importlib.import_module(__package__)
 
 from . import cbioportal_common
+from . import build_cancers_from_oncotree
 from .cbioportal_common import OUTPUT_FILE
 from .cbioportal_common import ERROR_FILE
 from .cbioportal_common import MetaFileTypes
@@ -65,11 +67,20 @@
 # ------------------------------------------------------------------------------
 # sub-routines
 
-def import_cancer_type(jvm_args, data_filename):
+def import_cancer_type(jvm_args, data_filename=None, oncotree_url=None, oncotree_version=None, clobber=False):
+    if data_filename is None and oncotree_url is None:
+        raise RuntimeError("import-cancer-type requires either --data_filename or --oncotree-url")
+    if data_filename is not None and oncotree_url is not None:
+        raise RuntimeError("import-cancer-type: --data_filename and --oncotree-url are mutually exclusive")
+    if oncotree_url is not None:
+        tmp = tempfile.NamedTemporaryFile(suffix='.txt', prefix='cancers_', delete=False)
+        tmp.close() # release fh so the other script can write to it
+        data_filename = build_cancers_from_oncotree.build(oncotree_url, oncotree_version, output=tmp.name)
     args = jvm_args.split(' ')
     args.append(IMPORT_CANCER_TYPE_CLASS)
     args.append(data_filename)
-    args.append("false") # don't clobber existing table
+    if not clobber:
+        args.append("false") # don't clobber existing table
     args.append("--noprogress") # don't report memory usage and % progress
     run_java(*args)
 
@@ -235,9 +246,9 @@ def process_case_lists(jvm_args, case_list_dir):
         if not (case_list.startswith('.') or case_list.endswith('~')):
             import_case_list(jvm_args, os.path.join(case_list_dir, case_list))
 
-def process_command(jvm_args, command, meta_filename, data_filename, study_ids, patient_ids, sample_ids, update_generic_assay_entity = None):
+def process_command(jvm_args, command, meta_filename, data_filename, study_ids, patient_ids, sample_ids, update_generic_assay_entity=None, oncotree_url=None, oncotree_version=None):
     if command == IMPORT_CANCER_TYPE:
-        import_cancer_type(jvm_args, data_filename)
+        import_cancer_type(jvm_args, data_filename=data_filename, oncotree_url=oncotree_url, oncotree_version=oncotree_version, clobber=True)
     elif command == IMPORT_STUDY:
         import_study(jvm_args, meta_filename)
     elif command == REMOVE_STUDY:
@@ -384,7 +395,7 @@ def process_study_directory(jvm_args, study_directory, update_generic_assay_enti
 
     # First, import cancer types
     for meta_filename, data_filename in cancer_type_filepairs:
-        import_cancer_type(jvm_args, data_filename)
+        import_cancer_type(jvm_args, data_filename, clobber=False)
 
     # Then define the study
     if study_meta_filename is None:
@@ -575,6 +586,10 @@ def interface(args=None):
     subparsers = parser.add_subparsers(title='subcommands', dest='subcommand',
                           help='Command for import. Allowed commands: ' + allowed_commands_csv)
     import_cancer_type = subparsers.add_parser('import-cancer-type', parents=[parent_parser], add_help=False)
+    import_cancer_type.add_argument('--oncotree-url', type=str, required=False,
+                        help='OncoTree base URL to fetch cancer types from (alternative to --data_filename)')
+    import_cancer_type.add_argument('--oncotree-version', type=str, required=False, default=None,
+                        help='OncoTree version (used with --oncotree-url)')
     import_study = subparsers.add_parser('import-study', parents=[parent_parser], add_help=False)
     import_study_data = subparsers.add_parser('import-study-data', parents=[parent_parser], add_help=False)
     import_case_list = subparsers.add_parser('import-case-list', parents=[parent_parser], add_help=False)
@@ -690,7 +705,9 @@ def main(args):
             args.study_ids,
             args.patient_ids if hasattr(args, 'patient_ids') else None,
             args.sample_ids if hasattr(args, 'sample_ids') else None,
-            args.update_generic_assay_entity)
+            args.update_generic_assay_entity,
+            oncotree_url=getattr(args, 'oncotree_url', None),
+            oncotree_version=getattr(args, 'oncotree_version', None))
 
 # ------------------------------------------------------------------------------
 # ready to roll