From 322a66047c34337ca434e9d482d495c63a706a38 Mon Sep 17 00:00:00 2001
From: James Ko <jamesqko@gmail.com>
Date: Thu, 12 Feb 2026 12:18:59 -0500
Subject: [PATCH 1/3] add script to build cancer types from oncotree

---
 .../importer/build_cancers_from_oncotree.py   | 113 ++++++++++++++++++
 1 file changed, 113 insertions(+)
 create mode 100644 scripts/importer/build_cancers_from_oncotree.py
diff --git a/scripts/importer/build_cancers_from_oncotree.py b/scripts/importer/build_cancers_from_oncotree.py
new file mode 100644
index 00000000..31ed59b1
--- /dev/null
+++ b/scripts/importer/build_cancers_from_oncotree.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+"""Build a tab-delimited cancers.txt file from the OncoTree tumorTypes API.
+
+This mirrors ImporterImpl.importTypesOfCancer logic:
+- query {oncotree_url}/tumorTypes[?version=...]
+- write rows: code<TAB>name<TAB>color<TAB>parent
+- fail if response is empty
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+import time
+from pathlib import Path
+from typing import Any, Iterable
+from urllib.error import HTTPError, URLError
+from urllib.parse import urlencode, urljoin
+from urllib.request import Request, urlopen
+
+DEFAULT_FILENAME = "cancers.txt"
+
+
+def _build_url(base_url: str, version: str | None) -> str:
+    base = base_url if base_url.endswith("/") else base_url + "/"
+    url = urljoin(base, "tumorTypes")
+    if version:
+        return f"{url}?{urlencode({'version': version})}"
+    return url
+
+
+def _query_oncotree(url: str) -> list[dict[str, Any]]:
+    request = Request(url, headers={"Content-Type": "application/x-www-form-urlencoded"})
+    for attempt in range(2):
+        try:
+            with urlopen(request, timeout=60) as response:
+                payload = response.read().decode("utf-8")
+            data = json.loads(payload)
+            if not isinstance(data, list):
+                raise ValueError("Unexpected OncoTree response: expected a JSON list")
+            return data
+        except (HTTPError, URLError, TimeoutError) as exc:
+            if attempt == 0:
+                print("Warning: error loading OncoTree data, reattempting in 5 seconds...", file=sys.stderr)
+                print(f"Warning detail: {exc}", file=sys.stderr)
+                time.sleep(5)
+                continue
+            raise
+
+    # Unreachable but keeps static analyzers happy.
+    raise RuntimeError("Failed to query OncoTree")
+
+
+def _normalize(value: Any) -> str:
+    if value is None:
+        return ""
+    return str(value)
+
+
+def _to_lines(rows: Iterable[dict[str, Any]]) -> list[str]:
+    lines: list[str] = []
+    for row in rows:
+        code = _normalize(row.get("code"))
+        name = _normalize(row.get("name"))
+        color = _normalize(row.get("color"))
+        parent = _normalize(row.get("parent"))
+        lines.append("\t".join([code, name, color, parent]))
+    return lines
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Build cancers.txt from OncoTree tumorTypes API using importer-equivalent logic."
+    )
+    parser.add_argument(
+        "--oncotree-url",
+        required=True,
+        help="OncoTree base URL (equivalent to importer property oncotree.url)",
+    )
+    parser.add_argument(
+        "--oncotree-version",
+        default="",
+        help="Optional OncoTree version value passed as query parameter 'version'.",
+    )
+    parser.add_argument(
+        "--output",
+        default=DEFAULT_FILENAME,
+        help=f"Output filename (default: {DEFAULT_FILENAME})",
+    )
+
+    args = parser.parse_args()
+
+    url = _build_url(args.oncotree_url, args.oncotree_version or None)
+    oncotree_data = _query_oncotree(url)
+
+    if not oncotree_data:
+        raise RuntimeError("No oncotree data returned!")
+
+    lines = _to_lines(oncotree_data)
+    output_path = Path(args.output)
+    output_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
+
+    print(f"Wrote {len(lines)} rows to {output_path}")
+    return 0
+
+
+if __name__ == "__main__":
+    try:
+        raise SystemExit(main())
+    except Exception as exc:  # noqa: BLE001
+        print(f"Error: {exc}", file=sys.stderr)
+        raise SystemExit(1)

From cca30e72931e77b30589c69dc32f5ca675b1cb10 Mon Sep 17 00:00:00 2001
From: James Ko <jamesqko@gmail.com>
Date: Thu, 12 Feb 2026 13:14:18 -0500
Subject: [PATCH 2/3] add support to cbioportalImporter for loading cancer
 types from the Oncotree API

---
 .../importer/build_cancers_from_oncotree.py   | 29 +++++++++++--------
 scripts/importer/cbioportalImporter.py        | 26 +++++++++++++----
 2 files changed, 37 insertions(+), 18 deletions(-)

diff --git a/scripts/importer/build_cancers_from_oncotree.py b/scripts/importer/build_cancers_from_oncotree.py
index 31ed59b1..6dc4ccca 100644
--- a/scripts/importer/build_cancers_from_oncotree.py
+++ b/scripts/importer/build_cancers_from_oncotree.py
@@ -69,6 +69,22 @@ def _to_lines(rows: Iterable[dict[str, Any]]) -> list[str]:
     return lines
 
 
+def build(oncotree_url: str, oncotree_version: str | None = None, output: str = DEFAULT_FILENAME) -> str:
+    """Fetch cancer types from OncoTree and write to a file. Returns the output file path."""
+    url = _build_url(oncotree_url, oncotree_version or None)
+    oncotree_data = _query_oncotree(url)
+
+    if not oncotree_data:
+        raise RuntimeError("No oncotree data returned!")
+
+    lines = _to_lines(oncotree_data)
+    output_path = Path(output)
+    output_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
+
+    print(f"Wrote {len(lines)} rows to {output_path}")
+    return str(output_path)
+
+
 def main() -> int:
     parser = argparse.ArgumentParser(
         description="Build cancers.txt from OncoTree tumorTypes API using importer-equivalent logic."
@@ -90,18 +106,7 @@ def main() -> int:
     )
 
     args = parser.parse_args()
-
-    url = _build_url(args.oncotree_url, args.oncotree_version or None)
-    oncotree_data = _query_oncotree(url)
-
-    if not oncotree_data:
-        raise RuntimeError("No oncotree data returned!")
-
-    lines = _to_lines(oncotree_data)
-    output_path = Path(args.output)
-    output_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
-
-    print(f"Wrote {len(lines)} rows to {output_path}")
+    build(args.oncotree_url, args.oncotree_version or None, args.output)
     return 0
 
 
diff --git a/scripts/importer/cbioportalImporter.py b/scripts/importer/cbioportalImporter.py
index c29035a7..7dc6742a 100644
--- a/scripts/importer/cbioportalImporter.py
+++ b/scripts/importer/cbioportalImporter.py
@@ -27,6 +27,7 @@
     importlib.import_module(__package__)
 
 from . import cbioportal_common
+from . import build_cancers_from_oncotree
 from .cbioportal_common import OUTPUT_FILE
 from .cbioportal_common import ERROR_FILE
 from .cbioportal_common import MetaFileTypes
@@ -65,11 +66,18 @@
 # ------------------------------------------------------------------------------
 # sub-routines
 
-def import_cancer_type(jvm_args, data_filename):
+def import_cancer_type(jvm_args, data_filename=None, oncotree_url=None, oncotree_version=None, clobber=False):
+    if data_filename is None and oncotree_url is None:
+        raise RuntimeError("import-cancer-type requires either --data_filename or --oncotree-url")
+    if data_filename is not None and oncotree_url is not None:
+        raise RuntimeError("import-cancer-type: --data_filename and --oncotree-url are mutually exclusive")
+    if oncotree_url is not None:
+        data_filename = build_cancers_from_oncotree.build(oncotree_url, oncotree_version)
     args = jvm_args.split(' ')
     args.append(IMPORT_CANCER_TYPE_CLASS)
     args.append(data_filename)
-    args.append("false") # don't clobber existing table
+    if not clobber:
+        args.append("false") # don't clobber existing table
     args.append("--noprogress") # don't report memory usage and % progress
     run_java(*args)
 
@@ -235,9 +243,9 @@ def process_case_lists(jvm_args, case_list_dir):
         if not (case_list.startswith('.') or case_list.endswith('~')):
             import_case_list(jvm_args, os.path.join(case_list_dir, case_list))
 
-def process_command(jvm_args, command, meta_filename, data_filename, study_ids, patient_ids, sample_ids, update_generic_assay_entity = None):
+def process_command(jvm_args, command, meta_filename, data_filename, study_ids, patient_ids, sample_ids, update_generic_assay_entity=None, oncotree_url=None, oncotree_version=None):
     if command == IMPORT_CANCER_TYPE:
-        import_cancer_type(jvm_args, data_filename)
+        import_cancer_type(jvm_args, data_filename=data_filename, oncotree_url=oncotree_url, oncotree_version=oncotree_version, clobber=True)
     elif command == IMPORT_STUDY:
         import_study(jvm_args, meta_filename)
     elif command == REMOVE_STUDY:
@@ -384,7 +392,7 @@ def process_study_directory(jvm_args, study_directory, update_generic_assay_enti
 
     # First, import cancer types
     for meta_filename, data_filename in cancer_type_filepairs:
-        import_cancer_type(jvm_args, data_filename)
+        import_cancer_type(jvm_args, data_filename, clobber=False)
 
     # Then define the study
     if study_meta_filename is None:
@@ -575,6 +583,10 @@ def interface(args=None):
     subparsers = parser.add_subparsers(title='subcommands', dest='subcommand',
                           help='Command for import. Allowed commands: ' + allowed_commands_csv)
     import_cancer_type = subparsers.add_parser('import-cancer-type', parents=[parent_parser], add_help=False)
+    import_cancer_type.add_argument('--oncotree-url', type=str, required=False,
+                        help='OncoTree base URL to fetch cancer types from (alternative to --data_filename)')
+    import_cancer_type.add_argument('--oncotree-version', type=str, required=False, default=None,
+                        help='OncoTree version (used with --oncotree-url)')
     import_study = subparsers.add_parser('import-study', parents=[parent_parser], add_help=False)
     import_study_data = subparsers.add_parser('import-study-data', parents=[parent_parser], add_help=False)
     import_case_list = subparsers.add_parser('import-case-list', parents=[parent_parser], add_help=False)
@@ -690,7 +702,9 @@ def main(args):
             args.study_ids,
             args.patient_ids if hasattr(args, 'patient_ids') else None,
             args.sample_ids if hasattr(args, 'sample_ids') else None,
-            args.update_generic_assay_entity)
+            args.update_generic_assay_entity,
+            oncotree_url=getattr(args, 'oncotree_url', None),
+            oncotree_version=getattr(args, 'oncotree_version', None))
 
 # ------------------------------------------------------------------------------
 # ready to roll

From 71f088bc8f9addf49e9725b31a0cf89612aaffef Mon Sep 17 00:00:00 2001
From: James Ko <jamesqko@gmail.com>
Date: Thu, 12 Feb 2026 13:14:57 -0500
Subject: [PATCH 3/3] write to a tempfile

---
 scripts/importer/cbioportalImporter.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/scripts/importer/cbioportalImporter.py b/scripts/importer/cbioportalImporter.py
index 7dc6742a..bf6a1a45 100644
--- a/scripts/importer/cbioportalImporter.py
+++ b/scripts/importer/cbioportalImporter.py
@@ -11,6 +11,7 @@
 import argparse
 import logging
 import re
+import tempfile
 from pathlib import Path
 from typing import Dict, Tuple
 
@@ -72,7 +73,9 @@ def import_cancer_type(jvm_args, data_filename=None, oncotree_url=None, oncotree
     if data_filename is not None and oncotree_url is not None:
         raise RuntimeError("import-cancer-type: --data_filename and --oncotree-url are mutually exclusive")
     if oncotree_url is not None:
-        data_filename = build_cancers_from_oncotree.build(oncotree_url, oncotree_version)
+        tmp = tempfile.NamedTemporaryFile(suffix='.txt', prefix='cancers_', delete=False)
+        tmp.close() # release fh so the other script can write to it
+        data_filename = build_cancers_from_oncotree.build(oncotree_url, oncotree_version, output=tmp.name)
     args = jvm_args.split(' ')
     args.append(IMPORT_CANCER_TYPE_CLASS)
     args.append(data_filename)