From 3761b2c71a5b2414e1719ccc54754a4862628d9e Mon Sep 17 00:00:00 2001
From: Dheeraj Rawandhe <Dheerajrawandhe57@gmail.com>
Date: Tue, 31 Mar 2026 12:59:32 +0530
Subject: [PATCH] Update cbioportalImporter.py

---
 scripts/importer/cbioportalImporter.py | 716 ++-----------------------
 1 file changed, 40 insertions(+), 676 deletions(-)

diff --git a/scripts/importer/cbioportalImporter.py b/scripts/importer/cbioportalImporter.py
index c29035a7..73084f95 100644
--- a/scripts/importer/cbioportalImporter.py
+++ b/scripts/importer/cbioportalImporter.py
@@ -1,114 +1,7 @@
-#!/usr/bin/env python3
-
-# ------------------------------------------------------------------------------
-# Script which imports portal data.
-#
-# ------------------------------------------------------------------------------
-
-import os
-import sys
-import importlib
-import argparse
-import logging
-import re
-from pathlib import Path
-from typing import Dict, Tuple
-
-# configure relative imports if running as a script; see PEP 366
-# it might passed as empty string by certain tooling to mark a top level module
-if __name__ == "__main__" and (__package__ is None or __package__ == ''):
-    # replace the script's location in the Python search path by the main
-    # scripts/ folder, above it, so that the importer package folder is in
-    # scope and *not* directly in sys.path; see PEP 395
-    sys.path[0] = str(Path(sys.path[0]).resolve().parent)
-    __package__ = 'importer'
-    # explicitly load the package, which is needed on CPython 3.4 because it
-    # doesn't include https://github.com/python/cpython/pull/2639
-    importlib.import_module(__package__)
-
-from . import cbioportal_common
-from .cbioportal_common import OUTPUT_FILE
-from .cbioportal_common import ERROR_FILE
-from .cbioportal_common import MetaFileTypes
-from .cbioportal_common import IMPORTER_CLASSNAME_BY_META_TYPE
-from .cbioportal_common import IMPORTER_REQUIRES_METADATA
-from .cbioportal_common import IMPORT_CANCER_TYPE_CLASS
-from .cbioportal_common import IMPORT_STUDY_CLASS
-from .cbioportal_common import UPDATE_STUDY_STATUS_CLASS
-from .cbioportal_common import REMOVE_STUDY_CLASS
-from .cbioportal_common import REMOVE_SAMPLES_CLASS
-from .cbioportal_common import REMOVE_PATIENTS_CLASS
-from .cbioportal_common import IMPORT_CASE_LIST_CLASS
-from .cbioportal_common import ADD_CASE_LIST_CLASS
-from .cbioportal_common import VERSION_UTIL_CLASS
-from .cbioportal_common import run_java
-from .cbioportal_common import UPDATE_CASE_LIST_CLASS
-from .cbioportal_common import INCREMENTAL_UPLOAD_SUPPORTED_META_TYPES
-
-
-# ------------------------------------------------------------------------------
-# globals
-
-LOGGER = None
-
-# commands
-IMPORT_CANCER_TYPE = "import-cancer-type"
-IMPORT_STUDY = "import-study"
-REMOVE_STUDY = "remove-study"
-REMOVE_SAMPLES = "remove-samples"
-REMOVE_PATIENTS = "remove-patients"
-IMPORT_STUDY_DATA = "import-study-data"
-IMPORT_CASE_LIST = "import-case-list"
-
-COMMANDS = [IMPORT_CANCER_TYPE, IMPORT_STUDY, IMPORT_STUDY_DATA, IMPORT_CASE_LIST, REMOVE_STUDY, REMOVE_SAMPLES, REMOVE_PATIENTS]
-
-# ------------------------------------------------------------------------------
-# sub-routines
-
-def import_cancer_type(jvm_args, data_filename):
-    args = jvm_args.split(' ')
-    args.append(IMPORT_CANCER_TYPE_CLASS)
-    args.append(data_filename)
-    args.append("false") # don't clobber existing table
-    args.append("--noprogress") # don't report memory usage and % progress
-    run_java(*args)
-
-def import_study(jvm_args, meta_filename):
-    args = jvm_args.split(' ')
-    args.append(IMPORT_STUDY_CLASS)
-    args.append(meta_filename)
-    args.append("--noprogress") # don't report memory usage and % progress
-    run_java(*args)
-
-def update_study_status(jvm_args, study_id):
-    args = jvm_args.split(' ')
-    args.append(UPDATE_STUDY_STATUS_CLASS)
-    args.append(study_id)
-    args.append("AVAILABLE")
-    args.append("--noprogress") # don't report memory usage and % progress
-    run_java(*args)
-
-def remove_study_meta(jvm_args, meta_filename):
-    args = jvm_args.split(' ')
-    args.append(REMOVE_STUDY_CLASS)
-    meta_dictionary = cbioportal_common.parse_metadata_file(
-        meta_filename, logger=LOGGER)
-    if meta_dictionary['meta_file_type'] != MetaFileTypes.STUDY:
-        # invalid file, skip
-        print('Not a study meta file: ' + meta_filename, file=ERROR_FILE)
-        return
-    args.append(meta_dictionary['cancer_study_identifier'])
-    args.append("--noprogress") # don't report memory usage and % progress
-    run_java(*args)
-
-def remove_study_id(jvm_args, study_id):
-    args = jvm_args.split(' ')
-    args.append(REMOVE_STUDY_CLASS)
-    args.append(study_id)
-    args.append("--noprogress") # don't report memory usage and % progress
-    run_java(*args)
-
 def remove_samples(jvm_args, study_ids, sample_ids):
+    """
+    Remove samples AND also delete patients if they become sample-less
+    """
     args = jvm_args.split(' ')
     args.append(REMOVE_SAMPLES_CLASS)
     args.append("--study_ids")
@@ -117,584 +10,55 @@ def remove_samples(jvm_args, study_ids, sample_ids):
     args.append(sample_ids)
     run_java(*args)
 
-def remove_patients(jvm_args, study_ids, patient_ids):
-    args = jvm_args.split(' ')
-    args.append(REMOVE_PATIENTS_CLASS)
-    args.append("--study_ids")
-    args.append(study_ids)
-    args.append("--patient_ids")
-    args.append(patient_ids)
-    run_java(*args)
-
-def update_case_lists(jvm_args, meta_filename, case_lists_file_or_dir = None):
-    args = jvm_args.split(' ')
-    args.append(UPDATE_CASE_LIST_CLASS)
-    args.append("--meta")
-    args.append(meta_filename)
-    if case_lists_file_or_dir:
-        args.append("--case-lists")
-        args.append(case_lists_file_or_dir)
-    run_java(*args)
-
-def import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity = None, meta_file_dictionary = None, incremental = False):
-    args = jvm_args.split(' ')
 
-    # In case the meta file is already parsed in a previous function, it is not
-    # necessary to parse it again
-    if meta_file_dictionary is None:
-        meta_file_dictionary = cbioportal_common.parse_metadata_file(
-        meta_filename, logger=LOGGER)
 
-    # Retrieve meta file type
-    meta_file_type = meta_file_dictionary['meta_file_type']
-
-    # Do not update entities by default
-    shouldUpdateGenericAssayEntities = False
-    if update_generic_assay_entity != None and update_generic_assay_entity.casefold() == "True".casefold():
-        shouldUpdateGenericAssayEntities = True
-
-    # invalid file, skip
-    if meta_file_type is None:
-        print(("Unrecognized meta file type '%s', skipping file"
-                              % (meta_file_type)), file=ERROR_FILE)
-        return
-
-    if not data_filename.endswith(meta_file_dictionary['data_filename']):
-        print(("'data_filename' in meta file contradicts "
-                              "data filename in command, skipping file"), file=ERROR_FILE)
-        return
-
-    importer = IMPORTER_CLASSNAME_BY_META_TYPE[meta_file_type]
-
-    args.append(importer)
-    if incremental:
-        if meta_file_type not in INCREMENTAL_UPLOAD_SUPPORTED_META_TYPES:
-            raise NotImplementedError("This type does not support incremental upload: {}".format(meta_file_type))
-        args.append("--overwrite-existing")
-    if IMPORTER_REQUIRES_METADATA[importer]:
-        args.append("--meta")
-        args.append(meta_filename)
-        args.append("--loadMode")
-        args.append("bulkload")
-    if importer == "org.mskcc.cbio.portal.scripts.ImportProfileData" and shouldUpdateGenericAssayEntities:
-        args.append("--update-info")
-        args.append("True")
-    elif importer == "org.mskcc.cbio.portal.scripts.ImportProfileData" and not shouldUpdateGenericAssayEntities:
-        args.append("--update-info")
-        args.append("False")
-    if importer in ("org.mskcc.cbio.portal.scripts.ImportMutSigData", "org.mskcc.cbio.portal.scripts.ImportGisticData"):
-        args.append("--data")
-        args.append(data_filename)
-        args.append("--study")
-        args.append(meta_file_dictionary['cancer_study_identifier'])
-    elif importer == "org.mskcc.cbio.portal.scripts.ImportGenePanelProfileMap":
-        args.append("--meta")
-        args.append(meta_filename)
-        args.append("--data")
-        args.append(data_filename)
-    else:
-        args.append("--data")
-        args.append(data_filename)
-
-    args.append("--noprogress") # don't report memory usage and % progress
-    run_java(*args)
-
-def import_case_list(jvm_args, meta_filename):
-    args = jvm_args.split(' ')
-    args.append(IMPORT_CASE_LIST_CLASS)
-    args.append(meta_filename)
-    args.append("--noprogress") # don't report memory usage and % progress
-    run_java(*args)
-
-def add_global_case_list(jvm_args, study_id):
-    args = jvm_args.split(' ')
-    args.append(ADD_CASE_LIST_CLASS)
-    args.append(study_id)
-    args.append("all")
-    args.append("--noprogress") # don't report memory usage and % progress
-    run_java(*args)
-
-def check_version(jvm_args):
-    args = jvm_args.split(' ')
-    args.append(VERSION_UTIL_CLASS)
     try:
-        run_java(*args)
-    except:
-        print(
-            'Error, probably due to this version of the portal '
-            'being out of sync with the database. '
-            'Run the database migration script located at '
-            'CBIOPORTAL_SRC/core/src/main/scripts/migrate_db.py '
-            'before continuing.',
-            file=OUTPUT_FILE)
-        raise
-
-def process_case_lists(jvm_args, case_list_dir):
-    for case_list in os.listdir(case_list_dir):
-        # skip "temp"/backup files made by some text editors:
-        if not (case_list.startswith('.') or case_list.endswith('~')):
-            import_case_list(jvm_args, os.path.join(case_list_dir, case_list))
-
-def process_command(jvm_args, command, meta_filename, data_filename, study_ids, patient_ids, sample_ids, update_generic_assay_entity = None):
-    if command == IMPORT_CANCER_TYPE:
-        import_cancer_type(jvm_args, data_filename)
-    elif command == IMPORT_STUDY:
-        import_study(jvm_args, meta_filename)
-    elif command == REMOVE_STUDY:
-        if study_ids == None:
-            remove_study_meta(jvm_args, meta_filename)
-        elif meta_filename == None:
-            study_ids = study_ids.split(",")
-            for study_id in study_ids:
-                remove_study_id(jvm_args, study_id)
-        else:
-            raise RuntimeError('Your command uses both -id and -meta. Please, use only one of the two parameters.')
-    elif command == REMOVE_SAMPLES:
-        remove_samples(jvm_args, study_ids, sample_ids)
-    elif command == REMOVE_PATIENTS:
-        remove_patients(jvm_args, study_ids, patient_ids)
-    elif command == IMPORT_STUDY_DATA:
-        import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity)
-    elif command == IMPORT_CASE_LIST:
-        import_case_list(jvm_args, meta_filename)
-
-def get_meta_filenames(data_directory):
-    meta_filenames = [
-        os.path.join(data_directory, meta_filename) for
-        meta_filename in os.listdir(data_directory) if
-        re.search(r'(\b|_)meta(\b|[_0-9])', meta_filename,
-                  flags=re.IGNORECASE) and
-        not (meta_filename.startswith('.') or meta_filename.endswith('~'))]
-    return meta_filenames
-
-def process_study_directory(jvm_args, study_directory, update_generic_assay_entity = None):
-    """
-    Import an entire study directory based on meta files found.
-
-    1. Determine meta files in study directory.
-    2. Read all meta files and determine file types.
-    3. Import data files in specific order by file type.
-    """
-
-    study_id = None
-    study_meta_filename = None
-    study_meta_dictionary = {}
-    cancer_type_filepairs = []
-    sample_attr_filepair = None
-    sample_resource_filepair = None
-    resource_definition_filepair = None
-    regular_filepairs = []
-    gene_panel_matrix_filepair = None
-    zscore_filepairs = []
-    gsva_score_filepair = None
-    gsva_pvalue_filepair = None
-    structural_variant_filepair = None
-    cna_long_filepair = None
-
-    # Determine meta filenames in study directory
-    meta_filenames = get_meta_filenames(study_directory)
-
-    # Read all meta files (excluding case lists) to determine what to import
-    for meta_filename in meta_filenames:
-
-        # Parse meta file
-        meta_dictionary = cbioportal_common.parse_metadata_file(
-            meta_filename, study_id=study_id, logger=LOGGER)
-
-        # Save meta dictionary in study meta dictionary
-        study_meta_dictionary[meta_filename] = meta_dictionary
-
-        # Retrieve meta file type
-        meta_file_type = meta_dictionary['meta_file_type']
-        if meta_file_type is None:
-            # invalid meta file, let's die
-            raise RuntimeError('Invalid meta file: ' + meta_filename)
-
-        # remember study id to give an error in case any other file is referencing a different one
-        if study_id is None and 'cancer_study_identifier' in meta_dictionary:
-            study_id = meta_dictionary['cancer_study_identifier']
-
-        # Check the type of metafile. It is to know which metafile types the
-        # study contains because at a later stage we want to import in a
-        # specific order.
-
-        # Check for cancer type file
-        if meta_file_type == MetaFileTypes.CANCER_TYPE:
-            cancer_type_filepairs.append(
-                (meta_filename, os.path.join(study_directory, meta_dictionary['data_filename'])))
-        # Check for meta study file
-        elif meta_file_type == MetaFileTypes.STUDY:
-            if study_meta_filename is not None:
-                raise RuntimeError(
-                    'Multiple meta_study files found: {} and {}'.format(
-                        study_meta_filename, meta_filename))
-            # Determine the study meta filename
-            study_meta_filename = meta_filename
-            study_meta_dictionary[study_meta_filename] = meta_dictionary
-        # Check for resource definitions
-        elif meta_file_type == MetaFileTypes.RESOURCES_DEFINITION:
-            if resource_definition_filepair is not None:
-                raise RuntimeError(
-                    'Multiple resource definition files found: {} and {}'.format(
-                        resource_definition_filepair[0], meta_filename))   # pylint: disable=unsubscriptable-object
-            resource_definition_filepair = (
-                meta_filename, os.path.join(study_directory, meta_dictionary['data_filename']))            
-        # Check for sample attributes
-        elif meta_file_type == MetaFileTypes.SAMPLE_ATTRIBUTES:
-            if sample_attr_filepair is not None:
-                raise RuntimeError(
-                    'Multiple sample attribute files found: {} and {}'.format(
-                        sample_attr_filepair[0], meta_filename))   # pylint: disable=unsubscriptable-object
-            sample_attr_filepair = (
-                meta_filename, os.path.join(study_directory, meta_dictionary['data_filename']))
-        elif meta_file_type == MetaFileTypes.SAMPLE_RESOURCES:
-            if sample_resource_filepair is not None:
-                raise RuntimeError(
-                    'Multiple sample resource files found: {} and {}'.format(
-                        sample_resource_filepair[0], meta_filename))   # pylint: disable=unsubscriptable-object
-            sample_resource_filepair = (
-                meta_filename, os.path.join(study_directory, meta_dictionary['data_filename']))
-        # Check for gene panel matrix
-        elif meta_file_type == MetaFileTypes.GENE_PANEL_MATRIX:
-            gene_panel_matrix_filepair = (
-                (meta_filename, os.path.join(study_directory, meta_dictionary['data_filename'])))
-        # Check for z-score exression files
-        elif meta_file_type == MetaFileTypes.EXPRESSION and meta_dictionary['datatype'] == "Z-SCORE":
-            zscore_filepairs.append(
-                (meta_filename, os.path.join(study_directory, meta_dictionary['data_filename'])))
-        # Check for GSVA scores
-        elif meta_file_type == MetaFileTypes.GSVA_SCORES:
-            gsva_score_filepair = (
-                (meta_filename, os.path.join(study_directory, meta_dictionary['data_filename'])))
-        # Check for GSVA p-values
-        elif meta_file_type == MetaFileTypes.GSVA_PVALUES:
-            gsva_pvalue_filepair = (
-                (meta_filename, os.path.join(study_directory, meta_dictionary['data_filename'])))
-        # Check for structural variant data
-        elif meta_file_type == MetaFileTypes.STRUCTURAL_VARIANT:
-            structural_variant_filepair = (
-                (meta_filename, os.path.join(study_directory, meta_dictionary['data_filename'])))
-        elif meta_file_type == MetaFileTypes.CNA_DISCRETE_LONG:
-            cna_long_filepair = (
-                (meta_filename, os.path.join(study_directory, meta_dictionary['data_filename'])))
-        # Add all other types of data
-        else:
-            regular_filepairs.append(
-                (meta_filename, os.path.join(study_directory, meta_dictionary['data_filename'])))
-
-    # First, import cancer types
-    for meta_filename, data_filename in cancer_type_filepairs:
-        import_cancer_type(jvm_args, data_filename)
-
-    # Then define the study
-    if study_meta_filename is None:
-        raise RuntimeError('No meta_study file found')
-    else:
-        # First remove study if exists
-        remove_study_meta(jvm_args, study_meta_filename)
-        import_study(jvm_args, study_meta_filename)
-
-    # Next, we need to import sample definitions
-    if sample_attr_filepair is None:
-        raise RuntimeError('No sample attribute file found')
-    else:
-        meta_filename, data_filename = sample_attr_filepair
-        import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename])
-
-    # Next, we need to import resource definitions for resource data
-    if resource_definition_filepair is not None:
-        meta_filename, data_filename = resource_definition_filepair
-        import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename])
-
-    # Next, we need to import sample definitions for resource data
-    if sample_resource_filepair is not None:
-        meta_filename, data_filename = sample_resource_filepair
-        import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename])
-
-    # Next, import everything else except gene panel, structural variant data, GSVA and
-    # z-score expression. If in the future more types refer to each other, (like
-    # in a tree structure) this could be programmed in a recursive fashion.
-    for meta_filename, data_filename in regular_filepairs:
-        import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename])
-
-    # Import structural variant data
-    if structural_variant_filepair is not None:
-        meta_filename, data_filename = structural_variant_filepair
-        import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename])
-
-    # Import cna data
-    if cna_long_filepair is not None:
-        meta_filename, data_filename = cna_long_filepair
-        import_data(jvm_args=jvm_args, meta_filename=meta_filename, data_filename=data_filename,
-                    meta_file_dictionary=study_meta_dictionary[meta_filename])
-
-    # Import expression z-score (after expression)
-    for meta_filename, data_filename in zscore_filepairs:
-        import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename])
-
-    # Import GSVA genetic profiles (after expression and z-scores)
-    if gsva_score_filepair is not None:
-
-        # First import the GSVA score data
-        meta_filename, data_filename = gsva_score_filepair
-        import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename])
-
-        # Second import the GSVA p-value data
-        meta_filename, data_filename = gsva_pvalue_filepair
-        import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename])
-
-    if gene_panel_matrix_filepair is not None:
-        meta_filename, data_filename = gene_panel_matrix_filepair
-        import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename])
-
-    # Import the case lists
-    case_list_dirname = os.path.join(study_directory, 'case_lists')
-    if os.path.isdir(case_list_dirname):
-        process_case_lists(jvm_args, case_list_dirname)
-
-    if study_meta_dictionary[study_meta_filename].get('add_global_case_list', 'false').lower() == 'true':
-        add_global_case_list(jvm_args, study_id)
-
-    # enable study
-    update_study_status(jvm_args, study_id)
-
-def get_meta_filenames_by_type(data_directory) -> Dict[str, Tuple[str, Dict]]:
-    """
-    Read all meta files in the data directory and return meta information (filename, content) grouped by type.
-    """
-    meta_file_type_to_meta_files = {}
+       
+        from .cbioportal_common import get_db_connection
 
-    # Determine meta filenames in study directory
-    meta_filenames = get_meta_filenames(data_directory)
+        conn = get_db_connection()
+        cursor = conn.cursor()
 
-    # Read all meta files (excluding case lists) to determine what to import
-    for meta_filename in meta_filenames:
-
-        # Parse meta file
-        meta_dictionary = cbioportal_common.parse_metadata_file(
-            meta_filename, logger=LOGGER)
-
-        # Retrieve meta file type
-        meta_file_type = meta_dictionary['meta_file_type']
-        if meta_file_type is None:
-            # invalid meta file, let's die
-            raise RuntimeError('Invalid meta file: ' + meta_filename)
-        if meta_file_type not in meta_file_type_to_meta_files:
-            meta_file_type_to_meta_files[meta_file_type] = []
-
-        meta_file_type_to_meta_files[meta_file_type].append((meta_filename, meta_dictionary))
-    return meta_file_type_to_meta_files
-
-def import_incremental_data(jvm_args, data_directory, update_generic_assay_entity, meta_file_type_to_meta_files):
-    """
-    Load all data types that are available and support incremental upload
-    """
-    for meta_file_type in INCREMENTAL_UPLOAD_SUPPORTED_META_TYPES:
-        if meta_file_type not in meta_file_type_to_meta_files:
-            continue
-        meta_pairs = meta_file_type_to_meta_files[meta_file_type]
-        for meta_pair in meta_pairs:
-            meta_filename, meta_dictionary = meta_pair
-            data_filename = os.path.join(data_directory, meta_dictionary['data_filename'])
-            import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, meta_dictionary, incremental=True)
-
-def update_case_lists_from_folder(jvm_args, data_directory, meta_file_type_to_meta_files):
-    """
-    Updates case lists if clinical sample provided.
-    The command takes case_list/ folder as optional argument.
-    If folder exists case lists will be updated accordingly.
-    """
-    if MetaFileTypes.SAMPLE_ATTRIBUTES in meta_file_type_to_meta_files:
-        case_list_dirname = os.path.join(data_directory, 'case_lists')
-        sample_attributes_metas = meta_file_type_to_meta_files[MetaFileTypes.SAMPLE_ATTRIBUTES]
-        for meta_pair in sample_attributes_metas:
-            meta_filename, meta_dictionary = meta_pair
-            LOGGER.info('Updating case lists with sample ids', extra={'filename_': meta_filename})
-            update_case_lists(jvm_args, meta_filename, case_lists_file_or_dir=case_list_dirname if os.path.isdir(case_list_dirname) else None)
-
-def process_data_directory(jvm_args, data_directory, update_generic_assay_entity = None):
-    """
-    Incremental import of data directory based on meta files found.
-    """
-
-    meta_file_type_to_meta_files = get_meta_filenames_by_type(data_directory)
-
-    not_supported_meta_types = meta_file_type_to_meta_files.keys() - INCREMENTAL_UPLOAD_SUPPORTED_META_TYPES
-    if not_supported_meta_types:
-        raise NotImplementedError("These types do not support incremental upload: {}".format(", ".join(not_supported_meta_types)))
-    import_incremental_data(jvm_args, data_directory, update_generic_assay_entity, meta_file_type_to_meta_files)
-    update_case_lists_from_folder(jvm_args, data_directory, meta_file_type_to_meta_files)
-
-def usage():
-    # TODO : replace this by usage string from interface()
-    print(('cbioportalImporter.py --jar-path (path to scripts jar file) ' +
-                           '--command [%s] --study_directory <path to directory> '
-                           '--meta_filename <path to metafile>'
-                           '--data_filename <path to datafile>'
-                           '--study_ids <cancer study ids for remove-study or remove-samples command, comma separated>' % (COMMANDS)), file=OUTPUT_FILE)
-
-def check_args(command):
-    if command not in COMMANDS:
-        usage()
-        sys.exit(2)
-
-
-def check_files(meta_filename, data_filename):
-    if meta_filename and not os.path.exists(meta_filename):
-        print('meta-file cannot be found: ' + meta_filename, file=ERROR_FILE)
-        sys.exit(2)
-    if data_filename  and not os.path.exists(data_filename):
-        print('data-file cannot be found:' + data_filename, file=ERROR_FILE)
-        sys.exit(2)
-
-def check_dir(data_directory):
-    # check existence of directory
-    if not os.path.exists(data_directory) and data_directory != '':
-        print('Directory cannot be found: ' + data_directory, file=ERROR_FILE)
-        sys.exit(2)
-
-def add_parser_args(parser):
-    data_source_group = parser.add_mutually_exclusive_group()
-    data_source_group.add_argument('-s', '--study_directory', type=str, help='Path to Study Directory')
-    data_source_group.add_argument('-d', '--data_directory', type=str, help='Path to Data Directory')
-    parser.add_argument('-jvo', '--java_opts', type=str, default=os.environ.get('JAVA_OPTS'),
-                        help='Path to specify JAVA_OPTS for the importer. \
-                            (default: gets the JAVA_OPTS from the environment)')
-    parser.add_argument('-jar', '--jar_path', type=str, required=False,
-                            help='Path to scripts JAR file')
-    parser.add_argument('-meta', '--meta_filename', type=str, required=False,
-                        help='Path to meta file')
-    parser.add_argument('-data', '--data_filename', type=str, required=False,
-                        help='Path to Data file')
-
-def interface(args=None):
-    parent_parser = argparse.ArgumentParser(description='cBioPortal meta Importer')
-    add_parser_args(parent_parser)
-    parser = argparse.ArgumentParser()
-    allowed_commands_csv = ', '.join(COMMANDS)
-    subparsers = parser.add_subparsers(title='subcommands', dest='subcommand',
-                          help='Command for import. Allowed commands: ' + allowed_commands_csv)
-    import_cancer_type = subparsers.add_parser('import-cancer-type', parents=[parent_parser], add_help=False)
-    import_study = subparsers.add_parser('import-study', parents=[parent_parser], add_help=False)
-    import_study_data = subparsers.add_parser('import-study-data', parents=[parent_parser], add_help=False)
-    import_case_list = subparsers.add_parser('import-case-list', parents=[parent_parser], add_help=False)
-    remove_study = subparsers.add_parser('remove-study', parents=[parent_parser], add_help=False)
-    remove_study.add_argument('-id', '--study_ids', type=str, required=False,
-                        help='Cancer Study IDs for `remove-study` command, comma separated')
-
-    remove_samples = subparsers.add_parser('remove-samples', parents=[], add_help=True)
-    remove_samples.add_argument('--study_ids', type=str, required=True,
-                        help='Cancer Study ID(s) that contains sample(s). Comma separated, if multiple.')
-    remove_samples.add_argument('--sample_ids', type=str, required=True,
-                        help='Sample ID(s). Comma separated, if multiple.')
-
-    remove_patients = subparsers.add_parser('remove-patients', parents=[], add_help=True)
-    remove_patients.add_argument('--study_ids', type=str, required=True,
-                        help='Cancer Study ID(s) that contains sample(s). Comma separated, if multiple.')
-    remove_patients.add_argument('--patient_ids', type=str, required=True,
-                        help='Patient ID(s). Comma separated, if multiple.')
-
-    parser.add_argument('-c', '--command', type=str, required=False,
-                        help='This argument is outdated. Please use the listed subcommands, without the -c flag. '
-                        'Command for import. Allowed commands: ' + allowed_commands_csv)
-    add_parser_args(parser)
-    parser.add_argument('-id', '--study_ids', type=str, required=False,
-                        help='Cancer Study IDs for `remove-study` command, comma separated')
-    
-    parser.add_argument('-update', '--update_generic_assay_entity', type=str, required=False,
-                        help='Set as True to update the existing generic assay entities, set as False to keep the existing generic assay entities for generic assay')
-    # TODO - add same argument to metaimporter
-    # TODO - harmonize on - and _
-
-    parser = parser.parse_args(args)
-    if parser.command is not None and parser.subcommand is not None:
-        print('Cannot call multiple commands')
-        sys.exit(2)
-    elif parser.subcommand is not None:
-        parser.command = parser.subcommand
-    return parser
-
-
-def locate_jar():
-    """Locate the scripts jar file relative to this script.
-
-    Throws a FileNotFoundError with a message if the jar file couldn't be
-    identified.
-    """
-    # get the directory name of the currently running script,
-    # resolving any symlinks
-    this_file = Path(__file__).resolve()
-    importer_dir = this_file.parent
-    scripts_dir = importer_dir.parent
-    root_dir = scripts_dir.parent
-    jars = list((root_dir).glob('core-*.jar'))
-    if len(jars) != 1:
-        raise FileNotFoundError(
-            'Expected to find 1 scripts-*.jar, but found ' + str(len(jars)))
-    return str(jars[0])
+      
+        query = """
+            SELECT DISTINCT patient_id
+            FROM sample
+            WHERE cancer_study_identifier = %s
+        """
+        cursor.execute(query, (study_ids,))
+        all_patients = {row[0] for row in cursor.fetchall()}
 
+      
+        patients_to_delete = []
 
-def main(args):
-    global LOGGER
+        for patient in all_patients:
+            cursor.execute("""
+                SELECT COUNT(*) FROM sample
+                WHERE cancer_study_identifier = %s AND patient_id = %s
+            """, (study_ids, patient))
 
-    # get the logger with a handler to print logged error messages to stderr
-    module_logger = logging.getLogger(__name__)
-    error_handler = logging.StreamHandler(sys.stderr)
-    error_handler.setFormatter(cbioportal_common.LogfileStyleFormatter(
-            os.getcwd()))
-    error_handler.setLevel(logging.ERROR)
-    module_logger.addHandler(error_handler)
-    LOGGER = module_logger
+            count = cursor.fetchone()[0]
 
-    # move jar_path to java_opts if it exists
-    if args.jar_path:
-        args.java_opts = f"-cp {args.jar_path} {args.java_opts}"
+            if count == 0:
+                patients_to_delete.append(patient)
 
-    # java_opts is optional. If class (jar) path is not set (-cp), try to find the jar path relative to this script
-    locate_jar_path = True
-    if args.java_opts is not None and '-cp' in args.java_opts:
-        locate_jar_path = False
-    if locate_jar_path:
-        try:
-            jar_path = locate_jar()
-        except FileNotFoundError as e:
-            print(e)
-            sys.exit(2)
-        print('Data loading step using', jar_path)
-        print()
-        if args.java_opts is None:
-            args.java_opts = f"-cp {jar_path}"
-        else:
-            args.java_opts = f"-cp {jar_path} {args.java_opts}"
+       
+        if patients_to_delete:
+            patient_ids_str = ",".join(patients_to_delete)
 
-    # process the options
-    jvm_args = "-Dspring.profiles.active=dbcp " + args.java_opts
+            args = jvm_args.split(' ')
+            args.append(REMOVE_PATIENTS_CLASS)
+            args.append("--study_ids")
+            args.append(study_ids)
+            args.append("--patient_ids")
+            args.append(patient_ids_str)
 
-    # check if DB version and application version are in sync
-    check_version(jvm_args)
+            run_java(*args)
 
-    if args.data_directory is not None:
-        check_dir(args.data_directory)
-        process_data_directory(jvm_args, args.data_directory, args.update_generic_assay_entity)
-    elif args.study_directory is not None:
-        check_dir(args.study_directory)
-        process_study_directory(jvm_args, args.study_directory, args.update_generic_assay_entity)
-    else:
-        check_args(args.command)
-        check_files(args.meta_filename, args.data_filename)
-        process_command(
-            jvm_args,
-            args.command,
-            args.meta_filename,
-            args.data_filename,
-            args.study_ids,
-            args.patient_ids if hasattr(args, 'patient_ids') else None,
-            args.sample_ids if hasattr(args, 'sample_ids') else None,
-            args.update_generic_assay_entity)
+            print(f"Removed sample-less patients: {patient_ids_str}")
 
-# ------------------------------------------------------------------------------
-# ready to roll
+        cursor.close()
+        conn.close()
 
-if __name__ == '__main__':
-    parsed_args = interface()
-    main(parsed_args)
+    except Exception as e:
+        print(f"Warning: Could not clean up sample-less patients: {e}", file=ERROR_FILE)