From 49577f91e1bf3a08ff3da0272844867507f3a99d Mon Sep 17 00:00:00 2001 From: Ino de Bruijn Date: Fri, 29 May 2026 19:47:56 +0000 Subject: [PATCH 1/3] perf(import): bulk-load all tab-delim matrix data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enable ClickHouseBulkLoader for every tab-delim matrix import in ImportTabDelimData (mRNA + z-scores, methylation, RPPA + z-scores, log2 CNA, generic assay, …). The loader streams rows via INSERT ... FORMAT TSVWithNames — one round-trip per profile instead of one JDBC INSERT per gene. For a TCGA Pancan-shaped matrix (~20K genes × ~92 samples) this is the difference between seconds and minutes per profile. flushAll() at the end of importData() is already gated on isBulkLoad(), so flipping the loader on also auto-flushes. The discretized CNA existingCnaEvents path is untouched. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../org/mskcc/cbio/portal/scripts/ImportTabDelimData.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index ff346e77..6eeca0ad 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -280,11 +280,15 @@ void importDataInternal() throws Exception { geneticAlterationImporter.initialize(); + // Bulk-load matrix data via ClickHouseBulkLoader's + // INSERT ... FORMAT TSVWithNames path — one round-trip per profile + // instead of one JDBC INSERT per gene. + ClickHouseBulkLoader.bulkLoadOn(); + //cache for data found in cna_event' table: Set existingCnaEvents = new HashSet<>(); if (isDiscretizedCnaProfile) { existingCnaEvents.addAll(DaoCnaEvent.getAllCnaEvents()); - ClickHouseBulkLoader.bulkLoadOn(); } // load entities map from database From 15fce5eda3ebe19f658e6c68a5a6451df5a35d85 Mon Sep 17 00:00:00 2001 From: Ino de Bruijn Date: Sat, 30 May 2026 01:30:16 +0000 Subject: [PATCH 2/3] perf(import): bulk-load genetic_entity inserts + flush in ImportGenericAssayEntity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirror the genetic_alteration bulk-load path in addNewGeneticEntity so the loader-aware code path is used whenever bulkLoadOn() is active. For GENERIC_ASSAY profiles, ImportProfileData runs ImportGenericAssayEntity and ImportTabDelimData in the same JVM, where the latter builds its stable-id→entity-id map from a SELECT. Without an explicit flush between those two steps, the entities buffered by the former are invisible to the SELECT and all rows get skipped. Add ClickHouseBulkLoader.flushAll() at the end of ImportGenericAssayEntity.importData() so the rows are in the DB before the next importer's lookup. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../org/mskcc/cbio/portal/dao/DaoGeneticEntity.java | 13 +++++++++++-- .../portal/scripts/ImportGenericAssayEntity.java | 11 ++++++++++- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticEntity.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticEntity.java index 8f2ed303..b0ae2d33 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticEntity.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticEntity.java @@ -30,18 +30,27 @@ private enum SqlAction { public static GeneticEntity addNewGeneticEntity(GeneticEntity geneticEntity) throws DaoException { + long entityId = ClickHouseAutoIncrement.nextId("seq_genetic_entity"); + geneticEntity.setId((int) entityId); + + if (ClickHouseBulkLoader.isBulkLoad()) { + ClickHouseBulkLoader.getClickHouseBulkLoader("genetic_entity").insertRecord( + Long.toString(entityId), + geneticEntity.getEntityType(), + geneticEntity.getStableId()); + return geneticEntity; + } + Connection con = null; PreparedStatement pstmt = null; try { con = JdbcUtil.getDbConnection(DaoGeneticEntity.class); - long entityId = ClickHouseAutoIncrement.nextId("seq_genetic_entity"); pstmt = con.prepareStatement("INSERT INTO genetic_entity (`id`, `entity_type`, `stable_id`) " + "VALUES(?,?,?)"); pstmt.setLong(1, entityId); pstmt.setString(2, geneticEntity.getEntityType()); pstmt.setString(3, geneticEntity.getStableId()); pstmt.executeUpdate(); - geneticEntity.setId((int) entityId); } catch (SQLException e) { throw new DaoException(e); } finally { diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayEntity.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayEntity.java index 0c629364..d6cb7d1d 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayEntity.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayEntity.java @@ -41,6 +41,7 @@ import joptsimple.OptionParser; import joptsimple.OptionSet; import joptsimple.OptionSpec; +import org.mskcc.cbio.portal.dao.ClickHouseBulkLoader; import org.mskcc.cbio.portal.dao.DaoGenericAssay; import org.mskcc.cbio.portal.dao.DaoGeneticEntity; import org.mskcc.cbio.portal.model.shared.EntityType; @@ -247,7 +248,15 @@ public static void importData(File dataFile, GeneticAlterationType geneticAltera } reader.close(); - + + // Flush any buffered genetic_entity inserts so that a subsequent SELECT + // (e.g. GenericAssayMetaUtils.buildGenericAssayStableIdToEntityIdMap) + // in the same JVM sees them. ImportProfileData runs this method + // immediately before ImportTabDelimData for GENERIC_ASSAY profiles. + if (ClickHouseBulkLoader.isBulkLoad()) { + ClickHouseBulkLoader.flushAll(); + } + ProgressMonitor.setCurrentMessage("Finished loading generic assay.\n"); } From 58dfce590a521ca4cf6ca5c87c820752d5c05738 Mon Sep 17 00:00:00 2001 From: Ino de Bruijn Date: Sat, 30 May 2026 09:30:17 +0000 Subject: [PATCH 3/3] perf(import): bulkLoadOff() after flush to match codebase convention MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bulkLoadOn/Off is the established pattern (7 production callers — DaoGene, DaoReferenceGenomeGene, ImportGeneData, ImportCopyNumberSegmentData, ImportMicroRNAIDs, MutSigReader, ConsoleUtil — and the integration tests). Leaving the loader in the "on" state after a flush leaves the global flag sticky for any downstream DAO call in the same JVM that has a bulk-load branch (e.g. addNewGeneticEntity via DaoGeneset.addGeneset, which does not pre-emptively call bulkLoadOff()). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../mskcc/cbio/portal/scripts/ImportGenericAssayEntity.java | 3 +++ .../java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java | 1 + 2 files changed, 4 insertions(+) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayEntity.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayEntity.java index d6cb7d1d..e55bc716 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayEntity.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayEntity.java @@ -253,8 +253,11 @@ public static void importData(File dataFile, GeneticAlterationType geneticAltera // (e.g. GenericAssayMetaUtils.buildGenericAssayStableIdToEntityIdMap) // in the same JVM sees them. ImportProfileData runs this method // immediately before ImportTabDelimData for GENERIC_ASSAY profiles. + // bulkLoadOff() restores the global flag — ImportTabDelimData will + // turn it back on for the matrix import. if (ClickHouseBulkLoader.isBulkLoad()) { ClickHouseBulkLoader.flushAll(); + ClickHouseBulkLoader.bulkLoadOff(); } ProgressMonitor.setCurrentMessage("Finished loading generic assay.\n"); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index 6eeca0ad..b51023ea 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -372,6 +372,7 @@ void importDataInternal() throws Exception { DaoSampleProfile.upsertSampleToProfileMapping(orderedSampleList, geneticProfileId, genePanelId); if (ClickHouseBulkLoader.isBulkLoad()) { ClickHouseBulkLoader.flushAll(); + ClickHouseBulkLoader.bulkLoadOff(); } geneticAlterationImporter.complete();