diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java index 2a947c5e0eed..ede30bfb946d 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java @@ -1977,8 +1977,9 @@ private void removeSemijoinOptimizationByBenefit(OptimizeTezProcContext procCtx) LOG.debug("Old stats for {}: {}", roi.filterOperator, roi.filterStats); LOG.debug("Number of rows reduction: {}/{}", newNumRows, roi.filterStats.getNumRows()); } + boolean useColStats = roi.filterStats.getColumnStats() != null; StatsUtils.updateStats(roi.filterStats, newNumRows, - true, roi.filterOperator, roi.colNames); + useColStats, roi.filterOperator, roi.colNames); if (LOG.isDebugEnabled()) { LOG.debug("New stats for {}: {}", roi.filterOperator, roi.filterStats); } diff --git a/ql/src/test/queries/clientpositive/semijoin_removal_missing_colstats.q b/ql/src/test/queries/clientpositive/semijoin_removal_missing_colstats.q new file mode 100644 index 000000000000..d867451d4e6f --- /dev/null +++ b/ql/src/test/queries/clientpositive/semijoin_removal_missing_colstats.q @@ -0,0 +1,11 @@ +-- HIVE-29516: NPE in StatsUtils.updateStats when removing semijoin by benefit and column statistics are missing +set hive.stats.fetch.column.stats=false; + +create table big (id int, val string) partitioned by (bday int); +alter table big add partition (bday=20260410); +alter table big partition (bday=20260410) update statistics set ('numRows' = '1000000000'); + +create table small (id int, val string); +alter table small update statistics set ('numRows' = '1000'); + +explain select big.val, small.val from big join small on big.id = small.id; diff --git a/ql/src/test/results/clientpositive/llap/semijoin_removal_missing_colstats.q.out b/ql/src/test/results/clientpositive/llap/semijoin_removal_missing_colstats.q.out new file mode 100644 index 000000000000..dc6e3bc3faf2 --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/semijoin_removal_missing_colstats.q.out @@ -0,0 +1,167 @@ +PREHOOK: query: create table big (id int, val string) partitioned by (bday int) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@big +POSTHOOK: query: create table big (id int, val string) partitioned by (bday int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@big +PREHOOK: query: alter table big add partition (bday=20260410) +PREHOOK: type: ALTERTABLE_ADDPARTS +PREHOOK: Output: default@big +POSTHOOK: query: alter table big add partition (bday=20260410) +POSTHOOK: type: ALTERTABLE_ADDPARTS +POSTHOOK: Output: default@big +POSTHOOK: Output: default@big@bday=20260410 +PREHOOK: query: alter table big partition (bday=20260410) update statistics set ('numRows' = '1000000000') +PREHOOK: type: ALTERTABLE_UPDATEPARTSTATS +PREHOOK: Input: default@big +PREHOOK: Output: default@big@bday=20260410 +POSTHOOK: query: alter table big partition (bday=20260410) update statistics set ('numRows' = '1000000000') +POSTHOOK: type: ALTERTABLE_UPDATEPARTSTATS +POSTHOOK: Input: default@big +POSTHOOK: Input: default@big@bday=20260410 +POSTHOOK: Output: default@big@bday=20260410 +PREHOOK: query: create table small (id int, val string) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@small +POSTHOOK: query: create table small (id int, val string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@small +PREHOOK: query: alter table small update statistics set ('numRows' = '1000') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@small +PREHOOK: Output: default@small +POSTHOOK: query: alter table small update statistics set ('numRows' = '1000') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@small +POSTHOOK: Output: default@small +PREHOOK: query: explain select big.val, small.val from big join small on big.id = small.id +PREHOOK: type: QUERY +PREHOOK: Input: default@big +PREHOOK: Input: default@big@bday=20260410 +PREHOOK: Input: default@small +#### A masked pattern was here #### +POSTHOOK: query: explain select big.val, small.val from big join small on big.id = small.id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@big +POSTHOOK: Input: default@big@bday=20260410 +POSTHOOK: Input: default@small +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 1 <- Reducer 4 (BROADCAST_EDGE) + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE) + Reducer 4 <- Map 3 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: big + filterExpr: (id is not null and id BETWEEN DynamicValue(RS_7_small_id_min) AND DynamicValue(RS_7_small_id_max) and in_bloom_filter(id, DynamicValue(RS_7_small_id_bloom_filter))) (type: boolean) + Statistics: Num rows: 1000000000 Data size: 296000000000 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (id is not null and id BETWEEN DynamicValue(RS_7_small_id_min) AND DynamicValue(RS_7_small_id_max) and in_bloom_filter(id, DynamicValue(RS_7_small_id_bloom_filter))) (type: boolean) + Statistics: Num rows: 1000000000 Data size: 296000000000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: id (type: int), val (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1000000000 Data size: 296000000000 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 1000000000 Data size: 296000000000 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string) + Execution mode: vectorized, llap + LLAP IO: all inputs + Map 3 + Map Operator Tree: + TableScan + alias: small + filterExpr: id is not null (type: boolean) + Statistics: Num rows: 1000 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Filter Operator + predicate: id is not null (type: boolean) + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: id (type: int), val (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + value expressions: _col1 (type: string) + Select Operator + expressions: _col0 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=1000000) + minReductionHashAggr: 0.99 + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 152 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 1 Data size: 152 Basic stats: PARTIAL Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) + Execution mode: vectorized, llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col1, _col3 + Statistics: Num rows: 1100000023 Data size: 325600007057 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: _col1 (type: string), _col3 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1100000023 Data size: 325600007057 Basic stats: PARTIAL Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1100000023 Data size: 325600007057 Basic stats: PARTIAL Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 4 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, 1, expectedEntries=1000000) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 152 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 1 Data size: 152 Basic stats: PARTIAL Column stats: NONE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink +