Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions grammar/FuncTestCaseParser.g4
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,7 @@ scalarType
| dateType #date
| intervalYearType #intervalYear
| UUID isnull=QMark? #uuid
| Unknown isnull=QMark? #unknown
| UserDefined Identifier isnull=QMark? #userDefined
;

Expand Down
2 changes: 2 additions & 0 deletions grammar/SubstraitLexer.g4
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,8 @@ Number
: '-'? Int
;

Unknown: 'UNKNOWN';

Identifier
: ('A'..'Z' | '_' | '$') ('A'..'Z' | '_' | '$' | Digit)*
;
Expand Down
1 change: 1 addition & 0 deletions grammar/SubstraitType.g4
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ scalarType
| Date #date
| Interval_Year #intervalYear
| UUID #uuid
| Unknown #unknown
;

parameterizedType
Expand Down
30 changes: 27 additions & 3 deletions proto/substrait/algebra.proto
Original file line number Diff line number Diff line change
Expand Up @@ -1010,6 +1010,7 @@ message Expression {
Lambda lambda = 15;
LambdaInvocation lambda_invocation = 16;
ExecutionContextVariable execution_context_variable = 17;
NamedExpression named_expression = 18;

// deprecated: enum literals are only sensible in the context of
// function arguments, for which FunctionArgument should now be
Expand Down Expand Up @@ -1245,6 +1246,17 @@ message Expression {
Nested.Struct arguments = 2;
}

// A named expression that is not yet resolved to a positional reference or
// concrete expression. The names field is used to represent namespacing
// (e.g. qualifier and field name). Producers and consumers must share enough
// external context to resolve this expression before execution unless the
// consumer defines semantics for unresolved named expressions. Until
// resolved, a NamedExpression's type is Type.Unknown.
message NamedExpression {
repeated string names = 1;
substrait.extensions.AdvancedExtension advanced_extension = 10;
}

// A scalar function call.
message ScalarFunction {
// Points to a function_anchor defined in this plan, which must refer
Expand All @@ -1262,6 +1274,9 @@ message Expression {
// - Enum arguments must be bound using FunctionArgument.enum
// followed by Enum.specified, with a string that case-insensitively
// matches one of the allowed options.
//
// In a partially bound expression, Type.Unknown may stand in for any
// concrete type while binding value or type arguments.
repeated FunctionArgument arguments = 4;

// Options to specify behavior for corner cases, or leave behavior
Expand All @@ -1270,7 +1285,8 @@ message Expression {
repeated FunctionOption options = 5;

// Must be set to the return type of the function, exactly as derived
// using the declaration in the extension.
// using the declaration in the extension. In a partially bound expression,
// this may be Type.Unknown if the concrete return type is unresolved.
Type output_type = 3;

// Deprecated; use arguments instead.
Expand Down Expand Up @@ -1301,6 +1317,9 @@ message Expression {
// - Enum arguments must be bound using FunctionArgument.enum
// followed by Enum.specified, with a string that case-insensitively
// matches one of the allowed options.
//
// In a partially bound expression, Type.Unknown may stand in for any
// concrete type while binding value or type arguments.
repeated FunctionArgument arguments = 9;

// Options to specify behavior for corner cases, or leave behavior
Expand All @@ -1309,7 +1328,8 @@ message Expression {
repeated FunctionOption options = 11;

// Must be set to the return type of the function, exactly as derived
// using the declaration in the extension.
// using the declaration in the extension. In a partially bound expression,
// this may be Type.Unknown if the concrete return type is unresolved.
Type output_type = 7;

// Describes which part of the window function to perform within the
Expand Down Expand Up @@ -1822,6 +1842,9 @@ message AggregateFunction {
// - Optional enum arguments must be bound using FunctionArgument.enum
// followed by either Enum.specified or Enum.unspecified. If specified,
// the string must case-insensitively match one of the allowed options.
//
// In a partially bound expression, Type.Unknown may stand in for any
// concrete type while binding value or type arguments.
repeated FunctionArgument arguments = 7;

// Options to specify behavior for corner cases, or leave behavior
Expand All @@ -1830,7 +1853,8 @@ message AggregateFunction {
repeated FunctionOption options = 8;

// Must be set to the return type of the function, exactly as derived
// using the declaration in the extension.
// using the declaration in the extension. In a partially bound expression,
// this may be Type.Unknown if the concrete return type is unresolved.
Type output_type = 5;

// Describes which part of the aggregation to perform within the context of
Expand Down
13 changes: 13 additions & 0 deletions proto/substrait/type.proto
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,13 @@ message Type {
Map map = 28;
Func func = 38;

// A placeholder type whose concrete type has not yet been resolved.
// Unknown may be used where a concrete type would normally appear in
// partially bound plans and expressions, and must be resolved to a
// concrete type before execution unless the consumer defines semantics for
// unknown-typed values.
Unknown unknown = 39;

UserDefined user_defined = 30;

// Deprecated in favor of user_defined, which allows nullability and
Expand Down Expand Up @@ -225,6 +232,12 @@ message Type {
Nullability nullability = 3;
}

message Unknown {
// Optional nullability constraint for this unknown type. If unspecified,
// both the concrete type and its nullability are unresolved.
Nullability nullability = 1;
}

message UserDefined {
// References a type_anchor defined in the plan's extension declarations.
uint32 type_reference = 1;
Expand Down
1 change: 1 addition & 0 deletions site/docs/expressions/_config
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
arrange:
- field_references.md
- unbound_expressions.md
- scalar_functions.md
- aggregate_functions.md
- specialized_record_expressions.md
Expand Down
8 changes: 8 additions & 0 deletions site/docs/expressions/extended_expression.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,11 @@ For a message with multiple expressions, users may produce each Extended Express
## Function extensions

Function extensions work the same for both Extended Expression and the original Expression defined in the Substrait protocol.

## Partially Bound Expressions

Extended Expression can also carry partially bound expressions for producers that do not yet know the full input schema. In this form, field-like references can be represented as `NamedExpression`, and any known-but-unresolved schema or expression types can use `unknown`. A consumer must bind these names and types before execution unless it defines its own unresolved-expression semantics.

```protobuf
--8<-- "examples/proto-textformat/extended_expression/unbound_named_projection.textproto"
```
46 changes: 46 additions & 0 deletions site/docs/expressions/unbound_expressions.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Unbound Expressions

Substrait normally represents bound relational expressions: field references are positional, value types are known, and function invocations identify implementations whose argument and return types have been derived. Some producers need to serialize expressions earlier in planning, before names and types have been resolved.

An expression tree is partially bound when it contains either an [`unknown`](../types/type_classes.md#unknown-type) type or a `NamedExpression`. Consumers may validate and transform partially bound expressions, but must resolve them before execution unless they define their own semantics for unresolved names or unknown-typed values.

## Detecting Partial Binding

There is no separate "unbound expression" message. Instead, partially bound state is detected structurally:

- If any expression, function argument, or schema field type is `unknown`, the expression is partially bound.
- If any expression contains `NamedExpression`, the expression is partially bound.

This is the canonical way to distinguish fully bound expressions from partially bound expressions in Substrait.

## Unknown Type

The `unknown` type marks an expression whose concrete type is not known yet. It may be used anywhere a concrete type would normally be expected in a partially bound function call. If only the nullability is known, set the nullability field; otherwise leave it unspecified.

## Named Expression

`NamedExpression` represents a reference by name instead of ordinal position. The `names` field stores one or more namespace components, such as `["foo"]` for an unqualified name or `["orders", "amount"]` for a qualified name. Until resolved, a named expression's type is `unknown`. Resolution of these components is intentionally external to Substrait and must be understood by both producer and consumer.

```protobuf
--8<-- "examples/proto-textformat/unbound_expression/named_expression.textproto"
```

## Function Example

Partially bound function calls can use named expressions as value arguments and `unknown` as the output type when the return type cannot be derived yet.

```protobuf
--8<-- "examples/proto-textformat/unbound_expression/scalar_function_unknown.textproto"
```

## Extended Expression Protocols

Expression-level APIs, such as filters and projections exchanged outside a full `Plan`, should use `ExtendedExpression`. This lets the producer include output names, any known input names, and extension declarations next to the expression tree.

If the input names are known but their types are not, `base_schema` can contain fields with `unknown` types. If the function overload is also unresolved, the function can refer to the `extension:io.substrait:unknown` extension until a downstream binder replaces it with a concrete function reference and concrete output type. In that case, the referenced function name should use the normal Substrait function-signature form with `unknown` short names, such as `add:unknown_unknown`.

```protobuf
--8<-- "examples/proto-textformat/extended_expression/unbound_named_projection.textproto"
```

Consumers that execute expressions must reject or bind away all `NamedExpression` and `unknown` types before execution unless they explicitly support unresolved semantics. A typical binder resolves `NamedExpression` values to `FieldReference`, replaces `unknown` input and output types with concrete types, and updates unresolved function references to concrete overloads.
3 changes: 3 additions & 0 deletions site/docs/extensions/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,11 @@ A function signature uniquely identifies a function implementation within a sing
| map&lt;K,V&gt; | map |
| func&lt;T-&gt;R&gt;, func&lt;(T1,...,TN)-&gt;R&gt; | func |
| any[\d]? | any |
| unknown | unknown |
| user-defined type &lt;name&gt; | u!&lt;name&gt; |

The `unknown` short type is reserved for partially bound expressions. When a producer does not yet know the concrete overload of a function, it may use a placeholder signature such as `add:unknown_unknown` until a downstream binder resolves the argument and return types.

#### Examples

| Function Signature | Function Name |
Expand Down
7 changes: 7 additions & 0 deletions site/docs/types/type_classes.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,13 @@ Simple type classes are those that don't support any form of configuration. For
| date | A date within [1000-01-01..9999-12-31]. | `int32` days since `1970-01-01`
| interval_year | Interval year to month. Supports a range of [-10,000..10,000] years with month precision (= [-120,000..120,000] months). Usually stored as separate integers for years and months, but only the total number of months is significant, i.e. `1y 0m` is considered equal to `0y 12m` or `1001y -12000m`. | `int32` years and `int32` months, with the added constraint that each component can never independently specify more than 10,000 years, even if the components have opposite signs (e.g. `-10000y 200000m` is **not** allowed)
| uuid | A universally-unique identifier composed of 128 bits. Typically presented to users in the following hexadecimal format: `c48ffa9e-64f4-44cb-ae47-152b4e60e77b`. Any 128-bit value is allowed, without specific adherence to RFC4122. | 16-byte `binary`
| unknown | A placeholder for a type that has not been resolved yet. `unknown` may be used in partially bound expressions and can stand in for any concrete type during binding. It must be resolved before execution unless the consumer defines semantics for unknown-typed values. | n/a

### Unknown Type

The `unknown` type class is intended for producers that serialize expressions before all schema and catalog information is available. For example, a front end can represent `a + b` with unresolved named operands and an `unknown` result type, then a downstream binder can resolve `a` and `b` to positional field references and concrete types.

`unknown` is not a wildcard for a fully bound executable plan. A plan that still contains `unknown` is partially bound.

### Compound Types

Expand Down
9 changes: 9 additions & 0 deletions site/docs/types/type_parsing.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,3 +107,12 @@ func<(any1, any2, any3) -> any4>
```

Function types use the arrow syntax (`->`) to separate parameter types from the return type. For multiple parameters, use parentheses to group the parameter types. See [Lambda Expressions](../expressions/lambda_expressions.md) for more details on lambda expressions and their usage.

### Unknown Type

The `unknown` type is written as `unknown` and may include the normal nullability marker:

```
unknown
unknown?
```
2 changes: 2 additions & 0 deletions site/docs/types/type_system.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,5 @@ Refer to [Type Parsing](type_parsing.md) for a description of the syntax used to

!!! note "Note"
Substrait employs a strict type system without any coercion rules. All changes in types must be made explicit via [cast expressions](../expressions/specialized_record_expressions.md).

Partially bound expressions may use the [`unknown`](type_classes.md#unknown-type) type as a placeholder until a downstream binder resolves the concrete type.
1 change: 1 addition & 0 deletions site/examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ By storing examples as separate files instead of inline in markdown, we can easi
```
examples/
├── extensions/ # Extension function examples (e.g., any types)
├── proto-textformat/ # Protobuf text format examples
├── types/ # User-defined type examples
└── README.md # This file
```
Expand Down
2 changes: 2 additions & 0 deletions site/examples/proto-textformat/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,5 @@ We use protobuf text format (textproto) rather than JSON for these examples beca
- `lambda/` - Examples of `Expression.Lambda` messages
- `lambda_invocation/` - Examples of `Expression.LambdaInvocation` messages
- `field_reference/` - Examples of `Expression.FieldReference` messages
- `extended_expression/` - Examples of `ExtendedExpression` messages
- `unbound_expression/` - Examples of unresolved `Expression` messages
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# message ExtendedExpression
# Arrow-style expression protocols should use ExtendedExpression so the message
# carries expression names and extension declarations with the expression tree.
version {
major_number: 0
minor_number: 0
patch_number: 0
producer: "substrait"
}
extension_urns {
extension_urn_anchor: 1
urn: "extension:io.substrait:unknown"
}
extensions {
extension_function {
extension_urn_reference: 1
function_anchor: 1
name: "add:unknown_unknown"
}
}
base_schema {
names: "a"
names: "b"
struct {
types {
unknown {}
}
types {
unknown {}
}
nullability: NULLABILITY_REQUIRED
}
}
referred_expr {
expression {
scalar_function {
function_reference: 1
arguments {
value {
named_expression {
names: "a"
}
}
}
arguments {
value {
named_expression {
names: "b"
}
}
}
output_type {
unknown {}
}
}
}
output_names: "sum"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# message Expression
# An unresolved reference to a named expression, such as a column name entered
# before a schema has been bound.
named_expression {
names: "foo"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# message Expression
# A partially bound scalar function over unresolved named expressions. The
# function_reference is still plan-local and must be declared by the enclosing
# Plan or ExtendedExpression. The unknown output type records that concrete
# type derivation has not completed.
scalar_function {
function_reference: 1
arguments {
value {
named_expression {
names: "a"
}
}
}
arguments {
value {
named_expression {
names: "b"
}
}
}
output_type {
unknown {}
}
}
Loading