diff --git a/gnovm/adr/pr5807_string_conversion_unicode_range.md b/gnovm/adr/pr5807_string_conversion_unicode_range.md new file mode 100644 index 00000000000..c9bdb891b39 --- /dev/null +++ b/gnovm/adr/pr5807_string_conversion_unicode_range.md @@ -0,0 +1,68 @@ +# ADR: Respect the Unicode code point range in integer-to-string conversions + +## Status + +Proposed (AI-assisted fix; found via differential testing against the Go +toolchain). + +## Context + +Per the Go spec ("Conversions to and from a string type"), converting an +integer value to a string yields the UTF-8 representation of the code point, +and "values outside the range of valid Unicode code points are converted to +`�`". + +GnoVM implemented the conversion for the 64-bit-capable kinds as +`string(rune(tv.GetInt64()))` (and likewise for `int`, `uint`, `uint64`) in +`values_conversions.go`. The `rune(...)` wrapper narrows the value to int32 +*before* Go's own `string(rune)` range check runs, so an out-of-range 64-bit +value can alias onto a valid code point instead of yielding `�`: + +- `string(uint64(0x10001F600))` returned `"😀"` (Go: `"�"`) +- `string(int(-4294967231))` returned `"A"` (Go: `"�"`) +- `string(uint64(0x100000000))` returned `"\x00"` (Go: `"�"`) + +The divergence is deterministic but silently wrong, and it affects both the +runtime path and constant evaluation (both flow through `ConvertTo`). Note +that Go accepts out-of-range *constants* in this conversion too — +`string(0x100000041)` compiles in Go and yields `"�"` — so rejecting at +preprocess time would not match Go either; producing `"�"` is correct in +both paths. + +## Decision + +Add two helpers, `runeStrFromInt64` and `runeStrFromUint64`, that check +whether the value fits in an int32 before converting, and return +`string(utf8.RuneError)` when it does not. Once a value fits in int32, Go's +native `string(rune)` conversion already maps every invalid case (negative +values, surrogate halves, values above 0x10FFFF) to `�`, so only the +truncation hole needs plugging. The helpers are used at the four +64-bit-capable conversion sites (`IntKind`, `Int64Kind`, `UintKind`, +`Uint64Kind`). + +The remaining integer kinds are intentionally unchanged: + +- `Int32Kind` converts via `string(tv.GetInt32())` with no narrowing. +- `Int8/Int16/Uint8/Uint16` cannot exceed the int32 range. +- `Uint32Kind` uses `string(rune(tv.GetUint32()))`; values above `MaxInt32` + reinterpret as negative runes and values in `(0x10FFFF, MaxInt32]` exceed + the code point range, so both classes already map to `�` — provably + equivalent to the spec behavior for every uint32 input. + +## Alternatives considered + +- Range-checking against `0..0x10FFFF` directly in each case arm: equivalent + behavior, but duplicates the bound logic at four sites and re-implements + what `string(rune)` already does for the in-int32 cases. +- Rejecting out-of-range constant conversions at preprocess time: would + diverge from Go, which accepts them and produces `"�"`. + +## Consequences + +- `string(x)` conversions match Go for all integer kinds and all values, in + both runtime and constant evaluation paths. +- Programs that (incorrectly) relied on the truncating behavior change + output; such programs were already non-portable Go. +- Covered by `gnovm/tests/files/str_conv_overflow.gno` (boundary table + including truncation-aliasing values, in-range values, and in-int32 invalid + values). diff --git a/gnovm/pkg/gnolang/values_conversions.go b/gnovm/pkg/gnolang/values_conversions.go index d0b0629bb72..8dcb67623c5 100644 --- a/gnovm/pkg/gnolang/values_conversions.go +++ b/gnovm/pkg/gnolang/values_conversions.go @@ -10,6 +10,27 @@ import ( "github.com/gnolang/gno/gnovm/pkg/gnolang/internal/softfloat" ) +// runeStrFromInt64 converts an integer value to the string containing the +// UTF-8 representation of the code point, per the Go spec: values that do not +// fit in an int32 (rune) are necessarily outside the range of valid Unicode +// code points and convert to "�". Values that do fit in an int32 are +// handled by Go's native string(rune) conversion, which already maps +// negatives, surrogates, and values > 0x10FFFF to "�". +func runeStrFromInt64(v int64) string { + if v != int64(rune(v)) { + return string(utf8.RuneError) + } + return string(rune(v)) +} + +// runeStrFromUint64 is like runeStrFromInt64 for unsigned values. +func runeStrFromUint64(v uint64) string { + if v > math.MaxInt32 { + return string(utf8.RuneError) + } + return string(rune(v)) +} + // t cannot be nil or untyped or DataByteType. // the conversion is forced and overflow/underflow is ignored. // TODO: return error, and let caller also print the file and line. @@ -131,7 +152,7 @@ func ConvertTo(alloc *Allocator, store Store, tv *TypedValue, t Type, isConst bo tv.T = t tv.SetFloat64(x) case StringKind: - tv.V = alloc.NewString(string(rune(tv.GetInt()))) + tv.V = alloc.NewString(runeStrFromInt64(tv.GetInt())) tv.T = t tv.ClearNum() default: @@ -419,7 +440,7 @@ func ConvertTo(alloc *Allocator, store Store, tv *TypedValue, t Type, isConst bo tv.T = t tv.SetFloat64(x) case StringKind: - tv.V = alloc.NewString(string(rune(tv.GetInt64()))) + tv.V = alloc.NewString(runeStrFromInt64(tv.GetInt64())) tv.T = t tv.ClearNum() default: @@ -494,7 +515,7 @@ func ConvertTo(alloc *Allocator, store Store, tv *TypedValue, t Type, isConst bo tv.T = t tv.SetFloat64(x) case StringKind: - tv.V = alloc.NewString(string(rune(tv.GetUint()))) + tv.V = alloc.NewString(runeStrFromUint64(tv.GetUint())) tv.T = t tv.ClearNum() default: @@ -778,7 +799,7 @@ func ConvertTo(alloc *Allocator, store Store, tv *TypedValue, t Type, isConst bo tv.T = t tv.SetFloat64(x) case StringKind: - tv.V = alloc.NewString(string(rune(tv.GetUint64()))) + tv.V = alloc.NewString(runeStrFromUint64(tv.GetUint64())) tv.T = t tv.ClearNum() default: diff --git a/gnovm/tests/files/str_conv_overflow.gno b/gnovm/tests/files/str_conv_overflow.gno new file mode 100644 index 00000000000..dba40fcf05c --- /dev/null +++ b/gnovm/tests/files/str_conv_overflow.gno @@ -0,0 +1,42 @@ +package main + +// Per the Go spec, converting an integer to string yields the UTF-8 of the +// code point; values outside the valid Unicode code point range yield "�". +// Values that do not fit in an int32 must NOT be truncated first. +func main() { + // 64-bit values that would wrap to valid code points if truncated to int32. + println(string(0x100000041)) // NOT "A" + println(string(-4294967231)) // NOT "A" (== -(1<<32) + 65) + println(string(0x10001F600)) // NOT an emoji + var i64 int64 = 0x100000041 + println(string(i64)) + var i int = 0x100000041 + println(string(i)) + var u64 uint64 = 0x100000041 + println(string(u64)) + var u uint = 18446744073709551615 + println(string(u)) + // in-range values still convert normally. + println(string(rune(65)), string(int64(0x1F600)), string(uint64(65))) + // invalid in-range values yield "�". + println(string(-1), string(0xD800), string(0x110000)) + // uint32 uses the unchanged conversion path: values > MaxInt32 + // reinterpret as negative runes and (0x10FFFF, MaxInt32] is out of + // range, so every invalid uint32 already yields "�". + println(string(uint32(0x80000041)), string(uint32(0x7FFFFFFF))) + // untyped rune constants fold through the same conversion. + println(string('A' + 0x100000000)) +} + +// Output: +// � +// � +// � +// � +// � +// � +// � +// A 😀 A +// � � � +// � � +// �