From 096d52c60612f5212d3b7f1f422b7cc3ec4e7936 Mon Sep 17 00:00:00 2001 From: James Prior Date: Mon, 6 Apr 2026 18:54:39 +0100 Subject: [PATCH] An alternative template tokenizer implementation --- lib/liquid/parse_context.rb | 1 - lib/liquid/tokenizer.rb | 170 +++++++++++++------------------- performance/theme_runner.rb | 2 - test/unit/tag_unit_test.rb | 5 +- test/unit/template_unit_test.rb | 2 +- 5 files changed, 68 insertions(+), 112 deletions(-) diff --git a/lib/liquid/parse_context.rb b/lib/liquid/parse_context.rb index 855acc64e..9ce5832b3 100644 --- a/lib/liquid/parse_context.rb +++ b/lib/liquid/parse_context.rb @@ -44,7 +44,6 @@ def new_parser(input) def new_tokenizer(source, start_line_number: nil, for_liquid_tag: false) Tokenizer.new( source: source, - string_scanner: @string_scanner, line_number: start_line_number, for_liquid_tag: for_liquid_tag, ) diff --git a/lib/liquid/tokenizer.rb b/lib/liquid/tokenizer.rb index 8b331d93c..f455705c5 100644 --- a/lib/liquid/tokenizer.rb +++ b/lib/liquid/tokenizer.rb @@ -1,22 +1,11 @@ # frozen_string_literal: true -require "strscan" - module Liquid class Tokenizer attr_reader :line_number, :for_liquid_tag - TAG_END = /%\}/ - TAG_OR_VARIABLE_START = /\{[\{\%]/ - NEWLINE = /\n/ - - OPEN_CURLEY = "{".ord - CLOSE_CURLEY = "}".ord - PERCENTAGE = "%".ord - def initialize( source:, - string_scanner:, line_numbers: false, line_number: nil, for_liquid_tag: false @@ -28,8 +17,6 @@ def initialize( @tokens = [] if @source - @ss = string_scanner - @ss.string = @source tokenize end end @@ -51,111 +38,86 @@ def shift private def tokenize - if @for_liquid_tag - @tokens = @source.split("\n") + @tokens = if @for_liquid_tag + @source.split("\n") else - @tokens << shift_normal until @ss.eos? + scan(@source) end @source = nil - @ss = nil - end - - def shift_normal - token = next_token - - return unless token - - token - end - - def next_token - # possible states: :text, :tag, :variable - byte_a = @ss.peek_byte - - if byte_a == OPEN_CURLEY - @ss.scan_byte - - byte_b = @ss.peek_byte - - if byte_b == PERCENTAGE - @ss.scan_byte - return next_tag_token - elsif byte_b == OPEN_CURLEY - @ss.scan_byte - return next_variable_token - end - - @ss.pos -= 1 - end - - next_text_token end - def next_text_token - start = @ss.pos - - unless @ss.skip_until(TAG_OR_VARIABLE_START) - token = @ss.rest - @ss.terminate - return token - end - - pos = @ss.pos -= 2 - @source.byteslice(start, pos - start) - rescue ::ArgumentError => e - if e.message == "invalid byte sequence in #{@ss.string.encoding}" - raise SyntaxError, "Invalid byte sequence in #{@ss.string.encoding}" - else - raise - end - end - - def next_variable_token - start = @ss.pos - 2 - - byte_a = byte_b = @ss.scan_byte + # @param source [String] + # @return [Array] + def scan(source) + raise SyntaxError, "Invalid byte sequence in #{source.encoding}" unless source.valid_encoding? + + tokens = [] # : Array[String] + pos = 0 + eos = source.bytesize + + # rubocop:disable Metrics/BlockNesting + while pos < eos + byte = source.getbyte(pos) + next_byte = source.getbyte(pos + 1) + + if byte == 123 && next_byte == 123 # {{ + if (index = source.byteindex("}", pos + 2)) + if source.getbyte(index + 1) == 125 # }} + tokens << source.byteslice(pos, index + 2 - pos) + pos = index + 2 + else # } or %} + tokens << source.byteslice(pos, index + 1 - pos) + pos = index + 1 + end + else + tokens << "{{" + pos += 2 + end + elsif byte == 123 && next_byte == 37 # {% + if (index = source.byteindex("%}", pos + 2)) + tokens << source.byteslice(pos, index + 2 - pos) + pos = index + 2 + else + tokens << "{%" + pos += 2 + end + else + # Not markup. Scan until but not including {{ or {% + index = source.byteindex("{", pos) + + unless index + # No more markup. Scan until end of string. + tokens << source.byteslice(pos, eos - pos) + break + end - while byte_b - byte_a = @ss.scan_byte while byte_a && byte_a != CLOSE_CURLEY && byte_a != OPEN_CURLEY + next_byte = source.getbyte(index + 1) - break unless byte_a + while next_byte != 37 && next_byte != 123 + index = source.byteindex("{", index + 1) + break unless index - if @ss.eos? - return byte_a == CLOSE_CURLEY ? @source.byteslice(start, @ss.pos - start) : "{{" - end - - byte_b = @ss.scan_byte + next_byte = source.getbyte(index + 1) + unless next_byte + index = nil + break + end + end - if byte_a == CLOSE_CURLEY - if byte_b == CLOSE_CURLEY - return @source.byteslice(start, @ss.pos - start) - elsif byte_b != CLOSE_CURLEY - @ss.pos -= 1 - return @source.byteslice(start, @ss.pos - start) + if index + tokens << source.byteslice(pos, index - pos) + pos = index + else + # No more markup. Scan until end of string. + tokens << source.byteslice(pos, eos - pos) + break end - elsif byte_a == OPEN_CURLEY && byte_b == PERCENTAGE - return next_tag_token_with_start(start) end - - byte_a = byte_b end + # rubocop:enable Metrics/BlockNesting - "{{" - end - - def next_tag_token - start = @ss.pos - 2 - if (len = @ss.skip_until(TAG_END)) - @source.byteslice(start, len + 2) - else - "{%" - end - end - - def next_tag_token_with_start(start) - @ss.skip_until(TAG_END) - @source.byteslice(start, @ss.pos - start) + tokens end end end diff --git a/performance/theme_runner.rb b/performance/theme_runner.rb index 469503670..d0efb3896 100644 --- a/performance/theme_runner.rb +++ b/performance/theme_runner.rb @@ -50,11 +50,9 @@ def compile # `tokenize` will just test the tokenizen portion of liquid without any templates def tokenize - ss = StringScanner.new("") @tests.each do |test_hash| tokenizer = Liquid::Tokenizer.new( source: test_hash[:liquid], - string_scanner: ss, line_numbers: true, ) while tokenizer.shift; end diff --git a/test/unit/tag_unit_test.rb b/test/unit/tag_unit_test.rb index 9b4bf88b6..c5050fa3a 100644 --- a/test/unit/tag_unit_test.rb +++ b/test/unit/tag_unit_test.rb @@ -33,9 +33,6 @@ def test_tag_render_to_output_buffer_nil_value private def new_tokenizer - Tokenizer.new( - source: "", - string_scanner: StringScanner.new(""), - ) + Tokenizer.new(source: "") end end diff --git a/test/unit/template_unit_test.rb b/test/unit/template_unit_test.rb index 1f349657a..c1650a04d 100644 --- a/test/unit/template_unit_test.rb +++ b/test/unit/template_unit_test.rb @@ -39,7 +39,7 @@ def test_template_inheritance def test_invalid_utf8 input = "\xff\x00" error = assert_raises(SyntaxError) do - Liquid::Tokenizer.new(source: input, string_scanner: StringScanner.new(input)) + Liquid::Tokenizer.new(source: input) end assert_equal( 'Liquid syntax error: Invalid byte sequence in UTF-8',