diff --git a/doc/pages/regex.asciidoc b/doc/pages/regex.asciidoc index 416efd1f04..188502a0fb 100644 --- a/doc/pages/regex.asciidoc +++ b/doc/pages/regex.asciidoc @@ -165,6 +165,8 @@ them: * `(?i)` starts case-insensitive matching. * `(?I)` starts case-sensitive matching (default). +* `(?c)` starts smart-case matching - same as case-insensitive matching + unless the pattern contains an uppercase character. * `(?s)` allows `.` to match newlines (default). * `(?S)` prevents `.` from matching newlines. diff --git a/src/regex_impl.cc b/src/regex_impl.cc index dd90012556..881993576d 100644 --- a/src/regex_impl.cc +++ b/src/regex_impl.cc @@ -151,6 +151,7 @@ struct RegexParser None = 0, IgnoreCase = 1 << 0, DotMatchesNewLine = 1 << 1, + SmartCase = 1 << 2, }; friend constexpr bool with_bit_ops(Meta::Type) { return true; } @@ -206,10 +207,20 @@ struct RegexParser auto m = *it++; switch (m) { - case 'i': m_flags |= Flags::IgnoreCase; break; - case 'I': m_flags &= ~Flags::IgnoreCase; break; + case 'i': + m_flags |= Flags::IgnoreCase; + m_flags &= ~Flags::SmartCase; + break; + case 'I': + m_flags &= ~Flags::IgnoreCase; + m_flags &= ~Flags::SmartCase; + break; case 's': m_flags |= Flags::DotMatchesNewLine; break; case 'S': m_flags &= ~Flags::DotMatchesNewLine; break; + case 'c': + m_flags |= Flags::SmartCase | Flags::IgnoreCase; + m_smartcase_start_index = m_parsed_regex.nodes.size(); + break; case ')': m_pos = Iterator{it, m_regex}; return true; @@ -434,7 +445,6 @@ struct RegexParser { CharacterClass character_class; - character_class.ignore_case = (m_flags & Flags::IgnoreCase); character_class.negative = m_pos != m_regex.end() and *m_pos == '^'; if (character_class.negative) ++m_pos; @@ -505,15 +515,6 @@ struct RegexParser parse_error("unclosed character class"); ++m_pos; - if (character_class.ignore_case) - { - for (auto& range : character_class.ranges) - { - range.min = to_lower(range.min); - range.max = to_lower(range.max); - } - } - normalize_ranges(character_class.ranges); // Optimize the relatively common case of using a character class to @@ -592,6 +593,28 @@ struct RegexParser if (res == max_nodes) parse_error(format("regex parsed to more than {} ast nodes", max_nodes)); const NodeIndex next = res+1; + + if (m_flags & Flags::SmartCase) + { + bool ignore_case = true; + if (op == ParsedRegex::Literal and is_upper(value)) + ignore_case = false; + else if (op == ParsedRegex::CharClass) + for (auto& [min, max] : m_parsed_regex.character_classes[value].ranges) + if (is_upper(min) or is_upper(max)) + { + ignore_case = false; + break; + } + if (not ignore_case) + { + for (auto i = m_smartcase_start_index; i < m_parsed_regex.nodes.size(); ++i) + m_parsed_regex.nodes[i].ignore_case = false; + m_flags &= ~(Flags::SmartCase | Flags::IgnoreCase); + m_smartcase_start_index = -1; + } + } + m_parsed_regex.nodes.push_back({op, m_flags & Flags::IgnoreCase, next, value, quantifier}); return res; } @@ -636,6 +659,8 @@ struct RegexParser Flags m_flags = Flags::DotMatchesNewLine; + ParsedRegex::NodeIndex m_smartcase_start_index = -1; + static constexpr struct CharacterClassEscape { Codepoint cp; CharacterType ctype; @@ -729,7 +754,7 @@ struct RegexCompiler push_inst(CompiledRegex::AnyCharExceptNewLine); break; case ParsedRegex::CharClass: - push_inst(CompiledRegex::CharClass, {.character_class_index=int16_t(node.value)}); + push_inst(CompiledRegex::CharClass, {.character_class={.index=int16_t(node.value), .ignore_case=ignore_case}}); break; case ParsedRegex::CharType: push_inst(CompiledRegex::CharType, {.character_type=CharacterType{(unsigned char)node.value}}); @@ -955,7 +980,7 @@ struct RegexCompiler auto& character_class = m_parsed_regex.character_classes[node.value]; if (character_class.ctypes == CharacterType::None and not character_class.negative and - not character_class.ignore_case) + not node.ignore_case) { for (auto& range : character_class.ranges) { @@ -970,7 +995,7 @@ struct RegexCompiler { for (Codepoint cp = 0; cp < single_byte_limit; ++cp) { - if (start_desc.map[cp] or character_class.matches(cp)) + if (start_desc.map[cp] or character_class.matches(cp, node.ignore_case)) start_desc.map[cp] = true; } } @@ -1108,7 +1133,7 @@ String dump_regex(const CompiledRegex& program) res += "anything but newline\n"; break; case CompiledRegex::CharClass: - res += format("character class {}\n", inst.param.character_class_index); + res += format("character class {}\n", inst.param.character_class.index); break; case CompiledRegex::CharType: res += format("character type {}\n", to_underlying(inst.param.character_type)); @@ -1408,6 +1433,78 @@ auto test_regex = UnitTest{[]{ kak_assert(vm.exec("ABC")); } + { + TestVM<> vm{R"((?i)[@-C]+)"}; + kak_assert(vm.exec("aBc")); + kak_assert(not vm.exec("aBc_")); + } + + { + TestVM<> vm{R"((?i)[@-_])"}; + kak_assert(vm.exec("A")); + kak_assert(vm.exec("a")); + } + + { + TestVM<> vm{R"((?i)[@-C]+)"}; + kak_assert(vm.exec("aBc")); + kak_assert(not vm.exec("aBc_")); + } + + { + TestVM<> vm{R"((?i)[@-_])"}; + kak_assert(vm.exec("A")); + kak_assert(vm.exec("a")); + } + + { + TestVM<> vm{R"((?c)foobar)"}; + kak_assert(vm.exec("foobar")); + kak_assert(vm.exec("fooBar")); + kak_assert(vm.exec("FOOBAR")); + } + + { + TestVM<> vm{R"((?c)fooBar)"}; + kak_assert(not vm.exec("foobar")); + kak_assert(vm.exec("fooBar")); + kak_assert(not vm.exec("FOOBAR")); + } + + { + TestVM<> vm{R"((?c)foo\x42ar)"}; + kak_assert(not vm.exec("foobar")); + kak_assert(vm.exec("fooBar")); + kak_assert(not vm.exec("FOOBAR")); + } + + { + TestVM<> vm{R"((?c)foo[B]ar)"}; + kak_assert(not vm.exec("foobar")); + kak_assert(vm.exec("fooBar")); + kak_assert(not vm.exec("FOOBAR")); + } + + { + TestVM<> vm{R"((?c)foo[\x42]ar)"}; + kak_assert(not vm.exec("foobar")); + kak_assert(vm.exec("fooBar")); + kak_assert(not vm.exec("FOOBAR")); + } + + { + TestVM<> vm{R"((?c)foo[a-cQ]ar)"}; + kak_assert(vm.exec("foobar")); + kak_assert(not vm.exec("fooBar")); + kak_assert(not vm.exec("FOObAR")); + } + + { + TestVM<> vm{R"((?c)foo(?i)BAR(?c)baZ)"}; + kak_assert(vm.exec("FooBarbaZ")); + kak_assert(not vm.exec("FooBarbaz")); + } + { TestVM<> vm{R"([^\]]+)"}; kak_assert(not vm.exec("a]c")); diff --git a/src/regex_impl.hh b/src/regex_impl.hh index b04d99e701..7b44294bdc 100644 --- a/src/regex_impl.hh +++ b/src/regex_impl.hh @@ -46,21 +46,25 @@ struct CharacterClass Vector ranges; CharacterType ctypes = CharacterType::None; bool negative = false; - bool ignore_case = false; friend bool operator==(const CharacterClass&, const CharacterClass&) = default; - bool matches(Codepoint cp) const + bool matches(Codepoint cp, bool ignore_case) const { + Codepoint cp_flip = -1; if (ignore_case) - cp = to_lower(cp); + { + if (is_upper(cp)) + cp_flip = to_lower(cp); + else if (is_lower(cp)) + cp_flip = to_upper(cp); + } - for (auto& [min, max] : ranges) + for (const auto& range : ranges) { - if (cp < min) - break; - else if (cp <= max) - return not negative; + if ((cp >= range.min && cp <= range.max) || + (cp_flip != -1 && cp_flip >= range.min && cp_flip <= range.max)) + return !negative; } return (ctypes != CharacterType::None and is_ctype(ctypes, cp)) != negative; @@ -105,7 +109,11 @@ struct CompiledRegex : UseMemoryDomain uint32_t codepoint : 24; bool ignore_case : 1; } literal; - int16_t character_class_index; + struct CharacterClass + { + int16_t index; + bool ignore_case : 1; + } character_class; CharacterType character_type; int16_t jump_offset; int16_t save_index; @@ -405,7 +413,7 @@ private: return failed(); case CompiledRegex::CharClass: if (pos != config.end and - m_program.character_classes[inst.param.character_class_index].matches(cp)) + m_program.character_classes[inst.param.character_class.index].matches(cp, inst.param.character_class.ignore_case)) return consumed(); return failed(); case CompiledRegex::CharType: @@ -609,7 +617,7 @@ private: else if (op >= Lookaround::CharacterClass and op < Lookaround::CharacterType) { auto index = to_underlying(op) - to_underlying(Lookaround::CharacterClass); - if (not m_program.character_classes[index].matches(cp)) + if (not m_program.character_classes[index].matches(cp, param.ignore_case)) return false; } else if (op >= Lookaround::CharacterType and op < Lookaround::OpEnd)