Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/pages/regex.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,8 @@ them:

* `(?i)` starts case-insensitive matching.
* `(?I)` starts case-sensitive matching (default).
* `(?c)` starts smart-case matching - same as case-insensitive matching
unless the pattern contains an uppercase character.
* `(?s)` allows `.` to match newlines (default).
* `(?S)` prevents `.` from matching newlines.

Expand Down
129 changes: 113 additions & 16 deletions src/regex_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ struct RegexParser
None = 0,
IgnoreCase = 1 << 0,
DotMatchesNewLine = 1 << 1,
SmartCase = 1 << 2,
};
friend constexpr bool with_bit_ops(Meta::Type<Flags>) { return true; }

Expand Down Expand Up @@ -206,10 +207,20 @@ struct RegexParser
auto m = *it++;
switch (m)
{
case 'i': m_flags |= Flags::IgnoreCase; break;
case 'I': m_flags &= ~Flags::IgnoreCase; break;
case 'i':
m_flags |= Flags::IgnoreCase;
m_flags &= ~Flags::SmartCase;
break;
case 'I':
m_flags &= ~Flags::IgnoreCase;
m_flags &= ~Flags::SmartCase;
break;
case 's': m_flags |= Flags::DotMatchesNewLine; break;
case 'S': m_flags &= ~Flags::DotMatchesNewLine; break;
case 'c':
m_flags |= Flags::SmartCase | Flags::IgnoreCase;
m_smartcase_start_index = m_parsed_regex.nodes.size();
break;
case ')':
m_pos = Iterator{it, m_regex};
return true;
Expand Down Expand Up @@ -434,7 +445,6 @@ struct RegexParser
{
CharacterClass character_class;

character_class.ignore_case = (m_flags & Flags::IgnoreCase);
character_class.negative = m_pos != m_regex.end() and *m_pos == '^';
if (character_class.negative)
++m_pos;
Expand Down Expand Up @@ -505,15 +515,6 @@ struct RegexParser
parse_error("unclosed character class");
++m_pos;

if (character_class.ignore_case)
{
for (auto& range : character_class.ranges)
{
range.min = to_lower(range.min);
range.max = to_lower(range.max);
}
}

normalize_ranges(character_class.ranges);

// Optimize the relatively common case of using a character class to
Expand Down Expand Up @@ -592,6 +593,28 @@ struct RegexParser
if (res == max_nodes)
parse_error(format("regex parsed to more than {} ast nodes", max_nodes));
const NodeIndex next = res+1;

if (m_flags & Flags::SmartCase)
{
bool ignore_case = true;
if (op == ParsedRegex::Literal and is_upper(value))
ignore_case = false;
else if (op == ParsedRegex::CharClass)
for (auto& [min, max] : m_parsed_regex.character_classes[value].ranges)
if (is_upper(min) or is_upper(max))
{
ignore_case = false;
break;
}
if (not ignore_case)
{
for (auto i = m_smartcase_start_index; i < m_parsed_regex.nodes.size(); ++i)
m_parsed_regex.nodes[i].ignore_case = false;
m_flags &= ~(Flags::SmartCase | Flags::IgnoreCase);
m_smartcase_start_index = -1;
}
}

m_parsed_regex.nodes.push_back({op, m_flags & Flags::IgnoreCase, next, value, quantifier});
return res;
}
Expand Down Expand Up @@ -636,6 +659,8 @@ struct RegexParser

Flags m_flags = Flags::DotMatchesNewLine;

ParsedRegex::NodeIndex m_smartcase_start_index = -1;

static constexpr struct CharacterClassEscape {
Codepoint cp;
CharacterType ctype;
Expand Down Expand Up @@ -729,7 +754,7 @@ struct RegexCompiler
push_inst(CompiledRegex::AnyCharExceptNewLine);
break;
case ParsedRegex::CharClass:
push_inst(CompiledRegex::CharClass, {.character_class_index=int16_t(node.value)});
push_inst(CompiledRegex::CharClass, {.character_class={.index=int16_t(node.value), .ignore_case=ignore_case}});
break;
case ParsedRegex::CharType:
push_inst(CompiledRegex::CharType, {.character_type=CharacterType{(unsigned char)node.value}});
Expand Down Expand Up @@ -955,7 +980,7 @@ struct RegexCompiler
auto& character_class = m_parsed_regex.character_classes[node.value];
if (character_class.ctypes == CharacterType::None and
not character_class.negative and
not character_class.ignore_case)
not node.ignore_case)
{
for (auto& range : character_class.ranges)
{
Expand All @@ -970,7 +995,7 @@ struct RegexCompiler
{
for (Codepoint cp = 0; cp < single_byte_limit; ++cp)
{
if (start_desc.map[cp] or character_class.matches(cp))
if (start_desc.map[cp] or character_class.matches(cp, node.ignore_case))
start_desc.map[cp] = true;
}
}
Expand Down Expand Up @@ -1108,7 +1133,7 @@ String dump_regex(const CompiledRegex& program)
res += "anything but newline\n";
break;
case CompiledRegex::CharClass:
res += format("character class {}\n", inst.param.character_class_index);
res += format("character class {}\n", inst.param.character_class.index);
break;
case CompiledRegex::CharType:
res += format("character type {}\n", to_underlying(inst.param.character_type));
Expand Down Expand Up @@ -1408,6 +1433,78 @@ auto test_regex = UnitTest{[]{
kak_assert(vm.exec("ABC"));
}

{
TestVM<> vm{R"((?i)[@-C]+)"};
kak_assert(vm.exec("aBc"));
kak_assert(not vm.exec("aBc_"));
}

{
TestVM<> vm{R"((?i)[@-_])"};
kak_assert(vm.exec("A"));
kak_assert(vm.exec("a"));
}

{
TestVM<> vm{R"((?i)[@-C]+)"};
kak_assert(vm.exec("aBc"));
kak_assert(not vm.exec("aBc_"));
}

{
TestVM<> vm{R"((?i)[@-_])"};
kak_assert(vm.exec("A"));
kak_assert(vm.exec("a"));
}

{
TestVM<> vm{R"((?c)foobar)"};
kak_assert(vm.exec("foobar"));
kak_assert(vm.exec("fooBar"));
kak_assert(vm.exec("FOOBAR"));
}

{
TestVM<> vm{R"((?c)fooBar)"};
kak_assert(not vm.exec("foobar"));
kak_assert(vm.exec("fooBar"));
kak_assert(not vm.exec("FOOBAR"));
}

{
TestVM<> vm{R"((?c)foo\x42ar)"};
kak_assert(not vm.exec("foobar"));
kak_assert(vm.exec("fooBar"));
kak_assert(not vm.exec("FOOBAR"));
}

{
TestVM<> vm{R"((?c)foo[B]ar)"};
kak_assert(not vm.exec("foobar"));
kak_assert(vm.exec("fooBar"));
kak_assert(not vm.exec("FOOBAR"));
}

{
TestVM<> vm{R"((?c)foo[\x42]ar)"};
kak_assert(not vm.exec("foobar"));
kak_assert(vm.exec("fooBar"));
kak_assert(not vm.exec("FOOBAR"));
}

{
TestVM<> vm{R"((?c)foo[a-cQ]ar)"};
kak_assert(vm.exec("foobar"));
kak_assert(not vm.exec("fooBar"));
kak_assert(not vm.exec("FOObAR"));
}

{
TestVM<> vm{R"((?c)foo(?i)BAR(?c)baZ)"};
kak_assert(vm.exec("FooBarbaZ"));
kak_assert(not vm.exec("FooBarbaz"));
}

{
TestVM<> vm{R"([^\]]+)"};
kak_assert(not vm.exec("a]c"));
Expand Down
30 changes: 19 additions & 11 deletions src/regex_impl.hh
Original file line number Diff line number Diff line change
Expand Up @@ -46,21 +46,25 @@ struct CharacterClass
Vector<Range, MemoryDomain::Regex> ranges;
CharacterType ctypes = CharacterType::None;
bool negative = false;
bool ignore_case = false;

friend bool operator==(const CharacterClass&, const CharacterClass&) = default;

bool matches(Codepoint cp) const
bool matches(Codepoint cp, bool ignore_case) const
{
Codepoint cp_flip = -1;
if (ignore_case)
cp = to_lower(cp);
{
if (is_upper(cp))
cp_flip = to_lower(cp);
else if (is_lower(cp))
cp_flip = to_upper(cp);
}

for (auto& [min, max] : ranges)
for (const auto& range : ranges)
{
if (cp < min)
break;
else if (cp <= max)
return not negative;
if ((cp >= range.min && cp <= range.max) ||
(cp_flip != -1 && cp_flip >= range.min && cp_flip <= range.max))
return !negative;
}

return (ctypes != CharacterType::None and is_ctype(ctypes, cp)) != negative;
Expand Down Expand Up @@ -105,7 +109,11 @@ struct CompiledRegex : UseMemoryDomain<MemoryDomain::Regex>
uint32_t codepoint : 24;
bool ignore_case : 1;
} literal;
int16_t character_class_index;
struct CharacterClass
{
int16_t index;
bool ignore_case : 1;
} character_class;
CharacterType character_type;
int16_t jump_offset;
int16_t save_index;
Expand Down Expand Up @@ -405,7 +413,7 @@ private:
return failed();
case CompiledRegex::CharClass:
if (pos != config.end and
m_program.character_classes[inst.param.character_class_index].matches(cp))
m_program.character_classes[inst.param.character_class.index].matches(cp, inst.param.character_class.ignore_case))
return consumed();
return failed();
case CompiledRegex::CharType:
Expand Down Expand Up @@ -609,7 +617,7 @@ private:
else if (op >= Lookaround::CharacterClass and op < Lookaround::CharacterType)
{
auto index = to_underlying(op) - to_underlying(Lookaround::CharacterClass);
if (not m_program.character_classes[index].matches(cp))
if (not m_program.character_classes[index].matches(cp, param.ignore_case))
return false;
}
else if (op >= Lookaround::CharacterType and op < Lookaround::OpEnd)
Expand Down