Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 30 additions & 6 deletions moo.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
function reEscape(s) {
return s.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&')
}
function reGroups(s) {
var re = new RegExp('|' + s)
function reGroups(s, flags = '') {
var re = new RegExp('|' + s, flags)
return re.exec('').length - 1
}
function reCapture(s) {
Expand All @@ -41,19 +41,36 @@
return '(?:' + reEscape(obj) + ')'

} else if (isRegExp(obj)) {
// TODO: consider /u support
if (obj.ignoreCase) throw new Error('RegExp /i flag not allowed')
if (obj.global) throw new Error('RegExp /g flag is implied')
if (obj.sticky) throw new Error('RegExp /y flag is implied')
if (obj.multiline) throw new Error('RegExp /m flag is implied')
if (obj.unicode) throw new Error('RegExp /u flag is not allowed')
return obj.source

} else {
throw new Error('Not a pattern: ' + obj)
}
}

function UnicodePatternStateMachine () {
this.state = 'init' // init | hasFlag | noFlag
}
UnicodePatternStateMachine.prototype.hasFlag = function () {
return this.state === 'hasFlag'
}
UnicodePatternStateMachine.prototype.transition = function (pattern) {
if (!isRegExp(pattern)) { return }

const err = new Error('RegExp /u flag must be used on all or no patterns')
if (pattern.unicode) {
if (this.state === 'noFlag') { throw err }
this.state = 'hasFlag'
} else {
if (this.state === 'hasFlag') { throw err }
this.state = 'noFlag'
}
}

function objectToRules(object) {
var keys = Object.getOwnPropertyNames(object)
var result = []
Expand Down Expand Up @@ -156,6 +173,7 @@
var fastAllowed = true
var groups = []
var parts = []
var unicodeState = new UnicodePatternStateMachine()

// If there is a fallback rule, then disable fast matching
for (var i = 0; i < rules.length; i++) {
Expand Down Expand Up @@ -210,15 +228,20 @@

groups.push(options)

match.forEach(function (pattern) {
unicodeState.transition(pattern)
})

// convert to RegExp
var pat = reUnion(match.map(regexpOrLiteral))

// validate
var regexp = new RegExp(pat)
var flags = unicodeState.hasFlag() ? 'u' : ''
var regexp = new RegExp(pat, flags)

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Every time we build a RegExp we need to add the u flag, if applicable; otherwise it complains about "nothing to repeat" in patterns like /\u{0075}+/

if (regexp.test("")) {
throw new Error("RegExp matches empty string: " + regexp)
}
var groupCount = reGroups(pat)
var groupCount = reGroups(pat, flags)
if (groupCount > 0) {
throw new Error("RegExp has capture groups: " + regexp + "\nUse (?: … ) instead")
}
Expand All @@ -240,6 +263,7 @@
// match (i.e. an empty pattern).
var fallbackRule = errorRule && errorRule.fallback
var flags = hasSticky && !fallbackRule ? 'ym' : 'gm'
if (unicodeState.hasFlag()) { flags += 'u' }
var suffix = hasSticky || fallbackRule ? '' : '|'
var combined = new RegExp(reUnion(parts) + suffix, flags)

Expand Down
42 changes: 40 additions & 2 deletions test/test.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@ const python = require('./python')

function lexAll(lexer) {return Array.from(lexer)}

let supportsUnicodePropertyEscapes = false

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe there's a better way to feature detect this?

try {
/\p{ID_Start}/u;
supportsUnicodePropertyEscapes = true
} catch (e) {}

describe('compiler', () => {

Expand All @@ -29,13 +34,12 @@ describe('compiler', () => {
expect(lex4.next()).toMatchObject({type: 'err', text: 'nope!'})
})

test("warns for /g, /y, /i, /m, /u", () => {
test("warns for /g, /y, /i, /m", () => {
expect(() => compile({ word: /foo/ })).not.toThrow()
expect(() => compile({ word: /foo/g })).toThrow('implied')
expect(() => compile({ word: /foo/i })).toThrow('not allowed')
expect(() => compile({ word: /foo/y })).toThrow('implied')
expect(() => compile({ word: /foo/m })).toThrow('implied')
expect(() => compile({ word: /foo/u })).toThrow('not allowed')
})

// TODO warns if no lineBreaks: true
Expand Down Expand Up @@ -147,6 +151,40 @@ describe('compiler', () => {
expect(tokens.shift()).toMatchObject({type: 'number', value: '3.14'})
})

test('accepts unicode RegExps', () => {
const lexer = compile({
uSequence: /\u{0075}+/u,
space: / +/u,
})
lexer.reset('uuuuu uu uuu')
var tokens = lexAll(lexer).filter(t => t.type !== 'space')
expect(tokens.shift()).toMatchObject({type: 'uSequence', value: 'uuuuu'})
expect(tokens.shift()).toMatchObject({type: 'uSequence', value: 'uu'})
expect(tokens.shift()).toMatchObject({type: 'uSequence', value: 'uuu'})
})

test('accepts unicode property escapes in RegExps, where supported', () => {
if (!supportsUnicodePropertyEscapes) { return }
const lexer = compile({
identifier: /[$_\p{ID_Start}][$\p{ID_Continue}]*/u,
space: / +/u,
operator: ["+"],
})
lexer.reset('$foo π ভরা')
var tokens = lexAll(lexer).filter(t => t.type !== 'space')
expect(tokens.shift()).toMatchObject({type: 'identifier', value: '$foo'})
expect(tokens.shift()).toMatchObject({type: 'identifier', value: 'π'})
expect(tokens.shift()).toMatchObject({type: 'identifier', value: 'ভরা'})
})

test('rejects mixed unicode and non-unicode RegExps', () => {
expect(() => {
compile({
uSequence: /\u{0075}+/u,
space: / +/,
})
}).toThrow('RegExp /u flag must be used on all or no patterns')
})
})

describe('compiles literals', () => {
Expand Down