diff --git a/moo.js b/moo.js index cbeace4..e21c821 100644 --- a/moo.js +++ b/moo.js @@ -21,8 +21,8 @@ function reEscape(s) { return s.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&') } - function reGroups(s) { - var re = new RegExp('|' + s) + function reGroups(s, flags = '') { + var re = new RegExp('|' + s, flags) return re.exec('').length - 1 } function reCapture(s) { @@ -41,12 +41,10 @@ return '(?:' + reEscape(obj) + ')' } else if (isRegExp(obj)) { - // TODO: consider /u support if (obj.ignoreCase) throw new Error('RegExp /i flag not allowed') if (obj.global) throw new Error('RegExp /g flag is implied') if (obj.sticky) throw new Error('RegExp /y flag is implied') if (obj.multiline) throw new Error('RegExp /m flag is implied') - if (obj.unicode) throw new Error('RegExp /u flag is not allowed') return obj.source } else { @@ -54,6 +52,25 @@ } } + function UnicodePatternStateMachine () { + this.state = 'init' // init | hasFlag | noFlag + } + UnicodePatternStateMachine.prototype.hasFlag = function () { + return this.state === 'hasFlag' + } + UnicodePatternStateMachine.prototype.transition = function (pattern) { + if (!isRegExp(pattern)) { return } + + const err = new Error('RegExp /u flag must be used on all or no patterns') + if (pattern.unicode) { + if (this.state === 'noFlag') { throw err } + this.state = 'hasFlag' + } else { + if (this.state === 'hasFlag') { throw err } + this.state = 'noFlag' + } + } + function objectToRules(object) { var keys = Object.getOwnPropertyNames(object) var result = [] @@ -156,6 +173,7 @@ var fastAllowed = true var groups = [] var parts = [] + var unicodeState = new UnicodePatternStateMachine() // If there is a fallback rule, then disable fast matching for (var i = 0; i < rules.length; i++) { @@ -210,15 +228,20 @@ groups.push(options) + match.forEach(function (pattern) { + unicodeState.transition(pattern) + }) + // convert to RegExp var pat = reUnion(match.map(regexpOrLiteral)) // validate - var regexp = new RegExp(pat) + var flags = unicodeState.hasFlag() ? 'u' : '' + var regexp = new RegExp(pat, flags) if (regexp.test("")) { throw new Error("RegExp matches empty string: " + regexp) } - var groupCount = reGroups(pat) + var groupCount = reGroups(pat, flags) if (groupCount > 0) { throw new Error("RegExp has capture groups: " + regexp + "\nUse (?: … ) instead") } @@ -240,6 +263,7 @@ // match (i.e. an empty pattern). var fallbackRule = errorRule && errorRule.fallback var flags = hasSticky && !fallbackRule ? 'ym' : 'gm' + if (unicodeState.hasFlag()) { flags += 'u' } var suffix = hasSticky || fallbackRule ? '' : '|' var combined = new RegExp(reUnion(parts) + suffix, flags) diff --git a/test/test.js b/test/test.js index 95c15fe..766cc53 100644 --- a/test/test.js +++ b/test/test.js @@ -8,6 +8,11 @@ const python = require('./python') function lexAll(lexer) {return Array.from(lexer)} +let supportsUnicodePropertyEscapes = false +try { + /\p{ID_Start}/u; + supportsUnicodePropertyEscapes = true +} catch (e) {} describe('compiler', () => { @@ -29,13 +34,12 @@ describe('compiler', () => { expect(lex4.next()).toMatchObject({type: 'err', text: 'nope!'}) }) - test("warns for /g, /y, /i, /m, /u", () => { + test("warns for /g, /y, /i, /m", () => { expect(() => compile({ word: /foo/ })).not.toThrow() expect(() => compile({ word: /foo/g })).toThrow('implied') expect(() => compile({ word: /foo/i })).toThrow('not allowed') expect(() => compile({ word: /foo/y })).toThrow('implied') expect(() => compile({ word: /foo/m })).toThrow('implied') - expect(() => compile({ word: /foo/u })).toThrow('not allowed') }) // TODO warns if no lineBreaks: true @@ -147,6 +151,40 @@ describe('compiler', () => { expect(tokens.shift()).toMatchObject({type: 'number', value: '3.14'}) }) + test('accepts unicode RegExps', () => { + const lexer = compile({ + uSequence: /\u{0075}+/u, + space: / +/u, + }) + lexer.reset('uuuuu uu uuu') + var tokens = lexAll(lexer).filter(t => t.type !== 'space') + expect(tokens.shift()).toMatchObject({type: 'uSequence', value: 'uuuuu'}) + expect(tokens.shift()).toMatchObject({type: 'uSequence', value: 'uu'}) + expect(tokens.shift()).toMatchObject({type: 'uSequence', value: 'uuu'}) + }) + + test('accepts unicode property escapes in RegExps, where supported', () => { + if (!supportsUnicodePropertyEscapes) { return } + const lexer = compile({ + identifier: /[$_\p{ID_Start}][$\p{ID_Continue}]*/u, + space: / +/u, + operator: ["+"], + }) + lexer.reset('$foo π ভরা') + var tokens = lexAll(lexer).filter(t => t.type !== 'space') + expect(tokens.shift()).toMatchObject({type: 'identifier', value: '$foo'}) + expect(tokens.shift()).toMatchObject({type: 'identifier', value: 'π'}) + expect(tokens.shift()).toMatchObject({type: 'identifier', value: 'ভরা'}) + }) + + test('rejects mixed unicode and non-unicode RegExps', () => { + expect(() => { + compile({ + uSequence: /\u{0075}+/u, + space: / +/, + }) + }).toThrow('RegExp /u flag must be used on all or no patterns') + }) }) describe('compiles literals', () => {