no-context · modernserf · Jan 10, 2019 · modernserf · Jan 12, 2019 · modernserf
diff --git a/moo.js b/moo.js
@@ -21,8 +21,8 @@
   function reEscape(s) {
     return s.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&')
   }
-  function reGroups(s) {
-    var re = new RegExp('|' + s)
+  function reGroups(s, flags = '') {
+    var re = new RegExp('|' + s, flags)
     return re.exec('').length - 1
   }
   function reCapture(s) {
@@ -41,19 +41,36 @@
       return '(?:' + reEscape(obj) + ')'
 
     } else if (isRegExp(obj)) {
-      // TODO: consider /u support
       if (obj.ignoreCase) throw new Error('RegExp /i flag not allowed')
       if (obj.global) throw new Error('RegExp /g flag is implied')
       if (obj.sticky) throw new Error('RegExp /y flag is implied')
       if (obj.multiline) throw new Error('RegExp /m flag is implied')
-      if (obj.unicode) throw new Error('RegExp /u flag is not allowed')
       return obj.source
 
     } else {
       throw new Error('Not a pattern: ' + obj)
     }
   }
 
+  function UnicodePatternStateMachine () {
+    this.state = 'init' // init | hasFlag | noFlag
+  }
+  UnicodePatternStateMachine.prototype.hasFlag = function () {
+    return this.state === 'hasFlag'
+  }
+  UnicodePatternStateMachine.prototype.transition = function (pattern) {
+    if (!isRegExp(pattern)) { return }
+
+    const err = new Error('RegExp /u flag must be used on all or no patterns')
+    if (pattern.unicode) {
+      if (this.state === 'noFlag') { throw err }
+      this.state = 'hasFlag'
+    } else {
+      if (this.state === 'hasFlag') { throw err }
+      this.state = 'noFlag'
+    }
+  }
+
   function objectToRules(object) {
     var keys = Object.getOwnPropertyNames(object)
     var result = []
@@ -156,6 +173,7 @@
     var fastAllowed = true
     var groups = []
     var parts = []
+    var unicodeState = new UnicodePatternStateMachine()
 
     // If there is a fallback rule, then disable fast matching
     for (var i = 0; i < rules.length; i++) {
@@ -210,15 +228,20 @@
 
       groups.push(options)
 
+      match.forEach(function (pattern) {
+        unicodeState.transition(pattern)
+      })
+
       // convert to RegExp
       var pat = reUnion(match.map(regexpOrLiteral))
 
       // validate
-      var regexp = new RegExp(pat)
+      var flags = unicodeState.hasFlag() ? 'u' : ''
+      var regexp = new RegExp(pat, flags)
       if (regexp.test("")) {
         throw new Error("RegExp matches empty string: " + regexp)
       }
-      var groupCount = reGroups(pat)
+      var groupCount = reGroups(pat, flags)
       if (groupCount > 0) {
         throw new Error("RegExp has capture groups: " + regexp + "\nUse (?: … ) instead")
       }
@@ -240,6 +263,7 @@
     // match (i.e. an empty pattern).
     var fallbackRule = errorRule && errorRule.fallback
     var flags = hasSticky && !fallbackRule ? 'ym' : 'gm'
+    if (unicodeState.hasFlag()) { flags += 'u' }
     var suffix = hasSticky || fallbackRule ? '' : '|'
     var combined = new RegExp(reUnion(parts) + suffix, flags)
 

diff --git a/test/test.js b/test/test.js
@@ -8,6 +8,11 @@ const python = require('./python')
 
 function lexAll(lexer) {return Array.from(lexer)}
 
+let supportsUnicodePropertyEscapes = false
+try {
+  /\p{ID_Start}/u;
+  supportsUnicodePropertyEscapes = true
+} catch (e) {}
 
 describe('compiler', () => {
 
@@ -29,13 +34,12 @@ describe('compiler', () => {
     expect(lex4.next()).toMatchObject({type: 'err', text: 'nope!'})
   })
 
-  test("warns for /g, /y, /i, /m, /u", () => {
+  test("warns for /g, /y, /i, /m", () => {
     expect(() => compile({ word: /foo/ })).not.toThrow()
     expect(() => compile({ word: /foo/g })).toThrow('implied')
     expect(() => compile({ word: /foo/i })).toThrow('not allowed')
     expect(() => compile({ word: /foo/y })).toThrow('implied')
     expect(() => compile({ word: /foo/m })).toThrow('implied')
-    expect(() => compile({ word: /foo/u })).toThrow('not allowed')
   })
 
   // TODO warns if no lineBreaks: true
@@ -147,6 +151,40 @@ describe('compiler', () => {
     expect(tokens.shift()).toMatchObject({type: 'number', value: '3.14'})
   })
 
+  test('accepts unicode RegExps', () => {
+    const lexer = compile({
+      uSequence: /\u{0075}+/u,
+      space: / +/u,
+    })
+    lexer.reset('uuuuu uu uuu')
+    var tokens = lexAll(lexer).filter(t => t.type !== 'space')
+    expect(tokens.shift()).toMatchObject({type: 'uSequence', value: 'uuuuu'})
+    expect(tokens.shift()).toMatchObject({type: 'uSequence', value: 'uu'})
+    expect(tokens.shift()).toMatchObject({type: 'uSequence', value: 'uuu'})
+  })
+
+  test('accepts unicode property escapes in RegExps, where supported', () => {
+    if (!supportsUnicodePropertyEscapes) { return }
+    const lexer = compile({
+      identifier: /[$_\p{ID_Start}][$\p{ID_Continue}]*/u,
+      space: / +/u,
+      operator: ["+"],
+    })
+    lexer.reset('$foo π ভরা')
+    var tokens = lexAll(lexer).filter(t => t.type !== 'space')
+    expect(tokens.shift()).toMatchObject({type: 'identifier', value: '$foo'})
+    expect(tokens.shift()).toMatchObject({type: 'identifier', value: 'π'})
+    expect(tokens.shift()).toMatchObject({type: 'identifier', value: 'ভরা'})
+  })
+
+  test('rejects mixed unicode and non-unicode RegExps', () => {
+    expect(() => {
+      compile({
+        uSequence: /\u{0075}+/u,
+        space: / +/,
+      })
+    }).toThrow('RegExp /u flag must be used on all or no patterns')
+  })
 })
 
 describe('compiles literals', () => {