diff --git a/source/BNFC.cabal b/source/BNFC.cabal index 346e3b00f..2ef47ef87 100644 --- a/source/BNFC.cabal +++ b/source/BNFC.cabal @@ -29,6 +29,9 @@ Description: a pretty-printer as a Haskell, Agda, C, C++, Java, or Ocaml module, an XML representation, a LaTeX file containing a readable specification of the language. +data-files: + test/BNFC/Backend/TreeSitter/*.cf + test/BNFC/Backend/TreeSitter/*.expected.js -- Support range when build with cabal tested-with: @@ -272,6 +275,7 @@ library -- Tree-sitter backend BNFC.Backend.TreeSitter BNFC.Backend.TreeSitter.CFtoTreeSitter + BNFC.Backend.TreeSitter.MatchesEmpty BNFC.Backend.TreeSitter.RegToJSReg ----- Testing -------------------------------------------------------------- diff --git a/source/src/BNFC/Backend/TreeSitter.hs b/source/src/BNFC/Backend/TreeSitter.hs index 885124c38..b208c28ab 100644 --- a/source/src/BNFC/Backend/TreeSitter.hs +++ b/source/src/BNFC/Backend/TreeSitter.hs @@ -13,8 +13,14 @@ module BNFC.Backend.TreeSitter where +import Prelude hiding ((<>)) +import System.FilePath + import BNFC.Backend.Base import BNFC.Backend.TreeSitter.CFtoTreeSitter (cfToTreeSitter) +import BNFC.Backend.Common.Makefile(mkMakefile, mkRule, mkVar) +import BNFC.GetCF(fixTokenCats) +import BNFC.Utils(cstring, kebabCase_, snakeCase_, camelCase_) import BNFC.CF import BNFC.Options hiding (Backend) import BNFC.PrettyPrint @@ -22,11 +28,71 @@ import BNFC.PrettyPrint -- | Entry point: create grammar.js file makeTreeSitter :: SharedOptions -> CF -> Backend makeTreeSitter opts cf = do - mkfile "grammar.js" comment (render $ cfToTreeSitter name cf) + + mkfile (dir "grammar.js") comment $ + render (cfToTreeSitter name wordCat cf) + + mkfile (dir "tree-sitter.json") (const "") $ + render (treeSitterJson name) + + mkMakefile (fmap (dir ) (optMake opts)) $ + const treeSitterMakefile + where - name = lang opts + name = snakeCase_ (lang opts) + dir = "tree-sitter-" ++ kebabCase_ (lang opts) + wordCat = fixTokenCats (tokenNames cf) (strToCat (treeSitterWord opts)) comment :: String -> String comment = ("// " ++) --- | TODO: Add Makefile generation for tree-sitter +treeSitterMakefile :: Doc +treeSitterMakefile = vcat' + [ mkVar "TREE_SITTER" "tree-sitter" + , ".PHONY: parse clean" + , mkRule "src/parser.c" ["grammar.js"] ["$(TREE_SITTER) generate"] + , mkRule "parse" ["src/parser.c"] ["$(TREE_SITTER) parse --cst"] + , mkRule "clean" [] ["rm -rfv src"] + ] + +treeSitterJson :: String -> Doc +treeSitterJson name = + jsonObject + [ p "$schema" $ str "https://tree-sitter.github.io/tree-sitter/assets/schemas/config.schema.json" + , p "grammars" $ jsonArray1 $ jsonObject + [ p "name" $ str $ snakeCase_ name + , p "camelcase" $ str $ camelCase_ name + , p "scope" $ str $ "source." ++ snakeCase_ name + ] + , p "metadata" $ jsonObject + [ p "version" $ str "0.1.0" + , p "authors" $ jsonArray1 $ jsonObject [p "name" $ str "BNFC"] + ] + , p "bindings" $ jsonObject + [ p "c" "false" + , p "go" "false" + , p "java" "false" + , p "node" "false" + , p "python" "false" + , p "rust" "false" + , p "swift" "false" + , p "zig" "false" + ] + ] + where + str = cstring + p = (,) + +jsonLines :: [Doc] -> Doc +jsonLines = nest 2 . sep . punctuate "," + +jsonObject :: [(String, Doc)] -> Doc +jsonObject pairs = "{" $$ jsonLines (map jsonPair pairs) <+> "}" + where + jsonPair (k, v) = cstring k <> ":" <+> v + +jsonArray :: [Doc] -> Doc +jsonArray elems = "[" $$ jsonLines elems <+> "]" + +jsonArray1 :: Doc -> Doc +jsonArray1 = jsonArray . pure diff --git a/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs b/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs index eae9b17bb..5d95d898b 100644 --- a/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs +++ b/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs @@ -13,232 +13,224 @@ module BNFC.Backend.TreeSitter.CFtoTreeSitter where -import BNFC.Abs (Reg (RSeq, RSeqs, RStar, RAny)) +import BNFC.Abs (Reg) import BNFC.Backend.TreeSitter.RegToJSReg +import BNFC.Backend.TreeSitter.MatchesEmpty(fixPointKnownEmpty, transformEmptyMatches, KnownEmpty, OptSym(..), OptSentForm, isKnownEmpty) import BNFC.CF -import BNFC.Lexing (mkRegMultilineComment) +import BNFC.Utils(when, applyWhen, cstring, mkNames, NameStyle(..)) +import BNFC.Lexing (mkLexer, LexType(..), mkRegMultilineComment, mkRegSingleLineComment) import BNFC.PrettyPrint + import Prelude hiding ((<>)) --- | Indent one level of 2 spaces -indent :: Doc -> Doc -indent = nest 2 +import qualified Data.List as List +import qualified Data.Maybe as Maybe +import qualified Data.List.NonEmpty as List1 + +-- * Main entry point -- | Create content of grammar.js file -cfToTreeSitter :: String -> CF -> Doc -cfToTreeSitter name cf = +cfToTreeSitter :: String -> Cat -> CF -> Doc +cfToTreeSitter name wordCat cf = -- Overall structure of grammar.js text "module.exports = grammar({" $+$ indent - ( text "name: '" <> text name <> text "'," + ( text "name:" <+> cstring name <> "," $+$ extrasSection $+$ wordSection $+$ rulesSection ) $+$ text "});" where - extrasSection = prExtras cf - wordSection = prWord cf + lexTokens = Maybe.mapMaybe keepToken (mkLexer cf) + keepToken (r, LexToken nm) | not (isUnusedBuiltin nm) = Just (r, nm) + keepToken _ = Nothing + isUnusedBuiltin x = x `elem` specialCatsP && not (isUsedCat cf (TokenCat x)) + + -- generate rules for comment tokens so they can be used in highlighting + (mlComments, slComments) = comments cf + commentTokens = + disambig "CommentSingle" (map mkRegSingleLineComment slComments) + ++ disambig "CommentMulti" (map (uncurry mkRegMultilineComment) mlComments) + disambig base regs = zip regs names + where names = mkNames (map snd lexTokens) CamelCase (base <$ regs) + + extrasSection = prExtras (map snd commentTokens) + wordSection = prWord wordCat cf rulesSection = text "rules: {" $+$ indent ( prRules cf - $+$ prUsrTokenRules cf - $+$ prBuiltinTokenRules cf - ) + $+$ prTokenRules (lexTokens ++ commentTokens) ) $+$ text "}," +-- * Functions to build parts of grammar.js + -- | Print rules for comments -prExtras :: CF -> Doc -prExtras cf = - if extraNeeded - then - defineSymbol "extras" <> "[" - $+$ indent - ( -- default rule for white spaces - text "/\\s/," - $+$ mRules - $+$ sRules - ) - $+$ text "]," - else empty - where - extraNeeded = length commentMRules + length commentSRules > 0 - (commentMRules, commentSRules) = comments cf - mRules = vcat' $ map mkOneMRule commentMRules - sRules = vcat' $ map mkOneSRule commentSRules - mkOneSRule s = text (printRegJSReg $ RSeq (RSeqs s) (RStar RAny)) <> text "," - mkOneMRule (s, e) = text (printRegJSReg $ mkRegMultilineComment s e) <> text "," +prExtras :: [TokenCat] -> Doc +prExtras commentTokens = + defineSymbol "extras" <> "[" + $+$ indent + ( -- default rule for white spaces + text "/\\s/," + $+$ vcat' (map (appendComma . text . refName . formatTokenName) commentTokens) + ) + $+$ text "]," -- | Print word section, this section is needed for tree-sitter -- to do keyword extraction before any parsing/lexing, see -- https://tree-sitter.github.io/tree-sitter/creating-parsers#keyword-extraction --- TODO: currently, we just add every user defined token as well --- as the predefined Ident token to this list to be safe. Ideally, --- we should enumerate all defined tokens against all occurrences of --- keywords. Any tokens patterns that could accept a keyword will go --- into this list. This will require integration of a regex engine. -prWord :: CF -> Doc -prWord cf = - if wordNeeded - then - defineSymbol "word" - $+$ indent - ( wrapChoice - ( usrTokensFormatted - ++ [text "$.token_Ident" | identUsed] - ) - ) - <> "," - else empty - where - wordNeeded = identUsed || usrTokens /= [] - identUsed = isUsedCat cf (TokenCat catIdent) - usrTokens = tokenPragmas cf - usrTokensFormatted = - map (text . refName . formatCatName False . TokenCat . fst) $ usrTokens - --- | Print builtin token rules according to their usage -prBuiltinTokenRules :: CF -> Doc -prBuiltinTokenRules cf = - ifC catInteger integerRule - $+$ ifC catDouble doubleRule - $+$ ifC catChar charRule - $+$ ifC catString stringRule - $+$ ifC catIdent identRule - where - ifC cat d = if isUsedCat cf (TokenCat cat) then d else empty - --- | Predefined builtin token rules -integerRule, doubleRule, charRule, stringRule, identRule :: Doc -integerRule = defineSymbol "token_Integer" <+> text "/\\d+/" <> "," -doubleRule = defineSymbol "token_Double" <+> text "/\\d+\\.\\d+(e-?\\d+)?/" <> "," -charRule = - defineSymbol "token_Char" <+> text "/'([^'\\\\]|(\\\\[\"'\\\\tnrf]))'/" <> "," -stringRule = - defineSymbol "token_String" <+> text "/\"([^'\\\\]|(\\\\[\"'\\\\tnrf]))*\"/" <> "," -identRule = - defineSymbol "token_Ident" <+> text "/[a-zA-Z][a-zA-Z\\d_']*/" <> "," - --- | First print the entrypoint rule, tree-sitter always use the --- first rule as entrypoint and does not support multi-entrypoint. --- Then print rest of the rules +-- +-- This should be defined as a rule which matches a /superset/ of keywords +-- in the language. Usually, this would be some general identifier token. So, +-- this defaults to the built-in Ident token and can be specified by the user +-- with a command-line flag. +prWord :: Cat -> CF -> Doc +prWord wordCat cf = + when (isUsedCat cf wordCat) $ + defineSymbol "word" + <+> formatSent [NonOptional (Left wordCat)] <> "," + +-- | Prints the rules in the grammar with the entry point first. +-- +-- Since Treesitter requires a unique entry point, this will build a "virtual" +-- entry point which dispatches to each of the declared BNFC entry points via +-- a choice list. Additionally, the virtual entry point can match the empty string +-- (and is the only rule which can). prRules :: CF -> Doc prRules cf = - if onlyOneEntry - then - prOneCat entryRules entryCat - $+$ prOtherRules entryCat cf - else error "Tree-sitter only supports one entrypoint" + prOneCat knownEmpty True virtEntryCat virtEntryRhsRules + $+$ vcat' (map (uncurry (prOneCat knownEmpty False)) groups) where - --If entrypoint is defined, there must be only one entrypoint - --If it is not defined, defaults to use the first rule as entrypoint - onlyOneEntry = not (hasEntryPoint cf) || onlyOneEntryDefined - onlyOneEntryDefined = length (allEntryPoints cf) == 1 - entryCat = firstEntry cf - entryRules = rulesForCat' cf entryCat - --- | Print all other rules except the entrypoint -prOtherRules :: Cat -> CF -> Doc -prOtherRules entryCat cf = vcat' $ map mkOne rules + groups = ruleGroups cf + + virtEntryCat = Cat "BNFCStart" + virtEntryRhsCats = + (if hasEntryPoint cf then List1.toList else List1.take 1) + (allEntryPoints cf) + virtEntryRhsRules = toVirtRule <$> virtEntryRhsCats + + toVirtRule rhsCat = + npRule + (identCat virtEntryCat ++ "_" ++ identCat rhsCat) + virtEntryCat + [Left rhsCat] + Parsable + + knownEmpty = fixPointKnownEmpty ((virtEntryCat, virtEntryRhsRules) : groups) + +prTokenRules :: [(Reg, TokenCat)] -> Doc +prTokenRules = vcat' . map prOneToken + +-- | Generate one tree-sitter rule for one terminal token. +prOneToken :: (Reg, TokenCat) -> Doc +prOneToken (reg, name) = + defineSymbol (formatTokenName name) + $+$ indent (text $ printRegJSReg reg) <> "," + +-- | Generates one tree-sitter rule for one non-terminal from CF. +prOneCat :: KnownEmpty -> Bool -> NonTerminal -> [Rule] -> Doc +prOneCat knownEmpty allowEmpty nt rules = + defineSymbol (formatCatName False nt) + $+$ indent (appendComma (wrapRhs parRhs)) where - rules = [(c, r) | (c, r) <- ruleGroupsInternals cf, c /= entryCat] - mkOne (cat, rules) = prOneCat rules cat + wrapRhs = applyWhen (allowEmpty && Left nt `isKnownEmpty` knownEmpty) $ + wrapOptional' + + (parsableRules, _) = List.partition isParsable rules + + parRhs = wrapChoice (genRules parsableRules) + + genRules = map genRule + genRule rule = + ("//" <+> text (renderOneLine (pretty rule)) <+> ";") + $+$ (formatRhs . transformEmptyMatches knownEmpty) (rhsRule rule) + + renderOneLine = renderStyle (style { mode = OneLineMode }) + +-- * Builds right-hand side of rules -prUsrTokenRules :: CF -> Doc -prUsrTokenRules cf = vcat' $ map prOneToken tokens +-- | Format right hand side into list of strings +formatRhs :: [OptSentForm] -> Doc +formatRhs = wrapChoice . map formatSent + +formatSent :: OptSentForm -> Doc +formatSent = wrapSeq . map fmtOpt where - tokens = tokenPragmas cf - --- | Check if a set of rules contains internal rules -hasInternal :: [Rule] -> Bool -hasInternal = not . all isParsable - --- | Generates one or two tree-sitter rule(s) for one non-terminal from CF. --- Uses choice function from tree-sitter to combine rules for the non-terminal --- If the non-terminal has internal rules, an internal version of the non-terminal --- will be created (prefixed with "_" in tree-sitter), and all internal rules will --- be sectioned as such. -prOneCat :: [Rule] -> NonTerminal -> Doc -prOneCat rules nt = - defineSymbol (formatCatName False nt) - $+$ indent (appendComma parRhs) - $+$ internalRules + fmtOpt (Optional x) = wrapOptional (fmt x) + fmtOpt (NonOptional x) = fmt x + + fmt (Left c) = text $ refName $ formatCatName False c + fmt (Right term) = cstring term + +formatTokenName :: TokenCat -> String +formatTokenName = formatCatName False . TokenCat + +-- | Format string for cat name, prefix "_" if the name is for internal rules +formatCatName :: Bool -> Cat -> String +formatCatName internal c = + if internal + then "_" ++ formatted + else formatted where - int = hasInternal rules - internalRules = - if int - then defineSymbol (formatCatName True nt) $+$ indent (appendComma intRhs) - else empty - parRhs = wrapChoice $ transChoice ++ genChoice (filter isParsable rules) - transChoice = [text $ refName $ formatCatName True nt | int] - intRhs = wrapChoice $ genChoice (filter (not . isParsable) rules) - genChoice = map (wrapSeq . formatRhs . rhsRule) - --- | Generate one tree-sitter rule for one defined token -prOneToken :: (TokenCat, Reg) -> Doc -prOneToken (cat, exp) = - defineSymbol (formatCatName False $ TokenCat cat) - $+$ indent (text $ printRegJSReg exp) <> "," + formatted = formatName c + formatName (Cat name) = name + formatName (TokenCat name) = "token_" ++ name + formatName (ListCat c) = "list_" ++ formatName c + formatName (CoercCat name i) = name ++ show i + + +-- * Treesitter-related formatting helpers -- | Start a defined symbol block in tree-sitter grammar defineSymbol :: String -> Doc defineSymbol name = hsep [text name <> ":", text "$", text "=>"] -appendComma :: Doc -> Doc -appendComma = (<> text ",") - -commaJoin :: Bool -> [Doc] -> Doc -commaJoin newline = - foldl comma empty - where - comma a b - | isEmpty a = b - | isEmpty b = a - | otherwise = (if newline then ($+$) else (<>)) (a <> ",") b - wrapSeq :: [Doc] -> Doc wrapSeq = wrapOptListFun "seq" False wrapChoice :: [Doc] -> Doc wrapChoice = wrapOptListFun "choice" True +wrapOptional :: Doc -> Doc +wrapOptional = wrapFun "optional" False + +wrapOptional' :: Doc -> Doc +wrapOptional' = wrapFun "optional" True + -- | Wrap list using tree-sitter fun if the list contains multiple items -- Returns the only item without wrapping otherwise wrapOptListFun :: String -> Bool -> [Doc] -> Doc -wrapOptListFun fun newline list = - if length list == 1 - then head list - else wrapFun fun newline (commaJoin newline list) +wrapOptListFun _ _ [x] = x +wrapOptListFun fun _ [ ] = wrapFun fun False empty +wrapOptListFun fun newline list = wrapFun fun newline (commaJoin newline list) wrapFun :: String -> Bool -> Doc -> Doc -wrapFun fun newline arg = joinOp [text fun <> text "(", indent arg, text ")"] +wrapFun fun newline arg = joinOp [text fun <> text "(", indentOp arg, text ")"] where joinOp = if newline then vcat' else hcat + indentOp = if newline then indent else id -- | Helper for referring to non-terminal names in tree-sitter refName :: String -> String refName = ("$." ++) --- | Format right hand side into list of strings -formatRhs :: SentForm -> [Doc] -formatRhs = - map (\case - Left c -> text $ refName $ formatCatName False c - Right term -> quoted term) +-- * Generic formatting helpers -quoted :: String -> Doc -quoted s = text "\"" <> text s <> text "\"" +-- | Indent one level of 2 spaces +indent :: Doc -> Doc +indent = nest 2 --- | Format string for cat name, prefix "_" if the name is for internal rules -formatCatName :: Bool -> Cat -> String -formatCatName internal c = - if internal - then "_" ++ formatted - else formatted +appendComma :: Doc -> Doc +appendComma = (<> text ",") + +commaJoin :: Bool -> [Doc] -> Doc +commaJoin newline = + foldl comma empty where - formatted = formatName c - formatName (Cat name) = name - formatName (TokenCat name) = "token_" ++ name - formatName (ListCat c) = "list_" ++ formatName c - formatName (CoercCat name i) = name ++ show i \ No newline at end of file + commaString = if newline then "," else ", " + comma a b + | isEmpty a = b + | isEmpty b = a + | otherwise = (if newline then ($+$) else (<>)) (a <> commaString) b + diff --git a/source/src/BNFC/Backend/TreeSitter/MatchesEmpty.hs b/source/src/BNFC/Backend/TreeSitter/MatchesEmpty.hs new file mode 100644 index 000000000..f4e1d946a --- /dev/null +++ b/source/src/BNFC/Backend/TreeSitter/MatchesEmpty.hs @@ -0,0 +1,279 @@ +{- + BNF Converter: TreeSitter Grammar Generator + Copyright (C) 2004 Author: Markus Forsberg, Michael Pellauer, + Bjorn Bringert + + Author : Kangjing Huang (huangkangjing@gmail.com) + Created : 23 Nov, 2023 + +-} + +{-# LANGUAGE CPP #-} +{-# LANGUAGE LambdaCase #-} + +{-| +Description: Identifies and transforms rules which match the empty string, + as required by Treesitter. +Maintainer: Kait Lam + +This module identifies and transforms rules which match the empty string, +as required by constraints of Treesitter. + +Treesitter requires that rules do /not/ match the empty string. +Although this is not made explicit in [their documentation](https://tree-sitter.github.io/tree-sitter/creating-parsers/3-writing-the-grammar.html), +rules which match empty will be thoroughly rejected by the tree-sitter +compiler. + +For example, this Treesitter grammar is not allowed because @$.listItem@ could match +the empty string. + +> list: $ => seq("[", $.listItem, "]"), +> listItem: $ => choice( +> seq(), +> "item" +> ), + +Instead, Treesitter wants empty matches to be moved to /use-sites/ of that +rule. The above grammar would be rewritten as: + +> list: $ => seq("[", optional($.listItem), "]"), +> listItem: $ => choice( +> choice(), +> "item" +> ), + +Unfortunately, the style Treesitter needs is quite incompatible with LBNF. LBNF +has no way to express "choice" occuring within the right hand side of a rule, +which forces any choice (including potential optionality) to happen at the +top-level of a rule. This is in direct conflict with what Treesitter expects. + +This modules bridges the gap by transforming LBNF's rules using process +outlined above. This happens in two steps: first, we compute which rules could +match empty by using a fixpoint algorithm, then, we transform the rules by +eliminating empty matches from all rules and wrapping non-terminals in +@optional@ if their rule could match empty. BNFC's "BNFC.CF" types have no +notion of "optional" within the RHS, so this module also introduces 'OptSym' to +represent this. + +Of course, this transformation affects the parse tree for certain strings. +Users of BNFC who want to generate Treesitter grammars should be aware of this +change. + +For users of this library, the main functions of interest are in the [Fixpoint +and transformations]("BNFC.Backend.TreeSitter.MatchesEmpty#g:fixpoint") +section. +-} +module BNFC.Backend.TreeSitter.MatchesEmpty where + +import BNFC.Utils((>.>)) +import BNFC.CF(SentForm, Cat, Rule, rhsRule) + +import qualified Data.Maybe as Maybe +import qualified Data.List as List +import qualified Data.Set as Set +#if !MIN_VERSION_base(4,11,0) +import Data.Semigroup (Semigroup(..)) +#endif + +-- * Basic types + +-- | A symbol which is either a non-terminal ('Cat') or terminal token name ('String'). +-- A list of these 'Sym's is a sentential form, 'SentForm'. +type Sym = Either Cat String + +-- | Set of 'Sym' which are known to match the empty string. +newtype KnownEmpty = KnownEmpty (Set.Set Sym) deriving (Eq, Show) + +-- | Returns whether the given symbol matches the empty string, according +-- to the given known empty set. +isKnownEmpty :: Sym -> KnownEmpty -> Bool +isKnownEmpty x ks = x `Set.member` (knownEmptySet ks) + +knownEmptySet :: KnownEmpty -> Set.Set Sym +knownEmptySet (KnownEmpty x) = x + +-- | Represents a 'Sym' which might be wrapped in a @optional(...)@ function +-- in the produced Treesitter grammar. +data OptSym = + -- | A 'Sym' which is wrapped in @optional([SYM])@, indicating that + -- it should match @[SYM]@ /or/ the empty string. + Optional Sym | + -- | A plain 'Sym' which matches only the 'Sym' itself. + NonOptional Sym deriving (Eq, Show) + +-- | A sentential form where each symbol may be wrapped in an optional function. +-- Analagous to 'SentForm', but containing 'OptSym' instead of 'Sym'. +type OptSentForm = [OptSym] + +-- * "Matches empty" type + +-- | Represents whether the wrapped value matches the empty string, or whether +-- it is known to be non-empty. +-- +-- Because this analysis is done on context-free grammars, the analysis is +-- precise. A value of 'MatchesEmpty' /will/ accept the empty string, and a +-- value of 'NonEmpty' will not. There is no uncertainty in this analysis. +data MatchesEmpty a = + -- | The contained value /accepts/ the empty string. + MatchesEmpty a | + -- | The contained value /does not/ accept the empty string. + NonEmpty a deriving (Eq, Show) + +matchesEmpty :: MatchesEmpty a -> Bool +matchesEmpty (MatchesEmpty _) = True +matchesEmpty (NonEmpty _) = False + +unMatchesEmpty :: MatchesEmpty a -> a +unMatchesEmpty (MatchesEmpty x) = x +unMatchesEmpty (NonEmpty x) = x + +-- ** Sequential operators + +-- | Combines the two values /in sequence/. Returns v'MatchesEmpty' if both values +-- are v'MatchesEmpty', otherwise returns v'NonEmpty'. In all cases, the inner +-- values are joined using the semigroup operation. +seqMatchesEmpty :: Semigroup a => MatchesEmpty a -> MatchesEmpty a -> MatchesEmpty a +seqMatchesEmpty (MatchesEmpty x) (MatchesEmpty y) = MatchesEmpty (x <> y) +seqMatchesEmpty x y = NonEmpty (unMatchesEmpty x <> unMatchesEmpty y) + +-- | Combines the list of values /in sequence/ (i.e., @seq(x1, ..., xn)@), returning +-- v'MatchesEmpty' if all are v'MatchesEmpty', otherwise v'NonEmpty'. Inner values +-- are joined using the semigroup operation. +seqListMatchesEmpty + :: Monoid a +#if !MIN_VERSION_base(4,11,0) + => Semigroup a +#endif + => [MatchesEmpty a] + -> MatchesEmpty a +seqListMatchesEmpty = foldr seqMatchesEmpty (MatchesEmpty mempty) + + +-- ** Alternation operators + +-- | Combines the two values as a /parallel choice/. Returns v'NonEmpty' if +-- both values are v'NonEmpty', otherwise returns v'MatchesEmpty'. In all +-- cases, the inner values are joined using the semigroup operation. +choiceMatchesEmpty :: Semigroup a => MatchesEmpty a -> MatchesEmpty a -> MatchesEmpty a +choiceMatchesEmpty (NonEmpty x) (NonEmpty y) = NonEmpty (x <> y) +choiceMatchesEmpty x y = MatchesEmpty (unMatchesEmpty x <> unMatchesEmpty y) + +-- | Combines the list of values /in choice/ (i.e., @choice(x1, ..., xn)@), returning +-- v'NonEmpty' if all are v'NonEmpty', otherwise v'MatchesEmpty'. Inner values +-- are joined using the semigroup operation. +choiceListMatchesEmpty + :: Monoid a +#if !MIN_VERSION_base(4,11,0) + => Semigroup a +#endif + => [MatchesEmpty a] + -> MatchesEmpty a +choiceListMatchesEmpty = foldr choiceMatchesEmpty (NonEmpty mempty) + +-- * Analysis of non-terminals + +-- | Determines whether the given symbol can match empty, according to the +-- given known empty set. If it /can/ match empty, the symbol is returned as +-- v'Optional' to indicate that uses of the symbol should match empty. +-- +-- TODO: This does not yet handle /tokens/ (terminals) which might be empty. +-- At the moment, all terminals are assumed to be non-empty. +possiblyEmptySym :: KnownEmpty -> Sym -> OptSym +possiblyEmptySym knownEmpty sym = + if sym `isKnownEmpty` knownEmpty then + Optional sym + else + NonOptional sym + +-- | Determines whether the given sentential form could match empty. +-- +-- The returned list is a /choice/ list of 'OptSentForm', with v'Optional' +-- applied to symbols which are within the known empty set. When combined using +-- choice, the returned list is equivalent to the original rule, /except/ that +-- the returned list has empty matches removed. If the rule previously matched +-- empty, this is encoded as the v'MatchesEmpty' variant. +-- +-- __Implementation Detail:__ A sentential form is a sequence of symbols, and a +-- sequence will match empty if and only if all of its parts can match empty. +-- Therefore, to eliminate the empty match from a sequence, we have to make at +-- least one of the terms non-empty. For a nullable sequence like @A? B? C?@, +-- this is done by transforming it to @A B? C? | B C? | C@. This should accept +-- an equivalent language modulo empty string, and it should preserve +-- unambiguity. +possiblyEmptyRule :: KnownEmpty -> SentForm -> MatchesEmpty [OptSentForm] +possiblyEmptyRule knownEmpty = + map (possiblyEmptySym knownEmpty) + >.> map fromOpt + >.> seqListMatchesEmpty + >.> \case + MatchesEmpty sent -> MatchesEmpty (subtractEmptyString sent) + NonEmpty sent -> NonEmpty [sent] + where + fromOpt (Optional x) = MatchesEmpty [Optional x] + fromOpt (NonOptional x) = NonEmpty [NonOptional x] + + subtractEmptyString = Maybe.mapMaybe headNonOptional . List.tails + + headNonOptional (Optional x : xs) = Just (NonOptional x : xs) + headNonOptional (NonOptional _ : _) = error "headNonOptional: unexpected that head is already NonOptional" + headNonOptional [] = Nothing + +-- | Determines whether the given non-terminal category with the given +-- production rules could match empty. +-- +-- The returned list is a /choice/ list of 'OptSentForm', with v'Optional' +-- applied to symbols which are within the known empty set. When combined using +-- choice, the returned list is equivalent to the original rules, /except/ that +-- the returned list has empty matches removed. If the category previously +-- matched empty, this is encoded as the v'MatchesEmpty' variant. +possiblyEmptyCat :: KnownEmpty -> (Cat, [Rule]) -> MatchesEmpty [OptSentForm] +possiblyEmptyCat knownEmpty (_, rules) = + choiceListMatchesEmpty $ map (possiblyEmptyRule knownEmpty . rhsRule) rules + +-- | Updates the set of known empty symbols according to the given grammar. +-- Returns the new set, which is made up of the previous set unioned with any +-- newly-discovered empty matching symbols. +-- +-- This is one step of the fixpoint computation in 'fixPointKnownEmpty'. +possiblyEmptyCats :: [(Cat, [Rule])] -> KnownEmpty -> KnownEmpty +possiblyEmptyCats cats knownEmpty = + KnownEmpty $ + Set.fromList (map (Left . fst) newEmptyCats) + `Set.union` knownEmptySet knownEmpty + where + newEmptyCats = filter (matchesEmpty . possiblyEmptyCat knownEmpty) cats + +-- * Fixpoint and transformations #fixpoint# +-- +-- $fixpoint +-- For users of this module, these are the main functions of interest. + +-- | Computes the complete set of symbols which are known to match empty, +-- using the given non-terminal production rules. +-- +-- This should be given the list of parsable grammar rules, e.g., from +-- 'BNFC.CF.ruleGroups. +fixPointKnownEmpty :: [(Cat, [Rule])] -> KnownEmpty +fixPointKnownEmpty cats = go (KnownEmpty Set.empty) + where + step = possiblyEmptyCats cats + + go x = if x == x' then x else go x' + where x' = step x + +-- | Transforms the given sentence such that the returned sentential form does +-- not match the empty string, and contains v'Optional' terms where needed. +-- +-- The returned list is a /choice/ list which is equivalent to the given +-- sentential form, but for the (potential) subtraction of empty matches. +-- +-- v'Optional' is inserted around symbols which previously matched the empty +-- string (according to the given 'KnownEmpty'). This compensates for +-- v'transformEmptyMatches' being applied to /other/ rules of the grammar. +-- +-- After this transformation is applied to all rules of the grammar, the +-- grammar should accept an identical language. However, the exact nodes which +-- match certain strings might change. +transformEmptyMatches :: KnownEmpty -> SentForm -> [OptSentForm] +transformEmptyMatches knownEmpty = unMatchesEmpty . possiblyEmptyRule knownEmpty + diff --git a/source/src/BNFC/Backend/TreeSitter/RegToJSReg.hs b/source/src/BNFC/Backend/TreeSitter/RegToJSReg.hs index 92ca550db..2529834cf 100644 --- a/source/src/BNFC/Backend/TreeSitter/RegToJSReg.hs +++ b/source/src/BNFC/Backend/TreeSitter/RegToJSReg.hs @@ -49,6 +49,10 @@ emptyPat = " [^\\u0000-\\uFFFF]?" -- | escape character according to Javascript regex format escapeCharFrom :: String -> Char -> String +escapeCharFrom _ '\n' = "\\n" +escapeCharFrom _ '\r' = "\\r" +escapeCharFrom _ '\t' = "\\t" +escapeCharFrom _ '\f' = "\\f" escapeCharFrom reservedChars x | x `elem` reservedChars = '\\' : [x] | otherwise = [x] diff --git a/source/src/BNFC/GetCF.hs b/source/src/BNFC/GetCF.hs index 3a8c681a6..385436181 100644 --- a/source/src/BNFC/GetCF.hs +++ b/source/src/BNFC/GetCF.hs @@ -20,6 +20,7 @@ module BNFC.GetCF ( parseCF, parseRawCF , checkRule, transItem + , fixTokenCats ) where import Control.Arrow (left) diff --git a/source/src/BNFC/Lexing.hs b/source/src/BNFC/Lexing.hs index 86f71d0e7..fd40947a6 100644 --- a/source/src/BNFC/Lexing.hs +++ b/source/src/BNFC/Lexing.hs @@ -1,7 +1,10 @@ {-# LANGUAGE PatternGuards #-} module BNFC.Lexing - ( mkLexer, LexType(..), mkRegMultilineComment + ( mkLexer + , LexType(..) + , mkRegMultilineComment + , mkRegSingleLineComment , debugPrint -- to avoid warning about unused definition ) where @@ -20,7 +23,7 @@ debugPrint = putStrLn . concat . words . printTree -- Abstract lexer -data LexType = LexComment | LexToken String | LexSymbols +data LexType = LexComment | LexToken String | LexSymbols deriving (Eq, Show) mkLexer :: CF -> [(Reg, LexType)] mkLexer cf = concat diff --git a/source/src/BNFC/Options.hs b/source/src/BNFC/Options.hs index d9a4df35c..2fec9c1e7 100644 --- a/source/src/BNFC/Options.hs +++ b/source/src/BNFC/Options.hs @@ -40,7 +40,7 @@ import System.FilePath (takeBaseName) import Text.Printf (printf) import Paths_BNFC (version) -import BNFC.CF (CF) +import BNFC.CF (CF, catIdent) import BNFC.Utils (unless) -- ~~~ Option data structures ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -147,6 +147,8 @@ data SharedOptions = Options --- C# specific , visualStudio :: Bool -- ^ Generate Visual Studio solution/project files. , wcf :: Bool -- ^ Windows Communication Foundation. + --- Tree-sitter specific + , treeSitterWord :: String -- ^ Option @--tree-sitter-word@. } deriving (Eq, Ord, Show) -- We take this opportunity to define the type of the backend functions. @@ -181,6 +183,8 @@ defaultOptions = Options -- C# specific , visualStudio = False , wcf = False + --- Tree-sitter specific + , treeSitterWord = catIdent } -- | Check whether an option is unchanged from the default. @@ -390,6 +394,10 @@ specificOptions = , ( Option [] ["agda"] (NoArg (\o -> o { agda = True, tokenText = TextToken })) "Also generate Agda bindings for the abstract syntax" , [TargetHaskell] ) + -- Tree-sitter backend: + , ( Option [] ["tree-sitter-word"] (ReqArg (\x o -> o { treeSitterWord = x }) "TOKEN") + "Use the given BNFC symbol as tree-sitter's \"word\" token" + , [TargetTreeSitter] ) ] -- | The list of specific options for a target. @@ -452,7 +460,7 @@ help = unlines $ title ++ , usageInfo "TARGET languages" targetOptions ] ++ map targetUsage helpTargets where - helpTargets = [ TargetHaskell, TargetJava, TargetC, TargetCpp ] + helpTargets = [ TargetHaskell, TargetJava, TargetC, TargetCpp, TargetTreeSitter ] targetUsage t = usageInfo (printf "Special options for the %s backend" (show t)) (specificOptions' t) diff --git a/source/src/BNFC/Utils.hs b/source/src/BNFC/Utils.hs index ebc6e1384..4d28aca8a 100644 --- a/source/src/BNFC/Utils.hs +++ b/source/src/BNFC/Utils.hs @@ -20,6 +20,7 @@ module BNFC.Utils , lowerCase, upperCase, mixedCase , camelCase, camelCase_ , snakeCase, snakeCase_ + , kebabCase, kebabCase_ , replace , writeFileRep , cstring @@ -301,6 +302,7 @@ data NameStyle = LowerCase -- ^ e.g. @lowercase@ | UpperCase -- ^ e.g. @UPPERCASE@ | SnakeCase -- ^ e.g. @snake_case@ + | KebabCase -- ^ e.g. @kebab-case@ | CamelCase -- ^ e.g. @CamelCase@ | MixedCase -- ^ e.g. @mixedCase@ | OrigCase -- ^ Keep original capitalization and form. @@ -341,8 +343,9 @@ data NameStyle mkName :: [String] -> NameStyle -> String -> String mkName reserved style s = notReserved name' where + suffix = if style == KebabCase then "-" else "_" notReserved name - | name `elem` reserved = notReserved (name ++ "_") + | name `elem` reserved = notReserved (name ++ suffix) | otherwise = name tokens = parseIdent s name' = case style of @@ -351,6 +354,7 @@ mkName reserved style s = notReserved name' CamelCase -> concatMap capitalize tokens MixedCase -> mapHead toLower $ concatMap capitalize tokens SnakeCase -> map toLower $ intercalate "_" tokens + KebabCase -> map toLower $ intercalate "-" tokens OrigCase -> s -- | Make first letter uppercase. @@ -485,6 +489,12 @@ snakeCase = text . snakeCase_ snakeCase_ :: String -> String snakeCase_ = mkName [] SnakeCase +kebabCase :: String -> Doc +kebabCase = text . kebabCase_ + +kebabCase_ :: String -> String +kebabCase_ = mkName [] KebabCase + -- ESCAPING -- | A function that renders a c-like string with escaped characters. diff --git a/source/test/BNFC/Backend/TreeSitter/.gitignore b/source/test/BNFC/Backend/TreeSitter/.gitignore new file mode 100644 index 000000000..38515f874 --- /dev/null +++ b/source/test/BNFC/Backend/TreeSitter/.gitignore @@ -0,0 +1,2 @@ +tree-sitter-* +tmp-generate-* diff --git a/source/test/BNFC/Backend/TreeSitter/Makefile b/source/test/BNFC/Backend/TreeSitter/Makefile new file mode 100644 index 000000000..fde1ba846 --- /dev/null +++ b/source/test/BNFC/Backend/TreeSitter/Makefile @@ -0,0 +1,23 @@ +.PHONY: all generate-all clean + +all: + bash -ec 'for f in *.cf; do js="$${f%.cf}".expected.js && rm -rfv $$js && $(MAKE) $$js; done' + $(MAKE) generate-all + git diff --exit-code --stat './*.js' + +%.expected.js: %.cf + d=`mktemp -d` && \ + cabal run bnfc -- -o $$d --tree-sitter $< && \ + grep -v 'File generated by' $$d/tree-sitter-*/grammar.js > $@ && \ + rm -rf $$d + +generate-all: + bash -ec 'for f in *.cf; do $(MAKE) generate-"$${f%.cf}"; done' + +generate-%: %.cf + cabal run bnfc -- -o tmp-$@ --tree-sitter --makefile $< + make -C tmp-$@/tree-sitter-* + +clean: + rm -rfv test-* + git restore -- './*.js' diff --git a/source/test/BNFC/Backend/TreeSitter/basic.cf b/source/test/BNFC/Backend/TreeSitter/basic.cf new file mode 100644 index 000000000..bbd40f6f0 --- /dev/null +++ b/source/test/BNFC/Backend/TreeSitter/basic.cf @@ -0,0 +1,9 @@ +-- start symbol should be allowed to be optional +rules Start ::= MyIdent Start | ; + +comment "//"; +comment "/*" "*/"; + +token MyIdent 'a'+; + + diff --git a/source/test/BNFC/Backend/TreeSitter/basic.expected.js b/source/test/BNFC/Backend/TreeSitter/basic.expected.js new file mode 100644 index 000000000..e76e80d3c --- /dev/null +++ b/source/test/BNFC/Backend/TreeSitter/basic.expected.js @@ -0,0 +1,29 @@ + +module.exports = grammar({ + name: "basic", + extras: $ =>[ + /\s/, + $.token_CommentSingle, + $.token_CommentMulti, + ], + rules: { + BNFCStart: $ => + optional( + // BNFCStart_Start. BNFCStart ::= Start ; + $.Start + ), + Start: $ => + choice( + // Start1. Start ::= MyIdent Start ; + seq($.token_MyIdent, optional($.Start)), + // Start2. Start ::= ; + choice() + ), + token_MyIdent: $ => + /a+/, + token_CommentSingle: $ => + /\/\/.*\n/, + token_CommentMulti: $ => + /\/\*[^*]*\*([^\*\/][^*]*\*|\*)*\//, + }, +}); diff --git a/source/test/BNFC/Backend/TreeSitter/list.cf b/source/test/BNFC/Backend/TreeSitter/list.cf new file mode 100644 index 000000000..abaf1900a --- /dev/null +++ b/source/test/BNFC/Backend/TreeSitter/list.cf @@ -0,0 +1,36 @@ +rules Start ::= + "1" [NonEmptySeparator] + | "2" [MaybeEmptySeparator] + + | "3" [NonEmptyTerminator] + | "4" [MaybeEmptyTerminator] + + | "5" [MaybeEmptySeparatorEmptySep] + | "6" [MaybeEmptyTerminatorEmptySep] + + | "7" [NonEmptySeparatorEmptySep] + | "8" [NonEmptyTerminatorEmptySep]; + +rules NonEmptySeparator ::= "a"; +separator nonempty NonEmptySeparator ","; + +rules MaybeEmptySeparator ::= "a"; +separator MaybeEmptySeparator ","; + +rules NonEmptyTerminator ::= "a"; +terminator nonempty NonEmptyTerminator ","; + +rules MaybeEmptyTerminator ::= "a"; +terminator MaybeEmptyTerminator ","; + +rules MaybeEmptySeparatorEmptySep ::= "a"; +separator MaybeEmptySeparatorEmptySep ""; + +rules MaybeEmptyTerminatorEmptySep ::= "a"; +terminator MaybeEmptyTerminatorEmptySep ""; + +rules NonEmptySeparatorEmptySep ::= "a"; +separator nonempty NonEmptySeparatorEmptySep ""; + +rules NonEmptyTerminatorEmptySep ::= "a"; +terminator nonempty NonEmptyTerminatorEmptySep ""; diff --git a/source/test/BNFC/Backend/TreeSitter/list.expected.js b/source/test/BNFC/Backend/TreeSitter/list.expected.js new file mode 100644 index 000000000..82555a86a --- /dev/null +++ b/source/test/BNFC/Backend/TreeSitter/list.expected.js @@ -0,0 +1,113 @@ + +module.exports = grammar({ + name: "list", + extras: $ =>[ + /\s/, + ], + rules: { + BNFCStart: $ => + // BNFCStart_Start. BNFCStart ::= Start ; + $.Start, + Start: $ => + choice( + // Start1. Start ::= "1" [NonEmptySeparator] ; + seq("1", $.list_NonEmptySeparator), + // Start2. Start ::= "2" [MaybeEmptySeparator] ; + seq("2", optional($.list_MaybeEmptySeparator)), + // Start3. Start ::= "3" [NonEmptyTerminator] ; + seq("3", $.list_NonEmptyTerminator), + // Start4. Start ::= "4" [MaybeEmptyTerminator] ; + seq("4", optional($.list_MaybeEmptyTerminator)), + // Start5. Start ::= "5" [MaybeEmptySeparatorEmptySep] ; + seq("5", optional($.list_MaybeEmptySeparatorEmptySep)), + // Start6. Start ::= "6" [MaybeEmptyTerminatorEmptySep] ; + seq("6", optional($.list_MaybeEmptyTerminatorEmptySep)), + // Start7. Start ::= "7" [NonEmptySeparatorEmptySep] ; + seq("7", $.list_NonEmptySeparatorEmptySep), + // Start8. Start ::= "8" [NonEmptyTerminatorEmptySep] ; + seq("8", $.list_NonEmptyTerminatorEmptySep) + ), + NonEmptySeparator: $ => + // NonEmptySeparator_a. NonEmptySeparator ::= "a" ; + "a", + list_NonEmptySeparator: $ => + choice( + // (:[]). [NonEmptySeparator] ::= NonEmptySeparator ; + $.NonEmptySeparator, + // (:). [NonEmptySeparator] ::= NonEmptySeparator "," [NonEmptySeparator] ; + seq($.NonEmptySeparator, ",", $.list_NonEmptySeparator) + ), + MaybeEmptySeparator: $ => + // MaybeEmptySeparator_a. MaybeEmptySeparator ::= "a" ; + "a", + list_MaybeEmptySeparator: $ => + choice( + // []. [MaybeEmptySeparator] ::= ; + choice(), + // (:[]). [MaybeEmptySeparator] ::= MaybeEmptySeparator ; + $.MaybeEmptySeparator, + // (:). [MaybeEmptySeparator] ::= MaybeEmptySeparator "," [MaybeEmptySeparator] ; + seq($.MaybeEmptySeparator, ",", optional($.list_MaybeEmptySeparator)) + ), + NonEmptyTerminator: $ => + // NonEmptyTerminator_a. NonEmptyTerminator ::= "a" ; + "a", + list_NonEmptyTerminator: $ => + choice( + // (:[]). [NonEmptyTerminator] ::= NonEmptyTerminator "," ; + seq($.NonEmptyTerminator, ","), + // (:). [NonEmptyTerminator] ::= NonEmptyTerminator "," [NonEmptyTerminator] ; + seq($.NonEmptyTerminator, ",", $.list_NonEmptyTerminator) + ), + MaybeEmptyTerminator: $ => + // MaybeEmptyTerminator_a. MaybeEmptyTerminator ::= "a" ; + "a", + list_MaybeEmptyTerminator: $ => + choice( + // []. [MaybeEmptyTerminator] ::= ; + choice(), + // (:). [MaybeEmptyTerminator] ::= MaybeEmptyTerminator "," [MaybeEmptyTerminator] ; + seq($.MaybeEmptyTerminator, ",", optional($.list_MaybeEmptyTerminator)) + ), + MaybeEmptySeparatorEmptySep: $ => + // MaybeEmptySeparatorEmptySep_a. MaybeEmptySeparatorEmptySep ::= "a" ; + "a", + list_MaybeEmptySeparatorEmptySep: $ => + choice( + // []. [MaybeEmptySeparatorEmptySep] ::= ; + choice(), + // (:). [MaybeEmptySeparatorEmptySep] ::= MaybeEmptySeparatorEmptySep [MaybeEmptySeparatorEmptySep] ; + seq($.MaybeEmptySeparatorEmptySep, optional($.list_MaybeEmptySeparatorEmptySep)) + ), + MaybeEmptyTerminatorEmptySep: $ => + // MaybeEmptyTerminatorEmptySep_a. MaybeEmptyTerminatorEmptySep ::= "a" ; + "a", + list_MaybeEmptyTerminatorEmptySep: $ => + choice( + // []. [MaybeEmptyTerminatorEmptySep] ::= ; + choice(), + // (:). [MaybeEmptyTerminatorEmptySep] ::= MaybeEmptyTerminatorEmptySep [MaybeEmptyTerminatorEmptySep] ; + seq($.MaybeEmptyTerminatorEmptySep, optional($.list_MaybeEmptyTerminatorEmptySep)) + ), + NonEmptySeparatorEmptySep: $ => + // NonEmptySeparatorEmptySep_a. NonEmptySeparatorEmptySep ::= "a" ; + "a", + list_NonEmptySeparatorEmptySep: $ => + choice( + // (:[]). [NonEmptySeparatorEmptySep] ::= NonEmptySeparatorEmptySep ; + $.NonEmptySeparatorEmptySep, + // (:). [NonEmptySeparatorEmptySep] ::= NonEmptySeparatorEmptySep [NonEmptySeparatorEmptySep] ; + seq($.NonEmptySeparatorEmptySep, $.list_NonEmptySeparatorEmptySep) + ), + NonEmptyTerminatorEmptySep: $ => + // NonEmptyTerminatorEmptySep_a. NonEmptyTerminatorEmptySep ::= "a" ; + "a", + list_NonEmptyTerminatorEmptySep: $ => + choice( + // (:[]). [NonEmptyTerminatorEmptySep] ::= NonEmptyTerminatorEmptySep ; + $.NonEmptyTerminatorEmptySep, + // (:). [NonEmptyTerminatorEmptySep] ::= NonEmptyTerminatorEmptySep [NonEmptyTerminatorEmptySep] ; + seq($.NonEmptyTerminatorEmptySep, $.list_NonEmptyTerminatorEmptySep) + ), + }, +}); diff --git a/source/test/BNFC/Backend/TreeSitter/list_trailing.cf b/source/test/BNFC/Backend/TreeSitter/list_trailing.cf new file mode 100644 index 000000000..be1245489 --- /dev/null +++ b/source/test/BNFC/Backend/TreeSitter/list_trailing.cf @@ -0,0 +1,6 @@ +rules Start ::= "1" [A]; + +rules Opt ::= "X" | ; + +rules A ::= Opt; +separator A ","; diff --git a/source/test/BNFC/Backend/TreeSitter/list_trailing.expected.js b/source/test/BNFC/Backend/TreeSitter/list_trailing.expected.js new file mode 100644 index 000000000..036ec7520 --- /dev/null +++ b/source/test/BNFC/Backend/TreeSitter/list_trailing.expected.js @@ -0,0 +1,34 @@ + +module.exports = grammar({ + name: "list_trailing", + extras: $ =>[ + /\s/, + ], + rules: { + BNFCStart: $ => + // BNFCStart_Start. BNFCStart ::= Start ; + $.Start, + Start: $ => + // Start1. Start ::= "1" [A] ; + seq("1", optional($.list_A)), + Opt: $ => + choice( + // Opt_X. Opt ::= "X" ; + "X", + // Opt1. Opt ::= ; + choice() + ), + A: $ => + // AOpt. A ::= Opt ; + $.Opt, + list_A: $ => + choice( + // []. [A] ::= ; + choice(), + // (:[]). [A] ::= A ; + $.A, + // (:). [A] ::= A "," [A] ; + seq(optional($.A), ",", optional($.list_A)) + ), + }, +}); diff --git a/source/test/BNFC/Backend/TreeSitter/seq.cf b/source/test/BNFC/Backend/TreeSitter/seq.cf new file mode 100644 index 000000000..d1e8f6653 --- /dev/null +++ b/source/test/BNFC/Backend/TreeSitter/seq.cf @@ -0,0 +1,10 @@ +rules Start ::= TransitivelyOptional | NonOptional; +entrypoints Start; + +rules TransitivelyOptional ::= A B C; + +rules A ::= "a" | ; +rules B ::= "b" | ; +rules C ::= "c" | ; + +rules NonOptional ::= "x" A B C; diff --git a/source/test/BNFC/Backend/TreeSitter/seq.expected.js b/source/test/BNFC/Backend/TreeSitter/seq.expected.js new file mode 100644 index 000000000..1754f78a6 --- /dev/null +++ b/source/test/BNFC/Backend/TreeSitter/seq.expected.js @@ -0,0 +1,52 @@ + +module.exports = grammar({ + name: "seq", + extras: $ =>[ + /\s/, + ], + rules: { + BNFCStart: $ => + optional( + // BNFCStart_Start. BNFCStart ::= Start ; + $.Start + ), + Start: $ => + choice( + // StartTransitivelyOptional. Start ::= TransitivelyOptional ; + $.TransitivelyOptional, + // StartNonOptional. Start ::= NonOptional ; + $.NonOptional + ), + TransitivelyOptional: $ => + // TransitivelyOptional1. TransitivelyOptional ::= A B C ; + choice( + seq($.A, optional($.B), optional($.C)), + seq($.B, optional($.C)), + $.C + ), + A: $ => + choice( + // A_a. A ::= "a" ; + "a", + // A1. A ::= ; + choice() + ), + B: $ => + choice( + // B_b. B ::= "b" ; + "b", + // B1. B ::= ; + choice() + ), + C: $ => + choice( + // C_c. C ::= "c" ; + "c", + // C1. C ::= ; + choice() + ), + NonOptional: $ => + // NonOptional1. NonOptional ::= "x" A B C ; + seq("x", optional($.A), optional($.B), optional($.C)), + }, +}); diff --git a/source/test/BNFC/Backend/TreeSitter/tree-sitter.json b/source/test/BNFC/Backend/TreeSitter/tree-sitter.json new file mode 100644 index 000000000..382d50b0c --- /dev/null +++ b/source/test/BNFC/Backend/TreeSitter/tree-sitter.json @@ -0,0 +1,38 @@ +{ + "$schema": "https://tree-sitter.github.io/tree-sitter/assets/schemas/config.schema.json", + "grammars": [ + { + "name": "test", + "camelcase": "Test", + "title": "Test", + "scope": "source.test", + "file-types": [ + "test" + ], + "injection-regex": "^test$", + "class-name": "TreeSitterTest" + } + ], + "metadata": { + "version": "0.1.0", + "license": "", + "description": "Test grammar for tree-sitter", + "authors": [ + { + "name": "BNFC Test" + } + ], + "links": { + "repository": "https://test.invalid" + } + }, + "bindings": { + "c": false, + "go": false, + "node": false, + "python": false, + "rust": false, + "swift": false, + "zig": false + } +} diff --git a/source/test/BNFC/Backend/TreeSitterSpec.hs b/source/test/BNFC/Backend/TreeSitterSpec.hs index 1651a10ad..93ae6f879 100644 --- a/source/test/BNFC/Backend/TreeSitterSpec.hs +++ b/source/test/BNFC/Backend/TreeSitterSpec.hs @@ -1,9 +1,18 @@ module BNFC.Backend.TreeSitterSpec where +import qualified Paths_BNFC + +import System.FilePath +import System.Directory(listDirectory) + +import qualified Data.List as List + +import BNFC.Backend.Base(fileName, execBackend) import BNFC.Options import BNFC.GetCF import Test.Hspec +import Test.HUnit ((@?)) import BNFC.Hspec import BNFC.Backend.TreeSitter -- SUT @@ -17,9 +26,42 @@ getCalc = parseCF calcOptions TargetTreeSitter $ , "EInt. Exp2 ::= Integer ;" , "coercions Exp 2 ;" ] +listDataFiles = do + dataDir <- Paths_BNFC.getDataDir + let dir = dataDir "test/BNFC/Backend/TreeSitter" + + files <- listDirectory dir + pure $ map (dir ) $ filter ((== ".cf") . takeExtension) files + +runFileTest filename = do + let opts = (defaultOptions { lang = takeBaseName filename}) + + bnfc <- readFile (filename -<.> "cf") + expected <- readFile (filename -<.> "expected.js") + + cf <- parseCF opts TargetTreeSitter bnfc + let backend = makeTreeSitter opts cf + + -- get the name of the grammar.js file within a subfolder + fileNames <- map fileName <$> execBackend backend + let (Just grammarJs) = List.find ((== "grammar.js") . takeFileName) fileNames + + backend `shouldGenerateText` (grammarJs, expected) + +makeFileTest filename = + it ("tree-sitter expect test: " ++ filename) $ + runFileTest filename + spec = do describe "Tree-Sitter backend" $ do it "creates the grammar.js file" $ do calc <- getCalc - makeTreeSitter calcOptions calc `shouldGenerate` "grammar.js" + makeTreeSitter calcOptions calc `shouldGenerate` ("tree-sitter-calc" "grammar.js") + + cfFiles <- runIO listDataFiles + + it "should find at least one expect test" $ do + not (null cfFiles) @? "no .cf files found" + + mapM_ makeFileTest cfFiles diff --git a/source/test/BNFC/Hspec.hs b/source/test/BNFC/Hspec.hs index dc91aad62..62c9ca639 100644 --- a/source/test/BNFC/Hspec.hs +++ b/source/test/BNFC/Hspec.hs @@ -3,11 +3,12 @@ module BNFC.Hspec where import Text.Printf +import Data.Char(isSpace) import BNFC.Backend.Base import Test.Hspec -import Test.HUnit ((@?)) +import Test.HUnit ((@?), (@=?), assertFailure) -- | Expectation that a backend generates a particular file. @@ -21,3 +22,16 @@ backend `shouldGenerate` file = do let filenames = map fileName files file `elem` filenames @? printf "file %s not found in %s" file (show filenames) + +shouldGenerateText + :: Backend -- ^ Backend to run. + -> (String, String) -- ^ Expected name of file and its expected contents. + -> Expectation +backend `shouldGenerateText` (file, expected) = do + backendFiles <- execBackend backend + let files = map (\x -> (fileName x, fileContent x)) backendFiles + let canon = dropWhile isSpace + case lookup file files of + Nothing -> assertFailure $ printf "file %s not found in %s" file (show (map fst files)) + Just content -> canon expected @=? canon content + diff --git a/testing/src/ParameterizedTests.hs b/testing/src/ParameterizedTests.hs index ce0c945c5..54fc05662 100644 --- a/testing/src/ParameterizedTests.hs +++ b/testing/src/ParameterizedTests.hs @@ -61,7 +61,8 @@ allWithParams params = makeTestSuite (tpName params) $ concat $ -- Use it while working in connection with a certain test case. (For quicker response.) current :: Test -- current = currentExampleTest -current = currentRegressionTest +-- current = currentRegressionTest +current = makeTestSuite "TS" [allWithParams p | p <- parameters ] -- current = layoutTest currentExampleTest :: Test @@ -421,6 +422,8 @@ parameters = concat , javaParams { tpName = "Java (with jflex and line numbers)" , tpBnfcOptions = ["--java", "--jflex", "-l"] } ] + -- Tree-sitter + , [ treeSitter ] ] where base = baseParameters @@ -444,6 +447,14 @@ parameters = concat , tpBnfcOptions = ["--ocaml"] , tpRunTestProg = haskellRunTestProg } + treeSitter = TP + { tpName = "tree-sitter" + , tpBuild = do + cmd "tree-sitter" "generate" . (:[]) =<< findFile "grammar.js" + , tpBnfcOptions = ["--tree-sitter"] + , tpRunTestProg = \ _lang args -> do + cmd "tree-sitter" "parse" args + } -- | Helper function that runs bnfc with the context's options and an -- option to generate 'tpMakefile'.