diff --git a/CHANGELOG.md b/CHANGELOG.md index e610530f..83402d2c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,38 @@ # Changelog for `distributors` +## 0.6.0.0 - 2026-04-24 + +### New Module + +- Added `Control.Lens.Grammar.Machine` as the transducer/matching runtime layer. + +### New Types + +- Added `Transducer` and `TransducerStep` as the finite-state representation for + compiled grammar machines. + +### New APIs (Machine Runtime) + +- Added `transducer` to compile `Bnf (RegEx token)` into `Transducer`. +- Added `parseForest` to reconstruct parse forests with rule labels and token spans/slices, + returning the remaining unparsed suffix. +- Added `expectNext` to compute scanner-frontier expected token classes after a prefix. +- Added `languageSample` to lazily enumerate sampled language words from shortest length upward. +- Added `unreachableRules` to report dead nonterminals unreachable from the start expression. + +### Internal Machinery + +- Implemented Thompson-style transducer construction over `RegEx`-extended BNF. +- Implemented Earley-style chart runtime (`initialChart`, `closeChartAt`, `scanClassOptions`, + `prefixGen`) with predict/complete closure and scanner grouping. +- Added completion-time caller indexing/cache optimizations and precomputed rule nullability/first-state + indexing to speed machine execution. + +### Grammar Integration + +- `Control.Lens.Grammar` now exposes machine-backed generators: + `transducerG` and the parse-forest examples/docs built on the Machine runtime. + ## 0.5.0.0 - 2026-04-16 ### Changes diff --git a/distributors.cabal b/distributors.cabal index 546551fa..2794cd48 100644 --- a/distributors.cabal +++ b/distributors.cabal @@ -5,7 +5,7 @@ cabal-version: 2.2 -- see: https://github.com/sol/hpack name: distributors -version: 0.5.0.0 +version: 0.6.0.0 synopsis: Unifying Parsers, Printers & Grammars description: Distributors provides mathematically inspired abstractions for coders to write parsers that can also be inverted to printers. category: Profunctors, Optics, Parsing @@ -36,6 +36,7 @@ library Control.Lens.Grammar.Internal.NestedPrismTH Control.Lens.Grammar.Internal.Orphanage Control.Lens.Grammar.Kleene + Control.Lens.Grammar.Machine Control.Lens.Grammar.Symbol Control.Lens.Grammar.Token Control.Lens.Grate @@ -98,6 +99,7 @@ library ghc-options: -Wall -Wcompat -Widentities -Wincomplete-record-updates -Wincomplete-uni-patterns -Wmissing-export-lists -Wmissing-home-modules -Wpartial-fields -Wredundant-constraints build-depends: MemoTrie >=0.6 && <1 + , QuickCheck >=2.14 && <3 , adjunctions >=4.4 && <5 , base >=4.15 && <5 , bifunctors >=5.5 && <6 @@ -106,12 +108,15 @@ library , contravariant >=1.5 && <2 , distributive >=0.6 && <1 , lens >=5.0 && <6 + , monad-loops >=0.4.3 && <1 , mtl >=2.2 && <3 , profunctors >=5.6 && <6 + , random >=1.2 && <2 , tagged >=0.8 && <1 , template-haskell >=2.17 && <3 , text ==2.* , th-abstraction >=0.4 && <1 + , transformers >=0.5 && <1 , vector >=0.12 && <1 , witherable >=0.4 && <1 default-language: Haskell2010 @@ -188,12 +193,15 @@ test-suite test , hspec >=2.7 && <3 , lens >=5.0 && <6 , megaparsec >=9.0 && <10 + , monad-loops >=0.4.3 && <1 , mtl >=2.2 && <3 , profunctors >=5.6 && <6 + , random >=1.2 && <2 , tagged >=0.8 && <1 , template-haskell >=2.17 && <3 , text ==2.* , th-abstraction >=0.4 && <1 + , transformers >=0.5 && <1 , vector >=0.12 && <1 , witherable >=0.4 && <1 default-language: Haskell2010 diff --git a/package.yaml b/package.yaml index 9f1c9a07..4319745c 100644 --- a/package.yaml +++ b/package.yaml @@ -1,5 +1,5 @@ name: distributors -version: 0.5.0.0 +version: 0.6.0.0 github: "morphismtech/distributors" license: BSD-3-Clause author: "Eitan Chatav" @@ -27,7 +27,11 @@ dependencies: - distributive >= 0.6 && < 1 - lens >= 5.0 && < 6 - MemoTrie >= 0.6 && < 1 +- QuickCheck >= 2.14 && < 3 +- monad-loops >=0.4.3 && < 1 - mtl >= 2.2 && < 3 +- random >= 1.2 && < 2 +- transformers >= 0.5 && < 1 - profunctors >= 5.6 && < 6 - tagged >= 0.8 && < 1 - template-haskell >= 2.17 && < 3 diff --git a/src/Control/Lens/Grammar.hs b/src/Control/Lens/Grammar.hs index 0cf344f7..8ac3779e 100644 --- a/src/Control/Lens/Grammar.hs +++ b/src/Control/Lens/Grammar.hs @@ -24,6 +24,7 @@ module Control.Lens.Grammar , regbnfG , regbnfGrammar , applicativeG + , transducerG -- * Context-sensitive grammar , CtxGrammar , printG @@ -45,6 +46,7 @@ import Control.Lens.PartialIso import Control.Lens.Grammar.BackusNaur import Control.Lens.Grammar.Boole import Control.Lens.Grammar.Kleene +import Control.Lens.Grammar.Machine import Control.Lens.Grammar.Token import Control.Lens.Grammar.Symbol import Data.Bifunctor.Joker @@ -67,6 +69,7 @@ import Witherable import Control.Lens.Grammar.BackusNaur as X import Control.Lens.Grammar.Boole as X import Control.Lens.Grammar.Kleene as X +import Control.Lens.Grammar.Machine as X import Control.Lens.Grammar.Symbol as X import Control.Lens.Grammar.Token as X import Control.Lens.PartialIso as X @@ -789,6 +792,71 @@ It can apply to a `RegGrammar`. regbnfG :: Grammar Char a -> RegBnf regbnfG bnf = runGrammor bnf +{- | Compile a `Grammar` into a `Transducer`. + +>>> let regexMachine = transducerG @Char regexGrammar + +A transducer is a form of finite state machine, +usable as an intermediary for further generators like +`=~`, `expectNext`, `languageSample`, `parseForest` & `unreachableRules`. + +>>> import Test.QuickCheck +>>> let regexLang = languageSample @Char regexMachine +>>> words100 <- generate (take 100 <$> regexLang) +>>> quickCheck (property (all (=~ regexMachine) words100)) ++++ OK, passed 1 test. +>>> import Control.Monad.State +>>> import System.Random +>>> let gen = mkStdGen 69 +>>> evalState (take 15 <$> regexLang) gen +["","|","\776269","()","[]","\\[","||","|\249908","\770923*","\1008821+","\318904?","\845807|","\477898\1026934","()*","()+"] + +>>> import Data.Tree (drawForest) + +@>>> let (forest, _) = parseForest regexMachine "xy|z" in putStr (drawForest (map (fmap show) forest)) +("regex",0,4,"xy|z") +| +`- ("alternate",0,4,"xy|z") + | + +- ("sequence",0,2,"xy") + | | + | +- ("expression",0,1,"x") + | | | + | | `- ("atom",0,1,"x") + | | | + | | `- ("class",0,1,"x") + | | | + | | `- ("class-one-of",0,1,"x") + | | | + | | `- ("char",0,1,"x") + | | + | `- ("expression",1,2,"y") + | | + | `- ("atom",1,2,"y") + | | + | `- ("class",1,2,"y") + | | + | `- ("class-one-of",1,2,"y") + | | + | `- ("char",1,2,"y") + | + `- ("sequence",3,4,"z") + | + `- ("expression",3,4,"z") + | + `- ("atom",3,4,"z") + | + `- ("class",3,4,"z") + | + `- ("class-one-of",3,4,"z") + | + `- ("char",3,4,"z") +@ + +-} +transducerG :: Categorized token => Grammar token a -> Transducer token +transducerG bnf = transducer (runGrammor bnf) + {- | `printG` generates a printer from a `CtxGrammar`. Since both `RegGrammar`s and context-free `Grammar`s are `CtxGrammar`s, the type system will allow `printG` to be applied to them. diff --git a/src/Control/Lens/Grammar/BackusNaur.hs b/src/Control/Lens/Grammar/BackusNaur.hs index bd47607e..ebfe058a 100644 --- a/src/Control/Lens/Grammar/BackusNaur.hs +++ b/src/Control/Lens/Grammar/BackusNaur.hs @@ -19,13 +19,10 @@ module Control.Lens.Grammar.BackusNaur , liftBnf0 , liftBnf1 , liftBnf2 - -- * Matching - , Matching (..) , diffB ) where import Control.Lens -import Control.Lens.Extras import Control.Lens.Grammar.Kleene import Control.Lens.Grammar.Token import Control.Lens.Grammar.Symbol @@ -107,11 +104,6 @@ liftBnf2 liftBnf2 f (Bnf start0 rules0) (Bnf start1 rules1) = Bnf (f start0 start1) (Set.map coerce rules0 <> Set.map coerce rules1) --- | Does a word match a pattern? -class Matching word pattern | pattern -> word where - (=~) :: word -> pattern -> Bool - infix 2 =~ - {- | The [Brzozowski derivative] (https://dl.acm.org/doi/pdf/10.1145/321239.321249) of a @@ -210,11 +202,3 @@ instance (Ord rule, Monoid rule) => Monoid (Bnf rule) where mempty = liftBnf0 mempty instance (Ord rule, Semigroup rule) => Semigroup (Bnf rule) where (<>) = liftBnf2 (<>) -instance (Categorized token, HasTrie token) - => Matching [token] (Bnf (RegEx token)) where - (=~) word = δ . diffB word -instance (Categorized token, HasTrie token) - => Matching [token] (RegEx token) where - word =~ pattern = word =~ liftBnf0 pattern -instance Matching s (APrism s t a b) where - word =~ pattern = is pattern word diff --git a/src/Control/Lens/Grammar/Kleene.hs b/src/Control/Lens/Grammar/Kleene.hs index 52d74473..fe8018a5 100644 --- a/src/Control/Lens/Grammar/Kleene.hs +++ b/src/Control/Lens/Grammar/Kleene.hs @@ -29,6 +29,8 @@ import Control.Applicative import Control.Lens.Grammar.Boole import Control.Lens.Grammar.Symbol import Control.Lens.Grammar.Token +import Control.Monad.Loops +import Control.Monad.State (StateT, state) import Data.Bifunctor.Joker import Data.Foldable import Data.MemoTrie @@ -38,6 +40,10 @@ import Data.Profunctor.Distributor import Data.Set (Set) import qualified Data.Set as Set import GHC.Generics +import System.Random (RandomGen, Random, random) +import Test.QuickCheck.Arbitrary +import Test.QuickCheck.Gen (Gen) +import qualified Test.QuickCheck.Gen as Gen import Text.ParserCombinators.ReadP (ReadP) import qualified Text.ParserCombinators.ReadP as ReadP @@ -225,6 +231,25 @@ instance TokenAlgebra token (f token) tokenClass = Joker . tokenClass instance TokenAlgebra Char (ReadP Char) where tokenClass = ReadP.satisfy . tokenClass +instance (Categorized token, Arbitrary token) => TokenAlgebra token (Gen token) where + tokenClass (TokenClass exam) = case exam of + OneOf xs -> oneOf xs + NotOneOf xs (AndAsIn cat) -> arbitrary `Gen.suchThat` + (\x -> x `notElem` xs && categorize x == cat) + NotOneOf xs (AndNotAsIn cats) -> arbitrary `Gen.suchThat` + (\x -> x `notElem` xs && categorize x `notElem` cats) + Alternate cls1 cls2 -> Gen.oneof [tokenClass cls1, tokenClass cls2] +instance (RandomGen g, Monad m, Categorized token, Random token) + => TokenAlgebra token (StateT g m token) where + tokenClass (TokenClass exam) = case exam of + OneOf xs -> oneOf xs + NotOneOf xs (AndAsIn cat) -> + iterateUntil (\x -> x `notElem` xs && categorize x == cat) anyToken + NotOneOf xs (AndNotAsIn cats) -> + iterateUntil (\x -> x `notElem` xs && categorize x `notElem` cats) anyToken + Alternate cls1 cls2 -> do + b <- state random + if (b :: Bool) then tokenClass cls1 else tokenClass cls2 instance Categorized token => Monoid (RegEx token) where mempty = SeqEmpty instance Categorized token => Semigroup (RegEx token) where diff --git a/src/Control/Lens/Grammar/Machine.hs b/src/Control/Lens/Grammar/Machine.hs new file mode 100644 index 00000000..4f778b95 --- /dev/null +++ b/src/Control/Lens/Grammar/Machine.hs @@ -0,0 +1,556 @@ +{- | +Module : Control.Lens.Grammar.Machine +Description : matching & transducers +Copyright : (C) 2026 - Eitan Chatav +License : BSD-style (see the file LICENSE) +Maintainer : Eitan Chatav +Stability : provisional +Portability : non-portable +-} + +module Control.Lens.Grammar.Machine + ( -- * Matching + Matching (..) + -- * Transducer + , transducer + , parseForest + , languageSample + , expectNext + , unreachableRules + , Transducer (..) + , TransducerStep (..) + ) where + +import Control.Lens +import Control.Lens.Extras +import Control.Lens.Grammar.BackusNaur +import Control.Lens.Grammar.Boole +import Control.Lens.Grammar.Kleene +import Control.Lens.Grammar.Token +import Data.Foldable +import qualified Data.IntMap.Strict as IntMap +import Data.IntMap.Strict (IntMap) +import qualified Data.IntSet as IntSet +import Data.IntSet (IntSet) +import qualified Data.Map.Strict as Map +import Data.Map.Strict (Map) +import qualified Data.Set as Set +import Data.Set (Set) +import Data.Tree (Tree (..)) + +-- | Does a word match a pattern? +class Matching word pattern | pattern -> word where + (=~) :: word -> pattern -> Bool + infix 2 =~ +-- instances +instance Categorized token + => Matching [token] (Transducer token) where + word =~ et = acceptsChart n chart + where + (n, chart) = prefixGen et word +instance Categorized token + => Matching [token] (Bnf (RegEx token)) where + word =~ bnf = word =~ transducer bnf +instance Categorized token + => Matching [token] (RegEx token) where + word =~ pattern = word =~ liftBnf0 pattern +instance Matching s (APrism s t a b) where + word =~ pattern = is pattern word + +{-| A `Transducer` is a tuple + +@ +T = (Σ, Δ, Q, I ⊆ Q, F ∈ Q, transition ⊆ Q × (Σ ∪ ∆) × Q, output ⊆ Q × ∆) +@ + +* @Σ@ is a (possibly infinite) set of terminal token classes, represented by `TokenClass`es. +* @Δ@ is a finite set of nonterminals, represented by the key set of `transducerRules`. +* @Q@ is a set of states, which is represented by the key set of `transducerRelations`. +* @I@ are initial states represented by `transducerStarts`. +* @F@ is a final state represented by @0@. +* @transition@ is a relation represented by `transducerRelations` + with `TransitionTokenClass` and `TransitionNonTerminal` transitions. +* @output@ is a relation represented by `transducerRelations` with `EmitNonTerminal` outputs. +-} +data Transducer token = Transducer + { transducerRelations :: IntMap (TransducerStep token) + , transducerRules :: Map String (IntSet, Bool) + -- ^ an index into `transducerRelations` for nonterminals with precomputed nullability + , transducerStarts :: IntSet + -- ^ an index into `transducerRelations` for the starting rule + } + +-- | A `TransducerStep` in a `Transducer`. +data TransducerStep token + = TransitionTokenClass (TokenClass token) IntSet + | TransitionNonTerminal String IntSet + | EmitNonTerminal String + +{- | Compile a `RegEx`tended `Bnf` into a `Transducer`, +using a combination of Thompson's algorithm for regular expressions +and Earley's algorithm for context-free grammars. See Jim & Mandelbaum, +[Efficient Earley Parsing with Regular Right-hand Sides] +(http://trevorjim.com/papers/ldta-2009.pdf), +and McIlroy, [Enumerating the strings of regular languages] +(https://www.cs.dartmouth.edu/~doug/nfa.pdf). + +A transducer is a form of finite state machine +that can be run in various ways like +`=~`, `expectNext`, `languageSample`, `parseForest` & `unreachableRules`. +-} +transducer :: Bnf (RegEx token) -> Transducer token +transducer (Bnf start rules) = Transducer + { transducerRelations = IntMap.fromList allStates + , transducerRules = Map.fromList + [ ( n + , ( Map.findWithDefault IntSet.empty n firstsMap + , Set.member n nullSet + ) + ) + | n <- Map.keys ruleMap + ] + , transducerStarts = startStates + } + + where + + ruleMap = foldr + (\(n, r) -> Map.insertWith (++) n [r]) Map.empty (toList rules) + + rexNullable nm = \case + SeqEmpty -> True + NonTerminal n -> Set.member n nm + Sequence x y -> rexNullable nm x && rexNullable nm y + KleeneStar _ -> True + KleeneOpt _ -> True + KleenePlus x -> rexNullable nm x + RegExam (Alternate x y) -> rexNullable nm x || rexNullable nm y + RegExam (OneOf _) -> False + RegExam (NotOneOf _ _) -> False + + ruleNames = Map.keys ruleMap + + iterNull ns = + let ns' = Set.fromList + [ n + | n <- ruleNames + , any (rexNullable ns) (Map.findWithDefault [] n ruleMap) + ] + in if ns == ns' then ns else iterNull ns' + + nullSet = iterNull Set.empty + + transducerAcceptId0 = 0 + + (finalMap, nextIdAfterFinals) = + foldl' alloc (Map.empty, transducerAcceptId0 + 1) ruleNames + where alloc (m, i) n = (Map.insert n i m, i + 1) + + finalStatesList = [(finalMap Map.! n, EmitNonTerminal n) | n <- ruleNames] + + (rulesStatesList, firstsMap, nextIdAfterRules) = + foldl' compileRule ([], Map.empty, nextIdAfterFinals) (Map.toList ruleMap) + where + compileRule (sts, fm, nid) (name, prods) = + let finalId = finalMap Map.! name + (newSts, newFirsts, nid') = + foldl' compileProd ([], IntSet.empty, nid) prods + compileProd (s, fs, i) prod = + let (f, st, i', _) = + thompson prod i (IntSet.singleton finalId) + in (s <> st, fs <> f, i') + in (sts <> newSts, Map.insert name newFirsts fm, nid') + + (startFirsts, startStatesRaw, _, startBypass) = + thompson start nextIdAfterRules (IntSet.singleton transducerAcceptId0) + + startStates = + startFirsts <> bypassStates startBypass (IntSet.singleton transducerAcceptId0) + + allStates = finalStatesList <> rulesStatesList <> startStatesRaw + + bypassStates True = id + bypassStates False = const IntSet.empty + + thompson rex nextId dests = case rex of + SeqEmpty -> (IntSet.empty, [], nextId, True) + NonTerminal name -> + ( IntSet.singleton nextId + , [(nextId, TransitionNonTerminal name dests)] + , nextId + 1 + , Set.member name nullSet + ) + Sequence rex0 rex1 -> + let + (firsts1, states1, nextId1, bypass1) = thompson rex1 nextId dests + (firsts0, states0, nextId0, bypass0) = + thompson rex0 nextId1 (firsts1 <> bypassStates bypass1 dests) + in + ( firsts0 <> bypassStates bypass0 firsts1 + , states0 <> states1 + , nextId0 + , bypass0 && bypass1 + ) + KleeneStar rex0 -> + let + (firsts, states, nextId', _) = thompson rex0 nextId (firsts <> dests) + in + (firsts, states, nextId', True) + KleeneOpt rex0 -> + let + (firsts, states, nextId', _) = thompson rex0 nextId dests + in + (firsts, states, nextId', True) + KleenePlus rex0 -> + let + (firsts, states, nextId', bypass) = thompson rex0 nextId (firsts <> dests) + in + (firsts, states, nextId', bypass) + RegExam (OneOf chars) + | Set.null chars -> (IntSet.empty, [], nextId, False) + | otherwise -> + ( IntSet.singleton nextId + , [(nextId, TransitionTokenClass (TokenClass (OneOf chars)) dests)] + , nextId + 1 + , False + ) + RegExam (NotOneOf chars catTest) -> + ( IntSet.singleton nextId + , [(nextId, TransitionTokenClass (TokenClass (NotOneOf chars catTest)) dests)] + , nextId + 1 + , False + ) + RegExam (Alternate rex0 rex1) -> + let + (firsts1, states1, nextId1, bypass1) = thompson rex1 nextId dests + (firsts0, states0, nextId0, bypass0) = thompson rex0 nextId1 dests + in + ( firsts0 <> firsts1 + , states0 <> states1 + , nextId0 + , bypass0 || bypass1 + ) + +{- | The parse forest of a string of tokens. -} +parseForest + :: Categorized token + => Transducer token + -> [token] -- ^ string + -> ([Tree (String, Int, Int, [token])], [token]) + {- ^ parse forest & remaining unparsed tokens -} +parseForest et word = (concat (itemForests Set.empty Nothing 0 acceptedLen 0), drop acceptedLen word) + where + (n, chart) = prefixGen et word + relations = transducerRelations et + acceptedLen = maximum [j | j <- [0 .. n], acceptsChart j chart] + + acceptedWord = take acceptedLen word + sliceAt start end = take (end - start) (drop start acceptedWord) + itemsAt j = IntMap.findWithDefault IntMap.empty j chart + ruleInfo name = Map.findWithDefault (IntSet.empty, False) name (transducerRules et) + + edgesAt :: IntMap (IntMap [edge]) -> Int -> Int -> [edge] + edgesAt table pos stateId = + IntMap.findWithDefault [] stateId (IntMap.findWithDefault IntMap.empty pos table) + + insertEdges :: edge -> IntSet -> IntMap [edge] -> IntMap [edge] + insertEdges edge dests acc = IntSet.foldr + (\stateId m -> IntMap.insertWith (++) stateId [edge] m) + acc + dests + + scanBack = IntMap.fromList + [ (end, backRow (end - 1) input) + | (end, input) <- zip [1 .. acceptedLen] acceptedWord + ] + where + backRow prev input = IntMap.foldrWithKey step IntMap.empty (itemsAt prev) + where + step prevState origins acc = case IntMap.lookup prevState relations of + Just (TransitionTokenClass cls dests) | tokenClass cls input -> + insertEdges (prevState, origins) dests acc + _ -> acc + + completeBack = IntMap.fromList + [ (split, IntMap.foldrWithKey step IntMap.empty (itemsAt split)) + | split <- [0 .. acceptedLen] + ] + where + step caller origins acc = case IntMap.lookup caller relations of + Just (TransitionNonTerminal name dests) -> + insertEdges (caller, origins, name) dests acc + _ -> acc + + ruleFinals = IntMap.foldrWithKey finalStates Map.empty relations + finalStates stateId step acc = case step of + EmitNonTerminal name -> Map.insert name stateId acc + _ -> acc + + entryStates Nothing = transducerStarts et + entryStates (Just name) = fst (ruleInfo name) + + ruleNullable = snd . ruleInfo + + itemForests guards entry origin end stateId + | Set.member itemKey guards = [] + | otherwise = baseForests <> scannedForests <> completedForests + where + itemKey = Left (entry, origin, end, stateId) + guards' = Set.insert itemKey guards + + baseForests + | end == origin && IntSet.member stateId (entryStates entry) = [[]] + | otherwise = [] + + scannedForests + | end <= origin = [] + | otherwise = + [ forest + | (prevState, origins) <- edgesAt scanBack end stateId + , IntSet.member origin origins + , let prev = end - 1 + , forest <- itemForests guards' entry origin prev prevState + ] + + completedForests = + [ prefix <> [subtree] + | split <- [origin .. end] + , (caller, origins, name) <- edgesAt completeBack split stateId + , IntSet.member origin origins + , prefix <- itemForests guards' entry origin split caller + , subtree <- ruleTrees guards' name split end + ] + + ruleTrees guards name start end + | Set.member ruleKey guards = [] + | otherwise = nullableTrees <> derivedTrees + where + ruleKey = Right (name, start, end) + guards' = Set.insert ruleKey guards + + nullableTrees + | start == end && ruleNullable name = [Node (name, start, end, []) []] + | otherwise = [] + + derivedTrees = case Map.lookup name ruleFinals of + Nothing -> [] + Just finalState -> + [ Node (name, start, end, sliceAt start end) subtrees + | subtrees <- itemForests guards' (Just name) start end finalState + ] + +prefixGen + :: Categorized token + => Transducer token + -> [token] + -> (Int, IntMap (IntMap IntSet)) +prefixGen et word = go 0 (initialChart et) word + where + go j chart [] = (j, chart) + go j chart (x : xs) = + let scanned = scanFrom j x chart + closed = closeChartAt et (j + 1) (IntMap.insert (j + 1) scanned chart) + in go (j + 1) closed xs + + scanFrom j input chart = IntMap.foldrWithKey advance IntMap.empty eJ + where + eJ = IntMap.findWithDefault IntMap.empty j chart + advance s origs acc = case IntMap.lookup s (transducerRelations et) of + Just (TransitionTokenClass cls ds) | tokenClass cls input -> + IntSet.foldr + (\d -> IntMap.insertWith IntSet.union d origs) acc ds + _ -> acc + +{- | What token is expected next? +The scanner frontier, `expectNext` returns the `TokenClass` +that can be scanned next after the given input prefix. +A `falseB` result means the current chart has no scanner transitions, +i.e. the prefix is a dead end for recognition. +-} +expectNext + :: Categorized token + => Transducer token -> [token] {- ^ prefix -} -> TokenClass token +expectNext et word = anyB fst (scanClassOptions et n chart) + where + (n, chart) = prefixGen et word + +{- | +Rule names that can never be entered from the start +expression — dead productions. A non-empty result is a grammar-hygiene +warning: those rules can be deleted without changing the recognized language. +-} +unreachableRules :: Transducer token -> Set String +unreachableRules et = + Map.keysSet (transducerRules et) `Set.difference` called + where + called = bfs (transducerStarts et) IntSet.empty Set.empty + + bfs frontier seen calls + | IntSet.null fresh = calls + | otherwise = bfs next (seen <> fresh) calls' + where + fresh = IntSet.difference frontier seen + (next, calls') = IntSet.foldr step (IntSet.empty, calls) fresh + + step s (acc, cs) = case IntMap.lookup s (transducerRelations et) of + Just (TransitionTokenClass _ ds) -> (acc <> ds, cs) + Just (TransitionNonTerminal name ds) -> + let firsts = maybe IntSet.empty fst (Map.lookup name (transducerRules et)) + in (acc <> ds <> firsts, Set.insert name cs) + Just (EmitNonTerminal _) -> (acc, cs) + Nothing -> (acc, cs) + +{- | +`languageSample` lazily produces all words in a language from shortest to longest. +However since `TokenClass`es can resolve to infinite sets of tokens, +and the relevant case of `Char` tokens while not infinite is huge, +it samples tokens in an `Applicative` `TokenAlgebra`. +-} +languageSample + :: (TokenAlgebra token (f token), Applicative f) + => Transducer token -- ^ transducer + -> f [[token]] +languageSample et = sequenceA (fmap sampleWord classWords) + where + + classWords = enumerateByLength [(0, [], initialChart et)] Set.empty + + sampleWord = traverse tokenClass . reverse + + enumerateByLength [] _ = [] + enumerateByLength frontier seen = + let + (accepted, seen') = acceptedAtFrontier frontier seen + next = concatMap expand frontier + in accepted <> enumerateByLength next seen' + + acceptedAtFrontier frontier seen0 = + let (acceptedRev, seen') = foldl' step ([], seen0) frontier + in (reverse acceptedRev, seen') + where + step (acc, seen) (j, revWord, chart) + | acceptsChart j chart = + if Set.member revWord seen + then (acc, seen) + else (revWord : acc, Set.insert revWord seen) + | otherwise = (acc, seen) + + expand (j, revWord, chart) = + [ (j + 1, cls : revWord, nextChart) + | (cls, nextChart) <- scanClassOptions et j chart + ] + +initialChart + :: Transducer token + -> IntMap (IntMap IntSet) +initialChart et = closeChartAt et 0 (IntMap.singleton 0 initialE0) + where + initialE0 = IntMap.fromList + [ (s, IntSet.singleton 0) | s <- IntSet.toList (transducerStarts et) ] + +-- Accept iff (q_accept, 0) is in E_n. +acceptsChart + :: Int + -> IntMap (IntMap IntSet) + -> Bool +acceptsChart j chart = IntSet.member 0 acceptOrigins + where + eJ = IntMap.findWithDefault IntMap.empty j chart + acceptOrigins = IntMap.findWithDefault IntSet.empty 0 eJ + +-- Group all scanner moves from E_j by token class; each result also carries the +-- closed successor chart at j+1. +scanClassOptions + :: Categorized token + => Transducer token + -> Int + -> IntMap (IntMap IntSet) + -> [(TokenClass token, IntMap (IntMap IntSet))] +scanClassOptions et j chart = + [ (cls, closeChartAt et (j + 1) (IntMap.insert (j + 1) scanned chart)) + | (cls, scanned) <- Map.toAscList grouped + ] + where + grouped = IntMap.foldrWithKey advance Map.empty eJ + eJ = IntMap.findWithDefault IntMap.empty j chart + + advance s origs acc = case IntMap.lookup s (transducerRelations et) of + Just (TransitionTokenClass cls ds) -> + Map.insertWith (IntMap.unionWith IntSet.union) cls scanned acc + where + scanned = IntSet.foldr + (\d -> IntMap.insertWith IntSet.union d origs) IntMap.empty ds + _ -> acc + +closeChartAt + :: Transducer token + -> Int + -> IntMap (IntMap IntSet) + -> IntMap (IntMap IntSet) +closeChartAt et j initialChart0 = loop initialWork initialChart0 IntMap.empty + where + initialEJ = IntMap.findWithDefault IntMap.empty j initialChart0 + initialWork = + [ (s, i) | (s, os) <- IntMap.toList initialEJ, i <- IntSet.toList os ] + + -- For fixed i < j, E_i does not change while closing E_j. Cache an index + -- from nonterminal name to caller origins/continuations to speed completion. + -- IntMap key: origin index i + -- Map key: nonterminal name + -- Value: list of (caller origins, continuation destinations) + + -- Earley closure at E_j: apply predict/complete until fixed point. + loop [] chart _ = chart + loop ((s, i) : rest) chart callerCache = case IntMap.lookup s (transducerRelations et) of + Just (TransitionNonTerminal name ds) -> + let + (firsts, isNull) = Map.findWithDefault + (IntSet.empty, False) name (transducerRules et) + predItems = [(f, j) | f <- IntSet.toList firsts] + nullItems = + if isNull then [(d, i) | d <- IntSet.toList ds] else [] + (chart', new) = addEarleyItems (predItems <> nullItems) chart + in loop (new <> rest) chart' callerCache + Just (EmitNonTerminal name) -> + let + (ixed, callerCache') = callerEntries i chart callerCache + callerRows = Map.findWithDefault [] name ixed + completions = + [ (d, i') + | (os, ds) <- callerRows + , i' <- IntSet.toList os + , d <- IntSet.toList ds + ] + (chart', new) = addEarleyItems completions chart + in loop (new <> rest) chart' callerCache' + _ -> loop rest chart callerCache + + callerEntries i chart callerCache + -- E_j mutates during closure, so do not cache index for i == j. + | i == j = (buildCallerIndex (IntMap.findWithDefault IntMap.empty i chart), callerCache) + | otherwise = case IntMap.lookup i callerCache of + Just ixed -> (ixed, callerCache) + Nothing -> + let ixed = buildCallerIndex (IntMap.findWithDefault IntMap.empty i chart) + in (ixed, IntMap.insert i ixed callerCache) + + buildCallerIndex eI = IntMap.foldrWithKey step Map.empty eI + where + step t os acc = case IntMap.lookup t (transducerRelations et) of + Just (TransitionNonTerminal n ds) -> + Map.insertWith (++) n [(os, ds)] acc + _ -> acc + + addEarleyItems items chart = foldl' ins (chart, []) items + where + ins (acc, new) (state, origin) = + let + eJ = IntMap.findWithDefault IntMap.empty j acc + os = IntMap.findWithDefault IntSet.empty state eJ + in if IntSet.member origin os + then (acc, new) + else + let + eJ' = IntMap.insert state (IntSet.insert origin os) eJ + acc' = IntMap.insert j eJ' acc + in (acc', (state, origin) : new) diff --git a/src/Control/Lens/Grammar/Token.hs b/src/Control/Lens/Grammar/Token.hs index 884d5963..5ad21da5 100644 --- a/src/Control/Lens/Grammar/Token.hs +++ b/src/Control/Lens/Grammar/Token.hs @@ -20,11 +20,18 @@ module Control.Lens.Grammar.Token import Control.Lens import Control.Lens.PartialIso +import Control.Monad.Loops (iterateUntil) import Data.Bifunctor.Joker import Data.Char +import Data.Foldable import Data.Profunctor import Data.Profunctor.Monoidal import Data.Word +import Control.Monad.State (StateT, state) +import System.Random (RandomGen, Random, random, randomR) +import Test.QuickCheck.Arbitrary +import Test.QuickCheck.Gen (Gen) +import qualified Test.QuickCheck.Gen as Gen import Text.ParserCombinators.ReadP (ReadP) import qualified Text.ParserCombinators.ReadP as ReadP @@ -137,3 +144,21 @@ instance Tokenized Char (ReadP Char) where notOneOf = ReadP.satisfy . notOneOf asIn = ReadP.satisfy . asIn notAsIn = ReadP.satisfy . notAsIn +instance (Categorized token, Arbitrary token) => Tokenized token (Gen token) where + anyToken = arbitrary @token + token = pure + oneOf = Gen.elements . toList + notOneOf xs = arbitrary `Gen.suchThat` (`notElem` xs) + asIn cat = arbitrary `Gen.suchThat` ((==) cat . categorize) + notAsIn cat = arbitrary `Gen.suchThat` ((/=) cat . categorize) +instance (RandomGen g, Monad m, Categorized token, Random token) + => Tokenized token (StateT g m token) where + anyToken = state random + token = pure + oneOf xs = do + let ys = toList xs + i <- state (randomR (0, length ys - 1)) + pure (ys !! i) + notOneOf xs = iterateUntil (`notElem` xs) anyToken + asIn cat = iterateUntil ((== cat) . categorize) anyToken + notAsIn cat = iterateUntil ((/= cat) . categorize) anyToken diff --git a/src/Data/Profunctor/Grammar.hs b/src/Data/Profunctor/Grammar.hs index b721355d..ee88cc91 100644 --- a/src/Data/Profunctor/Grammar.hs +++ b/src/Data/Profunctor/Grammar.hs @@ -27,6 +27,7 @@ import Control.Lens import Control.Lens.Extras import Control.Lens.Grammar.BackusNaur import Control.Lens.Grammar.Kleene +import Control.Lens.Grammar.Machine import Control.Lens.Grammar.Symbol import Control.Lens.Grammar.Token import Control.Monad diff --git a/test/Examples/Chain.hs b/test/Examples/Chain.hs index 5d0f964e..1b54285a 100644 --- a/test/Examples/Chain.hs +++ b/test/Examples/Chain.hs @@ -16,7 +16,7 @@ data Chain makePrisms ''Chain -chainGrammar :: CtxGrammar Char Chain +chainGrammar :: Grammar Char Chain chainGrammar = ruleRec "chain" seqG where seqG chn = rule "seq" $ diff --git a/test/Examples/Json.hs b/test/Examples/Json.hs index e3aec526..eb1b24d6 100644 --- a/test/Examples/Json.hs +++ b/test/Examples/Json.hs @@ -24,12 +24,17 @@ data Json -- Generate prisms makePrisms ''Json --- | JSON grammar following the McKeeman Form specification from json.org +-- | JSON grammar following the McKeeman Form specification from json.org. +-- The inner rules are mutually recursive: element ↔ value ↔ array ↔ +-- elements ↔ element, and element ↔ value ↔ object ↔ members ↔ member +-- ↔ element. Only a rule bound via `ruleRec` produces a stub that +-- breaks the cycle; a plain `rule` invocation forces its body, so +-- every cyclic back-edge must instead use the `element` stub produced +-- by the inner `ruleRec "element"` below. jsonGrammar :: Grammar Char Json -jsonGrammar = ruleRec "json" elementG +jsonGrammar = rule "json" elementG where - -- element = ws value ws - elementG json = rule "element" $ + elementG = ruleRec "element" $ \json -> ws >* valueG json *< ws -- value = object | array | string | number | "true" | "false" | "null" @@ -57,7 +62,7 @@ jsonGrammar = ruleRec "json" elementG -- member = ws string ws ':' element memberG json = rule "member" $ - ws >* stringG *< ws *< terminal ":" >*< elementG json + ws >* stringG *< ws *< terminal ":" >*< json -- array = '[' ws ']' | '[' elements ']' arrayG json = rule "array" $ choice @@ -67,7 +72,7 @@ jsonGrammar = ruleRec "json" elementG -- elements = element | element ',' elements elementsG json = rule "elements" $ - several1 (sepWith ",") (elementG json) + several1 (sepWith ",") json -- string = '"' characters '"' stringG = rule "string" $ diff --git a/test/Main.hs b/test/Main.hs index 9398364f..f73df07f 100644 --- a/test/Main.hs +++ b/test/Main.hs @@ -8,9 +8,11 @@ import Data.Function (fix) import Data.List (genericLength) import Data.Maybe (isJust) import Data.Profunctor.Types (Star (..)) +import Data.Tree (Tree (..)) import System.Environment (lookupEnv) import Test.DocTest import Test.Hspec +import Test.QuickCheck (generate) import qualified Text.Megaparsec as M import Examples.Arithmetic @@ -30,19 +32,42 @@ main = do when shouldRunDoctests $ describe "doctest" $ it "should run haddock examples" doctests - describe "regexGrammar" $ for_ regexExamples $ testGrammar False regexGrammar - describe "semverGrammar" $ for_ semverExamples $ testCtxGrammar True semverGrammar - describe "semverCtxGrammar" $ for_ semverExamples $ testCtxGrammar True semverCtxGrammar - describe "arithGrammar" $ for_ arithExamples $ testGrammar True arithGrammar - describe "jsonGrammar" $ for_ jsonExamples $ testCtxGrammar False jsonGrammar - describe "sexprGrammar" $ for_ sexprExamples $ testCtxGrammar True sexprGrammar - describe "lambdaGrammar" $ for_ lambdaExamples $ testCtxGrammar True lambdaGrammar - describe "lenvecGrammar" $ for_ lenvecExamples $ testCtxGrammar True lenvecGrammar - describe "chainGrammar" $ for_ chainExamples $ testCtxGrammar True chainGrammar + describe "regexGrammar" $ testCfg False regexExamples regexGrammar + describe "semverGrammar" $ testCfg True semverExamples semverGrammar + describe "semverCtxGrammar" $ testCsg True semverExamples semverCtxGrammar + describe "arithGrammar" $ testCfg True arithExamples arithGrammar + describe "jsonGrammar" $ testCfg False jsonExamples jsonGrammar + describe "sexprGrammar" $ testCfg True sexprExamples sexprGrammar + describe "lambdaGrammar" $ testCfg True lambdaExamples lambdaGrammar + describe "lenvecGrammar" $ testCsg True lenvecExamples lenvecGrammar + describe "chainGrammar" $ testCfg True chainExamples chainGrammar + describe "parseForest" parseForestTests describe "Parsector try rollback" tryRollbackTests describe "Kleene" kleeneProperties describe "meander" meanderProperties +parseForestTests :: Spec +parseForestTests = do + it "returns the nested rule forest for a full parse" $ do + let (actualForest, actualRest) = parseForest (transducerG arithGrammar) "2*3+4;;;" + actualForest `shouldBe` + [ Node ("arith", 0, 5, "2*3+4") + [ Node ("sum", 0, 5, "2*3+4") + [ Node ("product", 0, 3, "2*3") + [ Node ("factor", 0, 1, "2") + [Node ("number", 0, 1, "2") []] + , Node ("factor", 2, 3, "3") + [Node ("number", 2, 3, "3") []] + ] + , Node ("product", 4, 5, "4") + [ Node ("factor", 4, 5, "4") + [Node ("number", 4, 5, "4") []] + ] + ] + ] + ] + actualRest `shouldBe` ";;;" + tryRollbackTests :: Spec tryRollbackTests = do it "rolls back parse stream/offset on failed try" $ do @@ -60,9 +85,22 @@ tryRollbackTests = do doctests :: IO () doctests = do + stackExe <- lookupEnv "STACK_EXE" + ghcEnvironment <- lookupEnv "GHC_ENVIRONMENT" let modulePaths = [ "src/Control/Lens/Grammar.hs" ] + sourceDirs = + [ "-isrc" + , "-itest" + ] + packageEnvFlags = case ghcEnvironment of + Just "-" -> [] + Just path -> ["-package-env=" <> path] + Nothing -> [] + runnerFlags + | isJust stackExe = [] + | otherwise = sourceDirs <> packageEnvFlags languageExtensions = [ "-XAllowAmbiguousTypes" , "-XArrows" @@ -104,7 +142,7 @@ doctests = do for_ modulePaths $ \modulePath -> do putStr "Testing module documentation in " putStrLn modulePath - doctest (modulePath : languageExtensions) + doctest (modulePath : runnerFlags <> languageExtensions) meanderProperties :: Spec meanderProperties = @@ -117,12 +155,23 @@ meanderProperties = seen `shouldBe` input units `shouldBe` replicate (length input) () -testGrammar :: (Show a, Eq a) => Bool -> Grammar Char a -> (a, String) -> Spec -testGrammar isLL1 grammar (expectedSyntax, expectedString) = do - testCtxGrammar isLL1 grammar (expectedSyntax, expectedString) - it ("should match " <> expectedString <> " correctly") $ do - let actualMatch = expectedString =~ regbnfG grammar - actualMatch `shouldBe` True +testCfg :: (Show a, Eq a) => Bool -> [(a, String)] -> Grammar Char a -> Spec +testCfg isLL1 examples grammar = do + describe "examples" $ for_ examples $ \(expectedSyntax, expectedString) -> do + testCtxGrammar isLL1 grammar (expectedSyntax, expectedString) + it ("should match " <> expectedString <> " correctly") $ do + let actualMatch = expectedString =~ regbnfG grammar + actualMatch `shouldBe` True + describe "transducerG" $ do + it "should generate the hundred shorted valid words in a language" $ do + generated <- generate (take 100 <$> languageSample (transducerG grammar)) + for_ generated $ \word -> do + let fullParses = [() | (_, "") <- parseG grammar word] + fullParses `shouldBe` [()] + +testCsg :: (Show a, Eq a) => Bool -> [(a, String)] -> CtxGrammar Char a -> Spec +testCsg isLL1 examples grammar = + describe "examples" $ for_ examples $ testCtxGrammar isLL1 grammar testCtxGrammar :: (Show a, Eq a) => Bool -> CtxGrammar Char a -> (a, String) -> Spec testCtxGrammar isLL1 grammar (expectedSyntax, expectedString) = do diff --git a/test/Properties/Kleene.hs b/test/Properties/Kleene.hs index 7aa7ca3a..ab5f1c9d 100644 --- a/test/Properties/Kleene.hs +++ b/test/Properties/Kleene.hs @@ -2,6 +2,7 @@ module Properties.Kleene (kleeneProperties) where import Control.Lens.Grammar +import Data.Foldable (for_) import Test.Hspec import Test.Hspec.QuickCheck (prop) import Test.QuickCheck @@ -95,6 +96,26 @@ kleeneProperties = do prop "notAsIn cat = tokenClass (notAsIn cat)" $ \(cat :: GeneralCategory) -> (notAsIn cat :: RegEx Char) == tokenClass (notAsIn cat) + it "matching agrees with lifted Bnf on examples" $ do + let + cases = + [ ("", mempty :: RegEx Char) + , ("a", token 'a') + , ("b", token 'a') + , ("ab", token 'a' <> token 'b') + , ("a", token 'a' >|< token 'b') + , ("bbb", starK (token 'b')) + , ("bbb", plusK (token 'b')) + , ("", optK (token 'b')) + , ("x", oneOf "xyz") + , ("x", notOneOf "abc") + , ("A", asIn UppercaseLetter) + , ("a", notAsIn UppercaseLetter) + , ("abbb", token 'a' <> starK (token 'b')) + , ("cat", terminal "cat" >|< terminal "dog") + ] + for_ cases $ \(word, rex) -> + (word =~ rex) `shouldBe` (word =~ liftBnf0 rex) describe "BooleanAlgebra TokenClass" $ do it "trueB = anyToken" $ (trueB :: TokenClass Char) `shouldBe` anyToken