From 868abd63b41ee8a86c6cea084a94258680d7405d Mon Sep 17 00:00:00 2001 From: chtenb Date: Sat, 5 Mar 2022 11:39:11 +0100 Subject: [PATCH 1/4] add basic code point parsers --- spago.dhall | 1 + src/Text/Parsing/StringParser/CodePoints.purs | 37 +++++++++++++++---- test/BasicSpecs.purs | 20 ++++++++-- 3 files changed, 47 insertions(+), 11 deletions(-) diff --git a/spago.dhall b/spago.dhall index 4f548fb..28b0041 100644 --- a/spago.dhall +++ b/spago.dhall @@ -13,6 +13,7 @@ , "maybe" , "minibench" , "nonempty" + , "partial" , "prelude" , "psci-support" , "strings" diff --git a/src/Text/Parsing/StringParser/CodePoints.purs b/src/Text/Parsing/StringParser/CodePoints.purs index 5e31e0b..5687177 100644 --- a/src/Text/Parsing/StringParser/CodePoints.purs +++ b/src/Text/Parsing/StringParser/CodePoints.purs @@ -6,10 +6,13 @@ module Text.Parsing.StringParser.CodePoints ( eof , anyChar + , anyCodePoint , anyDigit , string , satisfy + , satisfyCodePoint , char + , codePoint , whiteSpace , skipSpaces , oneOf @@ -31,13 +34,14 @@ import Data.Either (Either(..)) import Data.Enum (fromEnum) import Data.Foldable (class Foldable, foldMap, elem, notElem) import Data.Maybe (Maybe(..)) +import Data.String (CodePoint) import Data.String.CodePoints as SCP import Data.String.CodeUnits as SCU import Data.String.Regex as Regex import Data.String.Regex.Flags (noFlags) import Text.Parsing.StringParser (Parser(..), try, fail) -import Text.Parsing.StringParser.Combinators (many, ()) import Text.Parsing.StringParser.CodeUnits as CodeUnitsParser +import Text.Parsing.StringParser.Combinators (many, ()) -- | Match the end of the file. eof :: Parser Unit @@ -46,17 +50,23 @@ eof = Parser \s -> { substring, position } | 0 < SCP.length substring -> Left { pos: position, error: "Expected EOF" } _ -> Right { result: unit, suffix: s } --- | Match any character. +-- | Match any character from the Basic Multilingual Plane. anyChar :: Parser Char -anyChar = Parser \{ substring, position } -> - case SCP.codePointAt 0 substring of - Just cp -> case toChar cp of - Just chr -> Right { result: chr, suffix: { substring: SCP.drop 1 substring, position: position + 1 } } - Nothing -> Left { pos: position, error: "CodePoint " <> show cp <> " is not a character" } - Nothing -> Left { pos: position, error: "Unexpected EOF" } +anyChar = do + cp <- anyCodePoint + case toChar cp of + Just chr -> pure chr + Nothing -> fail $ "Code point " <> show cp <> " is not a character" where toChar = fromCharCode <<< fromEnum +-- | Match any code point. +anyCodePoint :: Parser CodePoint +anyCodePoint = Parser \{ substring, position } -> + case SCP.codePointAt 0 substring of + Just cp -> Right { result: cp, suffix: { substring: SCP.drop 1 substring, position: position + 1 } } + Nothing -> Left { pos: position, error: "Unexpected EOF" } + -- | Match any digit. anyDigit :: Parser Char anyDigit = try do @@ -81,10 +91,21 @@ satisfy f = try do if f c then pure c else fail $ "Character " <> show c <> " did not satisfy predicate" +-- | Match a code point satisfying the given predicate. +satisfyCodePoint :: (CodePoint -> Boolean) -> Parser CodePoint +satisfyCodePoint f = try do + cp <- anyCodePoint + if f cp then pure cp + else fail $ "Code point " <> show cp <> " did not satisfy predicate" + -- | Match the specified character. char :: Char -> Parser Char char c = satisfy (_ == c) "Could not match character " <> show c +-- | Match the specified code point. +codePoint :: CodePoint -> Parser CodePoint +codePoint c = satisfyCodePoint (_ == c) "Could not match code point " <> show c + -- | Match many whitespace characters. whiteSpace :: Parser String whiteSpace = do diff --git a/test/BasicSpecs.purs b/test/BasicSpecs.purs index abff61b..c9cdffa 100644 --- a/test/BasicSpecs.purs +++ b/test/BasicSpecs.purs @@ -2,30 +2,44 @@ module Test.BasicSpecs where import Prelude hiding (between) -import Test.Utils (AnyParser(..), mkAnyParser) import Control.Alt ((<|>)) import Control.Monad.Writer (Writer, execWriter, tell) import Data.Either (isRight) import Data.List (List) import Data.List as List +import Data.Maybe (fromJust) +import Data.String (CodePoint, codePointAt) import Data.Traversable (traverse) import Effect (Effect) import Effect.Class.Console (log) +import Partial.Unsafe (unsafePartial) import Test.Assert (assert') +import Test.Utils (AnyParser(..), mkAnyParser) import Text.Parsing.StringParser (Parser, runParser, try) -import Text.Parsing.StringParser.CodePoints (anyChar, anyDigit, anyLetter, char, eof, skipSpaces, string) +import Text.Parsing.StringParser.CodePoints (anyChar, anyCodePoint, anyDigit, anyLetter, char, codePoint, eof, skipSpaces, string) import Text.Parsing.StringParser.Combinators (between, chainl, chainl1, endBy, endBy1, lookAhead, many, many1, manyTill, sepBy, sepBy1, sepEndBy, sepEndBy1) type TestInputs = { successes :: Array String, failures :: Array String } type TestCase = { name :: String, parser :: AnyParser, inputs :: TestInputs } +codePointLiteral :: String -> CodePoint +codePointLiteral s = unsafePartial $ fromJust $ codePointAt 0 s + testCases :: Array TestCase testCases = [ { name: "anyChar" , parser: mkAnyParser anyChar -- TODO: test "🙂" which should fail -- this is an open upstream issue https://github.com/purescript/purescript-strings/issues/153 - , inputs: { successes: [ "a", "%" ], failures: [ "" ] } + , inputs: { successes: [ "a", "%" ], failures: [ "", "aa" ] } + } + , { name: "anyCodePoint" + , parser: mkAnyParser anyCodePoint + , inputs: { successes: [ "a", "%", "🙂" ], failures: [ "", "aa" ] } + } + , { name: "codePoint" + , parser: mkAnyParser $ codePoint $ codePointLiteral "🙂" + , inputs: { successes: [ "🙂" ], failures: [ "", "a", "aa" ] } } , { name: "anyLetter" , parser: mkAnyParser anyLetter From d34eb868f9248890e225761ff4454323d6989408 Mon Sep 17 00:00:00 2001 From: chtenb Date: Sat, 5 Mar 2022 11:41:19 +0100 Subject: [PATCH 2/4] update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 573c288..a1158b8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ Breaking changes: - Fix semantics of endBy and sepEndBy parser combinators (#84 by @chtenb) New features: +- Introduce code point parsers `anyCodePoint`, `codePoint'` and `satisfyCodePoint` (#88 by @chtenb) Bugfixes: - Do not export `chainl'` and `chainr'` helper functions (#84 by @chtenb) From 1b4a43f70580b212385e938e3a90fc6e23730b76 Mon Sep 17 00:00:00 2001 From: chtenb Date: Mon, 7 Mar 2022 20:47:28 +0100 Subject: [PATCH 3/4] fix anyChar --- src/Text/Parsing/StringParser/CodePoints.purs | 16 +++++++++++----- test/BasicSpecs.purs | 8 +++++--- test/CodePoints.purs | 2 +- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/src/Text/Parsing/StringParser/CodePoints.purs b/src/Text/Parsing/StringParser/CodePoints.purs index e45a165..dcc732f 100644 --- a/src/Text/Parsing/StringParser/CodePoints.purs +++ b/src/Text/Parsing/StringParser/CodePoints.purs @@ -53,12 +53,18 @@ eof = Parser \s -> -- | Match any character from the Basic Multilingual Plane. anyChar :: Parser Char anyChar = do - cp <- anyCodePoint - case toChar cp of - Just chr -> pure chr - Nothing -> fail $ "Code point " <> show cp <> " is not a character" + cc <- anyCodePoint <#> fromEnum + case fromCharCode cc of + Just chr -> + -- the `fromCharCode` function doesn't check if this is beyond the + -- BMP, so we check that ourselves. + -- https://github.com/purescript/purescript-strings/issues/153 + if cc > 65535 -- BMP + then notAChar cc + else pure chr + Nothing -> notAChar cc where - toChar = fromCharCode <<< fromEnum + notAChar cc = fail $ "Code point " <> show cc <> " is not a character" -- | Match any code point. anyCodePoint :: Parser CodePoint diff --git a/test/BasicSpecs.purs b/test/BasicSpecs.purs index cfc2d48..c94ffb0 100644 --- a/test/BasicSpecs.purs +++ b/test/BasicSpecs.purs @@ -29,9 +29,11 @@ testCases :: Array TestCase testCases = [ { name: "anyChar" , parser: mkAnyParser anyChar - -- TODO: test "🙂" which should fail - -- this is an open upstream issue https://github.com/purescript/purescript-strings/issues/153 - , inputs: { successes: [ "a", "%" ], failures: [ "", "aa" ] } + , inputs: { successes: [ "a", "%" ], failures: [ "", "aa", "🙂" ] } + } + , { name: "many anyChar" + , parser: mkAnyParser $ many anyChar + , inputs: { successes: [ "", "a", "%", "aa" ], failures: [ "🙂" ] } } , { name: "anyCodePoint" , parser: mkAnyParser anyCodePoint diff --git a/test/CodePoints.purs b/test/CodePoints.purs index 20cf0a3..26b468b 100644 --- a/test/CodePoints.purs +++ b/test/CodePoints.purs @@ -120,7 +120,7 @@ testCodePoints = do assert $ expectResult "\x458CA" (string "\x458CA" <* string ")" <* eof) "\x458CA)" assert $ expectResult '\xEEE2' (char '\xEEE2' <* eof) "\xEEE2" assert $ expectPosition 1 anyChar "\xEEE2" - assert $ expectPosition 1 anyChar "\x458CA" + assert $ parseFail anyChar "\x458CA" -- Is beyond BMP log "Running overflow tests (may take a while)" From 64f4257af02d6de21eed5b2daf90b98e4b8a6074 Mon Sep 17 00:00:00 2001 From: chtenb Date: Mon, 7 Mar 2022 20:50:04 +0100 Subject: [PATCH 4/4] address review comment --- src/Text/Parsing/StringParser/CodePoints.purs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Text/Parsing/StringParser/CodePoints.purs b/src/Text/Parsing/StringParser/CodePoints.purs index dcc732f..1640ab2 100644 --- a/src/Text/Parsing/StringParser/CodePoints.purs +++ b/src/Text/Parsing/StringParser/CodePoints.purs @@ -40,8 +40,8 @@ import Data.String.CodeUnits as SCU import Data.String.Regex as Regex import Data.String.Regex.Flags (noFlags) import Text.Parsing.StringParser (Parser(..), fail) -import Text.Parsing.StringParser.Combinators (try, many, ()) import Text.Parsing.StringParser.CodeUnits as CodeUnitsParser +import Text.Parsing.StringParser.Combinators (try, many, ()) -- | Match the end of the file. eof :: Parser Unit @@ -69,9 +69,9 @@ anyChar = do -- | Match any code point. anyCodePoint :: Parser CodePoint anyCodePoint = Parser \{ substring, position } -> - case SCP.codePointAt 0 substring of - Just cp -> Right { result: cp, suffix: { substring: SCP.drop 1 substring, position: position + 1 } } + case SCP.uncons substring of Nothing -> Left { pos: position, error: "Unexpected EOF" } + Just { head, tail } -> Right { result: head, suffix: { substring: tail, position: position + 1 } } -- | Match any digit. anyDigit :: Parser Char