diff --git a/CHANGELOG.md b/CHANGELOG.md index 4af4f13..ab3d4e0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ Breaking changes: - Fix semantics of endBy and sepEndBy parser combinators (#84 by @chtenb) New features: +- Introduce code point parsers `anyCodePoint`, `codePoint'` and `satisfyCodePoint` (#88 by @chtenb) Bugfixes: - Do not export `chainl'` and `chainr'` helper functions (#84 by @chtenb) diff --git a/spago.dhall b/spago.dhall index 4f548fb..28b0041 100644 --- a/spago.dhall +++ b/spago.dhall @@ -13,6 +13,7 @@ , "maybe" , "minibench" , "nonempty" + , "partial" , "prelude" , "psci-support" , "strings" diff --git a/src/Text/Parsing/StringParser/CodePoints.purs b/src/Text/Parsing/StringParser/CodePoints.purs index 0baa62a..1640ab2 100644 --- a/src/Text/Parsing/StringParser/CodePoints.purs +++ b/src/Text/Parsing/StringParser/CodePoints.purs @@ -6,10 +6,13 @@ module Text.Parsing.StringParser.CodePoints ( eof , anyChar + , anyCodePoint , anyDigit , string , satisfy + , satisfyCodePoint , char + , codePoint , whiteSpace , skipSpaces , oneOf @@ -31,13 +34,14 @@ import Data.Either (Either(..)) import Data.Enum (fromEnum) import Data.Foldable (class Foldable, foldMap, elem, notElem) import Data.Maybe (Maybe(..)) +import Data.String (CodePoint) import Data.String.CodePoints as SCP import Data.String.CodeUnits as SCU import Data.String.Regex as Regex import Data.String.Regex.Flags (noFlags) import Text.Parsing.StringParser (Parser(..), fail) -import Text.Parsing.StringParser.Combinators (try, many, ()) import Text.Parsing.StringParser.CodeUnits as CodeUnitsParser +import Text.Parsing.StringParser.Combinators (try, many, ()) -- | Match the end of the file. eof :: Parser Unit @@ -46,16 +50,28 @@ eof = Parser \s -> { substring, position } | 0 < SCP.length substring -> Left { pos: position, error: "Expected EOF" } _ -> Right { result: unit, suffix: s } --- | Match any character. +-- | Match any character from the Basic Multilingual Plane. anyChar :: Parser Char -anyChar = Parser \{ substring, position } -> - case SCP.codePointAt 0 substring of - Just cp -> case toChar cp of - Just chr -> Right { result: chr, suffix: { substring: SCP.drop 1 substring, position: position + 1 } } - Nothing -> Left { pos: position, error: "CodePoint " <> show cp <> " is not a character" } - Nothing -> Left { pos: position, error: "Unexpected EOF" } +anyChar = do + cc <- anyCodePoint <#> fromEnum + case fromCharCode cc of + Just chr -> + -- the `fromCharCode` function doesn't check if this is beyond the + -- BMP, so we check that ourselves. + -- https://github.com/purescript/purescript-strings/issues/153 + if cc > 65535 -- BMP + then notAChar cc + else pure chr + Nothing -> notAChar cc where - toChar = fromCharCode <<< fromEnum + notAChar cc = fail $ "Code point " <> show cc <> " is not a character" + +-- | Match any code point. +anyCodePoint :: Parser CodePoint +anyCodePoint = Parser \{ substring, position } -> + case SCP.uncons substring of + Nothing -> Left { pos: position, error: "Unexpected EOF" } + Just { head, tail } -> Right { result: head, suffix: { substring: tail, position: position + 1 } } -- | Match any digit. anyDigit :: Parser Char @@ -81,10 +97,21 @@ satisfy f = try do if f c then pure c else fail $ "Character " <> show c <> " did not satisfy predicate" +-- | Match a code point satisfying the given predicate. +satisfyCodePoint :: (CodePoint -> Boolean) -> Parser CodePoint +satisfyCodePoint f = try do + cp <- anyCodePoint + if f cp then pure cp + else fail $ "Code point " <> show cp <> " did not satisfy predicate" + -- | Match the specified character. char :: Char -> Parser Char char c = satisfy (_ == c) "Could not match character " <> show c +-- | Match the specified code point. +codePoint :: CodePoint -> Parser CodePoint +codePoint c = satisfyCodePoint (_ == c) "Could not match code point " <> show c + -- | Match many whitespace characters. whiteSpace :: Parser String whiteSpace = do diff --git a/test/BasicSpecs.purs b/test/BasicSpecs.purs index 141479c..c94ffb0 100644 --- a/test/BasicSpecs.purs +++ b/test/BasicSpecs.purs @@ -7,25 +7,41 @@ import Control.Monad.Writer (Writer, execWriter, tell) import Data.Either (isRight) import Data.List (List) import Data.List as List +import Data.Maybe (fromJust) +import Data.String (CodePoint, codePointAt) import Data.Traversable (traverse) import Effect (Effect) import Effect.Class.Console (log) +import Partial.Unsafe (unsafePartial) import Test.Assert (assert') import Test.Utils (AnyParser(..), mkAnyParser) import Text.Parsing.StringParser (Parser, runParser) -import Text.Parsing.StringParser.CodePoints (anyChar, anyDigit, anyLetter, char, eof, skipSpaces, string) +import Text.Parsing.StringParser.CodePoints (anyChar, anyCodePoint, anyDigit, anyLetter, char, codePoint, eof, skipSpaces, string) import Text.Parsing.StringParser.Combinators (try, tryAhead, between, chainl, chainl1, endBy, endBy1, lookAhead, many, many1, manyTill, many1Till, optionMaybe, sepBy, sepBy1, sepEndBy, sepEndBy1) type TestInputs = { successes :: Array String, failures :: Array String } type TestCase = { name :: String, parser :: AnyParser, inputs :: TestInputs } +codePointLiteral :: String -> CodePoint +codePointLiteral s = unsafePartial $ fromJust $ codePointAt 0 s + testCases :: Array TestCase testCases = [ { name: "anyChar" , parser: mkAnyParser anyChar - -- TODO: test "🙂" which should fail - -- this is an open upstream issue https://github.com/purescript/purescript-strings/issues/153 - , inputs: { successes: [ "a", "%" ], failures: [ "" ] } + , inputs: { successes: [ "a", "%" ], failures: [ "", "aa", "🙂" ] } + } + , { name: "many anyChar" + , parser: mkAnyParser $ many anyChar + , inputs: { successes: [ "", "a", "%", "aa" ], failures: [ "🙂" ] } + } + , { name: "anyCodePoint" + , parser: mkAnyParser anyCodePoint + , inputs: { successes: [ "a", "%", "🙂" ], failures: [ "", "aa" ] } + } + , { name: "codePoint" + , parser: mkAnyParser $ codePoint $ codePointLiteral "🙂" + , inputs: { successes: [ "🙂" ], failures: [ "", "a", "aa" ] } } , { name: "anyLetter" , parser: mkAnyParser anyLetter diff --git a/test/CodePoints.purs b/test/CodePoints.purs index 20cf0a3..26b468b 100644 --- a/test/CodePoints.purs +++ b/test/CodePoints.purs @@ -120,7 +120,7 @@ testCodePoints = do assert $ expectResult "\x458CA" (string "\x458CA" <* string ")" <* eof) "\x458CA)" assert $ expectResult '\xEEE2' (char '\xEEE2' <* eof) "\xEEE2" assert $ expectPosition 1 anyChar "\xEEE2" - assert $ expectPosition 1 anyChar "\x458CA" + assert $ parseFail anyChar "\x458CA" -- Is beyond BMP log "Running overflow tests (may take a while)"