66module Text.Parsing.StringParser.CodePoints
77 ( eof
88 , anyChar
9+ , anyCodePoint
910 , anyDigit
1011 , string
1112 , satisfy
13+ , satisfyCodePoint
1214 , char
15+ , codePoint
1316 , whiteSpace
1417 , skipSpaces
1518 , oneOf
@@ -31,13 +34,14 @@ import Data.Either (Either(..))
3134import Data.Enum (fromEnum )
3235import Data.Foldable (class Foldable , foldMap , elem , notElem )
3336import Data.Maybe (Maybe (..))
37+ import Data.String (CodePoint )
3438import Data.String.CodePoints as SCP
3539import Data.String.CodeUnits as SCU
3640import Data.String.Regex as Regex
3741import Data.String.Regex.Flags (noFlags )
3842import Text.Parsing.StringParser (Parser (..), fail )
39- import Text.Parsing.StringParser.Combinators (try , many , (<?>))
4043import Text.Parsing.StringParser.CodeUnits as CodeUnitsParser
44+ import Text.Parsing.StringParser.Combinators (try , many , (<?>))
4145
4246-- | Match the end of the file.
4347eof :: Parser Unit
@@ -46,16 +50,28 @@ eof = Parser \s ->
4650 { substring, position } | 0 < SCP .length substring -> Left { pos: position, error: " Expected EOF" }
4751 _ -> Right { result: unit, suffix: s }
4852
49- -- | Match any character.
53+ -- | Match any character from the Basic Multilingual Plane .
5054anyChar :: Parser Char
51- anyChar = Parser \{ substring, position } ->
52- case SCP .codePointAt 0 substring of
53- Just cp -> case toChar cp of
54- Just chr -> Right { result: chr, suffix: { substring: SCP .drop 1 substring, position: position + 1 } }
55- Nothing -> Left { pos: position, error: " CodePoint " <> show cp <> " is not a character" }
56- Nothing -> Left { pos: position, error: " Unexpected EOF" }
55+ anyChar = do
56+ cc <- anyCodePoint <#> fromEnum
57+ case fromCharCode cc of
58+ Just chr ->
59+ -- the `fromCharCode` function doesn't check if this is beyond the
60+ -- BMP, so we check that ourselves.
61+ -- https://github.com/purescript/purescript-strings/issues/153
62+ if cc > 65535 -- BMP
63+ then notAChar cc
64+ else pure chr
65+ Nothing -> notAChar cc
5766 where
58- toChar = fromCharCode <<< fromEnum
67+ notAChar cc = fail $ " Code point " <> show cc <> " is not a character"
68+
69+ -- | Match any code point.
70+ anyCodePoint :: Parser CodePoint
71+ anyCodePoint = Parser \{ substring, position } ->
72+ case SCP .uncons substring of
73+ Nothing -> Left { pos: position, error: " Unexpected EOF" }
74+ Just { head, tail } -> Right { result: head, suffix: { substring: tail, position: position + 1 } }
5975
6076-- | Match any digit.
6177anyDigit :: Parser Char
@@ -81,10 +97,21 @@ satisfy f = try do
8197 if f c then pure c
8298 else fail $ " Character " <> show c <> " did not satisfy predicate"
8399
100+ -- | Match a code point satisfying the given predicate.
101+ satisfyCodePoint :: (CodePoint -> Boolean ) -> Parser CodePoint
102+ satisfyCodePoint f = try do
103+ cp <- anyCodePoint
104+ if f cp then pure cp
105+ else fail $ " Code point " <> show cp <> " did not satisfy predicate"
106+
84107-- | Match the specified character.
85108char :: Char -> Parser Char
86109char c = satisfy (_ == c) <?> " Could not match character " <> show c
87110
111+ -- | Match the specified code point.
112+ codePoint :: CodePoint -> Parser CodePoint
113+ codePoint c = satisfyCodePoint (_ == c) <?> " Could not match code point " <> show c
114+
88115-- | Match many whitespace characters.
89116whiteSpace :: Parser String
90117whiteSpace = do
0 commit comments