Skip to content

Commit efbe004

Browse files
authored
Add basic code point parsers (#88)
* add basic code point parsers * update changelog
1 parent b6b5ffd commit efbe004

File tree

5 files changed

+59
-14
lines changed

5 files changed

+59
-14
lines changed

β€ŽCHANGELOG.mdβ€Ž

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ Breaking changes:
99
- Fix semantics of endBy and sepEndBy parser combinators (#84 by @chtenb)
1010

1111
New features:
12+
- Introduce code point parsers `anyCodePoint`, `codePoint'` and `satisfyCodePoint` (#88 by @chtenb)
1213

1314
Bugfixes:
1415
- Do not export `chainl'` and `chainr'` helper functions (#84 by @chtenb)

β€Žspago.dhallβ€Ž

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
, "maybe"
1414
, "minibench"
1515
, "nonempty"
16+
, "partial"
1617
, "prelude"
1718
, "psci-support"
1819
, "strings"

β€Žsrc/Text/Parsing/StringParser/CodePoints.pursβ€Ž

Lines changed: 36 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,13 @@
66
module Text.Parsing.StringParser.CodePoints
77
( eof
88
, anyChar
9+
, anyCodePoint
910
, anyDigit
1011
, string
1112
, satisfy
13+
, satisfyCodePoint
1214
, char
15+
, codePoint
1316
, whiteSpace
1417
, skipSpaces
1518
, oneOf
@@ -31,13 +34,14 @@ import Data.Either (Either(..))
3134
import Data.Enum (fromEnum)
3235
import Data.Foldable (class Foldable, foldMap, elem, notElem)
3336
import Data.Maybe (Maybe(..))
37+
import Data.String (CodePoint)
3438
import Data.String.CodePoints as SCP
3539
import Data.String.CodeUnits as SCU
3640
import Data.String.Regex as Regex
3741
import Data.String.Regex.Flags (noFlags)
3842
import Text.Parsing.StringParser (Parser(..), fail)
39-
import Text.Parsing.StringParser.Combinators (try, many, (<?>))
4043
import Text.Parsing.StringParser.CodeUnits as CodeUnitsParser
44+
import Text.Parsing.StringParser.Combinators (try, many, (<?>))
4145

4246
-- | Match the end of the file.
4347
eof :: Parser Unit
@@ -46,16 +50,28 @@ eof = Parser \s ->
4650
{ substring, position } | 0 < SCP.length substring -> Left { pos: position, error: "Expected EOF" }
4751
_ -> Right { result: unit, suffix: s }
4852

49-
-- | Match any character.
53+
-- | Match any character from the Basic Multilingual Plane.
5054
anyChar :: Parser Char
51-
anyChar = Parser \{ substring, position } ->
52-
case SCP.codePointAt 0 substring of
53-
Just cp -> case toChar cp of
54-
Just chr -> Right { result: chr, suffix: { substring: SCP.drop 1 substring, position: position + 1 } }
55-
Nothing -> Left { pos: position, error: "CodePoint " <> show cp <> " is not a character" }
56-
Nothing -> Left { pos: position, error: "Unexpected EOF" }
55+
anyChar = do
56+
cc <- anyCodePoint <#> fromEnum
57+
case fromCharCode cc of
58+
Just chr ->
59+
-- the `fromCharCode` function doesn't check if this is beyond the
60+
-- BMP, so we check that ourselves.
61+
-- https://github.com/purescript/purescript-strings/issues/153
62+
if cc > 65535 -- BMP
63+
then notAChar cc
64+
else pure chr
65+
Nothing -> notAChar cc
5766
where
58-
toChar = fromCharCode <<< fromEnum
67+
notAChar cc = fail $ "Code point " <> show cc <> " is not a character"
68+
69+
-- | Match any code point.
70+
anyCodePoint :: Parser CodePoint
71+
anyCodePoint = Parser \{ substring, position } ->
72+
case SCP.uncons substring of
73+
Nothing -> Left { pos: position, error: "Unexpected EOF" }
74+
Just { head, tail } -> Right { result: head, suffix: { substring: tail, position: position + 1 } }
5975

6076
-- | Match any digit.
6177
anyDigit :: Parser Char
@@ -81,10 +97,21 @@ satisfy f = try do
8197
if f c then pure c
8298
else fail $ "Character " <> show c <> " did not satisfy predicate"
8399

100+
-- | Match a code point satisfying the given predicate.
101+
satisfyCodePoint :: (CodePoint -> Boolean) -> Parser CodePoint
102+
satisfyCodePoint f = try do
103+
cp <- anyCodePoint
104+
if f cp then pure cp
105+
else fail $ "Code point " <> show cp <> " did not satisfy predicate"
106+
84107
-- | Match the specified character.
85108
char :: Char -> Parser Char
86109
char c = satisfy (_ == c) <?> "Could not match character " <> show c
87110

111+
-- | Match the specified code point.
112+
codePoint :: CodePoint -> Parser CodePoint
113+
codePoint c = satisfyCodePoint (_ == c) <?> "Could not match code point " <> show c
114+
88115
-- | Match many whitespace characters.
89116
whiteSpace :: Parser String
90117
whiteSpace = do

β€Žtest/BasicSpecs.pursβ€Ž

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,25 +7,41 @@ import Control.Monad.Writer (Writer, execWriter, tell)
77
import Data.Either (isRight)
88
import Data.List (List)
99
import Data.List as List
10+
import Data.Maybe (fromJust)
11+
import Data.String (CodePoint, codePointAt)
1012
import Data.Traversable (traverse)
1113
import Effect (Effect)
1214
import Effect.Class.Console (log)
15+
import Partial.Unsafe (unsafePartial)
1316
import Test.Assert (assert')
1417
import Test.Utils (AnyParser(..), mkAnyParser)
1518
import Text.Parsing.StringParser (Parser, runParser)
16-
import Text.Parsing.StringParser.CodePoints (anyChar, anyDigit, anyLetter, char, eof, skipSpaces, string)
19+
import Text.Parsing.StringParser.CodePoints (anyChar, anyCodePoint, anyDigit, anyLetter, char, codePoint, eof, skipSpaces, string)
1720
import Text.Parsing.StringParser.Combinators (try, tryAhead, between, chainl, chainl1, endBy, endBy1, lookAhead, many, many1, manyTill, many1Till, optionMaybe, sepBy, sepBy1, sepEndBy, sepEndBy1)
1821

1922
type TestInputs = { successes :: Array String, failures :: Array String }
2023
type TestCase = { name :: String, parser :: AnyParser, inputs :: TestInputs }
2124

25+
codePointLiteral :: String -> CodePoint
26+
codePointLiteral s = unsafePartial $ fromJust $ codePointAt 0 s
27+
2228
testCases :: Array TestCase
2329
testCases =
2430
[ { name: "anyChar"
2531
, parser: mkAnyParser anyChar
26-
-- TODO: test "πŸ™‚" which should fail
27-
-- this is an open upstream issue https://github.com/purescript/purescript-strings/issues/153
28-
, inputs: { successes: [ "a", "%" ], failures: [ "" ] }
32+
, inputs: { successes: [ "a", "%" ], failures: [ "", "aa", "πŸ™‚" ] }
33+
}
34+
, { name: "many anyChar"
35+
, parser: mkAnyParser $ many anyChar
36+
, inputs: { successes: [ "", "a", "%", "aa" ], failures: [ "πŸ™‚" ] }
37+
}
38+
, { name: "anyCodePoint"
39+
, parser: mkAnyParser anyCodePoint
40+
, inputs: { successes: [ "a", "%", "πŸ™‚" ], failures: [ "", "aa" ] }
41+
}
42+
, { name: "codePoint"
43+
, parser: mkAnyParser $ codePoint $ codePointLiteral "πŸ™‚"
44+
, inputs: { successes: [ "πŸ™‚" ], failures: [ "", "a", "aa" ] }
2945
}
3046
, { name: "anyLetter"
3147
, parser: mkAnyParser anyLetter

β€Žtest/CodePoints.pursβ€Ž

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ testCodePoints = do
120120
assert $ expectResult "\x458CA" (string "\x458CA" <* string ")" <* eof) "\x458CA)"
121121
assert $ expectResult '\xEEE2' (char '\xEEE2' <* eof) "\xEEE2"
122122
assert $ expectPosition 1 anyChar "\xEEE2"
123-
assert $ expectPosition 1 anyChar "\x458CA"
123+
assert $ parseFail anyChar "\x458CA" -- Is beyond BMP
124124

125125
log "Running overflow tests (may take a while)"
126126

0 commit comments

Comments
Β (0)