Skip to content

Commit 14c9f3e

Browse files
authored
Merge pull request #2 from Kyrela/master
Fix regex bomb + fix missing NodeTypes + use relative imports + better typehints
2 parents 3a340b6 + 4d6ddcf commit 14c9f3e

File tree

3 files changed

+20
-20
lines changed

3 files changed

+20
-20
lines changed

discord_markdown_ast_parser/__init__.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
1-
from typing import Any, Dict, List, Optional
1+
from typing import Any, Dict, List, Union
22

3-
from lexer import lex, Lexing
4-
from parser import Node, parse_tokens
3+
from .lexer import lex, Lexing
4+
from .parser import Node, parse_tokens
55

66

7-
def lexing_list_convert(lexing: Lexing) -> List[Lexing]:
7+
def lexing_list_convert(lexing: Union[List[Lexing], Lexing]) -> List[Lexing]:
88
if not isinstance(lexing, list):
99
lexing = [lexing]
1010
return [Lexing(item) if isinstance(item, str) else item for item in lexing]

discord_markdown_ast_parser/lexer.py

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import re
2-
from dataclasses import dataclass, InitVar, field
2+
from dataclasses import dataclass, field
33
from enum import Enum
44
from typing import Optional, List, Generator, Dict
55
import itertools
@@ -8,31 +8,30 @@
88
class Lexing:
99
def __init__(self, pattern: Optional[str] = None, flags: re.RegexFlag = re.NOFLAG):
1010
self.regex = re.compile(pattern, flags=flags) if pattern else None
11-
11+
1212
def __call__(self, text: str):
1313
return self.regex and self.regex.match(text)
14-
14+
1515
def __repr__(self):
1616
return f"{self.__class__.__name__}({self.regex and self.regex.pattern!r})"
1717

18-
# stolen from https://www.urlregex.com/
19-
URL_REGEX = (
20-
r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
21-
)
18+
19+
URL_REGEX = r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-z]{2,4}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)"
20+
2221

2322
class LexingRule(Lexing, Enum):
2423
USER_MENTION = r"<@!?(\d{15,20})>"
2524
ROLE_MENTION = r"<@&(\d{15,20})>"
26-
SLASH_COMMAND_MENTION = r"</([a-zA-Z0-9_ ]{2,}):(\d{15,20})>"
25+
SLASH_COMMAND_MENTION = r"</([a-zA-Z0-9_ ]{2,}):(\d{15,20})>"
2726
CHANNEL_MENTION = r"<#(\d{15,20})>"
2827
TIMESTAMP = r"<t:(-?\d+)(?::([tTdDfFR]))?>"
2928
EMOJI_CUSTOM = r"<:([a-zA-Z0-9_]{2,}):(\d{15,20})>"
3029
EMOJI_CUSTOM_ANIMATED = r"<a:([a-zA-Z0-9_]{2,}):(\d{15,20})>"
3130
EMOJI_UNICODE = r"(\u00a9|\u00ae|[\u2000-\u3300]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff])"
3231
EMOJI_UNICODE_ENCODED = r":([a-zA-Z0-9_]+):"
33-
URL_WITHOUT_PREVIEW_EMBEDDED = f"\[([^\]]+)\]\(<({URL_REGEX})>\)"
34-
URL_WITH_PREVIEW_EMBEDDED = f"\[([^\]]+)\]\(({URL_REGEX})\)"
35-
URL_WITHOUT_PREVIEW = f"<{URL_REGEX}>"
32+
URL_WITHOUT_PREVIEW_EMBEDDED = fr"\[([^\]]+)\]\(<({URL_REGEX})>\)"
33+
URL_WITH_PREVIEW_EMBEDDED = fr"\[([^\]]+)\]\(({URL_REGEX})\)"
34+
URL_WITHOUT_PREVIEW = fr"<{URL_REGEX}>"
3635
URL_WITH_PREVIEW = URL_REGEX
3736
QUOTE_LINE_PREFIX = r"(>>)?> "
3837
TILDE = r"~"
@@ -50,12 +49,11 @@ class Token:
5049
value: str = ""
5150
lexing_rule: Lexing = LexingRule.TEXT_INLINE
5251
groups: List[str] = field(default_factory=list)
53-
52+
5453
def __contains__(self, rule: Lexing):
5554
return self.lexing_rule == rule
5655

5756

58-
5957
def lex(input_text: str, custom: Optional[Dict[str, List[Lexing]]] = None) -> Generator[Token, None, None]:
6058
"""Lexes the input text and returns a generator of tokens.
6159
The generator will yield a token for each lexing rule that matches the input text.
@@ -68,7 +66,7 @@ def lex(input_text: str, custom: Optional[Dict[str, List[Lexing]]] = None) -> Ge
6866
"""
6967
seen_simple_text = ""
7068
custom = custom or {}
71-
69+
7270
while input_text:
7371
for rule in itertools.chain(*custom.values(), LexingRule):
7472
match = rule(input_text)
@@ -81,7 +79,7 @@ def lex(input_text: str, custom: Optional[Dict[str, List[Lexing]]] = None) -> Ge
8179
continue # don't yield a token in this run
8280

8381
# cut off matched part
84-
input_text = input_text[len(match[0]) :]
82+
input_text = input_text[len(match[0]):]
8583

8684
# yield inline text if we have some left
8785
if len(seen_simple_text) > 0:

discord_markdown_ast_parser/parser.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import itertools
55
from typing import Optional, Generator, Any, List, Dict, Tuple, Iterable
66

7-
from lexer import Token, LexingRule, Lexing
7+
from .lexer import Token, LexingRule, Lexing
88

99

1010
NodeType = Enum(
@@ -21,12 +21,14 @@
2121
"CHANNEL",
2222
"SLASH_COMMAND",
2323
"EMOJI_CUSTOM",
24+
"EMOJI_CUSTOM_ANIMATED",
2425
"EMOJI_UNICODE",
2526
"EMOJI_UNICODE_ENCODED",
2627
"URL_WITH_PREVIEW_EMBEDDED",
2728
"URL_WITHOUT_PREVIEW_EMBEDDED",
2829
"URL_WITH_PREVIEW",
2930
"URL_WITHOUT_PREVIEW",
31+
"TIMESTAMP",
3032
"QUOTE_BLOCK",
3133
"CODE_BLOCK",
3234
"CODE_INLINE",

0 commit comments

Comments
 (0)