diff --git a/discord_markdown_ast_parser/__init__.py b/discord_markdown_ast_parser/__init__.py index 608dbe8..1c7f83c 100644 --- a/discord_markdown_ast_parser/__init__.py +++ b/discord_markdown_ast_parser/__init__.py @@ -1,10 +1,10 @@ -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Union -from lexer import lex, Lexing -from parser import Node, parse_tokens +from .lexer import lex, Lexing +from .parser import Node, parse_tokens -def lexing_list_convert(lexing: Lexing) -> List[Lexing]: +def lexing_list_convert(lexing: Union[List[Lexing], Lexing]) -> List[Lexing]: if not isinstance(lexing, list): lexing = [lexing] return [Lexing(item) if isinstance(item, str) else item for item in lexing] diff --git a/discord_markdown_ast_parser/lexer.py b/discord_markdown_ast_parser/lexer.py index d890243..fbfbc32 100644 --- a/discord_markdown_ast_parser/lexer.py +++ b/discord_markdown_ast_parser/lexer.py @@ -1,5 +1,5 @@ import re -from dataclasses import dataclass, InitVar, field +from dataclasses import dataclass, field from enum import Enum from typing import Optional, List, Generator, Dict import itertools @@ -8,31 +8,30 @@ class Lexing: def __init__(self, pattern: Optional[str] = None, flags: re.RegexFlag = re.NOFLAG): self.regex = re.compile(pattern, flags=flags) if pattern else None - + def __call__(self, text: str): return self.regex and self.regex.match(text) - + def __repr__(self): return f"{self.__class__.__name__}({self.regex and self.regex.pattern!r})" -# stolen from https://www.urlregex.com/ -URL_REGEX = ( - r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+" -) + +URL_REGEX = r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-z]{2,4}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)" + class LexingRule(Lexing, Enum): USER_MENTION = r"<@!?(\d{15,20})>" ROLE_MENTION = r"<@&(\d{15,20})>" - SLASH_COMMAND_MENTION = r"" + SLASH_COMMAND_MENTION = r"" CHANNEL_MENTION = r"<#(\d{15,20})>" TIMESTAMP = r"" EMOJI_CUSTOM = r"<:([a-zA-Z0-9_]{2,}):(\d{15,20})>" EMOJI_CUSTOM_ANIMATED = r"" EMOJI_UNICODE = r"(\u00a9|\u00ae|[\u2000-\u3300]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff])" EMOJI_UNICODE_ENCODED = r":([a-zA-Z0-9_]+):" - URL_WITHOUT_PREVIEW_EMBEDDED = f"\[([^\]]+)\]\(<({URL_REGEX})>\)" - URL_WITH_PREVIEW_EMBEDDED = f"\[([^\]]+)\]\(({URL_REGEX})\)" - URL_WITHOUT_PREVIEW = f"<{URL_REGEX}>" + URL_WITHOUT_PREVIEW_EMBEDDED = fr"\[([^\]]+)\]\(<({URL_REGEX})>\)" + URL_WITH_PREVIEW_EMBEDDED = fr"\[([^\]]+)\]\(({URL_REGEX})\)" + URL_WITHOUT_PREVIEW = fr"<{URL_REGEX}>" URL_WITH_PREVIEW = URL_REGEX QUOTE_LINE_PREFIX = r"(>>)?> " TILDE = r"~" @@ -50,12 +49,11 @@ class Token: value: str = "" lexing_rule: Lexing = LexingRule.TEXT_INLINE groups: List[str] = field(default_factory=list) - + def __contains__(self, rule: Lexing): return self.lexing_rule == rule - def lex(input_text: str, custom: Optional[Dict[str, List[Lexing]]] = None) -> Generator[Token, None, None]: """Lexes the input text and returns a generator of tokens. The generator will yield a token for each lexing rule that matches the input text. @@ -68,7 +66,7 @@ def lex(input_text: str, custom: Optional[Dict[str, List[Lexing]]] = None) -> Ge """ seen_simple_text = "" custom = custom or {} - + while input_text: for rule in itertools.chain(*custom.values(), LexingRule): match = rule(input_text) @@ -81,7 +79,7 @@ def lex(input_text: str, custom: Optional[Dict[str, List[Lexing]]] = None) -> Ge continue # don't yield a token in this run # cut off matched part - input_text = input_text[len(match[0]) :] + input_text = input_text[len(match[0]):] # yield inline text if we have some left if len(seen_simple_text) > 0: diff --git a/discord_markdown_ast_parser/parser.py b/discord_markdown_ast_parser/parser.py index cb814d2..5386727 100644 --- a/discord_markdown_ast_parser/parser.py +++ b/discord_markdown_ast_parser/parser.py @@ -4,7 +4,7 @@ import itertools from typing import Optional, Generator, Any, List, Dict, Tuple, Iterable -from lexer import Token, LexingRule, Lexing +from .lexer import Token, LexingRule, Lexing NodeType = Enum( @@ -21,12 +21,14 @@ "CHANNEL", "SLASH_COMMAND", "EMOJI_CUSTOM", + "EMOJI_CUSTOM_ANIMATED", "EMOJI_UNICODE", "EMOJI_UNICODE_ENCODED", "URL_WITH_PREVIEW_EMBEDDED", "URL_WITHOUT_PREVIEW_EMBEDDED", "URL_WITH_PREVIEW", "URL_WITHOUT_PREVIEW", + "TIMESTAMP", "QUOTE_BLOCK", "CODE_BLOCK", "CODE_INLINE",