11import re
2- from dataclasses import dataclass , InitVar , field
2+ from dataclasses import dataclass , field
33from enum import Enum
44from typing import Optional , List , Generator , Dict
55import itertools
88class Lexing :
99 def __init__ (self , pattern : Optional [str ] = None , flags : re .RegexFlag = re .NOFLAG ):
1010 self .regex = re .compile (pattern , flags = flags ) if pattern else None
11-
11+
1212 def __call__ (self , text : str ):
1313 return self .regex and self .regex .match (text )
14-
14+
1515 def __repr__ (self ):
1616 return f"{ self .__class__ .__name__ } ({ self .regex and self .regex .pattern !r} )"
1717
18- # stolen from https://www.urlregex.com/
19- URL_REGEX = (
20- r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
21- )
18+
19+ URL_REGEX = r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-z]{2,4}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)"
20+
2221
2322class LexingRule (Lexing , Enum ):
2423 USER_MENTION = r"<@!?(\d{15,20})>"
2524 ROLE_MENTION = r"<@&(\d{15,20})>"
26- SLASH_COMMAND_MENTION = r"</([a-zA-Z0-9_ ]{2,}):(\d{15,20})>"
25+ SLASH_COMMAND_MENTION = r"</([a-zA-Z0-9_ ]{2,}):(\d{15,20})>"
2726 CHANNEL_MENTION = r"<#(\d{15,20})>"
2827 TIMESTAMP = r"<t:(-?\d+)(?::([tTdDfFR]))?>"
2928 EMOJI_CUSTOM = r"<:([a-zA-Z0-9_]{2,}):(\d{15,20})>"
3029 EMOJI_CUSTOM_ANIMATED = r"<a:([a-zA-Z0-9_]{2,}):(\d{15,20})>"
3130 EMOJI_UNICODE = r"(\u00a9|\u00ae|[\u2000-\u3300]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff])"
3231 EMOJI_UNICODE_ENCODED = r":([a-zA-Z0-9_]+):"
33- URL_WITHOUT_PREVIEW_EMBEDDED = f "\[([^\]]+)\]\(<({ URL_REGEX } )>\)"
34- URL_WITH_PREVIEW_EMBEDDED = f "\[([^\]]+)\]\(({ URL_REGEX } )\)"
35- URL_WITHOUT_PREVIEW = f "<{ URL_REGEX } >"
32+ URL_WITHOUT_PREVIEW_EMBEDDED = fr "\[([^\]]+)\]\(<({ URL_REGEX } )>\)"
33+ URL_WITH_PREVIEW_EMBEDDED = fr "\[([^\]]+)\]\(({ URL_REGEX } )\)"
34+ URL_WITHOUT_PREVIEW = fr "<{ URL_REGEX } >"
3635 URL_WITH_PREVIEW = URL_REGEX
3736 QUOTE_LINE_PREFIX = r"(>>)?> "
3837 TILDE = r"~"
@@ -50,12 +49,11 @@ class Token:
5049 value : str = ""
5150 lexing_rule : Lexing = LexingRule .TEXT_INLINE
5251 groups : List [str ] = field (default_factory = list )
53-
52+
5453 def __contains__ (self , rule : Lexing ):
5554 return self .lexing_rule == rule
5655
5756
58-
5957def lex (input_text : str , custom : Optional [Dict [str , List [Lexing ]]] = None ) -> Generator [Token , None , None ]:
6058 """Lexes the input text and returns a generator of tokens.
6159 The generator will yield a token for each lexing rule that matches the input text.
@@ -68,7 +66,7 @@ def lex(input_text: str, custom: Optional[Dict[str, List[Lexing]]] = None) -> Ge
6866 """
6967 seen_simple_text = ""
7068 custom = custom or {}
71-
69+
7270 while input_text :
7371 for rule in itertools .chain (* custom .values (), LexingRule ):
7472 match = rule (input_text )
@@ -81,7 +79,7 @@ def lex(input_text: str, custom: Optional[Dict[str, List[Lexing]]] = None) -> Ge
8179 continue # don't yield a token in this run
8280
8381 # cut off matched part
84- input_text = input_text [len (match [0 ]) :]
82+ input_text = input_text [len (match [0 ]):]
8583
8684 # yield inline text if we have some left
8785 if len (seen_simple_text ) > 0 :
0 commit comments