From 3ee424d41c735729898085a50653e2923bcc5f28 Mon Sep 17 00:00:00 2001 From: Gracjan Polak Date: Wed, 6 Jan 2016 14:18:32 +0100 Subject: [PATCH] Lexeme parsing improvements --- haskell-lexeme.el | 105 ++++++++++++++++++++++++++-------- tests/haskell-lexeme-tests.el | 22 +++++++ 2 files changed, 103 insertions(+), 24 deletions(-) diff --git a/haskell-lexeme.el b/haskell-lexeme.el index f4a397905..03957cafa 100644 --- a/haskell-lexeme.el +++ b/haskell-lexeme.el @@ -121,26 +121,73 @@ Note that negative sign char is not part of a number.") (rx-to-string `(: "'" (regexp ,haskell-lexeme-char-literal-inside) "'")) "Regexp matching a character literal.") -(defconst haskell-lexeme-string-literal-inside - (rx (* (| (regexp "[^\n\"\\]") - (: "\\" - (| "a" "b" "f" "n" "r" "t" "v" "\\" "\"" "'" "&" - "NUL" "SOH" "STX" "ETX" "EOT" "ENQ" "ACK" - "BEL" "BS" "HT" "LF" "VT" "FF" "CR" "SO" "SI" "DLE" - "DC1" "DC2" "DC3" "DC4" "NAK" "SYN" "ETB" "CAN" - "EM" "SUB" "ESC" "FS" "GS" "RS" "US" "SP" "DEL" - (: "^" (regexp "[]A-Z@^_\\[]")) - (regexp "[ \t\n\r\v\f]*\\\\")))))) - "Regexp matching an inside of a string literal.") +(defconst haskell-lexeme-string-literal-inside-item + (rx (| (not (any "\n\"\\")) + (: "\\" + (| "a" "b" "f" "n" "r" "t" "v" "\\" "\"" "'" "&" + "NUL" "SOH" "STX" "ETX" "EOT" "ENQ" "ACK" + "BEL" "BS" "HT" "LF" "VT" "FF" "CR" "SO" "SI" "DLE" + "DC1" "DC2" "DC3" "DC4" "NAK" "SYN" "ETB" "CAN" + "EM" "SUB" "ESC" "FS" "GS" "RS" "US" "SP" "DEL" + (: "^" (regexp "[]A-Z@^_\\[]")) + (regexp "[ \t\n\r\v\f]*\\\\"))))) + "Regexp matching an item that is a single character or a single +escape sequence inside of a string literal. + +Note that `haskell-lexeme-string-literal-inside-item' matches +strictly only escape sequences defined in Haskell Report.") (defconst haskell-lexeme-string-literal - (rx-to-string `(: "\"" (regexp ,haskell-lexeme-string-literal-inside) "\"")) - "Regexp matching a string literal.") + (rx (: (group "\"") + (group (* (| (regexp "\\\\[ \t\n\r\v\f]*\\\\") + "\\\"" + (regexp "[^\"\n]")))) + (group (| "\"" (regexp "$"))))) + "Regexp matching a string literal lookalike. + +Note that `haskell-lexeme-string-literal' matches more than +Haskell Report specifies because we want to support also code +under edit. + +String literals end with double quote or unescaped newline or end +of buffer. + +Regexp has subgroup expressions: + (match-text 1) matches the opening doublequote. + (match-text 2) matches the inside of the string. + (match-text 3) matches the closing quote, or a closing + newline or empty string at the end of the buffer.") + +(defconst haskell-lexeme-quasi-quote-literal + (rx-to-string `(: "[" (optional "$") + (group (regexp ,haskell-lexeme-id)) + (group "|") + (group (* (| (not (any "|")) + (: "|" (not (any "]")))) + )) + (group "|") + "]")) + "Regexp matching a quasi quote literal. + +Quasi quotes start with '[xxx|' or '[$xxx|' sequence and end with +'|]'. The 'xxx' is quoter name There is no escaping mechanism +provided for the ending sequence. + +Regexp has subgroup expressions: + (match-text 1) matches the quoter name (without $ sign if present). + (match-text 2) matches the opening vertical bar. + (match-text 3) matches the inside of the quoted string. + (match-text 4) matches the closing vertical bar + or empty string if at the end of the buffer. + +Note that this regexp admits 'e', 't', 'd', 'p' as quoter names +although template Haskell explicitly rejects those.") (defun haskell-lexeme-classify-by-first-char (char) "Classify token by CHAR. -CHAR is a chararacter that is assumed to be first character of a token." +CHAR is a chararacter that is assumed to be the first character +of a token." (let ((category (get-char-code-property char 'general-category))) (cond @@ -158,7 +205,7 @@ CHAR is a chararacter that is assumed to be first character of a token." 'conid) ((or (equal char ?_) (member category '(Ll Lo))) - 'varsym) + 'varid) ((and (>= char ?0) (<= char 9)) 'number) ((member char '(?\] ?\[ ?\( ?\) ?\{ ?\} ?\` ?\, ?\;)) @@ -179,8 +226,9 @@ Possible results are: - 'qsymid: for qualified identifiers or symbols - 'string: for strings literals - 'char: for char literals -- 'decimal: for decimal, float, hexadecimal and octal number literals +- 'number: for decimal, float, hexadecimal and octal number literals - 'template-haskell-quote: for a string of apostrophes for template haskell +- 'template-haskell-quasi-quote: for a string of apostrophes for template haskell Note that for qualified symbols (match-string 1) returns the unqualified identifier or symbol. Further qualification for @@ -195,20 +243,27 @@ See `haskell-lexeme-classify-by-first-char' for details." (point (point-marker))) (or (and (looking-at "{-") - (progn - (save-excursion - (forward-comment 1) - (set-match-data (list point (point-marker)))) + (save-excursion + (forward-comment 1) + (set-match-data (list point (point-marker))) 'nested-comment)) (and (looking-at haskell-lexeme-char-literal) 'char) (and (looking-at haskell-lexeme-string-literal) 'string) (and (looking-at "[][(){}`,;]") - 'special) + (let ((match-data (match-data))) + (if (and (equal "[" (match-string-no-properties 0)) + (looking-at haskell-lexeme-quasi-quote-literal)) + (if (member (match-string-no-properties 1) '("e" "d" "p" "t")) + (progn + (set-match-data match-data) + 'special) + 'template-haskell-quasi-quote) + 'special))) (and (looking-at haskell-lexeme-qid-or-qsym) - (if (and (eq (- (match-end 0) (match-beginning 0)) 2) - (equal (match-string 0) "--")) + (if (save-match-data + (string-match "\\`---*\\'" (match-string-no-properties 0))) (progn (set-match-data (list point (set-marker (make-marker) (line-end-position)))) 'comment) @@ -216,7 +271,9 @@ See `haskell-lexeme-classify-by-first-char' for details." (and (looking-at haskell-lexeme-number) 'number) (and (looking-at "'+") - 'template-haskell-quote)))) + 'template-haskell-quote) + (and (looking-at ".") + 'illegal)))) (provide 'haskell-lexeme) diff --git a/tests/haskell-lexeme-tests.el b/tests/haskell-lexeme-tests.el index b6fc76faa..3b9efb6a9 100644 --- a/tests/haskell-lexeme-tests.el +++ b/tests/haskell-lexeme-tests.el @@ -195,3 +195,25 @@ order." (check-lexemes '("0.12 34.22.33 1e+23 1e23 1e+33 455.33E1456.4") '("0.12" "34.22" "." "33" "1e+23" "1e23" "1e+33" "455.33E1456" "." "4"))) + +(ert-deftest haskell-lexeme-quasi-quote-1 () + (check-lexemes + '("[xml| |]") + '("[xml| |]"))) + +(ert-deftest haskell-lexeme-quasi-quote-2 () + (check-lexemes + '("[xml| |] |]") + '("[xml| |]" "|" "]"))) + +(ert-deftest haskell-lexeme-quasi-quote-3 () + :expected-result :failed + (check-lexemes + '("[xml| |") + '("[xml| |"))) + +(ert-deftest haskell-lexeme-quasi-quote-4 () + :expected-result :failed + (check-lexemes + '("[xml| ") + '("[xml| ")))