You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
224 lines
7.9 KiB
224 lines
7.9 KiB
9 years ago
|
;;; haskell-lexeme.el --- haskell lexical tokens -*- coding: utf-8; lexical-binding: t -*-
|
||
|
|
||
|
;; Copyright (C) 2015 Gracjan Polak
|
||
|
|
||
|
;; This file is free software; you can redistribute it and/or modify
|
||
|
;; it under the terms of the GNU General Public License as published by
|
||
|
;; the Free Software Foundation; either version 3, or (at your option)
|
||
|
;; any later version.
|
||
|
|
||
|
;; This file is distributed in the hope that it will be useful,
|
||
|
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
;; GNU General Public License for more details.
|
||
|
|
||
|
;; You should have received a copy of the GNU General Public License
|
||
|
;; along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||
|
|
||
|
;;; Commentary:
|
||
|
|
||
|
;;; Code:
|
||
|
|
||
|
(require 'rx)
|
||
|
|
||
|
(unless (category-docstring ?P)
|
||
|
(define-category ?P "Haskell symbol constituent characters")
|
||
|
(map-char-table
|
||
|
#'(lambda (key val)
|
||
|
(if (or
|
||
|
(and (consp key) (> (car key) 128))
|
||
|
(and (numberp key) (> key 128)))
|
||
|
(if (member val '(Pc Pd Po Sm Sc Sk So))
|
||
|
(modify-category-entry key ?P))))
|
||
|
unicode-category-table)
|
||
|
|
||
|
(dolist (key (string-to-list "!#$%&*+./<=>?@^|~\\-"))
|
||
|
(modify-category-entry key ?P)))
|
||
|
|
||
|
(defconst haskell-lexeme-modid
|
||
|
"[[:upper:]][[:alnum:]'_]*"
|
||
|
"Regexp matching a valid Haskell module identifier.
|
||
|
|
||
|
Note that GHC accepts Unicode category UppercaseLetter as a first
|
||
|
character. Following letters are from Unicode categories
|
||
|
UppercaseLetter, LowercaseLetter, OtherLetter, TitlecaseLetter,
|
||
|
ModifierLetter, DecimalNumber, OtherNumber, backslash or
|
||
|
underscore.
|
||
|
|
||
|
Note that this differs from constructor identifier as the latter
|
||
|
one can have any number of hash character at the end to
|
||
|
accommodate MagicHash extension.")
|
||
|
|
||
|
(defconst haskell-lexeme-id
|
||
|
"[[:alpha:]_][[:alnum:]'_]*#*"
|
||
|
"Regexp matching a valid Haskell identifier.
|
||
|
|
||
|
GHC accepts a string starting with any alphabetic character or
|
||
|
underscore followed by any alphanumeric character or underscore
|
||
|
or apostrophe.")
|
||
|
|
||
|
(defconst haskell-lexeme-sym
|
||
|
"\\(:?\\cP\\|:\\)+"
|
||
|
"Regexp matching a valid Haskell variable or constructor symbol.
|
||
|
|
||
|
GHC accepts a string of chars from the set
|
||
|
[:!#$%&*+./<=>?@^|~\\-] or Unicode category Symbol for chars with
|
||
|
codes larger than 128 only.")
|
||
|
|
||
|
(defconst haskell-lexeme-modid-opt-prefix
|
||
|
(concat "\\(?:" haskell-lexeme-modid "\\.\\)*")
|
||
|
"Regexp matching a valid Haskell module prefix, potentially empty.
|
||
|
|
||
|
Module path prefix is separated by dots and finishes with a
|
||
|
dot. For path component syntax see `haskell-lexeme-modid'.")
|
||
|
|
||
|
(defconst haskell-lexeme-qid-or-qsym
|
||
|
(rx-to-string `(: (regexp ,haskell-lexeme-modid-opt-prefix)
|
||
|
(group (| (regexp ,haskell-lexeme-id) (regexp ,haskell-lexeme-sym)
|
||
|
))))
|
||
|
"Regexp matching a valid qualified identifier or symbol.
|
||
|
|
||
|
Note that (match-string 1) returns the unqualified part.")
|
||
|
|
||
|
(defconst haskell-lexeme-qid
|
||
|
(rx-to-string `(: (regexp "'*")
|
||
|
(regexp ,haskell-lexeme-modid-opt-prefix)
|
||
|
(group (regexp ,haskell-lexeme-id))))
|
||
|
"Regexp matching a valid qualified identifier.
|
||
|
|
||
|
Note that (match-string 1) returns the unqualified part.")
|
||
|
|
||
|
(defconst haskell-lexeme-qsym
|
||
|
(rx-to-string `(: (regexp "'*")
|
||
|
(regexp ,haskell-lexeme-modid-opt-prefix)
|
||
|
(group (regexp ,haskell-lexeme-id))))
|
||
|
"Regexp matching a valid qualified symbol.
|
||
|
|
||
|
Note that (match-string 1) returns the unqualified part.")
|
||
|
|
||
|
(defconst haskell-lexeme-number
|
||
|
(rx (| (: (regexp "[0-9]+\\.[0-9]+") (opt (regexp "[eE][-+]?[0-9]+")))
|
||
|
(regexp "[0-9]+[eE][-+]?[0-9]+")
|
||
|
(regexp "0[xX][0-9a-fA-F]+")
|
||
|
(regexp "0[oO][0-7]+")
|
||
|
(regexp "[0-9]+")))
|
||
|
"Regexp matching a floating point, decimal, octal or hexadecimal number.
|
||
|
|
||
|
Note that negative sign char is not part of a number.")
|
||
|
|
||
|
(defconst haskell-lexeme-char-literal-inside
|
||
|
(rx (| (regexp "[^\n'\\]")
|
||
|
(: "\\"
|
||
|
(| "a" "b" "f" "n" "r" "t" "v" "\\" "\"" "'"
|
||
|
"NUL" "SOH" "STX" "ETX" "EOT" "ENQ" "ACK"
|
||
|
"BEL" "BS" "HT" "LF" "VT" "FF" "CR" "SO" "SI" "DLE"
|
||
|
"DC1" "DC2" "DC3" "DC4" "NAK" "SYN" "ETB" "CAN"
|
||
|
"EM" "SUB" "ESC" "FS" "GS" "RS" "US" "SP" "DEL"
|
||
|
(: "^" (regexp "[]A-Z@^_\\[]"))))))
|
||
|
"Regexp matching an inside of a character literal.")
|
||
|
|
||
|
(defconst haskell-lexeme-char-literal
|
||
|
(rx-to-string `(: "'" (regexp ,haskell-lexeme-char-literal-inside) "'"))
|
||
|
"Regexp matching a character literal.")
|
||
|
|
||
|
(defconst haskell-lexeme-string-literal-inside
|
||
|
(rx (* (| (regexp "[^\n\"\\]")
|
||
|
(: "\\"
|
||
|
(| "a" "b" "f" "n" "r" "t" "v" "\\" "\"" "'" "&"
|
||
|
"NUL" "SOH" "STX" "ETX" "EOT" "ENQ" "ACK"
|
||
|
"BEL" "BS" "HT" "LF" "VT" "FF" "CR" "SO" "SI" "DLE"
|
||
|
"DC1" "DC2" "DC3" "DC4" "NAK" "SYN" "ETB" "CAN"
|
||
|
"EM" "SUB" "ESC" "FS" "GS" "RS" "US" "SP" "DEL"
|
||
|
(: "^" (regexp "[]A-Z@^_\\[]"))
|
||
|
(regexp "[ \t\n\r\v\f]*\\\\"))))))
|
||
|
"Regexp matching an inside of a string literal.")
|
||
|
|
||
|
(defconst haskell-lexeme-string-literal
|
||
|
(rx-to-string `(: "\"" (regexp ,haskell-lexeme-string-literal-inside) "\""))
|
||
|
"Regexp matching a string literal.")
|
||
|
|
||
|
(defun haskell-lexeme-classify-by-first-char (char)
|
||
|
"Classify token by CHAR.
|
||
|
|
||
|
CHAR is a chararacter that is assumed to be first character of a token."
|
||
|
(let ((category (get-char-code-property char 'general-category)))
|
||
|
|
||
|
(cond
|
||
|
((or (member char '(?! ?# ?$ ?% ?& ?* ?+ ?. ?/ ?< ?= ?> ?? ?@ ?^ ?| ?~ ?\\ ?-))
|
||
|
(and (> char 127)
|
||
|
(member category '(Pc Pd Po Sm Sc Sk So))))
|
||
|
'varsym)
|
||
|
((equal char ?:)
|
||
|
'consym)
|
||
|
((equal char ?\')
|
||
|
'char)
|
||
|
((equal char ?\")
|
||
|
'string)
|
||
|
((member category '(Lu Lt))
|
||
|
'conid)
|
||
|
((or (equal char ?_)
|
||
|
(member category '(Ll Lo)))
|
||
|
'varsym)
|
||
|
((and (>= char ?0) (<= char 9))
|
||
|
'number)
|
||
|
((member char '(?\] ?\[ ?\( ?\) ?\{ ?\} ?\` ?\, ?\;))
|
||
|
'special))))
|
||
|
|
||
|
(defun haskell-lexeme-looking-at-token ()
|
||
|
"Like `looking-at' but understands Haskell lexemes.
|
||
|
|
||
|
Moves point forward over whitespace. Returns a symbol describing
|
||
|
type of Haskell token recognized. Use `match-string',
|
||
|
`match-beginning' and `match-end' with argument 0 to query match
|
||
|
result.
|
||
|
|
||
|
Possible results are:
|
||
|
- 'special: for chars [](){}`,;
|
||
|
- 'comment: for single line comments
|
||
|
- 'nested-comment: for multiline comments
|
||
|
- 'qsymid: for qualified identifiers or symbols
|
||
|
- 'string: for strings literals
|
||
|
- 'char: for char literals
|
||
|
- 'decimal: for decimal, float, hexadecimal and octal number literals
|
||
|
- 'template-haskell-quote: for a string of apostrophes for template haskell
|
||
|
|
||
|
Note that for qualified symbols (match-string 1) returns the
|
||
|
unqualified identifier or symbol. Further qualification for
|
||
|
symbol or identifier can be done with:
|
||
|
|
||
|
(haskell-lexeme-classify-by-first-char (char-after (match-beginning 1)))
|
||
|
|
||
|
See `haskell-lexeme-classify-by-first-char' for details."
|
||
|
(skip-syntax-forward "->")
|
||
|
(let
|
||
|
((case-fold-search nil)
|
||
|
(point (point-marker)))
|
||
|
(or
|
||
|
(and (looking-at "{-")
|
||
|
(progn
|
||
|
(save-excursion
|
||
|
(forward-comment 1)
|
||
|
(set-match-data (list point (point-marker))))
|
||
|
'nested-comment))
|
||
|
(and (looking-at haskell-lexeme-char-literal)
|
||
|
'char)
|
||
|
(and (looking-at haskell-lexeme-string-literal)
|
||
|
'string)
|
||
|
(and (looking-at "[][(){}`,;]")
|
||
|
'special)
|
||
|
(and (looking-at haskell-lexeme-qid-or-qsym)
|
||
|
(if (and (eq (- (match-end 0) (match-beginning 0)) 2)
|
||
|
(equal (match-string 0) "--"))
|
||
|
(progn
|
||
|
(set-match-data (list point (set-marker (make-marker) (line-end-position))))
|
||
|
'comment)
|
||
|
'qsymid))
|
||
|
(and (looking-at haskell-lexeme-number)
|
||
|
'number)
|
||
|
(and (looking-at "'+")
|
||
|
'template-haskell-quote))))
|
||
|
|
||
|
(provide 'haskell-lexeme)
|
||
|
|
||
|
;;; haskell-lexeme.el ends here
|