You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
167 lines
8.2 KiB
167 lines
8.2 KiB
;;; haskell-string.el --- Haskell related string utilities -*- lexical-binding: t -*- |
|
|
|
;; Copyright (C) 2013 Herbert Valerio Riedel |
|
|
|
;; Author: Herbert Valerio Riedel <hvr@gnu.org> |
|
|
|
;; This file is not part of GNU Emacs. |
|
|
|
;; This file is free software; you can redistribute it and/or modify |
|
;; it under the terms of the GNU General Public License as published by |
|
;; the Free Software Foundation; either version 3 of the License, or |
|
;; (at your option) any later version. |
|
|
|
;; This file is distributed in the hope that it will be useful, |
|
;; but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
;; GNU General Public License for more details. |
|
|
|
;; You should have received a copy of the GNU General Public License |
|
;; along with this program. If not, see <http://www.gnu.org/licenses/>. |
|
|
|
;;; Commentary: |
|
|
|
;;; Todo: |
|
|
|
;; - write ERT tests |
|
|
|
;;; Code: |
|
|
|
(defun haskell-string-trim (string) |
|
"Remove whitespace around STRING. |
|
|
|
A Whitespace character is defined in the Haskell Report as follows |
|
|
|
whitechar -> newline | vertab | space | tab | uniWhite |
|
newline -> return linefeed | return | linefeed | formfeed |
|
uniWhite -> any Unicode character defined as whitespace |
|
|
|
Note: The implementation currently only supports ASCII |
|
white-space characters, i.e. the implemention doesn't |
|
consider uniWhite." |
|
|
|
(let ((s1 (if (string-match "[\t\n\v\f\r ]+\\'" string) (replace-match "" t t string) string))) |
|
(if (string-match "\\`[\t\n\v\f\r ]+" s1) (replace-match "" t t s1) s1))) |
|
|
|
(defun haskell-string-only-spaces-p (string) |
|
"Return t if STRING contains only whitespace (or is empty)." |
|
(string= "" (haskell-string-trim string))) |
|
|
|
(defun haskell-string-take (string n) |
|
"Return (up to) N character length prefix of STRING." |
|
(substring string 0 (min (length string) n))) |
|
|
|
(defconst haskell-string-literal-encode-ascii-array |
|
[ "\\NUL" "\\SOH" "\\STX" "\\ETX" "\\EOT" "\\ENQ" "\\ACK" "\\a" "\\b" "\\t" "\\n" "\\v" "\\f" "\\r" "\\SO" "\\SI" "\\DLE" "\\DC1" "\\DC2" "\\DC3" "\\DC4" "\\NAK" "\\SYN" "\\ETB" "\\CAN" "\\EM" "\\SUB" "\\ESC" "\\FS" "\\GS" "\\RS" "\\US" " " "!" "\\\"" "#" "$" "%" "&" "'" "(" ")" "*" "+" "," "-" "." "/" "0" "1" "2" "3" "4" "5" "6" "7" "8" "9" ":" ";" "<" "=" ">" "?" "@" "A" "B" "C" "D" "E" "F" "G" "H" "I" "J" "K" "L" "M" "N" "O" "P" "Q" "R" "S" "T" "U" "V" "W" "X" "Y" "Z" "[" "\\\\" "]" "^" "_" "`" "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q" "r" "s" "t" "u" "v" "w" "x" "y" "z" "{" "|" "}" "~" "\\DEL" ] |
|
"Array of encodings for 7-bit ASCII character points indexed by ASCII value.") |
|
|
|
(defun haskell-string-literal-encode (str &optional no-quotes) |
|
"Encode STR according Haskell escape rules using 7-bit ASCII representation. |
|
|
|
The serialization has been implement to closely match the |
|
behaviour of GHC's Show instance for Strings. |
|
|
|
If NO-QUOTES is non-nil, omit wrapping result in quotes. |
|
|
|
This is the dual operation to `haskell-string-literal-decode'." |
|
|
|
(let ((lastc -1)) |
|
(let ((encode (lambda (c) |
|
(let ((lc lastc)) |
|
(setq lastc c) |
|
(if (>= c 128) ;; if non-ASCII code point |
|
(format "\\%d" c) |
|
;; else, for ASCII code points |
|
(if (or (and (= lc 14) (= c ?H)) ;; "\SO\&H" |
|
(and (>= lc 128) (>= c ?0) (<= c ?9))) ;; "\123\&4" |
|
(concat "\\&" (aref haskell-string-literal-encode-ascii-array c)) |
|
(aref haskell-string-literal-encode-ascii-array c) |
|
)))))) |
|
|
|
(if no-quotes |
|
(mapconcat encode str "") |
|
(concat "\"" (mapconcat encode str "") "\""))))) |
|
|
|
(defconst haskell-string-literal-escapes-regexp |
|
(concat "[\\]\\(?:" |
|
(regexp-opt (append |
|
(mapcar (lambda (c) (format "%c" c)) |
|
"abfnrtv\\\"'&") ;; "charesc" escape sequences |
|
(mapcar (lambda (c) (format "^%c" c)) |
|
"ABCDEFGHIJKLMNOPQRSTUVWXYZ@[\\]^_") ;; "cntrl" escape sequences |
|
(mapcar (lambda (s) (format "%s" s)) |
|
(split-string "NUL SOH STX ETX EOT ENQ ACK BEL BS HT LF VT FF CR |
|
SO SI DLE DC1 DC2 DC3 DC4 NAK SYN ETB CAN EM SUB ESC |
|
FS GS RS US SP DEL")))) ;; "ascii" (w\o "cntrl") escape sequences |
|
"\\|" "[\t\n\v\f\r ]+[\\]" ;; whitespace gaps |
|
"\\|" "[0-9]+" ;; decimal escape sequence |
|
"\\|" "o[0-7]+" ;; octal escape sequence |
|
"\\|" "x[0-9a-f]+" ;; hex escape sequence |
|
"\\)?") ;; everything else is an invalid escape sequence |
|
"Regexp for matching escape codes in string literals. |
|
See Haskell Report Sect 2.6, |
|
URL `http://www.haskell.org/onlinereport/haskell2010/haskellch2.html#x7-200002.6', |
|
for more details.") |
|
|
|
(defconst haskell-string-literal-decode1-table |
|
(let ((h (make-hash-table :test 'equal))) |
|
(mapc (lambda (c) (puthash (concat "\\" (car c)) (cdr c) h)) |
|
'(;; ascii-escapes |
|
("NUL" . "\x00") ("SOH" . "\x01") ("STX" . "\x02") ("ETX" . "\x03") ("EOT" . "\x04") ("ENQ" . "\x05") |
|
("ACK" . "\x06") ("BEL" . "\x07") ("BS" . "\x08") ("HT" . "\x09") ("LF" . "\x0a") ("VT" . "\x0b") |
|
("FF" . "\x0c") ("CR" . "\x0d") ("SO" . "\x0e") ("SI" . "\x0f") ("DLE" . "\x10") ("DC1" . "\x11") |
|
("DC2" . "\x12") ("DC3" . "\x13") ("DC4" . "\x14") ("NAK" . "\x15") ("SYN" . "\x16") ("ETB" . "\x17") |
|
("CAN" . "\x18") ("EM" . "\x19") ("SUB" . "\x1a") ("ESC" . "\x1b") ("FS" . "\x1c") ("GS" . "\x1d") |
|
("RS" . "\x1e") ("US" . "\x1f") ("SP" . "\x20") ("DEL" . "\x7f" ) |
|
;; C-compatible single-char escape sequences |
|
("a" . "\x07") ("b" . "\x08") ("f" . "\x0c") ("n" . "\x0a") ("r" . "\x0d") ("t" . "\x09") ("v" . "\x0b") |
|
;; trivial escapes |
|
("\\" . "\\") ("\"" . "\"") ("'" . "'") |
|
;; "empty" escape |
|
("&" . ""))) |
|
h) |
|
"Hash table containing irregular escape sequences and their decoded strings. |
|
Used by `haskell-string-literal-decode1'.") |
|
|
|
(defun haskell-string-literal-decode1 (l) |
|
"Decode a single string literal escape sequence. |
|
L must contain exactly one escape sequence. |
|
This is an internal function used by `haskell-string-literal-decode'." |
|
(let ((case-fold-search nil)) |
|
(cond |
|
((gethash l haskell-string-literal-decode1-table)) |
|
((string-match "\\`[\\][0-9]+\\'" l) (char-to-string (string-to-number (substring l 1) 10))) |
|
((string-match "\\`[\\]x[[:xdigit:]]+\\'" l) (char-to-string (string-to-number (substring l 2) 16))) |
|
((string-match "\\`[\\]o[0-7]+\\'" l) (char-to-string (string-to-number (substring l 2) 8))) |
|
((string-match "\\`[\\]\\^[@-_]\\'" l) (char-to-string (- (aref l 2) ?@))) ;; "cntrl" escapes |
|
((string-match "\\`[\\][\t\n\v\f\r ]+[\\]\\'" l) "") ;; whitespace gap |
|
(t (error "Invalid escape sequence"))))) |
|
|
|
(defun haskell-string-literal-decode (estr &optional no-quotes) |
|
"Decode a Haskell string-literal. |
|
If NO-QUOTES is nil, ESTR must be surrounded by quotes. |
|
|
|
This is the dual operation to `haskell-string-literal-encode'." |
|
(if (and (not no-quotes) |
|
(string-match-p "\\`\"[^\\\"[:cntrl:]]*\"\\'" estr)) |
|
(substring estr 1 -1) ;; optimized fast-path for trivial strings |
|
(let ((s (if no-quotes ;; else: do general decoding |
|
estr |
|
(if (string-match-p "\\`\".*\"\\'" estr) |
|
(substring estr 1 -1) |
|
(error "String literal must be delimited by quotes")))) |
|
(case-fold-search nil)) |
|
(replace-regexp-in-string haskell-string-literal-escapes-regexp #'haskell-string-literal-decode1 s t t)))) |
|
|
|
(defun haskell-string-ellipsize (string n) |
|
"Return STRING truncated to (at most) N characters. |
|
If truncation occured, last character in string is replaced by `…'. |
|
See also `haskell-string-take'." |
|
(cond |
|
((<= (length string) n) string) ;; no truncation needed |
|
((< n 1) "") |
|
(t (concat (substring string 0 (1- n)) "…")))) |
|
|
|
(provide 'haskell-string) |
|
|
|
;;; haskell-string.el ends here
|
|
|