FG42/nix/elisp_reader.nix

753 lines
30 KiB
Nix

# This file is copied from https://github.com/talyz/fromElisp which
# distributes under the compatible license MIT.
#
# We have changed this according to our needs
{ lib
, commentMaxLength ? 300
, stringMaxLength ? 3000
, characterMaxLength ? 50
, integerMaxLength ? 50
, floatMaxLength ? 50
, boolVectorMaxLength ? 50
, symbolMaxLength ? 50
, orgModeBabelCodeBlockHeaderMaxLength ? 200
, orgModeBabelCodeBlockArgMaxLength ? 30
}:
with lib;
with builtins;
let
# Create a matcher from a regex string and maximum length. A
# matcher takes a string and returns the first match produced by
# running its regex on it, or null if the match is unsuccessful,
# but only as far in as specified by maxLength.
mkMatcher = regex: maxLength:
string:
let
substr = substring 0 maxLength string;
matched = match regex substr;
in
if matched != null then head matched else null;
removeStrings = stringsToRemove: string:
let
len = length stringsToRemove;
listOfNullStrings = genList (const "") len;
in
replaceStrings stringsToRemove listOfNullStrings string;
# Split a string of elisp into individual tokens and add useful
# metadata.
tokenizeElisp' = { elisp, startLineNumber ? 1 }:
let
# These are the only characters that can not be unescaped in a
# symbol name. We match the inverse of these to get the actual
# symbol characters and use them to differentiate between
# symbols and tokens that could potentially look like symbols,
# such as numbers. Due to the leading bracket, this has to be
# placed _first_ inside a bracket expression.
notInSymbol = '']["'`,#;\\()[:space:][:cntrl:]'';
matchComment = mkMatcher "(;[^\n]*).*" commentMaxLength;
matchString = mkMatcher ''("([^"\\]|\\.)*").*'' stringMaxLength;
matchCharacter = mkMatcher ''([?]((\\[sSHMAC]-)|\\\^)*(([^][\\()]|\\[][\\()])|\\[^^SHMACNuUx0-7]|\\[uU][[:digit:]a-fA-F]+|\\x[[:digit:]a-fA-F]*|\\[0-7]{1,3}|\\N\{[^}]+}))([${notInSymbol}?]|$).*'' characterMaxLength;
matchNonBase10Integer = mkMatcher ''(#([BbOoXx]|[[:digit:]]{1,2}r)[[:digit:]a-fA-F]+)([${notInSymbol}]|$).*'' integerMaxLength;
matchInteger = mkMatcher ''([+-]?[[:digit:]]+[.]?)([${notInSymbol}]|$).*'' integerMaxLength;
matchBoolVector = mkMatcher ''(#&[[:digit:]]+"([^"\\]|\\.)*").*'' boolVectorMaxLength;
matchFloat = mkMatcher ''([+-]?([[:digit:]]*[.][[:digit:]]+|([[:digit:]]*[.])?[[:digit:]]+e([+-]?[[:digit:]]+|[+](INF|NaN))))([${notInSymbol}]|$).*'' floatMaxLength;
matchDot = mkMatcher ''([.])([${notInSymbol}]|$).*'' 2;
# Symbols can contain pretty much any characters - the general
# rule is that if nothing else matches, it's a symbol, so we
# should be pretty generous here and match for symbols last. See
# https://www.gnu.org/software/emacs/manual/html_node/elisp/Symbol-Type.html
matchSymbol =
let
symbolChar = ''([^${notInSymbol}]|\\.)'';
in mkMatcher ''(${symbolChar}+)([${notInSymbol}]|$).*'' symbolMaxLength;
maxTokenLength = foldl' max 0 [
commentMaxLength
stringMaxLength
characterMaxLength
integerMaxLength
floatMaxLength
boolVectorMaxLength
symbolMaxLength
];
# Fold over all the characters in a string, checking for
# matching tokens.
#
# The implementation is a bit obtuse, for optimization reasons:
# nix doesn't have tail-call optimization, thus a strict fold,
# which should essentially force a limited version of tco when
# iterating a list, is our best alternative.
#
# The string read from is split into a list of its constituent
# characters, which is then folded over. Each character is then
# used to determine a likely matching regex "matcher" to run on
# the string, starting at the position of the aforementioned
# character. When an appropriate matcher has been found and run
# successfully on the string, its result is added to
# `state.acc`, a list of all matched tokens. The length of the
# matched token is determined and passed on to the following
# iteration through `state.skip`. If `state.skip` is positive,
# nothing will be done in the current iteration, except
# decrementing `state.skip` for the next one: this skips the
# characters we've already matched. At each iteration,
# `state.pos` is also incremented, to keep track of the current
# string position.
#
# The order of the matches is significant - matchSymbol will,
# for example, also match numbers and characters, so we check
# for symbols last.
readToken = state: char:
let
rest = substring state.pos maxTokenLength elisp;
comment = matchComment rest;
character = matchCharacter rest;
nonBase10Integer = matchNonBase10Integer rest;
integer = matchInteger rest;
float = matchFloat rest;
function = matchFunction rest;
boolVector = matchBoolVector rest;
string = matchString rest;
dot = matchDot rest;
symbol = matchSymbol rest;
in
if state.skip > 0 then
state // {
pos = state.pos + 1;
skip = state.skip - 1;
line = if char == "\n" then state.line + 1 else state.line;
}
else if char == "\n" then
let
mod = state.line / 1000;
newState = {
pos = state.pos + 1;
line = state.line + 1;
inherit mod;
};
in
state // (
# Force evaluation of old state every 1000 lines. Nix
# doesn't have a modulo builtin, so we have to save
# the result of an integer division and compare
# between runs.
if mod > state.mod then
seq state.acc newState
else
newState
)
else if elem char [ " " "\t" "\r" ] then
state // {
pos = state.pos + 1;
inherit (state) line;
}
else if char == ";" then
if comment != null then
state // {
pos = state.pos + 1;
skip = (stringLength comment) - 1;
}
else throw "Unrecognized token on line ${toString state.line}: ${rest}"
else if char == "(" then
state // {
acc = state.acc ++ [{ type = "openParen"; value = "("; inherit (state) line; }];
pos = state.pos + 1;
}
else if char == ")" then
state // {
acc = state.acc ++ [{ type = "closeParen"; value = ")"; inherit (state) line; }];
pos = state.pos + 1;
}
else if char == "[" then
state // {
acc = state.acc ++ [{ type = "openBracket"; value = "["; inherit (state) line; }];
pos = state.pos + 1;
}
else if char == "]" then
state // {
acc = state.acc ++ [{ type = "closeBracket"; value = "]"; inherit (state) line; }];
pos = state.pos + 1;
}
else if char == "'" then
state // {
acc = state.acc ++ [{ type = "quote"; value = "'"; inherit (state) line; }];
pos = state.pos + 1;
}
else if char == ''"'' then
if string != null then
state // {
acc = state.acc ++ [{ type = "string"; value = string; inherit (state) line; }];
pos = state.pos + 1;
skip = (stringLength string) - 1;
}
else throw "Unrecognized token on line ${toString state.line}: ${rest}"
else if char == "#" then
let nextChar = substring 1 1 rest;
in
if nextChar == "'" then
state // {
acc = state.acc ++ [{ type = "function"; value = "#'"; inherit (state) line; }];
pos = state.pos + 1;
skip = 1;
}
else if nextChar == "&" then
if boolVector != null then
state // {
acc = state.acc ++ [{ type = "boolVector"; value = boolVector; inherit (state) line; }];
pos = state.pos + 1;
skip = (stringLength boolVector) - 1;
}
else throw "Unrecognized token on line ${toString state.line}: ${rest}"
else if nextChar == "s" then
if substring 2 1 rest == "(" then
state // {
acc = state.acc ++ [{ type = "record"; value = "#s"; inherit (state) line; }];
pos = state.pos + 1;
skip = 1;
}
else throw "List must follow #s in record on line ${toString state.line}: ${rest}"
else if nextChar == "[" then
state // {
acc = state.acc ++ [{ type = "byteCode"; value = "#"; inherit (state) line; }];
pos = state.pos + 1;
}
else if nonBase10Integer != null then
state // {
acc = state.acc ++ [{ type = "nonBase10Integer"; value = nonBase10Integer; inherit (state) line; }];
pos = state.pos + 1;
skip = (stringLength nonBase10Integer) - 1;
}
else throw "Unrecognized token on line ${toString state.line}: ${rest}"
else if elem char [ "+" "-" "." "0" "1" "2" "3" "4" "5" "6" "7" "8" "9" ] then
if integer != null then
state // {
acc = state.acc ++ [{ type = "integer"; value = integer; inherit (state) line; }];
pos = state.pos + 1;
skip = (stringLength integer) - 1;
}
else if float != null then
state // {
acc = state.acc ++ [{ type = "float"; value = float; inherit (state) line; }];
pos = state.pos + 1;
skip = (stringLength float) - 1;
}
else if dot != null then
state // {
acc = state.acc ++ [{ type = "dot"; value = dot; inherit (state) line; }];
pos = state.pos + 1;
skip = (stringLength dot) - 1;
}
else if symbol != null then
state // {
acc = state.acc ++ [{ type = "symbol"; value = symbol; inherit (state) line; }];
pos = state.pos + 1;
skip = (stringLength symbol) - 1;
}
else throw "Unrecognized token on line ${toString state.line}: ${rest}"
else if char == "?" then
if character != null then
state // {
acc = state.acc ++ [{ type = "character"; value = character; inherit (state) line; }];
pos = state.pos + 1;
skip = (stringLength character) - 1;
}
else throw "Unrecognized token on line ${toString state.line}: ${rest}"
else if char == "`" then
state // {
acc = state.acc ++ [{ type = "backquote"; value = "`"; inherit (state) line; }];
pos = state.pos + 1;
}
else if char == "," then
if substring 1 1 rest == "@" then
state // {
acc = state.acc ++ [{ type = "slice"; value = ",@"; inherit (state) line; }];
skip = 1;
pos = state.pos + 1;
}
else
state // {
acc = state.acc ++ [{ type = "expand"; value = ","; inherit (state) line; }];
pos = state.pos + 1;
}
else if symbol != null then
state // {
acc = state.acc ++ [{ type = "symbol"; value = symbol; inherit (state) line; }];
pos = state.pos + 1;
skip = (stringLength symbol) - 1;
}
else
throw "Unrecognized token on line ${toString state.line}: ${rest}";
in (builtins.foldl' readToken { acc = []; pos = 0; skip = 0; line = startLineNumber; mod = 0; } (stringToCharacters elisp)).acc;
tokenizeElisp = elisp:
tokenizeElisp' { inherit elisp; };
# Produce an AST from a list of tokens produced by `tokenizeElisp`.
parseElisp' = tokens:
let
# Convert literal value tokens in a flat list to their
# corresponding nix representation.
parseValues = tokens:
map (token:
if token.type == "string" then
token // {
value = substring 1 (stringLength token.value - 2) token.value;
}
else if token.type == "integer" then
token // {
value = fromJSON (removeStrings ["+" "."] token.value);
}
else if token.type == "symbol" && token.value == "t" then
token // {
value = true;
}
else if token.type == "float" then
let
initial = head (match "([+-]?([[:digit:]]*[.])?[[:digit:]]+(e([+-]?[[:digit:]]+|[+](INF|NaN)))?)" token.value);
isSpecial = (match "(.+(e[+](INF|NaN)))" initial) != null;
withoutPlus = removeStrings ["+"] initial;
withPrefix =
if substring 0 1 withoutPlus == "." then
"0" + withoutPlus
else if substring 0 2 withoutPlus == "-." then
"-0" + removeStrings ["-"] withoutPlus
else
withoutPlus;
in
if !isSpecial && withPrefix != null then
token // {
value = fromJSON withPrefix;
}
else
token
else
token
) tokens;
# Convert pairs of opening and closing tokens to their
# respective collection types, i.e. lists and vectors. Also,
# normalize the forms of nil, which can be written as either
# `nil` or `()`, to empty lists.
#
# For performance reasons, this is implemented as a fold over
# the list of tokens, rather than as a recursive function. To
# keep track of list depth when sublists are parsed, a list,
# `state.acc`, is used as a stack. When entering a sublist, an
# empty list is pushed to `state.acc`, and items in the sublist
# are subsequently added to this list. When exiting the list,
# `state.acc` is popped and the completed list is added to the
# new head of `state.acc`, i.e. the outer list, which we were
# parsing before entering the sublist.
#
# Evaluation of old state is forced with `seq` in a few places,
# because nix otherwise keeps it around, eventually resulting in
# a stack overflow.
parseCollections = tokens:
let
parseToken = state: token:
let
openColl = if token.type == "openParen" then "list" else if token.type == "openBracket" then "vector" else null;
closeColl = if token.type == "closeParen" then "list" else if token.type == "closeBracket" then "vector" else null;
in
if openColl != null then
state // {
acc = [ [] ] ++ seq (head state.acc) state.acc;
inColl = [ openColl ] ++ state.inColl;
depth = state.depth + 1;
line = [ token.line ] ++ state.line;
}
else if closeColl != null then
if (head state.inColl) == closeColl then
let
outerColl = elemAt state.acc 1;
currColl = {
type = closeColl;
value = head state.acc;
line = head state.line;
inherit (state) depth;
};
rest = tail (tail state.acc);
in
state // seq state.acc {
acc = [ (outerColl ++ [ currColl ]) ] ++ rest;
inColl = tail state.inColl;
depth = state.depth - 1;
line = tail state.line;
}
else
throw "Unmatched ${token.type} on line ${toString token.line}"
else if token.type == "symbol" && token.value == "nil" then
let
currColl = head state.acc;
rest = tail state.acc;
emptyList = {
type = "list";
depth = state.depth + 1;
value = [];
};
in
state // seq currColl { acc = [ (currColl ++ [ emptyList ]) ] ++ rest; }
else
let
currColl = head state.acc;
rest = tail state.acc;
in
state // seq currColl { acc = [ (currColl ++ [ token ]) ] ++ rest; };
in
head (builtins.foldl' parseToken { acc = [ [] ]; inColl = [ null ]; depth = -1; line = []; } tokens).acc;
# Handle dotted pair notation, a syntax where the car and cdr
# are represented explicitly. See
# https://www.gnu.org/software/emacs/manual/html_node/elisp/Dotted-Pair-Notation.html#Dotted-Pair-Notation
# for more info.
#
# This mainly entails handling lists that are the cdrs of a
# dotted pairs, concatenating the lexically distinct lists into
# the logical list they actually represent.
#
# For example:
# (a . (b . (c . nil))) -> (a b c)
parseDots = tokens:
let
parseToken = state: token:
if token.type == "dot" then
if state.inList then
state // {
dotted = true;
depthReduction = state.depthReduction + 1;
}
else
throw ''"Dotted pair notation"-dot outside list on line ${toString token.line}''
else if isList token.value then
let
collectionContents = foldl' parseToken {
acc = [];
dotted = false;
inList = token.type == "list";
inherit (state) depthReduction;
} token.value;
in
state // {
acc = state.acc ++ (
if state.dotted then
collectionContents.acc
else
[
(token // {
value = collectionContents.acc;
depth = token.depth - state.depthReduction;
})
]
);
dotted = false;
}
else
state // {
acc = state.acc ++ [token];
};
in
(foldl' parseToken { acc = []; dotted = false; inList = false; depthReduction = 0; } tokens).acc;
parseQuotes = tokens:
let
parseToken = state: token':
let
token =
if isList token'.value then
token' // {
value = (foldl' parseToken { acc = []; quotes = []; } token'.value).acc;
}
else
token';
in
if elem token.type [ "quote" "expand" "slice" "backquote" "function" "record" "byteCode" ] then
state // {
quotes = [ token ] ++ state.quotes;
}
else if state.quotes != [] then
let
quote = value: token:
token // {
inherit value;
};
quotedValue = foldl' quote token state.quotes;
in
state // {
acc = state.acc ++ [ quotedValue ];
quotes = [];
}
else
state // {
acc = state.acc ++ [ token ];
};
in
(foldl' parseToken { acc = []; quotes = []; } tokens).acc;
in
parseQuotes (parseDots (parseCollections (parseValues tokens)));
parseElisp = elisp:
parseElisp' (tokenizeElisp elisp);
fromElisp' = ast:
let
readObject = object:
if isList object.value then
map readObject object.value
else if object.type == "quote" then
["quote" (readObject object.value)]
else if object.type == "backquote" then
["`" (readObject object.value)]
else if object.type == "expand" then
["," (readObject object.value)]
else if object.type == "slice" then
[",@" (readObject object.value)]
else if object.type == "function" then
["#'" (readObject object.value)]
else if object.type == "byteCode" then
["#"] ++ (readObject object.value)
else if object.type == "record" then
["#s"] ++ (readObject object.value)
else
object.value;
in
map readObject ast;
fromElisp = elisp:
fromElisp' (parseElisp elisp);
# Parse an Org mode babel text and return a list of all code blocks
# with metadata.
#
# The general operation is similar to tokenizeElisp', so check its
# documentation for a more in-depth description.
#
# As in tokenizeElisp', the string read from is split into a list of
# its constituent characters, which is then folded over. Each
# character is then used to determine whether we should try to run a
# match for a `#+begin_src` header or `#+end_src` footer, starting
# at the position of the aforementioned character. These matches
# should only be attempted if the current character is `#` and the
# line has nothing but whitespace before it (noted by
# `state.leadingWhitespace`).
#
# When an appropriate match for a header has been found, its
# arguments are further parsed and the result is put into the code
# block's `flags` attribute. The subsequent characters are added to
# the code block's `body` attribute, until a footer is successfully
# matched and the block is added to the list of parsed blocks,
# `state.acc`.
parseOrgModeBabel = text:
let
matchBeginCodeBlock = mkMatcher "(#[+][bB][eE][gG][iI][nN]_[sS][rR][cC])([[:space:]]+).*" orgModeBabelCodeBlockHeaderMaxLength;
matchHeader = mkMatcher "(#[+][hH][eE][aA][dD][eE][rR][sS]?:)([[:space:]]+).*" orgModeBabelCodeBlockHeaderMaxLength;
matchEndCodeBlock = mkMatcher "(#[+][eE][nN][dD]_[sS][rR][cC][^\n]*).*" orgModeBabelCodeBlockHeaderMaxLength;
matchBeginCodeBlockLang = match "([[:blank:]]*)([[:alnum:]][[:alnum:]-]*).*";
matchBeginCodeBlockFlags = mkMatcher "([^\n]*[\n]).*" orgModeBabelCodeBlockHeaderMaxLength;
parseToken = state: char:
let
rest = substring state.pos orgModeBabelCodeBlockHeaderMaxLength text;
beginCodeBlock = matchBeginCodeBlock rest;
header = matchHeader rest;
endCodeBlock = matchEndCodeBlock rest;
language = matchBeginCodeBlockLang rest;
flags = matchBeginCodeBlockFlags rest;
force = expr: seq state.pos (seq state.line expr);
in
if state.skip > 0 then
state // force {
pos = state.pos + 1;
skip = state.skip - 1;
line = if char == "\n" then state.line + 1 else state.line;
leadingWhitespace = char == "\n" || (state.leadingWhitespace && elem char [ " " "\t" "\r" ]);
}
else if char == "#" && state.leadingWhitespace && !state.readBody && beginCodeBlock != null then
state // {
pos = state.pos + 1;
skip = (stringLength beginCodeBlock) - 1;
leadingWhitespace = false;
readLanguage = true;
}
else if char == "#" && state.leadingWhitespace && !state.readBody && header != null then
state // {
pos = state.pos + 1;
skip = (stringLength header) - 1;
leadingWhitespace = false;
readFlags = true;
}
else if state.readLanguage then
if language != null then
state // {
block = state.block // {
language = elemAt language 1;
};
pos = state.pos + 1;
skip = (foldl' (total: string: total + (stringLength string)) 0 language) - 1;
leadingWhitespace = false;
readLanguage = false;
readFlags = true;
readBody = true;
}
else throw "Language missing or invalid for code block on line ${toString state.line}!"
else if state.readFlags then
if flags != null then
let
parseFlag = state: item:
let
prefix = if isString item then substring 0 1 item else null;
in
if elem prefix [ ":" "-" "+" ] then
state // {
acc = state.acc // { ${item} = true; };
flag = item;
}
else if state.flag != null then
state // {
acc = state.acc // { ${state.flag} = item; };
flag = null;
}
else
state;
in
state // {
block = state.block // {
flags =
(foldl'
parseFlag
{ acc = state.block.flags;
flag = null;
inherit (state) line;
}
(fromElisp flags)).acc;
startLineNumber = state.line + 1;
};
pos = state.pos + 1;
skip = (stringLength flags) - 1;
line = if char == "\n" then state.line + 1 else state.line;
leadingWhitespace = char == "\n";
readFlags = false;
}
else throw "Arguments malformed for code block on line ${toString state.line}!"
else if char == "#" && state.leadingWhitespace && endCodeBlock != null then
state // {
acc = state.acc ++ [ state.block ];
block = {
language = null;
body = "";
flags = {};
};
pos = state.pos + 1;
skip = (stringLength endCodeBlock) - 1;
leadingWhitespace = false;
readBody = false;
}
else if state.readBody then
let
mod = state.pos / 100;
newState = {
block = state.block // {
body = state.block.body + char;
};
inherit mod;
pos = state.pos + 1;
line = if char == "\n" then state.line + 1 else state.line;
leadingWhitespace = char == "\n" || (state.leadingWhitespace && elem char [ " " "\t" "\r" ]);
};
in
if mod > state.mod then
state // seq state.block.body (force newState)
else
state // newState
else
state // force {
pos = state.pos + 1;
line = if char == "\n" then state.line + 1 else state.line;
leadingWhitespace = char == "\n" || (state.leadingWhitespace && elem char [ " " "\t" "\r" ]);
};
in
(foldl'
parseToken
{ acc = [];
mod = 0;
pos = 0;
skip = 0;
line = 1;
block = {
language = null;
body = "";
flags = {};
};
leadingWhitespace = true;
readLanguage = false;
readFlags = false;
readBody = false;
}
(stringToCharacters text)).acc;
# Run tokenizeElisp' on all Elisp code blocks (with `:tangle yes`
# set) from an Org mode babel text. If the block doesn't have a
# `tangle` attribute, it's determined by `defaultArgs`.
tokenizeOrgModeBabelElisp' = defaultArgs: text:
let
codeBlocks =
filter
(block:
let
tangle = toLower (block.flags.":tangle" or defaultArgs.":tangle" or "no");
language = toLower block.language;
in
elem language [ "elisp" "emacs-lisp" ]
&& elem tangle [ "yes" ''"yes"'' ])
(parseOrgModeBabel text);
in
foldl'
(result: codeBlock:
result ++ (tokenizeElisp' {
elisp = codeBlock.body;
inherit (codeBlock) startLineNumber;
})
)
[]
codeBlocks;
tokenizeOrgModeBabelElisp =
tokenizeOrgModeBabelElisp' {
":tangle" = "no";
};
parseOrgModeBabelElisp' = defaultArgs: text:
parseElisp' (tokenizeOrgModeBabelElisp' defaultArgs text);
parseOrgModeBabelElisp = text:
parseElisp' (tokenizeOrgModeBabelElisp text);
fromOrgModeBabelElisp' = defaultArgs: text:
fromElisp' (parseOrgModeBabelElisp' defaultArgs text);
fromOrgModeBabelElisp = text:
fromElisp' (parseOrgModeBabelElisp text);
in
{
inherit tokenizeElisp parseElisp fromElisp;
inherit tokenizeElisp' parseElisp' fromElisp';
inherit tokenizeOrgModeBabelElisp parseOrgModeBabelElisp fromOrgModeBabelElisp;
inherit tokenizeOrgModeBabelElisp' parseOrgModeBabelElisp' fromOrgModeBabelElisp';
}