serene-golang-implementation/bootstrap/pkg/core/parser.go

/*
 Serene --- Yet an other Lisp

Copyright (c) 2020  Sameer Rahmani <lxsameer@gnu.org>

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

package core

// Parser Implementation:
// * `ParseToAST` is the entry point of the parser
// * It's a manual parser with look ahead factor of (1)
// * It parsers the input string to a tree of `IEpxr`s
//
// TODOs:
// * Add a shortcut for anonymous functions similar to `#(...)` clojure
//   syntax
// * Add the support for strings
// * Add the support for kewords
// * Add a shortcut for the `deref` function like `@x` => `(deref x)`
// * A line of comment at the end of a list definition causes a synxtax error.
//   We need to fix it. For example:
//   (asdb xyz
//   ;; problematic comment line
//   )
// Will fails. The reason being we call `readExpr` in `readList` and in the
// `readExpr` when we read a line of comment we jump to a label and try to
// read another expr which in our case it would read the end of list and throw
// and error

import (
	"strings"
	"unicode"

	"serene-lang.org/bootstrap/pkg/ast"
)

// An array of the valid characters that be be used in a symbol
var validChars = []rune{'!', '$', '%', '&', '*', '+', '-', '.', '~', '/', ':', '<', '=', '>', '?', '@', '^', '_'}

// IParsable defines the common interface which any parser has to implement.
type IParsable interface {
	// Reads the next character in the buffer with respect to skipWhitespace
	// parameter which basically jumps over whitespace and some conceptual
	// equivilant of a whitespace like '\n'
	next(skipWhitespace bool) *string

	// Similar to the `next` but it won't change the position in the buffer
	// so an imidiate `next` function after a `peek` will read the same char
	// but will move the position, and a series of `peek` calls will read the
	// same function over and over again without changing the position in the
	// buffer.
	peek(skipWhitespace bool) *string

	// Moves back the position by one in the buffer.
	back()

	// Returns the current position in the buffer
	GetLocation() int
	GetSource() *ast.Source
	Buffer() *[]string
}

// StringParser is an implementation of the  IParsable that operates on strings.
// To put it simply it parses input strings
type StringParser struct {
	buffer []string
	pos    int
	source string

	// This slice holds the boundaries of lines in the buffer. Basically
	// each element determines the position which a line ends and the line
	// number directly maps to the position of it's boundary in the slice.
	lineIndex []int
}

// Implementing IParsable for StringParser ---

// updateLineIndex reads the current character and if it is an end of line, then
// it will update the line index to add the boundaries of the current line.
func (sp *StringParser) updateLineIndex(pos int) {
	if pos < len(sp.buffer) {
		c := sp.buffer[pos]
		if c == "\n" {
			if len(sp.lineIndex) > 0 {
				if sp.lineIndex[len(sp.lineIndex)-1] != pos+1 {
					// Including the \n itself
					sp.lineIndex = append(sp.lineIndex, pos+1)
				}
			} else {
				sp.lineIndex = append(sp.lineIndex, pos+1)
			}

		}
	}

}

// Returns the next character in the buffer
func (sp *StringParser) next(skipWhitespace bool) *string {
	if sp.pos >= len(sp.buffer) {
		return nil
	}
	char := sp.buffer[sp.pos]
	sp.updateLineIndex(sp.pos)
	sp.pos = sp.pos + 1

	if skipWhitespace && isSeparator(&char) {
		return sp.next(skipWhitespace)
	}

	return &char
}

// isSeparator returns a boolean indicating whether the given character `c`
// contains a separator or not. In a Lisp whitespace and someother characters
// are conceptually the same and we need to treat them the same as well.
func isSeparator(c *string) bool {
	if c == nil {
		return false
	}

	r := []rune(*c)[0]
	if r == ' ' || r == '\t' || r == '\n' || r == '\f' {
		return true
	}

	return false

}

// Return the character of the buffer without consuming it
func (sp *StringParser) peek(skipWhitespace bool) *string {
	if sp.pos >= len(sp.buffer) {
		return nil
	}

	c := sp.buffer[sp.pos]
	if isSeparator(&c) && skipWhitespace {
		sp.updateLineIndex(sp.pos)
		sp.pos = sp.pos + 1
		return sp.peek(skipWhitespace)
	}
	return &c
}

// Move the char pointer back by one character
func (sp *StringParser) back() {
	if sp.pos > 0 {
		sp.pos = sp.pos - 1
	}
}

func (sp *StringParser) GetLocation() int {
	return sp.pos
}

func (sp *StringParser) GetSource() *ast.Source {
	return &ast.Source{
		Buffer:    &sp.buffer,
		NS:        sp.source,
		LineIndex: &sp.lineIndex,
	}
}

func (sp *StringParser) Buffer() *[]string {
	return &sp.buffer
}

// END: IParsable ---

// makeErrorAtPoint is a helper function which generates an `IError` that
// points at the current position of the buffer.
func makeErrorAtPoint(p IParsable, msg string, a ...interface{}) IError {
	n := MakeSinglePointNode(p.GetSource(), p.GetLocation())
	return MakeSyntaxErrorf(n, msg, a...)
}

// makeErrorFromError is a function which wraps a Golang error in an IError
func makeErrorFromError(parser IParsable, e error) IError {
	return makeErrorAtPoint(parser, "%w", e)
}

func contains(s []rune, c rune) bool {
	for _, v := range s {
		if v == c {
			return true
		}
	}

	return false
}

func isValidForSymbol(char string) bool {
	c := rune(char[0])
	return contains(validChars, c) || unicode.IsLetter(c) || unicode.IsDigit(c)
}

func readKeyword(parser IParsable) (IExpr, IError) {
	symbol, err := readRawSymbol(parser)
	if err != nil {
		return nil, err
	}

	node := MakeNodeFromExpr(symbol)
	return MakeKeyword(node, ":"+symbol.(*Symbol).String())
}

//readRawSymbol reads a symbol from the current position forward
func readRawSymbol(parser IParsable) (IExpr, IError) {
	c := parser.peek(false)
	var symbol string

	if c == nil {
		return nil, makeErrorAtPoint(parser, "unexpected enf of file while parsing a symbol")
	}

	// Does the symbol starts with a valid character or not
	if isValidForSymbol(*c) {
		parser.next(false)
		symbol = *c
	} else {
		return nil, makeErrorAtPoint(parser,
			"unexpected character: got '%s', expected a symbol at %d",
			*c,
			parser.GetLocation(),
		)
	}

	// read the rest of the symbol
	for {
		c := parser.next(false)

		if c == nil {
			break
		}

		if isValidForSymbol(*c) {
			symbol = symbol + *c
		} else {
			parser.back()
			break
		}
	}

	node := MakeNode(parser.GetSource(), parser.GetLocation()-len(symbol), parser.GetLocation())
	sym, err := MakeSymbol(node, symbol)

	if err != nil {
		err.SetNode(&node)
		return nil, err
	}

	return sym, nil
}

func readString(parser IParsable) (IExpr, IError) {
	str := ""

	for {
		c := parser.next(false)
		if c == nil {
			return nil, makeErrorAtPoint(parser, "reached end of file while scanning a string")
		}

		if *c == "\"" {
			node := MakeNode(parser.GetSource(), parser.GetLocation()-len(str), parser.GetLocation())
			return MakeString(node, str), nil
		}

		if *c == "\\" {
			c = parser.next(false)
			switch *c {
			case "n":
				str = str + "\n"
			case "t":
				str = str + "\t"
			case "r":
				str = str + "\r"
			case "\\":
				str = str + "\\"
			case "\"":
				str = str + "\""
			default:
				return nil, makeErrorAtPoint(parser, "Unsupported escape character: \\%s", *c)
			}
		} else {
			str = str + *c
		}
	}
}

// readNumber reads a number with respect to its sign and whether it's, a ...interface{}
// a decimal or a float
func readNumber(parser IParsable, neg bool) (IExpr, IError) {
	isDouble := false
	result := ""

	if neg {
		result = "-"
	}

	for {
		c := parser.next(false)

		if c == nil {
			break
		}

		if *c == "." && isDouble {
			return nil, makeErrorAtPoint(parser, "a double with more that one '.' ???")
		}

		if *c == "." {
			isDouble = true
			result = result + *c
			continue
		}

		// Weird, But go won't stop complaining without this swap
		char := *c
		r := rune(char[0])
		if unicode.IsDigit(r) {
			result = result + *c
		} else if isValidForSymbol(char) {
			return nil, makeErrorAtPoint(parser, "Illegal token while scanning for a number.")
		} else {
			parser.back()
			break
		}
	}
	n := MakeSinglePointNode(parser.GetSource(), parser.GetLocation())
	n.location.DecStart(len(result))
	value, err := MakeNumberFromStr(n, result, isDouble)

	if err != nil {
		return nil, makeErrorFromError(parser, err)
	}

	return value, nil
}

// readSymbol reads a symbol and return the appropriate type of expression
// based on the symbol conditions. For example it will read a number if the
// symbol starts with a number or a neg sign or a string if it starts with '\"'
// and a raw symbol otherwise
func readSymbol(parser IParsable) (IExpr, IError) {
	c := parser.peek(false)

	if c == nil {
		return nil, makeErrorAtPoint(parser, "unexpected end of file while scanning a symbol")
	}

	if *c == "\"" {
		parser.next(false)
		return readString(parser)
	}

	// Weird, But go won't stop complaining without this swap
	char := *c
	r := rune(char[0])
	if unicode.IsDigit(r) {
		return readNumber(parser, false)
	}

	if *c == "-" {
		parser.next(true)
		c := parser.peek(false)

		// Weird, But go won't stop complaining without this swap
		char := *c
		r := rune(char[0])

		if unicode.IsDigit(r) {
			return readNumber(parser, true)
		} else {
			// Unread '-'
			parser.back()
			return readRawSymbol(parser)
		}

	}
	return readRawSymbol(parser)
}

// readList reads a List recursively.
func readList(parser IParsable) (IExpr, IError) {
	list := []IExpr{}

	for {
		c := parser.peek(true)
		if c == nil {
			return nil, makeErrorAtPoint(parser, "reaching the end of file while reading a list")
		}
		if *c == ")" {
			parser.next(true)
			break
		} else {
			val, err := readExpr(parser)

			if err != nil {
				return nil, err
			}
			list = append(list, val)

		}
	}

	node := MakeNodeFromExprs(list)
	if node == nil {
		n := MakeSinglePointNode(parser.GetSource(), parser.GetLocation())
		node = &n
	}

	node.location.DecStart(1)
	node.location.IncEnd(1)

	return MakeList(*node, list), nil
}

func readComment(parser IParsable) (IExpr, IError) {
	for {
		c := parser.next(false)
		if c == nil || *c == "\n" {
			return nil, nil
		}
	}
}

// readQuotedExpr reads quoted expression ( lie 'something ) by replaceing the
// quote with a call to `quote` special form so 'something => (quote something)
func readQuotedExpr(parser IParsable) (IExpr, IError) {
	expr, err := readExpr(parser)
	if err != nil {
		return nil, err
	}

	symNode := MakeNode(parser.GetSource(), parser.GetLocation(), parser.GetLocation())
	sym, err := MakeSymbol(symNode, "quote")

	if err != nil {
		err.SetNode(&symNode)
		return nil, err
	}

	listElems := []IExpr{
		sym,
		expr,
	}

	listNode := MakeNodeFromExprs(listElems)
	if listNode == nil {
		n := MakeSinglePointNode(parser.GetSource(), parser.GetLocation())
		listNode = &n
	}

	listNode.location.DecStart(1)
	listNode.location.IncStart(1)
	return MakeList(*listNode, listElems), nil
}

// readUnquotedExpr reads different unquoting expressions from their short representaions.
// ~a => (unquote a)
// ~@a => (unquote-splicing a)
// Note: `unquote` and `unquote-splicing` are not global functions or special, they are bounded
// to quasiquoted experssions only.
func readUnquotedExpr(parser IParsable) (IExpr, IError) {
	c := parser.peek(true)

	if c == nil {
		return nil, makeErrorAtPoint(parser, "end of file while reading an unquoted expression")
	}

	var sym IExpr
	var err IError
	var expr IExpr

	node := MakeNode(parser.GetSource(), parser.GetLocation(), parser.GetLocation())

	if *c == "@" {
		parser.next(true)
		sym, err = MakeSymbol(node, "unquote-splicing")
		if err != nil {
			err.SetNode(&node)
		} else {
			expr, err = readExpr(parser)
		}

	} else {
		sym, err = MakeSymbol(node, "unquote")
		if err != nil {
			err.SetNode(&node)
		} else {
			expr, err = readExpr(parser)
		}
	}

	if err != nil {
		return nil, err
	}

	listElems := []IExpr{sym, expr}

	listNode := MakeNodeFromExprs(listElems)

	// listNode won't be nil in this case but it doesn't
	// mean we shouldn't check
	if listNode == nil {
		n := MakeSinglePointNode(parser.GetSource(), parser.GetLocation())
		listNode = &n
	}

	listNode.location.DecStart(1)
	listNode.location.IncStart(1)
	return MakeList(*listNode, listElems), nil
}

// readQuasiquotedExpr reads the backquote and replace it with a call
// to the `quasiquote` macro.
func readQuasiquotedExpr(parser IParsable) (IExpr, IError) {
	expr, err := readExpr(parser)
	if err != nil {
		return nil, err
	}

	node := MakeNode(parser.GetSource(), parser.GetLocation(), parser.GetLocation())
	sym, err := MakeSymbol(node, "quasiquote")
	if err != nil {
		err.SetNode(&node)
		return nil, err
	}

	listElems := []IExpr{sym, expr}
	listNode := MakeNodeFromExprs(listElems)
	// listNode won't be nil in this case but it doesn't
	// mean we shouldn't check
	if listNode == nil {
		n := MakeSinglePointNode(parser.GetSource(), parser.GetLocation())
		listNode = &n
	}

	listNode.location.DecStart(1)
	listNode.location.IncStart(1)

	return MakeList(*listNode, listElems), nil
}

// readExpr reads one expression from the input. This function is the most
// important function in the parser which dispatches the call to different
// reader functions based on the first character
func readExpr(parser IParsable) (IExpr, IError) {
loop:
	c := parser.next(true)

	if c == nil {
		// We're done reading
		return nil, nil
	}

	if *c == "'" {
		return readQuotedExpr(parser)
	}

	if *c == "~" {
		return readUnquotedExpr(parser)
	}

	if *c == "`" {
		return readQuasiquotedExpr(parser)
	}
	if *c == "(" {
		return readList(parser)
	}
	if *c == ";" {
		readComment(parser)
		goto loop
	}

	if *c == ":" {
		return readKeyword(parser)
	}
	// if *c == "[" {
	// 	readVector(parser)
	// }

	// if *c == "{" {
	// 	readMap(parser)
	// }
	parser.back()
	return readSymbol(parser)

}

//ParseToAST is the entry function to the reader/parser which
// converts the `input` string to a `Block` of code. A block
// by itself is not something available to the language. It's
// just anbstraction for a ordered collection of expressions.
// It doesn't have anything to do with the concept of blocks
// from other programming languages.
func ParseToAST(ns string, input string) (*Block, IError) {

	var ast Block
	parser := StringParser{
		buffer: strings.Split(input, ""),
		pos:    0,
		source: ns,
	}

	for {
		expr, err := readExpr(&parser)
		if err != nil {
			return nil, err
		}

		if expr == nil {
			break
		}

		ast.Append(expr)
	}

	return &ast, nil
}