From 794beed3bd645a9cf9dc1d121b8695c8911c3d4f Mon Sep 17 00:00:00 2001 From: John Yang Date: Sat, 16 Dec 2023 09:37:24 +0900 Subject: [PATCH] Refactor parsing from compiler to parser --- makefile | 2 +- .../{Compiler.cpp => CodeGenerator.cpp} | 393 +++++------------- .../{Compiler.hpp => CodeGenerator.hpp} | 78 ++-- src/compile/ParsedSrc.hpp | 18 + src/compile/Parser.cpp | 190 +++++++++ src/compile/Parser.hpp | 49 +++ src/compile/SrcMap.hpp | 14 + src/repl/repl.cpp | 23 +- 8 files changed, 421 insertions(+), 346 deletions(-) rename src/compile/{Compiler.cpp => CodeGenerator.cpp} (53%) rename src/compile/{Compiler.hpp => CodeGenerator.hpp} (76%) create mode 100644 src/compile/ParsedSrc.hpp create mode 100644 src/compile/Parser.cpp create mode 100644 src/compile/Parser.hpp create mode 100644 src/compile/SrcMap.hpp diff --git a/makefile b/makefile index 2db036b..ec27357 100644 --- a/makefile +++ b/makefile @@ -23,7 +23,7 @@ To set it to the /lib folder of this project. endef _DEPS = code/Code.hpp \ - compile/Compiler.hpp \ + compile/CodeGenerator.hpp compile/Parser.hpp \ error/RuntimeError.hpp error/SyntaxError.hpp error/TypeError.hpp \ fn/CPPFnImpls.hpp \ repl/repl.hpp \ diff --git a/src/compile/Compiler.cpp b/src/compile/CodeGenerator.cpp similarity index 53% rename from src/compile/Compiler.cpp rename to src/compile/CodeGenerator.cpp index 1f2d95a..ca854b2 100644 --- a/src/compile/Compiler.cpp +++ b/src/compile/CodeGenerator.cpp @@ -1,9 +1,8 @@ -#include "Compiler.hpp" +#include "CodeGenerator.hpp" #include "../code/OpCode.hpp" #include "../error/SyntaxError.hpp" #include "../runtime/VM.hpp" #include "../sexpr/Casting.hpp" -#include "../sexpr/String.hpp" #include "Grammar.hpp" #include "SrcLoc.hpp" #include @@ -17,199 +16,29 @@ using namespace compile; using namespace runtime; using namespace error; -std::vector Compiler::tokenize(std::vector lines) { - std::vector tokens; - for (unsigned int row{1}; const auto &line : lines) { - auto newTokens = tokenize(line, row); - tokens.insert(tokens.cend(), newTokens.cbegin(), newTokens.cend()); - ++row; - } - return tokens; -} - -std::vector -Compiler::tokenize(std::string line, const unsigned int row) { - std::vector tokens; - std::regex rgx( - "\\\"(?:[^\"\\\\]*(?:\\\\.)?)*\\\"|;|\\(|\\)|,@|,|`|'|[^\\s(),@,`']+" - ); - auto begin = std::sregex_iterator(line.cbegin(), line.cend(), rgx); - auto end = std::sregex_iterator(); - for (std::sregex_iterator i = begin; i != end; ++i) { - std::smatch match = *i; - tokens.push_back(Token{ - match.str(), - { - row, - (unsigned int)match.position(), - } - }); - } - return tokens; -} - -bool Compiler::isNum(const std::string s) { - try { - std::stod(s); - } catch (...) { - return false; - } - return true; -} - -const SExprs *Compiler::parse() { - auto tokens = tokenize(source); - auto it = tokens.cbegin(); - return cast(parseLists(it, tokens.cend())); -} - -const SExpr *Compiler::parseLists(TokenIter &it, const TokenIter &end) { - if (it == end) { - return vm.heap.alloc(); - } - const auto [row, col] = it->srcLoc; - const auto cur = parseList(it, end); - const auto sexprs = vm.heap.alloc(cur, parseLists(it, end)); - srcMap[sexprs] = {row, col}; - return sexprs; -} - -const SExpr *Compiler::parseList(TokenIter &it, const TokenIter &end) { - auto token = *it; - it += 1; - if (token.str == "(") { - const auto sExprs = parseElem(it, end); - srcMap.insert({sExprs, {token.srcLoc.row, token.srcLoc.col}}); - return sExprs; - } - if (token.str == "'" || token.str == "`" || token.str == "," || - token.str == ",@") { - const auto rest = - vm.heap.alloc(parseList(it, end), vm.heap.alloc()); - srcMap.insert({rest, {token.srcLoc.row, token.srcLoc.col}}); - const auto sExprs = vm.heap.alloc(parseAtom(token), rest); - srcMap.insert({sExprs, {token.srcLoc.row, token.srcLoc.col}}); - return sExprs; - } - const auto atom = parseAtom(token); - return atom; -} - -const SExpr *Compiler::parseElem(TokenIter &it, const TokenIter &end) { - auto token = *it; - if (token.str == ")") { - it += 1; - return vm.heap.alloc(); - } else if (token.str == "(") { - it += 1; - const auto first = parseElem(it, end); - const auto rest = parseElem(it, end); - const auto sExprs = vm.heap.alloc(first, rest); - srcMap.insert({sExprs, {token.srcLoc.row, token.srcLoc.col}}); - return sExprs; - } - return parseSexprs(it, end); -} - -const SExpr *Compiler::parseSexprs(TokenIter &it, const TokenIter &end) { - auto token = *it; - const auto first = parseList(it, end); - if (it->str == ".") { - it += 1; - const auto rest = parseList(it, end); - if (it == end) { - handleTypeError(dotGrammer, "datum", rest); - } - it += 1; - const auto sExprs = vm.heap.alloc(first, rest); - srcMap.insert({sExprs, {token.srcLoc.row, token.srcLoc.col}}); - return sExprs; - } - const auto rest = parseElem(it, end); - const auto sExprs = vm.heap.alloc(first, rest); - srcMap.insert({sExprs, {token.srcLoc.row, token.srcLoc.col}}); - return sExprs; -} - -const SExpr *Compiler::parseAtom(Token token) { - if (isNum(token.str)) { - return vm.heap.alloc(std::stod(token.str)); - } - if (token.str.front() == '\"' && token.str.back() == '\"') { - return vm.heap.alloc(token.str); - } - if (token.str == "#") { - return vm.heap.alloc(); - } - if (token.str == "#t") { - return vm.heap.alloc(true); - } - if (token.str == "#f") { - return vm.heap.alloc(false); - } - if (token.str == "'") { - return vm.heap.alloc("quote"); - } - if (token.str == "`") { - return vm.heap.alloc("quasiquote"); - } - if (token.str == ",") { - return vm.heap.alloc("unquote"); - } - if (token.str == ",@") { - return vm.heap.alloc("unquote-splicing"); - } - return vm.heap.alloc(token.str); -} - -void Compiler::handleUnexpectedToken( - const Token &token, const std::string &line -) { - std::stringstream ss; - ss << "Unexpected \"" << token.str << "\"."; - throw SyntaxError(ss.str(), line, token.srcLoc.row, token.srcLoc.col); -} - -Compiler::Compiler( - const std::vector source, - SrcMap sourceLoc, - const SExpr *param, - const SExprs *body, - Compiler &enclosing, - VM &vm +CodeGenerator::CodeGenerator( + runtime::VM &vm, + CodeGenerator &enclosing, + ParsedSrc &parsedSrc, + const sexpr::SExpr *param, + const sexpr::SExprs *body ) : vm(vm), enclosing(enclosing), - source(source), - srcMap(sourceLoc), - curSrcLoc({srcMap[param].row, srcMap[param].col}), + parsedSrc(parsedSrc), + curSrcLoc({parsedSrc.srcMap[param].row, parsedSrc.srcMap[param].col}), param(param), + body(body), arity(countArity()), variadic(isVariadic()), - body(body), - stackOffset(1) { - - if (const auto sExprs = dynCast(param)) { - visitEach(sExprs.value(), [this](const auto sExpr) { - const auto sym = cast(sExpr); - locals.push_back({sym, stackOffset, false}); - stackOffset += 1; - }); - } + stackOffset(1), + proto(generate()) {} - const auto lastParam = last(param); - - if (const auto sym = dynCast(lastParam)) { - locals.push_back({sym.value(), stackOffset, false}); - stackOffset += 1; - } -} - -void Compiler::updateCurSrcLoc(const sexpr::SExprs *sExpr) { - curSrcLoc = srcMap[sExpr]; +void CodeGenerator::updateCurSrcLoc(const sexpr::SExprs *sExpr) { + curSrcLoc = parsedSrc.srcMap[sExpr]; } -std::optional Compiler::resolveLocal(const Sym *sym) { +std::optional CodeGenerator::resolveLocal(const Sym *sym) { auto it = std::find_if(locals.rbegin(), locals.rend(), [&sym](const auto &local) { return *local.symbol == *sym; @@ -221,7 +50,7 @@ std::optional Compiler::resolveLocal(const Sym *sym) { } std::optional -Compiler::resolveUpvalue(Compiler &caller, const Sym *sym) { +CodeGenerator::resolveUpvalue(CodeGenerator &caller, const Sym *sym) { if (!enclosing.has_value()) { return std::nullopt; } @@ -236,7 +65,7 @@ Compiler::resolveUpvalue(Compiler &caller, const Sym *sym) { return std::nullopt; } -std::size_t Compiler::addUpvalue(int idx, bool isLocal) { +std::size_t CodeGenerator::addUpvalue(int idx, bool isLocal) { if (auto it = std::find_if( upValues.cbegin(), upValues.cend(), @@ -251,22 +80,22 @@ std::size_t Compiler::addUpvalue(int idx, bool isLocal) { return upValues.size() - 1; } -bool Compiler::isVariadic() { return isa(last(param)); } +bool CodeGenerator::isVariadic() { return isa(last(param)); } -uint8_t Compiler::countArity() { +uint8_t CodeGenerator::countArity() { if (isa(param) || isa(param)) { return 0; } return visitEach(cast(param), [](const auto) {}); } -code::InstrPtr Compiler::emitConst(const sexpr::SExpr *sExpr) { +code::InstrPtr CodeGenerator::emitConst(const sexpr::SExpr *sExpr) { return code.pushConst(sExpr); } -void Compiler::patchJump(const code::InstrPtr idx) { code.patchJump(idx); } +void CodeGenerator::patchJump(const code::InstrPtr idx) { code.patchJump(idx); } -const SExpr *Compiler::last(const SExpr *sExpr) { +const SExpr *CodeGenerator::last(const SExpr *sExpr) { if (isa(sExpr)) { return sExpr; } @@ -275,7 +104,7 @@ const SExpr *Compiler::last(const SExpr *sExpr) { return last(sExprs->rest); } -unsigned int Compiler::visitEach(const SExpr *sExpr, Visitor visitor) { +unsigned int CodeGenerator::visitEach(const SExpr *sExpr, Visitor visitor) { if (isa(sExpr)) { return 0; } @@ -285,7 +114,7 @@ unsigned int Compiler::visitEach(const SExpr *sExpr, Visitor visitor) { return 1 + visitEach(sExprs->rest, visitor); } -void Compiler::traverse(const SExpr *sExpr, Visitor visitor) { +void CodeGenerator::traverse(const SExpr *sExpr, Visitor visitor) { if (isa(sExpr)) { const auto sexprs = cast(sExpr); traverse(sexprs->first, visitor); @@ -294,23 +123,49 @@ void Compiler::traverse(const SExpr *sExpr, Visitor visitor) { visitor(sExpr); } -void Compiler::compileStmts(const SExpr *sExpr) { +const Prototype *CodeGenerator::generate() { + if (const auto sExprs = dynCast(param)) { + visitEach(sExprs.value(), [this](const auto sExpr) { + const auto sym = cast(sExpr); + locals.push_back({sym, stackOffset, false}); + stackOffset += 1; + }); + } + + const auto lastParam = last(param); + + if (const auto sym = dynCast(lastParam)) { + locals.push_back({sym.value(), stackOffset, false}); + stackOffset += 1; + } + + if (variadic) { + emitCode(OpCode::MAKE_LIST, arity + 1); + } + + emitStmts(body); + emitRet(); + + return vm.heap.alloc(upValues.size(), arity, variadic, code); +} + +void CodeGenerator::emitStmts(const SExpr *sExpr) { emitCode(OpCode::MAKE_NIL); visitEach(sExpr, [this](const auto sExpr) { stackOffset += 1; - compileStmt(sExpr); + emitStmt(sExpr); }); } -void Compiler::compileExprs(const SExpr *sExpr) { +void CodeGenerator::emitExprs(const SExpr *sExpr) { emitCode(OpCode::MAKE_NIL); visitEach(sExpr, [this](const auto sExpr) { emitCode(OpCode::POP_TOP); - compileExpr(sExpr); + emitExpr(sExpr); }); } -void Compiler::compileStmt(const SExpr *sExpr) { +void CodeGenerator::emitStmt(const SExpr *sExpr) { if (matchForm( sExpr, { @@ -318,22 +173,22 @@ void Compiler::compileStmt(const SExpr *sExpr) { {&DEFMACRO_SYM, [this](const auto &matched) { execDefMacro(matched); }}, {&BEGIN_SYM, - [this](const auto &matched) { compileStmts(matched.get()); }}, + [this](const auto &matched) { emitStmts(matched.get()); }}, }, [this, &sExpr](const auto &sym, const auto) { if (vm.env.isMacro(sym.get())) { - compileStmt(execMacro(sExpr)); + emitStmt(execMacro(sExpr)); return; } - compileExpr(sExpr); + emitExpr(sExpr); } )) { return; } - compileExpr(sExpr); + emitExpr(sExpr); } -void Compiler::compileExpr(const SExpr *sExpr) { +void CodeGenerator::emitExpr(const SExpr *sExpr) { if (matchForm( sExpr, {{&DEFINE_SYM, [this](const auto &) { handleInvalidDef(); }}, @@ -343,29 +198,29 @@ void Compiler::compileExpr(const SExpr *sExpr) { {&IF_SYM, [this](const auto &matched) { emitIf(matched); }}, {&LAMBDA_SYM, [this](const auto &matched) { emitLambda(matched); }}, {&BEGIN_SYM, - [this](const auto &matched) { compileExprs(matched.get()); }}}, + [this](const auto &matched) { emitExprs(matched.get()); }}}, [this, &sExpr](const auto sym, const auto) { if (vm.env.isMacro(sym.get())) { - compileExpr(execMacro(sExpr)); + emitExpr(execMacro(sExpr)); return; } - compileCall(cast(sExpr)); + emitCall(cast(sExpr)); } )) { return; }; if (const auto atom = dynCast(sExpr)) { - compileAtom(atom.value()); + emitAtom(atom.value()); return; } - compileCall(cast(sExpr)); + emitCall(cast(sExpr)); } -void Compiler::compileAtom(const Atom *atom) { +void CodeGenerator::emitAtom(const Atom *atom) { if (isa(atom)) { throw error::SyntaxError( "Expected a non-empty list.", - source[curSrcLoc.row - 1], + parsedSrc.source[curSrcLoc.row - 1], curSrcLoc.row, curSrcLoc.col ); @@ -377,11 +232,10 @@ void Compiler::compileAtom(const Atom *atom) { emitCode(OpCode::LOAD_CONST, emitConst(atom)); } -void Compiler::compileCall(const SExprs *sExprs) { - compileExpr(sExprs->first); - const auto argc = visitEach(sExprs->rest, [this](const auto &sExpr) { - compileExpr(sExpr); - }); +void CodeGenerator::emitCall(const SExprs *sExprs) { + emitExpr(sExprs->first); + const auto argc = + visitEach(sExprs->rest, [this](const auto &sExpr) { emitExpr(sExpr); }); try { cast(last(sExprs)); @@ -391,7 +245,7 @@ void Compiler::compileCall(const SExprs *sExprs) { emitCode(OpCode::CALL, argc); } -void Compiler::emitLambda(const MatchedSExpr matched) { +void CodeGenerator::emitLambda(const MatchedSExpr matched) { try { const auto [lambdaParam, lambdaBody] = unpackPartial(matched.get()); @@ -402,18 +256,13 @@ void Compiler::emitLambda(const MatchedSExpr matched) { assertType(last(lambdaParam.get())); } - Compiler compiler( - source, - srcMap, - lambdaParam.get(), - cast(lambdaBody.get()), - *this, - vm + CodeGenerator codeGenerator( + vm, *this, parsedSrc, lambdaParam.get(), cast(lambdaBody.get()) ); - const auto function = compiler.compile(); + const auto function = codeGenerator.getGenerated(); emitCode(OpCode::MAKE_CLOSURE, emitConst(function)); - for (const auto &upValue : compiler.upValues) { + for (const auto &upValue : codeGenerator.upValues) { emitCode(upValue.isLocal ? 1 : 0, upValue.idx); } } catch (error::TypeError &te) { @@ -421,7 +270,7 @@ void Compiler::emitLambda(const MatchedSExpr matched) { } } -void Compiler::emitSym(const sexpr::Sym *sym) { +void CodeGenerator::emitSym(const sexpr::Sym *sym) { if (vm.env.isNatFn(sym)) { emitCode(OpCode::LOAD_CONST, emitConst(vm.env.load(sym))); return; @@ -437,7 +286,7 @@ void Compiler::emitSym(const sexpr::Sym *sym) { emitCode(OpCode::LOAD_SYM, emitConst(sym)); } -void Compiler::emitQuote(const MatchedSExpr matched) { +void CodeGenerator::emitQuote(const MatchedSExpr matched) { try { const auto [expr] = unpack(matched.get()); @@ -447,11 +296,11 @@ void Compiler::emitQuote(const MatchedSExpr matched) { } } -void Compiler::emitDef(const MatchedSExpr matched) { +void CodeGenerator::emitDef(const MatchedSExpr matched) { try { const auto [sym, expr] = unpack(matched.get()); - compileExpr(expr.get()); + emitExpr(expr.get()); if (enclosing.has_value()) { locals.push_back({sym.get(), stackOffset, false}); } else { @@ -462,12 +311,12 @@ void Compiler::emitDef(const MatchedSExpr matched) { } } -void Compiler::execDefMacro(const MatchedSExpr matched) { +void CodeGenerator::execDefMacro(const MatchedSExpr matched) { if (enclosing.has_value()) { const auto [row, col] = curSrcLoc; throw error::SyntaxError( "Invalid syntax for define-macro: must define macros in top level", - source[row - 1], + parsedSrc.source[row - 1], row, col ); @@ -476,22 +325,17 @@ void Compiler::execDefMacro(const MatchedSExpr matched) { const auto [macroSym, macroArgNames, macroBody] = unpackPartial(matched.get()); - Compiler compiler( - source, - srcMap, - macroArgNames.get(), - cast(macroBody.get()), - *this, - vm + CodeGenerator codeGenerator( + vm, *this, parsedSrc, macroArgNames.get(), cast(macroBody.get()) ); - const auto function = compiler.compile(); + const auto function = codeGenerator.getGenerated(); Code def; def.pushCode(OpCode::MAKE_CLOSURE, curSrcLoc.row); def.pushCode(def.pushConst(function)); - for (const auto &upValue : compiler.upValues) { + for (const auto &upValue : codeGenerator.upValues) { def.pushCode(upValue.isLocal ? 1 : 0); def.pushCode(upValue.idx); } @@ -510,11 +354,11 @@ void Compiler::execDefMacro(const MatchedSExpr matched) { } } -void Compiler::emitSet(const MatchedSExpr matched) { +void CodeGenerator::emitSet(const MatchedSExpr matched) { try { const auto [sym, expr] = unpack(matched.get()); - compileExpr(expr.get()); + emitExpr(expr.get()); if (const auto idx = resolveLocal(sym.get()); idx.has_value()) { emitCode(OpCode::SET_STACK, locals[*idx].stackOffset); return; @@ -529,26 +373,26 @@ void Compiler::emitSet(const MatchedSExpr matched) { } } -void Compiler::emitIf(const MatchedSExpr matched) { +void CodeGenerator::emitIf(const MatchedSExpr matched) { try { const auto [test, conseq, alt] = unpack(matched.get()); - compileExpr(test.get()); + emitExpr(test.get()); const auto jifIdx = emitCode(OpCode::POP_JUMP_IF_FALSE, UINT8_MAX, UINT8_MAX) + 1; - compileExpr(conseq.get()); + emitExpr(conseq.get()); const auto jIdx = emitCode(OpCode::JUMP, UINT8_MAX, UINT8_MAX) + 1; patchJump(jifIdx); - compileExpr(alt.get()); + emitExpr(alt.get()); patchJump(jIdx); } catch (error::TypeError &te) { handleTypeError(ifGrammar, te.expected, te.actual); } } -void Compiler::emitRet() { +void CodeGenerator::emitRet() { try { cast(last(body)); } catch (error::TypeError &te) { @@ -564,7 +408,7 @@ void Compiler::emitRet() { emitCode(OpCode::RETURN); } -const SExpr *Compiler::execMacro(const SExpr *sExpr) { +const SExpr *CodeGenerator::execMacro(const SExpr *sExpr) { Code fExpr; const auto sExprs = cast(sExpr); @@ -585,70 +429,43 @@ const SExpr *Compiler::execMacro(const SExpr *sExpr) { const auto res = vm.eval(); traverse(res, [this](const auto &sExpr) { - srcMap.insert({sExpr, curSrcLoc}); + parsedSrc.srcMap.insert({sExpr, curSrcLoc}); }); return res; } -void Compiler::handleInvalidDef() { +void CodeGenerator::handleInvalidDef() { const auto [row, col] = curSrcLoc; throw error::SyntaxError( "Invalid syntax for define: cannot use define as an " "expression", - source[row - 1], + parsedSrc.source[row - 1], row, col ); } -void Compiler::handleTypeError( +void CodeGenerator::handleTypeError( const std::string grammar, const std::string expected, const SExpr *actual ) { std::stringstream ss; ss << "Invalid syntax for " << grammar << "." << std::endl << "Expected " << expected << ", but got " << actual << "."; const auto [row, col] = curSrcLoc; - throw SyntaxError(ss.str(), source[row - 1], row, col); + throw SyntaxError(ss.str(), parsedSrc.source[row - 1], row, col); } -Compiler::Compiler(std::vector source, VM &vm) +CodeGenerator::CodeGenerator(runtime::VM &vm, ParsedSrc &parsedSrc) : vm(vm), gcGuard(vm.heap.pauseGC()), - source(source), + parsedSrc(parsedSrc), curSrcLoc({1, 0}), param(vm.heap.alloc()), + body(parsedSrc.root), arity(0), variadic(false), - body(parse()), - stackOffset(1) {} + stackOffset(1), + proto(generate()) {} -const Prototype *Compiler::compile() { - if (variadic) { - emitCode(OpCode::MAKE_LIST, arity + 1); - } - - compileStmts(body); - emitRet(); - - return vm.heap.alloc(upValues.size(), arity, variadic, code); -} - -void Compiler::verifyLex( - const std::string &line, - const unsigned int curSrcLoc, - unsigned int &openParen, - unsigned int &closedParen -) { - auto tokens = tokenize(line, curSrcLoc); - for (auto it = tokens.cbegin(); it != tokens.cend(); ++it) { - if (openParen == closedParen && it->str == ")") { - handleUnexpectedToken(*it, line); - } - if (it->str == "(") { - openParen += 1; - } else if (it->str == ")") { - closedParen += 1; - } - } -} +const Prototype *CodeGenerator::getGenerated() const { return proto; } diff --git a/src/compile/Compiler.hpp b/src/compile/CodeGenerator.hpp similarity index 76% rename from src/compile/Compiler.hpp rename to src/compile/CodeGenerator.hpp index 2861ccc..2eb622a 100644 --- a/src/compile/Compiler.hpp +++ b/src/compile/CodeGenerator.hpp @@ -1,5 +1,5 @@ -#ifndef LISP_SRC_COMPILE_COMPILER_HPP_ -#define LISP_SRC_COMPILE_COMPILER_HPP_ +#ifndef LISP_SRC_COMPILE_CODEGENERATOR_HPP_ +#define LISP_SRC_COMPILE_CODEGENERATOR_HPP_ #include "../code/Code.hpp" #include "../runtime/GCGuard.hpp" @@ -10,7 +10,7 @@ #include "../sexpr/SExprs.hpp" #include "../sexpr/Sym.hpp" #include "Local.hpp" -#include "Token.hpp" +#include "ParsedSrc.hpp" #include "Upvalue.hpp" #include #include @@ -30,44 +30,28 @@ const sexpr::Sym LAMBDA_SYM("lambda"); const sexpr::Sym DEFINE_SYM("define"); const sexpr::Sym DEFMACRO_SYM("defmacro"); -class Compiler { +class CodeGenerator { private: - using SrcMap = std::unordered_map; - using TokenIter = std::vector::const_iterator; using Visitor = std::function; runtime::VM &vm; const std::optional gcGuard; - const std::optional> enclosing; + const std::optional> enclosing; - std::vector source; - SrcMap srcMap; + ParsedSrc &parsedSrc; SrcLoc curSrcLoc; const sexpr::SExpr *param; - const uint8_t arity; - const bool variadic; - const sexpr::SExprs *body; + const uint8_t arity; + const bool variadic; std::vector locals; std::vector upValues; uint8_t stackOffset; code::Code code; - - static bool isNum(const std::string s); - static std::vector tokenize(std::vector lines); - static std::vector tokenize(std::string line, const unsigned int row); - static void - handleUnexpectedToken(const Token &token, const std::string &line); - - const sexpr::SExprs *parse(); - const sexpr::SExpr *parseLists(TokenIter &it, const TokenIter &end); - const sexpr::SExpr *parseList(TokenIter &it, const TokenIter &end); - const sexpr::SExpr *parseElem(TokenIter &it, const TokenIter &end); - const sexpr::SExpr *parseSexprs(TokenIter &it, const TokenIter &end); - const sexpr::SExpr *parseAtom(Token token); + const sexpr::Prototype *proto; template class MatchedSExpr { private: @@ -83,6 +67,14 @@ class Compiler { } }; + CodeGenerator( + runtime::VM &vm, + CodeGenerator &enclosing, + ParsedSrc &parsedSrc, + const sexpr::SExpr *param, + const sexpr::SExprs *body + ); + template std::tuple, const MatchedSExpr...> unpack(const sexpr::SExpr *sExpr) { @@ -173,19 +165,10 @@ class Compiler { return false; } - Compiler( - const std::vector source, - SrcMap sourceLoc, - const sexpr::SExpr *param, - const sexpr::SExprs *body, - Compiler &enclosing, - runtime::VM &vm - ); - void updateCurSrcLoc(const sexpr::SExprs *sExpr); std::optional resolveLocal(const sexpr::Sym *sym); std::optional - resolveUpvalue(Compiler &caller, const sexpr::Sym *sym); + resolveUpvalue(CodeGenerator &caller, const sexpr::Sym *sym); std::size_t addUpvalue(int idx, bool isLocal); bool isVariadic(); uint8_t countArity(); @@ -205,12 +188,14 @@ class Compiler { const sexpr::SExpr *last(const sexpr::SExpr *sExpr); unsigned int visitEach(const sexpr::SExpr *sExpr, Visitor visitor); void traverse(const sexpr::SExpr *sExpr, Visitor visitor); - void compileStmts(const sexpr::SExpr *sExpr); - void compileExprs(const sexpr::SExpr *sExpr); - void compileStmt(const sexpr::SExpr *sExpr); - void compileExpr(const sexpr::SExpr *sExpr); - void compileAtom(const sexpr::Atom *atom); - void compileCall(const sexpr::SExprs *sExprs); + + const sexpr::Prototype *generate(); + void emitStmts(const sexpr::SExpr *sExpr); + void emitExprs(const sexpr::SExpr *sExpr); + void emitStmt(const sexpr::SExpr *sExpr); + void emitExpr(const sexpr::SExpr *sExpr); + void emitAtom(const sexpr::Atom *atom); + void emitCall(const sexpr::SExprs *sExprs); void emitDef(const MatchedSExpr matched); void emitSet(const MatchedSExpr matched); void emitSym(const sexpr::Sym *sym); @@ -229,16 +214,9 @@ class Compiler { ); public: - Compiler(std::vector source, runtime::VM &vm); + CodeGenerator(runtime::VM &vm, ParsedSrc &parsedSrc); - const sexpr::Prototype *compile(); - - static void verifyLex( - const std::string &line, - const unsigned int lineNum, - unsigned int &openParen, - unsigned int &closedParen - ); + const sexpr::Prototype *getGenerated() const; }; } // namespace compile diff --git a/src/compile/ParsedSrc.hpp b/src/compile/ParsedSrc.hpp new file mode 100644 index 0000000..f9489fd --- /dev/null +++ b/src/compile/ParsedSrc.hpp @@ -0,0 +1,18 @@ +#ifndef LISP_SRC_COMPILE_PARSEDSRC_HPP_ +#define LISP_SRC_COMPILE_PARSEDSRC_HPP_ + +#include "../sexpr/SExprs.hpp" +#include "SrcMap.hpp" +#include + +namespace compile { + +struct ParsedSrc { + std::vector source; + SrcMap srcMap; + const sexpr::SExprs *root; +}; + +} // namespace compile + +#endif diff --git a/src/compile/Parser.cpp b/src/compile/Parser.cpp new file mode 100644 index 0000000..6824134 --- /dev/null +++ b/src/compile/Parser.cpp @@ -0,0 +1,190 @@ +#include "Parser.hpp" +#include "../error/SyntaxError.hpp" +#include "../sexpr/Casting.hpp" +#include "../sexpr/Num.hpp" +#include "../sexpr/SExprs.hpp" +#include "../sexpr/Sym.hpp" +#include "Grammar.hpp" +#include "ParsedSrc.hpp" +#include + +using namespace compile; +using namespace sexpr; +using namespace error; + +bool Parser::isNum(const std::string s) { + try { + std::stod(s); + } catch (...) { + return false; + } + return true; +} + +std::vector Parser::tokenize(std::vector lines) { + std::vector tokens; + for (unsigned int row{1}; const auto &line : lines) { + auto newTokens = tokenize(line, row); + tokens.insert(tokens.cend(), newTokens.cbegin(), newTokens.cend()); + ++row; + } + return tokens; +} + +std::vector Parser::tokenize(std::string line, const unsigned int row) { + std::vector tokens; + std::regex rgx( + "\\\"(?:[^\"\\\\]*(?:\\\\.)?)*\\\"|;|\\(|\\)|,@|,|`|'|[^\\s(),@,`']+" + ); + auto begin = std::sregex_iterator(line.cbegin(), line.cend(), rgx); + auto end = std::sregex_iterator(); + for (std::sregex_iterator i = begin; i != end; ++i) { + std::smatch match = *i; + tokens.push_back(Token{ + match.str(), + { + row, + (unsigned int)match.position(), + } + }); + } + return tokens; +} + +const sexpr::SExprs *Parser::parse() { + auto tokens = tokenize(source); + auto it = tokens.cbegin(); + return cast(parseLists(it, tokens.cend())); +} + +const SExpr *Parser::parseLists(TokenIter &it, const TokenIter &end) { + if (it == end) { + return vm.heap.alloc(); + } + const auto [row, col] = it->srcLoc; + const auto cur = parseList(it, end); + const auto sexprs = vm.heap.alloc(cur, parseLists(it, end)); + srcMap[sexprs] = {row, col}; + return sexprs; +} + +const SExpr *Parser::parseList(TokenIter &it, const TokenIter &end) { + auto token = *it; + it += 1; + if (token.str == "(") { + const auto sExprs = parseElem(it, end); + srcMap.insert({sExprs, {token.srcLoc.row, token.srcLoc.col}}); + return sExprs; + } + if (token.str == "'" || token.str == "`" || token.str == "," || + token.str == ",@") { + const auto rest = + vm.heap.alloc(parseList(it, end), vm.heap.alloc()); + srcMap.insert({rest, {token.srcLoc.row, token.srcLoc.col}}); + const auto sExprs = vm.heap.alloc(parseAtom(token), rest); + srcMap.insert({sExprs, {token.srcLoc.row, token.srcLoc.col}}); + return sExprs; + } + const auto atom = parseAtom(token); + return atom; +} + +const SExpr *Parser::parseElem(TokenIter &it, const TokenIter &end) { + auto token = *it; + if (token.str == ")") { + it += 1; + return vm.heap.alloc(); + } else if (token.str == "(") { + it += 1; + const auto first = parseElem(it, end); + const auto rest = parseElem(it, end); + const auto sExprs = vm.heap.alloc(first, rest); + srcMap.insert({sExprs, {token.srcLoc.row, token.srcLoc.col}}); + return sExprs; + } + return parseSexprs(it, end); +} + +const SExpr *Parser::parseSexprs(TokenIter &it, const TokenIter &end) { + auto token = *it; + const auto first = parseList(it, end); + if (it->str == ".") { + it += 1; + const auto rest = parseList(it, end); + if (it == end) { + std::stringstream ss; + ss << "Invalid syntax for " << dotGrammer << "." << std::endl + << "Expected datum, but got " << rest << "."; + const auto [row, col] = it->srcLoc; + throw SyntaxError(ss.str(), source[row - 1], row, col); + } + it += 1; + const auto sExprs = vm.heap.alloc(first, rest); + srcMap.insert({sExprs, {token.srcLoc.row, token.srcLoc.col}}); + return sExprs; + } + const auto rest = parseElem(it, end); + const auto sExprs = vm.heap.alloc(first, rest); + srcMap.insert({sExprs, {token.srcLoc.row, token.srcLoc.col}}); + return sExprs; +} + +const SExpr *Parser::parseAtom(Token token) { + if (isNum(token.str)) { + return vm.heap.alloc(std::stod(token.str)); + } + if (token.str.front() == '\"' && token.str.back() == '\"') { + return vm.heap.alloc(token.str); + } + if (token.str == "#") { + return vm.heap.alloc(); + } + if (token.str == "#t") { + return vm.heap.alloc(true); + } + if (token.str == "#f") { + return vm.heap.alloc(false); + } + if (token.str == "'") { + return vm.heap.alloc("quote"); + } + if (token.str == "`") { + return vm.heap.alloc("quasiquote"); + } + if (token.str == ",") { + return vm.heap.alloc("unquote"); + } + if (token.str == ",@") { + return vm.heap.alloc("unquote-splicing"); + } + return vm.heap.alloc(token.str); +} + +void Parser::verifyLex( + const std::string &line, + const unsigned int curSrcLoc, + unsigned int &openParen, + unsigned int &closedParen +) { + auto tokens = tokenize(line, curSrcLoc); + for (auto it = tokens.cbegin(); it != tokens.cend(); ++it) { + if (openParen == closedParen && it->str == ")") { + std::stringstream ss; + ss << "Unexpected \"" << it->str << "\"."; + throw SyntaxError(ss.str(), line, it->srcLoc.row, it->srcLoc.col); + } + if (it->str == "(") { + openParen += 1; + } else if (it->str == ")") { + closedParen += 1; + } + } +} + +Parser::Parser(runtime::VM &vm, const std::vector source) + : vm(vm), + gcGuard(vm.heap.pauseGC()), + source(source), + parsedSrc({source, srcMap, parse()}) {} + +const ParsedSrc Parser::getParsed() const { return parsedSrc; } diff --git a/src/compile/Parser.hpp b/src/compile/Parser.hpp new file mode 100644 index 0000000..563464b --- /dev/null +++ b/src/compile/Parser.hpp @@ -0,0 +1,49 @@ +#ifndef LISP_SRC_COMPILE_PARSER_HPP_ +#define LISP_SRC_COMPILE_PARSER_HPP_ + +#include "../runtime/GCGuard.hpp" +#include "../runtime/VM.hpp" +#include "ParsedSrc.hpp" +#include "Token.hpp" +#include + +namespace compile { + +class Parser { +private: + using TokenIter = std::vector::const_iterator; + + runtime::VM &vm; + runtime::GCGuard gcGuard; + + const std::vector source; + SrcMap srcMap; + ParsedSrc parsedSrc; + + static bool isNum(const std::string s); + static std::vector tokenize(std::vector lines); + static std::vector tokenize(std::string line, const unsigned int row); + + const sexpr::SExprs *parse(); + const sexpr::SExpr *parseLists(TokenIter &it, const TokenIter &end); + const sexpr::SExpr *parseList(TokenIter &it, const TokenIter &end); + const sexpr::SExpr *parseElem(TokenIter &it, const TokenIter &end); + const sexpr::SExpr *parseSexprs(TokenIter &it, const TokenIter &end); + const sexpr::SExpr *parseAtom(Token token); + +public: + static void verifyLex( + const std::string &line, + const unsigned int lineNum, + unsigned int &openParen, + unsigned int &closedParen + ); + + Parser(runtime::VM &vm, const std::vector source); + + const ParsedSrc getParsed() const; +}; + +} // namespace compile + +#endif diff --git a/src/compile/SrcMap.hpp b/src/compile/SrcMap.hpp new file mode 100644 index 0000000..3586e38 --- /dev/null +++ b/src/compile/SrcMap.hpp @@ -0,0 +1,14 @@ +#ifndef LISP_SRC_COMPILE_SRCMAP_HPP_ +#define LISP_SRC_COMPILE_SRCMAP_HPP_ + +#include "../sexpr/SExpr.hpp" +#include "SrcLoc.hpp" +#include + +namespace compile { + +using SrcMap = std::unordered_map; + +} + +#endif diff --git a/src/repl/repl.cpp b/src/repl/repl.cpp index ff3b266..5763da0 100644 --- a/src/repl/repl.cpp +++ b/src/repl/repl.cpp @@ -1,5 +1,6 @@ #include "repl.hpp" -#include "../compile/Compiler.hpp" +#include "../compile/CodeGenerator.hpp" +#include "../compile/Parser.hpp" #include "../error/RuntimeError.hpp" #include "../error/SyntaxError.hpp" #include "../runtime/VM.hpp" @@ -38,7 +39,7 @@ bool getConsoleInput( continue; } add_history(line.c_str()); - Compiler::verifyLex(line, lines.size() + 1, openParen, closedParen); + Parser::verifyLex(line, lines.size() + 1, openParen, closedParen); lines.push_back(line + " "); if (openParen == closedParen) { return true; @@ -55,7 +56,7 @@ bool getFileInput(std::istream &in, std::vector &lines) { line = std::regex_replace( line, std::regex("(\\\\\"|\"(?:\\\\\"|[^\"])*\")|(;.*$)"), "$1" ); - Compiler::verifyLex(line, lines.size() + 1, openParen, closedParen); + Parser::verifyLex(line, lines.size() + 1, openParen, closedParen); lines.push_back(line + " "); } return lines.size() > 0; @@ -75,8 +76,12 @@ int execFile(const std::string filePath, VM &vm) { try { if (getFileInput(fs, lines)) { { - Compiler compiler(lines, vm); - vm.load(compiler.compile()); + Parser parser(vm, lines); + auto parsedSrc = parser.getParsed(); + CodeGenerator codeGenerator(vm, parsedSrc); + auto main = codeGenerator.getGenerated(); + + vm.load(main); } vm.eval(); } @@ -127,8 +132,12 @@ int repl::repl() { try { if (getConsoleInput(lines, "lisp> ", " ... ")) { { - Compiler compiler(lines, vm); - vm.load(compiler.compile()); + Parser parser(vm, lines); + auto parsedSrc = parser.getParsed(); + CodeGenerator codeGenerator(vm, parsedSrc); + auto main = codeGenerator.getGenerated(); + + vm.load(main); } const auto res = vm.eval(); if (!isa(res)) {