/*
* Kai Programming Language Compiler
* by John Ryland
* (c) Copyright 2019
*/
#include <fstream>
#include <iostream>
#include <cstdio>
#include <cstdlib>
#include <cctype>
#include <string>
#include <vector>
#include <map>
enum class TokenType
{
// [a-z|A-Z] [a-z|A-Z|0-9|_] *
Identifier,
// literals
IntegerLiteral,
FloatLiteral,
StringLiteral,
// Specific parsed identifiers
Type, // ?
Constant, // ?
Function, // ?
Module, // ?
// Language specific tokens
Keyword,
Symbol,
// end of file
EndOfFile
};
enum class SymbolType
{
// Unary
Not,
Tilda,
// Binary
Add,
Subtract,
Multiply,
Divide,
Modulus,
LogicalAnd,
LogicalOr,
BinaryAnd,
BinaryOr,
BinaryXor,
Equality,
InEquality,
LessThan,
LessThanOrEqual,
GreaterThan,
GreaterThanOrEqual,
Assignment,
Produces,
// Other
OpenSquareBracket,
ClosSquareeBracket,
OpenBracket,
CloseBracket,
OpenBrace,
CloseBrace,
QuestionMark,
Colon,
SemiColon,
Period,
Comma,
At,
EnumSize
};
enum class ExpressionType
{
Constant,
Variable,
Unary,
Binary,
Ternary
};
enum class KeywordType
{
Module,
Information,
Imports,
Interface,
Implementation,
Tests,
Type,
Const,
Enum,
Struct,
Function,
Var,
If,
Else,
For,
While,
EnumSize
};
enum class TypeType
{
Basic,
Enum,
Struct,
Function,
Alias
};
enum class BasicType
{
Int8,
Int16,
Int32,
Int64,
UInt8,
UInt16,
UInt32,
UInt64,
Flt32,
Flt64,
String,
Meta, // template type
EnumSize
};
enum class ControlFlowType
{
If,
For,
While,
DoWhile,
Continue,
Break,
Switch
};
enum class StatementType
{
Block,
ControlFlow,
Expression
};
enum class ConstExpressionType
{
IntegerLiteral,
FloatLiteral,
StringLiteral,
CompileTimeConstant, // expression which can be evaluated at compile-time
TypeName
};
enum class UnaryExpressionType
{
/* C unary operators
PreIncrement,
PreDecrement,
PostIncrement,
PostDecrement,
AddressOf,
Indirection,
Sizeof,
Cast,
*/
Positive,
Negative,
OnesComplement,
Not
};
enum class BinaryExpressionType
{
Add,
Subtract,
Multiply,
Divide,
Modulus,
LogicalAnd,
LogicalOr,
BinaryAnd,
BinaryOr,
BinaryXor,
Equality,
InEquality,
LessThan,
LessThanOrEqual,
GreaterThan,
GreaterThanOrEqual,
};
enum class TernaryExpressionType
{
Conditional
};
struct TypeDeclaration;
struct Expression;
struct Statement;
struct FunctionBlock;
struct Module;
struct Identifier
{
std::string m_name;
};
struct StringLiteral
{
std::string m_string;
};
struct IntegerLiteral
{
bool m_negative;
uint64_t m_value;
};
struct FloatLiteral
{
double m_value;
};
struct EnumValues
{
uint64_t m_nextValue;
std::map<std::string,uint64_t> m_values;
};
struct Variable
{
std::string m_name;
TypeDeclaration* m_type;
};
struct StructMembers
{
std::vector<Variable> m_members;
};
struct FunctionDeclaration
{
std::string m_name;
TypeDeclaration* m_returnType;
std::vector<Variable> m_parameters;
};
struct TypeDeclaration
{
std::string m_name;
// TODO: C++ lacks a typesafe union
// C++17 adds variant which tries to address this, but the syntax is very clunky, bolted on using boost style library code.
// Ideally language should have built-in a proper typesafe union which kind of fuses together what is in C++ an enum and a union.
// Below is the pattern that is being used here, an enum and union. The type needs to identify which member of the union is valid.
// In the Kai language, perhaps this can be done right. Need to figure out the correct syntax for it.
// These are called 'tagged unions' as opposed to an anonymous union.
// Seems rust has something similar to what would be expected.
TypeType m_type;
union
{
BasicType m_basic;
EnumValues* m_enum;
StructMembers* m_struct;
TypeDeclaration* m_alias; // perhaps strong and weak options?
FunctionDeclaration* m_function;
};
};
/*
Safer pointer concepts
void foo(char* a_pointer);
char* normal_pointer = (char*)malloc(1024);
foo(normal_pointer); // the passed in pointer contains no information about the size
Perhaps better:
template <typename T>
struct Pointer {
Pointer(T* p, size_t s) { m_p = p; m_s = s; }
T* m_p;
size_t m_s;
}
void foo(Pointer<char> a_pointer);
Pointer<char> normal_pointer = Pointer<char>((char*)malloc(1024), 1024);
foo(normal_pointer); // the passed in pointer now contains information about the size
It is possible to build in pointer arithmatic bounds checking.
It is also possible to create spans/views which also follow a similar style.
A span/view however needs to understand object lifetime.
Lifetime management is another concern. Possibly the Qt way with a hierarchy of ownership
from a root object is one way.
In terms of VM and functional style programming and multi-threading, was thinking about
the idea that in the VM it could enforce that when no-synchronization is being performed,
that a thread of execution just has read-only access to heap and constant data, but can
read/write to it's stack. Multiple threads can run concurrently just working off their
stack. Then there can be some synchronization primitives that can join the work of the
threads giving them controlled access to the head where work can be transferred from the
stacks to the heap and the heap generally modified. The heap might be generational.
*/
/*
Tagged union concept
*/
class tagged_union
{
enum
{
m_basic_tag,
m_enum_tag,
m_struct_tag,
m_alias_tag,
m_function_tag,
} m_type;
union
{
BasicType m_basic;
EnumValues* m_enum;
StructMembers* m_struct;
TypeDeclaration* m_alias;
FunctionDeclaration* m_function;
};
public:
tagged_union& operator=(const BasicType& b) { m_type = m_basic_tag; m_basic = b; return *this; }
tagged_union& operator=(EnumValues* e) { m_type = m_enum_tag; m_enum = e; return *this; }
// ...
// How to access? :
// runtime-checking way:
BasicType& get_basic() { assert(m_type == m_basic_tag); return m_basic; }
// Better would be to statically analyse and not even need the tag!!
// other way is access is only via a switch/, but
/*
heavy approach is using inheritance:
class tagged_union { virtual void f() {} }; // base class with a v-table
class basicC : base { BasicType m_basic; }
class enumC : base { EnumValues* m_enum; }
class structC : base { StructMembers* m_struct; }
// ...
tagged_union* thing; // needs to be pointer
// setting:
delete thing; thing = new basicC{ b };
then access using dynamic_cast
if (dynamic_cast<basicC*>(thing))
... // it is of BasicType
*/
/*
// syntax in other languages:
tree match
{
case Node(x, _, _) => println("top level node value: " + x)
case Leaf => println("top level node is a leaf")
}
*/
/*
// desired syntax:
// declared same as a C style union
union Shape
{
struct { int size; } Square;
struct { int width; int height; } Rectangle;
struct { int radius; } Circle;
};
// alternative idea, same as C style enum
// perhaps use the name 'variant' to make it clear it is different
variant Shape
{
Square = struct { int size; },
Rectangle = struct { int width; int height; },
Circle = struct { int radius; },
}
void func(Shape shape)
{
// access similar to an enum, this is the only provided way to access the members
switch (value in shape)
{
case Square: printf("square : %i\n", value.size);
case Rectangle: printf("rectangle: %i\n", value.width);
case Circle: printf("circle: %i\n", value.radius);
}
shape =
}
// C++ has optional, also rust, basically something like this is possible with tagged unions
template <typename T>
variant Optional
{
Some = T,
None = void,
}
template <typename T>
variant Result
{
Okay = T,
Error = struct { int reasonCode; string errorMessage; },
}
*/
private:
};
/*
// Bad code: returns stack memory
char *itoa(int i)
{
char buf[20], *z;
sprintf(buf,"%d",i);
z = buf;
return z;
}
However if the language was designed to, it might be possible
for the caller to take ownership of stack from the callee, as if
the caller had just declared some local variables within its scope.
This perhaps could apply all the way up the call stack. It would
allow things like stings and small allocations to happen from the
stack instead of needing the heap for everything.
Probably this could be implemented using alloca and some classes.
Ideally something like this would work and the buffer for string
would be on the stack
string itoa(int i)
{
string buf(20);
sprintf(buf, "%d", i);
return buf;
}
void caller()
{
string s;
s = itoa(100); // have compiler do some combination of RVO and stack transfer
}
Extending on this idea, there will be areas of the stack which will be used
for the lifetime of a block, and others which will have their lifetimes extended
by being allocated from a parent block or a parent's parent's block etc.
This is part of the concept of RVO, a returned value could be in the parent's
stack frame and it was as if that memory was past by reference to the function
and it allocated in to it, however potentially without the constructor being
run in the parent etc.
This is fine for fixed sized values. Anything of a varying size would otherwise
need to be in the heap, which is pointed out above, but is not a general limitation
of the stack. alloca is an example how varying sized objects can be allocated from
the stack. So the challenge is to do something like RVO with varying sized objects.
When the stack is used to store short lived objects like loop variables and temporary
variables, and is also being used to store preserved values between blocks/stack frames
for larger variable sized allocations, then something has to give. Either the short
lived objects can't be reclaimed, or the compiler has to move memory around to keep
these short lived objects in memory after the preserved ones. This rearranging could
happen at the end of a block, but anytime memory is moved, it means it is not
possible to have pointers to things, or need another level of indirection. Essentially
we create something like an incremental GC which is incrementally working within
the thread and at an increment of each block. Generally it will slow down the program.
So instead of moving memory around, perhaps the concept could be to have two stacks.
One a preserved stack, and another a temporary or scratch stack.
The preserved stack is for RVO type objects, ones with varying size. The size can only
be varied in the frame in which the object is constructed, not in subsequent frames.
That is so that future objects won't need to be moved in memory to accommodate the change
in size. This is achieved by virtue of the rule that parameters to functions are passed
by const reference, so deeper function calls shouldn't be modifying it anyway.
Problem is if multiple values in a current frame are varying in size. This is a tricky
case, but conceptually that might need to be either one thing or in different frames/
blocks.
Also another concern is guaranteeing safety. The preserved stack could run out of space.
For a high safety critical system, it would be nice to make guarantees about the
maximum amount of memory which could be used and to be able to say that the preserved
stack size is large enough for any case. This might mean making a program specify the
upper bound on the size of varying sized objects. It might also mean not allowing
recursion or perhaps some way to bound the recursion as part of the language.
*/
struct ConstExpression
{
ConstExpressionType m_type;
union
{
// TODO: enum value - has a value and a type
// TODO: constant variable
IntegerLiteral* m_integer;
FloatLiteral* m_float;
StringLiteral* m_string;
// CompileTimeConstant m_expression;
TypeDeclaration* m_typeDeclaration;
};
};
struct ConstantVariable
{
std::string m_name;
TypeDeclaration* m_type;
// TODO: const structs, where value is inside { }
// eg: const vec3 origin := { 0, 0, 0 };
ConstExpression m_value;
};
struct IfStatement
{
Expression* m_condition;
Statement* m_statement;
Statement* m_else;
};
struct ForStatement
{
Statement* m_initializer;
Expression* m_condition;
Statement* m_statement;
Statement* m_increment;
};
struct WhileStatement
{
Expression* m_condition;
Statement* m_statement;
};
struct SwitchStatement
{
Expression* m_condition;
std::map<ConstExpression,Expression*> m_cases;
};
struct ControlFlow
{
ControlFlowType m_type;
union
{
IfStatement* m_if;
ForStatement* m_for;
WhileStatement* m_while;
WhileStatement* m_doWhile;
SwitchStatement* m_switch;
};
};
struct UnaryExpression
{
UnaryExpressionType m_operator;
Expression* m_first;
};
struct BinaryExpression
{
BinaryExpressionType m_operator;
Expression* m_first;
Expression* m_second;
};
struct TernaryExpression
{
TernaryExpressionType m_operator;
Expression* m_first;
Expression* m_second;
Expression* m_third;
};
// These are RHS expressions
struct Expression
{
ExpressionType m_type;
union
{
ConstExpression m_constant;
Variable* m_variable;
Identifier* m_variableId; // TODO: place-holder member, until can do contexted lookup of variable by name
UnaryExpression m_unary;
BinaryExpression m_binary;
TernaryExpression m_ternary;
};
};
struct Statement
{
StatementType m_type;
union
{
FunctionBlock* m_block;
ControlFlow* m_controlFlow;
Expression* m_expression;
};
};
struct FunctionBlock
{
std::vector<Variable> m_localVariables;
std::vector<Statement> m_statements;
std::vector<Expression> m_expressions;
};
struct Function
{
FunctionDeclaration* m_interface;
FunctionBlock m_implementation;
};
struct ModuleInformation
{
std::string m_description;
std::string m_version;
std::string m_copyright;
std::string m_date;
std::string m_author;
std::string m_authorURL;
std::string m_authorURLToPublicKey;
std::string m_authorDigitalSignature;
std::string m_checksum;
std::string m_interfaceHash;
std::string m_implementationHash;
std::string m_digitalSignature; // online digital timestamp notary service?
std::string m_history;
std::string m_profile; // 8-bit,16-bit,32-bit,no-float,no-mmu,no-IO,low-ram,...
// scheme where generate checksum/hash of a file
// author has publish public key inside DNS or somewhere public
// author signs file/hash with private key and appends
// anyone can verify that only the author was the creator of this file
// however anyone can strip this digital signature and repeat the process with their own key pair
// what is needed is to also timestamp it in a verifiable way that can see who published first
// https://github.com/trbs/rfc3161ng/
// actually perhaps author doesn't need to have a key-pair. It may be enough that they put their
// name on it, and the time-stamping server is signing the hash of the file which include details
// of authorship.
// I guess that doesn't prevent someone from attributing something to someone else.
// If the author wants to sign it, say to prove it came from their reputable company, then they
// can sign it too. This might be authorship signature.
};
struct ModuleImports
{
std::map<std::string,Module*> m_imports;
};
struct ModuleInterface
{
// TODO: should they be pointers to the decls in CompilerState?
std::vector<TypeDeclaration*> m_types;
std::vector<ConstantVariable*> m_constants;
std::vector<FunctionDeclaration*> m_functions;
};
struct ModuleImplementation
{
ModuleInterface m_details;
// TODO: should be pointers to the decls in CompilerState?
std::map<std::string,Function> m_functions;
};
struct ModuleTests
{
std::map<std::string,Function> m_tests;
};
struct Module
{
std::string m_name;
std::string m_fileName;
ModuleInformation m_information;
ModuleImports m_imports;
ModuleInterface m_interface;
ModuleImplementation m_implementation;
ModuleTests m_tests;
};
struct Program
{
std::string m_name;
std::string m_fileName;
ModuleInformation m_information;
ModuleImports m_imports;
Function m_mainFunction;
};
struct Token
{
uint32_t m_lineNumber;
uint32_t m_columnNumber;
TokenType m_tokenType;
union
{
Identifier* m_identifier;
IntegerLiteral* m_integerLiteral;
FloatLiteral* m_floatLiteral;
StringLiteral* m_stringLiteral;
ConstantVariable* m_constant; // ?
TypeDeclaration* m_type; // ?
FunctionDeclaration* m_function; // ?
Module* m_module; // ?
KeywordType m_keyword;
SymbolType m_symbol;
};
};
struct CompilerState
{
// TODO: nothing is namespaced here. perhaps multi-maps, but would be better to avoid
// depends if everything needs to be explicitly scoped when used, or can have a 'using' keyword.
std::map<std::string,TokenType> m_tokens;
std::map<std::string,Identifier> m_identifiers;
std::map<std::string,IntegerLiteral> m_integerLiteralTable;
std::map<std::string,FloatLiteral> m_floatLiteralTable;
std::map<std::string,StringLiteral> m_stringLiteralTable;
// TODO: perhaps these belong in the modules
std::map<std::string,ConstantVariable> m_constants;
std::map<std::string,EnumValues> m_enums;
std::map<std::string,StructMembers> m_structs;
std::map<std::string,TypeDeclaration> m_types;
std::map<std::string,FunctionDeclaration> m_functions;
std::map<std::string,Module> m_modules;
Program m_program;
// language grammar
std::map<std::string,KeywordType> m_keywordTable;
std::map<std::string,SymbolType> m_symbolTable;
// Lexical parsing state
bool m_insideMultiLineComment;
bool m_insideMultiLineString;
// Function pointers
std::function<std::string(void)> m_getInput;
std::function<bool(void)> m_getEof;
std::function<Token(void)> m_getToken;
std::function<void(void)> m_parser;
};
std::string basicTypeToString(BasicType a_basicType)
{
switch (a_basicType)
{
case BasicType::Int8: return "int8";
case BasicType::Int16: return "int16";
case BasicType::Int32: return "int32";
case BasicType::Int64: return "int64";
case BasicType::UInt8: return "uint8";
case BasicType::UInt16: return "uint16";
case BasicType::UInt32: return "uint32";
case BasicType::UInt64: return "uint64";
case BasicType::Flt32: return "flt32";
case BasicType::Flt64: return "flt64";
case BasicType::String: return "string";
case BasicType::Meta: return "type";
case BasicType::EnumSize: return "";
}
}
std::string symbolToString(SymbolType a_operator)
{
switch (a_operator)
{
case SymbolType::Not: return "!";
case SymbolType::Tilda: return "~";
case SymbolType::Add: return "+";
case SymbolType::Subtract: return "-";
case SymbolType::Multiply: return "*";
case SymbolType::Divide: return "/";
case SymbolType::Modulus: return "%";
case SymbolType::LogicalAnd: return "&&";
case SymbolType::LogicalOr: return "||";
case SymbolType::BinaryAnd: return "&";
case SymbolType::BinaryOr: return "|";
case SymbolType::BinaryXor: return "^";
case SymbolType::Equality: return "==";
case SymbolType::InEquality: return "!=";
case SymbolType::LessThan: return "<";
case SymbolType::LessThanOrEqual: return "<=";
case SymbolType::GreaterThan: return ">";
case SymbolType::GreaterThanOrEqual: return ">=";
case SymbolType::Assignment: return ":=";
case SymbolType::Produces: return "=>";
case SymbolType::OpenSquareBracket: return "[";
case SymbolType::ClosSquareeBracket: return "]";
case SymbolType::OpenBracket: return "(";
case SymbolType::CloseBracket: return ")";
case SymbolType::OpenBrace: return "{";
case SymbolType::CloseBrace: return "}";
case SymbolType::QuestionMark: return "?";
case SymbolType::Colon: return ":";
case SymbolType::SemiColon: return ";";
case SymbolType::Period: return ".";
case SymbolType::Comma: return ",";
case SymbolType::At: return "@";
case SymbolType::EnumSize: return "";
}
}
std::string keywordToString(KeywordType a_keyword)
{
switch (a_keyword)
{
case KeywordType::Module: return "module";
case KeywordType::Information: return "information";
case KeywordType::Imports: return "imports";
case KeywordType::Interface: return "interface";
case KeywordType::Implementation: return "implementation";
case KeywordType::Tests: return "tests";
case KeywordType::Type: return "type";
case KeywordType::Const: return "const";
case KeywordType::Enum: return "enum";
case KeywordType::Struct: return "struct";
case KeywordType::Function: return "function";
case KeywordType::Var: return "var";
case KeywordType::If: return "if";
case KeywordType::Else: return "else";
case KeywordType::For: return "for";
case KeywordType::While: return "while";
case KeywordType::EnumSize: return "";
}
}
void AddBasicTypeToCompilerState(CompilerState& a_state, BasicType a_type)
{
std::string typeString = basicTypeToString(a_type);
a_state.m_tokens[typeString] = TokenType::Type;
a_state.m_types[typeString] = TypeDeclaration{ typeString, TypeType::Basic, a_type };
}
void AddSymbolToCompilerState(CompilerState& a_state, SymbolType a_symbol)
{
std::string symbolString = symbolToString(a_symbol);
a_state.m_tokens[symbolString] = TokenType::Symbol;
a_state.m_symbolTable[symbolString] = a_symbol;
}
void AddKeywordToCompilerState(CompilerState& a_state, KeywordType a_keyword)
{
std::string keywordString = keywordToString(a_keyword);
a_state.m_tokens[keywordString] = TokenType::Keyword;
a_state.m_keywordTable[keywordString] = a_keyword;
}
template <typename T>
class EnumIterator
{
public:
EnumIterator()
: m_value(T::EnumSize)
{
}
EnumIterator(T a_value)
: m_value(a_value)
{
}
T operator*(void) const
{
return m_value;
}
void operator++()
{
m_value = static_cast<T>(static_cast<int>(m_value) + 1);
}
bool operator!=(EnumIterator rhs)
{
return m_value != rhs.m_value;
}
private:
T m_value;
};
template <typename T>
EnumIterator<T> begin(EnumIterator<T>)
{
return EnumIterator<T>(static_cast<T>(0));
}
template <typename T>
EnumIterator<T> end(EnumIterator<T>)
{
return EnumIterator<T>(T::EnumSize);
}
CompilerState InitializeCompilerState()
{
CompilerState state;
state.m_insideMultiLineComment = false;
// Currently not-supported
state.m_insideMultiLineString = false;
/*
constexpr int startLine1 = __LINE__;
AddBasicTypeToCompilerState(state, BasicType::Int8);
static_assert((__LINE__ - startLine1 - 1) == static_cast<int>(BasicType::EnumSize), "Not all BasicType enum cases handled");
*/
// Register basic types
for (BasicType basicType: EnumIterator<BasicType>())
{
AddBasicTypeToCompilerState(state, basicType);
}
// Register symbols
for (SymbolType symbol: EnumIterator<SymbolType>())
{
AddSymbolToCompilerState(state, symbol);
}
// Register keywords
for (KeywordType keyword: EnumIterator<KeywordType>())
{
AddKeywordToCompilerState(state, keyword);
}
return state;
}
void parser(CompilerState& a_state, Token& a_token)
{
// TODO: from this stream of tokens, we need to build a syntax tree / parse expressions etc.
switch (a_token.m_tokenType)
{
case TokenType::Identifier: printf("i: %s\n", a_token.m_identifier->m_name.c_str()); break;
case TokenType::Type: printf("t: %s\n", a_token.m_type->m_name.c_str()); break;
case TokenType::Function: printf("f: %s\n", a_token.m_function->m_name.c_str()); break;
case TokenType::Constant: printf("c: %s\n", a_token.m_constant->m_name.c_str()); break;
case TokenType::IntegerLiteral: printf("n: %c%llu\n", a_token.m_integerLiteral->m_negative ? '-' : '+',
a_token.m_integerLiteral->m_value); break;
case TokenType::FloatLiteral: printf("f: %lf\n", a_token.m_floatLiteral->m_value); break;
case TokenType::StringLiteral: printf("s: %s\n", a_token.m_stringLiteral->m_string.c_str()); break;
case TokenType::Module: printf("m: %s\n", a_token.m_module->m_name.c_str()); break;
case TokenType::Keyword: printf("k: %s\n", keywordToString(a_token.m_keyword).c_str()); break;
case TokenType::Symbol: printf("o: %s\n", symbolToString(a_token.m_symbol).c_str()); break;
case TokenType::EndOfFile: printf("EOF\n"); break;
}
}
void dumpToken(const Token& a_token)
{
switch (a_token.m_tokenType)
{
case TokenType::Identifier: printf("i: %s\n", a_token.m_identifier->m_name.c_str()); break;
case TokenType::Type: printf("t: %s\n", a_token.m_type->m_name.c_str()); break;
case TokenType::Function: printf("f: %s\n", a_token.m_function->m_name.c_str()); break;
case TokenType::Constant: printf("c: %s\n", a_token.m_constant->m_name.c_str()); break;
case TokenType::IntegerLiteral: printf("n: %c%llu\n", a_token.m_integerLiteral->m_negative ? '-' : '+',
a_token.m_integerLiteral->m_value); break;
case TokenType::FloatLiteral: printf("f: %lf\n", a_token.m_floatLiteral->m_value); break;
case TokenType::StringLiteral: printf("s: %s\n", a_token.m_stringLiteral->m_string.c_str()); break;
case TokenType::Module: printf("m: %s\n", a_token.m_module->m_name.c_str()); break;
case TokenType::Keyword: printf("k: %s\n", keywordToString(a_token.m_keyword).c_str()); break;
case TokenType::Symbol: printf("o: %s\n", symbolToString(a_token.m_symbol).c_str()); break;
case TokenType::EndOfFile: printf("EOF\n"); break;
}
}
bool scanForEndOfComment(size_t& a_idx, const char* a_lineBuffer)
{
while (a_lineBuffer[a_idx] != '\n' && a_lineBuffer[a_idx] != '\0')
{
if (a_lineBuffer[a_idx] == '*')
{
a_idx++;
if (a_lineBuffer[a_idx] == '/')
{
a_idx++;
return false;
}
a_idx--;
}
a_idx++;
}
return true;
}
// <letter> ::= [a-ZA-Z]
// <digit> ::= [0-9]
// <id> ::= <letter> { <letter> | <digit> | "_" }
// <digits> ::= <digit> { <digit> }
// <integer literal> ::= <digits>
// <string literal> ::= "\"" { < a char or escape char > } "\""
// <real literal> ::= <digits> "." <digits> [ "e" [ <sign> ] <digits>] <string literal> ::= "\"" { < a char or escape char > } "\""
// <literal> ::= <integer literal> | <real literal> | <string literal>
// <operators> ::= "+" | "-" | "*" | "%" | "=" | "!=" | "<" | ">" | "<=" | ">=" | "(" | ")" | "[" | "]" | ":=" | "." | "," | ";" | ":" |
// "!" | "||" | "|" | "&" | "&&" | "^" | "{" | "}"
// <keywords> ::= "if" | "then" | "else" | "of" | "while" | "do" | "begin" | "end" | "var" | "array" | "procedure" | "function" | "program" | "assert"
// <predefined id> ::= "Boolean" | "false" | "integer" | "read" | "real" | "size" | "string" | "true" | "writeln"
bool tokenFromString(CompilerState& a_state, std::string& a_string, Token& newToken, uint32_t& column)
{
// TODO: replace a_string[x] with str[x]
const char* str = a_string.c_str();
// ignore new-lines
if (str[0] == '\n' && str[1] == '\0')
{
a_string = "";
return false;
}
if (a_state.m_insideMultiLineComment)
{
size_t idx = 0;
a_state.m_insideMultiLineComment = scanForEndOfComment(idx, str);
if (a_state.m_insideMultiLineComment)
{
// ignore lines that are inside of a multi-line comment
a_string = "";
return false;
}
a_string = (a_string.length() == idx) ? "" : a_string.substr(idx);
column += static_cast<uint32_t>(idx);
return false;
}
// currently multi-line strings are not supported
a_state.m_insideMultiLineString = false;
char* uint_end;
uint64_t asUInt = std::strtoull(str, &uint_end, 0);
char* flt_end;
double asDouble = std::strtod(str, &flt_end);
std::string token;
size_t end = 0;
// TODO: all these parsings need validating of the token (string, id, int, float)
if (isalpha(a_string[0]))
{
while (isalnum(a_string[end]) || a_string[end] == '_')
{
end++;
}
// might be a type or a module, so will look-up against a_state
newToken.m_tokenType = TokenType::Identifier;
}
else if (isdigit(a_string[0]))
{
if (uint_end >= flt_end)
{
newToken.m_tokenType = TokenType::IntegerLiteral;
}
else
{
newToken.m_tokenType = TokenType::FloatLiteral;
}
end = std::max(uint_end, flt_end) - str;
}
else if (a_string[0] == '"')
{
newToken.m_tokenType = TokenType::StringLiteral;
end = a_string.find('"', 1);
if (end == std::string::npos)
{
printf("line: %i:%i bad string literal, couldn't find closing quotes: %s\n", newToken.m_lineNumber, newToken.m_columnNumber, a_string.c_str());
exit(0);
}
end++;
}
else
{
if (a_string[0] == '/')
{
if (a_string[1] == '/')
{
a_string = "";
return false;
}
else if (a_string[1] == '*')
{
size_t idx = 2;
a_state.m_insideMultiLineComment = scanForEndOfComment(idx, str);
if (a_state.m_insideMultiLineComment)
{
a_string = "";
return false;
}
a_string = (a_string.length() == idx) ? "" : a_string.substr(idx);
column += static_cast<uint32_t>(idx);
return false;
}
}
// symbol
while (!isalnum(a_string[end]) && a_string[end] != 0)
{
end++;
}
while (end && !a_state.m_tokens.count(a_string.substr(0, end)))
{
end--;
}
if (end == 0)
{
printf("line %i:%i unrecognized character sequence / symbol: %s\n", newToken.m_lineNumber, newToken.m_columnNumber, a_string.c_str());
exit(0);
}
assert(a_state.m_tokens.count(a_string.substr(0, end)));
}
token = a_string.substr(0, end);
a_string = a_string.substr(end);
column += static_cast<uint32_t>(end);
// printf("got token : %s\n", token.c_str());
// Handle existing/same tokens as previously parsed
if (!a_state.m_tokens.count(token))
{
a_state.m_tokens[token] = newToken.m_tokenType;
switch (newToken.m_tokenType)
{
case TokenType::Identifier: a_state.m_identifiers[token] = Identifier{ token }; break;
case TokenType::IntegerLiteral: a_state.m_integerLiteralTable[token] = { false, asUInt }; break;
case TokenType::FloatLiteral: a_state.m_floatLiteralTable[token] = FloatLiteral{ asDouble }; break;
case TokenType::StringLiteral: a_state.m_stringLiteralTable[token] = // strips quotes from string
StringLiteral{ token.substr(1, token.length()-2) }; break;
default: break;
}
}
else
{
newToken.m_tokenType = a_state.m_tokens[token];
}
switch (newToken.m_tokenType)
{
case TokenType::Identifier: newToken.m_identifier = &a_state.m_identifiers[token]; break;
case TokenType::Type: newToken.m_type = &a_state.m_types[token]; break;
case TokenType::Function: newToken.m_function = &a_state.m_functions[token]; break;
case TokenType::Constant: newToken.m_constant = &a_state.m_constants[token]; break;
case TokenType::IntegerLiteral: newToken.m_integerLiteral = &a_state.m_integerLiteralTable[token]; break;
case TokenType::FloatLiteral: newToken.m_floatLiteral = &a_state.m_floatLiteralTable[token]; break;
case TokenType::StringLiteral: newToken.m_stringLiteral = &a_state.m_stringLiteralTable[token]; break;
case TokenType::Module: newToken.m_module = &a_state.m_modules[token]; break;
case TokenType::Keyword: newToken.m_keyword = a_state.m_keywordTable[token]; break;
case TokenType::Symbol: newToken.m_symbol = a_state.m_symbolTable[token]; break;
case TokenType::EndOfFile: break;
}
return true;
}
// Returns the number of characters trimmed
uint32_t trimLeadingWhiteSpace(std::string& str)
{
const std::string whitespace = " \t\n\v\f\r";
size_t first = str.find_first_not_of(whitespace);
first = first == std::string::npos ? str.length() : first;
str = str.substr(first);
return static_cast<uint32_t>(first);
}
/*
void push_tokenizer(CompilerState& a_state, std::ifstream& a_input)
{
uint32_t lineNumber = 1;
while (!a_input.eof())
{
std::string lineBuffer;
std::getline(a_input, lineBuffer);
uint32_t column = 1 + trimLeadingWhiteSpace(lineBuffer);
while (!lineBuffer.empty())
{
Token token{ lineNumber, column };
// token generator
if (tokenFromString(a_state, lineBuffer, token, column))
{
// token consumer
a_state.m_parser(a_state, token);
}
column += trimLeadingWhiteSpace(lineBuffer);
}
lineNumber++;
}
}
*/
struct TokenizerState
{
uint32_t m_lineNumber = 0;
uint32_t m_column = 1;
std::string m_lineBuffer;
};
Token getNextToken(CompilerState& a_state, TokenizerState& a_tokenizer)
{
Token token{ a_tokenizer.m_lineNumber, a_tokenizer.m_column };
while (!a_tokenizer.m_lineBuffer.empty())
{
if (tokenFromString(a_state, a_tokenizer.m_lineBuffer, token, a_tokenizer.m_column))
{
a_tokenizer.m_column += trimLeadingWhiteSpace(a_tokenizer.m_lineBuffer);
return token;
}
a_tokenizer.m_column += trimLeadingWhiteSpace(a_tokenizer.m_lineBuffer);
}
if (a_state.m_getEof())
{
token.m_tokenType = TokenType::EndOfFile;
return token;
}
a_tokenizer.m_lineNumber++;
a_tokenizer.m_lineBuffer = a_state.m_getInput();
a_tokenizer.m_column = 1 + trimLeadingWhiteSpace(a_tokenizer.m_lineBuffer);
return getNextToken(a_state, a_tokenizer);
}
/*
void tokenizer(CompilerState& a_state)
{
while (true)
{
Token token = a_state.m_getToken();
if (token.m_tokenType == TokenType::EndOfFile)
{
return;
}
// token consumer
a_state.m_parser(a_state, token);
}
}
*/
void dumpTokens(CompilerState& a_state)
{
while (true)
{
Token token = a_state.m_getToken();
if (token.m_tokenType == TokenType::EndOfFile)
{
return;
}
dumpToken(token);
}
}
bool tokenIs(Token& a_token, TokenType a_type)
{
return a_token.m_tokenType == a_type;
}
bool tokenIs(Token& a_token, KeywordType a_keyword)
{
return a_token.m_tokenType == TokenType::Keyword && a_token.m_keyword == a_keyword;
}
bool tokenIs(Token& a_token, SymbolType a_symbol)
{
return a_token.m_tokenType == TokenType::Symbol && a_token.m_symbol == a_symbol;
}
bool tokenIs(Token& a_token, const std::string& a_string)
{
return a_token.m_tokenType == TokenType::Identifier && a_token.m_identifier->m_name == a_string;
}
#define tokenAssert(cond, error) \
if (!(cond)) \
{ \
printf("input line %i:%i, error parsing token of type %i, %s.\n", token.m_lineNumber, token.m_columnNumber, token.m_tokenType, error); \
return false; \
}
template <typename T>
bool valid(const T& a_arg)
{
return false;
}
template <>
bool valid(const std::string& a_string)
{
return !a_string.empty();
}
struct GenericProcessElement
{
// Because can not have virtual templated functions
// we use this mode for polymorphism
enum class Mode
{
Parse,
Validate
};
// working state
bool done = false;
Mode mode;
// captured state
bool& okay;
Token& token;
CompilerState& a_state;
GenericProcessElement(bool& b, Token& t, CompilerState& s, Mode m)
: okay(b), token(t), a_state(s), mode(m) {}
// generic lambda emulation
template <typename EL, typename T1, typename T2>
void operator()(EL elem, T1 func, T2& var, bool validate = false)
{
if (mode == Mode::Parse)
{
if (!done && tokenIs(token, elem))
{
done = true;
okay = func(a_state, token, var);
}
}
else if (mode == Mode::Validate)
{
if (okay && validate)
{
okay = valid(var);
}
}
}
};
template <typename T, typename T2>
bool processSection(CompilerState& a_state, T a_blockProcessor, T2 a_terminal)
{
Token token;
bool okay = true;
GenericProcessElement processElement(okay, token, a_state, GenericProcessElement::Mode::Parse);
while (okay)
{
token = a_state.m_getToken();
processElement.done = false;
a_blockProcessor(processElement);
if (tokenIs(token, a_terminal))
{
break;
}
tokenAssert(processElement.done, "invalid token");
}
return okay;
}
template <typename T>
bool processBlock(CompilerState& a_state,
Token& a_token,
T a_blockProcessor,
SymbolType a_open = SymbolType::OpenBrace,
SymbolType a_close = SymbolType::CloseBrace)
{
bool okay = false;
a_token = a_state.m_getToken();
if (tokenIs(a_token, a_open))
{
okay = processSection(a_state, a_blockProcessor, a_close);
}
if (okay)
{
GenericProcessElement validateElement(okay, a_token, a_state, GenericProcessElement::Mode::Validate);
a_blockProcessor(validateElement);
}
return okay;
}
bool processInformationParameter(CompilerState& a_state, Token& a_token, std::string& a_parameter)
{
Token token = a_state.m_getToken();
tokenAssert(tokenIs(token, SymbolType::Assignment), "expected ':='");
token = a_state.m_getToken();
tokenAssert(token.m_tokenType == TokenType::StringLiteral, "expected a string");
a_parameter = token.m_stringLiteral->m_string;
printf("processing %s\n", a_parameter.c_str());
token = a_state.m_getToken();
tokenAssert(tokenIs(token, SymbolType::SemiColon), "expected a semi-colon");
return true;
}
bool processInformation(CompilerState& a_state, Token& a_token, ModuleInformation& a_information)
{
const bool required = true;
bool okay = processBlock(a_state, a_token, [&a_information](GenericProcessElement& processElement) {
processElement("description", processInformationParameter, a_information.m_description, required);
processElement("version", processInformationParameter, a_information.m_version, required);
processElement("copyright", processInformationParameter, a_information.m_copyright, required);
processElement("author", processInformationParameter, a_information.m_author, required);
processElement("checksum", processInformationParameter, a_information.m_checksum, required);
});
return okay;
}
bool processImport(CompilerState& a_state, Token& a_token, ModuleImports& a_imports)
{
std::string moduleName = a_token.m_identifier->m_name;
if (!a_state.m_modules.count(moduleName))
{
Module& module = a_state.m_modules[moduleName];
// TODO: populate
printf("need to parse module %s\n", moduleName.c_str());
}
a_imports.m_imports[a_token.m_identifier->m_name] = &a_state.m_modules[moduleName];
a_token = a_state.m_getToken();
return tokenIs(a_token, SymbolType::Comma) || tokenIs(a_token, SymbolType::CloseBrace);
}
bool processImports(CompilerState& a_state, Token& a_token, ModuleImports& a_imports)
{
bool okay = processBlock(a_state, a_token, [&a_imports](GenericProcessElement& processElement) {
processElement(TokenType::Identifier, processImport, a_imports);
});
return okay;
}
bool processTypedef(CompilerState& a_state, Token& a_token, ModuleInterface& a_interface)
{
Token token = a_state.m_getToken();
tokenAssert(token.m_tokenType != TokenType::Type, "redefinition of type");
tokenAssert(token.m_tokenType == TokenType::Identifier, "expected an identifier");
std::string typeName = token.m_identifier->m_name;
tokenAssert(a_state.m_types.count(typeName) == 0, "redefinition of type (2)");
token = a_state.m_getToken();
tokenAssert(token.m_tokenType == TokenType::Symbol, "expected a ':=' symbol");
tokenAssert(token.m_symbol == SymbolType::Assignment, "expected a ':=' symbol");
token = a_state.m_getToken();
tokenAssert(token.m_tokenType == TokenType::Type, "expected a type");
// register the type
TypeDeclaration* typeDef = &a_state.m_types[typeName];
a_state.m_tokens[typeName] = TokenType::Type;
// construct it
a_interface.m_types.emplace_back(typeDef);
typeDef->m_name = typeName;
typeDef->m_type = TypeType::Alias;
typeDef->m_alias = token.m_type;
token = a_state.m_getToken();
tokenAssert(token.m_tokenType == TokenType::Symbol, "expected a ';' symbol");
tokenAssert(token.m_symbol == SymbolType::SemiColon, "expected a ';' symbol");
return true;
}
bool processConst(CompilerState& a_state, Token& a_token, ModuleInterface& a_interface)
{
Token token = a_state.m_getToken();
tokenAssert(token.m_tokenType == TokenType::Type, "expected a type");
TypeDeclaration* typeDef = token.m_type;
token = a_state.m_getToken();
tokenAssert(token.m_tokenType == TokenType::Identifier, "expected an identifier");
std::string varName = token.m_identifier->m_name;
token = a_state.m_getToken();
tokenAssert(token.m_tokenType == TokenType::Symbol, "expected a ':=' symbol");
tokenAssert(token.m_symbol == SymbolType::Assignment, "expected a ':=' symbol");
// processConstExpression
token = a_state.m_getToken();
bool tokenTypeOkay = token.m_tokenType == TokenType::IntegerLiteral
|| token.m_tokenType == TokenType::FloatLiteral || token.m_tokenType == TokenType::StringLiteral;
tokenAssert(tokenTypeOkay, "expected a constant expression");
// register the constant variable
ConstantVariable* constVar = &a_state.m_constants[varName];
a_state.m_tokens[varName] = TokenType::Constant;
// construct it
a_interface.m_constants.emplace_back(constVar);
constVar->m_name = varName;
constVar->m_type = typeDef;
switch (token.m_tokenType)
{
case TokenType::IntegerLiteral:
constVar->m_value.m_type = ConstExpressionType::IntegerLiteral;
constVar->m_value.m_integer = token.m_integerLiteral;
break;
case TokenType::FloatLiteral:
constVar->m_value.m_type = ConstExpressionType::FloatLiteral;
constVar->m_value.m_float = token.m_floatLiteral;
break;
case TokenType::StringLiteral:
constVar->m_value.m_type = ConstExpressionType::StringLiteral;
constVar->m_value.m_string = token.m_stringLiteral;
break;
default:
break;
}
token = a_state.m_getToken();
tokenAssert(token.m_tokenType == TokenType::Symbol, "expected a ';' symbol");
tokenAssert(token.m_symbol == SymbolType::SemiColon, "expected a ';' symbol");
return true;
}
bool processEnumValue(CompilerState& a_state, Token& a_token, EnumValues& a_values)
{
Token& token = a_token;
std::string enumValueName = a_token.m_identifier->m_name;
tokenAssert(a_values.m_values.count(enumValueName) == 0, "duplicate enum value\n");
a_token = a_state.m_getToken();
if (tokenIs(a_token, SymbolType::Assignment))
{
a_token = a_state.m_getToken();
tokenAssert(token.m_tokenType == TokenType::IntegerLiteral, "expected an integer");
a_values.m_nextValue = a_token.m_integerLiteral->m_value;
a_token = a_state.m_getToken();
}
a_values.m_values[enumValueName] = a_values.m_nextValue;
a_values.m_nextValue++;
return tokenIs(a_token, SymbolType::Comma) || tokenIs(a_token, SymbolType::CloseBrace);
}
bool processEnum(CompilerState& a_state, Token& a_token, ModuleInterface& a_interface)
{
Token token = a_state.m_getToken();
tokenAssert(token.m_tokenType == TokenType::Identifier, "expected an identifier");
std::string enumName = token.m_identifier->m_name;
TypeDeclaration* enumDef = &a_state.m_types[enumName];
EnumValues* enumValues = &a_state.m_enums[enumName];
a_state.m_tokens[enumName] = TokenType::Type;
enumDef->m_name = enumName;
enumDef->m_type = TypeType::Enum;
enumDef->m_enum = enumValues;
enumValues->m_nextValue = 0;
bool okay = processBlock(a_state, a_token, [&enumValues](GenericProcessElement& processElement) {
processElement(TokenType::Identifier, processEnumValue, *enumValues);
});
return okay;
}
// TODO: move to utils
template <typename Container, typename Functor>
typename Container::const_iterator find(const Container& a_container, Functor&& a_functor)
{
return std::find_if(a_container.begin(), a_container.end(), a_functor);
}
template <typename Container>
bool processVariableNameHelper(CompilerState& a_state, Token& a_token, Variable& a_variable, Container& a_collection, const char* a_errorMessage)
{
Token& token = a_token;
token = a_state.m_getToken();
tokenAssert(token.m_tokenType == TokenType::Identifier, "expected an identifier");
a_variable.m_name = token.m_identifier->m_name;
std::string name = a_variable.m_name;
auto sameName = [&name](const Variable& a_var) { return a_var.m_name == name; };
tokenAssert(find(a_collection, sameName) == a_collection.end(), a_errorMessage);
return true;
}
template <typename Container>
bool processVariableHelper(CompilerState& a_state, Token& a_token, Variable& a_variable, Container& a_collection, const char* a_errorMessage)
{
Token& token = a_token;
tokenAssert(token.m_tokenType == TokenType::Type, "expected a type");
a_variable.m_type = token.m_type;
return processVariableNameHelper(a_state, a_token, a_variable, a_collection, a_errorMessage);
}
bool processStructMember(CompilerState& a_state, Token& a_token, StructMembers& a_members)
{
Variable member;
if (!processVariableHelper(a_state, a_token, member, a_members.m_members, "existing member with same name exists"))
return false;
a_members.m_members.push_back(member);
Token& token = a_token;
token = a_state.m_getToken();
tokenAssert(tokenIs(token, SymbolType::SemiColon), "expected a ';' symbol");
return true;
}
bool processStruct(CompilerState& a_state, Token& a_token, ModuleInterface& a_interface)
{
Token& token = a_token;
token = a_state.m_getToken();
tokenAssert(token.m_tokenType == TokenType::Identifier, "expected an identifier");
std::string structName = token.m_identifier->m_name;
TypeDeclaration* structDef = &a_state.m_types[structName];
StructMembers* structMembers = &a_state.m_structs[structName];
a_state.m_tokens[structName] = TokenType::Type;
structDef->m_name = structName;
structDef->m_type = TypeType::Struct;
structDef->m_struct = structMembers;
if (!processBlock(a_state, a_token, [&structMembers](GenericProcessElement& processElement) {
processElement(TokenType::Type, processStructMember, *structMembers); }))
{
// Forward declaration
structDef->m_struct = nullptr; // if the members are nullptr, it is a forward declaration
tokenAssert(tokenIs(token, SymbolType::SemiColon), "expected a ';' symbol");
}
return true;
}
bool processFunctionParameter(CompilerState& a_state, Token& a_token, FunctionDeclaration& a_function)
{
Variable parameter;
if (!processVariableHelper(a_state, a_token, parameter, a_function.m_parameters, "existing parameter with same name exists"))
return false;
a_function.m_parameters.push_back(parameter);
Token& token = a_token;
token = a_state.m_getToken();
return tokenIs(a_token, SymbolType::Comma) || tokenIs(a_token, SymbolType::CloseBracket);
}
bool processFunctionCommon(CompilerState& a_state, Token& a_token, ModuleInterface& a_interface)
{
FunctionDeclaration* functionDef = nullptr;
Token& token = a_token;
token = a_state.m_getToken();
if (token.m_tokenType == TokenType::Identifier)
{
// tokenAssert(token.m_tokenType == TokenType::Identifier, "expected an identifier");
std::string functionName = token.m_identifier->m_name;
functionDef = &a_state.m_functions[functionName];
a_state.m_tokens[functionName] = TokenType::Function;
functionDef->m_name = functionName;
functionDef->m_returnType = nullptr;
}
else if (token.m_tokenType == TokenType::Function)
{
// Existing function declaration - either it is redeclaring/overriding existing function declaration
// or it is defining the body of a forward/interface of a function - TODO: validate the arguments are not changed
// TODO: perhaps need to think about if function is declared and it is a separate definition of the function
// then there is no need to repeat what the arguments etc are, might be easier if just assign to it like this:
// function MyFunc(int8 arg1) => int8; // This is the declaration
// MyFunc := { return arg1 + 1; } // This is the definition - note without the repetition but hard to see the
// // args and return types in the scope it is defined, so good and bad
functionDef = token.m_function;
}
if (!functionDef)
{
return false;
}
bool okay = processBlock(a_state, a_token, [&functionDef](GenericProcessElement& processElement) {
processElement(TokenType::Type, processFunctionParameter, *functionDef);
}, SymbolType::OpenBracket, SymbolType::CloseBracket);
if (okay)
{
token = a_state.m_getToken();
tokenAssert(tokenIs(token, SymbolType::Produces), "expected '=>' and return type");
token = a_state.m_getToken();
tokenAssert(tokenIs(token, TokenType::Type), "expected a type");
functionDef->m_returnType = token.m_type;
}
return true;
}
bool processFunctionDeclaration(CompilerState& a_state, Token& a_token, ModuleInterface& a_interface)
{
bool okay = processFunctionCommon(a_state, a_token, a_interface);
if (okay)
{
Token token = a_state.m_getToken();
tokenAssert(tokenIs(token, SymbolType::SemiColon), "expected ';'");
}
return true;
}
//bool processLeftExpression(CompilerState& a_state, Token& a_token, Function& a_function)
struct RightHandSideExpression
{
TypeDeclaration* m_deducedType;
Expression m_expressionTree;
};
struct LeftHandSideExpression
{
// var declaration and assignment
// var
// struct var member -> recursive
};
/*
bool processLeftExpression(CompilerState& a_state, Token& a_token, LeftHandSideExpression& a_lhsExpression)
{
}
*/
/*
struct Expression
{
ExpressionType m_type;
union
{
ConstExpression m_constant;
Variable* m_variable;
UnaryExpression m_unary;
BinaryExpression m_binary;
TernaryExpression m_ternary;
};
};
*/
Expression& newExpression(FunctionBlock& a_functionBlock)
{
Expression newExpr;
a_functionBlock.m_expressions.push_back(newExpr);
return a_functionBlock.m_expressions.back();
}
bool processRightLeft(CompilerState& a_state, Token& a_token, FunctionBlock& a_functionBlock, Expression& a_left)
{
if (tokenIs(a_token, TokenType::IntegerLiteral))
{
a_left.m_type = ExpressionType::Constant;
a_left.m_constant.m_type = ConstExpressionType::IntegerLiteral;
a_left.m_constant.m_integer = a_token.m_integerLiteral;
}
else if (tokenIs(a_token, TokenType::FloatLiteral))
{
a_left.m_type = ExpressionType::Constant;
a_left.m_constant.m_type = ConstExpressionType::FloatLiteral;
a_left.m_constant.m_float = a_token.m_floatLiteral;
}
else if (tokenIs(a_token, TokenType::StringLiteral))
{
a_left.m_type = ExpressionType::Constant;
a_left.m_constant.m_type = ConstExpressionType::StringLiteral;
a_left.m_constant.m_string = a_token.m_stringLiteral;
}
else if (tokenIs(a_token, TokenType::Function))
{
// TODO: need to add support for making function calls - this is another expression type
}
else if (tokenIs(a_token, TokenType::Identifier))
{
// TODO: assuming here that all identifiers are local variables - could be non-local, could be in outer block, could be parameter
// TODO: perhaps variable lookup could be built in to the tokenizing. as enter/leave blocks, could update the CompilerState with the
// table of variable for the given context? Perhaps variables can have a type, local,parameter,const global.
a_left.m_type = ExpressionType::Variable;
a_left.m_variableId = a_token.m_identifier;
}
else if (tokenIs(a_token, TokenType::Constant))
{
// TODO:
}
else if (tokenIs(a_token, TokenType::Type))
{
// TODO: could perhaps assign a type to a variable if the variable is of meta type
}
else if (tokenIs(a_token, TokenType::Module))
{
// TODO: perhaps namespace - in general need to consider this/namespaces
// TODO: build test case with Math module, and reference a constant like Math::pi.
}
else if (tokenIs(a_token, TokenType::Keyword))
{
// TODO: keyword could be control-flow, like if/while/do/done/for etc - need to process somewhere
// although this is not valid mid-way through an expression, only for the first token on the lhs.
}
else if (tokenIs(a_token, TokenType::Symbol))
{
// ++ and -- are not supported, perhaps should consider if need or not
a_left.m_type = ExpressionType::Unary;
if (tokenIs(a_token, SymbolType::Add))
{
a_left.m_unary.m_operator = UnaryExpressionType::Positive;
}
else if (tokenIs(a_token, SymbolType::Subtract))
{
a_left.m_unary.m_operator = UnaryExpressionType::Negative;
}
else if (tokenIs(a_token, SymbolType::Tilda))
{
a_left.m_unary.m_operator = UnaryExpressionType::OnesComplement;
}
else if (tokenIs(a_token, SymbolType::Not))
{
a_left.m_unary.m_operator = UnaryExpressionType::Not;
}
a_left.m_unary.m_first = &newExpression(a_functionBlock);
a_token = a_state.m_getToken();
processRightLeft(a_state, a_token, a_functionBlock, *a_left.m_unary.m_first);
}
else
{
return false;
}
return true;
}
bool processTerminal(CompilerState& a_state, Token& a_token, FunctionBlock& a_functionBlock, Expression& a_expression)
{
return processRightLeft(a_state, a_token, a_functionBlock, a_expression);
}
bool processRightExpressionHelper(CompilerState& a_state, Token& a_token, FunctionBlock& a_functionBlock, Expression& a_expression);
bool processRightRight(CompilerState& a_state, Token& a_token, FunctionBlock& a_functionBlock, Expression& a_left, Expression& a_expression)
{
Token& token = a_token;
a_token = a_state.m_getToken();
if (tokenIs(a_token, SymbolType::SemiColon))
{
Expression& left = newExpression(a_functionBlock);
left = a_left;
a_expression = left;
//a_expression = a_left;
return true;
}
else if (tokenIs(a_token, SymbolType::Add))
{
Expression& left = newExpression(a_functionBlock);
Expression& right = newExpression(a_functionBlock);
left = a_left;
if (!processRightExpressionHelper(a_state, a_token, a_functionBlock, right))
{
return false;
}
a_expression.m_type = ExpressionType::Binary;
a_expression.m_binary.m_first = &left;
a_expression.m_binary.m_second = &right;
a_expression.m_binary.m_operator = BinaryExpressionType::Add;
return true;
}
else if (tokenIs(a_token, SymbolType::Multiply))
{
a_expression.m_type = ExpressionType::Binary;
Expression& left = newExpression(a_functionBlock);
Expression& right = newExpression(a_functionBlock);
left = a_left;
a_token = a_state.m_getToken();
if (!processRightLeft(a_state, a_token, a_functionBlock, right))
{
return false;
}
//Expression& product = newExpression(a_functionBlock);
Expression product;
product.m_type = ExpressionType::Binary;
product.m_binary.m_first = &left;
product.m_binary.m_second = &right;
product.m_binary.m_operator = BinaryExpressionType::Multiply;
return processRightRight(a_state, a_token, a_functionBlock, product, a_expression);
}
return false;
}
bool processTerm(CompilerState& a_state, Token& a_token, FunctionBlock& a_functionBlock, Expression& a_expression)
{
Token& token = a_token;
a_token = a_state.m_getToken();
//Expression left;
Expression& left = newExpression(a_functionBlock);
if (!processTerminal(a_state, a_token, a_functionBlock, left))
{
return false;
}
Expression& exp = newExpression(a_functionBlock);
if (!processRightRight(a_state, a_token, a_functionBlock, left, exp))
// if (!processRightRight(a_state, a_token, a_functionBlock, left, a_expression))
{
return false;
}
a_expression = exp;
return true;
}
bool processRightExpressionHelper(CompilerState& a_state, Token& a_token, FunctionBlock& a_functionBlock, Expression& a_expression)
{
Token& token = a_token;
a_token = a_state.m_getToken();
#if 0
while (!tokenIs(token, SymbolType::SemiColon))
{
a_token = a_state.m_getToken();
}
#else
Expression& left = newExpression(a_functionBlock);
if (!processRightLeft(a_state, a_token, a_functionBlock, left))
{
return false;
}
if (!processRightRight(a_state, a_token, a_functionBlock, left, a_expression))
{
return false;
}
#endif
return true;
}
void printSpaces(int a_spaces)
{
for (int i = 0; i < a_spaces; ++i)
printf(" ");
}
void dumpExpressionTree(Expression& a_expression, int a_depth = 0)
{
printf("\n");
printSpaces(a_depth);
switch (a_expression.m_type)
{
case ExpressionType::Constant:
printf("const -%llu-", a_expression.m_constant.m_integer->m_value);
break;
case ExpressionType::Variable:
printf("var -%s-", a_expression.m_variableId->m_name.c_str());
break;
case ExpressionType::Unary:
printf("unary ");
break;
case ExpressionType::Binary:
printf("binary L:");
dumpExpressionTree(*a_expression.m_binary.m_first, a_depth + 1);
printf("\n");
printSpaces(a_depth);
printf("binary R:");
dumpExpressionTree(*a_expression.m_binary.m_second, a_depth + 1);
break;
case ExpressionType::Ternary:
printf("ternary ");
break;
}
}
std::string expressionTreeString(const Expression& a_expression)
{
std::string str;
switch (a_expression.m_type)
{
case ExpressionType::Constant:
str += std::to_string(a_expression.m_constant.m_integer->m_value);
break;
case ExpressionType::Variable:
str += a_expression.m_variableId->m_name;
break;
case ExpressionType::Unary:
str += "(";
switch (a_expression.m_unary.m_operator)
{
case UnaryExpressionType::Not: str += "!"; break;
case UnaryExpressionType::OnesComplement: str += "~"; break;
case UnaryExpressionType::Positive: str += "+"; break;
case UnaryExpressionType::Negative: str += "-"; break;
}
str += expressionTreeString(*a_expression.m_unary.m_first) + ")";
break;
case ExpressionType::Binary:
str += "(" + expressionTreeString(*a_expression.m_binary.m_first);
if (a_expression.m_binary.m_operator == BinaryExpressionType::Multiply)
str += " x ";
else if (a_expression.m_binary.m_operator == BinaryExpressionType::Add)
str += " + ";
else
str += " ? ";
str += expressionTreeString(*a_expression.m_binary.m_second) + ")";
break;
case ExpressionType::Ternary:
str += " !!!!!!! ";
break;
}
return str;
}
std::string expressionTreeStack(const Expression& a_expression)
{
std::string str;
switch (a_expression.m_type)
{
case ExpressionType::Constant:
if (a_expression.m_constant.m_integer)
str += "push " + std::to_string(a_expression.m_constant.m_integer->m_value) + "\n";
break;
case ExpressionType::Variable:
if (a_expression.m_variableId)
str += "push " + a_expression.m_variableId->m_name + "\n";
break;
case ExpressionType::Unary:
if (a_expression.m_unary.m_first)
str += expressionTreeStack(*a_expression.m_unary.m_first);
str += "op unary ";
switch (a_expression.m_unary.m_operator)
{
case UnaryExpressionType::Not: str += "!"; break;
case UnaryExpressionType::OnesComplement: str += "~"; break;
case UnaryExpressionType::Positive: str += "+"; break;
case UnaryExpressionType::Negative: str += "-"; break;
}
str += "\n";
break;
case ExpressionType::Binary:
if (a_expression.m_binary.m_first)
str += expressionTreeStack(*a_expression.m_binary.m_first);
if (a_expression.m_binary.m_second)
str += expressionTreeStack(*a_expression.m_binary.m_second);
str += "op binary ";
if (a_expression.m_binary.m_operator == BinaryExpressionType::Multiply)
str += " x ";
else if (a_expression.m_binary.m_operator == BinaryExpressionType::Add)
str += " + ";
else
str += " ? ";
str += "\n";
break;
case ExpressionType::Ternary:
str += " !!!!!!! ";
break;
}
return str;
}
std::string expressionTreeRegisters(const Expression& a_expression, int a_currentRegister = 0)
{
std::string str;
std::string reg1 = "%r" + std::to_string(a_currentRegister);
std::string reg2 = "%r" + std::to_string(a_currentRegister + 1);
switch (a_expression.m_type)
{
case ExpressionType::Constant:
if (a_expression.m_constant.m_integer)
str += "mov " + reg1 + ", " + std::to_string(a_expression.m_constant.m_integer->m_value) + "\n";
break;
case ExpressionType::Variable:
if (a_expression.m_variableId)
str += "mov " + reg1 + ", " + a_expression.m_variableId->m_name + "\n";
break;
case ExpressionType::Unary:
if (a_expression.m_unary.m_first)
str += expressionTreeRegisters(*a_expression.m_unary.m_first, a_currentRegister);
switch (a_expression.m_unary.m_operator)
{
case UnaryExpressionType::Not: str += "not"; break;
case UnaryExpressionType::OnesComplement: str += "not"; break;
case UnaryExpressionType::Positive: str += "pos"; break;
case UnaryExpressionType::Negative: str += "neg"; break;
}
str += " " + reg1 + "\n";
break;
case ExpressionType::Binary:
if (a_expression.m_binary.m_first)
str += expressionTreeRegisters(*a_expression.m_binary.m_first, a_currentRegister);
if (a_expression.m_binary.m_second)
str += expressionTreeRegisters(*a_expression.m_binary.m_second, a_currentRegister + 1);
if (a_expression.m_binary.m_operator == BinaryExpressionType::Multiply)
str += "mul";
else if (a_expression.m_binary.m_operator == BinaryExpressionType::Add)
str += "add";
else
str += "???";
str += " " + reg1 + ", " + reg2 + "\n";
break;
case ExpressionType::Ternary:
str += " !!!!!!! ";
break;
}
return str;
}
bool processRightExpression(CompilerState& a_state, Token& a_token, FunctionBlock& a_functionBlock, RightHandSideExpression& a_rhsExpression)
{
// if (!processRightExpressionHelper(a_state, a_token, a_functionBlock, a_rhsExpression.m_expressionTree))
if (!processTerm(a_state, a_token, a_functionBlock, a_rhsExpression.m_expressionTree))
{
return false;
}
//dumpExpressionTree(a_rhsExpression.m_expressionTree);
std::string expTreeStr = expressionTreeString(a_rhsExpression.m_expressionTree);
std::string expTreeReg = expressionTreeRegisters(a_rhsExpression.m_expressionTree);
std::string expTreeStack = expressionTreeStack(a_rhsExpression.m_expressionTree);
printf("expression tree as string: %s\n", expTreeStr.c_str());
printf("expression tree as registers:\n%s\n", expTreeReg.c_str());
printf("expression tree as stack:\n%s\n", expTreeStack.c_str());
// TODO: process the expression tree to determine the type
a_rhsExpression.m_deducedType = &a_state.m_types["int8"];
return true;
}
bool processExpression(CompilerState& a_state, Token& a_token, FunctionBlock& a_functionBlock)
{
Token& token = a_token;
if (tokenIs(a_token, KeywordType::Var))
{
Variable variable;
if (!processVariableNameHelper(a_state, a_token, variable, a_functionBlock.m_localVariables, "existing variable with same name exists"))
{
return false;
}
token = a_state.m_getToken();
tokenAssert(tokenIs(a_token, SymbolType::Assignment), "expected an assignment to deduce the type of var");
// TODO: save the assigned value/variable/expression
RightHandSideExpression rhsExpression;
if (!processRightExpression(a_state, a_token, a_functionBlock, rhsExpression))
{
return false;
}
variable.m_type = rhsExpression.m_deducedType;
a_functionBlock.m_localVariables.push_back(variable);
tokenAssert(tokenIs(token, SymbolType::SemiColon), "expected a ';' symbol");
return true;
}
else
{
// TODO: probably block or control flow statements
}
return true;
}
bool processStatement(CompilerState& a_state, Token& a_token, FunctionBlock& a_functionBlock)
{
return true;
}
bool processVariable(CompilerState& a_state, Token& a_token, FunctionBlock& a_functionBlock)
{
Variable variable;
if (!processVariableHelper(a_state, a_token, variable, a_functionBlock.m_localVariables, "existing variable with same name exists"))
{
return false;
}
a_functionBlock.m_localVariables.push_back(variable);
Token& token = a_token;
token = a_state.m_getToken();
if (tokenIs(a_token, SymbolType::Assignment))
{
// TODO: save the assigned value/variable/expression
RightHandSideExpression rhsExpression;
if (!processRightExpression(a_state, a_token, a_functionBlock, rhsExpression))
{
return false;
}
}
tokenAssert(tokenIs(token, SymbolType::SemiColon), "expected a ';' symbol");
return true;
}
/*
struct Statement
{
StatementType m_type;
union
{
FunctionBlock* m_block;
ControlFlow* m_controlFlow;
Expression* m_expression;
};
};
struct FunctionBlock
{
std::vector<Variable> m_localVariables;
std::vector<Statement> m_statements;
std::vector<Expression> m_expressions;
};
struct Function
{
FunctionDeclaration* m_interface;
FunctionBlock m_implementation;
};
*/
bool processFunctionBody(CompilerState& a_state, Token& a_token, ModuleImplementation& a_implementation)
{
bool okay = processFunctionCommon(a_state, a_token, a_implementation.m_details);
if (okay)
{
Function function;
okay = processBlock(a_state, a_token, [&function](GenericProcessElement& processElement) {
processElement(TokenType::Type, processVariable, function.m_implementation); // variable declaration
processElement(TokenType::Identifier, processStatement, function.m_implementation); // variable assignment
processElement(TokenType::Keyword, processExpression, function.m_implementation); // different
});
if (!okay && tokenIs(a_token, SymbolType::SemiColon))
{
return true;
}
}
return true;
}
bool processInterface(CompilerState& a_state, Token& a_token, ModuleInterface& a_interface)
{
bool okay = processBlock(a_state, a_token, [&a_interface](GenericProcessElement& processElement) {
processElement(KeywordType::Type, processTypedef, a_interface);
processElement(KeywordType::Const, processConst, a_interface);
processElement(KeywordType::Enum, processEnum, a_interface);
processElement(KeywordType::Struct, processStruct, a_interface);
processElement(KeywordType::Function, processFunctionDeclaration, a_interface);
});
Token& token = a_token;
tokenAssert(okay, "bad interface");
return okay;
}
bool processImplementation(CompilerState& a_state, Token& a_token, ModuleImplementation& a_implementation)
{
bool okay = processBlock(a_state, a_token, [&a_implementation](GenericProcessElement& processElement) {
processElement(KeywordType::Type, processTypedef, a_implementation.m_details);
processElement(KeywordType::Const, processConst, a_implementation.m_details);
processElement(KeywordType::Enum, processEnum, a_implementation.m_details);
processElement(KeywordType::Struct, processStruct, a_implementation.m_details);
processElement(KeywordType::Function, processFunctionBody, a_implementation);
});
Token& token = a_token;
tokenAssert(okay, "bad interface");
return okay;
}
bool processTests(CompilerState& a_state, Token& a_token, ModuleTests& a_tests)
{
bool okay = true;
return okay;
}
bool processModule(CompilerState& a_state, Token& a_token, Module& a_module)
{
const bool required = true;
bool okay = processBlock(a_state, a_token, [&a_module](GenericProcessElement& processElement) {
processElement(KeywordType::Information, processInformation, a_module.m_information);
processElement(KeywordType::Imports, processImports, a_module.m_imports);
processElement(KeywordType::Interface, processInterface, a_module.m_interface);
processElement(KeywordType::Implementation, processImplementation, a_module.m_implementation);
processElement(KeywordType::Tests, processTests, a_module.m_tests);
});
return okay;
}
bool processProgram(CompilerState& a_state, const char* a_fileName)
{
bool okay = true;
Token token = a_state.m_getToken();
if (tokenIs(token, KeywordType::Module))
{
token = a_state.m_getToken();
if (token.m_tokenType != TokenType::Identifier)
{
tokenAssert(false, "expected an identifier");
return false;
}
const std::string& moduleName = token.m_identifier->m_name;
Module& module = a_state.m_modules[moduleName];
module.m_name = moduleName;
module.m_fileName = a_fileName;
okay = processModule(a_state, token, module);
}
else
{
tokenAssert(false, "invalid token");
okay = false;
}
return okay;
}
int main(int argc, char* argv[])
{
if (argc < 2)
{
printf("bad number of arguments\n");
return -1;
}
std::ifstream input(argv[1]);
if (!input.is_open())
{
printf("couldn't open input file\n");
return -1;
}
TokenizerState tokenizerState;
CompilerState state = InitializeCompilerState();
// Setup handlers
// i/o input
state.m_getInput = [&input](){ std::string line; std::getline(input, line); return line; };
// eof
state.m_getEof = [&input](){ return input.eof(); };
// tokenizer
state.m_getToken = [&state, &tokenizerState](){ return getNextToken(state, tokenizerState); };
// parser
state.m_parser = [&state](){ dumpTokens(state); };
//state.m_parser = &parser;
//state.m_parser();
bool okay = processProgram(state, argv[1]);
printf("%sokay\n", !okay ? "not " : "");
return okay ? 0 : -1;
}