diff --git a/compile.sh b/compile.sh old mode 100644 new mode 100755 diff --git a/man/man1/crate.1.gz b/man/man1/crate.1.gz index 1764189..a01c3b6 100644 Binary files a/man/man1/crate.1.gz and b/man/man1/crate.1.gz differ diff --git a/src/crate/crate b/src/crate/crate index 3e27d8e..810ab50 100755 Binary files a/src/crate/crate and b/src/crate/crate differ diff --git a/src/crate/crate.hpp b/src/crate/crate.hpp index c17dd9a..05a3022 100644 --- a/src/crate/crate.hpp +++ b/src/crate/crate.hpp @@ -199,7 +199,7 @@ vector lex(const string); Program parse(const vector); int compile(Program); int run(string); -int strparse(string); +string strparse(string); int numparse(string); void nodes(Node, int); void prog(Program); diff --git a/src/crate/extralex.cpp b/src/crate/extralex.cpp index 8ea61da..365af4a 100644 --- a/src/crate/extralex.cpp +++ b/src/crate/extralex.cpp @@ -1,5 +1,156 @@ #include "crate.hpp" -bool isUpper(const string& s) { - return all_of(s.begin(), s.end(), [](unsigned char c){ return (toupper(c) == c); }); -} +/* string strparse(string str) +{ + // this code assumes that the string is syntactically correct, thanks to the lexer. all it does is produce the actual string, not a string containing it. + // Ex. if i put `strparse("\\0");`, I will get a string containing the EOF character. + // Ex. if i put `strparse("\\u1234);`, I will get a string containing the Unicode character U+1234. + + char c; + string out = ""; + string unicode = ""; + string octal = ""; + int uni = 4; + int oct = 3; + int uchar = 0; + int ochar = 0; + bool usequence = false; + bool osequence = false; + bool slash = false; + + vector octal {"\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007", "\010", "\011", "\012", "\013", "\014", "\015", "\016",\ + "\017", "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\027", "\030", "\031", "\032", "\033", "\034", "\035",\ + "\036", "\037", "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047", "\050", "\051", "\052", "\053", "\054",\ + "\055", "\056", "\057", "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067", "\070", "\071", "\072", "\073",\ + "\074", "\075", "\076", "\077", "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107", "\110", "\111", "\112",\ + "\113", "\114", "\115", "\116", "\117", "\120", "\006", "\007", "\010", "\011", "\012", "\013", "\014", "\015", "\016",\ + "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007", "\010", "\011", "\012", "\013", "\014", "\015", "\016",\ + "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007", "\010", "\011", "\012", "\013", "\014", "\015", "\016",\ + "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007", "\010", "\011", "\012", "\013", "\014", "\015", "\016",\ + "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007", "\010", "\011", "\012", "\013", "\014", "\015", "\016",\ + "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007", "\010", "\011", "\012", "\013", "\014", "\015", "\016",\ }; + + for (int i = 0; i < str.size(); i++) { + c = str[i]; + + switch (c) { + case '\\': + if (slash) { + out += c; + slash = false; + } else { + slash = true; + } + break; + case '0': + if (slash) { + out += '\0'; + slash = false; + } else { + out += c; + } + case 'a': + if (slash) { + out += '\a'; + slash = false; + } else { + out += c; + } + case '"': + if (slash) { + out += '\"'; + slash = false; + } else { + // this won't run: it'll have been corrected in the lexer. + ; + } + case '\'': + if (slash) { + out += '\''; + slash = false; + } else { + out += c; + } + case '?': + if (slash) { + out += '\?'; + slash = false; + } else { + out += c; + } + case 'b': + if (slash) { + out += '\b'; + slash = false; + } else { + out += c; + } + case 'f': + if (slash) { + out += '\f'; + slash = false; + } else { + out += c; + } + case 'n': + if (slash) { + out += '\n'; + slash = false; + } else { + out += c; + } + case 'r': + if (slash) { + out += '\r'; + slash = false; + } else { + out += c; + } + case 't': + if (slash) { + out += '\t'; + slash = false; + } else { + out += c; + } + case 'v': + if (slash) { + out += '\v'; + slash = false; + } else { + out += c; + } + case 'u': + if (slash) { + usequence = true; + } else { + out += c; + } + case 'o': + if (slash) { + osequence = true; + } else { + out += c; + } + default: + if (slash & !usequence & !osequence) { + cout << "Invalid escape sequence \\" << c << " ."; + exit(1); + } else if (usequence) { + } else if (osequence) { + if (ochars.find(c) != 1) { + oct--; + + if (oct == 0) { + oct = 3; + out += "" + } + } else { + cout << "Invalid octal sequence with " << c << " ."; + exit(1); + } + } else { + out += c; + } + } +} */ diff --git a/src/crate/lex.cpp b/src/crate/lex.cpp index cefe43b..eced124 100644 --- a/src/crate/lex.cpp +++ b/src/crate/lex.cpp @@ -25,6 +25,7 @@ vector lex(const string src) bool unicode = false; bool octal = false; + bool done = false; int uni = 4; int oct = 3; @@ -155,6 +156,11 @@ vector lex(const string src) load_type = ""; ok = false; slash = false; + } else if (load_type == "char") { + cout << "[" << row << ", " << col << "] Unexpected EOL (unfinished-chr-with-eol)"; + load_type = ""; + ok = false; + slash = false; } col = 0; eol = true; @@ -169,7 +175,6 @@ vector lex(const string src) } // string else if (load_type == "string") { - load_var += c; if (c == '\\') { if (slash) { slash = false; @@ -224,11 +229,99 @@ vector lex(const string src) cur.row = row; cur.col = col; tlist.push_back(cur); + + load_type = ""; + load_var = ""; } } + + if (load_type == "") { + // no more string + ; + } else { + load_var += c; + } + } + } + // character + else if (load_type == "char") { + if (done == true & c != '\'') { + cout << "[" << row << ", " << col << "] Expected end of character (expect-char-end)"; + ok = false; + load_type = ""; + load_var = ""; } + + if (c == '\\') { + if (slash) { + slash = false; + done = true; + } else { + slash = true; + } + } else { + if (slash) { + if (c == 'u') { + unicode = true; + } else if (c == 'o') { + octal = true; + } else { + if (unicode) { + if (hex.find(c) != -1) { + uni--; + if (uni == 0) { + unicode = false; + uni = 4; + } + } else { + unicode = false; + uni = 4; + load_type = ""; + load_var = ""; + + cout << "[" << row << ", " << col << "] Non-hexadecimal character in unicode sequence (bad-unicode)."; + } + } else if (octal) { + if (octl.find(c) != -1) { + oct--; + if (oct == 0) { + octal = false; + oct = 3; + } + } else { + octal = false; + oct = 3; + load_type = ""; + load_var = ""; - load_var += c; + cout << "[" << row << ", " << col << "] Non-octal character in octal sequence (bad-octal)."; + } + } else { + slash = false; + done = true; + } + } + } else { + if (c == '\'') { + cur.ttype = CHR; + cur.value = load_var; + cur.row = row; + cur.col = col; + tlist.push_back(cur); + + done = false; + load_type = ""; + load_var = ""; + } + } + + if (load_type == "") { + // no more string + ; + } else { + load_var += c; + } + } } // start of comment else if (c == '$') { @@ -361,7 +454,72 @@ vector lex(const string src) load_type = ""; load_var = ""; load_type = "string"; - load_var += c; + } + // start of character + else if (c == '\'') { + if (load_type == "") { + ; + } else if (load_type == "alpha") { + if (keys.find(load_var) != keys.end()) { + cur.ttype = keys[load_var]; + cur.value = load_var; + cur.row = row; + cur.col = col; + tlist.push_back(cur); + } else { + cur.ttype = ID; + cur.value = load_var; + cur.row = row; + cur.col = col; + tlist.push_back(cur); + } + } else if (load_type == "int") { + cur.ttype = INT; + cur.value = load_var; + cur.row = row; + cur.col = col; + tlist.push_back(cur); + } else if (load_type == "float") { + cur.ttype = FLOAT; + cur.value = load_var; + cur.row = row; + cur.col = col; + tlist.push_back(cur); + } else if (load_type == "operational") { + if (ops.find(load_var) != ops.end()) { + cur.ttype = ops[load_var]; + cur.value = load_var; + cur.row = row; + cur.col = col; + tlist.push_back(cur); + } else { + cout << "[" << row << ", " << col << "] Invalid operator " << load_var << " .\n"; + ok = false; + load_type = ""; + load_var = ""; + } + } else if (load_type == "period") { + cur.ttype = ARGS; + cur.value = load_var; + cur.row = row; + cur.col = col; + tlist.push_back(cur); + } else if (load_type == "splat") { + cur.ttype = KWARGS; + cur.value = load_var; + cur.row = row; + cur.col = col; + tlist.push_back(cur); + } else { + cout << "[" << row << ", " << col << "] we're so sorry. something went wrong with the lexical analyzer. \n\tplease notify me at silas-wr/crate on github.\n"; + ok = false; // make it uncompilable + load_type = ""; + load_var = ""; + } + + load_type = ""; + load_var = ""; + load_type = "char"; } // alphabetical else if (alphabet.find(c) != -1) { diff --git a/src/crate/testlex.crate b/src/crate/testlex.crate index 1361a54..9d01c42 100644 --- a/src/crate/testlex.crate +++ b/src/crate/testlex.crate @@ -117,7 +117,7 @@ $ params ... $ objects -$ 'a' +'a' "hello" $ 1 $ 12