Finishing up some things. I'm going to soon add the numparse, `strp…

…arse`, and `chrparse` functions.
silas-wr · Jul 27, 2024 · 2826326 · 2826326
1 parent fd1ddae
commit 2826326
Show file tree

Hide file tree

Showing 7 changed files with 317 additions and 8 deletions.
diff --git a/compile.sh b/compile.sh
diff --git a/man/man1/crate.1.gz b/man/man1/crate.1.gz
diff --git a/src/crate/crate b/src/crate/crate
diff --git a/src/crate/crate.hpp b/src/crate/crate.hpp
@@ -199,7 +199,7 @@ vector<Token> lex(const string);
 Program parse(const vector<Token>);
 int compile(Program);
 int run(string);
-int strparse(string);
+string strparse(string);
 int numparse(string);
 void nodes(Node, int);
 void prog(Program);
diff --git a/src/crate/extralex.cpp b/src/crate/extralex.cpp
@@ -1,5 +1,156 @@
 #include "crate.hpp"
 
-bool isUpper(const string& s) {
-    return all_of(s.begin(), s.end(), [](unsigned char c){ return (toupper(c) == c); });
-}
+/* string strparse(string str)
+{
+  // this code assumes that the string is syntactically correct, thanks to the lexer. all it does is produce the actual string, not a string containing it.
+  // Ex. if i put `strparse("\\0");`, I will get a string containing the EOF character.
+  // Ex. if i put `strparse("\\u1234);`, I will get a string containing the Unicode character U+1234.
+
+  char c;
+  string out = "";
+  string unicode = "";
+  string octal = "";
+  int uni = 4;
+  int oct = 3;
+  int uchar = 0;
+  int ochar = 0;
+  bool usequence = false;
+  bool osequence = false;
+  bool slash = false;
+  
+  vector<string> octal {"\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007", "\010", "\011", "\012", "\013", "\014", "\015", "\016",\
+  "\017", "\020", "\021", "\022", "\023", "\024", "\025", "\026", "\027", "\030", "\031", "\032", "\033", "\034", "\035",\ 
+  "\036", "\037", "\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047", "\050", "\051", "\052", "\053", "\054",\
+  "\055", "\056", "\057", "\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067", "\070", "\071", "\072", "\073",\
+  "\074", "\075", "\076", "\077", "\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107", "\110", "\111", "\112",\
+  "\113", "\114", "\115", "\116", "\117", "\120", "\006", "\007", "\010", "\011", "\012", "\013", "\014", "\015", "\016",\
+  "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007", "\010", "\011", "\012", "\013", "\014", "\015", "\016",\ 
+  "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007", "\010", "\011", "\012", "\013", "\014", "\015", "\016",\
+  "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007", "\010", "\011", "\012", "\013", "\014", "\015", "\016",\
+  "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007", "\010", "\011", "\012", "\013", "\014", "\015", "\016",\
+  "\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007", "\010", "\011", "\012", "\013", "\014", "\015", "\016",\ };
+  
+  for (int i = 0; i < str.size(); i++) {
+    c = str[i];
+    
+    switch (c) {
+      case '\\':
+      	if (slash) {
+      	  out += c;
+      	  slash = false;
+      	} else {
+      	  slash = true;
+      	}
+        break;
+      case '0':
+      	if (slash) {
+      	  out += '\0';
+      	  slash = false;
+      	} else {
+      	  out += c;
+      	}
+      case 'a':
+      	if (slash) {
+      	  out += '\a';
+      	  slash = false;
+      	} else {
+      	  out += c;
+      	}
+      case '"':
+      	if (slash) {
+      	  out += '\"';
+      	  slash = false;
+      	} else {
+	  // this won't run: it'll have been corrected in the lexer.
+	  ;	
+      	}
+      case '\'':
+      	if (slash) {
+      	  out += '\'';
+      	  slash = false;
+      	} else {
+      	  out += c;
+      	}
+      case '?':
+      	if (slash) {
+      	  out += '\?';
+      	  slash = false;
+      	} else {
+      	  out += c;
+      	}
+      case 'b':
+      	if (slash) {
+      	  out += '\b';
+      	  slash = false;
+      	} else {
+      	  out += c;
+      	}
+      case 'f':
+      	if (slash) {
+      	  out += '\f';
+      	  slash = false;
+      	} else {
+      	  out += c;
+      	}
+      case 'n':
+      	if (slash) {
+      	  out += '\n';
+      	  slash = false;
+      	} else {
+      	  out += c;
+      	}
+      case 'r':
+      	if (slash) {
+      	  out += '\r';
+      	  slash = false;
+      	} else {
+      	  out += c;
+      	}
+      case 't':
+      	if (slash) {
+      	  out += '\t';
+      	  slash = false;
+      	} else {
+      	  out += c;
+      	}
+      case 'v':
+      	if (slash) {
+      	  out += '\v';
+      	  slash = false;
+      	} else {
+      	  out += c;
+      	}
+      case 'u':
+      	if (slash) {
+      	  usequence = true;
+      	} else {
+      	  out += c;
+      	}
+      case 'o':
+        if (slash) {
+      	  osequence = true;
+      	} else {
+      	  out += c;
+      	}
+      default:
+      	if (slash & !usequence & !osequence) {
+      	  cout << "Invalid escape sequence \\" << c << " .";
+      	  exit(1);
+      	} else if (usequence) {
+      	} else if (osequence) {
+      	  if (ochars.find(c) != 1) {
+      	    oct--;
+      	    
+      	    if (oct == 0) {
+      	      oct = 3;
+      	      out += ""
+      	    }
+      	  } else {
+      	    cout << "Invalid octal sequence with " << c << " .";
+       	    exit(1);
+      	  }
+      	} else {
+      	  out += c;
+      	}
+  }
+} */
diff --git a/src/crate/lex.cpp b/src/crate/lex.cpp
@@ -25,6 +25,7 @@ vector<Token> lex(const string src)
 
   bool unicode = false;
   bool octal = false;
+  bool done = false;
 
   int uni = 4;
   int oct = 3;
@@ -155,6 +156,11 @@ vector<Token> lex(const string src)
         load_type = "";
         ok = false;
         slash = false;
+      } else if (load_type == "char") {
+        cout << "[" << row << ", " << col << "] Unexpected EOL (unfinished-chr-with-eol)";
+        load_type = "";
+        ok = false;
+        slash = false;
       }
       col = 0;
       eol = true;
@@ -169,7 +175,6 @@ vector<Token> lex(const string src)
     }
     // string
     else if (load_type == "string") {
-      load_var += c;
       if (c == '\\') {
         if (slash) {
           slash = false;
@@ -224,11 +229,99 @@ vector<Token> lex(const string src)
             cur.row = row;
             cur.col = col;
             tlist.push_back(cur);
+
+	    load_type = "";
+	    load_var = "";
           }
         }
+
+	if (load_type == "") {
+	  // no more string
+	  ;
+	} else {
+	  load_var += c;
+	}
+      }
+    }
+    // character
+    else if (load_type == "char") {
+      if (done == true & c != '\'') {
+      	cout << "[" << row << ", " << col << "] Expected end of character (expect-char-end)";
+      	ok = false;
+      	load_type = "";
+      	load_var = "";
       }
+
+      if (c == '\\') {
+        if (slash) {
+          slash = false;
+          done = true;
+        } else {
+          slash = true;
+        }
+      } else {
+        if (slash) {
+          if (c == 'u') { 
+            unicode = true; 
+          } else if (c == 'o') {
+            octal = true;
+          } else {
+            if (unicode) {
+              if (hex.find(c) != -1) {
+                uni--;
+                if (uni == 0) {
+                  unicode = false;
+                  uni = 4;
+                }
+              } else {
+                unicode = false;
+                uni = 4;
+                load_type = "";
+                load_var = "";
+
+                cout << "[" << row << ", " << col << "] Non-hexadecimal character in unicode sequence (bad-unicode).";
+              }
+            } else if (octal) {
+              if (octl.find(c) != -1) {
+                oct--;
+                if (oct == 0) {
+                  octal = false;
+                  oct = 3;
+                }
+              } else {
+                octal = false;
+                oct = 3;
+                load_type = "";
+                load_var = "";
 
-      load_var += c;
+                cout << "[" << row << ", " << col << "] Non-octal character in octal sequence (bad-octal).";
+              }
+            } else {
+              slash = false;
+              done = true;
+            }
+          }
+        } else {
+          if (c == '\'') {
+            cur.ttype = CHR;        
+            cur.value = load_var;  
+            cur.row = row;
+            cur.col = col;
+            tlist.push_back(cur);
+
+	    done = false;
+	    load_type = "";
+	    load_var = "";
+          }
+        }
+
+	if (load_type == "") {
+	  // no more string
+	  ;
+	} else {
+	  load_var += c;
+	}
+      }
     }
     // start of comment
     else if (c == '$') { 
@@ -361,7 +454,72 @@ vector<Token> lex(const string src)
       load_type = "";
       load_var = "";
       load_type = "string";
-      load_var += c;
+    }
+    // start of character
+    else if (c == '\'') {
+      if (load_type == "") {
+        ;
+      } else if (load_type == "alpha") {
+        if (keys.find(load_var) != keys.end()) {
+          cur.ttype = keys[load_var];
+          cur.value = load_var;
+          cur.row = row;
+          cur.col = col;
+          tlist.push_back(cur);
+        } else {
+          cur.ttype = ID;
+          cur.value = load_var;
+          cur.row = row;
+          cur.col = col;
+          tlist.push_back(cur);
+        }
+      } else if (load_type == "int") {
+        cur.ttype = INT;        
+        cur.value = load_var;
+        cur.row = row;
+        cur.col = col;
+        tlist.push_back(cur);
+      } else if (load_type == "float") {
+        cur.ttype = FLOAT;        
+        cur.value = load_var;  
+        cur.row = row;
+        cur.col = col;
+        tlist.push_back(cur);
+      } else if (load_type == "operational") {
+        if (ops.find(load_var) != ops.end()) {
+          cur.ttype = ops[load_var];
+          cur.value = load_var;
+          cur.row = row;
+          cur.col = col;
+          tlist.push_back(cur);
+        } else {
+          cout << "[" << row << ", " << col << "] Invalid operator " << load_var << " .\n";
+          ok = false;
+          load_type = "";
+          load_var = "";
+        }
+      } else if (load_type == "period") {
+        cur.ttype = ARGS;
+        cur.value = load_var;
+        cur.row = row;
+        cur.col = col;
+        tlist.push_back(cur);
+      } else if (load_type == "splat") {
+        cur.ttype = KWARGS;
+        cur.value = load_var;
+        cur.row = row;
+        cur.col = col;
+        tlist.push_back(cur);
+      } else {
+        cout << "[" << row << ", " << col << "] we're so sorry. something went wrong with the lexical analyzer. \n\tplease notify me at silas-wr/crate on github.\n";
+        ok = false; // make it uncompilable
+        load_type = "";
+        load_var = "";
+      }
+
+      load_type = "";
+      load_var = "";
+      load_type = "char";
     }
     // alphabetical
     else if (alphabet.find(c) != -1) {

diff --git a/src/crate/testlex.crate b/src/crate/testlex.crate
@@ -117,7 +117,7 @@ $ params
 ...
 
 $ objects
-$ 'a'
+'a'
 "hello"
 $ 1
 $ 12
-Original file line number
+Diff line change
@@ Expand Up / @@ -117,7 +117,7 @@ $ params @@
     ...
     $ objects
-    $ 'a'
+    'a'
     "hello"
     $ 1
     $ 12
@@ Expand Down @@