- refactored lexer

- updated lexer to allow hex and binary integer literals
- updated parser with support for writing to pointers
- updated code generation to support writing to pointers
- fixed a bug with codegen where args are loaded from incorrect offsets
  due to saving registers prior to calling.
This commit is contained in:
2026-02-03 15:37:38 +00:00
parent ce2eda72a0
commit 7973b2afca
3 changed files with 580 additions and 118 deletions
+34 -8
View File
@@ -29,8 +29,10 @@ static GLOBAL_METHODS: LazyLock<HashMap<&str, &str>> = LazyLock::new(|| {
("println", "print::println"), ("println", "print::println"),
("printnum", "print::print_num"), ("printnum", "print::print_num"),
("print_space", "print::print_whitespace"), ("print_space", "print::print_whitespace"),
("print_newline", "print::print_newline"),
("print_char", "print::print_byte"), ("print_char", "print::print_byte"),
("print_word", "print::print_word"), ("print_word", "print::print_word"),
("print_hex", "print::print_hex_word"),
]) ])
}); });
@@ -252,6 +254,19 @@ impl CodeGenerator {
Statement::Break => unimplemented!(), Statement::Break => unimplemented!(),
Statement::Continue => unimplemented!(), Statement::Continue => unimplemented!(),
Statement::PtrWrite { ptr, value } => {
let (result_reg, expr_code) = self.generate_expression(value, true)?;
code.extend(expr_code);
let (ptr_reg, ptr_code) = self.generate_expression(ptr, true)?;
code.extend(ptr_code);
code.push(format!("\tstw {}, {}", result_reg, ptr_reg));
self.allocator.free_temp(&result_reg);
self.allocator.free_temp(&ptr_reg);
}
Statement::Assign { varname, value } => { Statement::Assign { varname, value } => {
// Evaluate expression // Evaluate expression
let (result_reg, expr_code) = self.generate_expression(value, true)?; let (result_reg, expr_code) = self.generate_expression(value, true)?;
@@ -540,6 +555,14 @@ impl CodeGenerator {
} }
Expression::Call { name, args } => { Expression::Call { name, args } => {
// first evaluate all the args we're going to need
let mut arg_regs = Vec::new();
for arg in args.iter().rev() {
let (arg_reg, arg_code) = self.generate_expression(arg, true)?;
code.extend(arg_code);
arg_regs.push(arg_reg);
}
// Save caller-saved registers and track which ones we saved // Save caller-saved registers and track which ones we saved
let saved_regs = self.allocator.get_caller_saved_registers(); let saved_regs = self.allocator.get_caller_saved_registers();
for reg in &saved_regs { for reg in &saved_regs {
@@ -547,12 +570,12 @@ impl CodeGenerator {
} }
// Evaluate and push arguments in reverse order // Evaluate and push arguments in reverse order
let mut arg_regs = Vec::new(); for (i, arg_reg) in arg_regs.iter().enumerate() {
for arg in args.iter().rev() { code.push(format!(
let (arg_reg, arg_code) = self.generate_expression(arg, true)?; "\tpush {} // push arg {}",
code.extend(arg_code); arg_reg,
code.push(format!("\tpush {}", arg_reg)); args.len() - 1 - i
arg_regs.push(arg_reg); ));
} }
if GLOBAL_METHODS.contains_key(name.name.as_str()) { if GLOBAL_METHODS.contains_key(name.name.as_str()) {
@@ -564,10 +587,11 @@ impl CodeGenerator {
return Err(CompilerError::Undefined(name.clone())); return Err(CompilerError::Undefined(name.clone()));
} }
let result_reg = String::new(); let result_reg: String;
if use_result { if use_result {
let (result_reg, result_alloc) = self.allocator.alloc_temp()?; let (temp_result_reg, result_alloc) = self.allocator.alloc_temp()?;
result_reg = temp_result_reg;
code.extend(result_alloc); code.extend(result_alloc);
code.push(format!("\tpop {}", result_reg)); code.push(format!("\tpop {}", result_reg));
@@ -579,6 +603,8 @@ impl CodeGenerator {
} }
} }
} else { } else {
result_reg = "zero".to_string();
// Clean up arguments // Clean up arguments
if args.len() > 0 { if args.len() > 0 {
for _ in 0..(args.len()) { for _ in 0..(args.len()) {
+503 -110
View File
@@ -20,7 +20,7 @@ pub enum Token {
// Identifiers and literals // Identifiers and literals
Identifier(String), Identifier(String),
String(String), String(String),
Integer(u32), Integer(u64),
Char(char), Char(char),
// Symbols // Symbols
@@ -31,13 +31,12 @@ pub enum Token {
Semicolon, // ; Semicolon, // ;
Colon, // : Colon, // :
Comma, // , Comma, // ,
// Pipe, // |
// Operators // Operators
Plus, // + Plus, // +
Minus, // - Minus, // -
Star, // * Star, // *
Amphersand, Amphersand, // &
Slash, // / Slash, // /
Assign, // = Assign, // =
EqualEqual, // == EqualEqual, // ==
@@ -80,7 +79,6 @@ impl Token {
Token::Colon => "Colon", Token::Colon => "Colon",
Token::Comma => "Comma", Token::Comma => "Comma",
Token::RightArrow => "RightArrow", Token::RightArrow => "RightArrow",
// Token::Pipe => "Pipe",
Token::Plus => "Plus", Token::Plus => "Plus",
Token::Minus => "Minus", Token::Minus => "Minus",
Token::Star => "Star", Token::Star => "Star",
@@ -139,30 +137,258 @@ impl<'a> Lexer<'a> {
} }
} }
fn skip_line_comment(&mut self) {
// Skip the two slashes
self.advance(); // first /
self.advance(); // second /
// Skip until newline or EOF
while let Some(c) = self.current {
if c == '\n' {
self.line += 1;
self.advance();
break;
}
self.advance();
}
}
fn skip_block_comment(&mut self) -> Result<(), String> {
// Skip the /*
self.advance(); // /
self.advance(); // *
let start_line = self.line;
// Look for */
while let Some(c) = self.current {
if c == '\n' {
self.line += 1;
}
if c == '*' {
if let Some(&next) = self.peek() {
if next == '/' {
self.advance(); // *
self.advance(); // /
return Ok(());
}
}
}
self.advance();
}
Err(format!(
"Unterminated block comment starting at line {}",
start_line
))
}
fn skip_whitespace_and_comments(&mut self) {
loop {
self.skip_whitespace();
// Check for comments
if let Some('/') = self.current {
if let Some(&next) = self.peek() {
match next {
'/' => {
self.skip_line_comment();
continue;
}
'*' => {
if let Err(e) = self.skip_block_comment() {
eprintln!("Lexer error: {}", e);
}
continue;
}
_ => break,
}
}
}
break;
}
}
fn read_identifier(&mut self) -> String { fn read_identifier(&mut self) -> String {
let mut ident = String::new(); let mut ident = String::new();
// Include the current character if it's valid
if let Some(c) = self.current {
if c.is_alphabetic() || c == '_' {
ident.push(c);
}
}
// Read remaining characters
while let Some(&c) = self.peek() { while let Some(&c) = self.peek() {
if c.is_alphanumeric() || c == '_' { if c.is_alphanumeric() || c == '_' {
ident.push(c);
self.advance(); self.advance();
ident.push(c);
} else { } else {
break; break;
} }
} }
ident ident
} }
fn read_number(&mut self) -> i64 { fn keyword_or_identifier(&mut self) -> Token {
let mut num_str = String::from(self.current.unwrap()); let ident = self.read_identifier();
match ident.as_str() {
"fn" => Token::Fn,
"if" => Token::If,
"else" => Token::Else,
"while" => Token::While,
"loop" => Token::Loop,
"break" => Token::Break,
"return" => Token::Return,
"continue" => Token::Continue,
"include" => Token::Include,
"let" => Token::Let,
"const" => Token::Const,
"static" => Token::Static,
_ => Token::Identifier(ident),
}
}
fn read_number(&mut self) -> Result<u64, String> {
let current = self.current.unwrap();
// Check for hex (0x) or binary (0b) prefix
if current == '0' {
if let Some(&next_char) = self.peek() {
match next_char {
'x' | 'X' => {
self.advance(); // consume '0'
self.advance(); // consume 'x'
return self.read_hex_number();
}
'b' | 'B' => {
self.advance(); // consume '0'
self.advance(); // consume 'b'
return self.read_binary_number();
}
_ => {}
}
}
}
// Read decimal number
self.read_decimal_number()
}
fn read_decimal_number(&mut self) -> Result<u64, String> {
let mut num_str = String::new();
if let Some(c) = self.current {
num_str.push(c);
}
while let Some(&c) = self.peek() { while let Some(&c) = self.peek() {
if c.is_ascii_digit() { if c.is_ascii_digit() {
num_str.push(c);
self.advance(); self.advance();
num_str.push(c);
} else { } else {
break; break;
} }
} }
num_str.parse().unwrap()
num_str
.parse::<u64>()
.map_err(|_| format!("Invalid decimal number: {}", num_str))
}
fn read_hex_number(&mut self) -> Result<u64, String> {
let mut num_str = String::new();
// Read current character if it's a hex digit
if let Some(c) = self.current {
if c.is_ascii_hexdigit() {
num_str.push(c);
}
}
while let Some(&c) = self.peek() {
if c.is_ascii_hexdigit() {
self.advance();
num_str.push(c);
} else {
break;
}
}
if num_str.is_empty() {
return Err("Invalid hexadecimal number: no digits after 0x".to_string());
}
u64::from_str_radix(&num_str, 16)
.map_err(|_| format!("Invalid hexadecimal number: {}", num_str))
}
fn read_binary_number(&mut self) -> Result<u64, String> {
let mut num_str = String::new();
// Read current character if it's a binary digit
if let Some(c) = self.current {
if c == '0' || c == '1' {
num_str.push(c);
}
}
while let Some(&c) = self.peek() {
if c == '0' || c == '1' {
self.advance();
num_str.push(c);
} else {
break;
}
}
if num_str.is_empty() {
return Err("Invalid binary number: no digits after 0b".to_string());
}
u64::from_str_radix(&num_str, 2)
.map_err(|_| format!("Invalid binary number: {}", num_str))
}
fn read_string(&mut self) -> Result<String, String> {
self.advance(); // Skip the opening quote
let mut s = String::new();
while let Some(c) = self.current {
if c == '"' {
return Ok(s);
}
// Handle escape sequences
if c == '\\' {
self.advance();
if let Some(escaped) = self.current {
let escaped_char = match escaped {
'n' => '\n',
't' => '\t',
'r' => '\r',
'\\' => '\\',
'"' => '"',
_ => escaped, // For now, just use the character as-is
};
s.push(escaped_char);
} else {
return Err("Unexpected end of string after escape".to_string());
}
} else {
s.push(c);
}
self.advance();
}
Err("Unterminated string literal".to_string())
} }
fn match_next(&mut self, expected: char) -> bool { fn match_next(&mut self, expected: char) -> bool {
@@ -175,104 +401,140 @@ impl<'a> Lexer<'a> {
} }
} }
pub fn next_token(&mut self) -> Token { fn scan_single_char_token(&mut self, c: char) -> Option<Token> {
self.skip_whitespace(); match c {
'(' => Some(Token::LeftParen),
')' => Some(Token::RightParen),
'{' => Some(Token::LeftBrace),
'}' => Some(Token::RightBrace),
';' => Some(Token::Semicolon),
':' => Some(Token::Colon),
',' => Some(Token::Comma),
'&' => Some(Token::Amphersand),
'+' => Some(Token::Plus),
'*' => Some(Token::Star),
_ => None,
}
}
let token = match self.current { fn scan_operator(&mut self, c: char) -> Option<Token> {
Some('(') => Token::LeftParen, match c {
Some(')') => Token::RightParen, '-' => Some(if self.match_next('>') {
Some('{') => Token::LeftBrace, Token::RightArrow
Some('}') => Token::RightBrace, } else {
Some(';') => Token::Semicolon, Token::Minus
Some(':') => Token::Colon, }),
Some(',') => Token::Comma, '!' => Some(if self.match_next('=') {
Some('&') => Token::Amphersand, Token::BangEqual
// Some('|') => Token::Pipe, } else {
Some('+') => Token::Plus, Token::Bang
Some('*') => Token::Star, }),
Some('/') => Token::Slash, '=' => Some(if self.match_next('=') {
Some('-') => { Token::EqualEqual
if self.match_next('>') { } else {
Token::RightArrow Token::Assign
} else { }),
Token::Minus '<' => Some(if self.match_next('=') {
} Token::LessEqual
} } else {
Some('!') => { Token::Less
if self.match_next('=') { }),
Token::BangEqual '>' => Some(if self.match_next('=') {
} else { Token::GreaterEqual
Token::Bang } else {
} Token::Greater
} }),
Some('=') => { '/' => {
if self.match_next('=') { // Check if it's a comment or division
Token::EqualEqual if let Some(&next) = self.peek() {
} else { if next == '/' || next == '*' {
Token::Assign // It's a comment, don't consume it here
} // Let skip_whitespace_and_comments handle it
} None
Some('<') => { } else {
if self.match_next('=') { Some(Token::Slash)
Token::LessEqual
} else {
Token::Less
}
}
Some('>') => {
if self.match_next('=') {
Token::GreaterEqual
} else {
Token::Greater
}
}
Some('"') => {
self.advance(); // Skip the opening quote
let mut s = String::new();
while let Some(c) = self.current {
if c == '"' {
break;
} }
s.push(c);
self.advance();
}
Token::String(s)
}
Some(c) => {
if c.is_alphabetic() || c == '_' {
let mut ident = c.to_string();
ident.push_str(&self.read_identifier());
match ident.as_str() {
"fn" => Token::Fn,
"if" => Token::If,
"else" => Token::Else,
"while" => Token::While,
"loop" => Token::Loop,
"break" => Token::Break,
"return" => Token::Return,
"continue" => Token::Continue,
"include" => Token::Include,
"let" => Token::Let,
"const" => Token::Const,
"static" => Token::Static,
_ => Token::Identifier(ident),
}
} else if c.is_ascii_digit() {
Token::Integer(self.read_number() as u32)
} else { } else {
// Skip unknown characters for now Some(Token::Slash)
self.advance();
return self.next_token();
} }
} }
None => Token::Eof, _ => None,
}
}
pub fn next_token(&mut self) -> Token {
self.skip_whitespace_and_comments();
let Some(c) = self.current else {
return Token::Eof;
}; };
if token != Token::Eof { // Try single-character tokens first
if let Some(token) = self.scan_single_char_token(c) {
self.advance(); self.advance();
return token;
} }
token // Try operators (may be multi-character)
if let Some(token) = self.scan_operator(c) {
self.advance();
return token;
}
// String literals
if c == '"' {
let token = match self.read_string() {
Ok(s) => Token::String(s),
Err(e) => {
eprintln!("Lexer error on line {}: {}", self.line, e);
// Skip to next quote or end
while let Some(ch) = self.current {
if ch == '"' || ch == '\n' {
break;
}
self.advance();
}
Token::String(String::new())
}
};
self.advance();
return token;
}
// Identifiers and keywords
if c.is_alphabetic() || c == '_' {
let token = self.keyword_or_identifier();
self.advance();
return token;
}
// Numbers (decimal, hex, binary)
if c.is_ascii_digit() {
let token = match self.read_number() {
Ok(num) => Token::Integer(num),
Err(e) => {
eprintln!("Lexer error on line {}: {}", self.line, e);
// Skip invalid number
while let Some(&ch) = self.peek() {
if !ch.is_alphanumeric() {
break;
}
self.advance();
}
Token::Integer(0)
}
};
self.advance();
return token;
}
// Unknown character - skip it
eprintln!(
"Lexer warning on line {}: Skipping unknown character '{}'",
self.line, c
);
self.advance();
self.next_token()
} }
} }
@@ -318,6 +580,41 @@ mod tests {
assert_eq!(lexer.next_token(), Token::Eof); assert_eq!(lexer.next_token(), Token::Eof);
} }
#[test]
fn test_hex_numbers() {
let input = "0xFF 0x10 0xDEADBEEF 0x0";
let mut lexer = Lexer::new(input);
assert_eq!(lexer.next_token(), Token::Integer(0xFF));
assert_eq!(lexer.next_token(), Token::Integer(0x10));
assert_eq!(lexer.next_token(), Token::Integer(0xDEADBEEF));
assert_eq!(lexer.next_token(), Token::Integer(0x0));
assert_eq!(lexer.next_token(), Token::Eof);
}
#[test]
fn test_binary_numbers() {
let input = "0b1010 0b0 0b11111111 0b1";
let mut lexer = Lexer::new(input);
assert_eq!(lexer.next_token(), Token::Integer(0b1010));
assert_eq!(lexer.next_token(), Token::Integer(0b0));
assert_eq!(lexer.next_token(), Token::Integer(0b11111111));
assert_eq!(lexer.next_token(), Token::Integer(0b1));
assert_eq!(lexer.next_token(), Token::Eof);
}
#[test]
fn test_mixed_number_formats() {
let input = "42 0xFF 0b1010";
let mut lexer = Lexer::new(input);
assert_eq!(lexer.next_token(), Token::Integer(42));
assert_eq!(lexer.next_token(), Token::Integer(255));
assert_eq!(lexer.next_token(), Token::Integer(10));
assert_eq!(lexer.next_token(), Token::Eof);
}
#[test] #[test]
fn test_operators() { fn test_operators() {
let input = "= == ! != < <= > >="; let input = "= == ! != < <= > >=";
@@ -334,6 +631,19 @@ mod tests {
assert_eq!(lexer.next_token(), Token::Eof); assert_eq!(lexer.next_token(), Token::Eof);
} }
#[test]
fn test_string_with_escapes() {
let input = r#""hello\nworld" "tab\there""#;
let mut lexer = Lexer::new(input);
assert_eq!(
lexer.next_token(),
Token::String("hello\nworld".to_string())
);
assert_eq!(lexer.next_token(), Token::String("tab\there".to_string()));
assert_eq!(lexer.next_token(), Token::Eof);
}
#[test] #[test]
fn test_example_syntax() { fn test_example_syntax() {
let input = r#" let input = r#"
@@ -349,25 +659,108 @@ mod tests {
let mut lexer = Lexer::new(input); let mut lexer = Lexer::new(input);
// Skip whitespace and newlines
while let Some(c) = lexer.current {
if !c.is_whitespace() {
break;
}
lexer.advance();
}
// Test the first few tokens // Test the first few tokens
assert_eq!(lexer.next_token(), Token::Identifier("main".to_string())); assert_eq!(lexer.next_token(), Token::Identifier("main".to_string()));
assert_eq!(lexer.next_token(), Token::Colon); assert_eq!(lexer.next_token(), Token::Colon);
assert_eq!(lexer.next_token(), Token::Identifier("Func".to_string())); assert_eq!(lexer.next_token(), Token::Identifier("Func".to_string()));
assert_eq!(lexer.next_token(), Token::Assign); assert_eq!(lexer.next_token(), Token::Assign);
// assert_eq!(lexer.next_token(), Token::Pipe);
assert_eq!(lexer.next_token(), Token::Identifier("x".to_string())); assert_eq!(lexer.next_token(), Token::Identifier("x".to_string()));
assert_eq!(lexer.next_token(), Token::Colon); assert_eq!(lexer.next_token(), Token::Colon);
assert_eq!(lexer.next_token(), Token::Identifier("U32".to_string())); assert_eq!(lexer.next_token(), Token::Identifier("U32".to_string()));
assert_eq!(lexer.next_token(), Token::Comma); assert_eq!(lexer.next_token(), Token::Comma);
}
// The rest of the tokens would be tested similarly #[test]
fn test_line_comments() {
let input = r#"
let x = 5; // this is a comment
// this is another comment
let y = 10;
"#;
let mut lexer = Lexer::new(input);
assert_eq!(lexer.next_token(), Token::Let);
assert_eq!(lexer.next_token(), Token::Identifier("x".to_string()));
assert_eq!(lexer.next_token(), Token::Assign);
assert_eq!(lexer.next_token(), Token::Integer(5));
assert_eq!(lexer.next_token(), Token::Semicolon);
// Comment should be skipped
assert_eq!(lexer.next_token(), Token::Let);
assert_eq!(lexer.next_token(), Token::Identifier("y".to_string()));
assert_eq!(lexer.next_token(), Token::Assign);
assert_eq!(lexer.next_token(), Token::Integer(10));
assert_eq!(lexer.next_token(), Token::Semicolon);
assert_eq!(lexer.next_token(), Token::Eof);
}
#[test]
fn test_block_comments() {
let input = r#"
let x = 5; /* this is a
multiline block comment */
let y = 10;
"#;
let mut lexer = Lexer::new(input);
assert_eq!(lexer.next_token(), Token::Let);
assert_eq!(lexer.next_token(), Token::Identifier("x".to_string()));
assert_eq!(lexer.next_token(), Token::Assign);
assert_eq!(lexer.next_token(), Token::Integer(5));
assert_eq!(lexer.next_token(), Token::Semicolon);
// Block comment should be skipped
assert_eq!(lexer.next_token(), Token::Let);
assert_eq!(lexer.next_token(), Token::Identifier("y".to_string()));
assert_eq!(lexer.next_token(), Token::Assign);
assert_eq!(lexer.next_token(), Token::Integer(10));
assert_eq!(lexer.next_token(), Token::Semicolon);
assert_eq!(lexer.next_token(), Token::Eof);
}
#[test]
fn test_division_operator() {
let input = "x / y";
let mut lexer = Lexer::new(input);
assert_eq!(lexer.next_token(), Token::Identifier("x".to_string()));
assert_eq!(lexer.next_token(), Token::Slash);
assert_eq!(lexer.next_token(), Token::Identifier("y".to_string()));
assert_eq!(lexer.next_token(), Token::Eof);
}
#[test]
fn test_mixed_comments_and_operators() {
let input = r#"
x / y // division
/* block comment */ z = 10
a /= b // this won't work yet
"#;
let mut lexer = Lexer::new(input);
assert_eq!(lexer.next_token(), Token::Identifier("x".to_string()));
assert_eq!(lexer.next_token(), Token::Slash);
assert_eq!(lexer.next_token(), Token::Identifier("y".to_string()));
assert_eq!(lexer.next_token(), Token::Identifier("z".to_string()));
assert_eq!(lexer.next_token(), Token::Assign);
assert_eq!(lexer.next_token(), Token::Integer(10));
assert_eq!(lexer.next_token(), Token::Identifier("a".to_string()));
assert_eq!(lexer.next_token(), Token::Slash);
assert_eq!(lexer.next_token(), Token::Assign);
assert_eq!(lexer.next_token(), Token::Identifier("b".to_string()));
assert_eq!(lexer.next_token(), Token::Eof);
}
#[test]
fn test_nested_block_comment_attempt() {
// Note: This lexer doesn't support nested block comments
let input = "/* outer /* inner */ still in comment? */ x";
let mut lexer = Lexer::new(input);
// The comment ends at the first */
assert_eq!(lexer.next_token(), Token::Identifier("still".to_string()));
assert_eq!(lexer.next_token(), Token::Identifier("in".to_string()));
assert_eq!(lexer.next_token(), Token::Identifier("comment".to_string()));
} }
} }
+43
View File
@@ -247,6 +247,45 @@ impl Parser {
return ParseResult::Accept(Statement::Continue); return ParseResult::Accept(Statement::Continue);
} }
// handle writes to pointers!
if expect_tt!(self.peek_next()?, Star).accepted() {
self.next()?;
let left = if expect_tt!(self.peek_next()?, Identifier).accepted() {
let identifier = self.parse_identifier()?;
Expression::Variable {
name: identifier,
expr_type: None,
}
} else if expect_tt!(self.peek_next()?, LeftParen).accepted() {
self.next()?;
let expr = self.parse_expression()?;
let _ = expect_tt!(self.next()?, RightParen).accepted();
expr
} else {
return ParseResult::Reject(CompilerError::UnexpectedToken(
self.peek_next()?,
));
};
let _ = expect_tt!(self.next()?, Assign)?;
let right = self.parse_expression()?;
// expect semicolon
expect_tt!(self.next()?, Semicolon)?;
// return result
return ParseResult::Accept(Statement::PtrWrite {
ptr: left,
value: right,
});
}
// handle let statements (declarations) // handle let statements (declarations)
if expect_tt!(self.peek_next()?, Let).accepted() { if expect_tt!(self.peek_next()?, Let).accepted() {
self.next(); self.next();
@@ -573,6 +612,10 @@ pub enum Statement {
varname: String, varname: String,
value: Expression, value: Expression,
}, },
PtrWrite {
ptr: Expression,
value: Expression,
},
Expression { Expression {
expr: Expression, expr: Expression,
}, },