From 7973b2afcaa26a37ac153aeb85757823a99fa8f7 Mon Sep 17 00:00:00 2001 From: zxq5 Date: Tue, 3 Feb 2026 15:37:38 +0000 Subject: [PATCH] - refactored lexer - updated lexer to allow hex and binary integer literals - updated parser with support for writing to pointers - updated code generation to support writing to pointers - fixed a bug with codegen where args are loaded from incorrect offsets due to saving registers prior to calling. --- compiler/src/codegen.rs | 42 ++- compiler/src/lexer.rs | 613 +++++++++++++++++++++++++++++++++------- compiler/src/parser.rs | 43 +++ 3 files changed, 580 insertions(+), 118 deletions(-) diff --git a/compiler/src/codegen.rs b/compiler/src/codegen.rs index 878f167..54d0ba3 100644 --- a/compiler/src/codegen.rs +++ b/compiler/src/codegen.rs @@ -29,8 +29,10 @@ static GLOBAL_METHODS: LazyLock> = LazyLock::new(|| { ("println", "print::println"), ("printnum", "print::print_num"), ("print_space", "print::print_whitespace"), + ("print_newline", "print::print_newline"), ("print_char", "print::print_byte"), ("print_word", "print::print_word"), + ("print_hex", "print::print_hex_word"), ]) }); @@ -252,6 +254,19 @@ impl CodeGenerator { Statement::Break => unimplemented!(), Statement::Continue => unimplemented!(), + Statement::PtrWrite { ptr, value } => { + let (result_reg, expr_code) = self.generate_expression(value, true)?; + code.extend(expr_code); + + let (ptr_reg, ptr_code) = self.generate_expression(ptr, true)?; + code.extend(ptr_code); + + code.push(format!("\tstw {}, {}", result_reg, ptr_reg)); + + self.allocator.free_temp(&result_reg); + self.allocator.free_temp(&ptr_reg); + } + Statement::Assign { varname, value } => { // Evaluate expression let (result_reg, expr_code) = self.generate_expression(value, true)?; @@ -540,6 +555,14 @@ impl CodeGenerator { } Expression::Call { name, args } => { + // first evaluate all the args we're going to need + let mut arg_regs = Vec::new(); + for arg in args.iter().rev() { + let (arg_reg, arg_code) = self.generate_expression(arg, true)?; + code.extend(arg_code); + arg_regs.push(arg_reg); + } + // Save caller-saved registers and track which ones we saved let saved_regs = self.allocator.get_caller_saved_registers(); for reg in &saved_regs { @@ -547,12 +570,12 @@ impl CodeGenerator { } // Evaluate and push arguments in reverse order - let mut arg_regs = Vec::new(); - for arg in args.iter().rev() { - let (arg_reg, arg_code) = self.generate_expression(arg, true)?; - code.extend(arg_code); - code.push(format!("\tpush {}", arg_reg)); - arg_regs.push(arg_reg); + for (i, arg_reg) in arg_regs.iter().enumerate() { + code.push(format!( + "\tpush {} // push arg {}", + arg_reg, + args.len() - 1 - i + )); } if GLOBAL_METHODS.contains_key(name.name.as_str()) { @@ -564,10 +587,11 @@ impl CodeGenerator { return Err(CompilerError::Undefined(name.clone())); } - let result_reg = String::new(); + let result_reg: String; if use_result { - let (result_reg, result_alloc) = self.allocator.alloc_temp()?; + let (temp_result_reg, result_alloc) = self.allocator.alloc_temp()?; + result_reg = temp_result_reg; code.extend(result_alloc); code.push(format!("\tpop {}", result_reg)); @@ -579,6 +603,8 @@ impl CodeGenerator { } } } else { + result_reg = "zero".to_string(); + // Clean up arguments if args.len() > 0 { for _ in 0..(args.len()) { diff --git a/compiler/src/lexer.rs b/compiler/src/lexer.rs index 142343f..0c6c3f9 100644 --- a/compiler/src/lexer.rs +++ b/compiler/src/lexer.rs @@ -20,7 +20,7 @@ pub enum Token { // Identifiers and literals Identifier(String), String(String), - Integer(u32), + Integer(u64), Char(char), // Symbols @@ -31,13 +31,12 @@ pub enum Token { Semicolon, // ; Colon, // : Comma, // , - // Pipe, // | // Operators - Plus, // + - Minus, // - - Star, // * - Amphersand, + Plus, // + + Minus, // - + Star, // * + Amphersand, // & Slash, // / Assign, // = EqualEqual, // == @@ -80,7 +79,6 @@ impl Token { Token::Colon => "Colon", Token::Comma => "Comma", Token::RightArrow => "RightArrow", - // Token::Pipe => "Pipe", Token::Plus => "Plus", Token::Minus => "Minus", Token::Star => "Star", @@ -139,30 +137,258 @@ impl<'a> Lexer<'a> { } } + fn skip_line_comment(&mut self) { + // Skip the two slashes + self.advance(); // first / + self.advance(); // second / + + // Skip until newline or EOF + while let Some(c) = self.current { + if c == '\n' { + self.line += 1; + self.advance(); + break; + } + self.advance(); + } + } + + fn skip_block_comment(&mut self) -> Result<(), String> { + // Skip the /* + self.advance(); // / + self.advance(); // * + + let start_line = self.line; + + // Look for */ + while let Some(c) = self.current { + if c == '\n' { + self.line += 1; + } + + if c == '*' { + if let Some(&next) = self.peek() { + if next == '/' { + self.advance(); // * + self.advance(); // / + return Ok(()); + } + } + } + + self.advance(); + } + + Err(format!( + "Unterminated block comment starting at line {}", + start_line + )) + } + + fn skip_whitespace_and_comments(&mut self) { + loop { + self.skip_whitespace(); + + // Check for comments + if let Some('/') = self.current { + if let Some(&next) = self.peek() { + match next { + '/' => { + self.skip_line_comment(); + continue; + } + '*' => { + if let Err(e) = self.skip_block_comment() { + eprintln!("Lexer error: {}", e); + } + continue; + } + _ => break, + } + } + } + + break; + } + } + fn read_identifier(&mut self) -> String { let mut ident = String::new(); + + // Include the current character if it's valid + if let Some(c) = self.current { + if c.is_alphabetic() || c == '_' { + ident.push(c); + } + } + + // Read remaining characters while let Some(&c) = self.peek() { if c.is_alphanumeric() || c == '_' { - ident.push(c); self.advance(); + ident.push(c); } else { break; } } + ident } - fn read_number(&mut self) -> i64 { - let mut num_str = String::from(self.current.unwrap()); + fn keyword_or_identifier(&mut self) -> Token { + let ident = self.read_identifier(); + + match ident.as_str() { + "fn" => Token::Fn, + "if" => Token::If, + "else" => Token::Else, + "while" => Token::While, + "loop" => Token::Loop, + "break" => Token::Break, + "return" => Token::Return, + "continue" => Token::Continue, + "include" => Token::Include, + "let" => Token::Let, + "const" => Token::Const, + "static" => Token::Static, + _ => Token::Identifier(ident), + } + } + + fn read_number(&mut self) -> Result { + let current = self.current.unwrap(); + + // Check for hex (0x) or binary (0b) prefix + if current == '0' { + if let Some(&next_char) = self.peek() { + match next_char { + 'x' | 'X' => { + self.advance(); // consume '0' + self.advance(); // consume 'x' + return self.read_hex_number(); + } + 'b' | 'B' => { + self.advance(); // consume '0' + self.advance(); // consume 'b' + return self.read_binary_number(); + } + _ => {} + } + } + } + + // Read decimal number + self.read_decimal_number() + } + + fn read_decimal_number(&mut self) -> Result { + let mut num_str = String::new(); + + if let Some(c) = self.current { + num_str.push(c); + } + while let Some(&c) = self.peek() { if c.is_ascii_digit() { - num_str.push(c); self.advance(); + num_str.push(c); } else { break; } } - num_str.parse().unwrap() + + num_str + .parse::() + .map_err(|_| format!("Invalid decimal number: {}", num_str)) + } + + fn read_hex_number(&mut self) -> Result { + let mut num_str = String::new(); + + // Read current character if it's a hex digit + if let Some(c) = self.current { + if c.is_ascii_hexdigit() { + num_str.push(c); + } + } + + while let Some(&c) = self.peek() { + if c.is_ascii_hexdigit() { + self.advance(); + num_str.push(c); + } else { + break; + } + } + + if num_str.is_empty() { + return Err("Invalid hexadecimal number: no digits after 0x".to_string()); + } + + u64::from_str_radix(&num_str, 16) + .map_err(|_| format!("Invalid hexadecimal number: {}", num_str)) + } + + fn read_binary_number(&mut self) -> Result { + let mut num_str = String::new(); + + // Read current character if it's a binary digit + if let Some(c) = self.current { + if c == '0' || c == '1' { + num_str.push(c); + } + } + + while let Some(&c) = self.peek() { + if c == '0' || c == '1' { + self.advance(); + num_str.push(c); + } else { + break; + } + } + + if num_str.is_empty() { + return Err("Invalid binary number: no digits after 0b".to_string()); + } + + u64::from_str_radix(&num_str, 2) + .map_err(|_| format!("Invalid binary number: {}", num_str)) + } + + fn read_string(&mut self) -> Result { + self.advance(); // Skip the opening quote + let mut s = String::new(); + + while let Some(c) = self.current { + if c == '"' { + return Ok(s); + } + + // Handle escape sequences + if c == '\\' { + self.advance(); + if let Some(escaped) = self.current { + let escaped_char = match escaped { + 'n' => '\n', + 't' => '\t', + 'r' => '\r', + '\\' => '\\', + '"' => '"', + _ => escaped, // For now, just use the character as-is + }; + s.push(escaped_char); + } else { + return Err("Unexpected end of string after escape".to_string()); + } + } else { + s.push(c); + } + + self.advance(); + } + + Err("Unterminated string literal".to_string()) } fn match_next(&mut self, expected: char) -> bool { @@ -175,104 +401,140 @@ impl<'a> Lexer<'a> { } } - pub fn next_token(&mut self) -> Token { - self.skip_whitespace(); + fn scan_single_char_token(&mut self, c: char) -> Option { + match c { + '(' => Some(Token::LeftParen), + ')' => Some(Token::RightParen), + '{' => Some(Token::LeftBrace), + '}' => Some(Token::RightBrace), + ';' => Some(Token::Semicolon), + ':' => Some(Token::Colon), + ',' => Some(Token::Comma), + '&' => Some(Token::Amphersand), + '+' => Some(Token::Plus), + '*' => Some(Token::Star), + _ => None, + } + } - let token = match self.current { - Some('(') => Token::LeftParen, - Some(')') => Token::RightParen, - Some('{') => Token::LeftBrace, - Some('}') => Token::RightBrace, - Some(';') => Token::Semicolon, - Some(':') => Token::Colon, - Some(',') => Token::Comma, - Some('&') => Token::Amphersand, - // Some('|') => Token::Pipe, - Some('+') => Token::Plus, - Some('*') => Token::Star, - Some('/') => Token::Slash, - Some('-') => { - if self.match_next('>') { - Token::RightArrow - } else { - Token::Minus - } - } - Some('!') => { - if self.match_next('=') { - Token::BangEqual - } else { - Token::Bang - } - } - Some('=') => { - if self.match_next('=') { - Token::EqualEqual - } else { - Token::Assign - } - } - Some('<') => { - if self.match_next('=') { - Token::LessEqual - } else { - Token::Less - } - } - Some('>') => { - if self.match_next('=') { - Token::GreaterEqual - } else { - Token::Greater - } - } - Some('"') => { - self.advance(); // Skip the opening quote - let mut s = String::new(); - while let Some(c) = self.current { - if c == '"' { - break; + fn scan_operator(&mut self, c: char) -> Option { + match c { + '-' => Some(if self.match_next('>') { + Token::RightArrow + } else { + Token::Minus + }), + '!' => Some(if self.match_next('=') { + Token::BangEqual + } else { + Token::Bang + }), + '=' => Some(if self.match_next('=') { + Token::EqualEqual + } else { + Token::Assign + }), + '<' => Some(if self.match_next('=') { + Token::LessEqual + } else { + Token::Less + }), + '>' => Some(if self.match_next('=') { + Token::GreaterEqual + } else { + Token::Greater + }), + '/' => { + // Check if it's a comment or division + if let Some(&next) = self.peek() { + if next == '/' || next == '*' { + // It's a comment, don't consume it here + // Let skip_whitespace_and_comments handle it + None + } else { + Some(Token::Slash) } - s.push(c); - self.advance(); - } - Token::String(s) - } - Some(c) => { - if c.is_alphabetic() || c == '_' { - let mut ident = c.to_string(); - ident.push_str(&self.read_identifier()); - match ident.as_str() { - "fn" => Token::Fn, - "if" => Token::If, - "else" => Token::Else, - "while" => Token::While, - "loop" => Token::Loop, - "break" => Token::Break, - "return" => Token::Return, - "continue" => Token::Continue, - "include" => Token::Include, - "let" => Token::Let, - "const" => Token::Const, - "static" => Token::Static, - _ => Token::Identifier(ident), - } - } else if c.is_ascii_digit() { - Token::Integer(self.read_number() as u32) } else { - // Skip unknown characters for now - self.advance(); - return self.next_token(); + Some(Token::Slash) } } - None => Token::Eof, + _ => None, + } + } + + pub fn next_token(&mut self) -> Token { + self.skip_whitespace_and_comments(); + + let Some(c) = self.current else { + return Token::Eof; }; - if token != Token::Eof { + // Try single-character tokens first + if let Some(token) = self.scan_single_char_token(c) { self.advance(); + return token; } - token + // Try operators (may be multi-character) + if let Some(token) = self.scan_operator(c) { + self.advance(); + return token; + } + + // String literals + if c == '"' { + let token = match self.read_string() { + Ok(s) => Token::String(s), + Err(e) => { + eprintln!("Lexer error on line {}: {}", self.line, e); + // Skip to next quote or end + while let Some(ch) = self.current { + if ch == '"' || ch == '\n' { + break; + } + self.advance(); + } + Token::String(String::new()) + } + }; + self.advance(); + return token; + } + + // Identifiers and keywords + if c.is_alphabetic() || c == '_' { + let token = self.keyword_or_identifier(); + self.advance(); + return token; + } + + // Numbers (decimal, hex, binary) + if c.is_ascii_digit() { + let token = match self.read_number() { + Ok(num) => Token::Integer(num), + Err(e) => { + eprintln!("Lexer error on line {}: {}", self.line, e); + // Skip invalid number + while let Some(&ch) = self.peek() { + if !ch.is_alphanumeric() { + break; + } + self.advance(); + } + Token::Integer(0) + } + }; + self.advance(); + return token; + } + + // Unknown character - skip it + eprintln!( + "Lexer warning on line {}: Skipping unknown character '{}'", + self.line, c + ); + self.advance(); + self.next_token() } } @@ -318,6 +580,41 @@ mod tests { assert_eq!(lexer.next_token(), Token::Eof); } + #[test] + fn test_hex_numbers() { + let input = "0xFF 0x10 0xDEADBEEF 0x0"; + let mut lexer = Lexer::new(input); + + assert_eq!(lexer.next_token(), Token::Integer(0xFF)); + assert_eq!(lexer.next_token(), Token::Integer(0x10)); + assert_eq!(lexer.next_token(), Token::Integer(0xDEADBEEF)); + assert_eq!(lexer.next_token(), Token::Integer(0x0)); + assert_eq!(lexer.next_token(), Token::Eof); + } + + #[test] + fn test_binary_numbers() { + let input = "0b1010 0b0 0b11111111 0b1"; + let mut lexer = Lexer::new(input); + + assert_eq!(lexer.next_token(), Token::Integer(0b1010)); + assert_eq!(lexer.next_token(), Token::Integer(0b0)); + assert_eq!(lexer.next_token(), Token::Integer(0b11111111)); + assert_eq!(lexer.next_token(), Token::Integer(0b1)); + assert_eq!(lexer.next_token(), Token::Eof); + } + + #[test] + fn test_mixed_number_formats() { + let input = "42 0xFF 0b1010"; + let mut lexer = Lexer::new(input); + + assert_eq!(lexer.next_token(), Token::Integer(42)); + assert_eq!(lexer.next_token(), Token::Integer(255)); + assert_eq!(lexer.next_token(), Token::Integer(10)); + assert_eq!(lexer.next_token(), Token::Eof); + } + #[test] fn test_operators() { let input = "= == ! != < <= > >="; @@ -334,6 +631,19 @@ mod tests { assert_eq!(lexer.next_token(), Token::Eof); } + #[test] + fn test_string_with_escapes() { + let input = r#""hello\nworld" "tab\there""#; + let mut lexer = Lexer::new(input); + + assert_eq!( + lexer.next_token(), + Token::String("hello\nworld".to_string()) + ); + assert_eq!(lexer.next_token(), Token::String("tab\there".to_string())); + assert_eq!(lexer.next_token(), Token::Eof); + } + #[test] fn test_example_syntax() { let input = r#" @@ -349,25 +659,108 @@ mod tests { let mut lexer = Lexer::new(input); - // Skip whitespace and newlines - while let Some(c) = lexer.current { - if !c.is_whitespace() { - break; - } - lexer.advance(); - } - // Test the first few tokens assert_eq!(lexer.next_token(), Token::Identifier("main".to_string())); assert_eq!(lexer.next_token(), Token::Colon); assert_eq!(lexer.next_token(), Token::Identifier("Func".to_string())); assert_eq!(lexer.next_token(), Token::Assign); - // assert_eq!(lexer.next_token(), Token::Pipe); assert_eq!(lexer.next_token(), Token::Identifier("x".to_string())); assert_eq!(lexer.next_token(), Token::Colon); assert_eq!(lexer.next_token(), Token::Identifier("U32".to_string())); assert_eq!(lexer.next_token(), Token::Comma); + } - // The rest of the tokens would be tested similarly + #[test] + fn test_line_comments() { + let input = r#" + let x = 5; // this is a comment + // this is another comment + let y = 10; + "#; + + let mut lexer = Lexer::new(input); + + assert_eq!(lexer.next_token(), Token::Let); + assert_eq!(lexer.next_token(), Token::Identifier("x".to_string())); + assert_eq!(lexer.next_token(), Token::Assign); + assert_eq!(lexer.next_token(), Token::Integer(5)); + assert_eq!(lexer.next_token(), Token::Semicolon); + // Comment should be skipped + assert_eq!(lexer.next_token(), Token::Let); + assert_eq!(lexer.next_token(), Token::Identifier("y".to_string())); + assert_eq!(lexer.next_token(), Token::Assign); + assert_eq!(lexer.next_token(), Token::Integer(10)); + assert_eq!(lexer.next_token(), Token::Semicolon); + assert_eq!(lexer.next_token(), Token::Eof); + } + + #[test] + fn test_block_comments() { + let input = r#" + let x = 5; /* this is a + multiline block comment */ + let y = 10; + "#; + + let mut lexer = Lexer::new(input); + + assert_eq!(lexer.next_token(), Token::Let); + assert_eq!(lexer.next_token(), Token::Identifier("x".to_string())); + assert_eq!(lexer.next_token(), Token::Assign); + assert_eq!(lexer.next_token(), Token::Integer(5)); + assert_eq!(lexer.next_token(), Token::Semicolon); + // Block comment should be skipped + assert_eq!(lexer.next_token(), Token::Let); + assert_eq!(lexer.next_token(), Token::Identifier("y".to_string())); + assert_eq!(lexer.next_token(), Token::Assign); + assert_eq!(lexer.next_token(), Token::Integer(10)); + assert_eq!(lexer.next_token(), Token::Semicolon); + assert_eq!(lexer.next_token(), Token::Eof); + } + + #[test] + fn test_division_operator() { + let input = "x / y"; + let mut lexer = Lexer::new(input); + + assert_eq!(lexer.next_token(), Token::Identifier("x".to_string())); + assert_eq!(lexer.next_token(), Token::Slash); + assert_eq!(lexer.next_token(), Token::Identifier("y".to_string())); + assert_eq!(lexer.next_token(), Token::Eof); + } + + #[test] + fn test_mixed_comments_and_operators() { + let input = r#" + x / y // division + /* block comment */ z = 10 + a /= b // this won't work yet + "#; + + let mut lexer = Lexer::new(input); + + assert_eq!(lexer.next_token(), Token::Identifier("x".to_string())); + assert_eq!(lexer.next_token(), Token::Slash); + assert_eq!(lexer.next_token(), Token::Identifier("y".to_string())); + assert_eq!(lexer.next_token(), Token::Identifier("z".to_string())); + assert_eq!(lexer.next_token(), Token::Assign); + assert_eq!(lexer.next_token(), Token::Integer(10)); + assert_eq!(lexer.next_token(), Token::Identifier("a".to_string())); + assert_eq!(lexer.next_token(), Token::Slash); + assert_eq!(lexer.next_token(), Token::Assign); + assert_eq!(lexer.next_token(), Token::Identifier("b".to_string())); + assert_eq!(lexer.next_token(), Token::Eof); + } + + #[test] + fn test_nested_block_comment_attempt() { + // Note: This lexer doesn't support nested block comments + let input = "/* outer /* inner */ still in comment? */ x"; + let mut lexer = Lexer::new(input); + + // The comment ends at the first */ + assert_eq!(lexer.next_token(), Token::Identifier("still".to_string())); + assert_eq!(lexer.next_token(), Token::Identifier("in".to_string())); + assert_eq!(lexer.next_token(), Token::Identifier("comment".to_string())); } } diff --git a/compiler/src/parser.rs b/compiler/src/parser.rs index 755df29..2da5de2 100644 --- a/compiler/src/parser.rs +++ b/compiler/src/parser.rs @@ -247,6 +247,45 @@ impl Parser { return ParseResult::Accept(Statement::Continue); } + // handle writes to pointers! + if expect_tt!(self.peek_next()?, Star).accepted() { + self.next()?; + + let left = if expect_tt!(self.peek_next()?, Identifier).accepted() { + let identifier = self.parse_identifier()?; + + Expression::Variable { + name: identifier, + expr_type: None, + } + } else if expect_tt!(self.peek_next()?, LeftParen).accepted() { + self.next()?; + + let expr = self.parse_expression()?; + + let _ = expect_tt!(self.next()?, RightParen).accepted(); + + expr + } else { + return ParseResult::Reject(CompilerError::UnexpectedToken( + self.peek_next()?, + )); + }; + + let _ = expect_tt!(self.next()?, Assign)?; + + let right = self.parse_expression()?; + + // expect semicolon + expect_tt!(self.next()?, Semicolon)?; + + // return result + return ParseResult::Accept(Statement::PtrWrite { + ptr: left, + value: right, + }); + } + // handle let statements (declarations) if expect_tt!(self.peek_next()?, Let).accepted() { self.next(); @@ -573,6 +612,10 @@ pub enum Statement { varname: String, value: Expression, }, + PtrWrite { + ptr: Expression, + value: Expression, + }, Expression { expr: Expression, },