diff --git a/compiler/src/frontend/c/lexer.rs b/compiler/src/frontend/c/lexer.rs new file mode 100644 index 0000000..1cc5df4 --- /dev/null +++ b/compiler/src/frontend/c/lexer.rs @@ -0,0 +1,336 @@ +// ============================================================================ +// Token Types +// ============================================================================ + +#[derive(Debug, Clone, PartialEq)] +pub enum TokenType { + // Keywords + Int, + If, + Else, + While, + Return, + Include, + + // Identifiers and literals + Identifier(String), + Number(i32), + String(String), + Char(char), + + // Operators + Plus, + Minus, + Star, + Slash, + Assign, + Eq, + Ne, + Lt, + Gt, + Le, + Ge, + + // Delimiters + LParen, + RParen, + LBrace, + RBrace, + Semicolon, + Comma, + Colon, + Namespace, + + Eof, +} + +#[allow(unused)] +pub enum Type { + Int32, + Int16, + Int8, + Uint32, + Uint16, + Uint8, + Char, +} + +#[derive(Debug, Clone)] +pub struct Token { + pub token_type: TokenType, + pub line: usize, + pub col: usize, +} + +impl Token { + pub fn new(token_type: TokenType, line: usize, col: usize) -> Self { + Self { + token_type, + line, + col, + } + } +} + +// ============================================================================ +// Lexer +// ============================================================================ + +pub struct Lexer { + source: Vec, + pos: usize, + line: usize, + col: usize, +} + +impl Lexer { + pub fn new(source: &str) -> Self { + Self { + source: source.chars().collect(), + pos: 0, + line: 1, + col: 1, + } + } + + fn error(&self, msg: &str) -> String { + format!( + "Lexer error at line {}, col {}: {}", + self.line, self.col, msg + ) + } + + fn peek(&self, offset: usize) -> Option { + self.source.get(self.pos + offset).copied() + } + + fn advance(&mut self) -> Option { + if self.pos >= self.source.len() { + return None; + } + let ch = self.source[self.pos]; + self.pos += 1; + if ch == '\n' { + self.line += 1; + self.col = 1; + } else { + self.col += 1; + } + Some(ch) + } + + fn skip_whitespace(&mut self) { + while let Some(ch) = self.peek(0) { + if ch.is_whitespace() { + self.advance(); + } else { + break; + } + } + } + + fn skip_comment(&mut self) { + if self.peek(0) == Some('/') && self.peek(1) == Some('/') { + while let Some(ch) = self.peek(0) { + if ch == '\n' { + break; + } + self.advance(); + } + } + } + + fn read_number(&mut self) -> i32 { + let mut num_str = String::new(); + while let Some(ch) = self.peek(0) { + if ch.is_ascii_digit() { + num_str.push(ch); + self.advance(); + } else { + break; + } + } + num_str.parse().unwrap_or(0) + } + + fn read_identifier(&mut self) -> String { + let mut ident = String::new(); + while let Some(ch) = self.peek(0) { + if ch.is_alphanumeric() || ch == '_' { + ident.push(ch); + self.advance(); + } else { + break; + } + } + ident + } + + fn read_string(&mut self) -> Result { + let mut string = String::new(); + self.advance(); // Consume the opening quote + + while let Some(ch) = self.peek(0) { + if ch == '"' { + self.advance(); // Consume the closing quote + return Ok(string); + } else if ch == '\\' { + self.advance(); // Consume the backslash + if let Some(escaped_char) = self.peek(0) { + string.push(escaped_char); + self.advance(); + } + } else { + string.push(ch); + self.advance(); + } + } + + Err(String::from("Unexpected EOF")) + } + + fn read_char(&mut self) -> Result { + self.advance(); // Consume the opening quote + + if let Some(ch) = self.peek(0) { + self.advance(); + if self.peek(0) == Some('\'') { + self.advance(); + return Ok(ch); + } else { + Err(String::from("expected closing quote")) + } + } else { + Err(String::from("expected character")) + } + } + + pub fn tokenize(&mut self) -> Result, String> { + let mut tokens = Vec::new(); + + loop { + self.skip_whitespace(); + self.skip_comment(); + + if self.pos >= self.source.len() { + break; + } + + let line = self.line; + let col = self.col; + let ch = self.peek(0).unwrap(); + + let token_type = if ch.is_ascii_digit() { + let num = self.read_number(); + TokenType::Number(num) + } else if ch == '"' { + let string = self.read_string()?; + TokenType::String(string) + } else if ch == '\'' { + let char = self.read_char()?; + TokenType::Char(char) + } else if ch.is_alphabetic() || ch == '_' { + let ident = self.read_identifier(); + match ident.as_str() { + "int" => TokenType::Int, + "if" => TokenType::If, + "else" => TokenType::Else, + "while" => TokenType::While, + "return" => TokenType::Return, + "include" => TokenType::Include, + _ => TokenType::Identifier(ident), + } + } else { + match ch { + ':' if self.peek(1) == Some(':') => { + self.advance(); + self.advance(); + TokenType::Namespace + } + ':' => { + self.advance(); + TokenType::Colon + } + '=' if self.peek(1) == Some('=') => { + self.advance(); + self.advance(); + TokenType::Eq + } + '!' if self.peek(1) == Some('=') => { + self.advance(); + self.advance(); + TokenType::Ne + } + '<' if self.peek(1) == Some('=') => { + self.advance(); + self.advance(); + TokenType::Le + } + '>' if self.peek(1) == Some('=') => { + self.advance(); + self.advance(); + TokenType::Ge + } + '+' => { + self.advance(); + TokenType::Plus + } + '-' => { + self.advance(); + TokenType::Minus + } + '*' => { + self.advance(); + TokenType::Star + } + '/' => { + self.advance(); + TokenType::Slash + } + '=' => { + self.advance(); + TokenType::Assign + } + '<' => { + self.advance(); + TokenType::Lt + } + '>' => { + self.advance(); + TokenType::Gt + } + '(' => { + self.advance(); + TokenType::LParen + } + ')' => { + self.advance(); + TokenType::RParen + } + '{' => { + self.advance(); + TokenType::LBrace + } + '}' => { + self.advance(); + TokenType::RBrace + } + ';' => { + self.advance(); + TokenType::Semicolon + } + ',' => { + self.advance(); + TokenType::Comma + } + _ => return Err(self.error(&format!("Unexpected character: {}", ch))), + } + }; + + tokens.push(Token::new(token_type, line, col)); + } + + tokens.push(Token::new(TokenType::Eof, self.line, self.col)); + Ok(tokens) + } +} diff --git a/compiler/src/frontend/c/mod.rs b/compiler/src/frontend/c/mod.rs new file mode 100644 index 0000000..d2e0055 --- /dev/null +++ b/compiler/src/frontend/c/mod.rs @@ -0,0 +1,25 @@ +use common::logging::log; + +use crate::model::{CompilerError, Program}; +use parser::Parser; + +pub mod lexer; +pub mod parser; + +pub fn generate_ast(input: &str) -> Result { + log("Tokenising Input..."); + + let mut lexer = lexer::Lexer::new(&input); + let tokens = lexer.tokenize().map_err(|e| CompilerError::Generic(e))?; + // println!("{tokens:?}"); + + log(&format!("Parsing {} Tokens...", tokens.len())); + + let mut parser = Parser::new(tokens); + let ast = match parser.parse() { + Ok(ast) => ast, + Err(e) => return Err(CompilerError::Generic(e)), + }; + + Ok(ast) +} diff --git a/compiler/src/frontend/c/parser.rs b/compiler/src/frontend/c/parser.rs new file mode 100644 index 0000000..9d4c2bf --- /dev/null +++ b/compiler/src/frontend/c/parser.rs @@ -0,0 +1,471 @@ +// ============================================================================ +// AST Node Types +// ============================================================================ + +use crate::model::{ + BinaryOperator, Block, ConstExpr, Declaration, Dependency, Expression, Name, Program, + Statement, TypeId, UnaryOperator, Variable, +}; + +use super::lexer::{Token, TokenType}; + +// ============================================================================ +// Parser +// ============================================================================ + +pub struct Parser { + tokens: Vec, + pos: usize, +} + +impl Parser { + pub fn new(tokens: Vec) -> Self { + Self { tokens, pos: 0 } + } + + fn error(&self, msg: &str) -> String { + let token = self.current(); + format!( + "Parser error at line {}, col {}: {}", + token.line, token.col, msg + ) + } + + fn current(&self) -> &Token { + self.tokens + .get(self.pos) + .unwrap_or_else(|| self.tokens.last().unwrap()) + } + + fn peek(&self, offset: usize) -> &Token { + self.tokens + .get(self.pos + offset) + .unwrap_or_else(|| self.tokens.last().unwrap()) + } + + fn advance(&mut self) -> &Token { + if self.pos < self.tokens.len() - 1 { + self.pos += 1; + } + self.current() + } + + fn expect(&mut self, expected: TokenType) -> Result { + let token = self.current().clone(); + if std::mem::discriminant(&token.token_type) != std::mem::discriminant(&expected) + { + return Err(self.error(&format!( + "Expected {:?}, got {:?}", + expected, token.token_type + ))); + } + self.advance(); + Ok(token) + } + + pub fn parse(&mut self) -> Result { + let mut declarations = Vec::new(); + + while !matches!(self.current().token_type, TokenType::Eof) { + declarations.push(self.parse_declaration()?); + } + + Ok(Program { declarations }) + } + + fn parse_declaration(&mut self) -> Result { + // check for an import + if let TokenType::Include = self.current().token_type { + self.advance(); + + let name = + if let TokenType::Identifier(id) = self.current().clone().token_type { + Some(id) + } else { + None + } + .ok_or(String::from("Expected identifier"))?; + + self.advance(); + self.expect(TokenType::Colon)?; + + let path = if let TokenType::String(id) = self.current().clone().token_type { + Some(id) + } else { + None + } + .ok_or(String::from("Expected string literal"))?; + + self.advance(); + return Ok(Declaration::Dependency(Dependency { name, path })); + } + + self.expect(TokenType::Int)?; + + let name = match &self.current().token_type { + TokenType::Identifier(s) => s.clone(), + _ => return Err(self.error("Expected identifier")), + }; + self.advance(); + + match &self.current().token_type { + TokenType::LParen => { + // Function declaration + self.advance(); + let mut params = Vec::::new(); + + if !matches!(self.current().token_type, TokenType::RParen) { + self.expect(TokenType::Int)?; + + match &self.current().token_type { + TokenType::Identifier(s) => { + params.push(Variable { + name: s.clone(), + type_id: TypeId::U32, + }); + self.advance(); + } + _ => return Err(self.error("Expected parameter name")), + } + + while matches!(self.current().token_type, TokenType::Comma) { + self.advance(); + self.expect(TokenType::Int)?; + + match &self.current().token_type { + TokenType::Identifier(s) => { + params.push(Variable { + name: s.clone(), + type_id: TypeId::U32, + }); + self.advance(); + } + _ => return Err(self.error("Expected parameter name")), + } + } + } + + self.expect(TokenType::RParen)?; + let body = self.parse_block()?; + + Ok(Declaration::Function { + name, + params, + body, + return_type: TypeId::U32, + }) + } + _ => { + // Variable declaration + let init = if matches!(self.current().token_type, TokenType::Assign) { + self.advance(); + + if let TokenType::Number(n) = self.current().token_type { + self.advance(); + Some(ConstExpr::Number(n)) + } else { + return Err(self + .error("Expected constant in global variable declaration")); + } + } else { + None + }; + + self.expect(TokenType::Semicolon)?; + Ok(Declaration::Variable { + var: Variable { + name, + type_id: TypeId::U32, + }, + init, + is_const: false, + }) + } + } + } + + fn parse_block(&mut self) -> Result { + self.expect(TokenType::LBrace)?; + let mut statements = Vec::new(); + + while !matches!(self.current().token_type, TokenType::RBrace) { + statements.push(self.parse_statement()?); + } + + self.expect(TokenType::RBrace)?; + Ok(statements) + } + + fn parse_statement(&mut self) -> Result { + match &self.current().token_type { + TokenType::LBrace => Ok(Statement::Block(self.parse_block()?)), + TokenType::If => self.parse_if_stmt(), + TokenType::While => self.parse_while_stmt(), + TokenType::Return => self.parse_return_stmt(), + TokenType::Identifier(name) => { + let name = name.clone(); + + // peek ahead for open paren (func call expr) + if matches!(self.peek(1).token_type, TokenType::LParen) { + let expr = self.parse_expression()?; // a function call expr + self.expect(TokenType::Semicolon)?; + return Ok(Statement::Expression { expr }); + } + + self.advance(); // advance past identifier + + // assignment expression + if matches!(self.current().token_type, TokenType::Assign) { + self.advance(); + let expr = self.parse_expression()?; + + self.expect(TokenType::Semicolon)?; + Ok(Statement::Assign { + varname: name, + value: expr, + }) + } + // var expression + else { + self.expect(TokenType::Semicolon)?; + Ok(Statement::Expression { + expr: Expression::Variable { + name: Name { + name, + namespace: None, + }, + expr_type: None, + }, + }) + } + } + TokenType::Int => { + // Local variable declaration + self.advance(); + let name = match &self.current().token_type { + TokenType::Identifier(s) => s.clone(), + _ => return Err(self.error("Expected variable name")), + }; + self.advance(); + + let init = if matches!(self.current().token_type, TokenType::Assign) { + self.advance(); + Some(self.parse_expression()?) + } else { + None + }; + + self.expect(TokenType::Semicolon)?; + + // Convert to assignment expression statement + let expr = if let Some(init_expr) = init { + Statement::Assign { + varname: name, + value: init_expr, + } + } else { + Statement::Assign { + varname: name, + value: Expression::Empty, + } + }; + + Ok(expr) + } + _ => { + let expr = if matches!(self.current().token_type, TokenType::Semicolon) { + Expression::Empty + } else { + self.parse_expression()? + }; + + self.expect(TokenType::Semicolon)?; + Ok(Statement::Expression { expr }) + } + } + } + + fn parse_if_stmt(&mut self) -> Result { + self.expect(TokenType::If)?; + self.expect(TokenType::LParen)?; + let condition = self.parse_expression()?; + self.expect(TokenType::RParen)?; + let then_stmt = self.parse_block()?; + + let else_stmt = if matches!(self.current().token_type, TokenType::Else) { + self.advance(); + self.parse_block()? + } else { + Vec::new() + }; + + Ok(Statement::If { + condition, + then_stmt, + else_stmt, + }) + } + + fn parse_while_stmt(&mut self) -> Result { + self.expect(TokenType::While)?; + self.expect(TokenType::LParen)?; + let condition = self.parse_expression()?; + self.expect(TokenType::RParen)?; + let body = self.parse_block()?; + + Ok(Statement::While { condition, body }) + } + + fn parse_return_stmt(&mut self) -> Result { + self.expect(TokenType::Return)?; + + let expr = if matches!(self.current().token_type, TokenType::Semicolon) { + None + } else { + Some(self.parse_expression()?) + }; + + self.expect(TokenType::Semicolon)?; + Ok(Statement::Return(expr)) + } + + fn parse_expression(&mut self) -> Result { + self.parse_comparison() + } + + fn parse_comparison(&mut self) -> Result { + let mut expr = self.parse_additive()?; + + while let Some(op) = match &self.current().token_type { + TokenType::Eq => Some(BinaryOperator::Eq), + TokenType::Ne => Some(BinaryOperator::Ne), + TokenType::Lt => Some(BinaryOperator::Lt), + TokenType::Gt => Some(BinaryOperator::Gt), + TokenType::Le => Some(BinaryOperator::Le), + TokenType::Ge => Some(BinaryOperator::Ge), + _ => None, + } { + self.advance(); + let right = Box::new(self.parse_additive()?); + expr = Expression::Binary { + op, + left: Box::new(expr), + right, + }; + } + + Ok(expr) + } + + fn parse_additive(&mut self) -> Result { + let mut expr = self.parse_multiplicative()?; + + while let Some(op) = match &self.current().token_type { + TokenType::Plus => Some(BinaryOperator::Add), + TokenType::Minus => Some(BinaryOperator::Sub), + _ => None, + } { + self.advance(); + let right = Box::new(self.parse_multiplicative()?); + expr = Expression::Binary { + op, + left: Box::new(expr), + right, + }; + } + + Ok(expr) + } + + fn parse_multiplicative(&mut self) -> Result { + let mut expr = self.parse_unary()?; + + while let Some(op) = match &self.current().token_type { + TokenType::Star => Some(BinaryOperator::Mul), + TokenType::Slash => Some(BinaryOperator::Div), + _ => None, + } { + self.advance(); + let right = Box::new(self.parse_unary()?); + expr = Expression::Binary { + op, + left: Box::new(expr), + right, + }; + } + + Ok(expr) + } + + fn parse_unary(&mut self) -> Result { + let op = match &self.current().token_type { + TokenType::Plus => Some(UnaryOperator::Plus), + TokenType::Minus => Some(UnaryOperator::Minus), + _ => None, + }; + + if let Some(op) = op { + self.advance(); + let operand = Box::new(self.parse_unary()?); + return Ok(Expression::Unary { op, operand }); + } + + self.parse_primary() + } + + fn parse_primary(&mut self) -> Result { + match &self.current().token_type.clone() { + TokenType::Number(n) => { + let value = *n; + self.advance(); + Ok(Expression::Number(value as isize)) + } + TokenType::Identifier(name) => { + let name = name.clone(); + self.advance(); + + if matches!(self.current().token_type, TokenType::LParen) { + // Function call + self.advance(); + let mut args = Vec::new(); + + if !matches!(self.current().token_type, TokenType::RParen) { + args.push(self.parse_expression()?); + + while matches!(self.current().token_type, TokenType::Comma) { + self.advance(); + args.push(self.parse_expression()?); + } + } + + self.expect(TokenType::RParen)?; + Ok(Expression::Call { + name: Name { + name, + namespace: None, + }, + args, + }) + } else { + Ok(Expression::Variable { + name: Name { + name, + namespace: None, + }, + expr_type: None, + }) + } + } + TokenType::LParen => { + self.advance(); + let expr = self.parse_expression()?; + self.expect(TokenType::RParen)?; + Ok(expr) + } + _ => Err(self.error(&format!( + "Unexpected token: {:?}", + self.current().token_type + ))), + } + } +}