Files
damn_simple_architecture/c_compiler/src/lexer.rs
T

336 lines
8.8 KiB
Rust

// ============================================================================
// Token Types
// ============================================================================
#[derive(Debug, Clone, PartialEq)]
pub enum TokenType {
// Keywords
Int,
If,
Else,
While,
Return,
Include,
// Identifiers and literals
Identifier(String),
Number(i32),
String(String),
Char(char),
// Operators
Plus,
Minus,
Star,
Slash,
Assign,
Eq,
Ne,
Lt,
Gt,
Le,
Ge,
// Delimiters
LParen,
RParen,
LBrace,
RBrace,
Semicolon,
Comma,
Colon,
Namespace,
Eof,
}
pub enum Type {
Int32,
Int16,
Int8,
Uint32,
Uint16,
Uint8,
Char,
}
#[derive(Debug, Clone)]
pub struct Token {
pub token_type: TokenType,
pub line: usize,
pub col: usize,
}
impl Token {
pub fn new(token_type: TokenType, line: usize, col: usize) -> Self {
Self {
token_type,
line,
col,
}
}
}
// ============================================================================
// Lexer
// ============================================================================
pub struct Lexer {
source: Vec<char>,
pos: usize,
line: usize,
col: usize,
}
impl Lexer {
pub fn new(source: &str) -> Self {
Self {
source: source.chars().collect(),
pos: 0,
line: 1,
col: 1,
}
}
fn error(&self, msg: &str) -> String {
format!(
"Lexer error at line {}, col {}: {}",
self.line, self.col, msg
)
}
fn peek(&self, offset: usize) -> Option<char> {
self.source.get(self.pos + offset).copied()
}
fn advance(&mut self) -> Option<char> {
if self.pos >= self.source.len() {
return None;
}
let ch = self.source[self.pos];
self.pos += 1;
if ch == '\n' {
self.line += 1;
self.col = 1;
} else {
self.col += 1;
}
Some(ch)
}
fn skip_whitespace(&mut self) {
while let Some(ch) = self.peek(0) {
if ch.is_whitespace() {
self.advance();
} else {
break;
}
}
}
fn skip_comment(&mut self) {
if self.peek(0) == Some('/') && self.peek(1) == Some('/') {
while let Some(ch) = self.peek(0) {
if ch == '\n' {
break;
}
self.advance();
}
}
}
fn read_number(&mut self) -> i32 {
let mut num_str = String::new();
while let Some(ch) = self.peek(0) {
if ch.is_ascii_digit() {
num_str.push(ch);
self.advance();
} else {
break;
}
}
num_str.parse().unwrap_or(0)
}
fn read_identifier(&mut self) -> String {
let mut ident = String::new();
while let Some(ch) = self.peek(0) {
if ch.is_alphanumeric() || ch == '_' {
ident.push(ch);
self.advance();
} else {
break;
}
}
ident
}
fn read_string(&mut self) -> Result<String, String> {
let mut string = String::new();
self.advance(); // Consume the opening quote
while let Some(ch) = self.peek(0) {
if ch == '"' {
self.advance(); // Consume the closing quote
return Ok(string);
} else if ch == '\\' {
self.advance(); // Consume the backslash
if let Some(escaped_char) = self.peek(0) {
string.push(escaped_char);
self.advance();
}
} else {
string.push(ch);
self.advance();
}
}
Err(String::from("Unexpected EOF"))
}
fn read_char(&mut self) -> Result<char, String> {
self.advance(); // Consume the opening quote
if let Some(ch) = self.peek(0) {
self.advance();
if self.peek(0) == Some('\'') {
self.advance();
return Ok(ch);
} else {
Err(String::from("expected closing quote"))
}
} else {
Err(String::from("expected character"))
}
}
pub fn tokenize(&mut self) -> Result<Vec<Token>, String> {
let mut tokens = Vec::new();
loop {
self.skip_whitespace();
self.skip_comment();
if self.pos >= self.source.len() {
break;
}
let line = self.line;
let col = self.col;
let ch = self.peek(0).unwrap();
let token_type = if ch.is_ascii_digit() {
let num = self.read_number();
TokenType::Number(num)
} else if ch == '"' {
let string = self.read_string()?;
TokenType::String(string)
} else if ch == '\'' {
let char = self.read_char()?;
TokenType::Char(char)
} else if ch.is_alphabetic() || ch == '_' {
let ident = self.read_identifier();
match ident.as_str() {
"int" => TokenType::Int,
"if" => TokenType::If,
"else" => TokenType::Else,
"while" => TokenType::While,
"return" => TokenType::Return,
"include" => TokenType::Include,
_ => TokenType::Identifier(ident),
}
} else {
match ch {
':' if self.peek(1) == Some(':') => {
self.advance();
self.advance();
TokenType::Namespace
}
':' => {
self.advance();
TokenType::Colon
}
'=' if self.peek(1) == Some('=') => {
self.advance();
self.advance();
TokenType::Eq
}
'!' if self.peek(1) == Some('=') => {
self.advance();
self.advance();
TokenType::Ne
}
'<' if self.peek(1) == Some('=') => {
self.advance();
self.advance();
TokenType::Le
}
'>' if self.peek(1) == Some('=') => {
self.advance();
self.advance();
TokenType::Ge
}
'+' => {
self.advance();
TokenType::Plus
}
'-' => {
self.advance();
TokenType::Minus
}
'*' => {
self.advance();
TokenType::Star
}
'/' => {
self.advance();
TokenType::Slash
}
'=' => {
self.advance();
TokenType::Assign
}
'<' => {
self.advance();
TokenType::Lt
}
'>' => {
self.advance();
TokenType::Gt
}
'(' => {
self.advance();
TokenType::LParen
}
')' => {
self.advance();
TokenType::RParen
}
'{' => {
self.advance();
TokenType::LBrace
}
'}' => {
self.advance();
TokenType::RBrace
}
';' => {
self.advance();
TokenType::Semicolon
}
',' => {
self.advance();
TokenType::Comma
}
_ => return Err(self.error(&format!("Unexpected character: {}", ch))),
}
};
tokens.push(Token::new(token_type, line, col));
}
tokens.push(Token::new(TokenType::Eof, self.line, self.col));
Ok(tokens)
}
}