diff --git a/assembler/src/model/module.rs b/assembler/src/model/module.rs index d8ec547..3b45c89 100644 --- a/assembler/src/model/module.rs +++ b/assembler/src/model/module.rs @@ -4,7 +4,10 @@ //! //! They have unique identifiers in the form of UUIDs. -use std::path::{Path, PathBuf}; +use std::{ + path::{Path, PathBuf}, + sync::Arc, +}; use uuid::Uuid; @@ -22,7 +25,7 @@ impl ModuleId { /// Convenience method to get the [`Module`] from a [`ModuleId`]. #[must_use] - pub fn to_module<'m>(&self, registry: &'m ModuleRegistry) -> Option<&'m Module> { + pub fn to_module<'m>(&self, registry: &'m ModuleRegistry) -> Option<&'m Arc> { registry.get(self) } diff --git a/assembler/src/source/token.rs b/assembler/src/source/token.rs index 360ac64..4314128 100644 --- a/assembler/src/source/token.rs +++ b/assembler/src/source/token.rs @@ -23,8 +23,10 @@ pub enum TokenType { Instruction(InstructionToken), /// Label definition (e.g., `loop_start:`). Label(LabelToken), - /// Assembler directive (e.g., `.global`, `.section`, `.dw`). + /// Assembler directive (e.g., `.global`, `.section`, `.dw`, `.resb`). Directive(DirectiveToken), + /// Comma separator. + Comma, /// End of line. Newline, /// End of file. diff --git a/assembler/src/source/tokeniser.rs b/assembler/src/source/tokeniser.rs index b996c88..b68c0d8 100644 --- a/assembler/src/source/tokeniser.rs +++ b/assembler/src/source/tokeniser.rs @@ -17,6 +17,9 @@ use crate::{ load_source_bytes, source_info::SourceInfo, token::{Token, TokenType}, + token_info::{ + DirectiveToken, InstructionToken, LabelToken, RegisterToken, SymbolToken, + }, }, }; @@ -76,7 +79,6 @@ impl Tokeniser { // Note that modules are tokenised in their own threads, possibly in parallel. pub fn tokenise(self, ctx: &AssemblerContext) -> Result, AssembleError> { let module_name = self.extract_module_name()?; - let _file_path = self.path.to_string_lossy().to_string(); // Create a module for the source file being processed. let module = Arc::new(Module::new(module_name, &self.path)); @@ -89,15 +91,12 @@ impl Tokeniser { let mut token_stream = Vec::new(); let lines = lines_with_spans(&self.data); - // Technically ignores newlines since line will be trimmed. We just append a - // Newline token for each line. + // Process each line for line_result in lines { let line_span = line_result?; - - // Skip empty lines and comments let trimmed = line_span.content.trim(); - // Add newline token on blank lines. + // Skip empty lines and add newline tokens if trimmed.is_empty() { token_stream.push(Token::new( TokenType::Newline, @@ -106,17 +105,174 @@ impl Tokeniser { continue; } - eprintln!("{}", line_span.line_number); + // Actually tokenize the line content + let line_tokens = self.tokenize_line(&line_span, &module)?; + token_stream.extend(line_tokens); + + // Add newline token at end of line + token_stream.push(Token::new( + TokenType::Newline, + SourceInfo::new( + line_span.line_number, + module.clone(), + line_span.content.len()..line_span.content.len(), + ), + )); } + // Add EOF token + token_stream.push(Token::new(TokenType::Eof, SourceInfo::new(0, module, 0..0))); + Ok(token_stream) } + fn tokenize_line( + &self, + line_span: &crate::source::lines::LineSpan, + module: &Arc, + ) -> Result, AssembleError> { + let mut tokens = Vec::new(); + let mut remaining = line_span.content.trim(); + let start_column = line_span.start_char; + + while !remaining.is_empty() { + // Try to match a token. + let (token_type, consumed) = self.match_token(remaining)?; + + tokens.push(Token::new( + token_type, + SourceInfo::new( + line_span.line_number, + module.clone(), + start_column..start_column + consumed, + ), + )); + + // Advance position. + remaining = remaining[consumed..].trim_start(); + } + + Ok(tokens) + } + + fn try_match_label(&self, input: &str) -> Option<(TokenType, usize)> { + let caps = self.label_regex.captures(input)?; + let name = caps.get(1)?.as_str().to_string(); + let len = caps.get(0)?.len(); + + Some((TokenType::Label(LabelToken { name }), len)) + } + + fn try_match_register(&self, input: &str) -> Option<(TokenType, usize)> { + let caps = self.register_regex.captures(input)?; + let name = caps.get(1)?.as_str().to_string(); + let len = caps.get(0)?.len(); + + Some((TokenType::Register(RegisterToken { name }), len)) + } + + fn try_match_immediate(&self, input: &str) -> Option<(TokenType, usize)> { + let caps = self.immediate_regex.captures(input)?; + let value_str = caps.get(1)?.as_str(); + let len = caps.get(0)?.len(); + + let value = if let Some(hex_part) = value_str.strip_prefix("0x") { + u32::from_str_radix(hex_part, 16).ok()? + } else if let Some(bin_part) = value_str.strip_prefix("0b") { + u32::from_str_radix(bin_part, 2).ok()? + } else if let Some(oct_part) = value_str.strip_prefix("0o") { + u32::from_str_radix(oct_part, 8).ok()? + } else { + value_str.parse::().ok()? + }; + + Some((TokenType::Immediate(value), len)) + } + + fn try_match_directive(&self, input: &str) -> Option<(TokenType, usize)> { + let caps = self.directive_regex.captures(input)?; + let directive = caps.get(1)?.as_str().to_string(); + let len = caps.get(0)?.len(); + + Some((TokenType::Directive(DirectiveToken { directive }), len)) + } + + fn try_match_instruction(&self, input: &str) -> Option<(TokenType, usize)> { + let caps = self.instruction_regex.captures(input)?; + let mnemonic = caps.get(1)?.as_str().to_string(); + let len = caps.get(0)?.len(); + + Some((TokenType::Instruction(InstructionToken { mnemonic }), len)) + } + + fn try_match_symbol(&self, input: &str) -> Option<(TokenType, usize)> { + let caps = self.symbol_regex.captures(input)?; + let name = caps.get(1)?.as_str().to_string(); + let len = caps.get(0)?.len(); + + Some((TokenType::Symbol(SymbolToken { name }), len)) + } + + fn try_match_string(&self, input: &str) -> Option<(TokenType, usize)> { + let caps = self.string_regex.captures(input)?; + let content = caps.get(1)?.as_str().to_string(); + let len = caps.get(0)?.len(); + + Some((TokenType::String(content), len)) + } + + fn match_token(&self, input: &str) -> Result<(TokenType, usize), AssembleError> { + if let Some(m) = self.try_match_label(input) { + return Ok(m); + } + + if let Some(m) = self.try_match_register(input) { + return Ok(m); + } + + if let Some(m) = self.try_match_immediate(input) { + return Ok(m); + } + + if let Some(m) = self.try_match_directive(input) { + return Ok(m); + } + + if let Some(m) = self.try_match_instruction(input) { + return Ok(m); + } + + if let Some(m) = self.try_match_string(input) { + return Ok(m); + } + + if let Some(m) = self.try_match_symbol(input) { + return Ok(m); + } + + // Handle miscellaneous characters. + match input.chars().next() { + Some(',') => Ok((TokenType::Comma, 1)), + Some(c) => Err(AssembleError::new_other_error(AssembleErrorKind::Io( + IoError::new( + IoErrorKind::InvalidData, + Some(format!("Unexpected character: '{c}'")), + ), + ))), + None => Err(AssembleError::new_other_error(AssembleErrorKind::Io( + IoError::new( + IoErrorKind::InvalidData, + Some("Unexpected end of input".to_string()), + ), + ))), + } + } + fn extract_module_name(&self) -> Result { let module_name = self .path .file_name() - .and_then(|f| Some(f.to_string_lossy().to_string())) + .map(|f| f.to_string_lossy().to_string()) .ok_or_else(|| { AssembleError::new_other_error(AssembleErrorKind::Io(IoError::new( IoErrorKind::InvalidData,