From 9b9e153500fc8e51a9efe9ed0a43c3ed95e65a1b Mon Sep 17 00:00:00 2001 From: "J. Hinchliffe" Date: Wed, 25 Jun 2025 17:35:03 +0100 Subject: [PATCH] assembler: wrap Module's with Arc and update Tokeniser (still WIP) Implements complete tokenizer with Arc-wrapped modules Enhances module handling by wrapping Module instances in Arc for thread-safe sharing across the assembler pipeline. Implements full tokenization logic with pattern matching for all token types including labels, registers, immediates, directives, instructions, symbols, and strings. Adds comma token support and proper EOF handling to complete the lexical analysis phase. Generated AI slop commit message, may not be super accurate or it may be a bit too serious lol. --- assembler/src/model/module.rs | 7 +- assembler/src/source/token.rs | 4 +- assembler/src/source/tokeniser.rs | 172 ++++++++++++++++++++++++++++-- 3 files changed, 172 insertions(+), 11 deletions(-) diff --git a/assembler/src/model/module.rs b/assembler/src/model/module.rs index d8ec547..3b45c89 100644 --- a/assembler/src/model/module.rs +++ b/assembler/src/model/module.rs @@ -4,7 +4,10 @@ //! //! They have unique identifiers in the form of UUIDs. -use std::path::{Path, PathBuf}; +use std::{ + path::{Path, PathBuf}, + sync::Arc, +}; use uuid::Uuid; @@ -22,7 +25,7 @@ impl ModuleId { /// Convenience method to get the [`Module`] from a [`ModuleId`]. #[must_use] - pub fn to_module<'m>(&self, registry: &'m ModuleRegistry) -> Option<&'m Module> { + pub fn to_module<'m>(&self, registry: &'m ModuleRegistry) -> Option<&'m Arc> { registry.get(self) } diff --git a/assembler/src/source/token.rs b/assembler/src/source/token.rs index 360ac64..4314128 100644 --- a/assembler/src/source/token.rs +++ b/assembler/src/source/token.rs @@ -23,8 +23,10 @@ pub enum TokenType { Instruction(InstructionToken), /// Label definition (e.g., `loop_start:`). Label(LabelToken), - /// Assembler directive (e.g., `.global`, `.section`, `.dw`). + /// Assembler directive (e.g., `.global`, `.section`, `.dw`, `.resb`). Directive(DirectiveToken), + /// Comma separator. + Comma, /// End of line. Newline, /// End of file. diff --git a/assembler/src/source/tokeniser.rs b/assembler/src/source/tokeniser.rs index b996c88..b68c0d8 100644 --- a/assembler/src/source/tokeniser.rs +++ b/assembler/src/source/tokeniser.rs @@ -17,6 +17,9 @@ use crate::{ load_source_bytes, source_info::SourceInfo, token::{Token, TokenType}, + token_info::{ + DirectiveToken, InstructionToken, LabelToken, RegisterToken, SymbolToken, + }, }, }; @@ -76,7 +79,6 @@ impl Tokeniser { // Note that modules are tokenised in their own threads, possibly in parallel. pub fn tokenise(self, ctx: &AssemblerContext) -> Result, AssembleError> { let module_name = self.extract_module_name()?; - let _file_path = self.path.to_string_lossy().to_string(); // Create a module for the source file being processed. let module = Arc::new(Module::new(module_name, &self.path)); @@ -89,15 +91,12 @@ impl Tokeniser { let mut token_stream = Vec::new(); let lines = lines_with_spans(&self.data); - // Technically ignores newlines since line will be trimmed. We just append a - // Newline token for each line. + // Process each line for line_result in lines { let line_span = line_result?; - - // Skip empty lines and comments let trimmed = line_span.content.trim(); - // Add newline token on blank lines. + // Skip empty lines and add newline tokens if trimmed.is_empty() { token_stream.push(Token::new( TokenType::Newline, @@ -106,17 +105,174 @@ impl Tokeniser { continue; } - eprintln!("{}", line_span.line_number); + // Actually tokenize the line content + let line_tokens = self.tokenize_line(&line_span, &module)?; + token_stream.extend(line_tokens); + + // Add newline token at end of line + token_stream.push(Token::new( + TokenType::Newline, + SourceInfo::new( + line_span.line_number, + module.clone(), + line_span.content.len()..line_span.content.len(), + ), + )); } + // Add EOF token + token_stream.push(Token::new(TokenType::Eof, SourceInfo::new(0, module, 0..0))); + Ok(token_stream) } + fn tokenize_line( + &self, + line_span: &crate::source::lines::LineSpan, + module: &Arc, + ) -> Result, AssembleError> { + let mut tokens = Vec::new(); + let mut remaining = line_span.content.trim(); + let start_column = line_span.start_char; + + while !remaining.is_empty() { + // Try to match a token. + let (token_type, consumed) = self.match_token(remaining)?; + + tokens.push(Token::new( + token_type, + SourceInfo::new( + line_span.line_number, + module.clone(), + start_column..start_column + consumed, + ), + )); + + // Advance position. + remaining = remaining[consumed..].trim_start(); + } + + Ok(tokens) + } + + fn try_match_label(&self, input: &str) -> Option<(TokenType, usize)> { + let caps = self.label_regex.captures(input)?; + let name = caps.get(1)?.as_str().to_string(); + let len = caps.get(0)?.len(); + + Some((TokenType::Label(LabelToken { name }), len)) + } + + fn try_match_register(&self, input: &str) -> Option<(TokenType, usize)> { + let caps = self.register_regex.captures(input)?; + let name = caps.get(1)?.as_str().to_string(); + let len = caps.get(0)?.len(); + + Some((TokenType::Register(RegisterToken { name }), len)) + } + + fn try_match_immediate(&self, input: &str) -> Option<(TokenType, usize)> { + let caps = self.immediate_regex.captures(input)?; + let value_str = caps.get(1)?.as_str(); + let len = caps.get(0)?.len(); + + let value = if let Some(hex_part) = value_str.strip_prefix("0x") { + u32::from_str_radix(hex_part, 16).ok()? + } else if let Some(bin_part) = value_str.strip_prefix("0b") { + u32::from_str_radix(bin_part, 2).ok()? + } else if let Some(oct_part) = value_str.strip_prefix("0o") { + u32::from_str_radix(oct_part, 8).ok()? + } else { + value_str.parse::().ok()? + }; + + Some((TokenType::Immediate(value), len)) + } + + fn try_match_directive(&self, input: &str) -> Option<(TokenType, usize)> { + let caps = self.directive_regex.captures(input)?; + let directive = caps.get(1)?.as_str().to_string(); + let len = caps.get(0)?.len(); + + Some((TokenType::Directive(DirectiveToken { directive }), len)) + } + + fn try_match_instruction(&self, input: &str) -> Option<(TokenType, usize)> { + let caps = self.instruction_regex.captures(input)?; + let mnemonic = caps.get(1)?.as_str().to_string(); + let len = caps.get(0)?.len(); + + Some((TokenType::Instruction(InstructionToken { mnemonic }), len)) + } + + fn try_match_symbol(&self, input: &str) -> Option<(TokenType, usize)> { + let caps = self.symbol_regex.captures(input)?; + let name = caps.get(1)?.as_str().to_string(); + let len = caps.get(0)?.len(); + + Some((TokenType::Symbol(SymbolToken { name }), len)) + } + + fn try_match_string(&self, input: &str) -> Option<(TokenType, usize)> { + let caps = self.string_regex.captures(input)?; + let content = caps.get(1)?.as_str().to_string(); + let len = caps.get(0)?.len(); + + Some((TokenType::String(content), len)) + } + + fn match_token(&self, input: &str) -> Result<(TokenType, usize), AssembleError> { + if let Some(m) = self.try_match_label(input) { + return Ok(m); + } + + if let Some(m) = self.try_match_register(input) { + return Ok(m); + } + + if let Some(m) = self.try_match_immediate(input) { + return Ok(m); + } + + if let Some(m) = self.try_match_directive(input) { + return Ok(m); + } + + if let Some(m) = self.try_match_instruction(input) { + return Ok(m); + } + + if let Some(m) = self.try_match_string(input) { + return Ok(m); + } + + if let Some(m) = self.try_match_symbol(input) { + return Ok(m); + } + + // Handle miscellaneous characters. + match input.chars().next() { + Some(',') => Ok((TokenType::Comma, 1)), + Some(c) => Err(AssembleError::new_other_error(AssembleErrorKind::Io( + IoError::new( + IoErrorKind::InvalidData, + Some(format!("Unexpected character: '{c}'")), + ), + ))), + None => Err(AssembleError::new_other_error(AssembleErrorKind::Io( + IoError::new( + IoErrorKind::InvalidData, + Some("Unexpected end of input".to_string()), + ), + ))), + } + } + fn extract_module_name(&self) -> Result { let module_name = self .path .file_name() - .and_then(|f| Some(f.to_string_lossy().to_string())) + .map(|f| f.to_string_lossy().to_string()) .ok_or_else(|| { AssembleError::new_other_error(AssembleErrorKind::Io(IoError::new( IoErrorKind::InvalidData,