assembler: wrap Module's with Arc and update Tokeniser (still WIP)

Implements complete tokenizer with Arc-wrapped modules

Enhances module handling by wrapping Module instances in Arc for thread-safe sharing across the assembler pipeline.

Implements full tokenization logic with pattern matching for all token types including labels, registers, immediates, directives, instructions, symbols, and strings.

Adds comma token support and proper EOF handling to complete the lexical analysis phase.

Generated AI slop commit message, may not be super accurate or it may be a bit too serious lol.
This commit is contained in:
2025-06-25 17:35:03 +01:00
parent 27267e3daa
commit 9b9e153500
3 changed files with 172 additions and 11 deletions
+5 -2
View File
@@ -4,7 +4,10 @@
//!
//! They have unique identifiers in the form of UUIDs.
use std::path::{Path, PathBuf};
use std::{
path::{Path, PathBuf},
sync::Arc,
};
use uuid::Uuid;
@@ -22,7 +25,7 @@ impl ModuleId {
/// Convenience method to get the [`Module`] from a [`ModuleId`].
#[must_use]
pub fn to_module<'m>(&self, registry: &'m ModuleRegistry) -> Option<&'m Module> {
pub fn to_module<'m>(&self, registry: &'m ModuleRegistry) -> Option<&'m Arc<Module>> {
registry.get(self)
}
+3 -1
View File
@@ -23,8 +23,10 @@ pub enum TokenType {
Instruction(InstructionToken),
/// Label definition (e.g., `loop_start:`).
Label(LabelToken),
/// Assembler directive (e.g., `.global`, `.section`, `.dw`).
/// Assembler directive (e.g., `.global`, `.section`, `.dw`, `.resb`).
Directive(DirectiveToken),
/// Comma separator.
Comma,
/// End of line.
Newline,
/// End of file.
+164 -8
View File
@@ -17,6 +17,9 @@ use crate::{
load_source_bytes,
source_info::SourceInfo,
token::{Token, TokenType},
token_info::{
DirectiveToken, InstructionToken, LabelToken, RegisterToken, SymbolToken,
},
},
};
@@ -76,7 +79,6 @@ impl Tokeniser {
// Note that modules are tokenised in their own threads, possibly in parallel.
pub fn tokenise(self, ctx: &AssemblerContext) -> Result<Vec<Token>, AssembleError> {
let module_name = self.extract_module_name()?;
let _file_path = self.path.to_string_lossy().to_string();
// Create a module for the source file being processed.
let module = Arc::new(Module::new(module_name, &self.path));
@@ -89,15 +91,12 @@ impl Tokeniser {
let mut token_stream = Vec::new();
let lines = lines_with_spans(&self.data);
// Technically ignores newlines since line will be trimmed. We just append a
// Newline token for each line.
// Process each line
for line_result in lines {
let line_span = line_result?;
// Skip empty lines and comments
let trimmed = line_span.content.trim();
// Add newline token on blank lines.
// Skip empty lines and add newline tokens
if trimmed.is_empty() {
token_stream.push(Token::new(
TokenType::Newline,
@@ -106,17 +105,174 @@ impl Tokeniser {
continue;
}
eprintln!("{}", line_span.line_number);
// Actually tokenize the line content
let line_tokens = self.tokenize_line(&line_span, &module)?;
token_stream.extend(line_tokens);
// Add newline token at end of line
token_stream.push(Token::new(
TokenType::Newline,
SourceInfo::new(
line_span.line_number,
module.clone(),
line_span.content.len()..line_span.content.len(),
),
));
}
// Add EOF token
token_stream.push(Token::new(TokenType::Eof, SourceInfo::new(0, module, 0..0)));
Ok(token_stream)
}
fn tokenize_line(
&self,
line_span: &crate::source::lines::LineSpan,
module: &Arc<Module>,
) -> Result<Vec<Token>, AssembleError> {
let mut tokens = Vec::new();
let mut remaining = line_span.content.trim();
let start_column = line_span.start_char;
while !remaining.is_empty() {
// Try to match a token.
let (token_type, consumed) = self.match_token(remaining)?;
tokens.push(Token::new(
token_type,
SourceInfo::new(
line_span.line_number,
module.clone(),
start_column..start_column + consumed,
),
));
// Advance position.
remaining = remaining[consumed..].trim_start();
}
Ok(tokens)
}
fn try_match_label(&self, input: &str) -> Option<(TokenType, usize)> {
let caps = self.label_regex.captures(input)?;
let name = caps.get(1)?.as_str().to_string();
let len = caps.get(0)?.len();
Some((TokenType::Label(LabelToken { name }), len))
}
fn try_match_register(&self, input: &str) -> Option<(TokenType, usize)> {
let caps = self.register_regex.captures(input)?;
let name = caps.get(1)?.as_str().to_string();
let len = caps.get(0)?.len();
Some((TokenType::Register(RegisterToken { name }), len))
}
fn try_match_immediate(&self, input: &str) -> Option<(TokenType, usize)> {
let caps = self.immediate_regex.captures(input)?;
let value_str = caps.get(1)?.as_str();
let len = caps.get(0)?.len();
let value = if let Some(hex_part) = value_str.strip_prefix("0x") {
u32::from_str_radix(hex_part, 16).ok()?
} else if let Some(bin_part) = value_str.strip_prefix("0b") {
u32::from_str_radix(bin_part, 2).ok()?
} else if let Some(oct_part) = value_str.strip_prefix("0o") {
u32::from_str_radix(oct_part, 8).ok()?
} else {
value_str.parse::<u32>().ok()?
};
Some((TokenType::Immediate(value), len))
}
fn try_match_directive(&self, input: &str) -> Option<(TokenType, usize)> {
let caps = self.directive_regex.captures(input)?;
let directive = caps.get(1)?.as_str().to_string();
let len = caps.get(0)?.len();
Some((TokenType::Directive(DirectiveToken { directive }), len))
}
fn try_match_instruction(&self, input: &str) -> Option<(TokenType, usize)> {
let caps = self.instruction_regex.captures(input)?;
let mnemonic = caps.get(1)?.as_str().to_string();
let len = caps.get(0)?.len();
Some((TokenType::Instruction(InstructionToken { mnemonic }), len))
}
fn try_match_symbol(&self, input: &str) -> Option<(TokenType, usize)> {
let caps = self.symbol_regex.captures(input)?;
let name = caps.get(1)?.as_str().to_string();
let len = caps.get(0)?.len();
Some((TokenType::Symbol(SymbolToken { name }), len))
}
fn try_match_string(&self, input: &str) -> Option<(TokenType, usize)> {
let caps = self.string_regex.captures(input)?;
let content = caps.get(1)?.as_str().to_string();
let len = caps.get(0)?.len();
Some((TokenType::String(content), len))
}
fn match_token(&self, input: &str) -> Result<(TokenType, usize), AssembleError> {
if let Some(m) = self.try_match_label(input) {
return Ok(m);
}
if let Some(m) = self.try_match_register(input) {
return Ok(m);
}
if let Some(m) = self.try_match_immediate(input) {
return Ok(m);
}
if let Some(m) = self.try_match_directive(input) {
return Ok(m);
}
if let Some(m) = self.try_match_instruction(input) {
return Ok(m);
}
if let Some(m) = self.try_match_string(input) {
return Ok(m);
}
if let Some(m) = self.try_match_symbol(input) {
return Ok(m);
}
// Handle miscellaneous characters.
match input.chars().next() {
Some(',') => Ok((TokenType::Comma, 1)),
Some(c) => Err(AssembleError::new_other_error(AssembleErrorKind::Io(
IoError::new(
IoErrorKind::InvalidData,
Some(format!("Unexpected character: '{c}'")),
),
))),
None => Err(AssembleError::new_other_error(AssembleErrorKind::Io(
IoError::new(
IoErrorKind::InvalidData,
Some("Unexpected end of input".to_string()),
),
))),
}
}
fn extract_module_name(&self) -> Result<String, AssembleError> {
let module_name = self
.path
.file_name()
.and_then(|f| Some(f.to_string_lossy().to_string()))
.map(|f| f.to_string_lossy().to_string())
.ok_or_else(|| {
AssembleError::new_other_error(AssembleErrorKind::Io(IoError::new(
IoErrorKind::InvalidData,