tokeniser: refactor to store Module directly in Tokeniser

We hereby avoid making extra copies of the PathBuf.

- Also updated tests to match the new API
This commit is contained in:
2025-06-28 23:13:44 +01:00
parent a65dca6c5c
commit d15e00c272
2 changed files with 54 additions and 69 deletions
+47 -63
View File
@@ -1,11 +1,7 @@
//! This file contains the [`Tokeniser`], which consumes a [`Vec`] of input bytes and //! This file contains the [`Tokeniser`], which consumes a [`Vec`] of input bytes and
//! outputs a [`Vec<Token>`]. //! outputs a [`Vec<Token>`].
use std::{ use std::{path::Path, str::FromStr, sync::Arc};
path::{Path, PathBuf},
str::FromStr,
sync::Arc,
};
use regex::Regex; use regex::Regex;
@@ -33,8 +29,8 @@ mod tests;
pub struct Tokeniser { pub struct Tokeniser {
/// The data in the file. /// The data in the file.
pub data: Vec<u8>, pub data: Vec<u8>,
/// The path to the file. /// A copy of the Module in which the file is situated.
pub path: PathBuf, pub module: Arc<Module>,
// Pre-compiled regex patterns // Pre-compiled regex patterns
label_regex: Regex, label_regex: Regex,
@@ -54,11 +50,10 @@ pub struct Tokeniser {
impl Tokeniser { impl Tokeniser {
#[must_use] #[must_use]
pub fn from_data(data: Vec<u8>, path: PathBuf) -> Self { pub fn from_data(data: Vec<u8>, module: Arc<Module>) -> Self {
Self { Self {
data, data,
path, module,
label_regex: Regex::new(r"^([a-zA-Z_][a-zA-Z0-9_]*):") label_regex: Regex::new(r"^([a-zA-Z_][a-zA-Z0-9_]*):")
.expect("Failed to compile label regex pattern"), .expect("Failed to compile label regex pattern"),
register_regex: Regex::new( register_regex: Regex::new(
@@ -88,31 +83,29 @@ impl Tokeniser {
} }
} }
/// Creates a [`Tokeniser`] from a file path. /// Creates a [`Tokeniser`] from a file path. Also creates the underlying [`Module`]
pub fn new<P: AsRef<Path>>(path: P) -> Result<Self, AssembleError> { /// for you.
pub fn new<P: AsRef<Path>>(
path: P,
ctx: &AssemblerContext,
) -> Result<Self, AssembleError> {
let path = path.as_ref().to_path_buf(); let path = path.as_ref().to_path_buf();
let data = load_source_bytes(&path)?; let data = load_source_bytes(&path)?;
let module = Arc::new(Module::new(path)?);
Ok(Self::from_data(data, path))
}
// Note that modules are tokenised in their own threads, possibly in parallel.
pub fn tokenise(
mut self,
ctx: &AssemblerContext,
) -> Result<Vec<Token>, AssembleError> {
let module_name = self.extract_module_name()?;
// Create a module for the source file being processed.
let module = Arc::new(Module::new(module_name, &self.path));
{ {
let mut module_registry = ctx.module_registry.write()?; let mut module_registry = ctx.module_registry.write()?;
module_registry.add(module.clone()); module_registry.add(module.clone());
} }
Ok(Self::from_data(data, module))
}
// Note that modules are tokenised in their own threads, possibly in parallel.
pub fn tokenise(mut self) -> Result<Vec<Token>, AssembleError> {
let mut token_stream = Vec::new(); let mut token_stream = Vec::new();
let lines = lines_with_spans(&self.data); let data = self.data.clone();
let lines = lines_with_spans(&data);
// Process each line // Process each line
for line_result in lines { for line_result in lines {
@@ -123,13 +116,13 @@ impl Tokeniser {
if trimmed.is_empty() { if trimmed.is_empty() {
token_stream.push(Token::new( token_stream.push(Token::new(
TokenType::Newline, TokenType::Newline,
SourceInfo::new(line_span.line_number, module.clone(), 0..1), SourceInfo::new(line_span.line_number, self.module.clone(), 0..1),
)); ));
continue; continue;
} }
// Actually tokenise the line content // Actually tokenise the line content
let line_tokens = self.tokenise_line(&line_span, &module)?; let line_tokens = self.tokenise_line(&line_span)?;
token_stream.extend(line_tokens); token_stream.extend(line_tokens);
// Add newline token at end of line // Add newline token at end of line
@@ -137,22 +130,24 @@ impl Tokeniser {
TokenType::Newline, TokenType::Newline,
SourceInfo::new( SourceInfo::new(
line_span.line_number, line_span.line_number,
module.clone(), self.module.clone(),
line_span.content.len()..line_span.content.len(), line_span.content.len()..line_span.content.len(),
), ),
)); ));
} }
// Add EOF token // Add EOF token
token_stream.push(Token::new(TokenType::Eof, SourceInfo::new(0, module, 0..0))); token_stream.push(Token::new(
TokenType::Eof,
SourceInfo::new(0, self.module.clone(), 0..0),
));
Ok(token_stream) Ok(token_stream)
} }
fn tokenise_line( fn tokenise_line(
&mut self, // Changed to &mut self &mut self,
line_span: &LineSpan, line_span: &LineSpan,
module: &Arc<Module>,
) -> Result<Vec<Token>, AssembleError> { ) -> Result<Vec<Token>, AssembleError> {
let mut tokens = Vec::new(); let mut tokens = Vec::new();
let mut remaining = line_span.content.as_str(); let mut remaining = line_span.content.as_str();
@@ -168,7 +163,7 @@ impl Tokeniser {
// Try to match a token // Try to match a token
let (token_type, consumed) = let (token_type, consumed) =
self.match_token(&remaining, line_span.line_number, column)?; self.match_token(remaining, line_span.line_number, column)?;
// Filter out string continuation tokens and comments // Filter out string continuation tokens and comments
match token_type { match token_type {
@@ -184,7 +179,7 @@ impl Tokeniser {
token_type, token_type,
SourceInfo::new( SourceInfo::new(
line_span.line_number, line_span.line_number,
module.clone(), self.module.clone(),
start_column..start_column + consumed, start_column..start_column + consumed,
), ),
)); ));
@@ -292,7 +287,7 @@ impl Tokeniser {
) -> Option<(TokenType, usize)> { ) -> Option<(TokenType, usize)> {
if self.in_string { if self.in_string {
// We're continuing a multiline string // We're continuing a multiline string
self.handle_string_continuation(input, line_number, column) Some(self.handle_string_continuation(input, line_number, column))
} else { } else {
// Look for the start of a new string // Look for the start of a new string
self.handle_string_start(input, line_number, column) self.handle_string_start(input, line_number, column)
@@ -333,7 +328,7 @@ impl Tokeniser {
input: &str, input: &str,
_line_number: usize, _line_number: usize,
_column: usize, _column: usize,
) -> Option<(TokenType, usize)> { ) -> (TokenType, usize) {
// Look for closing quote // Look for closing quote
if let Some(end_pos) = input.find('"') { if let Some(end_pos) = input.find('"') {
// End of multiline string found // End of multiline string found
@@ -343,17 +338,18 @@ impl Tokeniser {
let content = std::mem::take(&mut self.string_buffer); let content = std::mem::take(&mut self.string_buffer);
let len = end_pos + 1; // +1 for the closing quote let len = end_pos + 1; // +1 for the closing quote
Some((TokenType::String(content), len)) (TokenType::String(content), len)
} else { } else {
// Continue multiline string // Continue multiline string
self.string_buffer.push_str(input); self.string_buffer.push_str(input);
self.string_buffer.push('\n'); // Add newline self.string_buffer.push('\n'); // Add newline
// Consume the entire line // Consume the entire line
Some((TokenType::StringContinuation, input.len())) (TokenType::StringContinuation, input.len())
} }
} }
#[expect(clippy::range_plus_one, reason = "RangeInclusive is a different type!")]
fn match_token( fn match_token(
&mut self, &mut self,
input: &str, input: &str,
@@ -394,14 +390,19 @@ impl Tokeniser {
} }
// Handle miscellaneous characters. // Handle miscellaneous characters.
match input.chars().next() { match input.chars().enumerate().next() {
Some(',') => Ok((TokenType::Comma, 1)), Some((_, ',')) => Ok((TokenType::Comma, 1)),
Some(c) => Err(AssembleError::new_other_error(AssembleErrorKind::Io( Some((idx, c)) => {
IoError::new( let source =
IoErrorKind::InvalidData, SourceInfo::new(line_number, self.module.clone(), idx..idx + 1);
Some(format!("Unexpected character: '{c}'")),
), Err(AssembleError::new_source_error(
))), source,
AssembleErrorKind::Tokeniser(error::TokeniserError::UnexpectedChar(
c,
)),
))
}
None => Err(AssembleError::new_other_error(AssembleErrorKind::Io( None => Err(AssembleError::new_other_error(AssembleErrorKind::Io(
IoError::new( IoError::new(
IoErrorKind::InvalidData, IoErrorKind::InvalidData,
@@ -410,21 +411,4 @@ impl Tokeniser {
))), ))),
} }
} }
fn extract_module_name(&self) -> Result<String, AssembleError> {
let module_name = self
.path
.file_name()
.map(|f| f.to_string_lossy().to_string())
.ok_or_else(|| {
AssembleError::new_other_error(AssembleErrorKind::Io(IoError::new(
IoErrorKind::InvalidData,
Some(
"filename couldn't be extracted, is it valid UTF-8?".to_string(),
),
)))
})?;
Ok(module_name)
}
} }
+7 -6
View File
@@ -3,7 +3,7 @@
use common::prelude::Register; use common::prelude::Register;
use crate::{ use crate::{
context::AssemblerContext, model::module::Module,
source::{ source::{
opcode::Opcode, opcode::Opcode,
token::{Token, TokenType}, token::{Token, TokenType},
@@ -11,20 +11,21 @@ use crate::{
tokeniser::Tokeniser, tokeniser::Tokeniser,
}, },
}; };
use std::path::PathBuf; use std::{path::PathBuf, sync::Arc};
/// Helper function to create a tokenizer from source text /// Helper function to create a tokenizer from source text
fn create_tokenizer_from_source(source: &str) -> Tokeniser { fn create_tokenizer_from_source(source: &str) -> Tokeniser {
let data = source.as_bytes().to_vec();
let path = PathBuf::from("test.dsa"); let path = PathBuf::from("test.dsa");
Tokeniser::from_data(data, path) let module = Module::new(path).expect("Cannot create module!");
Tokeniser::from_data(source.as_bytes().to_vec(), Arc::new(module))
} }
/// Helper function to tokenize source and return tokens /// Helper function to tokenize source and return tokens
fn tokenize_source(source: &str) -> Result<Vec<Token>, crate::error::AssembleError> { fn tokenize_source(source: &str) -> Result<Vec<Token>, crate::error::AssembleError> {
let tokenizer = create_tokenizer_from_source(source); let tokenizer = create_tokenizer_from_source(source);
let context = AssemblerContext::new();
tokenizer.tokenise(&context) tokenizer.tokenise()
} }
/// Helper function to extract token types from a token vector /// Helper function to extract token types from a token vector