tokeniser: refactor to store Module directly in Tokeniser

We hereby avoid making extra copies of the PathBuf.

- Also updated tests to match the new API
This commit is contained in:
2025-06-28 23:13:44 +01:00
parent a65dca6c5c
commit d15e00c272
2 changed files with 54 additions and 69 deletions
+47 -63
View File
@@ -1,11 +1,7 @@
//! This file contains the [`Tokeniser`], which consumes a [`Vec`] of input bytes and
//! outputs a [`Vec<Token>`].
use std::{
path::{Path, PathBuf},
str::FromStr,
sync::Arc,
};
use std::{path::Path, str::FromStr, sync::Arc};
use regex::Regex;
@@ -33,8 +29,8 @@ mod tests;
pub struct Tokeniser {
/// The data in the file.
pub data: Vec<u8>,
/// The path to the file.
pub path: PathBuf,
/// A copy of the Module in which the file is situated.
pub module: Arc<Module>,
// Pre-compiled regex patterns
label_regex: Regex,
@@ -54,11 +50,10 @@ pub struct Tokeniser {
impl Tokeniser {
#[must_use]
pub fn from_data(data: Vec<u8>, path: PathBuf) -> Self {
pub fn from_data(data: Vec<u8>, module: Arc<Module>) -> Self {
Self {
data,
path,
module,
label_regex: Regex::new(r"^([a-zA-Z_][a-zA-Z0-9_]*):")
.expect("Failed to compile label regex pattern"),
register_regex: Regex::new(
@@ -88,31 +83,29 @@ impl Tokeniser {
}
}
/// Creates a [`Tokeniser`] from a file path.
pub fn new<P: AsRef<Path>>(path: P) -> Result<Self, AssembleError> {
/// Creates a [`Tokeniser`] from a file path. Also creates the underlying [`Module`]
/// for you.
pub fn new<P: AsRef<Path>>(
path: P,
ctx: &AssemblerContext,
) -> Result<Self, AssembleError> {
let path = path.as_ref().to_path_buf();
let data = load_source_bytes(&path)?;
Ok(Self::from_data(data, path))
}
// Note that modules are tokenised in their own threads, possibly in parallel.
pub fn tokenise(
mut self,
ctx: &AssemblerContext,
) -> Result<Vec<Token>, AssembleError> {
let module_name = self.extract_module_name()?;
// Create a module for the source file being processed.
let module = Arc::new(Module::new(module_name, &self.path));
let module = Arc::new(Module::new(path)?);
{
let mut module_registry = ctx.module_registry.write()?;
module_registry.add(module.clone());
}
Ok(Self::from_data(data, module))
}
// Note that modules are tokenised in their own threads, possibly in parallel.
pub fn tokenise(mut self) -> Result<Vec<Token>, AssembleError> {
let mut token_stream = Vec::new();
let lines = lines_with_spans(&self.data);
let data = self.data.clone();
let lines = lines_with_spans(&data);
// Process each line
for line_result in lines {
@@ -123,13 +116,13 @@ impl Tokeniser {
if trimmed.is_empty() {
token_stream.push(Token::new(
TokenType::Newline,
SourceInfo::new(line_span.line_number, module.clone(), 0..1),
SourceInfo::new(line_span.line_number, self.module.clone(), 0..1),
));
continue;
}
// Actually tokenise the line content
let line_tokens = self.tokenise_line(&line_span, &module)?;
let line_tokens = self.tokenise_line(&line_span)?;
token_stream.extend(line_tokens);
// Add newline token at end of line
@@ -137,22 +130,24 @@ impl Tokeniser {
TokenType::Newline,
SourceInfo::new(
line_span.line_number,
module.clone(),
self.module.clone(),
line_span.content.len()..line_span.content.len(),
),
));
}
// Add EOF token
token_stream.push(Token::new(TokenType::Eof, SourceInfo::new(0, module, 0..0)));
token_stream.push(Token::new(
TokenType::Eof,
SourceInfo::new(0, self.module.clone(), 0..0),
));
Ok(token_stream)
}
fn tokenise_line(
&mut self, // Changed to &mut self
&mut self,
line_span: &LineSpan,
module: &Arc<Module>,
) -> Result<Vec<Token>, AssembleError> {
let mut tokens = Vec::new();
let mut remaining = line_span.content.as_str();
@@ -168,7 +163,7 @@ impl Tokeniser {
// Try to match a token
let (token_type, consumed) =
self.match_token(&remaining, line_span.line_number, column)?;
self.match_token(remaining, line_span.line_number, column)?;
// Filter out string continuation tokens and comments
match token_type {
@@ -184,7 +179,7 @@ impl Tokeniser {
token_type,
SourceInfo::new(
line_span.line_number,
module.clone(),
self.module.clone(),
start_column..start_column + consumed,
),
));
@@ -292,7 +287,7 @@ impl Tokeniser {
) -> Option<(TokenType, usize)> {
if self.in_string {
// We're continuing a multiline string
self.handle_string_continuation(input, line_number, column)
Some(self.handle_string_continuation(input, line_number, column))
} else {
// Look for the start of a new string
self.handle_string_start(input, line_number, column)
@@ -333,7 +328,7 @@ impl Tokeniser {
input: &str,
_line_number: usize,
_column: usize,
) -> Option<(TokenType, usize)> {
) -> (TokenType, usize) {
// Look for closing quote
if let Some(end_pos) = input.find('"') {
// End of multiline string found
@@ -343,17 +338,18 @@ impl Tokeniser {
let content = std::mem::take(&mut self.string_buffer);
let len = end_pos + 1; // +1 for the closing quote
Some((TokenType::String(content), len))
(TokenType::String(content), len)
} else {
// Continue multiline string
self.string_buffer.push_str(input);
self.string_buffer.push('\n'); // Add newline
// Consume the entire line
Some((TokenType::StringContinuation, input.len()))
(TokenType::StringContinuation, input.len())
}
}
#[expect(clippy::range_plus_one, reason = "RangeInclusive is a different type!")]
fn match_token(
&mut self,
input: &str,
@@ -394,14 +390,19 @@ impl Tokeniser {
}
// Handle miscellaneous characters.
match input.chars().next() {
Some(',') => Ok((TokenType::Comma, 1)),
Some(c) => Err(AssembleError::new_other_error(AssembleErrorKind::Io(
IoError::new(
IoErrorKind::InvalidData,
Some(format!("Unexpected character: '{c}'")),
),
))),
match input.chars().enumerate().next() {
Some((_, ',')) => Ok((TokenType::Comma, 1)),
Some((idx, c)) => {
let source =
SourceInfo::new(line_number, self.module.clone(), idx..idx + 1);
Err(AssembleError::new_source_error(
source,
AssembleErrorKind::Tokeniser(error::TokeniserError::UnexpectedChar(
c,
)),
))
}
None => Err(AssembleError::new_other_error(AssembleErrorKind::Io(
IoError::new(
IoErrorKind::InvalidData,
@@ -410,21 +411,4 @@ impl Tokeniser {
))),
}
}
fn extract_module_name(&self) -> Result<String, AssembleError> {
let module_name = self
.path
.file_name()
.map(|f| f.to_string_lossy().to_string())
.ok_or_else(|| {
AssembleError::new_other_error(AssembleErrorKind::Io(IoError::new(
IoErrorKind::InvalidData,
Some(
"filename couldn't be extracted, is it valid UTF-8?".to_string(),
),
)))
})?;
Ok(module_name)
}
}
+7 -6
View File
@@ -3,7 +3,7 @@
use common::prelude::Register;
use crate::{
context::AssemblerContext,
model::module::Module,
source::{
opcode::Opcode,
token::{Token, TokenType},
@@ -11,20 +11,21 @@ use crate::{
tokeniser::Tokeniser,
},
};
use std::path::PathBuf;
use std::{path::PathBuf, sync::Arc};
/// Helper function to create a tokenizer from source text
fn create_tokenizer_from_source(source: &str) -> Tokeniser {
let data = source.as_bytes().to_vec();
let path = PathBuf::from("test.dsa");
Tokeniser::from_data(data, path)
let module = Module::new(path).expect("Cannot create module!");
Tokeniser::from_data(source.as_bytes().to_vec(), Arc::new(module))
}
/// Helper function to tokenize source and return tokens
fn tokenize_source(source: &str) -> Result<Vec<Token>, crate::error::AssembleError> {
let tokenizer = create_tokenizer_from_source(source);
let context = AssemblerContext::new();
tokenizer.tokenise(&context)
tokenizer.tokenise()
}
/// Helper function to extract token types from a token vector