tokeniser: refactor to store Module directly in Tokeniser
We hereby avoid making extra copies of the PathBuf. - Also updated tests to match the new API
This commit is contained in:
@@ -1,11 +1,7 @@
|
||||
//! This file contains the [`Tokeniser`], which consumes a [`Vec`] of input bytes and
|
||||
//! outputs a [`Vec<Token>`].
|
||||
|
||||
use std::{
|
||||
path::{Path, PathBuf},
|
||||
str::FromStr,
|
||||
sync::Arc,
|
||||
};
|
||||
use std::{path::Path, str::FromStr, sync::Arc};
|
||||
|
||||
use regex::Regex;
|
||||
|
||||
@@ -33,8 +29,8 @@ mod tests;
|
||||
pub struct Tokeniser {
|
||||
/// The data in the file.
|
||||
pub data: Vec<u8>,
|
||||
/// The path to the file.
|
||||
pub path: PathBuf,
|
||||
/// A copy of the Module in which the file is situated.
|
||||
pub module: Arc<Module>,
|
||||
|
||||
// Pre-compiled regex patterns
|
||||
label_regex: Regex,
|
||||
@@ -54,11 +50,10 @@ pub struct Tokeniser {
|
||||
|
||||
impl Tokeniser {
|
||||
#[must_use]
|
||||
pub fn from_data(data: Vec<u8>, path: PathBuf) -> Self {
|
||||
pub fn from_data(data: Vec<u8>, module: Arc<Module>) -> Self {
|
||||
Self {
|
||||
data,
|
||||
path,
|
||||
|
||||
module,
|
||||
label_regex: Regex::new(r"^([a-zA-Z_][a-zA-Z0-9_]*):")
|
||||
.expect("Failed to compile label regex pattern"),
|
||||
register_regex: Regex::new(
|
||||
@@ -88,31 +83,29 @@ impl Tokeniser {
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a [`Tokeniser`] from a file path.
|
||||
pub fn new<P: AsRef<Path>>(path: P) -> Result<Self, AssembleError> {
|
||||
/// Creates a [`Tokeniser`] from a file path. Also creates the underlying [`Module`]
|
||||
/// for you.
|
||||
pub fn new<P: AsRef<Path>>(
|
||||
path: P,
|
||||
ctx: &AssemblerContext,
|
||||
) -> Result<Self, AssembleError> {
|
||||
let path = path.as_ref().to_path_buf();
|
||||
let data = load_source_bytes(&path)?;
|
||||
|
||||
Ok(Self::from_data(data, path))
|
||||
}
|
||||
|
||||
// Note that modules are tokenised in their own threads, possibly in parallel.
|
||||
pub fn tokenise(
|
||||
mut self,
|
||||
ctx: &AssemblerContext,
|
||||
) -> Result<Vec<Token>, AssembleError> {
|
||||
let module_name = self.extract_module_name()?;
|
||||
|
||||
// Create a module for the source file being processed.
|
||||
let module = Arc::new(Module::new(module_name, &self.path));
|
||||
let module = Arc::new(Module::new(path)?);
|
||||
|
||||
{
|
||||
let mut module_registry = ctx.module_registry.write()?;
|
||||
module_registry.add(module.clone());
|
||||
}
|
||||
|
||||
Ok(Self::from_data(data, module))
|
||||
}
|
||||
|
||||
// Note that modules are tokenised in their own threads, possibly in parallel.
|
||||
pub fn tokenise(mut self) -> Result<Vec<Token>, AssembleError> {
|
||||
let mut token_stream = Vec::new();
|
||||
let lines = lines_with_spans(&self.data);
|
||||
let data = self.data.clone();
|
||||
let lines = lines_with_spans(&data);
|
||||
|
||||
// Process each line
|
||||
for line_result in lines {
|
||||
@@ -123,13 +116,13 @@ impl Tokeniser {
|
||||
if trimmed.is_empty() {
|
||||
token_stream.push(Token::new(
|
||||
TokenType::Newline,
|
||||
SourceInfo::new(line_span.line_number, module.clone(), 0..1),
|
||||
SourceInfo::new(line_span.line_number, self.module.clone(), 0..1),
|
||||
));
|
||||
continue;
|
||||
}
|
||||
|
||||
// Actually tokenise the line content
|
||||
let line_tokens = self.tokenise_line(&line_span, &module)?;
|
||||
let line_tokens = self.tokenise_line(&line_span)?;
|
||||
token_stream.extend(line_tokens);
|
||||
|
||||
// Add newline token at end of line
|
||||
@@ -137,22 +130,24 @@ impl Tokeniser {
|
||||
TokenType::Newline,
|
||||
SourceInfo::new(
|
||||
line_span.line_number,
|
||||
module.clone(),
|
||||
self.module.clone(),
|
||||
line_span.content.len()..line_span.content.len(),
|
||||
),
|
||||
));
|
||||
}
|
||||
|
||||
// Add EOF token
|
||||
token_stream.push(Token::new(TokenType::Eof, SourceInfo::new(0, module, 0..0)));
|
||||
token_stream.push(Token::new(
|
||||
TokenType::Eof,
|
||||
SourceInfo::new(0, self.module.clone(), 0..0),
|
||||
));
|
||||
|
||||
Ok(token_stream)
|
||||
}
|
||||
|
||||
fn tokenise_line(
|
||||
&mut self, // Changed to &mut self
|
||||
&mut self,
|
||||
line_span: &LineSpan,
|
||||
module: &Arc<Module>,
|
||||
) -> Result<Vec<Token>, AssembleError> {
|
||||
let mut tokens = Vec::new();
|
||||
let mut remaining = line_span.content.as_str();
|
||||
@@ -168,7 +163,7 @@ impl Tokeniser {
|
||||
|
||||
// Try to match a token
|
||||
let (token_type, consumed) =
|
||||
self.match_token(&remaining, line_span.line_number, column)?;
|
||||
self.match_token(remaining, line_span.line_number, column)?;
|
||||
|
||||
// Filter out string continuation tokens and comments
|
||||
match token_type {
|
||||
@@ -184,7 +179,7 @@ impl Tokeniser {
|
||||
token_type,
|
||||
SourceInfo::new(
|
||||
line_span.line_number,
|
||||
module.clone(),
|
||||
self.module.clone(),
|
||||
start_column..start_column + consumed,
|
||||
),
|
||||
));
|
||||
@@ -292,7 +287,7 @@ impl Tokeniser {
|
||||
) -> Option<(TokenType, usize)> {
|
||||
if self.in_string {
|
||||
// We're continuing a multiline string
|
||||
self.handle_string_continuation(input, line_number, column)
|
||||
Some(self.handle_string_continuation(input, line_number, column))
|
||||
} else {
|
||||
// Look for the start of a new string
|
||||
self.handle_string_start(input, line_number, column)
|
||||
@@ -333,7 +328,7 @@ impl Tokeniser {
|
||||
input: &str,
|
||||
_line_number: usize,
|
||||
_column: usize,
|
||||
) -> Option<(TokenType, usize)> {
|
||||
) -> (TokenType, usize) {
|
||||
// Look for closing quote
|
||||
if let Some(end_pos) = input.find('"') {
|
||||
// End of multiline string found
|
||||
@@ -343,17 +338,18 @@ impl Tokeniser {
|
||||
let content = std::mem::take(&mut self.string_buffer);
|
||||
let len = end_pos + 1; // +1 for the closing quote
|
||||
|
||||
Some((TokenType::String(content), len))
|
||||
(TokenType::String(content), len)
|
||||
} else {
|
||||
// Continue multiline string
|
||||
self.string_buffer.push_str(input);
|
||||
self.string_buffer.push('\n'); // Add newline
|
||||
|
||||
// Consume the entire line
|
||||
Some((TokenType::StringContinuation, input.len()))
|
||||
(TokenType::StringContinuation, input.len())
|
||||
}
|
||||
}
|
||||
|
||||
#[expect(clippy::range_plus_one, reason = "RangeInclusive is a different type!")]
|
||||
fn match_token(
|
||||
&mut self,
|
||||
input: &str,
|
||||
@@ -394,14 +390,19 @@ impl Tokeniser {
|
||||
}
|
||||
|
||||
// Handle miscellaneous characters.
|
||||
match input.chars().next() {
|
||||
Some(',') => Ok((TokenType::Comma, 1)),
|
||||
Some(c) => Err(AssembleError::new_other_error(AssembleErrorKind::Io(
|
||||
IoError::new(
|
||||
IoErrorKind::InvalidData,
|
||||
Some(format!("Unexpected character: '{c}'")),
|
||||
),
|
||||
))),
|
||||
match input.chars().enumerate().next() {
|
||||
Some((_, ',')) => Ok((TokenType::Comma, 1)),
|
||||
Some((idx, c)) => {
|
||||
let source =
|
||||
SourceInfo::new(line_number, self.module.clone(), idx..idx + 1);
|
||||
|
||||
Err(AssembleError::new_source_error(
|
||||
source,
|
||||
AssembleErrorKind::Tokeniser(error::TokeniserError::UnexpectedChar(
|
||||
c,
|
||||
)),
|
||||
))
|
||||
}
|
||||
None => Err(AssembleError::new_other_error(AssembleErrorKind::Io(
|
||||
IoError::new(
|
||||
IoErrorKind::InvalidData,
|
||||
@@ -410,21 +411,4 @@ impl Tokeniser {
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_module_name(&self) -> Result<String, AssembleError> {
|
||||
let module_name = self
|
||||
.path
|
||||
.file_name()
|
||||
.map(|f| f.to_string_lossy().to_string())
|
||||
.ok_or_else(|| {
|
||||
AssembleError::new_other_error(AssembleErrorKind::Io(IoError::new(
|
||||
IoErrorKind::InvalidData,
|
||||
Some(
|
||||
"filename couldn't be extracted, is it valid UTF-8?".to_string(),
|
||||
),
|
||||
)))
|
||||
})?;
|
||||
|
||||
Ok(module_name)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
use common::prelude::Register;
|
||||
|
||||
use crate::{
|
||||
context::AssemblerContext,
|
||||
model::module::Module,
|
||||
source::{
|
||||
opcode::Opcode,
|
||||
token::{Token, TokenType},
|
||||
@@ -11,20 +11,21 @@ use crate::{
|
||||
tokeniser::Tokeniser,
|
||||
},
|
||||
};
|
||||
use std::path::PathBuf;
|
||||
use std::{path::PathBuf, sync::Arc};
|
||||
|
||||
/// Helper function to create a tokenizer from source text
|
||||
fn create_tokenizer_from_source(source: &str) -> Tokeniser {
|
||||
let data = source.as_bytes().to_vec();
|
||||
let path = PathBuf::from("test.dsa");
|
||||
Tokeniser::from_data(data, path)
|
||||
let module = Module::new(path).expect("Cannot create module!");
|
||||
|
||||
Tokeniser::from_data(source.as_bytes().to_vec(), Arc::new(module))
|
||||
}
|
||||
|
||||
/// Helper function to tokenize source and return tokens
|
||||
fn tokenize_source(source: &str) -> Result<Vec<Token>, crate::error::AssembleError> {
|
||||
let tokenizer = create_tokenizer_from_source(source);
|
||||
let context = AssemblerContext::new();
|
||||
tokenizer.tokenise(&context)
|
||||
|
||||
tokenizer.tokenise()
|
||||
}
|
||||
|
||||
/// Helper function to extract token types from a token vector
|
||||
|
||||
Reference in New Issue
Block a user