tokeniser: refactor to store Module directly in Tokeniser
We hereby avoid making extra copies of the PathBuf. - Also updated tests to match the new API
This commit is contained in:
@@ -1,11 +1,7 @@
|
|||||||
//! This file contains the [`Tokeniser`], which consumes a [`Vec`] of input bytes and
|
//! This file contains the [`Tokeniser`], which consumes a [`Vec`] of input bytes and
|
||||||
//! outputs a [`Vec<Token>`].
|
//! outputs a [`Vec<Token>`].
|
||||||
|
|
||||||
use std::{
|
use std::{path::Path, str::FromStr, sync::Arc};
|
||||||
path::{Path, PathBuf},
|
|
||||||
str::FromStr,
|
|
||||||
sync::Arc,
|
|
||||||
};
|
|
||||||
|
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
|
|
||||||
@@ -33,8 +29,8 @@ mod tests;
|
|||||||
pub struct Tokeniser {
|
pub struct Tokeniser {
|
||||||
/// The data in the file.
|
/// The data in the file.
|
||||||
pub data: Vec<u8>,
|
pub data: Vec<u8>,
|
||||||
/// The path to the file.
|
/// A copy of the Module in which the file is situated.
|
||||||
pub path: PathBuf,
|
pub module: Arc<Module>,
|
||||||
|
|
||||||
// Pre-compiled regex patterns
|
// Pre-compiled regex patterns
|
||||||
label_regex: Regex,
|
label_regex: Regex,
|
||||||
@@ -54,11 +50,10 @@ pub struct Tokeniser {
|
|||||||
|
|
||||||
impl Tokeniser {
|
impl Tokeniser {
|
||||||
#[must_use]
|
#[must_use]
|
||||||
pub fn from_data(data: Vec<u8>, path: PathBuf) -> Self {
|
pub fn from_data(data: Vec<u8>, module: Arc<Module>) -> Self {
|
||||||
Self {
|
Self {
|
||||||
data,
|
data,
|
||||||
path,
|
module,
|
||||||
|
|
||||||
label_regex: Regex::new(r"^([a-zA-Z_][a-zA-Z0-9_]*):")
|
label_regex: Regex::new(r"^([a-zA-Z_][a-zA-Z0-9_]*):")
|
||||||
.expect("Failed to compile label regex pattern"),
|
.expect("Failed to compile label regex pattern"),
|
||||||
register_regex: Regex::new(
|
register_regex: Regex::new(
|
||||||
@@ -88,31 +83,29 @@ impl Tokeniser {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Creates a [`Tokeniser`] from a file path.
|
/// Creates a [`Tokeniser`] from a file path. Also creates the underlying [`Module`]
|
||||||
pub fn new<P: AsRef<Path>>(path: P) -> Result<Self, AssembleError> {
|
/// for you.
|
||||||
|
pub fn new<P: AsRef<Path>>(
|
||||||
|
path: P,
|
||||||
|
ctx: &AssemblerContext,
|
||||||
|
) -> Result<Self, AssembleError> {
|
||||||
let path = path.as_ref().to_path_buf();
|
let path = path.as_ref().to_path_buf();
|
||||||
let data = load_source_bytes(&path)?;
|
let data = load_source_bytes(&path)?;
|
||||||
|
let module = Arc::new(Module::new(path)?);
|
||||||
Ok(Self::from_data(data, path))
|
|
||||||
}
|
|
||||||
|
|
||||||
// Note that modules are tokenised in their own threads, possibly in parallel.
|
|
||||||
pub fn tokenise(
|
|
||||||
mut self,
|
|
||||||
ctx: &AssemblerContext,
|
|
||||||
) -> Result<Vec<Token>, AssembleError> {
|
|
||||||
let module_name = self.extract_module_name()?;
|
|
||||||
|
|
||||||
// Create a module for the source file being processed.
|
|
||||||
let module = Arc::new(Module::new(module_name, &self.path));
|
|
||||||
|
|
||||||
{
|
{
|
||||||
let mut module_registry = ctx.module_registry.write()?;
|
let mut module_registry = ctx.module_registry.write()?;
|
||||||
module_registry.add(module.clone());
|
module_registry.add(module.clone());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Ok(Self::from_data(data, module))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Note that modules are tokenised in their own threads, possibly in parallel.
|
||||||
|
pub fn tokenise(mut self) -> Result<Vec<Token>, AssembleError> {
|
||||||
let mut token_stream = Vec::new();
|
let mut token_stream = Vec::new();
|
||||||
let lines = lines_with_spans(&self.data);
|
let data = self.data.clone();
|
||||||
|
let lines = lines_with_spans(&data);
|
||||||
|
|
||||||
// Process each line
|
// Process each line
|
||||||
for line_result in lines {
|
for line_result in lines {
|
||||||
@@ -123,13 +116,13 @@ impl Tokeniser {
|
|||||||
if trimmed.is_empty() {
|
if trimmed.is_empty() {
|
||||||
token_stream.push(Token::new(
|
token_stream.push(Token::new(
|
||||||
TokenType::Newline,
|
TokenType::Newline,
|
||||||
SourceInfo::new(line_span.line_number, module.clone(), 0..1),
|
SourceInfo::new(line_span.line_number, self.module.clone(), 0..1),
|
||||||
));
|
));
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Actually tokenise the line content
|
// Actually tokenise the line content
|
||||||
let line_tokens = self.tokenise_line(&line_span, &module)?;
|
let line_tokens = self.tokenise_line(&line_span)?;
|
||||||
token_stream.extend(line_tokens);
|
token_stream.extend(line_tokens);
|
||||||
|
|
||||||
// Add newline token at end of line
|
// Add newline token at end of line
|
||||||
@@ -137,22 +130,24 @@ impl Tokeniser {
|
|||||||
TokenType::Newline,
|
TokenType::Newline,
|
||||||
SourceInfo::new(
|
SourceInfo::new(
|
||||||
line_span.line_number,
|
line_span.line_number,
|
||||||
module.clone(),
|
self.module.clone(),
|
||||||
line_span.content.len()..line_span.content.len(),
|
line_span.content.len()..line_span.content.len(),
|
||||||
),
|
),
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add EOF token
|
// Add EOF token
|
||||||
token_stream.push(Token::new(TokenType::Eof, SourceInfo::new(0, module, 0..0)));
|
token_stream.push(Token::new(
|
||||||
|
TokenType::Eof,
|
||||||
|
SourceInfo::new(0, self.module.clone(), 0..0),
|
||||||
|
));
|
||||||
|
|
||||||
Ok(token_stream)
|
Ok(token_stream)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn tokenise_line(
|
fn tokenise_line(
|
||||||
&mut self, // Changed to &mut self
|
&mut self,
|
||||||
line_span: &LineSpan,
|
line_span: &LineSpan,
|
||||||
module: &Arc<Module>,
|
|
||||||
) -> Result<Vec<Token>, AssembleError> {
|
) -> Result<Vec<Token>, AssembleError> {
|
||||||
let mut tokens = Vec::new();
|
let mut tokens = Vec::new();
|
||||||
let mut remaining = line_span.content.as_str();
|
let mut remaining = line_span.content.as_str();
|
||||||
@@ -168,7 +163,7 @@ impl Tokeniser {
|
|||||||
|
|
||||||
// Try to match a token
|
// Try to match a token
|
||||||
let (token_type, consumed) =
|
let (token_type, consumed) =
|
||||||
self.match_token(&remaining, line_span.line_number, column)?;
|
self.match_token(remaining, line_span.line_number, column)?;
|
||||||
|
|
||||||
// Filter out string continuation tokens and comments
|
// Filter out string continuation tokens and comments
|
||||||
match token_type {
|
match token_type {
|
||||||
@@ -184,7 +179,7 @@ impl Tokeniser {
|
|||||||
token_type,
|
token_type,
|
||||||
SourceInfo::new(
|
SourceInfo::new(
|
||||||
line_span.line_number,
|
line_span.line_number,
|
||||||
module.clone(),
|
self.module.clone(),
|
||||||
start_column..start_column + consumed,
|
start_column..start_column + consumed,
|
||||||
),
|
),
|
||||||
));
|
));
|
||||||
@@ -292,7 +287,7 @@ impl Tokeniser {
|
|||||||
) -> Option<(TokenType, usize)> {
|
) -> Option<(TokenType, usize)> {
|
||||||
if self.in_string {
|
if self.in_string {
|
||||||
// We're continuing a multiline string
|
// We're continuing a multiline string
|
||||||
self.handle_string_continuation(input, line_number, column)
|
Some(self.handle_string_continuation(input, line_number, column))
|
||||||
} else {
|
} else {
|
||||||
// Look for the start of a new string
|
// Look for the start of a new string
|
||||||
self.handle_string_start(input, line_number, column)
|
self.handle_string_start(input, line_number, column)
|
||||||
@@ -333,7 +328,7 @@ impl Tokeniser {
|
|||||||
input: &str,
|
input: &str,
|
||||||
_line_number: usize,
|
_line_number: usize,
|
||||||
_column: usize,
|
_column: usize,
|
||||||
) -> Option<(TokenType, usize)> {
|
) -> (TokenType, usize) {
|
||||||
// Look for closing quote
|
// Look for closing quote
|
||||||
if let Some(end_pos) = input.find('"') {
|
if let Some(end_pos) = input.find('"') {
|
||||||
// End of multiline string found
|
// End of multiline string found
|
||||||
@@ -343,17 +338,18 @@ impl Tokeniser {
|
|||||||
let content = std::mem::take(&mut self.string_buffer);
|
let content = std::mem::take(&mut self.string_buffer);
|
||||||
let len = end_pos + 1; // +1 for the closing quote
|
let len = end_pos + 1; // +1 for the closing quote
|
||||||
|
|
||||||
Some((TokenType::String(content), len))
|
(TokenType::String(content), len)
|
||||||
} else {
|
} else {
|
||||||
// Continue multiline string
|
// Continue multiline string
|
||||||
self.string_buffer.push_str(input);
|
self.string_buffer.push_str(input);
|
||||||
self.string_buffer.push('\n'); // Add newline
|
self.string_buffer.push('\n'); // Add newline
|
||||||
|
|
||||||
// Consume the entire line
|
// Consume the entire line
|
||||||
Some((TokenType::StringContinuation, input.len()))
|
(TokenType::StringContinuation, input.len())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[expect(clippy::range_plus_one, reason = "RangeInclusive is a different type!")]
|
||||||
fn match_token(
|
fn match_token(
|
||||||
&mut self,
|
&mut self,
|
||||||
input: &str,
|
input: &str,
|
||||||
@@ -394,14 +390,19 @@ impl Tokeniser {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Handle miscellaneous characters.
|
// Handle miscellaneous characters.
|
||||||
match input.chars().next() {
|
match input.chars().enumerate().next() {
|
||||||
Some(',') => Ok((TokenType::Comma, 1)),
|
Some((_, ',')) => Ok((TokenType::Comma, 1)),
|
||||||
Some(c) => Err(AssembleError::new_other_error(AssembleErrorKind::Io(
|
Some((idx, c)) => {
|
||||||
IoError::new(
|
let source =
|
||||||
IoErrorKind::InvalidData,
|
SourceInfo::new(line_number, self.module.clone(), idx..idx + 1);
|
||||||
Some(format!("Unexpected character: '{c}'")),
|
|
||||||
),
|
Err(AssembleError::new_source_error(
|
||||||
))),
|
source,
|
||||||
|
AssembleErrorKind::Tokeniser(error::TokeniserError::UnexpectedChar(
|
||||||
|
c,
|
||||||
|
)),
|
||||||
|
))
|
||||||
|
}
|
||||||
None => Err(AssembleError::new_other_error(AssembleErrorKind::Io(
|
None => Err(AssembleError::new_other_error(AssembleErrorKind::Io(
|
||||||
IoError::new(
|
IoError::new(
|
||||||
IoErrorKind::InvalidData,
|
IoErrorKind::InvalidData,
|
||||||
@@ -410,21 +411,4 @@ impl Tokeniser {
|
|||||||
))),
|
))),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn extract_module_name(&self) -> Result<String, AssembleError> {
|
|
||||||
let module_name = self
|
|
||||||
.path
|
|
||||||
.file_name()
|
|
||||||
.map(|f| f.to_string_lossy().to_string())
|
|
||||||
.ok_or_else(|| {
|
|
||||||
AssembleError::new_other_error(AssembleErrorKind::Io(IoError::new(
|
|
||||||
IoErrorKind::InvalidData,
|
|
||||||
Some(
|
|
||||||
"filename couldn't be extracted, is it valid UTF-8?".to_string(),
|
|
||||||
),
|
|
||||||
)))
|
|
||||||
})?;
|
|
||||||
|
|
||||||
Ok(module_name)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
use common::prelude::Register;
|
use common::prelude::Register;
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
context::AssemblerContext,
|
model::module::Module,
|
||||||
source::{
|
source::{
|
||||||
opcode::Opcode,
|
opcode::Opcode,
|
||||||
token::{Token, TokenType},
|
token::{Token, TokenType},
|
||||||
@@ -11,20 +11,21 @@ use crate::{
|
|||||||
tokeniser::Tokeniser,
|
tokeniser::Tokeniser,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
use std::path::PathBuf;
|
use std::{path::PathBuf, sync::Arc};
|
||||||
|
|
||||||
/// Helper function to create a tokenizer from source text
|
/// Helper function to create a tokenizer from source text
|
||||||
fn create_tokenizer_from_source(source: &str) -> Tokeniser {
|
fn create_tokenizer_from_source(source: &str) -> Tokeniser {
|
||||||
let data = source.as_bytes().to_vec();
|
|
||||||
let path = PathBuf::from("test.dsa");
|
let path = PathBuf::from("test.dsa");
|
||||||
Tokeniser::from_data(data, path)
|
let module = Module::new(path).expect("Cannot create module!");
|
||||||
|
|
||||||
|
Tokeniser::from_data(source.as_bytes().to_vec(), Arc::new(module))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Helper function to tokenize source and return tokens
|
/// Helper function to tokenize source and return tokens
|
||||||
fn tokenize_source(source: &str) -> Result<Vec<Token>, crate::error::AssembleError> {
|
fn tokenize_source(source: &str) -> Result<Vec<Token>, crate::error::AssembleError> {
|
||||||
let tokenizer = create_tokenizer_from_source(source);
|
let tokenizer = create_tokenizer_from_source(source);
|
||||||
let context = AssemblerContext::new();
|
|
||||||
tokenizer.tokenise(&context)
|
tokenizer.tokenise()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Helper function to extract token types from a token vector
|
/// Helper function to extract token types from a token vector
|
||||||
|
|||||||
Reference in New Issue
Block a user