diff --git a/assembler/src/source/tokeniser.rs b/assembler/src/source/tokeniser.rs index c1ecd5a..49cb4b5 100644 --- a/assembler/src/source/tokeniser.rs +++ b/assembler/src/source/tokeniser.rs @@ -1,11 +1,7 @@ //! This file contains the [`Tokeniser`], which consumes a [`Vec`] of input bytes and //! outputs a [`Vec`]. -use std::{ - path::{Path, PathBuf}, - str::FromStr, - sync::Arc, -}; +use std::{path::Path, str::FromStr, sync::Arc}; use regex::Regex; @@ -33,8 +29,8 @@ mod tests; pub struct Tokeniser { /// The data in the file. pub data: Vec, - /// The path to the file. - pub path: PathBuf, + /// A copy of the Module in which the file is situated. + pub module: Arc, // Pre-compiled regex patterns label_regex: Regex, @@ -54,11 +50,10 @@ pub struct Tokeniser { impl Tokeniser { #[must_use] - pub fn from_data(data: Vec, path: PathBuf) -> Self { + pub fn from_data(data: Vec, module: Arc) -> Self { Self { data, - path, - + module, label_regex: Regex::new(r"^([a-zA-Z_][a-zA-Z0-9_]*):") .expect("Failed to compile label regex pattern"), register_regex: Regex::new( @@ -88,31 +83,29 @@ impl Tokeniser { } } - /// Creates a [`Tokeniser`] from a file path. - pub fn new>(path: P) -> Result { + /// Creates a [`Tokeniser`] from a file path. Also creates the underlying [`Module`] + /// for you. + pub fn new>( + path: P, + ctx: &AssemblerContext, + ) -> Result { let path = path.as_ref().to_path_buf(); let data = load_source_bytes(&path)?; - - Ok(Self::from_data(data, path)) - } - - // Note that modules are tokenised in their own threads, possibly in parallel. - pub fn tokenise( - mut self, - ctx: &AssemblerContext, - ) -> Result, AssembleError> { - let module_name = self.extract_module_name()?; - - // Create a module for the source file being processed. - let module = Arc::new(Module::new(module_name, &self.path)); + let module = Arc::new(Module::new(path)?); { let mut module_registry = ctx.module_registry.write()?; module_registry.add(module.clone()); } + Ok(Self::from_data(data, module)) + } + + // Note that modules are tokenised in their own threads, possibly in parallel. + pub fn tokenise(mut self) -> Result, AssembleError> { let mut token_stream = Vec::new(); - let lines = lines_with_spans(&self.data); + let data = self.data.clone(); + let lines = lines_with_spans(&data); // Process each line for line_result in lines { @@ -123,13 +116,13 @@ impl Tokeniser { if trimmed.is_empty() { token_stream.push(Token::new( TokenType::Newline, - SourceInfo::new(line_span.line_number, module.clone(), 0..1), + SourceInfo::new(line_span.line_number, self.module.clone(), 0..1), )); continue; } // Actually tokenise the line content - let line_tokens = self.tokenise_line(&line_span, &module)?; + let line_tokens = self.tokenise_line(&line_span)?; token_stream.extend(line_tokens); // Add newline token at end of line @@ -137,22 +130,24 @@ impl Tokeniser { TokenType::Newline, SourceInfo::new( line_span.line_number, - module.clone(), + self.module.clone(), line_span.content.len()..line_span.content.len(), ), )); } // Add EOF token - token_stream.push(Token::new(TokenType::Eof, SourceInfo::new(0, module, 0..0))); + token_stream.push(Token::new( + TokenType::Eof, + SourceInfo::new(0, self.module.clone(), 0..0), + )); Ok(token_stream) } fn tokenise_line( - &mut self, // Changed to &mut self + &mut self, line_span: &LineSpan, - module: &Arc, ) -> Result, AssembleError> { let mut tokens = Vec::new(); let mut remaining = line_span.content.as_str(); @@ -168,7 +163,7 @@ impl Tokeniser { // Try to match a token let (token_type, consumed) = - self.match_token(&remaining, line_span.line_number, column)?; + self.match_token(remaining, line_span.line_number, column)?; // Filter out string continuation tokens and comments match token_type { @@ -184,7 +179,7 @@ impl Tokeniser { token_type, SourceInfo::new( line_span.line_number, - module.clone(), + self.module.clone(), start_column..start_column + consumed, ), )); @@ -292,7 +287,7 @@ impl Tokeniser { ) -> Option<(TokenType, usize)> { if self.in_string { // We're continuing a multiline string - self.handle_string_continuation(input, line_number, column) + Some(self.handle_string_continuation(input, line_number, column)) } else { // Look for the start of a new string self.handle_string_start(input, line_number, column) @@ -333,7 +328,7 @@ impl Tokeniser { input: &str, _line_number: usize, _column: usize, - ) -> Option<(TokenType, usize)> { + ) -> (TokenType, usize) { // Look for closing quote if let Some(end_pos) = input.find('"') { // End of multiline string found @@ -343,17 +338,18 @@ impl Tokeniser { let content = std::mem::take(&mut self.string_buffer); let len = end_pos + 1; // +1 for the closing quote - Some((TokenType::String(content), len)) + (TokenType::String(content), len) } else { // Continue multiline string self.string_buffer.push_str(input); self.string_buffer.push('\n'); // Add newline // Consume the entire line - Some((TokenType::StringContinuation, input.len())) + (TokenType::StringContinuation, input.len()) } } + #[expect(clippy::range_plus_one, reason = "RangeInclusive is a different type!")] fn match_token( &mut self, input: &str, @@ -394,14 +390,19 @@ impl Tokeniser { } // Handle miscellaneous characters. - match input.chars().next() { - Some(',') => Ok((TokenType::Comma, 1)), - Some(c) => Err(AssembleError::new_other_error(AssembleErrorKind::Io( - IoError::new( - IoErrorKind::InvalidData, - Some(format!("Unexpected character: '{c}'")), - ), - ))), + match input.chars().enumerate().next() { + Some((_, ',')) => Ok((TokenType::Comma, 1)), + Some((idx, c)) => { + let source = + SourceInfo::new(line_number, self.module.clone(), idx..idx + 1); + + Err(AssembleError::new_source_error( + source, + AssembleErrorKind::Tokeniser(error::TokeniserError::UnexpectedChar( + c, + )), + )) + } None => Err(AssembleError::new_other_error(AssembleErrorKind::Io( IoError::new( IoErrorKind::InvalidData, @@ -410,21 +411,4 @@ impl Tokeniser { ))), } } - - fn extract_module_name(&self) -> Result { - let module_name = self - .path - .file_name() - .map(|f| f.to_string_lossy().to_string()) - .ok_or_else(|| { - AssembleError::new_other_error(AssembleErrorKind::Io(IoError::new( - IoErrorKind::InvalidData, - Some( - "filename couldn't be extracted, is it valid UTF-8?".to_string(), - ), - ))) - })?; - - Ok(module_name) - } } diff --git a/assembler/src/source/tokeniser/tests.rs b/assembler/src/source/tokeniser/tests.rs index f9fe225..392d8f5 100644 --- a/assembler/src/source/tokeniser/tests.rs +++ b/assembler/src/source/tokeniser/tests.rs @@ -3,7 +3,7 @@ use common::prelude::Register; use crate::{ - context::AssemblerContext, + model::module::Module, source::{ opcode::Opcode, token::{Token, TokenType}, @@ -11,20 +11,21 @@ use crate::{ tokeniser::Tokeniser, }, }; -use std::path::PathBuf; +use std::{path::PathBuf, sync::Arc}; /// Helper function to create a tokenizer from source text fn create_tokenizer_from_source(source: &str) -> Tokeniser { - let data = source.as_bytes().to_vec(); let path = PathBuf::from("test.dsa"); - Tokeniser::from_data(data, path) + let module = Module::new(path).expect("Cannot create module!"); + + Tokeniser::from_data(source.as_bytes().to_vec(), Arc::new(module)) } /// Helper function to tokenize source and return tokens fn tokenize_source(source: &str) -> Result, crate::error::AssembleError> { let tokenizer = create_tokenizer_from_source(source); - let context = AssemblerContext::new(); - tokenizer.tokenise(&context) + + tokenizer.tokenise() } /// Helper function to extract token types from a token vector