From fb84a6d3c37c3bb043db06b9ffd31a16426a6223 Mon Sep 17 00:00:00 2001 From: "J. Hinchliffe" Date: Wed, 25 Jun 2025 16:50:17 +0100 Subject: [PATCH] assembler: clippy lints, better error formatting Adds regex dependency and enhances error handling system Introduces comprehensive error type hierarchy with specific variants for parser, symbol, codegen, threading, and IO errors to improve error reporting and debugging capabilities. Adds regex crate for pattern matching in tokenizer implementation with pre-compiled patterns for labels, registers, immediates, directives, instructions, and symbols. Enhances source info functionality with context printing and error underlining similar to compiler diagnostics. Implements better error conversions and threading error handling for lock failures and panics. --- Cargo.lock | 39 ++++++ assembler/Cargo.toml | 1 + assembler/src/error.rs | 169 ++++++++++++++++++++++-- assembler/src/error/conversions.rs | 64 ++++++++- assembler/src/model/module.rs | 2 +- assembler/src/model/module_registry.rs | 6 +- assembler/src/source.rs | 11 +- assembler/src/source/lines.rs | 76 +++++++++++ assembler/src/source/source_info.rs | 86 +++++++++++- assembler/src/source/tokeniser.rs | 79 ++++++++++- assembler/src/source/tokeniser/error.rs | 2 +- assembler/src/symtab.rs | 33 ++--- 12 files changed, 519 insertions(+), 49 deletions(-) create mode 100644 assembler/src/source/lines.rs diff --git a/Cargo.lock b/Cargo.lock index b3d79d8..916ae54 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -129,6 +129,15 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + [[package]] name = "android-activity" version = "0.6.0" @@ -269,6 +278,7 @@ dependencies = [ "clap", "common", "num_cpus", + "regex", "threadpool", "uuid", ] @@ -2691,6 +2701,35 @@ dependencies = [ "thiserror 2.0.12", ] +[[package]] +name = "regex" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" + [[package]] name = "renderdoc-sys" version = "1.1.0" diff --git a/assembler/Cargo.toml b/assembler/Cargo.toml index a39add2..704941c 100644 --- a/assembler/Cargo.toml +++ b/assembler/Cargo.toml @@ -16,5 +16,6 @@ path = "src/lib.rs" clap = { version = "4.5.40", features = ["derive"] } common = { path = "../common" } num_cpus = "1.17.0" +regex = "1.11.1" threadpool = "1.8.1" uuid = { version = "1.17.0", features = ["v4"] } diff --git a/assembler/src/error.rs b/assembler/src/error.rs index c26a30d..c5ad53b 100644 --- a/assembler/src/error.rs +++ b/assembler/src/error.rs @@ -51,26 +51,173 @@ impl Display for AssembleError { /// Marker trait. impl std::error::Error for AssembleError {} -/// Different types of errors that may occur when assembling a set of input source files. +#[derive(Debug, Clone)] #[non_exhaustive] -#[derive(Debug)] pub enum AssembleErrorKind { /// Usually unexpected I/O errors. Not normally recoverable. - IO(std::io::Error), + Io(IoError), /// Errors emitted from the [`Tokeniser`]. - Tokenise(TokeniserError), + Tokeniser(TokeniserError), + Parser(ParserError), + Symbol(SymbolError), + Codegen(CodegenError), + Threading(ThreadingError), /// Returned for code where the functionality has not yet been implemented but we /// don't want the program to panic. - Unimplemented(String), + Unimplemented(&'static str), +} + +#[derive(Debug, Clone)] +pub struct ParserError { + error_type: ParserErrorType, + source_info: SourceInfo, +} + +#[derive(Debug, Clone)] +pub enum ParserErrorType { + UnexpectedToken, + MissingOperand, + InvalidInstruction, + MissingLabel, + DuplicateLabel, +} + +impl Display for ParserErrorType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::UnexpectedToken => write!(f, "unexpected token"), + Self::MissingOperand => write!(f, "missing operand"), + Self::InvalidInstruction => write!(f, "invalid instruction"), + Self::MissingLabel => write!(f, "missing label"), + Self::DuplicateLabel => write!(f, "duplicate label"), + } + } +} + +impl Display for ParserError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + // TODO: Print the path/to/filename.dsa:line_no, column col_no. + write!( + f, + "Parser error, {} at {}", + self.error_type, self.source_info + )?; + + // Prints out the context for our error. + self.source_info + .print_context_with_underline() + .map_err(|e| { + _ = writeln!(f, "Print context error: {e}"); + + std::fmt::Error {} + })?; + + Ok(()) + } +} + +#[derive(Debug, Clone)] +pub enum SymbolError { + Undefined, + Duplicate, + CircularDependency, + InvalidReference, +} + +impl Display for SymbolError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Undefined => write!(f, "undefined symbol"), + Self::Duplicate => write!(f, "duplicate symbol"), + Self::CircularDependency => write!(f, "circular dependency"), + Self::InvalidReference => write!(f, "invalid reference"), + } + } +} + +#[derive(Debug, Clone)] +pub enum CodegenError { + InvalidOperand, + OutOfRange, + UnsupportedInstruction, +} + +impl Display for CodegenError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::InvalidOperand => write!(f, "invalid operand"), + Self::OutOfRange => write!(f, "out of range"), + Self::UnsupportedInstruction => write!(f, "unsupported instruction"), + } + } +} + +#[derive(Debug, Clone)] +pub enum ThreadingError { + LockFailed, + ThreadPanic, +} + +impl Display for ThreadingError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::LockFailed => write!(f, "lock failed"), + Self::ThreadPanic => write!(f, "thread panic"), + } + } +} + +#[derive(Debug, Clone)] +pub struct IoError { + msg: Option, + kind: IoErrorKind, +} + +impl IoError { + #[must_use] + pub const fn new(kind: IoErrorKind, msg: Option) -> Self { + Self { msg, kind } + } +} + +#[derive(Debug, Clone)] +pub enum IoErrorKind { + NotFound, + PermissionDenied, + InvalidData, + Other, +} + +impl std::fmt::Display for IoErrorKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::NotFound => write!(f, "file not found"), + Self::PermissionDenied => write!(f, "permission denied"), + Self::InvalidData => write!(f, "invalid data"), + Self::Other => write!(f, "other I/O error"), + } + } +} + +impl std::fmt::Display for IoError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.kind)?; + + if let Some(msg) = &self.msg { + write!(f, ", \"{msg}\"")?; + } + + Ok(()) + } } impl Display for AssembleErrorKind { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - Self::Tokenise(why) => write!(f, "tokeniser error: {why}"), + Self::Tokeniser(why) => write!(f, "tokeniser error: {why}"), Self::Unimplemented(why) => write!(f, "used unimplemented feature: {why}"), - Self::IO(why) => write!(f, "problem occurred with I/O: {why}"), - #[expect(unreachable_patterns)] + Self::Io(why) => write!(f, "problem occurred with I/O: {why}"), + #[allow(unreachable_patterns)] _ => write!( f, "unhandled error type in Display implementation! See error.rs!" @@ -79,10 +226,4 @@ impl Display for AssembleErrorKind { } } -impl From for AssembleErrorKind { - fn from(err: std::io::Error) -> Self { - Self::IO(err) - } -} - pub mod conversions; diff --git a/assembler/src/error/conversions.rs b/assembler/src/error/conversions.rs index 8bb4731..75af112 100644 --- a/assembler/src/error/conversions.rs +++ b/assembler/src/error/conversions.rs @@ -1,7 +1,67 @@ -use crate::error::AssembleError; +use std::{ + io::ErrorKind, + sync::{PoisonError, RwLockReadGuard, RwLockWriteGuard}, +}; + +use crate::error::{AssembleError, IoError, IoErrorKind}; + +use super::{AssembleErrorKind, ThreadingError}; + +impl From for IoError { + fn from(err: std::io::Error) -> Self { + let kind = match err.kind() { + ErrorKind::NotFound => IoErrorKind::NotFound, + ErrorKind::PermissionDenied => IoErrorKind::PermissionDenied, + ErrorKind::InvalidData => IoErrorKind::InvalidData, + _ => IoErrorKind::Other, + }; + + let msg = err.to_string(); + + Self::new(kind, Some(msg)) + } +} impl From for AssembleError { fn from(err: std::io::Error) -> Self { - Self::new_other_error(err.into()) + Self::new_other_error(AssembleErrorKind::Io(err.into())) + } +} + +// TODO: Maybe attempt recovery? To be honest we don't want any threads to panic at all, +// or we want them all to panic spectacularly. +impl From>> for AssembleError { + fn from(err: PoisonError>) -> Self { + Self::new_other_error(AssembleErrorKind::Threading(err.into())) + } +} + +impl From>> for ThreadingError { + fn from(_err: PoisonError>) -> Self { + Self::LockFailed + } +} + +impl From>> for AssembleError { + fn from(err: PoisonError>) -> Self { + Self::new_other_error(AssembleErrorKind::Threading(err.into())) + } +} + +impl From>> for ThreadingError { + fn from(_err: PoisonError>) -> Self { + Self::LockFailed + } +} + +impl From for AssembleError { + fn from(err: std::fmt::Error) -> Self { + IoError::new(IoErrorKind::Other, Some(err.to_string())).into() + } +} + +impl From for AssembleError { + fn from(err: IoError) -> Self { + Self::new_other_error(AssembleErrorKind::Io(err)) } } diff --git a/assembler/src/model/module.rs b/assembler/src/model/module.rs index 6b722d8..d8ec547 100644 --- a/assembler/src/model/module.rs +++ b/assembler/src/model/module.rs @@ -40,7 +40,7 @@ impl std::fmt::Display for ModuleId { } /// A single source file or compilation unit. Stores its own symbol table. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct Module { /// The name of the module. This is typically the name of the file, less the `.dsa` /// extension. diff --git a/assembler/src/model/module_registry.rs b/assembler/src/model/module_registry.rs index b91f4bf..72ed50a 100644 --- a/assembler/src/model/module_registry.rs +++ b/assembler/src/model/module_registry.rs @@ -17,14 +17,16 @@ impl Default for ModuleRegistry { } impl ModuleRegistry { - #[must_use] pub fn new() -> Self { + #[must_use] + pub fn new() -> Self { Self { modules: HashMap::new(), } } /// Gets a [`Module`] by ID. - #[must_use] pub fn get(&self, module_id: &ModuleId) -> Option<&Module> { + #[must_use] + pub fn get(&self, module_id: &ModuleId) -> Option<&Module> { self.modules.get(module_id) } diff --git a/assembler/src/source.rs b/assembler/src/source.rs index 93ec85d..96a054e 100644 --- a/assembler/src/source.rs +++ b/assembler/src/source.rs @@ -1,10 +1,14 @@ //! This module contains anything within the first stage of assembly, i.e. the //! tokenisation stage, or utility functions for reading input files. -use std::path::Path; +use std::{ + io::{BufRead, Lines}, + path::Path, +}; use crate::error::AssembleError; +pub mod lines; pub mod source_info; pub mod token; pub mod token_info; @@ -17,3 +21,8 @@ pub fn load_source_bytes>(p: P) -> Result, AssembleError> Ok(std::fs::read(path)?) } + +/// Get the lines from a [`BufReader`]. +pub fn reader_lines(rdr: R) -> Lines { + rdr.lines() +} diff --git a/assembler/src/source/lines.rs b/assembler/src/source/lines.rs new file mode 100644 index 0000000..4087c35 --- /dev/null +++ b/assembler/src/source/lines.rs @@ -0,0 +1,76 @@ +//! Enhanced lines iterator that tracks line numbers and character positions. + +use std::io::{BufRead, BufReader, Cursor}; + +use crate::error::AssembleError; + +/// Iterator that yields lines with their line numbers and character spans. +pub struct LinesWithSpans { + reader: R, + line_number: usize, + total_chars: usize, + buffer: String, +} + +#[derive(Debug, Clone)] +pub struct LineSpan { + /// The line number. + pub line_number: usize, + /// The contents of the line. + pub content: String, + /// Character offset from start of file. + pub start_char: usize, + /// End character offset (exclusive). + pub end_char: usize, +} + +impl LinesWithSpans { + pub const fn new(reader: R) -> Self { + Self { + reader, + line_number: 0, + total_chars: 0, + buffer: String::new(), + } + } +} + +impl Iterator for LinesWithSpans { + type Item = Result; + + fn next(&mut self) -> Option { + self.buffer.clear(); + + match self.reader.read_line(&mut self.buffer) { + Ok(0) => None, // EOF + Ok(bytes_read) => { + self.line_number += 1; + let start_char = self.total_chars; + self.total_chars += bytes_read; + + // Remove trailing newline for cleaner processing + let content = if self.buffer.ends_with('\n') { + self.buffer[..self.buffer.len() - 1].to_string() + } else { + self.buffer.clone() + }; + + Some(Ok(LineSpan { + line_number: self.line_number, + content, + start_char, + end_char: self.total_chars, + })) + } + Err(e) => Some(Err(e.into())), + } + } +} + +/// Helper function to create lines iterator from data. +#[must_use] +pub fn lines_with_spans(data: &[u8]) -> LinesWithSpans>> { + let cursor = Cursor::new(data); + let reader = BufReader::new(cursor); + LinesWithSpans::new(reader) +} diff --git a/assembler/src/source/source_info.rs b/assembler/src/source/source_info.rs index ff43b4a..54f6955 100644 --- a/assembler/src/source/source_info.rs +++ b/assembler/src/source/source_info.rs @@ -4,22 +4,98 @@ //! This will likely be attached to a [`Token`] which will in turn be attached to an AST //! [`Node`]. -use std::fmt::Display; +use std::{ + fmt::{Display, Write}, + fs::File, + io::BufReader, + sync::Arc, +}; -use crate::model::module::Module; +use crate::{ + error::{AssembleError, AssembleErrorKind, IoError, IoErrorKind}, + model::module::Module, + source::lines::LinesWithSpans, +}; /// Information on where the token is within the source. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct SourceInfo { /// The line number within the source file underpinned by `module_id`. pub line_no: usize, - pub module: Module, + pub module: Arc, /// The indexes where this token may be found (line-local). pub span: std::ops::Range, } impl Display for SourceInfo { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self.module.name) + write!( + f, + "{}:{}, column {}", + self.module.path.display(), + self.line_no, + self.span.start + ) + } +} + +impl SourceInfo { + #[must_use] + pub const fn new( + line_no: usize, + module: Arc, + span: std::ops::Range, + ) -> Self { + Self { + line_no, + module, + span, + } + } + + /// Prints out where in the source code the error originated with an underline similar + /// to what rustc does. + pub fn print_context_with_underline(&self) -> Result<(), AssembleError> { + let f = File::open(&self.module.path)?; + let rdr = BufReader::new(f); + + let mut lines = LinesWithSpans::new(rdr); + + let Some(line_result) = lines.nth(self.line_no - 1) else { + // Handle a line not existing. + return Err(AssembleError::new_source_error( + self.clone(), + AssembleErrorKind::Io(IoError::new( + IoErrorKind::Other, + Some(format!( + "the line {} does not exist in input file `{}` but source info suggested otherwise!.", + self.line_no, + self.module.path.display() + )), + )), + )); + }; + + let line_span = line_result?; + + // Print the line number and line content. + println!("{:>4} | {}", self.line_no, line_span.content); + + let mut underline = String::new(); + write!(underline, "{:>4} | ", "")?; + + for _ in 0..self.span.start { + underline.push(' '); + } + + for _ in self.span.start..self.span.end.min(line_span.content.len()) { + underline.push('^'); + } + + // Print the underline in red and bold. + // TODO: Use a crate to make this extra portable. + println!("\x1b[1;31m{underline}\x1b[0m"); + + Ok(()) } } diff --git a/assembler/src/source/tokeniser.rs b/assembler/src/source/tokeniser.rs index d7c0c6b..1f3c588 100644 --- a/assembler/src/source/tokeniser.rs +++ b/assembler/src/source/tokeniser.rs @@ -3,9 +3,15 @@ use std::path::{Path, PathBuf}; +use regex::Regex; + use crate::{ - error::{AssembleError, AssembleErrorKind}, - source::{load_source_bytes, token::Token}, + context::AssemblerContext, + error::{AssembleError, AssembleErrorKind, IoError, IoErrorKind}, + model::module::Module, + source::{ + lines::lines_with_spans, load_source_bytes, token::Token, + }, }; pub mod error; @@ -16,12 +22,41 @@ pub struct Tokeniser { pub data: Vec, /// The path to the file. pub path: PathBuf, + + // Pre-compiled regex patterns + label_regex: Regex, + register_regex: Regex, + immediate_regex: Regex, + directive_regex: Regex, + instruction_regex: Regex, + symbol_regex: Regex, + string_regex: Regex, } impl Tokeniser { #[must_use] - pub const fn from_data(data: Vec, path: PathBuf) -> Self { - Self { data, path } + pub fn from_data(data: Vec, path: PathBuf) -> Self { + Self { + data, + path, + + label_regex: Regex::new(r"^([a-zA-Z_][a-zA-Z0-9_]*):") + .expect("Failed to compile label regex pattern"), + register_regex: Regex::new(r"^(r[0-9]+|sp|fp|pc)") + .expect("Failed to compile register regex pattern"), + immediate_regex: Regex::new(r"^(0x[0-9a-fA-F]+|[0-9]+)") + .expect("Failed to compile immediate regex pattern"), + directive_regex: Regex::new(r"^\.([a-zA-Z]+)") + .expect("Failed to compile directive regex pattern"), + instruction_regex: Regex::new( + r"^(add|sub|mul|div|jmp|call|ret|lli|nop|halt)", + ) + .expect("Failed to compile instruction regex pattern"), + symbol_regex: Regex::new(r"^([a-zA-Z_][a-zA-Z0-9_]*)") + .expect("Failed to compile symbol regex pattern"), + string_regex: Regex::new(r#"^"([^"]*)"#) + .expect("Failed to compile string regex pattern"), + } } /// Creates a [`Tokeniser`] from a file path. @@ -29,12 +64,42 @@ impl Tokeniser { let path = path.as_ref().to_path_buf(); let data = load_source_bytes(&path)?; - Ok(Self { data, path }) + Ok(Self::from_data(data, path)) } - pub fn tokenise(self) -> Result, AssembleError> { + // Note that modules are tokenised in their own threads, possibly in parallel. + pub fn tokenise(self, ctx: &AssemblerContext) -> Result, AssembleError> { + let lines = lines_with_spans(&self.data); + + let Some(module_name) = self.path.file_name().and_then(|f| f.to_str()) else { + return Err(AssembleError::new_other_error(AssembleErrorKind::Io( + IoError::new( + IoErrorKind::InvalidData, + Some( + "filename couldn't be extracted, is it valid UTF-8?".to_string(), + ), + ), + ))); + }; + + // Create a module for the source file being processed. + let module = Module::new(module_name.to_string(), &self.path); + + { + let mut module_registry = ctx.module_registry.write()?; + module_registry.add(module); + } + + // Technically ignores newlines since line will be trimmed. We just append a + // Newline token for each line. + for line_result in lines { + let line = line_result?; + + eprintln!("{}", line.line_number); + } + Err(AssembleError::new_other_error( - AssembleErrorKind::Unimplemented("tokeniser not written yet!".to_string()), + AssembleErrorKind::Unimplemented("tokeniser not written yet!"), )) } } diff --git a/assembler/src/source/tokeniser/error.rs b/assembler/src/source/tokeniser/error.rs index 0af204a..84127ed 100644 --- a/assembler/src/source/tokeniser/error.rs +++ b/assembler/src/source/tokeniser/error.rs @@ -1,6 +1,6 @@ //! This module contains the error types for the tokeniser. -#[derive(Debug)] +#[derive(Debug, Clone, Copy)] pub enum TokeniserError {} impl std::fmt::Display for TokeniserError { diff --git a/assembler/src/symtab.rs b/assembler/src/symtab.rs index 3c64309..2a6977b 100644 --- a/assembler/src/symtab.rs +++ b/assembler/src/symtab.rs @@ -46,12 +46,11 @@ impl SymbolTable { && let Some(existing) = self.symbols.get(&existing_id) && existing.module_id == module_id { - return Err(AssembleError::new_other_error( - crate::error::AssembleErrorKind::IO(std::io::Error::new( - std::io::ErrorKind::AlreadyExists, - format!("Symbol '{name}' already defined in module"), - )), - )); + return Err(std::io::Error::new( + std::io::ErrorKind::AlreadyExists, + format!("Symbol '{name}' already defined in module"), + ) + .into()); } // Add to all mappings @@ -63,19 +62,22 @@ impl SymbolTable { } /// Gets the [`Symbol`] by its [`SymbolId`]. - #[must_use] pub fn get(&self, id: &SymbolId) -> Option<&Symbol> { + #[must_use] + pub fn get(&self, id: &SymbolId) -> Option<&Symbol> { self.symbols.get(id) } /// Gets the [`Symbol`] by its name. - #[must_use] pub fn get_by_name(&self, name: &str) -> Option<&Symbol> { + #[must_use] + pub fn get_by_name(&self, name: &str) -> Option<&Symbol> { self.name_to_id .get(name) .and_then(|id| self.symbols.get(id)) } /// Gets all [`Symbol`]s in a module. - #[must_use] pub fn get_module_symbols(&self, module_id: &ModuleId) -> Vec<&Symbol> { + #[must_use] + pub fn get_module_symbols(&self, module_id: &ModuleId) -> Vec<&Symbol> { self.module_symbols .get(module_id) .map(|ids| ids.iter().filter_map(|id| self.symbols.get(id)).collect()) @@ -83,7 +85,8 @@ impl SymbolTable { } /// Gets all the public symbols. - #[must_use] pub fn get_public_symbols(&self) -> Vec<&Symbol> { + #[must_use] + pub fn get_public_symbols(&self) -> Vec<&Symbol> { self.symbols .values() .filter(|sym| matches!(sym.visibility, Visibility::Public)) @@ -104,12 +107,10 @@ impl SymbolTable { } Ok(()) } else { - Err(AssembleError::new_other_error( - crate::error::AssembleErrorKind::IO(std::io::Error::new( - std::io::ErrorKind::NotFound, - "Symbol not found", - )), - )) + Err( + std::io::Error::new(std::io::ErrorKind::NotFound, "Symbol not found") + .into(), + ) } } }