diff --git a/Cargo.lock b/Cargo.lock index b3d79d8..916ae54 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -129,6 +129,15 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + [[package]] name = "android-activity" version = "0.6.0" @@ -269,6 +278,7 @@ dependencies = [ "clap", "common", "num_cpus", + "regex", "threadpool", "uuid", ] @@ -2691,6 +2701,35 @@ dependencies = [ "thiserror 2.0.12", ] +[[package]] +name = "regex" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" + [[package]] name = "renderdoc-sys" version = "1.1.0" diff --git a/assembler/Cargo.toml b/assembler/Cargo.toml index a39add2..704941c 100644 --- a/assembler/Cargo.toml +++ b/assembler/Cargo.toml @@ -16,5 +16,6 @@ path = "src/lib.rs" clap = { version = "4.5.40", features = ["derive"] } common = { path = "../common" } num_cpus = "1.17.0" +regex = "1.11.1" threadpool = "1.8.1" uuid = { version = "1.17.0", features = ["v4"] } diff --git a/assembler/src/error.rs b/assembler/src/error.rs index c26a30d..c5ad53b 100644 --- a/assembler/src/error.rs +++ b/assembler/src/error.rs @@ -51,26 +51,173 @@ impl Display for AssembleError { /// Marker trait. impl std::error::Error for AssembleError {} -/// Different types of errors that may occur when assembling a set of input source files. +#[derive(Debug, Clone)] #[non_exhaustive] -#[derive(Debug)] pub enum AssembleErrorKind { /// Usually unexpected I/O errors. Not normally recoverable. - IO(std::io::Error), + Io(IoError), /// Errors emitted from the [`Tokeniser`]. - Tokenise(TokeniserError), + Tokeniser(TokeniserError), + Parser(ParserError), + Symbol(SymbolError), + Codegen(CodegenError), + Threading(ThreadingError), /// Returned for code where the functionality has not yet been implemented but we /// don't want the program to panic. - Unimplemented(String), + Unimplemented(&'static str), +} + +#[derive(Debug, Clone)] +pub struct ParserError { + error_type: ParserErrorType, + source_info: SourceInfo, +} + +#[derive(Debug, Clone)] +pub enum ParserErrorType { + UnexpectedToken, + MissingOperand, + InvalidInstruction, + MissingLabel, + DuplicateLabel, +} + +impl Display for ParserErrorType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::UnexpectedToken => write!(f, "unexpected token"), + Self::MissingOperand => write!(f, "missing operand"), + Self::InvalidInstruction => write!(f, "invalid instruction"), + Self::MissingLabel => write!(f, "missing label"), + Self::DuplicateLabel => write!(f, "duplicate label"), + } + } +} + +impl Display for ParserError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + // TODO: Print the path/to/filename.dsa:line_no, column col_no. + write!( + f, + "Parser error, {} at {}", + self.error_type, self.source_info + )?; + + // Prints out the context for our error. + self.source_info + .print_context_with_underline() + .map_err(|e| { + _ = writeln!(f, "Print context error: {e}"); + + std::fmt::Error {} + })?; + + Ok(()) + } +} + +#[derive(Debug, Clone)] +pub enum SymbolError { + Undefined, + Duplicate, + CircularDependency, + InvalidReference, +} + +impl Display for SymbolError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Undefined => write!(f, "undefined symbol"), + Self::Duplicate => write!(f, "duplicate symbol"), + Self::CircularDependency => write!(f, "circular dependency"), + Self::InvalidReference => write!(f, "invalid reference"), + } + } +} + +#[derive(Debug, Clone)] +pub enum CodegenError { + InvalidOperand, + OutOfRange, + UnsupportedInstruction, +} + +impl Display for CodegenError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::InvalidOperand => write!(f, "invalid operand"), + Self::OutOfRange => write!(f, "out of range"), + Self::UnsupportedInstruction => write!(f, "unsupported instruction"), + } + } +} + +#[derive(Debug, Clone)] +pub enum ThreadingError { + LockFailed, + ThreadPanic, +} + +impl Display for ThreadingError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::LockFailed => write!(f, "lock failed"), + Self::ThreadPanic => write!(f, "thread panic"), + } + } +} + +#[derive(Debug, Clone)] +pub struct IoError { + msg: Option, + kind: IoErrorKind, +} + +impl IoError { + #[must_use] + pub const fn new(kind: IoErrorKind, msg: Option) -> Self { + Self { msg, kind } + } +} + +#[derive(Debug, Clone)] +pub enum IoErrorKind { + NotFound, + PermissionDenied, + InvalidData, + Other, +} + +impl std::fmt::Display for IoErrorKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::NotFound => write!(f, "file not found"), + Self::PermissionDenied => write!(f, "permission denied"), + Self::InvalidData => write!(f, "invalid data"), + Self::Other => write!(f, "other I/O error"), + } + } +} + +impl std::fmt::Display for IoError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.kind)?; + + if let Some(msg) = &self.msg { + write!(f, ", \"{msg}\"")?; + } + + Ok(()) + } } impl Display for AssembleErrorKind { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - Self::Tokenise(why) => write!(f, "tokeniser error: {why}"), + Self::Tokeniser(why) => write!(f, "tokeniser error: {why}"), Self::Unimplemented(why) => write!(f, "used unimplemented feature: {why}"), - Self::IO(why) => write!(f, "problem occurred with I/O: {why}"), - #[expect(unreachable_patterns)] + Self::Io(why) => write!(f, "problem occurred with I/O: {why}"), + #[allow(unreachable_patterns)] _ => write!( f, "unhandled error type in Display implementation! See error.rs!" @@ -79,10 +226,4 @@ impl Display for AssembleErrorKind { } } -impl From for AssembleErrorKind { - fn from(err: std::io::Error) -> Self { - Self::IO(err) - } -} - pub mod conversions; diff --git a/assembler/src/error/conversions.rs b/assembler/src/error/conversions.rs index 8bb4731..75af112 100644 --- a/assembler/src/error/conversions.rs +++ b/assembler/src/error/conversions.rs @@ -1,7 +1,67 @@ -use crate::error::AssembleError; +use std::{ + io::ErrorKind, + sync::{PoisonError, RwLockReadGuard, RwLockWriteGuard}, +}; + +use crate::error::{AssembleError, IoError, IoErrorKind}; + +use super::{AssembleErrorKind, ThreadingError}; + +impl From for IoError { + fn from(err: std::io::Error) -> Self { + let kind = match err.kind() { + ErrorKind::NotFound => IoErrorKind::NotFound, + ErrorKind::PermissionDenied => IoErrorKind::PermissionDenied, + ErrorKind::InvalidData => IoErrorKind::InvalidData, + _ => IoErrorKind::Other, + }; + + let msg = err.to_string(); + + Self::new(kind, Some(msg)) + } +} impl From for AssembleError { fn from(err: std::io::Error) -> Self { - Self::new_other_error(err.into()) + Self::new_other_error(AssembleErrorKind::Io(err.into())) + } +} + +// TODO: Maybe attempt recovery? To be honest we don't want any threads to panic at all, +// or we want them all to panic spectacularly. +impl From>> for AssembleError { + fn from(err: PoisonError>) -> Self { + Self::new_other_error(AssembleErrorKind::Threading(err.into())) + } +} + +impl From>> for ThreadingError { + fn from(_err: PoisonError>) -> Self { + Self::LockFailed + } +} + +impl From>> for AssembleError { + fn from(err: PoisonError>) -> Self { + Self::new_other_error(AssembleErrorKind::Threading(err.into())) + } +} + +impl From>> for ThreadingError { + fn from(_err: PoisonError>) -> Self { + Self::LockFailed + } +} + +impl From for AssembleError { + fn from(err: std::fmt::Error) -> Self { + IoError::new(IoErrorKind::Other, Some(err.to_string())).into() + } +} + +impl From for AssembleError { + fn from(err: IoError) -> Self { + Self::new_other_error(AssembleErrorKind::Io(err)) } } diff --git a/assembler/src/model/module.rs b/assembler/src/model/module.rs index 6b722d8..d8ec547 100644 --- a/assembler/src/model/module.rs +++ b/assembler/src/model/module.rs @@ -40,7 +40,7 @@ impl std::fmt::Display for ModuleId { } /// A single source file or compilation unit. Stores its own symbol table. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct Module { /// The name of the module. This is typically the name of the file, less the `.dsa` /// extension. diff --git a/assembler/src/model/module_registry.rs b/assembler/src/model/module_registry.rs index b91f4bf..72ed50a 100644 --- a/assembler/src/model/module_registry.rs +++ b/assembler/src/model/module_registry.rs @@ -17,14 +17,16 @@ impl Default for ModuleRegistry { } impl ModuleRegistry { - #[must_use] pub fn new() -> Self { + #[must_use] + pub fn new() -> Self { Self { modules: HashMap::new(), } } /// Gets a [`Module`] by ID. - #[must_use] pub fn get(&self, module_id: &ModuleId) -> Option<&Module> { + #[must_use] + pub fn get(&self, module_id: &ModuleId) -> Option<&Module> { self.modules.get(module_id) } diff --git a/assembler/src/source.rs b/assembler/src/source.rs index 93ec85d..96a054e 100644 --- a/assembler/src/source.rs +++ b/assembler/src/source.rs @@ -1,10 +1,14 @@ //! This module contains anything within the first stage of assembly, i.e. the //! tokenisation stage, or utility functions for reading input files. -use std::path::Path; +use std::{ + io::{BufRead, Lines}, + path::Path, +}; use crate::error::AssembleError; +pub mod lines; pub mod source_info; pub mod token; pub mod token_info; @@ -17,3 +21,8 @@ pub fn load_source_bytes>(p: P) -> Result, AssembleError> Ok(std::fs::read(path)?) } + +/// Get the lines from a [`BufReader`]. +pub fn reader_lines(rdr: R) -> Lines { + rdr.lines() +} diff --git a/assembler/src/source/lines.rs b/assembler/src/source/lines.rs new file mode 100644 index 0000000..4087c35 --- /dev/null +++ b/assembler/src/source/lines.rs @@ -0,0 +1,76 @@ +//! Enhanced lines iterator that tracks line numbers and character positions. + +use std::io::{BufRead, BufReader, Cursor}; + +use crate::error::AssembleError; + +/// Iterator that yields lines with their line numbers and character spans. +pub struct LinesWithSpans { + reader: R, + line_number: usize, + total_chars: usize, + buffer: String, +} + +#[derive(Debug, Clone)] +pub struct LineSpan { + /// The line number. + pub line_number: usize, + /// The contents of the line. + pub content: String, + /// Character offset from start of file. + pub start_char: usize, + /// End character offset (exclusive). + pub end_char: usize, +} + +impl LinesWithSpans { + pub const fn new(reader: R) -> Self { + Self { + reader, + line_number: 0, + total_chars: 0, + buffer: String::new(), + } + } +} + +impl Iterator for LinesWithSpans { + type Item = Result; + + fn next(&mut self) -> Option { + self.buffer.clear(); + + match self.reader.read_line(&mut self.buffer) { + Ok(0) => None, // EOF + Ok(bytes_read) => { + self.line_number += 1; + let start_char = self.total_chars; + self.total_chars += bytes_read; + + // Remove trailing newline for cleaner processing + let content = if self.buffer.ends_with('\n') { + self.buffer[..self.buffer.len() - 1].to_string() + } else { + self.buffer.clone() + }; + + Some(Ok(LineSpan { + line_number: self.line_number, + content, + start_char, + end_char: self.total_chars, + })) + } + Err(e) => Some(Err(e.into())), + } + } +} + +/// Helper function to create lines iterator from data. +#[must_use] +pub fn lines_with_spans(data: &[u8]) -> LinesWithSpans>> { + let cursor = Cursor::new(data); + let reader = BufReader::new(cursor); + LinesWithSpans::new(reader) +} diff --git a/assembler/src/source/source_info.rs b/assembler/src/source/source_info.rs index ff43b4a..54f6955 100644 --- a/assembler/src/source/source_info.rs +++ b/assembler/src/source/source_info.rs @@ -4,22 +4,98 @@ //! This will likely be attached to a [`Token`] which will in turn be attached to an AST //! [`Node`]. -use std::fmt::Display; +use std::{ + fmt::{Display, Write}, + fs::File, + io::BufReader, + sync::Arc, +}; -use crate::model::module::Module; +use crate::{ + error::{AssembleError, AssembleErrorKind, IoError, IoErrorKind}, + model::module::Module, + source::lines::LinesWithSpans, +}; /// Information on where the token is within the source. -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct SourceInfo { /// The line number within the source file underpinned by `module_id`. pub line_no: usize, - pub module: Module, + pub module: Arc, /// The indexes where this token may be found (line-local). pub span: std::ops::Range, } impl Display for SourceInfo { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self.module.name) + write!( + f, + "{}:{}, column {}", + self.module.path.display(), + self.line_no, + self.span.start + ) + } +} + +impl SourceInfo { + #[must_use] + pub const fn new( + line_no: usize, + module: Arc, + span: std::ops::Range, + ) -> Self { + Self { + line_no, + module, + span, + } + } + + /// Prints out where in the source code the error originated with an underline similar + /// to what rustc does. + pub fn print_context_with_underline(&self) -> Result<(), AssembleError> { + let f = File::open(&self.module.path)?; + let rdr = BufReader::new(f); + + let mut lines = LinesWithSpans::new(rdr); + + let Some(line_result) = lines.nth(self.line_no - 1) else { + // Handle a line not existing. + return Err(AssembleError::new_source_error( + self.clone(), + AssembleErrorKind::Io(IoError::new( + IoErrorKind::Other, + Some(format!( + "the line {} does not exist in input file `{}` but source info suggested otherwise!.", + self.line_no, + self.module.path.display() + )), + )), + )); + }; + + let line_span = line_result?; + + // Print the line number and line content. + println!("{:>4} | {}", self.line_no, line_span.content); + + let mut underline = String::new(); + write!(underline, "{:>4} | ", "")?; + + for _ in 0..self.span.start { + underline.push(' '); + } + + for _ in self.span.start..self.span.end.min(line_span.content.len()) { + underline.push('^'); + } + + // Print the underline in red and bold. + // TODO: Use a crate to make this extra portable. + println!("\x1b[1;31m{underline}\x1b[0m"); + + Ok(()) } } diff --git a/assembler/src/source/tokeniser.rs b/assembler/src/source/tokeniser.rs index d7c0c6b..1f3c588 100644 --- a/assembler/src/source/tokeniser.rs +++ b/assembler/src/source/tokeniser.rs @@ -3,9 +3,15 @@ use std::path::{Path, PathBuf}; +use regex::Regex; + use crate::{ - error::{AssembleError, AssembleErrorKind}, - source::{load_source_bytes, token::Token}, + context::AssemblerContext, + error::{AssembleError, AssembleErrorKind, IoError, IoErrorKind}, + model::module::Module, + source::{ + lines::lines_with_spans, load_source_bytes, token::Token, + }, }; pub mod error; @@ -16,12 +22,41 @@ pub struct Tokeniser { pub data: Vec, /// The path to the file. pub path: PathBuf, + + // Pre-compiled regex patterns + label_regex: Regex, + register_regex: Regex, + immediate_regex: Regex, + directive_regex: Regex, + instruction_regex: Regex, + symbol_regex: Regex, + string_regex: Regex, } impl Tokeniser { #[must_use] - pub const fn from_data(data: Vec, path: PathBuf) -> Self { - Self { data, path } + pub fn from_data(data: Vec, path: PathBuf) -> Self { + Self { + data, + path, + + label_regex: Regex::new(r"^([a-zA-Z_][a-zA-Z0-9_]*):") + .expect("Failed to compile label regex pattern"), + register_regex: Regex::new(r"^(r[0-9]+|sp|fp|pc)") + .expect("Failed to compile register regex pattern"), + immediate_regex: Regex::new(r"^(0x[0-9a-fA-F]+|[0-9]+)") + .expect("Failed to compile immediate regex pattern"), + directive_regex: Regex::new(r"^\.([a-zA-Z]+)") + .expect("Failed to compile directive regex pattern"), + instruction_regex: Regex::new( + r"^(add|sub|mul|div|jmp|call|ret|lli|nop|halt)", + ) + .expect("Failed to compile instruction regex pattern"), + symbol_regex: Regex::new(r"^([a-zA-Z_][a-zA-Z0-9_]*)") + .expect("Failed to compile symbol regex pattern"), + string_regex: Regex::new(r#"^"([^"]*)"#) + .expect("Failed to compile string regex pattern"), + } } /// Creates a [`Tokeniser`] from a file path. @@ -29,12 +64,42 @@ impl Tokeniser { let path = path.as_ref().to_path_buf(); let data = load_source_bytes(&path)?; - Ok(Self { data, path }) + Ok(Self::from_data(data, path)) } - pub fn tokenise(self) -> Result, AssembleError> { + // Note that modules are tokenised in their own threads, possibly in parallel. + pub fn tokenise(self, ctx: &AssemblerContext) -> Result, AssembleError> { + let lines = lines_with_spans(&self.data); + + let Some(module_name) = self.path.file_name().and_then(|f| f.to_str()) else { + return Err(AssembleError::new_other_error(AssembleErrorKind::Io( + IoError::new( + IoErrorKind::InvalidData, + Some( + "filename couldn't be extracted, is it valid UTF-8?".to_string(), + ), + ), + ))); + }; + + // Create a module for the source file being processed. + let module = Module::new(module_name.to_string(), &self.path); + + { + let mut module_registry = ctx.module_registry.write()?; + module_registry.add(module); + } + + // Technically ignores newlines since line will be trimmed. We just append a + // Newline token for each line. + for line_result in lines { + let line = line_result?; + + eprintln!("{}", line.line_number); + } + Err(AssembleError::new_other_error( - AssembleErrorKind::Unimplemented("tokeniser not written yet!".to_string()), + AssembleErrorKind::Unimplemented("tokeniser not written yet!"), )) } } diff --git a/assembler/src/source/tokeniser/error.rs b/assembler/src/source/tokeniser/error.rs index 0af204a..84127ed 100644 --- a/assembler/src/source/tokeniser/error.rs +++ b/assembler/src/source/tokeniser/error.rs @@ -1,6 +1,6 @@ //! This module contains the error types for the tokeniser. -#[derive(Debug)] +#[derive(Debug, Clone, Copy)] pub enum TokeniserError {} impl std::fmt::Display for TokeniserError { diff --git a/assembler/src/symtab.rs b/assembler/src/symtab.rs index 3c64309..2a6977b 100644 --- a/assembler/src/symtab.rs +++ b/assembler/src/symtab.rs @@ -46,12 +46,11 @@ impl SymbolTable { && let Some(existing) = self.symbols.get(&existing_id) && existing.module_id == module_id { - return Err(AssembleError::new_other_error( - crate::error::AssembleErrorKind::IO(std::io::Error::new( - std::io::ErrorKind::AlreadyExists, - format!("Symbol '{name}' already defined in module"), - )), - )); + return Err(std::io::Error::new( + std::io::ErrorKind::AlreadyExists, + format!("Symbol '{name}' already defined in module"), + ) + .into()); } // Add to all mappings @@ -63,19 +62,22 @@ impl SymbolTable { } /// Gets the [`Symbol`] by its [`SymbolId`]. - #[must_use] pub fn get(&self, id: &SymbolId) -> Option<&Symbol> { + #[must_use] + pub fn get(&self, id: &SymbolId) -> Option<&Symbol> { self.symbols.get(id) } /// Gets the [`Symbol`] by its name. - #[must_use] pub fn get_by_name(&self, name: &str) -> Option<&Symbol> { + #[must_use] + pub fn get_by_name(&self, name: &str) -> Option<&Symbol> { self.name_to_id .get(name) .and_then(|id| self.symbols.get(id)) } /// Gets all [`Symbol`]s in a module. - #[must_use] pub fn get_module_symbols(&self, module_id: &ModuleId) -> Vec<&Symbol> { + #[must_use] + pub fn get_module_symbols(&self, module_id: &ModuleId) -> Vec<&Symbol> { self.module_symbols .get(module_id) .map(|ids| ids.iter().filter_map(|id| self.symbols.get(id)).collect()) @@ -83,7 +85,8 @@ impl SymbolTable { } /// Gets all the public symbols. - #[must_use] pub fn get_public_symbols(&self) -> Vec<&Symbol> { + #[must_use] + pub fn get_public_symbols(&self) -> Vec<&Symbol> { self.symbols .values() .filter(|sym| matches!(sym.visibility, Visibility::Public)) @@ -104,12 +107,10 @@ impl SymbolTable { } Ok(()) } else { - Err(AssembleError::new_other_error( - crate::error::AssembleErrorKind::IO(std::io::Error::new( - std::io::ErrorKind::NotFound, - "Symbol not found", - )), - )) + Err( + std::io::Error::new(std::io::ErrorKind::NotFound, "Symbol not found") + .into(), + ) } } }