diff --git a/assembler/src/context.rs b/assembler/src/context.rs new file mode 100644 index 0000000..dce4911 --- /dev/null +++ b/assembler/src/context.rs @@ -0,0 +1,21 @@ +//! This module contains the global asembler context to be passed to functions that need +//! it. + +use std::sync::RwLock; + +use crate::{model::module_registry::ModuleRegistry, symtab::SymbolTable}; + +/// Global state to be passed around. +pub struct AssemblerContext { + pub symbol_table: RwLock, + pub module_registry: RwLock, +} + +impl AssemblerContext { + pub fn new() -> Self { + Self { + symbol_table: RwLock::new(SymbolTable::new()), + module_registry: RwLock::new(ModuleRegistry::new()), + } + } +} diff --git a/assembler/src/error.rs b/assembler/src/error.rs index a856590..483b68d 100644 --- a/assembler/src/error.rs +++ b/assembler/src/error.rs @@ -3,7 +3,7 @@ use std::fmt::{Debug, Display}; -use crate::source::source_info::SourceInfo; +use crate::source::{source_info::SourceInfo, tokeniser::error::TokeniserError}; /// An error that may occur during the assembly of a set of source files. #[derive(Debug)] @@ -13,26 +13,38 @@ pub struct AssembleError { source_info: Option, /// The type of assembly error that occurred. kind: AssembleErrorKind, - /// The formatter to handle printing the error. - formatter: Box, } impl AssembleError { - pub fn new_source_error(source_info: SourceInfo, kind: AssembleErrorKind) -> Self { + #[must_use] + pub const fn new_source_error( + source_info: SourceInfo, + kind: AssembleErrorKind, + ) -> Self { Self { source_info: Some(source_info), kind, - formatter, } } - pub fn new_other_error(kind: AssembleErrorKind) {} + #[must_use] + pub const fn new_other_error(kind: AssembleErrorKind) -> Self { + Self { + source_info: None, + kind, + } + } } impl Display for AssembleError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - self.formatter - .write(f, self.source_info.as_ref(), &self.kind) + if let Some(info) = &self.source_info { + write!(f, "at {info}")?; + } + + write!(f, "{}", self.kind)?; + + Ok(()) } } @@ -42,7 +54,29 @@ impl std::error::Error for AssembleError {} /// Different types of errors that may occur when assembling a set of input source files. #[non_exhaustive] #[derive(Debug)] -pub enum AssembleErrorKind {} +pub enum AssembleErrorKind { + /// Usually unexpected I/O errors. Not normally recoverable. + IO(std::io::Error), + /// Errors emitted from the [`Tokeniser`]. + Tokenise(TokeniserError), +} + +impl Display for AssembleErrorKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Tokenise(why) => write!(f, "tokeniser error: {why}"), + _ => write!( + f, + "unhandled error type in Display implementation! See error.rs!" + ), + } + } +} + +impl From for AssembleErrorKind { + fn from(err: std::io::Error) -> Self { + Self::IO(err) + } +} pub mod conversions; -pub mod formatters; diff --git a/assembler/src/error/conversions.rs b/assembler/src/error/conversions.rs index 2a89c19..8bb4731 100644 --- a/assembler/src/error/conversions.rs +++ b/assembler/src/error/conversions.rs @@ -2,6 +2,6 @@ use crate::error::AssembleError; impl From for AssembleError { fn from(err: std::io::Error) -> Self { - + Self::new_other_error(err.into()) } } diff --git a/assembler/src/lib.rs b/assembler/src/lib.rs index c9433f2..22be5b9 100644 --- a/assembler/src/lib.rs +++ b/assembler/src/lib.rs @@ -15,6 +15,7 @@ pub mod args; pub mod image_builder; // pub mod tooling; +pub mod context; pub mod error; pub mod model; pub mod source; diff --git a/assembler/src/model.rs b/assembler/src/model.rs index 8969da3..75e8bc0 100644 --- a/assembler/src/model.rs +++ b/assembler/src/model.rs @@ -1,3 +1,5 @@ //! This module contains the underlying data models and enums used by the Assembler. +pub mod module; +pub mod module_registry; pub mod symbol; diff --git a/assembler/src/model/module.rs b/assembler/src/model/module.rs new file mode 100644 index 0000000..620d5ce --- /dev/null +++ b/assembler/src/model/module.rs @@ -0,0 +1,69 @@ +//! This module contains the [`Module`] type and associated types. Each compilation unit +//! (file) is represented by a module which is used to namespace "function" calls and +//! accesses to global variables. +//! +//! They have unique identifiers in the form of UUIDs. + +use std::path::{Path, PathBuf}; + +use uuid::Uuid; + +use crate::model::module_registry::ModuleRegistry; + +/// The ID for a module. A tuple struct for type safety. +#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)] +pub struct ModuleId(Uuid); + +impl ModuleId { + pub fn from_module(module: Module) -> Self { + module.id + } + + /// Convenience method to get the [`Module`] from a [`ModuleId`]. + pub fn to_module<'m>(&self, registry: &'m ModuleRegistry) -> Option<&'m Module> { + registry.get(self) + } + + /// Convenience method to get the [`Module`] name from a [`ModuleId`]. + pub fn to_module_name<'m>(self, registry: &'m ModuleRegistry) -> Option<&'m str> { + if let Some(module) = self.to_module(®istry) { + Some(module.name.as_str()) + } else { + None + } + } +} + +impl std::fmt::Display for ModuleId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} + +/// A single source file or compilation unit. Stores its own symbol table. +#[derive(Debug)] +pub struct Module { + /// The name of the module. This is typically the name of the file, less the `.dsa` + /// extension. + pub name: String, + /// The file path to the module. This is an absolute path. + pub path: PathBuf, + /// A unique ID for this module. + pub id: ModuleId, +} + +impl std::hash::Hash for Module { + fn hash(&self, state: &mut H) { + self.id.0.hash(state); + } +} + +impl Module { + pub fn new>(name: String, path: P) -> Self { + Self { + name, + path: path.as_ref().to_path_buf(), + id: ModuleId(Uuid::new_v4()), + } + } +} diff --git a/assembler/src/model/module_registry.rs b/assembler/src/model/module_registry.rs new file mode 100644 index 0000000..0cd4054 --- /dev/null +++ b/assembler/src/model/module_registry.rs @@ -0,0 +1,36 @@ +//! This module contains the code for the module registry. This is a singleton storing all +//! the modules being assembled. + +use std::collections::HashMap; + +use super::module::{Module, ModuleId}; + +/// Stores all the [`Module`]'s to be assembled. +pub struct ModuleRegistry { + modules: HashMap, +} + +impl ModuleRegistry { + pub fn new() -> Self { + Self { + modules: HashMap::new(), + } + } + + /// Gets a [`Module`] by ID. + pub fn get(&self, module_id: &ModuleId) -> Option<&Module> { + self.modules.get(module_id) + } + + /// Adds a [`Module`] and returns its [`ModuleId`]. + pub fn add(&mut self, module: Module) -> ModuleId { + let id = module.id; + self.modules.insert(id, module); + id + } + + /// Returns an iterator of modules. + pub fn modules(&self) -> impl Iterator { + self.modules.values() + } +} diff --git a/assembler/src/model/symbol.rs b/assembler/src/model/symbol.rs index 24fffc7..ea9b5cb 100644 --- a/assembler/src/model/symbol.rs +++ b/assembler/src/model/symbol.rs @@ -1,17 +1,58 @@ //! This module contains the definitions for a Symbol. +use std::collections::HashSet; + use uuid::Uuid; +use crate::{model::module::ModuleId, symtab::SymbolTable}; + +/// Tuple struct for type safety. Has methods for fetching symbols by ID. +#[derive(Debug, PartialEq, Eq, Hash, Copy, Clone)] +pub struct SymbolId(Uuid); + +impl From for SymbolId { + fn from(sym: Symbol) -> Self { + sym.id + } +} + +impl SymbolId { + pub fn new() -> Self { + Self(Uuid::new_v4()) + } + + /// Convenience method to get the [`Module`] from a [`ModuleId`]. + pub fn to_module<'s>(&self, registry: &'s SymbolTable) -> Option<&'s Symbol> { + registry.get(self) + } + + /// Convenience method to get the [`Module`] name from a [`ModuleId`]. + pub fn to_module_name<'m>(self, registry: &'m SymbolTable) -> Option<&'m str> { + if let Some(module) = self.to_module(®istry) { + Some(module.name.as_str()) + } else { + None + } + } +} + /// A symbol is a named reference that may be resolved later to an address by a linker. +#[derive(Debug)] pub struct Symbol { /// Stored cheaply instead of the name. Shall be stored in the symbol table under /// this key. - pub id: Uuid, + pub id: SymbolId, + + /// The human-readable name for the symbol. + pub name: String, + pub visibility: Visibility, + pub symbol_type: SymbolType, - /// The id of the module the symbol is defined in. - module_id: Uuid, + /// The id of the module the symbol is defined in. This will be different for symbols + /// in different objects. + pub module_id: ModuleId, /// Whether or not the symbol requires relocating. pub needs_relocation: bool, @@ -30,21 +71,68 @@ pub struct Symbol { /// ``` /// /// Where `main` depends on `another_func`. - pub dependencies: Vec, + pub dependencies: HashSet, + + /// The address of the symbol. + pub address: Option, + /// The section the symbol is in. + /// TODO: Perhaps make this a proper type? + pub section: Option, + pub size: Option, } impl Symbol { pub fn new( - id: Uuid, + name: String, + module_id: ModuleId, visibility: Visibility, symbol_type: SymbolType, - module_id: Uuid, ) -> Self { Self { - id, + id: SymbolId::new(), + name, + module_id, + address: None, + section: None, + size: None, visibility, symbol_type, - module_id, + needs_relocation: false, + dependencies: HashSet::new(), + } + } + + /// Adds a dependency on another [`Symbol`]. + pub fn add_dependency(&mut self, dep: SymbolId) { + if self.id == dep { + return; + } + + // We can resolve a lot of addresses at assembly time, but not really foreign + // ones, since we aren't certain of their position. + // + /* TODO: Handle this for flat binary case i.e. no linker required. This may be + * done using a similar method to before, such as just concatenating all + * of the files together and handling jumps and halts. + * + * > Ask Harry or read the initial code. + */ + if self.dependencies.insert(dep) { + self.needs_relocation = true; + } + } + + /// Returns whether a [`Symbol`] depends on `symbol_id`. + pub fn depends_on(&self, symbol_id: &SymbolId) -> bool { + self.dependencies.contains(symbol_id) + } + + /// Removes a [`Symbol`] from the dependency set. + pub fn remove_dependency(&mut self, symbol_id: &SymbolId) { + self.dependencies.remove(symbol_id); + + if self.dependencies.is_empty() { + self.needs_relocation = false; } } } @@ -63,7 +151,8 @@ pub enum Visibility { Weak, } +#[derive(Debug)] pub enum SymbolType { - Function, - Label, + LabelOrFunction, + Variable, } diff --git a/assembler/src/source.rs b/assembler/src/source.rs index 5190d73..1146508 100644 --- a/assembler/src/source.rs +++ b/assembler/src/source.rs @@ -14,7 +14,5 @@ pub mod tokeniser; pub fn load_source_bytes>(p: P) -> Result, AssembleError> { let path = p.as_ref(); - let bytes = std::fs::read(path)?; - - Ok(vec![]) + Ok(std::fs::read(path)?) } diff --git a/assembler/src/source/source_info.rs b/assembler/src/source/source_info.rs index 5347a29..ff43b4a 100644 --- a/assembler/src/source/source_info.rs +++ b/assembler/src/source/source_info.rs @@ -1,17 +1,25 @@ //! This file contains information on where a [`Token`] or [`Node`] is within the source -//! code for more informative errors. This will likely be attached to a [`Token`] which -//! will in turn be attached to an AST [`Node`]. +//! code for more informative errors. +//! +//! This will likely be attached to a [`Token`] which will in turn be attached to an AST +//! [`Node`]. -use uuid::Uuid; +use std::fmt::Display; + +use crate::model::module::Module; /// Information on where the token is within the source. #[derive(Debug)] pub struct SourceInfo { /// The line number within the source file underpinned by `module_id`. pub line_no: usize, - /// The ID of the module containing this token. This will be looked up in the global - /// hashmap of [`Module`]'s. - pub module_id: Uuid, + pub module: Module, /// The indexes where this token may be found (line-local). pub span: std::ops::Range, } + +impl Display for SourceInfo { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.module.name) + } +} diff --git a/assembler/src/source/token.rs b/assembler/src/source/token.rs index 094b078..ba3c8d1 100644 --- a/assembler/src/source/token.rs +++ b/assembler/src/source/token.rs @@ -2,16 +2,105 @@ //! easier to build from scratch and edit his code than it would be to try and wrangle it //! into shape. +use crate::source::source_info::SourceInfo; + #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub enum TokenType { - Symbol(Symbol), - Register(Register), + /// Symbol reference (e.g., `loop_start`, `my_data`). + Symbol(SymbolToken), + /// CPU register (e.g., `r1`, `r2`, `sp`). + Register(RegisterToken), + /// Immediate value (e.g., `42`, `0xFF`). Immediate(u32), - StringLit(String), - Opcode(Opcode), + /// String literal (e.g., `"hello world"`). + String(String), + /// Assembly instruction (e.g., `add`, `jmp`, `nop`). + Instruction(InstructionToken), + /// Label definition (e.g., `loop_start:`). + Label(LabelToken), + /// Assembler directive (e.g., `.global`, `.section`, `.dw`). + Directive(DirectiveToken), + /// End of line. + Newline, + /// End of file. + Eof, } +#[derive(Debug)] pub struct Token { + /// The type of the token. token_type: TokenType, + /// Where in the source code is this [`Token`]? source_info: SourceInfo, } + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct SymbolToken { + pub name: String, +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct LabelToken { + pub name: String, +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct DirectiveToken { + pub directive: String, +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct RegisterToken { + pub name: String, +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct InstructionToken { + pub mnemonic: String, +} + +impl Token { + #[must_use] + pub const fn new(token_type: TokenType, source_info: SourceInfo) -> Self { + Self { + token_type, + source_info, + } + } + + #[must_use] + pub const fn symbol(name: String, source_info: SourceInfo) -> Self { + Self::new(TokenType::Symbol(SymbolToken { name }), source_info) + } + + #[must_use] + pub const fn label(name: String, source_info: SourceInfo) -> Self { + Self::new(TokenType::Label(LabelToken { name }), source_info) + } + + #[must_use] + pub const fn instruction(mnemonic: String, source_info: SourceInfo) -> Self { + Self::new( + TokenType::Instruction(InstructionToken { mnemonic }), + source_info, + ) + } + + #[must_use] + pub const fn register(name: String, source_info: SourceInfo) -> Self { + Self::new(TokenType::Register(RegisterToken { name }), source_info) + } + + #[must_use] + pub const fn immediate(value: u32, source_info: SourceInfo) -> Self { + Self::new(TokenType::Immediate(value), source_info) + } + + #[must_use] + pub const fn directive(directive: String, source_info: SourceInfo) -> Self { + Self::new( + TokenType::Directive(DirectiveToken { directive }), + source_info, + ) + } +} diff --git a/assembler/src/source/tokeniser.rs b/assembler/src/source/tokeniser.rs index 8bbe489..902fc68 100644 --- a/assembler/src/source/tokeniser.rs +++ b/assembler/src/source/tokeniser.rs @@ -3,3 +3,5 @@ /// Consumes a [`Vec`] and outputs a [`Vec`] of [Token]'s. pub struct Tokeniser {} + +pub mod error; diff --git a/assembler/src/source/tokeniser/error.rs b/assembler/src/source/tokeniser/error.rs new file mode 100644 index 0000000..0af204a --- /dev/null +++ b/assembler/src/source/tokeniser/error.rs @@ -0,0 +1,10 @@ +//! This module contains the error types for the tokeniser. + +#[derive(Debug)] +pub enum TokeniserError {} + +impl std::fmt::Display for TokeniserError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "TODO!!!!!!") + } +} diff --git a/assembler/src/symtab.rs b/assembler/src/symtab.rs index 45d913f..7fe473e 100644 --- a/assembler/src/symtab.rs +++ b/assembler/src/symtab.rs @@ -4,7 +4,118 @@ //! It is also required for detection of duplicate symbols, and resolution in the flat //! binary output type. -/// Stored for each compilation unit (called a [`Module`]). -/// -/// One hashmap maps [`Symbol`] ID's to their corresponding structs, and -pub struct SymbolTable +use crate::{ + error::AssembleError, + model::{ + module::ModuleId, + symbol::{Symbol, SymbolId, Visibility}, + }, +}; +use std::collections::HashMap; + +/// Global symbol table - single source of truth for all symbols. +/// Much simpler than per-module tables. +#[derive(Debug)] +pub struct SymbolTable { + /// All symbols by their ID - O(1) lookup + symbols: HashMap, + /// Name to ID mapping for human-readable lookups - O(1) lookup + name_to_id: HashMap, + /// Module to symbols mapping for module-specific queries + module_symbols: HashMap>, +} + +impl SymbolTable { + #[must_use] + pub fn new() -> Self { + Self { + symbols: HashMap::new(), + name_to_id: HashMap::new(), + module_symbols: HashMap::new(), + } + } + + /// Adds a symbol to the global table + pub fn add_symbol(&mut self, symbol: Symbol) -> Result { + let id = symbol.id; + let module_id = symbol.module_id; + let name = symbol.name.clone(); + + // Check for duplicate names in the same module + if let Some(&existing_id) = self.name_to_id.get(&name) + && let Some(existing) = self.symbols.get(&existing_id) + && existing.module_id == module_id + { + return Err(AssembleError::new_other_error( + crate::error::AssembleErrorKind::IO(std::io::Error::new( + std::io::ErrorKind::AlreadyExists, + format!("Symbol '{name}' already defined in module"), + )), + )); + } + + // Add to all mappings + self.name_to_id.insert(name, id); + self.symbols.insert(id, symbol); + self.module_symbols.entry(module_id).or_default().push(id); + + Ok(id) + } + + /// Gets the [`Symbol`] by its [`SymbolId`]. + pub fn get(&self, id: &SymbolId) -> Option<&Symbol> { + self.symbols.get(id) + } + + /// Gets the [`Symbol`] by its name. + pub fn get_by_name(&self, name: &str) -> Option<&Symbol> { + self.name_to_id + .get(name) + .and_then(|id| self.symbols.get(id)) + } + + /// Gets all [`Symbol`]s in a module. + pub fn get_module_symbols(&self, module_id: &ModuleId) -> Vec<&Symbol> { + self.module_symbols + .get(module_id) + .map(|ids| ids.iter().filter_map(|id| self.symbols.get(id)).collect()) + .unwrap_or_default() + } + + /// Gets all the public symbols. + pub fn get_public_symbols(&self) -> Vec<&Symbol> { + self.symbols + .values() + .filter(|sym| matches!(sym.visibility, Visibility::Public)) + .collect() + } + + /// Updates symbol address (during resolution). Used for flat binaries or symbols with + /// no relocations. + pub fn update_symbol_address( + &mut self, + id: &SymbolId, + address: u32, + ) -> Result<(), AssembleError> { + if let Some(symbol) = self.symbols.get_mut(id) { + symbol.address = Some(address); + if symbol.dependencies.is_empty() { + symbol.needs_relocation = false; + } + Ok(()) + } else { + Err(AssembleError::new_other_error( + crate::error::AssembleErrorKind::IO(std::io::Error::new( + std::io::ErrorKind::NotFound, + "Symbol not found", + )), + )) + } + } +} + +impl Default for SymbolTable { + fn default() -> Self { + Self::new() + } +}