assembler: we failing DSA with this one

This commit is contained in:
2025-06-25 14:31:53 +01:00
parent 9232f2ccab
commit 20a7d42adb
14 changed files with 508 additions and 38 deletions
+21
View File
@@ -0,0 +1,21 @@
//! This module contains the global asembler context to be passed to functions that need
//! it.
use std::sync::RwLock;
use crate::{model::module_registry::ModuleRegistry, symtab::SymbolTable};
/// Global state to be passed around.
pub struct AssemblerContext {
pub symbol_table: RwLock<SymbolTable>,
pub module_registry: RwLock<ModuleRegistry>,
}
impl AssemblerContext {
pub fn new() -> Self {
Self {
symbol_table: RwLock::new(SymbolTable::new()),
module_registry: RwLock::new(ModuleRegistry::new()),
}
}
}
+44 -10
View File
@@ -3,7 +3,7 @@
use std::fmt::{Debug, Display};
use crate::source::source_info::SourceInfo;
use crate::source::{source_info::SourceInfo, tokeniser::error::TokeniserError};
/// An error that may occur during the assembly of a set of source files.
#[derive(Debug)]
@@ -13,26 +13,38 @@ pub struct AssembleError {
source_info: Option<SourceInfo>,
/// The type of assembly error that occurred.
kind: AssembleErrorKind,
/// The formatter to handle printing the error.
formatter: Box<dyn ErrorFormatter>,
}
impl AssembleError {
pub fn new_source_error(source_info: SourceInfo, kind: AssembleErrorKind) -> Self {
#[must_use]
pub const fn new_source_error(
source_info: SourceInfo,
kind: AssembleErrorKind,
) -> Self {
Self {
source_info: Some(source_info),
kind,
formatter,
}
}
pub fn new_other_error(kind: AssembleErrorKind) {}
#[must_use]
pub const fn new_other_error(kind: AssembleErrorKind) -> Self {
Self {
source_info: None,
kind,
}
}
}
impl Display for AssembleError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.formatter
.write(f, self.source_info.as_ref(), &self.kind)
if let Some(info) = &self.source_info {
write!(f, "at {info}")?;
}
write!(f, "{}", self.kind)?;
Ok(())
}
}
@@ -42,7 +54,29 @@ impl std::error::Error for AssembleError {}
/// Different types of errors that may occur when assembling a set of input source files.
#[non_exhaustive]
#[derive(Debug)]
pub enum AssembleErrorKind {}
pub enum AssembleErrorKind {
/// Usually unexpected I/O errors. Not normally recoverable.
IO(std::io::Error),
/// Errors emitted from the [`Tokeniser`].
Tokenise(TokeniserError),
}
impl Display for AssembleErrorKind {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Tokenise(why) => write!(f, "tokeniser error: {why}"),
_ => write!(
f,
"unhandled error type in Display implementation! See error.rs!"
),
}
}
}
impl From<std::io::Error> for AssembleErrorKind {
fn from(err: std::io::Error) -> Self {
Self::IO(err)
}
}
pub mod conversions;
pub mod formatters;
+1 -1
View File
@@ -2,6 +2,6 @@ use crate::error::AssembleError;
impl From<std::io::Error> for AssembleError {
fn from(err: std::io::Error) -> Self {
Self::new_other_error(err.into())
}
}
+1
View File
@@ -15,6 +15,7 @@
pub mod args;
pub mod image_builder;
// pub mod tooling;
pub mod context;
pub mod error;
pub mod model;
pub mod source;
+2
View File
@@ -1,3 +1,5 @@
//! This module contains the underlying data models and enums used by the Assembler.
pub mod module;
pub mod module_registry;
pub mod symbol;
+69
View File
@@ -0,0 +1,69 @@
//! This module contains the [`Module`] type and associated types. Each compilation unit
//! (file) is represented by a module which is used to namespace "function" calls and
//! accesses to global variables.
//!
//! They have unique identifiers in the form of UUIDs.
use std::path::{Path, PathBuf};
use uuid::Uuid;
use crate::model::module_registry::ModuleRegistry;
/// The ID for a module. A tuple struct for type safety.
#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
pub struct ModuleId(Uuid);
impl ModuleId {
pub fn from_module(module: Module) -> Self {
module.id
}
/// Convenience method to get the [`Module`] from a [`ModuleId`].
pub fn to_module<'m>(&self, registry: &'m ModuleRegistry) -> Option<&'m Module> {
registry.get(self)
}
/// Convenience method to get the [`Module`] name from a [`ModuleId`].
pub fn to_module_name<'m>(self, registry: &'m ModuleRegistry) -> Option<&'m str> {
if let Some(module) = self.to_module(&registry) {
Some(module.name.as_str())
} else {
None
}
}
}
impl std::fmt::Display for ModuleId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.0)
}
}
/// A single source file or compilation unit. Stores its own symbol table.
#[derive(Debug)]
pub struct Module {
/// The name of the module. This is typically the name of the file, less the `.dsa`
/// extension.
pub name: String,
/// The file path to the module. This is an absolute path.
pub path: PathBuf,
/// A unique ID for this module.
pub id: ModuleId,
}
impl std::hash::Hash for Module {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
self.id.0.hash(state);
}
}
impl Module {
pub fn new<P: AsRef<Path>>(name: String, path: P) -> Self {
Self {
name,
path: path.as_ref().to_path_buf(),
id: ModuleId(Uuid::new_v4()),
}
}
}
+36
View File
@@ -0,0 +1,36 @@
//! This module contains the code for the module registry. This is a singleton storing all
//! the modules being assembled.
use std::collections::HashMap;
use super::module::{Module, ModuleId};
/// Stores all the [`Module`]'s to be assembled.
pub struct ModuleRegistry {
modules: HashMap<ModuleId, Module>,
}
impl ModuleRegistry {
pub fn new() -> Self {
Self {
modules: HashMap::new(),
}
}
/// Gets a [`Module`] by ID.
pub fn get(&self, module_id: &ModuleId) -> Option<&Module> {
self.modules.get(module_id)
}
/// Adds a [`Module`] and returns its [`ModuleId`].
pub fn add(&mut self, module: Module) -> ModuleId {
let id = module.id;
self.modules.insert(id, module);
id
}
/// Returns an iterator of modules.
pub fn modules(&self) -> impl Iterator<Item = &Module> {
self.modules.values()
}
}
+99 -10
View File
@@ -1,17 +1,58 @@
//! This module contains the definitions for a Symbol.
use std::collections::HashSet;
use uuid::Uuid;
use crate::{model::module::ModuleId, symtab::SymbolTable};
/// Tuple struct for type safety. Has methods for fetching symbols by ID.
#[derive(Debug, PartialEq, Eq, Hash, Copy, Clone)]
pub struct SymbolId(Uuid);
impl From<Symbol> for SymbolId {
fn from(sym: Symbol) -> Self {
sym.id
}
}
impl SymbolId {
pub fn new() -> Self {
Self(Uuid::new_v4())
}
/// Convenience method to get the [`Module`] from a [`ModuleId`].
pub fn to_module<'s>(&self, registry: &'s SymbolTable) -> Option<&'s Symbol> {
registry.get(self)
}
/// Convenience method to get the [`Module`] name from a [`ModuleId`].
pub fn to_module_name<'m>(self, registry: &'m SymbolTable) -> Option<&'m str> {
if let Some(module) = self.to_module(&registry) {
Some(module.name.as_str())
} else {
None
}
}
}
/// A symbol is a named reference that may be resolved later to an address by a linker.
#[derive(Debug)]
pub struct Symbol {
/// Stored cheaply instead of the name. Shall be stored in the symbol table under
/// this key.
pub id: Uuid,
pub id: SymbolId,
/// The human-readable name for the symbol.
pub name: String,
pub visibility: Visibility,
pub symbol_type: SymbolType,
/// The id of the module the symbol is defined in.
module_id: Uuid,
/// The id of the module the symbol is defined in. This will be different for symbols
/// in different objects.
pub module_id: ModuleId,
/// Whether or not the symbol requires relocating.
pub needs_relocation: bool,
@@ -30,21 +71,68 @@ pub struct Symbol {
/// ```
///
/// Where `main` depends on `another_func`.
pub dependencies: Vec<Uuid>,
pub dependencies: HashSet<SymbolId>,
/// The address of the symbol.
pub address: Option<u32>,
/// The section the symbol is in.
/// TODO: Perhaps make this a proper type?
pub section: Option<String>,
pub size: Option<u32>,
}
impl Symbol {
pub fn new(
id: Uuid,
name: String,
module_id: ModuleId,
visibility: Visibility,
symbol_type: SymbolType,
module_id: Uuid,
) -> Self {
Self {
id,
id: SymbolId::new(),
name,
module_id,
address: None,
section: None,
size: None,
visibility,
symbol_type,
module_id,
needs_relocation: false,
dependencies: HashSet::new(),
}
}
/// Adds a dependency on another [`Symbol`].
pub fn add_dependency(&mut self, dep: SymbolId) {
if self.id == dep {
return;
}
// We can resolve a lot of addresses at assembly time, but not really foreign
// ones, since we aren't certain of their position.
//
/* TODO: Handle this for flat binary case i.e. no linker required. This may be
* done using a similar method to before, such as just concatenating all
* of the files together and handling jumps and halts.
*
* > Ask Harry or read the initial code.
*/
if self.dependencies.insert(dep) {
self.needs_relocation = true;
}
}
/// Returns whether a [`Symbol`] depends on `symbol_id`.
pub fn depends_on(&self, symbol_id: &SymbolId) -> bool {
self.dependencies.contains(symbol_id)
}
/// Removes a [`Symbol`] from the dependency set.
pub fn remove_dependency(&mut self, symbol_id: &SymbolId) {
self.dependencies.remove(symbol_id);
if self.dependencies.is_empty() {
self.needs_relocation = false;
}
}
}
@@ -63,7 +151,8 @@ pub enum Visibility {
Weak,
}
#[derive(Debug)]
pub enum SymbolType {
Function,
Label,
LabelOrFunction,
Variable,
}
+1 -3
View File
@@ -14,7 +14,5 @@ pub mod tokeniser;
pub fn load_source_bytes<P: AsRef<Path>>(p: P) -> Result<Vec<u8>, AssembleError> {
let path = p.as_ref();
let bytes = std::fs::read(path)?;
Ok(vec![])
Ok(std::fs::read(path)?)
}
+14 -6
View File
@@ -1,17 +1,25 @@
//! This file contains information on where a [`Token`] or [`Node`] is within the source
//! code for more informative errors. This will likely be attached to a [`Token`] which
//! will in turn be attached to an AST [`Node`].
//! code for more informative errors.
//!
//! This will likely be attached to a [`Token`] which will in turn be attached to an AST
//! [`Node`].
use uuid::Uuid;
use std::fmt::Display;
use crate::model::module::Module;
/// Information on where the token is within the source.
#[derive(Debug)]
pub struct SourceInfo {
/// The line number within the source file underpinned by `module_id`.
pub line_no: usize,
/// The ID of the module containing this token. This will be looked up in the global
/// hashmap of [`Module`]'s.
pub module_id: Uuid,
pub module: Module,
/// The indexes where this token may be found (line-local).
pub span: std::ops::Range<usize>,
}
impl Display for SourceInfo {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.module.name)
}
}
+93 -4
View File
@@ -2,16 +2,105 @@
//! easier to build from scratch and edit his code than it would be to try and wrangle it
//! into shape.
use crate::source::source_info::SourceInfo;
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum TokenType {
Symbol(Symbol),
Register(Register),
/// Symbol reference (e.g., `loop_start`, `my_data`).
Symbol(SymbolToken),
/// CPU register (e.g., `r1`, `r2`, `sp`).
Register(RegisterToken),
/// Immediate value (e.g., `42`, `0xFF`).
Immediate(u32),
StringLit(String),
Opcode(Opcode),
/// String literal (e.g., `"hello world"`).
String(String),
/// Assembly instruction (e.g., `add`, `jmp`, `nop`).
Instruction(InstructionToken),
/// Label definition (e.g., `loop_start:`).
Label(LabelToken),
/// Assembler directive (e.g., `.global`, `.section`, `.dw`).
Directive(DirectiveToken),
/// End of line.
Newline,
/// End of file.
Eof,
}
#[derive(Debug)]
pub struct Token {
/// The type of the token.
token_type: TokenType,
/// Where in the source code is this [`Token`]?
source_info: SourceInfo,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct SymbolToken {
pub name: String,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct LabelToken {
pub name: String,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct DirectiveToken {
pub directive: String,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct RegisterToken {
pub name: String,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct InstructionToken {
pub mnemonic: String,
}
impl Token {
#[must_use]
pub const fn new(token_type: TokenType, source_info: SourceInfo) -> Self {
Self {
token_type,
source_info,
}
}
#[must_use]
pub const fn symbol(name: String, source_info: SourceInfo) -> Self {
Self::new(TokenType::Symbol(SymbolToken { name }), source_info)
}
#[must_use]
pub const fn label(name: String, source_info: SourceInfo) -> Self {
Self::new(TokenType::Label(LabelToken { name }), source_info)
}
#[must_use]
pub const fn instruction(mnemonic: String, source_info: SourceInfo) -> Self {
Self::new(
TokenType::Instruction(InstructionToken { mnemonic }),
source_info,
)
}
#[must_use]
pub const fn register(name: String, source_info: SourceInfo) -> Self {
Self::new(TokenType::Register(RegisterToken { name }), source_info)
}
#[must_use]
pub const fn immediate(value: u32, source_info: SourceInfo) -> Self {
Self::new(TokenType::Immediate(value), source_info)
}
#[must_use]
pub const fn directive(directive: String, source_info: SourceInfo) -> Self {
Self::new(
TokenType::Directive(DirectiveToken { directive }),
source_info,
)
}
}
+2
View File
@@ -3,3 +3,5 @@
/// Consumes a [`Vec<u8>`] and outputs a [`Vec`] of [Token]'s.
pub struct Tokeniser {}
pub mod error;
+10
View File
@@ -0,0 +1,10 @@
//! This module contains the error types for the tokeniser.
#[derive(Debug)]
pub enum TokeniserError {}
impl std::fmt::Display for TokeniserError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "TODO!!!!!!")
}
}
+115 -4
View File
@@ -4,7 +4,118 @@
//! It is also required for detection of duplicate symbols, and resolution in the flat
//! binary output type.
/// Stored for each compilation unit (called a [`Module`]).
///
/// One hashmap maps [`Symbol`] ID's to their corresponding structs, and
pub struct SymbolTable
use crate::{
error::AssembleError,
model::{
module::ModuleId,
symbol::{Symbol, SymbolId, Visibility},
},
};
use std::collections::HashMap;
/// Global symbol table - single source of truth for all symbols.
/// Much simpler than per-module tables.
#[derive(Debug)]
pub struct SymbolTable {
/// All symbols by their ID - O(1) lookup
symbols: HashMap<SymbolId, Symbol>,
/// Name to ID mapping for human-readable lookups - O(1) lookup
name_to_id: HashMap<String, SymbolId>,
/// Module to symbols mapping for module-specific queries
module_symbols: HashMap<ModuleId, Vec<SymbolId>>,
}
impl SymbolTable {
#[must_use]
pub fn new() -> Self {
Self {
symbols: HashMap::new(),
name_to_id: HashMap::new(),
module_symbols: HashMap::new(),
}
}
/// Adds a symbol to the global table
pub fn add_symbol(&mut self, symbol: Symbol) -> Result<SymbolId, AssembleError> {
let id = symbol.id;
let module_id = symbol.module_id;
let name = symbol.name.clone();
// Check for duplicate names in the same module
if let Some(&existing_id) = self.name_to_id.get(&name)
&& let Some(existing) = self.symbols.get(&existing_id)
&& existing.module_id == module_id
{
return Err(AssembleError::new_other_error(
crate::error::AssembleErrorKind::IO(std::io::Error::new(
std::io::ErrorKind::AlreadyExists,
format!("Symbol '{name}' already defined in module"),
)),
));
}
// Add to all mappings
self.name_to_id.insert(name, id);
self.symbols.insert(id, symbol);
self.module_symbols.entry(module_id).or_default().push(id);
Ok(id)
}
/// Gets the [`Symbol`] by its [`SymbolId`].
pub fn get(&self, id: &SymbolId) -> Option<&Symbol> {
self.symbols.get(id)
}
/// Gets the [`Symbol`] by its name.
pub fn get_by_name(&self, name: &str) -> Option<&Symbol> {
self.name_to_id
.get(name)
.and_then(|id| self.symbols.get(id))
}
/// Gets all [`Symbol`]s in a module.
pub fn get_module_symbols(&self, module_id: &ModuleId) -> Vec<&Symbol> {
self.module_symbols
.get(module_id)
.map(|ids| ids.iter().filter_map(|id| self.symbols.get(id)).collect())
.unwrap_or_default()
}
/// Gets all the public symbols.
pub fn get_public_symbols(&self) -> Vec<&Symbol> {
self.symbols
.values()
.filter(|sym| matches!(sym.visibility, Visibility::Public))
.collect()
}
/// Updates symbol address (during resolution). Used for flat binaries or symbols with
/// no relocations.
pub fn update_symbol_address(
&mut self,
id: &SymbolId,
address: u32,
) -> Result<(), AssembleError> {
if let Some(symbol) = self.symbols.get_mut(id) {
symbol.address = Some(address);
if symbol.dependencies.is_empty() {
symbol.needs_relocation = false;
}
Ok(())
} else {
Err(AssembleError::new_other_error(
crate::error::AssembleErrorKind::IO(std::io::Error::new(
std::io::ErrorKind::NotFound,
"Symbol not found",
)),
))
}
}
}
impl Default for SymbolTable {
fn default() -> Self {
Self::new()
}
}