misc: update release profile for optimised builds

misc: add 'profiling' profile.
assembler: small misc updates, I am tired
2025-06-29 04:33:24 +01:00 · 2025-06-29 04:10:54 +01:00 · 2025-06-29 03:52:53 +01:00 · 2025-06-29 01:43:31 +01:00 · 2025-06-29 00:22:10 +01:00 · 2025-06-29 00:11:36 +01:00
22 changed files with 1299 additions and 230 deletions
@@ -5,3 +5,7 @@ rustc-wrapper = "sccache"
 [future-incompat-report]
 frequency = "always"
 [profile.profiling]
 inherits = "release"
 debug = true
@@ -15,3 +15,7 @@ panic = "abort" # Cranelift does not support stack unwinds.
 lto = false
 debug = true
 incremental = false           # sccache does not support caching incremental crates.
 [profile.release]
 incremental = true
 lto = "fat"
@@ -5,7 +5,8 @@ pub struct Args {
    /// The output format to assemble to. Currently just ELF or a flat binary.
    #[arg(value_enum)]
    output_format: Option<OutputFormat>,
-    /// Whether the relocatable object files should be statically linked into a single executable or library.
+    /// Whether the relocatable object files should be statically linked into a single
    /// executable or library.
    link: bool,
 }
@@ -6,6 +6,7 @@ use std::path::Path;
 use std::sync::mpsc;
 use std::thread;
 use crate::error::{AssembleErrorKind, IoErrorKind};
 use crate::{
    context::AssemblerContext,
    error::AssembleError,
@@ -176,8 +177,8 @@ impl CompilerEngine {
        // Verify file exists
        if !path.exists() {
            return Err(EngineError::Assembly(AssembleError::new_other_error(
-                crate::error::AssembleErrorKind::Io(crate::error::IoError::new(
+                AssembleErrorKind::Io(crate::error::IoError::new(
-                    crate::error::IoErrorKind::NotFound,
+                    IoErrorKind::NotFound,
                    Some(format!("Source file not found: {}", path.display())),
                )),
            )));
@@ -218,12 +219,12 @@ impl Default for CompilerEngine {
 fn assemble(src: &Path) -> Result<Vec<Instruction>, AssembleError> {
    // Verify the file exists
    if !src.exists() {
-        return Err(AssembleError::new_other_error(
+        return Err(AssembleError::new_other_error(AssembleErrorKind::Io(
-            crate::error::AssembleErrorKind::Io(crate::error::IoError::new(
+            crate::error::IoError::new(
-                crate::error::IoErrorKind::NotFound,
+                IoErrorKind::NotFound,
                Some(format!("Source file not found: {}", src.display())),
-            )),
+            ),
-        ));
+        )));
    }
    let mut modules = HashSet::new();
@@ -267,12 +268,10 @@ fn prepare_dependency(
    context: &AssemblerContext,
 ) -> Result<(), AssembleError> {
    let filename = path.file_name().and_then(|n| n.to_str()).ok_or_else(|| {
-        AssembleError::new_other_error(crate::error::AssembleErrorKind::Io(
+        AssembleError::new_other_error(AssembleErrorKind::Io(crate::error::IoError::new(
-            crate::error::IoError::new(
+            IoErrorKind::InvalidData,
                crate::error::IoErrorKind::InvalidData,
            Some("Failed to get file name from path".to_string()),
-            ),
+        )))
        ))
    })?;
    // Calculate a simple hash for the file (similar to quick_hash)
@@ -290,8 +289,8 @@ fn prepare_dependency(
    // Phase 1: Tokenize the file
    println!("Tokenising {filename}");
-    let tokeniser = Tokeniser::new(path)?;
+    let tokeniser = Tokeniser::new(path, context)?;
-    let tokens = tokeniser.tokenise(context)?;
+    let tokens = tokeniser.tokenise()?;
    // Get the module ID that was registered during tokenization
    let module_id = get_module_id_for_file(path, context)?;
@@ -328,7 +327,7 @@ fn get_module_id_for_file(
    {
        let registry = context.module_registry.read()?;
-        // Find module by path
+        // Find module by path.
        for module in registry.modules() {
            if module.path == file_path {
                return Ok(module.id);
@@ -336,15 +335,15 @@ fn get_module_id_for_file(
        }
    }
-    Err(AssembleError::new_other_error(
+    Err(AssembleError::new_other_error(AssembleErrorKind::Io(
-        crate::error::AssembleErrorKind::Io(crate::error::IoError::new(
+        crate::error::IoError::new(
-            crate::error::IoErrorKind::NotFound,
+            IoErrorKind::NotFound,
            Some(format!(
                "Module not found for file: {}",
                file_path.display()
            )),
-        )),
+        ),
-    ))
+    )))
 }
 /// Result of compilation. This is useless at present but compiles.
@@ -18,7 +18,8 @@ impl Default for AssemblerContext {
 }
 impl AssemblerContext {
-    #[must_use] pub fn new() -> Self {
+    #[must_use]
    pub fn new() -> Self {
        Self {
            symbol_table: RwLock::new(SymbolTable::new()),
            module_registry: RwLock::new(ModuleRegistry::new()),
@@ -13,6 +13,9 @@ pub struct AssembleError {
    source_info: Option<SourceInfo>,
    /// The type of assembly error that occurred.
    kind: AssembleErrorKind,
    /// Whether context should be added to errors being printed. This might get changed
    /// to Verbosity in the future.
    display_quietly: bool,
 }
 impl AssembleError {
@@ -24,6 +27,7 @@ impl AssembleError {
        Self {
            source_info: Some(source_info),
            kind,
            display_quietly: false,
        }
    }
@@ -32,16 +36,86 @@ impl AssembleError {
        Self {
            source_info: None,
            kind,
            display_quietly: true,
        }
    }
    /// Prints a parser error to the screen.
    fn print_parser_error(
        &self,
        f: &mut std::fmt::Formatter<'_>,
        parse_error: &ParserError,
    ) -> std::fmt::Result {
        let Some(source_info) = &self.source_info else {
            write!(
                f,
                "parser error thrown with no source information. Error: {parse_error}"
            )?;
            return Ok(());
        };
        writeln!(f, "parser error of type `{parse_error}`.\n")?;
        // Prints out the context for our error.
        if !self.display_quietly {
            source_info.print_context_with_underline().map_err(|e| {
                _ = writeln!(f, "print context error: {e}");
                std::fmt::Error {}
            })?;
        }
        Ok(())
    }
    /// Prints a tokeniser error to the screen.
    fn print_tokeniser_error(
        &self,
        f: &mut std::fmt::Formatter<'_>,
        err: &TokeniserError,
    ) -> std::fmt::Result {
        let Some(source_info) = &self.source_info else {
            write!(
                f,
                "Tokeniser error thrown with no source information. Error: {err}"
            )?;
            return Ok(());
        };
        writeln!(f, "tokeniser error of type `{err}`.\n")?;
        // Prints out the context for our error.
        source_info.print_context_with_underline().map_err(|e| {
            _ = writeln!(f, "Print context error: {e}");
            std::fmt::Error {}
        })?;
        Ok(())
    }
 }
 impl Display for AssembleError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        if let Some(info) = &self.source_info {
-            write!(f, "at {info}")?;
+            write!(f, "At {info}, got ")?;
            match &self.kind {
                AssembleErrorKind::Parser(err) => self.print_parser_error(f, err)?,
                AssembleErrorKind::Tokeniser(err) => {
                    self.print_tokeniser_error(f, err)?;
                }
                _ => write!(f, "{}", self.kind)?,
            }
            writeln!(f)?;
            return Ok(());
        }
        // Handle errors without SourceInfo.
        write!(f, "{}", self.kind)?;
        Ok(())
@@ -68,13 +142,7 @@ pub enum AssembleErrorKind {
 }
 #[derive(Debug, Clone)]
-pub struct ParserError {
+pub enum ParserError {
    error_type: ParserErrorType,
    source_info: SourceInfo,
 }
 #[derive(Debug, Clone)]
 pub enum ParserErrorType {
    UnexpectedToken,
    MissingOperand,
    InvalidInstruction,
@@ -82,7 +150,7 @@ pub enum ParserErrorType {
    DuplicateLabel,
 }
-impl Display for ParserErrorType {
+impl Display for ParserError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::UnexpectedToken => write!(f, "unexpected token"),
@@ -94,28 +162,6 @@ impl Display for ParserErrorType {
    }
 }
 impl Display for ParserError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        // TODO: Print the path/to/filename.dsa:line_no, column col_no.
        write!(
            f,
            "Parser error, {} at {}",
            self.error_type, self.source_info
        )?;
        // Prints out the context for our error.
        self.source_info
            .print_context_with_underline()
            .map_err(|e| {
                _ = writeln!(f, "Print context error: {e}");
                std::fmt::Error {}
            })?;
        Ok(())
    }
 }
 #[derive(Debug, Clone)]
 pub enum SymbolError {
    Undefined,
@@ -1,3 +1,10 @@
 use std::sync::Arc;
 use assembler::{
    error::{AssembleError, AssembleErrorKind, ParserError},
    model::module::Module,
    source::{source_info::SourceInfo, token::TokenType, tokeniser::Tokeniser},
 };
 use common as _;
 use num_cpus as _;
 use threadpool as _;
@@ -5,9 +12,31 @@ use threadpool as _;
 // use clap::Parser;
 // use std::{fs, io::Write, path::PathBuf};
-fn main() {
+fn main() -> Result<(), AssembleError> {
    // // Parse command line arguments
    // let args: Vec<String> = std::env::args().collect();
    let contents = include_bytes!("../../resources/dsa/bf.dsa").to_vec();
    let module = Arc::new(Module::new("resources/dsa/bf.dsa")?);
    let tok = Tokeniser::from_data(contents, module.clone());
    let ts = tok
        .tokenise()?
        .into_iter()
        .filter(|t| !matches!(t.token_type, TokenType::Eof | TokenType::Newline));
    for t in ts {
        t.source_info.print_context_with_underline()?;
    }
    let test_error: AssembleError = AssembleError::new_source_error(
        SourceInfo::new(45, module.clone(), 4..7),
        AssembleErrorKind::Parser(ParserError::InvalidInstruction),
    );
    eprintln!("\n\n{test_error}");
    Ok(())
    // let _clap_args = assembler::args::Args::parse();
@@ -9,15 +9,24 @@ use std::{
    sync::Arc,
 };
 use regex::Regex;
 use uuid::Uuid;
-use crate::model::module_registry::ModuleRegistry;
+use crate::{
    error::{AssembleError, AssembleErrorKind, IoError, IoErrorKind},
    model::module_registry::ModuleRegistry,
 };
 /// The ID for a module. A tuple struct for type safety.
 #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
 pub struct ModuleId(Uuid);
 impl ModuleId {
    #[must_use]
    pub fn new() -> Self {
        Self(Uuid::new_v4())
    }
    #[must_use]
    pub const fn from_module(module: &Module) -> Self {
        module.id
@@ -36,6 +45,12 @@ impl ModuleId {
    }
 }
 impl Default for ModuleId {
    fn default() -> Self {
        Self::new()
    }
 }
 impl std::fmt::Display for ModuleId {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{}", self.0)
@@ -61,11 +76,35 @@ impl std::hash::Hash for Module {
 }
 impl Module {
-    pub fn new<P: AsRef<Path>>(name: String, path: P) -> Self {
+    pub fn new<P: AsRef<Path>>(p: P) -> Result<Self, AssembleError> {
-        Self {
+        let path = p.as_ref().to_path_buf();
-            name,
+        let name = Self::extract_module_name(&path)?;
-            path: path.as_ref().to_path_buf(),
+        let id = ModuleId::new();
-            id: ModuleId(Uuid::new_v4()),
+
        Ok(Self { name, path, id })
    }
    /// Gets the name for a module from the path.
    fn extract_module_name<P: AsRef<Path>>(path: P) -> Result<String, AssembleError> {
        let extensions_regex = Regex::new(".(dsa|S|asm)$")
            .expect("For some reason the regular expression failed to compile!");
        let module_name = path
            .as_ref()
            .file_name()
            .map(|f| f.to_string_lossy())
            .ok_or_else(|| {
                AssembleError::new_other_error(AssembleErrorKind::Io(IoError::new(
                    IoErrorKind::InvalidData,
                    Some(
                        "the filename couldn't be extracted, is it valid UTF-8?"
                            .to_string(),
                    ),
                )))
            })?;
        // Strip any file extensions given. We don't care for now.
        let out = extensions_regex.replace(&module_name, "");
        Ok(out.to_string())
    }
 }
@@ -9,6 +9,7 @@ use std::{
 use crate::error::AssembleError;
 pub mod lines;
 pub mod opcode;
 pub mod source_info;
 pub mod token;
 pub mod token_info;
@@ -0,0 +1,349 @@
 //! This module contains instructions for tokenisation.
 use std::{fmt, str::FromStr};
 use common::prelude::{ITypeArgs, Instruction, Interrupt, RTypeArgs};
 use crate::{
    error::{AssembleError, AssembleErrorKind},
    source::source_info::SourceInfo,
 };
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum Opcode {
    Nop,
    Mov,
    Movs,
    Ldb,
    Ldbs,
    Ldh,
    Ldhs,
    Ldw,
    Stb,
    Sth,
    Stw,
    Lli,
    Lui,
    Jmp,
    Jeq,
    Jne,
    Jgt,
    Jge,
    Jlt,
    Jle,
    Cmp,
    Inc,
    Dec,
    Shl,
    Shr,
    Add,
    Sub,
    And,
    Or,
    Not,
    Xor,
    Nand,
    Nor,
    Xnor,
    Int,
    Irt,
    Hlt,
    AddI,
    SubI,
    // Pseudo-instructions
    Db,
    Dh,
    Dw,
    Resb,
    Resh,
    Resw,
    Push,
    Pop,
    Pusha,
    Popa,
    Lwi,
    Call,
    Return,
    // Meta instructions (these aren't present in the binary as instructions)
    Include,
    Data,
    Segment,
 }
 #[derive(Debug)]
 pub enum OpcodeFromStrError {
    InvalidRegister(&'static str),
    InvalidOpcode(String),
 }
 impl std::fmt::Display for OpcodeFromStrError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::InvalidRegister(reg) => write!(f, "register does not exist: {reg}"),
            Self::InvalidOpcode(op) => write!(f, "instruction does not exist: {op}"),
        }
    }
 }
 impl std::error::Error for OpcodeFromStrError {}
 impl Opcode {
    pub const OPCODES: &[&str] = &[
        // Real instructions (0x00-0x26)
        "nop", "mov", "movs", "ldb", "ldbs", "ldh", "ldhs", "ldw", "stb", "sth", "stw",
        "lli", "lui", "jmp", "jeq", "jne", "jgt", "jge", "jlt", "jle", "cmp", "inc",
        "dec", "shl", "shr", "add", "sub", "and", "or", "not", "xor", "nand", "nor",
        "xnor", "int", "irt", "hlt", "addi", "subi", // Pseudo-instructions
        "db", "dh", "dw", "resb", "resh", "resw", "push", "pop", "lwi", "call", "return",
        "pusha", "popa", // meta instructions
        "include",
    ];
    pub fn to_instruction(
        &self,
        source_info: SourceInfo,
    ) -> Result<Instruction, AssembleError> {
        match self {
            Self::Nop => Ok(Instruction::Nop),
            Self::Mov => Ok(Instruction::Mov(RTypeArgs::default())),
            Self::Movs => Ok(Instruction::MovSigned(RTypeArgs::default())),
            Self::Ldb => Ok(Instruction::LoadByte(ITypeArgs::default())),
            Self::Ldbs => Ok(Instruction::LoadByteSigned(ITypeArgs::default())),
            Self::Ldh => Ok(Instruction::LoadHalfword(ITypeArgs::default())),
            Self::Ldhs => Ok(Instruction::LoadHalfwordSigned(ITypeArgs::default())),
            Self::Ldw => Ok(Instruction::LoadWord(ITypeArgs::default())),
            Self::Stb => Ok(Instruction::StoreByte(ITypeArgs::default())),
            Self::Sth => Ok(Instruction::StoreHalfword(ITypeArgs::default())),
            Self::Stw => Ok(Instruction::StoreWord(ITypeArgs::default())),
            Self::Lli => Ok(Instruction::LoadLowerImmediate(ITypeArgs::default())),
            Self::Lui => Ok(Instruction::LoadUpperImmediate(ITypeArgs::default())),
            Self::Jmp => Ok(Instruction::Jump(ITypeArgs::default())),
            Self::Jeq => Ok(Instruction::JumpEq(ITypeArgs::default())),
            Self::Jne => Ok(Instruction::JumpNeq(ITypeArgs::default())),
            Self::Jgt => Ok(Instruction::JumpGt(ITypeArgs::default())),
            Self::Jge => Ok(Instruction::JumpGe(ITypeArgs::default())),
            Self::Jlt => Ok(Instruction::JumpLt(ITypeArgs::default())),
            Self::Jle => Ok(Instruction::JumpLe(ITypeArgs::default())),
            Self::Cmp => Ok(Instruction::Compare(RTypeArgs::default())),
            Self::Inc => Ok(Instruction::Increment(RTypeArgs::default())),
            Self::Dec => Ok(Instruction::Decrement(RTypeArgs::default())),
            Self::Shl => Ok(Instruction::ShiftLeft(RTypeArgs::default())),
            Self::Shr => Ok(Instruction::ShiftRight(RTypeArgs::default())),
            Self::Add => Ok(Instruction::Add(RTypeArgs::default())),
            Self::Sub => Ok(Instruction::Sub(RTypeArgs::default())),
            Self::And => Ok(Instruction::And(RTypeArgs::default())),
            Self::Or => Ok(Instruction::Or(RTypeArgs::default())),
            Self::Not => Ok(Instruction::Not(RTypeArgs::default())),
            Self::Xor => Ok(Instruction::Xor(RTypeArgs::default())),
            Self::Nand => Ok(Instruction::Nand(RTypeArgs::default())),
            Self::Nor => Ok(Instruction::Nor(RTypeArgs::default())),
            Self::Xnor => Ok(Instruction::Xnor(RTypeArgs::default())),
            Self::Int => Ok(Instruction::Interrupt(Interrupt::default())),
            Self::Irt => Ok(Instruction::IntReturn),
            Self::Hlt => Ok(Instruction::Halt),
            Self::AddI => Ok(Instruction::AddImmediate(ITypeArgs::default())),
            Self::SubI => Ok(Instruction::SubImmediate(ITypeArgs::default())),
            Self::Segment => Ok(Instruction::Segment(0)),
            _ => Err(AssembleError::new_source_error(
                source_info,
                AssembleErrorKind::Unimplemented(
                    "Opcode::to_instruction called on an instruction that does not exist in common.",
                ),
            )),
        }
    }
    #[must_use]
    pub const fn to_opcode_value(&self) -> Option<u8> {
        match self {
            Self::Nop => Some(0x00),
            Self::Mov => Some(0x01),
            Self::Movs => Some(0x02),
            Self::Ldb => Some(0x03),
            Self::Ldbs => Some(0x04),
            Self::Ldh => Some(0x05),
            Self::Ldhs => Some(0x06),
            Self::Ldw => Some(0x07),
            Self::Stb => Some(0x08),
            Self::Sth => Some(0x09),
            Self::Stw => Some(0x0A),
            Self::Lli => Some(0x0B),
            Self::Lui => Some(0x0C),
            Self::Jmp => Some(0x0D),
            Self::Jeq => Some(0x0E),
            Self::Jne => Some(0x0F),
            Self::Jgt => Some(0x10),
            Self::Jge => Some(0x11),
            Self::Jlt => Some(0x12),
            Self::Jle => Some(0x13),
            Self::Cmp => Some(0x14),
            Self::Inc => Some(0x15),
            Self::Dec => Some(0x16),
            Self::Shl => Some(0x17),
            Self::Shr => Some(0x18),
            Self::Add => Some(0x19),
            Self::Sub => Some(0x1A),
            Self::And => Some(0x1B),
            Self::Or => Some(0x1C),
            Self::Not => Some(0x1D),
            Self::Xor => Some(0x1E),
            Self::Nand => Some(0x1F),
            Self::Nor => Some(0x20),
            Self::Xnor => Some(0x21),
            Self::Int => Some(0x22),
            Self::Irt => Some(0x23),
            Self::Hlt => Some(0x24),
            Self::AddI => Some(0x25),
            Self::SubI => Some(0x26),
            // TODO: Maybe recombine pseudos?
            Self::Segment => Some(0x27),
            // Pseudo-instructions don't have opcode values
            _ => None,
        }
    }
    #[must_use]
    pub const fn is_pseudo_instruction(&self) -> bool {
        matches!(
            self,
            Self::Db
                | Self::Dh
                | Self::Dw
                | Self::Resb
                | Self::Resh
                | Self::Resw
                | Self::Push
                | Self::Pop
                | Self::Lwi
        )
    }
 }
 impl FromStr for Opcode {
    type Err = OpcodeFromStrError;
    fn from_str(s: &str) -> Result<Self, Self::Err> {
        match s.to_lowercase().as_str() {
            "nop" => Ok(Self::Nop),
            "mov" => Ok(Self::Mov),
            "movs" => Ok(Self::Movs),
            "ldb" => Ok(Self::Ldb),
            "ldbs" => Ok(Self::Ldbs),
            "ldh" => Ok(Self::Ldh),
            "ldhs" => Ok(Self::Ldhs),
            "ldw" => Ok(Self::Ldw),
            "stb" => Ok(Self::Stb),
            "sth" => Ok(Self::Sth),
            "stw" => Ok(Self::Stw),
            "lli" => Ok(Self::Lli),
            "lui" => Ok(Self::Lui),
            "jmp" => Ok(Self::Jmp),
            "jeq" => Ok(Self::Jeq),
            "jne" => Ok(Self::Jne),
            "jgt" => Ok(Self::Jgt),
            "jge" => Ok(Self::Jge),
            "jlt" => Ok(Self::Jlt),
            "jle" => Ok(Self::Jle),
            "cmp" => Ok(Self::Cmp),
            "inc" => Ok(Self::Inc),
            "dec" => Ok(Self::Dec),
            "shl" => Ok(Self::Shl),
            "shr" => Ok(Self::Shr),
            "add" => Ok(Self::Add),
            "sub" => Ok(Self::Sub),
            "and" => Ok(Self::And),
            "or" => Ok(Self::Or),
            "not" => Ok(Self::Not),
            "xor" => Ok(Self::Xor),
            "nand" => Ok(Self::Nand),
            "nor" => Ok(Self::Nor),
            "xnor" => Ok(Self::Xnor),
            "int" => Ok(Self::Int),
            "irt" => Ok(Self::Irt),
            "hlt" => Ok(Self::Hlt),
            "addi" => Ok(Self::AddI),
            "subi" => Ok(Self::SubI),
            "db" => Ok(Self::Db),
            "dh" => Ok(Self::Dh),
            "dw" => Ok(Self::Dw),
            "resb" => Ok(Self::Resb),
            "resh" => Ok(Self::Resh),
            "resw" => Ok(Self::Resw),
            "push" => Ok(Self::Push),
            "pop" => Ok(Self::Pop),
            "lwi" => Ok(Self::Lwi),
            "include" => Ok(Self::Include),
            "call" => Ok(Self::Call),
            "return" => Ok(Self::Return),
            "pusha" => Ok(Self::Pusha),
            "popa" => Ok(Self::Popa),
            _ => Err(OpcodeFromStrError::InvalidOpcode(s.to_string())),
        }
    }
 }
 impl fmt::Display for Opcode {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        match self {
            Self::Nop => write!(f, "nop"),
            Self::Mov => write!(f, "mov"),
            Self::Movs => write!(f, "movs"),
            Self::Ldb => write!(f, "ldb"),
            Self::Ldbs => write!(f, "ldbs"),
            Self::Ldh => write!(f, "ldh"),
            Self::Ldhs => write!(f, "ldhs"),
            Self::Ldw => write!(f, "ldw"),
            Self::Stb => write!(f, "stb"),
            Self::Sth => write!(f, "sth"),
            Self::Stw => write!(f, "stw"),
            Self::Lli => write!(f, "lli"),
            Self::Lui => write!(f, "lui"),
            Self::Jmp => write!(f, "jmp"),
            Self::Jeq => write!(f, "jeq"),
            Self::Jne => write!(f, "jne"),
            Self::Jgt => write!(f, "jgt"),
            Self::Jge => write!(f, "jge"),
            Self::Jlt => write!(f, "jlt"),
            Self::Jle => write!(f, "jle"),
            Self::Cmp => write!(f, "cmp"),
            Self::Inc => write!(f, "inc"),
            Self::Dec => write!(f, "dec"),
            Self::Shl => write!(f, "shl"),
            Self::Shr => write!(f, "shr"),
            Self::Add => write!(f, "add"),
            Self::Sub => write!(f, "sub"),
            Self::And => write!(f, "and"),
            Self::Or => write!(f, "or"),
            Self::Not => write!(f, "not"),
            Self::Xor => write!(f, "xor"),
            Self::Nand => write!(f, "nand"),
            Self::Nor => write!(f, "nor"),
            Self::Xnor => write!(f, "xnor"),
            Self::Int => write!(f, "int"),
            Self::Irt => write!(f, "irt"),
            Self::Hlt => write!(f, "hlt"),
            Self::AddI => write!(f, "addi"),
            Self::SubI => write!(f, "subi"),
            Self::Db => write!(f, "db"),
            Self::Dh => write!(f, "dh"),
            Self::Dw => write!(f, "dw"),
            Self::Resb => write!(f, "resb"),
            Self::Resh => write!(f, "resh"),
            Self::Resw => write!(f, "resw"),
            Self::Push => write!(f, "push"),
            Self::Pop => write!(f, "pop"),
            Self::Lwi => write!(f, "lwi"),
            Self::Call => write!(f, "call"),
            Self::Return => write!(f, "return"),
            Self::Pusha => write!(f, "pusha"),
            Self::Popa => write!(f, "popa"),
            // meta instructions
            Self::Include => write!(f, "include"),
            Self::Data => write!(f, "data"),
            Self::Segment => write!(f, "[SEGMENT]"),
        }
    }
 }
@@ -0,0 +1,4 @@
 //! This module contains code for handling pseudo opcodes.
 /// Pseudo instructions that cannot simply be lowered to ISA instructions.
 pub enum PseudoOpcode {}
@@ -21,7 +21,8 @@ use crate::{
 #[derive(Debug, Clone)]
 pub struct SourceInfo {
    /// The line number within the source file underpinned by `module_id`.
-    pub line_no: usize,
+    pub line_number: usize,
    /// The [`Module`] the source code is associated with.
    pub module: Arc<Module>,
    /// The indexes where this token may be found (line-local).
    pub span: std::ops::Range<usize>,
@@ -31,10 +32,10 @@ impl Display for SourceInfo {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
-            "{}:{}, column {}",
+            "{}:{}:{}",
            self.module.path.display(),
-            self.line_no,
+            self.line_number,
-            self.span.start
+            self.span.start + 1
        )
    }
 }
@@ -47,7 +48,7 @@ impl SourceInfo {
        span: std::ops::Range<usize>,
    ) -> Self {
        Self {
-            line_no,
+            line_number: line_no,
            module,
            span,
        }
@@ -61,7 +62,7 @@ impl SourceInfo {
        let mut lines = LinesWithSpans::new(rdr);
-        let Some(line_result) = lines.nth(self.line_no - 1) else {
+        let Some(line_result) = lines.nth(self.line_number - 1) else {
            // Handle a line not existing.
            return Err(AssembleError::new_source_error(
                self.clone(),
@@ -69,7 +70,7 @@ impl SourceInfo {
                    IoErrorKind::Other,
                    Some(format!(
                        "the line {} does not exist in input file `{}` but source info suggested otherwise!.",
-                        self.line_no,
+                        self.line_number,
                        self.module.path.display()
                    )),
                )),
@@ -79,13 +80,15 @@ impl SourceInfo {
        let line_span = line_result?;
        // Print the line number and line content.
-        println!("{:>4} | {}", self.line_no, line_span.content);
+        println!("{:>4} | {}", self.line_number, line_span.content);
        let mut pad_left = String::new();
        write!(pad_left, "{:>4}   ", "")?;
        let mut underline = String::new();
        write!(underline, "{:>4} | ", "")?;
        for _ in 0..self.span.start {
-            underline.push(' ');
+            pad_left.push(' ');
        }
        for _ in self.span.start..self.span.end.min(line_span.content.len()) {
@@ -94,7 +97,7 @@ impl SourceInfo {
        // Print the underline in red and bold.
        // TODO: Use a crate to make this extra portable.
-        println!("\x1b[1;31m{underline}\x1b[0m");
+        println!("{pad_left}\x1b[1;31m{underline}\x1b[0m");
        Ok(())
    }
@@ -2,13 +2,15 @@
 //! easier to build from scratch and edit his code than it would be to try and wrangle it
 //! into shape.
 use common::prelude::*;
 use crate::source::{
    opcode::Opcode,
    source_info::SourceInfo,
-    token_info::{
+    token_info::{DirectiveToken, LabelToken, RegisterToken, SymbolToken},
        DirectiveToken, InstructionToken, LabelToken, RegisterToken, SymbolToken,
    },
 };
 /// Represents the different types of tokens that can be produced by the tokeniser.
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub enum TokenType {
    /// Symbol reference (e.g., `loop_start`, `my_data`).
@@ -19,20 +21,22 @@ pub enum TokenType {
    Immediate(u32),
    /// String literal (e.g., `"hello world"`).
    String(String),
    /// Intermediate token for multiline strings (filtered out in final output)
    StringContinuation,
    /// Assembly instruction (e.g., `add`, `jmp`, `nop`).
-    Instruction(InstructionToken),
+    Instruction(Opcode),
    /// Label definition (e.g., `loop_start:`).
    Label(LabelToken),
-    /// Assembler directive (e.g., `.global`, `.section`, `.dw`, `.resb`).
+    /// Assembler directive (e.g., `.global`, `.section`, `.dw`).
    Directive(DirectiveToken),
    /// Comment (e.g., `// this is a comment`).
    Comment,
    /// Comma separator.
    Comma,
    /// End of line.
    Newline,
    /// End of file.
    Eof,
    /// A line comment. This is to be filtered out of the token stream.
    Comment,
 }
 #[derive(Debug)]
@@ -63,16 +67,13 @@ impl Token {
    }
    #[must_use]
-    pub const fn instruction(mnemonic: String, source_info: SourceInfo) -> Self {
+    pub const fn instruction(op: Opcode, source_info: SourceInfo) -> Self {
-        Self::new(
+        Self::new(TokenType::Instruction(op), source_info)
            TokenType::Instruction(InstructionToken { mnemonic }),
            source_info,
        )
    }
    #[must_use]
-    pub const fn register(name: String, source_info: SourceInfo) -> Self {
+    pub const fn register(reg: Register, source_info: SourceInfo) -> Self {
-        Self::new(TokenType::Register(RegisterToken { name }), source_info)
+        Self::new(TokenType::Register(RegisterToken { reg }), source_info)
    }
    #[must_use]
@@ -1,3 +1,5 @@
 use common::prelude::Register;
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub struct SymbolToken {
    pub name: String,
@@ -15,10 +17,18 @@ pub struct DirectiveToken {
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 pub struct RegisterToken {
-    pub name: String,
+    pub reg: Register,
 }
-#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+impl RegisterToken {
-pub struct InstructionToken {
+    #[must_use]
-    pub mnemonic: String,
+    pub const fn new(reg: Register) -> Self {
        Self { reg }
    }
    /// Returns the name of a valid [`Register`]
    #[must_use]
    pub fn name(&self) -> String {
        self.reg.to_string()
    }
 }
@@ -1,36 +1,37 @@
 //! This file contains the [`Tokeniser`], which consumes a [`Vec`] of input bytes and
 //! outputs a [`Vec<Token>`].
-use std::{
+use std::{path::Path, str::FromStr, sync::Arc};
    path::{Path, PathBuf},
    sync::Arc,
 };
 use regex::Regex;
 use common::prelude::*;
 use crate::{
    context::AssemblerContext,
-    error::{AssembleError, AssembleErrorKind, IoError, IoErrorKind},
+    error::{AssembleError, AssembleErrorKind},
    model::module::Module,
    source::{
-        lines::lines_with_spans,
+        lines::{LineSpan, lines_with_spans},
        load_source_bytes,
        opcode::Opcode,
        source_info::SourceInfo,
        token::{Token, TokenType},
-        token_info::{
+        token_info::{DirectiveToken, LabelToken, RegisterToken, SymbolToken},
-            DirectiveToken, InstructionToken, LabelToken, RegisterToken, SymbolToken,
+        tokeniser::error::TokeniserError,
        },
    },
 };
 pub mod error;
 #[cfg(test)]
 mod tests;
 /// Consumes a [`Vec<u8>`] and outputs a [`Vec`] of [Token]'s.
 pub struct Tokeniser {
    /// The data in the file.
    pub data: Vec<u8>,
-    /// The path to the file.
+    /// A copy of the Module in which the file is situated.
-    pub path: PathBuf,
+    pub module: Arc<Module>,
    // Pre-compiled regex patterns
    label_regex: Regex,
@@ -39,60 +40,73 @@ pub struct Tokeniser {
    directive_regex: Regex,
    instruction_regex: Regex,
    symbol_regex: Regex,
    string_regex: Regex,
    comment_regex: Regex,
    // String parsing state
    in_string: bool,
    string_buffer: String,
    string_start_line: usize,
    string_start_column: usize,
 }
 impl Tokeniser {
    #[must_use]
-    pub fn from_data(data: Vec<u8>, path: PathBuf) -> Self {
+    pub fn from_data(data: Vec<u8>, module: Arc<Module>) -> Self {
        Self {
            data,
-            path,
+            module,
            label_regex: Regex::new(r"^([a-zA-Z_][a-zA-Z0-9_]*):")
                .expect("Failed to compile label regex pattern"),
-            register_regex: Regex::new(r"^(r[0-9]+|sp|fp|pc)")
+            register_regex: Regex::new(
                r"^(rg[0-9a-f]+|acc|spr|bpr|ret|idr|mmr|zero|noreg|pcx)\b",
            )
            .expect("Failed to compile register regex pattern"),
-            immediate_regex: Regex::new(r"^(0x[0-9a-fA-F]+|[0-9]+)")
+            immediate_regex: Regex::new(
                r"^(0x[0-9a-fA-F_]+|0b[0-1_]+|0o[0-7_]+|[0-9_]+)",
            )
            .expect("Failed to compile immediate regex pattern"),
-            directive_regex: Regex::new(r"^\.([a-zA-Z]+)")
+            directive_regex: Regex::new(r"^(res[bwh]|d[bwh]|include|section|global|local)\b")
                .expect("Failed to compile directive regex pattern"),
            instruction_regex: Regex::new(
-                r"^(add|sub|mul|div|jmp|call|ret|lli|nop|halt)",
+                r"^(nop|movs?|ld[bhw]s?|st[bhw]|l[lu]i|j(mp|[egl][qte])|cmp|[id]nc|sh[lr]|add[i]?|sub[i]?|x?n?or|and|not|i[rd]t|hlt|lhwmm|lidt|push[a]?|pop[a]?|lwi|return|call)\b",
            )
            .expect("Failed to compile instruction regex pattern"),
-            symbol_regex: Regex::new(r"^([a-zA-Z_][a-zA-Z0-9_]*)")
+            symbol_regex: Regex::new(r"^([a-zA-Z_][a-zA-Z0-9_]*)::{2}([a-zA-Z0-9_]*)|([a-zA-Z_][a-zA-Z0-9_]*)")
                .expect("Failed to compile symbol regex pattern"),
-            string_regex: Regex::new(r#"^"([^"]*)"#)
+            comment_regex: Regex::new("^//.*")
                .expect("Failed to compile string regex pattern"),
            comment_regex: Regex::new("//.*")
                .expect("Failed to compile comment regex pattern"),
            // Initialize string parsing state
            in_string: false,
            string_buffer: String::new(),
            string_start_line: 0,
            string_start_column: 0,
        }
    }
-    /// Creates a [`Tokeniser`] from a file path.
+    /// Creates a [`Tokeniser`] from a file path. Also creates the underlying [`Module`]
-    pub fn new<P: AsRef<Path>>(path: P) -> Result<Self, AssembleError> {
+    /// for you.
    pub fn new<P: AsRef<Path>>(
        path: P,
        ctx: &AssemblerContext,
    ) -> Result<Self, AssembleError> {
        let path = path.as_ref().to_path_buf();
        let data = load_source_bytes(&path)?;
-
+        let module = Arc::new(Module::new(path)?);
        Ok(Self::from_data(data, path))
    }
    // Note that modules are tokenised in their own threads, possibly in parallel.
    pub fn tokenise(self, ctx: &AssemblerContext) -> Result<Vec<Token>, AssembleError> {
        let module_name = self.extract_module_name()?;
        // Create a module for the source file being processed.
        let module = Arc::new(Module::new(module_name, &self.path));
        {
            let mut module_registry = ctx.module_registry.write()?;
            module_registry.add(module.clone());
        }
        Ok(Self::from_data(data, module))
    }
    // Note that modules are tokenised in their own threads, possibly in parallel.
    pub fn tokenise(mut self) -> Result<Vec<Token>, AssembleError> {
        let mut token_stream = Vec::new();
-        let lines = lines_with_spans(&self.data);
+        let data = self.data.clone();
        let lines = lines_with_spans(&data);
        // Process each line
        for line_result in lines {
@@ -103,13 +117,13 @@ impl Tokeniser {
            if trimmed.is_empty() {
                token_stream.push(Token::new(
                    TokenType::Newline,
-                    SourceInfo::new(line_span.line_number, module.clone(), 0..1),
+                    SourceInfo::new(line_span.line_number, self.module.clone(), 0..1),
                ));
                continue;
            }
-            // Actually tokenize the line content
+            // Actually tokenise the line content
-            let line_tokens = self.tokenize_line(&line_span, &module)?;
+            let line_tokens = self.tokenise_line(&line_span)?;
            token_stream.extend(line_tokens);
            // Add newline token at end of line
@@ -117,42 +131,70 @@ impl Tokeniser {
                TokenType::Newline,
                SourceInfo::new(
                    line_span.line_number,
-                    module.clone(),
+                    self.module.clone(),
                    line_span.content.len()..line_span.content.len(),
                ),
            ));
        }
        // Add EOF token
-        token_stream.push(Token::new(TokenType::Eof, SourceInfo::new(0, module, 0..0)));
+        token_stream.push(Token::new(
            TokenType::Eof,
            SourceInfo::new(0, self.module.clone(), 0..0),
        ));
        Ok(token_stream)
    }
-    fn tokenize_line(
+    fn tokenise_line(
-        &self,
+        &mut self,
-        line_span: &crate::source::lines::LineSpan,
+        line_span: &LineSpan,
        module: &Arc<Module>,
    ) -> Result<Vec<Token>, AssembleError> {
        let mut tokens = Vec::new();
-        let mut remaining = line_span.content.trim();
+        let mut remaining = line_span.content.as_str();
-        let start_column = line_span.start_char;
+        let mut column = 0;
        // Skip leading whitespace
        let trimmed_start = remaining.trim_start();
        column += remaining.len() - trimmed_start.len();
        remaining = trimmed_start;
        while !remaining.is_empty() {
-            // Try to match a token.
+            let start_column = column;
            let (token_type, consumed) = self.match_token(remaining)?;
            // Try to match a token.
            let (token_type, consumed) =
                self.match_token(remaining, line_span.line_number, column)?;
            // Filter out string continuation tokens and comments.
            match token_type {
                TokenType::StringContinuation => {
                    // Don't add to token stream, just consume input
                }
                TokenType::Comment => {
                    // Don't add to token stream, consume rest of line
                    break;
                }
                _ => {
                    tokens.push(Token::new(
                        token_type,
                        SourceInfo::new(
                            line_span.line_number,
-                    module.clone(),
+                            self.module.clone(),
                            start_column..start_column + consumed,
                        ),
                    ));
                }
            }
            // Advance position.
-            remaining = remaining[consumed..].trim_start();
+            remaining = &remaining[consumed..];
            column += consumed;
            // Skip whitespace.
            let before_trim = remaining.len();
            remaining = remaining.trim_start();
            column += before_trim - remaining.len();
        }
        Ok(tokens)
@@ -175,10 +217,13 @@ impl Tokeniser {
    fn try_match_register(&self, input: &str) -> Option<(TokenType, usize)> {
        let caps = self.register_regex.captures(input)?;
-        let name = caps.get(1)?.as_str().to_string();
+
        let captured_group = caps.get(1)?.as_str();
        let len = caps.get(0)?.len();
-        Some((TokenType::Register(RegisterToken { name }), len))
+        let reg = Register::try_from(captured_group).ok()?;
        Some((TokenType::Register(RegisterToken { reg }), len))
    }
    fn try_match_immediate(&self, input: &str) -> Option<(TokenType, usize)> {
@@ -186,6 +231,9 @@ impl Tokeniser {
        let value_str = caps.get(1)?.as_str();
        let len = caps.get(0)?.len();
        // Remove any underscores that were inserted for readability.
        let value_str = value_str.replace('_', "");
        let value = if let Some(hex_part) = value_str.strip_prefix("0x") {
            u32::from_str_radix(hex_part, 16).ok()?
        } else if let Some(bin_part) = value_str.strip_prefix("0b") {
@@ -212,26 +260,122 @@ impl Tokeniser {
        let mnemonic = caps.get(1)?.as_str().to_string();
        let len = caps.get(0)?.len();
-        Some((TokenType::Instruction(InstructionToken { mnemonic }), len))
+        let op = Opcode::from_str(&mnemonic).ok()?;
        Some((TokenType::Instruction(op), len))
    }
    fn try_match_symbol(&self, input: &str) -> Option<(TokenType, usize)> {
        let caps = self.symbol_regex.captures(input)?;
        let name = caps.get(1)?.as_str().to_string();
        let len = caps.get(0)?.len();
        // Check which capture group matched.
        let name = if let Some(scoped_name) = caps.get(1) {
            // Matched the scoped symbol pattern (name::scope).
            format!("{}::{}", scoped_name.as_str(), caps.get(2)?.as_str())
        } else if let Some(simple_name) = caps.get(3) {
            simple_name.as_str().to_string()
        } else {
            return None;
        };
        Some((TokenType::Symbol(SymbolToken { name }), len))
    }
-    fn try_match_string(&self, input: &str) -> Option<(TokenType, usize)> {
+    fn try_match_string(
-        let caps = self.string_regex.captures(input)?;
+        &mut self,
-        let content = caps.get(1)?.as_str().to_string();
+        input: &str,
-        let len = caps.get(0)?.len();
+        line_number: usize,
-
+        column: usize,
-        Some((TokenType::String(content), len))
+    ) -> Option<(TokenType, usize)> {
        if self.in_string {
            // We're continuing a multiline string
            Some(self.handle_string_continuation(input, line_number, column))
        } else {
            // Look for the start of a new string
            self.handle_string_start(input, line_number, column)
        }
    }
    fn handle_string_start(
        &mut self,
        input: &str,
        line_number: usize,
        column: usize,
    ) -> Option<(TokenType, usize)> {
        if !input.starts_with('"') {
            return None;
        }
        // Find the closing quote on the same line
        if let Some(end_pos) = input[1..].find('"') {
            // Complete string on one line
            let content = input[1..=end_pos].to_string();
            let len = end_pos + 2; // +2 for both quotes
            Some((TokenType::String(content), len))
        } else {
            // Start of multiline string
            self.in_string = true;
            self.string_start_line = line_number;
            self.string_start_column = column;
            self.string_buffer = input[1..].to_string(); // Everything after opening quote
            self.string_buffer.push('\n'); // Add newline for multiline
            // Consume the entire rest of the line
            Some((TokenType::StringContinuation, input.len()))
        }
    }
    fn handle_string_continuation(
        &mut self,
        input: &str,
        _line_number: usize,
        _column: usize,
    ) -> (TokenType, usize) {
        // Look for closing quote
        if let Some(end_pos) = input.find('"') {
            // End of multiline string found
            self.string_buffer.push_str(&input[..end_pos]);
            self.in_string = false;
            let content = std::mem::take(&mut self.string_buffer);
            let len = end_pos + 1; // +1 for the closing quote
            (TokenType::String(content), len)
        } else {
            // Continue multiline string
            self.string_buffer.push_str(input);
            self.string_buffer.push('\n'); // Add newline
            // Consume the entire line
            (TokenType::StringContinuation, input.len())
        }
    }
    #[expect(clippy::range_plus_one, reason = "RangeInclusive is a different type!")]
    fn match_token(
        &mut self,
        input: &str,
        line_number: usize,
        column: usize,
    ) -> Result<(TokenType, usize), AssembleError> {
        if input.starts_with(',') {
            return Ok((TokenType::Comma, 1));
        }
        // Check for string first (including multiline continuations).
        if let Some(m) = self.try_match_string(input, line_number, column) {
            return Ok(m);
        }
        if let Some(m) = self.try_match_directive(input) {
            return Ok(m);
        }
        if let Some(m) = self.try_match_instruction(input) {
            return Ok(m);
        }
    fn match_token(&self, input: &str) -> Result<(TokenType, usize), AssembleError> {
        if let Some(m) = self.try_match_comment(input) {
            return Ok(m);
        }
@@ -248,54 +392,30 @@ impl Tokeniser {
            return Ok(m);
        }
        if let Some(m) = self.try_match_directive(input) {
            return Ok(m);
        }
        if let Some(m) = self.try_match_instruction(input) {
            return Ok(m);
        }
        if let Some(m) = self.try_match_string(input) {
            return Ok(m);
        }
        if let Some(m) = self.try_match_symbol(input) {
            return Ok(m);
        }
        let mut idx_iter = (column + 1)..;
        let Some(idx) = idx_iter.next() else {
            unreachable!()
        };
        let source = SourceInfo::new(line_number, self.module.clone(), idx..idx + 1);
        // Handle miscellaneous characters.
-        match input.chars().next() {
+        if let Some(c) = input.chars().next() {
-            Some(',') => Ok((TokenType::Comma, 1)),
+            Err(AssembleError::new_source_error(
-            Some(c) => Err(AssembleError::new_other_error(AssembleErrorKind::Io(
+                source,
-                IoError::new(
+                AssembleErrorKind::Tokeniser(TokeniserError::UnexpectedChar(c)),
-                    IoErrorKind::InvalidData,
+            ))
-                    Some(format!("Unexpected character: '{c}'")),
+        } else {
-                ),
+            Err(AssembleError::new_source_error(
-            ))),
+                source,
-            None => Err(AssembleError::new_other_error(AssembleErrorKind::Io(
+                AssembleErrorKind::Tokeniser(TokeniserError::UnexpectedEndOfInput(
-                IoError::new(
+                    input.len(),
-                    IoErrorKind::InvalidData,
+                )),
-                    Some("Unexpected end of input".to_string()),
+            ))
                ),
            ))),
        }
    }
    fn extract_module_name(&self) -> Result<String, AssembleError> {
        let module_name = self
            .path
            .file_name()
            .map(|f| f.to_string_lossy().to_string())
            .ok_or_else(|| {
                AssembleError::new_other_error(AssembleErrorKind::Io(IoError::new(
                    IoErrorKind::InvalidData,
                    Some(
                        "filename couldn't be extracted, is it valid UTF-8?".to_string(),
                    ),
                )))
            })?;
        Ok(module_name)
    }
 }
@@ -1,10 +1,41 @@
 //! This module contains the error types for the tokeniser.
 #[derive(Debug, Clone, Copy)]
-pub enum TokeniserError {}
+/// Types of errors that may be returned during tokenisation.
 pub enum TokeniserError {
    /// An unexpected character was found in the source code.
    UnexpectedChar(char),
    /// An unterminated string literal was found. [`SourceInfo`] will be attached if this
    /// was returned.
    UnterminatedString,
    /// An invalid number format was encountered when parsing a literal value
    /// ([`TokenType::Immediate`]).
    InvalidNumber(&'static str),
    /// An unrecognized token was encountered.
    UnrecognisedToken,
    /// Returned if the consumed count was lower than the length of the input file.
    /// This is a sign you will need to debug some [`Tokeniser`] code to ensure that
    /// [`Tokeniser::match_token`] is working as intended.
    ///
    /// First field is length of the line.
    UnexpectedEndOfInput(usize),
 }
 impl TokeniserError {}
 impl std::fmt::Display for TokeniserError {
    #[rustfmt::skip]
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "TODO!!!!!!")
+        match self {
            Self::UnexpectedChar(c) => write!(f, "unexpected char '{c}' found in input")?,
            Self::InvalidNumber(lit) => write!(f, "invalid integer literal \"{lit}\" found in input")?,
            Self::UnrecognisedToken => write!(f, "unrecognised token found in input")?,
            Self::UnterminatedString => write!(f, "unterminated string literal")?,
            Self::UnexpectedEndOfInput(line_length) => write!(
                f, "unexpected end of input, input length: {line_length}"
            )?,
        }
        Ok(())
    }
 }
@@ -0,0 +1,418 @@
 //! Unit tests for the tokenizer
 use common::prelude::Register;
 use crate::{
    model::module::Module,
    source::{
        opcode::Opcode,
        token::{Token, TokenType},
        token_info::RegisterToken,
        tokeniser::Tokeniser,
    },
 };
 use std::{path::PathBuf, sync::Arc};
 /// Helper function to create a tokenizer from source text
 fn create_tokenizer_from_source(source: &str) -> Tokeniser {
    let path = PathBuf::from("test.dsa");
    let module = Module::new(path).expect("Cannot create module!");
    Tokeniser::from_data(source.as_bytes().to_vec(), Arc::new(module))
 }
 /// Helper function to tokenize source and return tokens
 fn tokenize_source(source: &str) -> Result<Vec<Token>, crate::error::AssembleError> {
    let tokenizer = create_tokenizer_from_source(source);
    tokenizer.tokenise()
 }
 /// Helper function to extract token types from a token vector
 fn extract_token_types(tokens: &[Token]) -> Vec<&TokenType> {
    tokens.iter().map(|t| &t.token_type).collect()
 }
 #[test]
 fn test_empty_source() {
    let tokens = tokenize_source("").expect("Failed to tokenize empty source");
    // Should have at least EOF token
    assert!(!tokens.is_empty());
    assert!(matches!(
        tokens
            .last()
            .expect("Expected at least one token")
            .token_type,
        TokenType::Eof
    ));
 }
 #[test]
 fn test_whitespace_only() {
    let tokens = tokenize_source("   \n  \n  ").expect("Failed to tokenize whitespace");
    // Should have newlines and EOF
    let token_types = extract_token_types(&tokens);
    assert!(token_types.iter().any(|t| matches!(t, TokenType::Newline)));
    assert!(token_types.iter().any(|t| matches!(t, TokenType::Eof)));
 }
 #[test]
 fn test_single_instruction() {
    let tokens = tokenize_source("add").expect("Failed to tokenize instruction");
    let token_types = extract_token_types(&tokens);
    // Should have instruction, newline, and EOF
    assert!(
        token_types
            .iter()
            .any(|t| matches!(t, TokenType::Instruction(_)))
    );
    if let TokenType::Instruction(instr) = &tokens[0].token_type {
        assert_eq!(instr.to_string(), "add");
    } else {
        panic!("Expected instruction token");
    }
 }
 #[test]
 fn test_all_instructions() {
    let instructions = ["add", "sub", "jmp", "call", "return", "lli", "nop", "hlt"];
    for instr in &instructions {
        let tokens = tokenize_source(instr).expect("Failed to tokenize instruction");
        if let TokenType::Instruction(parsed_instr) = &tokens[0].token_type {
            assert_eq!(parsed_instr.to_string(), *instr);
        } else {
            panic!("Expected instruction token for {instr}");
        }
    }
 }
 #[test]
 fn test_registers() {
    let test_cases = [("rg0", "rg0"), ("rgf", "rgf"), ("pcx", "pcx")];
    for (input, expected) in &test_cases {
        let tokens = tokenize_source(input).expect("Failed to tokenize register");
        if let TokenType::Register(reg) = &tokens[0].token_type {
            assert_eq!(reg.reg.to_string(), *expected);
        } else {
            panic!("Expected register token for {input}");
        }
    }
 }
 #[test]
 fn test_immediates() {
    let test_cases = [
        ("42", 42),
        ("0", 0),
        ("0xFF", 255),
        ("0x1234", 0x1234),
        ("0xDEADBEEF", 0xDEAD_BEEF),
        ("0o12", 0o12),
        ("0b101", 0b101),
    ];
    for (input, expected) in &test_cases {
        let tokens = tokenize_source(input).expect("Failed to tokenize immediate");
        if let TokenType::Immediate(value) = &tokens[0].token_type {
            assert_eq!(*value, *expected);
        } else {
            panic!("Expected immediate token for {input}");
        }
    }
 }
 #[test]
 fn test_labels() {
    let test_cases = [
        ("loop_start:", "loop_start"),
        ("main:", "main"),
        ("_private_label:", "_private_label"),
        ("Label123:", "Label123"),
    ];
    for (input, expected) in &test_cases {
        let tokens = tokenize_source(input).expect("Failed to tokenize label");
        if let TokenType::Label(label) = &tokens[0].token_type {
            assert_eq!(label.name, *expected);
        } else {
            panic!("Expected label token for {input}");
        }
    }
 }
 #[test]
 fn test_directives() {
    let test_cases = [
        ("global", "global"),
        ("section", "section"),
        ("local", "local"),
    ];
    for (input, expected) in &test_cases {
        let tokens = tokenize_source(input).expect("Failed to tokenize directive");
        if let TokenType::Directive(directive) = &tokens[0].token_type {
            assert_eq!(directive.directive, *expected);
        } else {
            panic!("Expected directive token for {input}");
        }
    }
 }
 #[test]
 fn test_symbols() {
    let test_cases = [
        ("my_symbol", "my_symbol"),
        ("_private", "_private"),
        ("Symbol123", "Symbol123"),
        ("camelCase", "camelCase"),
    ];
    for (input, expected) in &test_cases {
        let tokens = tokenize_source(input).expect("Failed to tokenize symbol");
        if let TokenType::Symbol(symbol) = &tokens[0].token_type {
            assert_eq!(symbol.name, *expected);
        } else {
            panic!("Expected symbol token for {input}");
        }
    }
 }
 #[test]
 fn test_complex_instruction_line() {
    let source = "addi rg1, rg2, 0xFF";
    let tokens = tokenize_source(source).expect("Failed to tokenise complex instruction");
    // Should have: instruction, register, comma, register, comma, immediate, newline, EOF
    assert!(tokens.len() >= 6);
    assert!(matches!(tokens[0].token_type, TokenType::Instruction(_)));
    assert!(matches!(tokens[1].token_type, TokenType::Register(_)));
    assert!(matches!(tokens[2].token_type, TokenType::Comma));
    assert!(matches!(tokens[3].token_type, TokenType::Register(_)));
    assert!(matches!(tokens[4].token_type, TokenType::Comma));
    assert!(matches!(tokens[5].token_type, TokenType::Immediate(_)));
 }
 #[test]
 fn test_multiline_with_comments() {
    const EXPECTED_TOKEN_TYPES: [TokenType; 11] = [
        TokenType::Instruction(Opcode::Add),
        TokenType::Register(RegisterToken::new(Register::Rg0)),
        TokenType::Comma,
        TokenType::Register(RegisterToken::new(Register::Rg1)),
        TokenType::Newline,
        TokenType::Instruction(Opcode::SubI),
        TokenType::Register(RegisterToken::new(Register::Rg2)),
        TokenType::Comma,
        TokenType::Immediate(10),
        TokenType::Newline,
        TokenType::Eof,
    ];
    const SOURCE: &str = r"add rg0, rg1 // Another comment
        subi rg2, 10";
    let tokens =
        tokenize_source(SOURCE).expect("Failed to tokenise source with comments");
    let token_types = extract_token_types(&tokens);
    assert_eq!(
        token_types.len(),
        EXPECTED_TOKEN_TYPES.len(),
        "{token_types:#?}"
    );
    for (expected, got) in EXPECTED_TOKEN_TYPES.iter().zip(token_types.iter()) {
        assert!(!(expected != *got), "Expected {expected:?}, got {got:?}");
    }
 }
 #[test]
 fn test_tokenise_brainf_interpreter() {
    const SOURCE: &str = include_str!("../../../../resources/dsa/bf.dsa");
    let tokens =
        tokenize_source(SOURCE).expect("Failed to tokenise the brainfuck compiler!");
    dbg!(tokens);
 }
 #[test]
 fn test_string_literals() {
    let test_cases = [
        (r#""hello world""#, "hello world"),
        (
            r#""++++++++++++++++++++++++++++++++++++++++++++""#,
            "++++++++++++++++++++++++++++++++++++++++++++",
        ),
        (r#""Invalid Instruction!""#, "Invalid Instruction!"),
        (r#""""#, ""),
    ];
    for (input, expected) in &test_cases {
        let tokens = tokenize_source(input).expect("Failed to tokenize string literal");
        if let TokenType::String(value) = &tokens[0].token_type {
            assert_eq!(value, expected);
        } else {
            panic!("Expected string token for {input}");
        }
    }
 }
 #[test]
 fn test_data_directives() {
    let test_cases = [("db", "db"), ("dw", "dw"), ("resb", "resb")];
    for (input, expected) in &test_cases {
        let tokens = tokenize_source(input).expect("Failed to tokenize data declaration");
        if let TokenType::Directive(decl) = &tokens[0].token_type {
            assert_eq!(decl.directive, *expected);
        } else {
            panic!("Expected data declaration token for {input}");
        }
    }
 }
 #[test]
 fn test_include_directive() {
    let source = r#"include print "./lib/print.dsa""#;
    let tokens = tokenize_source(source).expect("Failed to tokenize include directive");
    assert!(tokens.len() >= 3);
    assert!(matches!(tokens[0].token_type, TokenType::Directive(_)));
    assert!(matches!(tokens[1].token_type, TokenType::Symbol(_)));
    assert!(matches!(tokens[2].token_type, TokenType::String(_)));
 }
 #[test]
 fn test_hex_addresses() {
    let test_cases = [("0x10000", 0x10000), ("0x30000", 0x30000)];
    for (input, expected) in &test_cases {
        let tokens = tokenize_source(input).expect("Failed to tokenize hex address");
        if let TokenType::Immediate(value) = &tokens[0].token_type {
            assert_eq!(*value, *expected);
        } else {
            panic!("Expected immediate token for {input}");
        }
    }
 }
 #[test]
 fn test_memory_operations() {
    let source = "ldw rg1, rg2";
    let tokens = tokenize_source(source).expect("Failed to tokenize memory operation");
    assert!(tokens.len() >= 4);
    assert!(matches!(tokens[0].token_type, TokenType::Instruction(_)));
    assert!(matches!(tokens[1].token_type, TokenType::Register(_)));
    assert!(matches!(tokens[2].token_type, TokenType::Comma));
    assert!(matches!(tokens[3].token_type, TokenType::Register(_)));
 }
 #[test]
 fn test_function_calls() {
    let source = "call print::print";
    let tokens = tokenize_source(source).expect("Failed to tokenize function call");
    assert!(tokens.len() >= 2);
    assert!(matches!(tokens[0].token_type, TokenType::Instruction(_)));
    // The symbol might be parsed differently depending on how :: is handled
    // This test checks basic structure
    assert!(
        tokens
            .iter()
            .any(|t| matches!(t.token_type, TokenType::Symbol(_)))
    );
 }
 #[test]
 fn test_comments_are_ignored() {
    let source = "add rg0, rg1 // this is a comment\nsub rg2, rg3";
    let tokens = tokenize_source(source).expect("Failed to tokenize with comments");
    // Comments should be stripped, so we should only have instruction tokens
    let instruction_count = tokens
        .iter()
        .filter(|t| matches!(t.token_type, TokenType::Instruction(_)))
        .count();
    assert_eq!(instruction_count, 2);
 }
 #[test]
 fn test_newline_always_present() {
    // Test that even without explicit newline at end, one is added
    let source = "add rg0, rg1"; // No newline at end
    let tokens = tokenize_source(source).expect("Failed to tokenize without newline");
    // Should have newline before EOF
    let has_newline = tokens
        .iter()
        .any(|t| matches!(t.token_type, TokenType::Newline));
    assert!(
        has_newline,
        "Expected newline to be added even when missing from input"
    );
    // EOF should be last.
    assert!(matches!(
        tokens
            .last()
            .expect("Expected at least one token")
            .token_type,
        TokenType::Eof
    ));
 }
 #[test]
 fn test_complex_branching_code() {
    let source = r"
    cmp rg3, rg8
    jeq increment
    cmp rg3, rg9
    jeq decrement";
    let tokens = tokenize_source(source).expect("Failed to tokenize branching code");
    let instruction_count = tokens
        .iter()
        .filter(|t| matches!(t.token_type, TokenType::Instruction(_)))
        .count();
    assert_eq!(instruction_count, 4);
    let symbol_count = tokens
        .iter()
        .filter(|t| matches!(t.token_type, TokenType::Symbol(_)))
        .count();
    assert_eq!(symbol_count, 2); // increment and decrement labels
 }
 #[test]
 fn test_stack_operations() {
    let source = "push rg2\npop zero\npusha 2\npopa 2";
    let tokens = tokenize_source(source).expect("Failed to tokenize stack operations");
    let instruction_count = tokens
        .iter()
        .filter(|t| matches!(t.token_type, TokenType::Instruction(_)))
        .count();
    assert_eq!(instruction_count, 4);
 }
@@ -1,9 +1,10 @@
 use crate::{instructions::encode::Encode, prelude::*};
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Default)]
 pub enum Interrupt {
    Software(u8),
    Breakpoint,
    #[default]
    HardFault,
 }
@@ -1,4 +1,5 @@
-//! Various types of arguments that instructions can take, alongside encoding and decoding logic.
+//! Various types of arguments that instructions can take, alongside encoding and decoding
 //! logic.
 use crate::{
    instructions::{RegisterParseError, encode::Encode},
@@ -35,18 +36,20 @@ impl std::fmt::Display for ArgsDecodeError {
 impl std::error::Error for ArgsDecodeError {}
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
 /// Used by instructions with 2 registers and an immediate argument.
 pub struct ITypeArgs {
    pub immediate: u16,
    pub r1: Register,
-    /// May not actually be used by some instructions taking an immediate e.g. LUI. This is solved by making the constructor take Options.
+    /// May not actually be used by some instructions taking an immediate e.g. LUI. This
    /// is solved by making the constructor take Options.
    pub r2: Register,
 }
 impl ITypeArgs {
    #[must_use]
-    /// Creates a new [`ITypeArgs`]. If r1 or r2 is unset, they will be replaced with [`Register::NoReg`].
+    /// Creates a new [`ITypeArgs`]. If r1 or r2 is unset, they will be replaced with
    /// [`Register::NoReg`].
    pub fn new(immediate: u16, r1: Option<Register>, r2: Option<Register>) -> Self {
        let r1 = r1.unwrap_or_default();
        let r2 = r2.unwrap_or_default();
@@ -56,8 +59,8 @@ impl ITypeArgs {
 }
 impl Encode for ITypeArgs {
-    /// Encodes an I-type instruction from its fields. These must have some unused high-order
+    /// Encodes an I-type instruction from its fields. These must have some unused
-    /// bits set to 0 else the bit shifting logic gets fucked.
+    /// high-order bits set to 0 else the bit shifting logic gets fucked.
    fn encode(self, opcode: u8) -> u32 {
        let opcode = u32::from(opcode);
        let r1 = self.r1 as u32;
@@ -84,7 +87,7 @@ impl TryFrom<u32> for ITypeArgs {
 }
 /// Used by instructions not using immediates (besides 5 bit shift values).
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
 pub struct RTypeArgs {
    pub sr1: Register,
    pub sr2: Register,
@@ -95,7 +98,8 @@ pub struct RTypeArgs {
 impl RTypeArgs {
    #[must_use]
-    /// Creates a new [`RTypeArgs`]. If any registers are unset, they will be replaced with [`Register::NoReg`]. If `shamt` is unset, it will be set to 0.
+    /// Creates a new [`RTypeArgs`]. If any registers are unset, they will be replaced
    /// with [`Register::NoReg`]. If `shamt` is unset, it will be set to 0.
    pub fn new(
        sr1: Option<Register>,
        sr2: Option<Register>,
@@ -122,7 +126,8 @@ impl Encode for RTypeArgs {
    ///
    /// # Arguments
    ///
-    /// - `shamt`: The amount to shift value (used only in shift instructions, otherwise 0).
+    /// - `shamt`: The amount to shift value (used only in shift instructions, otherwise
    ///   0).
    fn encode(self, opcode: u8) -> u32 {
        let opcode = u32::from(opcode);
        let sr1 = self.sr1 as u32;
@@ -39,7 +39,9 @@ impl std::fmt::Display for InstructionDecodeError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::InvalidOpcode(code) => write!(f, "invalid opcode, got {code:x}")?,
-            Self::InvalidArgument(err) => write!(f, "invalid arguments, got an error {err}")?,
+            Self::InvalidArgument(err) => {
                write!(f, "invalid arguments, got an error {err}")?;
            }
        }
        Ok(())
@@ -160,11 +160,12 @@ impl CodeEditor {
    /// Stick to bottom
    /// The scroll handle will stick to the bottom position even while the content size
-    /// changes dynamically. This can be useful to simulate terminal UIs or log/info scrollers.
+    /// changes dynamically. This can be useful to simulate terminal UIs or log/info
-    /// The scroll handle remains stuck until user manually changes position. Once "unstuck"
+    /// scrollers. The scroll handle remains stuck until user manually changes
-    /// it will remain focused on whatever content viewport the user left it on. If the scroll
+    /// position. Once "unstuck" it will remain focused on whatever content viewport
-    /// handle is dragged to the bottom it will again become stuck and remain there until manually
+    /// the user left it on. If the scroll handle is dragged to the bottom it will
-    /// pulled from the end position.
+    /// again become stuck and remain there until manually pulled from the end
    /// position.
    ///
    /// **Default: false**
    pub fn stick_to_bottom(self, stick_to_bottom: bool) -> Self {
@@ -5,7 +5,7 @@
 include print "./lib/print.dsa"
 // "print hello world"
-db program: "++++++++++++++++++++++++++++++++++++++++++++
+db program "++++++++++++++++++++++++++++++++++++++++++++
 >++++++++++++++++++++++++++++++++
 >++++++++++++++++
 >
@@ -35,10 +35,10 @@ db program: "++++++++++++++++++++++++++++++++++++++++++++
 ]
 <<++..."
-db error: "Invalid Instruction!"
+db error "Invalid Instruction!"
-dw stack: 0x10000
+dw stack 0x10000
-dw input: 0x30000
+dw input 0x30000
-resb data: 1024
+resb data 1024
 // set up a stack so we can call functions
 _init_stack:
Author	SHA1	Message	Date
nullndvoid	b91207bfde	misc: update release profile for optimised builds	2025-06-29 04:33:24 +01:00
nullndvoid	4ac630ba02	misc: add 'profiling' profile.	2025-06-29 04:10:54 +01:00
nullndvoid	85e3d443cc	assembler: small misc updates, I am tired	2025-06-29 03:52:53 +01:00
nullndvoid	0528768947	fmt: ran 'cargo fmt'.	2025-06-29 01:43:31 +01:00
nullndvoid	21582f1297	tokeniser/syntax: (db varname: -> db varname) dropped colon, updated tests.	2025-06-29 00:22:10 +01:00
nullndvoid	6ceb35d439	tokeniser: bugfixes to comma handling, regexes TODO: Verify output is as expected, perhaps I can dump to file and compare token stream with known valid one? Will add some extra tests of course!	2025-06-29 00:11:36 +01:00
nullndvoid	8bb252e941	tokeniser: return TokeniserErrors where relevant. The UnexpectedEndOfInput case is a little vague.	2025-06-28 23:35:55 +01:00
nullndvoid	5317988fdd	assembler: SourceInfo doc comment added to self.module.	2025-06-28 23:14:30 +01:00
nullndvoid	d15e00c272	tokeniser: refactor to store Module directly in Tokeniser We hereby avoid making extra copies of the PathBuf. - Also updated tests to match the new API	2025-06-28 23:13:44 +01:00
nullndvoid	a65dca6c5c	tokeniser: errors now print with SourceInfo if added	2025-06-28 23:11:24 +01:00
nullndvoid	b8be1bd95f	tokeniser: add some actual tokeniser errors TODO: Return these lol	2025-06-28 23:05:07 +01:00
nullndvoid	f42c6d4095	assembler: refactor error handling and use ModuleId::new constructor	2025-06-28 23:03:13 +01:00
nullndvoid	eebea82c4a	assembler: start tokenising multiline strings (WIP)	2025-06-26 17:42:48 +01:00
nullndvoid	ed4fcc8495	assembler: enhance error handling and tokenization logic	2025-06-26 17:00:14 +01:00
nullndvoid	40f8b1d57b	assembler: fix clippy warnings	2025-06-25 19:49:20 +01:00
nullndvoid	68e459f32b	assembler: use common to match registers	2025-06-25 19:29:56 +01:00
nullndvoid	d9807b5b36	assembler: update tokeniser to allow extra prefixes and separators (0xDEAD_BEEF)	2025-06-25 19:15:51 +01:00