Compare commits

6 Commits

Author SHA1 Message Date
nullndvoid 7cb7525484 assembler: remove some current dead code 2025-06-25 17:56:45 +01:00
nullndvoid 7565374d5b assembler: Tokeniser updates, Compiler Engine is back finally 2025-06-25 17:55:34 +01:00
nullndvoid 9b9e153500 assembler: wrap Module's with Arc and update Tokeniser (still WIP)
Implements complete tokenizer with Arc-wrapped modules

Enhances module handling by wrapping Module instances in Arc for thread-safe sharing across the assembler pipeline.

Implements full tokenization logic with pattern matching for all token types including labels, registers, immediates, directives, instructions, symbols, and strings.

Adds comma token support and proper EOF handling to complete the lexical analysis phase.

Generated AI slop commit message, may not be super accurate or it may be a bit too serious lol.
2025-06-25 17:35:03 +01:00
nullndvoid 27267e3daa assembler: use smart pointer for modules since sourceinfo gets copy 2025-06-25 17:03:48 +01:00
nullndvoid fb84a6d3c3 assembler: clippy lints, better error formatting
Adds regex dependency and enhances error handling system

Introduces comprehensive error type hierarchy with specific variants for parser, symbol, codegen, threading, and IO errors to improve error reporting and debugging capabilities.

Adds regex crate for pattern matching in tokenizer implementation with pre-compiled patterns for labels, registers, immediates, directives, instructions, and symbols.

Enhances source info functionality with context printing and error underlining similar to compiler diagnostics.

Implements better error conversions and threading error handling for lock failures and panics.
2025-06-25 16:50:17 +01:00
nullndvoid 4e5db58a84 assembler: start refactoring/rewriting tokeniser 2025-06-25 14:48:45 +01:00
19 changed files with 1172 additions and 83 deletions
Generated
+39
View File
@@ -129,6 +129,15 @@ dependencies = [
"zerocopy", "zerocopy",
] ]
[[package]]
name = "aho-corasick"
version = "1.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
dependencies = [
"memchr",
]
[[package]] [[package]]
name = "android-activity" name = "android-activity"
version = "0.6.0" version = "0.6.0"
@@ -269,6 +278,7 @@ dependencies = [
"clap", "clap",
"common", "common",
"num_cpus", "num_cpus",
"regex",
"threadpool", "threadpool",
"uuid", "uuid",
] ]
@@ -2691,6 +2701,35 @@ dependencies = [
"thiserror 2.0.12", "thiserror 2.0.12",
] ]
[[package]]
name = "regex"
version = "1.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "regex-automata"
version = "0.4.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
[[package]] [[package]]
name = "renderdoc-sys" name = "renderdoc-sys"
version = "1.1.0" version = "1.1.0"
+1
View File
@@ -16,5 +16,6 @@ path = "src/lib.rs"
clap = { version = "4.5.40", features = ["derive"] } clap = { version = "4.5.40", features = ["derive"] }
common = { path = "../common" } common = { path = "../common" }
num_cpus = "1.17.0" num_cpus = "1.17.0"
regex = "1.11.1"
threadpool = "1.8.1" threadpool = "1.8.1"
uuid = { version = "1.17.0", features = ["v4"] } uuid = { version = "1.17.0", features = ["v4"] }
+375
View File
@@ -0,0 +1,375 @@
//! Simple compiler engine that orchestrates the entire compilation process.
use std::collections::{HashMap, HashSet};
use std::fmt;
use std::path::Path;
use std::sync::mpsc;
use std::thread;
use crate::{
context::AssemblerContext,
error::AssembleError,
model::module::ModuleId,
source::{token::Token, tokeniser::Tokeniser},
};
use common::instructions::Instruction;
/// Error type for the `CompilerEngine`
#[derive(Debug)]
pub enum EngineError {
/// Assembly error during compilation
Assembly(AssembleError),
/// Channel communication error
Channel(String),
/// Other generic error
Other(String),
}
impl fmt::Display for EngineError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Assembly(e) => write!(f, "Assembly error: {e}"),
Self::Channel(msg) => write!(f, "Channel error: {msg}"),
Self::Other(msg) => write!(f, "Engine error: {msg}"),
}
}
}
impl std::error::Error for EngineError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match self {
Self::Assembly(e) => Some(e),
Self::Channel(_) | Self::Other(_) => None,
}
}
}
// Convert from AssembleError
impl From<AssembleError> for EngineError {
fn from(error: AssembleError) -> Self {
Self::Assembly(error)
}
}
// Convert from mpsc::SendError
impl<T> From<mpsc::SendError<T>> for EngineError {
fn from(error: mpsc::SendError<T>) -> Self {
Self::Channel(format!("Send error: {error}"))
}
}
// Convert from mpsc::RecvError
impl From<mpsc::RecvError> for EngineError {
fn from(error: mpsc::RecvError) -> Self {
Self::Channel(format!("Receive error: {error}"))
}
}
// Convert from mpsc::TryRecvError
impl From<mpsc::TryRecvError> for EngineError {
fn from(error: mpsc::TryRecvError) -> Self {
Self::Channel(format!("Try receive error: {error}"))
}
}
// Convert from String for generic errors
impl From<String> for EngineError {
fn from(error: String) -> Self {
Self::Other(error)
}
}
// Convert from &str for convenience
impl From<&str> for EngineError {
fn from(error: &str) -> Self {
Self::Other(error.to_string())
}
}
/// Simple compiler engine that orchestrates the entire compilation process.
pub struct CompilerEngine {
result_tx: mpsc::Sender<Result<Vec<Instruction>, EngineError>>,
result_rx: Option<mpsc::Receiver<Result<Vec<Instruction>, EngineError>>>,
is_running: bool,
}
impl CompilerEngine {
/// Create a new compiler engine
#[must_use]
pub fn new() -> Self {
let (tx, rx) = mpsc::channel();
Self {
result_tx: tx,
result_rx: Some(rx),
is_running: false,
}
}
/// Start the compilation process in a separate thread
pub fn start_compilation<P: AsRef<Path>>(&mut self, src: P) {
if self.is_running {
return;
}
let src = src.as_ref().to_path_buf();
let tx = self.result_tx.clone();
thread::spawn(move || {
let result = assemble(&src).map_err(EngineError::from);
let _ = tx.send(result); // Ignore send errors if receiver is dropped
});
self.is_running = true;
}
/// Check if compilation is complete and get the result
pub fn try_get_result(&mut self) -> Option<Result<Vec<Instruction>, EngineError>> {
if !self.is_running {
return None;
}
match self
.result_rx
.as_ref()
.expect("result_rx should be Some while compilation is running")
.try_recv()
{
Ok(result) => {
self.is_running = false;
Some(result)
}
Err(mpsc::TryRecvError::Empty) => None,
Err(mpsc::TryRecvError::Disconnected) => {
self.is_running = false;
Some(Err(EngineError::Channel(
"Compilation thread disconnected".to_string(),
)))
}
}
}
/// Block until compilation is complete and return the result
pub fn wait_for_result(&mut self) -> Result<Vec<Instruction>, EngineError> {
if !self.is_running {
return Err(EngineError::Other("No compilation in progress".to_string()));
}
let result = self
.result_rx
.take()
.expect("result_rx should be Some while waiting for compilation result")
.recv()
.map_err(EngineError::from)?;
self.is_running = false;
result
}
/// Add a source file to be compiled (for compatibility with old interface)
pub fn add_source_file<P: AsRef<Path>>(
&mut self,
path: P,
) -> Result<(), EngineError> {
let path = path.as_ref().to_path_buf();
// Verify file exists
if !path.exists() {
return Err(EngineError::Assembly(AssembleError::new_other_error(
crate::error::AssembleErrorKind::Io(crate::error::IoError::new(
crate::error::IoErrorKind::NotFound,
Some(format!("Source file not found: {}", path.display())),
)),
)));
}
// For now, just validate the file exists
// TODO: Could store multiple files for batch compilation
Ok(())
}
/// Compile all added source files (synchronous version)
pub fn compile(&mut self) -> Result<CompileResult, EngineError> {
// This is a placeholder that matches the old interface
// For now, return empty result since we don't have a specific file to compile
Ok(CompileResult {
modules: Vec::new(),
tokens: HashMap::new(),
})
}
/// Get access to the assembler context (placeholder)
pub fn context(&self) -> Result<&AssemblerContext, EngineError> {
// For now, return an error since we're using the threaded approach
// TODO: Integrate context properly when we have more compilation phases
Err(EngineError::Other(
"Context not available in threaded mode".to_string(),
))
}
}
impl Default for CompilerEngine {
fn default() -> Self {
Self::new()
}
}
/// Main assembly function that orchestrates the compilation process
fn assemble(src: &Path) -> Result<Vec<Instruction>, AssembleError> {
// Verify the file exists
if !src.exists() {
return Err(AssembleError::new_other_error(
crate::error::AssembleErrorKind::Io(crate::error::IoError::new(
crate::error::IoErrorKind::NotFound,
Some(format!("Source file not found: {}", src.display())),
)),
));
}
let mut modules = HashSet::new();
let mut all_tokens = HashMap::new();
let mut module_ids = Vec::new();
// Create a new assembler context for this compilation
let context = AssemblerContext::new();
// Process the main file and its dependencies
prepare_dependency(
src,
&mut modules,
&mut all_tokens,
&mut module_ids,
&context,
)?;
// Phase 2: Parse tokens into AST (placeholder for now)
// TODO: Add parser here when implemented
println!("Phase 2: Parsing {} modules...", module_ids.len());
// Phase 3: Symbol resolution (placeholder for now)
// TODO: Add symbol resolution here when implemented
println!("Phase 3: Resolving symbols...");
// Phase 4: Code generation (placeholder for now)
// TODO: Add code generation here when implemented
println!("Phase 4: Generating code...");
// For now, return empty instructions since we don't have the full pipeline yet
Ok(Vec::new())
}
/// Prepare a dependency (file) for compilation
fn prepare_dependency(
path: &Path,
modules: &mut HashSet<u64>,
all_tokens: &mut HashMap<ModuleId, Vec<Token>>,
module_ids: &mut Vec<ModuleId>,
context: &AssemblerContext,
) -> Result<(), AssembleError> {
let filename = path.file_name().and_then(|n| n.to_str()).ok_or_else(|| {
AssembleError::new_other_error(crate::error::AssembleErrorKind::Io(
crate::error::IoError::new(
crate::error::IoErrorKind::InvalidData,
Some("Failed to get file name from path".to_string()),
),
))
})?;
// Calculate a simple hash for the file (similar to quick_hash)
let file_hash = calculate_file_hash(path);
// Skip if we've already processed this module
if modules.contains(&file_hash) {
return Ok(());
}
modules.insert(file_hash);
if let Ok(canonical_path) = path.canonicalize() {
println!("Building {} [{}]", filename, canonical_path.display());
}
// Phase 1: Tokenize the file
println!("Tokenising {filename}");
let tokeniser = Tokeniser::new(path)?;
let tokens = tokeniser.tokenise(context)?;
// Get the module ID that was registered during tokenization
let module_id = get_module_id_for_file(path, context)?;
all_tokens.insert(module_id, tokens);
module_ids.push(module_id);
// TODO: Parse tokens to find dependencies (.include directives, etc.)
// For now, we'll just process the single file
println!("Resolving dependencies for {filename}");
Ok(())
}
/// Calculate a simple hash for a file path (similar to the old `quick_hash`)
fn calculate_file_hash(path: &Path) -> u64 {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut hasher = DefaultHasher::new();
if let Ok(canonical) = path.canonicalize() {
canonical.hash(&mut hasher);
} else {
path.hash(&mut hasher);
}
hasher.finish()
}
/// Get the module ID for a given source file
fn get_module_id_for_file(
file_path: &Path,
context: &AssemblerContext,
) -> Result<ModuleId, AssembleError> {
{
let registry = context.module_registry.read()?;
// Find module by path
for module in registry.modules() {
if module.path == file_path {
return Ok(module.id);
}
}
}
Err(AssembleError::new_other_error(
crate::error::AssembleErrorKind::Io(crate::error::IoError::new(
crate::error::IoErrorKind::NotFound,
Some(format!(
"Module not found for file: {}",
file_path.display()
)),
)),
))
}
/// Result of compilation. This is useless at present but compiles.
#[derive(Debug)]
pub struct CompileResult {
pub modules: Vec<ModuleId>,
pub tokens: HashMap<ModuleId, Vec<Token>>,
}
impl CompileResult {
/// Get tokens for a specific module
#[must_use]
pub fn get_tokens(&self, module_id: &ModuleId) -> Option<&Vec<Token>> {
self.tokens.get(module_id)
}
/// Get all module IDs
#[must_use]
pub fn module_ids(&self) -> &[ModuleId] {
&self.modules
}
/// Get total number of tokens across all modules
#[must_use]
pub fn total_tokens(&self) -> usize {
self.tokens.values().map(std::vec::Vec::len).sum()
}
}
+158 -11
View File
@@ -51,20 +51,173 @@ impl Display for AssembleError {
/// Marker trait. /// Marker trait.
impl std::error::Error for AssembleError {} impl std::error::Error for AssembleError {}
/// Different types of errors that may occur when assembling a set of input source files. #[derive(Debug, Clone)]
#[non_exhaustive] #[non_exhaustive]
#[derive(Debug)]
pub enum AssembleErrorKind { pub enum AssembleErrorKind {
/// Usually unexpected I/O errors. Not normally recoverable. /// Usually unexpected I/O errors. Not normally recoverable.
IO(std::io::Error), Io(IoError),
/// Errors emitted from the [`Tokeniser`]. /// Errors emitted from the [`Tokeniser`].
Tokenise(TokeniserError), Tokeniser(TokeniserError),
Parser(ParserError),
Symbol(SymbolError),
Codegen(CodegenError),
Threading(ThreadingError),
/// Returned for code where the functionality has not yet been implemented but we
/// don't want the program to panic.
Unimplemented(&'static str),
}
#[derive(Debug, Clone)]
pub struct ParserError {
error_type: ParserErrorType,
source_info: SourceInfo,
}
#[derive(Debug, Clone)]
pub enum ParserErrorType {
UnexpectedToken,
MissingOperand,
InvalidInstruction,
MissingLabel,
DuplicateLabel,
}
impl Display for ParserErrorType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::UnexpectedToken => write!(f, "unexpected token"),
Self::MissingOperand => write!(f, "missing operand"),
Self::InvalidInstruction => write!(f, "invalid instruction"),
Self::MissingLabel => write!(f, "missing label"),
Self::DuplicateLabel => write!(f, "duplicate label"),
}
}
}
impl Display for ParserError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
// TODO: Print the path/to/filename.dsa:line_no, column col_no.
write!(
f,
"Parser error, {} at {}",
self.error_type, self.source_info
)?;
// Prints out the context for our error.
self.source_info
.print_context_with_underline()
.map_err(|e| {
_ = writeln!(f, "Print context error: {e}");
std::fmt::Error {}
})?;
Ok(())
}
}
#[derive(Debug, Clone)]
pub enum SymbolError {
Undefined,
Duplicate,
CircularDependency,
InvalidReference,
}
impl Display for SymbolError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Undefined => write!(f, "undefined symbol"),
Self::Duplicate => write!(f, "duplicate symbol"),
Self::CircularDependency => write!(f, "circular dependency"),
Self::InvalidReference => write!(f, "invalid reference"),
}
}
}
#[derive(Debug, Clone)]
pub enum CodegenError {
InvalidOperand,
OutOfRange,
UnsupportedInstruction,
}
impl Display for CodegenError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::InvalidOperand => write!(f, "invalid operand"),
Self::OutOfRange => write!(f, "out of range"),
Self::UnsupportedInstruction => write!(f, "unsupported instruction"),
}
}
}
#[derive(Debug, Clone)]
pub enum ThreadingError {
LockFailed,
ThreadPanic,
}
impl Display for ThreadingError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::LockFailed => write!(f, "lock failed"),
Self::ThreadPanic => write!(f, "thread panic"),
}
}
}
#[derive(Debug, Clone)]
pub struct IoError {
msg: Option<String>,
kind: IoErrorKind,
}
impl IoError {
#[must_use]
pub const fn new(kind: IoErrorKind, msg: Option<String>) -> Self {
Self { msg, kind }
}
}
#[derive(Debug, Clone)]
pub enum IoErrorKind {
NotFound,
PermissionDenied,
InvalidData,
Other,
}
impl std::fmt::Display for IoErrorKind {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::NotFound => write!(f, "file not found"),
Self::PermissionDenied => write!(f, "permission denied"),
Self::InvalidData => write!(f, "invalid data"),
Self::Other => write!(f, "other I/O error"),
}
}
}
impl std::fmt::Display for IoError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.kind)?;
if let Some(msg) = &self.msg {
write!(f, ", \"{msg}\"")?;
}
Ok(())
}
} }
impl Display for AssembleErrorKind { impl Display for AssembleErrorKind {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self { match self {
Self::Tokenise(why) => write!(f, "tokeniser error: {why}"), Self::Tokeniser(why) => write!(f, "tokeniser error: {why}"),
Self::Unimplemented(why) => write!(f, "used unimplemented feature: {why}"),
Self::Io(why) => write!(f, "problem occurred with I/O: {why}"),
#[allow(unreachable_patterns)]
_ => write!( _ => write!(
f, f,
"unhandled error type in Display implementation! See error.rs!" "unhandled error type in Display implementation! See error.rs!"
@@ -73,10 +226,4 @@ impl Display for AssembleErrorKind {
} }
} }
impl From<std::io::Error> for AssembleErrorKind {
fn from(err: std::io::Error) -> Self {
Self::IO(err)
}
}
pub mod conversions; pub mod conversions;
+62 -2
View File
@@ -1,7 +1,67 @@
use crate::error::AssembleError; use std::{
io::ErrorKind,
sync::{PoisonError, RwLockReadGuard, RwLockWriteGuard},
};
use crate::error::{AssembleError, IoError, IoErrorKind};
use super::{AssembleErrorKind, ThreadingError};
impl From<std::io::Error> for IoError {
fn from(err: std::io::Error) -> Self {
let kind = match err.kind() {
ErrorKind::NotFound => IoErrorKind::NotFound,
ErrorKind::PermissionDenied => IoErrorKind::PermissionDenied,
ErrorKind::InvalidData => IoErrorKind::InvalidData,
_ => IoErrorKind::Other,
};
let msg = err.to_string();
Self::new(kind, Some(msg))
}
}
impl From<std::io::Error> for AssembleError { impl From<std::io::Error> for AssembleError {
fn from(err: std::io::Error) -> Self { fn from(err: std::io::Error) -> Self {
Self::new_other_error(err.into()) Self::new_other_error(AssembleErrorKind::Io(err.into()))
}
}
// TODO: Maybe attempt recovery? To be honest we don't want any threads to panic at all,
// or we want them all to panic spectacularly.
impl<T> From<PoisonError<RwLockReadGuard<'_, T>>> for AssembleError {
fn from(err: PoisonError<RwLockReadGuard<'_, T>>) -> Self {
Self::new_other_error(AssembleErrorKind::Threading(err.into()))
}
}
impl<T> From<PoisonError<RwLockReadGuard<'_, T>>> for ThreadingError {
fn from(_err: PoisonError<RwLockReadGuard<'_, T>>) -> Self {
Self::LockFailed
}
}
impl<T> From<PoisonError<RwLockWriteGuard<'_, T>>> for AssembleError {
fn from(err: PoisonError<RwLockWriteGuard<'_, T>>) -> Self {
Self::new_other_error(AssembleErrorKind::Threading(err.into()))
}
}
impl<T> From<PoisonError<RwLockWriteGuard<'_, T>>> for ThreadingError {
fn from(_err: PoisonError<RwLockWriteGuard<'_, T>>) -> Self {
Self::LockFailed
}
}
impl From<std::fmt::Error> for AssembleError {
fn from(err: std::fmt::Error) -> Self {
IoError::new(IoErrorKind::Other, Some(err.to_string())).into()
}
}
impl From<IoError> for AssembleError {
fn from(err: IoError) -> Self {
Self::new_other_error(AssembleErrorKind::Io(err))
} }
} }
View File
+2 -6
View File
@@ -13,8 +13,8 @@
)] )]
pub mod args; pub mod args;
pub mod image_builder;
// pub mod tooling; // pub mod tooling;
pub mod compiler_engine;
pub mod context; pub mod context;
pub mod error; pub mod error;
pub mod model; pub mod model;
@@ -23,11 +23,7 @@ pub mod symtab;
mod util; mod util;
pub mod prelude { // pub mod prelude {}
pub use crate::image_builder;
// pub use crate::tooling::brainf;
// pub use crate::tooling::project;
}
use num_cpus as _; use num_cpus as _;
use threadpool as _; use threadpool as _;
+6 -3
View File
@@ -4,7 +4,10 @@
//! //!
//! They have unique identifiers in the form of UUIDs. //! They have unique identifiers in the form of UUIDs.
use std::path::{Path, PathBuf}; use std::{
path::{Path, PathBuf},
sync::Arc,
};
use uuid::Uuid; use uuid::Uuid;
@@ -22,7 +25,7 @@ impl ModuleId {
/// Convenience method to get the [`Module`] from a [`ModuleId`]. /// Convenience method to get the [`Module`] from a [`ModuleId`].
#[must_use] #[must_use]
pub fn to_module<'m>(&self, registry: &'m ModuleRegistry) -> Option<&'m Module> { pub fn to_module<'m>(&self, registry: &'m ModuleRegistry) -> Option<&'m Arc<Module>> {
registry.get(self) registry.get(self)
} }
@@ -40,7 +43,7 @@ impl std::fmt::Display for ModuleId {
} }
/// A single source file or compilation unit. Stores its own symbol table. /// A single source file or compilation unit. Stores its own symbol table.
#[derive(Debug)] #[derive(Debug, Clone)]
pub struct Module { pub struct Module {
/// The name of the module. This is typically the name of the file, less the `.dsa` /// The name of the module. This is typically the name of the file, less the `.dsa`
/// extension. /// extension.
+8 -6
View File
@@ -1,13 +1,13 @@
//! This module contains the code for the module registry. This is a singleton storing all //! This module contains the code for the module registry. This is a singleton storing all
//! the modules being assembled. //! the modules being assembled.
use std::collections::HashMap; use std::{collections::HashMap, sync::Arc};
use super::module::{Module, ModuleId}; use super::module::{Module, ModuleId};
/// Stores all the [`Module`]'s to be assembled. /// Stores all the [`Module`]'s to be assembled.
pub struct ModuleRegistry { pub struct ModuleRegistry {
modules: HashMap<ModuleId, Module>, modules: HashMap<ModuleId, Arc<Module>>,
} }
impl Default for ModuleRegistry { impl Default for ModuleRegistry {
@@ -17,26 +17,28 @@ impl Default for ModuleRegistry {
} }
impl ModuleRegistry { impl ModuleRegistry {
#[must_use] pub fn new() -> Self { #[must_use]
pub fn new() -> Self {
Self { Self {
modules: HashMap::new(), modules: HashMap::new(),
} }
} }
/// Gets a [`Module`] by ID. /// Gets a [`Module`] by ID.
#[must_use] pub fn get(&self, module_id: &ModuleId) -> Option<&Module> { #[must_use]
pub fn get(&self, module_id: &ModuleId) -> Option<&Arc<Module>> {
self.modules.get(module_id) self.modules.get(module_id)
} }
/// Adds a [`Module`] and returns its [`ModuleId`]. /// Adds a [`Module`] and returns its [`ModuleId`].
pub fn add(&mut self, module: Module) -> ModuleId { pub fn add(&mut self, module: Arc<Module>) -> ModuleId {
let id = module.id; let id = module.id;
self.modules.insert(id, module); self.modules.insert(id, module);
id id
} }
/// Returns an iterator of modules. /// Returns an iterator of modules.
pub fn modules(&self) -> impl Iterator<Item = &Module> { pub fn modules(&self) -> impl Iterator<Item = &Arc<Module>> {
self.modules.values() self.modules.values()
} }
} }
+11 -1
View File
@@ -1,12 +1,17 @@
//! This module contains anything within the first stage of assembly, i.e. the //! This module contains anything within the first stage of assembly, i.e. the
//! tokenisation stage, or utility functions for reading input files. //! tokenisation stage, or utility functions for reading input files.
use std::path::Path; use std::{
io::{BufRead, Lines},
path::Path,
};
use crate::error::AssembleError; use crate::error::AssembleError;
pub mod lines;
pub mod source_info; pub mod source_info;
pub mod token; pub mod token;
pub mod token_info;
pub mod tokeniser; pub mod tokeniser;
/// Attempts to load and open a source file, returning a [`Vec<u8>`] or an /// Attempts to load and open a source file, returning a [`Vec<u8>`] or an
@@ -16,3 +21,8 @@ pub fn load_source_bytes<P: AsRef<Path>>(p: P) -> Result<Vec<u8>, AssembleError>
Ok(std::fs::read(path)?) Ok(std::fs::read(path)?)
} }
/// Get the lines from a [`BufReader`].
pub fn reader_lines<R: BufRead>(rdr: R) -> Lines<R> {
rdr.lines()
}
+76
View File
@@ -0,0 +1,76 @@
//! Enhanced lines iterator that tracks line numbers and character positions.
use std::io::{BufRead, BufReader, Cursor};
use crate::error::AssembleError;
/// Iterator that yields lines with their line numbers and character spans.
pub struct LinesWithSpans<R: BufRead> {
reader: R,
line_number: usize,
total_chars: usize,
buffer: String,
}
#[derive(Debug, Clone)]
pub struct LineSpan {
/// The line number.
pub line_number: usize,
/// The contents of the line.
pub content: String,
/// Character offset from start of file.
pub start_char: usize,
/// End character offset (exclusive).
pub end_char: usize,
}
impl<R: BufRead> LinesWithSpans<R> {
pub const fn new(reader: R) -> Self {
Self {
reader,
line_number: 0,
total_chars: 0,
buffer: String::new(),
}
}
}
impl<R: BufRead> Iterator for LinesWithSpans<R> {
type Item = Result<LineSpan, AssembleError>;
fn next(&mut self) -> Option<Self::Item> {
self.buffer.clear();
match self.reader.read_line(&mut self.buffer) {
Ok(0) => None, // EOF
Ok(bytes_read) => {
self.line_number += 1;
let start_char = self.total_chars;
self.total_chars += bytes_read;
// Remove trailing newline for cleaner processing
let content = if self.buffer.ends_with('\n') {
self.buffer[..self.buffer.len() - 1].to_string()
} else {
self.buffer.clone()
};
Some(Ok(LineSpan {
line_number: self.line_number,
content,
start_char,
end_char: self.total_chars,
}))
}
Err(e) => Some(Err(e.into())),
}
}
}
/// Helper function to create lines iterator from data.
#[must_use]
pub fn lines_with_spans(data: &[u8]) -> LinesWithSpans<BufReader<Cursor<&[u8]>>> {
let cursor = Cursor::new(data);
let reader = BufReader::new(cursor);
LinesWithSpans::new(reader)
}
+81 -5
View File
@@ -4,22 +4,98 @@
//! This will likely be attached to a [`Token`] which will in turn be attached to an AST //! This will likely be attached to a [`Token`] which will in turn be attached to an AST
//! [`Node`]. //! [`Node`].
use std::fmt::Display; use std::{
fmt::{Display, Write},
fs::File,
io::BufReader,
sync::Arc,
};
use crate::model::module::Module; use crate::{
error::{AssembleError, AssembleErrorKind, IoError, IoErrorKind},
model::module::Module,
source::lines::LinesWithSpans,
};
/// Information on where the token is within the source. /// Information on where the token is within the source.
#[derive(Debug)] #[derive(Debug, Clone)]
pub struct SourceInfo { pub struct SourceInfo {
/// The line number within the source file underpinned by `module_id`. /// The line number within the source file underpinned by `module_id`.
pub line_no: usize, pub line_no: usize,
pub module: Module, pub module: Arc<Module>,
/// The indexes where this token may be found (line-local). /// The indexes where this token may be found (line-local).
pub span: std::ops::Range<usize>, pub span: std::ops::Range<usize>,
} }
impl Display for SourceInfo { impl Display for SourceInfo {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.module.name) write!(
f,
"{}:{}, column {}",
self.module.path.display(),
self.line_no,
self.span.start
)
}
}
impl SourceInfo {
#[must_use]
pub const fn new(
line_no: usize,
module: Arc<Module>,
span: std::ops::Range<usize>,
) -> Self {
Self {
line_no,
module,
span,
}
}
/// Prints out where in the source code the error originated with an underline similar
/// to what rustc does.
pub fn print_context_with_underline(&self) -> Result<(), AssembleError> {
let f = File::open(&self.module.path)?;
let rdr = BufReader::new(f);
let mut lines = LinesWithSpans::new(rdr);
let Some(line_result) = lines.nth(self.line_no - 1) else {
// Handle a line not existing.
return Err(AssembleError::new_source_error(
self.clone(),
AssembleErrorKind::Io(IoError::new(
IoErrorKind::Other,
Some(format!(
"the line {} does not exist in input file `{}` but source info suggested otherwise!.",
self.line_no,
self.module.path.display()
)),
)),
));
};
let line_span = line_result?;
// Print the line number and line content.
println!("{:>4} | {}", self.line_no, line_span.content);
let mut underline = String::new();
write!(underline, "{:>4} | ", "")?;
for _ in 0..self.span.start {
underline.push(' ');
}
for _ in self.span.start..self.span.end.min(line_span.content.len()) {
underline.push('^');
}
// Print the underline in red and bold.
// TODO: Use a crate to make this extra portable.
println!("\x1b[1;31m{underline}\x1b[0m");
Ok(())
} }
} }
+13 -29
View File
@@ -2,7 +2,12 @@
//! easier to build from scratch and edit his code than it would be to try and wrangle it //! easier to build from scratch and edit his code than it would be to try and wrangle it
//! into shape. //! into shape.
use crate::source::source_info::SourceInfo; use crate::source::{
source_info::SourceInfo,
token_info::{
DirectiveToken, InstructionToken, LabelToken, RegisterToken, SymbolToken,
},
};
#[derive(Debug, Clone, PartialEq, Eq, Hash)] #[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum TokenType { pub enum TokenType {
@@ -18,45 +23,24 @@ pub enum TokenType {
Instruction(InstructionToken), Instruction(InstructionToken),
/// Label definition (e.g., `loop_start:`). /// Label definition (e.g., `loop_start:`).
Label(LabelToken), Label(LabelToken),
/// Assembler directive (e.g., `.global`, `.section`, `.dw`). /// Assembler directive (e.g., `.global`, `.section`, `.dw`, `.resb`).
Directive(DirectiveToken), Directive(DirectiveToken),
/// Comma separator.
Comma,
/// End of line. /// End of line.
Newline, Newline,
/// End of file. /// End of file.
Eof, Eof,
/// A line comment. This is to be filtered out of the token stream.
Comment,
} }
#[derive(Debug)] #[derive(Debug)]
pub struct Token { pub struct Token {
/// The type of the token. /// The type of the token.
token_type: TokenType, pub token_type: TokenType,
/// Where in the source code is this [`Token`]? /// Where in the source code is this [`Token`]?
source_info: SourceInfo, pub source_info: SourceInfo,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct SymbolToken {
pub name: String,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct LabelToken {
pub name: String,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct DirectiveToken {
pub directive: String,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct RegisterToken {
pub name: String,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct InstructionToken {
pub mnemonic: String,
} }
impl Token { impl Token {
+24
View File
@@ -0,0 +1,24 @@
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct SymbolToken {
pub name: String,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct LabelToken {
pub name: String,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct DirectiveToken {
pub directive: String,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct RegisterToken {
pub name: String,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct InstructionToken {
pub mnemonic: String,
}
+296 -2
View File
@@ -1,7 +1,301 @@
//! This file contains the [`Tokeniser`], which consumes a [`Vec`] of input bytes and //! This file contains the [`Tokeniser`], which consumes a [`Vec`] of input bytes and
//! outputs a [`Vec<Token>`]. //! outputs a [`Vec<Token>`].
/// Consumes a [`Vec<u8>`] and outputs a [`Vec`] of [Token]'s. use std::{
pub struct Tokeniser {} path::{Path, PathBuf},
sync::Arc,
};
use regex::Regex;
use crate::{
context::AssemblerContext,
error::{AssembleError, AssembleErrorKind, IoError, IoErrorKind},
model::module::Module,
source::{
lines::lines_with_spans,
load_source_bytes,
source_info::SourceInfo,
token::{Token, TokenType},
token_info::{
DirectiveToken, InstructionToken, LabelToken, RegisterToken, SymbolToken,
},
},
};
pub mod error; pub mod error;
/// Consumes a [`Vec<u8>`] and outputs a [`Vec`] of [Token]'s.
pub struct Tokeniser {
/// The data in the file.
pub data: Vec<u8>,
/// The path to the file.
pub path: PathBuf,
// Pre-compiled regex patterns
label_regex: Regex,
register_regex: Regex,
immediate_regex: Regex,
directive_regex: Regex,
instruction_regex: Regex,
symbol_regex: Regex,
string_regex: Regex,
comment_regex: Regex,
}
impl Tokeniser {
#[must_use]
pub fn from_data(data: Vec<u8>, path: PathBuf) -> Self {
Self {
data,
path,
label_regex: Regex::new(r"^([a-zA-Z_][a-zA-Z0-9_]*):")
.expect("Failed to compile label regex pattern"),
register_regex: Regex::new(r"^(r[0-9]+|sp|fp|pc)")
.expect("Failed to compile register regex pattern"),
immediate_regex: Regex::new(r"^(0x[0-9a-fA-F]+|[0-9]+)")
.expect("Failed to compile immediate regex pattern"),
directive_regex: Regex::new(r"^\.([a-zA-Z]+)")
.expect("Failed to compile directive regex pattern"),
instruction_regex: Regex::new(
r"^(add|sub|mul|div|jmp|call|ret|lli|nop|halt)",
)
.expect("Failed to compile instruction regex pattern"),
symbol_regex: Regex::new(r"^([a-zA-Z_][a-zA-Z0-9_]*)")
.expect("Failed to compile symbol regex pattern"),
string_regex: Regex::new(r#"^"([^"]*)"#)
.expect("Failed to compile string regex pattern"),
comment_regex: Regex::new("//.*")
.expect("Failed to compile comment regex pattern"),
}
}
/// Creates a [`Tokeniser`] from a file path.
pub fn new<P: AsRef<Path>>(path: P) -> Result<Self, AssembleError> {
let path = path.as_ref().to_path_buf();
let data = load_source_bytes(&path)?;
Ok(Self::from_data(data, path))
}
// Note that modules are tokenised in their own threads, possibly in parallel.
pub fn tokenise(self, ctx: &AssemblerContext) -> Result<Vec<Token>, AssembleError> {
let module_name = self.extract_module_name()?;
// Create a module for the source file being processed.
let module = Arc::new(Module::new(module_name, &self.path));
{
let mut module_registry = ctx.module_registry.write()?;
module_registry.add(module.clone());
}
let mut token_stream = Vec::new();
let lines = lines_with_spans(&self.data);
// Process each line
for line_result in lines {
let line_span = line_result?;
let trimmed = line_span.content.trim();
// Skip empty lines and add newline tokens
if trimmed.is_empty() {
token_stream.push(Token::new(
TokenType::Newline,
SourceInfo::new(line_span.line_number, module.clone(), 0..1),
));
continue;
}
// Actually tokenize the line content
let line_tokens = self.tokenize_line(&line_span, &module)?;
token_stream.extend(line_tokens);
// Add newline token at end of line
token_stream.push(Token::new(
TokenType::Newline,
SourceInfo::new(
line_span.line_number,
module.clone(),
line_span.content.len()..line_span.content.len(),
),
));
}
// Add EOF token
token_stream.push(Token::new(TokenType::Eof, SourceInfo::new(0, module, 0..0)));
Ok(token_stream)
}
fn tokenize_line(
&self,
line_span: &crate::source::lines::LineSpan,
module: &Arc<Module>,
) -> Result<Vec<Token>, AssembleError> {
let mut tokens = Vec::new();
let mut remaining = line_span.content.trim();
let start_column = line_span.start_char;
while !remaining.is_empty() {
// Try to match a token.
let (token_type, consumed) = self.match_token(remaining)?;
tokens.push(Token::new(
token_type,
SourceInfo::new(
line_span.line_number,
module.clone(),
start_column..start_column + consumed,
),
));
// Advance position.
remaining = remaining[consumed..].trim_start();
}
Ok(tokens)
}
fn try_match_comment(&self, input: &str) -> Option<(TokenType, usize)> {
let caps = self.comment_regex.captures(input)?;
let len = caps.get(0)?.len();
Some((TokenType::Comment, len))
}
fn try_match_label(&self, input: &str) -> Option<(TokenType, usize)> {
let caps = self.label_regex.captures(input)?;
let name = caps.get(1)?.as_str().to_string();
let len = caps.get(0)?.len();
Some((TokenType::Label(LabelToken { name }), len))
}
fn try_match_register(&self, input: &str) -> Option<(TokenType, usize)> {
let caps = self.register_regex.captures(input)?;
let name = caps.get(1)?.as_str().to_string();
let len = caps.get(0)?.len();
Some((TokenType::Register(RegisterToken { name }), len))
}
fn try_match_immediate(&self, input: &str) -> Option<(TokenType, usize)> {
let caps = self.immediate_regex.captures(input)?;
let value_str = caps.get(1)?.as_str();
let len = caps.get(0)?.len();
let value = if let Some(hex_part) = value_str.strip_prefix("0x") {
u32::from_str_radix(hex_part, 16).ok()?
} else if let Some(bin_part) = value_str.strip_prefix("0b") {
u32::from_str_radix(bin_part, 2).ok()?
} else if let Some(oct_part) = value_str.strip_prefix("0o") {
u32::from_str_radix(oct_part, 8).ok()?
} else {
value_str.parse::<u32>().ok()?
};
Some((TokenType::Immediate(value), len))
}
fn try_match_directive(&self, input: &str) -> Option<(TokenType, usize)> {
let caps = self.directive_regex.captures(input)?;
let directive = caps.get(1)?.as_str().to_string();
let len = caps.get(0)?.len();
Some((TokenType::Directive(DirectiveToken { directive }), len))
}
fn try_match_instruction(&self, input: &str) -> Option<(TokenType, usize)> {
let caps = self.instruction_regex.captures(input)?;
let mnemonic = caps.get(1)?.as_str().to_string();
let len = caps.get(0)?.len();
Some((TokenType::Instruction(InstructionToken { mnemonic }), len))
}
fn try_match_symbol(&self, input: &str) -> Option<(TokenType, usize)> {
let caps = self.symbol_regex.captures(input)?;
let name = caps.get(1)?.as_str().to_string();
let len = caps.get(0)?.len();
Some((TokenType::Symbol(SymbolToken { name }), len))
}
fn try_match_string(&self, input: &str) -> Option<(TokenType, usize)> {
let caps = self.string_regex.captures(input)?;
let content = caps.get(1)?.as_str().to_string();
let len = caps.get(0)?.len();
Some((TokenType::String(content), len))
}
fn match_token(&self, input: &str) -> Result<(TokenType, usize), AssembleError> {
if let Some(m) = self.try_match_comment(input) {
return Ok(m);
}
if let Some(m) = self.try_match_label(input) {
return Ok(m);
}
if let Some(m) = self.try_match_register(input) {
return Ok(m);
}
if let Some(m) = self.try_match_immediate(input) {
return Ok(m);
}
if let Some(m) = self.try_match_directive(input) {
return Ok(m);
}
if let Some(m) = self.try_match_instruction(input) {
return Ok(m);
}
if let Some(m) = self.try_match_string(input) {
return Ok(m);
}
if let Some(m) = self.try_match_symbol(input) {
return Ok(m);
}
// Handle miscellaneous characters.
match input.chars().next() {
Some(',') => Ok((TokenType::Comma, 1)),
Some(c) => Err(AssembleError::new_other_error(AssembleErrorKind::Io(
IoError::new(
IoErrorKind::InvalidData,
Some(format!("Unexpected character: '{c}'")),
),
))),
None => Err(AssembleError::new_other_error(AssembleErrorKind::Io(
IoError::new(
IoErrorKind::InvalidData,
Some("Unexpected end of input".to_string()),
),
))),
}
}
fn extract_module_name(&self) -> Result<String, AssembleError> {
let module_name = self
.path
.file_name()
.map(|f| f.to_string_lossy().to_string())
.ok_or_else(|| {
AssembleError::new_other_error(AssembleErrorKind::Io(IoError::new(
IoErrorKind::InvalidData,
Some(
"filename couldn't be extracted, is it valid UTF-8?".to_string(),
),
)))
})?;
Ok(module_name)
}
}
+1 -1
View File
@@ -1,6 +1,6 @@
//! This module contains the error types for the tokeniser. //! This module contains the error types for the tokeniser.
#[derive(Debug)] #[derive(Debug, Clone, Copy)]
pub enum TokeniserError {} pub enum TokeniserError {}
impl std::fmt::Display for TokeniserError { impl std::fmt::Display for TokeniserError {
+17 -16
View File
@@ -46,12 +46,11 @@ impl SymbolTable {
&& let Some(existing) = self.symbols.get(&existing_id) && let Some(existing) = self.symbols.get(&existing_id)
&& existing.module_id == module_id && existing.module_id == module_id
{ {
return Err(AssembleError::new_other_error( return Err(std::io::Error::new(
crate::error::AssembleErrorKind::IO(std::io::Error::new( std::io::ErrorKind::AlreadyExists,
std::io::ErrorKind::AlreadyExists, format!("Symbol '{name}' already defined in module"),
format!("Symbol '{name}' already defined in module"), )
)), .into());
));
} }
// Add to all mappings // Add to all mappings
@@ -63,19 +62,22 @@ impl SymbolTable {
} }
/// Gets the [`Symbol`] by its [`SymbolId`]. /// Gets the [`Symbol`] by its [`SymbolId`].
#[must_use] pub fn get(&self, id: &SymbolId) -> Option<&Symbol> { #[must_use]
pub fn get(&self, id: &SymbolId) -> Option<&Symbol> {
self.symbols.get(id) self.symbols.get(id)
} }
/// Gets the [`Symbol`] by its name. /// Gets the [`Symbol`] by its name.
#[must_use] pub fn get_by_name(&self, name: &str) -> Option<&Symbol> { #[must_use]
pub fn get_by_name(&self, name: &str) -> Option<&Symbol> {
self.name_to_id self.name_to_id
.get(name) .get(name)
.and_then(|id| self.symbols.get(id)) .and_then(|id| self.symbols.get(id))
} }
/// Gets all [`Symbol`]s in a module. /// Gets all [`Symbol`]s in a module.
#[must_use] pub fn get_module_symbols(&self, module_id: &ModuleId) -> Vec<&Symbol> { #[must_use]
pub fn get_module_symbols(&self, module_id: &ModuleId) -> Vec<&Symbol> {
self.module_symbols self.module_symbols
.get(module_id) .get(module_id)
.map(|ids| ids.iter().filter_map(|id| self.symbols.get(id)).collect()) .map(|ids| ids.iter().filter_map(|id| self.symbols.get(id)).collect())
@@ -83,7 +85,8 @@ impl SymbolTable {
} }
/// Gets all the public symbols. /// Gets all the public symbols.
#[must_use] pub fn get_public_symbols(&self) -> Vec<&Symbol> { #[must_use]
pub fn get_public_symbols(&self) -> Vec<&Symbol> {
self.symbols self.symbols
.values() .values()
.filter(|sym| matches!(sym.visibility, Visibility::Public)) .filter(|sym| matches!(sym.visibility, Visibility::Public))
@@ -104,12 +107,10 @@ impl SymbolTable {
} }
Ok(()) Ok(())
} else { } else {
Err(AssembleError::new_other_error( Err(
crate::error::AssembleErrorKind::IO(std::io::Error::new( std::io::Error::new(std::io::ErrorKind::NotFound, "Symbol not found")
std::io::ErrorKind::NotFound, .into(),
"Symbol not found", )
)),
))
} }
} }
} }
+1 -1
View File
@@ -2,7 +2,7 @@ pub mod logging;
use std::io::Write; use std::io::Write;
pub fn input(prompt: &str) -> String { pub fn _input(prompt: &str) -> String {
print!("{prompt}\n > "); print!("{prompt}\n > ");
std::io::stdout().flush().expect("Failed to flush stdout"); std::io::stdout().flush().expect("Failed to flush stdout");
let mut input = String::new(); let mut input = String::new();
+1
View File
@@ -5,6 +5,7 @@ use std::{
path::{Path, PathBuf}, path::{Path, PathBuf},
}; };
use assembler::compiler_engine::CompilerEngine;
use common::prelude::Instruction; use common::prelude::Instruction;
use egui::{Align, Context, Key, Layout, Ui}; use egui::{Align, Context, Key, Layout, Ui};