damn_simple_architecture/assembler/src/source/tokeniser/tests.rs

//! Unit tests for the tokenizer

use common::prelude::Register;

use crate::{
    model::module::Module,
    source::{
        opcode::Opcode,
        token::{Token, TokenType},
        token_info::RegisterToken,
        tokeniser::Tokeniser,
    },
};
use std::{path::PathBuf, sync::Arc};

/// Helper function to create a tokenizer from source text
fn create_tokenizer_from_source(source: &str) -> Tokeniser {
    let path = PathBuf::from("test.dsa");
    let module = Module::new(path).expect("Cannot create module!");

    Tokeniser::from_data(source.as_bytes().to_vec(), Arc::new(module))
}

/// Helper function to tokenize source and return tokens
fn tokenize_source(source: &str) -> Result<Vec<Token>, crate::error::AssembleError> {
    let tokenizer = create_tokenizer_from_source(source);

    tokenizer.tokenise()
}

/// Helper function to extract token types from a token vector
fn extract_token_types(tokens: &[Token]) -> Vec<&TokenType> {
    tokens.iter().map(|t| &t.token_type).collect()
}

#[test]
fn test_empty_source() {
    let tokens = tokenize_source("").expect("Failed to tokenize empty source");

    // Should have at least EOF token
    assert!(!tokens.is_empty());
    assert!(matches!(
        tokens
            .last()
            .expect("Expected at least one token")
            .token_type,
        TokenType::Eof
    ));
}

#[test]
fn test_whitespace_only() {
    let tokens = tokenize_source("   \n  \n  ").expect("Failed to tokenize whitespace");

    // Should have newlines and EOF
    let token_types = extract_token_types(&tokens);
    assert!(token_types.iter().any(|t| matches!(t, TokenType::Newline)));
    assert!(token_types.iter().any(|t| matches!(t, TokenType::Eof)));
}

#[test]
fn test_single_instruction() {
    let tokens = tokenize_source("add").expect("Failed to tokenize instruction");
    let token_types = extract_token_types(&tokens);

    // Should have instruction, newline, and EOF
    assert!(
        token_types
            .iter()
            .any(|t| matches!(t, TokenType::Instruction(_)))
    );
    if let TokenType::Instruction(instr) = &tokens[0].token_type {
        assert_eq!(instr.to_string(), "add");
    } else {
        panic!("Expected instruction token");
    }
}

#[test]
fn test_all_instructions() {
    let instructions = ["add", "sub", "jmp", "call", "return", "lli", "nop", "hlt"];

    for instr in &instructions {
        let tokens = tokenize_source(instr).expect("Failed to tokenize instruction");

        if let TokenType::Instruction(parsed_instr) = &tokens[0].token_type {
            assert_eq!(parsed_instr.to_string(), *instr);
        } else {
            panic!("Expected instruction token for {instr}");
        }
    }
}

#[test]
fn test_registers() {
    let test_cases = [("rg0", "rg0"), ("rgf", "rgf"), ("pcx", "pcx")];

    for (input, expected) in &test_cases {
        let tokens = tokenize_source(input).expect("Failed to tokenize register");

        if let TokenType::Register(reg) = &tokens[0].token_type {
            assert_eq!(reg.reg.to_string(), *expected);
        } else {
            panic!("Expected register token for {input}");
        }
    }
}

#[test]
fn test_immediates() {
    let test_cases = [
        ("42", 42),
        ("0", 0),
        ("0xFF", 255),
        ("0x1234", 0x1234),
        ("0xDEADBEEF", 0xDEAD_BEEF),
        ("0o12", 0o12),
        ("0b101", 0b101),
    ];

    for (input, expected) in &test_cases {
        let tokens = tokenize_source(input).expect("Failed to tokenize immediate");

        if let TokenType::Immediate(value) = &tokens[0].token_type {
            assert_eq!(*value, *expected);
        } else {
            panic!("Expected immediate token for {input}");
        }
    }
}

#[test]
fn test_labels() {
    let test_cases = [
        ("loop_start:", "loop_start"),
        ("main:", "main"),
        ("_private_label:", "_private_label"),
        ("Label123:", "Label123"),
    ];

    for (input, expected) in &test_cases {
        let tokens = tokenize_source(input).expect("Failed to tokenize label");

        if let TokenType::Label(label) = &tokens[0].token_type {
            assert_eq!(label.name, *expected);
        } else {
            panic!("Expected label token for {input}");
        }
    }
}

#[test]
fn test_directives() {
    let test_cases = [
        ("global", "global"),
        ("section", "section"),
        ("local", "local"),
    ];

    for (input, expected) in &test_cases {
        let tokens = tokenize_source(input).expect("Failed to tokenize directive");

        if let TokenType::Directive(directive) = &tokens[0].token_type {
            assert_eq!(directive.directive, *expected);
        } else {
            panic!("Expected directive token for {input}");
        }
    }
}

#[test]
fn test_symbols() {
    let test_cases = [
        ("my_symbol", "my_symbol"),
        ("_private", "_private"),
        ("Symbol123", "Symbol123"),
        ("camelCase", "camelCase"),
    ];

    for (input, expected) in &test_cases {
        let tokens = tokenize_source(input).expect("Failed to tokenize symbol");

        if let TokenType::Symbol(symbol) = &tokens[0].token_type {
            assert_eq!(symbol.name, *expected);
        } else {
            panic!("Expected symbol token for {input}");
        }
    }
}

#[test]
fn test_complex_instruction_line() {
    let source = "addi rg1, rg2, 0xFF";
    let tokens = tokenize_source(source).expect("Failed to tokenise complex instruction");

    // Should have: instruction, register, comma, register, comma, immediate, newline, EOF
    assert!(tokens.len() >= 6);
    assert!(matches!(tokens[0].token_type, TokenType::Instruction(_)));
    assert!(matches!(tokens[1].token_type, TokenType::Register(_)));
    assert!(matches!(tokens[2].token_type, TokenType::Comma));
    assert!(matches!(tokens[3].token_type, TokenType::Register(_)));
    assert!(matches!(tokens[4].token_type, TokenType::Comma));
    assert!(matches!(tokens[5].token_type, TokenType::Immediate(_)));
}

#[test]
fn test_multiline_with_comments() {
    const EXPECTED_TOKEN_TYPES: [TokenType; 11] = [
        TokenType::Instruction(Opcode::Add),
        TokenType::Register(RegisterToken::new(Register::Rg0)),
        TokenType::Comma,
        TokenType::Register(RegisterToken::new(Register::Rg1)),
        TokenType::Newline,
        TokenType::Instruction(Opcode::SubI),
        TokenType::Register(RegisterToken::new(Register::Rg2)),
        TokenType::Comma,
        TokenType::Immediate(10),
        TokenType::Newline,
        TokenType::Eof,
    ];

    const SOURCE: &str = r"add rg0, rg1 // Another comment
        subi rg2, 10";

    let tokens =
        tokenize_source(SOURCE).expect("Failed to tokenise source with comments");
    let token_types = extract_token_types(&tokens);

    assert_eq!(
        token_types.len(),
        EXPECTED_TOKEN_TYPES.len(),
        "{token_types:#?}"
    );

    for (expected, got) in EXPECTED_TOKEN_TYPES.iter().zip(token_types.iter()) {
        assert!(!(expected != *got), "Expected {expected:?}, got {got:?}");
    }
}

#[test]
fn test_tokenise_brainf_interpreter() {
    const SOURCE: &str = include_str!("../../../../resources/dsa/bf.dsa");

    let tokens =
        tokenize_source(SOURCE).expect("Failed to tokenise the brainfuck compiler!");

    dbg!(tokens);
}

#[test]
fn test_string_literals() {
    let test_cases = [
        (r#""hello world""#, "hello world"),
        (
            r#""++++++++++++++++++++++++++++++++++++++++++++""#,
            "++++++++++++++++++++++++++++++++++++++++++++",
        ),
        (r#""Invalid Instruction!""#, "Invalid Instruction!"),
        (r#""""#, ""),
    ];

    for (input, expected) in &test_cases {
        let tokens = tokenize_source(input).expect("Failed to tokenize string literal");

        if let TokenType::String(value) = &tokens[0].token_type {
            assert_eq!(value, expected);
        } else {
            panic!("Expected string token for {input}");
        }
    }
}

#[test]
fn test_data_directives() {
    let test_cases = [("db", "db"), ("dw", "dw"), ("resb", "resb")];

    for (input, expected) in &test_cases {
        let tokens = tokenize_source(input).expect("Failed to tokenize data declaration");

        if let TokenType::Directive(decl) = &tokens[0].token_type {
            assert_eq!(decl.directive, *expected);
        } else {
            panic!("Expected data declaration token for {input}");
        }
    }
}

#[test]
fn test_include_directive() {
    let source = r#"include print "./lib/print.dsa""#;
    let tokens = tokenize_source(source).expect("Failed to tokenize include directive");

    assert!(tokens.len() >= 3);
    assert!(matches!(tokens[0].token_type, TokenType::Directive(_)));
    assert!(matches!(tokens[1].token_type, TokenType::Symbol(_)));
    assert!(matches!(tokens[2].token_type, TokenType::String(_)));
}

#[test]
fn test_hex_addresses() {
    let test_cases = [("0x10000", 0x10000), ("0x30000", 0x30000)];

    for (input, expected) in &test_cases {
        let tokens = tokenize_source(input).expect("Failed to tokenize hex address");

        if let TokenType::Immediate(value) = &tokens[0].token_type {
            assert_eq!(*value, *expected);
        } else {
            panic!("Expected immediate token for {input}");
        }
    }
}

#[test]
fn test_memory_operations() {
    let source = "ldw rg1, rg2";
    let tokens = tokenize_source(source).expect("Failed to tokenize memory operation");

    assert!(tokens.len() >= 4);
    assert!(matches!(tokens[0].token_type, TokenType::Instruction(_)));
    assert!(matches!(tokens[1].token_type, TokenType::Register(_)));
    assert!(matches!(tokens[2].token_type, TokenType::Comma));
    assert!(matches!(tokens[3].token_type, TokenType::Register(_)));
}

#[test]
fn test_function_calls() {
    let source = "call print::print";
    let tokens = tokenize_source(source).expect("Failed to tokenize function call");

    assert!(tokens.len() >= 2);
    assert!(matches!(tokens[0].token_type, TokenType::Instruction(_)));
    // The symbol might be parsed differently depending on how :: is handled
    // This test checks basic structure
    assert!(
        tokens
            .iter()
            .any(|t| matches!(t.token_type, TokenType::Symbol(_)))
    );
}

#[test]
fn test_comments_are_ignored() {
    let source = "add rg0, rg1 // this is a comment\nsub rg2, rg3";
    let tokens = tokenize_source(source).expect("Failed to tokenize with comments");

    // Comments should be stripped, so we should only have instruction tokens
    let instruction_count = tokens
        .iter()
        .filter(|t| matches!(t.token_type, TokenType::Instruction(_)))
        .count();

    assert_eq!(instruction_count, 2);
}

#[test]
fn test_newline_always_present() {
    // Test that even without explicit newline at end, one is added
    let source = "add rg0, rg1"; // No newline at end
    let tokens = tokenize_source(source).expect("Failed to tokenize without newline");

    // Should have newline before EOF
    let has_newline = tokens
        .iter()
        .any(|t| matches!(t.token_type, TokenType::Newline));

    assert!(
        has_newline,
        "Expected newline to be added even when missing from input"
    );

    // EOF should be last.
    assert!(matches!(
        tokens
            .last()
            .expect("Expected at least one token")
            .token_type,
        TokenType::Eof
    ));
}

#[test]
fn test_complex_branching_code() {
    let source = r"
    cmp rg3, rg8
    jeq increment
    cmp rg3, rg9
    jeq decrement";

    let tokens = tokenize_source(source).expect("Failed to tokenize branching code");

    let instruction_count = tokens
        .iter()
        .filter(|t| matches!(t.token_type, TokenType::Instruction(_)))
        .count();

    assert_eq!(instruction_count, 4);

    let symbol_count = tokens
        .iter()
        .filter(|t| matches!(t.token_type, TokenType::Symbol(_)))
        .count();

    assert_eq!(symbol_count, 2); // increment and decrement labels
}

#[test]
fn test_stack_operations() {
    let source = "push rg2\npop zero\npusha 2\npopa 2";
    let tokens = tokenize_source(source).expect("Failed to tokenize stack operations");

    let instruction_count = tokens
        .iter()
        .filter(|t| matches!(t.token_type, TokenType::Instruction(_)))
        .count();

    assert_eq!(instruction_count, 4);
}