//! This parser is ripped directly out of another language I was working on, called Shiv.
//! I'm considering using that name, "Shiv", for the eventual user-facing scripting language
//! we're building up to here.

use crate::tree::Tree;
use ::core::iter;
use ::core::ops::Range;
use ::id_arena::{Arena, Id};
use ::lasso::{Rodeo, Spur};
use ::logos::{Lexer, Logos};

fn string_literal(lex: &mut Lexer<Token>) -> bool {
    let remainder = lex.remainder().as_bytes();
    let mut cursor = remainder;
    while let [b, rest @ ..] = cursor {
        if *b == b'"' {
            lex.bump((rest.as_ptr() as usize) - (remainder.as_ptr() as usize));
            return true;
        }
        cursor = rest;
    }
    false
}
fn line_comment(lex: &mut Lexer<Token>) -> bool {
    let remainder = lex.remainder().as_bytes();
    let mut cursor = remainder;
    // 10 is a newline. b'\n'
    while let [0..=9 | 11..=255, rest @ ..] = cursor {
        cursor = rest;
    }
    lex.bump(cursor.as_ptr() as usize - remainder.as_ptr() as usize);
    true
}

// reminder that the `Variant` derive is something I've added in my local fork
#[derive(Logos, Debug, ::derive_more::IsVariant, Clone)]
pub enum Token {
    #[token("(")]
    OpenParenthesis,
    #[token(")")]
    CloseParenthesis,
    #[token("\n")]
    NewLine,
    #[token("::")]
    PathSeparator,
    #[token(";", line_comment)]
    LineComment,
    #[token("\"", string_literal)]
    StringLiteral,
    #[regex("[0-9]+")]
    Integer,
    #[token("&")]
    RefOp,
    #[regex(r##"[-_+=a-zA-Z*`.,>][-_+=a-zA-Z*`.,>0-9]*"##)]
    Ident,
    #[regex("[\t ]+")]
    Whitespace,
    #[error]
    Error,
}
pub fn lex_expression(input: &str) -> Result<Vec<(Token, Range<usize>)>, ()> {
    Ok(Token::lexer(input).spanned().collect())
}

#[derive(Debug, ::derive_more::Deref)]
pub struct AST {
    pub names: Rodeo,
    pub strings: Rodeo,
    #[deref]
    tree: Tree<Term>,
}
impl AST {
    fn fmt_rec(ast: &AST, id: Id<Term>, out: &mut String) {
        match &ast.arena[id] {
            Term::CreateRef(operand) => {
                out.push('&');
                Self::fmt_rec(ast, *operand, out);
            }
            Term::List(elements) => {
                out.push('(');
                match &**elements {
                    [first, rest @ ..] => {
                        Self::fmt_rec(ast, *first, out);
                        for elem in rest {
                            out.push(' ');
                            Self::fmt_rec(ast, *elem, out);
                        }
                    }
                    [] => (),
                }
                out.push(')');
            }
            Term::Variable(path) => path.fmt(ast, id, out),
            Term::IntegerLiteral(val) => {
                let mut itoa_buf = itoa::Buffer::new();
                out.push_str(itoa_buf.format(*val));
            }
            Term::StringLiteral(val) => {
                out.push('"');
                out.push_str(ast.strings.resolve(&val.0));
                out.push('"');
            }
            Term::Comment(_span) => todo!("formatting AST glued comments"),
        }
    }
    pub fn fmt_sexpr(&self) -> String {
        let mut output = String::new();
        Self::fmt_rec(self, self.top, &mut output);
        output
    }
    pub fn fmt_term(&self, id: Id<Term>) -> String {
        let mut output = String::new();
        Self::fmt_rec(self, id, &mut output);
        output
    }
}
#[derive(Debug, Clone)]
pub struct Span(Range<usize>);
#[derive(Debug, Hash, PartialEq, Eq, Clone)]
pub struct Name(Spur);
impl Name {
    fn new(interner: &mut Rodeo, name: &str) -> Self {
        Self(interner.get_or_intern(name))
    }
    pub fn lookup(interner: &Rodeo, name: &str) -> Option<Self> {
        interner.get(name).map(|x| Self(x))
    }
    pub fn key(&self) -> &Spur {
        &self.0
    }
}

#[derive(Debug, Clone)]
pub struct StringLiteral(Spur);
impl StringLiteral {
    fn new(interner: &mut Rodeo, literal: &str) -> Self {
        Self(interner.get_or_intern(literal))
    }
}

/// Since modules are allowed to take arguments on import, paths must accommodate
/// parameter passing as well.
#[derive(Debug, Clone, PartialEq)]
pub enum PathSegment {
    Ident(Name),
    Value(Id<Term>),
    /// Path segments of this type are only usable by interpreter internals.
    /// Currently used for `#magic#` sub paths, to avoid needing to intern most internal
    /// magic strings.
    Hidden(&'static str),
}
/// A variable path of the form `bootstrap::get-args` or `elf::<archs::x86_64>::make-elf`.
// Note that module constructors must be pure functions, to avoid surprising performance
// overhead in path expressions.
// Invariant: a path must always begin with an ident.
#[derive(Debug, Clone, PartialEq)]
pub struct Path {
    parts: Vec<PathSegment>,
}
impl Path {
    pub fn ident(name: Name) -> Self {
        Self {
            parts: vec![PathSegment::Ident(name)],
        }
    }
    fn new(parts: Vec<Name>) -> Self {
        Self {
            parts: parts
                .into_iter()
                .map(|name| PathSegment::Ident(name))
                .collect(),
        }
    }
    /// Produce a hidden magic path.
    pub fn hidden_magic(names: &mut Rodeo, parts: Vec<&'static str>) -> Self {
        Self::from_parts(
            iter::once(PathSegment::Ident(Name::new(names, "#magic#")))
                .chain(parts.into_iter().map(|name| PathSegment::Hidden(name)))
                .collect(),
        )
    }
    fn from_parts(parts: Vec<PathSegment>) -> Self {
        Self { parts }
    }
    fn fmt(&self, ast: &AST, id: Id<Term>, out: &mut String) {
        if self
            .parts
            .iter()
            .all(|a| matches!(a, PathSegment::Ident(_) | PathSegment::Hidden(_)))
        {
            match &*self.parts {
                [PathSegment::Ident(name), rest @ ..] => {
                    out.push_str(ast.names.resolve(&name.0));
                    for name in rest {
                        match name {
                            PathSegment::Ident(name) => {
                                out.push_str("::");
                                out.push_str(ast.names.resolve(&name.0));
                            }
                            PathSegment::Hidden(name) => {
                                out.push_str("::");
                                out.push_str(name);
                            }
                            _ => todo!("full path expression formatting"),
                        }
                    }
                }
                _ => todo!("full path expression formatting"),
            }
        } else {
            todo!("full path expression formatting")
        }
    }
    pub fn into_ident(self) -> Option<Name> {
        // Can't move out of a slice pattern, eh?
        let mut iter = self.parts.into_iter();
        let res = iter.next().and_then(|segment| match segment {
            PathSegment::Ident(ident) => Some(ident),
            _ => None,
        });
        for _ in iter {
            return None;
        }
        res
    }
}

#[derive(Debug, ::derive_more::Unwrap, Clone)]
pub enum Term {
    CreateRef(Id<Term>),
    List(Vec<Id<Term>>),
    Variable(Path),
    IntegerLiteral(u64),
    StringLiteral(StringLiteral),
    Comment(Span),
}
use crate::tree::TreeIndex;
impl crate::tree::IndexNode for Term {
    fn index(&self, index: TreeIndex) -> Option<Id<Self>> {
        match self {
            Term::CreateRef(operand) if index.val() == 0 => Some(*operand),
            Term::CreateRef(_) => None,
            Term::List(elements) => elements.get(index.val() as usize).copied(),
            Term::Variable(_name) => None,
            Term::IntegerLiteral(_val) => None,
            Term::StringLiteral(_val) => None,
            Term::Comment(_span) => None,
        }
    }
}
#[derive(Debug)]
pub enum ParseError {
    IntegerTooLarge,
    // TODO: include spans on parse errors
    ExpectedIdent,
    UnexpectedEof,
    Eof,
}
fn parse_list(
    names: &mut Rodeo,
    strings: &mut Rodeo,
    terms: &mut Arena<Term>,
    lexer: &mut Lexer<Token>,
) -> Result<Id<Term>, ParseError> {
    let mut expressions = Vec::new();
    loop {
        let mut peeker = lexer.clone();
        if peeker
            .next()
            .ok_or(ParseError::UnexpectedEof)?
            .is_close_parenthesis()
        {
            // Consume closing parenthesis.
            lexer.next();
            break;
        } else {
            expressions.push(parse_expression(names, strings, terms, lexer).map_err(
                |e| match e {
                    // Unclosed parenthesis.
                    // If we left off this conversion, you could leave off trailing parentheses
                    // at the end of a file. (Who knows, maybe that could be considered a feature?)
                    ParseError::Eof => ParseError::UnexpectedEof,
                    e => e,
                },
            )?);
        }
    }
    Ok(terms.alloc(Term::List(expressions)))
}
fn parse_path(
    names: &mut Rodeo,
    _strings: &mut Rodeo,
    terms: &mut Arena<Term>,
    lexer: &mut Lexer<Token>,
    front: &str,
) -> Result<Id<Term>, ParseError> {
    let mut parts = vec![PathSegment::Ident(Name::new(names, front))];
    #[derive(Copy, Clone, Debug)]
    enum State {
        ExpectingSeparator,
        ExpectingSegment,
    }
    let mut state = State::ExpectingSeparator;
    loop {
        let mut peeker = lexer.clone();
        match (state, peeker.next()) {
            (State::ExpectingSegment, Some(Token::Ident)) => {
                parts.push(PathSegment::Ident(Name::new(names, {
                    lexer.next();
                    lexer.slice()
                })));
                state = State::ExpectingSeparator;
            }
            (State::ExpectingSegment, None) => return Err(ParseError::UnexpectedEof),
            (State::ExpectingSeparator, Some(Token::PathSeparator)) => {
                lexer.next();
                state = State::ExpectingSegment;
            }
            (State::ExpectingSeparator, _) => break,
            x => todo!("unexpected tokens in path parser: {:?}", x),
        }
    }
    Ok(terms.alloc(Term::Variable(Path { parts })))
}
fn parse_expression(
    names: &mut Rodeo,
    strings: &mut Rodeo,
    terms: &mut Arena<Term>,
    lexer: &mut Lexer<Token>,
) -> Result<Id<Term>, ParseError> {
    // Note that we perform stack growth here because most (or all?) recursion during parsing
    // will pass through this function, as this is an expression based language.
    // We talk in terms of 2048 because it is the nearest power of 2 to 1888,
    // which I have approximated to be the current width (in bytes) of the stack frames
    // between here and the farthest recursion.
    ::stacker::maybe_grow(16 * 2048, 2048 * 1024, || {
        while let Some((token, span)) = lexer.next().map(|token| (token, lexer.span())) {
            match token {
                Token::OpenParenthesis => return parse_list(names, strings, terms, lexer),
                Token::CloseParenthesis => todo!("no matching open parenthesis"),
                // New lines are ignored in most cases.
                Token::NewLine => (),
                // I haven't decided how I want to use path separators for the moment.
                Token::PathSeparator => todo!("unexpected `::`?"),
                // Comments are always ignored, at least for now.
                Token::LineComment => (),
                Token::StringLiteral => {
                    // Strip off preceding and trailing quotes.
                    let lit = &lexer.slice()[1..];
                    let lit = &lit[..lit.len() - 1];
                    return Ok(terms.alloc(Term::StringLiteral(StringLiteral::new(strings, lit))));
                }
                Token::Integer => {
                    let int = lexer
                        .slice()
                        .as_bytes()
                        .iter()
                        .try_fold(0u64, |a, b| {
                            (a * 10).checked_add((b - b'0') as u64).ok_or(())
                        })
                        .map_err(|()| ParseError::IntegerTooLarge)?;
                    return Ok(terms.alloc(Term::IntegerLiteral(int)));
                }
                Token::RefOp => {
                    let operand = parse_expression(names, strings, terms, lexer)?;
                    return Ok(terms.alloc(Term::CreateRef(operand)));
                }
                // TODO: keep ident spans (really, we should just keep **all the spans**)
                // TODO: consider peeking for a path separator
                Token::Ident => return parse_path(names, strings, terms, lexer, lexer.slice()),
                // Whitespace is ignored in most cases.
                Token::Whitespace => (),
                Token::Error => todo!("hm... {:?}", lexer.slice()),
            }
        }

        Err(ParseError::Eof)
    })
}
pub fn parse_program(input: &str) -> Result<AST, ParseError> {
    let mut names = Rodeo::<Spur>::new();
    let mut strings = Rodeo::<Spur>::new();
    let mut terms = Arena::<Term>::new();
    let mut lexer = Token::lexer(input);
    // todo: parse multiple expressions until we hit EOF,
    // then slap them all inside an implicit progn.

    // Since '#' is not permitted in identifiers by the lexer,
    // we can use it to indicate *super special* identifiers used
    // by the runtime itself.
    // In this case, that's a `#magic#` module that's always in scope, without a preceding
    // `(import-runtime "interpreter" (imports #magic#))` or what have you
    // (which would be impossible to write, as `#magic#` is not a syntactically valid identifier).
    // Uses of the `#magic#` module should vanish upon lowering to any IR a Shiv program
    // is allowed to observe.
    // Since `import-runtime` is indeed a runtime construct,
    // we need a way to talk about runtime internals in
    // magically inserted code without relying on modules of that sort.
    // In this case, that is the implicit toplevel progn.
    // At least for now.
    // The fact that this is necessary at all is only due to our packing *slightly*
    // more information into this tree than the structure and given semantics can strictly accommodate.
    // We do not *really* need to store the top level progn inside the tree itself,
    // but this hack allows us to do so, and doing so should simplify things.
    let mut implicit_progn = vec![terms.alloc(Term::Variable(Path::from_parts(vec![
        PathSegment::Ident(Name::new(&mut names, "#magic#")),
        PathSegment::Hidden("progn"),
    ])))];
    loop {
        match parse_expression(&mut names, &mut strings, &mut terms, &mut lexer) {
            Ok(top) => implicit_progn.push(top),
            Err(ParseError::Eof) => break,
            Err(e) => return Err(e),
        }
    }
    let top = terms.alloc(Term::List(implicit_progn));
    Ok(AST {
        names,
        strings,
        tree: Tree { arena: terms, top },
    })
}
