#[cfg(test)]
mod tests;
#[cfg(not(doctest))]
pub mod tokenizers;
pub use tokenizers::Tokenizer;
use crate::tokenization::tokenizers::{pre_tokenizers::whitespace::Whitespace};

/// Tokenize the string into a vector of each letter
pub fn tokenize_alphabet(strings: Vec<String>) -> Vec<Vec<String>> {
    tokenizers::utils::parallelism::set_parallelism(true);
    let mut tokens: Vec<Vec<String>> = Vec::with_capacity(strings.len());
    for string in strings.iter(){
        let mut current_tokens: Vec<String> = Vec::with_capacity(string.len());
        for char in string.chars() {
            current_tokens.push(char.to_string());
        }
        tokens.push(current_tokens)
    }
    tokens
}

/// Tokenizes strings by splitting at whitespace
pub fn tokenize_spaces(strings: Vec<String>) -> Vec<Vec<String>> {
    let mut tokens: Vec<Vec<String>> = Vec::with_capacity(strings.len());
    for string in strings.iter(){
        let mut current_tokens: Vec<String> = Vec::new();
        for token in string.split(" ") {
            current_tokens.push(token.to_string());
        }
        tokens.push(current_tokens);
    }
    tokens
}

/// Tokenizes strings using BPE tokenization
pub fn tokenize_bpe(mut strings: Vec<String>, tokenizer: Option<&Tokenizer>) -> Vec<Vec<String>> {
    tokenizers::utils::parallelism::set_parallelism(true);
    // Lowercase
    strings = strings.iter().map(|a| {a.to_lowercase()}).collect();
    // Create tokenizer and tokenize
    let encodings = match tokenizer {
        Some(tokenizer) =>
        {
            tokenizer.encode_batch(strings, false).expect("WordPiece tokenization failed!")
        },
        None => {
            let tokenizer = load_bpe_tokenizer();
            tokenizer.encode_batch(strings, false).expect("WordPiece tokenization failed!")
        }
    };
    // Convert back to strings
    let mut tokens: Vec<Vec<String>> = Vec::with_capacity(encodings.len());
    for encoding in encodings {
        tokens.push(encoding.get_tokens().to_vec());
    }
    tokens
}

/// Loads the BPE tokenizer
pub fn load_bpe_tokenizer() -> Tokenizer {
    use crate::tokenization::tokenizers::models::bpe::BPE;
    // Create tokenizer
    let bpe_builder = BPE::builder();
    let bpe_vocab = super::vocab::load_bpe_vocab();
    let mut merges: Vec<(String, String)> = Vec::new();
    let lines: Vec<&str> = include_str!("../resources/bpe_merges.txt").split("\n").collect();
    for line in lines {
        let line = String::from(line).replace("Ġ", "").replace("\n", "").replace("##", "");
        // Filter out junk
        if line.contains(" ") && !line.contains("#") {
            let line: Vec<&str> = line.split(" ").collect();
            // Make sure vocab contains both tokens and combined token
            if bpe_vocab.token2index.contains_key(&line[0].to_string()) && bpe_vocab.token2index.contains_key(&line[1].to_string()) && bpe_vocab.token2index.contains_key(&format!("{}{}", line[0].to_string(), line[1].to_string())) {
                merges.push((line[0].to_string(), line[1].to_string()));
            }
        }
    }
    let bpe_builder = bpe_builder.vocab_and_merges(bpe_vocab.token2index, merges);
    let bpe = bpe_builder
        .unk_token("[UNK]".into())
        .build().expect("BPE Tokenizer failed to build!");

    Tokenizer::new(bpe)
}

/// Tokenizes strings using WordPiece tokenization
pub fn tokenize_wordpiece(mut strings: Vec<String>, tokenizer: Option<&Tokenizer>) -> Vec<Vec<String>> {
    tokenizers::utils::parallelism::set_parallelism(true);
    // Lowercase
    strings = strings.iter().map(|a| {a.to_lowercase()}).collect();
    // Create tokenizer and tokenize
    let encodings = match tokenizer {
        Some(tokenizer) =>
        {
            tokenizer.encode_batch(strings, false).expect("WordPiece tokenization failed!")
        },
        None => {
            let tokenizer = load_wordpiece_tokenizer();
            tokenizer.encode_batch(strings, false).expect("WordPiece tokenization failed!")
        }
    };
    // Convert back to strings
    let mut tokens: Vec<Vec<String>> = Vec::with_capacity(encodings.len());
    for encoding in encodings.iter() {
        tokens.push(encoding.get_tokens().to_vec());
    }
    tokens
}

/// Loads the wordpiece tokenizer
pub fn load_wordpiece_tokenizer() -> Tokenizer {
    use tokenizers::models::wordpiece::WordPiece;
    use std::collections::HashMap;
    // Build tokenizer
    let wordpiece_builder = WordPiece::builder();
    let lines: Vec<&str> = include_str!("../resources/wordpiece_vocab.txt").split("\n").collect();
    let mut hashmap: HashMap<String, u32> = HashMap::new();
    for (i, line) in lines.iter().enumerate() {
        hashmap.insert(line.to_string(), i as u32);
    }
    let wordpiece_builder = wordpiece_builder.vocab(hashmap);
    let wordpiece = wordpiece_builder
        .build().expect("WordPiece Tokenizer failed to build!");

    let mut tokenizer = Tokenizer::new(wordpiece);
    tokenizer.with_pre_tokenizer(Whitespace::default());
    tokenizer
}

// UNTOKENIZATION FUNCTIONS
/// Untokenize alphabet tokens
pub fn untokenize_alphabet(tokens: Vec<Vec<String>>) -> Vec<String> {
    tokens.iter().map(|tokens| {
        tokens.join("")
    }).collect()
}

/// Untokenize BPE tokens
pub fn untokenize_bpe(tokens: Vec<Vec<String>>) -> Vec<String> {
    // BPE encodings contain spaces, so untokenizing is the same as alphabet
    untokenize_alphabet(tokens)
}

/// Untokenize wordpiece tokens
pub fn untokenize_wordpiece(tokens: Vec<Vec<String>>) -> Vec<String> {
    let punctuation  = [".".to_string(), "?".to_string(), "!".to_string(), ",".to_string(), "'".to_string(), r#"""#.to_string()];
    let mut untokenized_strings = vec![String::new(); tokens.len()];
    for i in 0..tokens.len() {
        for x in 0..tokens[i].len() {
            if tokens[i][x] != "[PAD]".to_string() && tokens[i][x] != "[EOS]".to_string() {
                if tokens[i][x].contains("##") || punctuation.contains(&tokens[i][x]) || x == 0 {
                    untokenized_strings[i] = format!("{}{}", untokenized_strings[i], tokens[i][x].replace("##", ""))
                }else {
                    untokenized_strings[i] = format!("{} {}", untokenized_strings[i], tokens[i][x])
                }
            }
        }
    }
    untokenized_strings
}

/// Untokenize space seperated tokens
pub fn untokenize_spaces(tokens: Vec<Vec<String>>) -> Vec<String> {
    tokens.iter().map(|tokens| {
        tokens.join(" ")
    }).collect()
}