use rayon::prelude::*;
use std::collections::{HashMap, HashSet};

//Imports serde:
//use serde::{Deserialize, Serialize};

#[derive(Debug, PartialEq, Clone)]
pub struct Sentence {
    pub index: usize,
    pub length: usize,
    pub outgoing_connections: Option<HashMap<usize, usize>>, //first represents sentence index, second represents the number of outgoing connections
    pub text: String,
    pub words: HashSet<String>,
    pub number_of_connections: f32,
    pub originating_file: String,
}

#[derive(Debug, Clone)]
pub struct Summariser<'a> {
    pub sentences: HashMap<usize, Sentence>,
    bias_list: HashSet<String>,
    bias_strength: Option<f32>,
    _marker: std::marker::PhantomData<&'a str>,
}

impl<'a> Summariser<'a> {
    pub fn new() -> Summariser<'a> {
        Summariser {
            sentences: HashMap::new(),
            bias_list: HashSet::new(),
            bias_strength: None,
            _marker: std::marker::PhantomData,
        }
    }
    pub fn add_raw_text(
        &mut self,
        filename: String,
        raw_text: String,
        separator: &str,
        min_length: usize,
        max_length: usize,
        ngrams: bool,
    ) -> &mut Summariser<'a> {
        let mut sentences = HashMap::new();
        //let tokenizer = get_tokenizer_from_text(raw_text.clone());
        //Split the text into chunks of 100 words each
        //let chunked_text = raw_text
        //.split(|c: char| !c.is_alphanumeric() && c != ' ')
        //.map(|x| x.to_string())
        //.collect::<Vec<String>>()
        //.chunks(35)
        //.map(|x| x.join(" "))
        //.collect::<Vec<String>>();
        let all_sentences = raw_text.split(separator).collect::<Vec<&str>>();
        //let number_of_sentences_as_u32 = all_sentences.len() as u32;
        //let bar = ProgressBar::new(number_of_sentences_as_u32 as u64);
        let mut i = self.sentences.len();
        for sentence in all_sentences.iter() {
            if sentence.len() > min_length && sentence.len() < max_length {
                //println!("{}", i);
                let mut words: HashSet<String> = HashSet::new();
                if !ngrams {
                    words = HashSet::from_iter(
                        sentence
                            //.chars()
                            //.collect::<Vec<char>>()
                            //.iter()
                            //.map(|x| if !x.is_alphanumeric() || !x.is_ascii_whitespace() {
                            //let spaced_char = format!(" {} ", x);
                            //spaced_char.to_string().to_lowercase()
                            //} else {
                            //x.to_string().to_lowercase()
                            //})
                            //.collect::<String>()
                            .split_whitespace()
                            .map(|word| word.to_string()),
                    );
                } else {
                    for n in 7..15 {
                        let ngrams = sentence
                            .chars()
                            .collect::<Vec<char>>()
                            .windows(n)
                            .map(|x| x.iter().collect::<String>())
                            .collect::<Vec<String>>();
                        words.extend(ngrams);
                    }
                }
                let outgoing_connections = HashMap::new();
                let sentence = Sentence {
                    index: i,
                    length: sentence.len(),
                    outgoing_connections: Some(outgoing_connections),
                    text: sentence.to_string(),
                    words: words,
                    number_of_connections: 0.0,
                    originating_file: filename.clone(),
                };
                sentences.insert(i, sentence.clone());
                //}
            }
            i += 1;
        }
        self.sentences.extend(sentences);
        self
    }

    pub fn score_sentences_by_word_frequency(
        &mut self,
        word_frequencies: HashMap<String, f32>,
        density: f32,
        length_penalty: f32,
    ) -> Vec<(usize, f32)> {
        for sentence in self.sentences.values_mut() {
            let mut score = 0.0;
            for word in sentence.words.iter() {
                let frequency = word_frequencies.get(word).unwrap_or(&-1.0);
                //let mut score = sentence.number_of_connections;
                score += frequency;
            }
            sentence.number_of_connections =
                score.powf(density) / (sentence.length as f32).powf(length_penalty);
        }
        //Sort the sentences by score
        let mut sorted_sentences: Vec<(usize, f32)> = self
            .sentences
            .iter()
            .map(|(index, sentence)| (*index, sentence.number_of_connections))
            .collect();
        sorted_sentences.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
        return sorted_sentences;
    }

    pub fn approximate_top_sentences(
        &mut self,
        k: usize,
        density: f32,
        length_penalty: f32,
    ) -> Vec<Sentence> {
        let mut word_frequencies: HashMap<String, f32> = HashMap::new();
        for sentence in self.sentences.values() {
            for word in sentence.words.iter() {
                let count = word_frequencies.entry(word.clone()).or_insert(-1.0);
                *count += 1.0;
            }
        }
        //for (word, score) in word_frequencies.iter_mut() {
        //*score = score.powf(3.0);
        //}
        let sorted_sentences =
            self.score_sentences_by_word_frequency(word_frequencies, density, length_penalty);
        let top_sentences_text = sorted_sentences
            .iter()
            //.rev()
            .map(|(indx, _)| self.sentences.get(indx).unwrap().clone())
            .take(k)
            .collect::<Vec<Sentence>>();
        top_sentences_text
    }

    pub fn retrieve_sentence_by_index(&self, index: usize) -> Option<Sentence> {
        self.sentences.get(&index).cloned()
    }

    pub fn from_sentences(filename: String, sentences: Vec<String>) -> Summariser<'a> {
        let converted_and_filtered_sentences = sentences
            .iter()
            .map(|sentence| {
                let words: HashSet<String> = HashSet::from_iter(
                    sentence
                        .to_lowercase()
                        .split_whitespace()
                        .map(|word| word.to_string()),
                );
                let outgoing_connections = HashMap::new();
                let sentence = Sentence {
                    index: 0,
                    length: sentence.len(),
                    outgoing_connections: Some(outgoing_connections),
                    text: sentence.to_string(),
                    words: words,
                    number_of_connections: 0.0,
                    originating_file: filename.clone(),
                };
                sentence
            })
            .collect::<Vec<Sentence>>();
        let mut sentences_map = HashMap::new();
        for (i, sentence) in converted_and_filtered_sentences.iter().enumerate() {
            sentences_map.insert(i, sentence.clone());
        }
        Summariser {
            sentences: sentences_map,
            bias_list: HashSet::new(),
            bias_strength: None,
            _marker: std::marker::PhantomData,
        }
    }

    fn white_space_closure_filter(&self, sentence: Sentence) -> bool {
        let whitespace_count = sentence
            .text
            .chars()
            .filter(|&c| c == ' ' || c == '\t' || c == '\n')
            .count();
        let whitespace_percentage = whitespace_count as f32 / sentence.length as f32;
        whitespace_percentage < 0.15
    }

    fn punctuation_closure_filter(&self, sentence: Sentence) -> bool {
        let punc_vec = vec![
            '.', ',', '!', '?', ':', ';', '-', '\'', '"', '[', ']', '(', ')', '{', '}', '<', '>',
            '=', '+', '*', '&', '^', '%', '$', '#', '@', '~', '`', '|', '\\', '/', '_', '0', '1',
            '2', '3', '4', '5', '6', '7', '8', '9',
        ];
        let punctuation_vec: HashSet<&char> = HashSet::from_iter(punc_vec.iter());
        let punctuation_count = sentence
            .text
            .chars()
            .filter(|&c| punctuation_vec.contains(&c))
            .count();
        let punctuation_percentage = punctuation_count as f32 / sentence.length as f32;
        punctuation_percentage < 0.12
    }

    fn caps_closure_filter(&self, sentence: Sentence) -> bool {
        let caps_count = sentence.text.chars().filter(|&c| c.is_uppercase()).count();
        let caps_percentage = caps_count as f32 / sentence.length as f32;
        caps_percentage < 0.02
    }

    //fn too_similar_to_existing_filter(&self, sentence: Sentence) -> bool {
    //let mut similar_to_existing = false;
    //for (k, existing_sentence) in self.sentences.iter() {
    //if k != &sentence.index {
    //let jaccard_similarity =
    //jaccard_similarity(&sentence.words, &existing_sentence.words);
    //if jaccard_similarity > 0.5 {
    //similar_to_existing = true;
    //break;
    //}
    //}
    //}
    //similar_to_existing
    //}

    pub fn clean_sentences(
        &mut self,
        excessive_whitespace: bool,
        excessive_punctuation_and_nums: bool,
        excessive_caps: bool,
    ) -> &mut Summariser<'a> {
        let new_sentences = self
            .sentences
            .clone()
            .into_iter()
            .filter(|(_, sentence)| {
                if excessive_whitespace {
                    self.white_space_closure_filter(sentence.clone())
                } else {
                    true
                }
            })
            .filter(|(_, sentence)| {
                if excessive_punctuation_and_nums {
                    self.punctuation_closure_filter(sentence.clone())
                } else {
                    true
                }
            })
            .filter(|(_, sentence)| {
                if excessive_caps {
                    self.caps_closure_filter(sentence.clone())
                } else {
                    true
                }
            })
            .collect::<HashMap<_, _>>();
        self.sentences = new_sentences;
        self
    }

    fn from_sentences_direct(sentences: HashMap<usize, Sentence>) -> Summariser<'a> {
        Summariser {
            sentences: sentences,
            bias_list: HashSet::new(),
            bias_strength: None,
            _marker: std::marker::PhantomData,
        }
    }

    pub fn top_sentences(
        &mut self,
        number_of_sentences_to_return: usize,
        return_summaries_for_each: bool,
        chunk_size: Option<usize>,
        force_sum_all: bool,
        length_penalty: f32,
        force_chunk: bool,
        density: f32,
        bias_list: Option<HashSet<String>>,
        bias_strength: Option<f32>,
        progress_bar: bool,
        filename_filter: String,
    ) -> Vec<Sentence> {
        //If longer than 10,000, then divide it into portions of 5000 each. Instantiate new Summarisers and call Summariser::from_sentences_direct on each one, passing in the portion of the original sentences (convert the HashMap to a vec). Then call Summariser::top_sentences on each one, passing in the number of sentences to return. Collect the sentences, and pass them to a new instance of Summariser::from_sentences_direct. Then call Summariser::top_sentences on that instance, passing in the number of sentences to return. Return the result.
        if bias_list.is_some() {
            self.bias_list = bias_list.clone().unwrap();
        }
        if bias_strength.is_some() {
            self.bias_strength = bias_strength.clone();
        } else {
            self.bias_strength = Some(2.0);
        }
        let length_of_sentences = self.sentences.len();
        if !force_sum_all && length_of_sentences > 2000
            || !force_sum_all && return_summaries_for_each
            || force_chunk
        {
            //if chunk_size is specified, then use that. otherwise use a default value of 2000
            let final_chunk_size = match chunk_size {
                Some(chunk_size) => chunk_size,
                None => 500,
            };
            let mut summarisers = self
                .sentences
                .clone()
                .into_iter()
                .collect::<Vec<(usize, Sentence)>>()
                .chunks(final_chunk_size.clone())
                .map(|chunk| {
                    let mut initial = 0;
                    let mut new_sentences = HashMap::new();
                    for (_, sentence) in chunk {
                        new_sentences.insert(initial.clone(), sentence.clone());
                        initial += 1;
                    }
                    Summariser::from_sentences_direct(new_sentences)
                })
                .collect::<Vec<Summariser<'a>>>();
            //println!("Number of summarisers: {}", summarisers.len());
            //let number_of_summarisers = summarisers.len();
            let collected_sentences = summarisers
                .iter_mut()
                .map(|summariser| {
                    let indiv_num_to_return = match return_summaries_for_each {
                        true => number_of_sentences_to_return,
                        false => 100, //number_of_sentences_to_return * number_of_summarisers.clone(),
                    };
                    summariser.top_sentences(
                        indiv_num_to_return,
                        false,
                        None,
                        true,
                        length_penalty,
                        false,
                        density,
                        bias_list.clone(),
                        bias_strength,
                        progress_bar,
                        filename_filter.clone()
                    )
                })
                .collect::<Vec<Vec<Sentence>>>();
            if return_summaries_for_each {
                let collected_sentences = collected_sentences
                    .into_iter()
                    .flatten()
                    .collect::<Vec<Sentence>>();
                return collected_sentences;
            } else {
                let collected_sentences = collected_sentences
                    .into_iter()
                    .flatten()
                    .collect::<Vec<Sentence>>();
                let mut summariser = Summariser::from_sentences_direct(
                    collected_sentences
                        .into_iter()
                        .enumerate()
                        .map(|(index, sentence)| (index, sentence))
                        .collect::<HashMap<_, _>>(),
                );
                let final_sentences = summariser.top_sentences(
                    number_of_sentences_to_return,
                    false,
                    None,
                    false,
                    length_penalty,
                    false,
                    density,
                    bias_list.clone(),
                    bias_strength,
                    progress_bar,
                    filename_filter.clone()
                );
                return final_sentences;
            }
        }
        let length_of_sentences_as_u32 = length_of_sentences as u32;
        //matrix.par_iter_mut().enumerate().for_each(|(i, row)| {
            //for j in i + 1..length_of_sentences {
                //if let Some(sentence) = self.sentences.get(&i.clone()) {
                    //row[j] = (self.number_of_word_connections(i.clone(), j.clone()) as f32)
                        //.powf(density)
                        // / (sentence.length as f32).powf(length_penalty); //1.1
                //}
            //}
            //if progress_bar {
                //bar.inc(1);
            //}
        //});
        //In our previous tutorial, we implemented the algorithm like this ^. But there's another way:
        let sentences = self.sentences.clone();
        let mut scores = sentences.par_iter()
                .map(|(i, sentence)| {
                    let mut scores = vec![0.0; length_of_sentences];
                    for j in (i+1)..length_of_sentences {
                        if let Some(_) = self.sentences.get(&j.clone()) {
                            scores[j] = (self.number_of_word_connections(i.clone(), j.clone()) as f32)
                                .powf(density)
                                / (sentence.length as f32).powf(length_penalty); //1.1
                        }
                    }
                    
                    let summed_scores = scores.iter().sum::<f32>();
                    return (i, summed_scores)
                }).collect::<Vec<(_, _)>>();
        scores.sort_unstable_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
        let top_sentences_indices = scores
            .iter()
            .map(|x| x.0.clone())
            .collect::<Vec<usize>>()
            .iter()
            .filter(|x| self.sentences.contains_key(x))
            .map(|x| self.sentences.get(x).unwrap().clone())
            .filter(|x| x.originating_file == filename_filter)
            .take(number_of_sentences_to_return)
            .collect::<Vec<Sentence>>();
        top_sentences_indices
    }
    //Could create a sparse matrix here
    pub fn number_of_word_connections(
        &'a self,
        sentence_a_indx: usize,
        sentence_b_indx: usize,
    ) -> f32 {
        //We could simply call intersection here on the two HashSets of the .words fields for the two sentences
        //let sentence_a = self.sentences.get(&sentence_a_indx).unwrap();
        //let sentence_b = self.sentences.get(&sentence_b_indx).unwrap();
        //However, the index might not exist in the HashMap, so we need to check for that
        //THIS IS THE BUG, IT NEEDS TO BE 0
        let mut overlapping_words_with_b_length = 0;
        if let Some(sentence_a) = self.sentences.get(&sentence_a_indx) {
            if let Some(sentence_b) = self.sentences.get(&sentence_b_indx) {
                let intersection_length = sentence_a
                    .words
                    .intersection(&sentence_b.words)
                    .collect::<Vec<_>>()
                    .len();
                //.map(|x| x.to_string())
                if self.bias_list.len() > 0 {
                    overlapping_words_with_b_length = self
                        .bias_list
                        .intersection(&sentence_b.words)
                        .collect::<Vec<_>>()
                        .len(); //+ self.bias_list
                                //.intersection(&sentence_a.words)
                                //.collect::<Vec<_>>()
                                //.len();
                }
                return intersection_length as f32
                    * (1.0
                        + (overlapping_words_with_b_length as f32 * 3.0)
                            .powf(self.bias_strength.unwrap()));
            } else {
                return 0.0;
            }
        } else {
            return 0.0;
        }
    }
}

