use regex::Regex;
use select::document::Document;
use select::predicate::Name;
use std::collections::{HashMap, HashSet};
use std::iter::FromIterator;
use substring::Substring;
use url::Url;

#[derive(Debug)]
pub struct HtmlDocument {
    pub origin: String,
    pub headers : HashMap<String, Vec<String>>,
    pub body: String,
}


/*

HTML DOCUMENT

*/


impl HtmlDocument {
    /// public function
    /// input is a reference to self
    /// returns an option
    /// transforms body of HtmlDocument to collect all anchors
    pub fn anchors(&self) -> Option<HashSet<String>> {
        let mut ret_vec: Vec<String> = vec![];
        let body_string = self.body.as_str();

        Document::from(body_string)
            .find(Name("a"))
            .filter_map(|n| n.attr("href"))
            .for_each(|x| {
                let link = HtmlDocument::check_link(&self.origin, &x.to_string());
                ret_vec.push(link)
            });

        let link_hashset: HashSet<_> = ret_vec.iter().cloned().collect();

        Some(link_hashset)
    }

    /// public function
    /// input is a reference to self
    /// returns an option
    /// transforms body of HtmlDocument to collect domain related anchors
    /// what is the best way to use self here?
    pub fn domain_anchors(&self) -> Option<HashSet<String>> {
        let mut ret_vec: Vec<String> = vec![];
        let body_string = self.body.as_str();

        Document::from(body_string)
            .find(Name("a"))
            .filter_map(|n| n.attr("href"))
            .for_each(|x| {
                let link = HtmlDocument::check_link(&self.origin, x);

                if HtmlDocument::is_domain_related(&self.origin, &link)
                    && HtmlDocument::is_http(&link)
                    && !HtmlDocument::has_extension(&link)
                {
                    ret_vec.push(link)
                }
            });

        let link_hashset: HashSet<_> = ret_vec.iter().cloned().collect();

        Some(link_hashset)
    }

    /// public function
    /// input is a reference to self
    /// returns an option
    /// transforms body of HtmlDocument to collect non domain related anchors
    /// what is the best way to use self here?
    pub fn non_domain_anchors(&self) -> Option<HashSet<String>> {
        let mut ret_vec: Vec<String> = vec![];
        let body_string = self.body.as_str();

        Document::from(body_string)
            .find(Name("a"))
            .filter_map(|n| n.attr("href"))
            .for_each(|x| {
                let link = HtmlDocument::check_link(&self.origin, x);

                if !HtmlDocument::is_domain_related(&self.origin, &link)
                    && HtmlDocument::is_http(&link)
                    && !HtmlDocument::has_extension(&link)
                {
                    ret_vec.push(link)
                }
            });

        let link_hashset: HashSet<_> = ret_vec.iter().cloned().collect();

        Some(link_hashset)
    }

    ///pub function
    /// inputs: &self and regex &str
    /// returns option hashset<string>
    /// used for when trying to crawl threads
    pub fn anchors_curate(&self, regex: &str) -> Option<HashSet<String>> {
        let mut ret_vec: Vec<String> = vec![];
        let body_string = self.body.as_str();
        let re = Regex::new(regex).unwrap();

        Document::from(body_string)
            .find(Name("a"))
            .filter_map(|n| n.attr("href"))
            .for_each(|x| {
                let link = HtmlDocument::check_link(&self.origin, x);

                if re.is_match(&link)
                    && !HtmlDocument::has_extension(&link)
                {
                    ret_vec.push(link)
                }
            });

        let link_hashset: HashSet<_> = ret_vec.iter().cloned().collect();

        Some(link_hashset)
    }

    ///public method
    /// gets the text of a tag
    pub fn tag_text(&self, tag: &str) -> Vec<String> {
        let re = Regex::new(r"\t|\n").unwrap();
        let mut html_str_vec: Vec<String> = vec![];
        Document::from(self.body.as_str())
            .find(Name(tag))
            .for_each(|f| {
                let parsed = re.replace_all(&f.text(), " ").to_string();

                if !parsed.trim().is_empty() {
                    html_str_vec.push(parsed.trim().to_string());
                }
            });

        html_str_vec
    }

    ///public method
    /// gets the html of a tag
    pub fn tag_html(&self, tag: &str) -> Vec<String> {
        let mut html_str_vec: Vec<String> = vec![];
        Document::from(self.body.as_str())
            .find(Name(tag))
            .for_each(|f| {
                //ret_string.push_str(&f.html());
                html_str_vec.push(f.html())
            });

        html_str_vec
    }

    /// public function
    /// returns HashMap<String, Vec<String>>
    /// takes in reference to self
    /// use to map out the meta information in html string
    pub fn meta_data(&self) -> HashMap<String, Vec<String>> {
        let mut meta_hashmap: HashMap<String, Vec<String>> = HashMap::new();
        Document::from(self.body.as_str())
            .find(Name("meta"))
            .for_each(|f| {
                let mut index = 0;
                let mut key = String::new();
                let mut values;
                for (_, foo) in f.attrs() {
                    if index % 2 == 0 {
                        key = foo.to_string();
                    } else {
                        values = Vec::from_iter(foo.split(',').map(String::from));
                        meta_hashmap.insert(key.clone(), values);
                    }
                    index += 1;
                }
            });

        meta_hashmap
    }
/*

PRIVATES

*/

/// private function
/// check_string -> string of domain that you want to check against
/// link -> string of link you want to check
/// returns a bool
/// use case: used as a conditional check to wether an HtmlDocument anchor href
/// is associated with the domain
/// needs refactoring
fn is_domain_related(check_string: &str, link: &str) -> bool {
    let origin_url = Url::parse(check_string).expect("self is not a url");
    let checker_url = Url::parse(link).expect("link is not a string");
    let origin_vec = Vec::from_iter(
        origin_url
            .host()
            .unwrap()
            .to_string()
            .split('.')
            .map(String::from),
    );
    let checker_vec = Vec::from_iter(
        checker_url
            .host()
            .unwrap()
            .to_string()
            .split('.')
            .map(String::from),
    );

    //google's amazon's
    let origin_second_level = &origin_vec[origin_vec.len() - 2];
    let checker_second_level = &checker_vec[checker_vec.len() - 2];

    //com's org's
    let origin_top_level = &origin_vec[origin_vec.len() - 1];
    let checker_top_level = &checker_vec[checker_vec.len() - 1];

    //      google         ==         google              com          ==         com
    if origin_second_level == checker_second_level && origin_top_level == checker_top_level {
        true
    } else {
        false
    }
}

///private method: check_link
/// input "origin", type &str, stands for an HtmlDocument anchor
/// input "in_link", type &str, stands for an HtmlDocument anchor that may need parsing
/// this is a cursory check to see if a parse is needed
/// returns string
fn check_link(origin: &str, in_link: &str) -> String {
    let in_link_parse = match Url::parse(in_link) {
        Ok(link) => link.to_string(),
        Err(_) => HtmlDocument::parse_link(&origin, &in_link),
    };

    in_link_parse
}

/// cdprivate method
/// input is the origin of the HtmlDocument, and the unparsed link
/// the output is the parsed string
/// use case
/// get an anchor tag href where it is unparsed, as in "/"
fn parse_link(origin: &str, unparsed_link: &str) -> String {
    let host = Url::parse(origin).expect("origin not a url");
    let host_string = host.host_str().unwrap().to_string();
    let parsed_link: String;

    if unparsed_link.substring(0, 1) == "/" {
        parsed_link = format!("{}://{}{}", host.scheme(), host_string, unparsed_link);
    } else {
        parsed_link = format!("{}://{}/{}", host.scheme(), host_string, unparsed_link);
    }

    parsed_link
}

/// private method
/// input is link to run http(s) check
/// use case:
/// this is used as conditional method to whether its an actual http url
fn is_http(link: &str) -> bool {
    let url = Url::parse(link).expect("is http failed");

    if url.scheme() == "http" || url.scheme() == "https" {
        true
    } else {
        false
    }
}

fn has_extension(link: &str) -> bool {
    let url = Url::parse(link).expect("is http failed");

    //Make a constant
    let extention_list: Vec<&str> = vec!["jpeg", "jpg", "css", "js", "webm", "webp"];

    let extention_vec = Vec::from_iter(url.path().split('.'));

    if extention_list.contains(&extention_vec[extention_vec.len() - 1]) {
        true
    } else {
        false
    }
}

}