use regex::Regex;
use scraper::{Html, Selector};
use select::document::Document;
use select::predicate::Name;
use std::collections::{HashMap, HashSet};
use std::iter::FromIterator;
use substring::Substring;
use url::Url;

#[derive(Debug)]
pub struct HtmlRecord {
    pub origin: String,
    pub headers: HashMap<String, Vec<String>>,
    pub body: String,
}

/*

HTML DOCUMENT

*/

impl HtmlRecord {
    /// public function
    /// input is a reference to self
    /// returns an option
    /// transforms body of HtmlDocument to collect all anchors
    pub fn anchors(&self) -> HashSet<String> {
        let mut ret_vec: Vec<String> = vec![];

        let doc = Html::parse_document(self.body.as_str());
        let selector = Selector::parse("a").unwrap();
        for element in doc.select(&selector) {
            match element.value().attr("href") {
                Some(link) => {
                    let link = HtmlRecord::check_link(&self.origin, link);
                    ret_vec.push(link)
                }
                None => continue,
            };
        }
        /*Document::from(body_string)
        .find(Name("a"))
        .filter_map(|n| n.attr("href"))
        .for_each(|x| {
            let link = HtmlDocument::check_link(&self.origin, &x.to_string());
            ret_vec.push(link)
        });*/

        let link_hashset: HashSet<String> = ret_vec.iter().cloned().collect();

        link_hashset
    }

    /// public function
    /// input is a reference to self
    /// returns an option
    /// transforms body of HtmlDocument to collect domain related anchors
    /// what is the best way to use self here?
    pub fn domain_anchors(&self) -> HashSet<String> {
        /*
        let mut ret_vec: Vec<String> = vec![];
        let body_string = self.body.as_str();

        Document::from(body_string)
           .find(Name("a"))
           .filter_map(|n| n.attr("href"))
           .for_each(|x| {
               let link = HtmlDocument::check_link(&self.origin, x);

               if HtmlDocument::is_domain_related(&self.origin, &link)
                   && HtmlDocument::is_http(&link)
                   && !HtmlDocument::has_extension(&link)
               {
                   ret_vec.push(link)
               }
           });
        */

        let mut ret_vec: Vec<String> = vec![];

        let doc = Html::parse_document(self.body.as_str());
        let selector = Selector::parse("a").unwrap();
        for element in doc.select(&selector) {
            match element.value().attr("href") {
                Some(link) => {
                    let link = HtmlRecord::check_link(&self.origin, link);
                    if HtmlRecord::is_domain_related(&self.origin, &link)
                        && HtmlRecord::is_http(&link)
                        && !HtmlRecord::has_extension(&link)
                    {
                        ret_vec.push(link)
                    }
                }
                None => continue,
            };
        }

        let link_hashset: HashSet<String> = ret_vec.iter().cloned().collect();

        link_hashset
    }

    /// public function
    /// input is a reference to self
    /// returns an option
    /// transforms body of HtmlDocument to collect non domain related anchors
    /// what is the best way to use self here?
    pub fn non_domain_anchors(&self) -> HashSet<String> {
        /*let mut ret_vec: Vec<String> = vec![];
        let body_string = self.body.as_str();

        Document::from(body_string)
            .find(Name("a"))
            .filter_map(|n| n.attr("href"))
            .for_each(|x| {
                let link = HtmlDocument::check_link(&self.origin, x);

                if !HtmlDocument::is_domain_related(&self.origin, &link)
                    && HtmlDocument::is_http(&link)
                    && !HtmlDocument::has_extension(&link)
                {
                    ret_vec.push(link)
                }
            });*/

        let mut ret_vec: Vec<String> = vec![];
        let doc = Html::parse_document(self.body.as_str());
        let selector = Selector::parse("a").unwrap();

        for element in doc.select(&selector) {
            match element.value().attr("href") {
                Some(link) => {
                    let link = HtmlRecord::check_link(&self.origin, link);
                    if !HtmlRecord::is_domain_related(&self.origin, &link)
                        && HtmlRecord::is_http(&link)
                        && !HtmlRecord::has_extension(&link)
                    {
                        ret_vec.push(link)
                    }
                }
                None => continue,
            };
        }

        let link_hashset: HashSet<String> = ret_vec.iter().cloned().collect();

        link_hashset
    }

    ///pub function
    /// inputs: &self and regex &str
    /// returns option hashset<string>
    /// used for when trying to crawl threads
    pub fn anchors_curate(&self, regex: &str) -> HashSet<String> {
        /*let mut ret_vec: Vec<String> = vec![];
        let body_string = self.body.as_str();
        let re = Regex::new(regex).unwrap();

        Document::from(body_string)
            .find(Name("a"))
            .filter_map(|n| n.attr("href"))
            .for_each(|x| {
                let link = HtmlDocument::check_link(&self.origin, x);

                if re.is_match(&link) && !HtmlDocument::has_extension(&link) {
                    ret_vec.push(link)
                }
            });
        */
        let mut ret_vec: Vec<String> = vec![];
        let re = Regex::new(regex).unwrap();
        let doc = Html::parse_document(self.body.as_str());
        let selector = Selector::parse("a").unwrap();

        for element in doc.select(&selector) {
            match element.value().attr("href") {
                Some(link) => {
                    let link = HtmlRecord::check_link(&self.origin, link);
                    if !HtmlRecord::is_domain_related(&self.origin, &link)
                        && re.is_match(&link)
                        && !HtmlRecord::has_extension(&link)
                    {
                        ret_vec.push(link)
                    }
                }
                None => continue,
            };
        }
        let link_hashset: HashSet<String> = ret_vec.iter().cloned().collect();

        link_hashset
    }

    ///public method
    /// gets the text of a tag
    pub fn tag_text(&self, tag: &str) -> Vec<String> {
        /*let re = Regex::new(r"\t|\n").unwrap();
        let mut html_str_vec: Vec<String> = vec![];
        Document::from(self.body.as_str())
            .find(Name(tag))
            .for_each(|f| {
                let parsed = re.replace_all(&f.text(), " ").to_string();

                if !parsed.trim().is_empty() {
                    html_str_vec.push(parsed.trim().to_string());
                }
            });

        html_str_vec*/

        let mut ret_vec: Vec<String> = vec![];
        let re = Regex::new(r"\n|\t").unwrap();
        let selector = Selector::parse(tag).unwrap();
        let doc = Html::parse_document(self.body.as_str());

        for element in doc.select(&selector) {
            let text_raw = element.text().collect::<String>(); //Vec<_>>();
            let text_parsed = re.replace_all(&text_raw, "").to_string();
            if !text_parsed.is_empty() {
                ret_vec.push(String::from(text_parsed.trim()));
            }
        }
        ret_vec
    }

    ///public method
    /// gets the html of a tag
    pub fn tag_html(&self, tag: &str) -> Vec<String> {
        let mut html_str_vec: Vec<String> = vec![];
        Document::from(self.body.as_str())
            .find(Name(tag))
            .for_each(|f| {
                //ret_string.push_str(&f.html());
                html_str_vec.push(f.html())
            });

        html_str_vec
    }

    /// public function
    /// returns HashMap<String, Vec<String>>
    /// takes in reference to self
    /// use to map out the meta information in html string
    pub fn meta_data(&self) -> HashMap<String, Vec<String>> {
        let mut meta_hash: HashMap<String, Vec<String>> = HashMap::new();
        let doc = Html::parse_document(self.body.as_str());
        let selector = Selector::parse("meta").unwrap();

        for element in doc.select(&selector) {
            let name = element.value().attr("name").unwrap_or("Content-Type");
            let content = match element.value().attr("content") {
                Some(content) => Vec::from_iter(content.split(',').map(String::from)),
                None => vec!["none".to_string()],
            };

            meta_hash.insert(String::from(name), content);
        }
        println!("{:?}", meta_hash);

        meta_hash
    }
    /*

    PRIVATES

    */

    /// private function
    /// check_string -> string of domain that you want to check against
    /// link -> string of link you want to check
    /// returns a bool
    /// use case: used as a conditional check to wether an HtmlDocument anchor href
    /// is associated with the domain
    /// needs refactoring
    fn is_domain_related(check_string: &str, link: &str) -> bool {
        let origin_url = Url::parse(check_string).expect("self is not a url");
        let url_to_check = Url::parse(link).expect("link is not a string");

        let origin_vec = match origin_url.host() {
            Some(host) => Vec::from_iter(host.to_string().split('.').map(String::from)),
            None => return false,
        };

        let checker_vec = match url_to_check.host() {
            Some(host) => Vec::from_iter(host.to_string().split('.').map(String::from)),
            None => return false,
        };

        //google's amazon's
        let origin_second_level = &origin_vec[origin_vec.len() - 2];
        let checker_second_level = &checker_vec[checker_vec.len() - 2];

        //com's org's
        let origin_top_level = &origin_vec[origin_vec.len() - 1];
        let checker_top_level = &checker_vec[checker_vec.len() - 1];

        //      google         ==         google              com          ==         com
        origin_second_level == checker_second_level && origin_top_level == checker_top_level
    }

    ///private method: check_link
    /// input "origin", type &str, stands for an HtmlDocument anchor
    /// input "in_link", type &str, stands for an HtmlDocument anchor that may need parsing
    /// this is a cursory check to see if a parse is needed
    /// returns string
    fn check_link(origin: &str, in_link: &str) -> String {
        match Url::parse(in_link) {
            Ok(link) => link.to_string(),
            Err(_) => HtmlRecord::parse_link(origin, in_link),
        }
    }

    /// cdprivate method
    /// input is the origin of the HtmlDocument, and the unparsed link
    /// the output is the parsed string
    /// use case
    /// get an anchor tag href where it is unparsed, as in "/"
    fn parse_link(origin: &str, unparsed_link: &str) -> String {
        let host = Url::parse(origin).expect("origin not a url");
        let host_string = host.host_str().unwrap_or_default().to_string();

        let parsed_link = if unparsed_link.substring(0, 1) == "/" {
            format!("{}://{}{}", host.scheme(), host_string, unparsed_link)
        } else {
            format!("{}://{}/{}", host.scheme(), host_string, unparsed_link)
        };

        parsed_link
    }

    /// private method
    /// input is link to run http(s) check
    /// use case:
    /// this is used as conditional method to whether its an actual http url
    fn is_http(link: &str) -> bool {
        let url = Url::parse(link).expect("is http failed");

        url.scheme() == "http" || url.scheme() == "https"
    }

    fn has_extension(link: &str) -> bool {
        let url = Url::parse(link).expect("is http failed");

        //Make a constant
        let extention_list: Vec<&str> = vec!["jpeg", "jpg", "css", "js", "webm", "webp"];

        let extention_vec = Vec::from_iter(url.path().split('.'));

        extention_list.contains(&extention_vec[extention_vec.len() - 1])
    }
}
