#![doc = include_str!("../README.md")]
use anyhow::Result;
use chrono::{DateTime, Local};
use clap::Parser;
use colored::Colorize;
use reqwest::StatusCode;
use scraper::{Html, Selector};
use std::fs::OpenOptions;
use std::io::Write;
use url::Url;

#[derive(Parser, Debug)]
#[clap(about, version, author)]
struct Cli {
    /// URL to check
    url: String,
    #[clap(short, long, value_name = "URL")]
    /// URL(s) to ignore if found (full url); separate multiple URLs with a comma
    skip: Option<String>,
}

#[derive(PartialEq, Clone)]
struct Link {
    referer: Url,
    link: Url,
}

impl Link {
    fn new(referer: Url, link: Url) -> Link {
        Link { referer, link }
    }
}

struct CheckError {
    link: Url,
    referer: Url,
    error: String,
}

impl CheckError {
    fn new(link: Url, referer: Url, error: String) -> CheckError {
        CheckError {
            link,
            referer,
            error,
        }
    }
}

/// get all links from one url
fn get_links(url: &Url) -> Result<Vec<Link>> {
    let res = reqwest::blocking::get(url.as_ref())?;

    // current url, even if redirected
    let full_url = &res.url().clone();

    // get domain using Url::parse
    let domain = format!("{}://{}", url.scheme(), url.host_str().unwrap());

    // get and parse the html
    let html = res.text()?;
    let document = Html::parse_document(&html);
    let selector = Selector::parse("a").unwrap();

    let mut links = vec![];

    // go through all a href elements and modify value as necessary, push to links
    for element in document.select(&selector) {
        if let Some(mut v) = element.value().attr("href") {
            v = v.trim();
            // don't add mailto links or anchor links
            if v.starts_with("mailto") || v.starts_with('#') {
                continue;
            }
            // if link is full url, add it as is
            if v.starts_with("http") {
                let link = Link::new(full_url.clone(), Url::parse(v)?);
                links.push(link);
            // if starts with /, just need the domain added
            } else if v.starts_with('/') {
                let link = Link::new(full_url.clone(), Url::parse(&format!("{}{}", domain, v))?);
                links.push(link);
            // if neither of those, this is a relative link starting from full url
            } else {
                let link = Link::new(full_url.clone(), Url::parse(&format!("{}{}", full_url, v))?);
                links.push(link);
            }
        }
    }

    Ok(links)
}

/// check validity of individual link
fn check_link(url: &Url) -> Result<bool> {
    let res = reqwest::blocking::get(url.as_ref())?;
    Ok(res.status() != StatusCode::NOT_FOUND)
}

fn main() {
    // parse arguments provided
    let args = Cli::parse();
    let mut url = args.url;

    // add https if user didn't, notify them (in case http is preferred)
    if !url.starts_with("http") {
        println!("Adding 'https://' to provided domain...");
        url = format!("https://{}", url);
    }

    // parse as Url, shadowing url
    let url = match Url::parse(&url) {
        Ok(url) => url,
        Err(e) => {
            eprintln!("Error: {}", e);
            return;
        }
    };

    // create filenames
    let domain = url.host_str().unwrap().trim_matches('"');
    let dt: DateTime<Local> = Local::now();
    let format = "%Y-%m-%d-%H-%M";
    let broken_links_csv = format!(
        "{}_broken_links_{}.csv",
        domain,
        dt.format(format).to_string()
    );
    let errors_csv = format!("{}_errors_{}.csv", domain, dt.format(format).to_string());

    println!("\nChecking links found in {:?}", url.to_string());

    // start the check
    let mut unchecked_links: Vec<Link> = vec![Link::new(url.clone(), url)];
    let mut valid_links: Vec<Link> = vec![];
    let mut invalid_links: Vec<Link> = vec![];
    let mut errors: Vec<CheckError> = vec![];

    let mut count = 0;

    while !unchecked_links.is_empty() {
        count += 1;
        let content = unchecked_links.pop().unwrap();

        // if we already know the link is valid, we've already fetched all links from it, so no
        // need to process it (but we can't skip known bad links, because we need their referer)
        //
        // this check is also done below to prevent known valid links from being added to
        // unchecked_links in the first place, but there may be instances of the same link being
        // added multiple times before the first instance of it can be checked, so we check again
        // here, where we are able to catch all subseqent occurrences after the first
        if valid_links.iter().any(|link| link.link == content.link) {
            continue;
        }

        // otherwise check it
        match check_link(&content.link) {
            // the link is ok
            Ok(true) => {
                println!("{} is {}", content.link, "OK".green());
                valid_links.push(content.clone());
                // get further links if link is within domain we're interested in & not a file
                // TODO: probably a better way to check if a file than this
                if content.referer.host() == content.link.host()
                    && !&content.link.as_ref().ends_with(".pdf")
                    && !&content.link.as_ref().ends_with(".xls")
                    && !&content.link.as_ref().ends_with(".xlsx")
                    && !&content.link.as_ref().ends_with(".docx")
                    && !&content.link.as_ref().ends_with(".doc")
                    && !&content.link.as_ref().ends_with(".pptx")
                    && !&content.link.as_ref().ends_with(".twbx")
                    && !&content.link.as_ref().ends_with(".png")
                    && !&content.link.as_ref().ends_with(".jpg")
                    && !&content.link.as_ref().ends_with(".jpeg")
                {
                    match get_links(&content.link) {
                        Ok(new_urls) => {
                            'outer: for new_url in new_urls {
                                // skip any urls specified by user
                                if let Some(ref skip_urls) = args.skip {
                                    for skip_url in skip_urls.split(',') {
                                        if new_url.link.as_ref() == skip_url {
                                            continue 'outer;
                                        }
                                    }
                                };
                                // limit, as much as possible, what gets added to unchecked_links
                                if !valid_links.contains(&new_url)
                                    && !invalid_links.contains(&new_url)
                                    && !unchecked_links.contains(&new_url)
                                {
                                    // if we already know the link is broken, add to invalid
                                    if invalid_links.iter().any(|link| link.link == new_url.link) {
                                        invalid_links.push(new_url.clone());
                                    // if we already know the link is good, add to valid
                                    } else if valid_links
                                        .iter()
                                        .any(|link| link.link == new_url.link)
                                    {
                                        valid_links.push(new_url.clone());
                                    } else {
                                        unchecked_links.push(new_url.clone());
                                    }
                                }
                            }
                        }
                        Err(e) => {
                            eprintln!(
                                "{} getting links from {} (from referer {}): {}",
                                "Error".red(),
                                content.link,
                                content.referer,
                                e
                            );
                            errors.push(CheckError::new(
                                content.link.clone(),
                                content.referer.clone(),
                                e.to_string(),
                            ));
                        }
                    }
                }
            }
            // the link is bad
            Ok(false) => {
                invalid_links.push(content.clone());
                println!("{} is {}", content.link, "Broken".red());
            }
            // other error (likely bad domain or timeout)
            Err(e) => {
                eprintln!(
                    "{} checking {} (from referer {}): {}",
                    "Error".red(),
                    content.link,
                    content.referer,
                    e
                );
                errors.push(CheckError::new(
                    content.link.clone(),
                    content.referer.clone(),
                    e.to_string(),
                ));
            }
        }
        // TODO: possibly set optional max links to be checked? (prevent against infinite loop)
        // if count == 200 {
        //     break;
        // }
    }

    println!("\nChecked {} links.", count);

    // process results
    if invalid_links.is_empty() {
        println!("No broken links found.");
    } else {
        println!("Broken links saved to {}", broken_links_csv);

        let mut file = OpenOptions::new()
            .read(true)
            .write(true)
            .append(true)
            .create(true)
            .open(broken_links_csv)
            .expect("cannot open file");

        // write the header
        let content = "Referer (link on this page),Link\n".to_string();
        file.write_all(content.as_bytes()).expect("write failed");

        for link in invalid_links {
            let content = format!("{:?},{:?}", link.referer.as_ref(), link.link.as_ref());
            file.write_all(content.as_bytes()).expect("write failed");
            file.write_all("\n".as_bytes()).expect("write failed");
        }
    }

    // create separate csv of any errors from checking links
    if errors.is_empty() {
        println!("No errors occurred.");
    } else {
        println!("Errors saved to {}", errors_csv);
        let mut file = OpenOptions::new()
            .read(true)
            .write(true)
            .append(true)
            .create(true)
            .open(errors_csv)
            .expect("cannot open file");

        // write the header
        let content = "Referer (link on this page),Link,Error\n".to_string();
        file.write_all(content.as_bytes()).expect("write failed");

        for error in errors {
            let content = format!(
                "{:?},{:?},{}",
                error.referer.as_ref(),
                error.link.as_ref(),
                error.error
            );
            file.write_all(content.as_bytes()).expect("write failed");
            file.write_all("\n".as_bytes()).expect("write failed");
        }
    }
}
