/*
 * Copyright (c) 2022  Peter Pentchev <roam@ringlet.net>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
//! Detect a UTF-8-capable locale.

use std::collections::HashMap;
use std::env;
use std::error;
use std::process;

use encoding::Encoding;

quick_error! {
    /// An error that occurred while examining the environment or locales.
    #[derive(Debug)]
    pub enum UErr {
        /// Invalid value for an environment variable.
        InvalidEnvValue(name: String) {
            display("The {} environment variable's value is not a valid string", name)
        }
    }
}

/// The variables examined by the [`get_preferred_languages()`] function.
pub const LOCALE_VARIABLES: [&str; 14] = [
    "LC_ALL",
    "LANG",
    "LC_MESSAGES",
    "LC_COLLATE",
    "LC_NAME",
    "LC_IDENTIFICATION",
    "LC_CTYPE",
    "LC_NUMERIC",
    "LC_TIME",
    "LC_MONETARY",
    "LC_PAPER",
    "LC_ADDRESS",
    "LC_TELEPHONE",
    "LC_MEASUREMENT",
];

/// The encodings recognized as UTF-8 for the various locale distributions.
pub const UTF8_ENCODINGS: [&str; 2] = ["UTF-8", "utf8"];

/// The default list of preferred languages.
pub const UTF8_LANGUAGES: [&str; 5] = ["C", "en", "de", "es", "it"];

/// Break a locale name down into components.
pub const RE_LOCALE_NAME: &str = r"(?x) ^
    (?P<lang> [a-zA-Z0-9]+ )
    (?:
        _
        (?P<territory> [a-zA-Z0-9]+ )
    )?
    (?:
        \.
        (?P<codeset> [a-zA-Z0-9-]+ )
    )?
    (?:
        @
        (?P<modifier> [a-zA-Z0-9]+ )
    )?
    $ ";

fn build_weights(langs: &[&str]) -> (HashMap<String, u32>, u32) {
    let mut res = HashMap::new();
    let mut unweight: u32 = 0;
    for lang in langs {
        res.entry(lang.to_string()).or_insert_with(|| {
            let current = unweight;
            unweight += 1;
            current
        });
    }
    (res, unweight)
}

/// Get a locale name that may hopefully be used for UTF-8 output.
///
/// The [`detect_utf8_locale()`] function runs the external `locale` command to
/// obtain a list of the supported locale names, and then picks a suitable one
/// to use so that programs are more likely to output valid UTF-8 characters
/// and language-neutral messages. It prefers the `C` base locale, but if
/// neither `C.UTF-8` nor `C.utf8` is available, it will fall back to a list of
/// other locale names that are likely to be present on the system.
///
/// The `utf8_locale` library has a predefined list of preferred languages.
/// If a program has different preferences, e.g. only expecting to parse
/// messages written in English, the [`detect_utf8_locale_from_languages()`]
/// function should be used instead of [`detect_utf8_locale()`].
pub fn detect_utf8_locale() -> Result<String, Box<dyn error::Error>> {
    detect_utf8_locale_from_languages(&UTF8_LANGUAGES)
}

/// Get a UTF-8-capable locale name, honoring language preferences.
///
/// This function does the same as [`detect_utf8_locale()`], but it allows
/// the caller to specify the list of desired languages in order of
/// preference.
pub fn detect_utf8_locale_from_languages(
    languages: &[&str],
) -> Result<String, Box<dyn error::Error>> {
    let re_name = regex::Regex::new(RE_LOCALE_NAME).unwrap();

    let (weights, unweight) = build_weights(languages);

    let raw = process::Command::new("locale")
        .arg("-a")
        .stderr(process::Stdio::inherit())
        .output()?
        .stdout;
    let text = encoding::all::ISO_8859_1.decode(&raw, encoding::DecoderTrap::Strict)?;
    Ok(text
        .lines()
        .fold(("C".to_string(), unweight), |state, line| {
            match re_name.captures(line) {
                None => state,
                Some(caps) => match caps.name("codeset") {
                    None => state,
                    Some(value) => match UTF8_ENCODINGS.contains(&value.as_str()) {
                        false => state,
                        true => {
                            let lang = &caps["lang"];
                            match weights.get(lang) {
                                None => state,
                                Some(&weight) => match weight < state.1 {
                                    true => (line.to_string(), weight),
                                    false => state,
                                },
                            }
                        }
                    },
                },
            }
        })
        .0)
}

/// Prepare the environment variables that need to be changed.
///
/// The [`get_utf8_vars()`] function invokes [`detect_utf8_locale()`] and
/// then returns a hashmap with `LC_ALL` set to the obtained locale name and
/// `LANGUAGE` set to an empty string so that recent versions of the gettext
/// library do not choose a different language to output messages in.
///
/// There is also a [`get_utf8_vars_from_languages()`] counterpart if a specific
/// list of language preferences is desired.
pub fn get_utf8_vars() -> Result<HashMap<String, String>, Box<dyn error::Error>> {
    get_utf8_vars_from_languages(&UTF8_LANGUAGES)
}

/// Prepare the environment variables with the specified language preferences.
///
/// This function does the same as [`get_utf8_vars()`], but it allows
/// the caller to specify the list of desired languages in order of
/// preference.
pub fn get_utf8_vars_from_languages(
    languages: &[&str],
) -> Result<HashMap<String, String>, Box<dyn error::Error>> {
    let loc = detect_utf8_locale_from_languages(languages)?;
    let arr = [
        ("LC_ALL".to_string(), loc),
        ("LANGUAGE".to_string(), "".to_string()),
    ];
    Ok(arr.into_iter().collect())
}

/// Prepare the environment to run subprocesses in.
///
/// The [`get_utf8_env()`] function invokes [`detect_utf8_locale()`] and then
/// returns a hashmap based on [`std::env::vars()`], but with `LC_ALL` set to
/// the obtained locale name and `LANGUAGE` set to an empty string so that
/// recent versions of the gettext library do not choose a different language
/// to output messages in.
///
/// There is also a [`get_utf8_env_from_languages()`] counterpart if a specific
/// list of language preferences is desired.
pub fn get_utf8_env() -> Result<HashMap<String, String>, Box<dyn error::Error>> {
    get_utf8_env_from_languages(&UTF8_LANGUAGES)
}

/// Prepare the environment with the specified language preferences.
///
/// This function does the same as [`get_utf8_env()`], but it allows
/// the caller to specify the list of desired languages in order of
/// preference.
pub fn get_utf8_env_from_languages(
    languages: &[&str],
) -> Result<HashMap<String, String>, Box<dyn error::Error>> {
    Ok(env::vars()
        .chain(get_utf8_vars_from_languages(languages)?)
        .collect())
}

/// Determine preferred languages as per the current locale settings.
///
/// The [`get_preferred_languages()`] function examines either the current
/// process environment or the provided dictionary and returns a list of
/// the languages specified in the locale variables (`LC_ALL`, `LANG`,
/// `LC_MESSAGES`, etc) in order of preference as defined by either
/// the `names` parameter passed or by the [`LOCALE_VARIABLES`] constant.
/// It may be used by programs to add the user's currently preferred locale
/// to their own settings.
///
/// Note that "C" is always appended to the end of the list if it is not
/// already present.
pub fn get_preferred_languages() -> Result<Vec<String>, Box<dyn error::Error>> {
    let re_name = regex::Regex::new(RE_LOCALE_NAME).unwrap();

    let mut res: Vec<String> = Vec::new();
    for name in &LOCALE_VARIABLES {
        match env::var(name) {
            Err(env::VarError::NotPresent) => (),
            Err(_) => return Err(Box::new(UErr::InvalidEnvValue(name.to_string()))),
            Ok(value) => match re_name.captures(&value) {
                None => (),
                Some(caps) => {
                    if UTF8_ENCODINGS.contains(&&caps["codeset"]) {
                        let lang = caps["lang"].to_string();
                        if !res.contains(&lang) {
                            res.push(lang);
                        }
                    }
                }
            },
        }
    }

    /* Make sure "C" is always in the list. */
    if !res.contains(&"C".to_string()) {
        res.push("C".to_string());
    }
    Ok(res)
}
