//     bindet - Fast binary file type detection
//
//         The MIT License (MIT)
//
//      Copyright (c) Obliter Software (https://github.com/oblitersoftware/)
//      Copyright (c) contributors
//
//      Permission is hereby granted, free of charge, to any person obtaining a copy
//      of this software and associated documentation files (the "Software"), to deal
//      in the Software without restriction, including without limitation the rights
//      to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
//      copies of the Software, and to permit persons to whom the Software is
//      furnished to do so, subject to the following conditions:
//
//      The above copyright notice and this permission notice shall be included in
//      all copies or substantial portions of the Software.
//
//      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
//      IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
//      FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
//      AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
//      LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
//      OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
//      THE SOFTWARE.

//! Fast binary file type detection
//!
//! `bindet` provides a fast and safe binary file detection even for large files.
//!
//! The worst case for `bindet` is `O(n)`, but some tricks are applied to try to amortize
//! the time complexity to `O(1)`, so in most of the cases it does not take `O(n)` to execute.
//!
//! ## Supported file types
//!
//! - Zip
//! - Rar (4 and 5)
//! - Tar
//! - Png
//! - Jpg
//! - 7-zip
//! - Opus
//! - Vorbis
//! - Mp3
//! - Webp
//! - Flac
//! - Matroska (mkv, mka, mks, mk3d, webm)
//! - Wasm
//!
//! ## First Step
//!
//! File detection is made in a two-pass process, first it tries to find the magic number at the start
//! of the [Read], if the magic number if found, a second pass may be done to ensure correctness of detection.
//! For example, [`FileType::Zip`](FileType::Zip) does have a **Local File Header** which starts
//! with a 4-byte descriptor and a **End of central directory record** that appears at the
//! end of non-empty zip files.
//!
//! Some files can de detect only by looking at the start of the file, using a fixed-size buffer,
//! which guarantees `O(1)` for simple detection and a amortized `O(1)` for correctness. Also, some file
//! types, such as [RAR SFX](https://documentation.help/WinRAR/HELPArcSFX.htm) states that the
//! magic number may be found from the start of the file up to SFX module size (which is of `1 MB`),
//! this means that in the worst case, we need to do a sliding window up to `1 MB` to find this value,
//! this type of check happens in the second step.
//!
//! ## Second Step
//!
//! In the first step, we use a small buffer to store initial bytes of the data and try to detect
//! the file type, in the second step we use a larger buffer size, up to the size of the largest
//! lookup range (which at the moment is of `1 MB`, which matches with RAR5 specification) and
//! use a sliding window to find a range that matches the magic number sequence.
//!
//! Also, the same strategy is applied to [`detect_at_end`](detect_at_end) logic, it looks into the
//! file backwardly, using a sliding window, to find a matching sequence of bytes, this logic is
//! used to ensure correctness for file types that does have a sequence of bytes that appear at the end.
//!
//! ### Worst-case scenario
//!
//! For [`detect`](detect) function, we mixes reading from the start and then only do backward sliding
//! at the end for types that have matched at the start, this improves the accuracy of file detection,
//! with the cost that if a marker is found at the start, and the specification states that there is a
//! marker at the end, and we do the backward sliding-window, and there is no marker at the end, we
//! will have traversed the entire data stream, with a time complexity of `O(n)`, so, the worst case
//! of file detection is linear.
//!
//! However, even with a linear worst case, we assume that in the most scenarios the marker at the
//! start will be enough to detect the file type. And if this is not enough and we need to look at the end,
//! we assume that in most cases we will not need to slide the window until the start of the stream,
//! assuming that the algorithm will find the marker closer to the end than to the start.
//!
//! Further benchmarks can be done to check if **bindet** amortized time complexity is really `O(1)`, given
//! a bunch of files to be detected.
//!
//! ### Examples
//!
//! ```
//! use std::fs::{OpenOptions};
//! use std::io::BufReader;
//! use std::io::ErrorKind;
//! use bindet;
//! use bindet::types::FileType;
//! use bindet::FileTypeMatch;
//! use bindet::FileTypeMatches;
//!
//! let file = OpenOptions::new().read(true).open("files/test.tar").unwrap();
//! let buf = BufReader::new(file);
//!
//! let detect = bindet::detect(buf).map_err(|e| e.kind());
//! let expected: Result<Option<FileTypeMatches>, ErrorKind> = Ok(Some(FileTypeMatches::new(
//!     vec![FileType::Tar],
//!     vec![FileTypeMatch::new(FileType::Tar, true)]
//! )));
//!
//! assert_eq!(detect, expected);
//! ```
#![feature(test)]
use crate::description::FileTypeDescription;
use crate::matcher::{FileTypeMatcher, RelativePosition, Step, TestResult};
use crate::types::FileType;
use std::io::{Read, Seek, SeekFrom};
use std::prelude::rust_2021::TryFrom;

pub mod description;
pub mod matcher;
pub mod types;

/// Stores information about a specific [FileType] match result.
#[derive(Debug, Clone, Eq, PartialEq)]
pub struct FileTypeMatch {
    /// [FileType] that matched
    file_type: FileType,
    /// If the file magic number perfectly matched, or if the match is a probable match.
    full_match: bool,
}

impl FileTypeMatch {
    pub fn new(file_type: FileType, full_match: bool) -> FileTypeMatch {
        FileTypeMatch {
            file_type,
            full_match,
        }
    }
}

#[derive(Debug, Clone, Eq, PartialEq)]
pub struct FileTypeMatches {
    /// [`FileTypes`][FileType] that have a perfect match.
    likely_to_be: Vec<FileType>,
    /// All [FileType] that matched, perfectly or not.
    all_matches: Vec<FileTypeMatch>,
}

impl FileTypeMatches {
    pub fn new(likely_to_be: Vec<FileType>, all_matches: Vec<FileTypeMatch>) -> FileTypeMatches {
        FileTypeMatches {
            likely_to_be,
            all_matches,
        }
    }
}

/// Detect a file type by looking at the start and at the end of the file (at the end only for
/// applicable file types)
///
/// Since different [FileType] detection implementations receives the same data slice, it may produce
/// more than one matching types.
pub fn detect<R>(mut read: R) -> Result<Option<FileTypeMatches>, std::io::Error>
where
    R: Read,
    R: Seek,
{
    let at_start = detect_at_start(&mut read)?;

    if let Some(start) = at_start {
        let types: Vec<FileType> = start
            .all_matches
            .iter()
            .map(|s| s.file_type.clone())
            .collect();
        let at_end = detect_at_end_with_(&mut read, types.as_slice())?;

        if let Some(at_end) = at_end {
            let start_matches: Vec<FileType> =
                start.all_matches.iter().map(|c| c.file_type).collect();

            let perfect: Vec<FileType> = at_end
                .all_matches
                .iter()
                .map(|t| t.file_type)
                .filter(|c| start_matches.contains(c))
                .collect();

            let mut all_likely: Vec<FileType> = vec![];
            all_likely.extend(start.likely_to_be);
            all_likely.extend(perfect);

            all_likely.dedup_by_key(|v| *v);

            let mut merged: Vec<FileTypeMatch> = vec![];
            merged.extend(start.all_matches);
            merged.extend(at_end.all_matches);

            let mut mapped_merged_items = merged
                .iter()
                .map(|v| FileTypeMatch {
                    file_type: v.file_type,
                    full_match: all_likely.contains(&v.file_type),
                })
                .collect::<Vec<FileTypeMatch>>();

            mapped_merged_items.dedup_by_key(|v| v.file_type);

            return Ok(Some(FileTypeMatches {
                likely_to_be: all_likely,
                all_matches: mapped_merged_items,
            }));
        } else {
            return Ok(Some(start));
        }
    } else {
        return Ok(None);
    }
}

/// Detect a file type by looking at the start of the file. Types that need a second check at the end
/// may be reported with [`FileTypeMatch.full_match = false`](FileTypeMatch) signaling a probable
/// match.
///
/// This is a less reliable version of [`check`](check), but with guaranteed `O(1)` time complexity,
/// in other words, it takes the same time to check small files and larger files.
///
/// Since different [FileType] detection implementations receives the same data slice, it may produce
/// more than one matching types.
pub fn detect_at_start<R>(read: &mut R) -> Result<Option<FileTypeMatches>, std::io::Error>
where
    R: Read,
{
    return detect_at_start_with_(read, &FileType::variants());
}

/// Detect a file type by looking at the start of the file. Types that need a second check at the end
/// may be reported with [`FileTypeMatch.full_match = false`](FileTypeMatch) signaling a probable
/// match.
///
/// This is a less reliable version of [`check`](check), but with guaranteed `O(1)` time complexity,
/// in other words, it takes the same time to check small files and larger files.
///
/// Since different [FileType] detection implementations receives the same data slice, it may produce
/// more than one matching types.
///
/// This version receives a [variants] parameter that allows to specify which file types to check.
pub fn detect_at_start_with_<R>(
    read: &mut R,
    variants: &[FileType],
) -> Result<Option<FileTypeMatches>, std::io::Error>
where
    R: Read,
{
    let start_position = RelativePosition::Start;
    let small = FileType::ideal_block_size_of_variants(&start_position, variants);

    let mut matches: Vec<FileTypeMatch> = vec![];

    let mut readed_: Vec<u8> = vec![];

    if let Some((size, types)) = small {
        let mut buff = vec![0u8; size];
        let buff_slice = &mut buff[..];

        let read = read.read(buff_slice)?;
        let bytes = &buff_slice[..read];
        readed_.extend_from_slice(&bytes);

        push_matched_types_into(&mut matches, &bytes, &start_position, &Step::Small, &types);
    }

    let any_perfect_match = matches.iter().filter(|v| v.full_match).count() > 0;

    if any_perfect_match {
        let perfect: Vec<FileType> = matches
            .iter()
            .filter(|v| v.full_match)
            .map(|v| v.file_type.clone())
            .collect();

        return Result::Ok(Some(FileTypeMatches {
            likely_to_be: perfect,
            all_matches: matches,
        }));
    }

    let big = FileType::maximum_block_size(&start_position);

    if let Some((size, types)) = big {
        let new_size = size - readed_.len();
        let mut buff = vec![0u8; new_size];
        let buff_slice = &mut buff[..];
        let mut readed = read.read(buff_slice)?;
        let mut all_readed = readed;
        loop {
            if readed != 0 && readed < size {
                if let Ok(ok) = read.read(&mut buff_slice[all_readed..]) {
                    readed = ok;
                    all_readed += readed;
                } else {
                    break;
                }
            } else {
                break;
            }
        }

        if all_readed != 0 {
            readed_.extend_from_slice(&buff_slice[..all_readed]);
            let bytes = &readed_[..];

            push_matched_types_into(&mut matches, &bytes, &start_position, &Step::Large, &types);
        }
    }

    if !matches.is_empty() {
        let types: Vec<FileType> = matches.iter().map(|v| v.file_type.clone()).collect();
        return Result::Ok(Some(FileTypeMatches {
            likely_to_be: types,
            all_matches: matches,
        }));
    }

    Result::Ok(None)
}

/// Detect a file type by using a backward sliding window, this approach does have a `O(n)` time complexity
/// and is not meant to be used directly.
///
/// Currently this only works for [FileType::Zip].
///
/// Since different [FileType] detection implementations receives the same data slice, it may produce
/// more than one matching types.
pub fn detect_at_end<R>(read: &mut R) -> Result<Option<FileTypeMatches>, std::io::Error>
where
    R: Read,
    R: Seek,
{
    detect_at_end_with_(read, &FileType::variants())
}

/// Detect a file type by using a backward sliding window, this approach does have a `O(n)` time complexity
/// and is not meant to be used directly.
///
/// Currently this only works for [FileType::Zip].
///
/// Since different [FileType] detection implementations receives the same data slice, it may produce
/// more than one matching types.
///
/// This version receives a [variants] parameter that allows to specify which file types to check.
pub fn detect_at_end_with_<R>(
    read: &mut R,
    variants: &[FileType],
) -> Result<Option<FileTypeMatches>, std::io::Error>
where
    R: Read,
    R: Seek,
{
    let end_position = RelativePosition::End;
    let small = FileType::ideal_block_size_of_variants(&end_position, variants);

    let mut matches: Vec<FileTypeMatch> = vec![];

    if let Some((size, types)) = small {
        let seek = read.seek(SeekFrom::End(0))?;

        let real_size = if seek > u64::try_from(size).unwrap() {
            size
        } else {
            usize::try_from(seek).unwrap()
        };

        let mut buff = vec![0u8; real_size];
        let buff_slice = &mut buff[..];

        let mut back = -i64::try_from(real_size).unwrap();
        let mut seek = read.seek(SeekFrom::End(back))?; // Skip real_size

        loop {
            if seek > 0 {
                let read_bytes = read.read(buff_slice)?;
                let bytes = &buff_slice[..read_bytes];

                push_matched_types_into(&mut matches, &bytes, &end_position, &Step::Small, &types);

                if types.len() == matches.len() {
                    break;
                }
                back -= 1; // Move back
                seek = read.seek(SeekFrom::End(back))?;
            } else {
                break;
            }
        }
    }

    let any_perfect_match = matches.iter().filter(|v| v.full_match).count() > 0;

    if any_perfect_match {
        let perfect: Vec<FileType> = matches
            .iter()
            .filter(|v| v.full_match)
            .map(|v| v.file_type.clone())
            .collect();

        return Result::Ok(Some(FileTypeMatches {
            likely_to_be: perfect,
            all_matches: matches,
        }));
    }

    // TODO: none of the FileTypes does have a maximum_block_size for windowing
    /*let big = FileType::maximum_block_size(&end_position);

    if let Some((size, types)) = big {
        let new_size = size - readed_.len();
        let mut buff = vec![0u8; new_size];
        let mut buff_slice = &mut buff[..];
        let read = read.read(buff_slice)?;

        if read != 0 {
            readed_.extend_from_slice(&buff_slice[..read]);
            let mut bytes = &readed_[..];

            push_matched_types_into(&mut matches, &bytes, &end_position, types);
        }
    }*/

    if !matches.is_empty() {
        let types: Vec<FileType> = matches.iter().map(|v| v.file_type.clone()).collect();
        return Result::Ok(Some(FileTypeMatches {
            likely_to_be: types,
            all_matches: matches,
        }));
    }

    Result::Ok(None)
}

fn push_matched_types_into(
    matches: &mut Vec<FileTypeMatch>,
    bytes: &[u8],
    relative_position: &RelativePosition,
    step: &Step,
    types: &Vec<FileType>,
) {
    for file_type in types {
        let matched = file_type.test(&relative_position, &step, &bytes);
        if matched != TestResult::NotMatched {
            matches.push(FileTypeMatch {
                file_type: *file_type,
                full_match: matched == TestResult::Matched,
            });
        }
    }
}

#[cfg(test)]
mod tests {
    extern crate test;
    use crate::types::FileType;
    use crate::{detect, detect_at_end, FileTypeMatch, FileTypeMatches};
    use std::fs::OpenOptions;
    use std::io::{BufReader, Error, ErrorKind};
    use std::path::Path;
    use test::Bencher;

    #[test]
    fn test_zip_detect() {
        test_detect_match("files/hello.zip", FileType::Zip);
    }

    #[test]
    fn test_rar_detect() {
        test_detect_match("files/hello.rar", FileType::Rar5);
    }

    #[test]
    fn test_rar_sfx_detect() {
        test_detect_match("files/hello-world.exe", FileType::Rar5);
    }

    #[test]
    fn test_png_detect() {
        test_detect_match("files/rust-logo.png", FileType::Png);
    }

    #[test]
    fn test_jpg_detect() {
        test_detect_match("files/rust-logo.jpg", FileType::Jpg);
    }

    #[test]
    fn test_7z_detect() {
        test_detect_match("files/rust-logo.7z", FileType::_7z);
    }

    #[test]
    fn test_opus_detect() {
        test_detect_match("files/test-opus.opus", FileType::Opus);
    }

    #[test]
    fn test_vorbis_detect() {
        test_detect_match("files/test-vorbis.ogg", FileType::Vorbis);
    }

    #[test]
    fn test_mp3_detect() {
        test_detect_match("files/test-mp3.mp3", FileType::Mp3);
    }

    #[test]
    fn test_webp_detect() {
        test_detect_match("files/rust-logo.webp", FileType::Webp);
    }

    #[test]
    fn test_flac_detect() {
        test_detect_match("files/test-flac.flac", FileType::Flac);
    }

    #[test]
    fn test_wasm_detect() {
        test_detect_match("files/test-wasm.wasm", FileType::Wasm);
    }

    // False positive - how to avoid?
    #[test]
    fn test_flac_txt_detect() {
        test_detect_match("files/test-flac.txt", FileType::Flac);
    }

    #[test]
    fn test_mka_detect() {
        test_detect_match("files/test-mka.mka", FileType::Matroska);
    }

    #[test]
    fn test_txt_no_match() {
        test_detect_no_match("files/text");
    }

    #[test]
    fn test_tar_detect() {
        test_detect_match("files/hello.tar", FileType::Tar);
        test_detect_match("files/test.tar", FileType::Tar);
        test_detect_match("files/test-0.tar", FileType::Tar);
    }

    fn test_detect<P>(path: P) -> Result<Option<FileTypeMatches>, Error>
    where
        P: AsRef<Path>,
    {
        let file = OpenOptions::new().read(true).open(path).unwrap();

        let buf = BufReader::new(file);

        detect(buf)
    }

    fn test_detect_sliding<P>(path: P) -> Result<Option<FileTypeMatches>, Error>
    where
        P: AsRef<Path>,
    {
        let file = OpenOptions::new().read(true).open(path).unwrap();

        let mut buf = BufReader::new(file);

        detect_at_end(&mut buf)
    }

    fn test_detect_match<P>(path: P, file_type: FileType) -> ()
    where
        P: AsRef<Path>,
    {
        let detect = test_detect(path).map_err(|e| e.kind());
        let expected: Result<Option<FileTypeMatches>, ErrorKind> = Ok(Some(FileTypeMatches::new(
            vec![file_type],
            vec![FileTypeMatch::new(file_type, true)],
        )));

        assert_eq!(detect, expected);
    }

    fn test_detect_no_match<P>(path: P) -> ()
    where
        P: AsRef<Path>,
    {
        let detect = test_detect(path).map_err(|e| e.kind());
        let expected: Result<Option<FileTypeMatches>, ErrorKind> = Ok(None);

        assert_eq!(detect, expected);
    }

    fn test_detect_match_sliding<P>(path: P, file_type: FileType) -> ()
    where
        P: AsRef<Path>,
    {
        let detect = test_detect_sliding(path).map_err(|e| e.kind());
        let expected: Result<Option<FileTypeMatches>, ErrorKind> = Ok(Some(FileTypeMatches::new(
            vec![file_type],
            vec![FileTypeMatch::new(file_type, true)],
        )));

        assert_eq!(detect, expected);
    }

    #[bench]
    fn bench_multi_detect(b: &mut Bencher) {
        b.iter(|| {
            detect_rar();
            detect_zip();
            detect_jar();
            detect_png();
        })
    }

    #[bench]
    fn bench_little_tar_detect(b: &mut Bencher) {
        b.iter(|| test_detect("files/test.tar"))
    }

    #[bench]
    fn bench_1mb_tar_detect(b: &mut Bencher) {
        b.iter(|| test_detect("files/test-0.tar"))
    }

    #[bench]
    fn bench_rar_sfx_detect(b: &mut Bencher) {
        b.iter(|| test_detect("files/hello-world.exe"))
    }

    fn detect_rar() -> Result<(), std::io::Error> {
        let f = OpenOptions::new()
            .read(true)
            .open("files/hello.rar")
            .unwrap();

        let buf = BufReader::new(f);

        detect(buf)?;
        Ok(())
    }

    fn detect_png() -> Result<(), std::io::Error> {
        let f = OpenOptions::new()
            .read(true)
            .open("files/rust-logo.png")
            .unwrap();

        let buf = BufReader::new(f);

        detect(buf)?;
        Ok(())
    }

    fn detect_zip() -> Result<(), std::io::Error> {
        let f = OpenOptions::new()
            .read(true)
            .open("files/hello.zip")
            .unwrap();

        let buf = BufReader::new(f);

        detect(buf)?;
        Ok(())
    }

    fn detect_jar() -> Result<(), std::io::Error> {
        let f = OpenOptions::new()
            .read(true)
            .open("files/hello.jar")
            .unwrap();

        let buf = BufReader::new(f);

        detect(buf)?;
        Ok(())
    }
}
