#![allow(clippy::identity_op)]

pub struct Image<'a> {
    pub bytes: &'a [u8],
    pub width: usize,
    pub height: usize,
}
pub struct SubImageFinderState {
    positions_buffer: Vec<(usize, usize, f32)>,
    f32buf_search_image: Vec<f32>,
    f32buf_subimage: Vec<f32>,
    u8buf_search_image: Vec<u8>,
    u8buf_subimage: Vec<u8>,
    backend: Backend,
    prune_width_scale: f32,
    prune_height_scale: f32,
}

/// The backend/algorithm to use.
///
/// There is an optional opencv backend, that uses the opencv-rust crate which
/// depends on the OpenCV C++ library. This requires enabling the opencv feature in
/// find-subimage.
///
/// There is another simdeez optional dependency, which uses the simdeez crate for
/// a rust SIMD implementation. This is enabled by default.
///
/// The only implementation which cannot be disabled at present is the scalar one.
#[derive(Clone, Copy)]
pub enum Backend {
    /// OpenCV SQDIFF_NORMED MatchTemplate
    /// Note that the threshold values for this backend use a different scale
    /// than the others
    #[cfg(feature = "opencv")]
    OpenCV { threshold: f32 },
    /// This should detect CPU features at runtime and use the
    /// best possible rust SIMD implementation of SQDIFF_NORMED (square difference)
    /// step_x and y let you customize it to skip every Nth x or y coordinate
    /// in case you need less accurate results, potentially giving large speedups
    #[cfg(feature = "simdeez")]
    RuntimeDetectedSimd {
        threshold: f32,
        step_x: usize,
        step_y: usize,
    },
    /// Scalar SQDIFF_NORMED (square difference) implementation
    /// Slowest, should work anywhere and be reliable
    /// Smallest in terms of generated code size
    /// step_x and y let you customize it to skip every Nth x or y coordinate
    /// in case you need less accurate results, potentially giving large speedups
    Scalar {
        threshold: f32,
        step_x: usize,
        step_y: usize,
    },
}
pub const OPENCV_DEFAULT_THRESHOLD: f32 = 0.05;
pub const NONOPENCV_DEFAULT_THRESHOLD: f32 = 0.35;

impl SubImageFinderState {
    /// Create a SubImageFinderState
    ///
    /// This uses the Scalar backend by default, unless the "simdeez-default-new"
    /// feature is enabled (It is currently enabled by default).
    ///
    /// See the backend and with_backend methods to change the backend.
    pub fn new() -> Self {
        #[cfg(feature = "simdeez-default-new")]
        let backend = Backend::RuntimeDetectedSimd {
            threshold: NONOPENCV_DEFAULT_THRESHOLD,
            step_x: 1,
            step_y: 1,
        };
        #[cfg(not(feature = "simdeez-default-new"))]
        let backend = Backend::Scalar {
            threshold: NONOPENCV_DEFAULT_THRESHOLD,
            step_x: 1,
            step_y: 1,
        };
        Self {
            positions_buffer: vec![],
            f32buf_search_image: vec![],
            f32buf_subimage: vec![],
            u8buf_search_image: vec![],
            u8buf_subimage: vec![],
            prune_width_scale: 0.5f32,
            prune_height_scale: 0.5f32,
            backend,
        }
    }

    #[cfg(feature = "opencv")]
    pub fn new_opencv(threshold: Option<f32>) -> Self {
        let mut ret = Self::new();
        ret.backend = Backend::OpenCV {
            threshold: threshold.unwrap_or(OPENCV_DEFAULT_THRESHOLD),
        };
        ret
    }

    /// Set the currently configured backend
    pub fn backend(&mut self, new_backend: Backend) {
        self.backend = new_backend;
    }

    /// Set the currently configured prune width/height scaling
    /// These default to 0.5
    pub fn pruning(&mut self, prune_width_scale: f32, prune_height_scale: f32) {
        self.prune_height_scale = prune_height_scale;
        self.prune_width_scale = prune_width_scale;
    }

    /// Return a new state with the given backend
    /// ```
    /// use find_subimage::{SubImageFinderState, Backend};
    /// let state = SubImageFinderState::new().with_backend(Backend::Scalar {threshold: 0.5, step_x:2, step_y:1});
    /// ```
    #[must_use]
    pub fn with_backend(mut self, new_backend: Backend) -> Self {
        self.backend(new_backend);
        self
    }

    /// Return a new state with the given pruning parameters
    #[must_use]
    pub fn with_pruning(mut self, prune_width_scale: f32, prune_height_scale: f32) -> Self {
        self.pruning(prune_width_scale, prune_height_scale);
        self
    }

    /// Finds positions where the subimage is found within the search image.
    /// These positions represent the top-right corner of the subimage.
    /// You can tweak the likelyhood of positions found with the backend's threshold.
    /// Note that the threshold is backend-dependant.
    ///
    /// The IN_CHANNEL_COUNT const generic parameter should be the number of channels
    /// in the input image (For example, 3 for an RGB image or 4 for RGBA).
    ///
    /// The input image can optionally be converted to grayscale before applying the
    /// algorithm using the TO_GRAYSCALE const generic parameter , which can improve performance.
    ///
    /// The third field of the tuples is the matching/distance value. Values closer to 1 mean
    /// a fuzzier match, and closer to 0 a more exact match.
    pub fn find_subimage_positions<
        const IN_CHANNEL_COUNT_SEARCH_IMAGE: usize,
        const SEARCH_IMAGE_TO_GRAYSCALE: bool,
        const IN_CHANNEL_COUNT_SUBIMAGE: usize,
        const SUBIMAGE_TO_GRAYSCALE: bool,
    >(
        &mut self,
        search_image: Image,
        subimage: Image,
    ) -> &[(usize, usize, f32)] {
        let backend = self.backend;
        Self::find_subimage_positions_with_backend::<
            IN_CHANNEL_COUNT_SEARCH_IMAGE,
            SEARCH_IMAGE_TO_GRAYSCALE,
            IN_CHANNEL_COUNT_SUBIMAGE,
            SUBIMAGE_TO_GRAYSCALE,
        >(self, search_image, subimage, &backend)
    }

    /// Like `find_subimage_positions_as_grayscale` but lets you use a different backend
    /// than the currently configured one.
    pub fn find_subimage_positions_with_backend<
        const IN_CHANNEL_COUNT_SEARCH_IMAGE: usize,
        const SEARCH_IMAGE_TO_GRAYSCALE: bool,
        const IN_CHANNEL_COUNT_SUBIMAGE: usize,
        const SUBIMAGE_TO_GRAYSCALE: bool,
    >(
        &mut self,
        search_image: Image,
        subimage: Image,
        backend: &Backend,
    ) -> &[(usize, usize, f32)] {
        self.positions_buffer.clear();

        let Image {
            bytes: search_image,
            width: search_width,
            height: search_height,
        } = search_image;
        let Image {
            bytes: subimage,
            width: subimage_width,
            height: subimage_height,
        } = subimage;

        let to_gray_sub = |rgb: &[u8]| {
            rgb.iter()
                .map(|x| (*x as f32) / (IN_CHANNEL_COUNT_SUBIMAGE as f32))
                .sum::<f32>() as u8
        };
        let to_gray_search = |rgb: &[u8]| {
            rgb.iter()
                .map(|x| (*x as f32) / (IN_CHANNEL_COUNT_SEARCH_IMAGE as f32))
                .sum::<f32>() as u8
        };
        let to_f32 = |x| x as f32;
        let ref_to_f32 = |&x| x as f32;

        match *backend {
            #[cfg(feature = "simdeez")]
            Backend::RuntimeDetectedSimd {
                threshold,
                step_x,
                step_y,
            } => {
                self.f32buf_subimage.clear();
                if SUBIMAGE_TO_GRAYSCALE {
                    self.f32buf_subimage.extend(
                        subimage
                            .chunks_exact(IN_CHANNEL_COUNT_SUBIMAGE)
                            .map(to_gray_sub)
                            .map(to_f32),
                    );
                } else {
                    self.f32buf_subimage.extend(subimage.iter().map(ref_to_f32));
                }

                self.f32buf_search_image.clear();
                if SEARCH_IMAGE_TO_GRAYSCALE {
                    self.f32buf_search_image.extend(
                        search_image
                            .chunks_exact(IN_CHANNEL_COUNT_SEARCH_IMAGE)
                            .map(to_gray_search)
                            .map(to_f32),
                    );
                } else {
                    self.f32buf_search_image
                        .extend(search_image.iter().map(ref_to_f32));
                }

                let simdeez_width = simdeez_width_runtime_select();
                let dist_function = if subimage_width % simdeez_width == 0 {
                    image_dist_simdeez_runtime_select
                } else {
                    image_dist_simdeez_with_remainder_runtime_select
                };

                let x_val = |x| {
                    if SEARCH_IMAGE_TO_GRAYSCALE {
                        x
                    } else {
                        x * IN_CHANNEL_COUNT_SEARCH_IMAGE
                    }
                };
                for y in (0..(search_height - subimage_height)).step_by(step_y) {
                    for x in (0..(search_width - subimage_width)).step_by(step_x) {
                        let dist = dist_function(
                            x_val(x),
                            y,
                            &self.f32buf_search_image,
                            x_val(search_width),
                            &self.f32buf_subimage,
                            x_val(subimage_width),
                            subimage_height,
                        );
                        if dist < threshold {
                            self.positions_buffer.push((x, y, dist));
                        }
                    }
                }
            }
            Backend::Scalar {
                threshold,
                step_x,
                step_y,
            } => {
                let subimage_bytes: &[u8] = if SUBIMAGE_TO_GRAYSCALE {
                    self.u8buf_subimage.clear();
                    self.u8buf_subimage.extend(
                        subimage
                            .chunks_exact(IN_CHANNEL_COUNT_SUBIMAGE)
                            .map(to_gray_sub),
                    );
                    &self.u8buf_subimage
                } else {
                    subimage
                };

                let search_bytes: &[u8] = if SEARCH_IMAGE_TO_GRAYSCALE {
                    self.u8buf_search_image.clear();
                    self.u8buf_search_image.extend(
                        search_image
                            .chunks_exact(IN_CHANNEL_COUNT_SEARCH_IMAGE)
                            .map(to_gray_search),
                    );
                    &self.u8buf_search_image
                } else {
                    search_image
                };

                for y in (0..(search_height - subimage_height)).step_by(step_y) {
                    for x in (0..(search_width - subimage_width)).step_by(step_x) {
                        let dist = image_dist_naive(
                            (x, y),
                            (search_bytes, search_width),
                            (subimage_bytes, subimage_width, subimage_height),
                            (IN_CHANNEL_COUNT_SEARCH_IMAGE, IN_CHANNEL_COUNT_SUBIMAGE),
                        );
                        if dist < threshold {
                            self.positions_buffer.push((x, y, dist));
                        }
                    }
                }
            }
            #[cfg(feature = "opencv")]
            Backend::OpenCV { threshold } => {
                //TODO: Handle non grayscale
                self.u8buf_subimage.clear();
                self.u8buf_subimage.extend(
                    subimage
                        .chunks_exact(IN_CHANNEL_COUNT_SUBIMAGE)
                        .map(to_gray),
                );

                self.u8buf_search_image.clear();
                self.u8buf_search_image.extend(
                    search_image
                        .chunks_exact(IN_CHANNEL_COUNT_SEARCH_IMAGE)
                        .map(to_gray),
                );

                unsafe {
                    let mut out_mat = opencv::core::Mat::default();
                    opencv::imgproc::match_template(
                        &opencv::core::Mat::new_rows_cols_with_data(
                            search_height as i32,
                            search_width as i32,
                            opencv::core::CV_8UC1,
                            self.u8buf_search_image.as_mut_ptr() as *mut _,
                            0,
                        )
                        .unwrap(),
                        &opencv::core::Mat::new_rows_cols_with_data(
                            subimage_height as i32,
                            subimage_width as i32,
                            opencv::core::CV_8UC1,
                            self.u8buf_subimage.as_mut_ptr() as *mut _,
                            0,
                        )
                        .unwrap(),
                        &mut out_mat,
                        opencv::imgproc::TM_SQDIFF_NORMED,
                        &opencv::core::no_array(),
                    )
                    .unwrap();

                    for (opencv::core::Point_ { x, y }, val) in out_mat.iter().unwrap() {
                        let val: f32 = val; // To help inference

                        const THRESHOLD: f32 = 0.05;
                        if val < THRESHOLD {
                            self.positions_buffer.push((x as usize, y as usize, val));
                        }
                    }
                }
            }
        }

        self.prune_nearby_results(subimage_width, subimage_height);

        &self.positions_buffer
    }

    /// Remove results that are too close together according to prune_[width|height]_scale
    /// prioritizing the ones with the lowest distance.
    fn prune_nearby_results(&mut self, subimage_width: usize, subimage_height: usize) {
        let width_threshold = (subimage_width as f32 * self.prune_width_scale) as isize;
        let height_threshold = (subimage_height as f32 * self.prune_height_scale) as isize;

        self.positions_buffer
            .sort_unstable_by(|a, b| a.2.partial_cmp(&b.2).unwrap());
        let mut i = 0;
        while i < self.positions_buffer.len() {
            let a = self.positions_buffer[i];
            self.positions_buffer.retain(|b| {
                let dist = (
                    (b.0 as isize - a.0 as isize).abs(),
                    (b.1 as isize - a.1 as isize).abs(),
                );
                dist == (0, 0) || (dist.0 > width_threshold || dist.1 > height_threshold)
            });
            i += 1;
        }
    }

    pub fn most_recent_results(&self) -> &[(usize, usize, f32)] {
        &self.positions_buffer
    }

    pub fn most_recent_results_mut(&mut self) -> &mut [(usize, usize, f32)] {
        &mut self.positions_buffer
    }
}

// I looked into std portable-simd but doing runtime detection with it seems way more complicated than the handy simdeez macro
// I'm pretty sure simdeez has UB in it though
// I may add a StaticTargetCpuSimd backend or something without runtime detection that expects users to compile with appropiate
// target cpu flags and uses portable-simd

#[cfg(feature = "simdeez")]
use simdeez::*;
#[cfg(feature = "simdeez")]
use simdeez::{avx2::*, scalar::*, sse2::*, sse41::*};
#[cfg(feature = "simdeez")]
simd_runtime_generate!(
    fn simdeez_width() -> usize {
        S::VF32_WIDTH
    }
);

macro_rules! make_simdeez_fn {
    ($with_remainder: expr, $fn_name: ident) => {
        #[cfg(feature = "simdeez")]
        simd_runtime_generate!(
            fn $fn_name(
                x_offset: usize,
                y_offset: usize,
                search_img: &[f32],
                search_w: usize,
                subimage: &[f32],
                w: usize,
                h: usize,
            ) -> f32 {
                #[cfg(not(feature = "checked-simdeez"))]
                let slice: fn(&[f32], _) -> &[f32] = |x, range| x.get_unchecked(range);
                #[cfg(feature = "checked-simdeez")]
                let slice: fn(&[f32], _) -> &[f32] = |x, range| &x[range];
                #[cfg(not(feature = "checked-simdeez"))]
                let slice_elem: fn(&[f32], _) -> &f32 = |x, idx| x.get_unchecked(idx);
                #[cfg(feature = "checked-simdeez")]
                let slice_elem: fn(&[f32], _) -> &f32 = |x, idx| &x[idx];

                // These 3 lines should do all the bounds checking we need
                // We use get_unchecked below
                let subimage = &subimage[..(w * h)];

                let search_img = &search_img[(x_offset + y_offset * search_w)..];
                let search_img = &search_img[..(h * search_w)];

                // [0.0; S::VF32_WIDTH] gave me a const generics error
                // In my case it's 8, 32 should be plenty conservative
                let zeroes = [0.0; 32];
                let mut res_simd = S::loadu_ps(&zeroes[0]);
                let mut res_scalar = 0.0f32;

                let simd_iters_per_row = w / S::VF32_WIDTH;
                let scalar_iters_per_row = w % S::VF32_WIDTH;

                for y in 0..h {
                    let row_sub = (y * w) as usize;
                    let row_search = (y * search_w) as usize;

                    let mut subimage = slice(subimage, row_sub..);
                    let mut search_img = slice(search_img, row_search..);

                    for _ in 0..simd_iters_per_row {
                        let search = S::loadu_ps(slice_elem(search_img, 0));
                        let sub = S::loadu_ps(slice_elem(subimage, 0));

                        let diff = S::sub_ps(sub, search);
                        let square = S::mul_ps(diff, diff);

                        res_simd = S::add_ps(res_simd, square);

                        subimage = slice(subimage, S::VF32_WIDTH..);
                        search_img = slice(search_img, S::VF32_WIDTH..);
                    }

                    if $with_remainder {
                        for i in 0..scalar_iters_per_row {
                            let search = slice_elem(search_img, i);
                            let sub = slice_elem(subimage, i);

                            let diff = sub - search;
                            let square = diff * diff;
                            res_scalar += square;
                        }
                    }
                }

                let res = S::horizontal_add_ps(res_simd) + res_scalar;

                //res.sqrt() / w as f32 / h as f32
                //res / (255.0 * 255.0) / w as f32 / h as f32
                (res / w as f32 / h as f32).sqrt() / 255.0
                //'res.sqrt() / ((w as f32 * h as f32).sqrt() * 255.0)
            }
        );
    };
}
make_simdeez_fn!(true, image_dist_simdeez_with_remainder);
make_simdeez_fn!(false, image_dist_simdeez);

fn image_dist_naive(
    (x_offset, y_offset): (usize, usize),
    (search_img, search_w): (&[u8], usize),
    (subimage, w, h): (&[u8], usize, usize),
    (channels_search, channels_sub): (usize, usize),
) -> f32 {
    let subimage = &subimage[..(w * h * channels_sub) as usize];

    let search_img = &search_img
        [(x_offset * channels_search + y_offset * search_w * channels_search) as usize..];
    let search_img = &search_img[..(h * search_w * channels_search) as usize];

    let calc_dist = |a, b| ((a as isize - b as isize) as f32).powi(2);
    let mut dist = 0.0f32;
    for y in 0..h {
        for x in 0..w {
            let pos_sub = (x * channels_sub + y * w * channels_sub) as usize;
            let pos_search = ((x_offset + x) * channels_search
                + (y_offset + y) * search_w * channels_search)
                as usize;
            dist += calc_dist(subimage[pos_sub + 0], search_img[pos_search + 0]);
            dist += calc_dist(subimage[pos_sub + 1], search_img[pos_search + 1]);
            dist += calc_dist(subimage[pos_sub + 2], search_img[pos_search + 2]);
        }
    }
    dist.sqrt() / w as f32 / h as f32
}

impl Default for SubImageFinderState {
    fn default() -> Self {
        Self::new()
    }
}

#[cfg(test)]
mod tests {

    //TODO: Tests and benchmarks in this crate (I've been doing them in a separate binary crate that used this as a path dependency)
}
