use crate::{InstructionSet, SIMD128, SIMD256};

#[cfg(target_arch = "x86")]
use core::arch::x86::*;

#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;

#[derive(Clone, Copy)]
pub struct AVX2(());

unsafe impl InstructionSet for AVX2 {
    #[inline(always)]
    fn detect() -> Option<Self> {
        #[cfg(target_feature = "avx2")]
        {
            Some(Self(()))
        }
        #[cfg(not(target_feature = "avx2"))]
        {
            #[cfg(feature = "std")]
            if std::is_x86_feature_detected!("avx2") {
                return Some(Self(()));
            }
            None
        }
    }

    #[inline(always)]
    unsafe fn new_unchecked() -> Self {
        Self(())
    }
}

unsafe impl SIMD128 for AVX2 {
    type V128 = __m128i;

    #[inline(always)]
    unsafe fn v128_load(self, addr: *const u8) -> Self::V128 {
        _mm_load_si128(addr.cast())
    }

    #[inline(always)]
    unsafe fn v128_loadu(self, addr: *const u8) -> Self::V128 {
        _mm_loadu_si128(addr.cast())
    }

    #[inline(always)]
    unsafe fn v128_storeu(self, addr: *mut u8, a: Self::V128) {
        _mm_storeu_si128(addr.cast(), a)
    }

    #[inline(always)]
    fn v128_or(self, a: Self::V128, b: Self::V128) -> Self::V128 {
        unsafe { _mm_or_si128(a, b) }
    }

    #[inline(always)]
    fn v128_and(self, a: Self::V128, b: Self::V128) -> Self::V128 {
        unsafe { _mm_and_si128(a, b) }
    }

    #[inline(always)]
    fn v128_to_bytes(self, a: Self::V128) -> [u8; 16] {
        unsafe { core::mem::transmute(a) }
    }

    #[inline(always)]
    fn u8x16_splat(self, x: u8) -> Self::V128 {
        unsafe { _mm_set1_epi8(x as i8) }
    }

    #[inline(always)]
    fn i8x16_shuffle(self, a: Self::V128, b: Self::V128) -> Self::V128 {
        unsafe { _mm_shuffle_epi8(a, b) }
    }

    #[inline(always)]
    fn i16x8_sll<const IMM8: i32>(self, a: Self::V128) -> Self::V128 {
        unsafe { _mm_slli_epi16::<IMM8>(a) }
    }

    #[inline(always)]
    fn i16x8_srl<const IMM8: i32>(self, a: Self::V128) -> Self::V128 {
        unsafe { _mm_srli_epi16::<IMM8>(a) }
    }

    #[inline(always)]
    fn i16x8_extract<const IMM3: i32>(self, a: Self::V128) -> i16 {
        unsafe { _mm_extract_epi16::<IMM3>(a) as i16 }
    }

    #[inline(always)]
    fn i32x4_extract<const IMM2: i32>(self, a: Self::V128) -> i32 {
        unsafe { _mm_extract_epi32::<IMM2>(a) }
    }
}

unsafe impl SIMD256 for AVX2 {
    type V256 = __m256i;

    #[inline(always)]
    unsafe fn v256_load(self, addr: *const u8) -> Self::V256 {
        _mm256_load_si256(addr.cast())
    }

    #[inline(always)]
    unsafe fn v256_loadu(self, addr: *const u8) -> Self::V256 {
        _mm256_loadu_si256(addr.cast())
    }

    #[inline(always)]
    unsafe fn v256_storeu(self, addr: *mut u8, a: Self::V256) {
        _mm256_storeu_si256(addr.cast(), a)
    }
    #[inline(always)]
    fn v256_or(self, a: Self::V256, b: Self::V256) -> Self::V256 {
        unsafe { _mm256_or_si256(a, b) }
    }

    #[inline(always)]
    fn v256_and(self, a: Self::V256, b: Self::V256) -> Self::V256 {
        unsafe { _mm256_and_si256(a, b) }
    }

    #[inline(always)]
    fn v256_to_bytes(self, a: Self::V256) -> [u8; 32] {
        unsafe { core::mem::transmute(a) }
    }

    #[inline(always)]
    fn v256_zero(self) -> Self::V256 {
        unsafe { _mm256_setzero_si256() }
    }

    #[inline(always)]
    fn v128_from_low_v256(self, a: Self::V256) -> Self::V128 {
        unsafe { _mm256_castsi256_si128(a) }
    }

    #[inline(always)]
    fn v128_from_high_v256(self, a: Self::V256) -> Self::V128 {
        unsafe { _mm256_extracti128_si256::<1>(a) }
    }

    #[inline(always)]
    fn u8x32_splat(self, x: u8) -> Self::V256 {
        unsafe { _mm256_set1_epi8(x as i8) }
    }
    #[inline(always)]
    fn u8x32_add(self, a: Self::V256, b: Self::V256) -> Self::V256 {
        unsafe { _mm256_add_epi8(a, b) }
    }
    #[inline(always)]
    fn u8x32_sub(self, a: Self::V256, b: Self::V256) -> Self::V256 {
        unsafe { _mm256_sub_epi8(a, b) }
    }

    #[inline(always)]
    fn u8x32_shuffle(self, a: Self::V256, b: Self::V256) -> Self::V256 {
        unsafe { _mm256_shuffle_epi8(a, b) }
    }

    #[inline(always)]
    fn u8x32_any_zero(self, a: Self::V256) -> bool {
        unsafe {
            let cmp = _mm256_cmpeq_epi8(a, _mm256_setzero_si256());
            _mm256_movemask_epi8(cmp) as u32 != 0
        }
    }

    #[inline(always)]
    fn i8x32_splat(self, x: i8) -> Self::V256 {
        unsafe { _mm256_set1_epi8(x) }
    }

    #[inline(always)]
    fn i8x32_cmplt(self, a: Self::V256, b: Self::V256) -> Self::V256 {
        unsafe { _mm256_cmpgt_epi8(b, a) }
    }

    #[inline(always)]
    fn u16x16_sll<const IMM8: i32>(self, a: Self::V256) -> Self::V256 {
        unsafe { _mm256_slli_epi16::<IMM8>(a) }
    }

    #[inline(always)]
    fn u16x16_srl<const IMM8: i32>(self, a: Self::V256) -> Self::V256 {
        unsafe { _mm256_srli_epi16::<IMM8>(a) }
    }

    #[inline(always)]
    fn u64x2_from_low_u128x2(self, a: Self::V256) -> Self::V128 {
        unsafe { _mm256_castsi256_si128(_mm256_permute4x64_epi64::<0b_0000_1000>(a)) }
    }

    #[inline(always)]
    fn i16x16_from_u8x16(self, a: Self::V128) -> Self::V256 {
        unsafe { _mm256_cvtepu8_epi16(a) }
    }

    #[inline(always)]
    fn u8x32_highest_bits(self, a: Self::V256) -> u32 {
        unsafe { _mm256_movemask_epi8(a) as u32 }
    }
}
