use std::cmp::Ordering;

#[inline(always)]
pub fn _memcmp_impl(a: &[u8], b: &[u8]) -> Ordering {
    #[cfg(target_pointer_width = "128")]
    let r = _start_cmp_128(a, b);
    #[cfg(target_pointer_width = "64")]
    let r = _start_cmp_64(a, b);
    #[cfg(target_pointer_width = "32")]
    let r = _start_cmp_32(a, b);
    #[cfg(target_pointer_width = "16")]
    let r = _start_cmp_16(a, b);
    //
    r
}

macro_rules! _unroll_one_cmp_16 {
    ($a_ptr:expr, $b_ptr:expr, $loop_size:expr, $idx:expr) => {{
        let aa_ptr = unsafe { $a_ptr.add($loop_size * $idx) };
        let bb_ptr = unsafe { $b_ptr.add($loop_size * $idx) };
        //
        let aac = unsafe { *(aa_ptr as *const u128) };
        let bbc = unsafe { *(bb_ptr as *const u128) };
        if aac != bbc {
            return _cmp_bytes_16(aa_ptr, bb_ptr);
        }
    }};
}

macro_rules! _unroll_one_cmp_8 {
    ($a_ptr:expr, $b_ptr:expr, $loop_size:expr, $idx:expr) => {{
        let aa_ptr = unsafe { $a_ptr.add($loop_size * $idx) };
        let bb_ptr = unsafe { $b_ptr.add($loop_size * $idx) };
        //
        let aac = unsafe { *(aa_ptr as *const u64) };
        let bbc = unsafe { *(bb_ptr as *const u64) };
        if aac != bbc {
            return _cmp_bytes_8(aa_ptr, bb_ptr);
        }
    }};
}

macro_rules! _unroll_one_cmp_4 {
    ($a_ptr:expr, $b_ptr:expr, $loop_size:expr, $idx:expr) => {{
        let aa_ptr = unsafe { $a_ptr.add($loop_size * $idx) };
        let bb_ptr = unsafe { $b_ptr.add($loop_size * $idx) };
        //
        let aac = unsafe { *(aa_ptr as *const u32) };
        let bbc = unsafe { *(bb_ptr as *const u32) };
        if aac != bbc {
            return _cmp_bytes_4(aa_ptr, bb_ptr);
        }
    }};
}

macro_rules! _unroll_one_cmp_2 {
    ($a_ptr:expr, $b_ptr:expr, $loop_size:expr, $idx:expr) => {{
        let aa_ptr = unsafe { $a_ptr.add($loop_size * $idx) };
        let bb_ptr = unsafe { $b_ptr.add($loop_size * $idx) };
        //
        let aac = unsafe { *(aa_ptr as *const u16) };
        let bbc = unsafe { *(bb_ptr as *const u16) };
        if aac != bbc {
            return _cmp_bytes_2(aa_ptr, bb_ptr);
        }
    }};
}

macro_rules! _unroll_one_cmp_1 {
    ($a_ptr:expr, $b_ptr:expr, $loop_size:expr, $idx:expr) => {{
        let aa_ptr = unsafe { $a_ptr.add($loop_size * $idx) };
        let bb_ptr = unsafe { $b_ptr.add($loop_size * $idx) };
        //
        let aac = unsafe { *(aa_ptr as *const u8) };
        let bbc = unsafe { *(bb_ptr as *const u8) };
        if aac != bbc {
            return aac.cmp(&bbc);
        }
    }};
}

#[cfg(target_pointer_width = "128")]
#[inline(always)]
fn _start_cmp_128(a: &[u8], b: &[u8]) -> Ordering {
    //
    let a_len = a.len();
    let b_len = b.len();
    let min_len = a_len.min(b_len);
    let mut a_ptr = a.as_ptr();
    let mut b_ptr = b.as_ptr();
    let end_ptr = unsafe { a_ptr.add(min_len) };
    //
    {
        let unroll = 8;
        let loop_size = 16;
        let end_ptr_16_8 = unsafe { end_ptr.sub(loop_size * unroll) };
        while a_ptr <= end_ptr_16_8 {
            _unroll_one_cmp_16!(a_ptr, b_ptr, loop_size, 0);
            _unroll_one_cmp_16!(a_ptr, b_ptr, loop_size, 1);
            _unroll_one_cmp_16!(a_ptr, b_ptr, loop_size, 2);
            _unroll_one_cmp_16!(a_ptr, b_ptr, loop_size, 3);
            _unroll_one_cmp_16!(a_ptr, b_ptr, loop_size, 4);
            _unroll_one_cmp_16!(a_ptr, b_ptr, loop_size, 5);
            _unroll_one_cmp_16!(a_ptr, b_ptr, loop_size, 6);
            _unroll_one_cmp_16!(a_ptr, b_ptr, loop_size, 7);
            //
            a_ptr = unsafe { a_ptr.add(loop_size * unroll) };
            b_ptr = unsafe { b_ptr.add(loop_size * unroll) };
        }
    }
    {
        let loop_size = 16;
        let end_ptr_16 = unsafe { end_ptr.sub(loop_size) };
        while a_ptr <= end_ptr_16 {
            _unroll_one_cmp_16!(a_ptr, b_ptr, loop_size, 0);
            a_ptr = unsafe { a_ptr.add(loop_size) };
            b_ptr = unsafe { b_ptr.add(loop_size) };
        }
    }
    // a rest data is a max: 15 bytes.
    {
        let loop_size = 8;
        let end_ptr_8 = unsafe { end_ptr.sub(loop_size) };
        if a_ptr <= end_ptr_8 {
            _unroll_one_cmp_8!(a_ptr, b_ptr, loop_size, 0);
            //
            a_ptr = unsafe { a_ptr.add(loop_size) };
            b_ptr = unsafe { b_ptr.add(loop_size) };
        }
    }
    {
        let loop_size = 4;
        let end_ptr_4 = unsafe { end_ptr.sub(loop_size) };
        if a_ptr <= end_ptr_4 {
            _unroll_one_cmp_4!(a_ptr, b_ptr, loop_size, 0);
            //
            a_ptr = unsafe { a_ptr.add(loop_size) };
            b_ptr = unsafe { b_ptr.add(loop_size) };
        }
    }
    {
        let loop_size = 2;
        let end_ptr_2 = unsafe { end_ptr.sub(loop_size) };
        if a_ptr <= end_ptr_2 {
            _unroll_one_cmp_2!(a_ptr, b_ptr, loop_size, 0);
            //
            a_ptr = unsafe { a_ptr.add(loop_size) };
            b_ptr = unsafe { b_ptr.add(loop_size) };
        }
    }
    {
        let loop_size = 1;
        let end_ptr_1 = unsafe { end_ptr.sub(loop_size) };
        if a_ptr <= end_ptr_1 {
            _unroll_one_cmp_1!(a_ptr, b_ptr, loop_size, 0);
        }
    }
    //
    a_len.cmp(&b_len)
}

#[cfg(target_pointer_width = "64")]
#[inline(always)]
fn _start_cmp_64(a: &[u8], b: &[u8]) -> Ordering {
    //
    let a_len = a.len();
    let b_len = b.len();
    let min_len = a_len.min(b_len);
    let mut a_ptr = a.as_ptr();
    let mut b_ptr = b.as_ptr();
    let end_ptr = unsafe { a_ptr.add(min_len) };
    //
    {
        let unroll = 8;
        let loop_size = 8;
        let end_ptr_8_8 = unsafe { end_ptr.sub(loop_size * unroll) };
        while a_ptr <= end_ptr_8_8 {
            _unroll_one_cmp_8!(a_ptr, b_ptr, loop_size, 0);
            _unroll_one_cmp_8!(a_ptr, b_ptr, loop_size, 1);
            _unroll_one_cmp_8!(a_ptr, b_ptr, loop_size, 2);
            _unroll_one_cmp_8!(a_ptr, b_ptr, loop_size, 3);
            _unroll_one_cmp_8!(a_ptr, b_ptr, loop_size, 4);
            _unroll_one_cmp_8!(a_ptr, b_ptr, loop_size, 5);
            _unroll_one_cmp_8!(a_ptr, b_ptr, loop_size, 6);
            _unroll_one_cmp_8!(a_ptr, b_ptr, loop_size, 7);
            //
            a_ptr = unsafe { a_ptr.add(loop_size * unroll) };
            b_ptr = unsafe { b_ptr.add(loop_size * unroll) };
        }
    }
    {
        let loop_size = 8;
        let end_ptr_8 = unsafe { end_ptr.sub(loop_size) };
        while a_ptr <= end_ptr_8 {
            _unroll_one_cmp_8!(a_ptr, b_ptr, loop_size, 0);
            a_ptr = unsafe { a_ptr.add(loop_size) };
            b_ptr = unsafe { b_ptr.add(loop_size) };
        }
    }
    // a rest data is a max: 7 bytes.
    {
        let loop_size = 4;
        let end_ptr_4 = unsafe { end_ptr.sub(loop_size) };
        if a_ptr <= end_ptr_4 {
            _unroll_one_cmp_4!(a_ptr, b_ptr, loop_size, 0);
            //
            a_ptr = unsafe { a_ptr.add(loop_size) };
            b_ptr = unsafe { b_ptr.add(loop_size) };
        }
    }
    {
        let loop_size = 2;
        let end_ptr_2 = unsafe { end_ptr.sub(loop_size) };
        if a_ptr <= end_ptr_2 {
            _unroll_one_cmp_2!(a_ptr, b_ptr, loop_size, 0);
            //
            a_ptr = unsafe { a_ptr.add(loop_size) };
            b_ptr = unsafe { b_ptr.add(loop_size) };
        }
    }
    {
        let loop_size = 1;
        let end_ptr_1 = unsafe { end_ptr.sub(loop_size) };
        if a_ptr <= end_ptr_1 {
            _unroll_one_cmp_1!(a_ptr, b_ptr, loop_size, 0);
        }
    }
    //
    a_len.cmp(&b_len)
}

#[cfg(target_pointer_width = "32")]
#[inline(always)]
fn _start_cmp_32(a: &[u8], b: &[u8]) -> Ordering {
    //
    let a_len = a.len();
    let b_len = b.len();
    let min_len = a_len.min(b_len);
    let mut a_ptr = a.as_ptr();
    let mut b_ptr = b.as_ptr();
    let end_ptr = unsafe { a_ptr.add(min_len) };
    //
    {
        let unroll = 8;
        let loop_size = 4;
        let end_ptr_4_8 = unsafe { end_ptr.sub(loop_size * unroll) };
        while a_ptr <= end_ptr_4_8 {
            _unroll_one_cmp_4!(a_ptr, b_ptr, loop_size, 0);
            _unroll_one_cmp_4!(a_ptr, b_ptr, loop_size, 1);
            _unroll_one_cmp_4!(a_ptr, b_ptr, loop_size, 2);
            _unroll_one_cmp_4!(a_ptr, b_ptr, loop_size, 3);
            _unroll_one_cmp_4!(a_ptr, b_ptr, loop_size, 4);
            _unroll_one_cmp_4!(a_ptr, b_ptr, loop_size, 5);
            _unroll_one_cmp_4!(a_ptr, b_ptr, loop_size, 6);
            _unroll_one_cmp_4!(a_ptr, b_ptr, loop_size, 7);
            //
            a_ptr = unsafe { a_ptr.add(loop_size * unroll) };
            b_ptr = unsafe { b_ptr.add(loop_size * unroll) };
        }
    }
    {
        let loop_size = 4;
        let end_ptr_4 = unsafe { end_ptr.sub(loop_size) };
        while a_ptr <= end_ptr_4 {
            _unroll_one_cmp_4!(a_ptr, b_ptr, loop_size, 0);
            //
            a_ptr = unsafe { a_ptr.add(loop_size) };
            b_ptr = unsafe { b_ptr.add(loop_size) };
        }
    }
    // a rest data is a max: 3 bytes.
    {
        let loop_size = 2;
        let end_ptr_2 = unsafe { end_ptr.sub(loop_size) };
        if a_ptr <= end_ptr_2 {
            _unroll_one_cmp_2!(a_ptr, b_ptr, loop_size, 0);
            //
            a_ptr = unsafe { a_ptr.add(loop_size) };
            b_ptr = unsafe { b_ptr.add(loop_size) };
        }
    }
    {
        let loop_size = 1;
        let end_ptr_1 = unsafe { end_ptr.sub(loop_size) };
        if a_ptr <= end_ptr_1 {
            _unroll_one_cmp_1!(a_ptr, b_ptr, loop_size, 0);
        }
    }
    //
    a_len.cmp(&b_len)
}

#[cfg(target_pointer_width = "16")]
#[inline(always)]
fn _start_cmp_16(a: &[u8], b: &[u8]) -> Ordering {
    //
    let a_len = a.len();
    let b_len = b.len();
    let min_len = a_len.min(b_len);
    let mut a_ptr = a.as_ptr();
    let mut b_ptr = b.as_ptr();
    let end_ptr = unsafe { a_ptr.add(min_len) };
    //
    {
        let unroll = 8;
        let loop_size = 2;
        let end_ptr_2_8 = unsafe { end_ptr.sub(loop_size * unroll) };
        while a_ptr <= end_ptr_2_8 {
            _unroll_one_cmp_2!(a_ptr, b_ptr, loop_size, 0);
            _unroll_one_cmp_2!(a_ptr, b_ptr, loop_size, 1);
            _unroll_one_cmp_2!(a_ptr, b_ptr, loop_size, 2);
            _unroll_one_cmp_2!(a_ptr, b_ptr, loop_size, 3);
            _unroll_one_cmp_2!(a_ptr, b_ptr, loop_size, 4);
            _unroll_one_cmp_2!(a_ptr, b_ptr, loop_size, 5);
            _unroll_one_cmp_2!(a_ptr, b_ptr, loop_size, 6);
            _unroll_one_cmp_2!(a_ptr, b_ptr, loop_size, 7);
            //
            a_ptr = unsafe { a_ptr.add(loop_size * unroll) };
            b_ptr = unsafe { b_ptr.add(loop_size * unroll) };
        }
    }
    {
        let loop_size = 2;
        let end_ptr_2 = unsafe { end_ptr.sub(loop_size) };
        while a_ptr <= end_ptr_2 {
            _unroll_one_cmp_2!(a_ptr, b_ptr, loop_size, 0);
            //
            a_ptr = unsafe { a_ptr.add(loop_size) };
            b_ptr = unsafe { b_ptr.add(loop_size) };
        }
    }
    // a rest data is a max: 1 bytes.
    {
        let loop_size = 1;
        let end_ptr_1 = unsafe { end_ptr.sub(loop_size) };
        if a_ptr <= end_ptr_1 {
            _unroll_one_cmp_1!(a_ptr, b_ptr, loop_size, 0);
        }
    }
    //
    a_len.cmp(&b_len)
}

#[inline(always)]
fn _cmp_bytes_16(a_ptr: *const u8, b_ptr: *const u8) -> Ordering {
    let aa_ptr = a_ptr as *const u128;
    let bb_ptr = b_ptr as *const u128;
    let aac = unsafe { *aa_ptr };
    let bbc = unsafe { *bb_ptr };
    let bits = aac ^ bbc;
    if bits != 0 {
        let pos = (bits.trailing_zeros() / 8) as usize;
        let aa_ptr = unsafe { a_ptr.add(pos) };
        let bb_ptr = unsafe { b_ptr.add(pos) };
        let aac = unsafe { *aa_ptr };
        let bbc = unsafe { *bb_ptr };
        return aac.cmp(&bbc);
    } else {
        Ordering::Equal
    }
}

#[inline(always)]
fn _cmp_bytes_8(a_ptr: *const u8, b_ptr: *const u8) -> Ordering {
    let aa_ptr = a_ptr as *const u64;
    let bb_ptr = b_ptr as *const u64;
    let aac = unsafe { *aa_ptr };
    let bbc = unsafe { *bb_ptr };
    let bits = aac ^ bbc;
    if bits != 0 {
        let pos = (bits.trailing_zeros() / 8) as usize;
        let aa_ptr = unsafe { a_ptr.add(pos) };
        let bb_ptr = unsafe { b_ptr.add(pos) };
        let aac = unsafe { *aa_ptr };
        let bbc = unsafe { *bb_ptr };
        return aac.cmp(&bbc);
    } else {
        Ordering::Equal
    }
}

#[inline(always)]
fn _cmp_bytes_4(a_ptr: *const u8, b_ptr: *const u8) -> Ordering {
    let aa_ptr = a_ptr as *const u32;
    let bb_ptr = b_ptr as *const u32;
    let aac = unsafe { *aa_ptr };
    let bbc = unsafe { *bb_ptr };
    let bits = aac ^ bbc;
    if bits != 0 {
        let pos = (bits.trailing_zeros() / 8) as usize;
        let aa_ptr = unsafe { a_ptr.add(pos) };
        let bb_ptr = unsafe { b_ptr.add(pos) };
        let aac = unsafe { *aa_ptr };
        let bbc = unsafe { *bb_ptr };
        return aac.cmp(&bbc);
    } else {
        Ordering::Equal
    }
}

#[inline(always)]
fn _cmp_bytes_2(a_ptr: *const u8, b_ptr: *const u8) -> Ordering {
    let aa_ptr = a_ptr as *const u16;
    let bb_ptr = b_ptr as *const u16;
    let aac = unsafe { *aa_ptr };
    let bbc = unsafe { *bb_ptr };
    let bits = aac ^ bbc;
    if bits != 0 {
        let pos = (bits.trailing_zeros() / 8) as usize;
        let aa_ptr = unsafe { a_ptr.add(pos) };
        let bb_ptr = unsafe { b_ptr.add(pos) };
        let aac = unsafe { *aa_ptr };
        let bbc = unsafe { *bb_ptr };
        return aac.cmp(&bbc);
    } else {
        Ordering::Equal
    }
}

/*
 * The simple implement:

#[inline(always)]
pub fn _memcmp_impl(a: &[u8], b: &[u8]) -> Ordering {
    let a_len = a.len();
    let b_len = b.len();
    let min_len = a_len.min(b_len);
    for i in 0..min_len {
        let cmp = a[i].cmp(&b[i]);
        if cmp != Ordering::Equal {
            return cmp;
        }
    }
    a_len.cmp(&b_len)
}

#[inline(always)]
pub fn _memcmp_impl(a: &[u8], b: &[u8]) -> Ordering {
    let a_len = a.len();
    let b_len = b.len();
    let min_len = a_len.min(b_len);
    let mut a_ptr = a.as_ptr();
    let mut b_ptr = b.as_ptr();
    let end_ptr = unsafe { a_ptr.add(min_len) };
    while a_ptr < end_ptr {
        let aa = unsafe { *a_ptr };
        let bb = unsafe { *b_ptr };
        if aa != bb {
            return aa.cmp(&bb);
        }
        a_ptr = unsafe { a_ptr.add(1) };
        b_ptr = unsafe { b_ptr.add(1) };
    }
    a_len.cmp(&b_len)
}
*/
