use std::ops::{Deref, DerefMut};
use std::ptr;

use byteorder::{ByteOrder, LittleEndian as LE};

use error::{Error, Result};
use varint::write_varu64;
use {MAX_INPUT_SIZE, MAX_BLOCK_SIZE};

/// The total number of slots we permit for our hash table of 4 byte repeat
/// sequences.
const MAX_TABLE_SIZE: usize = 1<<14;

/// The size of a small hash table. This is useful for reducing overhead when
/// compressing very small blocks of bytes.
const SMALL_TABLE_SIZE: usize = 1<<10;

/// The total number of bytes that we always leave uncompressed at the end
/// of the buffer. This in particular affords us some wiggle room during
/// compression such that faster copy operations can be used.
const INPUT_MARGIN: usize = 16 - 1;

/// The minimum block size that we're willing to consider for compression.
/// Anything smaller than this gets emitted as a literal.
const MIN_NON_LITERAL_BLOCK_SIZE: usize = 1 + 1 + INPUT_MARGIN;

/// Nice names for the various Snappy tags.
enum Tag {
    Literal = 0b00,
    Copy1 = 0b01,
    Copy2 = 0b10,
    // Compression never actually emits a Copy4 operation and decompression
    // uses tricks so that we never explicitly do case analysis on the copy
    // operation type, therefore leading to the fact that we never use Copy4.
    #[allow(dead_code)]
    Copy4 = 0b11,
}

/// Returns the maximum compressed size given the uncompressed size.
///
/// If the uncompressed size exceeds the maximum allowable size then this
/// returns 0.
pub fn max_compress_len(input_len: usize) -> usize {
    let input_len = input_len as u64;
    if input_len > MAX_INPUT_SIZE {
        return 0;
    }
    let max = 32 + input_len + (input_len / 6);
    if max > MAX_INPUT_SIZE {
        0
    } else {
        max as usize
    }
}

/// Encoder is a raw encoder for compressing bytes in the Snappy format.
///
/// Thie encoder does not use the Snappy frame format and simply compresses the
/// given bytes in one big Snappy block (that is, it has a single header).
///
/// Unless you explicitly need the low-level control, you should use `Writer`
/// instead, which compresses to the Snappy frame format.
///
/// It is beneficial to reuse an Encoder.
pub struct Encoder {
    small: [u16; SMALL_TABLE_SIZE],
    big: Vec<u16>,
}

impl Encoder {
    /// Return a new encoder that can be used for compressing bytes.
    pub fn new() -> Encoder {
        Encoder {
            small: [0; SMALL_TABLE_SIZE],
            big: vec![],
        }
    }

    /// Compresses all bytes in `input` into `output`.
    ///
    /// `input` can be any arbitrary sequence of bytes.
    ///
    /// `output` must be large enough to hold the maximum possible compressed
    /// size of `input`, which can be computed using `max_compress_len`.
    ///
    /// On success, this returns the number of bytes written to `output`.
    ///
    /// # Errors
    ///
    /// This method returns an error in the following circumstances:
    ///
    /// * The total number of bytes to compress exceeds `2^32 - 1`.
    /// * `output` has length less than `max_compress_len(input.len())`.
    pub fn compress(
        &mut self,
        mut input: &[u8],
        output: &mut [u8],
    ) -> Result<usize> {
        match max_compress_len(input.len()) {
            0 => {
                return Err(Error::TooBig {
                    given: input.len() as u64,
                    max: MAX_INPUT_SIZE,
                });
            }
            min if output.len() < min => {
                return Err(Error::BufferTooSmall {
                    given: output.len() as u64,
                    min: min as u64,
                });
            }
            _ => {}
        }
        // Handle an edge case specially.
        if input.is_empty() {
            // Encodes a varint of 0, denoting the total size of uncompressed
            // bytes.
            output[0] = 0;
            return Ok(1);
        }
        // Write the Snappy header, which is just the total number of
        // uncompressed bytes.
        let mut d = write_varu64(output, input.len() as u64);
        while !input.is_empty() {
            // Find the next block.
            let mut src = input;
            if src.len() > MAX_BLOCK_SIZE {
                src = &src[..MAX_BLOCK_SIZE as usize];
            }
            input = &input[src.len()..];

            // If the block is smallish, then don't waste time on it and just
            // emit a literal.
            let mut block = Block::new(src, output, d);
            if block.src.len() < MIN_NON_LITERAL_BLOCK_SIZE {
                let lit_end = block.src.len();
                unsafe {
                    // SAFETY: next_emit is zero (in bounds) and the end is
                    // the length of the block (in bounds).
                    block.emit_literal(lit_end);
                }
            } else {
                let table = self.block_table(block.src.len());
                block.compress(table);
            }
            d = block.d;
        }
        Ok(d)
    }

    /// Compresses all bytes in `input` into a freshly allocated `Vec`.
    ///
    /// This is just like the `compress` method, except it allocates a `Vec`
    /// with the right size for you. (This is intended to be a convenience
    /// method.)
    ///
    /// This method returns an error under the same circumstances that
    /// `compress` does.
    pub fn compress_vec(&mut self, input: &[u8]) -> Result<Vec<u8>> {
        let mut buf = vec![0; max_compress_len(input.len())];
        let n = try!(self.compress(input, &mut buf));
        buf.truncate(n);
        Ok(buf)
    }
}

struct Block<'s, 'd> {
    src: &'s [u8],
    s: usize,
    s_limit: usize,
    dst: &'d mut [u8],
    d: usize,
    next_emit: usize,
}

impl<'s, 'd> Block<'s, 'd> {
    #[inline(always)]
    fn new(
        src: &'s [u8],
        dst: &'d mut [u8],
        d: usize,
    ) -> Block<'s, 'd> {
        Block {
            src: src,
            s: 0,
            s_limit: src.len(),
            dst: dst,
            d: d,
            next_emit: 0,
        }
    }

    #[inline(always)]
    fn compress(&mut self, mut table: BlockTable) {
        debug_assert!(!table.is_empty());
        debug_assert!(self.src.len() >= MIN_NON_LITERAL_BLOCK_SIZE);

        self.s += 1;
        self.s_limit -= INPUT_MARGIN;
        let mut next_hash = table.hash(LE::read_u32(&self.src[self.s..]));
        loop {
            let mut skip = 32;
            let mut candidate;
            let mut s_next = self.s;
            loop {
                self.s = s_next;
                let bytes_between_hash_lookups = skip >> 5;
                s_next = self.s + bytes_between_hash_lookups;
                skip += bytes_between_hash_lookups;
                if s_next > self.s_limit {
                    return self.done();
                }
                unsafe {
                    // SAFETY: next_hash is always computed by table.hash
                    // which is guaranteed to be in bounds.
                    candidate = *table.get_unchecked(next_hash) as usize;
                    *table.get_unchecked_mut(next_hash) = self.s as u16;

                    let srcp = self.src.as_ptr();
                    // SAFETY: s_next is guaranteed to be less than s_limit by
                    // the conditional above, which implies s_next is in
                    // bounds.
                    let x = loadu32_le(srcp.offset(s_next as isize));
                    next_hash = table.hash(x);
                    // SAFETY: self.s is always less than s_next, so it is also
                    // in bounds by the argument above.
                    //
                    // candidate is extracted from table, which is only ever
                    // set to valid positions in the block and is therefore
                    // also in bounds.
                    //
                    // We only need to compare y/z for equality, so we don't
                    // need to both with endianness. cur corresponds to the
                    // bytes at the current position and cand corresponds to
                    // a potential match. If they're equal, we declare victory
                    // and move below to try and extend the match.
                    let cur = loadu32(srcp.offset(self.s as isize));
                    let cand = loadu32(srcp.offset(candidate as isize));
                    if cur == cand {
                        break;
                    }
                }
            }
            // While the above found a candidate for compression, before we
            // emit a copy operation for it, we need to make sure that we emit
            // any bytes between the last copy operation and this one as a
            // literal.
            let lit_end = self.s;
            unsafe {
                // SAFETY: next_emit is set to a previous value of self.s,
                // which is guaranteed to be less than s_limit (in bounds).
                // lit_end is set to the current value of self.s, also
                // guaranteed to be less than s_limit (in bounds).
                self.emit_literal(lit_end);
            }
            loop {
                // Look for more matching bytes starting at the position of
                // the candidate and the current src position. We increment
                // self.s and candidate by 4 since we already know the first 4
                // bytes match.
                let base = self.s;
                self.s += 4;
                unsafe {
                    // SAFETY: candidate is always set to a value from our
                    // hash table, which only contains positions in self.src
                    // that have been seen for this block that occurred before
                    // self.s.
                    self.extend_match(candidate + 4);
                }
                let (offset, len) = (base - candidate, self.s - base);
                self.emit_copy(offset, len);
                self.next_emit = self.s;
                if self.s >= self.s_limit {
                    return self.done();
                }
                // Update the hash table with the byte sequences
                // self.src[self.s - 1..self.s + 3] and
                // self.src[self.s..self.s + 4]. Instead of reading 4 bytes
                // twice, we read 8 bytes once.
                //
                // If we happen to get a hit on self.src[self.s..self.s + 4],
                // then continue this loop and extend the match.
                unsafe {
                    let srcp = self.src.as_ptr();
                    // SAFETY: self.s can never exceed s_limit given by the
                    // conditional above and self.s is guaranteed to be
                    // non-zero and is therefore in bounds.
                    let x = loadu64_le(srcp.offset((self.s - 1) as isize));
                    // The lower 4 bytes of x correspond to
                    // self.src[self.s - 1..self.s + 3].
                    let prev_hash = table.hash(x as u32);
                    // SAFETY: Hash values are guaranteed to be in bounds.
                    *table.get_unchecked_mut(prev_hash) = (self.s - 1) as u16;
                    // The lower 4 bytes of x>>8 correspond to
                    // self.src[self.s..self.s + 4].
                    let cur_hash = table.hash((x >> 8) as u32);
                    // SAFETY: Hash values are guaranteed to be in bounds.
                    candidate = *table.get_unchecked(cur_hash) as usize;
                    *table.get_unchecked_mut(cur_hash) = self.s as u16;

                    // SAFETY: candidate is set from table, which always
                    // contains valid positions in the current block.
                    let y = loadu32_le(srcp.offset(candidate as isize));
                    if (x >> 8) as u32 != y {
                        // If we didn't get a hit, update the next hash
                        // and move on. Our initial 8 byte read continues to
                        // pay off.
                        next_hash = table.hash((x >> 16) as u32);
                        self.s += 1;
                        break;
                    }
                }
            }
        }
    }

    /// Emits one or more copy operations with the given offset and length.
    /// offset must be in the range [1, 65535] and len must be in the range
    /// [4, 65535].
    #[inline(always)]
    fn emit_copy(&mut self, offset: usize, mut len: usize) {
        debug_assert!(1 <= offset && offset <= 65535);
        // Copy operations only allow lengths up to 64, but we'll allow bigger
        // lengths and emit as many operations as we need.
        //
        // N.B. Since our block size is 64KB, we never actually emit a copy 4
        // operation.
        debug_assert!(4 <= len && len <= 65535);

        // Emit copy 2 operations until we don't have to.
        // We check on 68 here and emit a shorter copy than 64 below because
        // it is cheaper to, e.g., encode a length 67 copy as a length 60
        // copy 2 followed by a length 7 copy 1 than to encode it as a length
        // 64 copy 2 followed by a length 3 copy 2. They key here is that a
        // copy 1 operation requires at least length 4 which forces a length 3
        // copy to use a copy 2 operation.
        while len >= 68 {
            self.emit_copy2(offset, 64);
            len -= 64;
        }
        if len > 64 {
            self.emit_copy2(offset, 60);
            len -= 60;
        }
        // If we can squeeze the last copy into a copy 1 operation, do it.
        if len <= 11 && offset <= 2047 {
            self.dst[self.d] =
                (((offset >> 8) as u8) << 5)
                | (((len - 4) as u8) << 2)
                | (Tag::Copy1 as u8);
            self.dst[self.d + 1] = offset as u8;
            self.d += 2;
        } else {
            self.emit_copy2(offset, len);
        }
    }

    /// Emits a "copy 2" operation with the given offset and length. The
    /// offset and length must be valid for a copy 2 operation. i.e., offset
    /// must be in the range [1, 65535] and len must be in the range [1, 64].
    #[inline(always)]
    fn emit_copy2(&mut self, offset: usize, len: usize) {
        debug_assert!(1 <= offset && offset <= 65535);
        debug_assert!(1 <= len && len <= 64);
        self.dst[self.d] = (((len - 1) as u8) << 2) | (Tag::Copy2 as u8);
        LE::write_u16(&mut self.dst[self.d + 1..], offset as u16);
        self.d += 3;
    }

    /// Attempts to extend a match from the current position in self.src with
    /// the candidate position given.
    ///
    /// This method uses unaligned loads and elides bounds checks, so the
    /// caller must guarantee that cand points to a valid location in self.src
    /// and is less than the current position in src.
    #[inline(always)]
    unsafe fn extend_match(&mut self, mut cand: usize) {
        debug_assert!(cand < self.s);
        while self.s + 8 <= self.src.len() {
            let srcp = self.src.as_ptr();
            // SAFETY: The loop invariant guarantees that there is at least
            // 8 bytes to read at self.src + self.s. Since cand must be
            // guaranteed by the caller to be valid and less than self.s, it
            // also has enough room to read 8 bytes.
            //
            // TODO(ag): Despite my best efforts, I couldn't get this to
            // autovectorize with 128-bit loads. The logic after the loads
            // appears to be a little too clever...
            let x = loadu64(srcp.offset(self.s as isize));
            let y = loadu64(srcp.offset(cand as isize));
            if x == y {
                // If all 8 bytes are equal, move on...
                self.s += 8;
                cand += 8;
            } else {
                // Otherwise, find the last byte that was equal. We can do
                // this efficiently by interpreted x/y as little endian
                // numbers, which lets us use the number of trailing zeroes
                // as a proxy for the number of equivalent bits (after an XOR).
                let z = x.to_le() ^ y.to_le();
                self.s += z.trailing_zeros() as usize / 8;
                return;
            }
        }
        // When we have fewer than 8 bytes left in the block, fall back to the
        // slow loop.
        while self.s < self.src.len() && self.src[self.s] == self.src[cand] {
            self.s += 1;
            cand += 1;
        }
    }

    /// Executes any cleanup when the current block has finished compressing.
    /// In particular, it emits any leftover bytes as a literal.
    #[inline(always)]
    fn done(&mut self) {
        if self.next_emit < self.src.len() {
            let lit_end = self.src.len();
            unsafe {
                // SAFETY: Both next_emit and lit_end are trivially in bounds
                // given the conditional and definition above.
                self.emit_literal(lit_end);
            }
        }
    }

    /// Emits a literal from self.src[self.next_emit..lit_end].
    ///
    /// This uses unaligned loads and elides bounds checks, so the caller must
    /// guarantee that self.src[self.next_emit..lit_end] is valid.
    #[inline(always)]
    unsafe fn emit_literal(&mut self, lit_end: usize) {
        let lit_start = self.next_emit;
        let len = lit_end - lit_start;
        let n = len.checked_sub(1).unwrap();
        if n <= 59 {
            self.dst[self.d] = ((n as u8) << 2) | (Tag::Literal as u8);
            self.d += 1;
            if len <= 16 && lit_start + 16 <= self.src.len() {
                // SAFETY: lit_start is equivalent to self.next_emit, which
                // is only set to self.s immediately proceeding a copy
                // emit. The conditional above also ensures that there is at
                // least 16 bytes of room in both src and dst.
                //
                // dst is big enough because the buffer is guaranteed to
                // be big enough to hold biggest possible compressed size plus
                // an extra 32 bytes, which exceeds the 16 byte copy here.
                let srcp = self.src.as_ptr().offset(lit_start as isize);
                let dstp = self.dst.as_mut_ptr().offset(self.d as isize);
                ptr::copy_nonoverlapping(srcp, dstp, 16);
                self.d += len;
                return;
            }
        } else if n < 256 {
            self.dst[self.d] = (60 << 2) | (Tag::Literal as u8);
            self.dst[self.d + 1] = n as u8;
            self.d += 2;
        } else {
            self.dst[self.d] = (61 << 2) | (Tag::Literal as u8);
            LE::write_u16(&mut self.dst[self.d + 1..], n as u16);
            self.d += 3;
        }
        // SAFETY: lit_start is equivalent to self.next_emit, which
        // is only set to self.s immediately proceeding a copy, which
        // implies that it always points to valid bytes in self.src.
        //
        // We can't guarantee that there are at least len bytes though,
        // which must be guaranteed by the caller and is why this method
        // is unsafe.
        let srcp = self.src.as_ptr().offset(lit_start as isize);
        let dstp = self.dst.as_mut_ptr().offset(self.d as isize);
        ptr::copy_nonoverlapping(srcp, dstp, len);
        self.d += len;
    }
}

/// `BlockTable` is a map from 4 byte sequences to positions of their most
/// recent occurrence in a block. In particular, this table lets us quickly
/// find candidates for compression.
///
/// We expose the `hash` method so that callers can be fastidious about the
/// number of times a hash is computed.
struct BlockTable<'a> {
    table: &'a mut [u16],
    /// The number of bits required to shift the hash such that the result
    /// is less than table.len().
    shift: u32,
}

impl Encoder {
    fn block_table(&mut self, block_size: usize) -> BlockTable {
        let mut shift: u32 = 32 - 8;
        let mut table_size = 256;
        while table_size < MAX_TABLE_SIZE && table_size < block_size {
            shift -= 1;
            table_size *= 2;
        }
        // If our block size is small, then use a small stack allocated table
        // instead of putting a bigger one on the heap. This particular
        // optimization is important if the caller is using Snappy to compress
        // many small blocks. (The memset savings alone is considerable.)
        let table: &mut [u16] =
            if table_size <= SMALL_TABLE_SIZE {
                &mut self.small[0..table_size]
            } else {
                if self.big.is_empty() {
                    // Interestingly, using `self.big.resize` here led to some
                    // very weird code getting generated that led to a large
                    // slow down. Forcing the issue with a new vec seems to
                    // fix it. ---AG
                    self.big = vec![0; MAX_TABLE_SIZE];
                }
                &mut self.big[0..table_size]
            };
        for x in &mut *table {
            *x = 0;
        }
        BlockTable {
            table: table,
            shift: shift,
        }
    }
}

impl<'a> BlockTable<'a> {
    #[inline(always)]
    fn hash(&self, x: u32) -> usize {
        (x.wrapping_mul(0x1E35A7BD) >> self.shift) as usize
    }
}

impl<'a> Deref for BlockTable<'a> {
    type Target = [u16];
    fn deref(&self) -> &[u16] { self.table }
}

impl<'a> DerefMut for BlockTable<'a> {
    fn deref_mut(&mut self) -> &mut [u16] { self.table }
}

unsafe fn loadu64(data: *const u8) -> u64 {
    let mut n: u64 = 0;
    ptr::copy_nonoverlapping(
        data,
        &mut n as *mut u64 as *mut u8,
        8);
    n
}

unsafe fn loadu64_le(data: *const u8) -> u64 {
    loadu64(data).to_le()
}

unsafe fn loadu32(data: *const u8) -> u32 {
    let mut n: u32 = 0;
    ptr::copy_nonoverlapping(
        data,
        &mut n as *mut u32 as *mut u8,
        4);
    n
}

unsafe fn loadu32_le(data: *const u8) -> u32 {
    loadu32(data).to_le()
}