1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
use std::ops::{Deref, DerefMut};
use std::ptr;

use byteorder::{ByteOrder, LittleEndian as LE};

use error::{Error, Result};
use varint::write_varu64;
use {MAX_INPUT_SIZE, MAX_BLOCK_SIZE};

/// The total number of slots we permit for our hash table of 4 byte repeat
/// sequences.
const MAX_TABLE_SIZE: usize = 1<<14;

/// The size of a small hash table. This is useful for reducing overhead when
/// compressing very small blocks of bytes.
const SMALL_TABLE_SIZE: usize = 1<<10;

/// The total number of bytes that we always leave uncompressed at the end
/// of the buffer. This in particular affords us some wiggle room during
/// compression such that faster copy operations can be used.
const INPUT_MARGIN: usize = 16 - 1;

/// The minimum block size that we're willing to consider for compression.
/// Anything smaller than this gets emitted as a literal.
const MIN_NON_LITERAL_BLOCK_SIZE: usize = 1 + 1 + INPUT_MARGIN;

/// Nice names for the various Snappy tags.
enum Tag {
    Literal = 0b00,
    Copy1 = 0b01,
    Copy2 = 0b10,
    // Compression never actually emits a Copy4 operation and decompression
    // uses tricks so that we never explicitly do case analysis on the copy
    // operation type, therefore leading to the fact that we never use Copy4.
    #[allow(dead_code)]
    Copy4 = 0b11,
}

/// Returns the maximum compressed size given the uncompressed size.
///
/// If the uncompressed size exceeds the maximum allowable size then this
/// returns 0.
pub fn max_compress_len(input_len: usize) -> usize {
    let input_len = input_len as u64;
    if input_len > MAX_INPUT_SIZE {
        return 0;
    }
    let max = 32 + input_len + (input_len / 6);
    if max > MAX_INPUT_SIZE {
        0
    } else {
        max as usize
    }
}

/// Encoder is a raw encoder for compressing bytes in the Snappy format.
///
/// Thie encoder does not use the Snappy frame format and simply compresses the
/// given bytes in one big Snappy block (that is, it has a single header).
///
/// Unless you explicitly need the low-level control, you should use `Writer`
/// instead, which compresses to the Snappy frame format.
///
/// It is beneficial to reuse an Encoder.
pub struct Encoder {
    small: [u16; SMALL_TABLE_SIZE],
    big: Vec<u16>,
}

impl Encoder {
    /// Return a new encoder that can be used for compressing bytes.
    pub fn new() -> Encoder {
        Encoder {
            small: [0; SMALL_TABLE_SIZE],
            big: vec![],
        }
    }

    /// Compresses all bytes in `input` into `output`.
    ///
    /// `input` can be any arbitrary sequence of bytes.
    ///
    /// `output` must be large enough to hold the maximum possible compressed
    /// size of `input`, which can be computed using `max_compress_len`.
    ///
    /// On success, this returns the number of bytes written to `output`.
    ///
    /// # Errors
    ///
    /// This method returns an error in the following circumstances:
    ///
    /// * The total number of bytes to compress exceeds `2^32 - 1`.
    /// * `output` has length less than `max_compress_len(input.len())`.
    pub fn compress(
        &mut self,
        mut input: &[u8],
        output: &mut [u8],
    ) -> Result<usize> {
        match max_compress_len(input.len()) {
            0 => {
                return Err(Error::TooBig {
                    given: input.len() as u64,
                    max: MAX_INPUT_SIZE,
                });
            }
            min if output.len() < min => {
                return Err(Error::BufferTooSmall {
                    given: output.len() as u64,
                    min: min as u64,
                });
            }
            _ => {}
        }
        // Handle an edge case specially.
        if input.is_empty() {
            // Encodes a varint of 0, denoting the total size of uncompressed
            // bytes.
            output[0] = 0;
            return Ok(1);
        }
        // Write the Snappy header, which is just the total number of
        // uncompressed bytes.
        let mut d = write_varu64(output, input.len() as u64);
        while !input.is_empty() {
            // Find the next block.
            let mut src = input;
            if src.len() > MAX_BLOCK_SIZE {
                src = &src[..MAX_BLOCK_SIZE as usize];
            }
            input = &input[src.len()..];

            // If the block is smallish, then don't waste time on it and just
            // emit a literal.
            let mut block = Block::new(src, output, d);
            if block.src.len() < MIN_NON_LITERAL_BLOCK_SIZE {
                let lit_end = block.src.len();
                unsafe {
                    // SAFETY: next_emit is zero (in bounds) and the end is
                    // the length of the block (in bounds).
                    block.emit_literal(lit_end);
                }
            } else {
                let table = self.block_table(block.src.len());
                block.compress(table);
            }
            d = block.d;
        }
        Ok(d)
    }

    /// Compresses all bytes in `input` into a freshly allocated `Vec`.
    ///
    /// This is just like the `compress` method, except it allocates a `Vec`
    /// with the right size for you. (This is intended to be a convenience
    /// method.)
    ///
    /// This method returns an error under the same circumstances that
    /// `compress` does.
    pub fn compress_vec(&mut self, input: &[u8]) -> Result<Vec<u8>> {
        let mut buf = vec![0; max_compress_len(input.len())];
        let n = try!(self.compress(input, &mut buf));
        buf.truncate(n);
        Ok(buf)
    }
}

struct Block<'s, 'd> {
    src: &'s [u8],
    s: usize,
    s_limit: usize,
    dst: &'d mut [u8],
    d: usize,
    next_emit: usize,
}

impl<'s, 'd> Block<'s, 'd> {
    #[inline(always)]
    fn new(
        src: &'s [u8],
        dst: &'d mut [u8],
        d: usize,
    ) -> Block<'s, 'd> {
        Block {
            src: src,
            s: 0,
            s_limit: src.len(),
            dst: dst,
            d: d,
            next_emit: 0,
        }
    }

    #[inline(always)]
    fn compress(&mut self, mut table: BlockTable) {
        debug_assert!(!table.is_empty());
        debug_assert!(self.src.len() >= MIN_NON_LITERAL_BLOCK_SIZE);

        self.s += 1;
        self.s_limit -= INPUT_MARGIN;
        let mut next_hash = table.hash(LE::read_u32(&self.src[self.s..]));
        loop {
            let mut skip = 32;
            let mut candidate;
            let mut s_next = self.s;
            loop {
                self.s = s_next;
                let bytes_between_hash_lookups = skip >> 5;
                s_next = self.s + bytes_between_hash_lookups;
                skip += bytes_between_hash_lookups;
                if s_next > self.s_limit {
                    return self.done();
                }
                unsafe {
                    // SAFETY: next_hash is always computed by table.hash
                    // which is guaranteed to be in bounds.
                    candidate = *table.get_unchecked(next_hash) as usize;
                    *table.get_unchecked_mut(next_hash) = self.s as u16;

                    let srcp = self.src.as_ptr();
                    // SAFETY: s_next is guaranteed to be less than s_limit by
                    // the conditional above, which implies s_next is in
                    // bounds.
                    let x = loadu32_le(srcp.offset(s_next as isize));
                    next_hash = table.hash(x);
                    // SAFETY: self.s is always less than s_next, so it is also
                    // in bounds by the argument above.
                    //
                    // candidate is extracted from table, which is only ever
                    // set to valid positions in the block and is therefore
                    // also in bounds.
                    //
                    // We only need to compare y/z for equality, so we don't
                    // need to both with endianness. cur corresponds to the
                    // bytes at the current position and cand corresponds to
                    // a potential match. If they're equal, we declare victory
                    // and move below to try and extend the match.
                    let cur = loadu32(srcp.offset(self.s as isize));
                    let cand = loadu32(srcp.offset(candidate as isize));
                    if cur == cand {
                        break;
                    }
                }
            }
            // While the above found a candidate for compression, before we
            // emit a copy operation for it, we need to make sure that we emit
            // any bytes between the last copy operation and this one as a
            // literal.
            let lit_end = self.s;
            unsafe {
                // SAFETY: next_emit is set to a previous value of self.s,
                // which is guaranteed to be less than s_limit (in bounds).
                // lit_end is set to the current value of self.s, also
                // guaranteed to be less than s_limit (in bounds).
                self.emit_literal(lit_end);
            }
            loop {
                // Look for more matching bytes starting at the position of
                // the candidate and the current src position. We increment
                // self.s and candidate by 4 since we already know the first 4
                // bytes match.
                let base = self.s;
                self.s += 4;
                unsafe {
                    // SAFETY: candidate is always set to a value from our
                    // hash table, which only contains positions in self.src
                    // that have been seen for this block that occurred before
                    // self.s.
                    self.extend_match(candidate + 4);
                }
                let (offset, len) = (base - candidate, self.s - base);
                self.emit_copy(offset, len);
                self.next_emit = self.s;
                if self.s >= self.s_limit {
                    return self.done();
                }
                // Update the hash table with the byte sequences
                // self.src[self.s - 1..self.s + 3] and
                // self.src[self.s..self.s + 4]. Instead of reading 4 bytes
                // twice, we read 8 bytes once.
                //
                // If we happen to get a hit on self.src[self.s..self.s + 4],
                // then continue this loop and extend the match.
                unsafe {
                    let srcp = self.src.as_ptr();
                    // SAFETY: self.s can never exceed s_limit given by the
                    // conditional above and self.s is guaranteed to be
                    // non-zero and is therefore in bounds.
                    let x = loadu64_le(srcp.offset((self.s - 1) as isize));
                    // The lower 4 bytes of x correspond to
                    // self.src[self.s - 1..self.s + 3].
                    let prev_hash = table.hash(x as u32);
                    // SAFETY: Hash values are guaranteed to be in bounds.
                    *table.get_unchecked_mut(prev_hash) = (self.s - 1) as u16;
                    // The lower 4 bytes of x>>8 correspond to
                    // self.src[self.s..self.s + 4].
                    let cur_hash = table.hash((x >> 8) as u32);
                    // SAFETY: Hash values are guaranteed to be in bounds.
                    candidate = *table.get_unchecked(cur_hash) as usize;
                    *table.get_unchecked_mut(cur_hash) = self.s as u16;

                    // SAFETY: candidate is set from table, which always
                    // contains valid positions in the current block.
                    let y = loadu32_le(srcp.offset(candidate as isize));
                    if (x >> 8) as u32 != y {
                        // If we didn't get a hit, update the next hash
                        // and move on. Our initial 8 byte read continues to
                        // pay off.
                        next_hash = table.hash((x >> 16) as u32);
                        self.s += 1;
                        break;
                    }
                }
            }
        }
    }

    /// Emits one or more copy operations with the given offset and length.
    /// offset must be in the range [1, 65535] and len must be in the range
    /// [4, 65535].
    #[inline(always)]
    fn emit_copy(&mut self, offset: usize, mut len: usize) {
        debug_assert!(1 <= offset && offset <= 65535);
        // Copy operations only allow lengths up to 64, but we'll allow bigger
        // lengths and emit as many operations as we need.
        //
        // N.B. Since our block size is 64KB, we never actually emit a copy 4
        // operation.
        debug_assert!(4 <= len && len <= 65535);

        // Emit copy 2 operations until we don't have to.
        // We check on 68 here and emit a shorter copy than 64 below because
        // it is cheaper to, e.g., encode a length 67 copy as a length 60
        // copy 2 followed by a length 7 copy 1 than to encode it as a length
        // 64 copy 2 followed by a length 3 copy 2. They key here is that a
        // copy 1 operation requires at least length 4 which forces a length 3
        // copy to use a copy 2 operation.
        while len >= 68 {
            self.emit_copy2(offset, 64);
            len -= 64;
        }
        if len > 64 {
            self.emit_copy2(offset, 60);
            len -= 60;
        }
        // If we can squeeze the last copy into a copy 1 operation, do it.
        if len <= 11 && offset <= 2047 {
            self.dst[self.d] =
                (((offset >> 8) as u8) << 5)
                | (((len - 4) as u8) << 2)
                | (Tag::Copy1 as u8);
            self.dst[self.d + 1] = offset as u8;
            self.d += 2;
        } else {
            self.emit_copy2(offset, len);
        }
    }

    /// Emits a "copy 2" operation with the given offset and length. The
    /// offset and length must be valid for a copy 2 operation. i.e., offset
    /// must be in the range [1, 65535] and len must be in the range [1, 64].
    #[inline(always)]
    fn emit_copy2(&mut self, offset: usize, len: usize) {
        debug_assert!(1 <= offset && offset <= 65535);
        debug_assert!(1 <= len && len <= 64);
        self.dst[self.d] = (((len - 1) as u8) << 2) | (Tag::Copy2 as u8);
        LE::write_u16(&mut self.dst[self.d + 1..], offset as u16);
        self.d += 3;
    }

    /// Attempts to extend a match from the current position in self.src with
    /// the candidate position given.
    ///
    /// This method uses unaligned loads and elides bounds checks, so the
    /// caller must guarantee that cand points to a valid location in self.src
    /// and is less than the current position in src.
    #[inline(always)]
    unsafe fn extend_match(&mut self, mut cand: usize) {
        debug_assert!(cand < self.s);
        while self.s + 8 <= self.src.len() {
            let srcp = self.src.as_ptr();
            // SAFETY: The loop invariant guarantees that there is at least
            // 8 bytes to read at self.src + self.s. Since cand must be
            // guaranteed by the caller to be valid and less than self.s, it
            // also has enough room to read 8 bytes.
            //
            // TODO(ag): Despite my best efforts, I couldn't get this to
            // autovectorize with 128-bit loads. The logic after the loads
            // appears to be a little too clever...
            let x = loadu64(srcp.offset(self.s as isize));
            let y = loadu64(srcp.offset(cand as isize));
            if x == y {
                // If all 8 bytes are equal, move on...
                self.s += 8;
                cand += 8;
            } else {
                // Otherwise, find the last byte that was equal. We can do
                // this efficiently by interpreted x/y as little endian
                // numbers, which lets us use the number of trailing zeroes
                // as a proxy for the number of equivalent bits (after an XOR).
                let z = x.to_le() ^ y.to_le();
                self.s += z.trailing_zeros() as usize / 8;
                return;
            }
        }
        // When we have fewer than 8 bytes left in the block, fall back to the
        // slow loop.
        while self.s < self.src.len() && self.src[self.s] == self.src[cand] {
            self.s += 1;
            cand += 1;
        }
    }

    /// Executes any cleanup when the current block has finished compressing.
    /// In particular, it emits any leftover bytes as a literal.
    #[inline(always)]
    fn done(&mut self) {
        if self.next_emit < self.src.len() {
            let lit_end = self.src.len();
            unsafe {
                // SAFETY: Both next_emit and lit_end are trivially in bounds
                // given the conditional and definition above.
                self.emit_literal(lit_end);
            }
        }
    }

    /// Emits a literal from self.src[self.next_emit..lit_end].
    ///
    /// This uses unaligned loads and elides bounds checks, so the caller must
    /// guarantee that self.src[self.next_emit..lit_end] is valid.
    #[inline(always)]
    unsafe fn emit_literal(&mut self, lit_end: usize) {
        let lit_start = self.next_emit;
        let len = lit_end - lit_start;
        let n = len.checked_sub(1).unwrap();
        if n <= 59 {
            self.dst[self.d] = ((n as u8) << 2) | (Tag::Literal as u8);
            self.d += 1;
            if len <= 16 && lit_start + 16 <= self.src.len() {
                // SAFETY: lit_start is equivalent to self.next_emit, which
                // is only set to self.s immediately proceeding a copy
                // emit. The conditional above also ensures that there is at
                // least 16 bytes of room in both src and dst.
                //
                // dst is big enough because the buffer is guaranteed to
                // be big enough to hold biggest possible compressed size plus
                // an extra 32 bytes, which exceeds the 16 byte copy here.
                let srcp = self.src.as_ptr().offset(lit_start as isize);
                let dstp = self.dst.as_mut_ptr().offset(self.d as isize);
                ptr::copy_nonoverlapping(srcp, dstp, 16);
                self.d += len;
                return;
            }
        } else if n < 256 {
            self.dst[self.d] = (60 << 2) | (Tag::Literal as u8);
            self.dst[self.d + 1] = n as u8;
            self.d += 2;
        } else {
            self.dst[self.d] = (61 << 2) | (Tag::Literal as u8);
            LE::write_u16(&mut self.dst[self.d + 1..], n as u16);
            self.d += 3;
        }
        // SAFETY: lit_start is equivalent to self.next_emit, which
        // is only set to self.s immediately proceeding a copy, which
        // implies that it always points to valid bytes in self.src.
        //
        // We can't guarantee that there are at least len bytes though,
        // which must be guaranteed by the caller and is why this method
        // is unsafe.
        let srcp = self.src.as_ptr().offset(lit_start as isize);
        let dstp = self.dst.as_mut_ptr().offset(self.d as isize);
        ptr::copy_nonoverlapping(srcp, dstp, len);
        self.d += len;
    }
}

/// `BlockTable` is a map from 4 byte sequences to positions of their most
/// recent occurrence in a block. In particular, this table lets us quickly
/// find candidates for compression.
///
/// We expose the `hash` method so that callers can be fastidious about the
/// number of times a hash is computed.
struct BlockTable<'a> {
    table: &'a mut [u16],
    /// The number of bits required to shift the hash such that the result
    /// is less than table.len().
    shift: u32,
}

impl Encoder {
    fn block_table(&mut self, block_size: usize) -> BlockTable {
        let mut shift: u32 = 32 - 8;
        let mut table_size = 256;
        while table_size < MAX_TABLE_SIZE && table_size < block_size {
            shift -= 1;
            table_size *= 2;
        }
        // If our block size is small, then use a small stack allocated table
        // instead of putting a bigger one on the heap. This particular
        // optimization is important if the caller is using Snappy to compress
        // many small blocks. (The memset savings alone is considerable.)
        let table: &mut [u16] =
            if table_size <= SMALL_TABLE_SIZE {
                &mut self.small[0..table_size]
            } else {
                if self.big.is_empty() {
                    // Interestingly, using `self.big.resize` here led to some
                    // very weird code getting generated that led to a large
                    // slow down. Forcing the issue with a new vec seems to
                    // fix it. ---AG
                    self.big = vec![0; MAX_TABLE_SIZE];
                }
                &mut self.big[0..table_size]
            };
        for x in &mut *table {
            *x = 0;
        }
        BlockTable {
            table: table,
            shift: shift,
        }
    }
}

impl<'a> BlockTable<'a> {
    #[inline(always)]
    fn hash(&self, x: u32) -> usize {
        (x.wrapping_mul(0x1E35A7BD) >> self.shift) as usize
    }
}

impl<'a> Deref for BlockTable<'a> {
    type Target = [u16];
    fn deref(&self) -> &[u16] { self.table }
}

impl<'a> DerefMut for BlockTable<'a> {
    fn deref_mut(&mut self) -> &mut [u16] { self.table }
}

unsafe fn loadu64(data: *const u8) -> u64 {
    let mut n: u64 = 0;
    ptr::copy_nonoverlapping(
        data,
        &mut n as *mut u64 as *mut u8,
        8);
    n
}

unsafe fn loadu64_le(data: *const u8) -> u64 {
    loadu64(data).to_le()
}

unsafe fn loadu32(data: *const u8) -> u32 {
    let mut n: u32 = 0;
    ptr::copy_nonoverlapping(
        data,
        &mut n as *mut u32 as *mut u8,
        4);
    n
}

unsafe fn loadu32_le(data: *const u8) -> u32 {
    loadu32(data).to_le()
}