git.notmuchmail.org Git - apitrace/blob - thirdparty/snappy/snappy.cc

   1 // Copyright 2005 Google Inc. All Rights Reserved.
   2 //
   3 // Redistribution and use in source and binary forms, with or without
   4 // modification, are permitted provided that the following conditions are
   5 // met:
   6 //
   7 //     * Redistributions of source code must retain the above copyright
   8 // notice, this list of conditions and the following disclaimer.
   9 //     * Redistributions in binary form must reproduce the above
  10 // copyright notice, this list of conditions and the following disclaimer
  11 // in the documentation and/or other materials provided with the
  12 // distribution.
  13 //     * Neither the name of Google Inc. nor the names of its
  14 // contributors may be used to endorse or promote products derived from
  15 // this software without specific prior written permission.
  16 //
  17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28
  29 #include "snappy.h"
  30 #include "snappy-internal.h"
  31 #include "snappy-sinksource.h"
  32
  33 #include <stdio.h>
  34
  35 #include <algorithm>
  36 #include <string>
  37 #include <vector>
  38
  39
  40 namespace snappy {
  41
  42 // Any hash function will produce a valid compressed bitstream, but a good
  43 // hash function reduces the number of collisions and thus yields better
  44 // compression for compressible input, and more speed for incompressible
  45 // input. Of course, it doesn't hurt if the hash function is reasonably fast
  46 // either, as it gets called a lot.
  47 static inline uint32 HashBytes(uint32 bytes, int shift) {
  48   uint32 kMul = 0x1e35a7bd;
  49   return (bytes * kMul) >> shift;
  50 }
  51 static inline uint32 Hash(const char* p, int shift) {
  52   return HashBytes(UNALIGNED_LOAD32(p), shift);
  53 }
  54
  55 size_t MaxCompressedLength(size_t source_len) {
  56   // Compressed data can be defined as:
  57   //    compressed := item* literal*
  58   //    item       := literal* copy
  59   //
  60   // The trailing literal sequence has a space blowup of at most 62/60
  61   // since a literal of length 60 needs one tag byte + one extra byte
  62   // for length information.
  63   //
  64   // Item blowup is trickier to measure.  Suppose the "copy" op copies
  65   // 4 bytes of data.  Because of a special check in the encoding code,
  66   // we produce a 4-byte copy only if the offset is < 65536.  Therefore
  67   // the copy op takes 3 bytes to encode, and this type of item leads
  68   // to at most the 62/60 blowup for representing literals.
  69   //
  70   // Suppose the "copy" op copies 5 bytes of data.  If the offset is big
  71   // enough, it will take 5 bytes to encode the copy op.  Therefore the
  72   // worst case here is a one-byte literal followed by a five-byte copy.
  73   // I.e., 6 bytes of input turn into 7 bytes of "compressed" data.
  74   //
  75   // This last factor dominates the blowup, so the final estimate is:
  76   return 32 + source_len + source_len/6;
  77 }
  78
  79 enum {
  80   LITERAL = 0,
  81   COPY_1_BYTE_OFFSET = 1,  // 3 bit length + 3 bits of offset in opcode
  82   COPY_2_BYTE_OFFSET = 2,
  83   COPY_4_BYTE_OFFSET = 3
  84 };
  85
  86 // Copy "len" bytes from "src" to "op", one byte at a time.  Used for
  87 // handling COPY operations where the input and output regions may
  88 // overlap.  For example, suppose:
  89 //    src    == "ab"
  90 //    op     == src + 2
  91 //    len    == 20
  92 // After IncrementalCopy(src, op, len), the result will have
  93 // eleven copies of "ab"
  94 //    ababababababababababab
  95 // Note that this does not match the semantics of either memcpy()
  96 // or memmove().
  97 static inline void IncrementalCopy(const char* src, char* op, int len) {
  98   DCHECK_GT(len, 0);
  99   do {
 100     *op++ = *src++;
 101   } while (--len > 0);
 102 }
 103
 104 // Equivalent to IncrementalCopy except that it can write up to ten extra
 105 // bytes after the end of the copy, and that it is faster.
 106 //
 107 // The main part of this loop is a simple copy of eight bytes at a time until
 108 // we've copied (at least) the requested amount of bytes.  However, if op and
 109 // src are less than eight bytes apart (indicating a repeating pattern of
 110 // length < 8), we first need to expand the pattern in order to get the correct
 111 // results. For instance, if the buffer looks like this, with the eight-byte
 112 // <src> and <op> patterns marked as intervals:
 113 //
 114 //    abxxxxxxxxxxxx
 115 //    [------]           src
 116 //      [------]         op
 117 //
 118 // a single eight-byte copy from <src> to <op> will repeat the pattern once,
 119 // after which we can move <op> two bytes without moving <src>:
 120 //
 121 //    ababxxxxxxxxxx
 122 //    [------]           src
 123 //        [------]       op
 124 //
 125 // and repeat the exercise until the two no longer overlap.
 126 //
 127 // This allows us to do very well in the special case of one single byte
 128 // repeated many times, without taking a big hit for more general cases.
 129 //
 130 // The worst case of extra writing past the end of the match occurs when
 131 // op - src == 1 and len == 1; the last copy will read from byte positions
 132 // [0..7] and write to [4..11], whereas it was only supposed to write to
 133 // position 1. Thus, ten excess bytes.
 134
 135 namespace {
 136
 137 const int kMaxIncrementCopyOverflow = 10;
 138
 139 }  // namespace
 140
 141 static inline void IncrementalCopyFastPath(const char* src, char* op, int len) {
 142   while (op - src < 8) {
 143     UnalignedCopy64(src, op);
 144     len -= op - src;
 145     op += op - src;
 146   }
 147   while (len > 0) {
 148     UnalignedCopy64(src, op);
 149     src += 8;
 150     op += 8;
 151     len -= 8;
 152   }
 153 }
 154
 155 static inline char* EmitLiteral(char* op,
 156                                 const char* literal,
 157                                 int len,
 158                                 bool allow_fast_path) {
 159   int n = len - 1;      // Zero-length literals are disallowed
 160   if (n < 60) {
 161     // Fits in tag byte
 162     *op++ = LITERAL | (n << 2);
 163
 164     // The vast majority of copies are below 16 bytes, for which a
 165     // call to memcpy is overkill. This fast path can sometimes
 166     // copy up to 15 bytes too much, but that is okay in the
 167     // main loop, since we have a bit to go on for both sides:
 168     //
 169     //   - The input will always have kInputMarginBytes = 15 extra
 170     //     available bytes, as long as we're in the main loop, and
 171     //     if not, allow_fast_path = false.
 172     //   - The output will always have 32 spare bytes (see
 173     //     MaxCompressedLength).
 174     if (allow_fast_path && len <= 16) {
 175       UnalignedCopy64(literal, op);
 176       UnalignedCopy64(literal + 8, op + 8);
 177       return op + len;
 178     }
 179   } else {
 180     // Encode in upcoming bytes
 181     char* base = op;
 182     int count = 0;
 183     op++;
 184     while (n > 0) {
 185       *op++ = n & 0xff;
 186       n >>= 8;
 187       count++;
 188     }
 189     assert(count >= 1);
 190     assert(count <= 4);
 191     *base = LITERAL | ((59+count) << 2);
 192   }
 193   memcpy(op, literal, len);
 194   return op + len;
 195 }
 196
 197 static inline char* EmitCopyLessThan64(char* op, size_t offset, int len) {
 198   DCHECK_LE(len, 64);
 199   DCHECK_GE(len, 4);
 200   DCHECK_LT(offset, 65536);
 201
 202   if ((len < 12) && (offset < 2048)) {
 203     size_t len_minus_4 = len - 4;
 204     assert(len_minus_4 < 8);            // Must fit in 3 bits
 205     *op++ = COPY_1_BYTE_OFFSET | ((len_minus_4) << 2) | ((offset >> 8) << 5);
 206     *op++ = offset & 0xff;
 207   } else {
 208     *op++ = COPY_2_BYTE_OFFSET | ((len-1) << 2);
 209     LittleEndian::Store16(op, offset);
 210     op += 2;
 211   }
 212   return op;
 213 }
 214
 215 static inline char* EmitCopy(char* op, size_t offset, int len) {
 216   // Emit 64 byte copies but make sure to keep at least four bytes reserved
 217   while (len >= 68) {
 218     op = EmitCopyLessThan64(op, offset, 64);
 219     len -= 64;
 220   }
 221
 222   // Emit an extra 60 byte copy if have too much data to fit in one copy
 223   if (len > 64) {
 224     op = EmitCopyLessThan64(op, offset, 60);
 225     len -= 60;
 226   }
 227
 228   // Emit remainder
 229   op = EmitCopyLessThan64(op, offset, len);
 230   return op;
 231 }
 232
 233
 234 bool GetUncompressedLength(const char* start, size_t n, size_t* result) {
 235   uint32 v = 0;
 236   const char* limit = start + n;
 237   if (Varint::Parse32WithLimit(start, limit, &v) != NULL) {
 238     *result = v;
 239     return true;
 240   } else {
 241     return false;
 242   }
 243 }
 244
 245 namespace internal {
 246 uint16* WorkingMemory::GetHashTable(size_t input_size, int* table_size) {
 247   // Use smaller hash table when input.size() is smaller, since we
 248   // fill the table, incurring O(hash table size) overhead for
 249   // compression, and if the input is short, we won't need that
 250   // many hash table entries anyway.
 251   assert(kMaxHashTableSize >= 256);
 252   size_t htsize = 256;
 253   while (htsize < kMaxHashTableSize && htsize < input_size) {
 254     htsize <<= 1;
 255   }
 256   CHECK_EQ(0, htsize & (htsize - 1)) << ": must be power of two";
 257   CHECK_LE(htsize, kMaxHashTableSize) << ": hash table too large";
 258
 259   uint16* table;
 260   if (htsize <= ARRAYSIZE(small_table_)) {
 261     table = small_table_;
 262   } else {
 263     if (large_table_ == NULL) {
 264       large_table_ = new uint16[kMaxHashTableSize];
 265     }
 266     table = large_table_;
 267   }
 268
 269   *table_size = htsize;
 270   memset(table, 0, htsize * sizeof(*table));
 271   return table;
 272 }
 273 }  // end namespace internal
 274
 275 // For 0 <= offset <= 4, GetUint32AtOffset(GetEightBytesAt(p), offset) will
 276 // equal UNALIGNED_LOAD32(p + offset).  Motivation: On x86-64 hardware we have
 277 // empirically found that overlapping loads such as
 278 //  UNALIGNED_LOAD32(p) ... UNALIGNED_LOAD32(p+1) ... UNALIGNED_LOAD32(p+2)
 279 // are slower than UNALIGNED_LOAD64(p) followed by shifts and casts to uint32.
 280 //
 281 // We have different versions for 64- and 32-bit; ideally we would avoid the
 282 // two functions and just inline the UNALIGNED_LOAD64 call into
 283 // GetUint32AtOffset, but GCC (at least not as of 4.6) is seemingly not clever
 284 // enough to avoid loading the value multiple times then. For 64-bit, the load
 285 // is done when GetEightBytesAt() is called, whereas for 32-bit, the load is
 286 // done at GetUint32AtOffset() time.
 287
 288 #ifdef ARCH_K8
 289
 290 typedef uint64 EightBytesReference;
 291
 292 static inline EightBytesReference GetEightBytesAt(const char* ptr) {
 293   return UNALIGNED_LOAD64(ptr);
 294 }
 295
 296 static inline uint32 GetUint32AtOffset(uint64 v, int offset) {
 297   DCHECK_GE(offset, 0);
 298   DCHECK_LE(offset, 4);
 299   return v >> (LittleEndian::IsLittleEndian() ? 8 * offset : 32 - 8 * offset);
 300 }
 301
 302 #else
 303
 304 typedef const char* EightBytesReference;
 305
 306 static inline EightBytesReference GetEightBytesAt(const char* ptr) {
 307   return ptr;
 308 }
 309
 310 static inline uint32 GetUint32AtOffset(const char* v, int offset) {
 311   DCHECK_GE(offset, 0);
 312   DCHECK_LE(offset, 4);
 313   return UNALIGNED_LOAD32(v + offset);
 314 }
 315
 316 #endif
 317
 318 // Flat array compression that does not emit the "uncompressed length"
 319 // prefix. Compresses "input" string to the "*op" buffer.
 320 //
 321 // REQUIRES: "input" is at most "kBlockSize" bytes long.
 322 // REQUIRES: "op" points to an array of memory that is at least
 323 // "MaxCompressedLength(input.size())" in size.
 324 // REQUIRES: All elements in "table[0..table_size-1]" are initialized to zero.
 325 // REQUIRES: "table_size" is a power of two
 326 //
 327 // Returns an "end" pointer into "op" buffer.
 328 // "end - op" is the compressed size of "input".
 329 namespace internal {
 330 char* CompressFragment(const char* input,
 331                        size_t input_size,
 332                        char* op,
 333                        uint16* table,
 334                        const int table_size) {
 335   // "ip" is the input pointer, and "op" is the output pointer.
 336   const char* ip = input;
 337   CHECK_LE(input_size, kBlockSize);
 338   CHECK_EQ(table_size & (table_size - 1), 0) << ": table must be power of two";
 339   const int shift = 32 - Bits::Log2Floor(table_size);
 340   DCHECK_EQ(static_cast<int>(kuint32max >> shift), table_size - 1);
 341   const char* ip_end = input + input_size;
 342   const char* base_ip = ip;
 343   // Bytes in [next_emit, ip) will be emitted as literal bytes.  Or
 344   // [next_emit, ip_end) after the main loop.
 345   const char* next_emit = ip;
 346
 347   const size_t kInputMarginBytes = 15;
 348   if (PREDICT_TRUE(input_size >= kInputMarginBytes)) {
 349     const char* ip_limit = input + input_size - kInputMarginBytes;
 350
 351     for (uint32 next_hash = Hash(++ip, shift); ; ) {
 352       DCHECK_LT(next_emit, ip);
 353       // The body of this loop calls EmitLiteral once and then EmitCopy one or
 354       // more times.  (The exception is that when we're close to exhausting
 355       // the input we goto emit_remainder.)
 356       //
 357       // In the first iteration of this loop we're just starting, so
 358       // there's nothing to copy, so calling EmitLiteral once is
 359       // necessary.  And we only start a new iteration when the
 360       // current iteration has determined that a call to EmitLiteral will
 361       // precede the next call to EmitCopy (if any).
 362       //
 363       // Step 1: Scan forward in the input looking for a 4-byte-long match.
 364       // If we get close to exhausting the input then goto emit_remainder.
 365       //
 366       // Heuristic match skipping: If 32 bytes are scanned with no matches
 367       // found, start looking only at every other byte. If 32 more bytes are
 368       // scanned, look at every third byte, etc.. When a match is found,
 369       // immediately go back to looking at every byte. This is a small loss
 370       // (~5% performance, ~0.1% density) for compressible data due to more
 371       // bookkeeping, but for non-compressible data (such as JPEG) it's a huge
 372       // win since the compressor quickly "realizes" the data is incompressible
 373       // and doesn't bother looking for matches everywhere.
 374       //
 375       // The "skip" variable keeps track of how many bytes there are since the
 376       // last match; dividing it by 32 (ie. right-shifting by five) gives the
 377       // number of bytes to move ahead for each iteration.
 378       uint32 skip = 32;
 379
 380       const char* next_ip = ip;
 381       const char* candidate;
 382       do {
 383         ip = next_ip;
 384         uint32 hash = next_hash;
 385         DCHECK_EQ(hash, Hash(ip, shift));
 386         uint32 bytes_between_hash_lookups = skip++ >> 5;
 387         next_ip = ip + bytes_between_hash_lookups;
 388         if (PREDICT_FALSE(next_ip > ip_limit)) {
 389           goto emit_remainder;
 390         }
 391         next_hash = Hash(next_ip, shift);
 392         candidate = base_ip + table[hash];
 393         DCHECK_GE(candidate, base_ip);
 394         DCHECK_LT(candidate, ip);
 395
 396         table[hash] = ip - base_ip;
 397       } while (PREDICT_TRUE(UNALIGNED_LOAD32(ip) !=
 398                             UNALIGNED_LOAD32(candidate)));
 399
 400       // Step 2: A 4-byte match has been found.  We'll later see if more
 401       // than 4 bytes match.  But, prior to the match, input
 402       // bytes [next_emit, ip) are unmatched.  Emit them as "literal bytes."
 403       DCHECK_LE(next_emit + 16, ip_end);
 404       op = EmitLiteral(op, next_emit, ip - next_emit, true);
 405
 406       // Step 3: Call EmitCopy, and then see if another EmitCopy could
 407       // be our next move.  Repeat until we find no match for the
 408       // input immediately after what was consumed by the last EmitCopy call.
 409       //
 410       // If we exit this loop normally then we need to call EmitLiteral next,
 411       // though we don't yet know how big the literal will be.  We handle that
 412       // by proceeding to the next iteration of the main loop.  We also can exit
 413       // this loop via goto if we get close to exhausting the input.
 414       EightBytesReference input_bytes;
 415       uint32 candidate_bytes = 0;
 416
 417       do {
 418         // We have a 4-byte match at ip, and no need to emit any
 419         // "literal bytes" prior to ip.
 420         const char* base = ip;
 421         int matched = 4 + FindMatchLength(candidate + 4, ip + 4, ip_end);
 422         ip += matched;
 423         size_t offset = base - candidate;
 424         DCHECK_EQ(0, memcmp(base, candidate, matched));
 425         op = EmitCopy(op, offset, matched);
 426         // We could immediately start working at ip now, but to improve
 427         // compression we first update table[Hash(ip - 1, ...)].
 428         const char* insert_tail = ip - 1;
 429         next_emit = ip;
 430         if (PREDICT_FALSE(ip >= ip_limit)) {
 431           goto emit_remainder;
 432         }
 433         input_bytes = GetEightBytesAt(insert_tail);
 434         uint32 prev_hash = HashBytes(GetUint32AtOffset(input_bytes, 0), shift);
 435         table[prev_hash] = ip - base_ip - 1;
 436         uint32 cur_hash = HashBytes(GetUint32AtOffset(input_bytes, 1), shift);
 437         candidate = base_ip + table[cur_hash];
 438         candidate_bytes = UNALIGNED_LOAD32(candidate);
 439         table[cur_hash] = ip - base_ip;
 440       } while (GetUint32AtOffset(input_bytes, 1) == candidate_bytes);
 441
 442       next_hash = HashBytes(GetUint32AtOffset(input_bytes, 2), shift);
 443       ++ip;
 444     }
 445   }
 446
 447  emit_remainder:
 448   // Emit the remaining bytes as a literal
 449   if (next_emit < ip_end) {
 450     op = EmitLiteral(op, next_emit, ip_end - next_emit, false);
 451   }
 452
 453   return op;
 454 }
 455 }  // end namespace internal
 456
 457 // Signature of output types needed by decompression code.
 458 // The decompression code is templatized on a type that obeys this
 459 // signature so that we do not pay virtual function call overhead in
 460 // the middle of a tight decompression loop.
 461 //
 462 // class DecompressionWriter {
 463 //  public:
 464 //   // Called before decompression
 465 //   void SetExpectedLength(size_t length);
 466 //
 467 //   // Called after decompression
 468 //   bool CheckLength() const;
 469 //
 470 //   // Called repeatedly during decompression
 471 //   bool Append(const char* ip, size_t length);
 472 //   bool AppendFromSelf(uint32 offset, size_t length);
 473 //
 474 //   // The difference between TryFastAppend and Append is that TryFastAppend
 475 //   // is allowed to read up to <available> bytes from the input buffer,
 476 //   // whereas Append is allowed to read <length>.
 477 //   //
 478 //   // Also, TryFastAppend is allowed to return false, declining the append,
 479 //   // without it being a fatal error -- just "return false" would be
 480 //   // a perfectly legal implementation of TryFastAppend. The intention
 481 //   // is for TryFastAppend to allow a fast path in the common case of
 482 //   // a small append.
 483 //   //
 484 //   // NOTE(user): TryFastAppend must always return decline (return false)
 485 //   // if <length> is 61 or more, as in this case the literal length is not
 486 //   // decoded fully. In practice, this should not be a big problem,
 487 //   // as it is unlikely that one would implement a fast path accepting
 488 //   // this much data.
 489 //   bool TryFastAppend(const char* ip, size_t available, size_t length);
 490 // };
 491
 492 // -----------------------------------------------------------------------
 493 // Lookup table for decompression code.  Generated by ComputeTable() below.
 494 // -----------------------------------------------------------------------
 495
 496 // Mapping from i in range [0,4] to a mask to extract the bottom 8*i bits
 497 static const uint32 wordmask[] = {
 498   0u, 0xffu, 0xffffu, 0xffffffu, 0xffffffffu
 499 };
 500
 501 // Data stored per entry in lookup table:
 502 //      Range   Bits-used       Description
 503 //      ------------------------------------
 504 //      1..64   0..7            Literal/copy length encoded in opcode byte
 505 //      0..7    8..10           Copy offset encoded in opcode byte / 256
 506 //      0..4    11..13          Extra bytes after opcode
 507 //
 508 // We use eight bits for the length even though 7 would have sufficed
 509 // because of efficiency reasons:
 510 //      (1) Extracting a byte is faster than a bit-field
 511 //      (2) It properly aligns copy offset so we do not need a <<8
 512 static const uint16 char_table[256] = {
 513   0x0001, 0x0804, 0x1001, 0x2001, 0x0002, 0x0805, 0x1002, 0x2002,
 514   0x0003, 0x0806, 0x1003, 0x2003, 0x0004, 0x0807, 0x1004, 0x2004,
 515   0x0005, 0x0808, 0x1005, 0x2005, 0x0006, 0x0809, 0x1006, 0x2006,
 516   0x0007, 0x080a, 0x1007, 0x2007, 0x0008, 0x080b, 0x1008, 0x2008,
 517   0x0009, 0x0904, 0x1009, 0x2009, 0x000a, 0x0905, 0x100a, 0x200a,
 518   0x000b, 0x0906, 0x100b, 0x200b, 0x000c, 0x0907, 0x100c, 0x200c,
 519   0x000d, 0x0908, 0x100d, 0x200d, 0x000e, 0x0909, 0x100e, 0x200e,
 520   0x000f, 0x090a, 0x100f, 0x200f, 0x0010, 0x090b, 0x1010, 0x2010,
 521   0x0011, 0x0a04, 0x1011, 0x2011, 0x0012, 0x0a05, 0x1012, 0x2012,
 522   0x0013, 0x0a06, 0x1013, 0x2013, 0x0014, 0x0a07, 0x1014, 0x2014,
 523   0x0015, 0x0a08, 0x1015, 0x2015, 0x0016, 0x0a09, 0x1016, 0x2016,
 524   0x0017, 0x0a0a, 0x1017, 0x2017, 0x0018, 0x0a0b, 0x1018, 0x2018,
 525   0x0019, 0x0b04, 0x1019, 0x2019, 0x001a, 0x0b05, 0x101a, 0x201a,
 526   0x001b, 0x0b06, 0x101b, 0x201b, 0x001c, 0x0b07, 0x101c, 0x201c,
 527   0x001d, 0x0b08, 0x101d, 0x201d, 0x001e, 0x0b09, 0x101e, 0x201e,
 528   0x001f, 0x0b0a, 0x101f, 0x201f, 0x0020, 0x0b0b, 0x1020, 0x2020,
 529   0x0021, 0x0c04, 0x1021, 0x2021, 0x0022, 0x0c05, 0x1022, 0x2022,
 530   0x0023, 0x0c06, 0x1023, 0x2023, 0x0024, 0x0c07, 0x1024, 0x2024,
 531   0x0025, 0x0c08, 0x1025, 0x2025, 0x0026, 0x0c09, 0x1026, 0x2026,
 532   0x0027, 0x0c0a, 0x1027, 0x2027, 0x0028, 0x0c0b, 0x1028, 0x2028,
 533   0x0029, 0x0d04, 0x1029, 0x2029, 0x002a, 0x0d05, 0x102a, 0x202a,
 534   0x002b, 0x0d06, 0x102b, 0x202b, 0x002c, 0x0d07, 0x102c, 0x202c,
 535   0x002d, 0x0d08, 0x102d, 0x202d, 0x002e, 0x0d09, 0x102e, 0x202e,
 536   0x002f, 0x0d0a, 0x102f, 0x202f, 0x0030, 0x0d0b, 0x1030, 0x2030,
 537   0x0031, 0x0e04, 0x1031, 0x2031, 0x0032, 0x0e05, 0x1032, 0x2032,
 538   0x0033, 0x0e06, 0x1033, 0x2033, 0x0034, 0x0e07, 0x1034, 0x2034,
 539   0x0035, 0x0e08, 0x1035, 0x2035, 0x0036, 0x0e09, 0x1036, 0x2036,
 540   0x0037, 0x0e0a, 0x1037, 0x2037, 0x0038, 0x0e0b, 0x1038, 0x2038,
 541   0x0039, 0x0f04, 0x1039, 0x2039, 0x003a, 0x0f05, 0x103a, 0x203a,
 542   0x003b, 0x0f06, 0x103b, 0x203b, 0x003c, 0x0f07, 0x103c, 0x203c,
 543   0x0801, 0x0f08, 0x103d, 0x203d, 0x1001, 0x0f09, 0x103e, 0x203e,
 544   0x1801, 0x0f0a, 0x103f, 0x203f, 0x2001, 0x0f0b, 0x1040, 0x2040
 545 };
 546
 547 // In debug mode, allow optional computation of the table at startup.
 548 // Also, check that the decompression table is correct.
 549 #ifndef NDEBUG
 550 DEFINE_bool(snappy_dump_decompression_table, false,
 551             "If true, we print the decompression table at startup.");
 552
 553 static uint16 MakeEntry(unsigned int extra,
 554                         unsigned int len,
 555                         unsigned int copy_offset) {
 556   // Check that all of the fields fit within the allocated space
 557   DCHECK_EQ(extra,       extra & 0x7);          // At most 3 bits
 558   DCHECK_EQ(copy_offset, copy_offset & 0x7);    // At most 3 bits
 559   DCHECK_EQ(len,         len & 0x7f);           // At most 7 bits
 560   return len | (copy_offset << 8) | (extra << 11);
 561 }
 562
 563 static void ComputeTable() {
 564   uint16 dst[256];
 565
 566   // Place invalid entries in all places to detect missing initialization
 567   int assigned = 0;
 568   for (int i = 0; i < 256; i++) {
 569     dst[i] = 0xffff;
 570   }
 571
 572   // Small LITERAL entries.  We store (len-1) in the top 6 bits.
 573   for (unsigned int len = 1; len <= 60; len++) {
 574     dst[LITERAL | ((len-1) << 2)] = MakeEntry(0, len, 0);
 575     assigned++;
 576   }
 577
 578   // Large LITERAL entries.  We use 60..63 in the high 6 bits to
 579   // encode the number of bytes of length info that follow the opcode.
 580   for (unsigned int extra_bytes = 1; extra_bytes <= 4; extra_bytes++) {
 581     // We set the length field in the lookup table to 1 because extra
 582     // bytes encode len-1.
 583     dst[LITERAL | ((extra_bytes+59) << 2)] = MakeEntry(extra_bytes, 1, 0);
 584     assigned++;
 585   }
 586
 587   // COPY_1_BYTE_OFFSET.
 588   //
 589   // The tag byte in the compressed data stores len-4 in 3 bits, and
 590   // offset/256 in 5 bits.  offset%256 is stored in the next byte.
 591   //
 592   // This format is used for length in range [4..11] and offset in
 593   // range [0..2047]
 594   for (unsigned int len = 4; len < 12; len++) {
 595     for (unsigned int offset = 0; offset < 2048; offset += 256) {
 596       dst[COPY_1_BYTE_OFFSET | ((len-4)<<2) | ((offset>>8)<<5)] =
 597         MakeEntry(1, len, offset>>8);
 598       assigned++;
 599     }
 600   }
 601
 602   // COPY_2_BYTE_OFFSET.
 603   // Tag contains len-1 in top 6 bits, and offset in next two bytes.
 604   for (unsigned int len = 1; len <= 64; len++) {
 605     dst[COPY_2_BYTE_OFFSET | ((len-1)<<2)] = MakeEntry(2, len, 0);
 606     assigned++;
 607   }
 608
 609   // COPY_4_BYTE_OFFSET.
 610   // Tag contents len-1 in top 6 bits, and offset in next four bytes.
 611   for (unsigned int len = 1; len <= 64; len++) {
 612     dst[COPY_4_BYTE_OFFSET | ((len-1)<<2)] = MakeEntry(4, len, 0);
 613     assigned++;
 614   }
 615
 616   // Check that each entry was initialized exactly once.
 617   CHECK_EQ(assigned, 256);
 618   for (int i = 0; i < 256; i++) {
 619     CHECK_NE(dst[i], 0xffff);
 620   }
 621
 622   if (FLAGS_snappy_dump_decompression_table) {
 623     printf("static const uint16 char_table[256] = {\n  ");
 624     for (int i = 0; i < 256; i++) {
 625       printf("0x%04x%s",
 626              dst[i],
 627              ((i == 255) ? "\n" : (((i%8) == 7) ? ",\n  " : ", ")));
 628     }
 629     printf("};\n");
 630   }
 631
 632   // Check that computed table matched recorded table
 633   for (int i = 0; i < 256; i++) {
 634     CHECK_EQ(dst[i], char_table[i]);
 635   }
 636 }
 637 #endif /* !NDEBUG */
 638
 639 // Helper class for decompression
 640 class SnappyDecompressor {
 641  private:
 642   Source*       reader_;         // Underlying source of bytes to decompress
 643   const char*   ip_;             // Points to next buffered byte
 644   const char*   ip_limit_;       // Points just past buffered bytes
 645   uint32        peeked_;         // Bytes peeked from reader (need to skip)
 646   bool          eof_;            // Hit end of input without an error?
 647   char          scratch_[5];     // Temporary buffer for PeekFast() boundaries
 648
 649   // Ensure that all of the tag metadata for the next tag is available
 650   // in [ip_..ip_limit_-1].  Also ensures that [ip,ip+4] is readable even
 651   // if (ip_limit_ - ip_ < 5).
 652   //
 653   // Returns true on success, false on error or end of input.
 654   bool RefillTag();
 655
 656  public:
 657   explicit SnappyDecompressor(Source* reader)
 658       : reader_(reader),
 659         ip_(NULL),
 660         ip_limit_(NULL),
 661         peeked_(0),
 662         eof_(false) {
 663   }
 664
 665   ~SnappyDecompressor() {
 666     // Advance past any bytes we peeked at from the reader
 667     reader_->Skip(peeked_);
 668   }
 669
 670   // Returns true iff we have hit the end of the input without an error.
 671   bool eof() const {
 672     return eof_;
 673   }
 674
 675   // Read the uncompressed length stored at the start of the compressed data.
 676   // On succcess, stores the length in *result and returns true.
 677   // On failure, returns false.
 678   bool ReadUncompressedLength(uint32* result) {
 679     DCHECK(ip_ == NULL);       // Must not have read anything yet
 680     // Length is encoded in 1..5 bytes
 681     *result = 0;
 682     uint32 shift = 0;
 683     while (true) {
 684       if (shift >= 32) return false;
 685       size_t n;
 686       const char* ip = reader_->Peek(&n);
 687       if (n == 0) return false;
 688       const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip));
 689       reader_->Skip(1);
 690       *result |= static_cast<uint32>(c & 0x7f) << shift;
 691       if (c < 128) {
 692         break;
 693       }
 694       shift += 7;
 695     }
 696     return true;
 697   }
 698
 699   // Process the next item found in the input.
 700   // Returns true if successful, false on error or end of input.
 701   template <class Writer>
 702   void DecompressAllTags(Writer* writer) {
 703     const char* ip = ip_;
 704
 705     // We could have put this refill fragment only at the beginning of the loop.
 706     // However, duplicating it at the end of each branch gives the compiler more
 707     // scope to optimize the <ip_limit_ - ip> expression based on the local
 708     // context, which overall increases speed.
 709     #define MAYBE_REFILL() \
 710         if (ip_limit_ - ip < 5) { \
 711           ip_ = ip; \
 712           if (!RefillTag()) return; \
 713           ip = ip_; \
 714         }
 715
 716     MAYBE_REFILL();
 717     for ( ;; ) {
 718       const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip++));
 719
 720       if ((c & 0x3) == LITERAL) {
 721         size_t literal_length = (c >> 2) + 1u;
 722         if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length)) {
 723           DCHECK_LT(literal_length, 61);
 724           ip += literal_length;
 725           MAYBE_REFILL();
 726           continue;
 727         }
 728         if (PREDICT_FALSE(literal_length >= 61)) {
 729           // Long literal.
 730           const size_t literal_length_length = literal_length - 60;
 731           literal_length =
 732               (LittleEndian::Load32(ip) & wordmask[literal_length_length]) + 1;
 733           ip += literal_length_length;
 734         }
 735
 736         size_t avail = ip_limit_ - ip;
 737         while (avail < literal_length) {
 738           if (!writer->Append(ip, avail)) return;
 739           literal_length -= avail;
 740           reader_->Skip(peeked_);
 741           size_t n;
 742           ip = reader_->Peek(&n);
 743           avail = n;
 744           peeked_ = avail;
 745           if (avail == 0) return;  // Premature end of input
 746           ip_limit_ = ip + avail;
 747         }
 748         if (!writer->Append(ip, literal_length)) {
 749           return;
 750         }
 751         ip += literal_length;
 752         MAYBE_REFILL();
 753       } else {
 754         const uint32 entry = char_table[c];
 755         const uint32 trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11];
 756         const uint32 length = entry & 0xff;
 757         ip += entry >> 11;
 758
 759         // copy_offset/256 is encoded in bits 8..10.  By just fetching
 760         // those bits, we get copy_offset (since the bit-field starts at
 761         // bit 8).
 762         const uint32 copy_offset = entry & 0x700;
 763         if (!writer->AppendFromSelf(copy_offset + trailer, length)) {
 764           return;
 765         }
 766         MAYBE_REFILL();
 767       }
 768     }
 769
 770 #undef MAYBE_REFILL
 771   }
 772 };
 773
 774 bool SnappyDecompressor::RefillTag() {
 775   const char* ip = ip_;
 776   if (ip == ip_limit_) {
 777     // Fetch a new fragment from the reader
 778     reader_->Skip(peeked_);   // All peeked bytes are used up
 779     size_t n;
 780     ip = reader_->Peek(&n);
 781     peeked_ = n;
 782     if (n == 0) {
 783       eof_ = true;
 784       return false;
 785     }
 786     ip_limit_ = ip + n;
 787   }
 788
 789   // Read the tag character
 790   DCHECK_LT(ip, ip_limit_);
 791   const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip));
 792   const uint32 entry = char_table[c];
 793   const uint32 needed = (entry >> 11) + 1;  // +1 byte for 'c'
 794   DCHECK_LE(needed, sizeof(scratch_));
 795
 796   // Read more bytes from reader if needed
 797   uint32 nbuf = ip_limit_ - ip;
 798   if (nbuf < needed) {
 799     // Stitch together bytes from ip and reader to form the word
 800     // contents.  We store the needed bytes in "scratch_".  They
 801     // will be consumed immediately by the caller since we do not
 802     // read more than we need.
 803     memmove(scratch_, ip, nbuf);
 804     reader_->Skip(peeked_);  // All peeked bytes are used up
 805     peeked_ = 0;
 806     while (nbuf < needed) {
 807       size_t length;
 808       const char* src = reader_->Peek(&length);
 809       if (length == 0) return false;
 810       uint32 to_add = min<uint32>(needed - nbuf, length);
 811       memcpy(scratch_ + nbuf, src, to_add);
 812       nbuf += to_add;
 813       reader_->Skip(to_add);
 814     }
 815     DCHECK_EQ(nbuf, needed);
 816     ip_ = scratch_;
 817     ip_limit_ = scratch_ + needed;
 818   } else if (nbuf < 5) {
 819     // Have enough bytes, but move into scratch_ so that we do not
 820     // read past end of input
 821     memmove(scratch_, ip, nbuf);
 822     reader_->Skip(peeked_);  // All peeked bytes are used up
 823     peeked_ = 0;
 824     ip_ = scratch_;
 825     ip_limit_ = scratch_ + nbuf;
 826   } else {
 827     // Pass pointer to buffer returned by reader_.
 828     ip_ = ip;
 829   }
 830   return true;
 831 }
 832
 833 template <typename Writer>
 834 static bool InternalUncompress(Source* r,
 835                                Writer* writer,
 836                                uint32 max_len) {
 837   // Read the uncompressed length from the front of the compressed input
 838   SnappyDecompressor decompressor(r);
 839   uint32 uncompressed_len = 0;
 840   if (!decompressor.ReadUncompressedLength(&uncompressed_len)) return false;
 841   return InternalUncompressAllTags(
 842       &decompressor, writer, uncompressed_len, max_len);
 843 }
 844
 845 template <typename Writer>
 846 static bool InternalUncompressAllTags(SnappyDecompressor* decompressor,
 847                                       Writer* writer,
 848                                       uint32 uncompressed_len,
 849                                       uint32 max_len) {
 850   // Protect against possible DoS attack
 851   if (static_cast<uint64>(uncompressed_len) > max_len) {
 852     return false;
 853   }
 854
 855   writer->SetExpectedLength(uncompressed_len);
 856
 857   // Process the entire input
 858   decompressor->DecompressAllTags(writer);
 859   return (decompressor->eof() && writer->CheckLength());
 860 }
 861
 862 bool GetUncompressedLength(Source* source, uint32* result) {
 863   SnappyDecompressor decompressor(source);
 864   return decompressor.ReadUncompressedLength(result);
 865 }
 866
 867 size_t Compress(Source* reader, Sink* writer) {
 868   size_t written = 0;
 869   size_t N = reader->Available();
 870   char ulength[Varint::kMax32];
 871   char* p = Varint::Encode32(ulength, N);
 872   writer->Append(ulength, p-ulength);
 873   written += (p - ulength);
 874
 875   internal::WorkingMemory wmem;
 876   char* scratch = NULL;
 877   char* scratch_output = NULL;
 878
 879   while (N > 0) {
 880     // Get next block to compress (without copying if possible)
 881     size_t fragment_size;
 882     const char* fragment = reader->Peek(&fragment_size);
 883     DCHECK_NE(fragment_size, 0) << ": premature end of input";
 884     const size_t num_to_read = min(N, kBlockSize);
 885     size_t bytes_read = fragment_size;
 886
 887     size_t pending_advance = 0;
 888     if (bytes_read >= num_to_read) {
 889       // Buffer returned by reader is large enough
 890       pending_advance = num_to_read;
 891       fragment_size = num_to_read;
 892     } else {
 893       // Read into scratch buffer
 894       if (scratch == NULL) {
 895         // If this is the last iteration, we want to allocate N bytes
 896         // of space, otherwise the max possible kBlockSize space.
 897         // num_to_read contains exactly the correct value
 898         scratch = new char[num_to_read];
 899       }
 900       memcpy(scratch, fragment, bytes_read);
 901       reader->Skip(bytes_read);
 902
 903       while (bytes_read < num_to_read) {
 904         fragment = reader->Peek(&fragment_size);
 905         size_t n = min<size_t>(fragment_size, num_to_read - bytes_read);
 906         memcpy(scratch + bytes_read, fragment, n);
 907         bytes_read += n;
 908         reader->Skip(n);
 909       }
 910       DCHECK_EQ(bytes_read, num_to_read);
 911       fragment = scratch;
 912       fragment_size = num_to_read;
 913     }
 914     DCHECK_EQ(fragment_size, num_to_read);
 915
 916     // Get encoding table for compression
 917     int table_size;
 918     uint16* table = wmem.GetHashTable(num_to_read, &table_size);
 919
 920     // Compress input_fragment and append to dest
 921     const int max_output = MaxCompressedLength(num_to_read);
 922
 923     // Need a scratch buffer for the output, in case the byte sink doesn't
 924     // have room for us directly.
 925     if (scratch_output == NULL) {
 926       scratch_output = new char[max_output];
 927     } else {
 928       // Since we encode kBlockSize regions followed by a region
 929       // which is <= kBlockSize in length, a previously allocated
 930       // scratch_output[] region is big enough for this iteration.
 931     }
 932     char* dest = writer->GetAppendBuffer(max_output, scratch_output);
 933     char* end = internal::CompressFragment(fragment, fragment_size,
 934                                            dest, table, table_size);
 935     writer->Append(dest, end - dest);
 936     written += (end - dest);
 937
 938     N -= num_to_read;
 939     reader->Skip(pending_advance);
 940   }
 941
 942   delete[] scratch;
 943   delete[] scratch_output;
 944
 945   return written;
 946 }
 947
 948 // -----------------------------------------------------------------------
 949 // Flat array interfaces
 950 // -----------------------------------------------------------------------
 951
 952 // A type that writes to a flat array.
 953 // Note that this is not a "ByteSink", but a type that matches the
 954 // Writer template argument to SnappyDecompressor::DecompressAllTags().
 955 class SnappyArrayWriter {
 956  private:
 957   char* base_;
 958   char* op_;
 959   char* op_limit_;
 960
 961  public:
 962   inline explicit SnappyArrayWriter(char* dst)
 963       : base_(dst),
 964         op_(dst) {
 965   }
 966
 967   inline void SetExpectedLength(size_t len) {
 968     op_limit_ = op_ + len;
 969   }
 970
 971   inline bool CheckLength() const {
 972     return op_ == op_limit_;
 973   }
 974
 975   inline bool Append(const char* ip, size_t len) {
 976     char* op = op_;
 977     const size_t space_left = op_limit_ - op;
 978     if (space_left < len) {
 979       return false;
 980     }
 981     memcpy(op, ip, len);
 982     op_ = op + len;
 983     return true;
 984   }
 985
 986   inline bool TryFastAppend(const char* ip, size_t available, size_t len) {
 987     char* op = op_;
 988     const size_t space_left = op_limit_ - op;
 989     if (len <= 16 && available >= 16 && space_left >= 16) {
 990       // Fast path, used for the majority (about 95%) of invocations.
 991       UnalignedCopy64(ip, op);
 992       UnalignedCopy64(ip + 8, op + 8);
 993       op_ = op + len;
 994       return true;
 995     } else {
 996       return false;
 997     }
 998   }
 999
1000   inline bool AppendFromSelf(size_t offset, size_t len) {
1001     char* op = op_;
1002     const size_t space_left = op_limit_ - op;
1003
1004     if (op - base_ <= offset - 1u) {  // -1u catches offset==0
1005       return false;
1006     }
1007     if (len <= 16 && offset >= 8 && space_left >= 16) {
1008       // Fast path, used for the majority (70-80%) of dynamic invocations.
1009       UnalignedCopy64(op - offset, op);
1010       UnalignedCopy64(op - offset + 8, op + 8);
1011     } else {
1012       if (space_left >= len + kMaxIncrementCopyOverflow) {
1013         IncrementalCopyFastPath(op - offset, op, len);
1014       } else {
1015         if (space_left < len) {
1016           return false;
1017         }
1018         IncrementalCopy(op - offset, op, len);
1019       }
1020     }
1021
1022     op_ = op + len;
1023     return true;
1024   }
1025 };
1026
1027 bool RawUncompress(const char* compressed, size_t n, char* uncompressed) {
1028   ByteArraySource reader(compressed, n);
1029   return RawUncompress(&reader, uncompressed);
1030 }
1031
1032 bool RawUncompress(Source* compressed, char* uncompressed) {
1033   SnappyArrayWriter output(uncompressed);
1034   return InternalUncompress(compressed, &output, kuint32max);
1035 }
1036
1037 bool Uncompress(const char* compressed, size_t n, string* uncompressed) {
1038   size_t ulength;
1039   if (!GetUncompressedLength(compressed, n, &ulength)) {
1040     return false;
1041   }
1042   // Protect against possible DoS attack
1043   if ((static_cast<uint64>(ulength) + uncompressed->size()) >
1044       uncompressed->max_size()) {
1045     return false;
1046   }
1047   STLStringResizeUninitialized(uncompressed, ulength);
1048   return RawUncompress(compressed, n, string_as_array(uncompressed));
1049 }
1050
1051
1052 // A Writer that drops everything on the floor and just does validation
1053 class SnappyDecompressionValidator {
1054  private:
1055   size_t expected_;
1056   size_t produced_;
1057
1058  public:
1059   inline SnappyDecompressionValidator() : produced_(0) { }
1060   inline void SetExpectedLength(size_t len) {
1061     expected_ = len;
1062   }
1063   inline bool CheckLength() const {
1064     return expected_ == produced_;
1065   }
1066   inline bool Append(const char* ip, size_t len) {
1067     produced_ += len;
1068     return produced_ <= expected_;
1069   }
1070   inline bool TryFastAppend(const char* ip, size_t available, size_t length) {
1071     return false;
1072   }
1073   inline bool AppendFromSelf(size_t offset, size_t len) {
1074     if (produced_ <= offset - 1u) return false;  // -1u catches offset==0
1075     produced_ += len;
1076     return produced_ <= expected_;
1077   }
1078 };
1079
1080 bool IsValidCompressedBuffer(const char* compressed, size_t n) {
1081   ByteArraySource reader(compressed, n);
1082   SnappyDecompressionValidator writer;
1083   return InternalUncompress(&reader, &writer, kuint32max);
1084 }
1085
1086 void RawCompress(const char* input,
1087                  size_t input_length,
1088                  char* compressed,
1089                  size_t* compressed_length) {
1090   ByteArraySource reader(input, input_length);
1091   UncheckedByteArraySink writer(compressed);
1092   Compress(&reader, &writer);
1093
1094   // Compute how many bytes were added
1095   *compressed_length = (writer.CurrentDestination() - compressed);
1096 }
1097
1098 size_t Compress(const char* input, size_t input_length, string* compressed) {
1099   // Pre-grow the buffer to the max length of the compressed output
1100   compressed->resize(MaxCompressedLength(input_length));
1101
1102   size_t compressed_length;
1103   RawCompress(input, input_length, string_as_array(compressed),
1104               &compressed_length);
1105   compressed->resize(compressed_length);
1106   return compressed_length;
1107 }
1108
1109
1110 } // end namespace snappy
1111