覚え書き: 9月 2011

I implemented GZIP decompression in C# from scratch. With the implementation, gzipped data can be decompressed like the following.

byte[] output = GZIP.Decompress(input);

The GZIP specification (RFC 1952: http://tools.ietf.org/html/rfc1952) itself is simple, but the specification of the core part of GZIP, the DEFLATE format (RFC 1951: http://tools.ietf.org/html/rfc1951), is difficult to understand. I hope that my implementation (which is the simplest and most readable as far as I know) will help you understand the specification. Please let me know if you find any bug in my implementation. You can use my implementation freely even for commercial purpose as long as you respect my copyright!

// Huffman.cs

// Deflate.cs

// Written by Takahiko Kawasaki.
using System;
using System.Net;
using System.Windows;
using System.Windows.Controls;
using System.Windows.Documents;
using System.Windows.Ink;
using System.Windows.Input;
using System.Windows.Media;
using System.Windows.Media.Animation;
using System.Windows.Shapes;
using System.IO;

namespace Util
{
    public class Deflate
    {
        private static readonly Huffman fixedLiteralLengthHuffman;
        private static readonly Huffman fixedDistanceHuffman;

        static Deflate()
        {
            // Set up tables for fixed Huffmn codes (BTYPE=01).
            fixedLiteralLengthHuffman = BuildFixedLiteralLengthHuffman();
            fixedDistanceHuffman      = BuildFixedDistanceHuffman();
        }

        private static Huffman BuildFixedLiteralLengthHuffman()
        {
            // 3.2.6 Compression with fixed Huffman codes (BTYPE=01)
            //
            //   Lit Value   Bits   Codes
            //   ---------   ----   ---------------------------
            //     0 - 143    8      00110000 through 10111111
            //   144 - 255    9     110010000 through 111111111
            //   256 - 279    7       0000000 through   0010111
            //   280 - 287    8      11000000 through 11000111

            byte[] codeLengths = new byte[288];

            int symbol;

            for (symbol = 0; symbol < 144; ++symbol)
            {
                codeLengths[symbol] = 8;
            }

            for (; symbol < 256; ++symbol)
            {
                codeLengths[symbol] = 9;
            }

            for (; symbol < 280; ++symbol)
            {
                codeLengths[symbol] = 7;
            }

            for (; symbol < 288; ++symbol)
            {
                codeLengths[symbol] = 8;
            }

            // Huffman class generates code values from code lengths.
            // Note that "code lengths are sufficient to generate the
            // actual codes". See 3.2.2 of RFC 1951.
            return new Huffman(codeLengths);
        }

        private static Huffman BuildFixedDistanceHuffman()
        {
            // 3.2.6 Compression with fixed Huffman codes (BTYPE=01)
            //
            // "Distance codes 0-31 are represented by (fixed-length)
            // 5-bit codes", the specification says.

            byte[] codeLengths = new byte[32];

            for (int symbol = 0; symbol < 32; ++symbol)
            {
                codeLengths[symbol] = 5;
            }

            // Let Huffman class generate code values from code lengths.
            // Note that "code lengths are sufficient to generate the
            // actual codes". See 3.2.2 of RFC 1951.
            return new Huffman(codeLengths);
        }

        public static byte[] Decompress(byte[] input, int index)
        {
            // RFC 1951: DEFLATE Compressed Data Format Specification version 1.3
            //   http://tools.ietf.org/html/rfc1951

            // The data is compressed on a bit basis, so use a bit index.
            int bitIndex = index * 8;

            // The place to store output.
            MemoryStream output = new MemoryStream();

            // Process all blocks one by one until the end.
            // InflateBlock() returns false if no more block exists.
            while (InflateBlock(input, ref bitIndex, output)) { }

            // Converted to the final output format.
            return output.ToArray();
        }

        private static bool InflateBlock(byte[] input, ref int bitIndex, MemoryStream output)
        {
            // Each block has a block header which consists of 3 bits.
            // See 3.2.3 of RFC 1951.
            //
            // The first bit indicates whether the block is the last one or not.
            bool last = ReadBit(input, ref bitIndex);

            // The second and third bits indicate the compression type of the block.
            // Compression types are as follows:
            //
            //   00: No compression.
            //   01: Compressed with fixed Huffman codes
            //   10: Compressed with dynamic Huffman codes
            //   11: Reserved (error)
            int type = ReadBits(input, ref bitIndex, 2);

            switch (type)
            {
                // No compression
                case 0:
                    InflatePlainBlock(input, ref bitIndex, output);
                    break;

                // Compressed with fixed Huffman codes
                case 1:
                    InflateFixedBlock(input, ref bitIndex, output);
                    break;

                // Compressed with dynamic Huffman codes
                case 2:
                    InflateDynamicBlock(input, ref bitIndex, output);
                    break;

                // Bad format
                default:
                    string format = "Bad compression type at bit index {0}";
                    string message = string.Format(format, bitIndex);
                    throw new FormatException(message);
            }

            // Return true if this block is not the last one.
            return !last;
        }

        private static bool GetBit(byte[] data, int bitIndex)
        {
            int index = bitIndex / 8;
            int shift = bitIndex % 8;

            // Return true if the bit indicated by bitIndex is set.
            return (data[index] & (1 << shift)) != 0;
        }

        private static int GetBits(byte[] data, int bitIndex, int nBits)
        {
            int number = 0;
            int weight = 1;

            // Convert consecutive bits into a number.
            for (int i = 0; i < nBits; ++i, weight *= 2)
            {
                // GetBit() returns true if the bit is set.
                if (GetBit(data, bitIndex + i))
                {
                    number += weight;
                }
            }

            return number;
        }

        private static bool ReadBit(byte[] data, ref int bitIndex)
        {
            return GetBit(data, bitIndex++);
        }

        private static int ReadBits(byte[] data, ref int bitIndex, int nBits)
        {
            int number = GetBits(data, bitIndex, nBits);

            bitIndex += nBits;

            return number;
        }

        private static void InflatePlainBlock(byte[] input, ref int bitIndex, MemoryStream output)
        {
            // 3.2.4 Non-compressed blocks (BTYPE=00)

            // Skip any remaining bits in current partially processed byte.
            bitIndex = (bitIndex + 7) & ~7;

            // Data copy is performed on a byte basis, so convert the bit index
            // to a byte index.
            int index = bitIndex / 8;

            // LEN: 2 bytes. The data length.
            int len = input[index] + input[index + 1] * 256;

            // NLEN: 2 bytes. The one's complement of LEN.

            // Skip LEN and NLEN.
            index += 4;

            // Copy the data to the output.
            output.Write(input, index, len);

            // Make the bitIndex point to the bit next to
            // the end of the copied data.
            bitIndex = (index + len) * 8;
        }

        private static void InflateFixedBlock(byte[] input, ref int bitIndex, MemoryStream output)
        {
            // 3.2.6 Compression with fixed Huffman codes (BTYPE=01)

            // Inflate the compressed data using the pre-defined
            // conversion tables. The specification says in 3.2.2
            // as follows.
            //
            //   The only differences between the two compressed
            //   cases is how the Huffman codes for the literal/
            //   length and distance alphabets are defined.
            //
            // The "two compressed cases" in the above sentence are
            // "fixed Huffman codes" and "dynamic Huffman codes".
            InflateData(input, ref bitIndex, output,
                fixedLiteralLengthHuffman, fixedDistanceHuffman);
        }

        private static void InflateDynamicBlock(byte[] input, ref int bitIndex, MemoryStream output)
        {
            // 3.2.7 Compression with dynamic Huffman codes (BTYPE=10)

            // 5 Bits: HLIT, The number of Literal/Length codes - 257 (257 - 286)
            int hlit = ReadBits(input, ref bitIndex, 5) + 257;

            // 5 Bits: HDIST, The number of Distance codes - 1 (1 - 32)
            int hdist = ReadBits(input, ref bitIndex, 5) + 1;

            // 4 Bits: HCLEN, The number of Code Length codes - 4 (4 - 19)
            int hclen = ReadBits(input, ref bitIndex, 4) + 4;

            // (hclen * 3) bits: code lengths of "values of code length".
            //
            // Note that "values of code lengths" (which ranges from 0 to 18)
            // themselves are compressed using Huffman code. In addition,
            // the order here is strange.
            byte[] codeLengthsFromCodeLengthValue = new byte[19];
            for (int i = 0; i < hclen; ++i)
            {
                byte codeLengthOfCodeLengthValue = (byte)ReadBits(input, ref bitIndex, 3);

                // The strange order is converted into a normal index here.
                int index = CodeLengthOrderToIndex(i);

                codeLengthsFromCodeLengthValue[index] = codeLengthOfCodeLengthValue;
            }

            // Create a table to convert "code value of code length value" into
            // "code length value".
            Huffman codeLengthHuffman = new Huffman(codeLengthsFromCodeLengthValue);

            // hlit code lengths for literal/length alphabet. The code lengths are
            // encoded using the code length Huffman code that was parsed above.
            byte[] codeLengthsFromLiteralLengthCode = new byte[hlit];
            ReadCodeLengths(input, ref bitIndex, codeLengthsFromLiteralLengthCode, codeLengthHuffman);

            // Create a table to convert "code value of literal/length alphabet"
            // into "literal/length symbol".
            Huffman literalLengthHuffman = new Huffman(codeLengthsFromLiteralLengthCode);

            // hdist code lengths for the distance alphabet. The code lengths are
            // encoded using the code length Huffman code that was parsed above.
            byte[] codeLengthsFromDistanceCode = new byte[hdist];
            ReadCodeLengths(input, ref bitIndex, codeLengthsFromDistanceCode, codeLengthHuffman);

            // Create a table to convert "code value of distance alphabet" into
            // "distance symbol".
            Huffman distanceHuffman = new Huffman(codeLengthsFromDistanceCode);

            // The actual compressed data of this block. The data are encoded using
            // the literal/length and distance Huffman codes that were parsed above.
            InflateData(input, ref bitIndex, output, literalLengthHuffman, distanceHuffman);
        }

        private static void InflateData(byte[] input, ref int bitIndex, MemoryStream output,
            Huffman literalLengthHuffman, Huffman distanceHuffman)
        {
            // 3.2.5 Compressed blocks (length and distance codes)

            while (true)
            {
                // Read a literal/length symbol from the input.
                int literalLength = literalLengthHuffman.ReadSymbol(input, ref bitIndex);

                // Symbol value '256' indicates the end.
                if (literalLength == 256)
                {
                    // End of this data.
                    break;
                }

                // Symbol values from 0 to 255 represent literal values.
                if (0 <= literalLength && literalLength <= 255)
                {
                    // As is.
                    output.WriteByte((byte)literalLength);
                    continue;
                }

                // Symbol values from 257 to 285 represent <length,distance> pairs.
                // Depending on symbol values, some extra bits in the input may be
                // consumed to compute the length.
                int length = ReadLength(input, ref bitIndex, literalLength);

                // Read the distance from the input.
                int distance = ReadDistance(input, ref bitIndex, distanceHuffman);

                // Extract some data from the output buffer and copy them.
                Duplicate(length, distance, output);
            }
        }

        private static byte[] indicesFromCodeLengthOrder
            = { 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 };

        private static int CodeLengthOrderToIndex(int order)
        {
            // 3.2.7 Compression with dynamic Huffman codes (BTYPE=10)
            //
            // See the description about "(HCLEN + 4) x 3 bits" in the
            // specification.
            return indicesFromCodeLengthOrder[order];
        }

        private static void ReadCodeLengths(byte[] input, ref int bitIndex,
            byte[] codeLengths, Huffman codeLengthHuffman)
        {
            // 3.2.7 Compression with dynamic Huffman codes (BTYPE=10)

            for (int i = 0; i < codeLengths.Length; ++i)
            {
                // Read a symbol value of code length.
                byte codeLength = (byte)codeLengthHuffman.ReadSymbol(input, ref bitIndex);

                // Code lengths from 0 to 15 represent 0 to 15, respectively,
                // meaning no more extra interpretation is needed.
                if (0 <= codeLength && codeLength <= 15)
                {
                    // As is.
                    codeLengths[i] = codeLength;
                    continue;
                }

                int repeatCount;

                switch (codeLength)
                {
                    case 16:
                        // Copy the previous code length for 3 - 6 times.
                        // The next 2 bits (+3) indicate repeat count.
                        codeLength = codeLengths[i - 1];
                        repeatCount = ReadBits(input, ref bitIndex, 2) + 3;
                        break;

                    case 17:
                        // Copy a code length of 0 for 3 - 10 times.
                        // The next 3 bits (+3) indicate repeat count.
                        codeLength = 0;
                        repeatCount = ReadBits(input, ref bitIndex, 3) + 3;
                        break;

                    case 18:
                        // Copy a code length of 0 for 11 - 138 times.
                        // The next 7 bits (+11) indicate repeat count.
                        codeLength = 0;
                        repeatCount = ReadBits(input, ref bitIndex, 7) + 11;
                        break;

                    default:
                        // Bad code length.
                        string format = "Bad code length {0} at bit index {1}";
                        string message = string.Format(format, codeLength, bitIndex);
                        throw new FormatException(message);
                }

                // Copy the code length as many times as specified.
                for (int j = 0; j < repeatCount; ++j)
                {
                    codeLengths[i + j] = codeLength;
                }

                // Skip the range filled by the above copy.
                i += repeatCount - 1;
            }
        }

        private static int ReadLength(byte[] input, ref int bitIndex, int literalLength)
        {
            // 3.2.5 Compressed blocks (length and distance code)

            int baseValue;
            int nBits;

            switch (literalLength)
            {
                case 257:
                case 258:
                case 259:
                case 260:
                case 261:
                case 262:
                case 263:
                case 264:
                    return (literalLength - 254);

                case 265: baseValue = 11; nBits = 1; break;
                case 266: baseValue = 13; nBits = 1; break;
                case 267: baseValue = 15; nBits = 1; break;
                case 268: baseValue = 17; nBits = 1; break;
                case 269: baseValue = 19; nBits = 2; break;
                case 270: baseValue = 23; nBits = 2; break;
                case 271: baseValue = 27; nBits = 2; break;
                case 272: baseValue = 31; nBits = 2; break;
                case 273: baseValue = 35; nBits = 3; break;
                case 274: baseValue = 43; nBits = 3; break;
                case 275: baseValue = 51; nBits = 3; break;
                case 276: baseValue = 59; nBits = 3; break;
                case 277: baseValue = 67; nBits = 4; break;
                case 278: baseValue = 83; nBits = 4; break;
                case 279: baseValue = 99; nBits = 4; break;
                case 280: baseValue = 115; nBits = 4; break;
                case 281: baseValue = 131; nBits = 5; break;
                case 282: baseValue = 163; nBits = 5; break;
                case 283: baseValue = 195; nBits = 5; break;
                case 284: baseValue = 227; nBits = 5; break;
                case 285: return 258;
                default:
                    string format = "Bad literal/length code {0} at bit index {1}";
                    string message = string.Format(format, literalLength, bitIndex);
                    throw new FormatException(message);
            }

            // Read a value to add to the base value.
            int n = ReadBits(input, ref bitIndex, nBits);

            return baseValue + n;
        }

        private static int ReadDistance(byte[] input, ref int bitIndex, Huffman distanceHuffman)
        {

// 3.2.5 Compressed blocks (length and distance code)

// Read a distance code from the input.

// It is expected to range from 0 to 29.

            int code = distanceHuffman.ReadSymbol(input, ref bitIndex);

            int baseValue;
            int nBits;

            switch (code)
            {
                case 0:
                case 1:
                case 2:
                case 3:
                    return code + 1;

                case 4: baseValue =     5; nBits = 1; break;
                case 5: baseValue =     7; nBits = 1; break;
                case 6: baseValue =     9; nBits = 2; break;
                case 7: baseValue =    13; nBits = 2; break;
                case 8: baseValue =    17; nBits = 3; break;
                case 9: baseValue =    25; nBits = 3; break;
                case 10: baseValue =    33; nBits = 4; break;
                case 11: baseValue =    49; nBits = 4; break;
                case 12: baseValue =    65; nBits = 5; break;
                case 13: baseValue =    97; nBits = 5; break;
                case 14: baseValue =   129; nBits = 6; break;
                case 15: baseValue =   193; nBits = 6; break;
                case 16: baseValue =   257; nBits = 7; break;
                case 17: baseValue =   385; nBits = 7; break;
                case 18: baseValue =   513; nBits = 8; break;
                case 19: baseValue =   769; nBits = 8; break;
                case 20: baseValue = 1025; nBits = 9; break;
                case 21: baseValue = 1537; nBits = 9; break;
                case 22: baseValue = 2049; nBits = 10; break;
                case 23: baseValue = 3073; nBits = 10; break;
                case 24: baseValue = 4097; nBits = 11; break;
                case 25: baseValue = 6145; nBits = 11; break;
                case 26: baseValue = 8193; nBits = 12; break;
                case 27: baseValue = 12289; nBits = 12; break;
                case 28: baseValue = 16385; nBits = 13; break;
                case 29: baseValue = 24577; nBits = 13; break;
                default:
                    // Distance codes 30-31 will never actually occur
                    // in the compressed data, the specification says.
                    string format = "Bad distance code {0} at bit index {1}";
                    string message = string.Format(format, code, bitIndex);
                    throw new FormatException(message);
            }

            // Read a value to add to the base value.
            int n = ReadBits(input, ref bitIndex, nBits);

            return baseValue + n;
        }

        private static void Duplicate(int length, int distance, MemoryStream output)
        {
            // Get the current output buffer.
            byte[] source = output.GetBuffer();

            // Get the number of bytes written so far. source.Length
            // should not be used because the size of the buffer may
            // be bigger than the number of bytes written so far.
            int sourceLength = (int)output.Length;

            // An array to finally append to the output.
            byte[] target = new byte[length];

            // The position from which to start copying data.
            int initialPosition = sourceLength - distance;
            int sourceIndex = initialPosition;

            for (int targetIndex = 0; targetIndex < length; ++targetIndex, ++sourceIndex)
            {
                if (sourceLength <= sourceIndex)
                {
                    // Reached the end of the current output buffer.
                    // The specification says as follows in 3.2.3.
                    //
                    //   Note also that the referenced string may
                    //   overlap the current position; for example,
                    //   if the last 2 bytes decoded have values X
                    //   and Y, a string reference with <length=5,
                    //   distance=2> adds X,Y,X,Y,X to the output
                    //   stream.

                    // repeat.
                    sourceIndex = initialPosition;
                }

                target[targetIndex] = source[sourceIndex];
            }

            // Append the duplicated bytes to the output.
            output.Write(target, 0, length);
        }
    }
}

// GZIP.cs

// Huffman.cs

// Deflate.cs

// ↑ Literal/Lengthのシンボル値、ビット数、コード値、の表

            byte[] codeLengths = new byte[288];

            int symbol;

            for (symbol = 0; symbol < 144; ++symbol)
            {
                codeLengths[symbol] = 8;
            }

            for (; symbol < 256; ++symbol)
            {
                codeLengths[symbol] = 9;
            }

            for (; symbol < 280; ++symbol)
            {
                codeLengths[symbol] = 7;
            }

            for (; symbol < 288; ++symbol)
            {
                codeLengths[symbol] = 8;
            }

            // Huffman クラスはコード長群からコード値群を生成する。
            // 「コード長だけで、実際のコードを生成するのに十分」
            // であることに注目。RFC 1951 3.2.2 を参照のこと。
            return new Huffman(codeLengths);
        }

        private static Huffman BuildFixedDistanceHuffman()
        {
            // 3.2.6 Compression with fixed Huffman codes (BTYPE=01)
            //
            // 「0 から 31 の距離コードは（固定長の） 5 ビットコードで
            // 表される」、と仕様に書かれている。

            byte[] codeLengths = new byte[32];

            for (int symbol = 0; symbol < 32; ++symbol)
            {
                codeLengths[symbol] = 5;
            }

            // Huffman クラスにコード長群からコード値群を生成させる。
            // 「コード長だけで、実際のコードを生成するのに十分」
            // であることに注目。RFC 1951 3.2.2 を参照のこと。
            return new Huffman(codeLengths);
        }

        public static byte[] Decompress(byte[] input, int index)
        {
            // RFC 1951: DEFLATE Compressed Data Format Specification version 1.3
            //   http://tools.ietf.org/html/rfc1951

            // データはビット単位で圧縮されているので、ビットインデックスを使用する。
            int bitIndex = index * 8;

            // 出力を蓄える場所
            MemoryStream output = new MemoryStream();

            // 全てのブロックを一つずつ処理する。InflateBlock() は、
            // もうブロックが存在しない場合 false を返す。
            while (InflateBlock(input, ref bitIndex, output)) { }

            // 最終の出力形式に変換する。
            return output.ToArray();
        }

        private static bool InflateBlock(byte[] input, ref int bitIndex, MemoryStream output)
        {
            // 各ブロックは、3 ビットで構成されるブロックヘッダーを持つ。
            // RFC 1951 3.2.3 を参照のこと。
            //
            // 一番目のビットは、当該ブロックが最後のブロックかどうかを示す。
            bool last = ReadBit(input, ref bitIndex);

            // 二番目と三番目のビットは、当該ブロックの圧縮形式を示す。
            // 圧縮形式は次の通り。
            //
            //   00: 圧縮されていない
            //   01: 固定ハフマンコードで圧縮されている
            //   10: 動的ハフマンコードで圧縮されている
            //   11: 予約済み（エラー）
            int type = ReadBits(input, ref bitIndex, 2);

            switch (type)
            {
                // 圧縮されていない
                case 0:
                    InflatePlainBlock(input, ref bitIndex, output);
                    break;

                // 固定ハフマンコードで圧縮されている
                case 1:
                    InflateFixedBlock(input, ref bitIndex, output);
                    break;

                // 動的ハフマンコードで圧縮されている
                case 2:
                    InflateDynamicBlock(input, ref bitIndex, output);
                    break;

                // 不正フォーマット
                default:
                    string format = "Bad compression type at bit index {0}";
                    string message = string.Format(format, bitIndex);
                    throw new FormatException(message);
            }

            // このブロックが最後のブロックでない場合 true を返す。
            return !last;
        }

        private static bool GetBit(byte[] data, int bitIndex)
        {
            int index = bitIndex / 8;
            int shift = bitIndex % 8;

            // bitIndex が指すビットがセットされていれば true を返す。
            return (data[index] & (1 << shift)) != 0;
        }

        private static int GetBits(byte[] data, int bitIndex, int nBits)
        {
            int number = 0;
            int weight = 1;

            // 連続するビット群を数値に変換する。
            for (int i = 0; i < nBits; ++i, weight *= 2)
            {
                // GetBit() はビットがセットされていれば true を返す。
                if (GetBit(data, bitIndex + i))
                {
                    number += weight;
                }
            }

            return number;
        }

        private static bool ReadBit(byte[] data, ref int bitIndex)
        {
            return GetBit(data, bitIndex++);
        }

        private static int ReadBits(byte[] data, ref int bitIndex, int nBits)
        {
            int number = GetBits(data, bitIndex, nBits);

            bitIndex += nBits;

            return number;
        }

        private static void InflatePlainBlock(byte[] input, ref int bitIndex, MemoryStream output)
        {
            // 3.2.4 Non-compressed blocks (BTYPE=00)

            // 現在部分的に処理中であるバイトの残りのビットを読み飛ばす。
            bitIndex = (bitIndex + 7) & ~7;

            // データコピーはバイト単位でおこなうので、ビットインデックスを
            // バイトインデックスに変換する。
            int index = bitIndex / 8;

            // LEN: 2 バイト。データの長さ。
            int len = input[index] + input[index + 1] * 256;

            // NLEN: 2 バイト。LEN の 1 の補数。

            // LEN と NLEN を読み飛ばす。
            index += 4;

            // データを出力にコピーする。
            output.Write(input, index, len);

            // bitIndex がコピーされたデータの末尾の次を指すようにする。

            bitIndex = (index + len) * 8;
        }

        private static void InflateFixedBlock(byte[] input, ref int bitIndex, MemoryStream output)
        {
            // 3.2.6 Compression with fixed Huffman codes (BTYPE=01)

            // 圧縮されたデータを、事前に定義された変換テーブルを
            // 用いて展開する。仕様の 3.2.2 には次のように記述されている。
            //
            //   二つの圧縮パターンの唯一の違いは、Literal/Length 用の
            //   ハフマンコードと距離用のハフマンコードがどのように定義
            //   されるか、である。
            //
            // 上記の文における「二つの圧縮パターン」とは、「固定
            // ハフマンコード」と「動的ハフマンコード」のことである。
            InflateData(input, ref bitIndex, output,
                fixedLiteralLengthHuffman, fixedDistanceHuffman);
        }

        private static void InflateDynamicBlock(byte[] input, ref int bitIndex, MemoryStream output)
        {
            // 3.2.7 Compression with dynamic Huffman codes (BTYPE=10)

            // 5 ビット: HLIT, Literal/Length コードの数マイナス 257 (257 - 286)
            int hlit = ReadBits(input, ref bitIndex, 5) + 257;

            // 5 ビット: HDIST, 距離コードの数マイナス 1 (1 - 32)
            int hdist = ReadBits(input, ref bitIndex, 5) + 1;

            // 4 ビット: HCLEN, コード長コードの数マイナス 4 (4 - 19)
            int hclen = ReadBits(input, ref bitIndex, 4) + 4;

            // (hclen * 3) ビット: 「コード長の値群」のコード長群
            //
            // 「コード長の値群」（0 から 18 の範囲）それ自身もハフマンコードを
            // 用いて圧縮されていることに注意。加えて、ここでは、その順番も
            // 奇妙である。
            byte[] codeLengthsFromCodeLengthValue = new byte[19];
            for (int i = 0; i < hclen; ++i)
            {
                byte codeLengthOfCodeLengthValue = (byte)ReadBits(input, ref bitIndex, 3);

                // 変な順番をここで通常のインデックスに変換する。
                int index = CodeLengthOrderToIndex(i);

                codeLengthsFromCodeLengthValue[index] = codeLengthOfCodeLengthValue;
            }

            // 「コード長値のコード値」を「コード長値」に変換するための
            // テーブルを作成する。
            Huffman codeLengthHuffman = new Huffman(codeLengthsFromCodeLengthValue);

            // Literal/Length シンボル用の hlit 個のコード長群。コード長群は、
            // 今パースしたばかりの「コード長ハフマンコード」を用いてエンコード
            // されている。
            byte[] codeLengthsFromLiteralLengthCode = new byte[hlit];
            ReadCodeLengths(input, ref bitIndex, codeLengthsFromLiteralLengthCode, codeLengthHuffman);

            // 「Literal/Length シンボルのコード値」を「Literal/Length シンボル」に
            // 変換するテーブルを作成する。
            Huffman literalLengthHuffman = new Huffman(codeLengthsFromLiteralLengthCode);

            // 距離シンボル用の hdist 個のコード長群。コード長群は、先ほど
            // パースした「コード長ハフマンコード」を用いてエンコードされている。
            byte[] codeLengthsFromDistanceCode = new byte[hdist];
            ReadCodeLengths(input, ref bitIndex, codeLengthsFromDistanceCode, codeLengthHuffman);

            // 「距離シンボルのコード値」を「距離シンボル」に変換するテーブルを
            // 作成する。
            Huffman distanceHuffman = new Huffman(codeLengthsFromDistanceCode);

            // このブロックの実際の圧縮データ。データは、これまでにパースした
            // Literal/Length ハフマンコードと距離ハフマンコードを用いてエンコード
            // されている。
            InflateData(input, ref bitIndex, output, literalLengthHuffman, distanceHuffman);
        }

        private static void InflateData(byte[] input, ref int bitIndex, MemoryStream output,
            Huffman literalLengthHuffman, Huffman distanceHuffman)
        {
            // 3.2.5 Compressed blocks (length and distance codes)

            while (true)
            {
                // 入力から Literal/Length シンボルを読み込む。
                int literalLength = literalLengthHuffman.ReadSymbol(input, ref bitIndex);

                // シンボル値 256 は終わりを示す。
                if (literalLength == 256)
                {
                    // このデータの終わり。
                    break;
                }

                // 0 から 255 までのシンボル値はリテラル値を表す。
                if (0 <= literalLength && literalLength <= 255)
                {
                    // そのまま
                    output.WriteByte((byte)literalLength);
                    continue;
                }

                // 257 から 285 までのシンボル値は、＜長さ，距離＞の組を表す。
                // シンボル値によっては、長さを計算するために、入力から追加で
                // 幾つかビットが読み込まれて消費される。
                int length = ReadLength(input, ref bitIndex, literalLength);

                // 入力から距離を読み込む。
                int distance = ReadDistance(input, ref bitIndex, distanceHuffman);

                // 出力バッファーから幾つかデータを取り出してコピーする。
                Duplicate(length, distance, output);
            }
        }

        private static byte[] indicesFromCodeLengthOrder
            = { 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 };

        private static int CodeLengthOrderToIndex(int order)
        {
            // 3.2.7 Compression with dynamic Huffman codes (BTYPE=10)
            //
            // 仕様内の「(HCLEN + 4) x 3 bits」の説明を参照のこと。
            return indicesFromCodeLengthOrder[order];
        }

        private static void ReadCodeLengths(byte[] input, ref int bitIndex,
            byte[] codeLengths, Huffman codeLengthHuffman)
        {
            // 3.2.7 Compression with dynamic Huffman codes (BTYPE=10)

            for (int i = 0; i < codeLengths.Length; ++i)
            {
                // コード長のシンボル値を読み込む。
                byte codeLength = (byte)codeLengthHuffman.ReadSymbol(input, ref bitIndex);

                // 0 から 15 までのコード長は、それぞれ 0 から 15 を表し、
                // これ以上の解釈は必要ない。
                if (0 <= codeLength && codeLength <= 15)
                {
                    // そのまま
                    codeLengths[i] = codeLength;
                    continue;
                }

                int repeatCount;

                switch (codeLength)
                {
                    case 16:
                        // 直前のコード長を 3 ～ 6 回繰り返す。
                        // 後に続く 2 ビット（足す３）が繰り返し回数を示す。
                        codeLength = codeLengths[i - 1];
                        repeatCount = ReadBits(input, ref bitIndex, 2) + 3;
                        break;

                    case 17:
                        // コード長 0 を 3 ～ 10 回繰り返す。
                        // 後に続く 3 ビット（足す３）が繰り返し回数を示す。
                        codeLength = 0;
                        repeatCount = ReadBits(input, ref bitIndex, 3) + 3;
                        break;

                    case 18:
                        // コード長 0 を 11 ～ 138 回繰り返す。
                        // 後に続く 7 ビット（足す１１）が繰り返し回数を示す。
                        codeLength = 0;
                        repeatCount = ReadBits(input, ref bitIndex, 7) + 11;
                        break;

                    default:
                        // 不正コード長
                        string format = "Bad code length {0} at bit index {1}";
                        string message = string.Format(format, codeLength, bitIndex);
                        throw new FormatException(message);
                }

                // コード長を指定された回数だけコピーする。
                for (int j = 0; j < repeatCount; ++j)
                {
                    codeLengths[i + j] = codeLength;
                }

                // 上記コピーで設定された範囲をスキップする。
                i += repeatCount - 1;
            }
        }

        private static int ReadLength(byte[] input, ref int bitIndex, int literalLength)
        {
            // 3.2.5 Compressed blocks (length and distance code)

            int baseValue;
            int nBits;

            switch (literalLength)
            {
                case 257:
                case 258:
                case 259:
                case 260:
                case 261:
                case 262:
                case 263:
                case 264:
                    return (literalLength - 254);

                case 265: baseValue = 11; nBits = 1; break;
                case 266: baseValue = 13; nBits = 1; break;
                case 267: baseValue = 15; nBits = 1; break;
                case 268: baseValue = 17; nBits = 1; break;
                case 269: baseValue = 19; nBits = 2; break;
                case 270: baseValue = 23; nBits = 2; break;
                case 271: baseValue = 27; nBits = 2; break;
                case 272: baseValue = 31; nBits = 2; break;
                case 273: baseValue = 35; nBits = 3; break;
                case 274: baseValue = 43; nBits = 3; break;
                case 275: baseValue = 51; nBits = 3; break;
                case 276: baseValue = 59; nBits = 3; break;
                case 277: baseValue = 67; nBits = 4; break;
                case 278: baseValue = 83; nBits = 4; break;
                case 279: baseValue = 99; nBits = 4; break;
                case 280: baseValue = 115; nBits = 4; break;
                case 281: baseValue = 131; nBits = 5; break;
                case 282: baseValue = 163; nBits = 5; break;
                case 283: baseValue = 195; nBits = 5; break;
                case 284: baseValue = 227; nBits = 5; break;
                case 285: return 258;
                default:
                    string format = "Bad literal/length code {0} at bit index {1}";
                    string message = string.Format(format, literalLength, bitIndex);
                    throw new FormatException(message);
            }

            // ベース値に加える値を読み込む。
            int n = ReadBits(input, ref bitIndex, nBits);

            return baseValue + n;
        }

        private static int ReadDistance(byte[] input, ref int bitIndex, Huffman distanceHuffman)
        {

// 3.2.5 Compressed blocks (length and distance code)

// 入力から距離コードを読み込む。

// 0 ～ 29 の範囲であることを想定している。

            int code = distanceHuffman.ReadSymbol(input, ref bitIndex);

            int baseValue;
            int nBits;

            switch (code)
            {
                case 0:
                case 1:
                case 2:
                case 3:
                    return code + 1;

                case 4: baseValue =     5; nBits = 1; break;
                case 5: baseValue =     7; nBits = 1; break;
                case 6: baseValue =     9; nBits = 2; break;
                case 7: baseValue =    13; nBits = 2; break;
                case 8: baseValue =    17; nBits = 3; break;
                case 9: baseValue =    25; nBits = 3; break;
                case 10: baseValue =    33; nBits = 4; break;
                case 11: baseValue =    49; nBits = 4; break;
                case 12: baseValue =    65; nBits = 5; break;
                case 13: baseValue =    97; nBits = 5; break;
                case 14: baseValue =   129; nBits = 6; break;
                case 15: baseValue =   193; nBits = 6; break;
                case 16: baseValue =   257; nBits = 7; break;
                case 17: baseValue =   385; nBits = 7; break;
                case 18: baseValue =   513; nBits = 8; break;
                case 19: baseValue =   769; nBits = 8; break;
                case 20: baseValue = 1025; nBits = 9; break;
                case 21: baseValue = 1537; nBits = 9; break;
                case 22: baseValue = 2049; nBits = 10; break;
                case 23: baseValue = 3073; nBits = 10; break;
                case 24: baseValue = 4097; nBits = 11; break;
                case 25: baseValue = 6145; nBits = 11; break;
                case 26: baseValue = 8193; nBits = 12; break;
                case 27: baseValue = 12289; nBits = 12; break;
                case 28: baseValue = 16385; nBits = 13; break;
                case 29: baseValue = 24577; nBits = 13; break;
                default:
                    // 仕様によれば、距離コード 30 と 31 は、実際の
                    // 圧縮データの中に含まれることはない。
                    string format = "Bad distance code {0} at bit index {1}";
                    string message = string.Format(format, code, bitIndex);
                    throw new FormatException(message);
            }

            // ベース値に加える値を読み込む。
            int n = ReadBits(input, ref bitIndex, nBits);

            return baseValue + n;
        }

        private static void Duplicate(int length, int distance, MemoryStream output)
        {
            // 現在の出力バッファーを取得する。
            byte[] source = output.GetBuffer();

            // これまでに書き込んだバイトの数を取得する。バッファーの
            // サイズはこれまでに書き込んだバイト数よりも大きいことが
            // あるので、source.Length を使用すべきではない。
            int sourceLength = (int)output.Length;

            // 最終的に出力に追加することになる配列。
            byte[] target = new byte[length];

            // データコピーを開始する位置。
            int initialPosition = sourceLength - distance;
            int sourceIndex = initialPosition;

            for (int targetIndex = 0; targetIndex < length; ++targetIndex, ++sourceIndex)
            {
                if (sourceLength <= sourceIndex)
                {
                    // 現在の出力バッファーの終わりに到達した。
                    // 仕様の 3.2.3 に次のように記述されている。
                    //
                    //   参照される文字列は現在位置とオーバー
                    //   ラップすることがある。例えば、デコード
                    //   された最後の２文字が X, Y だったとすると、
                    //   <長さ=5, 距離=2> という文字列参照は、
                    //   X,Y,X,Y,X を出力に追加することになる。

                    // 繰り返す。
                    sourceIndex = initialPosition;
                }

                target[targetIndex] = source[sourceIndex];
            }

            // 複製したバイトを出力に追加する。
            output.Write(target, 0, length);
        }
    }
}

// GZIP.cs

2011年9月17日土曜日

GZIP decompression in C#