diff --git a/bitwriter.go b/bitwriter.go
new file mode 100644
index 0000000..dfc6036
--- /dev/null
+++ b/bitwriter.go
@@ -0,0 +1,56 @@
+package brotli
+
+/* Copyright 2010 Google Inc. All Rights Reserved.
+
+   Distributed under MIT license.
+   See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
+*/
+
+/* Write bits into a byte array. */
+
+type bitWriter struct {
+	dst []byte
+
+	// Data waiting to be written is the low nbits of bits.
+	bits  uint64
+	nbits uint
+}
+
+func (w *bitWriter) writeBits(nb uint, b uint64) {
+	w.bits |= b << w.nbits
+	w.nbits += nb
+	if w.nbits >= 32 {
+		bits := w.bits
+		w.bits >>= 32
+		w.nbits -= 32
+		w.dst = append(w.dst,
+			byte(bits),
+			byte(bits>>8),
+			byte(bits>>16),
+			byte(bits>>24),
+		)
+	}
+}
+
+func (w *bitWriter) writeSingleBit(bit bool) {
+	if bit {
+		w.writeBits(1, 1)
+	} else {
+		w.writeBits(1, 0)
+	}
+}
+
+func (w *bitWriter) jumpToByteBoundary() {
+	dst := w.dst
+	for w.nbits != 0 {
+		dst = append(dst, byte(w.bits))
+		w.bits >>= 8
+		if w.nbits > 8 { // Avoid underflow
+			w.nbits -= 8
+		} else {
+			w.nbits = 0
+		}
+	}
+	w.bits = 0
+	w.dst = dst
+}
diff --git a/brotli_bit_stream.go b/brotli_bit_stream.go
index 7acfb18..ee65529 100644
--- a/brotli_bit_stream.go
+++ b/brotli_bit_stream.go
@@ -7,12 +7,18 @@ import (
 
 const maxHuffmanTreeSize = (2*numCommandSymbols + 1)
 
-/* The maximum size of Huffman dictionary for distances assuming that
-   NPOSTFIX = 0 and NDIRECT = 0. */
+/*
+The maximum size of Huffman dictionary for distances assuming that
+
+	NPOSTFIX = 0 and NDIRECT = 0.
+*/
 const maxSimpleDistanceAlphabetSize = 140
 
-/* Represents the range of values belonging to a prefix code:
-   [offset, offset + 2^nbits) */
+/*
+Represents the range of values belonging to a prefix code:
+
+	[offset, offset + 2^nbits)
+*/
 type prefixCodeRange struct {
 	offset uint32
 	nbits  uint32
@@ -96,9 +102,12 @@ func nextBlockTypeCode(calculator *blockTypeCodeCalculator, type_ byte) uint {
 	return type_code
 }
 
-/* |nibblesbits| represents the 2 bits to encode MNIBBLES (0-3)
-   REQUIRES: length > 0
-   REQUIRES: length <= (1 << 24) */
+/*
+|nibblesbits| represents the 2 bits to encode MNIBBLES (0-3)
+
+	REQUIRES: length > 0
+	REQUIRES: length <= (1 << 24)
+*/
 func encodeMlen(length uint, bits *uint64, numbits *uint, nibblesbits *uint64) {
 	var lg uint
 	if length == 1 {
@@ -132,8 +141,11 @@ func storeCommandExtra(cmd *command, storage_ix *uint, storage []byte) {
 	writeBits(uint(insnumextra+getCopyExtra(copycode)), bits, storage_ix, storage)
 }
 
-/* Data structure that stores almost everything that is needed to encode each
-   block switch command. */
+/*
+Data structure that stores almost everything that is needed to encode each
+
+	block switch command.
+*/
 type blockSplitCode struct {
 	type_code_calculator blockTypeCodeCalculator
 	type_depths          [maxBlockTypeSymbols]byte
@@ -154,9 +166,12 @@ func storeVarLenUint8(n uint, storage_ix *uint, storage []byte) {
 	}
 }
 
-/* Stores the compressed meta-block header.
-   REQUIRES: length > 0
-   REQUIRES: length <= (1 << 24) */
+/*
+Stores the compressed meta-block header.
+
+	REQUIRES: length > 0
+	REQUIRES: length <= (1 << 24)
+*/
 func storeCompressedMetaBlockHeader(is_final_block bool, length uint, storage_ix *uint, storage []byte) {
 	var lenbits uint64
 	var nlenbits uint
@@ -186,9 +201,12 @@ func storeCompressedMetaBlockHeader(is_final_block bool, length uint, storage_ix
 	}
 }
 
-/* Stores the uncompressed meta-block header.
-   REQUIRES: length > 0
-   REQUIRES: length <= (1 << 24) */
+/*
+Stores the uncompressed meta-block header.
+
+	REQUIRES: length > 0
+	REQUIRES: length <= (1 << 24)
+*/
 func storeUncompressedMetaBlockHeader(length uint, storage_ix *uint, storage []byte) {
 	var lenbits uint64
 	var nlenbits uint
@@ -312,8 +330,11 @@ func storeSimpleHuffmanTree(depths []byte, symbols []uint, num_symbols uint, max
 	}
 }
 
-/* num = alphabet size
-   depths = symbol depths */
+/*
+num = alphabet size
+
+	depths = symbol depths
+*/
 func storeHuffmanTree(depths []byte, num uint, tree []huffmanTree, storage_ix *uint, storage []byte) {
 	var huffman_tree [numCommandSymbols]byte
 	var huffman_tree_extra_bits [numCommandSymbols]byte
@@ -367,8 +388,11 @@ func storeHuffmanTree(depths []byte, num uint, tree []huffmanTree, storage_ix *u
 	storeHuffmanTreeToBitMask(huffman_tree_size, huffman_tree[:], huffman_tree_extra_bits[:], code_length_bitdepth[:], code_length_bitdepth_symbols[:], storage_ix, storage)
 }
 
-/* Builds a Huffman tree from histogram[0:length] into depth[0:length] and
-   bits[0:length] and stores the encoded tree to the bit stream. */
+/*
+Builds a Huffman tree from histogram[0:length] into depth[0:length] and
+
+	bits[0:length] and stores the encoded tree to the bit stream.
+*/
 func buildAndStoreHuffmanTree(histogram []uint32, histogram_length uint, alphabet_size uint, tree []huffmanTree, depth []byte, bits []uint16, storage_ix *uint, storage []byte) {
 	var count uint = 0
 	var s4 = [4]uint{0}
@@ -623,6 +647,203 @@ func buildAndStoreHuffmanTreeFast(histogram []uint32, histogram_total uint, max_
 	}
 }
 
+func buildAndStoreHuffmanTreeFastBW(histogram []uint32, histogram_total uint, max_bits uint, depth []byte, bits []uint16, bw *bitWriter) {
+	var count uint = 0
+	var symbols = [4]uint{0}
+	var length uint = 0
+	var total uint = histogram_total
+	for total != 0 {
+		if histogram[length] != 0 {
+			if count < 4 {
+				symbols[count] = length
+			}
+
+			count++
+			total -= uint(histogram[length])
+		}
+
+		length++
+	}
+
+	if count <= 1 {
+		bw.writeBits(4, 1)
+		bw.writeBits(max_bits, uint64(symbols[0]))
+		depth[symbols[0]] = 0
+		bits[symbols[0]] = 0
+		return
+	}
+
+	for i := 0; i < int(length); i++ {
+		depth[i] = 0
+	}
+	{
+		var max_tree_size uint = 2*length + 1
+		tree, _ := huffmanTreePool.Get().(*[]huffmanTree)
+		if tree == nil || cap(*tree) < int(max_tree_size) {
+			tmp := make([]huffmanTree, max_tree_size)
+			tree = &tmp
+		} else {
+			*tree = (*tree)[:max_tree_size]
+		}
+		var count_limit uint32
+		for count_limit = 1; ; count_limit *= 2 {
+			var node int = 0
+			var l uint
+			for l = length; l != 0; {
+				l--
+				if histogram[l] != 0 {
+					if histogram[l] >= count_limit {
+						initHuffmanTree(&(*tree)[node:][0], histogram[l], -1, int16(l))
+					} else {
+						initHuffmanTree(&(*tree)[node:][0], count_limit, -1, int16(l))
+					}
+
+					node++
+				}
+			}
+			{
+				var n int = node
+				/* Points to the next leaf node. */ /* Points to the next non-leaf node. */
+				var sentinel huffmanTree
+				var i int = 0
+				var j int = n + 1
+				var k int
+
+				sortHuffmanTreeItems(*tree, uint(n), huffmanTreeComparator(sortHuffmanTree1))
+
+				/* The nodes are:
+				   [0, n): the sorted leaf nodes that we start with.
+				   [n]: we add a sentinel here.
+				   [n + 1, 2n): new parent nodes are added here, starting from
+				                (n+1). These are naturally in ascending order.
+				   [2n]: we add a sentinel at the end as well.
+				   There will be (2n+1) elements at the end. */
+				initHuffmanTree(&sentinel, math.MaxUint32, -1, -1)
+
+				(*tree)[node] = sentinel
+				node++
+				(*tree)[node] = sentinel
+				node++
+
+				for k = n - 1; k > 0; k-- {
+					var left int
+					var right int
+					if (*tree)[i].total_count_ <= (*tree)[j].total_count_ {
+						left = i
+						i++
+					} else {
+						left = j
+						j++
+					}
+
+					if (*tree)[i].total_count_ <= (*tree)[j].total_count_ {
+						right = i
+						i++
+					} else {
+						right = j
+						j++
+					}
+
+					/* The sentinel node becomes the parent node. */
+					(*tree)[node-1].total_count_ = (*tree)[left].total_count_ + (*tree)[right].total_count_
+
+					(*tree)[node-1].index_left_ = int16(left)
+					(*tree)[node-1].index_right_or_value_ = int16(right)
+
+					/* Add back the last sentinel node. */
+					(*tree)[node] = sentinel
+					node++
+				}
+
+				if setDepth(2*n-1, *tree, depth, 14) {
+					/* We need to pack the Huffman tree in 14 bits. If this was not
+					   successful, add fake entities to the lowest values and retry. */
+					break
+				}
+			}
+		}
+
+		huffmanTreePool.Put(tree)
+	}
+
+	convertBitDepthsToSymbols(depth, length, bits)
+	if count <= 4 {
+		var i uint
+
+		/* value of 1 indicates a simple Huffman code */
+		bw.writeBits(2, 1)
+
+		bw.writeBits(2, uint64(count)-1) /* NSYM - 1 */
+
+		/* Sort */
+		for i = 0; i < count; i++ {
+			var j uint
+			for j = i + 1; j < count; j++ {
+				if depth[symbols[j]] < depth[symbols[i]] {
+					var tmp uint = symbols[j]
+					symbols[j] = symbols[i]
+					symbols[i] = tmp
+				}
+			}
+		}
+
+		if count == 2 {
+			bw.writeBits(max_bits, uint64(symbols[0]))
+			bw.writeBits(max_bits, uint64(symbols[1]))
+		} else if count == 3 {
+			bw.writeBits(max_bits, uint64(symbols[0]))
+			bw.writeBits(max_bits, uint64(symbols[1]))
+			bw.writeBits(max_bits, uint64(symbols[2]))
+		} else {
+			bw.writeBits(max_bits, uint64(symbols[0]))
+			bw.writeBits(max_bits, uint64(symbols[1]))
+			bw.writeBits(max_bits, uint64(symbols[2]))
+			bw.writeBits(max_bits, uint64(symbols[3]))
+
+			/* tree-select */
+			bw.writeSingleBit(depth[symbols[0]] == 1)
+		}
+	} else {
+		var previous_value byte = 8
+		var i uint
+
+		/* Complex Huffman Tree */
+		storeStaticCodeLengthCodeBW(bw)
+
+		/* Actual RLE coding. */
+		for i = 0; i < length; {
+			var value byte = depth[i]
+			var reps uint = 1
+			var k uint
+			for k = i + 1; k < length && depth[k] == value; k++ {
+				reps++
+			}
+
+			i += reps
+			if value == 0 {
+				bw.writeBits(uint(kZeroRepsDepth[reps]), kZeroRepsBits[reps])
+			} else {
+				if previous_value != value {
+					bw.writeBits(uint(kCodeLengthDepth[value]), uint64(kCodeLengthBits[value]))
+					reps--
+				}
+
+				if reps < 3 {
+					for reps != 0 {
+						reps--
+						bw.writeBits(uint(kCodeLengthDepth[value]), uint64(kCodeLengthBits[value]))
+					}
+				} else {
+					reps -= 3
+					bw.writeBits(uint(kNonZeroRepsDepth[reps]), kNonZeroRepsBits[reps])
+				}
+
+				previous_value = value
+			}
+		}
+	}
+}
+
 func indexOf(v []byte, v_size uint, value byte) uint {
 	var i uint = 0
 	for ; i < v_size; i++ {
@@ -674,12 +895,15 @@ func moveToFrontTransform(v_in []uint32, v_size uint, v_out []uint32) {
 	}
 }
 
-/* Finds runs of zeros in v[0..in_size) and replaces them with a prefix code of
-   the run length plus extra bits (lower 9 bits is the prefix code and the rest
-   are the extra bits). Non-zero values in v[] are shifted by
-   *max_length_prefix. Will not create prefix codes bigger than the initial
-   value of *max_run_length_prefix. The prefix code of run length L is simply
-   Log2Floor(L) and the number of extra bits is the same as the prefix code. */
+/*
+Finds runs of zeros in v[0..in_size) and replaces them with a prefix code of
+
+	the run length plus extra bits (lower 9 bits is the prefix code and the rest
+	are the extra bits). Non-zero values in v[] are shifted by
+	*max_length_prefix. Will not create prefix codes bigger than the initial
+	value of *max_run_length_prefix. The prefix code of run length L is simply
+	Log2Floor(L) and the number of extra bits is the same as the prefix code.
+*/
 func runLengthCodeZeros(in_size uint, v []uint32, out_size *uint, max_run_length_prefix *uint32) {
 	var max_reps uint32 = 0
 	var i uint
@@ -799,8 +1023,11 @@ func storeBlockSwitch(code *blockSplitCode, block_len uint32, block_type byte, i
 	writeBits(uint(len_nextra), uint64(len_extra), storage_ix, storage)
 }
 
-/* Builds a BlockSplitCode data structure from the block split given by the
-   vector of block types and block lengths and stores it to the bit stream. */
+/*
+Builds a BlockSplitCode data structure from the block split given by the
+
+	vector of block types and block lengths and stores it to the bit stream.
+*/
 func buildAndStoreBlockSplitCode(types []byte, lengths []uint32, num_blocks uint, num_types uint, tree []huffmanTree, code *blockSplitCode, storage_ix *uint, storage []byte) {
 	var type_histo [maxBlockTypeSymbols]uint32
 	var length_histo [numBlockLenSymbols]uint32
@@ -919,14 +1146,20 @@ func cleanupBlockEncoder(self *blockEncoder) {
 	blockEncoderPool.Put(self)
 }
 
-/* Creates entropy codes of block lengths and block types and stores them
-   to the bit stream. */
+/*
+Creates entropy codes of block lengths and block types and stores them
+
+	to the bit stream.
+*/
 func buildAndStoreBlockSwitchEntropyCodes(self *blockEncoder, tree []huffmanTree, storage_ix *uint, storage []byte) {
 	buildAndStoreBlockSplitCode(self.block_types_, self.block_lengths_, self.num_blocks_, self.num_block_types_, tree, &self.block_split_code_, storage_ix, storage)
 }
 
-/* Stores the next symbol with the entropy code of the current block type.
-   Updates the block type and block length at block boundaries. */
+/*
+Stores the next symbol with the entropy code of the current block type.
+
+	Updates the block type and block length at block boundaries.
+*/
 func storeSymbol(self *blockEncoder, symbol uint, storage_ix *uint, storage []byte) {
 	if self.block_len_ == 0 {
 		self.block_ix_++
@@ -945,9 +1178,12 @@ func storeSymbol(self *blockEncoder, symbol uint, storage_ix *uint, storage []by
 	}
 }
 
-/* Stores the next symbol with the entropy code of the current block type and
-   context value.
-   Updates the block type and block length at block boundaries. */
+/*
+Stores the next symbol with the entropy code of the current block type and
+
+	context value.
+	Updates the block type and block length at block boundaries.
+*/
 func storeSymbolWithContext(self *blockEncoder, symbol uint, context uint, context_map []uint32, storage_ix *uint, storage []byte, context_bits uint) {
 	if self.block_len_ == 0 {
 		self.block_ix_++
@@ -1268,8 +1504,11 @@ func storeMetaBlockFast(input []byte, start_pos uint, length uint, mask uint, is
 	}
 }
 
-/* This is for storing uncompressed blocks (simple raw storage of
-   bytes-as-bytes). */
+/*
+This is for storing uncompressed blocks (simple raw storage of
+
+	bytes-as-bytes).
+*/
 func storeUncompressedMetaBlock(is_final_block bool, input []byte, position uint, mask uint, len uint, storage_ix *uint, storage []byte) {
 	var masked_pos uint = position & mask
 	storeUncompressedMetaBlockHeader(uint(len), storage_ix, storage)
diff --git a/brotli_test.go b/brotli_test.go
index 45b989e..4dd8b54 100644
--- a/brotli_test.go
+++ b/brotli_test.go
@@ -16,6 +16,8 @@ import (
 	"os"
 	"testing"
 	"time"
+
+	"github.com/andybalholm/brotli/matchfinder"
 )
 
 func checkCompressedData(compressedData, wantOriginalData []byte) error {
@@ -595,3 +597,63 @@ func BenchmarkDecodeLevels(b *testing.B) {
 		})
 	}
 }
+
+func test(t *testing.T, filename string, m matchfinder.MatchFinder, blockSize int) {
+	data, err := ioutil.ReadFile(filename)
+	if err != nil {
+		t.Fatal(err)
+	}
+	b := new(bytes.Buffer)
+	w := &matchfinder.Writer{
+		Dest:        b,
+		MatchFinder: m,
+		Encoder:     &Encoder{},
+		BlockSize:   blockSize,
+	}
+	w.Write(data)
+	w.Close()
+	compressed := b.Bytes()
+	sr := NewReader(bytes.NewReader(compressed))
+	decompressed, err := ioutil.ReadAll(sr)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !bytes.Equal(decompressed, data) {
+		t.Fatal("decompressed output doesn't match")
+	}
+}
+
+func benchmark(b *testing.B, filename string, m matchfinder.MatchFinder, blockSize int) {
+	b.StopTimer()
+	b.ReportAllocs()
+	data, err := ioutil.ReadFile(filename)
+	if err != nil {
+		b.Fatal(err)
+	}
+
+	b.SetBytes(int64(len(data)))
+	buf := new(bytes.Buffer)
+	w := &matchfinder.Writer{
+		Dest:        buf,
+		MatchFinder: m,
+		Encoder:     &Encoder{},
+		BlockSize:   blockSize,
+	}
+	w.Write(data)
+	w.Close()
+	b.ReportMetric(float64(len(data))/float64(buf.Len()), "ratio")
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		w.Reset(ioutil.Discard)
+		w.Write(data)
+		w.Close()
+	}
+}
+
+func TestEncodeM4(t *testing.T) {
+	test(t, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 18}, 1<<16)
+}
+
+func BenchmarkEncodeM4(b *testing.B) {
+	benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20}, 1<<16)
+}
diff --git a/compress_fragment_two_pass.go b/compress_fragment_two_pass.go
index 172dc7f..79f9c7f 100644
--- a/compress_fragment_two_pass.go
+++ b/compress_fragment_two_pass.go
@@ -39,8 +39,11 @@ func isMatch1(p1 []byte, p2 []byte, length uint) bool {
 	return p1[4] == p2[4] && p1[5] == p2[5]
 }
 
-/* Builds a command and distance prefix code (each 64 symbols) into "depth" and
-   "bits" based on "histogram" and stores it into the bit stream. */
+/*
+Builds a command and distance prefix code (each 64 symbols) into "depth" and
+
+	"bits" based on "histogram" and stores it into the bit stream.
+*/
 func buildAndStoreCommandPrefixCode(histogram []uint32, depth []byte, bits []uint16, storage_ix *uint, storage []byte) {
 	var tree [129]huffmanTree
 	var cmd_depth = [numCommandSymbols]byte{0}
@@ -216,6 +219,25 @@ func storeMetaBlockHeader(len uint, is_uncompressed bool, storage_ix *uint, stor
 	writeSingleBit(is_uncompressed, storage_ix, storage)
 }
 
+func storeMetaBlockHeaderBW(len uint, is_uncompressed bool, bw *bitWriter) {
+	var nibbles uint = 6
+
+	/* ISLAST */
+	bw.writeBits(1, 0)
+
+	if len <= 1<<16 {
+		nibbles = 4
+	} else if len <= 1<<20 {
+		nibbles = 5
+	}
+
+	bw.writeBits(2, uint64(nibbles)-4)
+	bw.writeBits(nibbles*4, uint64(len)-1)
+
+	/* ISUNCOMPRESSED */
+	bw.writeSingleBit(is_uncompressed)
+}
+
 func createCommands(input []byte, block_size uint, input_size uint, base_ip_ptr []byte, table []int, table_bits uint, min_match uint, literals *[]byte, commands *[]uint32) {
 	var ip int = 0
 	var shift uint = 64 - table_bits
@@ -710,19 +732,22 @@ func compressFragmentTwoPassImpl(input []byte, input_size uint, is_last bool, co
 	}
 }
 
-/* Compresses "input" string to the "*storage" buffer as one or more complete
-   meta-blocks, and updates the "*storage_ix" bit position.
+/*
+Compresses "input" string to the "*storage" buffer as one or more complete
 
-   If "is_last" is 1, emits an additional empty last meta-block.
+	meta-blocks, and updates the "*storage_ix" bit position.
 
-   REQUIRES: "input_size" is greater than zero, or "is_last" is 1.
-   REQUIRES: "input_size" is less or equal to maximal metablock size (1 << 24).
-   REQUIRES: "command_buf" and "literal_buf" point to at least
-              kCompressFragmentTwoPassBlockSize long arrays.
-   REQUIRES: All elements in "table[0..table_size-1]" are initialized to zero.
-   REQUIRES: "table_size" is a power of two
-   OUTPUT: maximal copy distance <= |input_size|
-   OUTPUT: maximal copy distance <= BROTLI_MAX_BACKWARD_LIMIT(18) */
+	If "is_last" is 1, emits an additional empty last meta-block.
+
+	REQUIRES: "input_size" is greater than zero, or "is_last" is 1.
+	REQUIRES: "input_size" is less or equal to maximal metablock size (1 << 24).
+	REQUIRES: "command_buf" and "literal_buf" point to at least
+	           kCompressFragmentTwoPassBlockSize long arrays.
+	REQUIRES: All elements in "table[0..table_size-1]" are initialized to zero.
+	REQUIRES: "table_size" is a power of two
+	OUTPUT: maximal copy distance <= |input_size|
+	OUTPUT: maximal copy distance <= BROTLI_MAX_BACKWARD_LIMIT(18)
+*/
 func compressFragmentTwoPass(input []byte, input_size uint, is_last bool, command_buf []uint32, literal_buf []byte, table []int, table_size uint, storage_ix *uint, storage []byte) {
 	var initial_storage_ix uint = *storage_ix
 	var table_bits uint = uint(log2FloorNonZero(table_size))
diff --git a/encoder.go b/encoder.go
new file mode 100644
index 0000000..650d1e4
--- /dev/null
+++ b/encoder.go
@@ -0,0 +1,168 @@
+package brotli
+
+import "github.com/andybalholm/brotli/matchfinder"
+
+// An Encoder implements the matchfinder.Encoder interface, writing in Brotli format.
+type Encoder struct {
+	wroteHeader bool
+	bw          bitWriter
+	distCache   []distanceCode
+}
+
+func (e *Encoder) Reset() {
+	e.wroteHeader = false
+	e.bw = bitWriter{}
+}
+
+func (e *Encoder) Encode(dst []byte, src []byte, matches []matchfinder.Match, lastBlock bool) []byte {
+	e.bw.dst = dst
+	if !e.wroteHeader {
+		e.bw.writeBits(4, 15)
+		e.wroteHeader = true
+	}
+
+	var literalHisto [256]uint32
+	var commandHisto [704]uint32
+	var distanceHisto [64]uint32
+	literalCount := 0
+	commandCount := 0
+	distanceCount := 0
+
+	if len(e.distCache) < len(matches) {
+		e.distCache = make([]distanceCode, len(matches))
+	}
+
+	// first pass: build the histograms
+	pos := 0
+
+	// d is the ring buffer of the last 4 distances.
+	d := [4]int{-10, -10, -10, -10}
+	for i, m := range matches {
+		if m.Unmatched > 0 {
+			for _, c := range src[pos : pos+m.Unmatched] {
+				literalHisto[c]++
+			}
+			literalCount += m.Unmatched
+		}
+
+		insertCode := getInsertLengthCode(uint(m.Unmatched))
+		copyCode := getCopyLengthCode(uint(m.Length))
+		if m.Length == 0 {
+			// If the stream ends with unmatched bytes, we need a dummy copy length.
+			copyCode = 2
+		}
+		command := combineLengthCodes(insertCode, copyCode, false)
+		commandHisto[command]++
+		commandCount++
+
+		if command >= 128 && m.Length != 0 {
+			var distCode distanceCode
+			switch m.Distance {
+			case d[3]:
+				distCode.code = 0
+			case d[2]:
+				distCode.code = 1
+			case d[1]:
+				distCode.code = 2
+			case d[0]:
+				distCode.code = 3
+			case d[3] - 1:
+				distCode.code = 4
+			case d[3] + 1:
+				distCode.code = 5
+			case d[3] - 2:
+				distCode.code = 6
+			case d[3] + 2:
+				distCode.code = 7
+			case d[3] - 3:
+				distCode.code = 8
+			case d[3] + 3:
+				distCode.code = 9
+
+				// In my testing, codes 10–15 actually reduced the compression ratio.
+
+			default:
+				distCode = getDistanceCode(m.Distance)
+			}
+			e.distCache[i] = distCode
+			distanceHisto[distCode.code]++
+			distanceCount++
+			if distCode.code != 0 {
+				d[0], d[1], d[2], d[3] = d[1], d[2], d[3], m.Distance
+			}
+		}
+
+		pos += m.Unmatched + m.Length
+	}
+
+	storeMetaBlockHeaderBW(uint(len(src)), false, &e.bw)
+	e.bw.writeBits(13, 0)
+
+	var literalDepths [256]byte
+	var literalBits [256]uint16
+	buildAndStoreHuffmanTreeFastBW(literalHisto[:], uint(literalCount), 8, literalDepths[:], literalBits[:], &e.bw)
+
+	var commandDepths [704]byte
+	var commandBits [704]uint16
+	buildAndStoreHuffmanTreeFastBW(commandHisto[:], uint(commandCount), 10, commandDepths[:], commandBits[:], &e.bw)
+
+	var distanceDepths [64]byte
+	var distanceBits [64]uint16
+	buildAndStoreHuffmanTreeFastBW(distanceHisto[:], uint(distanceCount), 6, distanceDepths[:], distanceBits[:], &e.bw)
+
+	pos = 0
+	for i, m := range matches {
+		insertCode := getInsertLengthCode(uint(m.Unmatched))
+		copyCode := getCopyLengthCode(uint(m.Length))
+		if m.Length == 0 {
+			// If the stream ends with unmatched bytes, we need a dummy copy length.
+			copyCode = 2
+		}
+		command := combineLengthCodes(insertCode, copyCode, false)
+		e.bw.writeBits(uint(commandDepths[command]), uint64(commandBits[command]))
+		if kInsExtra[insertCode] > 0 {
+			e.bw.writeBits(uint(kInsExtra[insertCode]), uint64(m.Unmatched)-uint64(kInsBase[insertCode]))
+		}
+		if kCopyExtra[copyCode] > 0 {
+			e.bw.writeBits(uint(kCopyExtra[copyCode]), uint64(m.Length)-uint64(kCopyBase[copyCode]))
+		}
+
+		if m.Unmatched > 0 {
+			for _, c := range src[pos : pos+m.Unmatched] {
+				e.bw.writeBits(uint(literalDepths[c]), uint64(literalBits[c]))
+			}
+		}
+
+		if command >= 128 && m.Length != 0 {
+			distCode := e.distCache[i]
+			e.bw.writeBits(uint(distanceDepths[distCode.code]), uint64(distanceBits[distCode.code]))
+			if distCode.nExtra > 0 {
+				e.bw.writeBits(distCode.nExtra, distCode.extraBits)
+			}
+		}
+
+		pos += m.Unmatched + m.Length
+	}
+
+	if lastBlock {
+		e.bw.writeBits(2, 3) // islast + isempty
+		e.bw.jumpToByteBoundary()
+	}
+	return e.bw.dst
+}
+
+type distanceCode struct {
+	code      int
+	nExtra    uint
+	extraBits uint64
+}
+
+func getDistanceCode(distance int) distanceCode {
+	d := distance + 3
+	nbits := log2FloorNonZero(uint(d)) - 1
+	prefix := (d >> nbits) & 1
+	offset := (2 + prefix) << nbits
+	distcode := int(2*(nbits-1)) + prefix + 16
+	extra := d - offset
+	return distanceCode{distcode, uint(nbits), uint64(extra)}
+}
diff --git a/entropy_encode_static.go b/entropy_encode_static.go
index 5ddf3fc..294aff4 100644
--- a/entropy_encode_static.go
+++ b/entropy_encode_static.go
@@ -782,6 +782,11 @@ func storeStaticCodeLengthCode(storage_ix *uint, storage []byte) {
 	writeBits(40, 0x0000FF55555554, storage_ix, storage)
 }
 
+func storeStaticCodeLengthCodeBW(bw *bitWriter) {
+	bw.writeBits(32, 0x55555554)
+	bw.writeBits(8, 0xFF)
+}
+
 var kZeroRepsBits = [numCommandSymbols]uint64{
 	0x00000000,
 	0x00000000,
diff --git a/go.mod b/go.mod
index 1c94232..50324ea 100644
--- a/go.mod
+++ b/go.mod
@@ -1,5 +1,5 @@
 module github.com/andybalholm/brotli
 
-go 1.12
+go 1.13
 
 retract v1.0.1 // occasional panics and data corruption
diff --git a/matchfinder/m4.go b/matchfinder/m4.go
new file mode 100644
index 0000000..6bafe27
--- /dev/null
+++ b/matchfinder/m4.go
@@ -0,0 +1,270 @@
+package matchfinder
+
+import (
+	"encoding/binary"
+	"math/bits"
+	"runtime"
+)
+
+const (
+	ssapBits = 17
+	ssapMask = (1 << ssapBits) - 1
+)
+
+// M4 is an implementation of the MatchFinder
+// interface that uses a simple hash table to find matches,
+// but the advanced parsing technique from
+// https://fastcompression.blogspot.com/2011/12/advanced-parsing-strategies.html,
+// except that it looks for matches at every input position.
+type M4 struct {
+	// MaxDistance is the maximum distance (in bytes) to look back for
+	// a match. The default is 65535.
+	MaxDistance int
+
+	// MinLength is the length of the shortest match to return.
+	// The default is 4.
+	MinLength int
+
+	// HashLen is the number of bytes to use to calculate the hashes.
+	// The maximum is 8 and the default is 6.
+	HashLen int
+
+	table [1 << ssapBits]uint32
+
+	history []byte
+}
+
+func (q *M4) Reset() {
+	q.table = [1 << ssapBits]uint32{}
+	q.history = q.history[:0]
+}
+
+func (q *M4) FindMatches(dst []Match, src []byte) []Match {
+	if q.MaxDistance == 0 {
+		q.MaxDistance = 65535
+	}
+	if q.MinLength == 0 {
+		q.MinLength = 4
+	}
+	if q.HashLen == 0 {
+		q.HashLen = 6
+	}
+	var nextEmit int
+
+	if len(q.history) > q.MaxDistance*2 {
+		// Trim down the history buffer.
+		delta := len(q.history) - q.MaxDistance
+		copy(q.history, q.history[delta:])
+		q.history = q.history[:q.MaxDistance]
+
+		for i, v := range q.table {
+			newV := int(v) - delta
+			if newV < 0 {
+				newV = 0
+			}
+			q.table[i] = uint32(newV)
+		}
+	}
+
+	// Append src to the history buffer.
+	nextEmit = len(q.history)
+	q.history = append(q.history, src...)
+	src = q.history
+
+	// matches stores the matches that have been found but not emitted,
+	// in reverse order. (matches[0] is the most recent one.)
+	var matches [3]absoluteMatch
+	for i := nextEmit; i < len(src)-7; i++ {
+		if matches[0] != (absoluteMatch{}) && i >= matches[0].End {
+			// We have found some matches, and we're far enough along that we probably
+			// won't find overlapping matches, so we might as well emit them.
+			if matches[1] != (absoluteMatch{}) {
+				if matches[1].End > matches[0].Start {
+					matches[1].End = matches[0].Start
+				}
+				if matches[1].End-matches[1].Start >= q.MinLength {
+					dst = append(dst, Match{
+						Unmatched: matches[1].Start - nextEmit,
+						Length:    matches[1].End - matches[1].Start,
+						Distance:  matches[1].Start - matches[1].Match,
+					})
+					nextEmit = matches[1].End
+				}
+			}
+			dst = append(dst, Match{
+				Unmatched: matches[0].Start - nextEmit,
+				Length:    matches[0].End - matches[0].Start,
+				Distance:  matches[0].Start - matches[0].Match,
+			})
+			nextEmit = matches[0].End
+			matches = [3]absoluteMatch{}
+		}
+
+		// Now look for a match.
+		h := ((binary.LittleEndian.Uint64(src[i:]) & (1<<(8*q.HashLen) - 1)) * hashMul64) >> (64 - ssapBits)
+		candidate := int(q.table[h&ssapMask])
+		q.table[h&ssapMask] = uint32(i)
+
+		if candidate == 0 || i-candidate > q.MaxDistance || i-candidate == matches[0].Start-matches[0].Match {
+			continue
+		}
+		if binary.LittleEndian.Uint32(src[candidate:]) != binary.LittleEndian.Uint32(src[i:]) {
+			continue
+		}
+
+		// We have a 4-byte match now.
+
+		start := i
+		match := candidate
+		end := extendMatch(src, match+4, start+4)
+		for start > nextEmit && match > 0 && src[start-1] == src[match-1] {
+			start--
+			match--
+		}
+		if end-start <= matches[0].End-matches[0].Start {
+			continue
+		}
+
+		matches = [3]absoluteMatch{
+			absoluteMatch{
+				Start: start,
+				End:   end,
+				Match: match,
+			},
+			matches[0],
+			matches[1],
+		}
+
+		if matches[2] == (absoluteMatch{}) {
+			continue
+		}
+
+		// We have three matches, so it's time to emit one and/or eliminate one.
+		switch {
+		case matches[0].Start < matches[2].End:
+			// The first and third matches overlap; discard the one in between.
+			matches = [3]absoluteMatch{
+				matches[0],
+				matches[2],
+				absoluteMatch{},
+			}
+
+		case matches[0].Start < matches[2].End+q.MinLength:
+			// The first and third matches don't overlap, but there's no room for
+			// another match between them. Emit the first match and discard the second.
+			dst = append(dst, Match{
+				Unmatched: matches[2].Start - nextEmit,
+				Length:    matches[2].End - matches[2].Start,
+				Distance:  matches[2].Start - matches[2].Match,
+			})
+			nextEmit = matches[2].End
+			matches = [3]absoluteMatch{
+				matches[0],
+				absoluteMatch{},
+				absoluteMatch{},
+			}
+
+		default:
+			// Emit the first match, shortening it if necessary to avoid overlap with the second.
+			if matches[2].End > matches[1].Start {
+				matches[2].End = matches[1].Start
+			}
+			if matches[2].End-matches[2].Start >= q.MinLength {
+				dst = append(dst, Match{
+					Unmatched: matches[2].Start - nextEmit,
+					Length:    matches[2].End - matches[2].Start,
+					Distance:  matches[2].Start - matches[2].Match,
+				})
+				nextEmit = matches[2].End
+			}
+			matches[2] = absoluteMatch{}
+		}
+	}
+
+	// We've found all the matches now; emit the remaining ones.
+	if matches[1] != (absoluteMatch{}) {
+		if matches[1].End > matches[0].Start {
+			matches[1].End = matches[0].Start
+		}
+		if matches[1].End-matches[1].Start >= q.MinLength {
+			dst = append(dst, Match{
+				Unmatched: matches[1].Start - nextEmit,
+				Length:    matches[1].End - matches[1].Start,
+				Distance:  matches[1].Start - matches[1].Match,
+			})
+			nextEmit = matches[1].End
+		}
+	}
+	if matches[0] != (absoluteMatch{}) {
+		dst = append(dst, Match{
+			Unmatched: matches[0].Start - nextEmit,
+			Length:    matches[0].End - matches[0].Start,
+			Distance:  matches[0].Start - matches[0].Match,
+		})
+		nextEmit = matches[0].End
+	}
+
+	if nextEmit < len(src) {
+		dst = append(dst, Match{
+			Unmatched: len(src) - nextEmit,
+		})
+	}
+
+	return dst
+}
+
+const hashMul64 = 0x1E35A7BD1E35A7BD
+
+// An absoluteMatch is like a Match, but it stores indexes into the byte
+// stream instead of lengths.
+type absoluteMatch struct {
+	// Start is the index of the first byte.
+	Start int
+
+	// End is the index of the byte after the last byte
+	// (so that End - Start = Length).
+	End int
+
+	// Match is the index of the previous data that matches
+	// (Start - Match = Distance).
+	Match int
+}
+
+// extendMatch returns the largest k such that k <= len(src) and that
+// src[i:i+k-j] and src[j:k] have the same contents.
+//
+// It assumes that:
+//
+//	0 <= i && i < j && j <= len(src)
+func extendMatch(src []byte, i, j int) int {
+	switch runtime.GOARCH {
+	case "amd64":
+		// As long as we are 8 or more bytes before the end of src, we can load and
+		// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
+		for j+8 < len(src) {
+			iBytes := binary.LittleEndian.Uint64(src[i:])
+			jBytes := binary.LittleEndian.Uint64(src[j:])
+			if iBytes != jBytes {
+				// If those 8 bytes were not equal, XOR the two 8 byte values, and return
+				// the index of the first byte that differs. The BSF instruction finds the
+				// least significant 1 bit, the amd64 architecture is little-endian, and
+				// the shift by 3 converts a bit index to a byte index.
+				return j + bits.TrailingZeros64(iBytes^jBytes)>>3
+			}
+			i, j = i+8, j+8
+		}
+	case "386":
+		// On a 32-bit CPU, we do it 4 bytes at a time.
+		for j+4 < len(src) {
+			iBytes := binary.LittleEndian.Uint32(src[i:])
+			jBytes := binary.LittleEndian.Uint32(src[j:])
+			if iBytes != jBytes {
+				return j + bits.TrailingZeros32(iBytes^jBytes)>>3
+			}
+			i, j = i+4, j+4
+		}
+	}
+	for ; j < len(src) && src[i] == src[j]; i, j = i+1, j+1 {
+	}
+	return j
+}
diff --git a/matchfinder/matchfinder.go b/matchfinder/matchfinder.go
new file mode 100644
index 0000000..f6bcfdb
--- /dev/null
+++ b/matchfinder/matchfinder.go
@@ -0,0 +1,103 @@
+// The matchfinder package defines reusable components for data compression.
+//
+// Many compression libraries have two main parts:
+//   - Something that looks for repeated sequences of bytes
+//   - An encoder for the compressed data format (often an entropy coder)
+//
+// Although these are logically two separate steps, the implementations are
+// usually closely tied together. You can't use flate's matcher with snappy's
+// encoder, for example. This package defines interfaces and an intermediate
+// representation to allow mixing and matching compression components.
+package matchfinder
+
+import "io"
+
+// A Match is the basic unit of LZ77 compression.
+type Match struct {
+	Unmatched int // the number of unmatched bytes since the previous match
+	Length    int // the number of bytes in the matched string; it may be 0 at the end of the input
+	Distance  int // how far back in the stream to copy from
+}
+
+// A MatchFinder performs the LZ77 stage of compression, looking for matches.
+type MatchFinder interface {
+	// FindMatches looks for matches in src, appends them to dst, and returns dst.
+	FindMatches(dst []Match, src []byte) []Match
+
+	// Reset clears any internal state, preparing the MatchFinder to be used with
+	// a new stream.
+	Reset()
+}
+
+// An Encoder encodes the data in its final format.
+type Encoder interface {
+	// Encode appends the encoded format of src to dst, using the match
+	// information from matches.
+	Encode(dst []byte, src []byte, matches []Match, lastBlock bool) []byte
+
+	// Reset clears any internal state, preparing the Encoder to be used with
+	// a new stream.
+	Reset()
+}
+
+// A Writer uses MatchFinder and Encoder to write compressed data to Dest.
+type Writer struct {
+	Dest        io.Writer
+	MatchFinder MatchFinder
+	Encoder     Encoder
+
+	// BlockSize is the number of bytes to compress at a time. If it is zero,
+	// each Write operation will be treated as one block.
+	BlockSize int
+
+	err     error
+	inBuf   []byte
+	outBuf  []byte
+	matches []Match
+}
+
+func (w *Writer) Write(p []byte) (n int, err error) {
+	if w.err != nil {
+		return 0, w.err
+	}
+
+	if w.BlockSize == 0 {
+		return w.writeBlock(p, false)
+	}
+
+	w.inBuf = append(w.inBuf, p...)
+	var pos int
+	for pos = 0; pos+w.BlockSize <= len(w.inBuf) && w.err == nil; pos += w.BlockSize {
+		w.writeBlock(w.inBuf[pos:pos+w.BlockSize], false)
+	}
+	if pos > 0 {
+		n := copy(w.inBuf, w.inBuf[pos:])
+		w.inBuf = w.inBuf[:n]
+	}
+
+	return len(p), w.err
+}
+
+func (w *Writer) writeBlock(p []byte, lastBlock bool) (n int, err error) {
+	w.outBuf = w.outBuf[:0]
+	w.matches = w.MatchFinder.FindMatches(w.matches[:0], p)
+	w.outBuf = w.Encoder.Encode(w.outBuf, p, w.matches, lastBlock)
+	_, w.err = w.Dest.Write(w.outBuf)
+	return len(p), w.err
+}
+
+func (w *Writer) Close() error {
+	w.writeBlock(w.inBuf, true)
+	w.inBuf = w.inBuf[:0]
+	return w.err
+}
+
+func (w *Writer) Reset(newDest io.Writer) {
+	w.MatchFinder.Reset()
+	w.Encoder.Reset()
+	w.err = nil
+	w.inBuf = w.inBuf[:0]
+	w.outBuf = w.outBuf[:0]
+	w.matches = w.matches[:0]
+	w.Dest = newDest
+}
diff --git a/matchfinder/textencoder.go b/matchfinder/textencoder.go
new file mode 100644
index 0000000..75ecc59
--- /dev/null
+++ b/matchfinder/textencoder.go
@@ -0,0 +1,53 @@
+package matchfinder
+
+import "fmt"
+
+// A TextEncoder is an Encoder that produces a human-readable representation of
+// the LZ77 compression. Matches are replaced with <Length,Distance> symbols.
+type TextEncoder struct{}
+
+func (t TextEncoder) Reset() {}
+
+func (t TextEncoder) Encode(dst []byte, src []byte, matches []Match, lastBlock bool) []byte {
+	pos := 0
+	for _, m := range matches {
+		if m.Unmatched > 0 {
+			dst = append(dst, src[pos:pos+m.Unmatched]...)
+			pos += m.Unmatched
+		}
+		if m.Length > 0 {
+			dst = append(dst, []byte(fmt.Sprintf("<%d,%d>", m.Length, m.Distance))...)
+			pos += m.Length
+		}
+	}
+	if pos < len(src) {
+		dst = append(dst, src[pos:]...)
+	}
+	return dst
+}
+
+// A NoMatchFinder implements MatchFinder, but doesn't find any matches.
+// It can be used to implement the equivalent of the standard library flate package's
+// HuffmanOnly setting.
+type NoMatchFinder struct{}
+
+func (n NoMatchFinder) Reset() {}
+
+func (n NoMatchFinder) FindMatches(dst []Match, src []byte) []Match {
+	return append(dst, Match{
+		Unmatched: len(src),
+	})
+}
+
+// AutoReset wraps a MatchFinder that can return references to data in previous
+// blocks, and calls Reset before each block. It is useful for (e.g.) using a
+// snappy Encoder with a MatchFinder designed for flate. (Snappy doesn't
+// support references between blocks.)
+type AutoReset struct {
+	MatchFinder
+}
+
+func (a AutoReset) FindMatches(dst []Match, src []byte) []Match {
+	a.Reset()
+	return a.MatchFinder.FindMatches(dst, src)
+}