Merge branch 'master' of github.com:goccy/go-json into feature/fix-decoder

2020-12-23 01:37:54 +09:00 · 2020-12-23 01:37:54 +09:00 · 24aa07e47f
parent 1dee747400 376ae457f3
commit 24aa07e47f
15 changed files with 2922 additions and 2170 deletions
--- a/README.md
+++ b/README.md
@ -33,17 +33,8 @@ $ go test -bench .

 **Fastest**

-### SmallStruct
-
-<img src="https://user-images.githubusercontent.com/209884/89118973-5a8cd600-d4e5-11ea-8a07-775cf3e32a2f.png"></img>
-
-### MediumStruct
-
-<img src="https://user-images.githubusercontent.com/209884/89118974-5d87c680-d4e5-11ea-8f4e-dbb01c2dd861.png"></img>
-
-### LargeStruct
-
-<img src="https://user-images.githubusercontent.com/209884/89118977-5f518a00-d4e5-11ea-8bfe-1455fc71c963.png"></img>
+<img width="700" alt="" src="https://user-images.githubusercontent.com/209884/102718073-82ac9280-4329-11eb-94f2-c5377a2feeed.png">
+<img width="700" alt="" src="https://user-images.githubusercontent.com/209884/102718071-804a3880-4329-11eb-9e70-5de74e55a553.png">

 ## Decode

--- a/benchmarks/encode_test.go
+++ b/benchmarks/encode_test.go
@ -7,6 +7,7 @@ import (
 	gojay "github.com/francoispqt/gojay"
 	gojson "github.com/goccy/go-json"
 	jsoniter "github.com/json-iterator/go"
+	segmentiojson "github.com/segmentio/encoding/json"
 )

 func Benchmark_Encode_SmallStruct_EncodingJson(b *testing.B) {
@ -37,6 +38,15 @@ func Benchmark_Encode_SmallStruct_GoJay(b *testing.B) {
 	}
 }

+func Benchmark_Encode_SmallStruct_SegmentioJson(b *testing.B) {
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		if _, err := segmentiojson.Marshal(NewSmallPayload()); err != nil {
+			b.Fatal(err)
+		}
+	}
+}
+
 func Benchmark_Encode_SmallStruct_GoJson(b *testing.B) {
 	b.ReportAllocs()
 	for i := 0; i < b.N; i++ {
@ -46,6 +56,57 @@ func Benchmark_Encode_SmallStruct_GoJson(b *testing.B) {
 	}
 }

+func Benchmark_Encode_SmallStructCached_EncodingJson(b *testing.B) {
+	cached := NewSmallPayload()
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		if _, err := json.Marshal(cached); err != nil {
+			b.Fatal(err)
+		}
+	}
+}
+
+func Benchmark_Encode_SmallStructCached_JsonIter(b *testing.B) {
+	var json = jsoniter.ConfigCompatibleWithStandardLibrary
+	cached := NewSmallPayload()
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		if _, err := json.Marshal(cached); err != nil {
+			b.Fatal(err)
+		}
+	}
+}
+
+func Benchmark_Encode_SmallStructCached_GoJay(b *testing.B) {
+	cached := NewSmallPayload()
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		if _, err := gojay.MarshalJSONObject(cached); err != nil {
+			b.Fatal(err)
+		}
+	}
+}
+
+func Benchmark_Encode_SmallStructCached_SegmentioJson(b *testing.B) {
+	cached := NewSmallPayload()
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		if _, err := segmentiojson.Marshal(cached); err != nil {
+			b.Fatal(err)
+		}
+	}
+}
+
+func Benchmark_Encode_SmallStructCached_GoJson(b *testing.B) {
+	cached := NewSmallPayload()
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		if _, err := gojson.Marshal(cached); err != nil {
+			b.Fatal(err)
+		}
+	}
+}
+
 func Benchmark_Encode_MediumStruct_EncodingJson(b *testing.B) {
 	b.ReportAllocs()
 	for i := 0; i < b.N; i++ {
@ -74,6 +135,15 @@ func Benchmark_Encode_MediumStruct_GoJay(b *testing.B) {
 	}
 }

+func Benchmark_Encode_MediumStruct_SegmentioJson(b *testing.B) {
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		if _, err := segmentiojson.Marshal(NewMediumPayload()); err != nil {
+			b.Fatal(err)
+		}
+	}
+}
+
 func Benchmark_Encode_MediumStruct_GoJson(b *testing.B) {
 	b.ReportAllocs()
 	for i := 0; i < b.N; i++ {
@ -83,6 +153,57 @@ func Benchmark_Encode_MediumStruct_GoJson(b *testing.B) {
 	}
 }

+func Benchmark_Encode_MediumStructCached_EncodingJson(b *testing.B) {
+	cached := NewMediumPayload()
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		if _, err := json.Marshal(cached); err != nil {
+			b.Fatal(err)
+		}
+	}
+}
+
+func Benchmark_Encode_MediumStructCached_JsonIter(b *testing.B) {
+	var json = jsoniter.ConfigCompatibleWithStandardLibrary
+	cached := NewMediumPayload()
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		if _, err := json.Marshal(cached); err != nil {
+			b.Fatal(err)
+		}
+	}
+}
+
+func Benchmark_Encode_MediumStructCached_GoJay(b *testing.B) {
+	cached := NewMediumPayload()
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		if _, err := gojay.MarshalJSONObject(cached); err != nil {
+			b.Fatal(err)
+		}
+	}
+}
+
+func Benchmark_Encode_MediumStructCached_SegmentioJson(b *testing.B) {
+	cached := NewMediumPayload()
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		if _, err := segmentiojson.Marshal(cached); err != nil {
+			b.Fatal(err)
+		}
+	}
+}
+
+func Benchmark_Encode_MediumStructCached_GoJson(b *testing.B) {
+	cached := NewMediumPayload()
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		if _, err := gojson.Marshal(cached); err != nil {
+			b.Fatal(err)
+		}
+	}
+}
+
 func Benchmark_Encode_LargeStruct_EncodingJson(b *testing.B) {
 	b.ReportAllocs()
 	for i := 0; i < b.N; i++ {
@ -111,6 +232,15 @@ func Benchmark_Encode_LargeStruct_GoJay(b *testing.B) {
 	}
 }

+func Benchmark_Encode_LargeStruct_SegmentioJson(b *testing.B) {
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		if _, err := segmentiojson.Marshal(NewLargePayload()); err != nil {
+			b.Fatal(err)
+		}
+	}
+}
+
 func Benchmark_Encode_LargeStruct_GoJson(b *testing.B) {
 	b.ReportAllocs()
 	for i := 0; i < b.N; i++ {
@ -119,3 +249,54 @@ func Benchmark_Encode_LargeStruct_GoJson(b *testing.B) {
 		}
 	}
 }
+
+func Benchmark_Encode_LargeStructCached_EncodingJson(b *testing.B) {
+	cached := NewLargePayload()
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		if _, err := json.Marshal(cached); err != nil {
+			b.Fatal(err)
+		}
+	}
+}
+
+func Benchmark_Encode_LargeStructCached_JsonIter(b *testing.B) {
+	var json = jsoniter.ConfigCompatibleWithStandardLibrary
+	cached := NewLargePayload()
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		if _, err := json.Marshal(cached); err != nil {
+			b.Fatal(err)
+		}
+	}
+}
+
+func Benchmark_Encode_LargeStructCached_GoJay(b *testing.B) {
+	cached := NewLargePayload()
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		if _, err := gojay.MarshalJSONObject(cached); err != nil {
+			b.Fatal(err)
+		}
+	}
+}
+
+func Benchmark_Encode_LargeStructCached_SegmentioJson(b *testing.B) {
+	cached := NewLargePayload()
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		if _, err := segmentiojson.Marshal(cached); err != nil {
+			b.Fatal(err)
+		}
+	}
+}
+
+func Benchmark_Encode_LargeStructCached_GoJson(b *testing.B) {
+	cached := NewLargePayload()
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		if _, err := gojson.Marshal(cached); err != nil {
+			b.Fatal(err)
+		}
+	}
+}
--- a/benchmarks/go.mod
+++ b/benchmarks/go.mod
@ -6,6 +6,7 @@ require (
 	github.com/francoispqt/gojay v1.2.13
 	github.com/goccy/go-json v0.0.0-00010101000000-000000000000
 	github.com/json-iterator/go v1.1.9
+	github.com/segmentio/encoding v0.2.4
 )

 replace github.com/goccy/go-json => ../
--- a/benchmarks/go.sum
+++ b/benchmarks/go.sum
@ -75,6 +75,8 @@ github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:
 github.com/prometheus/common v0.0.0-20180801064454-c7de2306084e/go.mod h1:daVV7qP5qjZbuso7PdcryaAu0sAZbrN9i7WWcTMWvro=
 github.com/prometheus/procfs v0.0.0-20180725123919-05ee40e3a273/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
 github.com/russross/blackfriday v1.5.2/go.mod h1:JO/DiYxRf+HjHt06OyowR9PTA263kcR/rfWxYHBV53g=
+github.com/segmentio/encoding v0.2.4 h1:TQRXhTlXj4urZe3Z5QVgxs9Ad1i7GYHg9peAtjOPe28=
+github.com/segmentio/encoding v0.2.4/go.mod h1:MJjRE6bMDocliO2FyFC2Dusp+uYdBfHWh5Bw7QyExto=
 github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo=
 github.com/shurcooL/component v0.0.0-20170202220835-f88ec8f54cc4/go.mod h1:XhFIlyj5a1fBNx5aJTbKoIq0mNaPvOagO+HjB3EtxrY=
 github.com/shurcooL/events v0.0.0-20181021180414-410e4ca65f48/go.mod h1:5u70Mqkb5O5cxEA8nxTsgrgLehJeAw6Oc4Ab1c/P1HM=
--- a/decode_array.go
+++ b/decode_array.go
@ -35,8 +35,7 @@ func (d *arrayDecoder) decodeStream(s *stream, p unsafe.Pointer) error {
 			idx := 0
 			for {
 				s.cursor++
-				addr := uintptr(p) + uintptr(idx)*d.size
-				if err := d.valueDecoder.decodeStream(s, unsafe.Pointer(addr)); err != nil {
+				if err := d.valueDecoder.decodeStream(s, unsafe.Pointer(uintptr(p)+uintptr(idx)*d.size)); err != nil {
 					return err
 				}
 				s.skipWhiteSpace()
@ -95,8 +94,7 @@ func (d *arrayDecoder) decode(buf []byte, cursor int64, p unsafe.Pointer) (int64
 			idx := 0
 			for {
 				cursor++
-				addr := uintptr(p) + uintptr(idx)*d.size
-				c, err := d.valueDecoder.decode(buf, cursor, unsafe.Pointer(addr))
+				c, err := d.valueDecoder.decode(buf, cursor, unsafe.Pointer(uintptr(p)+uintptr(idx)*d.size))
 				if err != nil {
 					return 0, err
 				}
--- a/decode_slice.go
+++ b/decode_slice.go
@ -94,8 +94,7 @@ func (d *sliceDecoder) decodeStream(s *stream, p unsafe.Pointer) error {
 					dst := sliceHeader{data: data, len: idx, cap: cap}
 					copySlice(d.elemType, dst, src)
 				}
-				addr := uintptr(data) + uintptr(idx)*d.size
-				if err := d.valueDecoder.decodeStream(s, unsafe.Pointer(addr)); err != nil {
+				if err := d.valueDecoder.decodeStream(s, unsafe.Pointer(uintptr(data)+uintptr(idx)*d.size)); err != nil {
 					return err
 				}
 				s.skipWhiteSpace()
@ -195,8 +194,7 @@ func (d *sliceDecoder) decode(buf []byte, cursor int64, p unsafe.Pointer) (int64
 					dst := sliceHeader{data: data, len: idx, cap: cap}
 					copySlice(d.elemType, dst, src)
 				}
-				addr := uintptr(data) + uintptr(idx)*d.size
-				c, err := d.valueDecoder.decode(buf, cursor, unsafe.Pointer(addr))
+				c, err := d.valueDecoder.decode(buf, cursor, unsafe.Pointer(uintptr(data)+uintptr(idx)*d.size))
 				if err != nil {
 					return 0, err
 				}
--- a/decode_struct.go
+++ b/decode_struct.go
@ -61,8 +61,7 @@ func (d *structDecoder) decodeStream(s *stream, p unsafe.Pointer) error {
 		k := *(*string)(unsafe.Pointer(&key))
 		field, exists := d.fieldMap[k]
 		if exists {
-			addr := uintptr(p) + field.offset
-			if err := field.dec.decodeStream(s, unsafe.Pointer(addr)); err != nil {
+			if err := field.dec.decodeStream(s, unsafe.Pointer(uintptr(p)+field.offset)); err != nil {
 				return err
 			}
 		} else if s.disallowUnknownFields {
@ -116,8 +115,7 @@ func (d *structDecoder) decode(buf []byte, cursor int64, p unsafe.Pointer) (int6
 		k := *(*string)(unsafe.Pointer(&key))
 		field, exists := d.fieldMap[k]
 		if exists {
-			addr := uintptr(p) + field.offset
-			c, err := field.dec.decode(buf, cursor, unsafe.Pointer(addr))
+			c, err := field.dec.decode(buf, cursor, unsafe.Pointer(uintptr(p)+field.offset))
 			if err != nil {
 				return 0, err
 			}
--- a/encode.go
+++ b/encode.go
@ -9,12 +9,14 @@ import (
 	"reflect"
 	"strconv"
 	"sync"
+	"sync/atomic"
 	"unsafe"
 )

 // An Encoder writes JSON values to an output stream.
 type Encoder struct {
 	w                              io.Writer
+	ctx                            *encodeRuntimeContext
 	buf                            []byte
 	enabledIndent                  bool
 	enabledHTMLEscape              bool
@ -33,31 +35,32 @@ const (
 	bufSize = 1024
 )

-type opcodeMap struct {
-	sync.Map
-}
-
 type opcodeSet struct {
 	codeIndent *opcode
 	code       *opcode
-	ctx        sync.Pool
+	codeLength int
 }

-func (m *opcodeMap) get(k uintptr) *opcodeSet {
-	if v, ok := m.Load(k); ok {
-		return v.(*opcodeSet)
+func loadOpcodeMap() map[uintptr]*opcodeSet {
+	p := atomic.LoadPointer(&cachedOpcode)
+	return *(*map[uintptr]*opcodeSet)(unsafe.Pointer(&p))
+}
+
+func storeOpcodeSet(typ uintptr, set *opcodeSet, m map[uintptr]*opcodeSet) {
+	newOpcodeMap := make(map[uintptr]*opcodeSet, len(m)+1)
+	newOpcodeMap[typ] = set
+
+	for k, v := range m {
+		newOpcodeMap[k] = v
 	}
-	return nil
-}

-func (m *opcodeMap) set(k uintptr, op *opcodeSet) {
-	m.Store(k, op)
+	atomic.StorePointer(&cachedOpcode, *(*unsafe.Pointer)(unsafe.Pointer(&newOpcodeMap)))
 }

 var (
 	encPool         sync.Pool
 	codePool        sync.Pool
-	cachedOpcode    opcodeMap
+	cachedOpcode    unsafe.Pointer // map[uintptr]*opcodeSet
 	marshalJSONType reflect.Type
 	marshalTextType reflect.Type
 )
@ -66,13 +69,16 @@ func init() {
 	encPool = sync.Pool{
 		New: func() interface{} {
 			return &Encoder{
+				ctx: &encodeRuntimeContext{
+					ptrs:     make([]uintptr, 128),
+					keepRefs: make([]unsafe.Pointer, 0, 8),
+				},
 				buf:                            make([]byte, 0, bufSize),
 				structTypeToCompiledCode:       map[uintptr]*compiledCode{},
 				structTypeToCompiledIndentCode: map[uintptr]*compiledCode{},
 			}
 		},
 	}
-	cachedOpcode = opcodeMap{}
 	marshalJSONType = reflect.TypeOf((*Marshaler)(nil)).Elem()
 	marshalTextType = reflect.TypeOf((*encoding.TextMarshaler)(nil)).Elem()
 }
@ -99,7 +105,8 @@ func (e *Encoder) EncodeWithOption(v interface{}, opts ...EncodeOption) error {
 			return err
 		}
 	}
-	if err := e.encode(v); err != nil {
+	var err error
+	if e.buf, err = e.encode(v); err != nil {
 		return err
 	}
 	if e.enabledIndent {
@ -148,7 +155,8 @@ func (e *Encoder) reset() {
 }

 func (e *Encoder) encodeForMarshal(v interface{}) ([]byte, error) {
-	if err := e.encode(v); err != nil {
+	var err error
+	if e.buf, err = e.encode(v); err != nil {
 		return nil, err
 	}
 	if e.enabledIndent {
@ -161,33 +169,33 @@ func (e *Encoder) encodeForMarshal(v interface{}) ([]byte, error) {
 	return copied, nil
 }

-func (e *Encoder) encode(v interface{}) error {
+func (e *Encoder) encode(v interface{}) ([]byte, error) {
+	b := e.buf
 	if v == nil {
-		e.encodeNull()
+		b = encodeNull(b)
 		if e.enabledIndent {
-			e.encodeBytes([]byte{',', '\n'})
+			b = encodeIndentComma(b)
 		} else {
-			e.encodeByte(',')
+			b = encodeComma(b)
 		}
-		return nil
+		return b, nil
 	}
 	header := (*interfaceHeader)(unsafe.Pointer(&v))
 	typ := header.typ

 	typeptr := uintptr(unsafe.Pointer(typ))
-	if codeSet := cachedOpcode.get(typeptr); codeSet != nil {
+	opcodeMap := loadOpcodeMap()
+	if codeSet, exists := opcodeMap[typeptr]; exists {
 		var code *opcode
 		if e.enabledIndent {
 			code = codeSet.codeIndent
 		} else {
 			code = codeSet.code
 		}
-		ctx := codeSet.ctx.Get().(*encodeRuntimeContext)
+		ctx := e.ctx
 		p := uintptr(header.ptr)
-		ctx.init(p)
-		err := e.run(ctx, code)
-		codeSet.ctx.Put(ctx)
-		return err
+		ctx.init(p, codeSet.codeLength)
+		return e.run(ctx, b, code)
 	}

 	// noescape trick for header.typ ( reflect.*rtype )
@ -199,7 +207,7 @@ func (e *Encoder) encode(v interface{}) error {
 		withIndent: true,
 	})
 	if err != nil {
-		return err
+		return nil, err
 	}
 	code, err := e.compileHead(&encodeCompileContext{
 		typ:        copiedType,
@ -207,7 +215,7 @@ func (e *Encoder) encode(v interface{}) error {
 		withIndent: false,
 	})
 	if err != nil {
-		return err
+		return nil, err
 	}
 	codeIndent = copyOpcode(codeIndent)
 	code = copyOpcode(code)
@ -215,19 +223,13 @@ func (e *Encoder) encode(v interface{}) error {
 	codeSet := &opcodeSet{
 		codeIndent: codeIndent,
 		code:       code,
-		ctx: sync.Pool{
-			New: func() interface{} {
-				return &encodeRuntimeContext{
-					ptrs:     make([]uintptr, codeLength),
-					keepRefs: make([]unsafe.Pointer, 8),
+		codeLength: codeLength,
 	}
-			},
-		},
-	}
-	cachedOpcode.set(typeptr, codeSet)
+
+	storeOpcodeSet(typeptr, codeSet, opcodeMap)
 	p := uintptr(header.ptr)
-	ctx := codeSet.ctx.Get().(*encodeRuntimeContext)
-	ctx.init(p)
+	ctx := e.ctx
+	ctx.init(p, codeLength)

 	var c *opcode
 	if e.enabledIndent {
@ -236,55 +238,14 @@ func (e *Encoder) encode(v interface{}) error {
 		c = code
 	}

-	if err := e.run(ctx, c); err != nil {
-		codeSet.ctx.Put(ctx)
-		return err
+	b, err = e.run(ctx, b, c)
+	if err != nil {
+		return nil, err
 	}
-	codeSet.ctx.Put(ctx)
-	return nil
+	return b, nil
 }

-func (e *Encoder) encodeInt(v int) {
-	e.encodeInt64(int64(v))
-}
-
-func (e *Encoder) encodeInt8(v int8) {
-	e.encodeInt64(int64(v))
-}
-
-func (e *Encoder) encodeInt16(v int16) {
-	e.encodeInt64(int64(v))
-}
-
-func (e *Encoder) encodeInt32(v int32) {
-	e.encodeInt64(int64(v))
-}
-
-func (e *Encoder) encodeInt64(v int64) {
-	e.buf = strconv.AppendInt(e.buf, v, 10)
-}
-
-func (e *Encoder) encodeUint(v uint) {
-	e.encodeUint64(uint64(v))
-}
-
-func (e *Encoder) encodeUint8(v uint8) {
-	e.encodeUint64(uint64(v))
-}
-
-func (e *Encoder) encodeUint16(v uint16) {
-	e.encodeUint64(uint64(v))
-}
-
-func (e *Encoder) encodeUint32(v uint32) {
-	e.encodeUint64(uint64(v))
-}
-
-func (e *Encoder) encodeUint64(v uint64) {
-	e.buf = strconv.AppendUint(e.buf, v, 10)
-}
-
-func (e *Encoder) encodeFloat32(v float32) {
+func encodeFloat32(b []byte, v float32) []byte {
 	f64 := float64(v)
 	abs := math.Abs(f64)
 	fmt := byte('f')
@ -295,10 +256,10 @@ func (e *Encoder) encodeFloat32(v float32) {
 			fmt = 'e'
 		}
 	}
-	e.buf = strconv.AppendFloat(e.buf, f64, fmt, -1, 32)
+	return strconv.AppendFloat(b, f64, fmt, -1, 32)
 }

-func (e *Encoder) encodeFloat64(v float64) {
+func encodeFloat64(b []byte, v float64) []byte {
 	abs := math.Abs(v)
 	fmt := byte('f')
 	// Note: Must use float32 comparisons for underlying float32 value to get precise cutoffs right.
@ -307,58 +268,62 @@ func (e *Encoder) encodeFloat64(v float64) {
 			fmt = 'e'
 		}
 	}
-	e.buf = strconv.AppendFloat(e.buf, v, fmt, -1, 64)
+	return strconv.AppendFloat(b, v, fmt, -1, 64)
 }

-func (e *Encoder) encodeBool(v bool) {
-	e.buf = strconv.AppendBool(e.buf, v)
-}
-
-func (e *Encoder) encodeBytes(b []byte) {
-	e.buf = append(e.buf, b...)
-}
-
-func (e *Encoder) encodeNull() {
-	e.buf = append(e.buf, 'n', 'u', 'l', 'l')
-}
-
-func (e *Encoder) encodeKey(code *opcode) {
-	if e.enabledHTMLEscape {
-		e.encodeBytes(code.escapedKey)
-	} else {
-		e.encodeBytes(code.key)
+func encodeBool(b []byte, v bool) []byte {
+	if v {
+		return append(b, "true"...)
 	}
+	return append(b, "false"...)
 }

-func (e *Encoder) encodeString(s string) {
+func encodeBytes(dst []byte, src []byte) []byte {
+	return append(dst, src...)
+}
+
+func encodeNull(b []byte) []byte {
+	return append(b, "null"...)
+}
+
+func encodeComma(b []byte) []byte {
+	return append(b, ',')
+}
+
+func encodeIndentComma(b []byte) []byte {
+	return append(b, ',', '\n')
+}
+
+func (e *Encoder) encodeKey(b []byte, code *opcode) []byte {
 	if e.enabledHTMLEscape {
-		e.encodeEscapedString(s)
-	} else {
-		e.encodeNoEscapedString(s)
+		return append(b, code.escapedKey...)
 	}
+	return append(b, code.key...)
 }

-func (e *Encoder) encodeByteSlice(b []byte) {
-	encodedLen := base64.StdEncoding.EncodedLen(len(b))
-	e.encodeByte('"')
-	pos := len(e.buf)
-	remainLen := cap(e.buf[pos:])
+func (e *Encoder) encodeString(b []byte, s string) []byte {
+	if e.enabledHTMLEscape {
+		return encodeEscapedString(b, s)
+	}
+	return encodeNoEscapedString(b, s)
+}
+
+func encodeByteSlice(b []byte, src []byte) []byte {
+	encodedLen := base64.StdEncoding.EncodedLen(len(src))
+	b = append(b, '"')
+	pos := len(b)
+	remainLen := cap(b[pos:])
 	var buf []byte
 	if remainLen > encodedLen {
-		buf = e.buf[pos : pos+encodedLen]
+		buf = b[pos : pos+encodedLen]
 	} else {
 		buf = make([]byte, encodedLen)
 	}
-	base64.StdEncoding.Encode(buf, b)
-	e.encodeBytes(buf)
-	e.encodeByte('"')
+	base64.StdEncoding.Encode(buf, src)
+	return append(append(b, buf...), '"')
 }

-func (e *Encoder) encodeByte(b byte) {
-	e.buf = append(e.buf, b)
-}
-
-func (e *Encoder) encodeIndent(indent int) {
-	e.buf = append(e.buf, e.prefix...)
-	e.buf = append(e.buf, bytes.Repeat(e.indentStr, indent)...)
+func (e *Encoder) encodeIndent(b []byte, indent int) []byte {
+	b = append(b, e.prefix...)
+	return append(b, bytes.Repeat(e.indentStr, indent)...)
 }
--- a/encode_compile.go
+++ b/encode_compile.go
@ -1092,7 +1092,7 @@ func (e *Encoder) compileStruct(ctx *encodeCompileContext, isPtr bool) (*opcode,

 		var buf bytes.Buffer
 		enc := NewEncoder(&buf)
-		enc.encodeEscapedString(tag.key)
+		enc.buf = encodeEscapedString(enc.buf, tag.key)
 		escapedKey := fmt.Sprintf(`%s:`, string(enc.buf))
 		enc.release()
 		fieldCode := &opcode{
--- a/encode_context.go
+++ b/encode_context.go
@ -88,7 +88,10 @@ type encodeRuntimeContext struct {
 	keepRefs []unsafe.Pointer
 }

-func (c *encodeRuntimeContext) init(p uintptr) {
+func (c *encodeRuntimeContext) init(p uintptr, codelen int) {
+	if len(c.ptrs) < codelen {
+		c.ptrs = make([]uintptr, codelen)
+	}
 	c.ptrs[0] = p
 	c.keepRefs = c.keepRefs[:0]
 }
--- a/encode_int.go
+++ b/encode_int.go
@ -0,0 +1,98 @@
+package json
+
+import (
+	"unsafe"
+)
+
+var endianness int
+
+func init() {
+	var b [2]byte
+	*(*uint16)(unsafe.Pointer(&b)) = uint16(0xABCD)
+
+	switch b[0] {
+	case 0xCD:
+		endianness = 0 // LE
+	case 0xAB:
+		endianness = 1 // BE
+	default:
+		panic("could not determine endianness")
+	}
+}
+
+// "00010203...96979899" cast to []uint16
+var intLELookup = [100]uint16{
+	0x3030, 0x3130, 0x3230, 0x3330, 0x3430, 0x3530, 0x3630, 0x3730, 0x3830, 0x3930,
+	0x3031, 0x3131, 0x3231, 0x3331, 0x3431, 0x3531, 0x3631, 0x3731, 0x3831, 0x3931,
+	0x3032, 0x3132, 0x3232, 0x3332, 0x3432, 0x3532, 0x3632, 0x3732, 0x3832, 0x3932,
+	0x3033, 0x3133, 0x3233, 0x3333, 0x3433, 0x3533, 0x3633, 0x3733, 0x3833, 0x3933,
+	0x3034, 0x3134, 0x3234, 0x3334, 0x3434, 0x3534, 0x3634, 0x3734, 0x3834, 0x3934,
+	0x3035, 0x3135, 0x3235, 0x3335, 0x3435, 0x3535, 0x3635, 0x3735, 0x3835, 0x3935,
+	0x3036, 0x3136, 0x3236, 0x3336, 0x3436, 0x3536, 0x3636, 0x3736, 0x3836, 0x3936,
+	0x3037, 0x3137, 0x3237, 0x3337, 0x3437, 0x3537, 0x3637, 0x3737, 0x3837, 0x3937,
+	0x3038, 0x3138, 0x3238, 0x3338, 0x3438, 0x3538, 0x3638, 0x3738, 0x3838, 0x3938,
+	0x3039, 0x3139, 0x3239, 0x3339, 0x3439, 0x3539, 0x3639, 0x3739, 0x3839, 0x3939,
+}
+
+var intBELookup = [100]uint16{
+	0x3030, 0x3031, 0x3032, 0x3033, 0x3034, 0x3035, 0x3036, 0x3037, 0x3038, 0x3039,
+	0x3130, 0x3131, 0x3132, 0x3133, 0x3134, 0x3135, 0x3136, 0x3137, 0x3138, 0x3139,
+	0x3230, 0x3231, 0x3232, 0x3233, 0x3234, 0x3235, 0x3236, 0x3237, 0x3238, 0x3239,
+	0x3330, 0x3331, 0x3332, 0x3333, 0x3334, 0x3335, 0x3336, 0x3337, 0x3338, 0x3339,
+	0x3430, 0x3431, 0x3432, 0x3433, 0x3434, 0x3435, 0x3436, 0x3437, 0x3438, 0x3439,
+	0x3530, 0x3531, 0x3532, 0x3533, 0x3534, 0x3535, 0x3536, 0x3537, 0x3538, 0x3539,
+	0x3630, 0x3631, 0x3632, 0x3633, 0x3634, 0x3635, 0x3636, 0x3637, 0x3638, 0x3639,
+	0x3730, 0x3731, 0x3732, 0x3733, 0x3734, 0x3735, 0x3736, 0x3737, 0x3738, 0x3739,
+	0x3830, 0x3831, 0x3832, 0x3833, 0x3834, 0x3835, 0x3836, 0x3837, 0x3838, 0x3839,
+	0x3930, 0x3931, 0x3932, 0x3933, 0x3934, 0x3935, 0x3936, 0x3937, 0x3938, 0x3939,
+}
+
+var intLookup = [2]*[100]uint16{&intLELookup, &intBELookup}
+
+func appendInt(b []byte, n int64) []byte {
+	return formatInteger(b, uint64(n), n < 0)
+}
+
+func appendUint(b []byte, n uint64) []byte {
+	return formatInteger(b, n, false)
+}
+
+func formatInteger(out []byte, n uint64, negative bool) []byte {
+	if !negative {
+		if n < 10 {
+			return append(out, byte(n+'0'))
+		} else if n < 100 {
+			u := intLELookup[n]
+			return append(out, byte(u), byte(u>>8))
+		}
+	} else {
+		n = -n
+	}
+
+	lookup := intLookup[endianness]
+
+	var b [22]byte
+	u := (*[11]uint16)(unsafe.Pointer(&b))
+	i := 11
+
+	for n >= 100 {
+		j := n % 100
+		n /= 100
+		i--
+		u[i] = lookup[j]
+	}
+
+	i--
+	u[i] = lookup[n]
+
+	i *= 2 // convert to byte index
+	if n < 10 {
+		i++ // remove leading zero
+	}
+	if negative {
+		i--
+		b[i] = '-'
+	}
+
+	return append(out, b[i:]...)
+}
--- a/encode_opcode.go
+++ b/encode_opcode.go
@ -6,7 +6,7 @@ import (
 	"unsafe"
 )

-var uintptrSize = unsafe.Sizeof(uintptr(0))
+const uintptrSize = 4 << (^uintptr(0) >> 63) // unsafe.Sizeof(uintptr(0)) but an ideal const

 type opcode struct {
 	op           opType // operation type
--- a/encode_string.go
+++ b/encode_string.go
@ -1,9 +1,352 @@
 package json

 import (
+	"math/bits"
+	"reflect"
 	"unicode/utf8"
+	"unsafe"
 )

+const (
+	lsb = 0x0101010101010101
+	msb = 0x8080808080808080
+)
+
+var needEscapeWithHTML = [256]bool{
+	'"':  true,
+	'&':  true,
+	'<':  true,
+	'>':  true,
+	'\\': true,
+	0x00: true,
+	0x01: true,
+	0x02: true,
+	0x03: true,
+	0x04: true,
+	0x05: true,
+	0x06: true,
+	0x07: true,
+	0x08: true,
+	0x09: true,
+	0x0a: true,
+	0x0b: true,
+	0x0c: true,
+	0x0d: true,
+	0x0e: true,
+	0x0f: true,
+	0x10: true,
+	0x11: true,
+	0x12: true,
+	0x13: true,
+	0x14: true,
+	0x15: true,
+	0x16: true,
+	0x17: true,
+	0x18: true,
+	0x19: true,
+	0x1a: true,
+	0x1b: true,
+	0x1c: true,
+	0x1d: true,
+	0x1e: true,
+	0x1f: true,
+	/* 0x20 - 0x7f */
+	0x80: true,
+	0x81: true,
+	0x82: true,
+	0x83: true,
+	0x84: true,
+	0x85: true,
+	0x86: true,
+	0x87: true,
+	0x88: true,
+	0x89: true,
+	0x8a: true,
+	0x8b: true,
+	0x8c: true,
+	0x8d: true,
+	0x8e: true,
+	0x8f: true,
+	0x90: true,
+	0x91: true,
+	0x92: true,
+	0x93: true,
+	0x94: true,
+	0x95: true,
+	0x96: true,
+	0x97: true,
+	0x98: true,
+	0x99: true,
+	0x9a: true,
+	0x9b: true,
+	0x9c: true,
+	0x9d: true,
+	0x9e: true,
+	0x9f: true,
+	0xa0: true,
+	0xa1: true,
+	0xa2: true,
+	0xa3: true,
+	0xa4: true,
+	0xa5: true,
+	0xa6: true,
+	0xa7: true,
+	0xa8: true,
+	0xa9: true,
+	0xaa: true,
+	0xab: true,
+	0xac: true,
+	0xad: true,
+	0xae: true,
+	0xaf: true,
+	0xb0: true,
+	0xb1: true,
+	0xb2: true,
+	0xb3: true,
+	0xb4: true,
+	0xb5: true,
+	0xb6: true,
+	0xb7: true,
+	0xb8: true,
+	0xb9: true,
+	0xba: true,
+	0xbb: true,
+	0xbc: true,
+	0xbd: true,
+	0xbe: true,
+	0xbf: true,
+	0xc0: true,
+	0xc1: true,
+	0xc2: true,
+	0xc3: true,
+	0xc4: true,
+	0xc5: true,
+	0xc6: true,
+	0xc7: true,
+	0xc8: true,
+	0xc9: true,
+	0xca: true,
+	0xcb: true,
+	0xcc: true,
+	0xcd: true,
+	0xce: true,
+	0xcf: true,
+	0xd0: true,
+	0xd1: true,
+	0xd2: true,
+	0xd3: true,
+	0xd4: true,
+	0xd5: true,
+	0xd6: true,
+	0xd7: true,
+	0xd8: true,
+	0xd9: true,
+	0xda: true,
+	0xdb: true,
+	0xdc: true,
+	0xdd: true,
+	0xde: true,
+	0xdf: true,
+	0xe0: true,
+	0xe1: true,
+	0xe2: true,
+	0xe3: true,
+	0xe4: true,
+	0xe5: true,
+	0xe6: true,
+	0xe7: true,
+	0xe8: true,
+	0xe9: true,
+	0xea: true,
+	0xeb: true,
+	0xec: true,
+	0xed: true,
+	0xee: true,
+	0xef: true,
+	0xf0: true,
+	0xf1: true,
+	0xf2: true,
+	0xf3: true,
+	0xf4: true,
+	0xf5: true,
+	0xf6: true,
+	0xf7: true,
+	0xf8: true,
+	0xf9: true,
+	0xfa: true,
+	0xfb: true,
+	0xfc: true,
+	0xfd: true,
+	0xfe: true,
+	0xff: true,
+}
+
+var needEscape = [256]bool{
+	'"':  true,
+	'\\': true,
+	0x00: true,
+	0x01: true,
+	0x02: true,
+	0x03: true,
+	0x04: true,
+	0x05: true,
+	0x06: true,
+	0x07: true,
+	0x08: true,
+	0x09: true,
+	0x0a: true,
+	0x0b: true,
+	0x0c: true,
+	0x0d: true,
+	0x0e: true,
+	0x0f: true,
+	0x10: true,
+	0x11: true,
+	0x12: true,
+	0x13: true,
+	0x14: true,
+	0x15: true,
+	0x16: true,
+	0x17: true,
+	0x18: true,
+	0x19: true,
+	0x1a: true,
+	0x1b: true,
+	0x1c: true,
+	0x1d: true,
+	0x1e: true,
+	0x1f: true,
+	/* 0x20 - 0x7f */
+	0x80: true,
+	0x81: true,
+	0x82: true,
+	0x83: true,
+	0x84: true,
+	0x85: true,
+	0x86: true,
+	0x87: true,
+	0x88: true,
+	0x89: true,
+	0x8a: true,
+	0x8b: true,
+	0x8c: true,
+	0x8d: true,
+	0x8e: true,
+	0x8f: true,
+	0x90: true,
+	0x91: true,
+	0x92: true,
+	0x93: true,
+	0x94: true,
+	0x95: true,
+	0x96: true,
+	0x97: true,
+	0x98: true,
+	0x99: true,
+	0x9a: true,
+	0x9b: true,
+	0x9c: true,
+	0x9d: true,
+	0x9e: true,
+	0x9f: true,
+	0xa0: true,
+	0xa1: true,
+	0xa2: true,
+	0xa3: true,
+	0xa4: true,
+	0xa5: true,
+	0xa6: true,
+	0xa7: true,
+	0xa8: true,
+	0xa9: true,
+	0xaa: true,
+	0xab: true,
+	0xac: true,
+	0xad: true,
+	0xae: true,
+	0xaf: true,
+	0xb0: true,
+	0xb1: true,
+	0xb2: true,
+	0xb3: true,
+	0xb4: true,
+	0xb5: true,
+	0xb6: true,
+	0xb7: true,
+	0xb8: true,
+	0xb9: true,
+	0xba: true,
+	0xbb: true,
+	0xbc: true,
+	0xbd: true,
+	0xbe: true,
+	0xbf: true,
+	0xc0: true,
+	0xc1: true,
+	0xc2: true,
+	0xc3: true,
+	0xc4: true,
+	0xc5: true,
+	0xc6: true,
+	0xc7: true,
+	0xc8: true,
+	0xc9: true,
+	0xca: true,
+	0xcb: true,
+	0xcc: true,
+	0xcd: true,
+	0xce: true,
+	0xcf: true,
+	0xd0: true,
+	0xd1: true,
+	0xd2: true,
+	0xd3: true,
+	0xd4: true,
+	0xd5: true,
+	0xd6: true,
+	0xd7: true,
+	0xd8: true,
+	0xd9: true,
+	0xda: true,
+	0xdb: true,
+	0xdc: true,
+	0xdd: true,
+	0xde: true,
+	0xdf: true,
+	0xe0: true,
+	0xe1: true,
+	0xe2: true,
+	0xe3: true,
+	0xe4: true,
+	0xe5: true,
+	0xe6: true,
+	0xe7: true,
+	0xe8: true,
+	0xe9: true,
+	0xea: true,
+	0xeb: true,
+	0xec: true,
+	0xed: true,
+	0xee: true,
+	0xef: true,
+	0xf0: true,
+	0xf1: true,
+	0xf2: true,
+	0xf3: true,
+	0xf4: true,
+	0xf5: true,
+	0xf6: true,
+	0xf7: true,
+	0xf8: true,
+	0xf9: true,
+	0xfa: true,
+	0xfb: true,
+	0xfc: true,
+	0xfd: true,
+	0xfe: true,
+	0xff: true,
+}
+
 // htmlSafeSet holds the value true if the ASCII character with the given
 // array position can be safely represented inside a JSON string, embedded
 // inside of HTML <script> tags, without any additional escaping.
@ -345,69 +688,198 @@ var safeSet = [utf8.RuneSelf]bool{

 var hex = "0123456789abcdef"

-func (e *Encoder) encodeEscapedString(s string) {
+// escapeIndex finds the index of the first char in `s` that requires escaping.
+// A char requires escaping if it's outside of the range of [0x20, 0x7F] or if
+// it includes a double quote or backslash.
+// If no chars in `s` require escaping, the return value is -1.
+func escapeIndex(s string) int {
+	chunks := stringToUint64Slice(s)
+	for _, n := range chunks {
+		// combine masks before checking for the MSB of each byte. We include
+		// `n` in the mask to check whether any of the *input* byte MSBs were
+		// set (i.e. the byte was outside the ASCII range).
+		mask := n | below(n, 0x20) | contains(n, '"') | contains(n, '\\')
+		if (mask & msb) != 0 {
+			return bits.TrailingZeros64(mask&msb) / 8
+		}
+	}
+
 	valLen := len(s)
-	// write string, the fast path, without utf8 and escape support
-	i := 0
-	for ; i < valLen; i++ {
-		if !htmlSafeSet[s[i]] {
-			break
+	for i := len(chunks) * 8; i < valLen; i++ {
+		if needEscape[s[i]] {
+			return i
 		}
 	}
-	e.buf = append(e.buf, '"')
-	if i == valLen {
-		e.buf = append(e.buf, s...)
-		e.buf = append(e.buf, '"')
-		return
-	}
-	e.buf = append(e.buf, s[:i]...)
-	e.writeStringSlowPathWithHTMLEscaped(i, s, valLen)
+
+	return -1
 }

-func (e *Encoder) writeStringSlowPathWithHTMLEscaped(i int, s string, valLen int) {
-	start := i
-	// for the remaining parts, we process them char by char
-	for i < valLen {
-		if b := s[i]; b < utf8.RuneSelf {
-			if htmlSafeSet[b] {
-				i++
+// escapeIndex finds the index of the first char in `s` that requires escaping.
+// A char requires escaping if it's outside of the range of [0x20, 0x7F] or if
+// it includes a double quote or backslash.
+// Also, the chars <, > and & require escaping.
+// If no chars in `s` require escaping, the return value is -1.
+func escapeIndexWithHTMLEscape(s string) int {
+	chunks := stringToUint64Slice(s)
+	for _, n := range chunks {
+		// combine masks before checking for the MSB of each byte. We include
+		// `n` in the mask to check whether any of the *input* byte MSBs were
+		// set (i.e. the byte was outside the ASCII range).
+		mask := n | (n - (lsb * 0x20)) |
+			((n ^ (lsb * '"')) - lsb) |
+			((n ^ (lsb * '\\')) - lsb) |
+			((n ^ (lsb * '<')) - lsb) |
+			((n ^ (lsb * '>')) - lsb) |
+			((n ^ (lsb * '&')) - lsb)
+		if (mask & msb) != 0 {
+			return bits.TrailingZeros64(mask&msb) / 8
+		}
+	}
+
+	valLen := len(s)
+	for i := len(chunks) * 8; i < valLen; i++ {
+		if needEscapeWithHTML[s[i]] {
+			return i
+		}
+	}
+
+	return -1
+}
+
+// below return a mask that can be used to determine if any of the bytes
+// in `n` are below `b`. If a byte's MSB is set in the mask then that byte was
+// below `b`. The result is only valid if `b`, and each byte in `n`, is below
+// 0x80.
+func below(n uint64, b byte) uint64 {
+	return n - expand(b)
+}
+
+// contains returns a mask that can be used to determine if any of the
+// bytes in `n` are equal to `b`. If a byte's MSB is set in the mask then
+// that byte is equal to `b`. The result is only valid if `b`, and each
+// byte in `n`, is below 0x80.
+func contains(n uint64, b byte) uint64 {
+	return (n ^ expand(b)) - lsb
+}
+
+// expand puts the specified byte into each of the 8 bytes of a uint64.
+func expand(b byte) uint64 {
+	return lsb * uint64(b)
+}
+
+func stringToUint64Slice(s string) []uint64 {
+	return *(*[]uint64)(unsafe.Pointer(&reflect.SliceHeader{
+		Data: ((*reflect.StringHeader)(unsafe.Pointer(&s))).Data,
+		Len:  len(s) / 8,
+		Cap:  len(s) / 8,
+	}))
+}
+
+func encodeEscapedString(buf []byte, s string) []byte {
+	valLen := len(s)
+	if valLen == 0 {
+		return append(buf, `""`...)
+	}
+	buf = append(buf, '"')
+	var (
+		i, j int
+	)
+	if valLen >= 8 {
+		chunks := stringToUint64Slice(s)
+		for _, n := range chunks {
+			// combine masks before checking for the MSB of each byte. We include
+			// `n` in the mask to check whether any of the *input* byte MSBs were
+			// set (i.e. the byte was outside the ASCII range).
+			mask := n | (n - (lsb * 0x20)) |
+				((n ^ (lsb * '"')) - lsb) |
+				((n ^ (lsb * '\\')) - lsb) |
+				((n ^ (lsb * '<')) - lsb) |
+				((n ^ (lsb * '>')) - lsb) |
+				((n ^ (lsb * '&')) - lsb)
+			if (mask & msb) != 0 {
+				j = bits.TrailingZeros64(mask&msb) / 8
+				goto ESCAPE_END
+			}
+		}
+		for i := len(chunks) * 8; i < valLen; i++ {
+			if needEscapeWithHTML[s[i]] {
+				j = i
+				goto ESCAPE_END
+			}
+		}
+		// no found any escape characters.
+		return append(append(buf, s...), '"')
+	}
+ESCAPE_END:
+	for j < valLen {
+		c := s[j]
+
+		if !needEscapeWithHTML[c] {
+			// fast path: most of the time, printable ascii characters are used
+			j++
 			continue
 		}
-			if start < i {
-				e.buf = append(e.buf, s[start:i]...)
-			}
-			switch b {
+
+		switch c {
 		case '\\', '"':
-				e.buf = append(e.buf, '\\', b)
+			buf = append(buf, s[i:j]...)
+			buf = append(buf, '\\', c)
+			i = j + 1
+			j = j + 1
+			continue
+
 		case '\n':
-				e.buf = append(e.buf, '\\', 'n')
+			buf = append(buf, s[i:j]...)
+			buf = append(buf, '\\', 'n')
+			i = j + 1
+			j = j + 1
+			continue
+
 		case '\r':
-				e.buf = append(e.buf, '\\', 'r')
+			buf = append(buf, s[i:j]...)
+			buf = append(buf, '\\', 'r')
+			i = j + 1
+			j = j + 1
+			continue
+
 		case '\t':
-				e.buf = append(e.buf, '\\', 't')
-			default:
+			buf = append(buf, s[i:j]...)
+			buf = append(buf, '\\', 't')
+			i = j + 1
+			j = j + 1
+			continue
+
+		case '<', '>', '&':
+			buf = append(buf, s[i:j]...)
+			buf = append(buf, `\u00`...)
+			buf = append(buf, hex[c>>4], hex[c&0xF])
+			i = j + 1
+			j = j + 1
+			continue
+		}
+
 		// This encodes bytes < 0x20 except for \t, \n and \r.
-				// If escapeHTML is set, it also escapes <, >, and &
-				// because they can lead to security holes when
-				// user-controlled strings are rendered into JSON
-				// and served to some browsers.
-				e.buf = append(e.buf, `\u00`...)
-				e.buf = append(e.buf, hex[b>>4], hex[b&0xF])
-			}
-			i++
-			start = i
+		if c < 0x20 {
+			buf = append(buf, s[i:j]...)
+			buf = append(buf, `\u00`...)
+			buf = append(buf, hex[c>>4], hex[c&0xF])
+			i = j + 1
+			j = j + 1
 			continue
 		}
-		c, size := utf8.DecodeRuneInString(s[i:])
-		if c == utf8.RuneError && size == 1 {
-			if start < i {
-				e.buf = append(e.buf, s[start:i]...)
-			}
-			e.buf = append(e.buf, `\ufffd`...)
-			i++
-			start = i
+
+		r, size := utf8.DecodeRuneInString(s[j:])
+
+		if r == utf8.RuneError && size == 1 {
+			buf = append(buf, s[i:j]...)
+			buf = append(buf, `\ufffd`...)
+			i = j + size
+			j = j + size
 			continue
 		}
+
+		switch r {
+		case '\u2028', '\u2029':
 			// U+2028 is LINE SEPARATOR.
 			// U+2029 is PARAGRAPH SEPARATOR.
 			// They are both technically valid characters in JSON strings,
@ -415,84 +887,121 @@ func (e *Encoder) writeStringSlowPathWithHTMLEscaped(i int, s string, valLen int
 			// and can lead to security holes there. It is valid JSON to
 			// escape them, so we do so unconditionally.
 			// See http://timelessrepo.com/json-isnt-a-javascript-subset for discussion.
-		if c == '\u2028' || c == '\u2029' {
-			if start < i {
-				e.buf = append(e.buf, s[start:i]...)
-			}
-			e.buf = append(e.buf, `\u202`...)
-			e.buf = append(e.buf, hex[c&0xF])
-			i += size
-			start = i
+			buf = append(buf, s[i:j]...)
+			buf = append(buf, `\u202`...)
+			buf = append(buf, hex[r&0xF])
+			i = j + size
+			j = j + size
 			continue
 		}
-		i += size
+
+		j += size
 	}
-	if start < len(s) {
-		e.buf = append(e.buf, s[start:]...)
-	}
-	e.buf = append(e.buf, '"')
+
+	return append(append(buf, s[i:]...), '"')
 }

-func (e *Encoder) encodeNoEscapedString(s string) {
+func encodeNoEscapedString(buf []byte, s string) []byte {
 	valLen := len(s)
+	if valLen == 0 {
+		return append(buf, `""`...)
+	}
+	buf = append(buf, '"')
+	var escapeIdx int
+	if valLen >= 8 {
+		if escapeIdx = escapeIndex(s); escapeIdx < 0 {
+			return append(append(buf, s...), '"')
+		}
+	}

-	// write string, the fast path, without utf8 and escape support
 	i := 0
-	for ; i < valLen; i++ {
-		c := s[i]
-		if c <= 31 || c == '"' || c == '\\' {
-			break
-		}
-	}
-	e.buf = append(e.buf, '"')
-	if i == valLen {
-		e.buf = append(e.buf, s...)
-		e.buf = append(e.buf, '"')
-		return
-	}
-	e.buf = append(e.buf, s[:i]...)
-	e.writeStringSlowPath(i, s, valLen)
-}
+	j := escapeIdx
+	for j < valLen {
+		c := s[j]

-func (e *Encoder) writeStringSlowPath(i int, s string, valLen int) {
-	start := i
-	// for the remaining parts, we process them char by char
-	for i < valLen {
-		if b := s[i]; b < utf8.RuneSelf {
-			if safeSet[b] {
-				i++
+		if c >= 0x20 && c <= 0x7f && c != '\\' && c != '"' {
+			// fast path: most of the time, printable ascii characters are used
+			j++
 			continue
 		}
-			if start < i {
-				e.buf = append(e.buf, s[start:i]...)
-			}
-			switch b {
+
+		switch c {
 		case '\\', '"':
-				e.buf = append(e.buf, '\\', b)
+			buf = append(buf, s[i:j]...)
+			buf = append(buf, '\\', c)
+			i = j + 1
+			j = j + 1
+			continue
+
 		case '\n':
-				e.buf = append(e.buf, '\\', 'n')
+			buf = append(buf, s[i:j]...)
+			buf = append(buf, '\\', 'n')
+			i = j + 1
+			j = j + 1
+			continue
+
 		case '\r':
-				e.buf = append(e.buf, '\\', 'r')
+			buf = append(buf, s[i:j]...)
+			buf = append(buf, '\\', 'r')
+			i = j + 1
+			j = j + 1
+			continue
+
 		case '\t':
-				e.buf = append(e.buf, '\\', 't')
-			default:
+			buf = append(buf, s[i:j]...)
+			buf = append(buf, '\\', 't')
+			i = j + 1
+			j = j + 1
+			continue
+
+		case '<', '>', '&':
+			buf = append(buf, s[i:j]...)
+			buf = append(buf, `\u00`...)
+			buf = append(buf, hex[c>>4], hex[c&0xF])
+			i = j + 1
+			j = j + 1
+			continue
+		}
+
 		// This encodes bytes < 0x20 except for \t, \n and \r.
-				// If escapeHTML is set, it also escapes <, >, and &
-				// because they can lead to security holes when
-				// user-controlled strings are rendered into JSON
-				// and served to some browsers.
-				e.buf = append(e.buf, []byte(`\u00`)...)
-				e.buf = append(e.buf, hex[b>>4], hex[b&0xF])
-			}
-			i++
-			start = i
+		if c < 0x20 {
+			buf = append(buf, s[i:j]...)
+			buf = append(buf, `\u00`...)
+			buf = append(buf, hex[c>>4], hex[c&0xF])
+			i = j + 1
+			j = j + 1
 			continue
 		}
-		i++
+
+		r, size := utf8.DecodeRuneInString(s[j:])
+
+		if r == utf8.RuneError && size == 1 {
+			buf = append(buf, s[i:j]...)
+			buf = append(buf, `\ufffd`...)
+			i = j + size
+			j = j + size
 			continue
 		}
-	if start < len(s) {
-		e.buf = append(e.buf, s[start:]...)
+
+		switch r {
+		case '\u2028', '\u2029':
+			// U+2028 is LINE SEPARATOR.
+			// U+2029 is PARAGRAPH SEPARATOR.
+			// They are both technically valid characters in JSON strings,
+			// but don't work in JSONP, which has to be evaluated as JavaScript,
+			// and can lead to security holes there. It is valid JSON to
+			// escape them, so we do so unconditionally.
+			// See http://timelessrepo.com/json-isnt-a-javascript-subset for discussion.
+			buf = append(buf, s[i:j]...)
+			buf = append(buf, `\u202`...)
+			buf = append(buf, hex[r&0xF])
+			i = j + size
+			j = j + size
+			continue
 		}
-	e.buf = append(e.buf, '"')
+
+		j += size
+	}
+
+	return append(append(buf, s[i:]...), '"')
 }
--- a/encode_vm.go
+++ b/encode_vm.go
--- a/json.go
+++ b/json.go
@ -393,7 +393,7 @@ func HTMLEscape(dst *bytes.Buffer, src []byte) {
 	}
 	enc := NewEncoder(dst)
 	enc.SetEscapeHTML(true)
-	enc.encode(v)
+	enc.buf, _ = enc.encode(v)
 	dst.Write(enc.buf[:len(enc.buf)-1]) // remove last ',' character
 }