// Copyright 2015 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // +build ignore // This program generates the trie for width operations. The generated table // includes width category information as well as the normalization mappings. package main import ( "bytes" "fmt" "io" "log" "math" "unicode/utf8" "golang.org/x/text/internal/gen" "golang.org/x/text/internal/triegen" ) // See gen_common.go for flags. func main() { gen.Init() genTables() genTests() gen.Repackage("gen_trieval.go", "trieval.go", "width") gen.Repackage("gen_common.go", "common_test.go", "width") } func genTables() { t := triegen.NewTrie("width") // fold and inverse mappings. See mapComment for a description of the format // of each entry. Add dummy value to make an index of 0 mean no mapping. inverse := [][4]byte{{}} mapping := map[[4]byte]int{[4]byte{}: 0} getWidthData(func(r rune, tag elem, alt rune) { idx := 0 if alt != 0 { var buf [4]byte buf[0] = byte(utf8.EncodeRune(buf[1:], alt)) s := string(r) buf[buf[0]] ^= s[len(s)-1] var ok bool if idx, ok = mapping[buf]; !ok { idx = len(mapping) if idx > math.MaxUint8 { log.Fatalf("Index %d does not fit in a byte.", idx) } mapping[buf] = idx inverse = append(inverse, buf) } } t.Insert(r, uint64(tag|elem(idx))) }) w := &bytes.Buffer{} gen.WriteUnicodeVersion(w) sz, err := t.Gen(w) if err != nil { log.Fatal(err) } sz += writeMappings(w, inverse) fmt.Fprintf(w, "// Total table size %d bytes (%dKiB)\n", sz, sz/1024) gen.WriteGoFile(*outputFile, "width", w.Bytes()) } const inverseDataComment = ` // inverseData contains 4-byte entries of the following format: // <length> <modified UTF-8-encoded rune> <0 padding> // The last byte of the UTF-8-encoded rune is xor-ed with the last byte of the // UTF-8 encoding of the original rune. Mappings often have the following // pattern: // A -> A (U+FF21 -> U+0041) // B -> B (U+FF22 -> U+0042) // ... // By xor-ing the last byte the same entry can be shared by many mappings. This // reduces the total number of distinct entries by about two thirds. // The resulting entry for the aforementioned mappings is // { 0x01, 0xE0, 0x00, 0x00 } // Using this entry to map U+FF21 (UTF-8 [EF BC A1]), we get // E0 ^ A1 = 41. // Similarly, for U+FF22 (UTF-8 [EF BC A2]), we get // E0 ^ A2 = 42. // Note that because of the xor-ing, the byte sequence stored in the entry is // not valid UTF-8.` func writeMappings(w io.Writer, data [][4]byte) int { fmt.Fprintln(w, inverseDataComment) fmt.Fprintf(w, "var inverseData = [%d][4]byte{\n", len(data)) for _, x := range data { fmt.Fprintf(w, "{ 0x%02x, 0x%02x, 0x%02x, 0x%02x },\n", x[0], x[1], x[2], x[3]) } fmt.Fprintln(w, "}") return len(data) * 4 } func genTests() { w := &bytes.Buffer{} fmt.Fprintf(w, "\nvar mapRunes = map[rune]struct{r rune; e elem}{\n") getWidthData(func(r rune, tag elem, alt rune) { if alt != 0 { fmt.Fprintf(w, "\t0x%X: {0x%X, 0x%X},\n", r, alt, tag) } }) fmt.Fprintln(w, "}") gen.WriteGoFile("runes_test.go", "width", w.Bytes()) }