// Copyright 2013 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package language import ( "bytes" "errors" "fmt" "sort" "strconv" "strings" "golang.org/x/text/internal/tag" ) // isAlpha returns true if the byte is not a digit. // b must be an ASCII letter or digit. func isAlpha(b byte) bool { return b > '9' } // isAlphaNum returns true if the string contains only ASCII letters or digits. func isAlphaNum(s []byte) bool { for _, c := range s { if !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9') { return false } } return true } // errSyntax is returned by any of the parsing functions when the // input is not well-formed, according to BCP 47. // TODO: return the position at which the syntax error occurred? var errSyntax = errors.New("language: tag is not well-formed") // ValueError is returned by any of the parsing functions when the // input is well-formed but the respective subtag is not recognized // as a valid value. type ValueError struct { v [8]byte } func mkErrInvalid(s []byte) error { var e ValueError copy(e.v[:], s) return e } func (e ValueError) tag() []byte { n := bytes.IndexByte(e.v[:], 0) if n == -1 { n = 8 } return e.v[:n] } // Error implements the error interface. func (e ValueError) Error() string { return fmt.Sprintf("language: subtag %q is well-formed but unknown", e.tag()) } // Subtag returns the subtag for which the error occurred. func (e ValueError) Subtag() string { return string(e.tag()) } // scanner is used to scan BCP 47 tokens, which are separated by _ or -. type scanner struct { b []byte bytes [max99thPercentileSize]byte token []byte start int // start position of the current token end int // end position of the current token next int // next point for scan err error done bool } func makeScannerString(s string) scanner { scan := scanner{} if len(s) <= len(scan.bytes) { scan.b = scan.bytes[:copy(scan.bytes[:], s)] } else { scan.b = []byte(s) } scan.init() return scan } // makeScanner returns a scanner using b as the input buffer. // b is not copied and may be modified by the scanner routines. func makeScanner(b []byte) scanner { scan := scanner{b: b} scan.init() return scan } func (s *scanner) init() { for i, c := range s.b { if c == '_' { s.b[i] = '-' } } s.scan() } // restToLower converts the string between start and end to lower case. func (s *scanner) toLower(start, end int) { for i := start; i < end; i++ { c := s.b[i] if 'A' <= c && c <= 'Z' { s.b[i] += 'a' - 'A' } } } func (s *scanner) setError(e error) { if s.err == nil || (e == errSyntax && s.err != errSyntax) { s.err = e } } // resizeRange shrinks or grows the array at position oldStart such that // a new string of size newSize can fit between oldStart and oldEnd. // Sets the scan point to after the resized range. func (s *scanner) resizeRange(oldStart, oldEnd, newSize int) { s.start = oldStart if end := oldStart + newSize; end != oldEnd { diff := end - oldEnd if end < cap(s.b) { b := make([]byte, len(s.b)+diff) copy(b, s.b[:oldStart]) copy(b[end:], s.b[oldEnd:]) s.b = b } else { s.b = append(s.b[end:], s.b[oldEnd:]...) } s.next = end + (s.next - s.end) s.end = end } } // replace replaces the current token with repl. func (s *scanner) replace(repl string) { s.resizeRange(s.start, s.end, len(repl)) copy(s.b[s.start:], repl) } // gobble removes the current token from the input. // Caller must call scan after calling gobble. func (s *scanner) gobble(e error) { s.setError(e) if s.start == 0 { s.b = s.b[:+copy(s.b, s.b[s.next:])] s.end = 0 } else { s.b = s.b[:s.start-1+copy(s.b[s.start-1:], s.b[s.end:])] s.end = s.start - 1 } s.next = s.start } // deleteRange removes the given range from s.b before the current token. func (s *scanner) deleteRange(start, end int) { s.setError(errSyntax) s.b = s.b[:start+copy(s.b[start:], s.b[end:])] diff := end - start s.next -= diff s.start -= diff s.end -= diff } // scan parses the next token of a BCP 47 string. Tokens that are larger // than 8 characters or include non-alphanumeric characters result in an error // and are gobbled and removed from the output. // It returns the end position of the last token consumed. func (s *scanner) scan() (end int) { end = s.end s.token = nil for s.start = s.next; s.next < len(s.b); { i := bytes.IndexByte(s.b[s.next:], '-') if i == -1 { s.end = len(s.b) s.next = len(s.b) i = s.end - s.start } else { s.end = s.next + i s.next = s.end + 1 } token := s.b[s.start:s.end] if i < 1 || i > 8 || !isAlphaNum(token) { s.gobble(errSyntax) continue } s.token = token return end } if n := len(s.b); n > 0 && s.b[n-1] == '-' { s.setError(errSyntax) s.b = s.b[:len(s.b)-1] } s.done = true return end } // acceptMinSize parses multiple tokens of the given size or greater. // It returns the end position of the last token consumed. func (s *scanner) acceptMinSize(min int) (end int) { end = s.end s.scan() for ; len(s.token) >= min; s.scan() { end = s.end } return end } // Parse parses the given BCP 47 string and returns a valid Tag. If parsing // failed it returns an error and any part of the tag that could be parsed. // If parsing succeeded but an unknown value was found, it returns // ValueError. The Tag returned in this case is just stripped of the unknown // value. All other values are preserved. It accepts tags in the BCP 47 format // and extensions to this standard defined in // http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers. // The resulting tag is canonicalized using the default canonicalization type. func Parse(s string) (t Tag, err error) { return Default.Parse(s) } // Parse parses the given BCP 47 string and returns a valid Tag. If parsing // failed it returns an error and any part of the tag that could be parsed. // If parsing succeeded but an unknown value was found, it returns // ValueError. The Tag returned in this case is just stripped of the unknown // value. All other values are preserved. It accepts tags in the BCP 47 format // and extensions to this standard defined in // http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers. // The resulting tag is canonicalized using the the canonicalization type c. func (c CanonType) Parse(s string) (t Tag, err error) { // TODO: consider supporting old-style locale key-value pairs. if s == "" { return und, errSyntax } if len(s) <= maxAltTaglen { b := [maxAltTaglen]byte{} for i, c := range s { // Generating invalid UTF-8 is okay as it won't match. if 'A' <= c && c <= 'Z' { c += 'a' - 'A' } else if c == '_' { c = '-' } b[i] = byte(c) } if t, ok := grandfathered(b); ok { return t, nil } } scan := makeScannerString(s) t, err = parse(&scan, s) t, changed := t.canonicalize(c) if changed { t.remakeString() } return t, err } func parse(scan *scanner, s string) (t Tag, err error) { t = und var end int if n := len(scan.token); n <= 1 { scan.toLower(0, len(scan.b)) if n == 0 || scan.token[0] != 'x' { return t, errSyntax } end = parseExtensions(scan) } else if n >= 4 { return und, errSyntax } else { // the usual case t, end = parseTag(scan) if n := len(scan.token); n == 1 { t.pExt = uint16(end) end = parseExtensions(scan) } else if end < len(scan.b) { scan.setError(errSyntax) scan.b = scan.b[:end] } } if int(t.pVariant) < len(scan.b) { if end < len(s) { s = s[:end] } if len(s) > 0 && tag.Compare(s, scan.b) == 0 { t.str = s } else { t.str = string(scan.b) } } else { t.pVariant, t.pExt = 0, 0 } return t, scan.err } // parseTag parses language, script, region and variants. // It returns a Tag and the end position in the input that was parsed. func parseTag(scan *scanner) (t Tag, end int) { var e error // TODO: set an error if an unknown lang, script or region is encountered. t.lang, e = getLangID(scan.token) scan.setError(e) scan.replace(t.lang.String()) langStart := scan.start end = scan.scan() for len(scan.token) == 3 && isAlpha(scan.token[0]) { // From http://tools.ietf.org/html/bcp47, <lang>-<extlang> tags are equivalent // to a tag of the form <extlang>. lang, e := getLangID(scan.token) if lang != 0 { t.lang = lang copy(scan.b[langStart:], lang.String()) scan.b[langStart+3] = '-' scan.start = langStart + 4 } scan.gobble(e) end = scan.scan() } if len(scan.token) == 4 && isAlpha(scan.token[0]) { t.script, e = getScriptID(script, scan.token) if t.script == 0 { scan.gobble(e) } end = scan.scan() } if n := len(scan.token); n >= 2 && n <= 3 { t.region, e = getRegionID(scan.token) if t.region == 0 { scan.gobble(e) } else { scan.replace(t.region.String()) } end = scan.scan() } scan.toLower(scan.start, len(scan.b)) t.pVariant = byte(end) end = parseVariants(scan, end, t) t.pExt = uint16(end) return t, end } var separator = []byte{'-'} // parseVariants scans tokens as long as each token is a valid variant string. // Duplicate variants are removed. func parseVariants(scan *scanner, end int, t Tag) int { start := scan.start varIDBuf := [4]uint8{} variantBuf := [4][]byte{} varID := varIDBuf[:0] variant := variantBuf[:0] last := -1 needSort := false for ; len(scan.token) >= 4; scan.scan() { // TODO: measure the impact of needing this conversion and redesign // the data structure if there is an issue. v, ok := variantIndex[string(scan.token)] if !ok { // unknown variant // TODO: allow user-defined variants? scan.gobble(mkErrInvalid(scan.token)) continue } varID = append(varID, v) variant = append(variant, scan.token) if !needSort { if last < int(v) { last = int(v) } else { needSort = true // There is no legal combinations of more than 7 variants // (and this is by no means a useful sequence). const maxVariants = 8 if len(varID) > maxVariants { break } } } end = scan.end } if needSort { sort.Sort(variantsSort{varID, variant}) k, l := 0, -1 for i, v := range varID { w := int(v) if l == w { // Remove duplicates. continue } varID[k] = varID[i] variant[k] = variant[i] k++ l = w } if str := bytes.Join(variant[:k], separator); len(str) == 0 { end = start - 1 } else { scan.resizeRange(start, end, len(str)) copy(scan.b[scan.start:], str) end = scan.end } } return end } type variantsSort struct { i []uint8 v [][]byte } func (s variantsSort) Len() int { return len(s.i) } func (s variantsSort) Swap(i, j int) { s.i[i], s.i[j] = s.i[j], s.i[i] s.v[i], s.v[j] = s.v[j], s.v[i] } func (s variantsSort) Less(i, j int) bool { return s.i[i] < s.i[j] } type bytesSort [][]byte func (b bytesSort) Len() int { return len(b) } func (b bytesSort) Swap(i, j int) { b[i], b[j] = b[j], b[i] } func (b bytesSort) Less(i, j int) bool { return bytes.Compare(b[i], b[j]) == -1 } // parseExtensions parses and normalizes the extensions in the buffer. // It returns the last position of scan.b that is part of any extension. // It also trims scan.b to remove excess parts accordingly. func parseExtensions(scan *scanner) int { start := scan.start exts := [][]byte{} private := []byte{} end := scan.end for len(scan.token) == 1 { extStart := scan.start ext := scan.token[0] end = parseExtension(scan) extension := scan.b[extStart:end] if len(extension) < 3 || (ext != 'x' && len(extension) < 4) { scan.setError(errSyntax) end = extStart continue } else if start == extStart && (ext == 'x' || scan.start == len(scan.b)) { scan.b = scan.b[:end] return end } else if ext == 'x' { private = extension break } exts = append(exts, extension) } sort.Sort(bytesSort(exts)) if len(private) > 0 { exts = append(exts, private) } scan.b = scan.b[:start] if len(exts) > 0 { scan.b = append(scan.b, bytes.Join(exts, separator)...) } else if start > 0 { // Strip trailing '-'. scan.b = scan.b[:start-1] } return end } // parseExtension parses a single extension and returns the position of // the extension end. func parseExtension(scan *scanner) int { start, end := scan.start, scan.end switch scan.token[0] { case 'u': attrStart := end scan.scan() for last := []byte{}; len(scan.token) > 2; scan.scan() { if bytes.Compare(scan.token, last) != -1 { // Attributes are unsorted. Start over from scratch. p := attrStart + 1 scan.next = p attrs := [][]byte{} for scan.scan(); len(scan.token) > 2; scan.scan() { attrs = append(attrs, scan.token) end = scan.end } sort.Sort(bytesSort(attrs)) copy(scan.b[p:], bytes.Join(attrs, separator)) break } last = scan.token end = scan.end } var last, key []byte for attrEnd := end; len(scan.token) == 2; last = key { key = scan.token keyEnd := scan.end end = scan.acceptMinSize(3) // TODO: check key value validity if keyEnd == end || bytes.Compare(key, last) != 1 { // We have an invalid key or the keys are not sorted. // Start scanning keys from scratch and reorder. p := attrEnd + 1 scan.next = p keys := [][]byte{} for scan.scan(); len(scan.token) == 2; { keyStart, keyEnd := scan.start, scan.end end = scan.acceptMinSize(3) if keyEnd != end { keys = append(keys, scan.b[keyStart:end]) } else { scan.setError(errSyntax) end = keyStart } } sort.Sort(bytesSort(keys)) reordered := bytes.Join(keys, separator) if e := p + len(reordered); e < end { scan.deleteRange(e, end) end = e } copy(scan.b[p:], bytes.Join(keys, separator)) break } } case 't': scan.scan() if n := len(scan.token); n >= 2 && n <= 3 && isAlpha(scan.token[1]) { _, end = parseTag(scan) scan.toLower(start, end) } for len(scan.token) == 2 && !isAlpha(scan.token[1]) { end = scan.acceptMinSize(3) } case 'x': end = scan.acceptMinSize(1) default: end = scan.acceptMinSize(2) } return end } // Compose creates a Tag from individual parts, which may be of type Tag, Base, // Script, Region, Variant, []Variant, Extension, []Extension or error. If a // Base, Script or Region or slice of type Variant or Extension is passed more // than once, the latter will overwrite the former. Variants and Extensions are // accumulated, but if two extensions of the same type are passed, the latter // will replace the former. A Tag overwrites all former values and typically // only makes sense as the first argument. The resulting tag is returned after // canonicalizing using the Default CanonType. If one or more errors are // encountered, one of the errors is returned. func Compose(part ...interface{}) (t Tag, err error) { return Default.Compose(part...) } // Compose creates a Tag from individual parts, which may be of type Tag, Base, // Script, Region, Variant, []Variant, Extension, []Extension or error. If a // Base, Script or Region or slice of type Variant or Extension is passed more // than once, the latter will overwrite the former. Variants and Extensions are // accumulated, but if two extensions of the same type are passed, the latter // will replace the former. A Tag overwrites all former values and typically // only makes sense as the first argument. The resulting tag is returned after // canonicalizing using CanonType c. If one or more errors are encountered, // one of the errors is returned. func (c CanonType) Compose(part ...interface{}) (t Tag, err error) { var b builder if err = b.update(part...); err != nil { return und, err } t, _ = b.tag.canonicalize(c) if len(b.ext) > 0 || len(b.variant) > 0 { sort.Sort(sortVariant(b.variant)) sort.Strings(b.ext) if b.private != "" { b.ext = append(b.ext, b.private) } n := maxCoreSize + tokenLen(b.variant...) + tokenLen(b.ext...) buf := make([]byte, n) p := t.genCoreBytes(buf) t.pVariant = byte(p) p += appendTokens(buf[p:], b.variant...) t.pExt = uint16(p) p += appendTokens(buf[p:], b.ext...) t.str = string(buf[:p]) } else if b.private != "" { t.str = b.private t.remakeString() } return } type builder struct { tag Tag private string // the x extension ext []string variant []string err error } func (b *builder) addExt(e string) { if e == "" { } else if e[0] == 'x' { b.private = e } else { b.ext = append(b.ext, e) } } var errInvalidArgument = errors.New("invalid Extension or Variant") func (b *builder) update(part ...interface{}) (err error) { replace := func(l *[]string, s string, eq func(a, b string) bool) bool { if s == "" { b.err = errInvalidArgument return true } for i, v := range *l { if eq(v, s) { (*l)[i] = s return true } } return false } for _, x := range part { switch v := x.(type) { case Tag: b.tag.lang = v.lang b.tag.region = v.region b.tag.script = v.script if v.str != "" { b.variant = nil for x, s := "", v.str[v.pVariant:v.pExt]; s != ""; { x, s = nextToken(s) b.variant = append(b.variant, x) } b.ext, b.private = nil, "" for i, e := int(v.pExt), ""; i < len(v.str); { i, e = getExtension(v.str, i) b.addExt(e) } } case Base: b.tag.lang = v.langID case Script: b.tag.script = v.scriptID case Region: b.tag.region = v.regionID case Variant: if !replace(&b.variant, v.variant, func(a, b string) bool { return a == b }) { b.variant = append(b.variant, v.variant) } case Extension: if !replace(&b.ext, v.s, func(a, b string) bool { return a[0] == b[0] }) { b.addExt(v.s) } case []Variant: b.variant = nil for _, x := range v { b.update(x) } case []Extension: b.ext, b.private = nil, "" for _, e := range v { b.update(e) } // TODO: support parsing of raw strings based on morphology or just extensions? case error: err = v } } return } func tokenLen(token ...string) (n int) { for _, t := range token { n += len(t) + 1 } return } func appendTokens(b []byte, token ...string) int { p := 0 for _, t := range token { b[p] = '-' copy(b[p+1:], t) p += 1 + len(t) } return p } type sortVariant []string func (s sortVariant) Len() int { return len(s) } func (s sortVariant) Swap(i, j int) { s[j], s[i] = s[i], s[j] } func (s sortVariant) Less(i, j int) bool { return variantIndex[s[i]] < variantIndex[s[j]] } func findExt(list []string, x byte) int { for i, e := range list { if e[0] == x { return i } } return -1 } // getExtension returns the name, body and end position of the extension. func getExtension(s string, p int) (end int, ext string) { if s[p] == '-' { p++ } if s[p] == 'x' { return len(s), s[p:] } end = nextExtension(s, p) return end, s[p:end] } // nextExtension finds the next extension within the string, searching // for the -<char>- pattern from position p. // In the fast majority of cases, language tags will have at most // one extension and extensions tend to be small. func nextExtension(s string, p int) int { for n := len(s) - 3; p < n; { if s[p] == '-' { if s[p+2] == '-' { return p } p += 3 } else { p++ } } return len(s) } var errInvalidWeight = errors.New("ParseAcceptLanguage: invalid weight") // ParseAcceptLanguage parses the contents of an Accept-Language header as // defined in http://www.ietf.org/rfc/rfc2616.txt and returns a list of Tags and // a list of corresponding quality weights. It is more permissive than RFC 2616 // and may return non-nil slices even if the input is not valid. // The Tags will be sorted by highest weight first and then by first occurrence. // Tags with a weight of zero will be dropped. An error will be returned if the // input could not be parsed. func ParseAcceptLanguage(s string) (tag []Tag, q []float32, err error) { var entry string for s != "" { if entry, s = split(s, ','); entry == "" { continue } entry, weight := split(entry, ';') // Scan the language. t, err := Parse(entry) if err != nil { id, ok := acceptFallback[entry] if !ok { return nil, nil, err } t = Tag{lang: id} } // Scan the optional weight. w := 1.0 if weight != "" { weight = consume(weight, 'q') weight = consume(weight, '=') // consume returns the empty string when a token could not be // consumed, resulting in an error for ParseFloat. if w, err = strconv.ParseFloat(weight, 32); err != nil { return nil, nil, errInvalidWeight } // Drop tags with a quality weight of 0. if w <= 0 { continue } } tag = append(tag, t) q = append(q, float32(w)) } sortStable(&tagSort{tag, q}) return tag, q, nil } // consume removes a leading token c from s and returns the result or the empty // string if there is no such token. func consume(s string, c byte) string { if s == "" || s[0] != c { return "" } return strings.TrimSpace(s[1:]) } func split(s string, c byte) (head, tail string) { if i := strings.IndexByte(s, c); i >= 0 { return strings.TrimSpace(s[:i]), strings.TrimSpace(s[i+1:]) } return strings.TrimSpace(s), "" } // Add hack mapping to deal with a small number of cases that that occur // in Accept-Language (with reasonable frequency). var acceptFallback = map[string]langID{ "english": _en, "deutsch": _de, "italian": _it, "french": _fr, "*": _mul, // defined in the spec to match all languages. } type tagSort struct { tag []Tag q []float32 } func (s *tagSort) Len() int { return len(s.q) } func (s *tagSort) Less(i, j int) bool { return s.q[i] > s.q[j] } func (s *tagSort) Swap(i, j int) { s.tag[i], s.tag[j] = s.tag[j], s.tag[i] s.q[i], s.q[j] = s.q[j], s.q[i] }