Commit e7c7b54b authored by Felix Lange's avatar Felix Lange

Godeps: bump github.com/huin/goupnp to c57ae84

parent 663d4e0a
......@@ -27,7 +27,7 @@
},
{
"ImportPath": "github.com/huin/goupnp",
"Rev": "4191d8a85005844ea202fde52799681971b12dfe"
"Rev": "c57ae84388ab59076fd547f1abeab71c2edb0a21"
},
{
"ImportPath": "github.com/jackpal/go-nat-pmp",
......@@ -89,6 +89,18 @@
"ImportPath": "golang.org/x/crypto/scrypt",
"Rev": "4ed45ec682102c643324fae5dff8dab085b6c300"
},
{
"ImportPath": "golang.org/x/net/html",
"Rev": "e0403b4e005737430c05a57aac078479844f919c"
},
{
"ImportPath": "golang.org/x/text/encoding",
"Rev": "c93e7c9fff19fb9139b5ab04ce041833add0134e"
},
{
"ImportPath": "golang.org/x/text/transform",
"Rev": "c93e7c9fff19fb9139b5ab04ce041833add0134e"
},
{
"ImportPath": "gopkg.in/check.v1",
"Rev": "64131543e7896d5bcc6bd5a76287eb75ea96c673"
......
......@@ -20,6 +20,8 @@ import (
"net/http"
"net/url"
"golang.org/x/net/html/charset"
"github.com/huin/goupnp/httpu"
"github.com/huin/goupnp/ssdp"
)
......@@ -104,6 +106,7 @@ func requestXml(url string, defaultSpace string, doc interface{}) error {
decoder := xml.NewDecoder(resp.Body)
decoder.DefaultSpace = defaultSpace
decoder.CharsetReader = charset.NewReaderLabel
return decoder.Decode(doc)
}
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package atom provides integer codes (also known as atoms) for a fixed set of
// frequently occurring HTML strings: tag names and attribute keys such as "p"
// and "id".
//
// Sharing an atom's name between all elements with the same tag can result in
// fewer string allocations when tokenizing and parsing HTML. Integer
// comparisons are also generally faster than string comparisons.
//
// The value of an atom's particular code is not guaranteed to stay the same
// between versions of this package. Neither is any ordering guaranteed:
// whether atom.H1 < atom.H2 may also change. The codes are not guaranteed to
// be dense. The only guarantees are that e.g. looking up "div" will yield
// atom.Div, calling atom.Div.String will return "div", and atom.Div != 0.
package atom
// Atom is an integer code for a string. The zero value maps to "".
type Atom uint32
// String returns the atom's name.
func (a Atom) String() string {
start := uint32(a >> 8)
n := uint32(a & 0xff)
if start+n > uint32(len(atomText)) {
return ""
}
return atomText[start : start+n]
}
func (a Atom) string() string {
return atomText[a>>8 : a>>8+a&0xff]
}
// fnv computes the FNV hash with an arbitrary starting value h.
func fnv(h uint32, s []byte) uint32 {
for i := range s {
h ^= uint32(s[i])
h *= 16777619
}
return h
}
func match(s string, t []byte) bool {
for i, c := range t {
if s[i] != c {
return false
}
}
return true
}
// Lookup returns the atom whose name is s. It returns zero if there is no
// such atom. The lookup is case sensitive.
func Lookup(s []byte) Atom {
if len(s) == 0 || len(s) > maxAtomLen {
return 0
}
h := fnv(hash0, s)
if a := table[h&uint32(len(table)-1)]; int(a&0xff) == len(s) && match(a.string(), s) {
return a
}
if a := table[(h>>16)&uint32(len(table)-1)]; int(a&0xff) == len(s) && match(a.string(), s) {
return a
}
return 0
}
// String returns a string whose contents are equal to s. In that sense, it is
// equivalent to string(s) but may be more efficient.
func String(s []byte) string {
if a := Lookup(s); a != 0 {
return a.String()
}
return string(s)
}
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package atom
import (
"sort"
"testing"
)
func TestKnown(t *testing.T) {
for _, s := range testAtomList {
if atom := Lookup([]byte(s)); atom.String() != s {
t.Errorf("Lookup(%q) = %#x (%q)", s, uint32(atom), atom.String())
}
}
}
func TestHits(t *testing.T) {
for _, a := range table {
if a == 0 {
continue
}
got := Lookup([]byte(a.String()))
if got != a {
t.Errorf("Lookup(%q) = %#x, want %#x", a.String(), uint32(got), uint32(a))
}
}
}
func TestMisses(t *testing.T) {
testCases := []string{
"",
"\x00",
"\xff",
"A",
"DIV",
"Div",
"dIV",
"aa",
"a\x00",
"ab",
"abb",
"abbr0",
"abbr ",
" abbr",
" a",
"acceptcharset",
"acceptCharset",
"accept_charset",
"h0",
"h1h2",
"h7",
"onClick",
"λ",
// The following string has the same hash (0xa1d7fab7) as "onmouseover".
"\x00\x00\x00\x00\x00\x50\x18\xae\x38\xd0\xb7",
}
for _, tc := range testCases {
got := Lookup([]byte(tc))
if got != 0 {
t.Errorf("Lookup(%q): got %d, want 0", tc, got)
}
}
}
func TestForeignObject(t *testing.T) {
const (
afo = Foreignobject
afO = ForeignObject
sfo = "foreignobject"
sfO = "foreignObject"
)
if got := Lookup([]byte(sfo)); got != afo {
t.Errorf("Lookup(%q): got %#v, want %#v", sfo, got, afo)
}
if got := Lookup([]byte(sfO)); got != afO {
t.Errorf("Lookup(%q): got %#v, want %#v", sfO, got, afO)
}
if got := afo.String(); got != sfo {
t.Errorf("Atom(%#v).String(): got %q, want %q", afo, got, sfo)
}
if got := afO.String(); got != sfO {
t.Errorf("Atom(%#v).String(): got %q, want %q", afO, got, sfO)
}
}
func BenchmarkLookup(b *testing.B) {
sortedTable := make([]string, 0, len(table))
for _, a := range table {
if a != 0 {
sortedTable = append(sortedTable, a.String())
}
}
sort.Strings(sortedTable)
x := make([][]byte, 1000)
for i := range x {
x[i] = []byte(sortedTable[i%len(sortedTable)])
}
b.ResetTimer()
for i := 0; i < b.N; i++ {
for _, s := range x {
Lookup(s)
}
}
}
This diff is collapsed.
This diff is collapsed.
// generated by go run gen.go -test; DO NOT EDIT
package atom
var testAtomList = []string{
"a",
"abbr",
"abbr",
"accept",
"accept-charset",
"accesskey",
"action",
"address",
"align",
"alt",
"annotation",
"annotation-xml",
"applet",
"area",
"article",
"aside",
"async",
"audio",
"autocomplete",
"autofocus",
"autoplay",
"b",
"base",
"basefont",
"bdi",
"bdo",
"bgsound",
"big",
"blink",
"blockquote",
"body",
"br",
"button",
"canvas",
"caption",
"center",
"challenge",
"charset",
"checked",
"cite",
"cite",
"class",
"code",
"col",
"colgroup",
"color",
"cols",
"colspan",
"command",
"command",
"content",
"contenteditable",
"contextmenu",
"controls",
"coords",
"crossorigin",
"data",
"data",
"datalist",
"datetime",
"dd",
"default",
"defer",
"del",
"desc",
"details",
"dfn",
"dialog",
"dir",
"dirname",
"disabled",
"div",
"dl",
"download",
"draggable",
"dropzone",
"dt",
"em",
"embed",
"enctype",
"face",
"fieldset",
"figcaption",
"figure",
"font",
"footer",
"for",
"foreignObject",
"foreignobject",
"form",
"form",
"formaction",
"formenctype",
"formmethod",
"formnovalidate",
"formtarget",
"frame",
"frameset",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"head",
"header",
"headers",
"height",
"hgroup",
"hidden",
"high",
"hr",
"href",
"hreflang",
"html",
"http-equiv",
"i",
"icon",
"id",
"iframe",
"image",
"img",
"input",
"inputmode",
"ins",
"isindex",
"ismap",
"itemid",
"itemprop",
"itemref",
"itemscope",
"itemtype",
"kbd",
"keygen",
"keytype",
"kind",
"label",
"label",
"lang",
"legend",
"li",
"link",
"list",
"listing",
"loop",
"low",
"malignmark",
"manifest",
"map",
"mark",
"marquee",
"math",
"max",
"maxlength",
"media",
"mediagroup",
"menu",
"menuitem",
"meta",
"meter",
"method",
"mglyph",
"mi",
"min",
"minlength",
"mn",
"mo",
"ms",
"mtext",
"multiple",
"muted",
"name",
"nav",
"nobr",
"noembed",
"noframes",
"noscript",
"novalidate",
"object",
"ol",
"onabort",
"onafterprint",
"onautocomplete",
"onautocompleteerror",
"onbeforeprint",
"onbeforeunload",
"onblur",
"oncancel",
"oncanplay",
"oncanplaythrough",
"onchange",
"onclick",
"onclose",
"oncontextmenu",
"oncuechange",
"ondblclick",
"ondrag",
"ondragend",
"ondragenter",
"ondragleave",
"ondragover",
"ondragstart",
"ondrop",
"ondurationchange",
"onemptied",
"onended",
"onerror",
"onfocus",
"onhashchange",
"oninput",
"oninvalid",
"onkeydown",
"onkeypress",
"onkeyup",
"onlanguagechange",
"onload",
"onloadeddata",
"onloadedmetadata",
"onloadstart",
"onmessage",
"onmousedown",
"onmousemove",
"onmouseout",
"onmouseover",
"onmouseup",
"onmousewheel",
"onoffline",
"ononline",
"onpagehide",
"onpageshow",
"onpause",
"onplay",
"onplaying",
"onpopstate",
"onprogress",
"onratechange",
"onreset",
"onresize",
"onscroll",
"onseeked",
"onseeking",
"onselect",
"onshow",
"onsort",
"onstalled",
"onstorage",
"onsubmit",
"onsuspend",
"ontimeupdate",
"ontoggle",
"onunload",
"onvolumechange",
"onwaiting",
"open",
"optgroup",
"optimum",
"option",
"output",
"p",
"param",
"pattern",
"ping",
"placeholder",
"plaintext",
"poster",
"pre",
"preload",
"progress",
"prompt",
"public",
"q",
"radiogroup",
"readonly",
"rel",
"required",
"reversed",
"rows",
"rowspan",
"rp",
"rt",
"ruby",
"s",
"samp",
"sandbox",
"scope",
"scoped",
"script",
"seamless",
"section",
"select",
"selected",
"shape",
"size",
"sizes",
"small",
"sortable",
"sorted",
"source",
"spacer",
"span",
"span",
"spellcheck",
"src",
"srcdoc",
"srclang",
"start",
"step",
"strike",
"strong",
"style",
"style",
"sub",
"summary",
"sup",
"svg",
"system",
"tabindex",
"table",
"target",
"tbody",
"td",
"template",
"textarea",
"tfoot",
"th",
"thead",
"time",
"title",
"title",
"tr",
"track",
"translate",
"tt",
"type",
"typemustmatch",
"u",
"ul",
"usemap",
"value",
"var",
"video",
"wbr",
"width",
"wrap",
"xmp",
}
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package charset provides common text encodings for HTML documents.
//
// The mapping from encoding labels to encodings is defined at
// https://encoding.spec.whatwg.org/.
package charset
import (
"bytes"
"fmt"
"io"
"mime"
"strings"
"unicode/utf8"
"golang.org/x/net/html"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/charmap"
"golang.org/x/text/transform"
)
// Lookup returns the encoding with the specified label, and its canonical
// name. It returns nil and the empty string if label is not one of the
// standard encodings for HTML. Matching is case-insensitive and ignores
// leading and trailing whitespace.
func Lookup(label string) (e encoding.Encoding, name string) {
label = strings.ToLower(strings.Trim(label, "\t\n\r\f "))
enc := encodings[label]
return enc.e, enc.name
}
// DetermineEncoding determines the encoding of an HTML document by examining
// up to the first 1024 bytes of content and the declared Content-Type.
//
// See http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#determining-the-character-encoding
func DetermineEncoding(content []byte, contentType string) (e encoding.Encoding, name string, certain bool) {
if len(content) > 1024 {
content = content[:1024]
}
for _, b := range boms {
if bytes.HasPrefix(content, b.bom) {
e, name = Lookup(b.enc)
return e, name, true
}
}
if _, params, err := mime.ParseMediaType(contentType); err == nil {
if cs, ok := params["charset"]; ok {
if e, name = Lookup(cs); e != nil {
return e, name, true
}
}
}
if len(content) > 0 {
e, name = prescan(content)
if e != nil {
return e, name, false
}
}
// Try to detect UTF-8.
// First eliminate any partial rune at the end.
for i := len(content) - 1; i >= 0 && i > len(content)-4; i-- {
b := content[i]
if b < 0x80 {
break
}
if utf8.RuneStart(b) {
content = content[:i]
break
}
}
hasHighBit := false
for _, c := range content {
if c >= 0x80 {
hasHighBit = true
break
}
}
if hasHighBit && utf8.Valid(content) {
return encoding.Nop, "utf-8", false
}
// TODO: change default depending on user's locale?
return charmap.Windows1252, "windows-1252", false
}
// NewReader returns an io.Reader that converts the content of r to UTF-8.
// It calls DetermineEncoding to find out what r's encoding is.
func NewReader(r io.Reader, contentType string) (io.Reader, error) {
preview := make([]byte, 1024)
n, err := io.ReadFull(r, preview)
switch {
case err == io.ErrUnexpectedEOF:
preview = preview[:n]
r = bytes.NewReader(preview)
case err != nil:
return nil, err
default:
r = io.MultiReader(bytes.NewReader(preview), r)
}
if e, _, _ := DetermineEncoding(preview, contentType); e != encoding.Nop {
r = transform.NewReader(r, e.NewDecoder())
}
return r, nil
}
// NewReaderLabel returns a reader that converts from the specified charset to
// UTF-8. It uses Lookup to find the encoding that corresponds to label, and
// returns an error if Lookup returns nil. It is suitable for use as
// encoding/xml.Decoder's CharsetReader function.
func NewReaderLabel(label string, input io.Reader) (io.Reader, error) {
e, _ := Lookup(label)
if e == nil {
return nil, fmt.Errorf("unsupported charset: %q", label)
}
return transform.NewReader(input, e.NewDecoder()), nil
}
func prescan(content []byte) (e encoding.Encoding, name string) {
z := html.NewTokenizer(bytes.NewReader(content))
for {
switch z.Next() {
case html.ErrorToken:
return nil, ""
case html.StartTagToken, html.SelfClosingTagToken:
tagName, hasAttr := z.TagName()
if !bytes.Equal(tagName, []byte("meta")) {
continue
}
attrList := make(map[string]bool)
gotPragma := false
const (
dontKnow = iota
doNeedPragma
doNotNeedPragma
)
needPragma := dontKnow
name = ""
e = nil
for hasAttr {
var key, val []byte
key, val, hasAttr = z.TagAttr()
ks := string(key)
if attrList[ks] {
continue
}
attrList[ks] = true
for i, c := range val {
if 'A' <= c && c <= 'Z' {
val[i] = c + 0x20
}
}
switch ks {
case "http-equiv":
if bytes.Equal(val, []byte("content-type")) {
gotPragma = true
}
case "content":
if e == nil {
name = fromMetaElement(string(val))
if name != "" {
e, name = Lookup(name)
if e != nil {
needPragma = doNeedPragma
}
}
}
case "charset":
e, name = Lookup(string(val))
needPragma = doNotNeedPragma
}
}
if needPragma == dontKnow || needPragma == doNeedPragma && !gotPragma {
continue
}
if strings.HasPrefix(name, "utf-16") {
name = "utf-8"
e = encoding.Nop
}
if e != nil {
return e, name
}
}
}
}
func fromMetaElement(s string) string {
for s != "" {
csLoc := strings.Index(s, "charset")
if csLoc == -1 {
return ""
}
s = s[csLoc+len("charset"):]
s = strings.TrimLeft(s, " \t\n\f\r")
if !strings.HasPrefix(s, "=") {
continue
}
s = s[1:]
s = strings.TrimLeft(s, " \t\n\f\r")
if s == "" {
return ""
}
if q := s[0]; q == '"' || q == '\'' {
s = s[1:]
closeQuote := strings.IndexRune(s, rune(q))
if closeQuote == -1 {
return ""
}
return s[:closeQuote]
}
end := strings.IndexAny(s, "; \t\n\f\r")
if end == -1 {
end = len(s)
}
return s[:end]
}
return ""
}
var boms = []struct {
bom []byte
enc string
}{
{[]byte{0xfe, 0xff}, "utf-16be"},
{[]byte{0xff, 0xfe}, "utf-16le"},
{[]byte{0xef, 0xbb, 0xbf}, "utf-8"},
}
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package charset
import (
"bytes"
"encoding/xml"
"io/ioutil"
"runtime"
"strings"
"testing"
"golang.org/x/text/transform"
)
func transformString(t transform.Transformer, s string) (string, error) {
r := transform.NewReader(strings.NewReader(s), t)
b, err := ioutil.ReadAll(r)
return string(b), err
}
var testCases = []struct {
utf8, other, otherEncoding string
}{
{"Résumé", "Résumé", "utf8"},
{"Résumé", "R\xe9sum\xe9", "latin1"},
{"これは漢字です。", "S0\x8c0o0\"oW[g0Y0\x020", "UTF-16LE"},
{"これは漢字です。", "0S0\x8c0oo\"[W0g0Y0\x02", "UTF-16BE"},
{"Hello, world", "Hello, world", "ASCII"},
{"Gdańsk", "Gda\xf1sk", "ISO-8859-2"},
{"Ââ Čč Đđ Ŋŋ Õõ Šš Žž Åå Ää", "\xc2\xe2 \xc8\xe8 \xa9\xb9 \xaf\xbf \xd5\xf5 \xaa\xba \xac\xbc \xc5\xe5 \xc4\xe4", "ISO-8859-10"},
{"สำหรับ", "\xca\xd3\xcb\xc3\u047a", "ISO-8859-11"},
{"latviešu", "latvie\xf0u", "ISO-8859-13"},
{"Seònaid", "Se\xf2naid", "ISO-8859-14"},
{"€1 is cheap", "\xa41 is cheap", "ISO-8859-15"},
{"românește", "rom\xe2ne\xbate", "ISO-8859-16"},
{"nutraĵo", "nutra\xbco", "ISO-8859-3"},
{"Kalâdlit", "Kal\xe2dlit", "ISO-8859-4"},
{"русский", "\xe0\xe3\xe1\xe1\xda\xd8\xd9", "ISO-8859-5"},
{"ελληνικά", "\xe5\xeb\xeb\xe7\xed\xe9\xea\xdc", "ISO-8859-7"},
{"Kağan", "Ka\xf0an", "ISO-8859-9"},
{"Résumé", "R\x8esum\x8e", "macintosh"},
{"Gdańsk", "Gda\xf1sk", "windows-1250"},
{"русский", "\xf0\xf3\xf1\xf1\xea\xe8\xe9", "windows-1251"},
{"Résumé", "R\xe9sum\xe9", "windows-1252"},
{"ελληνικά", "\xe5\xeb\xeb\xe7\xed\xe9\xea\xdc", "windows-1253"},
{"Kağan", "Ka\xf0an", "windows-1254"},
{"עִבְרִית", "\xf2\xc4\xe1\xc0\xf8\xc4\xe9\xfa", "windows-1255"},
{"العربية", "\xc7\xe1\xda\xd1\xc8\xed\xc9", "windows-1256"},
{"latviešu", "latvie\xf0u", "windows-1257"},
{"Việt", "Vi\xea\xf2t", "windows-1258"},
{"สำหรับ", "\xca\xd3\xcb\xc3\u047a", "windows-874"},
{"русский", "\xd2\xd5\xd3\xd3\xcb\xc9\xca", "KOI8-R"},
{"українська", "\xd5\xcb\xd2\xc1\xa7\xce\xd3\xd8\xcb\xc1", "KOI8-U"},
{"Hello 常用國字標準字體表", "Hello \xb1`\xa5\u03b0\xea\xa6r\xbc\u0437\u01e6r\xc5\xe9\xaa\xed", "big5"},
{"Hello 常用國字標準字體表", "Hello \xb3\xa3\xd3\xc3\x87\xf8\xd7\xd6\x98\xcb\x9c\xca\xd7\xd6\xf3\x77\xb1\xed", "gbk"},
{"Hello 常用國字標準字體表", "Hello \xb3\xa3\xd3\xc3\x87\xf8\xd7\xd6\x98\xcb\x9c\xca\xd7\xd6\xf3\x77\xb1\xed", "gb18030"},
{"עִבְרִית", "\x81\x30\xfb\x30\x81\x30\xf6\x34\x81\x30\xf9\x33\x81\x30\xf6\x30\x81\x30\xfb\x36\x81\x30\xf6\x34\x81\x30\xfa\x31\x81\x30\xfb\x38", "gb18030"},
{"㧯", "\x82\x31\x89\x38", "gb18030"},
{"これは漢字です。", "\x82\xb1\x82\xea\x82\xcd\x8a\xbf\x8e\x9a\x82\xc5\x82\xb7\x81B", "SJIS"},
{"Hello, 世界!", "Hello, \x90\xa2\x8aE!", "SJIS"},
{"イウエオカ", "\xb2\xb3\xb4\xb5\xb6", "SJIS"},
{"これは漢字です。", "\xa4\xb3\xa4\xec\xa4\u03f4\xc1\xbb\xfa\xa4\u01e4\xb9\xa1\xa3", "EUC-JP"},
{"Hello, 世界!", "Hello, \x1b$B@$3&\x1b(B!", "ISO-2022-JP"},
{"네이트 | 즐거움의 시작, 슈파스(Spaβ) NATE", "\xb3\xd7\xc0\xcc\xc6\xae | \xc1\xf1\xb0\xc5\xbf\xf2\xc0\xc7 \xbd\xc3\xc0\xdb, \xbd\xb4\xc6\xc4\xbd\xba(Spa\xa5\xe2) NATE", "EUC-KR"},
}
func TestDecode(t *testing.T) {
for _, tc := range testCases {
e, _ := Lookup(tc.otherEncoding)
if e == nil {
t.Errorf("%s: not found", tc.otherEncoding)
continue
}
s, err := transformString(e.NewDecoder(), tc.other)
if err != nil {
t.Errorf("%s: decode %q: %v", tc.otherEncoding, tc.other, err)
continue
}
if s != tc.utf8 {
t.Errorf("%s: got %q, want %q", tc.otherEncoding, s, tc.utf8)
}
}
}
func TestEncode(t *testing.T) {
for _, tc := range testCases {
e, _ := Lookup(tc.otherEncoding)
if e == nil {
t.Errorf("%s: not found", tc.otherEncoding)
continue
}
s, err := transformString(e.NewEncoder(), tc.utf8)
if err != nil {
t.Errorf("%s: encode %q: %s", tc.otherEncoding, tc.utf8, err)
continue
}
if s != tc.other {
t.Errorf("%s: got %q, want %q", tc.otherEncoding, s, tc.other)
}
}
}
// TestNames verifies that you can pass an encoding's name to Lookup and get
// the same encoding back (except for "replacement").
func TestNames(t *testing.T) {
for _, e := range encodings {
if e.name == "replacement" {
continue
}
_, got := Lookup(e.name)
if got != e.name {
t.Errorf("got %q, want %q", got, e.name)
continue
}
}
}
var sniffTestCases = []struct {
filename, declared, want string
}{
{"HTTP-charset.html", "text/html; charset=iso-8859-15", "iso-8859-15"},
{"UTF-16LE-BOM.html", "", "utf-16le"},
{"UTF-16BE-BOM.html", "", "utf-16be"},
{"meta-content-attribute.html", "text/html", "iso-8859-15"},
{"meta-charset-attribute.html", "text/html", "iso-8859-15"},
{"No-encoding-declaration.html", "text/html", "utf-8"},
{"HTTP-vs-UTF-8-BOM.html", "text/html; charset=iso-8859-15", "utf-8"},
{"HTTP-vs-meta-content.html", "text/html; charset=iso-8859-15", "iso-8859-15"},
{"HTTP-vs-meta-charset.html", "text/html; charset=iso-8859-15", "iso-8859-15"},
{"UTF-8-BOM-vs-meta-content.html", "text/html", "utf-8"},
{"UTF-8-BOM-vs-meta-charset.html", "text/html", "utf-8"},
}
func TestSniff(t *testing.T) {
switch runtime.GOOS {
case "nacl": // platforms that don't permit direct file system access
t.Skipf("not supported on %q", runtime.GOOS)
}
for _, tc := range sniffTestCases {
content, err := ioutil.ReadFile("testdata/" + tc.filename)
if err != nil {
t.Errorf("%s: error reading file: %v", tc.filename, err)
continue
}
_, name, _ := DetermineEncoding(content, tc.declared)
if name != tc.want {
t.Errorf("%s: got %q, want %q", tc.filename, name, tc.want)
continue
}
}
}
func TestReader(t *testing.T) {
switch runtime.GOOS {
case "nacl": // platforms that don't permit direct file system access
t.Skipf("not supported on %q", runtime.GOOS)
}
for _, tc := range sniffTestCases {
content, err := ioutil.ReadFile("testdata/" + tc.filename)
if err != nil {
t.Errorf("%s: error reading file: %v", tc.filename, err)
continue
}
r, err := NewReader(bytes.NewReader(content), tc.declared)
if err != nil {
t.Errorf("%s: error creating reader: %v", tc.filename, err)
continue
}
got, err := ioutil.ReadAll(r)
if err != nil {
t.Errorf("%s: error reading from charset.NewReader: %v", tc.filename, err)
continue
}
e, _ := Lookup(tc.want)
want, err := ioutil.ReadAll(transform.NewReader(bytes.NewReader(content), e.NewDecoder()))
if err != nil {
t.Errorf("%s: error decoding with hard-coded charset name: %v", tc.filename, err)
continue
}
if !bytes.Equal(got, want) {
t.Errorf("%s: got %q, want %q", tc.filename, got, want)
continue
}
}
}
var metaTestCases = []struct {
meta, want string
}{
{"", ""},
{"text/html", ""},
{"text/html; charset utf-8", ""},
{"text/html; charset=latin-2", "latin-2"},
{"text/html; charset; charset = utf-8", "utf-8"},
{`charset="big5"`, "big5"},
{"charset='shift_jis'", "shift_jis"},
}
func TestFromMeta(t *testing.T) {
for _, tc := range metaTestCases {
got := fromMetaElement(tc.meta)
if got != tc.want {
t.Errorf("%q: got %q, want %q", tc.meta, got, tc.want)
}
}
}
func TestXML(t *testing.T) {
const s = "<?xml version=\"1.0\" encoding=\"windows-1252\"?><a><Word>r\xe9sum\xe9</Word></a>"
d := xml.NewDecoder(strings.NewReader(s))
d.CharsetReader = NewReaderLabel
var a struct {
Word string
}
err := d.Decode(&a)
if err != nil {
t.Fatalf("Decode: %v", err)
}
want := "résumé"
if a.Word != want {
t.Errorf("got %q, want %q", a.Word, want)
}
}
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
// Download https://encoding.spec.whatwg.org/encodings.json and use it to
// generate table.go.
import (
"encoding/json"
"fmt"
"log"
"net/http"
"strings"
)
type enc struct {
Name string
Labels []string
}
type group struct {
Encodings []enc
Heading string
}
const specURL = "https://encoding.spec.whatwg.org/encodings.json"
func main() {
resp, err := http.Get(specURL)
if err != nil {
log.Fatalf("error fetching %s: %s", specURL, err)
}
if resp.StatusCode != 200 {
log.Fatalf("error fetching %s: HTTP status %s", specURL, resp.Status)
}
defer resp.Body.Close()
var groups []group
d := json.NewDecoder(resp.Body)
err = d.Decode(&groups)
if err != nil {
log.Fatalf("error reading encodings.json: %s", err)
}
fmt.Println("// generated by go run gen.go; DO NOT EDIT")
fmt.Println()
fmt.Println("package charset")
fmt.Println()
fmt.Println("import (")
fmt.Println(`"golang.org/x/text/encoding"`)
for _, pkg := range []string{"charmap", "japanese", "korean", "simplifiedchinese", "traditionalchinese", "unicode"} {
fmt.Printf("\"golang.org/x/text/encoding/%s\"\n", pkg)
}
fmt.Println(")")
fmt.Println()
fmt.Println("var encodings = map[string]struct{e encoding.Encoding; name string} {")
for _, g := range groups {
for _, e := range g.Encodings {
goName, ok := miscNames[e.Name]
if !ok {
for k, v := range prefixes {
if strings.HasPrefix(e.Name, k) {
goName = v + e.Name[len(k):]
break
}
}
if goName == "" {
log.Fatalf("unrecognized encoding name: %s", e.Name)
}
}
for _, label := range e.Labels {
fmt.Printf("%q: {%s, %q},\n", label, goName, e.Name)
}
}
}
fmt.Println("}")
}
var prefixes = map[string]string{
"iso-8859-": "charmap.ISO8859_",
"windows-": "charmap.Windows",
}
var miscNames = map[string]string{
"utf-8": "encoding.Nop",
"ibm866": "charmap.CodePage866",
"iso-8859-8-i": "charmap.ISO8859_8",
"koi8-r": "charmap.KOI8R",
"koi8-u": "charmap.KOI8U",
"macintosh": "charmap.Macintosh",
"x-mac-cyrillic": "charmap.MacintoshCyrillic",
"gbk": "simplifiedchinese.GBK",
"gb18030": "simplifiedchinese.GB18030",
"hz-gb-2312": "simplifiedchinese.HZGB2312",
"big5": "traditionalchinese.Big5",
"euc-jp": "japanese.EUCJP",
"iso-2022-jp": "japanese.ISO2022JP",
"shift_jis": "japanese.ShiftJIS",
"euc-kr": "korean.EUCKR",
"replacement": "encoding.Replacement",
"utf-16be": "unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM)",
"utf-16le": "unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM)",
"x-user-defined": "charmap.XUserDefined",
}
This diff is collapsed.
<!DOCTYPE html>
<html lang="en" >
<head>
<title>HTTP charset</title>
<link rel='author' title='Richard Ishida' href='mailto:ishida@w3.org'>
<link rel='help' href='http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream'>
<link rel="stylesheet" type="text/css" href="./generatedtests.css">
<script src="http://w3c-test.org/resources/testharness.js"></script>
<script src="http://w3c-test.org/resources/testharnessreport.js"></script>
<meta name='flags' content='http'>
<meta name="assert" content="The character encoding of a page can be set using the HTTP header charset declaration.">
<style type='text/css'>
.test div { width: 50px; }</style>
<link rel="stylesheet" type="text/css" href="the-input-byte-stream/support/encodingtests-15.css">
</head>
<body>
<p class='title'>HTTP charset</p>
<div id='log'></div>
<div class='test'><div id='box' class='ýäè'>&#xA0;</div></div>
<div class='description'>
<p class="assertion" title="Assertion">The character encoding of a page can be set using the HTTP header charset declaration.</p>
<div class="notes"><p><p>The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector <code>.test div.&#x00C3;&#x0153;&#x00C3;&#x20AC;&#x00C3;&#x0161;</code>. This matches the sequence of bytes above when they are interpreted as ISO 8859-15. If the class name matches the selector then the test will pass.</p><p>The only character encoding declaration for this HTML file is in the HTTP header, which sets the encoding to ISO 8859-15.</p></p>
</div>
</div>
<div class="nexttest"><div><a href="generate?test=the-input-byte-stream-003">Next test</a></div><div class="doctype">HTML5</div>
<p class="jump">the-input-byte-stream-001<br /><a href="/International/tests/html5/the-input-byte-stream/results-basics#basics" target="_blank">Result summary &amp; related tests</a><br /><a href="http://w3c-test.org/framework/details/i18n-html5/the-input-byte-stream-001" target="_blank">Detailed results for this test</a><br/> <a href="http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream" target="_blank">Link to spec</a></p>
<div class='prereq'>Assumptions: <ul><li>The default encoding for the browser you are testing is not set to ISO 8859-15.</li>
<li>The test is read from a server that supports HTTP.</li></ul></div>
</div>
<script>
test(function() {
assert_equals(document.getElementById('box').offsetWidth, 100);
}, " ");
</script>
</body>
</html>
<!DOCTYPE html>
<html lang="en" >
<head>
<title>HTTP vs UTF-8 BOM</title>
<link rel='author' title='Richard Ishida' href='mailto:ishida@w3.org'>
<link rel='help' href='http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream'>
<link rel="stylesheet" type="text/css" href="./generatedtests.css">
<script src="http://w3c-test.org/resources/testharness.js"></script>
<script src="http://w3c-test.org/resources/testharnessreport.js"></script>
<meta name='flags' content='http'>
<meta name="assert" content="A character encoding set in the HTTP header has lower precedence than the UTF-8 signature.">
<style type='text/css'>
.test div { width: 50px; }</style>
<link rel="stylesheet" type="text/css" href="the-input-byte-stream/support/encodingtests-utf8.css">
</head>
<body>
<p class='title'>HTTP vs UTF-8 BOM</p>
<div id='log'></div>
<div class='test'><div id='box' class='ýäè'>&#xA0;</div></div>
<div class='description'>
<p class="assertion" title="Assertion">A character encoding set in the HTTP header has lower precedence than the UTF-8 signature.</p>
<div class="notes"><p><p>The HTTP header attempts to set the character encoding to ISO 8859-15. The page starts with a UTF-8 signature.</p><p>The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector <code>.test div.&#x00FD;&#x00E4;&#x00E8;</code>. This matches the sequence of bytes above when they are interpreted as UTF-8. If the class name matches the selector then the test will pass.</p><p>If the test is unsuccessful, the characters &#x00EF;&#x00BB;&#x00BF; should appear at the top of the page. These represent the bytes that make up the UTF-8 signature when encountered in the ISO 8859-15 encoding.</p></p>
</div>
</div>
<div class="nexttest"><div><a href="generate?test=the-input-byte-stream-022">Next test</a></div><div class="doctype">HTML5</div>
<p class="jump">the-input-byte-stream-034<br /><a href="/International/tests/html5/the-input-byte-stream/results-basics#precedence" target="_blank">Result summary &amp; related tests</a><br /><a href="http://w3c-test.org/framework/details/i18n-html5/the-input-byte-stream-034" target="_blank">Detailed results for this test</a><br/> <a href="http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream" target="_blank">Link to spec</a></p>
<div class='prereq'>Assumptions: <ul><li>The default encoding for the browser you are testing is not set to ISO 8859-15.</li>
<li>The test is read from a server that supports HTTP.</li></ul></div>
</div>
<script>
test(function() {
assert_equals(document.getElementById('box').offsetWidth, 100);
}, " ");
</script>
</body>
</html>
<!DOCTYPE html>
<html lang="en" >
<head>
<meta charset="iso-8859-1" > <title>HTTP vs meta charset</title>
<link rel='author' title='Richard Ishida' href='mailto:ishida@w3.org'>
<link rel='help' href='http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream'>
<link rel="stylesheet" type="text/css" href="./generatedtests.css">
<script src="http://w3c-test.org/resources/testharness.js"></script>
<script src="http://w3c-test.org/resources/testharnessreport.js"></script>
<meta name='flags' content='http'>
<meta name="assert" content="The HTTP header has a higher precedence than an encoding declaration in a meta charset attribute.">
<style type='text/css'>
.test div { width: 50px; }.test div { width: 90px; }
</style>
<link rel="stylesheet" type="text/css" href="the-input-byte-stream/support/encodingtests-15.css">
</head>
<body>
<p class='title'>HTTP vs meta charset</p>
<div id='log'></div>
<div class='test'><div id='box' class='ýäè'>&#xA0;</div></div>
<div class='description'>
<p class="assertion" title="Assertion">The HTTP header has a higher precedence than an encoding declaration in a meta charset attribute.</p>
<div class="notes"><p><p>The HTTP header attempts to set the character encoding to ISO 8859-15. The page contains an encoding declaration in a meta charset attribute that attempts to set the character encoding to ISO 8859-1.</p><p>The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector <code>.test div.&#x00C3;&#x0153;&#x00C3;&#x20AC;&#x00C3;&#x0161;</code>. This matches the sequence of bytes above when they are interpreted as ISO 8859-15. If the class name matches the selector then the test will pass.</p></p>
</div>
</div>
<div class="nexttest"><div><a href="generate?test=the-input-byte-stream-037">Next test</a></div><div class="doctype">HTML5</div>
<p class="jump">the-input-byte-stream-018<br /><a href="/International/tests/html5/the-input-byte-stream/results-basics#precedence" target="_blank">Result summary &amp; related tests</a><br /><a href="http://w3c-test.org/framework/details/i18n-html5/the-input-byte-stream-018" target="_blank">Detailed results for this test</a><br/> <a href="http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream" target="_blank">Link to spec</a></p>
<div class='prereq'>Assumptions: <ul><li>The default encoding for the browser you are testing is not set to ISO 8859-15.</li>
<li>The test is read from a server that supports HTTP.</li></ul></div>
</div>
<script>
test(function() {
assert_equals(document.getElementById('box').offsetWidth, 100);
}, " ");
</script>
</body>
</html>
<!DOCTYPE html>
<html lang="en" >
<head>
<meta http-equiv="content-type" content="text/html;charset=iso-8859-1" > <title>HTTP vs meta content</title>
<link rel='author' title='Richard Ishida' href='mailto:ishida@w3.org'>
<link rel='help' href='http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream'>
<link rel="stylesheet" type="text/css" href="./generatedtests.css">
<script src="http://w3c-test.org/resources/testharness.js"></script>
<script src="http://w3c-test.org/resources/testharnessreport.js"></script>
<meta name='flags' content='http'>
<meta name="assert" content="The HTTP header has a higher precedence than an encoding declaration in a meta content attribute.">
<style type='text/css'>
.test div { width: 50px; }.test div { width: 90px; }
</style>
<link rel="stylesheet" type="text/css" href="the-input-byte-stream/support/encodingtests-15.css">
</head>
<body>
<p class='title'>HTTP vs meta content</p>
<div id='log'></div>
<div class='test'><div id='box' class='ýäè'>&#xA0;</div></div>
<div class='description'>
<p class="assertion" title="Assertion">The HTTP header has a higher precedence than an encoding declaration in a meta content attribute.</p>
<div class="notes"><p><p>The HTTP header attempts to set the character encoding to ISO 8859-15. The page contains an encoding declaration in a meta content attribute that attempts to set the character encoding to ISO 8859-1.</p><p>The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector <code>.test div.&#x00C3;&#x0153;&#x00C3;&#x20AC;&#x00C3;&#x0161;</code>. This matches the sequence of bytes above when they are interpreted as ISO 8859-15. If the class name matches the selector then the test will pass.</p></p>
</div>
</div>
<div class="nexttest"><div><a href="generate?test=the-input-byte-stream-018">Next test</a></div><div class="doctype">HTML5</div>
<p class="jump">the-input-byte-stream-016<br /><a href="/International/tests/html5/the-input-byte-stream/results-basics#precedence" target="_blank">Result summary &amp; related tests</a><br /><a href="http://w3c-test.org/framework/details/i18n-html5/the-input-byte-stream-016" target="_blank">Detailed results for this test</a><br/> <a href="http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream" target="_blank">Link to spec</a></p>
<div class='prereq'>Assumptions: <ul><li>The default encoding for the browser you are testing is not set to ISO 8859-15.</li>
<li>The test is read from a server that supports HTTP.</li></ul></div>
</div>
<script>
test(function() {
assert_equals(document.getElementById('box').offsetWidth, 100);
}, " ");
</script>
</body>
</html>
<!DOCTYPE html>
<html lang="en" >
<head>
<title>No encoding declaration</title>
<link rel='author' title='Richard Ishida' href='mailto:ishida@w3.org'>
<link rel='help' href='http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream'>
<link rel="stylesheet" type="text/css" href="./generatedtests.css">
<script src="http://w3c-test.org/resources/testharness.js"></script>
<script src="http://w3c-test.org/resources/testharnessreport.js"></script>
<meta name='flags' content='http'>
<meta name="assert" content="A page with no encoding information in HTTP, BOM, XML declaration or meta element will be treated as UTF-8.">
<style type='text/css'>
.test div { width: 50px; }</style>
<link rel="stylesheet" type="text/css" href="the-input-byte-stream/support/encodingtests-utf8.css">
</head>
<body>
<p class='title'>No encoding declaration</p>
<div id='log'></div>
<div class='test'><div id='box' class='ýäè'>&#xA0;</div></div>
<div class='description'>
<p class="assertion" title="Assertion">A page with no encoding information in HTTP, BOM, XML declaration or meta element will be treated as UTF-8.</p>
<div class="notes"><p><p>The test on this page contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector <code>.test div.&#x00FD;&#x00E4;&#x00E8;</code>. This matches the sequence of bytes above when they are interpreted as UTF-8. If the class name matches the selector then the test will pass.</p></p>
</div>
</div>
<div class="nexttest"><div><a href="generate?test=the-input-byte-stream-034">Next test</a></div><div class="doctype">HTML5</div>
<p class="jump">the-input-byte-stream-015<br /><a href="/International/tests/html5/the-input-byte-stream/results-basics#basics" target="_blank">Result summary &amp; related tests</a><br /><a href="http://w3c-test.org/framework/details/i18n-html5/the-input-byte-stream-015" target="_blank">Detailed results for this test</a><br/> <a href="http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream" target="_blank">Link to spec</a></p>
<div class='prereq'>Assumptions: <ul><li>The test is read from a server that supports HTTP.</li></ul></div>
</div>
<script>
test(function() {
assert_equals(document.getElementById('box').offsetWidth, 100);
}, " ");
</script>
</body>
</html>
These test cases come from
http://www.w3.org/International/tests/repository/html5/the-input-byte-stream/results-basics
Distributed under both the W3C Test Suite License
(http://www.w3.org/Consortium/Legal/2008/04-testsuite-license)
and the W3C 3-clause BSD License
(http://www.w3.org/Consortium/Legal/2008/03-bsd-license).
To contribute to a W3C Test Suite, see the policies and contribution
forms (http://www.w3.org/2004/10/27-testcases).
<!DOCTYPE html>
<html lang="en" >
<head>
<meta charset="iso-8859-15"> <title>UTF-8 BOM vs meta charset</title>
<link rel='author' title='Richard Ishida' href='mailto:ishida@w3.org'>
<link rel='help' href='http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream'>
<link rel="stylesheet" type="text/css" href="./generatedtests.css">
<script src="http://w3c-test.org/resources/testharness.js"></script>
<script src="http://w3c-test.org/resources/testharnessreport.js"></script>
<meta name='flags' content='http'>
<meta name="assert" content="A page with a UTF-8 BOM will be recognized as UTF-8 even if the meta charset attribute declares a different encoding.">
<style type='text/css'>
.test div { width: 50px; }.test div { width: 90px; }
</style>
<link rel="stylesheet" type="text/css" href="the-input-byte-stream/support/encodingtests-utf8.css">
</head>
<body>
<p class='title'>UTF-8 BOM vs meta charset</p>
<div id='log'></div>
<div class='test'><div id='box' class='ýäè'>&#xA0;</div></div>
<div class='description'>
<p class="assertion" title="Assertion">A page with a UTF-8 BOM will be recognized as UTF-8 even if the meta charset attribute declares a different encoding.</p>
<div class="notes"><p><p>The page contains an encoding declaration in a meta charset attribute that attempts to set the character encoding to ISO 8859-15, but the file starts with a UTF-8 signature.</p><p>The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector <code>.test div.&#x00FD;&#x00E4;&#x00E8;</code>. This matches the sequence of bytes above when they are interpreted as UTF-8. If the class name matches the selector then the test will pass.</p></p>
</div>
</div>
<div class="nexttest"><div><a href="generate?test=the-input-byte-stream-024">Next test</a></div><div class="doctype">HTML5</div>
<p class="jump">the-input-byte-stream-038<br /><a href="/International/tests/html5/the-input-byte-stream/results-basics#precedence" target="_blank">Result summary &amp; related tests</a><br /><a href="http://w3c-test.org/framework/details/i18n-html5/the-input-byte-stream-038" target="_blank">Detailed results for this test</a><br/> <a href="http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream" target="_blank">Link to spec</a></p>
<div class='prereq'>Assumptions: <ul><li>The default encoding for the browser you are testing is not set to ISO 8859-15.</li>
<li>The test is read from a server that supports HTTP.</li></ul></div>
</div>
<script>
test(function() {
assert_equals(document.getElementById('box').offsetWidth, 100);
}, " ");
</script>
</body>
</html>
<!DOCTYPE html>
<html lang="en" >
<head>
<meta http-equiv="content-type" content="text/html; charset=iso-8859-15"> <title>UTF-8 BOM vs meta content</title>
<link rel='author' title='Richard Ishida' href='mailto:ishida@w3.org'>
<link rel='help' href='http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream'>
<link rel="stylesheet" type="text/css" href="./generatedtests.css">
<script src="http://w3c-test.org/resources/testharness.js"></script>
<script src="http://w3c-test.org/resources/testharnessreport.js"></script>
<meta name='flags' content='http'>
<meta name="assert" content="A page with a UTF-8 BOM will be recognized as UTF-8 even if the meta content attribute declares a different encoding.">
<style type='text/css'>
.test div { width: 50px; }</style>
<link rel="stylesheet" type="text/css" href="the-input-byte-stream/support/encodingtests-utf8.css">
</head>
<body>
<p class='title'>UTF-8 BOM vs meta content</p>
<div id='log'></div>
<div class='test'><div id='box' class='ýäè'>&#xA0;</div></div>
<div class='description'>
<p class="assertion" title="Assertion">A page with a UTF-8 BOM will be recognized as UTF-8 even if the meta content attribute declares a different encoding.</p>
<div class="notes"><p><p>The page contains an encoding declaration in a meta content attribute that attempts to set the character encoding to ISO 8859-15, but the file starts with a UTF-8 signature.</p><p>The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector <code>.test div.&#x00FD;&#x00E4;&#x00E8;</code>. This matches the sequence of bytes above when they are interpreted as UTF-8. If the class name matches the selector then the test will pass.</p></p>
</div>
</div>
<div class="nexttest"><div><a href="generate?test=the-input-byte-stream-038">Next test</a></div><div class="doctype">HTML5</div>
<p class="jump">the-input-byte-stream-037<br /><a href="/International/tests/html5/the-input-byte-stream/results-basics#precedence" target="_blank">Result summary &amp; related tests</a><br /><a href="http://w3c-test.org/framework/details/i18n-html5/the-input-byte-stream-037" target="_blank">Detailed results for this test</a><br/> <a href="http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream" target="_blank">Link to spec</a></p>
<div class='prereq'>Assumptions: <ul><li>The default encoding for the browser you are testing is not set to ISO 8859-15.</li>
<li>The test is read from a server that supports HTTP.</li></ul></div>
</div>
<script>
test(function() {
assert_equals(document.getElementById('box').offsetWidth, 100);
}, " ");
</script>
</body>
</html>
<!DOCTYPE html>
<html lang="en" >
<head>
<meta charset="iso-8859-15"> <title>meta charset attribute</title>
<link rel='author' title='Richard Ishida' href='mailto:ishida@w3.org'>
<link rel='help' href='http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream'>
<link rel="stylesheet" type="text/css" href="./generatedtests.css">
<script src="http://w3c-test.org/resources/testharness.js"></script>
<script src="http://w3c-test.org/resources/testharnessreport.js"></script>
<meta name='flags' content='http'>
<meta name="assert" content="The character encoding of the page can be set by a meta element with charset attribute.">
<style type='text/css'>
.test div { width: 50px; }</style>
<link rel="stylesheet" type="text/css" href="the-input-byte-stream/support/encodingtests-15.css">
</head>
<body>
<p class='title'>meta charset attribute</p>
<div id='log'></div>
<div class='test'><div id='box' class='ýäè'>&#xA0;</div></div>
<div class='description'>
<p class="assertion" title="Assertion">The character encoding of the page can be set by a meta element with charset attribute.</p>
<div class="notes"><p><p>The only character encoding declaration for this HTML file is in the charset attribute of the meta element, which declares the encoding to be ISO 8859-15.</p><p>The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector <code>.test div.&#x00C3;&#x0153;&#x00C3;&#x20AC;&#x00C3;&#x0161;</code>. This matches the sequence of bytes above when they are interpreted as ISO 8859-15. If the class name matches the selector then the test will pass.</p></p>
</div>
</div>
<div class="nexttest"><div><a href="generate?test=the-input-byte-stream-015">Next test</a></div><div class="doctype">HTML5</div>
<p class="jump">the-input-byte-stream-009<br /><a href="/International/tests/html5/the-input-byte-stream/results-basics#basics" target="_blank">Result summary &amp; related tests</a><br /><a href="http://w3c-test.org/framework/details/i18n-html5/the-input-byte-stream-009" target="_blank">Detailed results for this test</a><br/> <a href="http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream" target="_blank">Link to spec</a></p>
<div class='prereq'>Assumptions: <ul><li>The default encoding for the browser you are testing is not set to ISO 8859-15.</li>
<li>The test is read from a server that supports HTTP.</li></ul></div>
</div>
<script>
test(function() {
assert_equals(document.getElementById('box').offsetWidth, 100);
}, " ");
</script>
</body>
</html>
<!DOCTYPE html>
<html lang="en" >
<head>
<meta http-equiv="content-type" content="text/html; charset=iso-8859-15"> <title>meta content attribute</title>
<link rel='author' title='Richard Ishida' href='mailto:ishida@w3.org'>
<link rel='help' href='http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream'>
<link rel="stylesheet" type="text/css" href="./generatedtests.css">
<script src="http://w3c-test.org/resources/testharness.js"></script>
<script src="http://w3c-test.org/resources/testharnessreport.js"></script>
<meta name='flags' content='http'>
<meta name="assert" content="The character encoding of the page can be set by a meta element with http-equiv and content attributes.">
<style type='text/css'>
.test div { width: 50px; }</style>
<link rel="stylesheet" type="text/css" href="the-input-byte-stream/support/encodingtests-15.css">
</head>
<body>
<p class='title'>meta content attribute</p>
<div id='log'></div>
<div class='test'><div id='box' class='ýäè'>&#xA0;</div></div>
<div class='description'>
<p class="assertion" title="Assertion">The character encoding of the page can be set by a meta element with http-equiv and content attributes.</p>
<div class="notes"><p><p>The only character encoding declaration for this HTML file is in the content attribute of the meta element, which declares the encoding to be ISO 8859-15.</p><p>The test contains a div with a class name that contains the following sequence of bytes: 0xC3 0xBD 0xC3 0xA4 0xC3 0xA8. These represent different sequences of characters in ISO 8859-15, ISO 8859-1 and UTF-8. The external, UTF-8-encoded stylesheet contains a selector <code>.test div.&#x00C3;&#x0153;&#x00C3;&#x20AC;&#x00C3;&#x0161;</code>. This matches the sequence of bytes above when they are interpreted as ISO 8859-15. If the class name matches the selector then the test will pass.</p></p>
</div>
</div>
<div class="nexttest"><div><a href="generate?test=the-input-byte-stream-009">Next test</a></div><div class="doctype">HTML5</div>
<p class="jump">the-input-byte-stream-007<br /><a href="/International/tests/html5/the-input-byte-stream/results-basics#basics" target="_blank">Result summary &amp; related tests</a><br /><a href="http://w3c-test.org/framework/details/i18n-html5/the-input-byte-stream-007" target="_blank">Detailed results for this test</a><br/> <a href="http://www.w3.org/TR/html5/syntax.html#the-input-byte-stream" target="_blank">Link to spec</a></p>
<div class='prereq'>Assumptions: <ul><li>The default encoding for the browser you are testing is not set to ISO 8859-15.</li>
<li>The test is read from a server that supports HTTP.</li></ul></div>
</div>
<script>
test(function() {
assert_equals(document.getElementById('box').offsetWidth, 100);
}, " ");
</script>
</body>
</html>
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package html
// Section 12.2.3.2 of the HTML5 specification says "The following elements
// have varying levels of special parsing rules".
// https://html.spec.whatwg.org/multipage/syntax.html#the-stack-of-open-elements
var isSpecialElementMap = map[string]bool{
"address": true,
"applet": true,
"area": true,
"article": true,
"aside": true,
"base": true,
"basefont": true,
"bgsound": true,
"blockquote": true,
"body": true,
"br": true,
"button": true,
"caption": true,
"center": true,
"col": true,
"colgroup": true,
"dd": true,
"details": true,
"dir": true,
"div": true,
"dl": true,
"dt": true,
"embed": true,
"fieldset": true,
"figcaption": true,
"figure": true,
"footer": true,
"form": true,
"frame": true,
"frameset": true,
"h1": true,
"h2": true,
"h3": true,
"h4": true,
"h5": true,
"h6": true,
"head": true,
"header": true,
"hgroup": true,
"hr": true,
"html": true,
"iframe": true,
"img": true,
"input": true,
"isindex": true,
"li": true,
"link": true,
"listing": true,
"marquee": true,
"menu": true,
"meta": true,
"nav": true,
"noembed": true,
"noframes": true,
"noscript": true,
"object": true,
"ol": true,
"p": true,
"param": true,
"plaintext": true,
"pre": true,
"script": true,
"section": true,
"select": true,
"source": true,
"style": true,
"summary": true,
"table": true,
"tbody": true,
"td": true,
"template": true,
"textarea": true,
"tfoot": true,
"th": true,
"thead": true,
"title": true,
"tr": true,
"track": true,
"ul": true,
"wbr": true,
"xmp": true,
}
func isSpecialElement(element *Node) bool {
switch element.Namespace {
case "", "html":
return isSpecialElementMap[element.Data]
case "svg":
return element.Data == "foreignObject"
}
return false
}
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
/*
Package html implements an HTML5-compliant tokenizer and parser.
Tokenization is done by creating a Tokenizer for an io.Reader r. It is the
caller's responsibility to ensure that r provides UTF-8 encoded HTML.
z := html.NewTokenizer(r)
Given a Tokenizer z, the HTML is tokenized by repeatedly calling z.Next(),
which parses the next token and returns its type, or an error:
for {
tt := z.Next()
if tt == html.ErrorToken {
// ...
return ...
}
// Process the current token.
}
There are two APIs for retrieving the current token. The high-level API is to
call Token; the low-level API is to call Text or TagName / TagAttr. Both APIs
allow optionally calling Raw after Next but before Token, Text, TagName, or
TagAttr. In EBNF notation, the valid call sequence per token is:
Next {Raw} [ Token | Text | TagName {TagAttr} ]
Token returns an independent data structure that completely describes a token.
Entities (such as "&lt;") are unescaped, tag names and attribute keys are
lower-cased, and attributes are collected into a []Attribute. For example:
for {
if z.Next() == html.ErrorToken {
// Returning io.EOF indicates success.
return z.Err()
}
emitToken(z.Token())
}
The low-level API performs fewer allocations and copies, but the contents of
the []byte values returned by Text, TagName and TagAttr may change on the next
call to Next. For example, to extract an HTML page's anchor text:
depth := 0
for {
tt := z.Next()
switch tt {
case ErrorToken:
return z.Err()
case TextToken:
if depth > 0 {
// emitBytes should copy the []byte it receives,
// if it doesn't process it immediately.
emitBytes(z.Text())
}
case StartTagToken, EndTagToken:
tn, _ := z.TagName()
if len(tn) == 1 && tn[0] == 'a' {
if tt == StartTagToken {
depth++
} else {
depth--
}
}
}
}
Parsing is done by calling Parse with an io.Reader, which returns the root of
the parse tree (the document element) as a *Node. It is the caller's
responsibility to ensure that the Reader provides UTF-8 encoded HTML. For
example, to process each anchor node in depth-first order:
doc, err := html.Parse(r)
if err != nil {
// ...
}
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "a" {
// Do something with n...
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}
f(doc)
The relevant specifications include:
https://html.spec.whatwg.org/multipage/syntax.html and
https://html.spec.whatwg.org/multipage/syntax.html#tokenization
*/
package html
// The tokenization algorithm implemented by this package is not a line-by-line
// transliteration of the relatively verbose state-machine in the WHATWG
// specification. A more direct approach is used instead, where the program
// counter implies the state, such as whether it is tokenizing a tag or a text
// node. Specification compliance is verified by checking expected and actual
// outputs over a test suite rather than aiming for algorithmic fidelity.
// TODO(nigeltao): Does a DOM API belong in this package or a separate one?
// TODO(nigeltao): How does parsing interact with a JavaScript engine?
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package html
import (
"strings"
)
// parseDoctype parses the data from a DoctypeToken into a name,
// public identifier, and system identifier. It returns a Node whose Type
// is DoctypeNode, whose Data is the name, and which has attributes
// named "system" and "public" for the two identifiers if they were present.
// quirks is whether the document should be parsed in "quirks mode".
func parseDoctype(s string) (n *Node, quirks bool) {
n = &Node{Type: DoctypeNode}
// Find the name.
space := strings.IndexAny(s, whitespace)
if space == -1 {
space = len(s)
}
n.Data = s[:space]
// The comparison to "html" is case-sensitive.
if n.Data != "html" {
quirks = true
}
n.Data = strings.ToLower(n.Data)
s = strings.TrimLeft(s[space:], whitespace)
if len(s) < 6 {
// It can't start with "PUBLIC" or "SYSTEM".
// Ignore the rest of the string.
return n, quirks || s != ""
}
key := strings.ToLower(s[:6])
s = s[6:]
for key == "public" || key == "system" {
s = strings.TrimLeft(s, whitespace)
if s == "" {
break
}
quote := s[0]
if quote != '"' && quote != '\'' {
break
}
s = s[1:]
q := strings.IndexRune(s, rune(quote))
var id string
if q == -1 {
id = s
s = ""
} else {
id = s[:q]
s = s[q+1:]
}
n.Attr = append(n.Attr, Attribute{Key: key, Val: id})
if key == "public" {
key = "system"
} else {
key = ""
}
}
if key != "" || s != "" {
quirks = true
} else if len(n.Attr) > 0 {
if n.Attr[0].Key == "public" {
public := strings.ToLower(n.Attr[0].Val)
switch public {
case "-//w3o//dtd w3 html strict 3.0//en//", "-/w3d/dtd html 4.0 transitional/en", "html":
quirks = true
default:
for _, q := range quirkyIDs {
if strings.HasPrefix(public, q) {
quirks = true
break
}
}
}
// The following two public IDs only cause quirks mode if there is no system ID.
if len(n.Attr) == 1 && (strings.HasPrefix(public, "-//w3c//dtd html 4.01 frameset//") ||
strings.HasPrefix(public, "-//w3c//dtd html 4.01 transitional//")) {
quirks = true
}
}
if lastAttr := n.Attr[len(n.Attr)-1]; lastAttr.Key == "system" &&
strings.ToLower(lastAttr.Val) == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd" {
quirks = true
}
}
return n, quirks
}
// quirkyIDs is a list of public doctype identifiers that cause a document
// to be interpreted in quirks mode. The identifiers should be in lower case.
var quirkyIDs = []string{
"+//silmaril//dtd html pro v0r11 19970101//",
"-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
"-//as//dtd html 3.0 aswedit + extensions//",
"-//ietf//dtd html 2.0 level 1//",
"-//ietf//dtd html 2.0 level 2//",
"-//ietf//dtd html 2.0 strict level 1//",
"-//ietf//dtd html 2.0 strict level 2//",
"-//ietf//dtd html 2.0 strict//",
"-//ietf//dtd html 2.0//",
"-//ietf//dtd html 2.1e//",
"-//ietf//dtd html 3.0//",
"-//ietf//dtd html 3.2 final//",
"-//ietf//dtd html 3.2//",
"-//ietf//dtd html 3//",
"-//ietf//dtd html level 0//",
"-//ietf//dtd html level 1//",
"-//ietf//dtd html level 2//",
"-//ietf//dtd html level 3//",
"-//ietf//dtd html strict level 0//",
"-//ietf//dtd html strict level 1//",
"-//ietf//dtd html strict level 2//",
"-//ietf//dtd html strict level 3//",
"-//ietf//dtd html strict//",
"-//ietf//dtd html//",
"-//metrius//dtd metrius presentational//",
"-//microsoft//dtd internet explorer 2.0 html strict//",
"-//microsoft//dtd internet explorer 2.0 html//",
"-//microsoft//dtd internet explorer 2.0 tables//",
"-//microsoft//dtd internet explorer 3.0 html strict//",
"-//microsoft//dtd internet explorer 3.0 html//",
"-//microsoft//dtd internet explorer 3.0 tables//",
"-//netscape comm. corp.//dtd html//",
"-//netscape comm. corp.//dtd strict html//",
"-//o'reilly and associates//dtd html 2.0//",
"-//o'reilly and associates//dtd html extended 1.0//",
"-//o'reilly and associates//dtd html extended relaxed 1.0//",
"-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
"-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
"-//spyglass//dtd html 2.0 extended//",
"-//sq//dtd html 2.0 hotmetal + extensions//",
"-//sun microsystems corp.//dtd hotjava html//",
"-//sun microsystems corp.//dtd hotjava strict html//",
"-//w3c//dtd html 3 1995-03-24//",
"-//w3c//dtd html 3.2 draft//",
"-//w3c//dtd html 3.2 final//",
"-//w3c//dtd html 3.2//",
"-//w3c//dtd html 3.2s draft//",
"-//w3c//dtd html 4.0 frameset//",
"-//w3c//dtd html 4.0 transitional//",
"-//w3c//dtd html experimental 19960712//",
"-//w3c//dtd html experimental 970421//",
"-//w3c//dtd w3 html//",
"-//w3o//dtd w3 html 3.0//",
"-//webtechs//dtd mozilla html 2.0//",
"-//webtechs//dtd mozilla html//",
}
This diff is collapsed.
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package html
import (
"testing"
"unicode/utf8"
)
func TestEntityLength(t *testing.T) {
// We verify that the length of UTF-8 encoding of each value is <= 1 + len(key).
// The +1 comes from the leading "&". This property implies that the length of
// unescaped text is <= the length of escaped text.
for k, v := range entity {
if 1+len(k) < utf8.RuneLen(v) {
t.Error("escaped entity &" + k + " is shorter than its UTF-8 encoding " + string(v))
}
if len(k) > longestEntityWithoutSemicolon && k[len(k)-1] != ';' {
t.Errorf("entity name %s is %d characters, but longestEntityWithoutSemicolon=%d", k, len(k), longestEntityWithoutSemicolon)
}
}
for k, v := range entity2 {
if 1+len(k) < utf8.RuneLen(v[0])+utf8.RuneLen(v[1]) {
t.Error("escaped entity &" + k + " is shorter than its UTF-8 encoding " + string(v[0]) + string(v[1]))
}
}
}
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package html
import (
"bytes"
"strings"
"unicode/utf8"
)
// These replacements permit compatibility with old numeric entities that
// assumed Windows-1252 encoding.
// https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference
var replacementTable = [...]rune{
'\u20AC', // First entry is what 0x80 should be replaced with.
'\u0081',
'\u201A',
'\u0192',
'\u201E',
'\u2026',
'\u2020',
'\u2021',
'\u02C6',
'\u2030',
'\u0160',
'\u2039',
'\u0152',
'\u008D',
'\u017D',
'\u008F',
'\u0090',
'\u2018',
'\u2019',
'\u201C',
'\u201D',
'\u2022',
'\u2013',
'\u2014',
'\u02DC',
'\u2122',
'\u0161',
'\u203A',
'\u0153',
'\u009D',
'\u017E',
'\u0178', // Last entry is 0x9F.
// 0x00->'\uFFFD' is handled programmatically.
// 0x0D->'\u000D' is a no-op.
}
// unescapeEntity reads an entity like "&lt;" from b[src:] and writes the
// corresponding "<" to b[dst:], returning the incremented dst and src cursors.
// Precondition: b[src] == '&' && dst <= src.
// attribute should be true if parsing an attribute value.
func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
// https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference
// i starts at 1 because we already know that s[0] == '&'.
i, s := 1, b[src:]
if len(s) <= 1 {
b[dst] = b[src]
return dst + 1, src + 1
}
if s[i] == '#' {
if len(s) <= 3 { // We need to have at least "&#.".
b[dst] = b[src]
return dst + 1, src + 1
}
i++
c := s[i]
hex := false
if c == 'x' || c == 'X' {
hex = true
i++
}
x := '\x00'
for i < len(s) {
c = s[i]
i++
if hex {
if '0' <= c && c <= '9' {
x = 16*x + rune(c) - '0'
continue
} else if 'a' <= c && c <= 'f' {
x = 16*x + rune(c) - 'a' + 10
continue
} else if 'A' <= c && c <= 'F' {
x = 16*x + rune(c) - 'A' + 10
continue
}
} else if '0' <= c && c <= '9' {
x = 10*x + rune(c) - '0'
continue
}
if c != ';' {
i--
}
break
}
if i <= 3 { // No characters matched.
b[dst] = b[src]
return dst + 1, src + 1
}
if 0x80 <= x && x <= 0x9F {
// Replace characters from Windows-1252 with UTF-8 equivalents.
x = replacementTable[x-0x80]
} else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
// Replace invalid characters with the replacement character.
x = '\uFFFD'
}
return dst + utf8.EncodeRune(b[dst:], x), src + i
}
// Consume the maximum number of characters possible, with the
// consumed characters matching one of the named references.
for i < len(s) {
c := s[i]
i++
// Lower-cased characters are more common in entities, so we check for them first.
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
continue
}
if c != ';' {
i--
}
break
}
entityName := string(s[1:i])
if entityName == "" {
// No-op.
} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
// No-op.
} else if x := entity[entityName]; x != 0 {
return dst + utf8.EncodeRune(b[dst:], x), src + i
} else if x := entity2[entityName]; x[0] != 0 {
dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
} else if !attribute {
maxLen := len(entityName) - 1
if maxLen > longestEntityWithoutSemicolon {
maxLen = longestEntityWithoutSemicolon
}
for j := maxLen; j > 1; j-- {
if x := entity[entityName[:j]]; x != 0 {
return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
}
}
}
dst1, src1 = dst+i, src+i
copy(b[dst:dst1], b[src:src1])
return dst1, src1
}
// unescape unescapes b's entities in-place, so that "a&lt;b" becomes "a<b".
// attribute should be true if parsing an attribute value.
func unescape(b []byte, attribute bool) []byte {
for i, c := range b {
if c == '&' {
dst, src := unescapeEntity(b, i, i, attribute)
for src < len(b) {
c := b[src]
if c == '&' {
dst, src = unescapeEntity(b, dst, src, attribute)
} else {
b[dst] = c
dst, src = dst+1, src+1
}
}
return b[0:dst]
}
}
return b
}
// lower lower-cases the A-Z bytes in b in-place, so that "aBc" becomes "abc".
func lower(b []byte) []byte {
for i, c := range b {
if 'A' <= c && c <= 'Z' {
b[i] = c + 'a' - 'A'
}
}
return b
}
const escapedChars = "&'<>\"\r"
func escape(w writer, s string) error {
i := strings.IndexAny(s, escapedChars)
for i != -1 {
if _, err := w.WriteString(s[:i]); err != nil {
return err
}
var esc string
switch s[i] {
case '&':
esc = "&amp;"
case '\'':
// "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
esc = "&#39;"
case '<':
esc = "&lt;"
case '>':
esc = "&gt;"
case '"':
// "&#34;" is shorter than "&quot;".
esc = "&#34;"
case '\r':
esc = "&#13;"
default:
panic("unrecognized escape character")
}
s = s[i+1:]
if _, err := w.WriteString(esc); err != nil {
return err
}
i = strings.IndexAny(s, escapedChars)
}
_, err := w.WriteString(s)
return err
}
// EscapeString escapes special characters like "<" to become "&lt;". It
// escapes only five such characters: <, >, &, ' and ".
// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
// always true.
func EscapeString(s string) string {
if strings.IndexAny(s, escapedChars) == -1 {
return s
}
var buf bytes.Buffer
escape(&buf, s)
return buf.String()
}
// UnescapeString unescapes entities like "&lt;" to become "<". It unescapes a
// larger range of entities than EscapeString escapes. For example, "&aacute;"
// unescapes to "á", as does "&#225;" and "&xE1;".
// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
// always true.
func UnescapeString(s string) string {
for _, c := range s {
if c == '&' {
return string(unescape([]byte(s), false))
}
}
return s
}
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package html
import "testing"
type unescapeTest struct {
// A short description of the test case.
desc string
// The HTML text.
html string
// The unescaped text.
unescaped string
}
var unescapeTests = []unescapeTest{
// Handle no entities.
{
"copy",
"A\ttext\nstring",
"A\ttext\nstring",
},
// Handle simple named entities.
{
"simple",
"&amp; &gt; &lt;",
"& > <",
},
// Handle hitting the end of the string.
{
"stringEnd",
"&amp &amp",
"& &",
},
// Handle entities with two codepoints.
{
"multiCodepoint",
"text &gesl; blah",
"text \u22db\ufe00 blah",
},
// Handle decimal numeric entities.
{
"decimalEntity",
"Delta = &#916; ",
"Delta = Δ ",
},
// Handle hexadecimal numeric entities.
{
"hexadecimalEntity",
"Lambda = &#x3bb; = &#X3Bb ",
"Lambda = λ = λ ",
},
// Handle numeric early termination.
{
"numericEnds",
"&# &#x &#128;43 &copy = &#169f = &#xa9",
"&# &#x €43 © = ©f = ©",
},
// Handle numeric ISO-8859-1 entity replacements.
{
"numericReplacements",
"Footnote&#x87;",
"Footnote‡",
},
}
func TestUnescape(t *testing.T) {
for _, tt := range unescapeTests {
unescaped := UnescapeString(tt.html)
if unescaped != tt.unescaped {
t.Errorf("TestUnescape %s: want %q, got %q", tt.desc, tt.unescaped, unescaped)
}
}
}
func TestUnescapeEscape(t *testing.T) {
ss := []string{
``,
`abc def`,
`a & b`,
`a&amp;b`,
`a &amp b`,
`&quot;`,
`"`,
`"<&>"`,
`&quot;&lt;&amp;&gt;&quot;`,
`3&5==1 && 0<1, "0&lt;1", a+acute=&aacute;`,
`The special characters are: <, >, &, ' and "`,
}
for _, s := range ss {
if got := UnescapeString(EscapeString(s)); got != s {
t.Errorf("got %q want %q", got, s)
}
}
}
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// This example demonstrates parsing HTML data and walking the resulting tree.
package html_test
import (
"fmt"
"log"
"strings"
"golang.org/x/net/html"
)
func ExampleParse() {
s := `<p>Links:</p><ul><li><a href="foo">Foo</a><li><a href="/bar/baz">BarBaz</a></ul>`
doc, err := html.Parse(strings.NewReader(s))
if err != nil {
log.Fatal(err)
}
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "a" {
for _, a := range n.Attr {
if a.Key == "href" {
fmt.Println(a.Val)
break
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}
f(doc)
// Output:
// foo
// /bar/baz
}
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package html
import (
"strings"
)
func adjustAttributeNames(aa []Attribute, nameMap map[string]string) {
for i := range aa {
if newName, ok := nameMap[aa[i].Key]; ok {
aa[i].Key = newName
}
}
}
func adjustForeignAttributes(aa []Attribute) {
for i, a := range aa {
if a.Key == "" || a.Key[0] != 'x' {
continue
}
switch a.Key {
case "xlink:actuate", "xlink:arcrole", "xlink:href", "xlink:role", "xlink:show",
"xlink:title", "xlink:type", "xml:base", "xml:lang", "xml:space", "xmlns:xlink":
j := strings.Index(a.Key, ":")
aa[i].Namespace = a.Key[:j]
aa[i].Key = a.Key[j+1:]
}
}
}
func htmlIntegrationPoint(n *Node) bool {
if n.Type != ElementNode {
return false
}
switch n.Namespace {
case "math":
if n.Data == "annotation-xml" {
for _, a := range n.Attr {
if a.Key == "encoding" {
val := strings.ToLower(a.Val)
if val == "text/html" || val == "application/xhtml+xml" {
return true
}
}
}
}
case "svg":
switch n.Data {
case "desc", "foreignObject", "title":
return true
}
}
return false
}
func mathMLTextIntegrationPoint(n *Node) bool {
if n.Namespace != "math" {
return false
}
switch n.Data {
case "mi", "mo", "mn", "ms", "mtext":
return true
}
return false
}
// Section 12.2.5.5.
var breakout = map[string]bool{
"b": true,
"big": true,
"blockquote": true,
"body": true,
"br": true,
"center": true,
"code": true,
"dd": true,
"div": true,
"dl": true,
"dt": true,
"em": true,
"embed": true,
"h1": true,
"h2": true,
"h3": true,
"h4": true,
"h5": true,
"h6": true,
"head": true,
"hr": true,
"i": true,
"img": true,
"li": true,
"listing": true,
"menu": true,
"meta": true,
"nobr": true,
"ol": true,
"p": true,
"pre": true,
"ruby": true,
"s": true,
"small": true,
"span": true,
"strong": true,
"strike": true,
"sub": true,
"sup": true,
"table": true,
"tt": true,
"u": true,
"ul": true,
"var": true,
}
// Section 12.2.5.5.
var svgTagNameAdjustments = map[string]string{
"altglyph": "altGlyph",
"altglyphdef": "altGlyphDef",
"altglyphitem": "altGlyphItem",
"animatecolor": "animateColor",
"animatemotion": "animateMotion",
"animatetransform": "animateTransform",
"clippath": "clipPath",
"feblend": "feBlend",
"fecolormatrix": "feColorMatrix",
"fecomponenttransfer": "feComponentTransfer",
"fecomposite": "feComposite",
"feconvolvematrix": "feConvolveMatrix",
"fediffuselighting": "feDiffuseLighting",
"fedisplacementmap": "feDisplacementMap",
"fedistantlight": "feDistantLight",
"feflood": "feFlood",
"fefunca": "feFuncA",
"fefuncb": "feFuncB",
"fefuncg": "feFuncG",
"fefuncr": "feFuncR",
"fegaussianblur": "feGaussianBlur",
"feimage": "feImage",
"femerge": "feMerge",
"femergenode": "feMergeNode",
"femorphology": "feMorphology",
"feoffset": "feOffset",
"fepointlight": "fePointLight",
"fespecularlighting": "feSpecularLighting",
"fespotlight": "feSpotLight",
"fetile": "feTile",
"feturbulence": "feTurbulence",
"foreignobject": "foreignObject",
"glyphref": "glyphRef",
"lineargradient": "linearGradient",
"radialgradient": "radialGradient",
"textpath": "textPath",
}
// Section 12.2.5.1
var mathMLAttributeAdjustments = map[string]string{
"definitionurl": "definitionURL",
}
var svgAttributeAdjustments = map[string]string{
"attributename": "attributeName",
"attributetype": "attributeType",
"basefrequency": "baseFrequency",
"baseprofile": "baseProfile",
"calcmode": "calcMode",
"clippathunits": "clipPathUnits",
"contentscripttype": "contentScriptType",
"contentstyletype": "contentStyleType",
"diffuseconstant": "diffuseConstant",
"edgemode": "edgeMode",
"externalresourcesrequired": "externalResourcesRequired",
"filterres": "filterRes",
"filterunits": "filterUnits",
"glyphref": "glyphRef",
"gradienttransform": "gradientTransform",
"gradientunits": "gradientUnits",
"kernelmatrix": "kernelMatrix",
"kernelunitlength": "kernelUnitLength",
"keypoints": "keyPoints",
"keysplines": "keySplines",
"keytimes": "keyTimes",
"lengthadjust": "lengthAdjust",
"limitingconeangle": "limitingConeAngle",
"markerheight": "markerHeight",
"markerunits": "markerUnits",
"markerwidth": "markerWidth",
"maskcontentunits": "maskContentUnits",
"maskunits": "maskUnits",
"numoctaves": "numOctaves",
"pathlength": "pathLength",
"patterncontentunits": "patternContentUnits",
"patterntransform": "patternTransform",
"patternunits": "patternUnits",
"pointsatx": "pointsAtX",
"pointsaty": "pointsAtY",
"pointsatz": "pointsAtZ",
"preservealpha": "preserveAlpha",
"preserveaspectratio": "preserveAspectRatio",
"primitiveunits": "primitiveUnits",
"refx": "refX",
"refy": "refY",
"repeatcount": "repeatCount",
"repeatdur": "repeatDur",
"requiredextensions": "requiredExtensions",
"requiredfeatures": "requiredFeatures",
"specularconstant": "specularConstant",
"specularexponent": "specularExponent",
"spreadmethod": "spreadMethod",
"startoffset": "startOffset",
"stddeviation": "stdDeviation",
"stitchtiles": "stitchTiles",
"surfacescale": "surfaceScale",
"systemlanguage": "systemLanguage",
"tablevalues": "tableValues",
"targetx": "targetX",
"targety": "targetY",
"textlength": "textLength",
"viewbox": "viewBox",
"viewtarget": "viewTarget",
"xchannelselector": "xChannelSelector",
"ychannelselector": "yChannelSelector",
"zoomandpan": "zoomAndPan",
}
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package html
import (
"golang.org/x/net/html/atom"
)
// A NodeType is the type of a Node.
type NodeType uint32
const (
ErrorNode NodeType = iota
TextNode
DocumentNode
ElementNode
CommentNode
DoctypeNode
scopeMarkerNode
)
// Section 12.2.3.3 says "scope markers are inserted when entering applet
// elements, buttons, object elements, marquees, table cells, and table
// captions, and are used to prevent formatting from 'leaking'".
var scopeMarker = Node{Type: scopeMarkerNode}
// A Node consists of a NodeType and some Data (tag name for element nodes,
// content for text) and are part of a tree of Nodes. Element nodes may also
// have a Namespace and contain a slice of Attributes. Data is unescaped, so
// that it looks like "a<b" rather than "a&lt;b". For element nodes, DataAtom
// is the atom for Data, or zero if Data is not a known tag name.
//
// An empty Namespace implies a "http://www.w3.org/1999/xhtml" namespace.
// Similarly, "math" is short for "http://www.w3.org/1998/Math/MathML", and
// "svg" is short for "http://www.w3.org/2000/svg".
type Node struct {
Parent, FirstChild, LastChild, PrevSibling, NextSibling *Node
Type NodeType
DataAtom atom.Atom
Data string
Namespace string
Attr []Attribute
}
// InsertBefore inserts newChild as a child of n, immediately before oldChild
// in the sequence of n's children. oldChild may be nil, in which case newChild
// is appended to the end of n's children.
//
// It will panic if newChild already has a parent or siblings.
func (n *Node) InsertBefore(newChild, oldChild *Node) {
if newChild.Parent != nil || newChild.PrevSibling != nil || newChild.NextSibling != nil {
panic("html: InsertBefore called for an attached child Node")
}
var prev, next *Node
if oldChild != nil {
prev, next = oldChild.PrevSibling, oldChild
} else {
prev = n.LastChild
}
if prev != nil {
prev.NextSibling = newChild
} else {
n.FirstChild = newChild
}
if next != nil {
next.PrevSibling = newChild
} else {
n.LastChild = newChild
}
newChild.Parent = n
newChild.PrevSibling = prev
newChild.NextSibling = next
}
// AppendChild adds a node c as a child of n.
//
// It will panic if c already has a parent or siblings.
func (n *Node) AppendChild(c *Node) {
if c.Parent != nil || c.PrevSibling != nil || c.NextSibling != nil {
panic("html: AppendChild called for an attached child Node")
}
last := n.LastChild
if last != nil {
last.NextSibling = c
} else {
n.FirstChild = c
}
n.LastChild = c
c.Parent = n
c.PrevSibling = last
}
// RemoveChild removes a node c that is a child of n. Afterwards, c will have
// no parent and no siblings.
//
// It will panic if c's parent is not n.
func (n *Node) RemoveChild(c *Node) {
if c.Parent != n {
panic("html: RemoveChild called for a non-child Node")
}
if n.FirstChild == c {
n.FirstChild = c.NextSibling
}
if c.NextSibling != nil {
c.NextSibling.PrevSibling = c.PrevSibling
}
if n.LastChild == c {
n.LastChild = c.PrevSibling
}
if c.PrevSibling != nil {
c.PrevSibling.NextSibling = c.NextSibling
}
c.Parent = nil
c.PrevSibling = nil
c.NextSibling = nil
}
// reparentChildren reparents all of src's child nodes to dst.
func reparentChildren(dst, src *Node) {
for {
child := src.FirstChild
if child == nil {
break
}
src.RemoveChild(child)
dst.AppendChild(child)
}
}
// clone returns a new node with the same type, data and attributes.
// The clone has no parent, no siblings and no children.
func (n *Node) clone() *Node {
m := &Node{
Type: n.Type,
DataAtom: n.DataAtom,
Data: n.Data,
Attr: make([]Attribute, len(n.Attr)),
}
copy(m.Attr, n.Attr)
return m
}
// nodeStack is a stack of nodes.
type nodeStack []*Node
// pop pops the stack. It will panic if s is empty.
func (s *nodeStack) pop() *Node {
i := len(*s)
n := (*s)[i-1]
*s = (*s)[:i-1]
return n
}
// top returns the most recently pushed node, or nil if s is empty.
func (s *nodeStack) top() *Node {
if i := len(*s); i > 0 {
return (*s)[i-1]
}
return nil
}
// index returns the index of the top-most occurrence of n in the stack, or -1
// if n is not present.
func (s *nodeStack) index(n *Node) int {
for i := len(*s) - 1; i >= 0; i-- {
if (*s)[i] == n {
return i
}
}
return -1
}
// insert inserts a node at the given index.
func (s *nodeStack) insert(i int, n *Node) {
(*s) = append(*s, nil)
copy((*s)[i+1:], (*s)[i:])
(*s)[i] = n
}
// remove removes a node from the stack. It is a no-op if n is not present.
func (s *nodeStack) remove(n *Node) {
i := s.index(n)
if i == -1 {
return
}
copy((*s)[i:], (*s)[i+1:])
j := len(*s) - 1
(*s)[j] = nil
*s = (*s)[:j]
}
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package html
import (
"fmt"
)
// checkTreeConsistency checks that a node and its descendants are all
// consistent in their parent/child/sibling relationships.
func checkTreeConsistency(n *Node) error {
return checkTreeConsistency1(n, 0)
}
func checkTreeConsistency1(n *Node, depth int) error {
if depth == 1e4 {
return fmt.Errorf("html: tree looks like it contains a cycle")
}
if err := checkNodeConsistency(n); err != nil {
return err
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
if err := checkTreeConsistency1(c, depth+1); err != nil {
return err
}
}
return nil
}
// checkNodeConsistency checks that a node's parent/child/sibling relationships
// are consistent.
func checkNodeConsistency(n *Node) error {
if n == nil {
return nil
}
nParent := 0
for p := n.Parent; p != nil; p = p.Parent {
nParent++
if nParent == 1e4 {
return fmt.Errorf("html: parent list looks like an infinite loop")
}
}
nForward := 0
for c := n.FirstChild; c != nil; c = c.NextSibling {
nForward++
if nForward == 1e6 {
return fmt.Errorf("html: forward list of children looks like an infinite loop")
}
if c.Parent != n {
return fmt.Errorf("html: inconsistent child/parent relationship")
}
}
nBackward := 0
for c := n.LastChild; c != nil; c = c.PrevSibling {
nBackward++
if nBackward == 1e6 {
return fmt.Errorf("html: backward list of children looks like an infinite loop")
}
if c.Parent != n {
return fmt.Errorf("html: inconsistent child/parent relationship")
}
}
if n.Parent != nil {
if n.Parent == n {
return fmt.Errorf("html: inconsistent parent relationship")
}
if n.Parent == n.FirstChild {
return fmt.Errorf("html: inconsistent parent/first relationship")
}
if n.Parent == n.LastChild {
return fmt.Errorf("html: inconsistent parent/last relationship")
}
if n.Parent == n.PrevSibling {
return fmt.Errorf("html: inconsistent parent/prev relationship")
}
if n.Parent == n.NextSibling {
return fmt.Errorf("html: inconsistent parent/next relationship")
}
parentHasNAsAChild := false
for c := n.Parent.FirstChild; c != nil; c = c.NextSibling {
if c == n {
parentHasNAsAChild = true
break
}
}
if !parentHasNAsAChild {
return fmt.Errorf("html: inconsistent parent/child relationship")
}
}
if n.PrevSibling != nil && n.PrevSibling.NextSibling != n {
return fmt.Errorf("html: inconsistent prev/next relationship")
}
if n.NextSibling != nil && n.NextSibling.PrevSibling != n {
return fmt.Errorf("html: inconsistent next/prev relationship")
}
if (n.FirstChild == nil) != (n.LastChild == nil) {
return fmt.Errorf("html: inconsistent first/last relationship")
}
if n.FirstChild != nil && n.FirstChild == n.LastChild {
// We have a sole child.
if n.FirstChild.PrevSibling != nil || n.FirstChild.NextSibling != nil {
return fmt.Errorf("html: inconsistent sole child's sibling relationship")
}
}
seen := map[*Node]bool{}
var last *Node
for c := n.FirstChild; c != nil; c = c.NextSibling {
if seen[c] {
return fmt.Errorf("html: inconsistent repeated child")
}
seen[c] = true
last = c
}
if last != n.LastChild {
return fmt.Errorf("html: inconsistent last relationship")
}
var first *Node
for c := n.LastChild; c != nil; c = c.PrevSibling {
if !seen[c] {
return fmt.Errorf("html: inconsistent missing child")
}
delete(seen, c)
first = c
}
if first != n.FirstChild {
return fmt.Errorf("html: inconsistent first relationship")
}
if len(seen) != 0 {
return fmt.Errorf("html: inconsistent forwards/backwards child list")
}
return nil
}
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
The *.dat files in this directory are copied from The WebKit Open Source
Project, specifically $WEBKITROOT/LayoutTests/html5lib/resources.
WebKit is licensed under a BSD style license.
http://webkit.org/coding/bsd-license.html says:
Copyright (C) 2009 Apple Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS "AS IS" AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#data
<p><b id="A"><script>document.getElementById("A").id = "B"</script></p>TEXT</b>
#errors
#document
| <html>
| <head>
| <body>
| <p>
| <b>
| id="B"
| <script>
| "document.getElementById("A").id = "B""
| <b>
| id="A"
| "TEXT"
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment