Skip to content

Commit 2e67dd8

Browse files
pat42smithrsc
authored andcommitted
encoding/xml: expand allowed entity names
Previously, multi-byte characters were not allowed. Also certain single-byte characters, such as '-', were disallowed. Fixes #3813. R=golang-dev, rsc CC=golang-dev https://golang.org/cl/6641052
1 parent 5d05c78 commit 2e67dd8

File tree

2 files changed

+154
-89
lines changed

2 files changed

+154
-89
lines changed

src/pkg/encoding/xml/xml.go

Lines changed: 119 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,6 @@ type Decoder struct {
181181
ns map[string]string
182182
err error
183183
line int
184-
tmp [32]byte
185184
}
186185

187186
// NewDecoder creates a new XML parser reading from r.
@@ -877,92 +876,92 @@ Input:
877876
// XML in all its glory allows a document to define and use
878877
// its own character names with <!ENTITY ...> directives.
879878
// Parsers are required to recognize lt, gt, amp, apos, and quot
880-
// even if they have not been declared. That's all we allow.
881-
var i int
882-
var semicolon bool
883-
var valid bool
884-
for i = 0; i < len(d.tmp); i++ {
885-
var ok bool
886-
d.tmp[i], ok = d.getc()
887-
if !ok {
888-
if d.err == io.EOF {
889-
d.err = d.syntaxError("unexpected EOF")
890-
}
879+
// even if they have not been declared.
880+
before := d.buf.Len()
881+
d.buf.WriteByte('&')
882+
var ok bool
883+
var text string
884+
var haveText bool
885+
if b, ok = d.mustgetc(); !ok {
886+
return nil
887+
}
888+
if b == '#' {
889+
d.buf.WriteByte(b)
890+
if b, ok = d.mustgetc(); !ok {
891891
return nil
892892
}
893-
c := d.tmp[i]
894-
if c == ';' {
895-
semicolon = true
896-
valid = i > 0
897-
break
898-
}
899-
if 'a' <= c && c <= 'z' ||
900-
'A' <= c && c <= 'Z' ||
901-
'0' <= c && c <= '9' ||
902-
c == '_' || c == '#' {
903-
continue
904-
}
905-
d.ungetc(c)
906-
break
907-
}
908-
s := string(d.tmp[0:i])
909-
if !valid {
910-
if !d.Strict {
911-
b0, b1 = 0, 0
912-
d.buf.WriteByte('&')
913-
d.buf.Write(d.tmp[0:i])
914-
if semicolon {
915-
d.buf.WriteByte(';')
893+
base := 10
894+
if b == 'x' {
895+
base = 16
896+
d.buf.WriteByte(b)
897+
if b, ok = d.mustgetc(); !ok {
898+
return nil
916899
}
917-
continue Input
918900
}
919-
semi := ";"
920-
if !semicolon {
921-
semi = " (no semicolon)"
901+
start := d.buf.Len()
902+
for '0' <= b && b <= '9' ||
903+
base == 16 && 'a' <= b && b <= 'f' ||
904+
base == 16 && 'A' <= b && b <= 'F' {
905+
d.buf.WriteByte(b)
906+
if b, ok = d.mustgetc(); !ok {
907+
return nil
908+
}
922909
}
923-
if i < len(d.tmp) {
924-
d.err = d.syntaxError("invalid character entity &" + s + semi)
910+
if b != ';' {
911+
d.ungetc(b)
925912
} else {
926-
d.err = d.syntaxError("invalid character entity &" + s + "... too long")
927-
}
928-
return nil
929-
}
930-
var haveText bool
931-
var text string
932-
if i >= 2 && s[0] == '#' {
933-
var n uint64
934-
var err error
935-
if i >= 3 && s[1] == 'x' {
936-
n, err = strconv.ParseUint(s[2:], 16, 64)
937-
} else {
938-
n, err = strconv.ParseUint(s[1:], 10, 64)
939-
}
940-
if err == nil && n <= unicode.MaxRune {
941-
text = string(n)
942-
haveText = true
913+
s := string(d.buf.Bytes()[start:])
914+
d.buf.WriteByte(';')
915+
n, err := strconv.ParseUint(s, base, 64)
916+
if err == nil && n <= unicode.MaxRune {
917+
text = string(n)
918+
haveText = true
919+
}
943920
}
944921
} else {
945-
if r, ok := entity[s]; ok {
946-
text = string(r)
947-
haveText = true
948-
} else if d.Entity != nil {
949-
text, haveText = d.Entity[s]
922+
d.ungetc(b)
923+
if !d.readName() {
924+
if d.err != nil {
925+
return nil
926+
}
927+
ok = false
950928
}
951-
}
952-
if !haveText {
953-
if !d.Strict {
954-
b0, b1 = 0, 0
955-
d.buf.WriteByte('&')
956-
d.buf.Write(d.tmp[0:i])
929+
if b, ok = d.mustgetc(); !ok {
930+
return nil
931+
}
932+
if b != ';' {
933+
d.ungetc(b)
934+
} else {
935+
name := d.buf.Bytes()[before+1:]
957936
d.buf.WriteByte(';')
958-
continue Input
937+
if isName(name) {
938+
s := string(name)
939+
if r, ok := entity[s]; ok {
940+
text = string(r)
941+
haveText = true
942+
} else if d.Entity != nil {
943+
text, haveText = d.Entity[s]
944+
}
945+
}
959946
}
960-
d.err = d.syntaxError("invalid character entity &" + s + ";")
961-
return nil
962947
}
963-
d.buf.Write([]byte(text))
964-
b0, b1 = 0, 0
965-
continue Input
948+
949+
if haveText {
950+
d.buf.Truncate(before)
951+
d.buf.Write([]byte(text))
952+
b0, b1 = 0, 0
953+
continue Input
954+
}
955+
if !d.Strict {
956+
b0, b1 = 0, 0
957+
continue Input
958+
}
959+
ent := string(d.buf.Bytes()[before])
960+
if ent[len(ent)-1] != ';' {
961+
ent += " (no semicolon)"
962+
}
963+
d.err = d.syntaxError("invalid character entity " + ent)
964+
return nil
966965
}
967966

968967
// We must rewrite unescaped \r and \r\n into \n.
@@ -1030,18 +1029,34 @@ func (d *Decoder) nsname() (name Name, ok bool) {
10301029
// Do not set d.err if the name is missing (unless unexpected EOF is received):
10311030
// let the caller provide better context.
10321031
func (d *Decoder) name() (s string, ok bool) {
1032+
d.buf.Reset()
1033+
if !d.readName() {
1034+
return "", false
1035+
}
1036+
1037+
// Now we check the characters.
1038+
s = d.buf.String()
1039+
if !isName([]byte(s)) {
1040+
d.err = d.syntaxError("invalid XML name: " + s)
1041+
return "", false
1042+
}
1043+
return s, true
1044+
}
1045+
1046+
// Read a name and append its bytes to d.buf.
1047+
// The name is delimited by any single-byte character not valid in names.
1048+
// All multi-byte characters are accepted; the caller must check their validity.
1049+
func (d *Decoder) readName() (ok bool) {
10331050
var b byte
10341051
if b, ok = d.mustgetc(); !ok {
10351052
return
10361053
}
1037-
1038-
// As a first approximation, we gather the bytes [A-Za-z_:.-\x80-\xFF]*
10391054
if b < utf8.RuneSelf && !isNameByte(b) {
10401055
d.ungetc(b)
1041-
return "", false
1056+
return false
10421057
}
1043-
d.buf.Reset()
10441058
d.buf.WriteByte(b)
1059+
10451060
for {
10461061
if b, ok = d.mustgetc(); !ok {
10471062
return
@@ -1052,16 +1067,7 @@ func (d *Decoder) name() (s string, ok bool) {
10521067
}
10531068
d.buf.WriteByte(b)
10541069
}
1055-
1056-
// Then we check the characters.
1057-
s = d.buf.String()
1058-
for i, c := range s {
1059-
if !unicode.Is(first, c) && (i == 0 || !unicode.Is(second, c)) {
1060-
d.err = d.syntaxError("invalid XML name: " + s)
1061-
return "", false
1062-
}
1063-
}
1064-
return s, true
1070+
return true
10651071
}
10661072

10671073
func isNameByte(c byte) bool {
@@ -1071,6 +1077,30 @@ func isNameByte(c byte) bool {
10711077
c == '_' || c == ':' || c == '.' || c == '-'
10721078
}
10731079

1080+
func isName(s []byte) bool {
1081+
if len(s) == 0 {
1082+
return false
1083+
}
1084+
c, n := utf8.DecodeRune(s)
1085+
if c == utf8.RuneError && n == 1 {
1086+
return false
1087+
}
1088+
if !unicode.Is(first, c) {
1089+
return false
1090+
}
1091+
for n < len(s) {
1092+
s = s[n:]
1093+
c, n = utf8.DecodeRune(s)
1094+
if c == utf8.RuneError && n == 1 {
1095+
return false
1096+
}
1097+
if !unicode.Is(first, c) && !unicode.Is(second, c) {
1098+
return false
1099+
}
1100+
}
1101+
return true
1102+
}
1103+
10741104
// These tables were generated by cut and paste from Appendix B of
10751105
// the XML spec at http://www.xml.com/axml/testaxml.htm
10761106
// and then reformatting. First corresponds to (Letter | '_' | ':')

src/pkg/encoding/xml/xml_test.go

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ const testInput = `
1919
<body xmlns:foo="ns1" xmlns="ns2" xmlns:tag="ns3" ` +
2020
"\r\n\t" + ` >
2121
<hello lang="en">World &lt;&gt;&apos;&quot; &#x767d;&#40300;翔</hello>
22+
<query>&何; &is-it;</query>
2223
<goodbye />
2324
<outer foo:attr="value" xmlns:tag="ns4">
2425
<inner/>
@@ -28,6 +29,8 @@ const testInput = `
2829
</tag:name>
2930
</body><!-- missing final newline -->`
3031

32+
var testEntity = map[string]string{"何": "What", "is-it": "is it?"}
33+
3134
var rawTokens = []Token{
3235
CharData("\n"),
3336
ProcInst{"xml", []byte(`version="1.0" encoding="UTF-8"`)},
@@ -41,6 +44,10 @@ var rawTokens = []Token{
4144
CharData("World <>'\" 白鵬翔"),
4245
EndElement{Name{"", "hello"}},
4346
CharData("\n "),
47+
StartElement{Name{"", "query"}, []Attr{}},
48+
CharData("What is it?"),
49+
EndElement{Name{"", "query"}},
50+
CharData("\n "),
4451
StartElement{Name{"", "goodbye"}, []Attr{}},
4552
EndElement{Name{"", "goodbye"}},
4653
CharData("\n "),
@@ -74,6 +81,10 @@ var cookedTokens = []Token{
7481
CharData("World <>'\" 白鵬翔"),
7582
EndElement{Name{"ns2", "hello"}},
7683
CharData("\n "),
84+
StartElement{Name{"ns2", "query"}, []Attr{}},
85+
CharData("What is it?"),
86+
EndElement{Name{"ns2", "query"}},
87+
CharData("\n "),
7788
StartElement{Name{"ns2", "goodbye"}, []Attr{}},
7889
EndElement{Name{"ns2", "goodbye"}},
7990
CharData("\n "),
@@ -156,6 +167,7 @@ var xmlInput = []string{
156167

157168
func TestRawToken(t *testing.T) {
158169
d := NewDecoder(strings.NewReader(testInput))
170+
d.Entity = testEntity
159171
testRawToken(t, d, rawTokens)
160172
}
161173

@@ -164,8 +176,14 @@ const nonStrictInput = `
164176
<tag>&unknown;entity</tag>
165177
<tag>&#123</tag>
166178
<tag>&#zzz;</tag>
179+
<tag>&なまえ3;</tag>
180+
<tag>&lt-gt;</tag>
181+
<tag>&;</tag>
182+
<tag>&0a;</tag>
167183
`
168184

185+
var nonStringEntity = map[string]string{"": "oops!", "0a": "oops!"}
186+
169187
var nonStrictTokens = []Token{
170188
CharData("\n"),
171189
StartElement{Name{"", "tag"}, []Attr{}},
@@ -184,6 +202,22 @@ var nonStrictTokens = []Token{
184202
CharData("&#zzz;"),
185203
EndElement{Name{"", "tag"}},
186204
CharData("\n"),
205+
StartElement{Name{"", "tag"}, []Attr{}},
206+
CharData("&なまえ3;"),
207+
EndElement{Name{"", "tag"}},
208+
CharData("\n"),
209+
StartElement{Name{"", "tag"}, []Attr{}},
210+
CharData("&lt-gt;"),
211+
EndElement{Name{"", "tag"}},
212+
CharData("\n"),
213+
StartElement{Name{"", "tag"}, []Attr{}},
214+
CharData("&;"),
215+
EndElement{Name{"", "tag"}},
216+
CharData("\n"),
217+
StartElement{Name{"", "tag"}, []Attr{}},
218+
CharData("&0a;"),
219+
EndElement{Name{"", "tag"}},
220+
CharData("\n"),
187221
}
188222

189223
func TestNonStrictRawToken(t *testing.T) {
@@ -317,6 +351,7 @@ func TestNestedDirectives(t *testing.T) {
317351

318352
func TestToken(t *testing.T) {
319353
d := NewDecoder(strings.NewReader(testInput))
354+
d.Entity = testEntity
320355

321356
for i, want := range cookedTokens {
322357
have, err := d.Token()

0 commit comments

Comments
 (0)