@@ -181,7 +181,6 @@ type Decoder struct {
181
181
ns map [string ]string
182
182
err error
183
183
line int
184
- tmp [32 ]byte
185
184
}
186
185
187
186
// NewDecoder creates a new XML parser reading from r.
@@ -877,92 +876,92 @@ Input:
877
876
// XML in all its glory allows a document to define and use
878
877
// its own character names with <!ENTITY ...> directives.
879
878
// Parsers are required to recognize lt, gt, amp, apos, and quot
880
- // even if they have not been declared. That's all we allow.
881
- var i int
882
- var semicolon bool
883
- var valid bool
884
- for i = 0 ; i < len (d .tmp ); i ++ {
885
- var ok bool
886
- d .tmp [i ], ok = d .getc ()
887
- if ! ok {
888
- if d .err == io .EOF {
889
- d .err = d .syntaxError ("unexpected EOF" )
890
- }
879
+ // even if they have not been declared.
880
+ before := d .buf .Len ()
881
+ d .buf .WriteByte ('&' )
882
+ var ok bool
883
+ var text string
884
+ var haveText bool
885
+ if b , ok = d .mustgetc (); ! ok {
886
+ return nil
887
+ }
888
+ if b == '#' {
889
+ d .buf .WriteByte (b )
890
+ if b , ok = d .mustgetc (); ! ok {
891
891
return nil
892
892
}
893
- c := d .tmp [i ]
894
- if c == ';' {
895
- semicolon = true
896
- valid = i > 0
897
- break
898
- }
899
- if 'a' <= c && c <= 'z' ||
900
- 'A' <= c && c <= 'Z' ||
901
- '0' <= c && c <= '9' ||
902
- c == '_' || c == '#' {
903
- continue
904
- }
905
- d .ungetc (c )
906
- break
907
- }
908
- s := string (d .tmp [0 :i ])
909
- if ! valid {
910
- if ! d .Strict {
911
- b0 , b1 = 0 , 0
912
- d .buf .WriteByte ('&' )
913
- d .buf .Write (d .tmp [0 :i ])
914
- if semicolon {
915
- d .buf .WriteByte (';' )
893
+ base := 10
894
+ if b == 'x' {
895
+ base = 16
896
+ d .buf .WriteByte (b )
897
+ if b , ok = d .mustgetc (); ! ok {
898
+ return nil
916
899
}
917
- continue Input
918
900
}
919
- semi := ";"
920
- if ! semicolon {
921
- semi = " (no semicolon)"
901
+ start := d .buf .Len ()
902
+ for '0' <= b && b <= '9' ||
903
+ base == 16 && 'a' <= b && b <= 'f' ||
904
+ base == 16 && 'A' <= b && b <= 'F' {
905
+ d .buf .WriteByte (b )
906
+ if b , ok = d .mustgetc (); ! ok {
907
+ return nil
908
+ }
922
909
}
923
- if i < len ( d . tmp ) {
924
- d .err = d . syntaxError ( "invalid character entity &" + s + semi )
910
+ if b != ';' {
911
+ d .ungetc ( b )
925
912
} else {
926
- d .err = d .syntaxError ("invalid character entity &" + s + "... too long" )
927
- }
928
- return nil
929
- }
930
- var haveText bool
931
- var text string
932
- if i >= 2 && s [0 ] == '#' {
933
- var n uint64
934
- var err error
935
- if i >= 3 && s [1 ] == 'x' {
936
- n , err = strconv .ParseUint (s [2 :], 16 , 64 )
937
- } else {
938
- n , err = strconv .ParseUint (s [1 :], 10 , 64 )
939
- }
940
- if err == nil && n <= unicode .MaxRune {
941
- text = string (n )
942
- haveText = true
913
+ s := string (d .buf .Bytes ()[start :])
914
+ d .buf .WriteByte (';' )
915
+ n , err := strconv .ParseUint (s , base , 64 )
916
+ if err == nil && n <= unicode .MaxRune {
917
+ text = string (n )
918
+ haveText = true
919
+ }
943
920
}
944
921
} else {
945
- if r , ok := entity [s ]; ok {
946
- text = string (r )
947
- haveText = true
948
- } else if d .Entity != nil {
949
- text , haveText = d .Entity [s ]
922
+ d .ungetc (b )
923
+ if ! d .readName () {
924
+ if d .err != nil {
925
+ return nil
926
+ }
927
+ ok = false
950
928
}
951
- }
952
- if ! haveText {
953
- if ! d .Strict {
954
- b0 , b1 = 0 , 0
955
- d .buf .WriteByte ('&' )
956
- d .buf .Write (d .tmp [0 :i ])
929
+ if b , ok = d .mustgetc (); ! ok {
930
+ return nil
931
+ }
932
+ if b != ';' {
933
+ d .ungetc (b )
934
+ } else {
935
+ name := d .buf .Bytes ()[before + 1 :]
957
936
d .buf .WriteByte (';' )
958
- continue Input
937
+ if isName (name ) {
938
+ s := string (name )
939
+ if r , ok := entity [s ]; ok {
940
+ text = string (r )
941
+ haveText = true
942
+ } else if d .Entity != nil {
943
+ text , haveText = d .Entity [s ]
944
+ }
945
+ }
959
946
}
960
- d .err = d .syntaxError ("invalid character entity &" + s + ";" )
961
- return nil
962
947
}
963
- d .buf .Write ([]byte (text ))
964
- b0 , b1 = 0 , 0
965
- continue Input
948
+
949
+ if haveText {
950
+ d .buf .Truncate (before )
951
+ d .buf .Write ([]byte (text ))
952
+ b0 , b1 = 0 , 0
953
+ continue Input
954
+ }
955
+ if ! d .Strict {
956
+ b0 , b1 = 0 , 0
957
+ continue Input
958
+ }
959
+ ent := string (d .buf .Bytes ()[before ])
960
+ if ent [len (ent )- 1 ] != ';' {
961
+ ent += " (no semicolon)"
962
+ }
963
+ d .err = d .syntaxError ("invalid character entity " + ent )
964
+ return nil
966
965
}
967
966
968
967
// We must rewrite unescaped \r and \r\n into \n.
@@ -1030,18 +1029,34 @@ func (d *Decoder) nsname() (name Name, ok bool) {
1030
1029
// Do not set d.err if the name is missing (unless unexpected EOF is received):
1031
1030
// let the caller provide better context.
1032
1031
func (d * Decoder ) name () (s string , ok bool ) {
1032
+ d .buf .Reset ()
1033
+ if ! d .readName () {
1034
+ return "" , false
1035
+ }
1036
+
1037
+ // Now we check the characters.
1038
+ s = d .buf .String ()
1039
+ if ! isName ([]byte (s )) {
1040
+ d .err = d .syntaxError ("invalid XML name: " + s )
1041
+ return "" , false
1042
+ }
1043
+ return s , true
1044
+ }
1045
+
1046
+ // Read a name and append its bytes to d.buf.
1047
+ // The name is delimited by any single-byte character not valid in names.
1048
+ // All multi-byte characters are accepted; the caller must check their validity.
1049
+ func (d * Decoder ) readName () (ok bool ) {
1033
1050
var b byte
1034
1051
if b , ok = d .mustgetc (); ! ok {
1035
1052
return
1036
1053
}
1037
-
1038
- // As a first approximation, we gather the bytes [A-Za-z_:.-\x80-\xFF]*
1039
1054
if b < utf8 .RuneSelf && ! isNameByte (b ) {
1040
1055
d .ungetc (b )
1041
- return "" , false
1056
+ return false
1042
1057
}
1043
- d .buf .Reset ()
1044
1058
d .buf .WriteByte (b )
1059
+
1045
1060
for {
1046
1061
if b , ok = d .mustgetc (); ! ok {
1047
1062
return
@@ -1052,16 +1067,7 @@ func (d *Decoder) name() (s string, ok bool) {
1052
1067
}
1053
1068
d .buf .WriteByte (b )
1054
1069
}
1055
-
1056
- // Then we check the characters.
1057
- s = d .buf .String ()
1058
- for i , c := range s {
1059
- if ! unicode .Is (first , c ) && (i == 0 || ! unicode .Is (second , c )) {
1060
- d .err = d .syntaxError ("invalid XML name: " + s )
1061
- return "" , false
1062
- }
1063
- }
1064
- return s , true
1070
+ return true
1065
1071
}
1066
1072
1067
1073
func isNameByte (c byte ) bool {
@@ -1071,6 +1077,30 @@ func isNameByte(c byte) bool {
1071
1077
c == '_' || c == ':' || c == '.' || c == '-'
1072
1078
}
1073
1079
1080
+ func isName (s []byte ) bool {
1081
+ if len (s ) == 0 {
1082
+ return false
1083
+ }
1084
+ c , n := utf8 .DecodeRune (s )
1085
+ if c == utf8 .RuneError && n == 1 {
1086
+ return false
1087
+ }
1088
+ if ! unicode .Is (first , c ) {
1089
+ return false
1090
+ }
1091
+ for n < len (s ) {
1092
+ s = s [n :]
1093
+ c , n = utf8 .DecodeRune (s )
1094
+ if c == utf8 .RuneError && n == 1 {
1095
+ return false
1096
+ }
1097
+ if ! unicode .Is (first , c ) && ! unicode .Is (second , c ) {
1098
+ return false
1099
+ }
1100
+ }
1101
+ return true
1102
+ }
1103
+
1074
1104
// These tables were generated by cut and paste from Appendix B of
1075
1105
// the XML spec at http://www.xml.com/axml/testaxml.htm
1076
1106
// and then reformatting. First corresponds to (Letter | '_' | ':')
0 commit comments