@@ -1827,19 +1827,21 @@ Node *TY_(FindBody)( TidyDocImpl* doc )
1827
1827
return node ;
1828
1828
}
1829
1829
1830
- /* Check meta charset
1831
- 1. if there is no meta charset, it adds one.
1832
- 2. if there is a meta charset, it moves it to the top if HEAD.
1833
- 3. if it doesn't match the output encoding, warn about that.
1834
- 4. if there are duplicates, discard them.
1835
- */
1830
+ /*\
1831
+ * Issue #456 - Check meta charset
1832
+ * 1. if there is no meta charset, it adds one, according to doctype, no warning.
1833
+ * 2. if there is a meta charset, it moves it to the top if HEAD. Not sure this required?
1834
+ * 3. if it doesn't match the output encoding, and fix. Naybe no warning?
1835
+ * 4. if there are duplicates, discard them, with warning.
1836
+ \*/
1836
1837
Bool TY_ (TidyMetaCharset )(TidyDocImpl * doc )
1837
1838
{
1838
1839
AttVal * charsetAttr ;
1839
1840
AttVal * contentAttr ;
1840
1841
AttVal * httpEquivAttr ;
1841
1842
Bool charsetFound = no ;
1842
- ctmbstr enc = TY_ (GetEncodingNameFromTidyId )(cfg (doc , TidyOutCharEncoding ));
1843
+ uint outenc = cfg (doc , TidyOutCharEncoding );
1844
+ ctmbstr enc = TY_ (GetEncodingNameFromTidyId )(outenc );
1843
1845
Node * currentNode ;
1844
1846
Node * head = TY_ (FindHEAD )( doc );
1845
1847
Node * metaTag ;
@@ -1850,25 +1852,38 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
1850
1852
tmbstr lcontent ;
1851
1853
tmbstr newValue ;
1852
1854
/* We can't do anything we don't have a head or encoding is NULL */
1853
- if ( !head || !enc )
1855
+ if ( !head || !enc || !TY_ (tmbstrlen )(enc ))
1856
+ return no ;
1857
+ if (outenc == RAW )
1854
1858
return no ;
1859
+ #ifndef NO_NATIVE_ISO2022_SUPPORT
1860
+ if (outenc == ISO2022 )
1861
+ return no ;
1862
+ #endif
1863
+
1855
1864
tidyBufInit (& charsetString );
1865
+ /* Set up the content test 'charset=value' */
1866
+ tidyBufClear (& charsetString );
1867
+ tidyBufAppend (& charsetString , "charset=" , 8 );
1868
+ tidyBufAppend (& charsetString , (char * )enc , TY_ (tmbstrlen )(enc ));
1869
+ tidyBufAppend (& charsetString , "\0" , 1 ); /* zero terminate the buffer */
1870
+ /* process the children of the head */
1856
1871
for (currentNode = head -> content ; currentNode ; currentNode = currentNode -> next )
1857
1872
{
1858
1873
if (!nodeIsMETA (currentNode ))
1859
- continue ;
1874
+ continue ; /* not a meta node */
1860
1875
charsetAttr = attrGetCHARSET (currentNode );
1861
1876
httpEquivAttr = attrGetHTTP_EQUIV (currentNode );
1862
1877
if (!charsetAttr && !httpEquivAttr )
1863
- continue ;
1878
+ continue ; /* has no charset attribute */
1864
1879
/*
1865
1880
Meta charset comes in quite a few flavors:
1866
- 1. <meta charset=value> - expected for (X)HTML5.
1881
+ 1. <meta charset=" value" > - expected for (X)HTML5.
1867
1882
*/
1868
1883
if (charsetAttr && !httpEquivAttr )
1869
1884
{
1870
- // we already found one, so remove the rest.
1871
- if (charsetFound )
1885
+ /* we already found one, so remove the rest. */
1886
+ if (charsetFound || ! charsetAttr -> value )
1872
1887
{
1873
1888
prevNode = currentNode -> prev ;
1874
1889
TY_ (ReportError )(doc , head , currentNode , DISCARDING_UNEXPECTED );
@@ -1877,67 +1892,130 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
1877
1892
continue ;
1878
1893
}
1879
1894
charsetFound = yes ;
1880
- // Fix mismatched attribute value
1895
+ /* Fix mismatched attribute value */
1881
1896
if (TY_ (tmbstrcmp )(TY_ (tmbstrtolower )(charsetAttr -> value ), enc ) != 0 )
1882
1897
{
1883
- newValue = (tmbstr ) TidyDocAlloc ( doc , TY_ (tmbstrlen )(enc ) );
1898
+ newValue = (tmbstr ) TidyDocAlloc ( doc , TY_ (tmbstrlen )(enc ) + 1 ); /* allocate + 1 for 0 */
1884
1899
TY_ (tmbstrcpy )( newValue , enc );
1885
- TY_ (ReportAttrError )( doc , currentNode , charsetAttr , BAD_ATTRIBUTE_VALUE_REPLACED );
1900
+ /* Note: previously http-equiv had been modified, without warning
1901
+ in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
1902
+ TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED );
1903
+ */
1904
+ TidyDocFree (doc , charsetAttr -> value ); /* free current value */
1886
1905
charsetAttr -> value = newValue ;
1887
1906
}
1888
- // Make sure it's the first element.
1907
+ /* Make sure it's the first element. */
1889
1908
if ( currentNode != head -> content -> next ){
1890
1909
TY_ (RemoveNode )( currentNode );
1891
1910
TY_ (InsertNodeAtStart )( head , currentNode );
1892
1911
}
1893
1912
continue ;
1894
1913
}
1895
1914
/*
1896
- 2. <meta http-equiv="content-type" content="text/html; charset=UTF-8">
1897
- expected for HTML4. This is normally ok - but can clash.
1915
+ 2. <meta http-equiv="content-type" content="text/html; charset=UTF-8">
1916
+ expected for HTML4. This is normally ok - but can clash.
1898
1917
*/
1899
1918
if (httpEquivAttr && !charsetAttr )
1900
1919
{
1901
- tidyBufClear (& charsetString );
1902
- tidyBufAppend (& charsetString , "charset=" , 8 );
1903
- tidyBufAppend (& charsetString , (char * )enc , TY_ (tmbstrlen )( enc ));
1904
1920
contentAttr = TY_ (AttrGetById )(currentNode , TidyAttr_CONTENT );
1921
+ if (!contentAttr )
1922
+ continue ; /* has no 'content' attribute */
1923
+ if (!httpEquivAttr -> value )
1924
+ {
1925
+ prevNode = currentNode -> prev ;
1926
+ TY_ (ReportError )(doc , head , currentNode , DISCARDING_UNEXPECTED );
1927
+ TY_ (DiscardElement )(doc , currentNode );
1928
+ currentNode = prevNode ;
1929
+ continue ;
1930
+ }
1905
1931
httpEquivAttrValue = TY_ (tmbstrtolower )(httpEquivAttr -> value );
1906
-
1907
- if (!contentAttr || TY_ (tmbstrcmp )(httpEquivAttr -> value , (tmbstr ) "content-type" ) != 0 )
1932
+ if (TY_ (tmbstrcmp )(httpEquivAttr -> value , (tmbstr ) "content-type" ) != 0 )
1933
+ continue ; /* is not 'content-type' */
1934
+ if (!contentAttr -> value )
1935
+ {
1936
+ prevNode = currentNode -> prev ;
1937
+ /* maybe need better message here */
1938
+ TY_ (ReportError )(doc , head , currentNode , DISCARDING_UNEXPECTED );
1939
+ TY_ (DiscardElement )(doc , currentNode );
1940
+ currentNode = prevNode ;
1908
1941
continue ;
1942
+ }
1943
+ /* check encoding matches
1944
+ If a miss-match found here, fix it. previous silently done
1945
+ in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
1946
+ */
1909
1947
lcontent = TY_ (tmbstrtolower )(contentAttr -> value );
1910
- if (TY_ (tmbsubstr )(lcontent , (ctmbstr ) & charsetString )){
1911
- printf ("WARN ABOUT CLASH: %s \n" , contentAttr -> value );
1948
+ if (TY_ (tmbsubstr )(lcontent , charsetString .bp ))
1949
+ {
1950
+ /* we already found one, so remove the rest. */
1951
+ if (charsetFound )
1952
+ {
1953
+ prevNode = currentNode -> prev ;
1954
+ TY_ (ReportError )(doc , head , currentNode , DISCARDING_UNEXPECTED );
1955
+ TY_ (DiscardElement )(doc , currentNode );
1956
+ currentNode = prevNode ;
1957
+ continue ;
1958
+ }
1959
+ charsetFound = yes ;
1960
+ }
1961
+ else
1962
+ {
1963
+ /* fix a mis-match */
1964
+ if (charsetFound )
1965
+ {
1966
+ prevNode = currentNode -> prev ;
1967
+ TY_ (ReportError )(doc , head , currentNode , DISCARDING_UNEXPECTED );
1968
+ TY_ (DiscardElement )(doc , currentNode );
1969
+ currentNode = prevNode ;
1970
+ }
1971
+ else
1972
+ {
1973
+ /* correct the content */
1974
+ newValue = (tmbstr )TidyDocAlloc (doc , 19 + TY_ (tmbstrlen )(enc ) + 1 );
1975
+ TidyDocFree (doc , contentAttr -> value );
1976
+ TY_ (tmbstrcpy )(newValue , "text/html; charset=" );
1977
+ TY_ (tmbstrcpy )(newValue + 19 , enc );
1978
+ contentAttr -> value = newValue ;
1979
+ charsetFound = yes ;
1980
+ }
1912
1981
}
1982
+ continue ;
1913
1983
}
1914
1984
/*
1915
- 3. <meta charset="utf-8" http-equiv="Content-Type" content="...">
1916
- This is generally bad.
1985
+ 3. <meta charset="utf-8" http-equiv="Content-Type" content="...">
1986
+ This is generally bad. Discard and warn .
1917
1987
*/
1918
1988
if (httpEquivAttr && charsetAttr )
1919
1989
{
1920
- printf ("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n" );
1990
+ /* printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); */
1991
+ prevNode = currentNode -> prev ;
1992
+ TY_ (ReportError )(doc , head , currentNode , DISCARDING_UNEXPECTED );
1993
+ TY_ (DiscardElement )(doc , currentNode );
1994
+ currentNode = prevNode ;
1921
1995
}
1922
1996
}
1923
- if (charsetFound ){
1924
- return yes ;
1925
- }
1926
- metaTag = TY_ (InferredTag )(doc , TidyTag_META );
1927
- switch (TY_ (HTMLVersion )(doc ))
1997
+
1998
+ /* completed head scan - add appropriate meta - if 'yes' and none exists */
1999
+ if (cfgBool (doc , TidyMetaCharset ) && !charsetFound )
1928
2000
{
2001
+ /* add appropriate meta charset tag - no warning */
2002
+ metaTag = TY_ (InferredTag )(doc , TidyTag_META );
2003
+ switch (TY_ (HTMLVersion )(doc ))
2004
+ {
1929
2005
case HT50 :
1930
2006
case XH50 :
1931
- TY_ (AddAttribute )( doc , metaTag , "charset" , enc );
2007
+ TY_ (AddAttribute )(doc , metaTag , "charset" , enc );
1932
2008
break ;
1933
2009
default :
1934
2010
tidyBufInit (& buf );
1935
- tidyBufAppend (& buf , "text/html; charset=" , 19 );
1936
- tidyBufAppend (& buf , (char * )enc , TY_ (tmbstrlen )(enc ));
1937
- TY_ (AddAttribute )( doc , metaTag , "content" , (char * )buf .bp );
2011
+ tidyBufAppend (& buf , "text/html; " , 11 );
2012
+ tidyBufAppend (& buf , charsetString .bp , TY_ (tmbstrlen )(charsetString .bp ));
2013
+ tidyBufAppend (& buf , "\0" , 1 ); /* zero terminate the buffer */
2014
+ TY_ (AddAttribute )(doc , metaTag , "content" , (char * )buf .bp );
1938
2015
tidyBufFree (& buf );
2016
+ }
2017
+ TY_ (InsertNodeAtStart )(head , metaTag );
1939
2018
}
1940
- TY_ (InsertNodeAtStart )( head , metaTag );
1941
2019
tidyBufFree (& charsetString );
1942
2020
return yes ;
1943
2021
}
0 commit comments