Skip to content

Commit 6ebd12b

Browse files
committed
Issue #456 - More work on this option
1 parent 8843199 commit 6ebd12b

File tree

4 files changed

+124
-49
lines changed

4 files changed

+124
-49
lines changed

src/clean.c

+3
Original file line numberDiff line numberDiff line change
@@ -2208,6 +2208,8 @@ void FixBrakes( TidyDocImpl* pDoc, Node *pParent )
22082208
}
22092209
#endif
22102210

2211+
/* Issue #456 - This is discarded */
2212+
#if 0
22112213
void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
22122214
{
22132215
Node *pNode;
@@ -2283,6 +2285,7 @@ void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
22832285
pLastProp = NULL;
22842286
}
22852287
}
2288+
#endif
22862289

22872290
void TY_(DropComments)(TidyDocImpl* doc, Node* node)
22882291
{

src/clean.h

+3-1
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,10 @@ void TY_(BumpObject)( TidyDocImpl* doc, Node *html );
6363
#if 0
6464
void TY_(FixBrakes)( TidyDocImpl* pDoc, Node *pParent );
6565
#endif
66-
66+
/* Issue #456 - This is discarded */
67+
#if 0
6768
void TY_(VerifyHTTPEquiv)( TidyDocImpl* pDoc, Node *pParent );
69+
#endif
6870

6971
void TY_(DropComments)(TidyDocImpl* doc, Node* node);
7072
void TY_(DropFontElements)(TidyDocImpl* doc, Node* node, Node **pnode);

src/lexer.c

+117-39
Original file line numberDiff line numberDiff line change
@@ -1827,19 +1827,21 @@ Node *TY_(FindBody)( TidyDocImpl* doc )
18271827
return node;
18281828
}
18291829

1830-
/* Check meta charset
1831-
1. if there is no meta charset, it adds one.
1832-
2. if there is a meta charset, it moves it to the top if HEAD.
1833-
3. if it doesn't match the output encoding, warn about that.
1834-
4. if there are duplicates, discard them.
1835-
*/
1830+
/*\
1831+
* Issue #456 - Check meta charset
1832+
* 1. if there is no meta charset, it adds one, according to doctype, no warning.
1833+
* 2. if there is a meta charset, it moves it to the top if HEAD. Not sure this required?
1834+
* 3. if it doesn't match the output encoding, and fix. Naybe no warning?
1835+
* 4. if there are duplicates, discard them, with warning.
1836+
\*/
18361837
Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
18371838
{
18381839
AttVal *charsetAttr;
18391840
AttVal *contentAttr;
18401841
AttVal *httpEquivAttr;
18411842
Bool charsetFound = no;
1842-
ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(doc, TidyOutCharEncoding));
1843+
uint outenc = cfg(doc, TidyOutCharEncoding);
1844+
ctmbstr enc = TY_(GetEncodingNameFromTidyId)(outenc);
18431845
Node *currentNode;
18441846
Node *head = TY_(FindHEAD)( doc );
18451847
Node *metaTag;
@@ -1850,25 +1852,38 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
18501852
tmbstr lcontent;
18511853
tmbstr newValue;
18521854
/* We can't do anything we don't have a head or encoding is NULL */
1853-
if( !head || !enc )
1855+
if( !head || !enc || !TY_(tmbstrlen)(enc))
1856+
return no;
1857+
if (outenc == RAW)
18541858
return no;
1859+
#ifndef NO_NATIVE_ISO2022_SUPPORT
1860+
if (outenc == ISO2022)
1861+
return no;
1862+
#endif
1863+
18551864
tidyBufInit(&charsetString);
1865+
/* Set up the content test 'charset=value' */
1866+
tidyBufClear(&charsetString);
1867+
tidyBufAppend(&charsetString, "charset=", 8);
1868+
tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)(enc));
1869+
tidyBufAppend(&charsetString, "\0", 1); /* zero terminate the buffer */
1870+
/* process the children of the head */
18561871
for (currentNode = head->content; currentNode; currentNode = currentNode->next)
18571872
{
18581873
if (!nodeIsMETA(currentNode))
1859-
continue;
1874+
continue; /* not a meta node */
18601875
charsetAttr = attrGetCHARSET(currentNode);
18611876
httpEquivAttr = attrGetHTTP_EQUIV(currentNode);
18621877
if(!charsetAttr && !httpEquivAttr)
1863-
continue;
1878+
continue; /* has no charset attribute */
18641879
/*
18651880
Meta charset comes in quite a few flavors:
1866-
1. <meta charset=value> - expected for (X)HTML5.
1881+
1. <meta charset="value"> - expected for (X)HTML5.
18671882
*/
18681883
if (charsetAttr && !httpEquivAttr)
18691884
{
1870-
// we already found one, so remove the rest.
1871-
if(charsetFound)
1885+
/* we already found one, so remove the rest. */
1886+
if(charsetFound || !charsetAttr->value)
18721887
{
18731888
prevNode = currentNode->prev;
18741889
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
@@ -1877,67 +1892,130 @@ Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
18771892
continue;
18781893
}
18791894
charsetFound = yes;
1880-
// Fix mismatched attribute value
1895+
/* Fix mismatched attribute value */
18811896
if(TY_(tmbstrcmp)(TY_(tmbstrtolower)(charsetAttr->value), enc) != 0)
18821897
{
1883-
newValue = (tmbstr) TidyDocAlloc( doc, TY_(tmbstrlen)(enc) );
1898+
newValue = (tmbstr) TidyDocAlloc( doc, TY_(tmbstrlen)(enc) + 1 ); /* allocate + 1 for 0 */
18841899
TY_(tmbstrcpy)( newValue, enc );
1885-
TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED );
1900+
/* Note: previously http-equiv had been modified, without warning
1901+
in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
1902+
TY_(ReportAttrError)( doc, currentNode, charsetAttr, BAD_ATTRIBUTE_VALUE_REPLACED );
1903+
*/
1904+
TidyDocFree(doc, charsetAttr->value); /* free current value */
18861905
charsetAttr->value = newValue;
18871906
}
1888-
// Make sure it's the first element.
1907+
/* Make sure it's the first element. */
18891908
if ( currentNode != head->content->next ){
18901909
TY_(RemoveNode)( currentNode );
18911910
TY_(InsertNodeAtStart)( head, currentNode );
18921911
}
18931912
continue;
18941913
}
18951914
/*
1896-
2. <meta http-equiv="content-type" content="text/html; charset=UTF-8">
1897-
expected for HTML4. This is normally ok - but can clash.
1915+
2. <meta http-equiv="content-type" content="text/html; charset=UTF-8">
1916+
expected for HTML4. This is normally ok - but can clash.
18981917
*/
18991918
if(httpEquivAttr && !charsetAttr)
19001919
{
1901-
tidyBufClear(&charsetString);
1902-
tidyBufAppend(&charsetString, "charset=", 8);
1903-
tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)( enc ));
19041920
contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT);
1921+
if (!contentAttr)
1922+
continue; /* has no 'content' attribute */
1923+
if (!httpEquivAttr->value)
1924+
{
1925+
prevNode = currentNode->prev;
1926+
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
1927+
TY_(DiscardElement)(doc, currentNode);
1928+
currentNode = prevNode;
1929+
continue;
1930+
}
19051931
httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value);
1906-
1907-
if(!contentAttr || TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0)
1932+
if(TY_(tmbstrcmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0)
1933+
continue; /* is not 'content-type' */
1934+
if (!contentAttr->value)
1935+
{
1936+
prevNode = currentNode->prev;
1937+
/* maybe need better message here */
1938+
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
1939+
TY_(DiscardElement)(doc, currentNode);
1940+
currentNode = prevNode;
19081941
continue;
1942+
}
1943+
/* check encoding matches
1944+
If a miss-match found here, fix it. previous silently done
1945+
in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
1946+
*/
19091947
lcontent = TY_(tmbstrtolower)(contentAttr->value);
1910-
if(TY_(tmbsubstr)(lcontent, (ctmbstr) &charsetString)){
1911-
printf("WARN ABOUT CLASH: %s \n", contentAttr->value);
1948+
if (TY_(tmbsubstr)(lcontent, charsetString.bp))
1949+
{
1950+
/* we already found one, so remove the rest. */
1951+
if (charsetFound)
1952+
{
1953+
prevNode = currentNode->prev;
1954+
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
1955+
TY_(DiscardElement)(doc, currentNode);
1956+
currentNode = prevNode;
1957+
continue;
1958+
}
1959+
charsetFound = yes;
1960+
}
1961+
else
1962+
{
1963+
/* fix a mis-match */
1964+
if (charsetFound)
1965+
{
1966+
prevNode = currentNode->prev;
1967+
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
1968+
TY_(DiscardElement)(doc, currentNode);
1969+
currentNode = prevNode;
1970+
}
1971+
else
1972+
{
1973+
/* correct the content */
1974+
newValue = (tmbstr)TidyDocAlloc(doc, 19 + TY_(tmbstrlen)(enc) + 1);
1975+
TidyDocFree(doc, contentAttr->value);
1976+
TY_(tmbstrcpy)(newValue, "text/html; charset=");
1977+
TY_(tmbstrcpy)(newValue + 19, enc);
1978+
contentAttr->value = newValue;
1979+
charsetFound = yes;
1980+
}
19121981
}
1982+
continue;
19131983
}
19141984
/*
1915-
3. <meta charset="utf-8" http-equiv="Content-Type" content="...">
1916-
This is generally bad.
1985+
3. <meta charset="utf-8" http-equiv="Content-Type" content="...">
1986+
This is generally bad. Discard and warn.
19171987
*/
19181988
if(httpEquivAttr && charsetAttr)
19191989
{
1920-
printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n");
1990+
/* printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); */
1991+
prevNode = currentNode->prev;
1992+
TY_(ReportError)(doc, head, currentNode, DISCARDING_UNEXPECTED);
1993+
TY_(DiscardElement)(doc, currentNode);
1994+
currentNode = prevNode;
19211995
}
19221996
}
1923-
if(charsetFound){
1924-
return yes;
1925-
}
1926-
metaTag = TY_(InferredTag)(doc, TidyTag_META);
1927-
switch(TY_(HTMLVersion)(doc))
1997+
1998+
/* completed head scan - add appropriate meta - if 'yes' and none exists */
1999+
if (cfgBool(doc, TidyMetaCharset) && !charsetFound)
19282000
{
2001+
/* add appropriate meta charset tag - no warning */
2002+
metaTag = TY_(InferredTag)(doc, TidyTag_META);
2003+
switch (TY_(HTMLVersion)(doc))
2004+
{
19292005
case HT50:
19302006
case XH50:
1931-
TY_(AddAttribute)( doc, metaTag, "charset", enc);
2007+
TY_(AddAttribute)(doc, metaTag, "charset", enc);
19322008
break;
19332009
default:
19342010
tidyBufInit(&buf);
1935-
tidyBufAppend(&buf, "text/html; charset=", 19);
1936-
tidyBufAppend(&buf, (char*)enc, TY_(tmbstrlen)(enc));
1937-
TY_(AddAttribute)( doc, metaTag, "content", (char*)buf.bp);
2011+
tidyBufAppend(&buf, "text/html; ", 11);
2012+
tidyBufAppend(&buf, charsetString.bp, TY_(tmbstrlen)(charsetString.bp));
2013+
tidyBufAppend(&buf, "\0", 1); /* zero terminate the buffer */
2014+
TY_(AddAttribute)(doc, metaTag, "content", (char*)buf.bp);
19382015
tidyBufFree(&buf);
2016+
}
2017+
TY_(InsertNodeAtStart)(head, metaTag);
19392018
}
1940-
TY_(InsertNodeAtStart)( head, metaTag );
19412019
tidyBufFree(&charsetString);
19422020
return yes;
19432021
}

src/tidylib.c

+1-9
Original file line numberDiff line numberDiff line change
@@ -1992,7 +1992,6 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc )
19921992
Bool xhtmlOut = cfgBool( doc, TidyXhtmlOut );
19931993
Bool xmlDecl = cfgBool( doc, TidyXmlDecl );
19941994
Bool tidyMark = cfgBool( doc, TidyMark );
1995-
Bool tidyMetaCharset = cfgBool( doc, TidyMetaCharset);
19961995
Bool tidyXmlTags = cfgBool( doc, TidyXmlTags );
19971996
Bool wantNameAttr = cfgBool( doc, TidyAnchorAsName );
19981997
Bool mergeEmphasis = cfgBool( doc, TidyMergeEmphasis );
@@ -2044,12 +2043,7 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc )
20442043
#endif
20452044

20462045
/* Reconcile http-equiv meta element with output encoding */
2047-
if (cfg( doc, TidyOutCharEncoding) != RAW
2048-
#ifndef NO_NATIVE_ISO2022_SUPPORT
2049-
&& cfg( doc, TidyOutCharEncoding) != ISO2022
2050-
#endif
2051-
)
2052-
TY_(VerifyHTTPEquiv)( doc, TY_(FindHEAD)( doc ));
2046+
TY_(TidyMetaCharset)(doc);
20532047

20542048
if ( !TY_(CheckNodeIntegrity)( &doc->root ) )
20552049
TidyPanic( doc->allocator, integrity );
@@ -2097,8 +2091,6 @@ int tidyDocCleanAndRepair( TidyDocImpl* doc )
20972091
if (tidyMark )
20982092
TY_(AddGenerator)(doc);
20992093

2100-
if (tidyMetaCharset)
2101-
TY_(TidyMetaCharset)(doc);
21022094
}
21032095

21042096
/* ensure presence of initial <?xml version="1.0"?> */

0 commit comments

Comments
 (0)