You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
5057 lines
150 KiB
5057 lines
150 KiB
/* parser.c -- HTML Parser
|
|
|
|
(c) 1998-2007 (W3C) MIT, ERCIM, Keio University
|
|
See tidy.h for the copyright notice.
|
|
|
|
*/
|
|
|
|
#include "tidy-int.h"
|
|
#include "lexer.h"
|
|
#include "parser.h"
|
|
#include "message.h"
|
|
#include "clean.h"
|
|
#include "tags.h"
|
|
#include "tmbstr.h"
|
|
#ifdef _MSC_VER
|
|
#include "sprtf.h"
|
|
#endif
|
|
|
|
#ifndef SPRTF
|
|
#define SPRTF printf
|
|
#endif
|
|
|
|
#ifdef AUTO_INPUT_ENCODING
|
|
#include "charsets.h"
|
|
#endif
|
|
|
|
/*
|
|
Issue #72 - Need to know to avoid error-reporting - no warning only if --show-body-only yes
|
|
Issue #132 - likewise avoid warning if showing body only
|
|
*/
|
|
#define showingBodyOnly(doc) (cfgAutoBool(doc,TidyBodyOnly) == TidyYesState) ? yes : no
|
|
|
|
|
|
Bool TY_(CheckNodeIntegrity)(Node *node)
|
|
{
|
|
#ifndef NO_NODE_INTEGRITY_CHECK
|
|
Node *child;
|
|
|
|
if (node->prev)
|
|
{
|
|
if (node->prev->next != node)
|
|
return no;
|
|
}
|
|
|
|
if (node->next)
|
|
{
|
|
if (node->next == node || node->next->prev != node)
|
|
return no;
|
|
}
|
|
|
|
if (node->parent)
|
|
{
|
|
if (node->prev == NULL && node->parent->content != node)
|
|
return no;
|
|
|
|
if (node->next == NULL && node->parent->last != node)
|
|
return no;
|
|
}
|
|
|
|
for (child = node->content; child; child = child->next)
|
|
if ( child->parent != node || !TY_(CheckNodeIntegrity)(child) )
|
|
return no;
|
|
|
|
#endif
|
|
return yes;
|
|
}
|
|
|
|
/*
|
|
used to determine how attributes
|
|
without values should be printed
|
|
this was introduced to deal with
|
|
user defined tags e.g. Cold Fusion
|
|
*/
|
|
Bool TY_(IsNewNode)(Node *node)
|
|
{
|
|
if (node && node->tag)
|
|
{
|
|
return (node->tag->model & CM_NEW);
|
|
}
|
|
return yes;
|
|
}
|
|
|
|
void TY_(CoerceNode)(TidyDocImpl* doc, Node *node, TidyTagId tid, Bool obsolete, Bool unexpected)
|
|
{
|
|
const Dict* tag = TY_(LookupTagDef)(tid);
|
|
Node* tmp = TY_(InferredTag)(doc, tag->id);
|
|
|
|
if (obsolete)
|
|
TY_(ReportWarning)(doc, node, tmp, OBSOLETE_ELEMENT);
|
|
else if (unexpected)
|
|
TY_(ReportError)(doc, node, tmp, REPLACING_UNEX_ELEMENT);
|
|
else
|
|
TY_(ReportNotice)(doc, node, tmp, REPLACING_ELEMENT);
|
|
|
|
TidyDocFree(doc, tmp->element);
|
|
TidyDocFree(doc, tmp);
|
|
|
|
node->was = node->tag;
|
|
node->tag = tag;
|
|
node->type = StartTag;
|
|
node->implicit = yes;
|
|
TidyDocFree(doc, node->element);
|
|
node->element = TY_(tmbstrdup)(doc->allocator, tag->name);
|
|
}
|
|
|
|
/* extract a node and its children from a markup tree */
|
|
Node *TY_(RemoveNode)(Node *node)
|
|
{
|
|
if (node->prev)
|
|
node->prev->next = node->next;
|
|
|
|
if (node->next)
|
|
node->next->prev = node->prev;
|
|
|
|
if (node->parent)
|
|
{
|
|
if (node->parent->content == node)
|
|
node->parent->content = node->next;
|
|
|
|
if (node->parent->last == node)
|
|
node->parent->last = node->prev;
|
|
}
|
|
|
|
node->parent = node->prev = node->next = NULL;
|
|
return node;
|
|
}
|
|
|
|
/* remove node from markup tree and discard it */
|
|
Node *TY_(DiscardElement)( TidyDocImpl* doc, Node *element )
|
|
{
|
|
Node *next = NULL;
|
|
|
|
if (element)
|
|
{
|
|
next = element->next;
|
|
TY_(RemoveNode)(element);
|
|
TY_(FreeNode)( doc, element);
|
|
}
|
|
|
|
return next;
|
|
}
|
|
|
|
/*
|
|
insert "node" into markup tree as the firt element
|
|
of content of "element"
|
|
*/
|
|
void TY_(InsertNodeAtStart)(Node *element, Node *node)
|
|
{
|
|
node->parent = element;
|
|
|
|
if (element->content == NULL)
|
|
element->last = node;
|
|
else
|
|
element->content->prev = node;
|
|
|
|
node->next = element->content;
|
|
node->prev = NULL;
|
|
element->content = node;
|
|
}
|
|
|
|
/*
|
|
insert "node" into markup tree as the last element
|
|
of content of "element"
|
|
*/
|
|
void TY_(InsertNodeAtEnd)(Node *element, Node *node)
|
|
{
|
|
node->parent = element;
|
|
node->prev = element->last;
|
|
|
|
if (element->last != NULL)
|
|
element->last->next = node;
|
|
else
|
|
element->content = node;
|
|
|
|
element->last = node;
|
|
}
|
|
|
|
/*
|
|
insert "node" into markup tree in place of "element"
|
|
which is moved to become the child of the node
|
|
*/
|
|
static void InsertNodeAsParent(Node *element, Node *node)
|
|
{
|
|
node->content = element;
|
|
node->last = element;
|
|
node->parent = element->parent;
|
|
element->parent = node;
|
|
|
|
if (node->parent->content == element)
|
|
node->parent->content = node;
|
|
|
|
if (node->parent->last == element)
|
|
node->parent->last = node;
|
|
|
|
node->prev = element->prev;
|
|
element->prev = NULL;
|
|
|
|
if (node->prev)
|
|
node->prev->next = node;
|
|
|
|
node->next = element->next;
|
|
element->next = NULL;
|
|
|
|
if (node->next)
|
|
node->next->prev = node;
|
|
}
|
|
|
|
/* insert "node" into markup tree before "element" */
|
|
void TY_(InsertNodeBeforeElement)(Node *element, Node *node)
|
|
{
|
|
Node *parent;
|
|
|
|
parent = element->parent;
|
|
node->parent = parent;
|
|
node->next = element;
|
|
node->prev = element->prev;
|
|
element->prev = node;
|
|
|
|
if (node->prev)
|
|
node->prev->next = node;
|
|
|
|
if (parent->content == element)
|
|
parent->content = node;
|
|
}
|
|
|
|
/* insert "node" into markup tree after "element" */
|
|
void TY_(InsertNodeAfterElement)(Node *element, Node *node)
|
|
{
|
|
Node *parent;
|
|
|
|
parent = element->parent;
|
|
node->parent = parent;
|
|
|
|
/* AQ - 13 Jan 2000 fix for parent == NULL */
|
|
if (parent != NULL && parent->last == element)
|
|
parent->last = node;
|
|
else
|
|
{
|
|
node->next = element->next;
|
|
/* AQ - 13 Jan 2000 fix for node->next == NULL */
|
|
if (node->next != NULL)
|
|
node->next->prev = node;
|
|
}
|
|
|
|
element->next = node;
|
|
node->prev = element;
|
|
}
|
|
|
|
static Bool CanPrune( TidyDocImpl* doc, Node *element )
|
|
{
|
|
if ( !cfgBool(doc, TidyDropEmptyElems) )
|
|
return no;
|
|
|
|
if ( TY_(nodeIsText)(element) )
|
|
return yes;
|
|
|
|
if ( element->content )
|
|
return no;
|
|
|
|
if ( element->tag == NULL )
|
|
return no;
|
|
|
|
if ( element->tag->model & CM_BLOCK && element->attributes != NULL )
|
|
return no;
|
|
|
|
if ( nodeIsA(element) && element->attributes != NULL )
|
|
return no;
|
|
|
|
if ( nodeIsP(element) && !cfgBool(doc, TidyDropEmptyParas) )
|
|
return no;
|
|
|
|
if ( element->tag->model & CM_ROW )
|
|
return no;
|
|
|
|
if ( element->tag->model & CM_EMPTY )
|
|
return no;
|
|
|
|
if ( nodeIsAPPLET(element) )
|
|
return no;
|
|
|
|
if ( nodeIsOBJECT(element) )
|
|
return no;
|
|
|
|
if ( nodeIsSCRIPT(element) && attrGetSRC(element) )
|
|
return no;
|
|
|
|
if ( nodeIsTITLE(element) )
|
|
return no;
|
|
|
|
/* #433359 - fix by Randy Waki 12 Mar 01 */
|
|
if ( nodeIsIFRAME(element) )
|
|
return no;
|
|
|
|
/* fix for bug 770297 */
|
|
if (nodeIsTEXTAREA(element))
|
|
return no;
|
|
|
|
/* fix for ISSUE #7 https://github.com/w3c/tidy-html5/issues/7 */
|
|
if (nodeIsCANVAS(element))
|
|
return no;
|
|
|
|
if (nodeIsPROGRESS(element))
|
|
return no;
|
|
|
|
if ( attrGetID(element) || attrGetNAME(element) )
|
|
return no;
|
|
|
|
/* fix for bug 695408; a better fix would look for unknown and */
|
|
/* known proprietary attributes that make the element significant */
|
|
if (attrGetDATAFLD(element))
|
|
return no;
|
|
|
|
/* fix for bug 723772, don't trim new-...-tags */
|
|
if (element->tag->id == TidyTag_UNKNOWN)
|
|
return no;
|
|
|
|
if (nodeIsBODY(element))
|
|
return no;
|
|
|
|
if (nodeIsCOLGROUP(element))
|
|
return no;
|
|
|
|
/* HTML5 - do NOT drop empty option if it has attributes */
|
|
if ( nodeIsOPTION(element) && element->attributes != NULL )
|
|
return no;
|
|
|
|
/* fix for #103 - don't drop empty dd tags lest document not validate */
|
|
if (nodeIsDD(element))
|
|
return no;
|
|
|
|
return yes;
|
|
}
|
|
|
|
/* return next element */
|
|
Node *TY_(TrimEmptyElement)( TidyDocImpl* doc, Node *element )
|
|
{
|
|
if ( CanPrune(doc, element) )
|
|
{
|
|
if (element->type != TextNode)
|
|
TY_(ReportNotice)(doc, element, NULL, TRIM_EMPTY_ELEMENT);
|
|
|
|
return TY_(DiscardElement)(doc, element);
|
|
}
|
|
return element->next;
|
|
}
|
|
|
|
Node* TY_(DropEmptyElements)(TidyDocImpl* doc, Node* node)
|
|
{
|
|
Node* next;
|
|
|
|
while (node)
|
|
{
|
|
next = node->next;
|
|
|
|
if (node->content)
|
|
TY_(DropEmptyElements)(doc, node->content);
|
|
|
|
if (!TY_(nodeIsElement)(node) &&
|
|
!(TY_(nodeIsText)(node) && !(node->start < node->end)))
|
|
{
|
|
node = next;
|
|
continue;
|
|
}
|
|
|
|
next = TY_(TrimEmptyElement)(doc, node);
|
|
node = next;
|
|
}
|
|
|
|
return node;
|
|
}
|
|
|
|
/*
|
|
errors in positioning of form start or end tags
|
|
generally require human intervention to fix
|
|
Issue #166 - repeated <main> element also uses this flag
|
|
to indicate duplicates, discarded
|
|
*/
|
|
static void BadForm( TidyDocImpl* doc )
|
|
{
|
|
doc->badForm |= flg_BadForm;
|
|
/* doc->errors++; */
|
|
}
|
|
|
|
/*
|
|
This maps
|
|
<em>hello </em><strong>world</strong>
|
|
to
|
|
<em>hello</em> <strong>world</strong>
|
|
|
|
If last child of element is a text node
|
|
then trim trailing white space character
|
|
moving it to after element's end tag.
|
|
*/
|
|
static void TrimTrailingSpace( TidyDocImpl* doc, Node *element, Node *last )
|
|
{
|
|
Lexer* lexer = doc->lexer;
|
|
byte c;
|
|
|
|
if (TY_(nodeIsText)(last))
|
|
{
|
|
if (last->end > last->start)
|
|
{
|
|
c = (byte) lexer->lexbuf[ last->end - 1 ];
|
|
|
|
if ( c == ' '
|
|
#ifdef COMMENT_NBSP_FIX
|
|
|| c == 160
|
|
#endif
|
|
)
|
|
{
|
|
#ifdef COMMENT_NBSP_FIX
|
|
/* take care with <td> </td> */
|
|
if ( c == 160 &&
|
|
( element->tag == doc->tags.tag_td ||
|
|
element->tag == doc->tags.tag_th )
|
|
)
|
|
{
|
|
if (last->end > last->start + 1)
|
|
last->end -= 1;
|
|
}
|
|
else
|
|
#endif
|
|
{
|
|
last->end -= 1;
|
|
if ( (element->tag->model & CM_INLINE) &&
|
|
!(element->tag->model & CM_FIELD) )
|
|
lexer->insertspace = yes;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#if 0
|
|
static Node *EscapeTag(Lexer *lexer, Node *element)
|
|
{
|
|
Node *node = NewNode(lexer->allocator, lexer);
|
|
|
|
node->start = lexer->lexsize;
|
|
AddByte(lexer, '<');
|
|
|
|
if (element->type == EndTag)
|
|
AddByte(lexer, '/');
|
|
|
|
if (element->element)
|
|
{
|
|
char *p;
|
|
for (p = element->element; *p != '\0'; ++p)
|
|
AddByte(lexer, *p);
|
|
}
|
|
else if (element->type == DocTypeTag)
|
|
{
|
|
uint i;
|
|
AddStringLiteral( lexer, "!DOCTYPE " );
|
|
for (i = element->start; i < element->end; ++i)
|
|
AddByte(lexer, lexer->lexbuf[i]);
|
|
}
|
|
|
|
if (element->type == StartEndTag)
|
|
AddByte(lexer, '/');
|
|
|
|
AddByte(lexer, '>');
|
|
node->end = lexer->lexsize;
|
|
|
|
return node;
|
|
}
|
|
#endif /* 0 */
|
|
|
|
/* Only true for text nodes. */
|
|
Bool TY_(IsBlank)(Lexer *lexer, Node *node)
|
|
{
|
|
Bool isBlank = TY_(nodeIsText)(node);
|
|
if ( isBlank )
|
|
isBlank = ( node->end == node->start || /* Zero length */
|
|
( node->end == node->start+1 /* or one blank. */
|
|
&& lexer->lexbuf[node->start] == ' ' ) );
|
|
return isBlank;
|
|
}
|
|
|
|
/*
|
|
This maps
|
|
<p>hello<em> world</em>
|
|
to
|
|
<p>hello <em>world</em>
|
|
|
|
Trims initial space, by moving it before the
|
|
start tag, or if this element is the first in
|
|
parent's content, then by discarding the space
|
|
*/
|
|
static void TrimInitialSpace( TidyDocImpl* doc, Node *element, Node *text )
|
|
{
|
|
Lexer* lexer = doc->lexer;
|
|
Node *prev, *node;
|
|
|
|
if ( TY_(nodeIsText)(text) &&
|
|
lexer->lexbuf[text->start] == ' ' &&
|
|
text->start < text->end )
|
|
{
|
|
if ( (element->tag->model & CM_INLINE) &&
|
|
!(element->tag->model & CM_FIELD) )
|
|
{
|
|
prev = element->prev;
|
|
|
|
if (TY_(nodeIsText)(prev))
|
|
{
|
|
if (prev->end == 0 || lexer->lexbuf[prev->end - 1] != ' ')
|
|
lexer->lexbuf[(prev->end)++] = ' ';
|
|
|
|
++(element->start);
|
|
}
|
|
else /* create new node */
|
|
{
|
|
node = TY_(NewNode)(lexer->allocator, lexer);
|
|
node->start = (element->start)++;
|
|
node->end = element->start;
|
|
lexer->lexbuf[node->start] = ' ';
|
|
TY_(InsertNodeBeforeElement)(element ,node);
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
SPRTF("TrimInitialSpace: Created text node, inserted before <%s>\n",
|
|
(element->element ? element->element : "unknown"));
|
|
#endif
|
|
}
|
|
}
|
|
|
|
/* discard the space in current node */
|
|
++(text->start);
|
|
}
|
|
}
|
|
|
|
static Bool IsPreDescendant(Node* node)
|
|
{
|
|
Node *parent = node->parent;
|
|
|
|
while (parent)
|
|
{
|
|
if (parent->tag && parent->tag->parser == TY_(ParsePre))
|
|
return yes;
|
|
|
|
parent = parent->parent;
|
|
}
|
|
|
|
return no;
|
|
}
|
|
|
|
static Bool CleanTrailingWhitespace(TidyDocImpl* doc, Node* node)
|
|
{
|
|
Node* next;
|
|
|
|
if (!TY_(nodeIsText)(node))
|
|
return no;
|
|
|
|
if (node->parent->type == DocTypeTag)
|
|
return no;
|
|
|
|
if (IsPreDescendant(node))
|
|
return no;
|
|
|
|
if (node->parent->tag && node->parent->tag->parser == TY_(ParseScript))
|
|
return no;
|
|
|
|
next = node->next;
|
|
|
|
/* <p>... </p> */
|
|
if (!next && !TY_(nodeHasCM)(node->parent, CM_INLINE))
|
|
return yes;
|
|
|
|
/* <div><small>... </small><h3>...</h3></div> */
|
|
if (!next && node->parent->next && !TY_(nodeHasCM)(node->parent->next, CM_INLINE))
|
|
return yes;
|
|
|
|
if (!next)
|
|
return no;
|
|
|
|
if (nodeIsBR(next))
|
|
return yes;
|
|
|
|
if (TY_(nodeHasCM)(next, CM_INLINE))
|
|
return no;
|
|
|
|
/* <a href='/'>...</a> <p>...</p> */
|
|
if (next->type == StartTag)
|
|
return yes;
|
|
|
|
/* <strong>...</strong> <hr /> */
|
|
if (next->type == StartEndTag)
|
|
return yes;
|
|
|
|
/* evil adjacent text nodes, Tidy should not generate these :-( */
|
|
if (TY_(nodeIsText)(next) && next->start < next->end
|
|
&& TY_(IsWhite)(doc->lexer->lexbuf[next->start]))
|
|
return yes;
|
|
|
|
return no;
|
|
}
|
|
|
|
static Bool CleanLeadingWhitespace(TidyDocImpl* ARG_UNUSED(doc), Node* node)
|
|
{
|
|
if (!TY_(nodeIsText)(node))
|
|
return no;
|
|
|
|
if (node->parent->type == DocTypeTag)
|
|
return no;
|
|
|
|
if (IsPreDescendant(node))
|
|
return no;
|
|
|
|
if (node->parent->tag && node->parent->tag->parser == TY_(ParseScript))
|
|
return no;
|
|
|
|
/* <p>...<br> <em>...</em>...</p> */
|
|
if (nodeIsBR(node->prev))
|
|
return yes;
|
|
|
|
/* <p> ...</p> */
|
|
if (node->prev == NULL && !TY_(nodeHasCM)(node->parent, CM_INLINE))
|
|
return yes;
|
|
|
|
/* <h4>...</h4> <em>...</em> */
|
|
if (node->prev && !TY_(nodeHasCM)(node->prev, CM_INLINE) &&
|
|
TY_(nodeIsElement)(node->prev))
|
|
return yes;
|
|
|
|
/* <p><span> ...</span></p> */
|
|
if (!node->prev && !node->parent->prev && !TY_(nodeHasCM)(node->parent->parent, CM_INLINE))
|
|
return yes;
|
|
|
|
return no;
|
|
}
|
|
|
|
static void CleanSpaces(TidyDocImpl* doc, Node* node)
|
|
{
|
|
Node* next;
|
|
|
|
while (node)
|
|
{
|
|
next = node->next;
|
|
|
|
if (TY_(nodeIsText)(node) && CleanLeadingWhitespace(doc, node))
|
|
while (node->start < node->end && TY_(IsWhite)(doc->lexer->lexbuf[node->start]))
|
|
++(node->start);
|
|
|
|
if (TY_(nodeIsText)(node) && CleanTrailingWhitespace(doc, node))
|
|
while (node->end > node->start && TY_(IsWhite)(doc->lexer->lexbuf[node->end - 1]))
|
|
--(node->end);
|
|
|
|
if (TY_(nodeIsText)(node) && !(node->start < node->end))
|
|
{
|
|
TY_(RemoveNode)(node);
|
|
TY_(FreeNode)(doc, node);
|
|
node = next;
|
|
|
|
continue;
|
|
}
|
|
|
|
if (node->content)
|
|
CleanSpaces(doc, node->content);
|
|
|
|
node = next;
|
|
}
|
|
}
|
|
|
|
/*
|
|
Move initial and trailing space out.
|
|
This routine maps:
|
|
|
|
hello<em> world</em>
|
|
to
|
|
hello <em>world</em>
|
|
and
|
|
<em>hello </em><strong>world</strong>
|
|
to
|
|
<em>hello</em> <strong>world</strong>
|
|
*/
|
|
static void TrimSpaces( TidyDocImpl* doc, Node *element)
|
|
{
|
|
Node* text = element->content;
|
|
|
|
if (nodeIsPRE(element) || IsPreDescendant(element))
|
|
return;
|
|
|
|
if (TY_(nodeIsText)(text))
|
|
TrimInitialSpace(doc, element, text);
|
|
|
|
text = element->last;
|
|
|
|
if (TY_(nodeIsText)(text))
|
|
TrimTrailingSpace(doc, element, text);
|
|
}
|
|
|
|
static Bool DescendantOf( Node *element, TidyTagId tid )
|
|
{
|
|
Node *parent;
|
|
for ( parent = element->parent;
|
|
parent != NULL;
|
|
parent = parent->parent )
|
|
{
|
|
if ( TagIsId(parent, tid) )
|
|
return yes;
|
|
}
|
|
return no;
|
|
}
|
|
|
|
static Bool InsertMisc(Node *element, Node *node)
|
|
{
|
|
if (node->type == CommentTag ||
|
|
node->type == ProcInsTag ||
|
|
node->type == CDATATag ||
|
|
node->type == SectionTag ||
|
|
node->type == AspTag ||
|
|
node->type == JsteTag ||
|
|
node->type == PhpTag )
|
|
{
|
|
TY_(InsertNodeAtEnd)(element, node);
|
|
return yes;
|
|
}
|
|
|
|
if ( node->type == XmlDecl )
|
|
{
|
|
Node* root = element;
|
|
while ( root && root->parent )
|
|
root = root->parent;
|
|
if ( root && !(root->content && root->content->type == XmlDecl))
|
|
{
|
|
TY_(InsertNodeAtStart)( root, node );
|
|
return yes;
|
|
}
|
|
}
|
|
|
|
/* Declared empty tags seem to be slipping through
|
|
** the cracks. This is an experiment to figure out
|
|
** a decent place to pick them up.
|
|
*/
|
|
if ( node->tag &&
|
|
TY_(nodeIsElement)(node) &&
|
|
TY_(nodeCMIsEmpty)(node) && TagId(node) == TidyTag_UNKNOWN &&
|
|
(node->tag->versions & VERS_PROPRIETARY) != 0 )
|
|
{
|
|
TY_(InsertNodeAtEnd)(element, node);
|
|
return yes;
|
|
}
|
|
|
|
return no;
|
|
}
|
|
|
|
|
|
static void ParseTag( TidyDocImpl* doc, Node *node, GetTokenMode mode )
|
|
{
|
|
Lexer* lexer = doc->lexer;
|
|
|
|
if (node->tag == NULL) /* [i_a]2 prevent crash for active content (php, asp) docs */
|
|
return;
|
|
|
|
/*
|
|
Fix by GLP 2000-12-21. Need to reset insertspace if this
|
|
is both a non-inline and empty tag (base, link, meta, isindex, hr, area).
|
|
*/
|
|
if (node->tag->model & CM_EMPTY)
|
|
{
|
|
lexer->waswhite = no;
|
|
if (node->tag->parser == NULL)
|
|
return;
|
|
}
|
|
else if (!(node->tag->model & CM_INLINE))
|
|
lexer->insertspace = no;
|
|
|
|
if (node->tag->parser == NULL)
|
|
return;
|
|
|
|
if (node->type == StartEndTag)
|
|
return;
|
|
|
|
lexer->parent = node; /* [i_a]2 added this - not sure why - CHECKME: */
|
|
|
|
(*node->tag->parser)( doc, node, mode );
|
|
}
|
|
|
|
/*
|
|
the doctype has been found after other tags,
|
|
and needs moving to before the html element
|
|
*/
|
|
static void InsertDocType( TidyDocImpl* doc, Node *element, Node *doctype )
|
|
{
|
|
Node* existing = TY_(FindDocType)( doc );
|
|
if ( existing )
|
|
{
|
|
TY_(ReportError)(doc, element, doctype, DISCARDING_UNEXPECTED );
|
|
TY_(FreeNode)( doc, doctype );
|
|
}
|
|
else
|
|
{
|
|
TY_(ReportError)(doc, element, doctype, DOCTYPE_AFTER_TAGS );
|
|
while ( !nodeIsHTML(element) )
|
|
element = element->parent;
|
|
TY_(InsertNodeBeforeElement)( element, doctype );
|
|
}
|
|
}
|
|
|
|
/*
|
|
move node to the head, where element is used as starting
|
|
point in hunt for head. normally called during parsing
|
|
*/
|
|
static void MoveToHead( TidyDocImpl* doc, Node *element, Node *node )
|
|
{
|
|
Node *head;
|
|
|
|
TY_(RemoveNode)( node ); /* make sure that node is isolated */
|
|
|
|
if ( TY_(nodeIsElement)(node) )
|
|
{
|
|
TY_(ReportError)(doc, element, node, TAG_NOT_ALLOWED_IN );
|
|
|
|
head = TY_(FindHEAD)(doc);
|
|
assert(head != NULL);
|
|
|
|
TY_(InsertNodeAtEnd)(head, node);
|
|
|
|
if ( node->tag->parser )
|
|
ParseTag( doc, node, IgnoreWhitespace );
|
|
}
|
|
else
|
|
{
|
|
TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
|
|
TY_(FreeNode)( doc, node );
|
|
}
|
|
}
|
|
|
|
/* moves given node to end of body element */
|
|
static void MoveNodeToBody( TidyDocImpl* doc, Node* node )
|
|
{
|
|
Node* body = TY_(FindBody)( doc );
|
|
if ( body )
|
|
{
|
|
TY_(RemoveNode)( node );
|
|
TY_(InsertNodeAtEnd)( body, node );
|
|
}
|
|
}
|
|
|
|
static void AddClassNoIndent( TidyDocImpl* doc, Node *node )
|
|
{
|
|
ctmbstr sprop =
|
|
"padding-left: 2ex; margin-left: 0ex"
|
|
"; margin-top: 0ex; margin-bottom: 0ex";
|
|
if ( !cfgBool(doc, TidyDecorateInferredUL) )
|
|
return;
|
|
if ( cfgBool(doc, TidyMakeClean) )
|
|
TY_(AddStyleAsClass)( doc, node, sprop );
|
|
else
|
|
TY_(AddStyleProperty)( doc, node, sprop );
|
|
}
|
|
|
|
/*
|
|
element is node created by the lexer
|
|
upon seeing the start tag, or by the
|
|
parser when the start tag is inferred
|
|
*/
|
|
void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode)
|
|
{
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
static int in_parse_block = 0;
|
|
static int parse_block_cnt = 0;
|
|
#endif
|
|
Lexer* lexer = doc->lexer;
|
|
Node *node;
|
|
Bool checkstack = yes;
|
|
uint istackbase = 0;
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
in_parse_block++;
|
|
parse_block_cnt++;
|
|
SPRTF("Entering ParseBlock %d... %d %s\n",in_parse_block,parse_block_cnt,
|
|
((element && element->element) ? element->element : ""));
|
|
#endif
|
|
|
|
if ( element->tag->model & CM_EMPTY ) {
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
in_parse_block--;
|
|
SPRTF("Exit ParseBlockL 1 %d...\n",in_parse_block);
|
|
#endif
|
|
return;
|
|
}
|
|
|
|
if ( nodeIsFORM(element) &&
|
|
DescendantOf(element, TidyTag_FORM) )
|
|
TY_(ReportError)(doc, element, NULL, ILLEGAL_NESTING );
|
|
|
|
/*
|
|
InlineDup() asks the lexer to insert inline emphasis tags
|
|
currently pushed on the istack, but take care to avoid
|
|
propagating inline emphasis inside OBJECT or APPLET.
|
|
For these elements a fresh inline stack context is created
|
|
and disposed of upon reaching the end of the element.
|
|
They thus behave like table cells in this respect.
|
|
*/
|
|
if (element->tag->model & CM_OBJECT)
|
|
{
|
|
istackbase = lexer->istackbase;
|
|
lexer->istackbase = lexer->istacksize;
|
|
}
|
|
|
|
if (!(element->tag->model & CM_MIXED))
|
|
TY_(InlineDup)( doc, NULL );
|
|
|
|
/*\
|
|
* Issue #212 - If it is likely that it may be necessary
|
|
* to move a leading space into a text node before this
|
|
* element, then keep the mode MixedContent to keep any
|
|
* leading space
|
|
\*/
|
|
if ( !(element->tag->model & CM_INLINE) ||
|
|
(element->tag->model & CM_FIELD ) )
|
|
{
|
|
mode = IgnoreWhitespace;
|
|
}
|
|
else if (mode == IgnoreWhitespace)
|
|
{
|
|
/* Issue #212 - Further fix in case ParseBlock() is called with 'IgnoreWhitespace'
|
|
when such a leading space may need to be inserted before this element to
|
|
preverve the browser view */
|
|
mode = MixedContent;
|
|
}
|
|
|
|
while ((node = TY_(GetToken)(doc, mode /*MixedContent*/)) != NULL)
|
|
{
|
|
/* end tag for this element */
|
|
if (node->type == EndTag && node->tag &&
|
|
(node->tag == element->tag || element->was == node->tag))
|
|
{
|
|
TY_(FreeNode)( doc, node );
|
|
|
|
if (element->tag->model & CM_OBJECT)
|
|
{
|
|
/* pop inline stack */
|
|
while (lexer->istacksize > lexer->istackbase)
|
|
TY_(PopInline)( doc, NULL );
|
|
lexer->istackbase = istackbase;
|
|
}
|
|
|
|
element->closed = yes;
|
|
TrimSpaces( doc, element );
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
in_parse_block--;
|
|
SPRTF("Exit ParseBlock 2 %d...\n",in_parse_block);
|
|
#endif
|
|
return;
|
|
}
|
|
|
|
#if OBSOLETE /* Issue #380 Kill this code! But leave in src, just in case! */
|
|
if ( nodeIsBODY( node ) && DescendantOf( element, TidyTag_HEAD ))
|
|
{
|
|
/* If we're in the HEAD, close it before proceeding.
|
|
This is an extremely rare occurance, but has been observed.
|
|
****************************************************************
|
|
Issue #380 - This can cause an INFINITE loop!
|
|
This code was added to SF CVS Tidy
|
|
revision 1.121 by lpassey, Wed Jul 28 18:08:06 2004 UTC
|
|
****************************************************************
|
|
*/
|
|
TY_(UngetToken)( doc );
|
|
break;
|
|
}
|
|
#endif /* #if OBSOLETE */
|
|
|
|
if ( nodeIsHTML(node) || nodeIsHEAD(node) || nodeIsBODY(node) )
|
|
{
|
|
if ( TY_(nodeIsElement)(node) )
|
|
TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
|
|
TY_(FreeNode)( doc, node );
|
|
continue;
|
|
}
|
|
|
|
|
|
if (node->type == EndTag)
|
|
{
|
|
if (node->tag == NULL)
|
|
{
|
|
TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
|
|
TY_(FreeNode)( doc, node );
|
|
continue;
|
|
}
|
|
else if ( nodeIsBR(node) )
|
|
node->type = StartTag;
|
|
else if ( nodeIsP(node) )
|
|
{
|
|
/* Cannot have a block inside a paragraph, so no checking
|
|
for an ancestor is necessary -- but we _can_ have
|
|
paragraphs inside a block, so change it to an implicit
|
|
empty paragraph, to be dealt with according to the user's
|
|
options
|
|
*/
|
|
node->type = StartEndTag;
|
|
node->implicit = yes;
|
|
#if OBSOLETE
|
|
TY_(CoerceNode)(doc, node, TidyTag_BR, no, no);
|
|
TY_(FreeAttrs)( doc, node ); /* discard align attribute etc. */
|
|
TY_(InsertNodeAtEnd)( element, node );
|
|
node = InferredTag(doc, TidyTag_BR);
|
|
#endif
|
|
}
|
|
else if (DescendantOf( element, node->tag->id ))
|
|
{
|
|
/*
|
|
if this is the end tag for an ancestor element
|
|
then infer end tag for this element
|
|
*/
|
|
TY_(UngetToken)( doc );
|
|
break;
|
|
#if OBSOLETE
|
|
Node *parent;
|
|
for ( parent = element->parent;
|
|
parent != NULL;
|
|
parent = parent->parent )
|
|
{
|
|
if (node->tag == parent->tag)
|
|
{
|
|
if (!(element->tag->model & CM_OPT))
|
|
TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE );
|
|
|
|
TY_(UngetToken)( doc );
|
|
|
|
if (element->tag->model & CM_OBJECT)
|
|
{
|
|
/* pop inline stack */
|
|
while (lexer->istacksize > lexer->istackbase)
|
|
TY_(PopInline)( doc, NULL );
|
|
lexer->istackbase = istackbase;
|
|
}
|
|
|
|
TrimSpaces( doc, element );
|
|
return;
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
else
|
|
{
|
|
/* special case </tr> etc. for stuff moved in front of table */
|
|
if ( lexer->exiled
|
|
&& (TY_(nodeHasCM)(node, CM_TABLE) || nodeIsTABLE(node)) )
|
|
{
|
|
TY_(UngetToken)( doc );
|
|
TrimSpaces( doc, element );
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
in_parse_block--;
|
|
SPRTF("Exit ParseBlock 2 %d...\n",in_parse_block);
|
|
#endif
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* mixed content model permits text */
|
|
if (TY_(nodeIsText)(node))
|
|
{
|
|
if ( checkstack )
|
|
{
|
|
checkstack = no;
|
|
if (!(element->tag->model & CM_MIXED))
|
|
{
|
|
if ( TY_(InlineDup)(doc, node) > 0 )
|
|
continue;
|
|
}
|
|
}
|
|
|
|
TY_(InsertNodeAtEnd)(element, node);
|
|
mode = MixedContent;
|
|
|
|
/*
|
|
HTML4 strict doesn't allow mixed content for
|
|
elements with %block; as their content model
|
|
*/
|
|
/*
|
|
But only body, map, blockquote, form and
|
|
noscript have content model %block;
|
|
*/
|
|
if ( nodeIsBODY(element) ||
|
|
nodeIsMAP(element) ||
|
|
nodeIsBLOCKQUOTE(element) ||
|
|
nodeIsFORM(element) ||
|
|
nodeIsNOSCRIPT(element) )
|
|
TY_(ConstrainVersion)( doc, ~VERS_HTML40_STRICT );
|
|
continue;
|
|
}
|
|
|
|
if ( InsertMisc(element, node) )
|
|
continue;
|
|
|
|
/* allow PARAM elements? */
|
|
if ( nodeIsPARAM(node) )
|
|
{
|
|
if ( TY_(nodeHasCM)(element, CM_PARAM) && TY_(nodeIsElement)(node) )
|
|
{
|
|
TY_(InsertNodeAtEnd)(element, node);
|
|
continue;
|
|
}
|
|
|
|
/* otherwise discard it */
|
|
TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
|
|
TY_(FreeNode)( doc, node );
|
|
continue;
|
|
}
|
|
|
|
/* allow AREA elements? */
|
|
if ( nodeIsAREA(node) )
|
|
{
|
|
if ( nodeIsMAP(element) && TY_(nodeIsElement)(node) )
|
|
{
|
|
TY_(InsertNodeAtEnd)(element, node);
|
|
continue;
|
|
}
|
|
|
|
/* otherwise discard it */
|
|
TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
|
|
TY_(FreeNode)( doc, node );
|
|
continue;
|
|
}
|
|
|
|
/* ignore unknown start/end tags */
|
|
if ( node->tag == NULL )
|
|
{
|
|
TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
|
|
TY_(FreeNode)( doc, node );
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
Allow CM_INLINE elements here.
|
|
|
|
Allow CM_BLOCK elements here unless
|
|
lexer->excludeBlocks is yes.
|
|
|
|
LI and DD are special cased.
|
|
|
|
Otherwise infer end tag for this element.
|
|
*/
|
|
|
|
if ( !TY_(nodeHasCM)(node, CM_INLINE) )
|
|
{
|
|
if ( !TY_(nodeIsElement)(node) )
|
|
{
|
|
if ( nodeIsFORM(node) )
|
|
BadForm( doc );
|
|
|
|
TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
|
|
TY_(FreeNode)( doc, node );
|
|
continue;
|
|
}
|
|
|
|
/* #427671 - Fix by Randy Waki - 10 Aug 00 */
|
|
/*
|
|
If an LI contains an illegal FRAME, FRAMESET, OPTGROUP, or OPTION
|
|
start tag, discard the start tag and let the subsequent content get
|
|
parsed as content of the enclosing LI. This seems to mimic IE and
|
|
Netscape, and avoids an infinite loop: without this check,
|
|
ParseBlock (which is parsing the LI's content) and ParseList (which
|
|
is parsing the LI's parent's content) repeatedly defer to each
|
|
other to parse the illegal start tag, each time inferring a missing
|
|
</li> or <li> respectively.
|
|
|
|
NOTE: This check is a bit fragile. It specifically checks for the
|
|
four tags that happen to weave their way through the current series
|
|
of tests performed by ParseBlock and ParseList to trigger the
|
|
infinite loop.
|
|
*/
|
|
if ( nodeIsLI(element) )
|
|
{
|
|
if ( nodeIsFRAME(node) ||
|
|
nodeIsFRAMESET(node) ||
|
|
nodeIsOPTGROUP(node) ||
|
|
nodeIsOPTION(node) )
|
|
{
|
|
TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
|
|
TY_(FreeNode)( doc, node ); /* DSR - 27Apr02 avoid memory leak */
|
|
continue;
|
|
}
|
|
}
|
|
|
|
if ( nodeIsTD(element) || nodeIsTH(element) )
|
|
{
|
|
/* if parent is a table cell, avoid inferring the end of the cell */
|
|
|
|
if ( TY_(nodeHasCM)(node, CM_HEAD) )
|
|
{
|
|
MoveToHead( doc, element, node );
|
|
continue;
|
|
}
|
|
|
|
if ( TY_(nodeHasCM)(node, CM_LIST) )
|
|
{
|
|
TY_(UngetToken)( doc );
|
|
node = TY_(InferredTag)(doc, TidyTag_UL);
|
|
AddClassNoIndent(doc, node);
|
|
lexer->excludeBlocks = yes;
|
|
}
|
|
else if ( TY_(nodeHasCM)(node, CM_DEFLIST) )
|
|
{
|
|
TY_(UngetToken)( doc );
|
|
node = TY_(InferredTag)(doc, TidyTag_DL);
|
|
lexer->excludeBlocks = yes;
|
|
}
|
|
|
|
/* infer end of current table cell */
|
|
if ( !TY_(nodeHasCM)(node, CM_BLOCK) )
|
|
{
|
|
TY_(UngetToken)( doc );
|
|
TrimSpaces( doc, element );
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
in_parse_block--;
|
|
SPRTF("Exit ParseBlock 3 %d...\n",in_parse_block);
|
|
#endif
|
|
return;
|
|
}
|
|
}
|
|
else if ( TY_(nodeHasCM)(node, CM_BLOCK) )
|
|
{
|
|
if ( lexer->excludeBlocks )
|
|
{
|
|
if ( !TY_(nodeHasCM)(element, CM_OPT) )
|
|
TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE );
|
|
|
|
TY_(UngetToken)( doc );
|
|
|
|
if ( TY_(nodeHasCM)(element, CM_OBJECT) )
|
|
lexer->istackbase = istackbase;
|
|
|
|
TrimSpaces( doc, element );
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
in_parse_block--;
|
|
SPRTF("Exit ParseBlock 4 %d...\n",in_parse_block);
|
|
#endif
|
|
return;
|
|
}
|
|
}
|
|
else /* things like list items */
|
|
{
|
|
if (node->tag->model & CM_HEAD)
|
|
{
|
|
MoveToHead( doc, element, node );
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
special case where a form start tag
|
|
occurs in a tr and is followed by td or th
|
|
*/
|
|
|
|
if ( nodeIsFORM(element) &&
|
|
nodeIsTD(element->parent) &&
|
|
element->parent->implicit )
|
|
{
|
|
if ( nodeIsTD(node) )
|
|
{
|
|
TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
|
|
TY_(FreeNode)( doc, node );
|
|
continue;
|
|
}
|
|
|
|
if ( nodeIsTH(node) )
|
|
{
|
|
TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
|
|
TY_(FreeNode)( doc, node );
|
|
node = element->parent;
|
|
TidyDocFree(doc, node->element);
|
|
node->element = TY_(tmbstrdup)(doc->allocator, "th");
|
|
node->tag = TY_(LookupTagDef)( TidyTag_TH );
|
|
continue;
|
|
}
|
|
}
|
|
|
|
if ( !TY_(nodeHasCM)(element, CM_OPT) && !element->implicit )
|
|
TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE );
|
|
|
|
TY_(UngetToken)( doc );
|
|
|
|
if ( TY_(nodeHasCM)(node, CM_LIST) )
|
|
{
|
|
if ( element->parent && element->parent->tag &&
|
|
element->parent->tag->parser == TY_(ParseList) )
|
|
{
|
|
TrimSpaces( doc, element );
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
in_parse_block--;
|
|
SPRTF("Exit ParseBlock 5 %d...\n",in_parse_block);
|
|
#endif
|
|
return;
|
|
}
|
|
|
|
node = TY_(InferredTag)(doc, TidyTag_UL);
|
|
AddClassNoIndent(doc, node);
|
|
}
|
|
else if ( TY_(nodeHasCM)(node, CM_DEFLIST) )
|
|
{
|
|
if ( nodeIsDL(element->parent) )
|
|
{
|
|
TrimSpaces( doc, element );
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
in_parse_block--;
|
|
SPRTF("Exit ParseBlock 6 %d...\n",in_parse_block);
|
|
#endif
|
|
return;
|
|
}
|
|
|
|
node = TY_(InferredTag)(doc, TidyTag_DL);
|
|
}
|
|
else if ( TY_(nodeHasCM)(node, CM_TABLE) || TY_(nodeHasCM)(node, CM_ROW) )
|
|
{
|
|
/* http://tidy.sf.net/issue/1316307 */
|
|
/* In exiled mode, return so table processing can
|
|
continue. */
|
|
if (lexer->exiled) {
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
in_parse_block--;
|
|
SPRTF("Exit ParseBlock 7 %d...\n",in_parse_block);
|
|
#endif
|
|
return;
|
|
}
|
|
node = TY_(InferredTag)(doc, TidyTag_TABLE);
|
|
}
|
|
else if ( TY_(nodeHasCM)(element, CM_OBJECT) )
|
|
{
|
|
/* pop inline stack */
|
|
while ( lexer->istacksize > lexer->istackbase )
|
|
TY_(PopInline)( doc, NULL );
|
|
lexer->istackbase = istackbase;
|
|
TrimSpaces( doc, element );
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
in_parse_block--;
|
|
SPRTF("Exit ParseBlock 8 %d...\n",in_parse_block);
|
|
#endif
|
|
return;
|
|
|
|
}
|
|
else
|
|
{
|
|
TrimSpaces( doc, element );
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
in_parse_block--;
|
|
SPRTF("Exit ParseBlock 9 %d...\n",in_parse_block);
|
|
#endif
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*\
|
|
* Issue #307 - an <A> tag to ends any open <A> element
|
|
* Like #427827 - fixed by Randy Waki and Bjoern Hoehrmann 23 Aug 00
|
|
* in ParseInline(), fix copied HERE to ParseBlock()
|
|
* href: http://www.w3.org/TR/html-markup/a.html
|
|
* The interactive element a must not appear as a descendant of the a element.
|
|
\*/
|
|
if ( nodeIsA(node) && !node->implicit &&
|
|
(nodeIsA(element) || DescendantOf(element, TidyTag_A)) )
|
|
{
|
|
if (node->type != EndTag && node->attributes == NULL
|
|
&& cfgBool(doc, TidyCoerceEndTags) )
|
|
{
|
|
node->type = EndTag;
|
|
TY_(ReportError)(doc, element, node, COERCE_TO_ENDTAG);
|
|
TY_(UngetToken)( doc );
|
|
continue;
|
|
}
|
|
|
|
if (nodeIsA(element))
|
|
{
|
|
TY_(UngetToken)( doc );
|
|
}
|
|
TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE);
|
|
|
|
if (!(mode & Preformatted))
|
|
TrimSpaces(doc, element);
|
|
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
in_parse_block--;
|
|
SPRTF("Exit ParseBlock 9b %d...\n",in_parse_block);
|
|
#endif
|
|
return;
|
|
}
|
|
|
|
/* parse known element */
|
|
if (TY_(nodeIsElement)(node))
|
|
{
|
|
if (node->tag->model & CM_INLINE)
|
|
{
|
|
if (checkstack && !node->implicit)
|
|
{
|
|
checkstack = no;
|
|
|
|
if (!(element->tag->model & CM_MIXED)) /* #431731 - fix by Randy Waki 25 Dec 00 */
|
|
{
|
|
if ( TY_(InlineDup)(doc, node) > 0 )
|
|
continue;
|
|
}
|
|
}
|
|
|
|
mode = MixedContent;
|
|
}
|
|
else
|
|
{
|
|
checkstack = yes;
|
|
mode = IgnoreWhitespace;
|
|
}
|
|
|
|
/* trim white space before <br> */
|
|
if ( nodeIsBR(node) )
|
|
TrimSpaces( doc, element );
|
|
|
|
TY_(InsertNodeAtEnd)(element, node);
|
|
|
|
if (node->implicit)
|
|
TY_(ReportError)(doc, element, node, INSERTING_TAG );
|
|
|
|
/* Issue #212 - WHY is this hard coded to 'IgnoreWhitespace' while an
|
|
effort has been made above to set a 'MixedContent' mode in some cases?
|
|
WHY IS THE 'mode' VARIABLE NOT USED HERE???? */
|
|
ParseTag( doc, node, IgnoreWhitespace /*MixedContent*/ );
|
|
continue;
|
|
}
|
|
|
|
/* discard unexpected tags */
|
|
if (node->type == EndTag)
|
|
TY_(PopInline)( doc, node ); /* if inline end tag */
|
|
|
|
TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
|
|
TY_(FreeNode)( doc, node );
|
|
continue;
|
|
}
|
|
|
|
if (!(element->tag->model & CM_OPT))
|
|
TY_(ReportError)(doc, element, node, MISSING_ENDTAG_FOR);
|
|
|
|
if (element->tag->model & CM_OBJECT)
|
|
{
|
|
/* pop inline stack */
|
|
while ( lexer->istacksize > lexer->istackbase )
|
|
TY_(PopInline)( doc, NULL );
|
|
lexer->istackbase = istackbase;
|
|
}
|
|
|
|
TrimSpaces( doc, element );
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
in_parse_block--;
|
|
SPRTF("Exit ParseBlock 10 %d...\n",in_parse_block);
|
|
#endif
|
|
}
|
|
|
|
/* [i_a] svg / math */
|
|
|
|
struct MatchingDescendantData
|
|
{
|
|
Node *found_node;
|
|
Bool *passed_marker_node;
|
|
|
|
/* input: */
|
|
TidyTagId matching_tagId;
|
|
Node *node_to_find;
|
|
Node *marker_node;
|
|
};
|
|
|
|
static NodeTraversalSignal FindDescendant_cb(TidyDocImpl* ARG_UNUSED(doc), Node* node, void *propagate)
|
|
{
|
|
struct MatchingDescendantData *cb_data = (struct MatchingDescendantData *)propagate;
|
|
|
|
if (TagId(node) == cb_data->matching_tagId)
|
|
{
|
|
/* make sure we match up 'unknown' tags exactly! */
|
|
if (cb_data->matching_tagId != TidyTag_UNKNOWN ||
|
|
(node->element != NULL &&
|
|
cb_data->node_to_find != NULL &&
|
|
cb_data->node_to_find->element != NULL &&
|
|
0 == TY_(tmbstrcmp)(cb_data->node_to_find->element, node->element)))
|
|
{
|
|
cb_data->found_node = node;
|
|
return ExitTraversal;
|
|
}
|
|
}
|
|
|
|
if (cb_data->passed_marker_node && node == cb_data->marker_node)
|
|
*cb_data->passed_marker_node = yes;
|
|
|
|
return VisitParent;
|
|
}
|
|
|
|
/*
|
|
Search the parent chain (from 'parent' upwards up to the root) for a node matching the
|
|
given 'node'.
|
|
|
|
When the search passes beyond the 'marker_node' (which is assumed to sit in the
|
|
parent chain), this will be flagged by setting the boolean referenced by
|
|
'is_parent_of_marker' to yes.
|
|
|
|
'is_parent_of_marker' and 'marker_node' are optional parameters and may be NULL.
|
|
*/
|
|
static Node *FindMatchingDescendant( Node *parent, Node *node, Node *marker_node, Bool *is_parent_of_marker )
|
|
{
|
|
struct MatchingDescendantData cb_data = { 0 };
|
|
cb_data.matching_tagId = TagId(node);
|
|
cb_data.node_to_find = node;
|
|
cb_data.marker_node = marker_node;
|
|
|
|
assert(node);
|
|
|
|
if (is_parent_of_marker)
|
|
*is_parent_of_marker = no;
|
|
|
|
TY_(TraverseNodeTree)(NULL, parent, FindDescendant_cb, &cb_data);
|
|
return cb_data.found_node;
|
|
}
|
|
|
|
/*
|
|
Act as a generic XML (sub)tree parser: collect each node and add it to the DOM, without any further validation.
|
|
TODO : add schema- or other-hierarchy-definition-based validation of the subtree here...
|
|
*/
|
|
void TY_(ParseNamespace)(TidyDocImpl* doc, Node *basenode, GetTokenMode mode)
|
|
{
|
|
Lexer* lexer = doc->lexer;
|
|
Node *node;
|
|
Node *parent = basenode;
|
|
uint istackbase;
|
|
AttVal* av; /* #130 MathML attr and entity fix! */
|
|
|
|
/* a la <table>: defer popping elements off the inline stack */
|
|
TY_(DeferDup)( doc );
|
|
istackbase = lexer->istackbase;
|
|
lexer->istackbase = lexer->istacksize;
|
|
|
|
mode = OtherNamespace; /* Preformatted; IgnoreWhitespace; */
|
|
|
|
while ((node = TY_(GetToken)(doc, mode)) != NULL)
|
|
{
|
|
/*
|
|
fix check to skip action in InsertMisc for regular/empty
|
|
nodes, which we don't want here...
|
|
|
|
The way we do it here is by checking and processing everything
|
|
and only what remains goes into InsertMisc()
|
|
*/
|
|
|
|
/* is this a close tag? And does it match the current parent node? */
|
|
if (node->type == EndTag)
|
|
{
|
|
/*
|
|
to prevent end tags flowing from one 'alternate namespace' we
|
|
check this in two phases: first we check if the tag is a
|
|
descendant of the current node, and when it is, we check whether
|
|
it is the end tag for a node /within/ or /outside/ the basenode.
|
|
*/
|
|
Bool outside;
|
|
Node *mp = FindMatchingDescendant(parent, node, basenode, &outside);
|
|
|
|
if (mp != NULL)
|
|
{
|
|
/*
|
|
when mp != parent as we might expect,
|
|
infer end tags until we 'hit' the matched
|
|
parent or the basenode
|
|
*/
|
|
Node *n;
|
|
|
|
for (n = parent;
|
|
n != NULL && n != basenode->parent && n != mp;
|
|
n = n->parent)
|
|
{
|
|
/* n->implicit = yes; */
|
|
n->closed = yes;
|
|
TY_(ReportError)(doc, n->parent, n, MISSING_ENDTAG_BEFORE);
|
|
}
|
|
|
|
/* Issue #369 - Since 'assert' is DEBUG only, and there are
|
|
simple cases where these can be fired, removing them
|
|
pending feedback from the original author!
|
|
assert(outside == no ? n == mp : 1);
|
|
assert(outside == yes ? n == basenode->parent : 1);
|
|
=================================================== */
|
|
|
|
if (outside == no)
|
|
{
|
|
/* EndTag for a node within the basenode subtree. Roll on... */
|
|
n->closed = yes;
|
|
TY_(FreeNode)(doc, node);
|
|
|
|
node = n;
|
|
parent = node->parent;
|
|
}
|
|
else
|
|
{
|
|
/* EndTag for a node outside the basenode subtree: let the caller handle that. */
|
|
TY_(UngetToken)( doc );
|
|
node = basenode;
|
|
parent = node->parent;
|
|
}
|
|
|
|
/* when we've arrived at the end-node for the base node, it's quitting time */
|
|
if (node == basenode)
|
|
{
|
|
lexer->istackbase = istackbase;
|
|
assert(basenode->closed == yes);
|
|
return;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/* unmatched close tag: report an error and discard */
|
|
/* TY_(ReportError)(doc, parent, node, NON_MATCHING_ENDTAG); Issue #308 - Seems wrong warning! */
|
|
TY_(ReportError)(doc, parent, node, DISCARDING_UNEXPECTED);
|
|
assert(parent);
|
|
/* assert(parent->tag != node->tag); Issue #308 - Seems would always be true! */
|
|
TY_(FreeNode)( doc, node); /* Issue #308 - Discard unexpected end tag memory */
|
|
}
|
|
}
|
|
else if (node->type == StartTag)
|
|
{
|
|
/* #130 MathML attr and entity fix!
|
|
care if it has attributes, and 'accidently' any of those attributes match known */
|
|
for ( av = node->attributes; av; av = av->next )
|
|
{
|
|
av->dict = 0; /* does something need to be freed? */
|
|
}
|
|
/* add another child to the current parent */
|
|
TY_(InsertNodeAtEnd)(parent, node);
|
|
parent = node;
|
|
}
|
|
else
|
|
{
|
|
/* #130 MathML attr and entity fix!
|
|
care if it has attributes, and 'accidently' any of those attributes match known */
|
|
for ( av = node->attributes; av; av = av->next )
|
|
{
|
|
av->dict = 0; /* does something need to be freed? */
|
|
}
|
|
TY_(InsertNodeAtEnd)(parent, node);
|
|
}
|
|
}
|
|
|
|
TY_(ReportError)(doc, basenode->parent, basenode, MISSING_ENDTAG_FOR);
|
|
}
|
|
|
|
|
|
void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode mode )
|
|
{
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
static int in_parse_inline = 0;
|
|
#endif
|
|
Lexer* lexer = doc->lexer;
|
|
Node *node, *parent;
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
in_parse_inline++;
|
|
SPRTF("Entering ParseInline %d...\n",in_parse_inline);
|
|
#endif
|
|
|
|
if (element->tag->model & CM_EMPTY) {
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
in_parse_inline--;
|
|
SPRTF("Exit ParseInline 1 %d...\n",in_parse_inline);
|
|
#endif
|
|
return;
|
|
}
|
|
|
|
/*
|
|
ParseInline is used for some block level elements like H1 to H6
|
|
For such elements we need to insert inline emphasis tags currently
|
|
on the inline stack. For Inline elements, we normally push them
|
|
onto the inline stack provided they aren't implicit or OBJECT/APPLET.
|
|
This test is carried out in PushInline and PopInline, see istack.c
|
|
|
|
InlineDup(...) is not called for elements with a CM_MIXED (inline and
|
|
block) content model, e.g. <del> or <ins>, otherwise constructs like
|
|
|
|
<p>111<a name='foo'>222<del>333</del>444</a>555</p>
|
|
<p>111<span>222<del>333</del>444</span>555</p>
|
|
<p>111<em>222<del>333</del>444</em>555</p>
|
|
|
|
will get corrupted.
|
|
*/
|
|
if ((TY_(nodeHasCM)(element, CM_BLOCK) || nodeIsDT(element)) &&
|
|
!TY_(nodeHasCM)(element, CM_MIXED))
|
|
TY_(InlineDup)(doc, NULL);
|
|
else if (TY_(nodeHasCM)(element, CM_INLINE))
|
|
TY_(PushInline)(doc, element);
|
|
|
|
if ( nodeIsNOBR(element) )
|
|
doc->badLayout |= USING_NOBR;
|
|
else if ( nodeIsFONT(element) )
|
|
doc->badLayout |= USING_FONT;
|
|
|
|
/* Inline elements may or may not be within a preformatted element */
|
|
if (mode != Preformatted)
|
|
mode = MixedContent;
|
|
|
|
while ((node = TY_(GetToken)(doc, mode)) != NULL)
|
|
{
|
|
/* end tag for current element */
|
|
if (node->tag == element->tag && node->type == EndTag)
|
|
{
|
|
if (element->tag->model & CM_INLINE)
|
|
TY_(PopInline)( doc, node );
|
|
|
|
TY_(FreeNode)( doc, node );
|
|
|
|
if (!(mode & Preformatted))
|
|
TrimSpaces(doc, element);
|
|
|
|
/*
|
|
if a font element wraps an anchor and nothing else
|
|
then move the font element inside the anchor since
|
|
otherwise it won't alter the anchor text color
|
|
*/
|
|
if ( nodeIsFONT(element) &&
|
|
element->content && element->content == element->last )
|
|
{
|
|
Node *child = element->content;
|
|
|
|
if ( nodeIsA(child) )
|
|
{
|
|
child->parent = element->parent;
|
|
child->next = element->next;
|
|
child->prev = element->prev;
|
|
|
|
element->next = NULL;
|
|
element->prev = NULL;
|
|
element->parent = child;
|
|
|
|
element->content = child->content;
|
|
element->last = child->last;
|
|
child->content = element;
|
|
|
|
TY_(FixNodeLinks)(child);
|
|
TY_(FixNodeLinks)(element);
|
|
}
|
|
}
|
|
|
|
element->closed = yes;
|
|
TrimSpaces( doc, element );
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
in_parse_inline--;
|
|
SPRTF("Exit ParseInline 2 %d...\n",in_parse_inline);
|
|
#endif
|
|
return;
|
|
}
|
|
|
|
/* <u>...<u> map 2nd <u> to </u> if 1st is explicit */
|
|
/* (see additional conditions below) */
|
|
/* otherwise emphasis nesting is probably unintentional */
|
|
/* big, small, sub, sup have cumulative effect to leave them alone */
|
|
if ( node->type == StartTag
|
|
&& node->tag == element->tag
|
|
&& TY_(IsPushed)( doc, node )
|
|
&& !node->implicit
|
|
&& !element->implicit
|
|
&& node->tag && (node->tag->model & CM_INLINE)
|
|
&& !nodeIsA(node)
|
|
&& !nodeIsFONT(node)
|
|
&& !nodeIsBIG(node)
|
|
&& !nodeIsSMALL(node)
|
|
&& !nodeIsSUB(node)
|
|
&& !nodeIsSUP(node)
|
|
&& !nodeIsQ(node)
|
|
&& !nodeIsSPAN(node)
|
|
&& cfgBool(doc, TidyCoerceEndTags)
|
|
)
|
|
{
|
|
/* proceeds only if "node" does not have any attribute and
|
|
follows a text node not finishing with a space */
|
|
if (element->content != NULL && node->attributes == NULL
|
|
&& TY_(nodeIsText)(element->last)
|
|
&& !TY_(TextNodeEndWithSpace)(doc->lexer, element->last) )
|
|
{
|
|
TY_(ReportWarning)(doc, element, node, COERCE_TO_ENDTAG_WARN);
|
|
node->type = EndTag;
|
|
TY_(UngetToken)(doc);
|
|
continue;
|
|
}
|
|
|
|
if (node->attributes == NULL || element->attributes == NULL)
|
|
TY_(ReportWarning)(doc, element, node, NESTED_EMPHASIS);
|
|
}
|
|
else if ( TY_(IsPushed)(doc, node) && node->type == StartTag &&
|
|
nodeIsQ(node) )
|
|
{
|
|
/*\
|
|
* Issue #215 - such nested quotes are NOT a problem if HTML5, so
|
|
* only issue this warning if NOT HTML5 mode.
|
|
\*/
|
|
if (TY_(HTMLVersion)(doc) != HT50)
|
|
{
|
|
TY_(ReportWarning)(doc, element, node, NESTED_QUOTATION);
|
|
}
|
|
}
|
|
|
|
if ( TY_(nodeIsText)(node) )
|
|
{
|
|
/* only called for 1st child */
|
|
if ( element->content == NULL && !(mode & Preformatted) )
|
|
TrimSpaces( doc, element );
|
|
|
|
if ( node->start >= node->end )
|
|
{
|
|
TY_(FreeNode)( doc, node );
|
|
continue;
|
|
}
|
|
|
|
TY_(InsertNodeAtEnd)(element, node);
|
|
continue;
|
|
}
|
|
|
|
/* mixed content model so allow text */
|
|
if (InsertMisc(element, node))
|
|
continue;
|
|
|
|
/* deal with HTML tags */
|
|
if ( nodeIsHTML(node) )
|
|
{
|
|
if ( TY_(nodeIsElement)(node) )
|
|
{
|
|
TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
|
|
TY_(FreeNode)( doc, node );
|
|
continue;
|
|
}
|
|
|
|
/* otherwise infer end of inline element */
|
|
TY_(UngetToken)( doc );
|
|
|
|
if (!(mode & Preformatted))
|
|
TrimSpaces(doc, element);
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
in_parse_inline--;
|
|
SPRTF("Exit ParseInline 3 %d...\n",in_parse_inline);
|
|
#endif
|
|
return;
|
|
}
|
|
|
|
/* within <dt> or <pre> map <p> to <br> */
|
|
if ( nodeIsP(node) &&
|
|
node->type == StartTag &&
|
|
( (mode & Preformatted) ||
|
|
nodeIsDT(element) ||
|
|
DescendantOf(element, TidyTag_DT )
|
|
)
|
|
)
|
|
{
|
|
node->tag = TY_(LookupTagDef)( TidyTag_BR );
|
|
TidyDocFree(doc, node->element);
|
|
node->element = TY_(tmbstrdup)(doc->allocator, "br");
|
|
TrimSpaces(doc, element);
|
|
TY_(InsertNodeAtEnd)(element, node);
|
|
continue;
|
|
}
|
|
|
|
/* <p> allowed within <address> in HTML 4.01 Transitional */
|
|
if ( nodeIsP(node) &&
|
|
node->type == StartTag &&
|
|
nodeIsADDRESS(element) )
|
|
{
|
|
TY_(ConstrainVersion)( doc, ~VERS_HTML40_STRICT );
|
|
TY_(InsertNodeAtEnd)(element, node);
|
|
(*node->tag->parser)( doc, node, mode );
|
|
continue;
|
|
}
|
|
|
|
/* ignore unknown and PARAM tags */
|
|
if ( node->tag == NULL || nodeIsPARAM(node) )
|
|
{
|
|
TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
|
|
TY_(FreeNode)( doc, node );
|
|
continue;
|
|
}
|
|
|
|
if ( nodeIsBR(node) && node->type == EndTag )
|
|
node->type = StartTag;
|
|
|
|
if ( node->type == EndTag )
|
|
{
|
|
/* coerce </br> to <br> */
|
|
if ( nodeIsBR(node) )
|
|
node->type = StartTag;
|
|
else if ( nodeIsP(node) )
|
|
{
|
|
/* coerce unmatched </p> to <br><br> */
|
|
if ( !DescendantOf(element, TidyTag_P) )
|
|
{
|
|
TY_(CoerceNode)(doc, node, TidyTag_BR, no, no);
|
|
TrimSpaces( doc, element );
|
|
TY_(InsertNodeAtEnd)( element, node );
|
|
node = TY_(InferredTag)(doc, TidyTag_BR);
|
|
TY_(InsertNodeAtEnd)( element, node ); /* todo: check this */
|
|
continue;
|
|
}
|
|
}
|
|
else if ( TY_(nodeHasCM)(node, CM_INLINE)
|
|
&& !nodeIsA(node)
|
|
&& !TY_(nodeHasCM)(node, CM_OBJECT)
|
|
&& TY_(nodeHasCM)(element, CM_INLINE) )
|
|
{
|
|
/* allow any inline end tag to end current element */
|
|
|
|
/* http://tidy.sf.net/issue/1426419 */
|
|
/* but, like the browser, retain an earlier inline element.
|
|
This is implemented by setting the lexer into a mode
|
|
where it gets tokens from the inline stack rather than
|
|
from the input stream. Check if the scenerio fits. */
|
|
if ( !nodeIsA(element)
|
|
&& (node->tag != element->tag)
|
|
&& TY_(IsPushed)( doc, node )
|
|
&& TY_(IsPushed)( doc, element ) )
|
|
{
|
|
/* we have something like
|
|
<b>bold <i>bold and italic</b> italics</i> */
|
|
if ( TY_(SwitchInline)( doc, element, node ) )
|
|
{
|
|
TY_(ReportError)(doc, element, node, NON_MATCHING_ENDTAG);
|
|
TY_(UngetToken)( doc ); /* put this back */
|
|
TY_(InlineDup1)( doc, NULL, element ); /* dupe the <i>, after </b> */
|
|
if (!(mode & Preformatted))
|
|
TrimSpaces( doc, element );
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
in_parse_inline--;
|
|
SPRTF("Exit ParseInline 4 %d...\n",in_parse_inline);
|
|
#endif
|
|
return; /* close <i>, but will re-open it, after </b> */
|
|
}
|
|
}
|
|
TY_(PopInline)( doc, element );
|
|
|
|
if ( !nodeIsA(element) )
|
|
{
|
|
if ( nodeIsA(node) && node->tag != element->tag )
|
|
{
|
|
TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE );
|
|
TY_(UngetToken)( doc );
|
|
}
|
|
else
|
|
{
|
|
TY_(ReportError)(doc, element, node, NON_MATCHING_ENDTAG);
|
|
TY_(FreeNode)( doc, node);
|
|
}
|
|
|
|
if (!(mode & Preformatted))
|
|
TrimSpaces(doc, element);
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
in_parse_inline--;
|
|
SPRTF("Exit ParseInline 5 %d...\n",in_parse_inline);
|
|
#endif
|
|
return;
|
|
}
|
|
|
|
/* if parent is <a> then discard unexpected inline end tag */
|
|
TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
|
|
TY_(FreeNode)( doc, node);
|
|
continue;
|
|
} /* special case </tr> etc. for stuff moved in front of table */
|
|
else if ( lexer->exiled
|
|
&& (TY_(nodeHasCM)(node, CM_TABLE) || nodeIsTABLE(node)) )
|
|
{
|
|
TY_(UngetToken)( doc );
|
|
TrimSpaces(doc, element);
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
in_parse_inline--;
|
|
SPRTF("Exit ParseInline 6 %d...\n",in_parse_inline);
|
|
#endif
|
|
return;
|
|
}
|
|
}
|
|
|
|
/* allow any header tag to end current header */
|
|
if ( TY_(nodeHasCM)(node, CM_HEADING) && TY_(nodeHasCM)(element, CM_HEADING) )
|
|
{
|
|
|
|
if ( node->tag == element->tag )
|
|
{
|
|
TY_(ReportError)(doc, element, node, NON_MATCHING_ENDTAG );
|
|
TY_(FreeNode)( doc, node);
|
|
}
|
|
else
|
|
{
|
|
TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE );
|
|
TY_(UngetToken)( doc );
|
|
}
|
|
|
|
if (!(mode & Preformatted))
|
|
TrimSpaces(doc, element);
|
|
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
in_parse_inline--;
|
|
SPRTF("Exit ParseInline 7 %d...\n",in_parse_inline);
|
|
#endif
|
|
return;
|
|
}
|
|
|
|
/*
|
|
an <A> tag to ends any open <A> element
|
|
but <A href=...> is mapped to </A><A href=...>
|
|
*/
|
|
/* #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 */
|
|
/* if (node->tag == doc->tags.tag_a && !node->implicit && TY_(IsPushed)(doc, node)) */
|
|
if ( nodeIsA(node) && !node->implicit &&
|
|
(nodeIsA(element) || DescendantOf(element, TidyTag_A)) )
|
|
{
|
|
/* coerce <a> to </a> unless it has some attributes */
|
|
/* #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 */
|
|
/* other fixes by Dave Raggett */
|
|
/* if (node->attributes == NULL) */
|
|
if (node->type != EndTag && node->attributes == NULL
|
|
&& cfgBool(doc, TidyCoerceEndTags) )
|
|
{
|
|
node->type = EndTag;
|
|
TY_(ReportError)(doc, element, node, COERCE_TO_ENDTAG);
|
|
/* TY_(PopInline)( doc, node ); */
|
|
TY_(UngetToken)( doc );
|
|
continue;
|
|
}
|
|
|
|
TY_(UngetToken)( doc );
|
|
TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE);
|
|
/* TY_(PopInline)( doc, element ); */
|
|
|
|
if (!(mode & Preformatted))
|
|
TrimSpaces(doc, element);
|
|
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
in_parse_inline--;
|
|
SPRTF("Exit ParseInline 8 %d...\n",in_parse_inline);
|
|
#endif
|
|
return;
|
|
}
|
|
|
|
if (element->tag->model & CM_HEADING)
|
|
{
|
|
if ( nodeIsCENTER(node) || nodeIsDIV(node) )
|
|
{
|
|
if (!TY_(nodeIsElement)(node))
|
|
{
|
|
TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
|
|
TY_(FreeNode)( doc, node);
|
|
continue;
|
|
}
|
|
|
|
TY_(ReportError)(doc, element, node, TAG_NOT_ALLOWED_IN);
|
|
|
|
/* insert center as parent if heading is empty */
|
|
if (element->content == NULL)
|
|
{
|
|
InsertNodeAsParent(element, node);
|
|
continue;
|
|
}
|
|
|
|
/* split heading and make center parent of 2nd part */
|
|
TY_(InsertNodeAfterElement)(element, node);
|
|
|
|
if (!(mode & Preformatted))
|
|
TrimSpaces(doc, element);
|
|
|
|
element = TY_(CloneNode)( doc, element );
|
|
TY_(InsertNodeAtEnd)(node, element);
|
|
continue;
|
|
}
|
|
|
|
if ( nodeIsHR(node) )
|
|
{
|
|
if ( !TY_(nodeIsElement)(node) )
|
|
{
|
|
TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
|
|
TY_(FreeNode)( doc, node);
|
|
continue;
|
|
}
|
|
|
|
TY_(ReportError)(doc, element, node, TAG_NOT_ALLOWED_IN);
|
|
|
|
/* insert hr before heading if heading is empty */
|
|
if (element->content == NULL)
|
|
{
|
|
TY_(InsertNodeBeforeElement)(element, node);
|
|
continue;
|
|
}
|
|
|
|
/* split heading and insert hr before 2nd part */
|
|
TY_(InsertNodeAfterElement)(element, node);
|
|
|
|
if (!(mode & Preformatted))
|
|
TrimSpaces(doc, element);
|
|
|
|
element = TY_(CloneNode)( doc, element );
|
|
TY_(InsertNodeAfterElement)(node, element);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
if ( nodeIsDT(element) )
|
|
{
|
|
if ( nodeIsHR(node) )
|
|
{
|
|
Node *dd;
|
|
if ( !TY_(nodeIsElement)(node) )
|
|
{
|
|
TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
|
|
TY_(FreeNode)( doc, node);
|
|
continue;
|
|
}
|
|
|
|
TY_(ReportError)(doc, element, node, TAG_NOT_ALLOWED_IN);
|
|
dd = TY_(InferredTag)(doc, TidyTag_DD);
|
|
|
|
/* insert hr within dd before dt if dt is empty */
|
|
if (element->content == NULL)
|
|
{
|
|
TY_(InsertNodeBeforeElement)(element, dd);
|
|
TY_(InsertNodeAtEnd)(dd, node);
|
|
continue;
|
|
}
|
|
|
|
/* split dt and insert hr within dd before 2nd part */
|
|
TY_(InsertNodeAfterElement)(element, dd);
|
|
TY_(InsertNodeAtEnd)(dd, node);
|
|
|
|
if (!(mode & Preformatted))
|
|
TrimSpaces(doc, element);
|
|
|
|
element = TY_(CloneNode)( doc, element );
|
|
TY_(InsertNodeAfterElement)(dd, element);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
if this is the end tag for an ancestor element
|
|
then infer end tag for this element
|
|
*/
|
|
if (node->type == EndTag)
|
|
{
|
|
for (parent = element->parent;
|
|
parent != NULL; parent = parent->parent)
|
|
{
|
|
if (node->tag == parent->tag)
|
|
{
|
|
if (!(element->tag->model & CM_OPT) && !element->implicit)
|
|
TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE);
|
|
|
|
if( TY_(IsPushedLast)( doc, element, node ) )
|
|
TY_(PopInline)( doc, element );
|
|
TY_(UngetToken)( doc );
|
|
|
|
if (!(mode & Preformatted))
|
|
TrimSpaces(doc, element);
|
|
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
in_parse_inline--;
|
|
SPRTF("Exit ParseInline 9 %d...\n",in_parse_inline);
|
|
#endif
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* block level tags end this element */
|
|
if (!(node->tag->model & CM_INLINE) &&
|
|
!(element->tag->model & CM_MIXED))
|
|
{
|
|
if ( !TY_(nodeIsElement)(node) )
|
|
{
|
|
TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
|
|
TY_(FreeNode)( doc, node);
|
|
continue;
|
|
}
|
|
/* HTML5 */
|
|
if (nodeIsDATALIST(element)) {
|
|
TY_(ConstrainVersion)( doc, ~VERS_HTML5 );
|
|
} else
|
|
if (!(element->tag->model & CM_OPT))
|
|
TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE);
|
|
|
|
if (node->tag->model & CM_HEAD && !(node->tag->model & CM_BLOCK))
|
|
{
|
|
MoveToHead(doc, element, node);
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
prevent anchors from propagating into block tags
|
|
except for headings h1 to h6
|
|
*/
|
|
if ( nodeIsA(element) )
|
|
{
|
|
if (node->tag && !(node->tag->model & CM_HEADING))
|
|
TY_(PopInline)( doc, element );
|
|
else if (!(element->content))
|
|
{
|
|
TY_(DiscardElement)( doc, element );
|
|
TY_(UngetToken)( doc );
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
in_parse_inline--;
|
|
SPRTF("Exit ParseInline 10 %d...\n",in_parse_inline);
|
|
#endif
|
|
return;
|
|
}
|
|
}
|
|
|
|
TY_(UngetToken)( doc );
|
|
|
|
if (!(mode & Preformatted))
|
|
TrimSpaces(doc, element);
|
|
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
in_parse_inline--;
|
|
SPRTF("Exit ParseInline 11 %d...\n",in_parse_inline);
|
|
#endif
|
|
return;
|
|
}
|
|
|
|
/* parse inline element */
|
|
if (TY_(nodeIsElement)(node))
|
|
{
|
|
if (node->implicit)
|
|
TY_(ReportError)(doc, element, node, INSERTING_TAG);
|
|
|
|
/* trim white space before <br> */
|
|
if ( nodeIsBR(node) )
|
|
TrimSpaces(doc, element);
|
|
|
|
TY_(InsertNodeAtEnd)(element, node);
|
|
ParseTag(doc, node, mode);
|
|
continue;
|
|
}
|
|
|
|
/* discard unexpected tags */
|
|
TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
|
|
TY_(FreeNode)( doc, node );
|
|
continue;
|
|
}
|
|
|
|
if (!(element->tag->model & CM_OPT))
|
|
TY_(ReportError)(doc, element, node, MISSING_ENDTAG_FOR);
|
|
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
in_parse_inline--;
|
|
SPRTF("Exit ParseInline 12 %d...\n",in_parse_inline);
|
|
#endif
|
|
}
|
|
|
|
void TY_(ParseEmpty)(TidyDocImpl* doc, Node *element, GetTokenMode mode)
|
|
{
|
|
Lexer* lexer = doc->lexer;
|
|
if ( lexer->isvoyager )
|
|
{
|
|
Node *node = TY_(GetToken)( doc, mode);
|
|
if ( node )
|
|
{
|
|
if ( !(node->type == EndTag && node->tag == element->tag) )
|
|
{
|
|
/* TY_(ReportError)(doc, element, node, ELEMENT_NOT_EMPTY); */
|
|
TY_(UngetToken)( doc );
|
|
}
|
|
else
|
|
{
|
|
TY_(FreeNode)( doc, node );
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void TY_(ParseDefList)(TidyDocImpl* doc, Node *list, GetTokenMode mode)
|
|
{
|
|
Lexer* lexer = doc->lexer;
|
|
Node *node, *parent;
|
|
|
|
if (list->tag->model & CM_EMPTY)
|
|
return;
|
|
|
|
lexer->insert = NULL; /* defer implicit inline start tags */
|
|
|
|
while ((node = TY_(GetToken)( doc, IgnoreWhitespace)) != NULL)
|
|
{
|
|
if (node->tag == list->tag && node->type == EndTag)
|
|
{
|
|
TY_(FreeNode)( doc, node);
|
|
list->closed = yes;
|
|
return;
|
|
}
|
|
|
|
/* deal with comments etc. */
|
|
if (InsertMisc(list, node))
|
|
continue;
|
|
|
|
if (TY_(nodeIsText)(node))
|
|
{
|
|
TY_(UngetToken)( doc );
|
|
node = TY_(InferredTag)(doc, TidyTag_DT);
|
|
TY_(ReportError)(doc, list, node, MISSING_STARTTAG);
|
|
}
|
|
|
|
if (node->tag == NULL)
|
|
{
|
|
TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
|
|
TY_(FreeNode)( doc, node);
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
if this is the end tag for an ancestor element
|
|
then infer end tag for this element
|
|
*/
|
|
if (node->type == EndTag)
|
|
{
|
|
Bool discardIt = no;
|
|
if ( nodeIsFORM(node) )
|
|
{
|
|
BadForm( doc );
|
|
TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
|
|
TY_(FreeNode)( doc, node );
|
|
continue;
|
|
}
|
|
|
|
for (parent = list->parent;
|
|
parent != NULL; parent = parent->parent)
|
|
{
|
|
/* Do not match across BODY to avoid infinite loop
|
|
between ParseBody and this parser,
|
|
See http://tidy.sf.net/bug/1098012. */
|
|
if (nodeIsBODY(parent))
|
|
{
|
|
discardIt = yes;
|
|
break;
|
|
}
|
|
if (node->tag == parent->tag)
|
|
{
|
|
TY_(ReportError)(doc, list, node, MISSING_ENDTAG_BEFORE);
|
|
|
|
TY_(UngetToken)( doc );
|
|
return;
|
|
}
|
|
}
|
|
if (discardIt)
|
|
{
|
|
TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
|
|
TY_(FreeNode)( doc, node);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
/* center in a dt or a dl breaks the dl list in two */
|
|
if ( nodeIsCENTER(node) )
|
|
{
|
|
if (list->content)
|
|
TY_(InsertNodeAfterElement)(list, node);
|
|
else /* trim empty dl list */
|
|
{
|
|
TY_(InsertNodeBeforeElement)(list, node);
|
|
|
|
/* #540296 tidy dumps with empty definition list */
|
|
#if 0
|
|
TY_(DiscardElement)(list);
|
|
#endif
|
|
}
|
|
|
|
/* #426885 - fix by Glenn Carroll 19 Apr 00, and
|
|
Gary Dechaines 11 Aug 00 */
|
|
/* ParseTag can destroy node, if it finds that
|
|
* this <center> is followed immediately by </center>.
|
|
* It's awkward but necessary to determine if this
|
|
* has happened.
|
|
*/
|
|
parent = node->parent;
|
|
|
|
/* and parse contents of center */
|
|
lexer->excludeBlocks = no;
|
|
ParseTag( doc, node, mode);
|
|
lexer->excludeBlocks = yes;
|
|
|
|
/* now create a new dl element,
|
|
* unless node has been blown away because the
|
|
* center was empty, as above.
|
|
*/
|
|
if (parent->last == node)
|
|
{
|
|
list = TY_(InferredTag)(doc, TidyTag_DL);
|
|
TY_(InsertNodeAfterElement)(node, list);
|
|
}
|
|
continue;
|
|
}
|
|
|
|
if ( !(nodeIsDT(node) || nodeIsDD(node)) )
|
|
{
|
|
TY_(UngetToken)( doc );
|
|
|
|
if (!(node->tag->model & (CM_BLOCK | CM_INLINE)))
|
|
{
|
|
TY_(ReportError)(doc, list, node, TAG_NOT_ALLOWED_IN);
|
|
return;
|
|
}
|
|
|
|
/* if DD appeared directly in BODY then exclude blocks */
|
|
if (!(node->tag->model & CM_INLINE) && lexer->excludeBlocks)
|
|
return;
|
|
|
|
node = TY_(InferredTag)(doc, TidyTag_DD);
|
|
TY_(ReportError)(doc, list, node, MISSING_STARTTAG);
|
|
}
|
|
|
|
if (node->type == EndTag)
|
|
{
|
|
TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
|
|
TY_(FreeNode)( doc, node);
|
|
continue;
|
|
}
|
|
|
|
/* node should be <DT> or <DD>*/
|
|
TY_(InsertNodeAtEnd)(list, node);
|
|
ParseTag( doc, node, IgnoreWhitespace);
|
|
}
|
|
|
|
TY_(ReportError)(doc, list, node, MISSING_ENDTAG_FOR);
|
|
}
|
|
|
|
static Bool FindLastLI( Node *list, Node **lastli )
|
|
{
|
|
Node *node;
|
|
|
|
*lastli = NULL;
|
|
for ( node = list->content; node ; node = node->next )
|
|
if ( nodeIsLI(node) && node->type == StartTag )
|
|
*lastli=node;
|
|
return *lastli ? yes:no;
|
|
}
|
|
|
|
void TY_(ParseList)(TidyDocImpl* doc, Node *list, GetTokenMode ARG_UNUSED(mode))
|
|
{
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
static int in_parse_list = 0;
|
|
#endif
|
|
Lexer* lexer = doc->lexer;
|
|
Node *node, *parent, *lastli;
|
|
Bool wasblock;
|
|
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
in_parse_list++;
|
|
SPRTF("Entering ParseList %d...\n",in_parse_list);
|
|
#endif
|
|
if (list->tag->model & CM_EMPTY)
|
|
{
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
in_parse_list--;
|
|
SPRTF("Exit ParseList 1 %d... CM_EMPTY\n",in_parse_list);
|
|
#endif
|
|
return;
|
|
}
|
|
lexer->insert = NULL; /* defer implicit inline start tags */
|
|
|
|
while ((node = TY_(GetToken)( doc, IgnoreWhitespace)) != NULL)
|
|
{
|
|
if (node->tag == list->tag && node->type == EndTag)
|
|
{
|
|
TY_(FreeNode)( doc, node);
|
|
list->closed = yes;
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
in_parse_list--;
|
|
SPRTF("Exit ParseList 2 %d... Endtag\n",in_parse_list);
|
|
#endif
|
|
return;
|
|
}
|
|
|
|
/* deal with comments etc. */
|
|
if (InsertMisc(list, node))
|
|
continue;
|
|
|
|
if (node->type != TextNode && node->tag == NULL)
|
|
{
|
|
TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
|
|
TY_(FreeNode)( doc, node);
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
if this is the end tag for an ancestor element
|
|
then infer end tag for this element
|
|
*/
|
|
if (node->type == EndTag)
|
|
{
|
|
if ( nodeIsFORM(node) )
|
|
{
|
|
BadForm( doc );
|
|
TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
|
|
TY_(FreeNode)( doc, node );
|
|
continue;
|
|
}
|
|
|
|
if (TY_(nodeHasCM)(node,CM_INLINE))
|
|
{
|
|
TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
|
|
TY_(PopInline)( doc, node );
|
|
TY_(FreeNode)( doc, node);
|
|
continue;
|
|
}
|
|
|
|
for ( parent = list->parent;
|
|
parent != NULL; parent = parent->parent )
|
|
{
|
|
/* Do not match across BODY to avoid infinite loop
|
|
between ParseBody and this parser,
|
|
See http://tidy.sf.net/bug/1053626. */
|
|
if (nodeIsBODY(parent))
|
|
break;
|
|
if (node->tag == parent->tag)
|
|
{
|
|
TY_(ReportError)(doc, list, node, MISSING_ENDTAG_BEFORE);
|
|
TY_(UngetToken)( doc );
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
in_parse_list--;
|
|
SPRTF("Exit ParseList 3 %d... No End Tag\n",in_parse_list);
|
|
#endif
|
|
return;
|
|
}
|
|
}
|
|
|
|
TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
|
|
TY_(FreeNode)( doc, node);
|
|
continue;
|
|
}
|
|
|
|
if ( nodeIsLI(node) || TY_(IsHTML5Mode)(doc))
|
|
{
|
|
/* node is <LI>
|
|
Issue #396 - A <ul> can have Zero or more li elements
|
|
*/
|
|
TY_(InsertNodeAtEnd)(list,node);
|
|
}
|
|
else
|
|
{
|
|
TY_(UngetToken)( doc );
|
|
|
|
if (TY_(nodeHasCM)(node,CM_BLOCK) && lexer->excludeBlocks)
|
|
{
|
|
TY_(ReportError)(doc, list, node, MISSING_ENDTAG_BEFORE);
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
in_parse_list--;
|
|
SPRTF("Exit ParseList 4 %d... No End Tag\n",in_parse_list);
|
|
#endif
|
|
return;
|
|
}
|
|
/* http://tidy.sf.net/issue/1316307 */
|
|
/* In exiled mode, return so table processing can continue. */
|
|
else if ( lexer->exiled
|
|
&& (TY_(nodeHasCM)(node, CM_TABLE|CM_ROWGRP|CM_ROW)
|
|
|| nodeIsTABLE(node)) )
|
|
{
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
in_parse_list--;
|
|
SPRTF("Exit ParseList 5 %d... exiled\n",in_parse_list);
|
|
#endif
|
|
return;
|
|
}
|
|
/* http://tidy.sf.net/issue/836462
|
|
If "list" is an unordered list, insert the next tag within
|
|
the last <li> to preserve the numbering to match the visual
|
|
rendering of most browsers. */
|
|
if ( nodeIsOL(list) && FindLastLI(list, &lastli) )
|
|
{
|
|
/* Create a node for error reporting */
|
|
node = TY_(InferredTag)(doc, TidyTag_LI);
|
|
TY_(ReportError)(doc, list, node, MISSING_STARTTAG );
|
|
TY_(FreeNode)( doc, node);
|
|
node = lastli;
|
|
}
|
|
else
|
|
{
|
|
/* Add an inferred <li> */
|
|
wasblock = TY_(nodeHasCM)(node,CM_BLOCK);
|
|
node = TY_(InferredTag)(doc, TidyTag_LI);
|
|
/* Add "display: inline" to avoid a blank line after <li> with
|
|
Internet Explorer. See http://tidy.sf.net/issue/836462 */
|
|
TY_(AddStyleProperty)( doc, node,
|
|
wasblock
|
|
? "list-style: none; display: inline"
|
|
: "list-style: none"
|
|
);
|
|
TY_(ReportError)(doc, list, node, MISSING_STARTTAG );
|
|
TY_(InsertNodeAtEnd)(list,node);
|
|
}
|
|
}
|
|
|
|
ParseTag( doc, node, IgnoreWhitespace);
|
|
}
|
|
|
|
TY_(ReportError)(doc, list, node, MISSING_ENDTAG_FOR);
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
in_parse_list--;
|
|
SPRTF("Exit ParseList 6 %d... missing end tag\n",in_parse_list);
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
unexpected content in table row is moved to just before
|
|
the table in accordance with Netscape and IE. This code
|
|
assumes that node hasn't been inserted into the row.
|
|
*/
|
|
static void MoveBeforeTable( TidyDocImpl* ARG_UNUSED(doc), Node *row,
|
|
Node *node )
|
|
{
|
|
Node *table;
|
|
|
|
/* first find the table element */
|
|
for (table = row->parent; table; table = table->parent)
|
|
{
|
|
if ( nodeIsTABLE(table) )
|
|
{
|
|
TY_(InsertNodeBeforeElement)( table, node );
|
|
return;
|
|
}
|
|
}
|
|
/* No table element */
|
|
TY_(InsertNodeBeforeElement)( row->parent, node );
|
|
}
|
|
|
|
/*
|
|
if a table row is empty then insert an empty cell
|
|
this practice is consistent with browser behavior
|
|
and avoids potential problems with row spanning cells
|
|
*/
|
|
static void FixEmptyRow(TidyDocImpl* doc, Node *row)
|
|
{
|
|
Node *cell;
|
|
|
|
if (row->content == NULL)
|
|
{
|
|
cell = TY_(InferredTag)(doc, TidyTag_TD);
|
|
TY_(InsertNodeAtEnd)(row, cell);
|
|
TY_(ReportError)(doc, row, cell, MISSING_STARTTAG);
|
|
}
|
|
}
|
|
|
|
void TY_(ParseRow)(TidyDocImpl* doc, Node *row, GetTokenMode ARG_UNUSED(mode))
|
|
{
|
|
Lexer* lexer = doc->lexer;
|
|
Node *node;
|
|
Bool exclude_state;
|
|
|
|
if (row->tag->model & CM_EMPTY)
|
|
return;
|
|
|
|
while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
|
|
{
|
|
if (node->tag == row->tag)
|
|
{
|
|
if (node->type == EndTag)
|
|
{
|
|
TY_(FreeNode)( doc, node);
|
|
row->closed = yes;
|
|