You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

4287 lines
124 KiB

/* lexer.c -- Lexer for html parser
(c) 1998-2008 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
*/
/*
Given a file stream fp it returns a sequence of tokens.
GetToken(fp) gets the next token
UngetToken(fp) provides one level undo
The tags include an attribute list:
- linked list of attribute/value nodes
- each node has 2 NULL-terminated strings.
- entities are replaced in attribute values
white space is compacted if not in preformatted mode
If not in preformatted mode then leading white space
is discarded and subsequent white space sequences
compacted to single space characters.
If XmlTags is no then Tag names are folded to upper
case and attribute names to lower case.
Not yet done:
- Doctype subset and marked sections
*/
#include "tidy-int.h"
#include "lexer.h"
#include "parser.h"
#include "entities.h"
#include "streamio.h"
#include "message.h"
#include "tmbstr.h"
#include "clean.h"
#include "utf8.h"
#include "streamio.h"
#ifdef _MSC_VER
#include "sprtf.h"
#endif
#ifndef SPRTF
#define SPRTF printf
#endif
#if !defined(NDEBUG) && defined(_MSC_VER)
/* #define DEBUG_ALLOCATION special EXTRA allocation debug information - VERY NOISY */
static void check_me(char *name);
5 years ago
static Bool show_attrs = aye;
#define MX_TXT 8
static char buffer[(MX_TXT*4)+8]; /* NOTE extra for '...'\0 tail */
static tmbstr get_text_string(Lexer* lexer, Node *node)
{
uint len = node->end - node->start;
tmbstr cp = lexer->lexbuf + node->start;
tmbstr end = lexer->lexbuf + node->end;
unsigned char c;
uint i = 0;
Bool insp = no;
if (len <= ((MX_TXT * 2) + 3)) {
buffer[0] = 0;
while (cp < end) {
c = *cp;
cp++;
if (c == '\n') {
buffer[i++] = '\\';
buffer[i++] = 'n';
} else if ( c == ' ' ) {
if (!insp)
buffer[i++] = c;
5 years ago
insp = aye;
} else {
buffer[i++] = c;
insp = no;
}
}
} else {
char *end1 = cp + MX_TXT;
char *bgn = cp + (len - MX_TXT);
buffer[0] = 0;
if (bgn < end1)
bgn = end1;
while (cp < end1) {
c = *cp;
cp++;
if (c == '\n') {
buffer[i++] = '\\';
buffer[i++] = 'n';
} else if ( c == ' ' ) {
if (!insp)
buffer[i++] = c;
5 years ago
insp = aye;
} else {
buffer[i++] = c;
insp = no;
}
if (i >= MX_TXT)
break;
}
c = '.';
if ((i < len)&&(cp < bgn)) {
buffer[i++] = c;
cp++;
if ((i < len)&&(cp < bgn)) {
buffer[i++] = c;
cp++;
if ((i < len)&&(cp < bgn)) {
buffer[i++] = c;
cp++;
}
}
}
cp = bgn;
insp = no;
while (cp < end) {
c = *cp;
cp++;
if (c == '\n') {
buffer[i++] = '\\';
buffer[i++] = 'n';
} else if ( c == ' ' ) {
if (!insp)
buffer[i++] = c;
5 years ago
insp = aye;
} else {
buffer[i++] = c;
insp = no;
}
}
}
buffer[i] = 0;
return buffer;
}
static void Show_Node( TidyDocImpl* doc, const char *msg, Node *node )
{
Lexer* lexer = doc->lexer;
5 years ago
Bool lex = ((msg[0] == 'l')&&(msg[1] == 'e')) ? aye : no;
int line = ( doc->lexer ? doc->lexer->lines : 0 );
int col = ( doc->lexer ? doc->lexer->columns : 0 );
tmbstr src = lex ? "lexer" : "stream";
SPRTF("R=%d C=%d: ", line, col );
// DEBUG: Be able to set a TRAP on a SPECIFIC row,col
if ((line == 67) && (col == 95)) {
check_me("Show_Node"); // just a debug trap
}
if (lexer && lexer->token &&
((lexer->token->type == TextNode)||(node && (node->type == TextNode)))) {
if (show_attrs) {
uint len = node ? node->end - node->start : 0;
tmbstr cp = node ? get_text_string( lexer, node ) : "NULL";
SPRTF("Returning %s TextNode [%s]%u %s\n", msg, cp, len, src );
} else {
SPRTF("Returning %s TextNode %p... %s\n", msg, node, src );
}
} else {
tmbstr name = node ? node->element ? node->element : "blank" : "NULL";
if (show_attrs) {
AttVal* av;
SPRTF("Returning %s node <%s", msg, name);
if (node) {
for (av = node->attributes; av; av = av->next) {
name = av->attribute;
if (name) {
SPRTF(" %s",name);
if (av->value) {
SPRTF("=\"%s\"", av->value);
}
}
}
}
SPRTF("> %s\n", src);
} else {
SPRTF("Returning %s node %p <%s>... %s\n", msg, node,
name, src );
}
}
}
#define GTDBG(a,b,c) Show_Node(a,b,c)
#else
#define GTDBG(a,b,c)
#endif
/* Forward references
*/
/* swallows closing '>' */
static AttVal *ParseAttrs( TidyDocImpl* doc, Bool *isempty );
static tmbstr ParseAttribute( TidyDocImpl* doc, Bool* isempty,
Node **asp, Node **php );
static tmbstr ParseValue( TidyDocImpl* doc, ctmbstr name, Bool foldCase,
Bool *isempty, int *pdelim );
static Node *ParseDocTypeDecl(TidyDocImpl* doc);
static void AddAttrToList( AttVal** list, AttVal* av );
/* used to classify characters for lexical purposes */
#define MAP(c) ((unsigned)c < 128 ? lexmap[(unsigned)c] : 0)
static uint lexmap[128];
#define IsValidXMLAttrName(name) TY_(IsValidXMLID)(name)
#define IsValidXMLElemName(name) TY_(IsValidXMLID)(name)
static struct _doctypes
{
uint score;
uint vers;
ctmbstr name;
ctmbstr fpi;
ctmbstr si;
} const W3C_Doctypes[] =
{
{ 2, HT20, "HTML 2.0", "-//IETF//DTD HTML 2.0//EN", NULL, },
{ 2, HT20, "HTML 2.0", "-//IETF//DTD HTML//EN", NULL, },
{ 2, HT20, "HTML 2.0", "-//W3C//DTD HTML 2.0//EN", NULL, },
{ 1, HT32, "HTML 3.2", "-//W3C//DTD HTML 3.2//EN", NULL, },
{ 1, HT32, "HTML 3.2", "-//W3C//DTD HTML 3.2 Final//EN", NULL, },
{ 1, HT32, "HTML 3.2", "-//W3C//DTD HTML 3.2 Draft//EN", NULL, },
{ 6, H40S, "HTML 4.0 Strict", "-//W3C//DTD HTML 4.0//EN", "http://www.w3.org/TR/REC-html40/strict.dtd" },
{ 8, H40T, "HTML 4.0 Transitional", "-//W3C//DTD HTML 4.0 Transitional//EN", "http://www.w3.org/TR/REC-html40/loose.dtd" },
{ 7, H40F, "HTML 4.0 Frameset", "-//W3C//DTD HTML 4.0 Frameset//EN", "http://www.w3.org/TR/REC-html40/frameset.dtd" },
{ 3, H41S, "HTML 4.01 Strict", "-//W3C//DTD HTML 4.01//EN", "http://www.w3.org/TR/html4/strict.dtd" },
{ 5, H41T, "HTML 4.01 Transitional", "-//W3C//DTD HTML 4.01 Transitional//EN", "http://www.w3.org/TR/html4/loose.dtd" },
{ 4, H41F, "HTML 4.01 Frameset", "-//W3C//DTD HTML 4.01 Frameset//EN", "http://www.w3.org/TR/html4/frameset.dtd" },
{ 9, X10S, "XHTML 1.0 Strict", "-//W3C//DTD XHTML 1.0 Strict//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd" },
{ 11, X10T, "XHTML 1.0 Transitional", "-//W3C//DTD XHTML 1.0 Transitional//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" },
{ 10, X10F, "XHTML 1.0 Frameset", "-//W3C//DTD XHTML 1.0 Frameset//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd" },
{ 12, XH11, "XHTML 1.1", "-//W3C//DTD XHTML 1.1//EN", "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd" },
{ 13, XB10, "XHTML Basic 1.0", "-//W3C//DTD XHTML Basic 1.0//EN", "http://www.w3.org/TR/xhtml-basic/xhtml-basic10.dtd" },
{ 20, HT50, "HTML5", NULL, NULL },
{ 21, XH50, "XHTML5", NULL, NULL },
/* reminder to add XHTML Print 1.0 support, see http://www.w3.org/TR/xhtml-print */
#if 0
{ 14, XP10, "XHTML Print 1.0", "-//W3C//DTD XHTML-Print 1.0//EN", "http://www.w3.org/MarkUp/DTD/xhtml-print10.dtd" },
{ 14, XP10, "XHTML Print 1.0", "-//PWG//DTD XHTML-Print 1.0//EN", "http://www.xhtml-print.org/xhtml-print/xhtml-print10.dtd" },
#endif
/* final entry */
{ 0, 0, NULL, NULL, NULL }
};
int TY_(HTMLVersion)(TidyDocImpl* doc)
{
uint i;
uint j = 0;
uint score = 0;
uint vers = doc->lexer->versions;
uint dtver = doc->lexer->doctype;
TidyDoctypeModes dtmode = (TidyDoctypeModes)cfg(doc, TidyDoctypeMode);
Bool xhtml = (cfgBool(doc, TidyXmlOut) || doc->lexer->isvoyager) &&
!cfgBool(doc, TidyHtmlOut);
Bool html4 = ((dtmode == TidyDoctypeStrict) || (dtmode == TidyDoctypeLoose) ||
5 years ago
(VERS_FROM40 & dtver) ? aye : no);
Bool html5 = (!html4 && ((dtmode == TidyDoctypeAuto) ||
5 years ago
(dtmode == TidyDoctypeHtml5)) ? aye : no);
if (xhtml && dtver == VERS_UNKNOWN) return XH50;
if (dtver == VERS_UNKNOWN) return HT50;
/* Issue #167 - if NOT XHTML, and doctype is default VERS_HTML5, then return HT50 */
if (!xhtml && (dtver == VERS_HTML5)) return HT50;
/* Issue #377 - If xhtml and (doctype == html5) and constrained vers contains XH50 return that,
and really if tidy defaults to 'html5', then maybe 'auto' should also apply! */
if (xhtml && html5 && ((vers & VERS_HTML5) == XH50)) return XH50;
for (i = 0; W3C_Doctypes[i].name; ++i)
{
if ((xhtml && !(VERS_XHTML & W3C_Doctypes[i].vers)) ||
(html4 && !(VERS_FROM40 & W3C_Doctypes[i].vers)))
continue;
if (vers & W3C_Doctypes[i].vers &&
(W3C_Doctypes[i].score < score || !score))
{
score = W3C_Doctypes[i].score;
j = i;
}
}
if (score)
return W3C_Doctypes[j].vers;
return VERS_UNKNOWN;
}
static ctmbstr GetFPIFromVers(uint vers)
{
uint i;
for (i = 0; W3C_Doctypes[i].name; ++i)
if (W3C_Doctypes[i].vers == vers)
return W3C_Doctypes[i].fpi;
return NULL;
}
static ctmbstr GetSIFromVers(uint vers)
{
uint i;
for (i = 0; W3C_Doctypes[i].name; ++i)
if (W3C_Doctypes[i].vers == vers)
return W3C_Doctypes[i].si;
return NULL;
}
static ctmbstr GetNameFromVers(uint vers)
{
uint i;
for (i = 0; W3C_Doctypes[i].name; ++i)
if (W3C_Doctypes[i].vers == vers)
return W3C_Doctypes[i].name;
return NULL;
}
static uint GetVersFromFPI(ctmbstr fpi)
{
uint i;
for (i = 0; W3C_Doctypes[i].name; ++i)
if (W3C_Doctypes[i].fpi != NULL && TY_(tmbstrcasecmp)(W3C_Doctypes[i].fpi, fpi) == 0)
return W3C_Doctypes[i].vers;
return 0;
}
#if (defined(_MSC_VER) && !defined(NDEBUG))
/* Issue #377 - Output diminishing version bits */
typedef struct tagV2S {
uint bit;
ctmbstr val;
}V2S, *PV2S;
static V2S v2s[] = {
{ HT20, "HT20" },
{ HT32, "HT32" },
{ H40S, "H40S" },
{ H40T, "H40T" },
{ H40F, "H40F" },
{ H41S, "H41S" },
{ H41T, "H41T" },
{ H41F, "H41F" },
{ X10S, "X10S" },
{ X10T, "X10T" },
{ X10F, "X10F" },
{ XH11, "XH11" },
{ XB10, "XB10" }, /* 4096u */
/* { VERS_SUN, "VSUN" }, */
/* { VERS_NETSCAPE, "VNET" }, */
/* { VERS_MICROSOFT, "VMIC" }, 32768u */
{ VERS_XML, "VXML" }, /* 65536u */
/* HTML5 */
{ HT50, "HT50" }, /* 131072u */
{ XH50, "XH50" }, /* 262144u */
{ 0, 0 }
};
/* Process the above table, adding a bit name,
or '----' when not present */
static char *add_vers_string( tmbstr buf, uint vers )
{
PV2S pv2s = v2s;
int len = (int)strlen(buf);
while (pv2s->val) {
if (vers & pv2s->bit) {
if (len) {
strcat(buf,"|");
len++;
}
strcat(buf,pv2s->val);
len += (int)strlen(pv2s->val);
vers &= ~(pv2s->bit);
if (!vers)
break;
} else {
if (len) {
strcat(buf,"|");
len++;
}
strcat(buf,"----");
len += 4;
}
pv2s++;
}
if (vers) { /* Should not have any here! */
if (len)
strcat(buf,"|");
sprintf(EndBuf(buf),"%u",vers);
}
return buf;
}
/* Issue #377 - Show first Before: list, and then on any change
Note the VERS_PROPRIETARY are exclude since they always remain */
void TY_(ConstrainVersion)(TidyDocImpl* doc, uint vers)
{
static char vcur[256];
static Bool dnfirst = no;
uint curr = doc->lexer->versions; /* get current */
doc->lexer->versions &= (vers | VERS_PROPRIETARY);
if (curr != doc->lexer->versions) { /* only if different */
if (!dnfirst) {
5 years ago
dnfirst = aye;
vcur[0] = 0;
curr &= ~(VERS_PROPRIETARY);
add_vers_string( vcur, curr );
SPRTF("Before: %s\n", vcur);
}
vcur[0] = 0;
curr = doc->lexer->versions;
curr &= ~(VERS_PROPRIETARY);
add_vers_string( vcur, curr );
SPRTF("After : %s\n", vcur);
}
}
#else /* !#if (defined(_MSC_VER) && !defined(NDEBUG)) */
/* everything is allowed in proprietary version of HTML */
/* this is handled here rather than in the tag/attr dicts */
void TY_(ConstrainVersion)(TidyDocImpl* doc, uint vers)
{
doc->lexer->versions &= (vers | VERS_PROPRIETARY);
}
#endif /* #if (defined(_MSC_VER) && !defined(NDEBUG)) y/n */
Bool TY_(IsWhite)(uint c)
{
uint map = MAP(c);
return (map & white)!=0;
}
Bool TY_(IsNewline)(uint c)
{
uint map = MAP(c);
return (map & newline)!=0;
}
Bool TY_(IsDigit)(uint c)
{
uint map;
map = MAP(c);
return (map & digit)!=0;
}
static Bool IsDigitHex(uint c)
{
uint map;
map = MAP(c);
return (map & digithex)!=0;
}
Bool TY_(IsLetter)(uint c)
{
uint map;
map = MAP(c);
return (map & letter)!=0;
}
Bool TY_(IsHTMLSpace)(uint c)
{
return c == 0x020 || c == 0x009 || c == 0x00a || c == 0x00c || c == 0x00d;
}
Bool TY_(IsNamechar)(uint c)
{
uint map = MAP(c);
return (map & namechar)!=0;
}
Bool TY_(IsXMLLetter)(uint c)
{
return ((c >= 0x41 && c <= 0x5a) ||
(c >= 0x61 && c <= 0x7a) ||
(c >= 0xc0 && c <= 0xd6) ||
(c >= 0xd8 && c <= 0xf6) ||
(c >= 0xf8 && c <= 0xff) ||
(c >= 0x100 && c <= 0x131) ||
(c >= 0x134 && c <= 0x13e) ||
(c >= 0x141 && c <= 0x148) ||
(c >= 0x14a && c <= 0x17e) ||
(c >= 0x180 && c <= 0x1c3) ||
(c >= 0x1cd && c <= 0x1f0) ||
(c >= 0x1f4 && c <= 0x1f5) ||
(c >= 0x1fa && c <= 0x217) ||
(c >= 0x250 && c <= 0x2a8) ||
(c >= 0x2bb && c <= 0x2c1) ||
c == 0x386 ||
(c >= 0x388 && c <= 0x38a) ||
c == 0x38c ||
(c >= 0x38e && c <= 0x3a1) ||
(c >= 0x3a3 && c <= 0x3ce) ||
(c >= 0x3d0 && c <= 0x3d6) ||
c == 0x3da ||
c == 0x3dc ||
c == 0x3de ||
c == 0x3e0 ||
(c >= 0x3e2 && c <= 0x3f3) ||
(c >= 0x401 && c <= 0x40c) ||
(c >= 0x40e && c <= 0x44f) ||
(c >= 0x451 && c <= 0x45c) ||
(c >= 0x45e && c <= 0x481) ||
(c >= 0x490 && c <= 0x4c4) ||
(c >= 0x4c7 && c <= 0x4c8) ||
(c >= 0x4cb && c <= 0x4cc) ||
(c >= 0x4d0 && c <= 0x4eb) ||
(c >= 0x4ee && c <= 0x4f5) ||
(c >= 0x4f8 && c <= 0x4f9) ||
(c >= 0x531 && c <= 0x556) ||
c == 0x559 ||
(c >= 0x561 && c <= 0x586) ||
(c >= 0x5d0 && c <= 0x5ea) ||
(c >= 0x5f0 && c <= 0x5f2) ||
(c >= 0x621 && c <= 0x63a) ||
(c >= 0x641 && c <= 0x64a) ||
(c >= 0x671 && c <= 0x6b7) ||
(c >= 0x6ba && c <= 0x6be) ||
(c >= 0x6c0 && c <= 0x6ce) ||
(c >= 0x6d0 && c <= 0x6d3) ||
c == 0x6d5 ||
(c >= 0x6e5 && c <= 0x6e6) ||
(c >= 0x905 && c <= 0x939) ||
c == 0x93d ||
(c >= 0x958 && c <= 0x961) ||
(c >= 0x985 && c <= 0x98c) ||
(c >= 0x98f && c <= 0x990) ||
(c >= 0x993 && c <= 0x9a8) ||
(c >= 0x9aa && c <= 0x9b0) ||
c == 0x9b2 ||
(c >= 0x9b6 && c <= 0x9b9) ||
(c >= 0x9dc && c <= 0x9dd) ||
(c >= 0x9df && c <= 0x9e1) ||
(c >= 0x9f0 && c <= 0x9f1) ||
(c >= 0xa05 && c <= 0xa0a) ||
(c >= 0xa0f && c <= 0xa10) ||
(c >= 0xa13 && c <= 0xa28) ||
(c >= 0xa2a && c <= 0xa30) ||
(c >= 0xa32 && c <= 0xa33) ||
(c >= 0xa35 && c <= 0xa36) ||
(c >= 0xa38 && c <= 0xa39) ||
(c >= 0xa59 && c <= 0xa5c) ||
c == 0xa5e ||
(c >= 0xa72 && c <= 0xa74) ||
(c >= 0xa85 && c <= 0xa8b) ||
c == 0xa8d ||
(c >= 0xa8f && c <= 0xa91) ||
(c >= 0xa93 && c <= 0xaa8) ||
(c >= 0xaaa && c <= 0xab0) ||
(c >= 0xab2 && c <= 0xab3) ||
(c >= 0xab5 && c <= 0xab9) ||
c == 0xabd ||
c == 0xae0 ||
(c >= 0xb05 && c <= 0xb0c) ||
(c >= 0xb0f && c <= 0xb10) ||
(c >= 0xb13 && c <= 0xb28) ||
(c >= 0xb2a && c <= 0xb30) ||
(c >= 0xb32 && c <= 0xb33) ||
(c >= 0xb36 && c <= 0xb39) ||
c == 0xb3d ||
(c >= 0xb5c && c <= 0xb5d) ||
(c >= 0xb5f && c <= 0xb61) ||
(c >= 0xb85 && c <= 0xb8a) ||
(c >= 0xb8e && c <= 0xb90) ||
(c >= 0xb92 && c <= 0xb95) ||
(c >= 0xb99 && c <= 0xb9a) ||
c == 0xb9c ||
(c >= 0xb9e && c <= 0xb9f) ||
(c >= 0xba3 && c <= 0xba4) ||
(c >= 0xba8 && c <= 0xbaa) ||
(c >= 0xbae && c <= 0xbb5) ||
(c >= 0xbb7 && c <= 0xbb9) ||
(c >= 0xc05 && c <= 0xc0c) ||
(c >= 0xc0e && c <= 0xc10) ||
(c >= 0xc12 && c <= 0xc28) ||
(c >= 0xc2a && c <= 0xc33) ||
(c >= 0xc35 && c <= 0xc39) ||
(c >= 0xc60 && c <= 0xc61) ||
(c >= 0xc85 && c <= 0xc8c) ||
(c >= 0xc8e && c <= 0xc90) ||
(c >= 0xc92 && c <= 0xca8) ||
(c >= 0xcaa && c <= 0xcb3) ||
(c >= 0xcb5 && c <= 0xcb9) ||
c == 0xcde ||
(c >= 0xce0 && c <= 0xce1) ||
(c >= 0xd05 && c <= 0xd0c) ||
(c >= 0xd0e && c <= 0xd10) ||
(c >= 0xd12 && c <= 0xd28) ||
(c >= 0xd2a && c <= 0xd39) ||
(c >= 0xd60 && c <= 0xd61) ||
(c >= 0xe01 && c <= 0xe2e) ||
c == 0xe30 ||
(c >= 0xe32 && c <= 0xe33) ||
(c >= 0xe40 && c <= 0xe45) ||
(c >= 0xe81 && c <= 0xe82) ||
c == 0xe84 ||
(c >= 0xe87 && c <= 0xe88) ||
c == 0xe8a ||
c == 0xe8d ||
(c >= 0xe94 && c <= 0xe97) ||
(c >= 0xe99 && c <= 0xe9f) ||
(c >= 0xea1 && c <= 0xea3) ||
c == 0xea5 ||
c == 0xea7 ||
(c >= 0xeaa && c <= 0xeab) ||
(c >= 0xead && c <= 0xeae) ||
c == 0xeb0 ||
(c >= 0xeb2 && c <= 0xeb3) ||
c == 0xebd ||
(c >= 0xec0 && c <= 0xec4) ||
(c >= 0xf40 && c <= 0xf47) ||
(c >= 0xf49 && c <= 0xf69) ||
(c >= 0x10a0 && c <= 0x10c5) ||
(c >= 0x10d0 && c <= 0x10f6) ||
c == 0x1100 ||
(c >= 0x1102 && c <= 0x1103) ||
(c >= 0x1105 && c <= 0x1107) ||
c == 0x1109 ||
(c >= 0x110b && c <= 0x110c) ||
(c >= 0x110e && c <= 0x1112) ||
c == 0x113c ||
c == 0x113e ||
c == 0x1140 ||
c == 0x114c ||
c == 0x114e ||
c == 0x1150 ||
(c >= 0x1154 && c <= 0x1155) ||
c == 0x1159 ||
(c >= 0x115f && c <= 0x1161) ||
c == 0x1163 ||
c == 0x1165 ||
c == 0x1167 ||
c == 0x1169 ||
(c >= 0x116d && c <= 0x116e) ||
(c >= 0x1172 && c <= 0x1173) ||
c == 0x1175 ||
c == 0x119e ||
c == 0x11a8 ||
c == 0x11ab ||
(c >= 0x11ae && c <= 0x11af) ||
(c >= 0x11b7 && c <= 0x11b8) ||
c == 0x11ba ||
(c >= 0x11bc && c <= 0x11c2) ||
c == 0x11eb ||
c == 0x11f0 ||
c == 0x11f9 ||
(c >= 0x1e00 && c <= 0x1e9b) ||
(c >= 0x1ea0 && c <= 0x1ef9) ||
(c >= 0x1f00 && c <= 0x1f15) ||
(c >= 0x1f18 && c <= 0x1f1d) ||
(c >= 0x1f20 && c <= 0x1f45) ||
(c >= 0x1f48 && c <= 0x1f4d) ||
(c >= 0x1f50 && c <= 0x1f57) ||
c == 0x1f59 ||
c == 0x1f5b ||
c == 0x1f5d ||
(c >= 0x1f5f && c <= 0x1f7d) ||
(c >= 0x1f80 && c <= 0x1fb4) ||
(c >= 0x1fb6 && c <= 0x1fbc) ||
c == 0x1fbe ||
(c >= 0x1fc2 && c <= 0x1fc4) ||
(c >= 0x1fc6 && c <= 0x1fcc) ||
(c >= 0x1fd0 && c <= 0x1fd3) ||
(c >= 0x1fd6 && c <= 0x1fdb) ||
(c >= 0x1fe0 && c <= 0x1fec) ||
(c >= 0x1ff2 && c <= 0x1ff4) ||
(c >= 0x1ff6 && c <= 0x1ffc) ||
c == 0x2126 ||
(c >= 0x212a && c <= 0x212b) ||
c == 0x212e ||
(c >= 0x2180 && c <= 0x2182) ||
(c >= 0x3041 && c <= 0x3094) ||
(c >= 0x30a1 && c <= 0x30fa) ||
(c >= 0x3105 && c <= 0x312c) ||
(c >= 0xac00 && c <= 0xd7a3) ||
(c >= 0x4e00 && c <= 0x9fa5) ||
c == 0x3007 ||
(c >= 0x3021 && c <= 0x3029) ||
(c >= 0x4e00 && c <= 0x9fa5) ||
c == 0x3007 ||
(c >= 0x3021 && c <= 0x3029));
}
Bool TY_(IsXMLNamechar)(uint c)
{
return (TY_(IsXMLLetter)(c) ||
c == '.' || c == '_' ||
c == ':' || c == '-' ||
(c >= 0x300 && c <= 0x345) ||
(c >= 0x360 && c <= 0x361) ||
(c >= 0x483 && c <= 0x486) ||
(c >= 0x591 && c <= 0x5a1) ||
(c >= 0x5a3 && c <= 0x5b9) ||
(c >= 0x5bb && c <= 0x5bd) ||
c == 0x5bf ||
(c >= 0x5c1 && c <= 0x5c2) ||
c == 0x5c4 ||
(c >= 0x64b && c <= 0x652) ||
c == 0x670 ||
(c >= 0x6d6 && c <= 0x6dc) ||
(c >= 0x6dd && c <= 0x6df) ||
(c >= 0x6e0 && c <= 0x6e4) ||
(c >= 0x6e7 && c <= 0x6e8) ||
(c >= 0x6ea && c <= 0x6ed) ||
(c >= 0x901 && c <= 0x903) ||
c == 0x93c ||
(c >= 0x93e && c <= 0x94c) ||
c == 0x94d ||
(c >= 0x951 && c <= 0x954) ||
(c >= 0x962 && c <= 0x963) ||
(c >= 0x981 && c <= 0x983) ||
c == 0x9bc ||
c == 0x9be ||
c == 0x9bf ||
(c >= 0x9c0 && c <= 0x9c4) ||
(c >= 0x9c7 && c <= 0x9c8) ||
(c >= 0x9cb && c <= 0x9cd) ||
c == 0x9d7 ||
(c >= 0x9e2 && c <= 0x9e3) ||
c == 0xa02 ||
c == 0xa3c ||
c == 0xa3e ||
c == 0xa3f ||
(c >= 0xa40 && c <= 0xa42) ||
(c >= 0xa47 && c <= 0xa48) ||
(c >= 0xa4b && c <= 0xa4d) ||
(c >= 0xa70 && c <= 0xa71) ||
(c >= 0xa81 && c <= 0xa83) ||
c == 0xabc ||
(c >= 0xabe && c <= 0xac5) ||
(c >= 0xac7 && c <= 0xac9) ||
(c >= 0xacb && c <= 0xacd) ||
(c >= 0xb01 && c <= 0xb03) ||
c == 0xb3c ||
(c >= 0xb3e && c <= 0xb43) ||
(c >= 0xb47 && c <= 0xb48) ||
(c >= 0xb4b && c <= 0xb4d) ||
(c >= 0xb56 && c <= 0xb57) ||
(c >= 0xb82 && c <= 0xb83) ||
(c >= 0xbbe && c <= 0xbc2) ||
(c >= 0xbc6 && c <= 0xbc8) ||
(c >= 0xbca && c <= 0xbcd) ||
c == 0xbd7 ||
(c >= 0xc01 && c <= 0xc03) ||
(c >= 0xc3e && c <= 0xc44) ||
(c >= 0xc46 && c <= 0xc48) ||
(c >= 0xc4a && c <= 0xc4d) ||
(c >= 0xc55 && c <= 0xc56) ||
(c >= 0xc82 && c <= 0xc83) ||
(c >= 0xcbe && c <= 0xcc4) ||
(c >= 0xcc6 && c <= 0xcc8) ||
(c >= 0xcca && c <= 0xccd) ||
(c >= 0xcd5 && c <= 0xcd6) ||
(c >= 0xd02 && c <= 0xd03) ||
(c >= 0xd3e && c <= 0xd43) ||
(c >= 0xd46 && c <= 0xd48) ||
(c >= 0xd4a && c <= 0xd4d) ||
c == 0xd57 ||
c == 0xe31 ||
(c >= 0xe34 && c <= 0xe3a) ||
(c >= 0xe47 && c <= 0xe4e) ||
c == 0xeb1 ||
(c >= 0xeb4 && c <= 0xeb9) ||
(c >= 0xebb && c <= 0xebc) ||
(c >= 0xec8 && c <= 0xecd) ||
(c >= 0xf18 && c <= 0xf19) ||
c == 0xf35 ||
c == 0xf37 ||
c == 0xf39 ||
c == 0xf3e ||
c == 0xf3f ||
(c >= 0xf71 && c <= 0xf84) ||
(c >= 0xf86 && c <= 0xf8b) ||
(c >= 0xf90 && c <= 0xf95) ||
c == 0xf97 ||
(c >= 0xf99 && c <= 0xfad) ||
(c >= 0xfb1 && c <= 0xfb7) ||
c == 0xfb9 ||
(c >= 0x20d0 && c <= 0x20dc) ||
c == 0x20e1 ||
(c >= 0x302a && c <= 0x302f) ||
c == 0x3099 ||
c == 0x309a ||
(c >= 0x30 && c <= 0x39) ||
(c >= 0x660 && c <= 0x669) ||
(c >= 0x6f0 && c <= 0x6f9) ||
(c >= 0x966 && c <= 0x96f) ||
(c >= 0x9e6 && c <= 0x9ef) ||
(c >= 0xa66 && c <= 0xa6f) ||
(c >= 0xae6 && c <= 0xaef) ||
(c >= 0xb66 && c <= 0xb6f) ||
(c >= 0xbe7 && c <= 0xbef) ||
(c >= 0xc66 && c <= 0xc6f) ||
(c >= 0xce6 && c <= 0xcef) ||
(c >= 0xd66 && c <= 0xd6f) ||
(c >= 0xe50 && c <= 0xe59) ||
(c >= 0xed0 && c <= 0xed9) ||
(c >= 0xf20 && c <= 0xf29) ||
c == 0xb7 ||
c == 0x2d0 ||
c == 0x2d1 ||
c == 0x387 ||
c == 0x640 ||
c == 0xe46 ||
c == 0xec6 ||
c == 0x3005 ||
(c >= 0x3031 && c <= 0x3035) ||
(c >= 0x309d && c <= 0x309e) ||
(c >= 0x30fc && c <= 0x30fe));
}
#if 0
Bool IsLower(uint c)
{
uint map = MAP(c);
return (map & lowercase)!=0;
}
#endif
Bool TY_(IsUpper)(uint c)
{
uint map = MAP(c);
return (map & uppercase)!=0;
}
uint TY_(ToLower)(uint c)
{
uint map = MAP(c);
if (map & uppercase)
c += 'a' - 'A';
return c;
}
uint TY_(ToUpper)(uint c)
{
uint map = MAP(c);
if (map & lowercase)
c += (uint) ('A' - 'a' );
return c;
}
#if 0
char FoldCase( TidyDocImpl* doc, tmbchar c, Bool tocaps )
{
if ( !cfgBool(doc, TidyXmlTags) )
{
if ( tocaps )
{
c = (tmbchar) ToUpper(c);
}
else /* force to lower case */
{
c = (tmbchar) ToLower(c);
}
}
return c;
}
#endif
/*
return last character in string
this is useful when trailing quotemark
is missing on an attribute
*/
static tmbchar LastChar( tmbstr str )
{
if ( str && *str )
{
int n = TY_(tmbstrlen)(str);
return str[n-1];
}
return 0;
}
/*
node->type is one of these:
#define TextNode 1
#define StartTag 2
#define EndTag 3
#define StartEndTag 4
*/
Lexer* TY_(NewLexer)( TidyDocImpl* doc )
{
Lexer* lexer = (Lexer*) TidyDocAlloc( doc, sizeof(Lexer) );
if ( lexer != NULL )
{
TidyClearMemory( lexer, sizeof(Lexer) );
lexer->allocator = doc->allocator;
lexer->lines = 1;
lexer->columns = 1;
lexer->state = LEX_CONTENT;
lexer->versions = (VERS_ALL|VERS_PROPRIETARY);
lexer->doctype = VERS_UNKNOWN;
lexer->root = &doc->root;
}
return lexer;
}
static Bool EndOfInput( TidyDocImpl* doc )
{
assert( doc->docIn != NULL );
return ( !doc->docIn->pushed && TY_(IsEOF)(doc->docIn) );
}
void TY_(FreeLexer)( TidyDocImpl* doc )
{
Lexer *lexer = doc->lexer;
if ( lexer )
{
TY_(FreeStyles)( doc );
/* See GetToken() */
if ( lexer->pushed || lexer->itoken )
{
if (lexer->pushed)
TY_(FreeNode)( doc, lexer->itoken );
TY_(FreeNode)( doc, lexer->token );
}
while ( lexer->istacksize > 0 )
TY_(PopInline)( doc, NULL );
TidyDocFree( doc, lexer->istack );
TidyDocFree( doc, lexer->lexbuf );
TidyDocFree( doc, lexer );
doc->lexer = NULL;
}
}
/* Lexer uses bigger memory chunks than pprint as
** it must hold the entire input document. not just
** the last line or three.
*/
static void AddByte( Lexer *lexer, tmbchar ch )
{
if ( lexer->lexsize + 2 >= lexer->lexlength )
{
tmbstr buf = NULL;
uint allocAmt = lexer->lexlength;
while ( lexer->lexsize + 2 >= allocAmt )
{
if ( allocAmt == 0 )
allocAmt = 8192;
else
allocAmt *= 2;
}
buf = (tmbstr) TidyRealloc( lexer->allocator, lexer->lexbuf, allocAmt );
if ( buf )
{
TidyClearMemory( buf + lexer->lexlength,
allocAmt - lexer->lexlength );
lexer->lexbuf = buf;
lexer->lexlength = allocAmt;
}
}
lexer->lexbuf[ lexer->lexsize++ ] = ch;
lexer->lexbuf[ lexer->lexsize ] = '\0'; /* debug */
}
static void ChangeChar( Lexer *lexer, tmbchar c )
{
if ( lexer->lexsize > 0 )
{
lexer->lexbuf[ lexer->lexsize-1 ] = c;
}
}
/* store character c as UTF-8 encoded byte stream */
void TY_(AddCharToLexer)( Lexer *lexer, uint c )
{
int i, err, count = 0;
tmbchar buf[10] = {0};
err = TY_(EncodeCharToUTF8Bytes)( c, buf, NULL, &count );
if (err)
{
/* replacement character 0xFFFD encoded as UTF-8 */
buf[0] = (byte) 0xEF;
buf[1] = (byte) 0xBF;
buf[2] = (byte) 0xBD;
count = 3;
}
for ( i = 0; i < count; ++i )
AddByte( lexer, buf[i] );
}
static void AddStringToLexer( Lexer *lexer, ctmbstr str )
{
uint c;
/* Many (all?) compilers will sign-extend signed chars (the default) when
** converting them to unsigned integer values. We must cast our char to
** unsigned char before assigning it to prevent this from happening.
*/
while( 0 != (c = (unsigned char) *str++ ))
TY_(AddCharToLexer)( lexer, c );
}
static void SetLexerLocus( TidyDocImpl* doc, Lexer *lexer )
{
lexer->lines = doc->docIn->curline;
lexer->columns = doc->docIn->curcol;
}
/*
No longer attempts to insert missing ';' for unknown
enitities unless one was present already, since this
gives unexpected results.
For example: <a href="something.htm?foo&bar&fred">
was tidied to: <a href="something.htm?foo&amp;bar;&amp;fred;">
rather than: <a href="something.htm?foo&amp;bar&amp;fred">
My thanks for Maurice Buxton for spotting this.
Also Randy Waki pointed out the following case for the
04 Aug 00 version (bug #433012):
For example: <a href="something.htm?id=1&lang=en">
was tidied to: <a href="something.htm?id=1&lang;=en">
rather than: <a href="something.htm?id=1&amp;lang=en">
where "lang" is a known entity (#9001), but browsers would
misinterpret "&lang;" because it had a value > 256.
So the case of an apparently known entity with a value > 256 and
missing a semicolon is handled specially.
"ParseEntity" is also a bit of a misnomer - it handles entities and
numeric character references. Invalid NCR's are now reported.
*/
static void ParseEntity( TidyDocImpl* doc, GetTokenMode mode )
{
typedef enum
{
ENT_default,
ENT_numdec,
ENT_numhex
} ENTState;
typedef Bool (*ENTfn)(uint);
const ENTfn entFn[] = {
TY_(IsNamechar),
TY_(IsDigit),
IsDigitHex
};
uint start;
ENTState entState = ENT_default;
uint charRead = 0;
Bool semicolon = no, found = no;
Bool isXml = cfgBool( doc, TidyXmlTags );
Bool preserveEntities = cfgBool( doc, TidyPreserveEntities );
uint c, ch, startcol, entver = 0;
Lexer* lexer = doc->lexer;
start = lexer->lexsize - 1; /* to start at "&" */
startcol = doc->docIn->curcol - 1;
while ( (c = TY_(ReadChar)(doc->docIn)) != EndOfStream )