|
|
|
/* lexer.c -- Lexer for html parser
|
|
|
|
|
|
|
|
(c) 1998-2008 (W3C) MIT, ERCIM, Keio University
|
|
|
|
See tidy.h for the copyright notice.
|
|
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
Given a file stream fp it returns a sequence of tokens.
|
|
|
|
|
|
|
|
GetToken(fp) gets the next token
|
|
|
|
UngetToken(fp) provides one level undo
|
|
|
|
|
|
|
|
The tags include an attribute list:
|
|
|
|
|
|
|
|
- linked list of attribute/value nodes
|
|
|
|
- each node has 2 NULL-terminated strings.
|
|
|
|
- entities are replaced in attribute values
|
|
|
|
|
|
|
|
white space is compacted if not in preformatted mode
|
|
|
|
If not in preformatted mode then leading white space
|
|
|
|
is discarded and subsequent white space sequences
|
|
|
|
compacted to single space characters.
|
|
|
|
|
|
|
|
If XmlTags is no then Tag names are folded to upper
|
|
|
|
case and attribute names to lower case.
|
|
|
|
|
|
|
|
Not yet done:
|
|
|
|
- Doctype subset and marked sections
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "tidy-int.h"
|
|
|
|
#include "lexer.h"
|
|
|
|
#include "parser.h"
|
|
|
|
#include "entities.h"
|
|
|
|
#include "streamio.h"
|
|
|
|
#include "message.h"
|
|
|
|
#include "tmbstr.h"
|
|
|
|
#include "clean.h"
|
|
|
|
#include "utf8.h"
|
|
|
|
#include "streamio.h"
|
|
|
|
#ifdef _MSC_VER
|
|
|
|
#include "sprtf.h"
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifndef SPRTF
|
|
|
|
#define SPRTF printf
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#if !defined(NDEBUG) && defined(_MSC_VER)
|
|
|
|
/* #define DEBUG_ALLOCATION special EXTRA allocation debug information - VERY NOISY */
|
|
|
|
static void check_me(char *name);
|
|
|
|
static Bool show_attrs = aye;
|
|
|
|
#define MX_TXT 8
|
|
|
|
static char buffer[(MX_TXT*4)+8]; /* NOTE extra for '...'\0 tail */
|
|
|
|
static tmbstr get_text_string(Lexer* lexer, Node *node)
|
|
|
|
{
|
|
|
|
uint len = node->end - node->start;
|
|
|
|
tmbstr cp = lexer->lexbuf + node->start;
|
|
|
|
tmbstr end = lexer->lexbuf + node->end;
|
|
|
|
unsigned char c;
|
|
|
|
uint i = 0;
|
|
|
|
Bool insp = no;
|
|
|
|
if (len <= ((MX_TXT * 2) + 3)) {
|
|
|
|
buffer[0] = 0;
|
|
|
|
while (cp < end) {
|
|
|
|
c = *cp;
|
|
|
|
cp++;
|
|
|
|
if (c == '\n') {
|
|
|
|
buffer[i++] = '\\';
|
|
|
|
buffer[i++] = 'n';
|
|
|
|
} else if ( c == ' ' ) {
|
|
|
|
if (!insp)
|
|
|
|
buffer[i++] = c;
|
|
|
|
insp = aye;
|
|
|
|
} else {
|
|
|
|
buffer[i++] = c;
|
|
|
|
insp = no;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
char *end1 = cp + MX_TXT;
|
|
|
|
char *bgn = cp + (len - MX_TXT);
|
|
|
|
buffer[0] = 0;
|
|
|
|
if (bgn < end1)
|
|
|
|
bgn = end1;
|
|
|
|
while (cp < end1) {
|
|
|
|
c = *cp;
|
|
|
|
cp++;
|
|
|
|
if (c == '\n') {
|
|
|
|
buffer[i++] = '\\';
|
|
|
|
buffer[i++] = 'n';
|
|
|
|
} else if ( c == ' ' ) {
|
|
|
|
if (!insp)
|
|
|
|
buffer[i++] = c;
|
|
|
|
insp = aye;
|
|
|
|
} else {
|
|
|
|
buffer[i++] = c;
|
|
|
|
insp = no;
|
|
|
|
}
|
|
|
|
if (i >= MX_TXT)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
c = '.';
|
|
|
|
if ((i < len)&&(cp < bgn)) {
|
|
|
|
buffer[i++] = c;
|
|
|
|
cp++;
|
|
|
|
if ((i < len)&&(cp < bgn)) {
|
|
|
|
buffer[i++] = c;
|
|
|
|
cp++;
|
|
|
|
if ((i < len)&&(cp < bgn)) {
|
|
|
|
buffer[i++] = c;
|
|
|
|
cp++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
cp = bgn;
|
|
|
|
insp = no;
|
|
|
|
while (cp < end) {
|
|
|
|
c = *cp;
|
|
|
|
cp++;
|
|
|
|
if (c == '\n') {
|
|
|
|
buffer[i++] = '\\';
|
|
|
|
buffer[i++] = 'n';
|
|
|
|
} else if ( c == ' ' ) {
|
|
|
|
if (!insp)
|
|
|
|
buffer[i++] = c;
|
|
|
|
insp = aye;
|
|
|
|
} else {
|
|
|
|
buffer[i++] = c;
|
|
|
|
insp = no;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
buffer[i] = 0;
|
|
|
|
return buffer;
|
|
|
|
}
|
|
|
|
static void Show_Node( TidyDocImpl* doc, const char *msg, Node *node )
|
|
|
|
{
|
|
|
|
Lexer* lexer = doc->lexer;
|
|
|
|
Bool lex = ((msg[0] == 'l')&&(msg[1] == 'e')) ? aye : no;
|
|
|
|
int line = ( doc->lexer ? doc->lexer->lines : 0 );
|
|
|
|
int col = ( doc->lexer ? doc->lexer->columns : 0 );
|
|
|
|
tmbstr src = lex ? "lexer" : "stream";
|
|
|
|
SPRTF("R=%d C=%d: ", line, col );
|
|
|
|
// DEBUG: Be able to set a TRAP on a SPECIFIC row,col
|
|
|
|
if ((line == 67) && (col == 95)) {
|
|
|
|
check_me("Show_Node"); // just a debug trap
|
|
|
|
}
|
|
|
|
if (lexer && lexer->token &&
|
|
|
|
((lexer->token->type == TextNode)||(node && (node->type == TextNode)))) {
|
|
|
|
if (show_attrs) {
|
|
|
|
uint len = node ? node->end - node->start : 0;
|
|
|
|
tmbstr cp = node ? get_text_string( lexer, node ) : "NULL";
|
|
|
|
SPRTF("Returning %s TextNode [%s]%u %s\n", msg, cp, len, src );
|
|
|
|
} else {
|
|
|
|
SPRTF("Returning %s TextNode %p... %s\n", msg, node, src );
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
tmbstr name = node ? node->element ? node->element : "blank" : "NULL";
|
|
|
|
if (show_attrs) {
|
|
|
|
AttVal* av;
|
|
|
|
SPRTF("Returning %s node <%s", msg, name);
|
|
|
|
if (node) {
|
|
|
|
for (av = node->attributes; av; av = av->next) {
|
|
|
|
name = av->attribute;
|
|
|
|
if (name) {
|
|
|
|
SPRTF(" %s",name);
|
|
|
|
if (av->value) {
|
|
|
|
SPRTF("=\"%s\"", av->value);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
SPRTF("> %s\n", src);
|
|
|
|
} else {
|
|
|
|
SPRTF("Returning %s node %p <%s>... %s\n", msg, node,
|
|
|
|
name, src );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#define GTDBG(a,b,c) Show_Node(a,b,c)
|
|
|
|
#else
|
|
|
|
#define GTDBG(a,b,c)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* Forward references
|
|
|
|
*/
|
|
|
|
/* swallows closing '>' */
|
|
|
|
static AttVal *ParseAttrs( TidyDocImpl* doc, Bool *isempty );
|
|
|
|
|
|
|
|
static tmbstr ParseAttribute( TidyDocImpl* doc, Bool* isempty,
|
|
|
|
Node **asp, Node **php );
|
|
|
|
|
|
|
|
static tmbstr ParseValue( TidyDocImpl* doc, ctmbstr name, Bool foldCase,
|
|
|
|
Bool *isempty, int *pdelim );
|
|
|
|
|
|
|
|
static Node *ParseDocTypeDecl(TidyDocImpl* doc);
|
|
|
|
|
|
|
|
static void AddAttrToList( AttVal** list, AttVal* av );
|
|
|
|
|
|
|
|
/* used to classify characters for lexical purposes */
|
|
|
|
#define MAP(c) ((unsigned)c < 128 ? lexmap[(unsigned)c] : 0)
|
|
|
|
static uint lexmap[128];
|
|
|
|
|
|
|
|
#define IsValidXMLAttrName(name) TY_(IsValidXMLID)(name)
|
|
|
|
#define IsValidXMLElemName(name) TY_(IsValidXMLID)(name)
|
|
|
|
|
|
|
|
static struct _doctypes
|
|
|
|
{
|
|
|
|
uint score;
|
|
|
|
uint vers;
|
|
|
|
ctmbstr name;
|
|
|
|
ctmbstr fpi;
|
|
|
|
ctmbstr si;
|
|
|
|
} const W3C_Doctypes[] =
|
|
|
|
{
|
|
|
|
{ 2, HT20, "HTML 2.0", "-//IETF//DTD HTML 2.0//EN", NULL, },
|
|
|
|
{ 2, HT20, "HTML 2.0", "-//IETF//DTD HTML//EN", NULL, },
|
|
|
|
{ 2, HT20, "HTML 2.0", "-//W3C//DTD HTML 2.0//EN", NULL, },
|
|
|
|
{ 1, HT32, "HTML 3.2", "-//W3C//DTD HTML 3.2//EN", NULL, },
|
|
|
|
{ 1, HT32, "HTML 3.2", "-//W3C//DTD HTML 3.2 Final//EN", NULL, },
|
|
|
|
{ 1, HT32, "HTML 3.2", "-//W3C//DTD HTML 3.2 Draft//EN", NULL, },
|
|
|
|
{ 6, H40S, "HTML 4.0 Strict", "-//W3C//DTD HTML 4.0//EN", "http://www.w3.org/TR/REC-html40/strict.dtd" },
|
|
|
|
{ 8, H40T, "HTML 4.0 Transitional", "-//W3C//DTD HTML 4.0 Transitional//EN", "http://www.w3.org/TR/REC-html40/loose.dtd" },
|
|
|
|
{ 7, H40F, "HTML 4.0 Frameset", "-//W3C//DTD HTML 4.0 Frameset//EN", "http://www.w3.org/TR/REC-html40/frameset.dtd" },
|
|
|
|
{ 3, H41S, "HTML 4.01 Strict", "-//W3C//DTD HTML 4.01//EN", "http://www.w3.org/TR/html4/strict.dtd" },
|
|
|
|
{ 5, H41T, "HTML 4.01 Transitional", "-//W3C//DTD HTML 4.01 Transitional//EN", "http://www.w3.org/TR/html4/loose.dtd" },
|
|
|
|
{ 4, H41F, "HTML 4.01 Frameset", "-//W3C//DTD HTML 4.01 Frameset//EN", "http://www.w3.org/TR/html4/frameset.dtd" },
|
|
|
|
{ 9, X10S, "XHTML 1.0 Strict", "-//W3C//DTD XHTML 1.0 Strict//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd" },
|
|
|
|
{ 11, X10T, "XHTML 1.0 Transitional", "-//W3C//DTD XHTML 1.0 Transitional//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" },
|
|
|
|
{ 10, X10F, "XHTML 1.0 Frameset", "-//W3C//DTD XHTML 1.0 Frameset//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd" },
|
|
|
|
{ 12, XH11, "XHTML 1.1", "-//W3C//DTD XHTML 1.1//EN", "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd" },
|
|
|
|
{ 13, XB10, "XHTML Basic 1.0", "-//W3C//DTD XHTML Basic 1.0//EN", "http://www.w3.org/TR/xhtml-basic/xhtml-basic10.dtd" },
|
|
|
|
|
|
|
|
{ 20, HT50, "HTML5", NULL, NULL },
|
|
|
|
{ 21, XH50, "XHTML5", NULL, NULL },
|
|
|
|
|
|
|
|
/* reminder to add XHTML Print 1.0 support, see http://www.w3.org/TR/xhtml-print */
|
|
|
|
#if 0
|
|
|
|
{ 14, XP10, "XHTML Print 1.0", "-//W3C//DTD XHTML-Print 1.0//EN", "http://www.w3.org/MarkUp/DTD/xhtml-print10.dtd" },
|
|
|
|
{ 14, XP10, "XHTML Print 1.0", "-//PWG//DTD XHTML-Print 1.0//EN", "http://www.xhtml-print.org/xhtml-print/xhtml-print10.dtd" },
|
|
|
|
#endif
|
|
|
|
/* final entry */
|
|
|
|
{ 0, 0, NULL, NULL, NULL }
|
|
|
|
};
|
|
|
|
|
|
|
|
int TY_(HTMLVersion)(TidyDocImpl* doc)
|
|
|
|
{
|
|
|
|
uint i;
|
|
|
|
uint j = 0;
|
|
|
|
uint score = 0;
|
|
|
|
uint vers = doc->lexer->versions;
|
|
|
|
uint dtver = doc->lexer->doctype;
|
|
|
|
TidyDoctypeModes dtmode = (TidyDoctypeModes)cfg(doc, TidyDoctypeMode);
|
|
|
|
Bool xhtml = (cfgBool(doc, TidyXmlOut) || doc->lexer->isvoyager) &&
|
|
|
|
!cfgBool(doc, TidyHtmlOut);
|
|
|
|
Bool html4 = ((dtmode == TidyDoctypeStrict) || (dtmode == TidyDoctypeLoose) ||
|
|
|
|
(VERS_FROM40 & dtver) ? aye : no);
|
|
|
|
Bool html5 = (!html4 && ((dtmode == TidyDoctypeAuto) ||
|
|
|
|
(dtmode == TidyDoctypeHtml5)) ? aye : no);
|
|
|
|
|
|
|
|
if (xhtml && dtver == VERS_UNKNOWN) return XH50;
|
|
|
|
if (dtver == VERS_UNKNOWN) return HT50;
|
|
|
|
/* Issue #167 - if NOT XHTML, and doctype is default VERS_HTML5, then return HT50 */
|
|
|
|
if (!xhtml && (dtver == VERS_HTML5)) return HT50;
|
|
|
|
/* Issue #377 - If xhtml and (doctype == html5) and constrained vers contains XH50 return that,
|
|
|
|
and really if tidy defaults to 'html5', then maybe 'auto' should also apply! */
|
|
|
|
if (xhtml && html5 && ((vers & VERS_HTML5) == XH50)) return XH50;
|
|
|
|
|
|
|
|
for (i = 0; W3C_Doctypes[i].name; ++i)
|
|
|
|
{
|
|
|
|
if ((xhtml && !(VERS_XHTML & W3C_Doctypes[i].vers)) ||
|
|
|
|
(html4 && !(VERS_FROM40 & W3C_Doctypes[i].vers)))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (vers & W3C_Doctypes[i].vers &&
|
|
|
|
(W3C_Doctypes[i].score < score || !score))
|
|
|
|
{
|
|
|
|
score = W3C_Doctypes[i].score;
|
|
|
|
j = i;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (score)
|
|
|
|
return W3C_Doctypes[j].vers;
|
|
|
|
|
|
|
|
return VERS_UNKNOWN;
|
|
|
|
}
|
|
|
|
|
|
|
|
static ctmbstr GetFPIFromVers(uint vers)
|
|
|
|
{
|
|
|
|
uint i;
|
|
|
|
|
|
|
|
for (i = 0; W3C_Doctypes[i].name; ++i)
|
|
|
|
if (W3C_Doctypes[i].vers == vers)
|
|
|
|
return W3C_Doctypes[i].fpi;
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static ctmbstr GetSIFromVers(uint vers)
|
|
|
|
{
|
|
|
|
uint i;
|
|
|
|
|
|
|
|
for (i = 0; W3C_Doctypes[i].name; ++i)
|
|
|
|
if (W3C_Doctypes[i].vers == vers)
|
|
|
|
return W3C_Doctypes[i].si;
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static ctmbstr GetNameFromVers(uint vers)
|
|
|
|
{
|
|
|
|
uint i;
|
|
|
|
|
|
|
|
for (i = 0; W3C_Doctypes[i].name; ++i)
|
|
|
|
if (W3C_Doctypes[i].vers == vers)
|
|
|
|
return W3C_Doctypes[i].name;
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static uint GetVersFromFPI(ctmbstr fpi)
|
|
|
|
{
|
|
|
|
uint i;
|
|
|
|
|
|
|
|
for (i = 0; W3C_Doctypes[i].name; ++i)
|
|
|
|
if (W3C_Doctypes[i].fpi != NULL && TY_(tmbstrcasecmp)(W3C_Doctypes[i].fpi, fpi) == 0)
|
|
|
|
return W3C_Doctypes[i].vers;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
#if (defined(_MSC_VER) && !defined(NDEBUG))
|
|
|
|
/* Issue #377 - Output diminishing version bits */
|
|
|
|
typedef struct tagV2S {
|
|
|
|
uint bit;
|
|
|
|
ctmbstr val;
|
|
|
|
}V2S, *PV2S;
|
|
|
|
|
|
|
|
static V2S v2s[] = {
|
|
|
|
{ HT20, "HT20" },
|
|
|
|
{ HT32, "HT32" },
|
|
|
|
{ H40S, "H40S" },
|
|
|
|
{ H40T, "H40T" },
|
|
|
|
{ H40F, "H40F" },
|
|
|
|
{ H41S, "H41S" },
|
|
|
|
{ H41T, "H41T" },
|
|
|
|
{ H41F, "H41F" },
|
|
|
|
{ X10S, "X10S" },
|
|
|
|
{ X10T, "X10T" },
|
|
|
|
{ X10F, "X10F" },
|
|
|
|
{ XH11, "XH11" },
|
|
|
|
{ XB10, "XB10" }, /* 4096u */
|
|
|
|
/* { VERS_SUN, "VSUN" }, */
|
|
|
|
/* { VERS_NETSCAPE, "VNET" }, */
|
|
|
|
/* { VERS_MICROSOFT, "VMIC" }, 32768u */
|
|
|
|
{ VERS_XML, "VXML" }, /* 65536u */
|
|
|
|
/* HTML5 */
|
|
|
|
{ HT50, "HT50" }, /* 131072u */
|
|
|
|
{ XH50, "XH50" }, /* 262144u */
|
|
|
|
{ 0, 0 }
|
|
|
|
};
|
|
|
|
|
|
|
|
/* Process the above table, adding a bit name,
|
|
|
|
or '----' when not present */
|
|
|
|
static char *add_vers_string( tmbstr buf, uint vers )
|
|
|
|
{
|
|
|
|
PV2S pv2s = v2s;
|
|
|
|
int len = (int)strlen(buf);
|
|
|
|
while (pv2s->val) {
|
|
|
|
if (vers & pv2s->bit) {
|
|
|
|
if (len) {
|
|
|
|
strcat(buf,"|");
|
|
|
|
len++;
|
|
|
|
}
|
|
|
|
strcat(buf,pv2s->val);
|
|
|
|
len += (int)strlen(pv2s->val);
|
|
|
|
vers &= ~(pv2s->bit);
|
|
|
|
if (!vers)
|
|
|
|
break;
|
|
|
|
} else {
|
|
|
|
if (len) {
|
|
|
|
strcat(buf,"|");
|
|
|
|
len++;
|
|
|
|
}
|
|
|
|
strcat(buf,"----");
|
|
|
|
len += 4;
|
|
|
|
|
|
|
|
}
|
|
|
|
pv2s++;
|
|
|
|
}
|
|
|
|
if (vers) { /* Should not have any here! */
|
|
|
|
if (len)
|
|
|
|
strcat(buf,"|");
|
|
|
|
sprintf(EndBuf(buf),"%u",vers);
|
|
|
|
}
|
|
|
|
return buf;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Issue #377 - Show first Before: list, and then on any change
|
|
|
|
Note the VERS_PROPRIETARY are exclude since they always remain */
|
|
|
|
void TY_(ConstrainVersion)(TidyDocImpl* doc, uint vers)
|
|
|
|
{
|
|
|
|
static char vcur[256];
|
|
|
|
static Bool dnfirst = no;
|
|
|
|
uint curr = doc->lexer->versions; /* get current */
|
|
|
|
doc->lexer->versions &= (vers | VERS_PROPRIETARY);
|
|
|
|
if (curr != doc->lexer->versions) { /* only if different */
|
|
|
|
if (!dnfirst) {
|
|
|
|
dnfirst = aye;
|
|
|
|
vcur[0] = 0;
|
|
|
|
curr &= ~(VERS_PROPRIETARY);
|
|
|
|
add_vers_string( vcur, curr );
|
|
|
|
SPRTF("Before: %s\n", vcur);
|
|
|
|
}
|
|
|
|
vcur[0] = 0;
|
|
|
|
curr = doc->lexer->versions;
|
|
|
|
curr &= ~(VERS_PROPRIETARY);
|
|
|
|
add_vers_string( vcur, curr );
|
|
|
|
SPRTF("After : %s\n", vcur);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#else /* !#if (defined(_MSC_VER) && !defined(NDEBUG)) */
|
|
|
|
/* everything is allowed in proprietary version of HTML */
|
|
|
|
/* this is handled here rather than in the tag/attr dicts */
|
|
|
|
void TY_(ConstrainVersion)(TidyDocImpl* doc, uint vers)
|
|
|
|
{
|
|
|
|
doc->lexer->versions &= (vers | VERS_PROPRIETARY);
|
|
|
|
}
|
|
|
|
#endif /* #if (defined(_MSC_VER) && !defined(NDEBUG)) y/n */
|
|
|
|
|
|
|
|
Bool TY_(IsWhite)(uint c)
|
|
|
|
{
|
|
|
|
uint map = MAP(c);
|
|
|
|
|
|
|
|
return (map & white)!=0;
|
|
|
|
}
|
|
|
|
|
|
|
|
Bool TY_(IsNewline)(uint c)
|
|
|
|
{
|
|
|
|
uint map = MAP(c);
|
|
|
|
return (map & newline)!=0;
|
|
|
|
}
|
|
|
|
|
|
|
|
Bool TY_(IsDigit)(uint c)
|
|
|
|
{
|
|
|
|
uint map;
|
|
|
|
|
|
|
|
map = MAP(c);
|
|
|
|
|
|
|
|
return (map & digit)!=0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static Bool IsDigitHex(uint c)
|
|
|
|
{
|
|
|
|
uint map;
|
|
|
|
|
|
|
|
map = MAP(c);
|
|
|
|
|
|
|
|
return (map & digithex)!=0;
|
|
|
|
}
|
|
|
|
|
|
|
|
Bool TY_(IsLetter)(uint c)
|
|
|
|
{
|
|
|
|
uint map;
|
|
|
|
|
|
|
|
map = MAP(c);
|
|
|
|
|
|
|
|
return (map & letter)!=0;
|
|
|
|
}
|
|
|
|
|
|
|
|
Bool TY_(IsHTMLSpace)(uint c)
|
|
|
|
{
|
|
|
|
return c == 0x020 || c == 0x009 || c == 0x00a || c == 0x00c || c == 0x00d;
|
|
|
|
}
|
|
|
|
|
|
|
|
Bool TY_(IsNamechar)(uint c)
|
|
|
|
{
|
|
|
|
uint map = MAP(c);
|
|
|
|
return (map & namechar)!=0;
|
|
|
|
}
|
|
|
|
|
|
|
|
Bool TY_(IsXMLLetter)(uint c)
|
|
|
|
{
|
|
|
|
return ((c >= 0x41 && c <= 0x5a) ||
|
|
|
|
(c >= 0x61 && c <= 0x7a) ||
|
|
|
|
(c >= 0xc0 && c <= 0xd6) ||
|
|
|
|
(c >= 0xd8 && c <= 0xf6) ||
|
|
|
|
(c >= 0xf8 && c <= 0xff) ||
|
|
|
|
(c >= 0x100 && c <= 0x131) ||
|
|
|
|
(c >= 0x134 && c <= 0x13e) ||
|
|
|
|
(c >= 0x141 && c <= 0x148) ||
|
|
|
|
(c >= 0x14a && c <= 0x17e) ||
|
|
|
|
(c >= 0x180 && c <= 0x1c3) ||
|
|
|
|
(c >= 0x1cd && c <= 0x1f0) ||
|
|
|
|
(c >= 0x1f4 && c <= 0x1f5) ||
|
|
|
|
(c >= 0x1fa && c <= 0x217) ||
|
|
|
|
(c >= 0x250 && c <= 0x2a8) ||
|
|
|
|
(c >= 0x2bb && c <= 0x2c1) ||
|
|
|
|
c == 0x386 ||
|
|
|
|
(c >= 0x388 && c <= 0x38a) ||
|
|
|
|
c == 0x38c ||
|
|
|
|
(c >= 0x38e && c <= 0x3a1) ||
|
|
|
|
(c >= 0x3a3 && c <= 0x3ce) ||
|
|
|
|
(c >= 0x3d0 && c <= 0x3d6) ||
|
|
|
|
c == 0x3da ||
|
|
|
|
c == 0x3dc ||
|
|
|
|
c == 0x3de ||
|
|
|
|
c == 0x3e0 ||
|
|
|
|
(c >= 0x3e2 && c <= 0x3f3) ||
|
|
|
|
(c >= 0x401 && c <= 0x40c) ||
|
|
|
|
(c >= 0x40e && c <= 0x44f) ||
|
|
|
|
(c >= 0x451 && c <= 0x45c) ||
|
|
|
|
(c >= 0x45e && c <= 0x481) ||
|
|
|
|
(c >= 0x490 && c <= 0x4c4) ||
|
|
|
|
(c >= 0x4c7 && c <= 0x4c8) ||
|
|
|
|
(c >= 0x4cb && c <= 0x4cc) ||
|
|
|
|
(c >= 0x4d0 && c <= 0x4eb) ||
|
|
|
|
(c >= 0x4ee && c <= 0x4f5) ||
|
|
|
|
(c >= 0x4f8 && c <= 0x4f9) ||
|
|
|
|
(c >= 0x531 && c <= 0x556) ||
|
|
|
|
c == 0x559 ||
|
|
|
|
(c >= 0x561 && c <= 0x586) ||
|
|
|
|
(c >= 0x5d0 && c <= 0x5ea) ||
|
|
|
|
(c >= 0x5f0 && c <= 0x5f2) ||
|
|
|
|
(c >= 0x621 && c <= 0x63a) ||
|
|
|
|
(c >= 0x641 && c <= 0x64a) ||
|
|
|
|
(c >= 0x671 && c <= 0x6b7) ||
|
|
|
|
(c >= 0x6ba && c <= 0x6be) ||
|
|
|
|
(c >= 0x6c0 && c <= 0x6ce) ||
|
|
|
|
(c >= 0x6d0 && c <= 0x6d3) ||
|
|
|
|
c == 0x6d5 ||
|
|
|
|
(c >= 0x6e5 && c <= 0x6e6) ||
|
|
|
|
(c >= 0x905 && c <= 0x939) ||
|
|
|
|
c == 0x93d ||
|
|
|
|
(c >= 0x958 && c <= 0x961) ||
|
|
|
|
(c >= 0x985 && c <= 0x98c) ||
|
|
|
|
(c >= 0x98f && c <= 0x990) ||
|
|
|
|
(c >= 0x993 && c <= 0x9a8) ||
|
|
|
|
(c >= 0x9aa && c <= 0x9b0) ||
|
|
|
|
c == 0x9b2 ||
|
|
|
|
(c >= 0x9b6 && c <= 0x9b9) ||
|
|
|
|
(c >= 0x9dc && c <= 0x9dd) ||
|
|
|
|
(c >= 0x9df && c <= 0x9e1) ||
|
|
|
|
(c >= 0x9f0 && c <= 0x9f1) ||
|
|
|
|
(c >= 0xa05 && c <= 0xa0a) ||
|
|
|
|
(c >= 0xa0f && c <= 0xa10) ||
|
|
|
|
(c >= 0xa13 && c <= 0xa28) ||
|
|
|
|
(c >= 0xa2a && c <= 0xa30) ||
|
|
|
|
(c >= 0xa32 && c <= 0xa33) ||
|
|
|
|
(c >= 0xa35 && c <= 0xa36) ||
|
|
|
|
(c >= 0xa38 && c <= 0xa39) ||
|
|
|
|
(c >= 0xa59 && c <= 0xa5c) ||
|
|
|
|
c == 0xa5e ||
|
|
|
|
(c >= 0xa72 && c <= 0xa74) ||
|
|
|
|
(c >= 0xa85 && c <= 0xa8b) ||
|
|
|
|
c == 0xa8d ||
|
|
|
|
(c >= 0xa8f && c <= 0xa91) ||
|
|
|
|
(c >= 0xa93 && c <= 0xaa8) ||
|
|
|
|
(c >= 0xaaa && c <= 0xab0) ||
|
|
|
|
(c >= 0xab2 && c <= 0xab3) ||
|
|
|
|
(c >= 0xab5 && c <= 0xab9) ||
|
|
|
|
c == 0xabd ||
|
|
|
|
c == 0xae0 ||
|
|
|
|
(c >= 0xb05 && c <= 0xb0c) ||
|
|
|
|
(c >= 0xb0f && c <= 0xb10) ||
|
|
|
|
(c >= 0xb13 && c <= 0xb28) ||
|
|
|
|
(c >= 0xb2a && c <= 0xb30) ||
|
|
|
|
(c >= 0xb32 && c <= 0xb33) ||
|
|
|
|
(c >= 0xb36 && c <= 0xb39) ||
|
|
|
|
c == 0xb3d ||
|
|
|
|
(c >= 0xb5c && c <= 0xb5d) ||
|
|
|
|
(c >= 0xb5f && c <= 0xb61) ||
|
|
|
|
(c >= 0xb85 && c <= 0xb8a) ||
|
|
|
|
(c >= 0xb8e && c <= 0xb90) ||
|
|
|
|
(c >= 0xb92 && c <= 0xb95) ||
|
|
|
|
(c >= 0xb99 && c <= 0xb9a) ||
|
|
|
|
c == 0xb9c ||
|
|
|
|
(c >= 0xb9e && c <= 0xb9f) ||
|
|
|
|
(c >= 0xba3 && c <= 0xba4) ||
|
|
|
|
(c >= 0xba8 && c <= 0xbaa) ||
|
|
|
|
(c >= 0xbae && c <= 0xbb5) ||
|
|
|
|
(c >= 0xbb7 && c <= 0xbb9) ||
|
|
|
|
(c >= 0xc05 && c <= 0xc0c) ||
|
|
|
|
(c >= 0xc0e && c <= 0xc10) ||
|
|
|
|
(c >= 0xc12 && c <= 0xc28) ||
|
|
|
|
(c >= 0xc2a && c <= 0xc33) ||
|
|
|
|
(c >= 0xc35 && c <= 0xc39) ||
|
|
|
|
(c >= 0xc60 && c <= 0xc61) ||
|
|
|
|
(c >= 0xc85 && c <= 0xc8c) ||
|
|
|
|
(c >= 0xc8e && c <= 0xc90) ||
|
|
|
|
(c >= 0xc92 && c <= 0xca8) ||
|
|
|
|
(c >= 0xcaa && c <= 0xcb3) ||
|
|
|
|
(c >= 0xcb5 && c <= 0xcb9) ||
|
|
|
|
c == 0xcde ||
|
|
|
|
(c >= 0xce0 && c <= 0xce1) ||
|
|
|
|
(c >= 0xd05 && c <= 0xd0c) ||
|
|
|
|
(c >= 0xd0e && c <= 0xd10) ||
|
|
|
|
(c >= 0xd12 && c <= 0xd28) ||
|
|
|
|
(c >= 0xd2a && c <= 0xd39) ||
|
|
|
|
(c >= 0xd60 && c <= 0xd61) ||
|
|
|
|
(c >= 0xe01 && c <= 0xe2e) ||
|
|
|
|
c == 0xe30 ||
|
|
|
|
(c >= 0xe32 && c <= 0xe33) ||
|
|
|
|
(c >= 0xe40 && c <= 0xe45) ||
|
|
|
|
(c >= 0xe81 && c <= 0xe82) ||
|
|
|
|
c == 0xe84 ||
|
|
|
|
(c >= 0xe87 && c <= 0xe88) ||
|
|
|
|
c == 0xe8a ||
|
|
|
|
c == 0xe8d ||
|
|
|
|
(c >= 0xe94 && c <= 0xe97) ||
|
|
|
|
(c >= 0xe99 && c <= 0xe9f) ||
|
|
|
|
(c >= 0xea1 && c <= 0xea3) ||
|
|
|
|
c == 0xea5 ||
|
|
|
|
c == 0xea7 ||
|
|
|
|
(c >= 0xeaa && c <= 0xeab) ||
|
|
|
|
(c >= 0xead && c <= 0xeae) ||
|
|
|
|
c == 0xeb0 ||
|
|
|
|
(c >= 0xeb2 && c <= 0xeb3) ||
|
|
|
|
c == 0xebd ||
|
|
|
|
(c >= 0xec0 && c <= 0xec4) ||
|
|
|
|
(c >= 0xf40 && c <= 0xf47) ||
|
|
|
|
(c >= 0xf49 && c <= 0xf69) ||
|
|
|
|
(c >= 0x10a0 && c <= 0x10c5) ||
|
|
|
|
(c >= 0x10d0 && c <= 0x10f6) ||
|
|
|
|
c == 0x1100 ||
|
|
|
|
(c >= 0x1102 && c <= 0x1103) ||
|
|
|
|
(c >= 0x1105 && c <= 0x1107) ||
|
|
|
|
c == 0x1109 ||
|
|
|
|
(c >= 0x110b && c <= 0x110c) ||
|
|
|
|
(c >= 0x110e && c <= 0x1112) ||
|
|
|
|
c == 0x113c ||
|
|
|
|
c == 0x113e ||
|
|
|
|
c == 0x1140 ||
|
|
|
|
c == 0x114c ||
|
|
|
|
c == 0x114e ||
|
|
|
|
c == 0x1150 ||
|
|
|
|
(c >= 0x1154 && c <= 0x1155) ||
|
|
|
|
c == 0x1159 ||
|
|
|
|
(c >= 0x115f && c <= 0x1161) ||
|
|
|
|
c == 0x1163 ||
|
|
|
|
c == 0x1165 ||
|
|
|
|
c == 0x1167 ||
|
|
|
|
c == 0x1169 ||
|
|
|
|
(c >= 0x116d && c <= 0x116e) ||
|
|
|
|
(c >= 0x1172 && c <= 0x1173) ||
|
|
|
|
c == 0x1175 ||
|
|
|
|
c == 0x119e ||
|
|
|
|
c == 0x11a8 ||
|
|
|
|
c == 0x11ab ||
|
|
|
|
(c >= 0x11ae && c <= 0x11af) ||
|
|
|
|
(c >= 0x11b7 && c <= 0x11b8) ||
|
|
|
|
c == 0x11ba ||
|
|
|
|
(c >= 0x11bc && c <= 0x11c2) ||
|
|
|
|
c == 0x11eb ||
|
|
|
|
c == 0x11f0 ||
|
|
|
|
c == 0x11f9 ||
|
|
|
|
(c >= 0x1e00 && c <= 0x1e9b) ||
|
|
|
|
(c >= 0x1ea0 && c <= 0x1ef9) ||
|
|
|
|
(c >= 0x1f00 && c <= 0x1f15) ||
|
|
|
|
(c >= 0x1f18 && c <= 0x1f1d) ||
|
|
|
|
(c >= 0x1f20 && c <= 0x1f45) ||
|
|
|
|
(c >= 0x1f48 && c <= 0x1f4d) ||
|
|
|
|
(c >= 0x1f50 && c <= 0x1f57) ||
|
|
|
|
c == 0x1f59 ||
|
|
|
|
c == 0x1f5b ||
|
|
|
|
c == 0x1f5d ||
|
|
|
|
(c >= 0x1f5f && c <= 0x1f7d) ||
|
|
|
|
(c >= 0x1f80 && c <= 0x1fb4) ||
|
|
|
|
(c >= 0x1fb6 && c <= 0x1fbc) ||
|
|
|
|
c == 0x1fbe ||
|
|
|
|
(c >= 0x1fc2 && c <= 0x1fc4) ||
|
|
|
|
(c >= 0x1fc6 && c <= 0x1fcc) ||
|
|
|
|
(c >= 0x1fd0 && c <= 0x1fd3) ||
|
|
|
|
(c >= 0x1fd6 && c <= 0x1fdb) ||
|
|
|
|
(c >= 0x1fe0 && c <= 0x1fec) ||
|
|
|
|
(c >= 0x1ff2 && c <= 0x1ff4) ||
|
|
|
|
(c >= 0x1ff6 && c <= 0x1ffc) ||
|
|
|
|
c == 0x2126 ||
|
|
|
|
(c >= 0x212a && c <= 0x212b) ||
|
|
|
|
c == 0x212e ||
|
|
|
|
(c >= 0x2180 && c <= 0x2182) ||
|
|
|
|
(c >= 0x3041 && c <= 0x3094) ||
|
|
|
|
(c >= 0x30a1 && c <= 0x30fa) ||
|
|
|
|
(c >= 0x3105 && c <= 0x312c) ||
|
|
|
|
(c >= 0xac00 && c <= 0xd7a3) ||
|
|
|
|
(c >= 0x4e00 && c <= 0x9fa5) ||
|
|
|
|
c == 0x3007 ||
|
|
|
|
(c >= 0x3021 && c <= 0x3029) ||
|
|
|
|
(c >= 0x4e00 && c <= 0x9fa5) ||
|
|
|
|
c == 0x3007 ||
|
|
|
|
(c >= 0x3021 && c <= 0x3029));
|
|
|
|
}
|
|
|
|
|
|
|
|
Bool TY_(IsXMLNamechar)(uint c)
|
|
|
|
{
|
|
|
|
return (TY_(IsXMLLetter)(c) ||
|
|
|
|
c == '.' || c == '_' ||
|
|
|
|
c == ':' || c == '-' ||
|
|
|
|
(c >= 0x300 && c <= 0x345) ||
|
|
|
|
(c >= 0x360 && c <= 0x361) ||
|
|
|
|
(c >= 0x483 && c <= 0x486) ||
|
|
|
|
(c >= 0x591 && c <= 0x5a1) ||
|
|
|
|
(c >= 0x5a3 && c <= 0x5b9) ||
|
|
|
|
(c >= 0x5bb && c <= 0x5bd) ||
|
|
|
|
c == 0x5bf ||
|
|
|
|
(c >= 0x5c1 && c <= 0x5c2) ||
|
|
|
|
c == 0x5c4 ||
|
|
|
|
(c >= 0x64b && c <= 0x652) ||
|
|
|
|
c == 0x670 ||
|
|
|
|
(c >= 0x6d6 && c <= 0x6dc) ||
|
|
|
|
(c >= 0x6dd && c <= 0x6df) ||
|
|
|
|
(c >= 0x6e0 && c <= 0x6e4) ||
|
|
|
|
(c >= 0x6e7 && c <= 0x6e8) ||
|
|
|
|
(c >= 0x6ea && c <= 0x6ed) ||
|
|
|
|
(c >= 0x901 && c <= 0x903) ||
|
|
|
|
c == 0x93c ||
|
|
|
|
(c >= 0x93e && c <= 0x94c) ||
|
|
|
|
c == 0x94d ||
|
|
|
|
(c >= 0x951 && c <= 0x954) ||
|
|
|
|
(c >= 0x962 && c <= 0x963) ||
|
|
|
|
(c >= 0x981 && c <= 0x983) ||
|
|
|
|
c == 0x9bc ||
|
|
|
|
c == 0x9be ||
|
|
|
|
c == 0x9bf ||
|
|
|
|
(c >= 0x9c0 && c <= 0x9c4) ||
|
|
|
|
(c >= 0x9c7 && c <= 0x9c8) ||
|
|
|
|
(c >= 0x9cb && c <= 0x9cd) ||
|
|
|
|
c == 0x9d7 ||
|
|
|
|
(c >= 0x9e2 && c <= 0x9e3) ||
|
|
|
|
c == 0xa02 ||
|
|
|
|
c == 0xa3c ||
|
|
|
|
c == 0xa3e ||
|
|
|
|
c == 0xa3f ||
|
|
|
|
(c >= 0xa40 && c <= 0xa42) ||
|
|
|
|
(c >= 0xa47 && c <= 0xa48) ||
|
|
|
|
(c >= 0xa4b && c <= 0xa4d) ||
|
|
|
|
(c >= 0xa70 && c <= 0xa71) ||
|
|
|
|
(c >= 0xa81 && c <= 0xa83) ||
|
|
|
|
c == 0xabc ||
|
|
|
|
(c >= 0xabe && c <= 0xac5) ||
|
|
|
|
(c >= 0xac7 && c <= 0xac9) ||
|
|
|
|
(c >= 0xacb && c <= 0xacd) ||
|
|
|
|
(c >= 0xb01 && c <= 0xb03) ||
|
|
|
|
c == 0xb3c ||
|
|
|
|
(c >= 0xb3e && c <= 0xb43) ||
|
|
|
|
(c >= 0xb47 && c <= 0xb48) ||
|
|
|
|
(c >= 0xb4b && c <= 0xb4d) ||
|
|
|
|
(c >= 0xb56 && c <= 0xb57) ||
|
|
|
|
(c >= 0xb82 && c <= 0xb83) ||
|
|
|
|
(c >= 0xbbe && c <= 0xbc2) ||
|
|
|
|
(c >= 0xbc6 && c <= 0xbc8) ||
|
|
|
|
(c >= 0xbca && c <= 0xbcd) ||
|
|
|
|
c == 0xbd7 ||
|
|
|
|
(c >= 0xc01 && c <= 0xc03) ||
|
|
|
|
(c >= 0xc3e && c <= 0xc44) ||
|
|
|
|
(c >= 0xc46 && c <= 0xc48) ||
|
|
|
|
(c >= 0xc4a && c <= 0xc4d) ||
|
|
|
|
(c >= 0xc55 && c <= 0xc56) ||
|
|
|
|
(c >= 0xc82 && c <= 0xc83) ||
|
|
|
|
(c >= 0xcbe && c <= 0xcc4) ||
|
|
|
|
(c >= 0xcc6 && c <= 0xcc8) ||
|
|
|
|
(c >= 0xcca && c <= 0xccd) ||
|
|
|
|
(c >= 0xcd5 && c <= 0xcd6) ||
|
|
|
|
(c >= 0xd02 && c <= 0xd03) ||
|
|
|
|
(c >= 0xd3e && c <= 0xd43) ||
|
|
|
|
(c >= 0xd46 && c <= 0xd48) ||
|
|
|
|
(c >= 0xd4a && c <= 0xd4d) ||
|
|
|
|
c == 0xd57 ||
|
|
|
|
c == 0xe31 ||
|
|
|
|
(c >= 0xe34 && c <= 0xe3a) ||
|
|
|
|
(c >= 0xe47 && c <= 0xe4e) ||
|
|
|
|
c == 0xeb1 ||
|
|
|
|
(c >= 0xeb4 && c <= 0xeb9) ||
|
|
|
|
(c >= 0xebb && c <= 0xebc) ||
|
|
|
|
(c >= 0xec8 && c <= 0xecd) ||
|
|
|
|
(c >= 0xf18 && c <= 0xf19) ||
|
|
|
|
c == 0xf35 ||
|
|
|
|
c == 0xf37 ||
|
|
|
|
c == 0xf39 ||
|
|
|
|
c == 0xf3e ||
|
|
|
|
c == 0xf3f ||
|
|
|
|
(c >= 0xf71 && c <= 0xf84) ||
|
|
|
|
(c >= 0xf86 && c <= 0xf8b) ||
|
|
|
|
(c >= 0xf90 && c <= 0xf95) ||
|
|
|
|
c == 0xf97 ||
|
|
|
|
(c >= 0xf99 && c <= 0xfad) ||
|
|
|
|
(c >= 0xfb1 && c <= 0xfb7) ||
|
|
|
|
c == 0xfb9 ||
|
|
|
|
(c >= 0x20d0 && c <= 0x20dc) ||
|
|
|
|
c == 0x20e1 ||
|
|
|
|
(c >= 0x302a && c <= 0x302f) ||
|
|
|
|
c == 0x3099 ||
|
|
|
|
c == 0x309a ||
|
|
|
|
(c >= 0x30 && c <= 0x39) ||
|
|
|
|
(c >= 0x660 && c <= 0x669) ||
|
|
|
|
(c >= 0x6f0 && c <= 0x6f9) ||
|
|
|
|
(c >= 0x966 && c <= 0x96f) ||
|
|
|
|
(c >= 0x9e6 && c <= 0x9ef) ||
|
|
|
|
(c >= 0xa66 && c <= 0xa6f) ||
|
|
|
|
(c >= 0xae6 && c <= 0xaef) ||
|
|
|
|
(c >= 0xb66 && c <= 0xb6f) ||
|
|
|
|
(c >= 0xbe7 && c <= 0xbef) ||
|
|
|
|
(c >= 0xc66 && c <= 0xc6f) ||
|
|
|
|
(c >= 0xce6 && c <= 0xcef) ||
|
|
|
|
(c >= 0xd66 && c <= 0xd6f) ||
|
|
|
|
(c >= 0xe50 && c <= 0xe59) ||
|
|
|
|
(c >= 0xed0 && c <= 0xed9) ||
|
|
|
|
(c >= 0xf20 && c <= 0xf29) ||
|
|
|
|
c == 0xb7 ||
|
|
|
|
c == 0x2d0 ||
|
|
|
|
c == 0x2d1 ||
|
|
|
|
c == 0x387 ||
|
|
|
|
c == 0x640 ||
|
|
|
|
c == 0xe46 ||
|
|
|
|
c == 0xec6 ||
|
|
|
|
c == 0x3005 ||
|
|
|
|
(c >= 0x3031 && c <= 0x3035) ||
|
|
|
|
(c >= 0x309d && c <= 0x309e) ||
|
|
|
|
(c >= 0x30fc && c <= 0x30fe));
|
|
|
|
}
|
|
|
|
|
|
|
|
#if 0
|
|
|
|
Bool IsLower(uint c)
|
|
|
|
{
|
|
|
|
uint map = MAP(c);
|
|
|
|
|
|
|
|
return (map & lowercase)!=0;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
Bool TY_(IsUpper)(uint c)
|
|
|
|
{
|
|
|
|
uint map = MAP(c);
|
|
|
|
|
|
|
|
return (map & uppercase)!=0;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint TY_(ToLower)(uint c)
|
|
|
|
{
|
|
|
|
uint map = MAP(c);
|
|
|
|
|
|
|
|
if (map & uppercase)
|
|
|
|
c += 'a' - 'A';
|
|
|
|
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint TY_(ToUpper)(uint c)
|
|
|
|
{
|
|
|
|
uint map = MAP(c);
|
|
|
|
|
|
|
|
if (map & lowercase)
|
|
|
|
c += (uint) ('A' - 'a' );
|
|
|
|
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
|
|
|
#if 0
|
|
|
|
char FoldCase( TidyDocImpl* doc, tmbchar c, Bool tocaps )
|
|
|
|
{
|
|
|
|
if ( !cfgBool(doc, TidyXmlTags) )
|
|
|
|
{
|
|
|
|
if ( tocaps )
|
|
|
|
{
|
|
|
|
c = (tmbchar) ToUpper(c);
|
|
|
|
}
|
|
|
|
else /* force to lower case */
|
|
|
|
{
|
|
|
|
c = (tmbchar) ToLower(c);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
return last character in string
|
|
|
|
this is useful when trailing quotemark
|
|
|
|
is missing on an attribute
|
|
|
|
*/
|
|
|
|
static tmbchar LastChar( tmbstr str )
|
|
|
|
{
|
|
|
|
if ( str && *str )
|
|
|
|
{
|
|
|
|
int n = TY_(tmbstrlen)(str);
|
|
|
|
return str[n-1];
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
node->type is one of these:
|
|
|
|
|
|
|
|
#define TextNode 1
|
|
|
|
#define StartTag 2
|
|
|
|
#define EndTag 3
|
|
|
|
#define StartEndTag 4
|
|
|
|
*/
|
|
|
|
|
|
|
|
Lexer* TY_(NewLexer)( TidyDocImpl* doc )
|
|
|
|
{
|
|
|
|
Lexer* lexer = (Lexer*) TidyDocAlloc( doc, sizeof(Lexer) );
|
|
|
|
|
|
|
|
if ( lexer != NULL )
|
|
|
|
{
|
|
|
|
TidyClearMemory( lexer, sizeof(Lexer) );
|
|
|
|
|
|
|
|
lexer->allocator = doc->allocator;
|
|
|
|
lexer->lines = 1;
|
|
|
|
lexer->columns = 1;
|
|
|
|
lexer->state = LEX_CONTENT;
|
|
|
|
|
|
|
|
lexer->versions = (VERS_ALL|VERS_PROPRIETARY);
|
|
|
|
lexer->doctype = VERS_UNKNOWN;
|
|
|
|
lexer->root = &doc->root;
|
|
|
|
}
|
|
|
|
return lexer;
|
|
|
|
}
|
|
|
|
|
|
|
|
static Bool EndOfInput( TidyDocImpl* doc )
|
|
|
|
{
|
|
|
|
assert( doc->docIn != NULL );
|
|
|
|
return ( !doc->docIn->pushed && TY_(IsEOF)(doc->docIn) );
|
|
|
|
}
|
|
|
|
|
|
|
|
void TY_(FreeLexer)( TidyDocImpl* doc )
|
|
|
|
{
|
|
|
|
Lexer *lexer = doc->lexer;
|
|
|
|
if ( lexer )
|
|
|
|
{
|
|
|
|
TY_(FreeStyles)( doc );
|
|
|
|
|
|
|
|
/* See GetToken() */
|
|
|
|
if ( lexer->pushed || lexer->itoken )
|
|
|
|
{
|
|
|
|
if (lexer->pushed)
|
|
|
|
TY_(FreeNode)( doc, lexer->itoken );
|
|
|
|
TY_(FreeNode)( doc, lexer->token );
|
|
|
|
}
|
|
|
|
|
|
|
|
while ( lexer->istacksize > 0 )
|
|
|
|
TY_(PopInline)( doc, NULL );
|
|
|
|
|
|
|
|
TidyDocFree( doc, lexer->istack );
|
|
|
|
TidyDocFree( doc, lexer->lexbuf );
|
|
|
|
TidyDocFree( doc, lexer );
|
|
|
|
doc->lexer = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Lexer uses bigger memory chunks than pprint as
|
|
|
|
** it must hold the entire input document. not just
|
|
|
|
** the last line or three.
|
|
|
|
*/
|
|
|
|
static void AddByte( Lexer *lexer, tmbchar ch )
|
|
|
|
{
|
|
|
|
if ( lexer->lexsize + 2 >= lexer->lexlength )
|
|
|
|
{
|
|
|
|
tmbstr buf = NULL;
|
|
|
|
uint allocAmt = lexer->lexlength;
|
|
|
|
while ( lexer->lexsize + 2 >= allocAmt )
|
|
|
|
{
|
|
|
|
if ( allocAmt == 0 )
|
|
|
|
allocAmt = 8192;
|
|
|
|
else
|
|
|
|
allocAmt *= 2;
|
|
|
|
}
|
|
|
|
buf = (tmbstr) TidyRealloc( lexer->allocator, lexer->lexbuf, allocAmt );
|
|
|
|
if ( buf )
|
|
|
|
{
|
|
|
|
TidyClearMemory( buf + lexer->lexlength,
|
|
|
|
allocAmt - lexer->lexlength );
|
|
|
|
lexer->lexbuf = buf;
|
|
|
|
lexer->lexlength = allocAmt;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
lexer->lexbuf[ lexer->lexsize++ ] = ch;
|
|
|
|
lexer->lexbuf[ lexer->lexsize ] = '\0'; /* debug */
|
|
|
|
}
|
|
|
|
|
|
|
|
static void ChangeChar( Lexer *lexer, tmbchar c )
|
|
|
|
{
|
|
|
|
if ( lexer->lexsize > 0 )
|
|
|
|
{
|
|
|
|
lexer->lexbuf[ lexer->lexsize-1 ] = c;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* store character c as UTF-8 encoded byte stream */
|
|
|
|
void TY_(AddCharToLexer)( Lexer *lexer, uint c )
|
|
|
|
{
|
|
|
|
int i, err, count = 0;
|
|
|
|
tmbchar buf[10] = {0};
|
|
|
|
|
|
|
|
err = TY_(EncodeCharToUTF8Bytes)( c, buf, NULL, &count );
|
|
|
|
if (err)
|
|
|
|
{
|
|
|
|
/* replacement character 0xFFFD encoded as UTF-8 */
|
|
|
|
buf[0] = (byte) 0xEF;
|
|
|
|
buf[1] = (byte) 0xBF;
|
|
|
|
buf[2] = (byte) 0xBD;
|
|
|
|
count = 3;
|
|
|
|
}
|
|
|
|
|
|
|
|
for ( i = 0; i < count; ++i )
|
|
|
|
AddByte( lexer, buf[i] );
|
|
|
|
}
|
|
|
|
|
|
|
|
static void AddStringToLexer( Lexer *lexer, ctmbstr str )
|
|
|
|
{
|
|
|
|
uint c;
|
|
|
|
|
|
|
|
/* Many (all?) compilers will sign-extend signed chars (the default) when
|
|
|
|
** converting them to unsigned integer values. We must cast our char to
|
|
|
|
** unsigned char before assigning it to prevent this from happening.
|
|
|
|
*/
|
|
|
|
while( 0 != (c = (unsigned char) *str++ ))
|
|
|
|
TY_(AddCharToLexer)( lexer, c );
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static void SetLexerLocus( TidyDocImpl* doc, Lexer *lexer )
|
|
|
|
{
|
|
|
|
lexer->lines = doc->docIn->curline;
|
|
|
|
lexer->columns = doc->docIn->curcol;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
No longer attempts to insert missing ';' for unknown
|
|
|
|
enitities unless one was present already, since this
|
|
|
|
gives unexpected results.
|
|
|
|
|
|
|
|
For example: <a href="something.htm?foo&bar&fred">
|
|
|
|
was tidied to: <a href="something.htm?foo&bar;&fred;">
|
|
|
|
rather than: <a href="something.htm?foo&bar&fred">
|
|
|
|
|
|
|
|
My thanks for Maurice Buxton for spotting this.
|
|
|
|
|
|
|
|
Also Randy Waki pointed out the following case for the
|
|
|
|
04 Aug 00 version (bug #433012):
|
|
|
|
|
|
|
|
For example: <a href="something.htm?id=1&lang=en">
|
|
|
|
was tidied to: <a href="something.htm?id=1⟨=en">
|
|
|
|
rather than: <a href="something.htm?id=1&lang=en">
|
|
|
|
|
|
|
|
where "lang" is a known entity (#9001), but browsers would
|
|
|
|
misinterpret "⟨" because it had a value > 256.
|
|
|
|
|
|
|
|
So the case of an apparently known entity with a value > 256 and
|
|
|
|
missing a semicolon is handled specially.
|
|
|
|
|
|
|
|
"ParseEntity" is also a bit of a misnomer - it handles entities and
|
|
|
|
numeric character references. Invalid NCR's are now reported.
|
|
|
|
*/
|
|
|
|
static void ParseEntity( TidyDocImpl* doc, GetTokenMode mode )
|
|
|
|
{
|
|
|
|
typedef enum
|
|
|
|
{
|
|
|
|
ENT_default,
|
|
|
|
ENT_numdec,
|
|
|
|
ENT_numhex
|
|
|
|
} ENTState;
|
|
|
|
|
|
|
|
typedef Bool (*ENTfn)(uint);
|
|
|
|
const ENTfn entFn[] = {
|
|
|
|
TY_(IsNamechar),
|
|
|
|
TY_(IsDigit),
|
|
|
|
IsDigitHex
|
|
|
|
};
|
|
|
|
uint start;
|
|
|
|
ENTState entState = ENT_default;
|
|
|
|
uint charRead = 0;
|
|
|
|
Bool semicolon = no, found = no;
|
|
|
|
Bool isXml = cfgBool( doc, TidyXmlTags );
|
|
|
|
Bool preserveEntities = cfgBool( doc, TidyPreserveEntities );
|
|
|
|
uint c, ch, startcol, entver = 0;
|
|
|
|
Lexer* lexer = doc->lexer;
|
|
|
|
|
|
|
|
start = lexer->lexsize - 1; /* to start at "&" */
|
|
|
|
startcol = doc->docIn->curcol - 1;
|
|
|
|
|
|
|
|
while ( (c = TY_(ReadChar)(doc->docIn)) != EndOfStream )
|
|