/* entities.c -- recognize HTML ISO entities (c) 1998-2008 (W3C) MIT, ERCIM, Keio University See tidy.h for the copyright notice. Entity handling can be static because there are no config or document-specific values. Lookup table is 100% defined at compile time. */ #include #include "entities.h" #include "tidy-int.h" #include "tmbstr.h" struct _entity; typedef struct _entity entity; struct _entity { ctmbstr name; uint versions; uint code; }; static const entity entities[] = { /* ** Markup pre-defined character entities */ { "quot", VERS_ALL|VERS_XML, 34 }, { "amp", VERS_ALL|VERS_XML, 38 }, { "apos", VERS_FROM40|VERS_XML, 39 }, { "lt", VERS_ALL|VERS_XML, 60 }, { "gt", VERS_ALL|VERS_XML, 62 }, /* ** Latin-1 character entities */ { "nbsp", VERS_ALL, 160 }, { "iexcl", VERS_ALL, 161 }, { "cent", VERS_ALL, 162 }, { "pound", VERS_ALL, 163 }, { "curren", VERS_ALL, 164 }, { "yen", VERS_ALL, 165 }, { "brvbar", VERS_ALL, 166 }, { "sect", VERS_ALL, 167 }, { "uml", VERS_ALL, 168 }, { "copy", VERS_ALL, 169 }, { "ordf", VERS_ALL, 170 }, { "laquo", VERS_ALL, 171 }, { "not", VERS_ALL, 172 }, { "shy", VERS_ALL, 173 }, { "reg", VERS_ALL, 174 }, { "macr", VERS_ALL, 175 }, { "deg", VERS_ALL, 176 }, { "plusmn", VERS_ALL, 177 }, { "sup2", VERS_ALL, 178 }, { "sup3", VERS_ALL, 179 }, { "acute", VERS_ALL, 180 }, { "micro", VERS_ALL, 181 }, { "para", VERS_ALL, 182 }, { "middot", VERS_ALL, 183 }, { "cedil", VERS_ALL, 184 }, { "sup1", VERS_ALL, 185 }, { "ordm", VERS_ALL, 186 }, { "raquo", VERS_ALL, 187 }, { "frac14", VERS_ALL, 188 }, { "frac12", VERS_ALL, 189 }, { "frac34", VERS_ALL, 190 }, { "iquest", VERS_ALL, 191 }, { "Agrave", VERS_ALL, 192 }, { "Aacute", VERS_ALL, 193 }, { "Acirc", VERS_ALL, 194 }, { "Atilde", VERS_ALL, 195 }, { "Auml", VERS_ALL, 196 }, { "Aring", VERS_ALL, 197 }, { "AElig", VERS_ALL, 198 }, { "Ccedil", VERS_ALL, 199 }, { "Egrave", VERS_ALL, 200 }, { "Eacute", VERS_ALL, 201 }, { "Ecirc", VERS_ALL, 202 }, { "Euml", VERS_ALL, 203 }, { "Igrave", VERS_ALL, 204 }, { "Iacute", VERS_ALL, 205 }, { "Icirc", VERS_ALL, 206 }, { "Iuml", VERS_ALL, 207 }, { "ETH", VERS_ALL, 208 }, { "Ntilde", VERS_ALL, 209 }, { "Ograve", VERS_ALL, 210 }, { "Oacute", VERS_ALL, 211 }, { "Ocirc", VERS_ALL, 212 }, { "Otilde", VERS_ALL, 213 }, { "Ouml", VERS_ALL, 214 }, { "times", VERS_ALL, 215 }, { "Oslash", VERS_ALL, 216 }, { "Ugrave", VERS_ALL, 217 }, { "Uacute", VERS_ALL, 218 }, { "Ucirc", VERS_ALL, 219 }, { "Uuml", VERS_ALL, 220 }, { "Yacute", VERS_ALL, 221 }, { "THORN", VERS_ALL, 222 }, { "szlig", VERS_ALL, 223 }, { "agrave", VERS_ALL, 224 }, { "aacute", VERS_ALL, 225 }, { "acirc", VERS_ALL, 226 }, { "atilde", VERS_ALL, 227 }, { "auml", VERS_ALL, 228 }, { "aring", VERS_ALL, 229 }, { "aelig", VERS_ALL, 230 }, { "ccedil", VERS_ALL, 231 }, { "egrave", VERS_ALL, 232 }, { "eacute", VERS_ALL, 233 }, { "ecirc", VERS_ALL, 234 }, { "euml", VERS_ALL, 235 }, { "igrave", VERS_ALL, 236 }, { "iacute", VERS_ALL, 237 }, { "icirc", VERS_ALL, 238 }, { "iuml", VERS_ALL, 239 }, { "eth", VERS_ALL, 240 }, { "ntilde", VERS_ALL, 241 }, { "ograve", VERS_ALL, 242 }, { "oacute", VERS_ALL, 243 }, { "ocirc", VERS_ALL, 244 }, { "otilde", VERS_ALL, 245 }, { "ouml", VERS_ALL, 246 }, { "divide", VERS_ALL, 247 }, { "oslash", VERS_ALL, 248 }, { "ugrave", VERS_ALL, 249 }, { "uacute", VERS_ALL, 250 }, { "ucirc", VERS_ALL, 251 }, { "uuml", VERS_ALL, 252 }, { "yacute", VERS_ALL, 253 }, { "thorn", VERS_ALL, 254 }, { "yuml", VERS_ALL, 255 }, /* ** Extended Entities defined in HTML 4: Symbols */ { "fnof", VERS_FROM40, 402 }, { "Alpha", VERS_FROM40, 913 }, { "Beta", VERS_FROM40, 914 }, { "Gamma", VERS_FROM40, 915 }, { "Delta", VERS_FROM40, 916 }, { "Epsilon", VERS_FROM40, 917 }, { "Zeta", VERS_FROM40, 918 }, { "Eta", VERS_FROM40, 919 }, { "Theta", VERS_FROM40, 920 }, { "Iota", VERS_FROM40, 921 }, { "Kappa", VERS_FROM40, 922 }, { "Lambda", VERS_FROM40, 923 }, { "Mu", VERS_FROM40, 924 }, { "Nu", VERS_FROM40, 925 }, { "Xi", VERS_FROM40, 926 }, { "Omicron", VERS_FROM40, 927 }, { "Pi", VERS_FROM40, 928 }, { "Rho", VERS_FROM40, 929 }, { "Sigma", VERS_FROM40, 931 }, { "Tau", VERS_FROM40, 932 }, { "Upsilon", VERS_FROM40, 933 }, { "Phi", VERS_FROM40, 934 }, { "Chi", VERS_FROM40, 935 }, { "Psi", VERS_FROM40, 936 }, { "Omega", VERS_FROM40, 937 }, { "alpha", VERS_FROM40, 945 }, { "beta", VERS_FROM40, 946 }, { "gamma", VERS_FROM40, 947 }, { "delta", VERS_FROM40, 948 }, { "epsilon", VERS_FROM40, 949 }, { "zeta", VERS_FROM40, 950 }, { "eta", VERS_FROM40, 951 }, { "theta", VERS_FROM40, 952 }, { "iota", VERS_FROM40, 953 }, { "kappa", VERS_FROM40, 954 }, { "lambda", VERS_FROM40, 955 }, { "mu", VERS_FROM40, 956 }, { "nu", VERS_FROM40, 957 }, { "xi", VERS_FROM40, 958 }, { "omicron", VERS_FROM40, 959 }, { "pi", VERS_FROM40, 960 }, { "rho", VERS_FROM40, 961 }, { "sigmaf", VERS_FROM40, 962 }, { "sigma", VERS_FROM40, 963 }, { "tau", VERS_FROM40, 964 }, { "upsilon", VERS_FROM40, 965 }, { "phi", VERS_FROM40, 966 }, { "chi", VERS_FROM40, 967 }, { "psi", VERS_FROM40, 968 }, { "omega", VERS_FROM40, 969 }, { "thetasym", VERS_FROM40, 977 }, { "upsih", VERS_FROM40, 978 }, { "piv", VERS_FROM40, 982 }, { "bull", VERS_FROM40, 8226 }, { "hellip", VERS_FROM40, 8230 }, { "prime", VERS_FROM40, 8242 }, { "Prime", VERS_FROM40, 8243 }, { "oline", VERS_FROM40, 8254 }, { "frasl", VERS_FROM40, 8260 }, { "weierp", VERS_FROM40, 8472 }, { "image", VERS_FROM40, 8465 }, { "real", VERS_FROM40, 8476 }, { "trade", VERS_FROM40, 8482 }, { "alefsym", VERS_FROM40, 8501 }, { "larr", VERS_FROM40, 8592 }, { "uarr", VERS_FROM40, 8593 }, { "rarr", VERS_FROM40, 8594 }, { "darr", VERS_FROM40, 8595 }, { "harr", VERS_FROM40, 8596 }, { "crarr", VERS_FROM40, 8629 }, { "lArr", VERS_FROM40, 8656 }, { "uArr", VERS_FROM40, 8657 }, { "rArr", VERS_FROM40, 8658 }, { "dArr", VERS_FROM40, 8659 }, { "hArr", VERS_FROM40, 8660 }, { "forall", VERS_FROM40, 8704 }, { "part", VERS_FROM40, 8706 }, { "exist", VERS_FROM40, 8707 }, { "empty", VERS_FROM40, 8709 }, { "nabla", VERS_FROM40, 8711 }, { "isin", VERS_FROM40, 8712 }, { "notin", VERS_FROM40, 8713 }, { "ni", VERS_FROM40, 8715 }, { "prod", VERS_FROM40, 8719 }, { "sum", VERS_FROM40, 8721 }, { "minus", VERS_FROM40, 8722 }, { "lowast", VERS_FROM40, 8727 }, { "radic", VERS_FROM40, 8730 }, { "prop", VERS_FROM40, 8733 }, { "infin", VERS_FROM40, 8734 }, { "ang", VERS_FROM40, 8736 }, { "and", VERS_FROM40, 8743 }, { "or", VERS_FROM40, 8744 }, { "cap", VERS_FROM40, 8745 }, { "cup", VERS_FROM40, 8746 }, { "int", VERS_FROM40, 8747 }, { "there4", VERS_FROM40, 8756 }, { "sim", VERS_FROM40, 8764 }, { "cong", VERS_FROM40, 8773 }, { "asymp", VERS_FROM40, 8776 }, { "ne", VERS_FROM40, 8800 }, { "equiv", VERS_FROM40, 8801 }, { "le", VERS_FROM40, 8804 }, { "ge", VERS_FROM40, 8805 }, { "sub", VERS_FROM40, 8834 }, { "sup", VERS_FROM40, 8835 }, { "nsub", VERS_FROM40, 8836 }, { "sube", VERS_FROM40, 8838 }, { "supe", VERS_FROM40, 8839 }, { "oplus", VERS_FROM40, 8853 }, { "otimes", VERS_FROM40, 8855 }, { "perp", VERS_FROM40, 8869 }, { "sdot", VERS_FROM40, 8901 }, { "lceil", VERS_FROM40, 8968 }, { "rceil", VERS_FROM40, 8969 }, { "lfloor", VERS_FROM40, 8970 }, { "rfloor", VERS_FROM40, 8971 }, { "lang", VERS_FROM40, 10216 }, { "rang", VERS_FROM40, 10217 }, { "loz", VERS_FROM40, 9674 }, { "spades", VERS_FROM40, 9824 }, { "clubs", VERS_FROM40, 9827 }, { "hearts", VERS_FROM40, 9829 }, { "diams", VERS_FROM40, 9830 }, /* ** Extended Entities defined in HTML 4: Special (less Markup at top) */ { "OElig", VERS_FROM40, 338 }, { "oelig", VERS_FROM40, 339 }, { "Scaron", VERS_FROM40, 352 }, { "scaron", VERS_FROM40, 353 }, { "Yuml", VERS_FROM40, 376 }, { "circ", VERS_FROM40, 710 }, { "tilde", VERS_FROM40, 732 }, { "ensp", VERS_FROM40, 8194 }, { "emsp", VERS_FROM40, 8195 }, { "thinsp", VERS_FROM40, 8201 }, { "zwnj", VERS_FROM40, 8204 }, { "zwj", VERS_FROM40, 8205 }, { "lrm", VERS_FROM40, 8206 }, { "rlm", VERS_FROM40, 8207 }, { "ndash", VERS_FROM40, 8211 }, { "mdash", VERS_FROM40, 8212 }, { "lsquo", VERS_FROM40, 8216 }, { "rsquo", VERS_FROM40, 8217 }, { "sbquo", VERS_FROM40, 8218 }, { "ldquo", VERS_FROM40, 8220 }, { "rdquo", VERS_FROM40, 8221 }, { "bdquo", VERS_FROM40, 8222 }, { "dagger", VERS_FROM40, 8224 }, { "Dagger", VERS_FROM40, 8225 }, { "permil", VERS_FROM40, 8240 }, { "lsaquo", VERS_FROM40, 8249 }, { "rsaquo", VERS_FROM40, 8250 }, { "euro", VERS_FROM40, 8364 }, { NULL, VERS_UNKNOWN, 0 } }; /* Pure static implementation. Trades off lookup speed ** for faster setup time (well, none actually). ** Optimization of comparing 1st character buys enough ** speed that hash doesn't improve things without > 500 ** items in list. */ static const entity* entitiesLookup( ctmbstr s ) { tmbchar ch = (tmbchar)( s ? *s : 0 ); const entity *np; for ( np = entities; ch && np && np->name; ++np ) if ( ch == *np->name && TY_(tmbstrcmp)(s, np->name) == 0 ) return np; return NULL; } #if 0 /* entity starting with "&" returns zero on error */ uint EntityCode( ctmbstr name, uint versions ) { const entity* np; assert( name && name[0] == '&' ); /* numeric entitity: name = "&#" followed by number */ if ( name[1] == '#' ) { uint c = 0; /* zero on missing/bad number */ Bool isXml = ( (versions & VERS_XML) == VERS_XML ); /* 'x' prefix denotes hexadecimal number format */ if ( name[2] == 'x' || (!isXml && name[2] == 'X') ) sscanf( name+3, "%x", &c ); else sscanf( name+2, "%u", &c ); return (uint) c; } /* Named entity: name ="&" followed by a name */ if ( NULL != (np = entitiesLookup(name+1)) ) { /* Only recognize entity name if version supports it. */ if ( np->versions & versions ) return np->code; } return 0; /* zero signifies unknown entity name */ } #endif Bool TY_(EntityInfo)( ctmbstr name, Bool isXml, uint* code, uint* versions ) { const entity* np; int res; assert( name && name[0] == '&' ); assert( code != NULL ); assert( versions != NULL ); /* numeric entitity: name = "&#" followed by number */ if ( name[1] == '#' ) { uint c = 0; /* zero on missing/bad number */ /* 'x' prefix denotes hexadecimal number format */ if ( name[2] == 'x' || (!isXml && name[2] == 'X') ) res = sscanf( name+3, "%x", &c ); else res = sscanf( name+2, "%u", &c ); /* Issue #373 - Null Char in XML result doc - sf905 2009 */ if ( res == 1 ) { *code = c; *versions = VERS_ALL; return yes; } else { *code = 0; *versions = ( isXml ? VERS_XML : VERS_PROPRIETARY ); return no; } } /* Named entity: name ="&" followed by a name */ if ( NULL != (np = entitiesLookup(name+1)) ) { *code = np->code; *versions = np->versions; return yes; } *code = 0; *versions = ( isXml ? VERS_XML : VERS_PROPRIETARY ); return no; } ctmbstr TY_(EntityName)( uint ch, uint versions ) { ctmbstr entnam = NULL; const entity *ep; for ( ep = entities; ep->name != NULL; ++ep ) { if ( ep->code == ch ) { if ( ep->versions & versions ) entnam = ep->name; break; /* Found code. Stop search. */ } } return entnam; } /* * local variables: * mode: c * indent-tabs-mode: nil * c-basic-offset: 4 * eval: (c-set-offset 'substatement-open 0) * end: */