Browse Source

embedded HTML Tidy library

master
hrbrmstr 8 years ago
parent
commit
0a142afbd4
No known key found for this signature in database GPG Key ID: 1D7529BE14E2BBA9
  1. BIN
      .DS_Store
  2. 16
      DESCRIPTION
  3. 2
      NAMESPACE
  4. 6
      NEWS.md
  5. 6
      R/RcppExports.R
  6. 12
      README.Rmd
  7. 16
      README.md
  8. 6
      man/tidy_html.Rd
  9. BIN
      src/.DS_Store
  10. 3
      src/Makevars
  11. 16
      src/RcppExports.cpp
  12. 3305
      src/access.c
  13. 281
      src/access.h
  14. 118
      src/alloc.c
  15. 204
      src/attrask.c
  16. 3619
      src/attrdict.c
  17. 156
      src/attrdict.h
  18. 208
      src/attrget.c
  19. 2315
      src/attrs.c
  20. 458
      src/attrs.h
  21. 226
      src/buffio.c
  22. 6
      src/buffio.h
  23. 1031
      src/charsets.c
  24. 13
      src/charsets.h
  25. 2692
      src/clean.c
  26. 82
      src/clean.h
  27. 1788
      src/config.c
  28. 146
      src/config.h
  29. 424
      src/entities.c
  30. 18
      src/entities.h
  31. 116
      src/fileio.c
  32. 42
      src/fileio.h
  33. 63
      src/forward.h
  34. 174
      src/gdoc.c
  35. 19
      src/gdoc.h
  36. 20
      src/htmltidy.cpp
  37. 104
      src/iconvtc.c
  38. 14
      src/iconvtc.h
  39. 380
      src/istack.c
  40. 959
      src/language.c
  41. 332
      src/language.h
  42. 2353
      src/language_en.h
  43. 132
      src/language_en_gb.h
  44. 138
      src/language_es.h
  45. 82
      src/language_es_mx.h
  46. 573
      src/language_fr.h
  47. 81
      src/language_zh_cn.h
  48. 4289
      src/lexer.c
  49. 620
      src/lexer.h
  50. 343
      src/mappedio.c
  51. 15
      src/mappedio.h
  52. 1102
      src/message.c
  53. 282
      src/message.h
  54. 5057
      src/parser.c
  55. 70
      src/parser.h
  56. 6
      src/platform.h
  57. 2564
      src/pprint.c
  58. 94
      src/pprint.h
  59. 446
      src/sprtf.c
  60. 77
      src/sprtf.h
  61. 1392
      src/streamio.c
  62. 210
      src/streamio.h
  63. 285
      src/tagask.c
  64. 1123
      src/tags.c
  65. 247
      src/tags.h
  66. 159
      src/tidy-int.h
  67. 1153
      src/tidy.h
  68. 112
      src/tidybuffio.h
  69. 858
      src/tidyenum.h
  70. 2356
      src/tidylib.c
  71. 635
      src/tidyplatform.h
  72. 295
      src/tmbstr.c
  73. 86
      src/tmbstr.h
  74. 533
      src/utf8.c
  75. 46
      src/utf8.h
  76. 23
      src/version.h
  77. 794
      src/win32tc.c
  78. 18
      src/win32tc.h
  79. 6
      tests/testthat/test-htmltidy.R

BIN
.DS_Store

Binary file not shown.

16
DESCRIPTION

@ -1,12 +1,20 @@
Package: htmltidy
Title: Clean up gnarly HTML/XML
Version: 0.1.0.9000
Authors@R: c(person("Bob", "Rudis", email = "bob@rudis.net", role = c("aut", "cre")))
Description: Clean up gnarly HTML/XML
Title: Clean Up Gnarly HTML/XML
Version: 0.2.0.9000
Authors@R: c(
person("Bob", "Rudis", email = "bob@rudis.net", role = c("aut", "cre")),
person("Dave", "Dave", email = "dsr@w3.org", role = c("ctb", "aut"),
comment="HTML Tidy library")
)
Maintainer: Bob Rudis <bob@rud.is>
Description: HTML and XML documents can be beautiful and pristine. They can also be
wretched, evil, malformed hellspawn. Now, you can tidy up that HTML and XML before
processing it with your favorite angle-bracket parsing tools.
Depends:
R (>= 3.3.0)
License: AGPL + file LICENSE
LazyData: true
NeedsCompilation: yes
Suggests:
testthat,
xml2

2
NAMESPACE

@ -1,5 +1,5 @@
# Generated by roxygen2: do not edit by hand
export(tidy)
export(tidy_html)
importFrom(Rcpp,sourceCpp)
useDynLib(htmltidy)

6
NEWS.md

@ -1,3 +1,9 @@
# htmltidy 0.2.0.9000
* Bundled tidy-html5 library with the package
* Modified tests
# htmltidy 0.1.0.9000
* Added a `NEWS.md` file to track changes to the package.

6
R/RcppExports.R

@ -1,11 +1,11 @@
# This file was generated by Rcpp::compileAttributes
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
#' Tidy HTML/XML
#'
#' @param source length 1 character vetor containing the HTML/XML source to process
#' @export
tidy <- function(source) {
.Call('htmltidy_tidy', PACKAGE = 'htmltidy', source)
tidy_html <- function(source) {
.Call('htmltidy_tidy_html', PACKAGE = 'htmltidy', source)
}

12
README.Rmd

@ -21,20 +21,16 @@ knitr::opts_chunk$set(
Inspired by [this SO question](http://stackoverflow.com/questions/37061873/identify-a-weblink-in-bold-in-r) and because there's a great deal of cruddy HTML out there that needs fixing to use properly when scraping data.
NOTE: Requires [`libtidy`](http://www.html-tidy.org/) and presently is super-basic (no way to set options and pretty much only does HTML)
You'll need to first do a `brew install tidy-html5` on MacOS or `apt-get install libtidy-dev` on Ubuntu/Debian to get this to work. NOTE that the linux libraries may be older and return slightly different (but no less tidy) HTML.
**SEEKING COLLABORATORS**
It relies on a locally included version of [`libtidy`](http://www.html-tidy.org/) and presently is super-basic (no way to set options and pretty much only does HTML)
This works enough for me to use in a pinch. It should be straightforward (but tedious) to:
- enable passing options in a `list`
- bundle `libtidy` _with the package_ and get it to work on Windows, linux & MacOS as the library compiles on all three with the necessary tools.
- Getting it to work on Windows.
The following functions are implemented:
- `tidy` : Clean up gnarly HTML/XML
- `tidy_html` : Clean up gnarly HTML/XML
### Installation
@ -54,7 +50,7 @@ library(htmltidy)
# current verison
packageVersion("htmltidy")
cat(tidy("<b><p><a href='http://google.com'>google &gt</a></p></b>"))
cat(tidy_html("<b><p><a href='http://google.com'>google &gt</a></p></b>"))
```
### Code of Conduct

16
README.md

@ -6,20 +6,16 @@
Inspired by [this SO question](http://stackoverflow.com/questions/37061873/identify-a-weblink-in-bold-in-r) and because there's a great deal of cruddy HTML out there that needs fixing to use properly when scraping data.
NOTE: Requires [`libtidy`](http://www.html-tidy.org/) and presently is super-basic (no way to set options and pretty much only does HTML)
You'll need to first do a `brew install tidy-html5` on MacOS or `apt-get install libtidy-dev` on Ubuntu/Debian to get this to work. NOTE that the linux libraries may be older and return slightly different (but no less tidy) HTML.
**SEEKING COLLABORATORS**
It relies on a locally included version of [`libtidy`](http://www.html-tidy.org/) and presently is super-basic (no way to set options and pretty much only does HTML)
This works enough for me to use in a pinch. It should be straightforward (but tedious) to:
- enable passing options in a `list`
- bundle `libtidy` *with the package* and get it to work on Windows, linux & MacOS as the library compiles on all three with the necessary tools.
- Getting it to work on Windows.
The following functions are implemented:
- `tidy` : Clean up gnarly HTML/XML
- `tidy_html` : Clean up gnarly HTML/XML
### Installation
@ -34,14 +30,14 @@ library(htmltidy)
# current verison
packageVersion("htmltidy")
#> [1] '0.1.0.9000'
#> [1] '0.2.0.9000'
cat(tidy("<b><p><a href='http://google.com'>google &gt</a></p></b>"))
cat(tidy_html("<b><p><a href='http://google.com'>google &gt</a></p></b>"))
#> <!DOCTYPE html>
#> <html xmlns="http://www.w3.org/1999/xhtml">
#> <head>
#> <meta name="generator" content=
#> "HTML Tidy for HTML5 for Mac OS X version 5.2.0" />
#> "HTML Tidy for HTML5 for R version 5.0.0" />
#> <title></title>
#> </head>
#> <body>

6
man/tidy.Rd → man/tidy_html.Rd

@ -1,10 +1,10 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/RcppExports.R
\name{tidy}
\alias{tidy}
\name{tidy_html}
\alias{tidy_html}
\title{Tidy HTML/XML}
\usage{
tidy(source)
tidy_html(source)
}
\arguments{
\item{source}{length 1 character vetor containing the HTML/XML source to process}

BIN
src/.DS_Store

Binary file not shown.

3
src/Makevars

@ -1 +1,2 @@
PKG_LIBS=-ltidy
PKG_CPPFLAGS = -I.
PKG_CXXFLAGS = -I.

16
src/RcppExports.cpp

@ -1,18 +1,18 @@
// This file was generated by Rcpp::compileAttributes
// Generated by using Rcpp::compileAttributes() -> do not edit by hand
// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
#include <Rcpp.h>
using namespace Rcpp;
// tidy
std::string tidy(std::string source);
RcppExport SEXP htmltidy_tidy(SEXP sourceSEXP) {
// tidy_html
std::string tidy_html(std::string source);
RcppExport SEXP htmltidy_tidy_html(SEXP sourceSEXP) {
BEGIN_RCPP
Rcpp::RObject __result;
Rcpp::RNGScope __rngScope;
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< std::string >::type source(sourceSEXP);
__result = Rcpp::wrap(tidy(source));
return __result;
rcpp_result_gen = Rcpp::wrap(tidy_html(source));
return rcpp_result_gen;
END_RCPP
}

3305
src/access.c

File diff suppressed because it is too large

281
src/access.h

@ -0,0 +1,281 @@
#ifndef __ACCESS_H__
#define __ACCESS_H__
/* access.h -- carry out accessibility checks
Copyright University of Toronto
Portions (c) 1998-2006 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
*/
/*********************************************************************
* AccessibilityChecks
*
* Carries out processes for all accessibility checks. Traverses
* through all the content within the tree and evaluates the tags for
* accessibility.
*
* To perform the following checks, 'AccessibilityChecks' must be
* called AFTER the tree structure has been formed.
*
* If, in the command prompt, there is no specification of which
* accessibility priorities to check, no accessibility checks will be
* performed. (ie. '1' for priority 1, '2' for priorities 1 and 2,
* and '3') for priorities 1, 2 and 3.)
*
* Copyright University of Toronto
* Programmed by: Mike Lam and Chris Ridpath
* Modifications by : Terry Teague (TRT)
*
*********************************************************************/
#include "forward.h"
#include "message.h"
#if SUPPORT_ACCESSIBILITY_CHECKS
/* The accessibility checks to perform depending on user's desire.
1. priority 1
2. priority 1 & 2
3. priority 1, 2, & 3
*/
/* Determines if the client-side text link is found within the document
typedef struct AreaLinks
{
struct AreaLinks* next;
char* link;
Bool HasBeenFound;
} AreaLinks;
*/
enum {
TEXTBUF_SIZE=128u
};
struct _TidyAccessImpl;
typedef struct _TidyAccessImpl TidyAccessImpl;
struct _TidyAccessImpl
{
/* gets set from Tidy variable AccessibilityCheckLevel */
int PRIORITYCHK;
/* Number of characters that are found within the concatenated text */
int counter;
/* list of characters in the text nodes found within a container element */
tmbchar textNode[ TEXTBUF_SIZE ];
/* The list of characters found within one text node */
tmbchar text[ TEXTBUF_SIZE ];
/* Number of frame elements found within a frameset */
int numFrames;
/* Number of 'longdesc' attributes found within a frameset */
int HasCheckedLongDesc;
int CheckedHeaders;
int ListElements;
int OtherListElements;
/* For 'USEMAP' identifier */
Bool HasUseMap;
Bool HasName;
Bool HasMap;
/* For tracking nodes that are deleted from the original parse tree - TRT */
/* Node *access_tree; */
Bool HasTH;
Bool HasValidFor;
Bool HasValidId;
Bool HasValidRowHeaders;
Bool HasValidColumnHeaders;
Bool HasInvalidRowHeader;
Bool HasInvalidColumnHeader;
int ForID;
/* List containing map-links
AreaLinks* links;
AreaLinks* start;
AreaLinks* current;
*/
};
/*
Determines which error/warning message should be displayed,
depending on the error code that was called.
Offset accessibility error codes by FIRST_ACCESS_ERR to avoid conflict with
other error codes defined in message.h and used in localize.c.
These accessErrorCodes are used throughout libtidy, and also
have associated localized strings to describe them.
IMPORTANT: to maintain compatability with TidyMessageFilter3, if you add
or remove keys from this enum, ALSO add/remove the corresponding key
in language.c:tidyErrorFilterKeysStruct[]!
*/
typedef enum
{
FIRST_ACCESS_ERR = CODES_TIDY_ERROR_LAST + 1, /* must be first */
/* [1.1.1.1] */ IMG_MISSING_ALT,
/* [1.1.1.2] */ IMG_ALT_SUSPICIOUS_FILENAME,
/* [1.1.1.3] */ IMG_ALT_SUSPICIOUS_FILE_SIZE,
/* [1.1.1.4] */ IMG_ALT_SUSPICIOUS_PLACEHOLDER,
/* [1.1.1.10] */ IMG_ALT_SUSPICIOUS_TOO_LONG,
/* [1.1.1.11] */ IMG_MISSING_ALT_BULLET,
/* [1.1.1.12] */ IMG_MISSING_ALT_H_RULE,
/* [1.1.2.1] */ IMG_MISSING_LONGDESC_DLINK,
/* [1.1.2.2] */ IMG_MISSING_DLINK,
/* [1.1.2.3] */ IMG_MISSING_LONGDESC,
/* [1.1.2.5] */ LONGDESC_NOT_REQUIRED,
/* [1.1.3.1] */ IMG_BUTTON_MISSING_ALT,
/* [1.1.4.1] */ APPLET_MISSING_ALT,
/* [1.1.5.1] */ OBJECT_MISSING_ALT,
/* [1.1.6.1] */ AUDIO_MISSING_TEXT_WAV,
/* [1.1.6.2] */ AUDIO_MISSING_TEXT_AU,
/* [1.1.6.3] */ AUDIO_MISSING_TEXT_AIFF,
/* [1.1.6.4] */ AUDIO_MISSING_TEXT_SND,
/* [1.1.6.5] */ AUDIO_MISSING_TEXT_RA,
/* [1.1.6.6] */ AUDIO_MISSING_TEXT_RM,
/* [1.1.8.1] */ FRAME_MISSING_LONGDESC,
/* [1.1.9.1] */ AREA_MISSING_ALT,
/* [1.1.10.1] */ SCRIPT_MISSING_NOSCRIPT,
/* [1.1.12.1] */ ASCII_REQUIRES_DESCRIPTION,
/* [1.2.1.1] */ IMG_MAP_SERVER_REQUIRES_TEXT_LINKS,
/* [1.4.1.1] */ MULTIMEDIA_REQUIRES_TEXT,
/* [1.5.1.1] */ IMG_MAP_CLIENT_MISSING_TEXT_LINKS,
/* [2.1.1.1] */ INFORMATION_NOT_CONVEYED_IMAGE,
/* [2.1.1.2] */ INFORMATION_NOT_CONVEYED_APPLET,
/* [2.1.1.3] */ INFORMATION_NOT_CONVEYED_OBJECT,
/* [2.1.1.4] */ INFORMATION_NOT_CONVEYED_SCRIPT,
/* [2.1.1.5] */ INFORMATION_NOT_CONVEYED_INPUT,
/* [2.2.1.1] */ COLOR_CONTRAST_TEXT,
/* [2.2.1.2] */ COLOR_CONTRAST_LINK,
/* [2.2.1.3] */ COLOR_CONTRAST_ACTIVE_LINK,
/* [2.2.1.4] */ COLOR_CONTRAST_VISITED_LINK,
/* [3.2.1.1] */ DOCTYPE_MISSING,
/* [3.3.1.1] */ STYLE_SHEET_CONTROL_PRESENTATION,
/* [3.5.1.1] */ HEADERS_IMPROPERLY_NESTED,
/* [3.5.2.1] */ POTENTIAL_HEADER_BOLD,
/* [3.5.2.2] */ POTENTIAL_HEADER_ITALICS,
/* [3.5.2.3] */ POTENTIAL_HEADER_UNDERLINE,
/* [3.5.3.1] */ HEADER_USED_FORMAT_TEXT,
/* [3.6.1.1] */ LIST_USAGE_INVALID_UL,
/* [3.6.1.2] */ LIST_USAGE_INVALID_OL,
/* [3.6.1.4] */ LIST_USAGE_INVALID_LI,
/* [4.1.1.1] */ INDICATE_CHANGES_IN_LANGUAGE,
/* [4.3.1.1] */ LANGUAGE_NOT_IDENTIFIED,
/* [4.3.1.1] */ LANGUAGE_INVALID,
/* [5.1.2.1] */ DATA_TABLE_MISSING_HEADERS,
/* [5.1.2.2] */ DATA_TABLE_MISSING_HEADERS_COLUMN,
/* [5.1.2.3] */ DATA_TABLE_MISSING_HEADERS_ROW,
/* [5.2.1.1] */ DATA_TABLE_REQUIRE_MARKUP_COLUMN_HEADERS,
/* [5.2.1.2] */ DATA_TABLE_REQUIRE_MARKUP_ROW_HEADERS,
/* [5.3.1.1] */ LAYOUT_TABLES_LINEARIZE_PROPERLY,
/* [5.4.1.1] */ LAYOUT_TABLE_INVALID_MARKUP,
/* [5.5.1.1] */ TABLE_MISSING_SUMMARY,
/* [5.5.1.2] */ TABLE_SUMMARY_INVALID_NULL,
/* [5.5.1.3] */ TABLE_SUMMARY_INVALID_SPACES,
/* [5.5.1.6] */ TABLE_SUMMARY_INVALID_PLACEHOLDER,
/* [5.5.2.1] */ TABLE_MISSING_CAPTION,
/* [5.6.1.1] */ TABLE_MAY_REQUIRE_HEADER_ABBR,
/* [5.6.1.2] */ TABLE_MAY_REQUIRE_HEADER_ABBR_NULL,
/* [5.6.1.3] */ TABLE_MAY_REQUIRE_HEADER_ABBR_SPACES,
/* [6.1.1.1] */ STYLESHEETS_REQUIRE_TESTING_LINK,
/* [6.1.1.2] */ STYLESHEETS_REQUIRE_TESTING_STYLE_ELEMENT,
/* [6.1.1.3] */ STYLESHEETS_REQUIRE_TESTING_STYLE_ATTR,
/* [6.2.1.1] */ FRAME_SRC_INVALID,
/* [6.2.2.1] */ TEXT_EQUIVALENTS_REQUIRE_UPDATING_APPLET,
/* [6.2.2.2] */ TEXT_EQUIVALENTS_REQUIRE_UPDATING_SCRIPT,
/* [6.2.2.3] */ TEXT_EQUIVALENTS_REQUIRE_UPDATING_OBJECT,
/* [6.3.1.1] */ PROGRAMMATIC_OBJECTS_REQUIRE_TESTING_SCRIPT,
/* [6.3.1.2] */ PROGRAMMATIC_OBJECTS_REQUIRE_TESTING_OBJECT,
/* [6.3.1.3] */ PROGRAMMATIC_OBJECTS_REQUIRE_TESTING_EMBED,
/* [6.3.1.4] */ PROGRAMMATIC_OBJECTS_REQUIRE_TESTING_APPLET,
/* [6.5.1.1] */ FRAME_MISSING_NOFRAMES,
/* [6.5.1.2] */ NOFRAMES_INVALID_NO_VALUE,
/* [6.5.1.3] */ NOFRAMES_INVALID_CONTENT,
/* [6.5.1.4] */ NOFRAMES_INVALID_LINK,
/* [7.1.1.1] */ REMOVE_FLICKER_SCRIPT,
/* [7.1.1.2] */ REMOVE_FLICKER_OBJECT,
/* [7.1.1.3] */ REMOVE_FLICKER_EMBED,
/* [7.1.1.4] */ REMOVE_FLICKER_APPLET,
/* [7.1.1.5] */ REMOVE_FLICKER_ANIMATED_GIF,
/* [7.2.1.1] */ REMOVE_BLINK_MARQUEE,
/* [7.4.1.1] */ REMOVE_AUTO_REFRESH,
/* [7.5.1.1] */ REMOVE_AUTO_REDIRECT,
/* [8.1.1.1] */ ENSURE_PROGRAMMATIC_OBJECTS_ACCESSIBLE_SCRIPT,
/* [8.1.1.2] */ ENSURE_PROGRAMMATIC_OBJECTS_ACCESSIBLE_OBJECT,
/* [8.1.1.3] */ ENSURE_PROGRAMMATIC_OBJECTS_ACCESSIBLE_APPLET,
/* [8.1.1.4] */ ENSURE_PROGRAMMATIC_OBJECTS_ACCESSIBLE_EMBED,
/* [9.1.1.1] */ IMAGE_MAP_SERVER_SIDE_REQUIRES_CONVERSION,
/* [9.3.1.1] */ SCRIPT_NOT_KEYBOARD_ACCESSIBLE_ON_MOUSE_DOWN,
/* [9.3.1.2] */ SCRIPT_NOT_KEYBOARD_ACCESSIBLE_ON_MOUSE_UP,
/* [9.3.1.3] */ SCRIPT_NOT_KEYBOARD_ACCESSIBLE_ON_CLICK,
/* [9.3.1.4] */ SCRIPT_NOT_KEYBOARD_ACCESSIBLE_ON_MOUSE_OVER,
/* [9.3.1.5] */ SCRIPT_NOT_KEYBOARD_ACCESSIBLE_ON_MOUSE_OUT,
/* [9.3.1.6] */ SCRIPT_NOT_KEYBOARD_ACCESSIBLE_ON_MOUSE_MOVE,
/* [10.1.1.1] */ NEW_WINDOWS_REQUIRE_WARNING_NEW,
/* [10.1.1.2] */ NEW_WINDOWS_REQUIRE_WARNING_BLANK,
/* [10.2.1.1] */ LABEL_NEEDS_REPOSITIONING_BEFORE_INPUT,
/* [10.2.1.2] */ LABEL_NEEDS_REPOSITIONING_AFTER_INPUT,
/* [10.4.1.1] */ FORM_CONTROL_REQUIRES_DEFAULT_TEXT,
/* [10.4.1.2] */ FORM_CONTROL_DEFAULT_TEXT_INVALID_NULL,
/* [10.4.1.3] */ FORM_CONTROL_DEFAULT_TEXT_INVALID_SPACES,
/* [11.2.1.1] */ REPLACE_DEPRECATED_HTML_APPLET,
/* [11.2.1.2] */ REPLACE_DEPRECATED_HTML_BASEFONT,
/* [11.2.1.3] */ REPLACE_DEPRECATED_HTML_CENTER,
/* [11.2.1.4] */ REPLACE_DEPRECATED_HTML_DIR,
/* [11.2.1.5] */ REPLACE_DEPRECATED_HTML_FONT,
/* [11.2.1.6] */ REPLACE_DEPRECATED_HTML_ISINDEX,
/* [11.2.1.7] */ REPLACE_DEPRECATED_HTML_MENU,
/* [11.2.1.8] */ REPLACE_DEPRECATED_HTML_S,
/* [11.2.1.9] */ REPLACE_DEPRECATED_HTML_STRIKE,
/* [11.2.1.10] */ REPLACE_DEPRECATED_HTML_U,
/* [12.1.1.1] */ FRAME_MISSING_TITLE,
/* [12.1.1.2] */ FRAME_TITLE_INVALID_NULL,
/* [12.1.1.3] */ FRAME_TITLE_INVALID_SPACES,
/* [12.4.1.1] */ ASSOCIATE_LABELS_EXPLICITLY,
/* [12.4.1.2] */ ASSOCIATE_LABELS_EXPLICITLY_FOR,
/* [12.4.1.3] */ ASSOCIATE_LABELS_EXPLICITLY_ID,
/* [13.1.1.1] */ LINK_TEXT_NOT_MEANINGFUL,
/* [13.1.1.2] */ LINK_TEXT_MISSING,
/* [13.1.1.3] */ LINK_TEXT_TOO_LONG,
/* [13.1.1.4] */ LINK_TEXT_NOT_MEANINGFUL_CLICK_HERE,
/* [13.1.1.5] */ LINK_TEXT_NOT_MEANINGFUL_MORE,
/* [13.1.1.6] */ LINK_TEXT_NOT_MEANINGFUL_FOLLOW_THIS,
/* [13.2.1.1] */ METADATA_MISSING,
/* [13.2.1.2] */ METADATA_MISSING_LINK,
/* [13.2.1.3] */ METADATA_MISSING_REDIRECT_AUTOREFRESH,
/* [13.10.1.1] */ SKIPOVER_ASCII_ART,
LAST_ACCESS_ERR /* must be last */
} accessErrorCodes;
void TY_(AccessibilityHelloMessage)( TidyDocImpl* doc );
void TY_(DisplayHTMLTableAlgorithm)( TidyDocImpl* doc );
/************************************************************
* AccessibilityChecks
*
* Traverses through the individual nodes of the tree
* and checks attributes and elements for accessibility.
* after the tree structure has been formed.
************************************************************/
void TY_(AccessibilityChecks)( TidyDocImpl* doc );
#endif /* SUPPORT_ACCESSIBILITY_CHECKS */
#endif /* __ACCESS_H__ */

118
src/alloc.c

@ -0,0 +1,118 @@
/* alloc.c -- Default memory allocation routines.
(c) 1998-2006 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
*/
/* #define DEBUG_MEMORY very NOISY extra DEBUG of memory allocation, reallocation and free */
#include "tidy.h"
#include "forward.h"
#ifdef DEBUG_MEMORY
#include "sprtf.h"
#endif
static TidyMalloc g_malloc = NULL;
static TidyRealloc g_realloc = NULL;
static TidyFree g_free = NULL;
static TidyPanic g_panic = NULL;
Bool TIDY_CALL tidySetMallocCall( TidyMalloc fmalloc )
{
g_malloc = fmalloc;
return yes;
}
Bool TIDY_CALL tidySetReallocCall( TidyRealloc frealloc )
{
g_realloc = frealloc;
return yes;
}
Bool TIDY_CALL tidySetFreeCall( TidyFree ffree )
{
g_free = ffree;
return yes;
}
Bool TIDY_CALL tidySetPanicCall( TidyPanic fpanic )
{
g_panic = fpanic;
return yes;
}
static void TIDY_CALL defaultPanic( TidyAllocator* ARG_UNUSED(allocator), ctmbstr msg )
{
if ( g_panic )
g_panic( msg );
else
{
/* 2 signifies a serious error */
fprintf( stderr, "Fatal error: %s\n", msg );
#ifdef _DEBUG
assert(0);
#endif
exit(2);
}
}
static void* TIDY_CALL defaultAlloc( TidyAllocator* allocator, size_t size )
{
void *p = ( g_malloc ? g_malloc(size) : malloc(size) );
if ( !p )
defaultPanic( allocator,"Out of memory!");
#if !defined(NDEBUG) && defined(_MSC_VER) && defined(DEBUG_MEMORY)
SPRTF("alloc MEM %p, size %d\n", p, (int)size );
if (size == 0) {
SPRTF("NOTE: An allocation of ZERO bytes!!!!!!\n");
}
#endif
return p;
}
static void* TIDY_CALL defaultRealloc( TidyAllocator* allocator, void* mem, size_t newsize )
{
void *p;
if ( mem == NULL )
return defaultAlloc( allocator, newsize );
p = ( g_realloc ? g_realloc(mem, newsize) : realloc(mem, newsize) );
if (!p)
defaultPanic( allocator, "Out of memory!");
#if !defined(NDEBUG) && defined(_MSC_VER) && defined(DEBUG_MEMORY)
SPRTF("realloc MEM %p, size %d\n", p, (int)newsize );
#endif
return p;
}
static void TIDY_CALL defaultFree( TidyAllocator* ARG_UNUSED(allocator), void* mem )
{
if ( mem )
{
#if !defined(NDEBUG) && defined(_MSC_VER) && defined(DEBUG_MEMORY)
SPRTF("free MEM %p\n", mem );
#endif
if ( g_free )
g_free( mem );
else
free( mem );
}
}
static const TidyAllocatorVtbl defaultVtbl = {
defaultAlloc,
defaultRealloc,
defaultFree,
defaultPanic
};
TidyAllocator TY_(g_default_allocator) = {
&defaultVtbl
};
/*
* local variables:
* mode: c
* indent-tabs-mode: nil
* c-basic-offset: 4
* eval: (c-set-offset 'substatement-open 0)
* end:
*/

204
src/attrask.c

@ -0,0 +1,204 @@
/* attrask.c -- Interrogate attribute type
(c) 1998-2006 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
*/
#include "tidy-int.h"
#include "tidy.h"
#include "attrs.h"
Bool TIDY_CALL tidyAttrIsHREF( TidyAttr tattr )
{
return attrIsHREF( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsSRC( TidyAttr tattr )
{
return attrIsSRC( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsID( TidyAttr tattr )
{
return attrIsID( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsNAME( TidyAttr tattr )
{
return attrIsNAME( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsSUMMARY( TidyAttr tattr )
{
return attrIsSUMMARY( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsALT( TidyAttr tattr )
{
return attrIsALT( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsLONGDESC( TidyAttr tattr )
{
return attrIsLONGDESC( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsUSEMAP( TidyAttr tattr )
{
return attrIsUSEMAP( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsISMAP( TidyAttr tattr )
{
return attrIsISMAP( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsLANGUAGE( TidyAttr tattr )
{
return attrIsLANGUAGE( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsTYPE( TidyAttr tattr )
{
return attrIsTYPE( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsVALUE( TidyAttr tattr )
{
return attrIsVALUE( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsCONTENT( TidyAttr tattr )
{
return attrIsCONTENT( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsTITLE( TidyAttr tattr )
{
return attrIsTITLE( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsXMLNS( TidyAttr tattr )
{
return attrIsXMLNS( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsDATAFLD( TidyAttr tattr )
{
return attrIsDATAFLD( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsWIDTH( TidyAttr tattr )
{
return attrIsWIDTH( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsHEIGHT( TidyAttr tattr )
{
return attrIsHEIGHT( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsFOR( TidyAttr tattr )
{
return attrIsFOR( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsSELECTED( TidyAttr tattr )
{
return attrIsSELECTED( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsCHECKED( TidyAttr tattr )
{
return attrIsCHECKED( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsLANG( TidyAttr tattr )
{
return attrIsLANG( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsTARGET( TidyAttr tattr )
{
return attrIsTARGET( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsHTTP_EQUIV( TidyAttr tattr )
{
return attrIsHTTP_EQUIV( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsREL( TidyAttr tattr )
{
return attrIsREL( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsEvent( TidyAttr tattr )
{
return TY_(attrIsEvent)( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsOnMOUSEMOVE( TidyAttr tattr )
{
return attrIsOnMOUSEMOVE( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsOnMOUSEDOWN( TidyAttr tattr )
{
return attrIsOnMOUSEDOWN( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsOnMOUSEUP( TidyAttr tattr )
{
return attrIsOnMOUSEUP( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsOnCLICK( TidyAttr tattr )
{
return attrIsOnCLICK( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsOnMOUSEOVER( TidyAttr tattr )
{
return attrIsOnMOUSEOVER( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsOnMOUSEOUT( TidyAttr tattr )
{
return attrIsOnMOUSEOUT( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsOnKEYDOWN( TidyAttr tattr )
{
return attrIsOnKEYDOWN( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsOnKEYUP( TidyAttr tattr )
{
return attrIsOnKEYUP( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsOnKEYPRESS( TidyAttr tattr )
{
return attrIsOnKEYPRESS( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsOnFOCUS( TidyAttr tattr )
{
return attrIsOnFOCUS( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsOnBLUR( TidyAttr tattr )
{
return attrIsOnBLUR( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsBGCOLOR( TidyAttr tattr )
{
return attrIsBGCOLOR( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsLINK( TidyAttr tattr )
{
return attrIsLINK( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsALINK( TidyAttr tattr )
{
return attrIsALINK( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsVLINK( TidyAttr tattr )
{
return attrIsVLINK( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsTEXT( TidyAttr tattr )
{
return attrIsTEXT( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsSTYLE( TidyAttr tattr )
{
return attrIsSTYLE( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsABBR( TidyAttr tattr )
{
return attrIsABBR( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsCOLSPAN( TidyAttr tattr )
{
return attrIsCOLSPAN( tidyAttrToImpl(tattr) );
}
Bool TIDY_CALL tidyAttrIsROWSPAN( TidyAttr tattr )
{
return attrIsROWSPAN( tidyAttrToImpl(tattr) );
}
/*
* local variables:
* mode: c
* indent-tabs-mode: nil
* c-basic-offset: 4
* eval: (c-set-offset 'substatement-open 0)
* end:
*/

3619
src/attrdict.c

File diff suppressed because it is too large

156
src/attrdict.h

@ -0,0 +1,156 @@
#ifndef __ATTRDICT_H__
#define __ATTRDICT_H__
/* attrdict.h -- extended attribute information
(c) 1998-2006 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
*/
#include "tidy.h"
typedef struct _AttrVersion
{
TidyAttrId attribute;
uint versions;
} AttrVersion;
extern const AttrVersion TY_(W3CAttrsFor_A)[];
extern const AttrVersion TY_(W3CAttrsFor_ABBR)[];
extern const AttrVersion TY_(W3CAttrsFor_ACRONYM)[];
extern const AttrVersion TY_(W3CAttrsFor_ADDRESS)[];
extern const AttrVersion TY_(W3CAttrsFor_APPLET)[];
extern const AttrVersion TY_(W3CAttrsFor_AREA)[];
extern const AttrVersion TY_(W3CAttrsFor_B)[];
extern const AttrVersion TY_(W3CAttrsFor_BASE)[];
extern const AttrVersion TY_(W3CAttrsFor_BASEFONT)[];
extern const AttrVersion TY_(W3CAttrsFor_BDO)[];
extern const AttrVersion TY_(W3CAttrsFor_BIG)[];
extern const AttrVersion TY_(W3CAttrsFor_BLOCKQUOTE)[];
extern const AttrVersion TY_(W3CAttrsFor_BODY)[];
extern const AttrVersion TY_(W3CAttrsFor_BR)[];
extern const AttrVersion TY_(W3CAttrsFor_BUTTON)[];
extern const AttrVersion TY_(W3CAttrsFor_CAPTION)[];
extern const AttrVersion TY_(W3CAttrsFor_CENTER)[];
extern const AttrVersion TY_(W3CAttrsFor_CITE)[];
extern const AttrVersion TY_(W3CAttrsFor_CODE)[];
extern const AttrVersion TY_(W3CAttrsFor_COL)[];
extern const AttrVersion TY_(W3CAttrsFor_COLGROUP)[];
extern const AttrVersion TY_(W3CAttrsFor_DD)[];
extern const AttrVersion TY_(W3CAttrsFor_DEL)[];
extern const AttrVersion TY_(W3CAttrsFor_DFN)[];
extern const AttrVersion TY_(W3CAttrsFor_DIR)[];
extern const AttrVersion TY_(W3CAttrsFor_DIV)[];
extern const AttrVersion TY_(W3CAttrsFor_DL)[];
extern const AttrVersion TY_(W3CAttrsFor_DT)[];
extern const AttrVersion TY_(W3CAttrsFor_EM)[];
extern const AttrVersion TY_(W3CAttrsFor_FIELDSET)[];
extern const AttrVersion TY_(W3CAttrsFor_FONT)[];
extern const AttrVersion TY_(W3CAttrsFor_FORM)[];
extern const AttrVersion TY_(W3CAttrsFor_FRAME)[];
extern const AttrVersion TY_(W3CAttrsFor_FRAMESET)[];
extern const AttrVersion TY_(W3CAttrsFor_H1)[];
extern const AttrVersion TY_(W3CAttrsFor_H2)[];
extern const AttrVersion TY_(W3CAttrsFor_H3)[];
extern const AttrVersion TY_(W3CAttrsFor_H4)[];
extern const AttrVersion TY_(W3CAttrsFor_H5)[];
extern const AttrVersion TY_(W3CAttrsFor_H6)[];
extern const AttrVersion TY_(W3CAttrsFor_HEAD)[];
extern const AttrVersion TY_(W3CAttrsFor_HR)[];
extern const AttrVersion TY_(W3CAttrsFor_HTML)[];
extern const AttrVersion TY_(W3CAttrsFor_I)[];
extern const AttrVersion TY_(W3CAttrsFor_IFRAME)[];
extern const AttrVersion TY_(W3CAttrsFor_IMG)[];
extern const AttrVersion TY_(W3CAttrsFor_INPUT)[];
extern const AttrVersion TY_(W3CAttrsFor_INS)[];
extern const AttrVersion TY_(W3CAttrsFor_ISINDEX)[];
extern const AttrVersion TY_(W3CAttrsFor_KBD)[];
extern const AttrVersion TY_(W3CAttrsFor_LABEL)[];
extern const AttrVersion TY_(W3CAttrsFor_LEGEND)[];
extern const AttrVersion TY_(W3CAttrsFor_LI)[];
extern const AttrVersion TY_(W3CAttrsFor_LINK)[];
extern const AttrVersion TY_(W3CAttrsFor_LISTING)[];
extern const AttrVersion TY_(W3CAttrsFor_MAP)[];
extern const AttrVersion TY_(W3CAttrsFor_MATHML)[]; /* [i_a]2 */
extern const AttrVersion TY_(W3CAttrsFor_MENU)[];
extern const AttrVersion TY_(W3CAttrsFor_META)[];
extern const AttrVersion TY_(W3CAttrsFor_NEXTID)[];
extern const AttrVersion TY_(W3CAttrsFor_NOFRAMES)[];
extern const AttrVersion TY_(W3CAttrsFor_NOSCRIPT)[];
extern const AttrVersion TY_(W3CAttrsFor_OBJECT)[];
extern const AttrVersion TY_(W3CAttrsFor_OL)[];
extern const AttrVersion TY_(W3CAttrsFor_OPTGROUP)[];
extern const AttrVersion TY_(W3CAttrsFor_OPTION)[];
extern const AttrVersion TY_(W3CAttrsFor_P)[];
extern const AttrVersion TY_(W3CAttrsFor_PARAM)[];
extern const AttrVersion TY_(W3CAttrsFor_PICTURE)[]; /* Issue #151 - html5 */
extern const AttrVersion TY_(W3CAttrsFor_PLAINTEXT)[];
extern const AttrVersion TY_(W3CAttrsFor_PRE)[];
extern const AttrVersion TY_(W3CAttrsFor_Q)[];
extern const AttrVersion TY_(W3CAttrsFor_RB)[];
extern const AttrVersion TY_(W3CAttrsFor_RBC)[];
extern const AttrVersion TY_(W3CAttrsFor_RP)[];
extern const AttrVersion TY_(W3CAttrsFor_RT)[];
extern const AttrVersion TY_(W3CAttrsFor_RTC)[];
extern const AttrVersion TY_(W3CAttrsFor_RUBY)[];
extern const AttrVersion TY_(W3CAttrsFor_S)[];
extern const AttrVersion TY_(W3CAttrsFor_SAMP)[];
extern const AttrVersion TY_(W3CAttrsFor_SCRIPT)[];
extern const AttrVersion TY_(W3CAttrsFor_SELECT)[];
extern const AttrVersion TY_(W3CAttrsFor_SMALL)[];
extern const AttrVersion TY_(W3CAttrsFor_SPAN)[];
extern const AttrVersion TY_(W3CAttrsFor_STRIKE)[];
extern const AttrVersion TY_(W3CAttrsFor_STRONG)[];
extern const AttrVersion TY_(W3CAttrsFor_STYLE)[];
extern const AttrVersion TY_(W3CAttrsFor_SUB)[];
extern const AttrVersion TY_(W3CAttrsFor_SUP)[];
extern const AttrVersion TY_(W3CAttrsFor_SVG)[];
extern const AttrVersion TY_(W3CAttrsFor_TABLE)[];
extern const AttrVersion TY_(W3CAttrsFor_TBODY)[];
extern const AttrVersion TY_(W3CAttrsFor_TD)[];
extern const AttrVersion TY_(W3CAttrsFor_TEXTAREA)[];
extern const AttrVersion TY_(W3CAttrsFor_TFOOT)[];
extern const AttrVersion TY_(W3CAttrsFor_TH)[];
extern const AttrVersion TY_(W3CAttrsFor_THEAD)[];
extern const AttrVersion TY_(W3CAttrsFor_TITLE)[];
extern const AttrVersion TY_(W3CAttrsFor_TR)[];
extern const AttrVersion TY_(W3CAttrsFor_TT)[];
extern const AttrVersion TY_(W3CAttrsFor_U)[];
extern const AttrVersion TY_(W3CAttrsFor_UL)[];
extern const AttrVersion TY_(W3CAttrsFor_VAR)[];
extern const AttrVersion TY_(W3CAttrsFor_XMP)[];
extern const AttrVersion TY_(W3CAttrsFor_TRACK)[];
extern const AttrVersion TY_(W3CAttrsFor_SUMMARY)[];
extern const AttrVersion TY_(W3CAttrsFor_FIGCAPTION)[];
extern const AttrVersion TY_(W3CAttrsFor_HGROUP)[];
extern const AttrVersion TY_(W3CAttrsFor_FIGURE)[];
extern const AttrVersion TY_(W3CAttrsFor_ARTICLE)[];
extern const AttrVersion TY_(W3CAttrsFor_ASIDE)[];
extern const AttrVersion TY_(W3CAttrsFor_BDI)[];
extern const AttrVersion TY_(W3CAttrsFor_NAV)[];
extern const AttrVersion TY_(W3CAttrsFor_SECTION)[];
extern const AttrVersion TY_(W3CAttrsFor_FOOTER)[];
extern const AttrVersion TY_(W3CAttrsFor_HEADER)[];
extern const AttrVersion TY_(W3CAttrsFor_DETAILS)[];
extern const AttrVersion TY_(W3CAttrsFor_DIALOG)[];
extern const AttrVersion TY_(W3CAttrsFor_COMMAND)[];
extern const AttrVersion TY_(W3CAttrsFor_MAIN)[];
extern const AttrVersion TY_(W3CAttrsFor_MARK)[];
extern const AttrVersion TY_(W3CAttrsFor_OUTPUT)[];
extern const AttrVersion TY_(W3CAttrsFor_MENUITEM)[];
extern const AttrVersion TY_(W3CAttrsFor_METER)[];
extern const AttrVersion TY_(W3CAttrsFor_PROGRESS)[];
extern const AttrVersion TY_(W3CAttrsFor_TEMPLATE)[];
extern const AttrVersion TY_(W3CAttrsFor_TIME)[];
extern const AttrVersion TY_(W3CAttrsFor_DATALIST)[];
extern const AttrVersion TY_(W3CAttrsFor_AUDIO)[];
extern const AttrVersion TY_(W3CAttrsFor_VIDEO)[];
extern const AttrVersion TY_(W3CAttrsFor_CANVAS)[];
extern const AttrVersion TY_(W3CAttrsFor_SOURCE)[];
extern const AttrVersion TY_(W3CAttrsFor_EMBED)[];
extern const AttrVersion TY_(W3CAttrsFor_KEYGEN)[];
extern const AttrVersion TY_(W3CAttrsFor_WBR)[];
#endif /* __ATTRDICT_H__ */

208
src/attrget.c

@ -0,0 +1,208 @@
/* attrget.c -- Locate attribute value by type
(c) 1998-2006 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
*/
#include "tidy-int.h"
#include "tags.h"
#include "attrs.h"
#include "tidy.h"
TidyAttr TIDY_CALL tidyAttrGetById( TidyNode tnod, TidyAttrId attId )
{
Node* nimp = tidyNodeToImpl(tnod);
return tidyImplToAttr( TY_(AttrGetById)( nimp, attId ) );
}
TidyAttr TIDY_CALL tidyAttrGetHREF( TidyNode tnod )
{
return tidyImplToAttr( attrGetHREF( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetSRC( TidyNode tnod )
{
return tidyImplToAttr( attrGetSRC( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetID( TidyNode tnod )
{
return tidyImplToAttr( attrGetID( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetNAME( TidyNode tnod )
{
return tidyImplToAttr( attrGetNAME( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetSUMMARY( TidyNode tnod )
{
return tidyImplToAttr( attrGetSUMMARY( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetALT( TidyNode tnod )
{
return tidyImplToAttr( attrGetALT( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetLONGDESC( TidyNode tnod )
{
return tidyImplToAttr( attrGetLONGDESC( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetUSEMAP( TidyNode tnod )
{
return tidyImplToAttr( attrGetUSEMAP( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetISMAP( TidyNode tnod )
{
return tidyImplToAttr( attrGetISMAP( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetLANGUAGE( TidyNode tnod )
{
return tidyImplToAttr( attrGetLANGUAGE( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetTYPE( TidyNode tnod )
{
return tidyImplToAttr( attrGetTYPE( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetVALUE( TidyNode tnod )
{
return tidyImplToAttr( attrGetVALUE( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetCONTENT( TidyNode tnod )
{
return tidyImplToAttr( attrGetCONTENT( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetTITLE( TidyNode tnod )
{
return tidyImplToAttr( attrGetTITLE( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetXMLNS( TidyNode tnod )
{
return tidyImplToAttr( attrGetXMLNS( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetDATAFLD( TidyNode tnod )
{
return tidyImplToAttr( attrGetDATAFLD( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetWIDTH( TidyNode tnod )
{
return tidyImplToAttr( attrGetWIDTH( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetHEIGHT( TidyNode tnod )
{
return tidyImplToAttr( attrGetHEIGHT( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetFOR( TidyNode tnod )
{
return tidyImplToAttr( attrGetFOR( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetSELECTED( TidyNode tnod )
{
return tidyImplToAttr( attrGetSELECTED( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetCHECKED( TidyNode tnod )
{
return tidyImplToAttr( attrGetCHECKED( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetLANG( TidyNode tnod )
{
return tidyImplToAttr( attrGetLANG( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetTARGET( TidyNode tnod )
{
return tidyImplToAttr( attrGetTARGET( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetHTTP_EQUIV( TidyNode tnod )
{
return tidyImplToAttr( attrGetHTTP_EQUIV( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetREL( TidyNode tnod )
{
return tidyImplToAttr( attrGetREL( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetOnMOUSEMOVE( TidyNode tnod )
{
return tidyImplToAttr( attrGetOnMOUSEMOVE( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetOnMOUSEDOWN( TidyNode tnod )
{
return tidyImplToAttr( attrGetOnMOUSEDOWN( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetOnMOUSEUP( TidyNode tnod )
{
return tidyImplToAttr( attrGetOnMOUSEUP( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetOnCLICK( TidyNode tnod )
{
return tidyImplToAttr( attrGetOnCLICK( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetOnMOUSEOVER( TidyNode tnod )
{
return tidyImplToAttr( attrGetOnMOUSEOVER( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetOnMOUSEOUT( TidyNode tnod )
{
return tidyImplToAttr( attrGetOnMOUSEOUT( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetOnKEYDOWN( TidyNode tnod )
{
return tidyImplToAttr( attrGetOnKEYDOWN( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetOnKEYUP( TidyNode tnod )
{
return tidyImplToAttr( attrGetOnKEYUP( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetOnKEYPRESS( TidyNode tnod )
{
return tidyImplToAttr( attrGetOnKEYPRESS( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetOnFOCUS( TidyNode tnod )
{
return tidyImplToAttr( attrGetOnFOCUS( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetOnBLUR( TidyNode tnod )
{
return tidyImplToAttr( attrGetOnBLUR( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetBGCOLOR( TidyNode tnod )
{
return tidyImplToAttr( attrGetBGCOLOR( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetLINK( TidyNode tnod )
{
return tidyImplToAttr( attrGetLINK( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetALINK( TidyNode tnod )
{
return tidyImplToAttr( attrGetALINK( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetVLINK( TidyNode tnod )
{
return tidyImplToAttr( attrGetVLINK( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetTEXT( TidyNode tnod )
{
return tidyImplToAttr( attrGetTEXT( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetSTYLE( TidyNode tnod )
{
return tidyImplToAttr( attrGetSTYLE( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetABBR( TidyNode tnod )
{
return tidyImplToAttr( attrGetABBR( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetCOLSPAN( TidyNode tnod )
{
return tidyImplToAttr( attrGetCOLSPAN( tidyNodeToImpl(tnod) ) );
}
TidyAttr TIDY_CALL tidyAttrGetROWSPAN( TidyNode tnod )
{
return tidyImplToAttr( attrGetROWSPAN( tidyNodeToImpl(tnod) ) );
}
/*
* local variables:
* mode: c
* indent-tabs-mode: nil
* c-basic-offset: 4
* eval: (c-set-offset 'substatement-open 0)
* end:
*/

2315
src/attrs.c

File diff suppressed because it is too large

458
src/attrs.h

@ -0,0 +1,458 @@
#ifndef __ATTRS_H__
#define __ATTRS_H__
/* attrs.h -- recognize HTML attributes
(c) 1998-2007 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
*/
#include "forward.h"
/* declaration for methods that check attribute values */
typedef void (AttrCheck)(TidyDocImpl* doc, Node *node, AttVal *attval);
struct _Attribute
{
TidyAttrId id;
tmbstr name;
AttrCheck* attrchk;
struct _Attribute* next;
};
/*
Anchor/Node linked list
*/
struct _Anchor
{
struct _Anchor *next;
Node *node;
char *name;
};
typedef struct _Anchor Anchor;
#if !defined(ATTRIBUTE_HASH_LOOKUP)
#define ATTRIBUTE_HASH_LOOKUP 1
#endif
#if ATTRIBUTE_HASH_LOOKUP
enum
{
ATTRIBUTE_HASH_SIZE=178u
};
struct _AttrHash
{
Attribute const* attr;
struct _AttrHash* next;
};
typedef struct _AttrHash AttrHash;
#endif
enum
{
ANCHOR_HASH_SIZE=1021u
};
struct _TidyAttribImpl
{
/* anchor/node lookup */
Anchor* anchor_hash[ANCHOR_HASH_SIZE];
/* Declared literal attributes */
Attribute* declared_attr_list;
#if ATTRIBUTE_HASH_LOOKUP
AttrHash* hashtab[ATTRIBUTE_HASH_SIZE];
#endif
};
typedef struct _TidyAttribImpl TidyAttribImpl;
#define XHTML_NAMESPACE "http://www.w3.org/1999/xhtml"
AttrCheck TY_(CheckUrl);
/* public method for finding attribute definition by name */
const Attribute* TY_(CheckAttribute)( TidyDocImpl* doc, Node *node, AttVal *attval );
const Attribute* TY_(FindAttribute)( TidyDocImpl* doc, AttVal *attval );
AttVal* TY_(GetAttrByName)( Node *node, ctmbstr name );
void TY_(DropAttrByName)( TidyDocImpl* doc, Node *node, ctmbstr name );
AttVal* TY_(AddAttribute)( TidyDocImpl* doc,
Node *node, ctmbstr name, ctmbstr value );
AttVal* TY_(RepairAttrValue)(TidyDocImpl* doc, Node* node, ctmbstr name, ctmbstr value);
Bool TY_(IsUrl)( TidyDocImpl* doc, ctmbstr attrname );
/* Bool IsBool( TidyDocImpl* doc, ctmbstr attrname ); */
Bool TY_(IsScript)( TidyDocImpl* doc, ctmbstr attrname );
/* may id or name serve as anchor? */
Bool TY_(IsAnchorElement)( TidyDocImpl* doc, Node* node );
/*
In CSS1, selectors can contain only the characters A-Z, 0-9, and
Unicode characters 161-255, plus dash (-); they cannot start with
a dash or a digit; they can also contain escaped characters and any
Unicode character as a numeric code (see next item).
The backslash followed by at most four hexadecimal digits (0..9A..F)
stands for the Unicode character with that number.
Any character except a hexadecimal digit can be escaped to remove its
special meaning, by putting a backslash in front.
#508936 - CSS class naming for -clean option
*/
Bool TY_(IsCSS1Selector)( ctmbstr buf );
Bool TY_(IsValidHTMLID)(ctmbstr id);
Bool TY_(IsValidXMLID)(ctmbstr id);
/* removes anchor for specific node */
void TY_(RemoveAnchorByNode)( TidyDocImpl* doc, ctmbstr name, Node *node );
/* free all anchors */
void TY_(FreeAnchors)( TidyDocImpl* doc );
/* public methods for inititializing/freeing attribute dictionary */
void TY_(InitAttrs)( TidyDocImpl* doc );
void TY_(FreeAttrTable)( TidyDocImpl* doc );
void TY_(AppendToClassAttr)( TidyDocImpl* doc, AttVal *classattr, ctmbstr classname );
/*
the same attribute name can't be used
more than once in each element
*/
void TY_(RepairDuplicateAttributes)( TidyDocImpl* doc, Node* node, Bool isXml );
void TY_(SortAttributes)(Node* node, TidyAttrSortStrategy strat);
Bool TY_(IsBoolAttribute)( AttVal* attval );
Bool TY_(attrIsEvent)( AttVal* attval );
AttVal* TY_(AttrGetById)( Node* node, TidyAttrId id );
uint TY_(NodeAttributeVersions)( Node* node, TidyAttrId id );
Bool TY_(AttributeIsProprietary)(Node* node, AttVal* attval);
Bool TY_(AttributeIsMismatched)(Node* node, AttVal* attval, TidyDocImpl* doc);
/* 0 == TidyAttr_UNKNOWN */
#define AttrId(av) ((av) && (av)->dict ? (av)->dict->id : TidyAttr_UNKNOWN)
#define AttrIsId(av, atid) ((av) && (av)->dict && ((av)->dict->id == atid))
#define AttrHasValue(attr) ((attr) && (attr)->value)
#define AttrValueIs(attr, val) (AttrHasValue(attr) && \
TY_(tmbstrcasecmp)((attr)->value, val) == 0)
#define AttrContains(attr, val) (AttrHasValue(attr) && \
TY_(tmbsubstr)((attr)->value, val) != NULL)
#define AttrVersions(attr) ((attr) && (attr)->dict ? (attr)->dict->versions : VERS_PROPRIETARY)
#define AttrsHaveSameId(a, b) (a && b && a->dict && b->dict && a->dict->id && \
b->dict->id && a->dict->id == b->dict->id)
#define attrIsABBR(av) AttrIsId( av, TidyAttr_ABBR )
#define attrIsACCEPT(av) AttrIsId( av, TidyAttr_ACCEPT )
#define attrIsACCEPT_CHARSET(av) AttrIsId( av, TidyAttr_ACCEPT_CHARSET )
#define attrIsACCESSKEY(av) AttrIsId( av, TidyAttr_ACCESSKEY )
#define attrIsACTION(av) AttrIsId( av, TidyAttr_ACTION )
#define attrIsADD_DATE(av) AttrIsId( av, TidyAttr_ADD_DATE )
#define attrIsALIGN(av) AttrIsId( av, TidyAttr_ALIGN )
#define attrIsALINK(av) AttrIsId( av, TidyAttr_ALINK )
#define attrIsALT(av) AttrIsId( av, TidyAttr_ALT )
#define attrIsARCHIVE(av) AttrIsId( av, TidyAttr_ARCHIVE )
#define attrIsAXIS(av) AttrIsId( av, TidyAttr_AXIS )
#define attrIsBACKGROUND(av) AttrIsId( av, TidyAttr_BACKGROUND )
#define attrIsBGCOLOR(av) AttrIsId( av, TidyAttr_BGCOLOR )
#define attrIsBGPROPERTIES(av) AttrIsId( av, TidyAttr_BGPROPERTIES )
#define attrIsBORDER(av) AttrIsId( av, TidyAttr_BORDER )
#define attrIsBORDERCOLOR(av) AttrIsId( av, TidyAttr_BORDERCOLOR )
#define attrIsBOTTOMMARGIN(av) AttrIsId( av, TidyAttr_BOTTOMMARGIN )
#define attrIsCELLPADDING(av) AttrIsId( av, TidyAttr_CELLPADDING )
#define attrIsCELLSPACING(av) AttrIsId( av, TidyAttr_CELLSPACING )
#define attrIsCHAR(av) AttrIsId( av, TidyAttr_CHAR )
#define attrIsCHAROFF(av) AttrIsId( av, TidyAttr_CHAROFF )
#define attrIsCHARSET(av) AttrIsId( av, TidyAttr_CHARSET )
#define attrIsCHECKED(av) AttrIsId( av, TidyAttr_CHECKED )
#define attrIsCITE(av) AttrIsId( av, TidyAttr_CITE )
#define attrIsCLASS(av) AttrIsId( av, TidyAttr_CLASS )
#define attrIsCLASSID(av) AttrIsId( av, TidyAttr_CLASSID )
#define attrIsCLEAR(av) AttrIsId( av, TidyAttr_CLEAR )
#define attrIsCODE(av) AttrIsId( av, TidyAttr_CODE )
#define attrIsCODEBASE(av) AttrIsId( av, TidyAttr_CODEBASE )
#define attrIsCODETYPE(av) AttrIsId( av, TidyAttr_CODETYPE )
#define attrIsCOLOR(av) AttrIsId( av, TidyAttr_COLOR )
#define attrIsCOLS(av) AttrIsId( av, TidyAttr_COLS )
#define attrIsCOLSPAN(av) AttrIsId( av, TidyAttr_COLSPAN )
#define attrIsCOMPACT(av) AttrIsId( av, TidyAttr_COMPACT )
#define attrIsCONTENT(av) AttrIsId( av, TidyAttr_CONTENT )
#define attrIsCOORDS(av) AttrIsId( av, TidyAttr_COORDS )
#define attrIsDATA(av) AttrIsId( av, TidyAttr_DATA )
#define attrIsDATAFLD(av) AttrIsId( av, TidyAttr_DATAFLD )
#define attrIsDATAFORMATAS(av) AttrIsId( av, TidyAttr_DATAFORMATAS )
#define attrIsDATAPAGESIZE(av) AttrIsId( av, TidyAttr_DATAPAGESIZE )
#define attrIsDATASRC(av) AttrIsId( av, TidyAttr_DATASRC )
#define attrIsDATETIME(av) AttrIsId( av, TidyAttr_DATETIME )
#define attrIsDECLARE(av) AttrIsId( av, TidyAttr_DECLARE )
#define attrIsDEFER(av) AttrIsId( av, TidyAttr_DEFER )
#define attrIsDIR(av) AttrIsId( av, TidyAttr_DIR )
#define attrIsDISABLED(av) AttrIsId( av, TidyAttr_DISABLED )
#define attrIsENCODING(av) AttrIsId( av, TidyAttr_ENCODING )
#define attrIsENCTYPE(av) AttrIsId( av, TidyAttr_ENCTYPE )
#define attrIsFACE(av) AttrIsId( av, TidyAttr_FACE )
#define attrIsFOR(av) AttrIsId( av, TidyAttr_FOR )
#define attrIsFRAME(av) AttrIsId( av, TidyAttr_FRAME )
#define attrIsFRAMEBORDER(av) AttrIsId( av, TidyAttr_FRAMEBORDER )
#define attrIsFRAMESPACING(av) AttrIsId( av, TidyAttr_FRAMESPACING )
#define attrIsGRIDX(av) AttrIsId( av, TidyAttr_GRIDX )
#define attrIsGRIDY(av) AttrIsId( av, TidyAttr_GRIDY )
#define attrIsHEADERS(av) AttrIsId( av, TidyAttr_HEADERS )
#define attrIsHEIGHT(av) AttrIsId( av, TidyAttr_HEIGHT )
#define attrIsHREF(av) AttrIsId( av, TidyAttr_HREF )
#define attrIsHREFLANG(av) AttrIsId( av, TidyAttr_HREFLANG )
#define attrIsHSPACE(av) AttrIsId( av, TidyAttr_HSPACE )
#define attrIsHTTP_EQUIV(av) AttrIsId( av, TidyAttr_HTTP_EQUIV )
#define attrIsID(av) AttrIsId( av, TidyAttr_ID )
#define attrIsISMAP(av) AttrIsId( av, TidyAttr_ISMAP )
#define attrIsITEMID(av) AttrIsId( av, TidyAttr_ITEMID )
#define attrIsITEMPROP(av) AttrIsId( av, TidyAttr_ITEMPROP )
#define attrIsITEMREF(av) AttrIsId( av, TidyAttr_ITEMREF )
#define attrIsITEMSCOPE(av) AttrIsId( av, TidyAttr_ITEMSCOPE )
#define attrIsITEMTYPE(av) AttrIsId( av, TidyAttr_ITEMTYPE )
#define attrIsLABEL(av) AttrIsId( av, TidyAttr_LABEL )
#define attrIsLANG(av) AttrIsId( av, TidyAttr_LANG )
#define attrIsLANGUAGE(av) AttrIsId( av, TidyAttr_LANGUAGE )
#define attrIsLAST_MODIFIED(av) AttrIsId( av, TidyAttr_LAST_MODIFIED )
#define attrIsLAST_VISIT(av) AttrIsId( av, TidyAttr_LAST_VISIT )
#define attrIsLEFTMARGIN(av) AttrIsId( av, TidyAttr_LEFTMARGIN )
#define attrIsLINK(av) AttrIsId( av, TidyAttr_LINK )
#define attrIsLONGDESC(av) AttrIsId( av, TidyAttr_LONGDESC )
#define attrIsLOWSRC(av) AttrIsId( av, TidyAttr_LOWSRC )
#define attrIsMARGINHEIGHT(av) AttrIsId( av, TidyAttr_MARGINHEIGHT )
#define attrIsMARGINWIDTH(av) AttrIsId( av, TidyAttr_MARGINWIDTH )
#define attrIsMAXLENGTH(av) AttrIsId( av, TidyAttr_MAXLENGTH )
#define attrIsMEDIA(av) AttrIsId( av, TidyAttr_MEDIA )
#define attrIsMETHOD(av) AttrIsId( av, TidyAttr_METHOD )
#define attrIsMULTIPLE(av) AttrIsId( av, TidyAttr_MULTIPLE )
#define attrIsNAME(av) AttrIsId( av, TidyAttr_NAME )
#define attrIsNOHREF(av) AttrIsId( av, TidyAttr_NOHREF )
#define attrIsNORESIZE(av) AttrIsId( av, TidyAttr_NORESIZE )
#define attrIsNOSHADE(av) AttrIsId( av, TidyAttr_NOSHADE )
#define attrIsNOWRAP(av) AttrIsId( av, TidyAttr_NOWRAP )
#define attrIsOBJECT(av) AttrIsId( av, TidyAttr_OBJECT )
#define attrIsOnAFTERUPDATE(av) AttrIsId( av, TidyAttr_OnAFTERUPDATE )
#define attrIsOnBEFOREUNLOAD(av) AttrIsId( av, TidyAttr_OnBEFOREUNLOAD )
#define attrIsOnBEFOREUPDATE(av) AttrIsId( av, TidyAttr_OnBEFOREUPDATE )
#define attrIsOnBLUR(av) AttrIsId( av, TidyAttr_OnBLUR )
#define attrIsOnCHANGE(av) AttrIsId( av, TidyAttr_OnCHANGE )
#define attrIsOnCLICK(av) AttrIsId( av, TidyAttr_OnCLICK )
#define attrIsOnDATAAVAILABLE(av) AttrIsId( av, TidyAttr_OnDATAAVAILABLE )
#define attrIsOnDATASETCHANGED(av) AttrIsId( av, TidyAttr_OnDATASETCHANGED )
#define attrIsOnDATASETCOMPLETE(av) AttrIsId( av, TidyAttr_OnDATASETCOMPLETE )
#define attrIsOnDBLCLICK(av) AttrIsId( av, TidyAttr_OnDBLCLICK )
#define attrIsOnERRORUPDATE(av) AttrIsId( av, TidyAttr_OnERRORUPDATE )
#define attrIsOnFOCUS(av) AttrIsId( av, TidyAttr_OnFOCUS )
#define attrIsOnKEYDOWN(av) AttrIsId( av, TidyAttr_OnKEYDOWN )
#define attrIsOnKEYPRESS(av) AttrIsId( av, TidyAttr_OnKEYPRESS )
#define attrIsOnKEYUP(av) AttrIsId( av, TidyAttr_OnKEYUP )
#define attrIsOnLOAD(av) AttrIsId( av, TidyAttr_OnLOAD )
#define attrIsOnMOUSEDOWN(av) AttrIsId( av, TidyAttr_OnMOUSEDOWN )
#define attrIsOnMOUSEMOVE(av) AttrIsId( av, TidyAttr_OnMOUSEMOVE )
#define attrIsOnMOUSEOUT(av) AttrIsId( av, TidyAttr_OnMOUSEOUT )
#define attrIsOnMOUSEOVER(av) AttrIsId( av, TidyAttr_OnMOUSEOVER )
#define attrIsOnMOUSEUP(av) AttrIsId( av, TidyAttr_OnMOUSEUP )
#define attrIsOnRESET(av) AttrIsId( av, TidyAttr_OnRESET )
#define attrIsOnROWENTER(av) AttrIsId( av, TidyAttr_OnROWENTER )
#define attrIsOnROWEXIT(av) AttrIsId( av, TidyAttr_OnROWEXIT )
#define attrIsOnSELECT(av) AttrIsId( av, TidyAttr_OnSELECT )
#define attrIsOnSUBMIT(av) AttrIsId( av, TidyAttr_OnSUBMIT )
#define attrIsOnUNLOAD(av) AttrIsId( av, TidyAttr_OnUNLOAD )
#define attrIsPROFILE(av) AttrIsId( av, TidyAttr_PROFILE )
#define attrIsPROMPT(av) AttrIsId( av, TidyAttr_PROMPT )
#define attrIsRBSPAN(av) AttrIsId( av, TidyAttr_RBSPAN )
#define attrIsREADONLY(av) AttrIsId( av, TidyAttr_READONLY )
#define attrIsREL(av) AttrIsId( av, TidyAttr_REL )
#define attrIsREV(av) AttrIsId( av, TidyAttr_REV )
#define attrIsRIGHTMARGIN(av) AttrIsId( av, TidyAttr_RIGHTMARGIN )
#define attrIsROLE(av) AttrIsId( av, TidyAttr_ROLE )
#define attrIsROWS(av) AttrIsId( av, TidyAttr_ROWS )
#define attrIsROWSPAN(av) AttrIsId( av, TidyAttr_ROWSPAN )
#define attrIsRULES(av) AttrIsId( av, TidyAttr_RULES )
#define attrIsSCHEME(av) AttrIsId( av, TidyAttr_SCHEME )
#define attrIsSCOPE(av) AttrIsId( av, TidyAttr_SCOPE )
#define attrIsSCROLLING(av) AttrIsId( av, TidyAttr_SCROLLING )
#define attrIsSELECTED(av) AttrIsId( av, TidyAttr_SELECTED )
#define attrIsSHAPE(av) AttrIsId( av, TidyAttr_SHAPE )
#define attrIsSHOWGRID(av) AttrIsId( av, TidyAttr_SHOWGRID )
#define attrIsSHOWGRIDX(av) AttrIsId( av, TidyAttr_SHOWGRIDX )
#define attrIsSHOWGRIDY(av) AttrIsId( av, TidyAttr_SHOWGRIDY )
#define attrIsSIZE(av) AttrIsId( av, TidyAttr_SIZE )
#define attrIsSPAN(av) AttrIsId( av, TidyAttr_SPAN )
#define attrIsSRC(av) AttrIsId( av, TidyAttr_SRC )
#define attrIsSTANDBY(av) AttrIsId( av, TidyAttr_STANDBY )
#define attrIsSTART(av) AttrIsId( av, TidyAttr_START )
#define attrIsSTYLE(av) AttrIsId( av, TidyAttr_STYLE )
#define attrIsSUMMARY(av) AttrIsId( av, TidyAttr_SUMMARY )
#define attrIsTABINDEX(av) AttrIsId( av, TidyAttr_TABINDEX )
#define attrIsTARGET(av) AttrIsId( av, TidyAttr_TARGET )
#define attrIsTEXT(av) AttrIsId( av, TidyAttr_TEXT )
#define attrIsTITLE(av) AttrIsId( av, TidyAttr_TITLE )
#define attrIsTOPMARGIN(av) AttrIsId( av, TidyAttr_TOPMARGIN )
#define attrIsTYPE(av) AttrIsId( av, TidyAttr_TYPE )
#define attrIsUSEMAP(av) AttrIsId( av, TidyAttr_USEMAP )
#define attrIsVALIGN(av) AttrIsId( av, TidyAttr_VALIGN )
#define attrIsVALUE(av) AttrIsId( av, TidyAttr_VALUE )
#define attrIsVALUETYPE(av) AttrIsId( av, TidyAttr_VALUETYPE )
#define attrIsVERSION(av) AttrIsId( av, TidyAttr_VERSION )
#define attrIsVLINK(av) AttrIsId( av, TidyAttr_VLINK )
#define attrIsVSPACE(av) AttrIsId( av, TidyAttr_VSPACE )
#define attrIsWIDTH(av) AttrIsId( av, TidyAttr_WIDTH )
#define attrIsWRAP(av) AttrIsId( av, TidyAttr_WRAP )
#define attrIsXMLNS(av) AttrIsId( av, TidyAttr_XMLNS )
#define attrIsXML_LANG(av) AttrIsId( av, TidyAttr_XML_LANG )
#define attrIsXML_SPACE(av) AttrIsId( av, TidyAttr_XML_SPACE )
#define attrIsARIA_ACTIVEDESCENDANT(av) AttrIsId( av, TidyAttr_ARIA_ACTIVEDESCENDANT )
#define attrIsARIA_ATOMIC(av) AttrIsId( av, TidyAttr_ARIA_ATOMIC )
#define attrIsARIA_AUTOCOMPLETE(av) AttrIsId( av, TidyAttr_ARIA_AUTOCOMPLETE )
#define attrIsARIA_BUSY(av) AttrIsId( av, TidyAttr_ARIA_BUSY )
#define attrIsARIA_CHECKED(av) AttrIsId( av, TidyAttr_ARIA_CHECKED )
#define attrIsARIA_CONTROLS(av) AttrIsId( av, TidyAttr_ARIA_CONTROLS )
#define attrIsARIA_DESCRIBEDBY(av) AttrIsId( av, TidyAttr_ARIA_DESCRIBEDBY )
#define attrIsARIA_DISABLED(av) AttrIsId( av, TidyAttr_ARIA_DISABLED )
#define attrIsARIA_DROPEFFECT(av) AttrIsId( av, TidyAttr_ARIA_DROPEFFECT )
#define attrIsARIA_EXPANDED(av) AttrIsId( av, TidyAttr_ARIA_EXPANDED )
#define attrIsARIA_FLOWTO(av) AttrIsId( av, TidyAttr_ARIA_FLOWTO )
#define attrIsARIA_GRABBED(av) AttrIsId( av, TidyAttr_ARIA_GRABBED )
#define attrIsARIA_HASPOPUP(av) AttrIsId( av, TidyAttr_ARIA_HASPOPUP )
#define attrIsARIA_HIDDEN(av) AttrIsId( av, TidyAttr_ARIA_HIDDEN )
#define attrIsARIA_INVALID(av) AttrIsId( av, TidyAttr_ARIA_INVALID )
#define attrIsARIA_LABEL(av) AttrIsId( av, TidyAttr_ARIA_LABEL )
#define attrIsARIA_LABELLEDBY(av) AttrIsId( av, TidyAttr_ARIA_LABELLEDBY )
#define attrIsARIA_LEVEL(av) AttrIsId( av, TidyAttr_ARIA_LEVEL )
#define attrIsARIA_LIVE(av) AttrIsId( av, TidyAttr_ARIA_LIVE )
#define attrIsARIA_MULTILINE(av) AttrIsId( av, TidyAttr_ARIA_MULTILINE )
#define attrIsARIA_MULTISELECTABLE(av) AttrIsId( av, TidyAttr_ARIA_MULTISELECTABLE )
#define attrIsARIA_ORIENTATION(av) AttrIsId( av, TidyAttr_ARIA_ORIENTATION )
#define attrIsARIA_OWNS(av) AttrIsId( av, TidyAttr_ARIA_OWNS )
#define attrIsARIA_POSINSET(av) AttrIsId( av, TidyAttr_ARIA_POSINSET )
#define attrIsARIA_PRESSED(av) AttrIsId( av, TidyAttr_ARIA_PRESSED )
#define attrIsARIA_READONLY(av) AttrIsId( av, TidyAttr_ARIA_READONLY )
#define attrIsARIA_RELEVANT(av) AttrIsId( av, TidyAttr_ARIA_RELEVANT )
#define attrIsARIA_REQUIRED(av) AttrIsId( av, TidyAttr_ARIA_REQUIRED )
#define attrIsARIA_SELECTED(av) AttrIsId( av, TidyAttr_ARIA_SELECTED )
#define attrIsARIA_SETSIZE(av) AttrIsId( av, TidyAttr_ARIA_SETSIZE )
#define attrIsARIA_SORT(av) AttrIsId( av, TidyAttr_ARIA_SORT )
#define attrIsARIA_VALUEMAX(av) AttrIsId( av, TidyAttr_ARIA_VALUEMAX )
#define attrIsARIA_VALUEMIN(av) AttrIsId( av, TidyAttr_ARIA_VALUEMIN )
#define attrIsARIA_VALUENOW(av) AttrIsId( av, TidyAttr_ARIA_VALUENOW )
#define attrIsARIA_VALUETEXT(av) AttrIsId( av, TidyAttr_ARIA_VALUETEXT )
/* Attribute Retrieval macros
*/
#define attrGetHREF( nod ) TY_(AttrGetById)( nod, TidyAttr_HREF )
#define attrGetSRC( nod ) TY_(AttrGetById)( nod, TidyAttr_SRC )
#define attrGetID( nod ) TY_(AttrGetById)( nod, TidyAttr_ID )
#define attrGetNAME( nod ) TY_(AttrGetById)( nod, TidyAttr_NAME )
#define attrGetSUMMARY( nod ) TY_(AttrGetById)( nod, TidyAttr_SUMMARY )
#define attrGetALT( nod ) TY_(AttrGetById)( nod, TidyAttr_ALT )
#define attrGetLONGDESC( nod ) TY_(AttrGetById)( nod, TidyAttr_LONGDESC )
#define attrGetUSEMAP( nod ) TY_(AttrGetById)( nod, TidyAttr_USEMAP )
#define attrGetISMAP( nod ) TY_(AttrGetById)( nod, TidyAttr_ISMAP )
#define attrGetLANGUAGE( nod ) TY_(AttrGetById)( nod, TidyAttr_LANGUAGE )
#define attrGetTYPE( nod ) TY_(AttrGetById)( nod, TidyAttr_TYPE )
#define attrGetVALUE( nod ) TY_(AttrGetById)( nod, TidyAttr_VALUE )
#define attrGetCONTENT( nod ) TY_(AttrGetById)( nod, TidyAttr_CONTENT )
#define attrGetTITLE( nod ) TY_(AttrGetById)( nod, TidyAttr_TITLE )
#define attrGetXMLNS( nod ) TY_(AttrGetById)( nod, TidyAttr_XMLNS )
#define attrGetDATAFLD( nod ) TY_(AttrGetById)( nod, TidyAttr_DATAFLD )
#define attrGetWIDTH( nod ) TY_(AttrGetById)( nod, TidyAttr_WIDTH )
#define attrGetHEIGHT( nod ) TY_(AttrGetById)( nod, TidyAttr_HEIGHT )
#define attrGetFOR( nod ) TY_(AttrGetById)( nod, TidyAttr_FOR )
#define attrGetSELECTED( nod ) TY_(AttrGetById)( nod, TidyAttr_SELECTED )
#define attrGetCHECKED( nod ) TY_(AttrGetById)( nod, TidyAttr_CHECKED )
#define attrGetLANG( nod ) TY_(AttrGetById)( nod, TidyAttr_LANG )
#define attrGetTARGET( nod ) TY_(AttrGetById)( nod, TidyAttr_TARGET )
#define attrGetHTTP_EQUIV( nod ) TY_(AttrGetById)( nod, TidyAttr_HTTP_EQUIV )
#define attrGetREL( nod ) TY_(AttrGetById)( nod, TidyAttr_REL )
#define attrGetOnMOUSEMOVE( nod ) TY_(AttrGetById)( nod, TidyAttr_OnMOUSEMOVE )
#define attrGetOnMOUSEDOWN( nod ) TY_(AttrGetById)( nod, TidyAttr_OnMOUSEDOWN )
#define attrGetOnMOUSEUP( nod ) TY_(AttrGetById)( nod, TidyAttr_OnMOUSEUP )
#define attrGetOnCLICK( nod ) TY_(AttrGetById)( nod, TidyAttr_OnCLICK )
#define attrGetOnMOUSEOVER( nod ) TY_(AttrGetById)( nod, TidyAttr_OnMOUSEOVER )
#define attrGetOnMOUSEOUT( nod ) TY_(AttrGetById)( nod, TidyAttr_OnMOUSEOUT )
#define attrGetOnKEYDOWN( nod ) TY_(AttrGetById)( nod, TidyAttr_OnKEYDOWN )
#define attrGetOnKEYUP( nod ) TY_(AttrGetById)( nod, TidyAttr_OnKEYUP )
#define attrGetOnKEYPRESS( nod ) TY_(AttrGetById)( nod, TidyAttr_OnKEYPRESS )
#define attrGetOnFOCUS( nod ) TY_(AttrGetById)( nod, TidyAttr_OnFOCUS )
#define attrGetOnBLUR( nod ) TY_(AttrGetById)( nod, TidyAttr_OnBLUR )
#define attrGetBGCOLOR( nod ) TY_(AttrGetById)( nod, TidyAttr_BGCOLOR )
#define attrGetLINK( nod ) TY_(AttrGetById)( nod, TidyAttr_LINK )
#define attrGetALINK( nod ) TY_(AttrGetById)( nod, TidyAttr_ALINK )
#define attrGetVLINK( nod ) TY_(AttrGetById)( nod, TidyAttr_VLINK )
#define attrGetTEXT( nod ) TY_(AttrGetById)( nod, TidyAttr_TEXT )
#define attrGetSTYLE( nod ) TY_(AttrGetById)( nod, TidyAttr_STYLE )
#define attrGetABBR( nod ) TY_(AttrGetById)( nod, TidyAttr_ABBR )
#define attrGetCOLSPAN( nod ) TY_(AttrGetById)( nod, TidyAttr_COLSPAN )
#define attrGetFONT( nod ) TY_(AttrGetById)( nod, TidyAttr_FONT )
#define attrGetBASEFONT( nod ) TY_(AttrGetById)( nod, TidyAttr_BASEFONT )
#define attrGetROWSPAN( nod ) TY_(AttrGetById)( nod, TidyAttr_ROWSPAN )
#define attrGetROLE( nod ) TY_(AttrGetById)( nod, TidyAttr_ROLE )
#define attrGetARIA_ACTIVEDESCENDANT( nod ) TY_(AttrGetById)( nod, TidyAttr_ARIA_ACTIVEDESCENDANT )
#define attrGetARIA_ATOMIC( nod ) TY_(AttrGetById)( nod, TidyAttr_ARIA_ATOMIC )
#define attrGetARIA_AUTOCOMPLETE( nod ) TY_(AttrGetById)( nod, TidyAttr_ARIA_AUTOCOMPLETE )
#define attrGetARIA_BUSY( nod ) TY_(AttrGetById)( nod, TidyAttr_ARIA_BUSY )
#define attrGetARIA_CHECKED( nod ) TY_(AttrGetById)( nod, TidyAttr_ARIA_CHECKED )
#define attrGetARIA_CONTROLS( nod ) TY_(AttrGetById)( nod, TidyAttr_ARIA_CONTROLS )
#define attrGetARIA_DESCRIBEDBY( nod ) TY_(AttrGetById)( nod, TidyAttr_ARIA_DESCRIBEDBY )
#define attrGetARIA_DISABLED( nod ) TY_(AttrGetById)( nod, TidyAttr_ARIA_DISABLED )
#define attrGetARIA_DROPEFFECT( nod ) TY_(AttrGetById)( nod, TidyAttr_ARIA_DROPEFFECT )
#define attrGetARIA_EXPANDED( nod ) TY_(AttrGetById)( nod, TidyAttr_ARIA_EXPANDED )
#define attrGetARIA_FLOWTO( nod ) TY_(AttrGetById)( nod, TidyAttr_ARIA_FLOWTO )
#define attrGetARIA_GRABBED( nod ) TY_(AttrGetById)( nod, TidyAttr_ARIA_GRABBED )
#define attrGetARIA_HASPOPUP( nod ) TY_(AttrGetById)( nod, TidyAttr_ARIA_HASPOPUP )
#define attrGetARIA_HIDDEN( nod ) TY_(AttrGetById)( nod, TidyAttr_ARIA_HIDDEN )
#define attrGetARIA_INVALID( nod ) TY_(AttrGetById)( nod, TidyAttr_ARIA_INVALID )
#define attrGetARIA_LABEL( nod ) TY_(AttrGetById)( nod, TidyAttr_ARIA_LABEL )
#define attrGetARIA_LABELLEDBY( nod ) TY_(AttrGetById)( nod, TidyAttr_ARIA_LABELLEDBY )
#define attrGetARIA_LEVEL( nod ) TY_(AttrGetById)( nod, TidyAttr_ARIA_LEVEL )
#define attrGetARIA_LIVE( nod ) TY_(AttrGetById)( nod, TidyAttr_ARIA_LIVE )
#define attrGetARIA_MULTILINE( nod ) TY_(AttrGetById)( nod, TidyAttr_ARIA_MULTILINE )
#define attrGetARIA_MULTISELECTABLE( nod ) TY_(AttrGetById)( nod, TidyAttr_ARIA_MULTISELECTABLE )
#define attrGetARIA_ORIENTATION( nod ) TY_(AttrGetById)( nod, TidyAttr_ARIA_ORIENTATION )
#define attrGetARIA_OWNS( nod ) TY_(AttrGetById)( nod, TidyAttr_ARIA_OWNS )
#define attrGetARIA_POSINSET( nod ) TY_(AttrGetById)( nod, TidyAttr_ARIA_POSINSET )
#define attrGetARIA_PRESSED( nod ) TY_(AttrGetById)( nod, TidyAttr_ARIA_PRESSED )
#define attrGetARIA_READONLY( nod ) TY_(AttrGetById)( nod, TidyAttr_ARIA_READONLY )
#define attrGetARIA_RELEVANT( nod ) TY_(AttrGetById)( nod, TidyAttr_ARIA_RELEVANT )
#define attrGetARIA_REQUIRED( nod ) TY_(AttrGetById)( nod, TidyAttr_ARIA_REQUIRED )
#define attrGetARIA_SELECTED( nod ) TY_(AttrGetById)( nod, TidyAttr_ARIA_SELECTED )
#define attrGetARIA_SETSIZE( nod ) TY_(AttrGetById)( nod, TidyAttr_ARIA_SETSIZE )
#define attrGetARIA_SORT( nod ) TY_(AttrGetById)( nod, TidyAttr_ARIA_SORT )
#define attrGetARIA_VALUEMAX( nod ) TY_(AttrGetById)( nod, TidyAttr_ARIA_VALUEMAX )
#define attrGetARIA_VALUEMIN( nod ) TY_(AttrGetById)( nod, TidyAttr_ARIA_VALUEMIN )
#define attrGetARIA_VALUENOW( nod ) TY_(AttrGetById)( nod, TidyAttr_ARIA_VALUENOW )
#define attrGetARIA_VALUETEXT( nod ) TY_(AttrGetById)( nod, TidyAttr_ARIA_VALUETEXT )
#endif /* __ATTRS_H__ */

226
src/buffio.c

@ -0,0 +1,226 @@
/* buffio.c -- Treat buffer as an I/O stream.
(c) 1998-2007 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
Requires buffer to automatically grow as bytes are added.
Must keep track of current read and write points.
*/
#include "tidy.h"
#include "tidybuffio.h"
#include "forward.h"
/**************
TIDY
**************/
static int TIDY_CALL insrc_getByte( void* appData )
{
TidyBuffer* buf = (TidyBuffer*) appData;
return tidyBufGetByte( buf );
}
static Bool TIDY_CALL insrc_eof( void* appData )
{
TidyBuffer* buf = (TidyBuffer*) appData;
return tidyBufEndOfInput( buf );
}
static void TIDY_CALL insrc_ungetByte( void* appData, byte bv )
{
TidyBuffer* buf = (TidyBuffer*) appData;
tidyBufUngetByte( buf, bv );
}
void TIDY_CALL tidyInitInputBuffer( TidyInputSource* inp, TidyBuffer* buf )
{
inp->getByte = insrc_getByte;
inp->eof = insrc_eof;
inp->ungetByte = insrc_ungetByte;
inp->sourceData = buf;
}
static void TIDY_CALL outsink_putByte( void* appData, byte bv )
{
TidyBuffer* buf = (TidyBuffer*) appData;
tidyBufPutByte( buf, bv );
}
void TIDY_CALL tidyInitOutputBuffer( TidyOutputSink* outp, TidyBuffer* buf )
{
outp->putByte = outsink_putByte;
outp->sinkData = buf;
}
void TIDY_CALL tidyBufInit( TidyBuffer* buf )
{
assert( buf != NULL );
tidyBufInitWithAllocator( buf, NULL );
}
void TIDY_CALL tidyBufAlloc( TidyBuffer* buf, uint allocSize )
{
tidyBufAllocWithAllocator( buf, NULL, allocSize );
}
void TIDY_CALL tidyBufInitWithAllocator( TidyBuffer* buf,
TidyAllocator *allocator )
{
assert( buf != NULL );
TidyClearMemory( buf, sizeof(TidyBuffer) );
buf->allocator = allocator ? allocator : &TY_(g_default_allocator);
}
void TIDY_CALL tidyBufAllocWithAllocator( TidyBuffer* buf,
TidyAllocator *allocator,
uint allocSize )
{
tidyBufInitWithAllocator( buf, allocator );
tidyBufCheckAlloc( buf, allocSize, 0 );
buf->next = 0;
}
void TIDY_CALL tidyBufFree( TidyBuffer* buf )
{
assert( buf != NULL );
TidyFree( buf->allocator, buf->bp );
tidyBufInitWithAllocator( buf, buf->allocator );
}
void TIDY_CALL tidyBufClear( TidyBuffer* buf )
{
assert( buf != NULL );
if ( buf->bp )
{
TidyClearMemory( buf->bp, buf->allocated );
buf->size = 0;
}
buf->next = 0;
}
/* Many users do not call tidyBufInit() or tidyBufAlloc() or their allocator
counterparts. So by default, set the default allocator.
*/
static void setDefaultAllocator( TidyBuffer* buf )
{
buf->allocator = &TY_(g_default_allocator);
}
/* Avoid thrashing memory by doubling buffer size
** until larger than requested size.
buf->allocated is bigger than allocSize+1 so that a trailing null byte is
always available.
*/
void TIDY_CALL tidyBufCheckAlloc( TidyBuffer* buf, uint allocSize, uint chunkSize )
{
assert( buf != NULL );
if ( !buf->allocator )
setDefaultAllocator( buf );
if ( 0 == chunkSize )
chunkSize = 256;
if ( allocSize+1 > buf->allocated )
{
byte* bp;
uint allocAmt = chunkSize;
if ( buf->allocated > 0 )
allocAmt = buf->allocated;
while ( allocAmt < allocSize+1 )
allocAmt *= 2;
bp = (byte*)TidyRealloc( buf->allocator, buf->bp, allocAmt );
if ( bp != NULL )
{
TidyClearMemory( bp + buf->allocated, allocAmt - buf->allocated );
buf->bp = bp;
buf->allocated = allocAmt;
}
}
}
/* Attach buffer to a chunk O' memory w/out allocation */
void TIDY_CALL tidyBufAttach( TidyBuffer* buf, byte* bp, uint size )
{
assert( buf != NULL );
buf->bp = bp;
buf->size = buf->allocated = size;
buf->next = 0;
if ( !buf->allocator )
setDefaultAllocator( buf );
}
/* Clear pointer to memory w/out deallocation */
void TIDY_CALL tidyBufDetach( TidyBuffer* buf )
{
tidyBufInitWithAllocator( buf, buf->allocator );
}
/**************
OUTPUT
**************/
void TIDY_CALL tidyBufAppend( TidyBuffer* buf, void* vp, uint size )
{
assert( buf != NULL );
if ( vp != NULL && size > 0 )
{
tidyBufCheckAlloc( buf, buf->size + size, 0 );
memcpy( buf->bp + buf->size, vp, size );
buf->size += size;
}
}
void TIDY_CALL tidyBufPutByte( TidyBuffer* buf, byte bv )
{
assert( buf != NULL );
tidyBufCheckAlloc( buf, buf->size + 1, 0 );
buf->bp[ buf->size++ ] = bv;
}
int TIDY_CALL tidyBufPopByte( TidyBuffer* buf )
{
int bv = EOF;
assert( buf != NULL );
if ( buf->size > 0 )
bv = buf->bp[ --buf->size ];
return bv;
}
/**************
INPUT
**************/
int TIDY_CALL tidyBufGetByte( TidyBuffer* buf )
{
int bv = EOF;
if ( ! tidyBufEndOfInput(buf) )
bv = buf->bp[ buf->next++ ];
return bv;
}
Bool TIDY_CALL tidyBufEndOfInput( TidyBuffer* buf )
{
return ( buf->next >= buf->size );
}
void TIDY_CALL tidyBufUngetByte( TidyBuffer* buf, byte bv )
{
if ( buf->next > 0 )
{
--buf->next;
assert( bv == buf->bp[ buf->next ] );
}
}
/*
* local variables:
* mode: c
* indent-tabs-mode: nil
* c-basic-offset: 4
* eval: (c-set-offset 'substatement-open 0)
* end:
*/

6
src/buffio.h

@ -0,0 +1,6 @@
#ifdef __GNUC__
#warning "FIXME: Using compatibility tidy header (buffio.h) that will go away!"
#endif
#include "tidybuffio.h"

1031
src/charsets.c

File diff suppressed because it is too large

13
src/charsets.h

@ -0,0 +1,13 @@
/* charsets.h -- character set information and mappings
(c) 1998-2006 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
*/
uint TY_(GetEncodingIdFromName)(ctmbstr name);
uint TY_(GetEncodingIdFromCodePage)(uint cp);
uint TY_(GetEncodingCodePageFromName)(ctmbstr name);
uint TY_(GetEncodingCodePageFromId)(uint id);
ctmbstr TY_(GetEncodingNameFromId)(uint id);
ctmbstr TY_(GetEncodingNameFromCodePage)(uint cp);

2692
src/clean.c

File diff suppressed because it is too large

82
src/clean.h

@ -0,0 +1,82 @@
#ifndef __CLEAN_H__
#define __CLEAN_H__
/* clean.h -- clean up misuse of presentation markup
(c) 1998-2006 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
*/
void TY_(FixNodeLinks)(Node *node);
void TY_(FreeStyles)( TidyDocImpl* doc );
/* Add class="foo" to node
*/
void TY_(AddStyleAsClass)( TidyDocImpl* doc, Node *node, ctmbstr stylevalue );
void TY_(AddStyleProperty)(TidyDocImpl* doc, Node *node, ctmbstr property );
void TY_(CleanDocument)( TidyDocImpl* doc );
/* simplifies <b><b> ... </b> ...</b> etc. */
void TY_(NestedEmphasis)( TidyDocImpl* doc, Node* node );
/* replace i by em and b by strong */
void TY_(EmFromI)( TidyDocImpl* doc, Node* node );
/*
Some people use dir or ul without an li
to indent the content. The pattern to
look for is a list with a single implicit
li. This is recursively replaced by an
implicit blockquote.
*/
void TY_(List2BQ)( TidyDocImpl* doc, Node* node );
/*
Replace implicit blockquote by div with an indent
taking care to reduce nested blockquotes to a single
div with the indent set to match the nesting depth
*/
void TY_(BQ2Div)( TidyDocImpl* doc, Node* node );
void TY_(DropSections)( TidyDocImpl* doc, Node* node );
/*
This is a major clean up to strip out all the extra stuff you get
when you save as web page from Word 2000. It doesn't yet know what
to do with VML tags, but these will appear as errors unless you
declare them as new tags, such as o:p which needs to be declared
as inline.
*/
void TY_(CleanWord2000)( TidyDocImpl* doc, Node *node);
Bool TY_(IsWord2000)( TidyDocImpl* doc );
/* where appropriate move object elements from head to body */
void TY_(BumpObject)( TidyDocImpl* doc, Node *html );
/* This is disabled due to http://tidy.sf.net/bug/681116 */
#if 0
void TY_(FixBrakes)( TidyDocImpl* pDoc, Node *pParent );
#endif
void TY_(VerifyHTTPEquiv)( TidyDocImpl* pDoc, Node *pParent );
void TY_(DropComments)(TidyDocImpl* doc, Node* node);
void TY_(DropFontElements)(TidyDocImpl* doc, Node* node, Node **pnode);
void TY_(WbrToSpace)(TidyDocImpl* doc, Node* node);
void TY_(DowngradeTypography)(TidyDocImpl* doc, Node* node);
void TY_(ReplacePreformattedSpaces)(TidyDocImpl* doc, Node* node);
void TY_(NormalizeSpaces)(Lexer *lexer, Node *node);
void TY_(ConvertCDATANodes)(TidyDocImpl* doc, Node* node);
void TY_(FixAnchors)(TidyDocImpl* doc, Node *node, Bool wantName, Bool wantId);
void TY_(FixXhtmlNamespace)(TidyDocImpl* doc, Bool wantXmlns);
void TY_(FixLanguageInformation)(TidyDocImpl* doc, Node* node, Bool wantXmlLang, Bool wantLang);
#endif /* __CLEAN_H__ */

1788
src/config.c

File diff suppressed because it is too large

146
src/config.h

@ -0,0 +1,146 @@
#ifndef __CONFIG_H__
#define __CONFIG_H__
/* config.h -- read config file and manage config properties
(c) 1998-2006 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
config files associate a property name with a value.
// comments can start at the beginning of a line
# comments can start at the beginning of a line
name: short values fit onto one line
name: a really long value that
continues on the next line
property names are case insensitive and should be less than
60 characters in length and must start at the begining of
the line, as whitespace at the start of a line signifies a
line continuation.
*/
#include "forward.h"
#include "tidy.h"
#include "streamio.h"
struct _tidy_option;
typedef struct _tidy_option TidyOptionImpl;
typedef Bool (ParseProperty)( TidyDocImpl* doc, const TidyOptionImpl* opt );
struct _tidy_option
{
TidyOptionId id;
TidyConfigCategory category; /* put 'em in groups */
ctmbstr name; /* property name */
TidyOptionType type; /* string, int or bool */
ulong dflt; /* default for TidyInteger and TidyBoolean */
ParseProperty* parser; /* parsing method, read-only if NULL */
const ctmbstr* pickList; /* pick list */
ctmbstr pdflt; /* default for TidyString */
};
typedef union
{
ulong v; /* Value for TidyInteger and TidyBoolean */
char *p; /* Value for TidyString */
} TidyOptionValue;
typedef struct _tidy_config
{
TidyOptionValue value[ N_TIDY_OPTIONS + 1 ]; /* current config values */
TidyOptionValue snapshot[ N_TIDY_OPTIONS + 1 ]; /* Snapshot of values to be restored later */
/* track what tags user has defined to eliminate unnecessary searches */
uint defined_tags;
uint c; /* current char in input stream */
StreamIn* cfgIn; /* current input source */
} TidyConfigImpl;
/* Used to build a table of documentation cross-references. */
typedef struct {
TidyOptionId opt; /**< Identifier. */
TidyOptionId const *links; /**< Cross references. Last element must be 'TidyUnknownOption'. */
} TidyOptionDoc;
const TidyOptionImpl* TY_(lookupOption)( ctmbstr optnam );
const TidyOptionImpl* TY_(getOption)( TidyOptionId optId );
TidyIterator TY_(getOptionList)( TidyDocImpl* doc );
const TidyOptionImpl* TY_(getNextOption)( TidyDocImpl* doc, TidyIterator* iter );
TidyIterator TY_(getOptionPickList)( const TidyOptionImpl* option );
ctmbstr TY_(getNextOptionPick)( const TidyOptionImpl* option, TidyIterator* iter );
const TidyOptionDoc* TY_(OptGetDocDesc)( TidyOptionId optId );
void TY_(InitConfig)( TidyDocImpl* doc );
void TY_(FreeConfig)( TidyDocImpl* doc );
/* Bool SetOptionValue( TidyDocImpl* doc, TidyOptionId optId, ctmbstr val ); */
Bool TY_(SetOptionInt)( TidyDocImpl* doc, TidyOptionId optId, ulong val );
Bool TY_(SetOptionBool)( TidyDocImpl* doc, TidyOptionId optId, Bool val );
Bool TY_(ResetOptionToDefault)( TidyDocImpl* doc, TidyOptionId optId );
void TY_(ResetConfigToDefault)( TidyDocImpl* doc );
void TY_(TakeConfigSnapshot)( TidyDocImpl* doc );
void TY_(ResetConfigToSnapshot)( TidyDocImpl* doc );
void TY_(CopyConfig)( TidyDocImpl* docTo, TidyDocImpl* docFrom );
int TY_(ParseConfigFile)( TidyDocImpl* doc, ctmbstr cfgfil );
int TY_(ParseConfigFileEnc)( TidyDocImpl* doc,
ctmbstr cfgfil, ctmbstr charenc );
int TY_(SaveConfigFile)( TidyDocImpl* doc, ctmbstr cfgfil );
int TY_(SaveConfigSink)( TidyDocImpl* doc, TidyOutputSink* sink );
/* returns false if unknown option, missing parameter, or
option doesn't use parameter
*/
Bool TY_(ParseConfigOption)( TidyDocImpl* doc, ctmbstr optnam, ctmbstr optVal );
Bool TY_(ParseConfigValue)( TidyDocImpl* doc, TidyOptionId optId, ctmbstr optVal );
/* ensure that char encodings are self consistent */
Bool TY_(AdjustCharEncoding)( TidyDocImpl* doc, int encoding );
Bool TY_(ConfigDiffThanDefault)( TidyDocImpl* doc );
Bool TY_(ConfigDiffThanSnapshot)( TidyDocImpl* doc );
int TY_(CharEncodingId)( TidyDocImpl* doc, ctmbstr charenc );
ctmbstr TY_(CharEncodingName)( int encoding );
ctmbstr TY_(CharEncodingOptName)( int encoding );
/* void SetEmacsFilename( TidyDocImpl* doc, ctmbstr filename ); */
#ifdef _DEBUG
/* Debug lookup functions will be type-safe and assert option type match */
ulong TY_(_cfgGet)( TidyDocImpl* doc, TidyOptionId optId );
Bool TY_(_cfgGetBool)( TidyDocImpl* doc, TidyOptionId optId );
TidyTriState TY_(_cfgGetAutoBool)( TidyDocImpl* doc, TidyOptionId optId );
ctmbstr TY_(_cfgGetString)( TidyDocImpl* doc, TidyOptionId optId );
#define cfg(doc, id) TY_(_cfgGet)( (doc), (id) )
#define cfgBool(doc, id) TY_(_cfgGetBool)( (doc), (id) )
#define cfgAutoBool(doc, id) TY_(_cfgGetAutoBool)( (doc), (id) )
#define cfgStr(doc, id) TY_(_cfgGetString)( (doc), (id) )
#else
/* Release build macros for speed */
#define cfg(doc, id) ((doc)->config.value[ (id) ].v)
#define cfgBool(doc, id) ((Bool) cfg(doc, id))
#define cfgAutoBool(doc, id) ((TidyTriState) cfg(doc, id))
#define cfgStr(doc, id) ((ctmbstr) (doc)->config.value[ (id) ].p)
#endif /* _DEBUG */
#endif /* __CONFIG_H__ */

424
src/entities.c

@ -0,0 +1,424 @@
/* entities.c -- recognize HTML ISO entities
(c) 1998-2008 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
Entity handling can be static because there are no config or
document-specific values. Lookup table is 100% defined at
compile time.
*/
#include <stdio.h>
#include "entities.h"
#include "tidy-int.h"
#include "tmbstr.h"
struct _entity;
typedef struct _entity entity;
struct _entity
{
ctmbstr name;
uint versions;
uint code;
};
static const entity entities[] =
{
/*
** Markup pre-defined character entities
*/
{ "quot", VERS_ALL|VERS_XML, 34 },
{ "amp", VERS_ALL|VERS_XML, 38 },
{ "apos", VERS_FROM40|VERS_XML, 39 },
{ "lt", VERS_ALL|VERS_XML, 60 },
{ "gt", VERS_ALL|VERS_XML, 62 },
/*
** Latin-1 character entities
*/
{ "nbsp", VERS_ALL, 160 },
{ "iexcl", VERS_ALL, 161 },
{ "cent", VERS_ALL, 162 },
{ "pound", VERS_ALL, 163 },
{ "curren", VERS_ALL, 164 },
{ "yen", VERS_ALL, 165 },
{ "brvbar", VERS_ALL, 166 },
{ "sect", VERS_ALL, 167 },
{ "uml", VERS_ALL, 168 },
{ "copy", VERS_ALL, 169 },
{ "ordf", VERS_ALL, 170 },
{ "laquo", VERS_ALL, 171 },
{ "not", VERS_ALL, 172 },
{ "shy", VERS_ALL, 173 },
{ "reg", VERS_ALL, 174 },
{ "macr", VERS_ALL, 175 },
{ "deg", VERS_ALL, 176 },
{ "plusmn", VERS_ALL, 177 },
{ "sup2", VERS_ALL, 178 },
{ "sup3", VERS_ALL, 179 },
{ "acute", VERS_ALL, 180 },
{ "micro", VERS_ALL, 181 },
{ "para", VERS_ALL, 182 },
{ "middot", VERS_ALL, 183 },
{ "cedil", VERS_ALL, 184 },
{ "sup1", VERS_ALL, 185 },
{ "ordm", VERS_ALL, 186 },
{ "raquo", VERS_ALL, 187 },
{ "frac14", VERS_ALL, 188 },
{ "frac12", VERS_ALL, 189 },
{ "frac34", VERS_ALL, 190 },
{ "iquest", VERS_ALL, 191 },
{ "Agrave", VERS_ALL, 192 },
{ "Aacute", VERS_ALL, 193 },
{ "Acirc", VERS_ALL, 194 },
{ "Atilde", VERS_ALL, 195 },
{ "Auml", VERS_ALL, 196 },
{ "Aring", VERS_ALL, 197 },
{ "AElig", VERS_ALL, 198 },
{ "Ccedil", VERS_ALL, 199 },
{ "Egrave", VERS_ALL, 200 },
{ "Eacute", VERS_ALL, 201 },
{ "Ecirc", VERS_ALL, 202 },
{ "Euml", VERS_ALL, 203 },
{ "Igrave", VERS_ALL, 204 },
{ "Iacute", VERS_ALL, 205 },
{ "Icirc", VERS_ALL, 206 },
{ "Iuml", VERS_ALL, 207 },
{ "ETH", VERS_ALL, 208 },
{ "Ntilde", VERS_ALL, 209 },
{ "Ograve", VERS_ALL, 210 },
{ "Oacute", VERS_ALL, 211 },
{ "Ocirc", VERS_ALL, 212 },
{ "Otilde", VERS_ALL, 213 },
{ "Ouml", VERS_ALL, 214 },
{ "times", VERS_ALL, 215 },
{ "Oslash", VERS_ALL, 216 },
{ "Ugrave", VERS_ALL, 217 },
{ "Uacute", VERS_ALL, 218 },
{ "Ucirc", VERS_ALL, 219 },
{ "Uuml", VERS_ALL, 220 },
{ "Yacute", VERS_ALL, 221 },
{ "THORN", VERS_ALL, 222 },
{ "szlig", VERS_ALL, 223 },
{ "agrave", VERS_ALL, 224 },
{ "aacute", VERS_ALL, 225 },
{ "acirc", VERS_ALL, 226 },
{ "atilde", VERS_ALL, 227 },
{ "auml", VERS_ALL, 228 },
{ "aring", VERS_ALL, 229 },
{ "aelig", VERS_ALL, 230 },
{ "ccedil", VERS_ALL, 231 },
{ "egrave", VERS_ALL, 232 },
{ "eacute", VERS_ALL, 233 },
{ "ecirc", VERS_ALL, 234 },
{ "euml", VERS_ALL, 235 },
{ "igrave", VERS_ALL, 236 },
{ "iacute", VERS_ALL, 237 },
{ "icirc", VERS_ALL, 238 },
{ "iuml", VERS_ALL, 239 },
{ "eth", VERS_ALL, 240 },
{ "ntilde", VERS_ALL, 241 },
{ "ograve", VERS_ALL, 242 },
{ "oacute", VERS_ALL, 243 },
{ "ocirc", VERS_ALL, 244 },
{ "otilde", VERS_ALL, 245 },
{ "ouml", VERS_ALL, 246 },
{ "divide", VERS_ALL, 247 },
{ "oslash", VERS_ALL, 248 },
{ "ugrave", VERS_ALL, 249 },
{ "uacute", VERS_ALL, 250 },
{ "ucirc", VERS_ALL, 251 },
{ "uuml", VERS_ALL, 252 },
{ "yacute", VERS_ALL, 253 },
{ "thorn", VERS_ALL, 254 },
{ "yuml", VERS_ALL, 255 },
/*
** Extended Entities defined in HTML 4: Symbols
*/
{ "fnof", VERS_FROM40, 402 },
{ "Alpha", VERS_FROM40, 913 },
{ "Beta", VERS_FROM40, 914 },
{ "Gamma", VERS_FROM40, 915 },
{ "Delta", VERS_FROM40, 916 },
{ "Epsilon", VERS_FROM40, 917 },
{ "Zeta", VERS_FROM40, 918 },
{ "Eta", VERS_FROM40, 919 },
{ "Theta", VERS_FROM40, 920 },
{ "Iota", VERS_FROM40, 921 },
{ "Kappa", VERS_FROM40, 922 },
{ "Lambda", VERS_FROM40, 923 },
{ "Mu", VERS_FROM40, 924 },
{ "Nu", VERS_FROM40, 925 },
{ "Xi", VERS_FROM40, 926 },
{ "Omicron", VERS_FROM40, 927 },
{ "Pi", VERS_FROM40, 928 },
{ "Rho", VERS_FROM40, 929 },
{ "Sigma", VERS_FROM40, 931 },
{ "Tau", VERS_FROM40, 932 },
{ "Upsilon", VERS_FROM40, 933 },
{ "Phi", VERS_FROM40, 934 },
{ "Chi", VERS_FROM40, 935 },
{ "Psi", VERS_FROM40, 936 },
{ "Omega", VERS_FROM40, 937 },
{ "alpha", VERS_FROM40, 945 },
{ "beta", VERS_FROM40, 946 },
{ "gamma", VERS_FROM40, 947 },
{ "delta", VERS_FROM40, 948 },
{ "epsilon", VERS_FROM40, 949 },
{ "zeta", VERS_FROM40, 950 },
{ "eta", VERS_FROM40, 951 },
{ "theta", VERS_FROM40, 952 },
{ "iota", VERS_FROM40, 953 },
{ "kappa", VERS_FROM40, 954 },
{ "lambda", VERS_FROM40, 955 },
{ "mu", VERS_FROM40, 956 },
{ "nu", VERS_FROM40, 957 },
{ "xi", VERS_FROM40, 958 },
{ "omicron", VERS_FROM40, 959 },
{ "pi", VERS_FROM40, 960 },
{ "rho", VERS_FROM40, 961 },
{ "sigmaf", VERS_FROM40, 962 },
{ "sigma", VERS_FROM40, 963 },
{ "tau", VERS_FROM40, 964 },
{ "upsilon", VERS_FROM40, 965 },
{ "phi", VERS_FROM40, 966 },
{ "chi", VERS_FROM40, 967 },
{ "psi", VERS_FROM40, 968 },
{ "omega", VERS_FROM40, 969 },
{ "thetasym", VERS_FROM40, 977 },
{ "upsih", VERS_FROM40, 978 },
{ "piv", VERS_FROM40, 982 },
{ "bull", VERS_FROM40, 8226 },
{ "hellip", VERS_FROM40, 8230 },
{ "prime", VERS_FROM40, 8242 },
{ "Prime", VERS_FROM40, 8243 },
{ "oline", VERS_FROM40, 8254 },
{ "frasl", VERS_FROM40, 8260 },
{ "weierp", VERS_FROM40, 8472 },
{ "image", VERS_FROM40, 8465 },
{ "real", VERS_FROM40, 8476 },
{ "trade", VERS_FROM40, 8482 },
{ "alefsym", VERS_FROM40, 8501 },
{ "larr", VERS_FROM40, 8592 },
{ "uarr", VERS_FROM40, 8593 },
{ "rarr", VERS_FROM40, 8594 },
{ "darr", VERS_FROM40, 8595 },
{ "harr", VERS_FROM40, 8596 },
{ "crarr", VERS_FROM40, 8629 },
{ "lArr", VERS_FROM40, 8656 },
{ "uArr", VERS_FROM40, 8657 },
{ "rArr", VERS_FROM40, 8658 },
{ "dArr", VERS_FROM40, 8659 },
{ "hArr", VERS_FROM40, 8660 },
{ "forall", VERS_FROM40, 8704 },
{ "part", VERS_FROM40, 8706 },
{ "exist", VERS_FROM40, 8707 },
{ "empty", VERS_FROM40, 8709 },
{ "nabla", VERS_FROM40, 8711 },
{ "isin", VERS_FROM40, 8712 },
{ "notin", VERS_FROM40, 8713 },
{ "ni", VERS_FROM40, 8715 },
{ "prod", VERS_FROM40, 8719 },
{ "sum", VERS_FROM40, 8721 },
{ "minus", VERS_FROM40, 8722 },
{ "lowast", VERS_FROM40, 8727 },
{ "radic", VERS_FROM40, 8730 },
{ "prop", VERS_FROM40, 8733 },
{ "infin", VERS_FROM40, 8734 },
{ "ang", VERS_FROM40, 8736 },
{ "and", VERS_FROM40, 8743 },
{ "or", VERS_FROM40, 8744 },
{ "cap", VERS_FROM40, 8745 },
{ "cup", VERS_FROM40, 8746 },
{ "int", VERS_FROM40, 8747 },
{ "there4", VERS_FROM40, 8756 },
{ "sim", VERS_FROM40, 8764 },
{ "cong", VERS_FROM40, 8773 },
{ "asymp", VERS_FROM40, 8776 },
{ "ne", VERS_FROM40, 8800 },
{ "equiv", VERS_FROM40, 8801 },
{ "le", VERS_FROM40, 8804 },
{ "ge", VERS_FROM40, 8805 },
{ "sub", VERS_FROM40, 8834 },
{ "sup", VERS_FROM40, 8835 },
{ "nsub", VERS_FROM40, 8836 },
{ "sube", VERS_FROM40, 8838 },
{ "supe", VERS_FROM40, 8839 },
{ "oplus", VERS_FROM40, 8853 },
{ "otimes", VERS_FROM40, 8855 },
{ "perp", VERS_FROM40, 8869 },
{ "sdot", VERS_FROM40, 8901 },
{ "lceil", VERS_FROM40, 8968 },
{ "rceil", VERS_FROM40, 8969 },
{ "lfloor", VERS_FROM40, 8970 },
{ "rfloor", VERS_FROM40, 8971 },
{ "lang", VERS_FROM40, 10216 },
{ "rang", VERS_FROM40, 10217 },
{ "loz", VERS_FROM40, 9674 },
{ "spades", VERS_FROM40, 9824 },
{ "clubs", VERS_FROM40, 9827 },
{ "hearts", VERS_FROM40, 9829 },
{ "diams", VERS_FROM40, 9830 },
/*
** Extended Entities defined in HTML 4: Special (less Markup at top)
*/
{ "OElig", VERS_FROM40, 338 },
{ "oelig", VERS_FROM40, 339 },
{ "Scaron", VERS_FROM40, 352 },
{ "scaron", VERS_FROM40, 353 },
{ "Yuml", VERS_FROM40, 376 },
{ "circ", VERS_FROM40, 710 },
{ "tilde", VERS_FROM40, 732 },
{ "ensp", VERS_FROM40, 8194 },
{ "emsp", VERS_FROM40, 8195 },
{ "thinsp", VERS_FROM40, 8201 },
{ "zwnj", VERS_FROM40, 8204 },
{ "zwj", VERS_FROM40, 8205 },
{ "lrm", VERS_FROM40, 8206 },
{ "rlm", VERS_FROM40, 8207 },
{ "ndash", VERS_FROM40, 8211 },
{ "mdash", VERS_FROM40, 8212 },
{ "lsquo", VERS_FROM40, 8216 },
{ "rsquo", VERS_FROM40, 8217 },
{ "sbquo", VERS_FROM40, 8218 },
{ "ldquo", VERS_FROM40, 8220 },
{ "rdquo", VERS_FROM40, 8221 },
{ "bdquo", VERS_FROM40, 8222 },
{ "dagger", VERS_FROM40, 8224 },
{ "Dagger", VERS_FROM40, 8225 },
{ "permil", VERS_FROM40, 8240 },
{ "lsaquo", VERS_FROM40, 8249 },
{ "rsaquo", VERS_FROM40, 8250 },
{ "euro", VERS_FROM40, 8364 },
{ NULL, VERS_UNKNOWN, 0 }
};
/* Pure static implementation. Trades off lookup speed
** for faster setup time (well, none actually).
** Optimization of comparing 1st character buys enough
** speed that hash doesn't improve things without > 500
** items in list.
*/
static const entity* entitiesLookup( ctmbstr s )
{
tmbchar ch = (tmbchar)( s ? *s : 0 );
const entity *np;
for ( np = entities; ch && np && np->name; ++np )
if ( ch == *np->name && TY_(tmbstrcmp)(s, np->name) == 0 )
return np;
return NULL;
}
#if 0
/* entity starting with "&" returns zero on error */
uint EntityCode( ctmbstr name, uint versions )
{
const entity* np;
assert( name && name[0] == '&' );
/* numeric entitity: name = "&#" followed by number */
if ( name[1] == '#' )
{
uint c = 0; /* zero on missing/bad number */
Bool isXml = ( (versions & VERS_XML) == VERS_XML );
/* 'x' prefix denotes hexadecimal number format */
if ( name[2] == 'x' || (!isXml && name[2] == 'X') )
sscanf( name+3, "%x", &c );
else
sscanf( name+2, "%u", &c );
return (uint) c;
}
/* Named entity: name ="&" followed by a name */
if ( NULL != (np = entitiesLookup(name+1)) )
{
/* Only recognize entity name if version supports it. */
if ( np->versions & versions )
return np->code;
}
return 0; /* zero signifies unknown entity name */
}
#endif
Bool TY_(EntityInfo)( ctmbstr name, Bool isXml, uint* code, uint* versions )
{
const entity* np;
int res;
assert( name && name[0] == '&' );
assert( code != NULL );
assert( versions != NULL );
/* numeric entitity: name = "&#" followed by number */
if ( name[1] == '#' )
{
uint c = 0; /* zero on missing/bad number */
/* 'x' prefix denotes hexadecimal number format */
if ( name[2] == 'x' || (!isXml && name[2] == 'X') )
res = sscanf( name+3, "%x", &c );
else
res = sscanf( name+2, "%u", &c );
/* Issue #373 - Null Char in XML result doc - sf905 2009 */
if ( res == 1 )
{
*code = c;
*versions = VERS_ALL;
return yes;
}
else
{
*code = 0;
*versions = ( isXml ? VERS_XML : VERS_PROPRIETARY );
return no;
}
}
/* Named entity: name ="&" followed by a name */
if ( NULL != (np = entitiesLookup(name+1)) )
{
*code = np->code;
*versions = np->versions;
return yes;
}
*code = 0;
*versions = ( isXml ? VERS_XML : VERS_PROPRIETARY );
return no;
}
ctmbstr TY_(EntityName)( uint ch, uint versions )
{
ctmbstr entnam = NULL;
const entity *ep;
for ( ep = entities; ep->name != NULL; ++ep )
{
if ( ep->code == ch )
{
if ( ep->versions & versions )
entnam = ep->name;
break; /* Found code. Stop search. */
}
}
return entnam;
}
/*
* local variables:
* mode: c
* indent-tabs-mode: nil
* c-basic-offset: 4
* eval: (c-set-offset 'substatement-open 0)
* end:
*/

18
src/entities.h

@ -0,0 +1,18 @@
#ifndef __ENTITIES_H__
#define __ENTITIES_H__
/* entities.h -- recognize character entities
(c) 1998-2006 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
*/
#include "forward.h"
/* entity starting with "&" returns zero on error */
/* uint EntityCode( ctmbstr name, uint versions ); */
ctmbstr TY_(EntityName)( uint charCode, uint versions );
Bool TY_(EntityInfo)( ctmbstr name, Bool isXml, uint* code, uint* versions );
#endif /* __ENTITIES_H__ */

116
src/fileio.c

@ -0,0 +1,116 @@
/* fileio.c -- does standard I/O
(c) 1998-2007 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
Default implementations of Tidy input sources
and output sinks based on standard C FILE*.
*/
#include <stdio.h>
#include "forward.h"
#include "fileio.h"
#include "tidy.h"
#if !defined(NDEBUG) && defined(_MSC_VER)
#include "sprtf.h"
#endif
typedef struct _fp_input_source
{
FILE* fp;
TidyBuffer unget;
} FileSource;
static int TIDY_CALL filesrc_getByte( void* sourceData )
{
FileSource* fin = (FileSource*) sourceData;
int bv;
if ( fin->unget.size > 0 )
bv = tidyBufPopByte( &fin->unget );
else
bv = fgetc( fin->fp );
return bv;
}
static Bool TIDY_CALL filesrc_eof( void* sourceData )
{
FileSource* fin = (FileSource*) sourceData;
Bool isEOF = ( fin->unget.size == 0 );
if ( isEOF )
isEOF = feof( fin->fp ) != 0;
return isEOF;
}
static void TIDY_CALL filesrc_ungetByte( void* sourceData, byte bv )
{
FileSource* fin = (FileSource*) sourceData;
tidyBufPutByte( &fin->unget, bv );
}
#if SUPPORT_POSIX_MAPPED_FILES
#define initFileSource initStdIOFileSource
#define freeFileSource freeStdIOFileSource
#endif
int TY_(initFileSource)( TidyAllocator *allocator, TidyInputSource* inp, FILE* fp )
{
FileSource* fin = NULL;
fin = (FileSource*) TidyAlloc( allocator, sizeof(FileSource) );
if ( !fin )
return -1;
TidyClearMemory( fin, sizeof(FileSource) );
fin->unget.allocator = allocator;
fin->fp = fp;
inp->getByte = filesrc_getByte;
inp->eof = filesrc_eof;
inp->ungetByte = filesrc_ungetByte;
inp->sourceData = fin;
return 0;
}
void TY_(freeFileSource)( TidyInputSource* inp, Bool closeIt )
{
FileSource* fin = (FileSource*) inp->sourceData;
if ( closeIt && fin && fin->fp )
fclose( fin->fp );
tidyBufFree( &fin->unget );
TidyFree( fin->unget.allocator, fin );
}
void TIDY_CALL TY_(filesink_putByte)( void* sinkData, byte bv )
{
FILE* fout = (FILE*) sinkData;
fputc( bv, fout );
#if !defined(NDEBUG) && defined(_MSC_VER)
if (_fileno(fout) != 2)
{
if (bv != 0x0d)
{
/*\
* avoid duplicate newline - SPRTF will translate an 0x0d to CRLF,
* and do the same with the following 0x0a
\*/
SPRTF("%c",bv);
}
}
#endif
}
void TY_(initFileSink)( TidyOutputSink* outp, FILE* fp )
{
outp->putByte = TY_(filesink_putByte);
outp->sinkData = fp;
}
/*
* local variables:
* mode: c
* indent-tabs-mode: nil
* c-basic-offset: 4
* eval: (c-set-offset 'substatement-open 0)
* end:
*/

42
src/fileio.h

@ -0,0 +1,42 @@
#ifndef __FILEIO_H__
#define __FILEIO_H__
/** @file fileio.h - does standard C I/O
Implementation of a FILE* based TidyInputSource and
TidyOutputSink.
(c) 1998-2007 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
*/
#include "tidybuffio.h"
#ifdef __cplusplus
extern "C" {
#endif
/** Allocate and initialize file input source */
int TY_(initFileSource)( TidyAllocator *allocator, TidyInputSource* source, FILE* fp );
/** Free file input source */
void TY_(freeFileSource)( TidyInputSource* source, Bool closeIt );
#if SUPPORT_POSIX_MAPPED_FILES
/** Allocate and initialize file input source using Standard C I/O */
int TY_(initStdIOFileSource)( TidyAllocator *allocator, TidyInputSource* source, FILE* fp );
/** Free file input source using Standard C I/O */
void TY_(freeStdIOFileSource)( TidyInputSource* source, Bool closeIt );
#endif
/** Initialize file output sink */
void TY_(initFileSink)( TidyOutputSink* sink, FILE* fp );
/* Needed for internal declarations */
void TIDY_CALL TY_(filesink_putByte)( void* sinkData, byte bv );
#ifdef __cplusplus
}
#endif
#endif /* __FILEIO_H__ */

63
src/forward.h

@ -0,0 +1,63 @@
#ifndef __FORWARD_H__
#define __FORWARD_H__
/* forward.h -- Forward declarations for major Tidy structures
(c) 1998-2007 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
Avoids many include file circular dependencies.
Try to keep this file down to the minimum to avoid
cross-talk between modules.
Header files include this file. C files include tidy-int.h.
*/
#include "tidyplatform.h"
#include "tidy.h"
/* Internal symbols are prefixed to avoid clashes with other libraries */
#define TYDYAPPEND(str1,str2) str1##str2
#define TY_(str) TYDYAPPEND(prvTidy,str)
struct _StreamIn;
typedef struct _StreamIn StreamIn;
struct _StreamOut;
typedef struct _StreamOut StreamOut;
struct _TidyDocImpl;
typedef struct _TidyDocImpl TidyDocImpl;
struct _Dict;
typedef struct _Dict Dict;
struct _Attribute;
typedef struct _Attribute Attribute;
struct _AttVal;
typedef struct _AttVal AttVal;
struct _Node;
typedef struct _Node Node;
struct _IStack;
typedef struct _IStack IStack;
struct _Lexer;
typedef struct _Lexer Lexer;
extern TidyAllocator TY_(g_default_allocator);
/** Wrappers for easy memory allocation using an allocator */
#define TidyAlloc(allocator, size) ((allocator)->vtbl->alloc((allocator), (size)))
#define TidyRealloc(allocator, block, size) ((allocator)->vtbl->realloc((allocator), (block), (size)))
#define TidyFree(allocator, block) ((allocator)->vtbl->free((allocator), (block)))
#define TidyPanic(allocator, msg) ((allocator)->vtbl->panic((allocator), (msg)))
#define TidyClearMemory(block, size) memset((block), 0, (size))
#endif /* __FORWARD_H__ */

174
src/gdoc.c

@ -0,0 +1,174 @@
/*
clean.c -- clean up misuse of presentation markup
(c) 1998-2008 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
Filters from other formats such as Microsoft Word
often make excessive use of presentation markup such
as font tags, B, I, and the align attribute. By applying
a set of production rules, it is straight forward to
transform this to use CSS.
Some rules replace some of the children of an element by
style properties on the element, e.g.
<p><b>...</b></p> -> <p style="font-weight: bold">...</p>
Such rules are applied to the element's content and then
to the element itself until none of the rules more apply.
Having applied all the rules to an element, it will have
a style attribute with one or more properties.
Other rules strip the element they apply to, replacing
it by style properties on the contents, e.g.
<dir><li><p>...</li></dir> -> <p style="margin-left 1em">...
These rules are applied to an element before processing
its content and replace the current element by the first
element in the exposed content.
After applying both sets of rules, you can replace the
style attribute by a class value and style rule in the
document head. To support this, an association of styles
and class names is built.
A naive approach is to rely on string matching to test
when two property lists are the same. A better approach
would be to first sort the properties before matching.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "tidy-int.h"
#include "gdoc.h"
#include "lexer.h"
#include "parser.h"
#include "tags.h"
#include "attrs.h"
#include "message.h"
#include "tmbstr.h"
#include "utf8.h"
/*
Extricate "element", replace it by its content and delete it.
*/
static void DiscardContainer( TidyDocImpl* doc, Node *element, Node **pnode)
{
if (element->content)
{
Node *node, *parent = element->parent;
element->last->next = element->next;
if (element->next)
{
element->next->prev = element->last;
}
else
parent->last = element->last;
if (element->prev)
{
element->content->prev = element->prev;
element->prev->next = element->content;
}
else
parent->content = element->content;
for (node = element->content; node; node = node->next)
node->parent = parent;
*pnode = element->content;
element->next = element->content = NULL;
TY_(FreeNode)(doc, element);
}
else
{
*pnode = TY_(DiscardElement)(doc, element);
}
}
static void CleanNode( TidyDocImpl* doc, Node *node )
{
Node *child, *next;
if (node->content)
{
for (child = node->content; child != NULL; child = next)
{
next = child->next;
if (TY_(nodeIsElement)(child))
{
if (nodeIsSTYLE(child))
TY_(DiscardElement)(doc, child);
if (nodeIsP(child) && !child->content)
TY_(DiscardElement)(doc, child);
else if (nodeIsSPAN(child))
DiscardContainer( doc, child, &next);
else if (nodeIsA(child) && !child->content)
{
AttVal *id = TY_(GetAttrByName)( child, "name" );
if (id)
TY_(RepairAttrValue)( doc, child->parent, "id", id->value );
TY_(DiscardElement)(doc, child);
}
else
{
if (child->attributes)
TY_(DropAttrByName)( doc, child, "class" );
CleanNode(doc, child);
}
}
}
}
}
/* insert meta element to force browser to recognize doc as UTF8 */
static void SetUTF8( TidyDocImpl* doc )
{
Node *head = TY_(FindHEAD)( doc );
if (head)
{
Node *node = TY_(InferredTag)(doc, TidyTag_META);
TY_(AddAttribute)( doc, node, "http-equiv", "Content-Type" );
TY_(AddAttribute)( doc, node, "content", "text/html; charset=UTF-8" );
TY_(InsertNodeAtStart)( head, node );
}
}
/* clean html exported by Google Docs
- strip the script element, as the style sheet is a mess
- strip class attributes
- strip span elements, leaving their content in place
- replace <a name=...></a> by id on parent element
- strip empty <p> elements
*/
void TY_(CleanGoogleDocument)( TidyDocImpl* doc )
{
/* placeholder. CleanTree()/CleanNode() will not
** zap root element
*/
CleanNode( doc, &doc->root );
SetUTF8( doc );
}
/*
* local variables:
* mode: c
* indent-tabs-mode: nil
* c-basic-offset: 4
* eval: (c-set-offset 'substatement-open 0)
* end:
*/

19
src/gdoc.h

@ -0,0 +1,19 @@
#ifndef __GDOC_H__
#define __GDOC_H__
/* gdoc.h -- clean up html exported by Google Docs
(c) 2012 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
- strip the script element, as the style sheet is a mess
- strip class attributes
- strip span elements, leaving their content in place
- replace <a name=...></a> by id on parent element
- strip empty <p> elements
*/
void TY_(CleanGoogleDocument)( TidyDocImpl* doc );
#endif /* __GDOC_H__ */

20
src/htmltidy.cpp

@ -1,14 +1,16 @@
#include <Rcpp.h>
#ifdef __linux__
#include <tidy/tidy.h>
#include <tidy/buffio.h>
#endif
#ifdef __APPLE__
#include <tidy.h>
#include <tidybuffio.h>
#endif
// #ifdef __linux__
// #include <tidy/tidy.h>
// #include <tidy/buffio.h>
// #endif
//
// #ifdef __APPLE__
// #include <tidy.h>
// #include <tidybuffio.h>
// #endif
// libtidy docs:
// http://api.html-tidy.org/tidy/tidylib_api_5.2.0/tidyenum_8h.html#a3a1401652599150188a168dade7dc150
@ -21,7 +23,7 @@
//' @param source length 1 character vetor containing the HTML/XML source to process
//' @export
//[[Rcpp::export]]
std::string tidy(std::string source) {
std::string tidy_html(std::string source) {
TidyBuffer output = {0};
TidyBuffer errbuf = {0};

104
src/iconvtc.c

@ -0,0 +1,104 @@
/* iconvtc.c -- Interface to iconv transcoding routines
(c) 1998-2008 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
*/
#include "tidy.h"
#include "forward.h"
#include "streamio.h"
#ifdef TIDY_ICONV_SUPPORT
#include <iconv.h>
/* maximum number of bytes for a single character */
#define TC_INBUFSIZE 16
/* maximum number of characters per byte sequence */
#define TC_OUTBUFSIZE 16
Bool IconvInitInputTranscoder(void)
{
return no;
}
void IconvUninitInputTranscoder(void)
{
return;
}
int IconvGetChar(byte firstByte, StreamIn * in, uint * bytesRead)
{
iconv_t cd;
TidyInputSource * source;
char inbuf[TC_INBUFSIZE] = { 0 };
char outbuf[TC_OUTBUFSIZE] = { 0 };
size_t inbufsize = 0;
assert( in != NULL );
assert( &in->source != NULL );
assert( bytesRead != NULL );
assert( in->iconvptr != 0 );
cd = (iconv_t)in->iconvptr;
source = &in->source;
inbuf[inbufsize++] = (char)firstByte;
while(inbufsize < TC_INBUFSIZE)
{
char * outbufptr = (char*)outbuf;
char * inbufptr = (char*)inbuf;
size_t readNow = inbufsize;
size_t writeNow = TC_OUTBUFSIZE;
size_t result = 0;
int iconv_errno = 0;
int nextByte = EndOfStream;
result = iconv(cd, (const char**)&inbufptr, &readNow, (char**)&outbufptr, &writeNow);
iconv_errno = errno;
if (result != (size_t)(-1))
{
int c;
/* create codepoint from UTF-32LE octets */
c = (unsigned char)outbuf[0];
c += (unsigned char)outbuf[1] << 8;
c += (unsigned char)outbuf[2] << 16;
c += (unsigned char)outbuf[3] << 32;
/* set number of read bytes */
*bytesRead = inbufsize;
return c;
}
assert( iconv_errno != EILSEQ ); /* broken multibyte sequence */
assert( iconv_errno != E2BIG ); /* not enough memory */
assert( iconv_errno == EINVAL ); /* incomplete sequence */
/* we need more bytes */
nextByte = source->getByte(source->sourceData);
if (nextByte == EndOfStream)
{
/* todo: error message for broken stream? */
*bytesRead = inbufsize;
return EndOfStream;
}
inbuf[inbufsize++] = (char)nextByte;
}
/* No full character found after reading TC_INBUFSIZE bytes, */
/* give up to read this stream, it's obviously unreadable. */
/* todo: error message for broken stream? */
return EndOfStream;
}
#endif /* TIDY_ICONV_SUPPORT */

14
src/iconvtc.h

@ -0,0 +1,14 @@
#ifndef __ICONVTC_H__
#define __ICONVTC_H__
#ifdef TIDY_ICONV_SUPPORT
/* iconvtc.h -- Interface to iconv transcoding routines
(c) 1998-2003 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
*/
#endif /* TIDY_ICONV_SUPPORT */
#endif /* __ICONVTC_H__ */

380
src/istack.c

@ -0,0 +1,380 @@
/* istack.c -- inline stack for compatibility with Mosaic
(c) 1998-2006 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
*/
#include "tidy-int.h"
#include "lexer.h"
#include "attrs.h"
#include "streamio.h"
#include "tmbstr.h"
#if !defined(NDEBUG) && defined(_MSC_VER)
#include "sprtf.h"
#endif
/* duplicate attributes */
AttVal *TY_(DupAttrs)( TidyDocImpl* doc, AttVal *attrs)
{
AttVal *newattrs;
if (attrs == NULL)
return attrs;
newattrs = TY_(NewAttribute)(doc);
*newattrs = *attrs;
newattrs->next = TY_(DupAttrs)( doc, attrs->next );
newattrs->attribute = TY_(tmbstrdup)(doc->allocator, attrs->attribute);
newattrs->value = TY_(tmbstrdup)(doc->allocator, attrs->value);
newattrs->dict = TY_(FindAttribute)(doc, newattrs);
newattrs->asp = attrs->asp ? TY_(CloneNode)(doc, attrs->asp) : NULL;
newattrs->php = attrs->php ? TY_(CloneNode)(doc, attrs->php) : NULL;
return newattrs;
}
static Bool IsNodePushable( Node *node )
{
if (node->tag == NULL)
return no;
if (!(node->tag->model & CM_INLINE))
return no;
if (node->tag->model & CM_OBJECT)
return no;
/*\ Issue #92: OLD problem of ins and del which are marked as both
* inline and block, thus should NOT ever be 'inserted'
\*/
if (nodeIsINS(node) || nodeIsDEL(node))
return no;
return yes;
}
/*
push a copy of an inline node onto stack
but don't push if implicit or OBJECT or APPLET
(implicit tags are ones generated from the istack)
One issue arises with pushing inlines when
the tag is already pushed. For instance:
<p><em>text
<p><em>more text
Shouldn't be mapped to
<p><em>text</em></p>
<p><em><em>more text</em></em>
*/
void TY_(PushInline)( TidyDocImpl* doc, Node *node )
{
Lexer* lexer = doc->lexer;
IStack *istack;
if (node->implicit)
return;
if ( !IsNodePushable(node) )
return;
if ( !nodeIsFONT(node) && TY_(IsPushed)(doc, node) )
return;
/* make sure there is enough space for the stack */
if (lexer->istacksize + 1 > lexer->istacklength)
{
if (lexer->istacklength == 0)
lexer->istacklength = 6; /* this is perhaps excessive */
lexer->istacklength = lexer->istacklength * 2;
lexer->istack = (IStack *)TidyDocRealloc(doc, lexer->istack,
sizeof(IStack)*(lexer->istacklength));
}
istack = &(lexer->istack[lexer->istacksize]);
istack->tag = node->tag;
istack->element = TY_(tmbstrdup)(doc->allocator, node->element);
istack->attributes = TY_(DupAttrs)( doc, node->attributes );
++(lexer->istacksize);
}
static void PopIStack( TidyDocImpl* doc )
{
Lexer* lexer = doc->lexer;
IStack *istack;
AttVal *av;
--(lexer->istacksize);
istack = &(lexer->istack[lexer->istacksize]);
while (istack->attributes)
{
av = istack->attributes;
istack->attributes = av->next;
TY_(FreeAttribute)( doc, av );
}
TidyDocFree(doc, istack->element);
istack->element = NULL; /* remove the freed element */
}
static void PopIStackUntil( TidyDocImpl* doc, TidyTagId tid )
{
Lexer* lexer = doc->lexer;
IStack *istack;
while (lexer->istacksize > 0)
{
PopIStack( doc );
istack = &(lexer->istack[lexer->istacksize]);
if ( istack->tag->id == tid )
break;
}
}
/* pop inline stack */
void TY_(PopInline)( TidyDocImpl* doc, Node *node )
{
Lexer* lexer = doc->lexer;
if (node)
{
if ( !IsNodePushable(node) )
return;
/* if node is </a> then pop until we find an <a> */
if ( nodeIsA(node) )
{
PopIStackUntil( doc, TidyTag_A );
return;
}
}
if (lexer->istacksize > 0)
{
PopIStack( doc );
/* #427822 - fix by Randy Waki 7 Aug 00 */
if (lexer->insert >= lexer->istack + lexer->istacksize)
lexer->insert = NULL;
}
}
Bool TY_(IsPushed)( TidyDocImpl* doc, Node *node )
{
Lexer* lexer = doc->lexer;
int i;
for (i = lexer->istacksize - 1; i >= 0; --i)
{
if (lexer->istack[i].tag == node->tag)
return yes;
}
return no;
}
/*
Test whether the last element on the stack has the same type than "node".
*/
Bool TY_(IsPushedLast)( TidyDocImpl* doc, Node *element, Node *node )
{
Lexer* lexer = doc->lexer;
if ( element && !IsNodePushable(element) )
return no;
if (lexer->istacksize > 0) {
if (lexer->istack[lexer->istacksize - 1].tag == node->tag) {
return yes;
}
}
return no;
}
/*
This has the effect of inserting "missing" inline
elements around the contents of blocklevel elements
such as P, TD, TH, DIV, PRE etc. This procedure is
called at the start of ParseBlock. when the inline
stack is not empty, as will be the case in:
<i><h1>italic heading</h1></i>
which is then treated as equivalent to
<h1><i>italic heading</i></h1>
This is implemented by setting the lexer into a mode
where it gets tokens from the inline stack rather than
from the input stream.
*/
int TY_(InlineDup)( TidyDocImpl* doc, Node* node )
{
Lexer* lexer = doc->lexer;
int n;
if ((n = lexer->istacksize - lexer->istackbase) > 0)
{
lexer->insert = &(lexer->istack[lexer->istackbase]);
lexer->inode = node;
}
return n;
}
/*
defer duplicates when entering a table or other
element where the inlines shouldn't be duplicated
*/
void TY_(DeferDup)( TidyDocImpl* doc )
{
doc->lexer->insert = NULL;
doc->lexer->inode = NULL;
}
Node *TY_(InsertedToken)( TidyDocImpl* doc )
{
Lexer* lexer = doc->lexer;
Node *node;
IStack *istack;
uint n;
/* this will only be NULL if inode != NULL */
if (lexer->insert == NULL)
{
node = lexer->inode;
lexer->inode = NULL;
return node;
}
/*
If this is the "latest" node then update
the position, otherwise use current values
*/
if (lexer->inode == NULL)
{
lexer->lines = doc->docIn->curline;
lexer->columns = doc->docIn->curcol;
}
node = TY_(NewNode)(doc->allocator, lexer);
node->type = StartTag;
node->implicit = yes;
node->start = lexer->txtstart;
/* #431734 [JTidy bug #226261 (was 126261)] - fix by Gary Peskin 20 Dec 00 */
node->end = lexer->txtend; /* was : lexer->txtstart; */
istack = lexer->insert;
/* #if 0 && defined(_DEBUG) */
#if !defined(NDEBUG) && defined(_MSC_VER)
if ( lexer->istacksize == 0 )
{
SPRTF( "WARNING: ZERO sized istack!\n" );
}
#endif
node->element = TY_(tmbstrdup)(doc->allocator, istack->element);
node->tag = istack->tag;
node->attributes = TY_(DupAttrs)( doc, istack->attributes );
/* advance lexer to next item on the stack */
n = (uint)(lexer->insert - &(lexer->istack[0]));
/* and recover state if we have reached the end */
if (++n < lexer->istacksize)
lexer->insert = &(lexer->istack[n]);
else
lexer->insert = NULL;
return node;
}
/*
We have two CM_INLINE elements pushed ... the first is closing,
but, like the browser, the second should be retained ...
Like <b>bold <i>bold and italics</b> italics only</i>
This function switches the tag positions on the stack,
returning 'yes' if both were found in the expected order.
*/
Bool TY_(SwitchInline)( TidyDocImpl* doc, Node* element, Node* node )
{
Lexer* lexer = doc->lexer;
if ( lexer
&& element && element->tag
&& node && node->tag
&& TY_(IsPushed)( doc, element )
&& TY_(IsPushed)( doc, node )
&& ((lexer->istacksize - lexer->istackbase) >= 2) )
{
/* we have a chance of succeeding ... */
int i;
for (i = (lexer->istacksize - lexer->istackbase - 1); i >= 0; --i)
{
if (lexer->istack[i].tag == element->tag) {
/* found the element tag - phew */
IStack *istack1 = &lexer->istack[i];
IStack *istack2 = NULL;
--i; /* back one more, and continue */
for ( ; i >= 0; --i)
{
if (lexer->istack[i].tag == node->tag)
{
/* found the element tag - phew */
istack2 = &lexer->istack[i];
break;
}
}
if ( istack2 )
{
/* perform the swap */
IStack tmp_istack = *istack2;
*istack2 = *istack1;
*istack1 = tmp_istack;
return yes;
}
}
}
}
return no;
}
/*
We want to push a specific a specific element on the stack,
but it may not be the last element, which InlineDup()
would handle. Return yes, if found and inserted.
*/
Bool TY_(InlineDup1)( TidyDocImpl* doc, Node* node, Node* element )
{
Lexer* lexer = doc->lexer;
int n, i;
if ( element
&& (element->tag != NULL)
&& ((n = lexer->istacksize - lexer->istackbase) > 0) )
{
for ( i = n - 1; i >=0; --i ) {
if (lexer->istack[i].tag == element->tag) {
/* found our element tag - insert it */
lexer->insert = &(lexer->istack[i]);
lexer->inode = node;
return yes;
}
}
}
return no;
}
/*
* local variables:
* mode: c
* indent-tabs-mode: nil
* c-basic-offset: 4
* eval: (c-set-offset 'substatement-open 0)
* end:
*/

959
src/language.c

@ -0,0 +1,959 @@
/*
* language.c
* Localization support for HTML Tidy.
*
* (c) 2015 HTACG
* See tidy.h and access.h for the copyright notice.
*
* Created by Jim Derry on 11/28/15.
*/
#include "language.h"
#include "language_en.h"
#if SUPPORT_LOCALIZATIONS
#include "language_en_gb.h"
#include "language_es.h"
#include "language_es_mx.h"
#include "language_zh_cn.h"
#include "language_fr.h"
#endif
#include "tmbstr.h"
#include "locale.h"
/**
* This structure type provides universal access to all of Tidy's strings.
*/
typedef struct {
languageDefinition *currentLanguage;
languageDefinition *fallbackLanguage;
languageDefinition *languages[];
} tidyLanguagesType;
/**
* This single structure contains all localizations. Note that we preset
* `.currentLanguage` to language_en, which is Tidy's default language.
*/
static tidyLanguagesType tidyLanguages = {
&language_en, /* current language */
&language_en, /* first fallback language */
{
/* Required localization! */
&language_en,
#if SUPPORT_LOCALIZATIONS
/* These additional languages are installed. */
&language_en_gb,
&language_es,
&language_es_mx,
&language_zh_cn,
&language_fr,
#endif
NULL /* This array MUST be null terminated. */
}
};
/**
* This structure maps old-fashioned Windows strings
* to proper POSIX names (modern Windows already uses
* POSIX names).
*/
static const tidyLocaleMapItem localeMappings[] = {
{ "america", "en_us" },
{ "american english", "en_us" },
{ "american-english", "en_us" },
{ "american", "en_us" },
{ "aus", "en_au" },
{ "australia", "en_au" },
{ "australian", "en_au" },
{ "austria", "de_at" },
{ "aut", "de_at" },
{ "bel", "nl_be" },
{ "belgian", "nl_be" },
{ "belgium", "nl_be" },
{ "bra", "pt-br" },
{ "brazil", "pt-br" },
{ "britain", "en_gb" },
{ "can", "en_ca" },
{ "canada", "en_ca" },
{ "canadian", "en_ca" },
{ "che", "de_ch" },
{ "china", "zh_cn" },
{ "chinese-simplified", "zh" },
{ "chinese-traditional", "zh_tw" },
{ "chinese", "zh" },
{ "chn", "zh_cn" },
{ "chs", "zh" },
{ "cht", "zh_tw" },
{ "csy", "cs" },
{ "cze", "cs_cz" },
{ "czech", "cs_cz" },
{ "dan", "da" },
{ "danish", "da" },
{ "dea", "de_at" },
{ "denmark", "da_dk" },
{ "des", "de_ch" },
{ "deu", "de" },
{ "dnk", "da_dk" },
{ "dutch-belgian", "nl_be" },
{ "dutch", "nl" },
{ "ell", "el" },
{ "ena", "en_au" },
{ "enc", "en_ca" },
{ "eng", "eb_gb" },
{ "england", "en_gb" },
{ "english-american", "en_us" },
{ "english-aus", "en_au" },
{ "english-can", "en_ca" },
{ "english-nz", "en_nz" },
{ "english-uk", "eb_gb" },
{ "english-us", "en_us" },
{ "english-usa", "en_us" },
{ "english", "en" },
{ "enu", "en_us" },
{ "enz", "en_nz" },
{ "esm", "es-mx" },
{ "esn", "es" },
{ "esp", "es" },
{ "fin", "fi" },
{ "finland", "fi_fi" },
{ "finnish", "fi" },
{ "fra", "fr" },
{ "france", "fr_fr" },
{ "frb", "fr_be" },
{ "frc", "fr_ca" },
{ "french-belgian", "fr_be" },
{ "french-canadian", "fr_ca" },
{ "french-swiss", "fr_ch" },
{ "french", "fr" },
{ "frs", "fr_ch" },
{ "gbr", "en_gb" },
{ "german-austrian", "de_at" },
{ "german-swiss", "de_ch" },
{ "german", "de" },
{ "germany", "de_de" },
{ "grc", "el_gr" },
{ "great britain", "en_gb" },
{ "greece", "el_gr" },
{ "greek", "el" },
{ "hkg", "zh_hk" },
{ "holland", "nl_nl" },
{ "hong kong", "zh_hk" },
{ "hong-kong", "zh_hk" },
{ "hun", "hu" },
{ "hungarian", "hu" },
{ "hungary", "hu_hu" },
{ "iceland", "is_is" },
{ "icelandic", "is" },
{ "ireland", "en_ie" },
{ "irl", "en_ie" },
{ "isl", "is" },
{ "ita", "it" },
{ "ita", "it_it" },
{ "italian-swiss", "it_ch" },
{ "italian", "it" },
{ "italy", "it_it" },
{ "its", "it_ch" },
{ "japan", "ja_jp" },
{ "japanese", "ja" },
{ "jpn", "ja" },
{ "kor", "ko" },
{ "korea", "ko_kr" },
{ "korean", "ko" },
{ "mex", "es-mx" },
{ "mexico", "es-mx" },
{ "netherlands", "nl_nl" },
{ "new zealand", "en_nz" },
{ "new-zealand", "en_nz" },
{ "nlb", "nl_be" },
{ "nld", "nl" },
{ "non", "nn" },
{ "nor", "nb" },
{ "norway", "no" },
{ "norwegian-bokmal", "nb" },
{ "norwegian-nynorsk", "nn" },
{ "norwegian", "no" },
{ "nz", "en_nz" },
{ "nzl", "en_nz" },
{ "plk", "pl" },
{ "pol", "pl-pl" },
{ "poland", "pl-pl" },
{ "polish", "pl" },
{ "portugal", "pt-pt" },
{ "portuguese-brazil", "pt-br" },
{ "portuguese", "pt" },
{ "pr china", "zh_cn" },
{ "pr-china", "zh_cn" },
{ "prt", "pt-pt" },
{ "ptb", "pt-br" },
{ "ptg", "pt" },
{ "rus", "ru" },
{ "russia", "ru-ru" },
{ "russian", "ru" },
{ "sgp", "zh_sg" },
{ "singapore", "zh_sg" },
{ "sky", "sk" },
{ "slovak", "sk" },
{ "spain", "es-es" },
{ "spanish-mexican", "es-mx" },
{ "spanish-modern", "es" },
{ "spanish", "es" },
{ "sve", "sv" },
{ "svk", "sk-sk" },
{ "swe", "sv-se" },
{ "sweden", "sv-se" },
{ "swedish", "sv" },
{ "swiss", "de_ch" },
{ "switzerland", "de_ch" },
{ "taiwan", "zh_tw" },
{ "trk", "tr" },
{ "tur", "tr-tr" },
{ "turkey", "tr-tr" },
{ "turkish", "tr" },
{ "twn", "zh_tw" },
{ "uk", "en_gb" },
{ "united kingdom", "en_gb" },
{ "united states", "en_us" },
{ "united-kingdom", "en_gb" },
{ "united-states", "en_us" },
{ "us", "en_us" },
{ "usa", "en_us" },
/* MUST be last. */
{ NULL, NULL }
};
/**
* LibTidy users may want to use `TidyReportFilter3` to enable their own
* localization lookup features. Because Tidy's errors codes are enums the
* specific values can change over time. This table will ensure that LibTidy
* users always have a static value available for use.
*/
static const tidyErrorFilterKeyItem tidyErrorFilterKeysStruct[] = {
/* This blocks of codes comes from `tidyErrorCodes` enum. */
{ "CODES_TIDY_ERROR_FIRST", CODES_TIDY_ERROR_FIRST },
{ "MISSING_SEMICOLON", MISSING_SEMICOLON },
{ "MISSING_SEMICOLON_NCR", MISSING_SEMICOLON_NCR },
{ "UNKNOWN_ENTITY", UNKNOWN_ENTITY },
{ "UNESCAPED_AMPERSAND", UNESCAPED_AMPERSAND },
{ "APOS_UNDEFINED", APOS_UNDEFINED },
{ "MISSING_ENDTAG_FOR", MISSING_ENDTAG_FOR },
{ "MISSING_ENDTAG_BEFORE", MISSING_ENDTAG_BEFORE },
{ "DISCARDING_UNEXPECTED", DISCARDING_UNEXPECTED },
{ "NESTED_EMPHASIS", NESTED_EMPHASIS },
{ "NON_MATCHING_ENDTAG", NON_MATCHING_ENDTAG },
{ "TAG_NOT_ALLOWED_IN", TAG_NOT_ALLOWED_IN },
{ "MISSING_STARTTAG", MISSING_STARTTAG },
{ "UNEXPECTED_ENDTAG", UNEXPECTED_ENDTAG },
{ "USING_BR_INPLACE_OF", USING_BR_INPLACE_OF },
{ "INSERTING_TAG", INSERTING_TAG },
{ "SUSPECTED_MISSING_QUOTE", SUSPECTED_MISSING_QUOTE },
{ "MISSING_TITLE_ELEMENT", MISSING_TITLE_ELEMENT },
{ "DUPLICATE_FRAMESET", DUPLICATE_FRAMESET },
{ "CANT_BE_NESTED", CANT_BE_NESTED },
{ "OBSOLETE_ELEMENT", OBSOLETE_ELEMENT },
{ "PROPRIETARY_ELEMENT", PROPRIETARY_ELEMENT },
{ "ELEMENT_VERS_MISMATCH_ERROR", ELEMENT_VERS_MISMATCH_ERROR },
{ "ELEMENT_VERS_MISMATCH_WARN", ELEMENT_VERS_MISMATCH_WARN },
{ "UNKNOWN_ELEMENT", UNKNOWN_ELEMENT },
{ "TRIM_EMPTY_ELEMENT", TRIM_EMPTY_ELEMENT },
{ "COERCE_TO_ENDTAG", COERCE_TO_ENDTAG },
{ "ILLEGAL_NESTING", ILLEGAL_NESTING },
{ "NOFRAMES_CONTENT", NOFRAMES_CONTENT },
{ "CONTENT_AFTER_BODY", CONTENT_AFTER_BODY },
{ "INCONSISTENT_VERSION", INCONSISTENT_VERSION },
{ "MALFORMED_COMMENT", MALFORMED_COMMENT },
{ "BAD_COMMENT_CHARS", BAD_COMMENT_CHARS },
{ "BAD_XML_COMMENT", BAD_XML_COMMENT },
{ "BAD_CDATA_CONTENT", BAD_CDATA_CONTENT },
{ "INCONSISTENT_NAMESPACE", INCONSISTENT_NAMESPACE },
{ "DOCTYPE_AFTER_TAGS", DOCTYPE_AFTER_TAGS },
{ "MALFORMED_DOCTYPE", MALFORMED_DOCTYPE },
{ "UNEXPECTED_END_OF_FILE", UNEXPECTED_END_OF_FILE },
{ "DTYPE_NOT_UPPER_CASE", DTYPE_NOT_UPPER_CASE },
{ "TOO_MANY_ELEMENTS", TOO_MANY_ELEMENTS },
{ "UNESCAPED_ELEMENT", UNESCAPED_ELEMENT },
{ "NESTED_QUOTATION", NESTED_QUOTATION },
{ "ELEMENT_NOT_EMPTY", ELEMENT_NOT_EMPTY },
{ "ENCODING_IO_CONFLICT", ENCODING_IO_CONFLICT },
{ "MIXED_CONTENT_IN_BLOCK", MIXED_CONTENT_IN_BLOCK },
{ "MISSING_DOCTYPE", MISSING_DOCTYPE },
{ "SPACE_PRECEDING_XMLDECL", SPACE_PRECEDING_XMLDECL },
{ "TOO_MANY_ELEMENTS_IN", TOO_MANY_ELEMENTS_IN },
{ "UNEXPECTED_ENDTAG_IN", UNEXPECTED_ENDTAG_IN },
{ "REPLACING_ELEMENT", REPLACING_ELEMENT },
{ "REPLACING_UNEX_ELEMENT", REPLACING_UNEX_ELEMENT },
{ "COERCE_TO_ENDTAG_WARN", COERCE_TO_ENDTAG_WARN },
{ "UNKNOWN_ATTRIBUTE", UNKNOWN_ATTRIBUTE },
{ "INSERTING_ATTRIBUTE", INSERTING_ATTRIBUTE },
{ "INSERTING_AUTO_ATTRIBUTE", INSERTING_AUTO_ATTRIBUTE },
{ "MISSING_ATTR_VALUE", MISSING_ATTR_VALUE },
{ "BAD_ATTRIBUTE_VALUE", BAD_ATTRIBUTE_VALUE },
{ "UNEXPECTED_GT", UNEXPECTED_GT },
{ "PROPRIETARY_ATTRIBUTE", PROPRIETARY_ATTRIBUTE },
{ "MISMATCHED_ATTRIBUTE_ERROR", MISMATCHED_ATTRIBUTE_ERROR },
{ "MISMATCHED_ATTRIBUTE_WARN", MISMATCHED_ATTRIBUTE_WARN },
{ "PROPRIETARY_ATTR_VALUE", PROPRIETARY_ATTR_VALUE },
{ "REPEATED_ATTRIBUTE", REPEATED_ATTRIBUTE },
{ "MISSING_IMAGEMAP", MISSING_IMAGEMAP },
{ "XML_ATTRIBUTE_VALUE", XML_ATTRIBUTE_VALUE },
{ "UNEXPECTED_QUOTEMARK", UNEXPECTED_QUOTEMARK },
{ "MISSING_QUOTEMARK", MISSING_QUOTEMARK },
{ "ID_NAME_MISMATCH", ID_NAME_MISMATCH },
{ "BACKSLASH_IN_URI", BACKSLASH_IN_URI },
{ "FIXED_BACKSLASH", FIXED_BACKSLASH },
{ "ILLEGAL_URI_REFERENCE", ILLEGAL_URI_REFERENCE },
{ "ESCAPED_ILLEGAL_URI", ESCAPED_ILLEGAL_URI },
{ "NEWLINE_IN_URI", NEWLINE_IN_URI },
{ "ANCHOR_NOT_UNIQUE", ANCHOR_NOT_UNIQUE },
{ "JOINING_ATTRIBUTE", JOINING_ATTRIBUTE },
{ "UNEXPECTED_EQUALSIGN", UNEXPECTED_EQUALSIGN },
{ "ATTR_VALUE_NOT_LCASE", ATTR_VALUE_NOT_LCASE },
{ "XML_ID_SYNTAX", XML_ID_SYNTAX },
{ "INVALID_ATTRIBUTE", INVALID_ATTRIBUTE },
{ "BAD_ATTRIBUTE_VALUE_REPLACED", BAD_ATTRIBUTE_VALUE_REPLACED },
{ "INVALID_XML_ID", INVALID_XML_ID },
{ "UNEXPECTED_END_OF_FILE_ATTR", UNEXPECTED_END_OF_FILE_ATTR },
{ "MISSING_ATTRIBUTE", MISSING_ATTRIBUTE },
{ "WHITE_IN_URI", WHITE_IN_URI },
{ "REMOVED_HTML5", REMOVED_HTML5 },
{ "BAD_SUMMARY_HTML5", BAD_SUMMARY_HTML5 },
{ "PREVIOUS_LOCATION", PREVIOUS_LOCATION },
{ "VENDOR_SPECIFIC_CHARS", VENDOR_SPECIFIC_CHARS },
{ "INVALID_SGML_CHARS", INVALID_SGML_CHARS },
{ "INVALID_UTF8", INVALID_UTF8 },
{ "INVALID_UTF16", INVALID_UTF16 },
{ "ENCODING_MISMATCH", ENCODING_MISMATCH },
{ "INVALID_URI", INVALID_URI },
{ "INVALID_NCR", INVALID_NCR },
{ "CODES_TIDY_ERROR_LAST", CODES_TIDY_ERROR_LAST },
#if SUPPORT_ACCESSIBILITY_CHECKS
/* This blocks of codes comes from `accessErrorCodes` enum. */
{ "FIRST_ACCESS_ERR", FIRST_ACCESS_ERR },
{ "IMG_MISSING_ALT", IMG_MISSING_ALT },
{ "IMG_ALT_SUSPICIOUS_FILENAME", IMG_ALT_SUSPICIOUS_FILENAME },
{ "IMG_ALT_SUSPICIOUS_FILE_SIZE", IMG_ALT_SUSPICIOUS_FILE_SIZE },
{ "IMG_ALT_SUSPICIOUS_PLACEHOLDER", IMG_ALT_SUSPICIOUS_PLACEHOLDER },
{ "IMG_ALT_SUSPICIOUS_TOO_LONG", IMG_ALT_SUSPICIOUS_TOO_LONG },
{ "IMG_MISSING_ALT_BULLET", IMG_MISSING_ALT_BULLET },
{ "IMG_MISSING_ALT_H_RULE", IMG_MISSING_ALT_H_RULE },
{ "IMG_MISSING_LONGDESC_DLINK", IMG_MISSING_LONGDESC_DLINK },
{ "IMG_MISSING_DLINK", IMG_MISSING_DLINK },
{ "IMG_MISSING_LONGDESC", IMG_MISSING_LONGDESC },
{ "LONGDESC_NOT_REQUIRED", LONGDESC_NOT_REQUIRED },
{ "IMG_BUTTON_MISSING_ALT", IMG_BUTTON_MISSING_ALT },
{ "APPLET_MISSING_ALT", APPLET_MISSING_ALT },
{ "OBJECT_MISSING_ALT", OBJECT_MISSING_ALT },
{ "AUDIO_MISSING_TEXT_WAV", AUDIO_MISSING_TEXT_WAV },
{ "AUDIO_MISSING_TEXT_AU", AUDIO_MISSING_TEXT_AU },
{ "AUDIO_MISSING_TEXT_AIFF", AUDIO_MISSING_TEXT_AIFF },
{ "AUDIO_MISSING_TEXT_SND", AUDIO_MISSING_TEXT_SND },
{ "AUDIO_MISSING_TEXT_RA", AUDIO_MISSING_TEXT_RA },
{ "AUDIO_MISSING_TEXT_RM", AUDIO_MISSING_TEXT_RM },
{ "FRAME_MISSING_LONGDESC", FRAME_MISSING_LONGDESC },
{ "AREA_MISSING_ALT", AREA_MISSING_ALT },
{ "SCRIPT_MISSING_NOSCRIPT", SCRIPT_MISSING_NOSCRIPT },
{ "ASCII_REQUIRES_DESCRIPTION", ASCII_REQUIRES_DESCRIPTION },
{ "IMG_MAP_SERVER_REQUIRES_TEXT_LINKS", IMG_MAP_SERVER_REQUIRES_TEXT_LINKS },
{ "MULTIMEDIA_REQUIRES_TEXT", MULTIMEDIA_REQUIRES_TEXT },
{ "IMG_MAP_CLIENT_MISSING_TEXT_LINKS", IMG_MAP_CLIENT_MISSING_TEXT_LINKS },
{ "INFORMATION_NOT_CONVEYED_IMAGE", INFORMATION_NOT_CONVEYED_IMAGE },
{ "INFORMATION_NOT_CONVEYED_APPLET", INFORMATION_NOT_CONVEYED_APPLET },
{ "INFORMATION_NOT_CONVEYED_OBJECT", INFORMATION_NOT_CONVEYED_OBJECT },
{ "INFORMATION_NOT_CONVEYED_SCRIPT", INFORMATION_NOT_CONVEYED_SCRIPT },
{ "INFORMATION_NOT_CONVEYED_INPUT", INFORMATION_NOT_CONVEYED_INPUT },
{ "COLOR_CONTRAST_TEXT", COLOR_CONTRAST_TEXT },
{ "COLOR_CONTRAST_LINK", COLOR_CONTRAST_LINK },
{ "COLOR_CONTRAST_ACTIVE_LINK", COLOR_CONTRAST_ACTIVE_LINK },
{ "COLOR_CONTRAST_VISITED_LINK", COLOR_CONTRAST_VISITED_LINK },
{ "DOCTYPE_MISSING", DOCTYPE_MISSING },
{ "STYLE_SHEET_CONTROL_PRESENTATION", STYLE_SHEET_CONTROL_PRESENTATION },
{ "HEADERS_IMPROPERLY_NESTED", HEADERS_IMPROPERLY_NESTED },
{ "POTENTIAL_HEADER_BOLD", POTENTIAL_HEADER_BOLD },
{ "POTENTIAL_HEADER_ITALICS", POTENTIAL_HEADER_ITALICS },
{ "POTENTIAL_HEADER_UNDERLINE", POTENTIAL_HEADER_UNDERLINE },
{ "HEADER_USED_FORMAT_TEXT", HEADER_USED_FORMAT_TEXT },
{ "LIST_USAGE_INVALID_UL", LIST_USAGE_INVALID_UL },
{ "LIST_USAGE_INVALID_OL", LIST_USAGE_INVALID_OL },
{ "LIST_USAGE_INVALID_LI", LIST_USAGE_INVALID_LI },
{ "INDICATE_CHANGES_IN_LANGUAGE", INDICATE_CHANGES_IN_LANGUAGE },
{ "LANGUAGE_NOT_IDENTIFIED", LANGUAGE_NOT_IDENTIFIED },
{ "LANGUAGE_INVALID", LANGUAGE_INVALID },
{ "DATA_TABLE_MISSING_HEADERS", DATA_TABLE_MISSING_HEADERS },
{ "DATA_TABLE_MISSING_HEADERS_COLUMN", DATA_TABLE_MISSING_HEADERS_COLUMN },
{ "DATA_TABLE_MISSING_HEADERS_ROW", DATA_TABLE_MISSING_HEADERS_ROW },
{ "DATA_TABLE_REQUIRE_MARKUP_COLUMN_HEADERS", DATA_TABLE_REQUIRE_MARKUP_COLUMN_HEADERS },
{ "DATA_TABLE_REQUIRE_MARKUP_ROW_HEADERS", DATA_TABLE_REQUIRE_MARKUP_ROW_HEADERS },
{ "LAYOUT_TABLES_LINEARIZE_PROPERLY", LAYOUT_TABLES_LINEARIZE_PROPERLY },
{ "LAYOUT_TABLE_INVALID_MARKUP", LAYOUT_TABLE_INVALID_MARKUP },
{ "TABLE_MISSING_SUMMARY", TABLE_MISSING_SUMMARY },
{ "TABLE_SUMMARY_INVALID_NULL", TABLE_SUMMARY_INVALID_NULL },
{ "TABLE_SUMMARY_INVALID_SPACES", TABLE_SUMMARY_INVALID_SPACES },
{ "TABLE_SUMMARY_INVALID_PLACEHOLDER", TABLE_SUMMARY_INVALID_PLACEHOLDER },
{ "TABLE_MISSING_CAPTION", TABLE_MISSING_CAPTION },
{ "TABLE_MAY_REQUIRE_HEADER_ABBR", TABLE_MAY_REQUIRE_HEADER_ABBR },
{ "TABLE_MAY_REQUIRE_HEADER_ABBR_NULL", TABLE_MAY_REQUIRE_HEADER_ABBR_NULL },
{ "TABLE_MAY_REQUIRE_HEADER_ABBR_SPACES", TABLE_MAY_REQUIRE_HEADER_ABBR_SPACES },
{ "STYLESHEETS_REQUIRE_TESTING_LINK", STYLESHEETS_REQUIRE_TESTING_LINK },
{ "STYLESHEETS_REQUIRE_TESTING_STYLE_ELEMENT", STYLESHEETS_REQUIRE_TESTING_STYLE_ELEMENT },
{ "STYLESHEETS_REQUIRE_TESTING_STYLE_ATTR", STYLESHEETS_REQUIRE_TESTING_STYLE_ATTR },
{ "FRAME_SRC_INVALID", FRAME_SRC_INVALID },
{ "TEXT_EQUIVALENTS_REQUIRE_UPDATING_APPLET", TEXT_EQUIVALENTS_REQUIRE_UPDATING_APPLET },
{ "TEXT_EQUIVALENTS_REQUIRE_UPDATING_SCRIPT", TEXT_EQUIVALENTS_REQUIRE_UPDATING_SCRIPT },
{ "TEXT_EQUIVALENTS_REQUIRE_UPDATING_OBJECT", TEXT_EQUIVALENTS_REQUIRE_UPDATING_OBJECT },
{ "PROGRAMMATIC_OBJECTS_REQUIRE_TESTING_SCRIPT", PROGRAMMATIC_OBJECTS_REQUIRE_TESTING_SCRIPT },
{ "PROGRAMMATIC_OBJECTS_REQUIRE_TESTING_OBJECT", PROGRAMMATIC_OBJECTS_REQUIRE_TESTING_OBJECT },
{ "PROGRAMMATIC_OBJECTS_REQUIRE_TESTING_EMBED", PROGRAMMATIC_OBJECTS_REQUIRE_TESTING_EMBED },
{ "PROGRAMMATIC_OBJECTS_REQUIRE_TESTING_APPLET", PROGRAMMATIC_OBJECTS_REQUIRE_TESTING_APPLET },
{ "FRAME_MISSING_NOFRAMES", FRAME_MISSING_NOFRAMES },
{ "NOFRAMES_INVALID_NO_VALUE", NOFRAMES_INVALID_NO_VALUE },
{ "NOFRAMES_INVALID_CONTENT", NOFRAMES_INVALID_CONTENT },
{ "NOFRAMES_INVALID_LINK", NOFRAMES_INVALID_LINK },
{ "REMOVE_FLICKER_SCRIPT", REMOVE_FLICKER_SCRIPT },
{ "REMOVE_FLICKER_OBJECT", REMOVE_FLICKER_OBJECT },
{ "REMOVE_FLICKER_EMBED", REMOVE_FLICKER_EMBED },
{ "REMOVE_FLICKER_APPLET", REMOVE_FLICKER_APPLET },
{ "REMOVE_FLICKER_ANIMATED_GIF", REMOVE_FLICKER_ANIMATED_GIF },
{ "REMOVE_BLINK_MARQUEE", REMOVE_BLINK_MARQUEE },
{ "REMOVE_AUTO_REFRESH", REMOVE_AUTO_REFRESH },
{ "REMOVE_AUTO_REDIRECT", REMOVE_AUTO_REDIRECT },
{ "ENSURE_PROGRAMMATIC_OBJECTS_ACCESSIBLE_SCRIPT", ENSURE_PROGRAMMATIC_OBJECTS_ACCESSIBLE_SCRIPT },
{ "ENSURE_PROGRAMMATIC_OBJECTS_ACCESSIBLE_OBJECT", ENSURE_PROGRAMMATIC_OBJECTS_ACCESSIBLE_OBJECT },
{ "ENSURE_PROGRAMMATIC_OBJECTS_ACCESSIBLE_APPLET", ENSURE_PROGRAMMATIC_OBJECTS_ACCESSIBLE_APPLET },
{ "ENSURE_PROGRAMMATIC_OBJECTS_ACCESSIBLE_EMBED", ENSURE_PROGRAMMATIC_OBJECTS_ACCESSIBLE_EMBED },
{ "IMAGE_MAP_SERVER_SIDE_REQUIRES_CONVERSION", IMAGE_MAP_SERVER_SIDE_REQUIRES_CONVERSION },
{ "SCRIPT_NOT_KEYBOARD_ACCESSIBLE_ON_MOUSE_DOWN", SCRIPT_NOT_KEYBOARD_ACCESSIBLE_ON_MOUSE_DOWN },
{ "SCRIPT_NOT_KEYBOARD_ACCESSIBLE_ON_MOUSE_UP", SCRIPT_NOT_KEYBOARD_ACCESSIBLE_ON_MOUSE_UP },
{ "SCRIPT_NOT_KEYBOARD_ACCESSIBLE_ON_CLICK", SCRIPT_NOT_KEYBOARD_ACCESSIBLE_ON_CLICK },
{ "SCRIPT_NOT_KEYBOARD_ACCESSIBLE_ON_MOUSE_OVER", SCRIPT_NOT_KEYBOARD_ACCESSIBLE_ON_MOUSE_OVER },
{ "SCRIPT_NOT_KEYBOARD_ACCESSIBLE_ON_MOUSE_OUT", SCRIPT_NOT_KEYBOARD_ACCESSIBLE_ON_MOUSE_OUT },
{ "SCRIPT_NOT_KEYBOARD_ACCESSIBLE_ON_MOUSE_MOVE", SCRIPT_NOT_KEYBOARD_ACCESSIBLE_ON_MOUSE_MOVE },
{ "NEW_WINDOWS_REQUIRE_WARNING_NEW", NEW_WINDOWS_REQUIRE_WARNING_NEW },
{ "NEW_WINDOWS_REQUIRE_WARNING_BLANK", NEW_WINDOWS_REQUIRE_WARNING_BLANK },
{ "LABEL_NEEDS_REPOSITIONING_BEFORE_INPUT", LABEL_NEEDS_REPOSITIONING_BEFORE_INPUT },
{ "LABEL_NEEDS_REPOSITIONING_AFTER_INPUT", LABEL_NEEDS_REPOSITIONING_AFTER_INPUT },
{ "FORM_CONTROL_REQUIRES_DEFAULT_TEXT", FORM_CONTROL_REQUIRES_DEFAULT_TEXT },
{ "FORM_CONTROL_DEFAULT_TEXT_INVALID_NULL", FORM_CONTROL_DEFAULT_TEXT_INVALID_NULL },
{ "FORM_CONTROL_DEFAULT_TEXT_INVALID_SPACES", FORM_CONTROL_DEFAULT_TEXT_INVALID_SPACES },
{ "REPLACE_DEPRECATED_HTML_APPLET", REPLACE_DEPRECATED_HTML_APPLET },
{ "REPLACE_DEPRECATED_HTML_BASEFONT", REPLACE_DEPRECATED_HTML_BASEFONT },
{ "REPLACE_DEPRECATED_HTML_CENTER", REPLACE_DEPRECATED_HTML_CENTER },
{ "REPLACE_DEPRECATED_HTML_DIR", REPLACE_DEPRECATED_HTML_DIR },
{ "REPLACE_DEPRECATED_HTML_FONT", REPLACE_DEPRECATED_HTML_FONT },
{ "REPLACE_DEPRECATED_HTML_ISINDEX", REPLACE_DEPRECATED_HTML_ISINDEX },
{ "REPLACE_DEPRECATED_HTML_MENU", REPLACE_DEPRECATED_HTML_MENU },
{ "REPLACE_DEPRECATED_HTML_S", REPLACE_DEPRECATED_HTML_S },
{ "REPLACE_DEPRECATED_HTML_STRIKE", REPLACE_DEPRECATED_HTML_STRIKE },
{ "REPLACE_DEPRECATED_HTML_U", REPLACE_DEPRECATED_HTML_U },
{ "FRAME_MISSING_TITLE", FRAME_MISSING_TITLE },
{ "FRAME_TITLE_INVALID_NULL", FRAME_TITLE_INVALID_NULL },
{ "FRAME_TITLE_INVALID_SPACES", FRAME_TITLE_INVALID_SPACES },
{ "ASSOCIATE_LABELS_EXPLICITLY", ASSOCIATE_LABELS_EXPLICITLY },
{ "ASSOCIATE_LABELS_EXPLICITLY_FOR", ASSOCIATE_LABELS_EXPLICITLY_FOR },
{ "ASSOCIATE_LABELS_EXPLICITLY_ID", ASSOCIATE_LABELS_EXPLICITLY_ID },
{ "LINK_TEXT_NOT_MEANINGFUL", LINK_TEXT_NOT_MEANINGFUL },
{ "LINK_TEXT_MISSING", LINK_TEXT_MISSING },
{ "LINK_TEXT_TOO_LONG", LINK_TEXT_TOO_LONG },
{ "LINK_TEXT_NOT_MEANINGFUL_CLICK_HERE", LINK_TEXT_NOT_MEANINGFUL_CLICK_HERE },
{ "LINK_TEXT_NOT_MEANINGFUL_MORE", LINK_TEXT_NOT_MEANINGFUL_MORE },
{ "LINK_TEXT_NOT_MEANINGFUL_FOLLOW_THIS", LINK_TEXT_NOT_MEANINGFUL_FOLLOW_THIS },
{ "METADATA_MISSING", METADATA_MISSING },
{ "METADATA_MISSING_LINK", METADATA_MISSING_LINK },
{ "METADATA_MISSING_REDIRECT_AUTOREFRESH", METADATA_MISSING_REDIRECT_AUTOREFRESH },
{ "SKIPOVER_ASCII_ART", SKIPOVER_ASCII_ART },
{ "LAST_ACCESS_ERR", LAST_ACCESS_ERR },
#endif
/* This blocks of codes comes from `tidyMessagesMisc` enum. */
{ "STRING_UNKNOWN_OPTION", STRING_UNKNOWN_OPTION },
{ "STRING_MISSING_MALFORMED", STRING_MISSING_MALFORMED },
{ "STRING_DOCTYPE_GIVEN", STRING_DOCTYPE_GIVEN },
{ "STRING_HTML_PROPRIETARY", STRING_HTML_PROPRIETARY },
{ "STRING_CONTENT_LOOKS", STRING_CONTENT_LOOKS },
{ "STRING_NO_SYSID", STRING_NO_SYSID },
{ NULL, 0 },
};
/**
* Given an error code, return the string associated with it.
*/
ctmbstr tidyErrorCodeAsString(uint code)
{
uint i = 0;
while (tidyErrorFilterKeysStruct[i].key) {
if ( tidyErrorFilterKeysStruct[i].value == code )
return tidyErrorFilterKeysStruct[i].key;
i++;
}
return "UNDEFINED";
}
/**
* The real string lookup function.
*/
ctmbstr TY_(tidyLocalizedString)( uint messageType, languageDefinition *definition, uint plural )
{
int i;
languageDictionary *dictionary = &definition->messages;
uint pluralForm = definition->whichPluralForm(plural);
for (i = 0; (*dictionary)[i].value; ++i)
{
if ( (*dictionary)[i].key == messageType && (*dictionary)[i].pluralForm == pluralForm )
{
return (*dictionary)[i].value;
}
}
return NULL;
}
/**
* Provides a string given `messageType` in the current
* localization, returning the correct plural form given
* `quantity`.
*
* This isn't currently highly optimized; rewriting some
* of infrastructure to use hash lookups is a preferred
* future optimization.
*/
ctmbstr tidyLocalizedStringN( uint messageType, uint quantity )
{
ctmbstr result;
result = TY_(tidyLocalizedString)( messageType, tidyLanguages.currentLanguage, quantity);
if (!result && tidyLanguages.fallbackLanguage )
{
result = TY_(tidyLocalizedString)( messageType, tidyLanguages.fallbackLanguage, quantity);
}
if (!result)
{
/* Fallback to en which is built in. */
result = TY_(tidyLocalizedString)( messageType, &language_en, quantity);
}
if (!result)
{
/* Last resort: Fallback to en singular which is built in. */
result = TY_(tidyLocalizedString)( messageType, &language_en, 1);
}
return result;
}
/**
* Provides a string given `messageType` in the current
* localization, in the non-plural form.
*
* This isn't currently highly optimized; rewriting some
* of infrastructure to use hash lookups is a preferred
* future optimization.
*/
ctmbstr tidyLocalizedString( uint messageType )
{
return tidyLocalizedStringN( messageType, 1 );
}
/**
** Determines the current locale without affecting the C locale.
** Tidy has always used the default C locale, and at this point
** in its development we're not going to tamper with that.
** @param result The buffer to use to return the string.
** Returns NULL on failure.
** @return The same buffer for convenience.
*/
tmbstr tidySystemLocale(tmbstr result)
{
ctmbstr temp;
/* This should set the OS locale. */
setlocale( LC_ALL, "" );
/* This should read the current locale. */
temp = setlocale( LC_ALL, NULL);
/* Make a new copy of the string, because temp
always points to the current locale. */
if (( result = malloc( strlen( temp ) + 1 ) ))
strcpy(result, temp);
/* This should restore the C locale. */
setlocale( LC_ALL, "C" );
return result;
}
/**
* Retrieves the POSIX name for a string. Result is a static char so please
* don't try to free it. If the name looks like a cc_ll identifier, we will
* return it if there's no other match.
*/
tmbstr tidyNormalizedLocaleName( ctmbstr locale )
{
uint i;
uint len;
static char result[6] = "xx_yy";
tmbstr search = strdup(locale);
search = TY_(tmbstrtolower)(search);
/* See if our string matches a Windows name. */
for (i = 0; localeMappings[i].winName; ++i)
{
if ( strcmp( localeMappings[i].winName, search ) == 0 )
{
free(search);
search = strdup(localeMappings[i].POSIXName);
break;
}
}
/* We're going to be stupid about this and trust the user, and
return just the first two characters if they exist and the
4th and 5th if they exist. The worst that can happen is a
junk language that doesn't exist and won't be set. */
len = strlen( search );
len = ( len <= 5 ? len : 5 );
for ( i = 0; i < len; i++ )
{
if ( i == 2 )
{
/* Either terminate the string or ensure there's an underscore */
if (len == 5) {
result[i] = '_';
}
else {
result[i] = '\0';
break; /* no need to copy after null */
}
}
else
{
result[i] = tolower( search[i] );
}
}
free( search );
return result;
}
/**
* Returns the languageDefinition if the languageCode is installed in Tidy,
* otherwise return NULL
*/
languageDefinition *TY_(tidyTestLanguage)( ctmbstr languageCode )
{
uint i;
languageDefinition *testLang;
languageDictionary *testDict;
ctmbstr testCode;
for (i = 0; tidyLanguages.languages[i]; ++i)
{
testLang = tidyLanguages.languages[i];
testDict = &testLang->messages;
testCode = (*testDict)[0].value;
if ( strcmp(testCode, languageCode) == 0 )
return testLang;
}
return NULL;
}
/**
* Tells Tidy to use a different language for output.
* @param languageCode A Windows or POSIX language code, and must match
* a TIDY_LANGUAGE for an installed language.
* @result Indicates that a setting was applied, but not necessarily the
* specific request, i.e., true indicates a language and/or region
* was applied. If es_mx is requested but not installed, and es is
* installed, then es will be selected and this function will return
* true. However the opposite is not true; if es is requested but
* not present, Tidy will not try to select from the es_XX variants.
*/
Bool tidySetLanguage( ctmbstr languageCode )
{
languageDefinition *dict1 = NULL;
languageDefinition *dict2 = NULL;
tmbstr wantCode = NULL;
char lang[3] = "";
if ( !languageCode || !(wantCode = tidyNormalizedLocaleName( languageCode )) )
{
return no;
}
/* We want to use the specified language as the currentLanguage, and set
fallback language as necessary. We have either a two or five digit code,
either or both of which might be installed. Let's test both of them:
*/
dict1 = TY_(tidyTestLanguage( wantCode )); /* WANTED language */
if ( strlen( wantCode ) > 2 )
{
strncpy(lang, wantCode, 2);
lang[2] = '\0';
dict2 = TY_(tidyTestLanguage( lang ) ); /* BACKUP language? */
}
if ( dict1 && dict2 )
{
tidyLanguages.currentLanguage = dict1;
tidyLanguages.fallbackLanguage = dict2;
}
if ( dict1 && !dict2 )
{
tidyLanguages.currentLanguage = dict1;
tidyLanguages.fallbackLanguage = NULL;
}
if ( !dict1 && dict2 )
{
tidyLanguages.currentLanguage = dict2;
tidyLanguages.fallbackLanguage = NULL;
}
if ( !dict1 && !dict2 )
{
/* No change. */
}
return dict1 || dict2;
}
/**
* Gets the current language used by Tidy.
*/
ctmbstr tidyGetLanguage()
{
languageDefinition *langDef = tidyLanguages.currentLanguage;
languageDictionary *langDict = &langDef->messages;
return (*langDict)[0].value;
}
/**
* Provides a string given `messageType` in the default
* localization (which is `en`), for single plural form.
*/
ctmbstr tidyDefaultString( uint messageType )
{
return TY_(tidyLocalizedString)( messageType, &language_en, 1);
}
/**
* Determines the true size of the `language_en` array indicating the
* number of items in the array, _not_ the highest index.
*/
const uint TY_(tidyStringKeyListSize)()
{
static uint array_size = 0;
if ( array_size == 0 )
{
while ( language_en.messages[array_size].value != NULL ) {
array_size++;
}
}
return array_size;
}
/*
* Initializes the TidyIterator to point to the first item
* in Tidy's list of localization string keys. Note that
* these are provided for documentation generation purposes
* and probably aren't useful for LibTidy implementors.
*/
TidyIterator getStringKeyList()
{
return (TidyIterator)(size_t)1;
}
/*
* Provides the next key value in Tidy's list of localized
* strings. Note that these are provided for documentation
* generation purposes and probably aren't useful to
* libtidy implementors.
*/
uint getNextStringKey( TidyIterator* iter )
{
uint item = 0;
size_t itemIndex;
assert( iter != NULL );
itemIndex = (size_t)*iter;
if ( itemIndex > 0 && itemIndex <= TY_(tidyStringKeyListSize)() )
{
item = language_en.messages[ itemIndex - 1 ].key;
itemIndex++;
}
*iter = (TidyIterator)( itemIndex <= TY_(tidyStringKeyListSize)() ? itemIndex : (size_t)0 );
return item;
}
/**
* Determines the true size of the `localeMappings` array indicating the
* number of items in the array, _not_ the highest index.
*/
const uint TY_(tidyLanguageListSize)()
{
static uint array_size = 0;
if ( array_size == 0 )
{
while ( localeMappings[array_size].winName ) {
array_size++;
}
}
return array_size;
}
/**
* Initializes the TidyIterator to point to the first item
* in Tidy's structure of Windows<->POSIX local mapping.
* Items can be retrieved with getNextWindowsLanguage();
*/
TidyIterator getWindowsLanguageList()
{
return (TidyIterator)(size_t)1;
}
/**
* Returns the next record of type `localeMapItem` in
* Tidy's structure of Windows<->POSIX local mapping.
*/
const tidyLocaleMapItem *getNextWindowsLanguage( TidyIterator *iter )
{
const tidyLocaleMapItem *item = NULL;
size_t itemIndex;
assert( iter != NULL );
itemIndex = (size_t)*iter;
if ( itemIndex > 0 && itemIndex <= TY_(tidyLanguageListSize)() )
{
item = &localeMappings[ itemIndex -1 ];
itemIndex++;
}
*iter = (TidyIterator)( itemIndex <= TY_(tidyLanguageListSize)() ? itemIndex : (size_t)0 );
return item;
}
/**
* Determines the number of languages installed in Tidy.
*/
const uint TY_(tidyInstalledLanguageListSize)()
{
static uint array_size = 0;
if ( array_size == 0 )
{
while ( tidyLanguages.languages[array_size] ) {
array_size++;
}
}
return array_size;
}
/**
* Initializes the TidyIterator to point to the first item
* in Tidy's list of installed language codes.
* Items can be retrieved with getNextInstalledLanguage();
*/
TidyIterator getInstalledLanguageList()
{
return (TidyIterator)(size_t)1;
}
/**
* Returns the next installed language.
*/
ctmbstr getNextInstalledLanguage( TidyIterator* iter )
{
ctmbstr item = NULL;
size_t itemIndex;
assert( iter != NULL );
itemIndex = (size_t)*iter;
if ( itemIndex > 0 && itemIndex <= TY_(tidyInstalledLanguageListSize)() )
{
item = tidyLanguages.languages[itemIndex - 1]->messages[0].value;
itemIndex++;
}
*iter = (TidyIterator)( itemIndex <= TY_(tidyInstalledLanguageListSize)() ? itemIndex : (size_t)0 );
return item;
}
/**
* Determines the number of error codes used by Tidy.
*/
const uint TY_(tidyErrorCodeListSize)()
{
static uint array_size = 0;
if ( array_size == 0 )
{
while ( tidyErrorFilterKeysStruct[array_size].key ) {
array_size++;
}
}
return array_size;
}
/**
* Initializes the TidyIterator to point to the first item
* in Tidy's list of error codes that can be return with
* `TidyReportFilter3`.
* Items can be retrieved with getNextErrorCode();
*/
TidyIterator getErrorCodeList()
{
return (TidyIterator)(size_t)1;
}
/**
* Returns the next error code.
*/
const tidyErrorFilterKeyItem *getNextErrorCode( TidyIterator* iter )
{
const tidyErrorFilterKeyItem *item = NULL;
size_t itemIndex;
assert( iter != NULL );
itemIndex = (size_t)*iter;
if ( itemIndex > 0 && itemIndex <= TY_(tidyErrorCodeListSize)() )
{
item = &tidyErrorFilterKeysStruct[itemIndex - 1];
itemIndex++;
}
*iter = (TidyIterator)( itemIndex <= TY_(tidyErrorCodeListSize)() ? itemIndex : (size_t)0 );
return item;
}

332
src/language.h

@ -0,0 +1,332 @@
#ifndef language_h
#define language_h
/*
* language.h
* Localization support for HTML Tidy.
* This header provides the public (within libtidy) interface
* to basic localization support. To add your own localization
* create a new `language_xx.h` file and add it to the struct
* in `language.c`.
*
* (c) 2015 HTACG
* See tidy.h and access.h for the copyright notice.
*
* Created by Jim Derry on 11/28/15.
*/
#include "tidyplatform.h"
/** @name Exposed Data Structures */
/** @{ */
/**
* Describes a record for a localization string.
* - key must correspond with one of Tidy's enums (see `tidyMessageTypes`
* below)
* - pluralForm corresponds to gettext plural forms case (not singularity).
* Most entries should be case 0, representing the single case.:
* https://www.gnu.org/software/gettext/manual/html_node/Plural-forms.html
*/
typedef struct languageDictionaryEntry {
uint key;
uint pluralForm;
ctmbstr value;
} languageDictionaryEntry;
/**
* For now we'll just use an array to hold all of the dictionary
* entries. In the future we can convert this to a hash structure
* which will make looking up strings faster.
*/
typedef languageDictionaryEntry const languageDictionary[600];
/**
* Finally, a complete language definition. The item `pluralForm`
* is a function pointer that will provide the correct plural
* form given the value `n`. The actual function is present in
* each language header and is language dependent.
*/
typedef struct languageDefinition {
uint (*whichPluralForm)(uint n);
languageDictionary messages;
} languageDefinition;
/**
* The function getNextWindowsLanguage() returns pointers to this type;
* it gives LibTidy implementors the ability to determine how Windows
* locale names are mapped to POSIX language codes.
*/
typedef struct tidyLocaleMapItem {
ctmbstr winName;
ctmbstr POSIXName;
} tidyLocaleMapItem;
/**
* The function getNextErrorCode() returns pointers to this type; it gives
* LibTidy implementors the ability to know what errors can be returned
* via `TidyReportFilter3`.
* Provides the mapping for LibTidy users to map between an opaque key
* and an error message value. See `tidyErrorFilterKeys[]` in `language.c`.
* The `key` string is guaranteed by the API (unless deleted entirely). The
* `value` is suitable for use in looking up Tidy's strings, but its value
* is not guaranteed between releases.
*/
typedef struct tidyErrorFilterKeyItem {
ctmbstr key;
int value;
} tidyErrorFilterKeyItem;
/**
* Defines all of the possible dictionary keys.
* The starting value is arbitrary but must prevent overlaps
* with other enums that are used for retrieving strings. The
* comprehensive list of enums for which we provides strings
* is as follows:
* - `tidyMessageTypes` in this file, start == 4096.
* - `tidyErrorCodes` from `message.h`, start == 200.
* - `accessErrorCodes` from `access.h`, start == CODES_TIDY_ERROR_LAST+1.
* - `tidyMessagesMisc` from `message.h`, start == 2048.
* - `TidyOptionId` from `tidyEnum.h`, start == 0 (important!).
* - `TidyReportLevelKeys` from `tidyEnum.h`, start == 600.
* - ...
* You should never count on the value of a label being
* constant. Accordingly feel free to arrange new enum
* values in the most appropriate grouping below.
*/
typedef enum
{
/* This MUST be present and first. */
TIDY_MESSAGE_TYPE_FIRST = 4096,
/* Specify the code for this language. */
TIDY_LANGUAGE,
/* Localization test strings. */
TEST_PRESENT_IN_BASE,
TEST_PRESENT_IN_REGION,
/* Strings for the console application. */
TC_CAT_DIAGNOSTICS,
TC_CAT_ENCODING,
TC_CAT_MARKUP,
TC_CAT_MISC,
TC_CAT_PRETTYPRINT,
TC_LABEL_COL,
TC_LABEL_FILE,
TC_LABEL_LANG,
TC_LABEL_LEVL,
TC_LABEL_OPT,
TC_MAIN_ERROR_LOAD_CONFIG,
TC_OPT_ACCESS,
TC_OPT_ASCII,
TC_OPT_ASHTML,
TC_OPT_ASXML,
TC_OPT_BARE,
TC_OPT_BIG5,
TC_OPT_CLEAN,
TC_OPT_CONFIG,
TC_OPT_ERRORS,
TC_OPT_FILE,
TC_OPT_GDOC,
TC_OPT_HELP,
TC_OPT_HELPCFG,
TC_OPT_HELPOPT,
TC_OPT_IBM858,
TC_OPT_INDENT,
TC_OPT_ISO2022,
TC_OPT_LANGUAGE,
TC_OPT_LATIN0,
TC_OPT_LATIN1,
TC_OPT_MAC,
TC_OPT_MODIFY,
TC_OPT_NUMERIC,
TC_OPT_OMIT,
TC_OPT_OUTPUT,
TC_OPT_QUIET,
TC_OPT_RAW,
TC_OPT_SHIFTJIS,
TC_OPT_SHOWCFG,
TC_OPT_UPPER,
TC_OPT_UTF16,
TC_OPT_UTF16BE,
TC_OPT_UTF16LE,
TC_OPT_UTF8,
TC_OPT_VERSION,
TC_OPT_WIN1252,
TC_OPT_WRAP,
TC_OPT_XML,
TC_OPT_XMLCFG,
TC_OPT_XMLSTRG,
TC_OPT_XMLERRS,
TC_OPT_XMLOPTS,
TC_OPT_XMLHELP,
TC_STRING_CONF_HEADER,
TC_STRING_CONF_NAME,
TC_STRING_CONF_TYPE,
TC_STRING_CONF_VALUE,
TC_STRING_CONF_NOTE,
TC_STRING_OPT_NOT_DOCUMENTED,
TC_STRING_OUT_OF_MEMORY,
TC_STRING_FATAL_ERROR,
TC_STRING_FILE_MANIP,
TC_STRING_LANG_MUST_SPECIFY,
TC_STRING_LANG_NOT_FOUND,
TC_STRING_MUST_SPECIFY,
TC_STRING_PROCESS_DIRECTIVES,
TC_STRING_CHAR_ENCODING,
TC_STRING_MISC,
TC_STRING_XML,
TC_STRING_UNKNOWN_OPTION,
TC_STRING_UNKNOWN_OPTION_B,
TC_STRING_VERS_A,
TC_STRING_VERS_B,
TC_TXT_HELP_1,
TC_TXT_HELP_2A,
TC_TXT_HELP_2B,
TC_TXT_HELP_3,
TC_TXT_HELP_CONFIG,
TC_TXT_HELP_CONFIG_NAME,
TC_TXT_HELP_CONFIG_TYPE,
TC_TXT_HELP_CONFIG_ALLW,
TC_TXT_HELP_LANG_1,
TC_TXT_HELP_LANG_2,
TC_TXT_HELP_LANG_3,
/* This MUST be present and last. */
TIDY_MESSAGE_TYPE_LAST
} tidyMessageTypes;
/**
* LibTidy users may want to use `TidyReportFilter3` to enable their own
* localization lookup features. Because Tidy's errors codes are enums the
* specific values can change over time. This function returns a string
* representing the enum value name that can be used as a lookup key
* independent of changing string values (TidyReportFiler2 is vulnerable
* to changing strings). `TidyReportFilter3` will return this general
* string as the error message indicator.
*/
ctmbstr tidyErrorCodeAsString(uint code);
/** @} */
/** @name Localization Related Functions */
/** @{ */
/**
** Determines the current locale without affecting the C locale.
** Tidy has always used the default C locale, and at this point
** in its development we're not going to tamper with that.
** @param result The buffer to use to return the string.
** Returns NULL on failure.
** @return The same buffer for convenience.
*/
tmbstr tidySystemLocale(tmbstr result);
/**
* Tells Tidy to use a different language for output.
* @param languageCode A Windows or POSIX language code, and must match
* a TIDY_LANGUAGE for an installed language.
* @result Indicates that a setting was applied, but not necessarily the
* specific request, i.e., true indicates a language and/or region
* was applied. If es_mx is requested but not installed, and es is
* installed, then es will be selected and this function will return
* true. However the opposite is not true; if es is requested but
* not present, Tidy will not try to select from the es_XX variants.
*/
Bool tidySetLanguage( ctmbstr languageCode );
/**
* Gets the current language used by Tidy.
*/
ctmbstr tidyGetLanguage();
/**
* Provides a string given `messageType` in the current
* localization for `quantity`.
*/
ctmbstr tidyLocalizedStringN( uint messageType, uint quantity );
/**
* Provides a string given `messageType` in the current
* localization for the single case.
*/
ctmbstr tidyLocalizedString( uint messageType );
/** @} */
/** @name Documentation Generation */
/** @{ */
/**
* Provides a string given `messageType` in the default
* localization (which is `en`).
*/
ctmbstr tidyDefaultString( uint messageType );
/*
* Initializes the TidyIterator to point to the first item
* in Tidy's list of localization string keys. Note that
* these are provided for documentation generation purposes
* and probably aren't useful for LibTidy implementors.
*/
TidyIterator getStringKeyList();
/*
* Provides the next key value in Tidy's list of localized
* strings. Note that these are provided for documentation
* generation purposes and probably aren't useful to
* libtidy implementors.
*/
uint getNextStringKey( TidyIterator* iter );
/**
* Initializes the TidyIterator to point to the first item
* in Tidy's structure of Windows<->POSIX local mapping.
* Items can be retrieved with getNextWindowsLanguage();
*/
TidyIterator getWindowsLanguageList();
/**
* Returns the next record of type `localeMapItem` in
* Tidy's structure of Windows<->POSIX local mapping.
*/
const tidyLocaleMapItem *getNextWindowsLanguage( TidyIterator* iter );
/**
* Initializes the TidyIterator to point to the first item
* in Tidy's list of installed language codes.
* Items can be retrieved with getNextInstalledLanguage();
*/
TidyIterator getInstalledLanguageList();
/**
* Returns the next installed language.
*/
ctmbstr getNextInstalledLanguage( TidyIterator* iter );
/**
* Initializes the TidyIterator to point to the first item
* in Tidy's list of error codes that can be return with
* `TidyReportFilter3`.
* Items can be retrieved with getNextErrorCode();
*/
TidyIterator getErrorCodeList();
/**
* Returns the next error code.
*/
const tidyErrorFilterKeyItem *getNextErrorCode( TidyIterator* iter );
/** @} */
#endif /* language_h */

2353
src/language_en.h

File diff suppressed because it is too large

132
src/language_en_gb.h

@ -0,0 +1,132 @@
#ifndef language_en_gb_h
#define language_en_gb_h
/*
* language_en_gb.h
* Localization support for HTML Tidy.
*
*
* This file is a localization file for HTML Tidy. It will have been machine
* generated or created and/or edited by hand. Both are valid options, but
* please help keep our localization efforts simple to maintain by maintaining
* the structure of this file, and changing the check box below if you make
* changes (so others know the file origin):
*
* [X] THIS FILE IS MACHINE GENERATED. It is a localization file for the
* language (and maybe region) "en_gb". The source of
* these strings is a gettext PO file in Tidy's source, probably called
* "language_en_gb.po".
*
* [ ] THIS FILE WAS HAND MODIFIED. Translators, please feel to edit this file
* directly (and check this box). If you prefer to edit PO files then use
* `poconvert.rb msgunfmt language_en_gb.h` (our own
* conversion tool) to generate a fresh PO from this file first!
*
* (c) 2015 HTACG
* See tidy.h and access.h for the copyright notice.
*
* Template Created by Jim Derry on 01/14/2016.
*
* Orginating PO file metadata:
* PO_LAST_TRANSLATOR=jderry
* PO_REVISION_DATE=2016-03-24 10:59:55
*/
#ifdef _MSC_VER
#pragma execution_character_set("utf-8")
#endif
#include "language.h"
#include "access.h"
#include "message.h"
/**
* This language-specific function returns the correct pluralForm
* to use given n items, and is used as a member of each language
* definition.
*/
static uint whichPluralForm_en_gb(uint n) {
/* Plural-Forms: nplurals=2; */
return n != 1;
}
/**
* This structure specifies all of the strings needed by Tidy for a
* single language. Static definition in a header file makes it
* easy to include and exclude languages without tinkering with
* the build system.
*/
static languageDefinition language_en_gb = { whichPluralForm_en_gb, {
/***************************************
** This MUST be present and first.
** Specify the code for this language.
***************************************/
{/* Specify the ll or ll_cc language code here. */
TIDY_LANGUAGE, 0, "en_gb"
},
{ TEXT_USING_FONT, 0,
"You are recommended to use CSS to specify the font and\n"
"properties such as its size and colour. This will reduce\n"
"the size of HTML files and make them easier to maintain\n"
"compared with using <FONT> elements.\n\n"
},
{ TEXT_USING_BODY, 0, "You are recommended to use CSS to specify page and link colours\n" },
{ TEXT_GENERAL_INFO_PLEA, 0,
"\n"
"Would you like to see Tidy in proper, British English? Please consider \n"
"helping us to localise HTML Tidy. For details please see \n"
"https://github.com/htacg/tidy-html5/blob/master/README/LOCALIZE.md\n"
},
#if SUPPORT_ACCESSIBILITY_CHECKS
{ INFORMATION_NOT_CONVEYED_IMAGE, 0, "[2.1.1.1]: ensure information not conveyed through colour alone (image)." },
{ INFORMATION_NOT_CONVEYED_APPLET, 0, "[2.1.1.2]: ensure information not conveyed through colour alone (applet)." },
{ INFORMATION_NOT_CONVEYED_OBJECT, 0, "[2.1.1.3]: ensure information not conveyed through colour alone (object)." },
{ INFORMATION_NOT_CONVEYED_SCRIPT, 0, "[2.1.1.4]: ensure information not conveyed through colour alone (script)." },
{ INFORMATION_NOT_CONVEYED_INPUT, 0, "[2.1.1.5]: ensure information not conveyed through colour alone (input)." },
{ COLOR_CONTRAST_TEXT, 0, "[2.2.1.1]: poor colour contrast (text)." },
{ COLOR_CONTRAST_LINK, 0, "[2.2.1.2]: poor colour contrast (link)." },
{ COLOR_CONTRAST_ACTIVE_LINK, 0, "[2.2.1.3]: poor colour contrast (active link)." },
{ COLOR_CONTRAST_VISITED_LINK, 0, "[2.2.1.4]: poor colour contrast (visited link)." },
#endif /* SUPPORT_ACCESSIBILITY_CHECKS */
{ TidyMergeDivs, 0,
"This option can be used to modify the behaviour of <code>clean</code> when "
"set to <code>yes</code>."
"<br/>"
"This option specifies if Tidy should merge nested <code>&lt;div&gt;</code> "
"such as <code>&lt;div&gt;&lt;div&gt;...&lt;/div&gt;&lt;/div&gt;</code>. "
"<br/>"
"If set to <code>auto</code> the attributes of the inner "
"<code>&lt;div&gt;</code> are moved to the outer one. Nested "
"<code>&lt;div&gt;</code> with <code>id</code> attributes are <em>not</em> "
"merged. "
"<br/>"
"If set to <code>yes</code> the attributes of the inner "
"<code>&lt;div&gt;</code> are discarded with the exception of "
"<code>class</code> and <code>style</code>. "
},
{ TidyMergeSpans, 0,
"This option can be used to modify the behaviour of <code>clean</code> when "
"set to <code>yes</code>."
"<br/>"
"This option specifies if Tidy should merge nested <code>&lt;span&gt;</code> "
"such as <code>&lt;span&gt;&lt;span&gt;...&lt;/span&gt;&lt;/span&gt;</code>. "
"<br/>"
"The algorithm is identical to the one used by <code>merge-divs</code>. "
},
{ TidyReplaceColor, 0,
"This option specifies if Tidy should replace numeric values in colour "
"attributes with HTML/XHTML colour names where defined, e.g. replace "
"<code>#ffffff</code> with <code>white</code>. "
},
{/* This MUST be present and last. */
TIDY_MESSAGE_TYPE_LAST, 0, NULL
}
}};
#endif /* language_en_gb_h */

138
src/language_es.h

@ -0,0 +1,138 @@
#ifndef language_es_h
#define language_es_h
/*
* language_es.h
* Localization support for HTML Tidy.
*
*
* This file is a localization file for HTML Tidy. It will have been machine
* generated or created and/or edited by hand. Both are valid options, but
* please help keep our localization efforts simple to maintain by maintaining
* the structure of this file, and changing the check box below if you make
* changes (so others know the file origin):
*
* [X] THIS FILE IS MACHINE GENERATED. It is a localization file for the
* language (and maybe region) "es". The source of
* these strings is a gettext PO file in Tidy's source, probably called
* "language_es.po".
*
* [ ] THIS FILE WAS HAND MODIFIED. Translators, please feel to edit this file
* directly (and check this box). If you prefer to edit PO files then use
* `poconvert.rb msgunfmt language_es.h` (our own
* conversion tool) to generate a fresh PO from this file first!
*
* (c) 2015 HTACG
* See tidy.h and access.h for the copyright notice.
*
* Template Created by Jim Derry on 01/14/2016.
*
* Orginating PO file metadata:
* PO_LAST_TRANSLATOR=jderry
* PO_REVISION_DATE=2016-03-24 10:59:55
*/
#ifdef _MSC_VER
#pragma execution_character_set("utf-8")
#endif
#include "language.h"
#include "access.h"
#include "message.h"
/**
* This language-specific function returns the correct pluralForm
* to use given n items, and is used as a member of each language
* definition.
*/
static uint whichPluralForm_es(uint n) {
/* Plural-Forms: nplurals=2; */
return n != 1;
}
/**
* This structure specifies all of the strings needed by Tidy for a
* single language. Static definition in a header file makes it
* easy to include and exclude languages without tinkering with
* the build system.
*/
static languageDefinition language_es = { whichPluralForm_es, {
/***************************************
** This MUST be present and first.
** Specify the code for this language.
***************************************/
{/* Specify the ll or ll_cc language code here. */
TIDY_LANGUAGE, 0, "es"
},
{ TEXT_GENERAL_INFO_PLEA, 0,
"\n"
"¿Le gustaría ver Tidy en un español correcto? Por favor considere \n"
"ayudarnos a localizar HTML Tidy. Para más detalles consulte \n"
"https://github.com/htacg/tidy-html5/blob/master/README/LOCALIZE.md \n"
},
{ TidyMakeClean, 0,
"Esta opción especifica si Tidy debe realizar la limpieza de algún legado etiquetas de "
"presentación (actualmente <code>&lt;i&gt;</code>, <code>&lt;b&gt;</code>, <code>&lt;center&gt;</"
"code> cuando encerrados dentro de las etiquetas apropiadas en línea y <code>&lt;font&gt;</"
"code>). Si se establece en <code>yes</code>, entonces etiquetas existentes serán reemplazados "
"con CSS <code>&lt;style&gt;</code> y estructural markup según corresponda. "
},
#if SUPPORT_ASIAN_ENCODINGS
{ TidyNCR, 0, "Esta opción especifica si Tidy debe permitir referencias de caracteres numéricos. " },
#endif /* SUPPORT_ASIAN_ENCODINGS */
{ TC_TXT_HELP_LANG_1, 0,
"\n"
"La opción --language (o --lang) indica el lenguaje Tidy debe \n"
"utilizar para comunicar su salida. Tenga en cuenta que esto no es \n"
"un servicio de traducción de documentos, y sólo afecta a los mensajes \n"
"que Tidy comunica a usted. \n"
"\n"
"Cuando se utiliza la línea de comandos el argumento --language debe \n"
"utilizarse antes de cualquier argumento que dan lugar a la producción, \n"
"de lo contrario Tidy producirá la salida antes de que se conozca el \n"
"idioma a utilizar. \n"
"\n"
"Además de los códigos de idioma estándar POSIX, Tidy es capaz de \n"
"entender códigos de idioma legados de Windows. Tenga en cuenta que \n"
"este lista indica los códigos Tidy entiende, y no indica que \n"
"actualmente el idioma está instalado. \n"
"\n"
"La columna más a la derecha indica cómo Tidy comprenderá el \n"
"legado nombre de Windows.\n"
"\n"
"Tidy está utilizando la configuración regional %s. \n"
"\n"
},
{ TC_TXT_HELP_LANG_2, 0,
"\n"
"Los siguientes idiomas están instalados actualmente en Tidy. Tenga \n"
"en cuenta que no hay garantía de que están completos; sólo quiere decir \n"
"que un desarrollador u otro comenzaron a añadir el idioma indicado. \n"
"\n"
"Localizaciones incompletas por defecto se usan \"en\" cuando sea \n"
"necesario. ¡Favor de informar los desarrolladores de estes casos! \n"
"\n"
},
{ TC_TXT_HELP_LANG_3, 0,
"\n"
"Si Tidy es capaz de determinar la configuración regional entonces \n"
"Tidy utilizará el lenguaje de forma automática de la configuración \n"
"regional. Por ejemplo los sistemas de tipo Unix utilizan los variables \n"
"$LANG y/o $LC_ALL. Consulte a su documentación del sistema para \n"
"obtener más información.\n"
"\n"
"Tidy está utilizando la configuración regional %s. \n"
"\n"
},
{/* This MUST be present and last. */
TIDY_MESSAGE_TYPE_LAST, 0, NULL
}
}};
#endif /* language_es_h */

82
src/language_es_mx.h

@ -0,0 +1,82 @@
#ifndef language_es_mx_h
#define language_es_mx_h
/*
* language_es_mx.h
* Localization support for HTML Tidy.
*
*
* This file is a localization file for HTML Tidy. It will have been machine
* generated or created and/or edited by hand. Both are valid options, but
* please help keep our localization efforts simple to maintain by maintaining
* the structure of this file, and changing the check box below if you make
* changes (so others know the file origin):
*
* [X] THIS FILE IS MACHINE GENERATED. It is a localization file for the
* language (and maybe region) "es_mx". The source of
* these strings is a gettext PO file in Tidy's source, probably called
* "language_es_mx.po".
*
* [ ] THIS FILE WAS HAND MODIFIED. Translators, please feel to edit this file
* directly (and check this box). If you prefer to edit PO files then use
* `poconvert.rb msgunfmt language_es_mx.h` (our own
* conversion tool) to generate a fresh PO from this file first!
*
* (c) 2015 HTACG
* See tidy.h and access.h for the copyright notice.
*
* Template Created by Jim Derry on 01/14/2016.
*
* Orginating PO file metadata:
* PO_LAST_TRANSLATOR=jderry
* PO_REVISION_DATE=2016-03-24 10:59:55
*/
#ifdef _MSC_VER
#pragma execution_character_set("utf-8")
#endif
#include "language.h"
#include "access.h"
#include "message.h"
/**
* This language-specific function returns the correct pluralForm
* to use given n items, and is used as a member of each language
* definition.
*/
static uint whichPluralForm_es_mx(uint n) {
/* Plural-Forms: nplurals=2; */
return n != 1;
}
/**
* This structure specifies all of the strings needed by Tidy for a
* single language. Static definition in a header file makes it
* easy to include and exclude languages without tinkering with
* the build system.
*/
static languageDefinition language_es_mx = { whichPluralForm_es_mx, {
/***************************************
** This MUST be present and first.
** Specify the code for this language.
***************************************/
{/* Specify the ll or ll_cc language code here. */
TIDY_LANGUAGE, 0, "es_mx"
},
{ TEXT_GENERAL_INFO_PLEA, 0,
"\n"
"¿Le gustaría ver Tidy en adecuada, español mexicano? Por favor considere \n"
"ayudarnos a localizar HTML Tidy. Para más detalles consulte \n"
"https://github.com/htacg/tidy-html5/blob/master/README/LOCALIZE.md \n"
},
{/* This MUST be present and last. */
TIDY_MESSAGE_TYPE_LAST, 0, NULL
}
}};
#endif /* language_es_mx_h */

573
src/language_fr.h

@ -0,0 +1,573 @@
#ifndef language_fr_h
#define language_fr_h
/*
* language_fr.h
* Localization support for HTML Tidy.
*
*
* This file is a localization file for HTML Tidy. It will have been machine
* generated or created and/or edited by hand. Both are valid options, but
* please help keep our localization efforts simple to maintain by maintaining
* the structure of this file, and changing the check box below if you make
* changes (so others know the file origin):
*
* [X] THIS FILE IS MACHINE GENERATED. It is a localization file for the
* language (and maybe region) "fr". The source of
* these strings is a gettext PO file in Tidy's source, probably called
* "language_fr.po".
*
* [ ] THIS FILE WAS HAND MODIFIED. Translators, please feel to edit this file
* directly (and check this box). If you prefer to edit PO files then use
* `poconvert.rb msgunfmt language_fr.h` (our own
* conversion tool) to generate a fresh PO from this file first!
*
* (c) 2015 HTACG
* See tidy.h and access.h for the copyright notice.
*
* Template Created by Jim Derry on 01/14/2016.
*
* Orginating PO file metadata:
* PO_LAST_TRANSLATOR=
* PO_REVISION_DATE=
*/
#ifdef _MSC_VER
#pragma execution_character_set("utf-8")
#endif
#include "language.h"
#include "access.h"
#include "message.h"
/**
* This language-specific function returns the correct pluralForm
* to use given n items, and is used as a member of each language
* definition.
*/
static uint whichPluralForm_fr(uint n) {
/* Plural-Forms: nplurals=2; */
return (n > 1);
}
/**
* This structure specifies all of the strings needed by Tidy for a
* single language. Static definition in a header file makes it
* easy to include and exclude languages without tinkering with
* the build system.
*/
static languageDefinition language_fr = { whichPluralForm_fr, {
/***************************************
** This MUST be present and first.
** Specify the code for this language.
***************************************/
{/* Specify the ll or ll_cc language code here. */
TIDY_LANGUAGE, 0, "fr"
},
{ ACCESS_URL, 0, "http://www.w3.org/WAI/GL" },
{ ATRC_ACCESS_URL, 0, "http://www.html-tidy.org/Accessibility/" },
{ FILE_CANT_OPEN, 0, "Impossible d'ouvrir « %s »\n" },
{ LINE_COLUMN_STRING, 0, "Ligne: %d Col: %d - " },
{ STRING_CONTENT_LOOKS, 0, "Le contenu du document ressemble à %s" },
{ STRING_DISCARDING, 0, "dépose" },
{ STRING_DOCTYPE_GIVEN, 0, "DOCTYPE donné est «%s»" },
{ STRING_ERROR_COUNT, 0, "%u %s, %u %s trouvées!" },
{ STRING_ERROR_COUNT_ERROR, 0, "erreur" },
{ STRING_ERROR_COUNT_ERROR, 1, "erreurs" },
{ STRING_ERROR_COUNT_WARNING, 0, "alarme" },
{ STRING_ERROR_COUNT_WARNING, 1, "alarmes" },
{ STRING_HELLO_ACCESS, 0, "Contrôles d'accessibilité: version 0.1" },
{ STRING_HTML_PROPRIETARY, 0, "HTML Proprietary" },
{ STRING_MISSING_MALFORMED, 0, "argument manquant ou incorrect pour l'option: %s" },
{ STRING_NO_ERRORS, 0, "Aucun avertissement ou les erreurs ne trouvées." },
{ STRING_NO_SYSID, 0, "Aucun identificateur de système dans le doctype émis" },
{ STRING_NOT_ALL_SHOWN, 0, "Pas tous les avertissements/erreurs ont été présentés." },
{ STRING_PLAIN_TEXT, 0, "le texte brut" },
{ STRING_REPLACING, 0, "remplaçant" },
{ STRING_SPECIFIED, 0, "spécifié" },
{ STRING_UNKNOWN_FILE, 0, "%s: Impossible d'ouvrir le fichier \"%s\"\n" },
{ STRING_UNKNOWN_OPTION, 0, "option inconnue: %s" },
{ STRING_UNRECZD_OPTION, 0, "option non reconnue -%c utiliser -help pour lister les options\n" },
{ STRING_XML_DECLARATION, 0, "déclaration XML" },
{ TEXT_HTML_T_ALGORITHM, 0,
"\n"
" - D'abord, cherchez à gauche de la position de la cellule de trouver \n"
" des cellules d'en-tête de ligne.\n"
"       - Puis rechercher vers le haut pour trouver les cellules d'en-tête \n"
" de colonne.\n"
"       - La recherche dans une direction donnée arrête lorsque le bord \n"
" de la table est atteinte ou lorsque la cellule de données est \n"
" trouvé après une cellule d'en-tête.\n"
"       - Têtes de ligne sont insérés dans la liste dans l'ordre où ils \n"
" apparaissent dans la table.\n"
"       - Pour les tables de gauche à droite, en-têtes sont insérés de \n"
" gauche à droite.\n"
"       - Têtes de colonnes sont insérés après-têtes de ligne, dans\n"
"         l'ordre où ils apparaissent dans le tableau, de haut en bas.\n"
"       - Si une cellule d'en-tête a les têtes ensemble d'attributs, puis \n"
" les en-têtes référencée par cet attribut sont insérés dans la \n"
"         liste et le recherche arrête pour la direction du courant.\n"
"         TD cellules qui fixent l'attribut de l'axe sont également \n"
" traités comme des cellules d'en-tête.\n"
"\n"
},
{ TEXT_WINDOWS_CHARS, 0,
"Personnages codes pour les polices Microsoft Windows dans la gamme\n"
"128-159 ne pas être reconnus sur d'autres plateformes. Vous êtes\n"
"au lieu recommandé d'utiliser les entités nommées, par exemple &trade; \n"
"plutôt code que Windows de caractères 153 (0x2122 en Unicode). Notez que\n"
"à partir de Février 1998 quelques navigateurs supportent les nouvelles \n"
"entités.\n"
"\n"
},
{ TEXT_VENDOR_CHARS, 0,
"Il est peu probable que fournisseur spécifique, encodages qui dépendent du système\n"
"travailler assez largement sur le World Wide Web; vous devriez éviter d'utiliser le "
"%s codage de caractères de $, à la place il est recommandé \n"
"de utiliser entités nommées, par exemple &trade;.\n"
},
{ TEXT_SGML_CHARS, 0,
"Les codes de caractères 128 à 159 (U + 0080 à U + 009F) ne sont pas autorisés \n"
"en HTML; même si elles l'étaient, ils seraient probablement les \n"
"caractères non imprimables de contrôle.\n"
"Tidy supposé que vous vouliez faire référence à un personnage avec la même valeur "
"d'octet\n"
"l'encodage %s et remplacé cette référence avec l'équivalent Unicode.\n"
"\n"
},
{ TEXT_INVALID_UTF8, 0,
"Les codes de caractères UTF-8 doivent être dans la gamme: U + 0000 à U + 10FFFF.\n"
"La définition de l'UTF-8 à l'annexe D de la norme ISO / CEI 10646-1: 2000 a "
"également\n"
"permet l'utilisation de séquences de cinq et six octets pour coder\n"
"des personnages qui sont en dehors de la gamme de l'ensemble de caractères Unicode;\n"
"ces séquences de cinq et six octets sont illégales pour l'utilisation de\n"
"UTF-8 comme une transformation de caractères Unicode. ISO / IEC 10646\n"
"ne permet pas la cartographie des substituts non appariés, ni U + FFFE et U + FFFF\n"
"(mais il ne permet d'autres non-caractères). Pour plus d'informations s'il vous "
"plaît se référer à\n"
"http://www.unicode.org/ et http://www.cl.cam.ac.uk/~mgk25/unicode.html\n"
"\n"
},
{ TEXT_INVALID_UTF16, 0,
"Codes de caractères pour UTF-16 doit être dans la gamme: U + 0000 à U + 10FFFF.\n"
"La définition de UTF-16 dans l'annexe C de l'ISO/CEI 10646-1: 2000 n'autorise pas "
"le\n"
"mappage des substituts non appariés. Pour plus d'informations, veuillez vous "
"référer\n"
"à http://www.unicode.org/ et http://www.cl.cam.ac.uk/~mgk25/unicode.html\n"
"\n"
},
{ TEXT_INVALID_URI, 0,
"URI doit être correctement protégés, ils ne doivent pas contenir unescaped\n"
"caractères ci-dessous U + 0021, y compris le caractère d'espace et non\n"
"ci-dessus U + 007E. Tidy échappe à l'URI pour vous comme recommandé par\n"
"HTML 4.01 section B.2.1 et XML 1.0 section 4.2.2. Certains agents utilisateurs\n"
"utiliser un autre algorithme pour échapper à ces URI et un serveur-verso\n"
"scripts dépendent de cela. Si vous voulez compter sur cela, vous devez\n"
"échapper à l'URI sur votre propre. Pour plus d'informations s'il vous plaît se "
"référer à\n"
"http://www.w3.org/International/O-URL-and-ident.html\n"
"\n"
},
{ TEXT_BAD_FORM, 0,
"Vous devrez peut-être déplacer un ou deux de la<form>et</form>\n"
"tags. Éléments HTML doivent être correctement imbriquées et les éléments\n"
"de formulaire ne font pas exception. Par exemple, vous ne devez pas placer la\n"
"<form>dans une cellule et la </form>dans un autre. Si le <form>est placé\n"
"devant une table, le </form>ne peut pas être placé à l'intérieur de la table !\n"
"Notez qu'une forme ne peut pas être imbriquée dans un autre !\n"
"\n"
},
{ TEXT_BAD_MAIN, 0,
"Qu'un seul <main> élément est autorisé dans un document.\n"
"Les <main>éléments ont été jetées, qui peut invalider le document\n"
"\n"
},
{ TEXT_M_SUMMARY, 0,
"L'attribut summary table devrait servir à décrire la structure\n"
"de la table. Il est très utile pour les personnes utilisant des\n"
"navigateurs non visuels. Les attributs de portée et en-têtes\n"
"pour les cellules d'un tableau servent utiles pour spécifier les\n"
"en-têtes s'appliquent à chaque cellule du tableau, permettant\n"
"aux navigateurs non visuels fournir un contexte pour chaque cellule.\n"
"\n"
},
{ TEXT_M_IMAGE_ALT, 0,
"L'attribut alt devrait servir à donner une brève description d'une\n"
"image ; Il faudrait aussi des descriptions plus longues avec l'attribut\n"
"longdesc qui prend une URL liée à la description. Ces mesures sont\n"
"nécessaires pour les personnes utilisant des navigateurs textuels.\n"
"\n"
},
{ TEXT_M_IMAGE_MAP, 0,
"Utilisation côté client images interactives préférence cartes-images\n"
"côté serveur comme celui-ci est inaccessibles aux personnes utilisant\n"
"des navigateurs non graphiques. En outre, les cartes côté client sont\n"
"plus faciles à mettre en place et fournir une rétroaction immédiate\n"
"aux utilisateurs.\n"
"\n"
},
{ TEXT_M_LINK_ALT, 0,
"Liens hypertextes définie à l'aide d'une hyperimage côté client, vous\n"
"devez utiliser l'attribut alt pour fournir une description textuelle de la\n"
"liaison pour les personnes utilisant des navigateurs textuels.\n"
"\n"
},
{ TEXT_USING_FRAMES, 0,
"Pages conçues à l'aide de cadres pose des problèmes pour\n"
"les personnes qui sont aveugles ou utilisez un navigateur qui\n"
"ne supporte pas les frames. Une page de base de cadres doit\n"
"toujours inclure une disposition alternative à l'intérieur d'un\n"
"élément NOFRAMES.\n"
"\n"
},
{ TEXT_ACCESS_ADVICE1, 0,
"Pour plus d'informations sur la façon de rendre vos pages\n"
"accessibles, voir http://www.w3.org/WAI/GL"
},
{ TEXT_ACCESS_ADVICE2, 0, "et http://www.html-tidy.org/Accessibility/" },
{ TEXT_USING_LAYER, 0,
"Les Cascading Style Sheets (CSS) mécanisme de positionnement\n"
"Il est recommandé de préférence à la propriétaire <LAYER>\n"
"élément grâce à l'appui du fournisseur limitée pour la LAYER.\n"
"\n"
},
{ TEXT_USING_SPACER, 0,
"Il est recommandé d'utiliser les CSS pour contrôler blanc\n"
"espace (par exemple pour retrait, les marges et interlignes).\n"
"Le <SPACER> élément propriétaire a le soutien des fournisseurs limité.\n"
"\n"
},
{ TEXT_USING_FONT, 0,
"Il est recommandé d'utiliser les CSS pour spécifier la police et\n"
"propriétés telles que sa taille et sa couleur. Cela permettra de réduire\n"
"la taille des fichiers HTML et de les rendre plus faciles à entretenir\n"
"rapport à l'utilisation <FONT> éléments.\n"
"\n"
},
{ TEXT_USING_NOBR, 0,
"Il est recommandé d'utiliser les CSS pour contrôler les sauts de ligne.\n"
"Utilisez \"white-space: nowrap\" pour inhiber emballage en place\n"
"d'insertion <NOBR> ... </ NOBR> dans le balisage.\n"
"\n"
},
{ TEXT_USING_BODY, 0,
"Il est recommandé d'utiliser les CSS pour spécifier la page et de liaison des "
"couleurs\n"
},
{ TEXT_NEEDS_INTERVENTION, 0,
"Ce document contient des erreurs qui doivent être résolus avant\n"
"utilisant HTML Tidy pour générer une version rangé.\n"
"\n"
},
{ TEXT_GENERAL_INFO, 0,
"A propos de HTML Tidy: https://github.com/htacg/tidy-html5\n"
"Les rapports de bugs et commentaires: https://github.com/htacg/tidy-html5/issues\n"
"Liste officielle de diffusion: https://lists.w3.org/Archives/Public/public-htacg/\n"
"Spécification HTML dernière: http://dev.w3.org/html5/spec-author-view/\n"
"Validez vos documents HTML: http://validator.w3.org/nu/\n"
"Hall de votre entreprise à rejoindre le W3C: http://www.w3.org/Consortium\n"
"\n"
},
{ TEXT_GENERAL_INFO_PLEA, 0,
"\n"
"Parlez-vous une langue autre que l'anglais ou une autre variante de\n"
"Anglais? Considérez-nous aidant à localiser HTML Tidy. Pour plus de détails s'il "
"vous plaît voir\n"
"https://github.com/htacg/tidy-html5/blob/master/README/LOCALIZE.md\n"
},
{ TidyInfoString, 0, "Info:" },
{ TidyWarningString, 0, "Attention:" },
{ TidyConfigString, 0, "Config:" },
{ TidyAccessString, 0, "Accès:" },
{ TidyErrorString, 0, "Erreur:" },
{ TidyBadDocumentString, 0, "Document:" },
{ TidyFatalString, 0, "Panique:" },
{ ENCODING_MISMATCH, 0, "codage d'entrée spécifiée (%s) ne correspond pas réelle encodage d'entrée (%s)" },
{ VENDOR_SPECIFIC_CHARS, 0, "%s de code de caractère invalide l'%s" },
{ INVALID_SGML_CHARS, 0, "%s de code de caractère invalide l'%s" },
{ INVALID_UTF8, 0, "%s invalides octets UTF-8 de (char. codes %s)" },
{ INVALID_UTF16, 0, "paire de substitution non valide UTF-16 (code de caract. %s) %s" },
{ INVALID_NCR, 0, "Référence de caractère numérique non valide de %s %s" },
{ MISSING_SEMICOLON, 0, "entité « %s » ne s'arrête pas à «; »" },
{ MISSING_SEMICOLON_NCR, 0, "Référence de caractère numérique « %s » n'est pas se terminer par «; »" },
{ UNESCAPED_AMPERSAND, 0, "sans séquence d'échappement & qui devrait être écrit comme &amp;" },
{ UNKNOWN_ENTITY, 0, "sans séquence d'échappement & ou entité inconnue « %s »" },
{ APOS_UNDEFINED, 0, "nommée l'entité ' seulement défini en XML/XHTML" },
{ INSERTING_ATTRIBUTE, 0, "%s insérer l'attribut « %s »" },
{ INSERTING_AUTO_ATTRIBUTE, 0, "%s insérer l'attribut « %s », à l'aide de la valeur « %s »" },
{ MISSING_ATTR_VALUE, 0, "L'attribut %s a une valeur non valide \"%s\"" },
{ UNKNOWN_ATTRIBUTE, 0, "L'attribut %s a une valeur non valide \"%s\"" },
{ PROPRIETARY_ATTRIBUTE, 0, "L'attribut %s a une valeur non valide \"%s\"" },
{ JOINING_ATTRIBUTE, 0, "%s rejoignant les valeurs d'attribut répétée « %s »" },
{ XML_ATTRIBUTE_VALUE, 0, "L'attribut %s a une valeur non valide \"%s\"" },
{ XML_ID_SYNTAX, 0, "ID de %s « %s » utilise la syntaxe XML ID" },
{ ATTR_VALUE_NOT_LCASE, 0, "valeur d'attribut de %s « %s » doit être en minuscules pour XHTML" },
{ PROPRIETARY_ATTR_VALUE, 0, "valeur d'attribut propriétaire de %s « %s »" },
{ ANCHOR_NOT_UNIQUE, 0, "%s anchor \"%s\" déjà défini" },
{ BAD_ATTRIBUTE_VALUE, 0, "L'attribut %s \"%s\" a une valeur non valide \"%s\"" },
{ BAD_ATTRIBUTE_VALUE_REPLACED, 0, "%s attribut « %s » a une valeur non valide « %s » et a été remplacé" },
{ INVALID_ATTRIBUTE, 0, "nom d'attribut de %s « %s » (valeur = « %s ») n'est pas valide" },
{ REPEATED_ATTRIBUTE, 0, "%s laissant tomber la valeur « %s » pour l'attribut répétée « %s »" },
{ INVALID_XML_ID, 0, "%s ne peut pas copier le nom attribut id" },
{ UNEXPECTED_GT, 0, "manquant '>' pour tag: %s" },
{ UNEXPECTED_QUOTEMARK, 0, "%s inattendue ou double quote mark" },
{ MISSING_QUOTEMARK, 0, "%s attribut manquant apostrophe droite" },
{ UNEXPECTED_END_OF_FILE_ATTR, 0, "%s fin de fichier lors de l'analyse d'attributs" },
{ ID_NAME_MISMATCH, 0, "%s id et le nom valeur d'attribut mismatch" },
{ BACKSLASH_IN_URI, 0, "référence URI %s contient des anti-slash. Faute de frappe ?" },
{ FIXED_BACKSLASH, 0, "%s conversion de barre oblique inverse d'URI de slash" },
{ ILLEGAL_URI_REFERENCE, 0, "%s mal échappé référence URI" },
{ ESCAPED_ILLEGAL_URI, 0, "%s échapper malformé référence URI" },
{ NEWLINE_IN_URI, 0, "rejeter la nouvelle ligne de %s en référence URI" },
{ WHITE_IN_URI, 0, "jeter le espaces de %s en référence URI" },
{ UNEXPECTED_EQUALSIGN, 0, "%s unexpected '=', nom d'attribut attendu" },
{ MISSING_IMAGEMAP, 0, "%s doivent utiliser côté client image map" },
{ MISSING_ATTRIBUTE, 0, "%s manque attribut \"%s\"" },
{ NESTED_EMPHASIS, 0, "accent imbriquée %s" },
{ NESTED_QUOTATION, 0, "imbriqué \"q\" éléments, typo possible" },
{ OBSOLETE_ELEMENT, 0, "remplaçant élément obsolète %s avec %s" },
{ COERCE_TO_ENDTAG_WARN, 0, "<%s> est probablement destinée en tant que </%s>" },
{ REMOVED_HTML5, 0, "L'élément de %s retiré HTML5" },
{ BAD_SUMMARY_HTML5, 0, "L'attribut summary sur l'élément du %s est obsolète dans HTML5" },
{ TRIM_EMPTY_ELEMENT, 0, "rognage vide %s" },
{ REPLACING_ELEMENT, 0, "remplaçant %s avec %s" },
{ COERCE_TO_ENDTAG, 0, "<%s> est probablement destinée en tant que </%s>" },
{ REPLACING_UNEX_ELEMENT, 0, "remplacement inattendu %s avec %s" },
{ MISSING_ENDTAG_FOR, 0, "manquant </%s>" },
{ MISSING_ENDTAG_BEFORE, 0, "manquante </%s> avant %s" },
{ DISCARDING_UNEXPECTED, 0, "rejet inattendu %s" },
{ NON_MATCHING_ENDTAG, 0, "remplacement inattendu %s avec </%s>" },
{ TAG_NOT_ALLOWED_IN, 0, "%s n'est pas autorisé dans <%s> éléments" },
{ MISSING_STARTTAG, 0, "manquant <%s>" },
{ UNEXPECTED_ENDTAG, 0, "rejet inattendu </%s>" },
{ TOO_MANY_ELEMENTS, 0, "trop de %s éléments" },
{ USING_BR_INPLACE_OF, 0, "utilisant <br> à la place de %s" },
{ INSERTING_TAG, 0, "insertion implicite <%s>" },
{ CANT_BE_NESTED, 0, "%s ne peut pas être imbriquée" },
{ PROPRIETARY_ELEMENT, 0, "%s n'est pas approuvé par le W3C" },
{ ILLEGAL_NESTING, 0, "%s ne doivent pas être imbriqués" },
{ NOFRAMES_CONTENT, 0, "%s non à l'intérieur 'noframes'" },
{ UNEXPECTED_END_OF_FILE, 0, "fin inattendue du fichier %s" },
{ ELEMENT_NOT_EMPTY, 0, "%s élément non vide ou pas fermée" },
{ UNEXPECTED_ENDTAG_IN, 0, "inattendus </%s> dans <%s>" },
{ TOO_MANY_ELEMENTS_IN, 0, "trop de %s éléments dans <%s>" },
{ UNESCAPED_ELEMENT, 0, "unescaped %s dans le contenu pre" },
{ DOCTYPE_AFTER_TAGS, 0, "<! DOCTYPE> est pas autorisé après éléments" },
{ MISSING_TITLE_ELEMENT, 0, "insertion manquante élément 'title'" },
{ INCONSISTENT_VERSION, 0, "DOCTYPE HTML ne correspond pas à un contenu" },
{ MISSING_DOCTYPE, 0, "manquante <!DOCTYPE> déclaration" },
{ CONTENT_AFTER_BODY, 0, "contenu se produit après la fin du body" },
{ MALFORMED_COMMENT, 0, "tirets adjacents dans un commentaire" },
{ BAD_COMMENT_CHARS, 0, "attendre -- ou >" },
{ BAD_CDATA_CONTENT, 0, "'<' + '/' + lettre non permis ici" },
{ INCONSISTENT_NAMESPACE, 0, "le namespace HTML ne correspond pas au contenu" },
{ SPACE_PRECEDING_XMLDECL, 0, "supprimant l'espace blanc précédent Déclaration XML" },
{ MALFORMED_DOCTYPE, 0, "en rejetant malformé <!DOCTYPE>" },
{ BAD_XML_COMMENT, 0, "commentaires XML ne peut pas contenir --" },
{ DTYPE_NOT_UPPER_CASE, 0, "SYSTEM, PUBLIC, W3C, DTD, EN doit être en majuscules" },
{ ENCODING_IO_CONFLICT, 0, "encodage de sortie ne fonctionne pas avec la sortie standard" },
{ SUSPECTED_MISSING_QUOTE, 0, "manquant guillemet pour la valeur d'attribut" },
{ DUPLICATE_FRAMESET, 0, "élément répété FRAMESET" },
{ UNKNOWN_ELEMENT, 0, "%s n'est pas reconnue !" },
{ PREVIOUS_LOCATION, 0, "<%s> précédemment mentionnés" },
{ TidyXmlDecl, 0,
"Cette option spécifie si Tidy devrait ajouter la déclaration XML lors de la sortie "
"XML ou XHTML. <br/> Notez que si l'entrée comprend déjà un <code> & lt;?xml ... &&gt;"
"</code> déclaration alors cette option sera ignorée. <br/> Si l'encodage pour la "
"sortie est différente de <var>ascii</var>, l'un des l'encodage <var>utf*</var> ou "
"<var>raw</var>, la déclaration est toujours ajouté que requis par la norme XML."
},
{ TidyXmlSpace, 0,
"Cette option spécifie si tidy doit ajouter <code>xml:espace=\"préserver \"</code> "
"pour des éléments tels que <code><pré></code>, <code><style></code> et "
"<code><script></code> lors de la génération de XML. <br />Il est nécessaire si "
"l'espace blanc dans ces éléments doit être analysée de manière appropriée sans avoir "
"accès à la DTD."
},
{ TidyAltText, 0,
"Cette option spécifie la valeur par défaut <code>alt=</code> utilise le texte Tidy "
"pour <code>&lt;img&gt;</code> attributs lorsque le <code>alt=</code> attribut est "
"absent. <br/> Utiliser avec précaution, car il est de votre responsabilité de rendre "
"vos documents accessibles aux personnes qui ne peuvent pas voir les images."
},
{ TidyXmlPIs, 0,
"Cette option spécifie si Tidy doit modifier l'analyse syntaxique des instructions de "
"traitement pour exiger <code>?&gt;</code> comme terminateur plutôt que <code>&gt;</"
"code>. <br/> Cette option est automatiquement activée si l'entrée est en XML."
},
{ TidyMakeBare, 0,
"Cette option spécifie si Tidy doit dépouiller Microsoft HTML spécifique à partir de "
"Word 2000 documents, et des espaces de sortie plutôt que des espaces insécables où "
"ils existent dans l'entrée."
},
{ TidyCSSPrefix, 0,
"Cette option spécifie le préfixe que Tidy utilise des règles de styles. <br/> Par "
"défaut, <var>c</var> sera utilisé."
},
{ TidyMakeClean, 0,
"Cette option spécifie si Tidy doit effectuer le nettoyage de certains anciens tags "
"de présentation (actuellement de <code>& lt; i&gt;</code>, <code>&lt;b&gt;</code>, "
"<code>&lt;centre&gt;</code> lorsque placé entre les balises inline appropriées, et "
"<code>&lt; font&gt;</code>). Si <var>yes</var> puis balises existantes seront "
"remplacées par CSS le <code>&lt;style&gt;</code> balises et le balisage structurel "
"selon le cas."
},
{ TidyGDocClean, 0,
"Cette option spécifie si Tidy doit permettre un comportement spécifique pour le "
"nettoyage HTML exporté à partir de Google Docs."
},
{ TidyDoctype, 0,
"Cette option spécifie la déclaration DOCTYPE générée par Tidy. <br/> Si <var>omit</"
"var> la sortie ne contiendra une déclaration DOCTYPE. Notez que ce cela implique "
"aussi <code>numeric-entities</code> est réglé sur <var>yes</var>. <br/> Si "
"<code>html5</code> le DOCTYPE est réglé sur <code>&lt;! DOCTYPE html&gt;</code>. <br/"
"> Si <var>auto</var> (par défaut) Tidy utilisera une supposition basée sur le "
"contenu du document. <br/> Si elle est définie <var>strict</var>, Tidy établira le "
"DOCTYPE du HTML4 ou XHTML 1 DTD stricte. <br/> Si <var>loose</var>, le DOCTYPE est "
"réglé sur le HTML4 ou XHTML1 loose (transitional) DTD. <br/> Alternativement, vous "
"pouvez fournir une chaîne pour l'identificateur public formel (FPI).<br/> Par "
"exemple: <br/> <code>doctype: \"- // ACME // DTD HTML. 3,14159 //EN\"</code> <br/> "
"Si vous spécifiez le FPI pour un document XHTML, Tidy établira l'identifiant du "
"système à une chaîne vide. Pour un document HTML, Tidy ajoute un identificateur de "
"système que si l'on était déjà présent dans le but de préserver le mode de certains "
"navigateurs de traitement. Tidy quitte le DOCTYPE pour les documents XML génériques "
"inchangés. <br/> Cette option ne permet pas une validation du document de conformité."
},
{ TidyDropEmptyElems, 0, "Cette option spécifie si Tidy doit jeter des éléments vides." },
{ TidyDropEmptyParas, 0, "Cette option spécifie si Tidy doit jeter des paragraphes vides." },
{ TidyFixUri, 0,
"Cette option spécifie si Tidy doit vérifier les valeurs d'attributs qui portent URI "
"pour des caractères illégaux et si ce sont trouvés, leur échapper en HTML 4 "
"recommande."
},
{ TidyPPrintTabs, 0,
"Cette option spécifie si tidy doit Indenter avec tabulation au lieu des espaces, en "
"supposant <code>indent</code> est <var>yes</var>. <br/>Définir sur <var>yes</var> "
"pour indenter avec des tabulations au lieu de la valeur par défaut des espaces. <br /"
">Utilisez l'option <code>indent-spaces</code> pour contrôler le nombre d'onglets "
"Sortie par niveau de tiret. Notez que lorsque <code>indent-with-tabs</code> est "
"activée. La valeur par défaut de <code>indent-spaces</code> est réinitialisé à "
"<var>1</var>. <br/>Remarque <code>tab-size</code> contrôle la conversion des espaces "
"d'entrée. Définissez-le à zéro pour conserver onglets de saisie."
},
{ TidySkipNested, 0,
"Cette option spécifie que Tidy doit ignorer les balises imbriquées lors de l'analyse "
"des données de script et de style."
},
{ TC_CAT_DIAGNOSTICS, 0, "diagnostics" },
{ TC_CAT_ENCODING, 0, "encoding" },
{ TC_CAT_MARKUP, 0, "markup" },
{ TC_CAT_MISC, 0, "misc" },
{ TC_CAT_PRETTYPRINT, 0, "imprimer" },
{ TC_LABEL_COL, 0, "colonne" },
{ TC_LABEL_FILE, 0, "fichier" },
{ TC_LABEL_LANG, 0, "lang" },
{ TC_LABEL_LEVL, 0, "niveau" },
{ TC_LABEL_OPT, 0, "option" },
{ TC_MAIN_ERROR_LOAD_CONFIG, 0, "Chargement du fichier de configuration \"%s\" a échoué, err =%d" },
{ TC_OPT_ACCESS, 0,
"faire des vérifications d'accessibilité supplémentaires (<niveau> = 0, 1, 2, 3). 0 "
"est supposé si <niveau> est manquant."
},
{ TC_OPT_ASCII, 0, "utiliser ISO-8859-1 pour l'entrée, US-ASCII pour la sortie" },
{ TC_OPT_UPPER, 0, "balises de force en majuscules" },
{ TC_TXT_HELP_3, 0,
"\n"
"Options de configuration Tidy\n"
"==========================\n"
"Utilisez les options de configuration de Tidy comme arguments de ligne de commande "
"sous la forme de «--option <value>\", par exemple, \"--indent-with-tabs yes\"\n"
"\n"
"Pour une liste de toutes les options de configuration, utiliser \"-help-config\"\n"
" ou consultez à la man page (si votre OS en a un).\n"
"\n"
"Si votre environnement a un ensemble de variables à un point de Tidy \n"
"$HTML_TIDY fichier de configuration puis Tidy va tenter de l'utiliser.\n"
"\n"
"Sur certaines plateformes Tidy tentera également d'utiliser une configuration "
"spécifiée dans /etc/tidy.conf ou ~/.tidy.conf.\n"
"\n"
"Autre\n"
"=====\n"
"Entrée/sortie par défaut utiliser stdin/stdout respectivement.\n"
"\n"
"Options de simple lettre en dehors de -f peuvent être combinés\n"
"comme dans: bien rangé -f errs.txt -imu foo.html\n"
"\n"
"renseignements\n"
"===========\n"
"Pour plus d'informations à propos de HTML Tidy, voir\n"
"  http://www.html-tidy.org/\n"
"\n"
"Pour plus d'informations sur le langage HTML, consultez les rubriques suivantes:\n"
"\n"
"  HTML: Edition pour les auteurs Web (de la dernière spécification de HTML)\n"
"  http://dev.w3.org/html5/spec-author-view\n"
"\n"
},
{ TC_TXT_HELP_CONFIG, 0,
"\n"
"HTML Tidy paramètres de configuration\n"
"\n"
"Dans un fichier, utilisez le formulaire:\n"
"\n"
"envelopper: 72\n"
"tiret: pas\n"
"\n"
"Quand il est spécifié sur la ligne de commande, utilisez le formulaire:\n"
"\n"
"--wrap 72 --indent pas\n"
"\n"
},
{ TC_TXT_HELP_CONFIG_NAME, 0, "Nom" },
{ TC_TXT_HELP_CONFIG_TYPE, 0, "Type" },
{ TC_TXT_HELP_CONFIG_ALLW, 0, "Les valeurs autorisées" },
{ TC_TXT_HELP_LANG_1, 0,
"\n"
"L'option --language (ou --lang) indique la langue Tidy\n"
"doit utiliser pour communiquer sa sortie. S'il vous plaît noter que ce ne sont pas "
"un service de traduction de documents, et affecte uniquement les messages qui Tidy "
"communique à vous.\n"
"\n"
"Lorsqu'il est utilisé à partir de la ligne de commande de l'argument doit \n"
"--language être utilisé avant des arguments qui résultent de la production, sinon "
"Tidy\n"
"va produire une sortie avant qu'il connaît la langue à utiliser.\n"
"\n"
"En plus des codes de langue standard POSIX, Tidy est capable de\n"
"héritées compréhension codes de langue de Windows. S'il vous plaît noter que \n"
"cette liste indique les codes Tidy comprend, et ne signifie pas que\n"
"la langue est actuellement installé.\n"
"\n"
"La colonne de droite indique comment Tidy comprendra le\n"
"héritage nom Windows.\n"
"\n"
},
{ TC_TXT_HELP_LANG_2, 0,
"\n"
"Notez qu'il n'y a aucune garantie qu'ils sont complets; seulement ça\n"
"un développeur ou d'une autre ont commencé à ajouter la langue indiquée.\n"
"Localisations incomplètes ne seront par défaut \"et\" si nécessaire.\n"
"S'il vous plaît signaler les cas de chaînes incorrectes à l'équipe Tidy.\n"
"\n"
},
{ TC_TXT_HELP_LANG_3, 0,
"\n"
"Si Tidy est capable de déterminer votre localisation puis Tidy utilisera le\n"
"langue locale automatiquement. Par exemple les systèmes Unix-like utilisent un $LANG "
"et/ou $LC_ALL variable d'environnement. Consultez votre exploitation documentation "
"du système pour plus d'informations.\n"
"\n"
},
{/* This MUST be present and last. */
TIDY_MESSAGE_TYPE_LAST, 0, NULL
}
}};
#endif /* language_fr_h */

81
src/language_zh_cn.h

@ -0,0 +1,81 @@
#ifndef language_zh_cn_h
#define language_zh_cn_h
/*
* language_zh_cn.h
* Localization support for HTML Tidy.
*
*
* This file is a localization file for HTML Tidy. It will have been machine
* generated or created and/or edited by hand. Both are valid options, but
* please help keep our localization efforts simple to maintain by maintaining
* the structure of this file, and changing the check box below if you make
* changes (so others know the file origin):
*
* [X] THIS FILE IS MACHINE GENERATED. It is a localization file for the
* language (and maybe region) "zh_cn". The source of
* these strings is a gettext PO file in Tidy's source, probably called
* "language_zh_cn.po".
*
* [ ] THIS FILE WAS HAND MODIFIED. Translators, please feel to edit this file
* directly (and check this box). If you prefer to edit PO files then use
* `poconvert.rb msgunfmt language_zh_cn.h` (our own
* conversion tool) to generate a fresh PO from this file first!
*
* (c) 2015 HTACG
* See tidy.h and access.h for the copyright notice.
*
* Template Created by Jim Derry on 01/14/2016.
*
* Orginating PO file metadata:
* PO_LAST_TRANSLATOR=jderry
* PO_REVISION_DATE=2016-03-24 10:59:55
*/
#ifdef _MSC_VER
#pragma execution_character_set("utf-8")
#endif
#include "language.h"
#include "access.h"
#include "message.h"
/**
* This language-specific function returns the correct pluralForm
* to use given n items, and is used as a member of each language
* definition.
*/
static uint whichPluralForm_zh_cn(uint n) {
/* Plural-Forms: nplurals=1; */
return 0;
}
/**
* This structure specifies all of the strings needed by Tidy for a
* single language. Static definition in a header file makes it
* easy to include and exclude languages without tinkering with
* the build system.
*/
static languageDefinition language_zh_cn = { whichPluralForm_zh_cn, {
/***************************************
** This MUST be present and first.
** Specify the code for this language.
***************************************/
{/* Specify the ll or ll_cc language code here. */
TIDY_LANGUAGE, 0, "zh_cn"
},
{ FILE_CANT_OPEN, 0, "无法打开”%s”\n" },
{ LINE_COLUMN_STRING, 0, "行 %d 列 %d - " },
{ STRING_CONTENT_LOOKS, 0, "文档内容看起来像 %s" },
{ TC_STRING_VERS_A, 0, "HTML Tidy 用于 %s 版本 %s" },
{ TC_STRING_VERS_B, 0, "HTML Tidy 版本 %s" },
{/* This MUST be present and last. */
TIDY_MESSAGE_TYPE_LAST, 0, NULL
}
}};
#endif /* language_zh_cn_h */

4289
src/lexer.c

File diff suppressed because it is too large

620
src/lexer.h

@ -0,0 +1,620 @@
#ifndef __LEXER_H__
#define __LEXER_H__
/* lexer.h -- Lexer for html parser
(c) 1998-2008 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
Given an input source, it returns a sequence of tokens.
GetToken(source) gets the next token
UngetToken(source) provides one level undo
The tags include an attribute list:
- linked list of attribute/value nodes
- each node has 2 NULL-terminated strings.
- entities are replaced in attribute values
white space is compacted if not in preformatted mode
If not in preformatted mode then leading white space
is discarded and subsequent white space sequences
compacted to single space characters.
If XmlTags is no then Tag names are folded to upper
case and attribute names to lower case.
Not yet done:
- Doctype subset and marked sections
*/
#ifdef __cplusplus
extern "C" {
#endif
#include "forward.h"
/* lexer character types
*/
#define digit 1u
#define letter 2u
#define namechar 4u
#define white 8u
#define newline 16u
#define lowercase 32u
#define uppercase 64u
#define digithex 128u
/* node->type is one of these values
*/
typedef enum
{
RootNode,
DocTypeTag,
CommentTag,
ProcInsTag,
TextNode,
StartTag,
EndTag,
StartEndTag,
CDATATag,
SectionTag,
AspTag,
JsteTag,
PhpTag,
XmlDecl
} NodeType;
/* lexer GetToken states
*/
typedef enum
{
LEX_CONTENT,
LEX_GT,
LEX_ENDTAG,
LEX_STARTTAG,
LEX_COMMENT,
LEX_DOCTYPE,
LEX_PROCINSTR,
LEX_CDATA,
LEX_SECTION,
LEX_ASP,
LEX_JSTE,
LEX_PHP,
LEX_XMLDECL
} LexerState;
/* ParseDocTypeDecl state constants */
typedef enum
{
DT_INTERMEDIATE,
DT_DOCTYPENAME,
DT_PUBLICSYSTEM,
DT_QUOTEDSTRING,
DT_INTSUBSET
} ParseDocTypeDeclState;
/* content model shortcut encoding
Descriptions are tentative.
*/
#define CM_UNKNOWN 0
/* Elements with no content. Map to HTML specification. */
#define CM_EMPTY (1 << 0)
/* Elements that appear outside of "BODY". */
#define CM_HTML (1 << 1)
/* Elements that can appear within HEAD. */
#define CM_HEAD (1 << 2)
/* HTML "block" elements. */
#define CM_BLOCK (1 << 3)
/* HTML "inline" elements. */
#define CM_INLINE (1 << 4)
/* Elements that mark list item ("LI"). */
#define CM_LIST (1 << 5)
/* Elements that mark definition list item ("DL", "DT"). */
#define CM_DEFLIST (1 << 6)
/* Elements that can appear inside TABLE. */
#define CM_TABLE (1 << 7)
/* Used for "THEAD", "TFOOT" or "TBODY". */
#define CM_ROWGRP (1 << 8)
/* Used for "TD", "TH" */
#define CM_ROW (1 << 9)
/* Elements whose content must be protected against white space movement.
Includes some elements that can found in forms. */
#define CM_FIELD (1 << 10)
/* Used to avoid propagating inline emphasis inside some elements
such as OBJECT or APPLET. */
#define CM_OBJECT (1 << 11)
/* Elements that allows "PARAM". */
#define CM_PARAM (1 << 12)
/* "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */
#define CM_FRAMES (1 << 13)
/* Heading elements (h1, h2, ...). */
#define CM_HEADING (1 << 14)
/* Elements with an optional end tag. */
#define CM_OPT (1 << 15)
/* Elements that use "align" attribute for vertical position. */
#define CM_IMG (1 << 16)
/* Elements with inline and block model. Used to avoid calling InlineDup. */
#define CM_MIXED (1 << 17)
/* Elements whose content needs to be indented only if containing one
CM_BLOCK element. */
#define CM_NO_INDENT (1 << 18)
/* Elements that are obsolete (such as "dir", "menu"). */
#define CM_OBSOLETE (1 << 19)
/* User defined elements. Used to determine how attributes wihout value
should be printed. */
#define CM_NEW (1 << 20)
/* Elements that cannot be omitted. */
#define CM_OMITST (1 << 21)
/* If the document uses just HTML 2.0 tags and attributes described
** it as HTML 2.0 Similarly for HTML 3.2 and the 3 flavors of HTML 4.0.
** If there are proprietary tags and attributes then describe it as
** HTML Proprietary. If it includes the xml-lang or xmlns attributes
** but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the
** flavors of Voyager (strict, loose or frameset).
*/
/* unknown */
#define xxxx 0u
/* W3C defined HTML/XHTML family document types */
#define HT20 1u
#define HT32 2u
#define H40S 4u
#define H40T 8u
#define H40F 16u
#define H41S 32u
#define H41T 64u
#define H41F 128u
#define X10S 256u
#define X10T 512u
#define X10F 1024u
#define XH11 2048u
#define XB10 4096u
/* proprietary stuff */
#define VERS_SUN 8192u
#define VERS_NETSCAPE 16384u
#define VERS_MICROSOFT 32768u
/* special flag */
#define VERS_XML 65536u
/* HTML5 */
#define HT50 131072u
#define XH50 262144u
/* compatibility symbols */
#define VERS_UNKNOWN (xxxx)
#define VERS_HTML20 (HT20)
#define VERS_HTML32 (HT32)
#define VERS_HTML40_STRICT (H40S|H41S|X10S)
#define VERS_HTML40_LOOSE (H40T|H41T|X10T)
#define VERS_FRAMESET (H40F|H41F|X10F)
#define VERS_XHTML11 (XH11)
#define VERS_BASIC (XB10)
/* HTML5 */
#define VERS_HTML5 (HT50|XH50)
/* meta symbols */
#define VERS_HTML40 (VERS_HTML40_STRICT|VERS_HTML40_LOOSE|VERS_FRAMESET)
#define VERS_IFRAME (VERS_HTML40_LOOSE|VERS_FRAMESET)
#define VERS_LOOSE (VERS_HTML20|VERS_HTML32|VERS_IFRAME)
#define VERS_EVENTS (VERS_HTML40|VERS_XHTML11)
#define VERS_FROM32 (VERS_HTML32|VERS_HTML40)
#define VERS_FROM40 (VERS_HTML40|VERS_XHTML11|VERS_BASIC)
#define VERS_XHTML (X10S|X10T|X10F|XH11|XB10|XH50)
/* strict */
#define VERS_STRICT (VERS_HTML5|VERS_HTML40_STRICT)
/* all W3C defined document types */
#define VERS_ALL (VERS_HTML20|VERS_HTML32|VERS_FROM40|XH50|HT50)
/* all proprietary types */
#define VERS_PROPRIETARY (VERS_NETSCAPE|VERS_MICROSOFT|VERS_SUN)
/* Linked list of class names and styles
*/
struct _Style;
typedef struct _Style TagStyle;
struct _Style
{
tmbstr tag;
tmbstr tag_class;
tmbstr properties;
TagStyle *next;
};
/* Linked list of style properties
*/
struct _StyleProp;
typedef struct _StyleProp StyleProp;
struct _StyleProp
{
tmbstr name;
tmbstr value;
StyleProp *next;
};
/* Attribute/Value linked list node
*/
struct _AttVal
{
AttVal* next;
const Attribute* dict;
Node* asp;
Node* php;
int delim;
tmbstr attribute;
tmbstr value;
};
/*
Mosaic handles inlines via a separate stack from other elements
We duplicate this to recover from inline markup errors such as:
<i>italic text
<p>more italic text</b> normal text
which for compatibility with Mosaic is mapped to:
<i>italic text</i>
<p><i>more italic text</i> normal text
Note that any inline end tag pop's the effect of the current
inline start tag, so that </b> pop's <i> in the above example.
*/
struct _IStack
{
IStack* next;
const Dict* tag; /* tag's dictionary definition */
tmbstr element; /* name (NULL for text nodes) */
AttVal* attributes;
};
/* HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl,
** etc. etc.
*/
struct _Node
{
Node* parent; /* tree structure */
Node* prev;
Node* next;
Node* content;
Node* last;
AttVal* attributes;
const Dict* was; /* old tag when it was changed */
const Dict* tag; /* tag's dictionary definition */
tmbstr element; /* name (NULL for text nodes) */
uint start; /* start of span onto text array */
uint end; /* end of span onto text array */
NodeType type; /* TextNode, StartTag, EndTag etc. */
uint line; /* current line of document */
uint column; /* current column of document */
Bool closed; /* true if closed by explicit end tag */
Bool implicit; /* true if inferred */
Bool linebreak; /* true if followed by a line break */
#ifdef TIDY_STORE_ORIGINAL_TEXT
tmbstr otext;
#endif
};
/*
The following are private to the lexer
Use NewLexer() to create a lexer, and
FreeLexer() to free it.
*/
struct _Lexer
{
#if 0 /* Move to TidyDocImpl */
StreamIn* in; /* document content input */
StreamOut* errout; /* error output stream */
uint badAccess; /* for accessibility errors */
uint badLayout; /* for bad style errors */
uint badChars; /* for bad character encodings */
uint badForm; /* for mismatched/mispositioned form tags */
uint warnings; /* count of warnings in this document */
uint errors; /* count of errors */
#endif
uint lines; /* lines seen */
uint columns; /* at start of current token */
Bool waswhite; /* used to collapse contiguous white space */
Bool pushed; /* true after token has been pushed back */
Bool insertspace; /* when space is moved after end tag */
Bool excludeBlocks; /* Netscape compatibility */
Bool exiled; /* true if moved out of table */
Bool isvoyager; /* true if xmlns attribute on html element */
uint versions; /* bit vector of HTML versions */
uint doctype; /* version as given by doctype (if any) */
uint versionEmitted; /* version of doctype emitted */
Bool bad_doctype; /* e.g. if html or PUBLIC is missing */
uint txtstart; /* start of current node */
uint txtend; /* end of current node */
LexerState state; /* state of lexer's finite state machine */
Node* token; /* last token returned by GetToken() */
Node* itoken; /* last duplicate inline returned by GetToken() */
Node* root; /* remember root node of the document */
Node* parent; /* remember parent node for CDATA elements */
Bool seenEndBody; /* true if a </body> tag has been encountered */
Bool seenEndHtml; /* true if a </html> tag has been encountered */
/*
Lexer character buffer
Parse tree nodes span onto this buffer
which contains the concatenated text
contents of all of the elements.
lexsize must be reset for each file.
*/
tmbstr lexbuf; /* MB character buffer */
uint lexlength; /* allocated */
uint lexsize; /* used */
/* Inline stack for compatibility with Mosaic */
Node* inode; /* for deferring text node */
IStack* insert; /* for inferring inline tags */
IStack* istack;
uint istacklength; /* allocated */
uint istacksize; /* used */
uint istackbase; /* start of frame */
TagStyle *styles; /* used for cleaning up presentation markup */
TidyAllocator* allocator; /* allocator */
#if 0
TidyDocImpl* doc; /* Pointer back to doc for error reporting */
#endif
};
/* Lexer Functions
*/
/* choose what version to use for new doctype */
int TY_(HTMLVersion)( TidyDocImpl* doc );
/* everything is allowed in proprietary version of HTML */
/* this is handled here rather than in the tag/attr dicts */
void TY_(ConstrainVersion)( TidyDocImpl* doc, uint vers );
Bool TY_(IsWhite)(uint c);
Bool TY_(IsDigit)(uint c);
Bool TY_(IsLetter)(uint c);
Bool TY_(IsHTMLSpace)(uint c);
Bool TY_(IsNewline)(uint c);
Bool TY_(IsNamechar)(uint c);
Bool TY_(IsXMLLetter)(uint c);
Bool TY_(IsXMLNamechar)(uint c);
/* Bool IsLower(uint c); */
Bool TY_(IsUpper)(uint c);
uint TY_(ToLower)(uint c);
uint TY_(ToUpper)(uint c);
Lexer* TY_(NewLexer)( TidyDocImpl* doc );
void TY_(FreeLexer)( TidyDocImpl* doc );
/* store character c as UTF-8 encoded byte stream */
void TY_(AddCharToLexer)( Lexer *lexer, uint c );
/*
Used for elements and text nodes
element name is NULL for text nodes
start and end are offsets into lexbuf
which contains the textual content of
all elements in the parse tree.
parent and content allow traversal
of the parse tree in any direction.
attributes are represented as a linked
list of AttVal nodes which hold the
strings for attribute/value pairs.
*/
Node* TY_(NewNode)( TidyAllocator* allocator, Lexer* lexer );
/* used to clone heading nodes when split by an <HR> */
Node* TY_(CloneNode)( TidyDocImpl* doc, Node *element );
/* free node's attributes */
void TY_(FreeAttrs)( TidyDocImpl* doc, Node *node );
/* doesn't repair attribute list linkage */
void TY_(FreeAttribute)( TidyDocImpl* doc, AttVal *av );
/* detach attribute from node */
void TY_(DetachAttribute)( Node *node, AttVal *attr );
/* detach attribute from node then free it
*/
void TY_(RemoveAttribute)( TidyDocImpl* doc, Node *node, AttVal *attr );
/*
Free document nodes by iterating through peers and recursing
through children. Set next to NULL before calling FreeNode()
to avoid freeing peer nodes. Doesn't patch up prev/next links.
*/
void TY_(FreeNode)( TidyDocImpl* doc, Node *node );
Node* TY_(TextToken)( Lexer *lexer );
/* used for creating preformatted text from Word2000 */
Node* TY_(NewLineNode)( Lexer *lexer );
/* used for adding a &nbsp; for Word2000 */
Node* TY_(NewLiteralTextNode)(Lexer *lexer, ctmbstr txt );
void TY_(AddStringLiteral)( Lexer* lexer, ctmbstr str );
/* void AddStringLiteralLen( Lexer* lexer, ctmbstr str, int len ); */
/* find element */
Node* TY_(FindDocType)( TidyDocImpl* doc );
Node* TY_(FindHTML)( TidyDocImpl* doc );
Node* TY_(FindHEAD)( TidyDocImpl* doc );
Node* TY_(FindTITLE)(TidyDocImpl* doc);
Node* TY_(FindBody)( TidyDocImpl* doc );
Node* TY_(FindXmlDecl)(TidyDocImpl* doc);
/* Returns containing block element, if any */
Node* TY_(FindContainer)( Node* node );
/* add meta element for Tidy */
Bool TY_(AddGenerator)( TidyDocImpl* doc );
uint TY_(ApparentVersion)( TidyDocImpl* doc );
ctmbstr TY_(HTMLVersionNameFromCode)( uint vers, Bool isXhtml );
Bool TY_(WarnMissingSIInEmittedDocType)( TidyDocImpl* doc );
Bool TY_(SetXHTMLDocType)( TidyDocImpl* doc );
/* fixup doctype if missing */
Bool TY_(FixDocType)( TidyDocImpl* doc );
/* ensure XML document starts with <?xml version="1.0"?> */
/* add encoding attribute if not using ASCII or UTF-8 output */
Bool TY_(FixXmlDecl)( TidyDocImpl* doc );
Node* TY_(InferredTag)(TidyDocImpl* doc, TidyTagId id);
void TY_(UngetToken)( TidyDocImpl* doc );
/*
modes for GetToken()
MixedContent -- for elements which don't accept PCDATA
Preformatted -- white space preserved as is
IgnoreMarkup -- for CDATA elements such as script, style
*/
typedef enum
{
IgnoreWhitespace,
MixedContent,
Preformatted,
IgnoreMarkup,
OtherNamespace,
CdataContent
} GetTokenMode;
Node* TY_(GetToken)( TidyDocImpl* doc, GetTokenMode mode );
void TY_(InitMap)(void);
/* create a new attribute */
AttVal* TY_(NewAttribute)( TidyDocImpl* doc );
/* create a new attribute with given name and value */
AttVal* TY_(NewAttributeEx)( TidyDocImpl* doc, ctmbstr name, ctmbstr value,
int delim );
/* insert attribute at the end of attribute list of a node */
void TY_(InsertAttributeAtEnd)( Node *node, AttVal *av );
/* insert attribute at the start of attribute list of a node */
void TY_(InsertAttributeAtStart)( Node *node, AttVal *av );
/*************************************
In-line Stack functions
*************************************/
/* duplicate attributes */
AttVal* TY_(DupAttrs)( TidyDocImpl* doc, AttVal* attrs );
/*
push a copy of an inline node onto stack
but don't push if implicit or OBJECT or APPLET
(implicit tags are ones generated from the istack)
One issue arises with pushing inlines when
the tag is already pushed. For instance:
<p><em>text
<p><em>more text
Shouldn't be mapped to
<p><em>text</em></p>
<p><em><em>more text</em></em>
*/
void TY_(PushInline)( TidyDocImpl* doc, Node* node );
/* pop inline stack */
void TY_(PopInline)( TidyDocImpl* doc, Node* node );
Bool TY_(IsPushed)( TidyDocImpl* doc, Node* node );
Bool TY_(IsPushedLast)( TidyDocImpl* doc, Node *element, Node *node );
/*
This has the effect of inserting "missing" inline
elements around the contents of blocklevel elements
such as P, TD, TH, DIV, PRE etc. This procedure is
called at the start of ParseBlock. when the inline
stack is not empty, as will be the case in:
<i><h1>italic heading</h1></i>
which is then treated as equivalent to
<h1><i>italic heading</i></h1>
This is implemented by setting the lexer into a mode
where it gets tokens from the inline stack rather than
from the input stream.
*/
int TY_(InlineDup)( TidyDocImpl* doc, Node *node );
/*
defer duplicates when entering a table or other
element where the inlines shouldn't be duplicated
*/
void TY_(DeferDup)( TidyDocImpl* doc );
Node* TY_(InsertedToken)( TidyDocImpl* doc );
/* stack manipulation for inline elements */
Bool TY_(SwitchInline)( TidyDocImpl* doc, Node* element, Node* node );
Bool TY_(InlineDup1)( TidyDocImpl* doc, Node* node, Node* element );
#ifdef __cplusplus
}
#endif
#endif /* __LEXER_H__ */

343
src/mappedio.c

@ -0,0 +1,343 @@
/* Interface to mmap style I/O
(c) 2006-2008 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
Originally contributed by Cory Nelson and Nuno Lopes
*/
/* keep these here to keep file non-empty */
#include "forward.h"
#include "mappedio.h"
#if SUPPORT_POSIX_MAPPED_FILES
#include "fileio.h"
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <stdio.h>
#include <sys/mman.h>
typedef struct
{
TidyAllocator *allocator;
const byte *base;
size_t pos, size;
} MappedFileSource;
static int TIDY_CALL mapped_getByte( void* sourceData )
{
MappedFileSource* fin = (MappedFileSource*) sourceData;
return fin->base[fin->pos++];
}
static Bool TIDY_CALL mapped_eof( void* sourceData )
{
MappedFileSource* fin = (MappedFileSource*) sourceData;
return (fin->pos >= fin->size);
}
static void TIDY_CALL mapped_ungetByte( void* sourceData, byte ARG_UNUSED(bv) )
{
MappedFileSource* fin = (MappedFileSource*) sourceData;
fin->pos--;
}
int TY_(initFileSource)( TidyAllocator *allocator, TidyInputSource* inp, FILE* fp )
{
MappedFileSource* fin;
struct stat sbuf;
int fd;
fin = (MappedFileSource*) TidyAlloc( allocator, sizeof(MappedFileSource) );
if ( !fin )
return -1;
fd = fileno(fp);
if ( fstat(fd, &sbuf) == -1
|| sbuf.st_size == 0
|| (fin->base = mmap(0, fin->size = sbuf.st_size, PROT_READ,
MAP_SHARED, fd, 0)) == MAP_FAILED)
{
TidyFree( allocator, fin );
/* Fallback on standard I/O */
return TY_(initStdIOFileSource)( allocator, inp, fp );
}
fin->pos = 0;
fin->allocator = allocator;
fclose(fp);
inp->getByte = mapped_getByte;
inp->eof = mapped_eof;
inp->ungetByte = mapped_ungetByte;
inp->sourceData = fin;
return 0;
}
void TY_(freeFileSource)( TidyInputSource* inp, Bool closeIt )
{
if ( inp->getByte == mapped_getByte )
{
MappedFileSource* fin = (MappedFileSource*) inp->sourceData;
munmap( (void*)fin->base, fin->size );
TidyFree( fin->allocator, fin );
}
else
TY_(freeStdIOFileSource)( inp, closeIt );
}
#endif
#if defined(_WIN32)
#if defined(_MSC_VER) && (_MSC_VER < 1300) /* less than msvc++ 7.0 */
#pragma warning(disable:4115) /* named type definition in parentheses in windows headers */
#endif
#include <windows.h>
#include <errno.h>
#include "streamio.h"
#include "tidy-int.h"
#include "message.h"
typedef struct _fp_input_mapped_source
{
TidyAllocator *allocator;
LONGLONG size, pos;
HANDLE file, map;
byte *view, *iter, *end;
unsigned int gran;
} MappedFileSource;
static int mapped_openView( MappedFileSource *data )
{
DWORD numb = ( ( data->size - data->pos ) > data->gran ) ?
data->gran : (DWORD)( data->size - data->pos );
if ( data->view )
{
UnmapViewOfFile( data->view );
data->view = NULL;
}
data->view = MapViewOfFile( data->map, FILE_MAP_READ,
(DWORD)( data->pos >> 32 ),
(DWORD)data->pos, numb );
if ( !data->view ) return -1;
data->iter = data->view;
data->end = data->iter + numb;
return 0;
}
static int TIDY_CALL mapped_getByte( void *sourceData )
{
MappedFileSource *data = sourceData;
if ( !data->view || data->iter >= data->end )
{
data->pos += data->gran;
if ( data->pos >= data->size || mapped_openView(data) != 0 )
return EndOfStream;
}
return *( data->iter++ );
}
static Bool TIDY_CALL mapped_eof( void *sourceData )
{
MappedFileSource *data = sourceData;
return ( data->pos >= data->size );
}
static void TIDY_CALL mapped_ungetByte( void *sourceData, byte ARG_UNUSED(bt) )
{
MappedFileSource *data = sourceData;
if ( data->iter >= data->view )
{
--data->iter;
return;
}
if ( data->pos < data->gran )
{
assert(0);
return;
}
data->pos -= data->gran;
mapped_openView( data );
}
static int initMappedFileSource( TidyAllocator *allocator, TidyInputSource* inp, HANDLE fp )
{
MappedFileSource* fin = NULL;
inp->getByte = mapped_getByte;
inp->eof = mapped_eof;
inp->ungetByte = mapped_ungetByte;
fin = (MappedFileSource*) TidyAlloc( allocator, sizeof(MappedFileSource) );
if ( !fin )
return -1;
#if defined(__MINGW32__)
{
DWORD lowVal, highVal;
lowVal = GetFileSize(fp, &highVal);
if ((lowVal == INVALID_FILE_SIZE) && (GetLastError() != NO_ERROR))
{
TidyFree(allocator, fin);
return -1;
}
fin->size = highVal;
fin->size = (fin->size << 32);
fin->size += lowVal;
}
#else /* NOT a MinGW build */
#if defined(_MSC_VER) && (_MSC_VER < 1300) /* less than msvc++ 7.0 */
{
LARGE_INTEGER* pli = (LARGE_INTEGER *)&fin->size;
(DWORD)pli->LowPart = GetFileSize( fp, (DWORD *)&pli->HighPart );
if ( GetLastError() != NO_ERROR || fin->size <= 0 )
{
TidyFree(allocator, fin);
return -1;
}
}
#else
if ( !GetFileSizeEx( fp, (LARGE_INTEGER*)&fin->size )
|| fin->size <= 0 )
{
TidyFree(allocator, fin);
return -1;
}
#endif
#endif /* MinGW y/n */
fin->map = CreateFileMapping( fp, NULL, PAGE_READONLY, 0, 0, NULL );
if ( !fin->map )
{
TidyFree(allocator, fin);
return -1;
}
{
SYSTEM_INFO info;
GetSystemInfo( &info );
fin->gran = info.dwAllocationGranularity;
}
fin->allocator = allocator;
fin->pos = 0;
fin->view = NULL;
fin->iter = NULL;
fin->end = NULL;
if ( mapped_openView( fin ) != 0 )
{
CloseHandle( fin->map );
TidyFree( allocator, fin );
return -1;
}
fin->file = fp;
inp->sourceData = fin;
return 0;
}
static void freeMappedFileSource( TidyInputSource* inp, Bool closeIt )
{
MappedFileSource* fin = (MappedFileSource*) inp->sourceData;
if ( closeIt && fin && fin->file != INVALID_HANDLE_VALUE )
{
if ( fin->view )
UnmapViewOfFile( fin->view );
CloseHandle( fin->map );
CloseHandle( fin->file );
}
TidyFree( fin->allocator, fin );
}
StreamIn* MappedFileInput ( TidyDocImpl* doc, HANDLE fp, int encoding )
{
StreamIn *in = TY_(initStreamIn)( doc, encoding );
if ( initMappedFileSource( doc->allocator, &in->source, fp ) != 0 )
{
TY_(freeStreamIn)( in );
return NULL;
}
in->iotype = FileIO;
return in;
}
int TY_(DocParseFileWithMappedFile)( TidyDocImpl* doc, ctmbstr filnam ) {
int status = -ENOENT;
HANDLE fin = CreateFileA( filnam, GENERIC_READ, FILE_SHARE_READ, NULL,
OPEN_EXISTING, 0, NULL );
#if PRESERVE_FILE_TIMES
LONGLONG actime, modtime;
TidyClearMemory( &doc->filetimes, sizeof(doc->filetimes) );
if ( fin != INVALID_HANDLE_VALUE && cfgBool(doc,TidyKeepFileTimes) &&
GetFileTime(fin, NULL, (FILETIME*)&actime, (FILETIME*)&modtime) )
{
#define TY_I64(str) TYDYAPPEND(str,LL)
#if _MSC_VER < 1300 && !defined(__GNUC__) /* less than msvc++ 7.0 */
# undef TY_I64
# define TY_I64(str) TYDYAPPEND(str,i64)
#endif
doc->filetimes.actime =
(time_t)( ( actime - TY_I64(116444736000000000)) / 10000000 );
doc->filetimes.modtime =
(time_t)( ( modtime - TY_I64(116444736000000000)) / 10000000 );
}
#endif
if ( fin != INVALID_HANDLE_VALUE )
{
StreamIn* in = MappedFileInput( doc, fin,
cfg( doc, TidyInCharEncoding ) );
if ( !in )
{
CloseHandle( fin );
return -ENOMEM;
}
status = TY_(DocParseStream)( doc, in );
freeMappedFileSource( &in->source, yes );
TY_(freeStreamIn)( in );
}
else /* Error message! */
TY_(FileError)( doc, filnam, TidyError );
return status;
}
#endif
/*
* local variables:
* mode: c
* indent-tabs-mode: nil
* c-basic-offset: 4
* eval: (c-set-offset 'substatement-open 0)
* end:
*/

15
src/mappedio.h

@ -0,0 +1,15 @@
#ifndef __TIDY_MAPPED_IO_H__
#define __TIDY_MAPPED_IO_H__
/* Interface to mmap style I/O
(c) 2006 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
*/
#if defined(_WIN32)
int TY_(DocParseFileWithMappedFile)( TidyDocImpl* doc, ctmbstr filnam );
#endif
#endif /* __TIDY_MAPPED_IO_H__ */

1102
src/message.c

File diff suppressed because it is too large

282
src/message.h

@ -0,0 +1,282 @@
#ifndef __MESSAGE_H__
#define __MESSAGE_H__
/* message.h -- general message writing routines
(c) 1998-2007 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
*/
#include "forward.h"
#include "tidy.h" /* For TidyReportLevel */
#include "language.h"
/* General message writing routines.
** Each message is a single warning, error, etc.
**
** These routines keep track of counts and,
** if the caller has set a filter, it will be
** called. The new preferred way of handling
** Tidy diagnostics output is either a) define
** a new output sink or b) install a message
** filter routine.
**
** Keep track of ShowWarnings, ShowErrors, etc.
*/
ctmbstr TY_(ReleaseDate)(void);
void TY_(ReportUnknownOption)( TidyDocImpl* doc, ctmbstr option );
void TY_(ReportBadArgument)( TidyDocImpl* doc, ctmbstr option );
void TY_(NeedsAuthorIntervention)( TidyDocImpl* doc );
void TY_(ReportMarkupVersion)( TidyDocImpl* doc );
void TY_(ReportNumWarnings)( TidyDocImpl* doc );
void TY_(GeneralInfo)( TidyDocImpl* doc );
/* void TY_(UnknownOption)( TidyDocImpl* doc, char c ); */
/* void TY_(UnknownFile)( TidyDocImpl* doc, ctmbstr program, ctmbstr file ); */
void TY_(FileError)( TidyDocImpl* doc, ctmbstr file, TidyReportLevel level );
void TY_(ErrorSummary)( TidyDocImpl* doc );
void TY_(ReportEncodingWarning)(TidyDocImpl* doc, uint code, uint encoding);
void TY_(ReportEncodingError)(TidyDocImpl* doc, uint code, uint c, Bool discarded);
void TY_(ReportEntityError)( TidyDocImpl* doc, uint code, ctmbstr entity, int c );
void TY_(ReportAttrError)( TidyDocImpl* doc, Node* node, AttVal* av, uint code );
void TY_(ReportMissingAttr)( TidyDocImpl* doc, Node* node, ctmbstr name );
#if SUPPORT_ACCESSIBILITY_CHECKS
void TY_(ReportAccessWarning)( TidyDocImpl* doc, Node* node, uint code );
void TY_(ReportAccessError)( TidyDocImpl* doc, Node* node, uint code );
#endif
void TY_(ReportNotice)(TidyDocImpl* doc, Node *element, Node *node, uint code);
void TY_(ReportWarning)(TidyDocImpl* doc, Node *element, Node *node, uint code);
void TY_(ReportError)(TidyDocImpl* doc, Node* element, Node* node, uint code);
void TY_(ReportFatal)(TidyDocImpl* doc, Node* element, Node* node, uint code);
/**
* These tidyErrorCodes are used throughout libtidy, and also
* have associated localized strings to describe them.
*
* IMPORTANT: to maintain compatability with TidyMessageFilter3, if you add
* or remove keys from this enum, ALSO add/remove the corresponding key
* in language.c:tidyErrorFilterKeysStruct[]!
*/
typedef enum {
/* This MUST be present and first. */
CODES_TIDY_ERROR_FIRST = 200,
/* error codes for entities/numeric character references */
MISSING_SEMICOLON,
MISSING_SEMICOLON_NCR,
UNKNOWN_ENTITY,
UNESCAPED_AMPERSAND,
APOS_UNDEFINED,
/* error codes for element messages */
MISSING_ENDTAG_FOR,
MISSING_ENDTAG_BEFORE,
DISCARDING_UNEXPECTED,
NESTED_EMPHASIS,
NON_MATCHING_ENDTAG,
TAG_NOT_ALLOWED_IN,
MISSING_STARTTAG,
UNEXPECTED_ENDTAG,
USING_BR_INPLACE_OF,
INSERTING_TAG,
SUSPECTED_MISSING_QUOTE,
MISSING_TITLE_ELEMENT,
DUPLICATE_FRAMESET,
CANT_BE_NESTED,
OBSOLETE_ELEMENT,
PROPRIETARY_ELEMENT,
ELEMENT_VERS_MISMATCH_ERROR,
ELEMENT_VERS_MISMATCH_WARN,
UNKNOWN_ELEMENT,
TRIM_EMPTY_ELEMENT,
COERCE_TO_ENDTAG,
ILLEGAL_NESTING,
NOFRAMES_CONTENT,
CONTENT_AFTER_BODY,
INCONSISTENT_VERSION,
MALFORMED_COMMENT,
BAD_COMMENT_CHARS,
BAD_XML_COMMENT,
BAD_CDATA_CONTENT,
INCONSISTENT_NAMESPACE,
DOCTYPE_AFTER_TAGS,
MALFORMED_DOCTYPE,
UNEXPECTED_END_OF_FILE,
DTYPE_NOT_UPPER_CASE,
TOO_MANY_ELEMENTS,
UNESCAPED_ELEMENT,
NESTED_QUOTATION,
ELEMENT_NOT_EMPTY,
ENCODING_IO_CONFLICT,
MIXED_CONTENT_IN_BLOCK,
MISSING_DOCTYPE,
SPACE_PRECEDING_XMLDECL,
TOO_MANY_ELEMENTS_IN,
UNEXPECTED_ENDTAG_IN,
REPLACING_ELEMENT,
REPLACING_UNEX_ELEMENT,
COERCE_TO_ENDTAG_WARN,
/* error codes used for attribute messages */
UNKNOWN_ATTRIBUTE,
INSERTING_ATTRIBUTE,
INSERTING_AUTO_ATTRIBUTE,
MISSING_ATTR_VALUE,
BAD_ATTRIBUTE_VALUE,
UNEXPECTED_GT,
PROPRIETARY_ATTRIBUTE,
MISMATCHED_ATTRIBUTE_ERROR,
MISMATCHED_ATTRIBUTE_WARN,
PROPRIETARY_ATTR_VALUE,
REPEATED_ATTRIBUTE,
MISSING_IMAGEMAP,
XML_ATTRIBUTE_VALUE,
UNEXPECTED_QUOTEMARK,
MISSING_QUOTEMARK,
ID_NAME_MISMATCH,
BACKSLASH_IN_URI,
FIXED_BACKSLASH,
ILLEGAL_URI_REFERENCE,
ESCAPED_ILLEGAL_URI,
NEWLINE_IN_URI,
ANCHOR_NOT_UNIQUE,
JOINING_ATTRIBUTE,
UNEXPECTED_EQUALSIGN,
ATTR_VALUE_NOT_LCASE,
XML_ID_SYNTAX,
INVALID_ATTRIBUTE,
BAD_ATTRIBUTE_VALUE_REPLACED,
INVALID_XML_ID,
UNEXPECTED_END_OF_FILE_ATTR,
MISSING_ATTRIBUTE,
WHITE_IN_URI,
REMOVED_HTML5, /* this element removed from HTML5 */
BAD_SUMMARY_HTML5, /* use of summary attr removed from HTML5 */
PREVIOUS_LOCATION, /* last */
/* character encoding errors */
VENDOR_SPECIFIC_CHARS,
INVALID_SGML_CHARS,
INVALID_UTF8,
INVALID_UTF16,
ENCODING_MISMATCH,
INVALID_URI,
INVALID_NCR,
/* This MUST be present and last. */
CODES_TIDY_ERROR_LAST
} tidyErrorCodes;
/**
* These tidyMessagesMisc are used throughout libtidy, and also
* have associated localized strings to describe them.
*/
typedef enum {
ACCESS_URL = 2048, /* Used to point to Web Accessibility Guidelines. */
ATRC_ACCESS_URL, /* Points to Tidy's accessibility page. */
FILE_CANT_OPEN, /* For retrieving a string when a file can't be opened. */
LINE_COLUMN_STRING, /* For retrieving localized `line %d column %d` text. */
STRING_CONTENT_LOOKS, /* `Document content looks like %s`. */
STRING_DISCARDING, /* For `discarding`. */
STRING_DOCTYPE_GIVEN, /* `Doctype given is \"%s\". */
STRING_ERROR_COUNT, /* `%u %s, %u %s were found!`. */
STRING_ERROR_COUNT_ERROR, /* `error` and `errors`. */
STRING_ERROR_COUNT_WARNING, /* `warning` and `warnings`. */
STRING_HELLO_ACCESS, /* Accessibility hello message. */
STRING_HTML_PROPRIETARY, /* `HTML Proprietary`/ */
STRING_MISSING_MALFORMED, /* For `missing or malformed argument for option: %s`. */
STRING_NO_ERRORS, /* `No warnings or errors were found.\n\n`. */
STRING_NO_SYSID, /* `No system identifier in emitted doctype`. */
STRING_NOT_ALL_SHOWN, /* ` Not all warnings/errors were shown.\n\n`. */
STRING_PLAIN_TEXT, /* For retrieving a string `plain text`. */
STRING_REPLACING, /* For `replacing`. */
STRING_SPECIFIED, /* For `specified`. */
STRING_UNKNOWN_FILE, /* `%s: can't open file \"%s\"\n`. */
STRING_UNKNOWN_OPTION, /* For retrieving a string `unknown option: %s`. */
STRING_UNRECZD_OPTION, /* `unrecognized option -%c use -help to list options\n`. */
STRING_XML_DECLARATION, /* For retrieving a string `XML declaration`. */
TEXT_ACCESS_ADVICE1, /* Explanatory text. */
TEXT_ACCESS_ADVICE2, /* Explanatory text. */
TEXT_BAD_FORM, /* Explanatory text. */
TEXT_BAD_MAIN, /* Explanatory text. */
TEXT_GENERAL_INFO, /* Explanatory text. */
TEXT_GENERAL_INFO_PLEA, /* Explanatory text. */
TEXT_HTML_T_ALGORITHM, /* Paragraph for describing the HTML table algorithm. */
TEXT_INVALID_URI, /* Explanatory text. */
TEXT_INVALID_UTF16, /* Explanatory text. */
TEXT_INVALID_UTF8, /* Explanatory text. */
TEXT_M_IMAGE_ALT, /* Explanatory text. */
TEXT_M_IMAGE_MAP, /* Explanatory text. */
TEXT_M_LINK_ALT, /* Explanatory text. */
TEXT_M_SUMMARY, /* Explanatory text. */
TEXT_NEEDS_INTERVENTION, /* Explanatory text. */
TEXT_SGML_CHARS, /* Explanatory text. */
TEXT_USING_BODY, /* Explanatory text. */
TEXT_USING_FONT, /* Explanatory text. */
TEXT_USING_FRAMES, /* Explanatory text. */
TEXT_USING_LAYER, /* Explanatory text. */
TEXT_USING_NOBR, /* Explanatory text. */
TEXT_USING_SPACER, /* Explanatory text. */
TEXT_VENDOR_CHARS, /* Explanatory text. */
TEXT_WINDOWS_CHARS /* Explanatory text. */
} tidyMessagesMisc;
/* accessibility flaws */
#define BA_MISSING_IMAGE_ALT 1
#define BA_MISSING_LINK_ALT 2
#define BA_MISSING_SUMMARY 4
#define BA_MISSING_IMAGE_MAP 8
#define BA_USING_FRAMES 16
#define BA_USING_NOFRAMES 32
#define BA_INVALID_LINK_NOFRAMES 64 /* WAI [6.5.1.4] */
#define BA_WAI (1 << 31)
/* presentation flaws */
#define USING_SPACER 1
#define USING_LAYER 2
#define USING_NOBR 4
#define USING_FONT 8
#define USING_BODY 16
/* badchar bit field */
#define BC_VENDOR_SPECIFIC_CHARS 1
#define BC_INVALID_SGML_CHARS 2
#define BC_INVALID_UTF8 4
#define BC_INVALID_UTF16 8
#define BC_ENCODING_MISMATCH 16 /* fatal error */
#define BC_INVALID_URI 32
#define BC_INVALID_NCR 64
/* Lexer and I/O Macros */
#define REPLACED_CHAR 0
#define DISCARDED_CHAR 1
#endif /* __MESSAGE_H__ */

5057
src/parser.c

File diff suppressed because it is too large

70
src/parser.h

@ -0,0 +1,70 @@
#ifndef __PARSER_H__
#define __PARSER_H__
/* parser.h -- HTML Parser
(c) 1998-2007 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
*/
#include "forward.h"
Bool TY_(CheckNodeIntegrity)(Node *node);
Bool TY_(TextNodeEndWithSpace)( Lexer *lexer, Node *node );
/*
used to determine how attributes
without values should be printed
this was introduced to deal with
user defined tags e.g. Cold Fusion
*/
Bool TY_(IsNewNode)(Node *node);
void TY_(CoerceNode)(TidyDocImpl* doc, Node *node, TidyTagId tid, Bool obsolete, Bool expected);
/* extract a node and its children from a markup tree */
Node *TY_(RemoveNode)(Node *node);
/* remove node from markup tree and discard it */
Node *TY_(DiscardElement)( TidyDocImpl* doc, Node *element);
/* insert node into markup tree as the firt element
of content of element */
void TY_(InsertNodeAtStart)(Node *element, Node *node);
/* insert node into markup tree as the last element
of content of "element" */
void TY_(InsertNodeAtEnd)(Node *element, Node *node);
/* insert node into markup tree before element */
void TY_(InsertNodeBeforeElement)(Node *element, Node *node);
/* insert node into markup tree after element */
void TY_(InsertNodeAfterElement)(Node *element, Node *node);
Node *TY_(TrimEmptyElement)( TidyDocImpl* doc, Node *element );
Node* TY_(DropEmptyElements)(TidyDocImpl* doc, Node* node);
/* assumes node is a text node */
Bool TY_(IsBlank)(Lexer *lexer, Node *node);
Bool TY_(IsJavaScript)(Node *node);
/*
HTML is the top level element
*/
void TY_(ParseDocument)( TidyDocImpl* doc );
/*
XML documents
*/
Bool TY_(XMLPreserveWhiteSpace)( TidyDocImpl* doc, Node *element );
void TY_(ParseXMLDocument)( TidyDocImpl* doc );
#endif /* __PARSER_H__ */

6
src/platform.h

@ -0,0 +1,6 @@
#ifdef __GNUC__
#warning "FIXME: Using compatibility tidy header (platform.h) that will go away!"
#endif
#include "tidyplatform.h"

2564
src/pprint.c

File diff suppressed because it is too large

94
src/pprint.h

@ -0,0 +1,94 @@
#ifndef __PPRINT_H__
#define __PPRINT_H__
/* pprint.h -- pretty print parse tree
(c) 1998-2007 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
*/
#include "forward.h"
/*
Block-level and unknown elements are printed on
new lines and their contents indented 2 spaces
Inline elements are printed inline.
Inline content is wrapped on spaces (except in
attribute values or preformatted text, after
start tags and before end tags
*/
#define NORMAL 0u
#define PREFORMATTED 1u
#define COMMENT 2u
#define ATTRIBVALUE 4u
#define NOWRAP 8u
#define CDATA 16u
/* The pretty printer keeps at most two lines of text in the
** buffer before flushing output. We need to capture the
** indent state (indent level) at the _beginning_ of _each_
** line, not the end of just the second line.
**
** We must also keep track "In Attribute" and "In String"
** states at the _end_ of each line,
*/
typedef struct _TidyIndent
{
int spaces;
int attrValStart;
int attrStringStart;
} TidyIndent;
typedef struct _TidyPrintImpl
{
TidyAllocator *allocator; /* Allocator */
uint *linebuf;
uint lbufsize;
uint linelen;
uint wraphere;
uint line;
uint ixInd;
TidyIndent indent[2]; /* Two lines worth of indent state */
} TidyPrintImpl;
#if 0 && SUPPORT_ASIAN_ENCODINGS
/* #431953 - start RJ Wraplen adjusted for smooth international ride */
uint CWrapLen( TidyDocImpl* doc, uint ind );
#endif
void TY_(InitPrintBuf)( TidyDocImpl* doc );
void TY_(FreePrintBuf)( TidyDocImpl* doc );
void TY_(PFlushLine)( TidyDocImpl* doc, uint indent );
/* print just the content of the body element.
** useful when you want to reuse material from
** other documents.
**
** -- Sebastiano Vigna <vigna@dsi.unimi.it>
*/
void TY_(PrintBody)( TidyDocImpl* doc ); /* you can print an entire document */
/* node as body using PPrintTree() */
void TY_(PPrintTree)( TidyDocImpl* doc, uint mode, uint indent, Node *node );
void TY_(PPrintXMLTree)( TidyDocImpl* doc, uint mode, uint indent, Node *node );
/*\
* 20150515 - support using tabs instead of spaces
\*/
void TY_(PPrintTabs)(void);
void TY_(PPrintSpaces)(void);
#endif /* __PPRINT_H__ */

446
src/sprtf.c

@ -0,0 +1,446 @@
/*
* SPRTF - Log output utility
*
* Author: Geoff R. McLane <reports _at_ geoffair _dot_ info>
* License: GPL v2 (or later at your choice)
*
* Revision 1.0.1 2012/11/06 13:01:25 geoff
* Revision 1.0.0 2012/10/17 00:00:00 geoff
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, US
*
*/
#ifdef _MSC_VER
#pragma warning( disable : 4995 )
#endif
// Module: sprtf.cxx
// Debug log file output
#include <stdio.h> // fopen()...
#include <string.h> // strcpy
#include <stdarg.h> // va_start, va_end, ...
#ifdef _MSC_VER
#include <WinSock2.h>
#include <sys/timeb.h>
#if (defined(UNICODE) || defined(_UNICODE))
#include <Strsafe.h>
#endif
#else /* !_MSC_VER */
#include <sys/time.h> // gettimeoday(), struct timeval,...
#endif /* _MSC_VER y/n */
#include <time.h>
#include <stdlib.h> // for exit() in unix
#include "sprtf.h"
#ifdef _MSC_VER
#ifndef _CRT_SECURE_NO_DEPRECATE
#define _CRT_SECURE_NO_DEPRECATE
#endif // #ifndef _CRT_SECURE_NO_DEPRECATE
#pragma warning( disable:4996 )
#else
#define strcmpi strcasecmp
#endif
#ifndef MX_ONE_BUF
#define MX_ONE_BUF 1024
#endif
#ifndef MX_BUFFERS
#define MX_BUFFERS 1024
#endif
static char _s_strbufs[MX_ONE_BUF * MX_BUFFERS];
static int iNextBuf = 0;
char *GetNxtBuf()
{
iNextBuf++;
if(iNextBuf >= MX_BUFFERS)
iNextBuf = 0;
return &_s_strbufs[MX_ONE_BUF * iNextBuf];
}
#define MXIO 512
#ifdef _MSC_VER // use local log
static char def_log[] = "tempex.txt";
#else
static char def_log[] = "ex.log";
#endif
static char logfile[264] = "\0";
static FILE * outfile = NULL;
static int addsystime = 0;
static int addsysdate = 0;
static int addstdout = 1;
static int addflush = 1;
static int add2screen = 0;
static int add2listview = 0;
static int append_to_log = 0;
#ifndef VFP
#define VFP(a) ( a && ( a != (FILE *)-1 ) )
#endif
int add_list_out( int val )
{
int i = add2listview;
add2listview = val;
return i;
}
int add_std_out( int val )
{
int i = addstdout;
addstdout = val;
return i;
}
int add_screen_out( int val )
{
int i = add2screen;
add2screen = val;
return i;
}
int add_sys_time( int val )
{
int i = addsystime;
addsystime = val;
return i;
}
int add_sys_date( int val )
{
int i = addsysdate;
addsysdate = val;
return i;
}
int add_append_log( int val )
{
int i = append_to_log;
append_to_log = val;
return i;
}
#ifdef _MSC_VER
static const char *mode = "wb"; // in window sprtf looks after the line endings
#else
static const char *mode = "w";
#endif
int open_log_file( void )
{
if (logfile[0] == 0)
strcpy(logfile,def_log);
if (append_to_log) {
#ifdef _MSC_VER
mode = "ab"; // in window sprtf looks after the line endings
#else
mode = "a";
#endif
}
outfile = fopen(logfile, mode);
if( outfile == 0 ) {
outfile = (FILE *)-1;
sprtf("ERROR: Failed to open log file [%s] ...\n", logfile);
exit(1); /* failed */
return 0; /* failed */
}
return 1; /* success */
}
void close_log_file( void )
{
if( VFP(outfile) ) {
fclose(outfile);
}
outfile = NULL;
}
char * get_log_file( void )
{
if (logfile[0] == 0)
strcpy(logfile,def_log);
if (outfile == (FILE *)-1) // disable the log file
return (char *)"none";
return logfile;
}
void set_log_file( char * nf, int open )
{
if (logfile[0] == 0)
strcpy(logfile,def_log);
if ( nf && *nf && strcmpi(nf,logfile) ) {
close_log_file(); // remove any previous
strcpy(logfile,nf); // set new name
if (strcmp(logfile,"none") == 0) { // if equal 'none'
outfile = (FILE *)-1; // disable the log file
} else if (open) {
open_log_file(); // and open it ... anything previous written is 'lost'
} else
outfile = 0; // else set 0 to open on first write
}
}
#ifdef _MSC_VER
int gettimeofday(struct timeval *tp, void *tzp)
{
#ifdef WIN32
struct _timeb timebuffer;
_ftime(&timebuffer);
tp->tv_sec = (long)timebuffer.time;
tp->tv_usec = timebuffer.millitm * 1000;
#else
tp->tv_sec = time(NULL);
tp->tv_usec = 0;
#endif
return 0;
}
#endif // _MSC_VER
void add_date_stg( char *ps, struct timeval *ptv )
{
time_t curtime;
struct tm * ptm;
curtime = (ptv->tv_sec & 0xffffffff);
ptm = localtime(&curtime);
if (ptm) {
strftime(EndBuf(ps),128,"%Y/%m/%d",ptm);
}
}
void add_time_stg( char *ps, struct timeval *ptv )
{
time_t curtime;
struct tm * ptm;
curtime = (ptv->tv_sec & 0xffffffff);
ptm = localtime(&curtime);
if (ptm) {
strftime(EndBuf(ps),128,"%H:%M:%S",ptm);
}
}
char *get_date_stg()
{
char *ps;
struct timeval tv;
gettimeofday( (struct timeval *)&tv, (struct timezone *)0 );
ps = GetNxtBuf();
*ps = 0;
add_date_stg( ps, &tv );
return ps;
}
char *get_time_stg()
{
char *ps;
struct timeval tv;
gettimeofday( (struct timeval *)&tv, (struct timezone *)0 );
ps = GetNxtBuf();
*ps = 0;
add_time_stg( ps, &tv );
return ps;
}
char *get_date_time_stg()
{
char *ps;
struct timeval tv;
gettimeofday( (struct timeval *)&tv, (struct timezone *)0 );
ps = GetNxtBuf();
*ps = 0;
add_date_stg( ps, &tv );
strcat(ps," ");
add_time_stg( ps, &tv );
return ps;
}
static void oi( char * psin )
{
int len, w;
char * ps = psin;
if (!ps)
return;
len = (int)strlen(ps);
if (len) {
if( outfile == 0 ) {
open_log_file();
}
if( VFP(outfile) ) {
char *tb;
if (addsysdate) {
tb = GetNxtBuf();
len = sprintf( tb, "%s - %s", get_date_time_stg(), ps );
ps = tb;
} else if( addsystime ) {
tb = GetNxtBuf();
len = sprintf( tb, "%s - %s", get_time_stg(), ps );
ps = tb;
}
w = (int)fwrite( ps, 1, len, outfile );
if( w != len ) {
fclose(outfile);
outfile = (FILE *)-1;
sprtf("WARNING: Failed write to log file [%s] ...\n", logfile);
exit(1);
} else if (addflush) {
fflush( outfile );
}
}
if( addstdout ) {
fwrite( ps, 1, len, stdout );
}
#ifdef ADD_LISTVIEW
if (add2listview) {
LVInsertItem(ps);
}
#endif // ADD_LISTVIEW
#ifdef ADD_SCREENOUT
if (add2screen) {
Add_String(ps); // add string to screen list
}
#endif // #ifdef ADD_SCREENOUT
}
}
#ifdef _MSC_VER
// service to ensure line endings in windows only
static void prt( char * ps )
{
static char _s_buf[1024];
char * pb = _s_buf;
size_t i, j, k;
char c, d;
i = strlen(ps);
k = 0;
d = 0;
if(i) {
k = 0;
d = 0;
for( j = 0; j < i; j++ ) {
c = ps[j];
if( c == 0x0d ) {
if( (j+1) < i ) {
if( ps[j+1] != 0x0a ) {
pb[k++] = c;
c = 0x0a;
}
} else {
pb[k++] = c;
c = 0x0a;
}
} else if( c == 0x0a ) {
if( d != 0x0d ) {
pb[k++] = 0x0d;
}
}
pb[k++] = c;
d = c;
if( k >= MXIO ) {
pb[k] = 0;
oi(pb);
k = 0;
}
} // for length of string
if( k ) {
//if( ( gbCheckCrLf ) &&
// ( d != 0x0a ) ) {
// add Cr/Lf pair
//pb[k++] = 0x0d;
//pb[k++] = 0x0a;
//pb[k] = 0;
//}
pb[k] = 0;
oi( pb );
}
}
}
#endif // #ifdef _MSC_VER
int direct_out_it( char *cp )
{
#ifdef _MSC_VER
prt(cp);
#else
oi(cp);
#endif
return (int)strlen(cp);
}
// STDAPI StringCchVPrintf( OUT LPTSTR pszDest,
// IN size_t cchDest, IN LPCTSTR pszFormat, IN va_list argList );
int MCDECL sprtf( const char *pf, ... )
{
static char _s_sprtfbuf[M_MAX_SPRTF+4];
char * pb = _s_sprtfbuf;
int i;
va_list arglist;
va_start(arglist, pf);
i = vsnprintf( pb, M_MAX_SPRTF, pf, arglist );
va_end(arglist);
#ifdef _MSC_VER
prt(pb); // ensure CR/LF
#else
oi(pb);
#endif
return i;
}
#ifdef UNICODE
// WIDE VARIETY
static void wprt( PTSTR ps )
{
static char _s_woibuf[1024];
char * cp = _s_woibuf;
int len = (int)lstrlen(ps);
if(len) {
int ret = WideCharToMultiByte( CP_ACP, // UINT CodePage, // code page
0, // DWORD dwFlags, // performance and mapping flags
ps, // LPCWSTR lpWideCharStr, // wide-character string
len, // int cchWideChar, // number of chars in string.
cp, // LPSTR lpMultiByteStr, // buffer for new string
1024, // int cbMultiByte, // size of buffer
NULL, // LPCSTR lpDefaultChar, // default for unmappable chars
NULL ); // LPBOOL lpUsedDefaultChar // set when default char used
//oi(cp);
prt(cp);
}
}
int MCDECL wsprtf( PTSTR pf, ... )
{
static WCHAR _s_sprtfwbuf[1024];
PWSTR pb = _s_sprtfwbuf;
int i = 1;
va_list arglist;
va_start(arglist, pf);
*pb = 0;
StringCchVPrintf(pb,1024,pf,arglist);
//i = vswprintf( pb, pf, arglist );
va_end(arglist);
wprt(pb);
return i;
}
#endif // #ifdef UNICODE
// eof - sprtf.cxx

77
src/sprtf.h

@ -0,0 +1,77 @@
/*
* SPRTF - Log output utility
*
* Author: Geoff R. McLane <reports _at_ geoffair _dot_ info>
* License: GPL v2 (or later at your choice)
*
* Revision 1.0.1 2012/11/06 13:01:25 geoff
* Revision 1.0.0 2012/10/17 00:00:00 geoff
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, US
*
*/
// Module: sprtf.hxx
// Debug log file output
#ifndef _SPRTF_HXX_
#define _SPRTF_HXX_
#include "tidyplatform.h"
#ifdef __cplusplus
extern "C" {
#endif
#ifdef _MSC_VER
#define MCDECL _cdecl
#else
#define MCDECL
#endif
TIDY_EXPORT int add_std_out( int val );
TIDY_EXPORT int add_sys_time( int val );
TIDY_EXPORT int add_sys_date( int val );
TIDY_EXPORT int add_screen_out( int val );
TIDY_EXPORT int add_list_out( int val );
TIDY_EXPORT int add_append_log( int val );
TIDY_EXPORT int open_log_file( void );
TIDY_EXPORT void close_log_file( void );
TIDY_EXPORT void set_log_file( char * nf, int open );
TIDY_EXPORT char * get_log_file( void );
TIDY_EXPORT int MCDECL sprtf( const char *pf, ... );
#define M_MAX_SPRTF 2048
TIDY_EXPORT int direct_out_it( char *cp );
TIDY_EXPORT char *GetNxtBuf();
#define EndBuf(a) ( a + strlen(a) )
TIDY_EXPORT char *get_date_stg();
TIDY_EXPORT char *get_time_stg();
TIDY_EXPORT char *get_date_time_stg();
#ifdef _MSC_VER
TIDY_EXPORT int gettimeofday(struct timeval *tp, void *tzp);
#endif
#ifndef SPRTF
#define SPRTF sprtf
#endif
#ifdef __cplusplus
}
#endif
#endif // #ifndef _SPRTF_HXX_
// oef - sprtf.hxx

1392
src/streamio.c

File diff suppressed because it is too large

210
src/streamio.h

@ -0,0 +1,210 @@
#ifndef __STREAMIO_H__
#define __STREAMIO_H__
/* streamio.h -- handles character stream I/O
(c) 1998-2007 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
Wrapper around Tidy input source and output sink
that calls appropriate interfaces, and applies
necessary char encoding transformations: to/from
ISO-10646 and/or UTF-8.
*/
#include "forward.h"
#include "tidybuffio.h"
#include "fileio.h"
#ifdef __cplusplus
extern "C"
{
#endif
typedef enum
{
FileIO,
BufferIO,
UserIO
} IOType;
/* states for ISO 2022
A document in ISO-2022 based encoding uses some ESC sequences called
"designator" to switch character sets. The designators defined and
used in ISO-2022-JP are:
"ESC" + "(" + ? for ISO646 variants
"ESC" + "$" + ? and
"ESC" + "$" + "(" + ? for multibyte character sets
*/
typedef enum
{
FSM_ASCII,
FSM_ESC,
FSM_ESCD,
FSM_ESCDP,
FSM_ESCP,
FSM_NONASCII
} ISO2022State;
/************************
** Source
************************/
enum
{
CHARBUF_SIZE=5,
LASTPOS_SIZE=64
};
/* non-raw input is cleaned up*/
struct _StreamIn
{
ISO2022State state; /* FSM for ISO2022 */
Bool pushed;
TidyAllocator *allocator;
tchar* charbuf;
uint bufpos;
uint bufsize;
int tabs;
int lastcols[LASTPOS_SIZE];
unsigned short curlastpos; /* current last position in lastcols */
unsigned short firstlastpos; /* first valid last position in lastcols */
int curcol;
int curline;
int encoding;
IOType iotype;
TidyInputSource source;
#ifdef TIDY_WIN32_MLANG_SUPPORT
void* mlang;
#endif
#ifdef TIDY_STORE_ORIGINAL_TEXT
tmbstr otextbuf;
size_t otextsize;
uint otextlen;
#endif
/* Pointer back to document for error reporting */
TidyDocImpl* doc;
};
StreamIn* TY_(initStreamIn)( TidyDocImpl* doc, int encoding );
void TY_(freeStreamIn)(StreamIn* in);
StreamIn* TY_(FileInput)( TidyDocImpl* doc, FILE* fp, int encoding );
StreamIn* TY_(BufferInput)( TidyDocImpl* doc, TidyBuffer* content, int encoding );
StreamIn* TY_(UserInput)( TidyDocImpl* doc, TidyInputSource* source, int encoding );
int TY_(ReadBOMEncoding)(StreamIn *in);
uint TY_(ReadChar)( StreamIn* in );
void TY_(UngetChar)( uint c, StreamIn* in );
Bool TY_(IsEOF)( StreamIn* in );
/************************
** Sink
************************/
struct _StreamOut
{
int encoding;
ISO2022State state; /* for ISO 2022 */
uint nl;
#ifdef TIDY_WIN32_MLANG_SUPPORT
void* mlang;
#endif
IOType iotype;
TidyOutputSink sink;
};
StreamOut* TY_(FileOutput)( TidyDocImpl *doc, FILE* fp, int encoding, uint newln );
StreamOut* TY_(BufferOutput)( TidyDocImpl *doc, TidyBuffer* buf, int encoding, uint newln );
StreamOut* TY_(UserOutput)( TidyDocImpl *doc, TidyOutputSink* sink, int encoding, uint newln );
StreamOut* TY_(StdErrOutput)(void);
/* StreamOut* StdOutOutput(void); */
void TY_(ReleaseStreamOut)( TidyDocImpl *doc, StreamOut* out );
void TY_(WriteChar)( uint c, StreamOut* out );
void TY_(outBOM)( StreamOut *out );
ctmbstr TY_(GetEncodingNameFromTidyId)(uint id);
ctmbstr TY_(GetEncodingOptNameFromTidyId)(uint id);
int TY_(GetCharEncodingFromOptName)(ctmbstr charenc);
/************************
** Misc
************************/
/* character encodings
*/
#define RAW 0
#define ASCII 1
#define LATIN0 2
#define LATIN1 3
#define UTF8 4
#define ISO2022 5
#define MACROMAN 6
#define WIN1252 7
#define IBM858 8
#if SUPPORT_UTF16_ENCODINGS
#define UTF16LE 9
#define UTF16BE 10
#define UTF16 11
#endif
/* Note that Big5 and SHIFTJIS are not converted to ISO 10646 codepoints
** (i.e., to Unicode) before being recoded into UTF-8. This may be
** confusing: usually UTF-8 implies ISO10646 codepoints.
*/
#if SUPPORT_ASIAN_ENCODINGS
#if SUPPORT_UTF16_ENCODINGS
#define BIG5 12
#define SHIFTJIS 13
#else
#define BIG5 9
#define SHIFTJIS 10
#endif
#endif
#ifdef TIDY_WIN32_MLANG_SUPPORT
/* hack: windows code page numbers start at 37 */
#define WIN32MLANG 36
#endif
/* Function for conversion from Windows-1252 to Unicode */
uint TY_(DecodeWin1252)(uint c);
/* Function to convert from MacRoman to Unicode */
uint TY_(DecodeMacRoman)(uint c);
#ifdef __cplusplus
}
#endif
/* Use numeric constants as opposed to escape chars (\r, \n)
** to avoid conflict Mac compilers that may re-define these.
*/
#define CR 0xD
#define LF 0xA
#if defined(MAC_OS_CLASSIC)
#define DEFAULT_NL_CONFIG TidyCR
#elif defined(_WIN32) || defined(OS2_OS)
#define DEFAULT_NL_CONFIG TidyCRLF
#else
#define DEFAULT_NL_CONFIG TidyLF
#endif
#endif /* __STREAMIO_H__ */

285
src/tagask.c

@ -0,0 +1,285 @@
/* tagask.c -- Interrogate node type
(c) 1998-2006 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
*/
#include "tidy-int.h"
#include "tags.h"
#include "tidy.h"
Bool TIDY_CALL tidyNodeIsText( TidyNode tnod )
{ return TY_(nodeIsText)( tidyNodeToImpl(tnod) );
}
Bool tidyNodeCMIsBlock( TidyNode tnod ); /* not exported yet */
Bool tidyNodeCMIsBlock( TidyNode tnod )
{ return TY_(nodeCMIsBlock)( tidyNodeToImpl(tnod) );
}
Bool tidyNodeCMIsInline( TidyNode tnod ); /* not exported yet */
Bool tidyNodeCMIsInline( TidyNode tnod )
{ return TY_(nodeCMIsInline)( tidyNodeToImpl(tnod) );
}
Bool tidyNodeCMIsEmpty( TidyNode tnod ); /* not exported yet */
Bool tidyNodeCMIsEmpty( TidyNode tnod )
{ return TY_(nodeCMIsEmpty)( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsHeader( TidyNode tnod )
{ return TY_(nodeIsHeader)( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsHTML( TidyNode tnod )
{ return nodeIsHTML( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsHEAD( TidyNode tnod )
{ return nodeIsHEAD( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsTITLE( TidyNode tnod )
{ return nodeIsTITLE( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsBASE( TidyNode tnod )
{ return nodeIsBASE( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsMETA( TidyNode tnod )
{ return nodeIsMETA( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsBODY( TidyNode tnod )
{ return nodeIsBODY( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsFRAMESET( TidyNode tnod )
{ return nodeIsFRAMESET( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsFRAME( TidyNode tnod )
{ return nodeIsFRAME( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsIFRAME( TidyNode tnod )
{ return nodeIsIFRAME( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsNOFRAMES( TidyNode tnod )
{ return nodeIsNOFRAMES( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsHR( TidyNode tnod )
{ return nodeIsHR( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsH1( TidyNode tnod )
{ return nodeIsH1( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsH2( TidyNode tnod )
{ return nodeIsH2( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsPRE( TidyNode tnod )
{ return nodeIsPRE( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsLISTING( TidyNode tnod )
{ return nodeIsLISTING( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsP( TidyNode tnod )
{ return nodeIsP( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsUL( TidyNode tnod )
{ return nodeIsUL( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsOL( TidyNode tnod )
{ return nodeIsOL( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsDL( TidyNode tnod )
{ return nodeIsDL( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsDIR( TidyNode tnod )
{ return nodeIsDIR( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsLI( TidyNode tnod )
{ return nodeIsLI( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsDT( TidyNode tnod )
{ return nodeIsDT( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsDD( TidyNode tnod )
{ return nodeIsDD( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsTABLE( TidyNode tnod )
{ return nodeIsTABLE( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsCAPTION( TidyNode tnod )
{ return nodeIsCAPTION( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsTD( TidyNode tnod )
{ return nodeIsTD( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsTH( TidyNode tnod )
{ return nodeIsTH( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsTR( TidyNode tnod )
{ return nodeIsTR( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsCOL( TidyNode tnod )
{ return nodeIsCOL( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsCOLGROUP( TidyNode tnod )
{ return nodeIsCOLGROUP( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsBR( TidyNode tnod )
{ return nodeIsBR( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsA( TidyNode tnod )
{ return nodeIsA( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsLINK( TidyNode tnod )
{ return nodeIsLINK( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsB( TidyNode tnod )
{ return nodeIsB( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsI( TidyNode tnod )
{ return nodeIsI( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsSTRONG( TidyNode tnod )
{ return nodeIsSTRONG( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsEM( TidyNode tnod )
{ return nodeIsEM( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsBIG( TidyNode tnod )
{ return nodeIsBIG( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsSMALL( TidyNode tnod )
{ return nodeIsSMALL( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsPARAM( TidyNode tnod )
{ return nodeIsPARAM( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsOPTION( TidyNode tnod )
{ return nodeIsOPTION( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsOPTGROUP( TidyNode tnod )
{ return nodeIsOPTGROUP( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsIMG( TidyNode tnod )
{ return nodeIsIMG( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsMAP( TidyNode tnod )
{ return nodeIsMAP( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsAREA( TidyNode tnod )
{ return nodeIsAREA( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsNOBR( TidyNode tnod )
{ return nodeIsNOBR( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsWBR( TidyNode tnod )
{ return nodeIsWBR( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsFONT( TidyNode tnod )
{ return nodeIsFONT( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsLAYER( TidyNode tnod )
{ return nodeIsLAYER( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsSPACER( TidyNode tnod )
{ return nodeIsSPACER( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsCENTER( TidyNode tnod )
{ return nodeIsCENTER( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsSTYLE( TidyNode tnod )
{ return nodeIsSTYLE( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsSCRIPT( TidyNode tnod )
{ return nodeIsSCRIPT( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsNOSCRIPT( TidyNode tnod )
{ return nodeIsNOSCRIPT( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsFORM( TidyNode tnod )
{ return nodeIsFORM( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsTEXTAREA( TidyNode tnod )
{ return nodeIsTEXTAREA( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsBLOCKQUOTE( TidyNode tnod )
{ return nodeIsBLOCKQUOTE( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsAPPLET( TidyNode tnod )
{ return nodeIsAPPLET( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsOBJECT( TidyNode tnod )
{ return nodeIsOBJECT( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsDIV( TidyNode tnod )
{ return nodeIsDIV( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsSPAN( TidyNode tnod )
{ return nodeIsSPAN( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsINPUT( TidyNode tnod )
{ return nodeIsINPUT( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsQ( TidyNode tnod )
{ return nodeIsQ( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsLABEL( TidyNode tnod )
{ return nodeIsLABEL( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsH3( TidyNode tnod )
{ return nodeIsH3( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsH4( TidyNode tnod )
{ return nodeIsH4( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsH5( TidyNode tnod )
{ return nodeIsH5( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsH6( TidyNode tnod )
{ return nodeIsH6( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsADDRESS( TidyNode tnod )
{ return nodeIsADDRESS( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsXMP( TidyNode tnod )
{ return nodeIsXMP( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsSELECT( TidyNode tnod )
{ return nodeIsSELECT( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsBLINK( TidyNode tnod )
{ return nodeIsBLINK( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsMARQUEE( TidyNode tnod )
{ return nodeIsMARQUEE( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsEMBED( TidyNode tnod )
{ return nodeIsEMBED( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsBASEFONT( TidyNode tnod )
{ return nodeIsBASEFONT( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsISINDEX( TidyNode tnod )
{ return nodeIsISINDEX( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsS( TidyNode tnod )
{ return nodeIsS( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsSTRIKE( TidyNode tnod )
{ return nodeIsSTRIKE( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsU( TidyNode tnod )
{ return nodeIsU( tidyNodeToImpl(tnod) );
}
Bool TIDY_CALL tidyNodeIsMENU( TidyNode tnod )
{ return nodeIsMENU( tidyNodeToImpl(tnod) );
}
/* HTML5 */
Bool TIDY_CALL tidyNodeIsDATALIST( TidyNode tnod )
{ return nodeIsDATALIST( tidyNodeToImpl(tnod) );
}
/*
* local variables:
* mode: c
* indent-tabs-mode: nil
* c-basic-offset: 4
* eval: (c-set-offset 'substatement-open 0)
* end:
*/

1123
src/tags.c

File diff suppressed because it is too large

247
src/tags.h

@ -0,0 +1,247 @@
#ifndef __TAGS_H__
#define __TAGS_H__
/* tags.h -- recognize HTML tags
(c) 1998-2006 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
The HTML tags are stored as 8 bit ASCII strings.
Use lookupw() to find a tag given a wide char string.
*/
#include "forward.h"
#include "attrdict.h"
typedef void (Parser)( TidyDocImpl* doc, Node *node, GetTokenMode mode );
typedef void (CheckAttribs)( TidyDocImpl* doc, Node *node );
/*
Tag dictionary node
*/
/* types of tags that the user can define */
typedef enum
{
tagtype_null = 0,
tagtype_empty = 1,
tagtype_inline = 2,
tagtype_block = 4,
tagtype_pre = 8
} UserTagType;
struct _Dict
{
TidyTagId id;
tmbstr name;
uint versions;
AttrVersion const * attrvers;
uint model;
Parser* parser;
CheckAttribs* chkattrs;
Dict* next;
};
#if !defined(ELEMENT_HASH_LOOKUP)
#define ELEMENT_HASH_LOOKUP 1
#endif
#if ELEMENT_HASH_LOOKUP
enum
{
ELEMENT_HASH_SIZE=178u
};
struct _DictHash
{
Dict const* tag;
struct _DictHash* next;
};
typedef struct _DictHash DictHash;
#endif
struct _TidyTagImpl
{
Dict* xml_tags; /* placeholder for all xml tags */
Dict* declared_tag_list; /* User declared tags */
#if ELEMENT_HASH_LOOKUP
DictHash* hashtab[ELEMENT_HASH_SIZE];
#endif
};
typedef struct _TidyTagImpl TidyTagImpl;
/* interface for finding tag by name */
const Dict* TY_(LookupTagDef)( TidyTagId tid );
Bool TY_(FindTag)( TidyDocImpl* doc, Node *node );
Parser* TY_(FindParser)( TidyDocImpl* doc, Node *node );
void TY_(DefineTag)( TidyDocImpl* doc, UserTagType tagType, ctmbstr name );
void TY_(FreeDeclaredTags)( TidyDocImpl* doc, UserTagType tagType ); /* tagtype_null to free all */
TidyIterator TY_(GetDeclaredTagList)( TidyDocImpl* doc );
ctmbstr TY_(GetNextDeclaredTag)( TidyDocImpl* doc, UserTagType tagType,
TidyIterator* iter );
void TY_(InitTags)( TidyDocImpl* doc );
void TY_(FreeTags)( TidyDocImpl* doc );
void TY_(AdjustTags)( TidyDocImpl *doc ); /* if NOT HTML5 DOCTYPE, fall back to HTML4 legacy mode */
void TY_(ResetTags)( TidyDocImpl *doc ); /* set table to HTML5 mode */
Bool TY_(IsHTML5Mode)( TidyDocImpl *doc );
/* Parser methods for tags */
Parser TY_(ParseHTML);
Parser TY_(ParseHead);
Parser TY_(ParseTitle);
Parser TY_(ParseScript);
Parser TY_(ParseFrameSet);
Parser TY_(ParseNoFrames);
Parser TY_(ParseBody);
Parser TY_(ParsePre);
Parser TY_(ParseList);
Parser TY_(ParseDefList);
Parser TY_(ParseBlock);
Parser TY_(ParseInline);
Parser TY_(ParseEmpty);
Parser TY_(ParseTableTag);
Parser TY_(ParseColGroup);
Parser TY_(ParseRowGroup);
Parser TY_(ParseRow);
Parser TY_(ParseSelect);
Parser TY_(ParseOptGroup);
Parser TY_(ParseText);
Parser TY_(ParseDatalist);
Parser TY_(ParseNamespace);
CheckAttribs TY_(CheckAttributes);
/* 0 == TidyTag_UNKNOWN */
#define TagId(node) ((node) && (node)->tag ? (node)->tag->id : TidyTag_UNKNOWN)
#define TagIsId(node, tid) ((node) && (node)->tag && (node)->tag->id == tid)
Bool TY_(nodeIsText)( Node* node );
Bool TY_(nodeIsElement)( Node* node );
Bool TY_(nodeHasText)( TidyDocImpl* doc, Node* node );
#if 0
/* Compare & result to operand. If equal, then all bits
** requested are set.
*/
Bool nodeMatchCM( Node* node, uint contentModel );
#endif
/* True if any of the bits requested are set.
*/
Bool TY_(nodeHasCM)( Node* node, uint contentModel );
Bool TY_(nodeCMIsBlock)( Node* node );
Bool TY_(nodeCMIsInline)( Node* node );
Bool TY_(nodeCMIsEmpty)( Node* node );
Bool TY_(nodeIsHeader)( Node* node ); /* H1, H2, ..., H6 */
uint TY_(nodeHeaderLevel)( Node* node ); /* 1, 2, ..., 6 */
#define nodeIsHTML( node ) TagIsId( node, TidyTag_HTML )
#define nodeIsHEAD( node ) TagIsId( node, TidyTag_HEAD )
#define nodeIsTITLE( node ) TagIsId( node, TidyTag_TITLE )
#define nodeIsBASE( node ) TagIsId( node, TidyTag_BASE )
#define nodeIsMETA( node ) TagIsId( node, TidyTag_META )
#define nodeIsBODY( node ) TagIsId( node, TidyTag_BODY )
#define nodeIsFRAMESET( node ) TagIsId( node, TidyTag_FRAMESET )
#define nodeIsFRAME( node ) TagIsId( node, TidyTag_FRAME )
#define nodeIsIFRAME( node ) TagIsId( node, TidyTag_IFRAME )
#define nodeIsNOFRAMES( node ) TagIsId( node, TidyTag_NOFRAMES )
#define nodeIsHR( node ) TagIsId( node, TidyTag_HR )
#define nodeIsH1( node ) TagIsId( node, TidyTag_H1 )
#define nodeIsH2( node ) TagIsId( node, TidyTag_H2 )
#define nodeIsPRE( node ) TagIsId( node, TidyTag_PRE )
#define nodeIsLISTING( node ) TagIsId( node, TidyTag_LISTING )
#define nodeIsP( node ) TagIsId( node, TidyTag_P )
#define nodeIsUL( node ) TagIsId( node, TidyTag_UL )
#define nodeIsOL( node ) TagIsId( node, TidyTag_OL )
#define nodeIsDL( node ) TagIsId( node, TidyTag_DL )
#define nodeIsDIR( node ) TagIsId( node, TidyTag_DIR )
#define nodeIsLI( node ) TagIsId( node, TidyTag_LI )
#define nodeIsDT( node ) TagIsId( node, TidyTag_DT )
#define nodeIsDD( node ) TagIsId( node, TidyTag_DD )
#define nodeIsTABLE( node ) TagIsId( node, TidyTag_TABLE )
#define nodeIsCAPTION( node ) TagIsId( node, TidyTag_CAPTION )
#define nodeIsTD( node ) TagIsId( node, TidyTag_TD )
#define nodeIsTH( node ) TagIsId( node, TidyTag_TH )
#define nodeIsTR( node ) TagIsId( node, TidyTag_TR )
#define nodeIsCOL( node ) TagIsId( node, TidyTag_COL )
#define nodeIsCOLGROUP( node ) TagIsId( node, TidyTag_COLGROUP )
#define nodeIsBR( node ) TagIsId( node, TidyTag_BR )
#define nodeIsA( node ) TagIsId( node, TidyTag_A )
#define nodeIsLINK( node ) TagIsId( node, TidyTag_LINK )
#define nodeIsB( node ) TagIsId( node, TidyTag_B )
#define nodeIsI( node ) TagIsId( node, TidyTag_I )
#define nodeIsSTRONG( node ) TagIsId( node, TidyTag_STRONG )
#define nodeIsEM( node ) TagIsId( node, TidyTag_EM )
#define nodeIsBIG( node ) TagIsId( node, TidyTag_BIG )
#define nodeIsSMALL( node ) TagIsId( node, TidyTag_SMALL )
#define nodeIsPARAM( node ) TagIsId( node, TidyTag_PARAM )
#define nodeIsOPTION( node ) TagIsId( node, TidyTag_OPTION )
#define nodeIsOPTGROUP( node ) TagIsId( node, TidyTag_OPTGROUP )
#define nodeIsIMG( node ) TagIsId( node, TidyTag_IMG )
#define nodeIsMAP( node ) TagIsId( node, TidyTag_MAP )
#define nodeIsAREA( node ) TagIsId( node, TidyTag_AREA )
#define nodeIsNOBR( node ) TagIsId( node, TidyTag_NOBR )
#define nodeIsWBR( node ) TagIsId( node, TidyTag_WBR )
#define nodeIsFONT( node ) TagIsId( node, TidyTag_FONT )
#define nodeIsLAYER( node ) TagIsId( node, TidyTag_LAYER )
#define nodeIsSPACER( node ) TagIsId( node, TidyTag_SPACER )
#define nodeIsCENTER( node ) TagIsId( node, TidyTag_CENTER )
#define nodeIsSTYLE( node ) TagIsId( node, TidyTag_STYLE )
#define nodeIsSCRIPT( node ) TagIsId( node, TidyTag_SCRIPT )
#define nodeIsNOSCRIPT( node ) TagIsId( node, TidyTag_NOSCRIPT )
#define nodeIsFORM( node ) TagIsId( node, TidyTag_FORM )
#define nodeIsTEXTAREA( node ) TagIsId( node, TidyTag_TEXTAREA )
#define nodeIsBLOCKQUOTE( node ) TagIsId( node, TidyTag_BLOCKQUOTE )
#define nodeIsAPPLET( node ) TagIsId( node, TidyTag_APPLET )
#define nodeIsOBJECT( node ) TagIsId( node, TidyTag_OBJECT )
#define nodeIsDIV( node ) TagIsId( node, TidyTag_DIV )
#define nodeIsSPAN( node ) TagIsId( node, TidyTag_SPAN )
#define nodeIsINPUT( node ) TagIsId( node, TidyTag_INPUT )
#define nodeIsQ( node ) TagIsId( node, TidyTag_Q )
#define nodeIsLABEL( node ) TagIsId( node, TidyTag_LABEL )
#define nodeIsH3( node ) TagIsId( node, TidyTag_H3 )
#define nodeIsH4( node ) TagIsId( node, TidyTag_H4 )
#define nodeIsH5( node ) TagIsId( node, TidyTag_H5 )
#define nodeIsH6( node ) TagIsId( node, TidyTag_H6 )
#define nodeIsADDRESS( node ) TagIsId( node, TidyTag_ADDRESS )
#define nodeIsXMP( node ) TagIsId( node, TidyTag_XMP )
#define nodeIsSELECT( node ) TagIsId( node, TidyTag_SELECT )
#define nodeIsBLINK( node ) TagIsId( node, TidyTag_BLINK )
#define nodeIsMARQUEE( node ) TagIsId( node, TidyTag_MARQUEE )
#define nodeIsEMBED( node ) TagIsId( node, TidyTag_EMBED )
#define nodeIsBASEFONT( node ) TagIsId( node, TidyTag_BASEFONT )
#define nodeIsISINDEX( node ) TagIsId( node, TidyTag_ISINDEX )
#define nodeIsS( node ) TagIsId( node, TidyTag_S )
#define nodeIsSTRIKE( node ) TagIsId( node, TidyTag_STRIKE )
#define nodeIsSUB( node ) TagIsId( node, TidyTag_SUB )
#define nodeIsSUP( node ) TagIsId( node, TidyTag_SUP )
#define nodeIsU( node ) TagIsId( node, TidyTag_U )
#define nodeIsMENU( node ) TagIsId( node, TidyTag_MENU )
#define nodeIsMAIN( node ) TagIsId( node, TidyTag_MAIN )
#define nodeIsBUTTON( node ) TagIsId( node, TidyTag_BUTTON )
#define nodeIsCANVAS( node ) TagIsId( node, TidyTag_CANVAS )
#define nodeIsPROGRESS( node ) TagIsId( node, TidyTag_PROGRESS )
#define nodeIsINS( node ) TagIsId( node, TidyTag_INS )
#define nodeIsDEL( node ) TagIsId( node, TidyTag_DEL )
/* HTML5 */
#define nodeIsDATALIST( node ) TagIsId( node, TidyTag_DATALIST )
#define nodeIsMATHML( node ) TagIsId( node, TidyTag_MATHML ) /* #130 MathML attr and entity fix! */
/* NOT in HTML 5 */
#define nodeIsACRONYM( node ) TagIsId( node, TidyTag_ACRONYM )
#define nodesIsFRAME( node ) TagIsId( node, TidyTag_FRAME )
#define nodeIsTT( node ) TagIsId( node, TidyTag_TT )
#endif /* __TAGS_H__ */

159
src/tidy-int.h

@ -0,0 +1,159 @@
#ifndef __TIDY_INT_H__
#define __TIDY_INT_H__
/* tidy-int.h -- internal library declarations
(c) 1998-2007 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
*/
#include "tidy.h"
#include "config.h"
#include "lexer.h"
#include "tags.h"
#include "attrs.h"
#include "pprint.h"
#include "access.h"
#ifndef MAX
#define MAX(a,b) (((a) > (b))?(a):(b))
#endif
#ifndef MIN
#define MIN(a,b) (((a) < (b))?(a):(b))
#endif
/*\
* Issue #166 - repeated <main> element
* Change the previous on/off uint flag badForm
* to a BIT flag to support other than <form>
* errors. This could be extended more...
\*/
#define flg_BadForm 0x00000001
#define flg_BadMain 0x00000002
struct _TidyDocImpl
{
/* The Document Tree (and backing store buffer) */
Node root; /* This MUST remain the first declared
variable in this structure */
Lexer* lexer;
/* Config + Markup Declarations */
TidyConfigImpl config;
TidyTagImpl tags;
TidyAttribImpl attribs;
#if SUPPORT_ACCESSIBILITY_CHECKS
/* Accessibility Checks state */
TidyAccessImpl access;
#endif
/* The Pretty Print buffer */
TidyPrintImpl pprint;
/* I/O */
StreamIn* docIn;
StreamOut* docOut;
StreamOut* errout;
TidyReportFilter mssgFilt;
TidyReportFilter2 mssgFilt2;
TidyReportFilter3 mssgFilt3;
TidyOptCallback pOptCallback;
TidyPPProgress progressCallback;
/* Parse + Repair Results */
uint optionErrors;
uint errors;
uint warnings;
uint accessErrors;
uint infoMessages;
uint docErrors;
int parseStatus;
uint badAccess; /* for accessibility errors */
uint badLayout; /* for bad style errors */
uint badChars; /* for bad char encodings */
uint badForm; /* bit field, for badly placed form tags, or other format errors */
Bool HTML5Mode; /* current mode is html5 */
/* Memory allocator */
TidyAllocator* allocator;
/* Miscellaneous */
void* appData;
uint nClassId;
Bool inputHadBOM;
#ifdef TIDY_STORE_ORIGINAL_TEXT
Bool storeText;
#endif
#if PRESERVE_FILE_TIMES
struct utimbuf filetimes;
#endif
tmbstr givenDoctype;
};
/* Twizzle internal/external types */
#ifdef NEVER
TidyDocImpl* tidyDocToImpl( TidyDoc tdoc );
TidyDoc tidyImplToDoc( TidyDocImpl* impl );
Node* tidyNodeToImpl( TidyNode tnod );
TidyNode tidyImplToNode( Node* node );
AttVal* tidyAttrToImpl( TidyAttr tattr );
TidyAttr tidyImplToAttr( AttVal* attval );
const TidyOptionImpl* tidyOptionToImpl( TidyOption topt );
TidyOption tidyImplToOption( const TidyOptionImpl* option );
#else
#define tidyDocToImpl( tdoc ) ((TidyDocImpl*)(tdoc))
#define tidyImplToDoc( doc ) ((TidyDoc)(doc))
#define tidyNodeToImpl( tnod ) ((Node*)(tnod))
#define tidyImplToNode( node ) ((TidyNode)(node))
#define tidyAttrToImpl( tattr ) ((AttVal*)(tattr))
#define tidyImplToAttr( attval ) ((TidyAttr)(attval))
#define tidyOptionToImpl( topt ) ((const TidyOptionImpl*)(topt))
#define tidyImplToOption( option ) ((TidyOption)(option))
#endif
/** Wrappers for easy memory allocation using the document's allocator */
#define TidyDocAlloc(doc, size) TidyAlloc((doc)->allocator, size)
#define TidyDocRealloc(doc, block, size) TidyRealloc((doc)->allocator, block, size)
#define TidyDocFree(doc, block) TidyFree((doc)->allocator, block)
#define TidyDocPanic(doc, msg) TidyPanic((doc)->allocator, msg)
int TY_(DocParseStream)( TidyDocImpl* impl, StreamIn* in );
/*
[i_a] generic node tree traversal code; used in several spots.
Define your own callback, which returns one of the NodeTraversalSignal values
to instruct the tree traversal routine TraverseNodeTree() what to do.
Pass custom data to/from the callback using the 'propagate' reference.
*/
typedef enum
{
ContinueTraversal, /* visit siblings and children */
SkipChildren, /* visit siblings of this node; ignore its children */
SkipSiblings, /* ignore subsequent siblings of this node; ignore their children; traverse */
SkipChildrenAndSiblings, /* visit siblings of this node; ignore its children */
VisitParent, /* REVERSE traversal: visit the parent of the current node */
ExitTraversal /* terminate traversal on the spot */
} NodeTraversalSignal;
typedef NodeTraversalSignal NodeTraversalCallBack(TidyDocImpl* doc, Node* node, void *propagate);
NodeTraversalSignal TY_(TraverseNodeTree)(TidyDocImpl* doc, Node* node, NodeTraversalCallBack *cb, void *propagate);
#endif /* __TIDY_INT_H__ */

1153
src/tidy.h

File diff suppressed because it is too large

112
src/tidybuffio.h

@ -0,0 +1,112 @@
#ifndef __TIDY_BUFFIO_H__
#define __TIDY_BUFFIO_H__
/** @file tidybuffio.h - Treat buffer as an I/O stream.
(c) 1998-2007 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
Requires buffer to automatically grow as bytes are added.
Must keep track of current read and write points.
*/
#include "tidyplatform.h"
#include "tidy.h"
#ifdef __cplusplus
extern "C" {
#endif
/** TidyBuffer - A chunk of memory */
TIDY_STRUCT
struct _TidyBuffer
{
TidyAllocator* allocator; /**< Memory allocator */
byte* bp; /**< Pointer to bytes */
uint size; /**< # bytes currently in use */
uint allocated; /**< # bytes allocated */
uint next; /**< Offset of current input position */
};
/** Initialize data structure using the default allocator */
TIDY_EXPORT void TIDY_CALL tidyBufInit( TidyBuffer* buf );
/** Initialize data structure using the given custom allocator */
TIDY_EXPORT void TIDY_CALL tidyBufInitWithAllocator( TidyBuffer* buf, TidyAllocator* allocator );
/** Free current buffer, allocate given amount, reset input pointer,
use the default allocator */
TIDY_EXPORT void TIDY_CALL tidyBufAlloc( TidyBuffer* buf, uint allocSize );
/** Free current buffer, allocate given amount, reset input pointer,
use the given custom allocator */
TIDY_EXPORT void TIDY_CALL tidyBufAllocWithAllocator( TidyBuffer* buf,
TidyAllocator* allocator,
uint allocSize );
/** Expand buffer to given size.
** Chunk size is minimum growth. Pass 0 for default of 256 bytes.
*/
TIDY_EXPORT void TIDY_CALL tidyBufCheckAlloc( TidyBuffer* buf,
uint allocSize, uint chunkSize );
/** Free current contents and zero out */
TIDY_EXPORT void TIDY_CALL tidyBufFree( TidyBuffer* buf );
/** Set buffer bytes to 0 */
TIDY_EXPORT void TIDY_CALL tidyBufClear( TidyBuffer* buf );
/** Attach to existing buffer */
TIDY_EXPORT void TIDY_CALL tidyBufAttach( TidyBuffer* buf, byte* bp, uint size );
/** Detach from buffer. Caller must free. */
TIDY_EXPORT void TIDY_CALL tidyBufDetach( TidyBuffer* buf );
/** Append bytes to buffer. Expand if necessary. */
TIDY_EXPORT void TIDY_CALL tidyBufAppend( TidyBuffer* buf, void* vp, uint size );
/** Append one byte to buffer. Expand if necessary. */
TIDY_EXPORT void TIDY_CALL tidyBufPutByte( TidyBuffer* buf, byte bv );
/** Get byte from end of buffer */
TIDY_EXPORT int TIDY_CALL tidyBufPopByte( TidyBuffer* buf );
/** Get byte from front of buffer. Increment input offset. */
TIDY_EXPORT int TIDY_CALL tidyBufGetByte( TidyBuffer* buf );
/** At end of buffer? */
TIDY_EXPORT Bool TIDY_CALL tidyBufEndOfInput( TidyBuffer* buf );
/** Put a byte back into the buffer. Decrement input offset. */
TIDY_EXPORT void TIDY_CALL tidyBufUngetByte( TidyBuffer* buf, byte bv );
/**************
TIDY
**************/
/* Forward declarations
*/
/** Initialize a buffer input source */
TIDY_EXPORT void TIDY_CALL tidyInitInputBuffer( TidyInputSource* inp, TidyBuffer* buf );
/** Initialize a buffer output sink */
TIDY_EXPORT void TIDY_CALL tidyInitOutputBuffer( TidyOutputSink* outp, TidyBuffer* buf );
#ifdef __cplusplus
}
#endif
#endif /* __TIDY_BUFFIO_H__ */
/*
* local variables:
* mode: c
* indent-tabs-mode: nil
* c-basic-offset: 4
* eval: (c-set-offset 'substatement-open 0)
* end:
*/

858
src/tidyenum.h

@ -0,0 +1,858 @@
#ifndef __TIDYENUM_H__
#define __TIDYENUM_H__
/* @file tidyenum.h -- Split public enums into separate header
Simplifies enum re-use in various wrappers. e.g. SWIG
generated wrappers and COM IDL files.
Copyright (c) 1998-2008 World Wide Web Consortium
(Massachusetts Institute of Technology, European Research
Consortium for Informatics and Mathematics, Keio University).
All Rights Reserved.
Contributing Author(s):
Dave Raggett <dsr@w3.org>
The contributing author(s) would like to thank all those who
helped with testing, bug fixes and suggestions for improvements.
This wouldn't have been possible without your help.
COPYRIGHT NOTICE:
This software and documentation is provided "as is," and
the copyright holders and contributing author(s) make no
representations or warranties, express or implied, including
but not limited to, warranties of merchantability or fitness
for any particular purpose or that the use of the software or
documentation will not infringe any third party patents,
copyrights, trademarks or other rights.
The copyright holders and contributing author(s) will not be held
liable for any direct, indirect, special or consequential damages
arising out of any use of the software or documentation, even if
advised of the possibility of such damage.
Permission is hereby granted to use, copy, modify, and distribute
this source code, or portions hereof, documentation and executables,
for any purpose, without fee, subject to the following restrictions:
1. The origin of this source code must not be misrepresented.
2. Altered versions must be plainly marked as such and must
not be misrepresented as being the original source.
3. This Copyright notice may not be removed or altered from any
source or altered source distribution.
The copyright holders and contributing author(s) specifically
permit, without fee, and encourage the use of this source code
as a component for supporting the Hypertext Markup Language in
commercial products. If you use this source code in a product,
acknowledgment is not required but would be appreciated.
Created 2001-05-20 by Charles Reitzel
Updated 2002-07-01 by Charles Reitzel - 1st Implementation
*/
#ifdef __cplusplus
extern "C" {
#endif
/* Enumerate configuration options
*/
/** Categories of Tidy configuration options
*/
typedef enum
{
TidyMarkup, /**< Markup options: (X)HTML version, etc */
TidyDiagnostics, /**< Diagnostics */
TidyPrettyPrint, /**< Output layout */
TidyEncoding, /**< Character encodings */
TidyMiscellaneous /**< File handling, message format, etc. */
} TidyConfigCategory;
/** Option IDs Used to get/set option values.
These TidyOptionId are used throughout libtidy, and also
have associated localized strings to describe them.
Note this enum MUST start at zero due to historical design-time
decisions that make assumptions about this starting value.
*/
typedef enum
{
TidyUnknownOption, /**< Unknown option! */
TidyIndentSpaces, /**< Indentation n spaces/tabs */
TidyWrapLen, /**< Wrap margin */
TidyTabSize, /**< Expand tabs to n spaces */
TidyCharEncoding, /**< In/out character encoding */
TidyInCharEncoding, /**< Input character encoding (if different) */
TidyOutCharEncoding, /**< Output character encoding (if different) */
TidyNewline, /**< Output line ending (default to platform) */
TidyDoctypeMode, /**< See doctype property */
TidyDoctype, /**< User specified doctype */
TidyDuplicateAttrs, /**< Keep first or last duplicate attribute */
TidyAltText, /**< Default text for alt attribute */
/* obsolete */
TidySlideStyle, /**< Style sheet for slides: not used for anything yet */
TidyErrFile, /**< File name to write errors to */
TidyOutFile, /**< File name to write markup to */
TidyWriteBack, /**< If true then output tidied markup */
TidyShowMarkup, /**< If false, normal output is suppressed */
TidyShowInfo, /**< If true, info-level messages are shown */
TidyShowWarnings, /**< However errors are always shown */
TidyQuiet, /**< No 'Parsing X', guessed DTD or summary */
TidyIndentContent, /**< Indent content of appropriate tags */
/**< "auto" does text/block level content indentation */
TidyCoerceEndTags, /**< Coerce end tags from start tags where probably intended */
TidyOmitOptionalTags,/**< Suppress optional start tags and end tags */
TidyHideEndTags, /**< Legacy name for TidyOmitOptionalTags */
TidyXmlTags, /**< Treat input as XML */
TidyXmlOut, /**< Create output as XML */
TidyXhtmlOut, /**< Output extensible HTML */
TidyHtmlOut, /**< Output plain HTML, even for XHTML input.
Yes means set explicitly. */
TidyXmlDecl, /**< Add <?xml?> for XML docs */
TidyUpperCaseTags, /**< Output tags in upper not lower case */
TidyUpperCaseAttrs, /**< Output attributes in upper not lower case */
TidyMakeBare, /**< Make bare HTML: remove Microsoft cruft */
TidyMakeClean, /**< Replace presentational clutter by style rules */
TidyGDocClean, /**< Clean up HTML exported from Google Docs */
TidyLogicalEmphasis, /**< Replace i by em and b by strong */
TidyDropPropAttrs, /**< Discard proprietary attributes */
TidyDropFontTags, /**< Discard presentation tags */
TidyDropEmptyElems, /**< Discard empty elements */
TidyDropEmptyParas, /**< Discard empty p elements */
TidyFixComments, /**< Fix comments with adjacent hyphens */
TidyBreakBeforeBR, /**< Output newline before <br> or not? */
/* obsolete */
TidyBurstSlides, /**< Create slides on each h2 element */
TidyNumEntities, /**< Use numeric entities */
TidyQuoteMarks, /**< Output " marks as &quot; */
TidyQuoteNbsp, /**< Output non-breaking space as entity */
TidyQuoteAmpersand, /**< Output naked ampersand as &amp; */
TidyWrapAttVals, /**< Wrap within attribute values */
TidyWrapScriptlets, /**< Wrap within JavaScript string literals */
TidyWrapSection, /**< Wrap within <![ ... ]> section tags */
TidyWrapAsp, /**< Wrap within ASP pseudo elements */
TidyWrapJste, /**< Wrap within JSTE pseudo elements */
TidyWrapPhp, /**< Wrap within PHP pseudo elements */
TidyFixBackslash, /**< Fix URLs by replacing \ with / */
TidyIndentAttributes,/**< Newline+indent before each attribute */
TidyXmlPIs, /**< If set to yes PIs must end with ?> */
TidyXmlSpace, /**< If set to yes adds xml:space attr as needed */
TidyEncloseBodyText, /**< If yes text at body is wrapped in P's */
TidyEncloseBlockText,/**< If yes text in blocks is wrapped in P's */
TidyKeepFileTimes, /**< If yes last modied time is preserved */
TidyWord2000, /**< Draconian cleaning for Word2000 */
TidyMark, /**< Add meta element indicating tidied doc */
TidyEmacs, /**< If true format error output for GNU Emacs */
TidyEmacsFile, /**< Name of current Emacs file */
TidyLiteralAttribs, /**< If true attributes may use newlines */
TidyBodyOnly, /**< Output BODY content only */
TidyFixUri, /**< Applies URI encoding if necessary */
TidyLowerLiterals, /**< Folds known attribute values to lower case */
TidyHideComments, /**< Hides all (real) comments in output */
TidyIndentCdata, /**< Indent <!CDATA[ ... ]]> section */
TidyForceOutput, /**< Output document even if errors were found */
TidyShowErrors, /**< Number of errors to put out */
TidyAsciiChars, /**< Convert quotes and dashes to nearest ASCII char */
TidyJoinClasses, /**< Join multiple class attributes */
TidyJoinStyles, /**< Join multiple style attributes */
TidyEscapeCdata, /**< Replace <![CDATA[]]> sections with escaped text */
#if SUPPORT_ASIAN_ENCODINGS
TidyLanguage, /**< Language property: not used for anything yet */
TidyNCR, /**< Allow numeric character references */
#else
TidyLanguageNotUsed,
TidyNCRNotUsed,
#endif
#if SUPPORT_UTF16_ENCODINGS
TidyOutputBOM, /**< Output a Byte Order Mark (BOM) for UTF-16 encodings */
/**< auto: if input stream has BOM, we output a BOM */
#else
TidyOutputBOMNotUsed,
#endif
TidyReplaceColor, /**< Replace hex color attribute values with names */
TidyCSSPrefix, /**< CSS class naming for -clean option */
TidyInlineTags, /**< Declared inline tags */
TidyBlockTags, /**< Declared block tags */
TidyEmptyTags, /**< Declared empty tags */
TidyPreTags, /**< Declared pre tags */
TidyAccessibilityCheckLevel, /**< Accessibility check level
0 (old style), or 1, 2, 3 */
TidyVertSpace, /**< degree to which markup is spread out vertically */
#if SUPPORT_ASIAN_ENCODINGS
TidyPunctWrap, /**< consider punctuation and breaking spaces for wrapping */
#else
TidyPunctWrapNotUsed,
#endif
TidyMergeEmphasis, /**< Merge nested B and I elements */
TidyMergeDivs, /**< Merge multiple DIVs */
TidyDecorateInferredUL, /**< Mark inferred UL elements with no indent CSS */
TidyPreserveEntities, /**< Preserve entities */
TidySortAttributes, /**< Sort attributes */
TidyMergeSpans, /**< Merge multiple SPANs */
TidyAnchorAsName, /**< Define anchors as name attributes */
TidyPPrintTabs, /**< Indent using tabs istead of spaces */
TidySkipNested, /**< Skip nested tags in script and style CDATA */
TidyStrictTagsAttr, /**< Ensure tags and attributes match output HTML version */
TidyEscapeScripts, /**< Escape items that look like closing tags in script tags */
N_TIDY_OPTIONS /**< Must be last */
} TidyOptionId;
/** Option data types
*/
typedef enum
{
TidyString, /**< String */
TidyInteger, /**< Integer or enumeration */
TidyBoolean /**< Boolean flag */
} TidyOptionType;
/** AutoBool values used by ParseBool, ParseTriState, ParseIndent, ParseBOM
*/
typedef enum
{
TidyNoState, /**< maps to 'no' */
TidyYesState, /**< maps to 'yes' */
TidyAutoState /**< Automatic */
} TidyTriState;
/** TidyNewline option values to control output line endings.
*/
typedef enum
{
TidyLF, /**< Use Unix style: LF */
TidyCRLF, /**< Use DOS/Windows style: CR+LF */
TidyCR /**< Use Macintosh style: CR */
} TidyLineEnding;
/** Mode controlling treatment of doctype
*/
typedef enum
{
TidyDoctypeHtml5, /**< <!DOCTYPE html> */
TidyDoctypeOmit, /**< Omit DOCTYPE altogether */
TidyDoctypeAuto, /**< Keep DOCTYPE in input. Set version to content */
TidyDoctypeStrict, /**< Convert document to HTML 4 strict content model */
TidyDoctypeLoose, /**< Convert document to HTML 4 transitional
content model */
TidyDoctypeUser /**< Set DOCTYPE FPI explicitly */
} TidyDoctypeModes;
/** Mode controlling treatment of duplicate Attributes
*/
typedef enum
{
TidyKeepFirst,
TidyKeepLast
} TidyDupAttrModes;
/** Mode controlling treatment of sorting attributes
*/
typedef enum
{
TidySortAttrNone,
TidySortAttrAlpha
} TidyAttrSortStrategy;
/* I/O and Message handling interface
**
** By default, Tidy will define, create and use
** instances of input and output handlers for
** standard C buffered I/O (i.e. FILE* stdin,
** FILE* stdout and FILE* stderr for content
** input, content output and diagnostic output,
** respectively. A FILE* cfgFile input handler
** will be used for config files. Command line
** options will just be set directly.
*/
/** Message severity level
* These TidyReportLevel are used throughout libtidy, but don't
* have associated localized strings to describe them because
* TidyReportLevel is externally-facing, and changing the enum
* starting int can break existing API's for poorly-written
* applications using libtidy. See enum `TidyReportLevelKeys`.
*/
typedef enum
{
TidyInfo, /**< Information about markup usage */
TidyWarning, /**< Warning message */
TidyConfig, /**< Configuration error */
TidyAccess, /**< Accessibility message */
TidyError, /**< Error message - output suppressed */
TidyBadDocument, /**< I/O or file system error */
TidyFatal /**< Crash! */
} TidyReportLevel;
/** Message severity level - string lookup keys
* These TidyReportLevelKeys are used throughout libtidy, and
* have associated localized strings to describe them. They
* correspond to enum `TidyReportLevel`.
*/
typedef enum
{
TidyInfoString = 600,
TidyWarningString,
TidyConfigString,
TidyAccessString,
TidyErrorString,
TidyBadDocumentString,
TidyFatalString
} TidyReportLevelKeys;
/* Document tree traversal functions
*/
/** Node types
*/
typedef enum
{
TidyNode_Root, /**< Root */
TidyNode_DocType, /**< DOCTYPE */
TidyNode_Comment, /**< Comment */
TidyNode_ProcIns, /**< Processing Instruction */
TidyNode_Text, /**< Text */
TidyNode_Start, /**< Start Tag */
TidyNode_End, /**< End Tag */
TidyNode_StartEnd, /**< Start/End (empty) Tag */
TidyNode_CDATA, /**< Unparsed Text */
TidyNode_Section, /**< XML Section */
TidyNode_Asp, /**< ASP Source */
TidyNode_Jste, /**< JSTE Source */
TidyNode_Php, /**< PHP Source */
TidyNode_XmlDecl /**< XML Declaration */
} TidyNodeType;
/** Known HTML element types
*/
typedef enum
{
TidyTag_UNKNOWN, /**< Unknown tag! */
TidyTag_A, /**< A */
TidyTag_ABBR, /**< ABBR */
TidyTag_ACRONYM, /**< ACRONYM */
TidyTag_ADDRESS, /**< ADDRESS */
TidyTag_ALIGN, /**< ALIGN */
TidyTag_APPLET, /**< APPLET */
TidyTag_AREA, /**< AREA */
TidyTag_B, /**< B */
TidyTag_BASE, /**< BASE */
TidyTag_BASEFONT, /**< BASEFONT */
TidyTag_BDO, /**< BDO */
TidyTag_BGSOUND, /**< BGSOUND */
TidyTag_BIG, /**< BIG */
TidyTag_BLINK, /**< BLINK */
TidyTag_BLOCKQUOTE, /**< BLOCKQUOTE */
TidyTag_BODY, /**< BODY */
TidyTag_BR, /**< BR */
TidyTag_BUTTON, /**< BUTTON */
TidyTag_CAPTION, /**< CAPTION */
TidyTag_CENTER, /**< CENTER */
TidyTag_CITE, /**< CITE */
TidyTag_CODE, /**< CODE */
TidyTag_COL, /**< COL */
TidyTag_COLGROUP, /**< COLGROUP */
TidyTag_COMMENT, /**< COMMENT */
TidyTag_DD, /**< DD */
TidyTag_DEL, /**< DEL */
TidyTag_DFN, /**< DFN */
TidyTag_DIR, /**< DIR */
TidyTag_DIV, /**< DIF */
TidyTag_DL, /**< DL */
TidyTag_DT, /**< DT */
TidyTag_EM, /**< EM */
TidyTag_EMBED, /**< EMBED */
TidyTag_FIELDSET, /**< FIELDSET */
TidyTag_FONT, /**< FONT */
TidyTag_FORM, /**< FORM */
TidyTag_FRAME, /**< FRAME */
TidyTag_FRAMESET, /**< FRAMESET */
TidyTag_H1, /**< H1 */
TidyTag_H2, /**< H2 */
TidyTag_H3, /**< H3 */
TidyTag_H4, /**< H4 */
TidyTag_H5, /**< H5 */
TidyTag_H6, /**< H6 */
TidyTag_HEAD, /**< HEAD */
TidyTag_HR, /**< HR */
TidyTag_HTML, /**< HTML */
TidyTag_I, /**< I */
TidyTag_IFRAME, /**< IFRAME */
TidyTag_ILAYER, /**< ILAYER */
TidyTag_IMG, /**< IMG */
TidyTag_INPUT, /**< INPUT */
TidyTag_INS, /**< INS */
TidyTag_ISINDEX, /**< ISINDEX */
TidyTag_KBD, /**< KBD */
TidyTag_KEYGEN, /**< KEYGEN */
TidyTag_LABEL, /**< LABEL */
TidyTag_LAYER, /**< LAYER */
TidyTag_LEGEND, /**< LEGEND */
TidyTag_LI, /**< LI */
TidyTag_LINK, /**< LINK */
TidyTag_LISTING, /**< LISTING */
TidyTag_MAP, /**< MAP */
TidyTag_MATHML, /**< MATH (HTML5) [i_a]2 MathML embedded in [X]HTML */
TidyTag_MARQUEE, /**< MARQUEE */
TidyTag_MENU, /**< MENU */
TidyTag_META, /**< META */
TidyTag_MULTICOL, /**< MULTICOL */
TidyTag_NOBR, /**< NOBR */
TidyTag_NOEMBED, /**< NOEMBED */
TidyTag_NOFRAMES, /**< NOFRAMES */
TidyTag_NOLAYER, /**< NOLAYER */
TidyTag_NOSAVE, /**< NOSAVE */
TidyTag_NOSCRIPT, /**< NOSCRIPT */
TidyTag_OBJECT, /**< OBJECT */
TidyTag_OL, /**< OL */
TidyTag_OPTGROUP, /**< OPTGROUP */
TidyTag_OPTION, /**< OPTION */
TidyTag_P, /**< P */
TidyTag_PARAM, /**< PARAM */
TidyTag_PICTURE, /**< PICTURE (HTML5) */
TidyTag_PLAINTEXT,/**< PLAINTEXT */
TidyTag_PRE, /**< PRE */
TidyTag_Q, /**< Q */
TidyTag_RB, /**< RB */
TidyTag_RBC, /**< RBC */
TidyTag_RP, /**< RP */
TidyTag_RT, /**< RT */
TidyTag_RTC, /**< RTC */
TidyTag_RUBY, /**< RUBY */
TidyTag_S, /**< S */
TidyTag_SAMP, /**< SAMP */
TidyTag_SCRIPT, /**< SCRIPT */
TidyTag_SELECT, /**< SELECT */
TidyTag_SERVER, /**< SERVER */
TidyTag_SERVLET, /**< SERVLET */
TidyTag_SMALL, /**< SMALL */
TidyTag_SPACER, /**< SPACER */
TidyTag_SPAN, /**< SPAN */
TidyTag_STRIKE, /**< STRIKE */
TidyTag_STRONG, /**< STRONG */
TidyTag_STYLE, /**< STYLE */
TidyTag_SUB, /**< SUB */
TidyTag_SUP, /**< SUP */
TidyTag_SVG, /**< SVG (HTML5) */
TidyTag_TABLE, /**< TABLE */
TidyTag_TBODY, /**< TBODY */
TidyTag_TD, /**< TD */
TidyTag_TEXTAREA, /**< TEXTAREA */
TidyTag_TFOOT, /**< TFOOT */
TidyTag_TH, /**< TH */
TidyTag_THEAD, /**< THEAD */
TidyTag_TITLE, /**< TITLE */
TidyTag_TR, /**< TR */
TidyTag_TT, /**< TT */
TidyTag_U, /**< U */
TidyTag_UL, /**< UL */
TidyTag_VAR, /**< VAR */
TidyTag_WBR, /**< WBR */
TidyTag_XMP, /**< XMP */
TidyTag_NEXTID, /**< NEXTID */
TidyTag_ARTICLE,
TidyTag_ASIDE,
TidyTag_AUDIO,
TidyTag_BDI,
TidyTag_CANVAS,
TidyTag_COMMAND,
TidyTag_DATALIST,
TidyTag_DETAILS,
TidyTag_DIALOG,
TidyTag_FIGCAPTION,
TidyTag_FIGURE,
TidyTag_FOOTER,
TidyTag_HEADER,
TidyTag_HGROUP,
TidyTag_MAIN,
TidyTag_MARK,
TidyTag_MENUITEM,
TidyTag_METER,
TidyTag_NAV,
TidyTag_OUTPUT,
TidyTag_PROGRESS,
TidyTag_SECTION,
TidyTag_SOURCE,
TidyTag_SUMMARY,
TidyTag_TEMPLATE,
TidyTag_TIME,
TidyTag_TRACK,
TidyTag_VIDEO,
N_TIDY_TAGS /**< Must be last */
} TidyTagId;
/* Attribute interrogation
*/
/** Known HTML attributes
*/
typedef enum
{
TidyAttr_UNKNOWN, /**< UNKNOWN= */
TidyAttr_ABBR, /**< ABBR= */
TidyAttr_ACCEPT, /**< ACCEPT= */
TidyAttr_ACCEPT_CHARSET, /**< ACCEPT_CHARSET= */
TidyAttr_ACCESSKEY, /**< ACCESSKEY= */
TidyAttr_ACTION, /**< ACTION= */
TidyAttr_ADD_DATE, /**< ADD_DATE= */
TidyAttr_ALIGN, /**< ALIGN= */
TidyAttr_ALINK, /**< ALINK= */
TidyAttr_ALLOWFULLSCREEN, /**< ALLOWFULLSCREEN= */
TidyAttr_ALT, /**< ALT= */
TidyAttr_ARCHIVE, /**< ARCHIVE= */
TidyAttr_AXIS, /**< AXIS= */
TidyAttr_BACKGROUND, /**< BACKGROUND= */
TidyAttr_BGCOLOR, /**< BGCOLOR= */
TidyAttr_BGPROPERTIES, /**< BGPROPERTIES= */
TidyAttr_BORDER, /**< BORDER= */
TidyAttr_BORDERCOLOR, /**< BORDERCOLOR= */
TidyAttr_BOTTOMMARGIN, /**< BOTTOMMARGIN= */
TidyAttr_CELLPADDING, /**< CELLPADDING= */
TidyAttr_CELLSPACING, /**< CELLSPACING= */
TidyAttr_CHAR, /**< CHAR= */
TidyAttr_CHAROFF, /**< CHAROFF= */
TidyAttr_CHARSET, /**< CHARSET= */
TidyAttr_CHECKED, /**< CHECKED= */
TidyAttr_CITE, /**< CITE= */
TidyAttr_CLASS, /**< CLASS= */
TidyAttr_CLASSID, /**< CLASSID= */
TidyAttr_CLEAR, /**< CLEAR= */
TidyAttr_CODE, /**< CODE= */
TidyAttr_CODEBASE, /**< CODEBASE= */
TidyAttr_CODETYPE, /**< CODETYPE= */
TidyAttr_COLOR, /**< COLOR= */
TidyAttr_COLS, /**< COLS= */
TidyAttr_COLSPAN, /**< COLSPAN= */
TidyAttr_COMPACT, /**< COMPACT= */
TidyAttr_CONTENT, /**< CONTENT= */
TidyAttr_COORDS, /**< COORDS= */
TidyAttr_DATA, /**< DATA= */
TidyAttr_DATAFLD, /**< DATAFLD= */
TidyAttr_DATAFORMATAS, /**< DATAFORMATAS= */
TidyAttr_DATAPAGESIZE, /**< DATAPAGESIZE= */
TidyAttr_DATASRC, /**< DATASRC= */
TidyAttr_DATETIME, /**< DATETIME= */
TidyAttr_DECLARE, /**< DECLARE= */
TidyAttr_DEFER, /**< DEFER= */
TidyAttr_DIR, /**< DIR= */
TidyAttr_DISABLED, /**< DISABLED= */
TidyAttr_ENCODING, /**< ENCODING= */
TidyAttr_ENCTYPE, /**< ENCTYPE= */
TidyAttr_FACE, /**< FACE= */
TidyAttr_FOR, /**< FOR= */
TidyAttr_FRAME, /**< FRAME= */
TidyAttr_FRAMEBORDER, /**< FRAMEBORDER= */
TidyAttr_FRAMESPACING, /**< FRAMESPACING= */
TidyAttr_GRIDX, /**< GRIDX= */
TidyAttr_GRIDY, /**< GRIDY= */
TidyAttr_HEADERS, /**< HEADERS= */
TidyAttr_HEIGHT, /**< HEIGHT= */
TidyAttr_HREF, /**< HREF= */
TidyAttr_HREFLANG, /**< HREFLANG= */
TidyAttr_HSPACE, /**< HSPACE= */
TidyAttr_HTTP_EQUIV, /**< HTTP_EQUIV= */
TidyAttr_ID, /**< ID= */
TidyAttr_ISMAP, /**< ISMAP= */
TidyAttr_ITEMID, /**< ITEMID= */
TidyAttr_ITEMPROP, /**< ITEMPROP= */
TidyAttr_ITEMREF, /**< ITEMREF= */
TidyAttr_ITEMSCOPE, /**< ITEMSCOPE= */
TidyAttr_ITEMTYPE, /**< ITEMTYPE= */
TidyAttr_LABEL, /**< LABEL= */
TidyAttr_LANG, /**< LANG= */
TidyAttr_LANGUAGE, /**< LANGUAGE= */
TidyAttr_LAST_MODIFIED, /**< LAST_MODIFIED= */
TidyAttr_LAST_VISIT, /**< LAST_VISIT= */
TidyAttr_LEFTMARGIN, /**< LEFTMARGIN= */
TidyAttr_LINK, /**< LINK= */
TidyAttr_LONGDESC, /**< LONGDESC= */
TidyAttr_LOWSRC, /**< LOWSRC= */
TidyAttr_MARGINHEIGHT, /**< MARGINHEIGHT= */
TidyAttr_MARGINWIDTH, /**< MARGINWIDTH= */
TidyAttr_MAXLENGTH, /**< MAXLENGTH= */
TidyAttr_MEDIA, /**< MEDIA= */
TidyAttr_METHOD, /**< METHOD= */
TidyAttr_MULTIPLE, /**< MULTIPLE= */
TidyAttr_NAME, /**< NAME= */
TidyAttr_NOHREF, /**< NOHREF= */
TidyAttr_NORESIZE, /**< NORESIZE= */
TidyAttr_NOSHADE, /**< NOSHADE= */
TidyAttr_NOWRAP, /**< NOWRAP= */
TidyAttr_OBJECT, /**< OBJECT= */
TidyAttr_OnAFTERUPDATE, /**< OnAFTERUPDATE= */
TidyAttr_OnBEFOREUNLOAD, /**< OnBEFOREUNLOAD= */
TidyAttr_OnBEFOREUPDATE, /**< OnBEFOREUPDATE= */
TidyAttr_OnBLUR, /**< OnBLUR= */
TidyAttr_OnCHANGE, /**< OnCHANGE= */
TidyAttr_OnCLICK, /**< OnCLICK= */
TidyAttr_OnDATAAVAILABLE, /**< OnDATAAVAILABLE= */
TidyAttr_OnDATASETCHANGED, /**< OnDATASETCHANGED= */
TidyAttr_OnDATASETCOMPLETE, /**< OnDATASETCOMPLETE= */
TidyAttr_OnDBLCLICK, /**< OnDBLCLICK= */
TidyAttr_OnERRORUPDATE, /**< OnERRORUPDATE= */
TidyAttr_OnFOCUS, /**< OnFOCUS= */
TidyAttr_OnKEYDOWN, /**< OnKEYDOWN= */
TidyAttr_OnKEYPRESS, /**< OnKEYPRESS= */
TidyAttr_OnKEYUP, /**< OnKEYUP= */
TidyAttr_OnLOAD, /**< OnLOAD= */
TidyAttr_OnMOUSEDOWN, /**< OnMOUSEDOWN= */
TidyAttr_OnMOUSEMOVE, /**< OnMOUSEMOVE= */
TidyAttr_OnMOUSEOUT, /**< OnMOUSEOUT= */
TidyAttr_OnMOUSEOVER, /**< OnMOUSEOVER= */
TidyAttr_OnMOUSEUP, /**< OnMOUSEUP= */
TidyAttr_OnRESET, /**< OnRESET= */
TidyAttr_OnROWENTER, /**< OnROWENTER= */
TidyAttr_OnROWEXIT, /**< OnROWEXIT= */
TidyAttr_OnSELECT, /**< OnSELECT= */
TidyAttr_OnSUBMIT, /**< OnSUBMIT= */
TidyAttr_OnUNLOAD, /**< OnUNLOAD= */
TidyAttr_PROFILE, /**< PROFILE= */
TidyAttr_PROMPT, /**< PROMPT= */
TidyAttr_RBSPAN, /**< RBSPAN= */
TidyAttr_READONLY, /**< READONLY= */
TidyAttr_REL, /**< REL= */
TidyAttr_REV, /**< REV= */
TidyAttr_RIGHTMARGIN, /**< RIGHTMARGIN= */
TidyAttr_ROLE, /**< ROLE= */
TidyAttr_ROWS, /**< ROWS= */
TidyAttr_ROWSPAN, /**< ROWSPAN= */
TidyAttr_RULES, /**< RULES= */
TidyAttr_SCHEME, /**< SCHEME= */
TidyAttr_SCOPE, /**< SCOPE= */
TidyAttr_SCROLLING, /**< SCROLLING= */
TidyAttr_SELECTED, /**< SELECTED= */
TidyAttr_SHAPE, /**< SHAPE= */
TidyAttr_SHOWGRID, /**< SHOWGRID= */
TidyAttr_SHOWGRIDX, /**< SHOWGRIDX= */
TidyAttr_SHOWGRIDY, /**< SHOWGRIDY= */
TidyAttr_SIZE, /**< SIZE= */
TidyAttr_SPAN, /**< SPAN= */
TidyAttr_SRC, /**< SRC= */
TidyAttr_SRCSET, /**< SRCSET= (HTML5) */
TidyAttr_STANDBY, /**< STANDBY= */
TidyAttr_START, /**< START= */
TidyAttr_STYLE, /**< STYLE= */
TidyAttr_SUMMARY, /**< SUMMARY= */
TidyAttr_TABINDEX, /**< TABINDEX= */
TidyAttr_TARGET, /**< TARGET= */
TidyAttr_TEXT, /**< TEXT= */
TidyAttr_TITLE, /**< TITLE= */
TidyAttr_TOPMARGIN, /**< TOPMARGIN= */
TidyAttr_TRANSLATE, /**< TRANSLATE= */
TidyAttr_TYPE, /**< TYPE= */
TidyAttr_USEMAP, /**< USEMAP= */
TidyAttr_VALIGN, /**< VALIGN= */
TidyAttr_VALUE, /**< VALUE= */
TidyAttr_VALUETYPE, /**< VALUETYPE= */
TidyAttr_VERSION, /**< VERSION= */
TidyAttr_VLINK, /**< VLINK= */
TidyAttr_VSPACE, /**< VSPACE= */
TidyAttr_WIDTH, /**< WIDTH= */
TidyAttr_WRAP, /**< WRAP= */
TidyAttr_XML_LANG, /**< XML_LANG= */
TidyAttr_XML_SPACE, /**< XML_SPACE= */
TidyAttr_XMLNS, /**< XMLNS= */
TidyAttr_EVENT, /**< EVENT= */
TidyAttr_METHODS, /**< METHODS= */
TidyAttr_N, /**< N= */
TidyAttr_SDAFORM, /**< SDAFORM= */
TidyAttr_SDAPREF, /**< SDAPREF= */
TidyAttr_SDASUFF, /**< SDASUFF= */
TidyAttr_URN, /**< URN= */
TidyAttr_ASYNC,
TidyAttr_AUTOCOMPLETE,
TidyAttr_AUTOFOCUS,
TidyAttr_AUTOPLAY,
TidyAttr_CHALLENGE,
TidyAttr_CONTENTEDITABLE,
TidyAttr_CONTEXTMENU,
TidyAttr_CONTROLS,
TidyAttr_CROSSORIGIN, /**< CROSSORIGIN= */
TidyAttr_DEFAULT,
TidyAttr_DIRNAME,
TidyAttr_DRAGGABLE,
TidyAttr_DROPZONE,
TidyAttr_FORM,
TidyAttr_FORMACTION,
TidyAttr_FORMENCTYPE,
TidyAttr_FORMMETHOD,
TidyAttr_FORMNOVALIDATE,
TidyAttr_FORMTARGET,
TidyAttr_HIDDEN,
TidyAttr_HIGH,
TidyAttr_ICON,
TidyAttr_KEYTYPE,
TidyAttr_KIND,
TidyAttr_LIST,
TidyAttr_LOOP,
TidyAttr_LOW,
TidyAttr_MANIFEST,
TidyAttr_MAX,
TidyAttr_MEDIAGROUP,
TidyAttr_MIN,
TidyAttr_NOVALIDATE,
TidyAttr_OPEN,
TidyAttr_OPTIMUM,
TidyAttr_OnABORT,
TidyAttr_OnAFTERPRINT,
TidyAttr_OnBEFOREPRINT,
TidyAttr_OnCANPLAY,
TidyAttr_OnCANPLAYTHROUGH,
TidyAttr_OnCONTEXTMENU,
TidyAttr_OnCUECHANGE,
TidyAttr_OnDRAG,
TidyAttr_OnDRAGEND,
TidyAttr_OnDRAGENTER,
TidyAttr_OnDRAGLEAVE,
TidyAttr_OnDRAGOVER,
TidyAttr_OnDRAGSTART,
TidyAttr_OnDROP,
TidyAttr_OnDURATIONCHANGE,
TidyAttr_OnEMPTIED,
TidyAttr_OnENDED,
TidyAttr_OnERROR,
TidyAttr_OnHASHCHANGE,
TidyAttr_OnINPUT,
TidyAttr_OnINVALID,
TidyAttr_OnLOADEDDATA,
TidyAttr_OnLOADEDMETADATA,
TidyAttr_OnLOADSTART,
TidyAttr_OnMESSAGE,
TidyAttr_OnMOUSEWHEEL,
TidyAttr_OnOFFLINE,
TidyAttr_OnONLINE,
TidyAttr_OnPAGEHIDE,
TidyAttr_OnPAGESHOW,
TidyAttr_OnPAUSE,
TidyAttr_OnPLAY,
TidyAttr_OnPLAYING,
TidyAttr_OnPOPSTATE,
TidyAttr_OnPROGRESS,
TidyAttr_OnRATECHANGE,
TidyAttr_OnREADYSTATECHANGE,
TidyAttr_OnREDO,
TidyAttr_OnRESIZE,
TidyAttr_OnSCROLL,
TidyAttr_OnSEEKED,
TidyAttr_OnSEEKING,
TidyAttr_OnSHOW,
TidyAttr_OnSTALLED,
TidyAttr_OnSTORAGE,
TidyAttr_OnSUSPEND,
TidyAttr_OnTIMEUPDATE,
TidyAttr_OnUNDO,
TidyAttr_OnVOLUMECHANGE,
TidyAttr_OnWAITING,
TidyAttr_PATTERN,
TidyAttr_PLACEHOLDER,
TidyAttr_POSTER,
TidyAttr_PRELOAD,
TidyAttr_PUBDATE,
TidyAttr_RADIOGROUP,
TidyAttr_REQUIRED,
TidyAttr_REVERSED,
TidyAttr_SANDBOX,
TidyAttr_SCOPED,
TidyAttr_SEAMLESS,
TidyAttr_SIZES,
TidyAttr_SPELLCHECK,
TidyAttr_SRCDOC,
TidyAttr_SRCLANG,
TidyAttr_STEP,
TidyAttr_ARIA_ACTIVEDESCENDANT,
TidyAttr_ARIA_ATOMIC,
TidyAttr_ARIA_AUTOCOMPLETE,
TidyAttr_ARIA_BUSY,
TidyAttr_ARIA_CHECKED,
TidyAttr_ARIA_CONTROLS,
TidyAttr_ARIA_DESCRIBEDBY,
TidyAttr_ARIA_DISABLED,
TidyAttr_ARIA_DROPEFFECT,
TidyAttr_ARIA_EXPANDED,
TidyAttr_ARIA_FLOWTO,
TidyAttr_ARIA_GRABBED,
TidyAttr_ARIA_HASPOPUP,
TidyAttr_ARIA_HIDDEN,
TidyAttr_ARIA_INVALID,
TidyAttr_ARIA_LABEL,
TidyAttr_ARIA_LABELLEDBY,
TidyAttr_ARIA_LEVEL,
TidyAttr_ARIA_LIVE,
TidyAttr_ARIA_MULTILINE,
TidyAttr_ARIA_MULTISELECTABLE,
TidyAttr_ARIA_ORIENTATION,
TidyAttr_ARIA_OWNS,
TidyAttr_ARIA_POSINSET,
TidyAttr_ARIA_PRESSED,
TidyAttr_ARIA_READONLY,
TidyAttr_ARIA_RELEVANT,
TidyAttr_ARIA_REQUIRED,
TidyAttr_ARIA_SELECTED,
TidyAttr_ARIA_SETSIZE,
TidyAttr_ARIA_SORT,
TidyAttr_ARIA_VALUEMAX,
TidyAttr_ARIA_VALUEMIN,
TidyAttr_ARIA_VALUENOW,
TidyAttr_ARIA_VALUETEXT,
/* SVG attributes (SVG 1.1) */
TidyAttr_X, /**< X= */
TidyAttr_Y, /**< Y= */
TidyAttr_VIEWBOX, /**< VIEWBOX= */
TidyAttr_PRESERVEASPECTRATIO, /**< PRESERVEASPECTRATIO= */
TidyAttr_ZOOMANDPAN, /**< ZOOMANDPAN= */
TidyAttr_BASEPROFILE, /**< BASEPROFILE= */
TidyAttr_CONTENTSCRIPTTYPE, /**< CONTENTSCRIPTTYPE= */
TidyAttr_CONTENTSTYLETYPE, /**< CONTENTSTYLETYPE= */
/* MathML <math> attributes */
TidyAttr_DISPLAY, /**< DISPLAY= (html5) */
/* RDFa global attributes */
TidyAttr_ABOUT, /**< ABOUT= */
TidyAttr_DATATYPE, /**< DATATYPE= */
TidyAttr_INLIST, /**< INLIST= */
TidyAttr_PREFIX, /**< PREFIX= */
TidyAttr_PROPERTY, /**< PROPERTY= */
TidyAttr_RESOURCE, /**< RESOURCE= */
TidyAttr_TYPEOF, /**< TYPEOF= */
TidyAttr_VOCAB, /**< VOCAB= */
TidyAttr_INTEGRITY, /**< INTEGRITY= */
N_TIDY_ATTRIBS /**< Must be last */
} TidyAttrId;
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* __TIDYENUM_H__ */

2356
src/tidylib.c

File diff suppressed because it is too large

635
src/tidyplatform.h

@ -0,0 +1,635 @@
#ifndef __TIDY_PLATFORM_H__
#define __TIDY_PLATFORM_H__
/* tidyplatform.h -- Platform specifics
(c) 1998-2008 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
*/
#ifdef __cplusplus
extern "C" {
#endif
/*
Uncomment and edit one of the following #defines if you
want to specify the config file at compile-time.
*/
/* #define TIDY_CONFIG_FILE "/etc/tidy_config.txt" */ /* original */
/* #define TIDY_CONFIG_FILE "/etc/tidyrc" */
/* #define TIDY_CONFIG_FILE "/etc/tidy.conf" */
/*
Uncomment the following #define if you are on a system
supporting the HOME environment variable.
It enables tidy to find config files named ~/.tidyrc if
the HTML_TIDY environment variable is not set.
*/
/* #define TIDY_USER_CONFIG_FILE "~/.tidyrc" */
/*
Uncomment the following #define if your
system supports the call getpwnam().
E.g. Unix and Linux.
It enables tidy to find files named
~your/foo for use in the HTML_TIDY environment
variable or CONFIG_FILE or USER_CONFIGFILE or
on the command line: -config ~joebob/tidy.cfg
Contributed by Todd Lewis.
*/
/* #define SUPPORT_GETPWNAM */
/* Enable/disable support for Big5 and Shift_JIS character encodings */
#ifndef SUPPORT_ASIAN_ENCODINGS
#define SUPPORT_ASIAN_ENCODINGS 1
#endif
/* Enable/disable support for UTF-16 character encodings */
#ifndef SUPPORT_UTF16_ENCODINGS
#define SUPPORT_UTF16_ENCODINGS 1
#endif
/* Enable/disable support for additional accessibility checks */
#ifndef SUPPORT_ACCESSIBILITY_CHECKS
#define SUPPORT_ACCESSIBILITY_CHECKS 1
#endif
/* Enable/disable support for additional languages */
#ifndef SUPPORT_LOCALIZATIONS
#define SUPPORT_LOCALIZATIONS 1
#endif
/* Convenience defines for Mac platforms */
#if defined(macintosh)
/* Mac OS 6.x/7.x/8.x/9.x, with or without CarbonLib - MPW or Metrowerks 68K/PPC compilers */
#define MAC_OS_CLASSIC
#ifndef PLATFORM_NAME
#define PLATFORM_NAME "Mac OS"
#endif
/* needed for access() */
#if !defined(_POSIX) && !defined(NO_ACCESS_SUPPORT)
#define NO_ACCESS_SUPPORT
#endif
#ifdef SUPPORT_GETPWNAM
#undef SUPPORT_GETPWNAM
#endif
#elif defined(__APPLE__) && defined(__MACH__)
/* Mac OS X (client) 10.x (or server 1.x/10.x) - gcc or Metrowerks MachO compilers */
#define MAC_OS_X
#ifndef PLATFORM_NAME
#define PLATFORM_NAME "Mac OS X"
#endif
#endif
#if defined(MAC_OS_CLASSIC) || defined(MAC_OS_X)
/* Any OS on Mac platform */
#define MAC_OS
#define FILENAMES_CASE_SENSITIVE 0
#define strcasecmp strcmp
#endif
/* Convenience defines for BSD like platforms */
#if defined(__FreeBSD__)
#define BSD_BASED_OS
#ifndef PLATFORM_NAME
#define PLATFORM_NAME "FreeBSD"
#endif
#elif defined(__NetBSD__)
#define BSD_BASED_OS
#ifndef PLATFORM_NAME
#define PLATFORM_NAME "NetBSD"
#endif
#elif defined(__OpenBSD__)
#define BSD_BASED_OS
#ifndef PLATFORM_NAME
#define PLATFORM_NAME "OpenBSD"
#endif
#elif defined(__DragonFly__)
#define BSD_BASED_OS
#ifndef PLATFORM_NAME
#define PLATFORM_NAME "DragonFly"
#endif
#elif defined(__MINT__)
#define BSD_BASED_OS
#ifndef PLATFORM_NAME
#define PLATFORM_NAME "FreeMiNT"
#endif
#elif defined(__bsdi__)
#define BSD_BASED_OS
#ifndef PLATFORM_NAME
#define PLATFORM_NAME "BSD/OS"
#endif
#endif
/* Convenience defines for Windows platforms */
#if defined(WINDOWS) || defined(_WIN32)
#define WINDOWS_OS
#ifndef PLATFORM_NAME
#define PLATFORM_NAME "Windows"
#endif
#if defined(__MWERKS__) || defined(__MSL__)
/* not available with Metrowerks Standard Library */
#ifdef SUPPORT_GETPWNAM
#undef SUPPORT_GETPWNAM
#endif
/* needed for setmode() */
#if !defined(NO_SETMODE_SUPPORT)
#define NO_SETMODE_SUPPORT
#endif
#define strcasecmp _stricmp
#endif
#if defined(__BORLANDC__)
#define strcasecmp stricmp
#endif
#define FILENAMES_CASE_SENSITIVE 0
#define SUPPORT_POSIX_MAPPED_FILES 0
#endif
/* Convenience defines for Linux platforms */
#if defined(linux) && defined(__alpha__)
/* Linux on Alpha - gcc compiler */
#define LINUX_OS
#ifndef PLATFORM_NAME
#define PLATFORM_NAME "Linux/Alpha"
#endif
#elif defined(linux) && defined(__sparc__)
/* Linux on Sparc - gcc compiler */
#define LINUX_OS
#ifndef PLATFORM_NAME
#define PLATFORM_NAME "Linux/Sparc"
#endif
#elif defined(linux) && (defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__))
/* Linux on x86 - gcc compiler */
#define LINUX_OS
#ifndef PLATFORM_NAME
#define PLATFORM_NAME "Linux/x86"
#endif
#elif defined(linux) && defined(__powerpc__)
/* Linux on PPC - gcc compiler */
#define LINUX_OS
#if defined(__linux__) && defined(__powerpc__)
/* #if #system(linux) */
/* MkLinux on PPC - gcc (egcs) compiler */
/* #define MAC_OS_MKLINUX */
#ifndef PLATFORM_NAME
#define PLATFORM_NAME "MkLinux"
#endif
#else
#ifndef PLATFORM_NAME
#define PLATFORM_NAME "Linux/PPC"
#endif
#endif
#elif defined(linux) || defined(__linux__)
/* generic Linux */
#define LINUX_OS
#ifndef PLATFORM_NAME
#define PLATFORM_NAME "Linux"
#endif
#endif
/* Convenience defines for Solaris platforms */
#if defined(sun)
#define SOLARIS_OS
#ifndef PLATFORM_NAME
#define PLATFORM_NAME "Solaris"
#endif
#endif
/* Convenience defines for HPUX + gcc platforms */
#if defined(__hpux)
#define HPUX_OS
#ifndef PLATFORM_NAME
#define PLATFORM_NAME "HPUX"
#endif
#endif
/* Convenience defines for RISCOS + gcc platforms */
#if defined(__riscos__)
#define RISC_OS
#ifndef PLATFORM_NAME
#define PLATFORM_NAME "RISC OS"
#endif
#endif
/* Convenience defines for OS/2 + icc/gcc platforms */
#if defined(__OS2__) || defined(__EMX__)
#define OS2_OS
#ifndef PLATFORM_NAME
#define PLATFORM_NAME "OS/2"
#endif
#define FILENAMES_CASE_SENSITIVE 0
#define strcasecmp stricmp
#endif
/* Convenience defines for IRIX */
#if defined(__sgi)
#define IRIX_OS
#ifndef PLATFORM_NAME
#define PLATFORM_NAME "SGI IRIX"
#endif
#endif
/* Convenience defines for AIX */
#if defined(_AIX)
#define AIX_OS
#ifndef PLATFORM_NAME
#define PLATFORM_NAME "IBM AIX"
#endif
#endif
/* Convenience defines for BeOS platforms */
#if defined(__BEOS__)
#define BE_OS
#ifndef PLATFORM_NAME
#define PLATFORM_NAME "BeOS"
#endif
#endif
/* Convenience defines for Cygwin platforms */
#if defined(__CYGWIN__)
#define CYGWIN_OS
#ifndef PLATFORM_NAME
#define PLATFORM_NAME "Cygwin"
#endif
#define FILENAMES_CASE_SENSITIVE 0
#endif
/* Convenience defines for OpenVMS */
#if defined(__VMS)
#define OPENVMS_OS
#ifndef PLATFORM_NAME
#define PLATFORM_NAME "OpenVMS"
#endif
#define FILENAMES_CASE_SENSITIVE 0
#endif
/* Convenience defines for DEC Alpha OSF + gcc platforms */
#if defined(__osf__)
#define OSF_OS
#ifndef PLATFORM_NAME
#define PLATFORM_NAME "DEC Alpha OSF"
#endif
#endif
/* Convenience defines for ARM platforms */
#if defined(__arm)
#define ARM_OS
#if defined(forARM) && defined(__NEWTON_H)
/* Using Newton C++ Tools ARMCpp compiler */
#define NEWTON_OS
#ifndef PLATFORM_NAME
#define PLATFORM_NAME "Newton"
#endif
#else
#ifndef PLATFORM_NAME
#define PLATFORM_NAME "ARM"
#endif
#endif
#endif
#include <ctype.h>
#include <stdio.h>
#include <setjmp.h> /* for longjmp on error exit */
#include <stdlib.h>
#include <stdarg.h> /* may need <varargs.h> for Unix V */
#include <string.h>
#include <assert.h>
#ifdef NEEDS_MALLOC_H
#include <malloc.h>
#endif
#ifdef SUPPORT_GETPWNAM
#include <pwd.h>
#endif
#ifdef NEEDS_UNISTD_H
#include <unistd.h> /* needed for unlink on some Unix systems */
#endif
/* By default, use case-sensitive filename comparison.
*/
#ifndef FILENAMES_CASE_SENSITIVE
#define FILENAMES_CASE_SENSITIVE 1
#endif
/*
Tidy preserves the last modified time for the files it
cleans up.
*/
/*
If your platform doesn't support <utime.h> and the
utime() function, or <sys/futime> and the futime()
function then set PRESERVE_FILE_TIMES to 0.
If your platform doesn't support <sys/utime.h> and the
futime() function, then set HAS_FUTIME to 0.
If your platform supports <utime.h> and the
utime() function requires the file to be
closed first, then set UTIME_NEEDS_CLOSED_FILE to 1.
*/
/* Keep old PRESERVEFILETIMES define for compatibility */
#ifdef PRESERVEFILETIMES
#undef PRESERVE_FILE_TIMES
#define PRESERVE_FILE_TIMES PRESERVEFILETIMES
#endif
#ifndef PRESERVE_FILE_TIMES
#if defined(RISC_OS) || defined(OPENVMS_OS) || defined(OSF_OS)
#define PRESERVE_FILE_TIMES 0
#else
#define PRESERVE_FILE_TIMES 1
#endif
#endif
#if PRESERVE_FILE_TIMES
#ifndef HAS_FUTIME
#if defined(CYGWIN_OS) || defined(BE_OS) || defined(OS2_OS) || defined(HPUX_OS) || defined(SOLARIS_OS) || defined(LINUX_OS) || defined(BSD_BASED_OS) || defined(MAC_OS) || defined(__MSL__) || defined(IRIX_OS) || defined(AIX_OS) || defined(__BORLANDC__) || defined(__GLIBC__)
#define HAS_FUTIME 0
#else
#define HAS_FUTIME 1
#endif
#endif
#ifndef UTIME_NEEDS_CLOSED_FILE
#if defined(SOLARIS_OS) || defined(BSD_BASED_OS) || defined(MAC_OS) || defined(__MSL__) || defined(LINUX_OS)
#define UTIME_NEEDS_CLOSED_FILE 1
#else
#define UTIME_NEEDS_CLOSED_FILE 0
#endif
#endif
#if defined(MAC_OS_X) || (!defined(MAC_OS_CLASSIC) && !defined(__MSL__))
#include <sys/types.h>
#include <sys/stat.h>
#else
#include <stat.h>
#endif
#if HAS_FUTIME
#include <sys/utime.h>
#else
#include <utime.h>
#endif /* HASFUTIME */
/*
MS Windows needs _ prefix for Unix file functions.
Not required by Metrowerks Standard Library (MSL).
Tidy uses following for preserving the last modified time.
WINDOWS automatically set by Win16 compilers.
_WIN32 automatically set by Win32 compilers.
*/
#if defined(_WIN32) && !defined(__MSL__) && !defined(__BORLANDC__)
#define futime _futime
#define fstat _fstat
#define utimbuf _utimbuf /* Windows seems to want utimbuf */
#define stat _stat
#define utime _utime
#define vsnprintf _vsnprintf
#endif /* _WIN32 */
#endif /* PRESERVE_FILE_TIMES */
/*
MS Windows needs _ prefix for Unix file functions.
Not required by Metrowerks Standard Library (MSL).
WINDOWS automatically set by Win16 compilers.
_WIN32 automatically set by Win32 compilers.
*/
#if defined(_WIN32) && !defined(__MSL__) && !defined(__BORLANDC__)
#if !(defined(__WATCOMC__) || defined(__MINGW32__))
#define fileno _fileno
#define setmode _setmode
#endif
#define access _access
#define strcasecmp _stricmp
#ifndef va_copy
#define va_copy(dest, src) (dest = src)
#endif
#if _MSC_VER > 1000
#pragma warning( disable : 4189 ) /* local variable is initialized but not referenced */
#pragma warning( disable : 4100 ) /* unreferenced formal parameter */
#pragma warning( disable : 4706 ) /* assignment within conditional expression */
#endif
#if _MSC_VER > 1300
#pragma warning( disable : 4996 ) /* disable depreciation warning */
#endif
#endif /* _WIN32 */
#if defined(_WIN32)
#if (defined(_USRDLL) || defined(_WINDLL) || defined(BUILD_SHARED_LIB)) && !defined(TIDY_EXPORT) && !defined(TIDY_STATIC)
#ifdef BUILDING_SHARED_LIB
#define TIDY_EXPORT __declspec( dllexport )
#else
#define TIDY_EXPORT __declspec( dllimport )
#endif
#else
#define TIDY_EXPORT extern
#endif
#ifndef TIDY_CALL
#ifdef _WIN64
# define TIDY_CALL __fastcall
#else
# define TIDY_CALL __stdcall
#endif
#endif
#endif /* _WIN32 */
/* hack for gnu sys/types.h file which defines uint and ulong */
#if defined(BE_OS) || defined(SOLARIS_OS) || defined(BSD_BASED_OS) || defined(OSF_OS) || defined(IRIX_OS) || defined(AIX_OS)
#include <sys/types.h>
#endif
#if !defined(HPUX_OS) && !defined(CYGWIN_OS) && !defined(MAC_OS_X) && !defined(BE_OS) && !defined(SOLARIS_OS) && !defined(BSD_BASED_OS) && !defined(OSF_OS) && !defined(IRIX_OS) && !defined(AIX_OS) && !defined(LINUX_OS)
# undef uint
typedef unsigned int uint;
#endif
#if defined(HPUX_OS) || defined(CYGWIN_OS) || defined(MAC_OS) || defined(BSD_BASED_OS) || defined(_WIN32)
# undef ulong
typedef unsigned long ulong;
#endif
/*
With GCC 4, __attribute__ ((visibility("default"))) can be used along compiling with tidylib
with "-fvisibility=hidden". See http://gcc.gnu.org/wiki/Visibility and build/gmake/Makefile.
*/
/*
#if defined(__GNUC__) && __GNUC__ >= 4
#define TIDY_EXPORT __attribute__ ((visibility("default")))
#endif
*/
#ifndef TIDY_EXPORT /* Define it away for most builds */
#define TIDY_EXPORT
#endif
#ifndef TIDY_STRUCT
#define TIDY_STRUCT
#endif
typedef unsigned char byte;
typedef uint tchar; /* single, full character */
typedef char tmbchar; /* single, possibly partial character */
#ifndef TMBSTR_DEFINED
typedef tmbchar* tmbstr; /* pointer to buffer of possibly partial chars */
typedef const tmbchar* ctmbstr; /* Ditto, but const */
#define NULLSTR (tmbstr)""
#define TMBSTR_DEFINED
#endif
#ifndef TIDY_CALL
#define TIDY_CALL
#endif
#if defined(__GNUC__) || defined(__INTEL_COMPILER)
# define ARG_UNUSED(x) x __attribute__((unused))
#else
# define ARG_UNUSED(x) x
#endif
/* HAS_VSNPRINTF triggers the use of "vsnprintf", which is safe related to
buffer overflow. Therefore, we make it the default unless HAS_VSNPRINTF
has been defined. */
#ifndef HAS_VSNPRINTF
# define HAS_VSNPRINTF 1
#endif
#ifndef SUPPORT_POSIX_MAPPED_FILES
# define SUPPORT_POSIX_MAPPED_FILES 1
#endif
/*
bool is a reserved word in some but
not all C++ compilers depending on age
work around is to avoid bool altogether
by introducing a new enum called Bool
*/
/* We could use the C99 definition where supported
typedef _Bool Bool;
#define no (_Bool)0
#define yes (_Bool)1
*/
typedef enum
{
no,
yes
} Bool;
/* for NULL pointers
#define null ((const void*)0)
extern void* null;
*/
#if defined(DMALLOC)
#include "dmalloc.h"
#endif
/* Opaque data structure.
* Cast to implementation type struct within lib.
* This will reduce inter-dependencies/conflicts w/ application code.
*/
#if 1
#define opaque_type( typenam )\
struct _##typenam { int _opaque; };\
typedef struct _##typenam const * typenam
#else
#define opaque_type(typenam) typedef const void* typenam
#endif
/* Opaque data structure used to pass back
** and forth to keep current position in a
** list or other collection.
*/
opaque_type( TidyIterator );
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* __TIDY_PLATFORM_H__ */
/*
* local variables:
* mode: c
* indent-tabs-mode: nil
* c-basic-offset: 4
* eval: (c-set-offset 'substatement-open 0)
* end:
*/

295
src/tmbstr.c

@ -0,0 +1,295 @@
/* tmbstr.c -- Tidy string utility functions
(c) 1998-2006 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
*/
#include "forward.h"
#include "tmbstr.h"
#include "lexer.h"
/* like strdup but using an allocator */
tmbstr TY_(tmbstrdup)( TidyAllocator *allocator, ctmbstr str )
{
tmbstr s = NULL;
if ( str )
{
uint len = TY_(tmbstrlen)( str );
tmbstr cp = s = (tmbstr) TidyAlloc( allocator, 1+len );
while ( 0 != (*cp++ = *str++) )
/**/;
}
return s;
}
/* like strndup but using an allocator */
tmbstr TY_(tmbstrndup)( TidyAllocator *allocator, ctmbstr str, uint len )
{
tmbstr s = NULL;
if ( str && len > 0 )
{
tmbstr cp = s = (tmbstr) TidyAlloc( allocator, 1+len );
while ( len-- > 0 && (*cp++ = *str++) )
/**/;
*cp = 0;
}
return s;
}
/* exactly same as strncpy */
uint TY_(tmbstrncpy)( tmbstr s1, ctmbstr s2, uint size )
{
if ( s1 != NULL && s2 != NULL )
{
tmbstr cp = s1;
while ( *s2 && --size ) /* Predecrement: reserve byte */
*cp++ = *s2++; /* for NULL terminator. */
*cp = 0;
}
return size;
}
/* Allows expressions like: cp += tmbstrcpy( cp, "joebob" );
*/
uint TY_(tmbstrcpy)( tmbstr s1, ctmbstr s2 )
{
uint ncpy = 0;
while (0 != (*s1++ = *s2++) )
++ncpy;
return ncpy;
}
/* Allows expressions like: cp += tmbstrcat( cp, "joebob" );
*/
uint TY_(tmbstrcat)( tmbstr s1, ctmbstr s2 )
{
uint ncpy = 0;
while ( *s1 )
++s1;
while (0 != (*s1++ = *s2++) )
++ncpy;
return ncpy;
}
/* exactly same as strcmp */
int TY_(tmbstrcmp)( ctmbstr s1, ctmbstr s2 )
{
int c;
while ((c = *s1) == *s2)
{
if (c == '\0')
return 0;
++s1;
++s2;
}
return (*s1 > *s2 ? 1 : -1);
}
/* returns byte count, not char count */
uint TY_(tmbstrlen)( ctmbstr str )
{
uint len = 0;
if ( str )
{
while ( *str++ )
++len;
}
return len;
}
/*
MS C 4.2 doesn't include strcasecmp.
Note that tolower and toupper won't
work on chars > 127.
Neither does ToLower()!
*/
int TY_(tmbstrcasecmp)( ctmbstr s1, ctmbstr s2 )
{
uint c;
while (c = (uint)(*s1), TY_(ToLower)(c) == TY_(ToLower)((uint)(*s2)))
{
if (c == '\0')
return 0;
++s1;
++s2;
}
return (*s1 > *s2 ? 1 : -1);
}
int TY_(tmbstrncmp)( ctmbstr s1, ctmbstr s2, uint n )
{
uint c;
while ((c = (byte)*s1) == (byte)*s2)
{
if (c == '\0')
return 0;
if (n == 0)
return 0;
++s1;
++s2;
--n;
}
if (n == 0)
return 0;
return (*s1 > *s2 ? 1 : -1);
}
int TY_(tmbstrncasecmp)( ctmbstr s1, ctmbstr s2, uint n )
{
uint c;
while (c = (uint)(*s1), TY_(ToLower)(c) == TY_(ToLower)((uint)(*s2)))
{
if (c == '\0')
return 0;
if (n == 0)
return 0;
++s1;
++s2;
--n;
}
if (n == 0)
return 0;
return (*s1 > *s2 ? 1 : -1);
}
#if 0
/* return offset of cc from beginning of s1,
** -1 if not found.
*/
int TY_(tmbstrnchr)( ctmbstr s1, uint maxlen, tmbchar cc )
{
int i;
ctmbstr cp = s1;
for ( i = 0; (uint)i < maxlen; ++i, ++cp )
{
if ( *cp == cc )
return i;
}
return -1;
}
#endif
ctmbstr TY_(tmbsubstrn)( ctmbstr s1, uint len1, ctmbstr s2 )
{
uint len2 = TY_(tmbstrlen)(s2);
int ix, diff = len1 - len2;
for ( ix = 0; ix <= diff; ++ix )
{
if ( TY_(tmbstrncmp)(s1+ix, s2, len2) == 0 )
return (ctmbstr) s1+ix;
}
return NULL;
}
#if 0
ctmbstr TY_(tmbsubstrncase)( ctmbstr s1, uint len1, ctmbstr s2 )
{
uint len2 = TY_(tmbstrlen)(s2);
int ix, diff = len1 - len2;
for ( ix = 0; ix <= diff; ++ix )
{
if ( TY_(tmbstrncasecmp)(s1+ix, s2, len2) == 0 )
return (ctmbstr) s1+ix;
}
return NULL;
}
#endif
ctmbstr TY_(tmbsubstr)( ctmbstr s1, ctmbstr s2 )
{
uint len1 = TY_(tmbstrlen)(s1), len2 = TY_(tmbstrlen)(s2);
int ix, diff = len1 - len2;
for ( ix = 0; ix <= diff; ++ix )
{
if ( TY_(tmbstrncasecmp)(s1+ix, s2, len2) == 0 )
return (ctmbstr) s1+ix;
}
return NULL;
}
/* Transform ASCII chars in string to lower case */
tmbstr TY_(tmbstrtolower)( tmbstr s )
{
tmbstr cp;
for ( cp=s; *cp; ++cp )
*cp = (tmbchar) TY_(ToLower)( *cp );
return s;
}
/* Transform ASCII chars in string to upper case */
tmbstr TY_(tmbstrtoupper)(tmbstr s)
{
tmbstr cp;
for (cp = s; *cp; ++cp)
*cp = (tmbchar)TY_(ToUpper)(*cp);
return s;
}
#if 0
Bool TY_(tmbsamefile)( ctmbstr filename1, ctmbstr filename2 )
{
#if FILENAMES_CASE_SENSITIVE
return ( TY_(tmbstrcmp)( filename1, filename2 ) == 0 );
#else
return ( TY_(tmbstrcasecmp)( filename1, filename2 ) == 0 );
#endif
}
#endif
int TY_(tmbvsnprintf)(tmbstr buffer, size_t count, ctmbstr format, va_list args)
{
int retval;
#if HAS_VSNPRINTF
retval = vsnprintf(buffer, count - 1, format, args);
/* todo: conditionally null-terminate the string? */
buffer[count - 1] = 0;
#else
retval = vsprintf(buffer, format, args);
#endif /* HAS_VSNPRINTF */
return retval;
}
int TY_(tmbsnprintf)(tmbstr buffer, size_t count, ctmbstr format, ...)
{
int retval;
va_list args;
va_start(args, format);
retval = TY_(tmbvsnprintf)(buffer, count, format, args);
va_end(args);
return retval;
}
/*
* local variables:
* mode: c
* indent-tabs-mode: nil
* c-basic-offset: 4
* eval: (c-set-offset 'substatement-open 0)
* end:
*/

86
src/tmbstr.h

@ -0,0 +1,86 @@
#ifndef __TMBSTR_H__
#define __TMBSTR_H__
/* tmbstr.h - Tidy string utility functions
(c) 1998-2006 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
*/
#include "tidyplatform.h"
#ifdef __cplusplus
extern "C"
{
#endif
/* like strdup but using an allocator */
tmbstr TY_(tmbstrdup)( TidyAllocator *allocator, ctmbstr str );
/* like strndup but using an allocator */
tmbstr TY_(tmbstrndup)( TidyAllocator *allocator, ctmbstr str, uint len);
/* exactly same as strncpy */
uint TY_(tmbstrncpy)( tmbstr s1, ctmbstr s2, uint size );
uint TY_(tmbstrcpy)( tmbstr s1, ctmbstr s2 );
uint TY_(tmbstrcat)( tmbstr s1, ctmbstr s2 );
/* exactly same as strcmp */
int TY_(tmbstrcmp)( ctmbstr s1, ctmbstr s2 );
/* returns byte count, not char count */
uint TY_(tmbstrlen)( ctmbstr str );
/*
MS C 4.2 doesn't include strcasecmp.
Note that tolower and toupper won't
work on chars > 127.
Neither do Lexer.ToLower() or Lexer.ToUpper()!
We get away with this because, except for XML tags,
we are always comparing to ascii element and
attribute names defined by HTML specs.
*/
int TY_(tmbstrcasecmp)( ctmbstr s1, ctmbstr s2 );
int TY_(tmbstrncmp)( ctmbstr s1, ctmbstr s2, uint n );
int TY_(tmbstrncasecmp)( ctmbstr s1, ctmbstr s2, uint n );
/* return offset of cc from beginning of s1,
** -1 if not found.
*/
/* int TY_(tmbstrnchr)( ctmbstr s1, uint len1, tmbchar cc ); */
ctmbstr TY_(tmbsubstrn)( ctmbstr s1, uint len1, ctmbstr s2 );
/* ctmbstr TY_(tmbsubstrncase)( ctmbstr s1, uint len1, ctmbstr s2 ); */
ctmbstr TY_(tmbsubstr)( ctmbstr s1, ctmbstr s2 );
/* transform string to lower case */
tmbstr TY_(tmbstrtolower)( tmbstr s );
/* Transform ASCII chars in string to upper case */
tmbstr TY_(tmbstrtoupper)( tmbstr s );
/* Bool TY_(tmbsamefile)( ctmbstr filename1, ctmbstr filename2 ); */
int TY_(tmbvsnprintf)(tmbstr buffer, size_t count, ctmbstr format, va_list args)
#ifdef __GNUC__
__attribute__((format(printf, 3, 0)))
#endif
;
int TY_(tmbsnprintf)(tmbstr buffer, size_t count, ctmbstr format, ...)
#ifdef __GNUC__
__attribute__((format(printf, 3, 4)))
#endif
;
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* __TMBSTR_H__ */

533
src/utf8.c

@ -0,0 +1,533 @@
/* utf8.c -- convert characters to/from UTF-8
(c) 1998-2007 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
Uses public interfaces to abstract input source and output
sink, which may be user supplied or either FILE* or memory
based Tidy implementations. Encoding support is uniform
regardless of I/O mechanism.
Note, UTF-8 encoding, by itself, does not affect the actual
"codepoints" of the underlying character encoding. In the
cases of ASCII, Latin1, Unicode (16-bit, BMP), these all
refer to ISO-10646 "codepoints". For anything else, they
refer to some other "codepoint" set.
Put another way, UTF-8 is a variable length method to
represent any non-negative integer value. The glyph
that a integer value represents is unchanged and defined
externally (e.g. by ISO-10646, Big5, Win1252, MacRoman,
Latin2-9, and so on).
Put still another way, UTF-8 is more of a _transfer_ encoding
than a _character_ encoding, per se.
*/
#include "tidy.h"
#include "forward.h"
#include "utf8.h"
/*
UTF-8 encoding/decoding functions
Return # of bytes in UTF-8 sequence; result < 0 if illegal sequence
Also see below for UTF-16 encoding/decoding functions
References :
1) UCS Transformation Format 8 (UTF-8):
ISO/IEC 10646-1:1996 Amendment 2 or ISO/IEC 10646-1:2000 Annex D
<http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335>
<http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-8.html>
Table 4 - Mapping from UCS-4 to UTF-8
2) Unicode standards:
<http://www.unicode.org/unicode/standard/standard.html>
3) Legal UTF-8 byte sequences:
<http://www.unicode.org/unicode/uni2errata/UTF-8_Corrigendum.html>
Code point 1st byte 2nd byte 3rd byte 4th byte
---------- -------- -------- -------- --------
U+0000..U+007F 00..7F
U+0080..U+07FF C2..DF 80..BF
U+0800..U+0FFF E0 A0..BF 80..BF
U+1000..U+FFFF E1..EF 80..BF 80..BF
U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
The definition of UTF-8 in Annex D of ISO/IEC 10646-1:2000 also
allows for the use of five- and six-byte sequences to encode
characters that are outside the range of the Unicode character
set; those five- and six-byte sequences are illegal for the use
of UTF-8 as a transformation of Unicode characters. ISO/IEC 10646
does not allow mapping of unpaired surrogates, nor U+FFFE and U+FFFF
(but it does allow other noncharacters).
4) RFC 2279: UTF-8, a transformation format of ISO 10646:
<http://www.ietf.org/rfc/rfc2279.txt>
5) UTF-8 and Unicode FAQ:
<http://www.cl.cam.ac.uk/~mgk25/unicode.html>
6) Markus Kuhn's UTF-8 decoder stress test file:
<http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt>
7) UTF-8 Demo:
<http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-demo.txt>
8) UTF-8 Sampler:
<http://www.columbia.edu/kermit/utf8.html>
9) Transformation Format for 16 Planes of Group 00 (UTF-16):
ISO/IEC 10646-1:1996 Amendment 1 or ISO/IEC 10646-1:2000 Annex C
<http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n2005/n2005.pdf>
<http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-16.html>
10) RFC 2781: UTF-16, an encoding of ISO 10646:
<http://www.ietf.org/rfc/rfc2781.txt>
11) UTF-16 invalid surrogate pairs:
<http://www.unicode.org/unicode/faq/utf_bom.html#16>
UTF-16 UTF-8 UCS-4
D83F DFF* F0 9F BF B* 0001FFF*
D87F DFF* F0 AF BF B* 0002FFF*
D8BF DFF* F0 BF BF B* 0003FFF*
D8FF DFF* F1 8F BF B* 0004FFF*
D93F DFF* F1 9F BF B* 0005FFF*
D97F DFF* F1 AF BF B* 0006FFF*
...
DBBF DFF* F3 BF BF B* 000FFFF*
DBFF DFF* F4 8F BF B* 0010FFF*
* = E or F
1010 A
1011 B
1100 C
1101 D
1110 E
1111 F
*/
#define kNumUTF8Sequences 7
#define kMaxUTF8Bytes 4
#define kUTF8ByteSwapNotAChar 0xFFFE
#define kUTF8NotAChar 0xFFFF
#define kMaxUTF8FromUCS4 0x10FFFF
#define kUTF16SurrogatesBegin 0x10000
#define kMaxUTF16FromUCS4 0x10FFFF
/* UTF-16 surrogate pair areas */
#define kUTF16LowSurrogateBegin 0xD800
#define kUTF16LowSurrogateEnd 0xDBFF
#define kUTF16HighSurrogateBegin 0xDC00
#define kUTF16HighSurrogateEnd 0xDFFF
/* offsets into validUTF8 table below */
static const int offsetUTF8Sequences[kMaxUTF8Bytes + 1] =
{
0, /* 1 byte */
1, /* 2 bytes */
2, /* 3 bytes */
4, /* 4 bytes */
kNumUTF8Sequences /* must be last */
};
static const struct validUTF8Sequence
{
uint lowChar;
uint highChar;
int numBytes;
byte validBytes[8];
} validUTF8[kNumUTF8Sequences] =
{
/* low high #bytes byte 1 byte 2 byte 3 byte 4 */
{0x0000, 0x007F, 1, {0x00, 0x7F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
{0x0080, 0x07FF, 2, {0xC2, 0xDF, 0x80, 0xBF, 0x00, 0x00, 0x00, 0x00}},
{0x0800, 0x0FFF, 3, {0xE0, 0xE0, 0xA0, 0xBF, 0x80, 0xBF, 0x00, 0x00}},
{0x1000, 0xFFFF, 3, {0xE1, 0xEF, 0x80, 0xBF, 0x80, 0xBF, 0x00, 0x00}},
{0x10000, 0x3FFFF, 4, {0xF0, 0xF0, 0x90, 0xBF, 0x80, 0xBF, 0x80, 0xBF}},
{0x40000, 0xFFFFF, 4, {0xF1, 0xF3, 0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF}},
{0x100000, 0x10FFFF, 4, {0xF4, 0xF4, 0x80, 0x8F, 0x80, 0xBF, 0x80, 0xBF}}
};
int TY_(DecodeUTF8BytesToChar)( uint* c, uint firstByte, ctmbstr successorBytes,
TidyInputSource* inp, int* count )
{
byte tempbuf[10];
byte *buf = &tempbuf[0];
uint ch = 0, n = 0;
int i, bytes = 0;
Bool hasError = no;
if ( successorBytes )
buf = (byte*) successorBytes;
/* special check if we have been passed an EOF char */
if ( firstByte == EndOfStream )
{
/* at present */
*c = firstByte;
*count = 1;
return 0;
}
ch = firstByte; /* first byte is passed in separately */
if (ch <= 0x7F) /* 0XXX XXXX one byte */
{
n = ch;
bytes = 1;
}
else if ((ch & 0xE0) == 0xC0) /* 110X XXXX two bytes */
{
n = ch & 31;
bytes = 2;
}
else if ((ch & 0xF0) == 0xE0) /* 1110 XXXX three bytes */
{
n = ch & 15;
bytes = 3;
}
else if ((ch & 0xF8) == 0xF0) /* 1111 0XXX four bytes */
{
n = ch & 7;
bytes = 4;
}
else if ((ch & 0xFC) == 0xF8) /* 1111 10XX five bytes */
{
n = ch & 3;
bytes = 5;
hasError = yes;
}
else if ((ch & 0xFE) == 0xFC) /* 1111 110X six bytes */
{
n = ch & 1;
bytes = 6;
hasError = yes;
}
else
{
/* not a valid first byte of a UTF-8 sequence */
n = ch;
bytes = 1;
hasError = yes;
}
/* successor bytes should have the form 10XX XXXX */
/* If caller supplied buffer, use it. Else see if caller
** supplied an input source, use that.
*/
if ( successorBytes )
{
for ( i=0; i < bytes-1; ++i )
{
if ( !buf[i] || (buf[i] & 0xC0) != 0x80 )
{
hasError = yes;
bytes = i+1;
break;
}
n = (n << 6) | (buf[i] & 0x3F);
}
}
else if ( inp )
{
for ( i=0; i < bytes-1 && !inp->eof(inp->sourceData); ++i )
{
int b = inp->getByte( inp->sourceData );
buf[i] = (tmbchar) b;
/* End of data or illegal successor byte value */
if ( b == EOF || (buf[i] & 0xC0) != 0x80 )
{
hasError = yes;
bytes = i+1;
if ( b != EOF )
inp->ungetByte( inp->sourceData, buf[i] );
break;
}
n = (n << 6) | (buf[i] & 0x3F);
}
}
else if ( bytes > 1 )
{
hasError = yes;
bytes = 1;
}
if (!hasError && ((n == kUTF8ByteSwapNotAChar) || (n == kUTF8NotAChar)))
hasError = yes;
if (!hasError && (n > kMaxUTF8FromUCS4))
hasError = yes;
#if 0 /* Breaks Big5 D8 - DF */
if (!hasError && (n >= kUTF16LowSurrogateBegin) && (n <= kUTF16HighSurrogateEnd))
/* unpaired surrogates not allowed */
hasError = yes;
#endif
if (!hasError)
{
int lo, hi;
lo = offsetUTF8Sequences[bytes - 1];
hi = offsetUTF8Sequences[bytes] - 1;
/* check for overlong sequences */
if ((n < validUTF8[lo].lowChar) || (n > validUTF8[hi].highChar))
hasError = yes;
else
{
hasError = yes; /* assume error until proven otherwise */
for (i = lo; i <= hi; i++)
{
int tempCount;
byte theByte;
for (tempCount = 0; tempCount < bytes; tempCount++)
{
if (!tempCount)
theByte = (tmbchar) firstByte;
else
theByte = buf[tempCount - 1];
if ( theByte >= validUTF8[i].validBytes[(tempCount * 2)] &&
theByte <= validUTF8[i].validBytes[(tempCount * 2) + 1] )
hasError = no;
if (hasError)
break;
}
}
}
}
#if 1 && defined(_DEBUG)
if ( hasError )
{
/* debug */
fprintf( stderr, "UTF-8 decoding error of %d bytes : ", bytes );
fprintf( stderr, "0x%02x ", firstByte );
for (i = 1; i < bytes; i++)
fprintf( stderr, "0x%02x ", buf[i - 1] );
fprintf( stderr, " = U+%04ulx\n", n );
}
#endif
*count = bytes;
*c = n;
if ( hasError )
return -1;
return 0;
}
int TY_(EncodeCharToUTF8Bytes)( uint c, tmbstr encodebuf,
TidyOutputSink* outp, int* count )
{
byte tempbuf[10] = {0};
byte* buf = &tempbuf[0];
int bytes = 0;
Bool hasError = no;
if ( encodebuf )
buf = (byte*) encodebuf;
if (c <= 0x7F) /* 0XXX XXXX one byte */
{
buf[0] = (tmbchar) c;
bytes = 1;
}
else if (c <= 0x7FF) /* 110X XXXX two bytes */
{
buf[0] = (tmbchar) ( 0xC0 | (c >> 6) );
buf[1] = (tmbchar) ( 0x80 | (c & 0x3F) );
bytes = 2;
}
else if (c <= 0xFFFF) /* 1110 XXXX three bytes */
{
buf[0] = (tmbchar) (0xE0 | (c >> 12));
buf[1] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
buf[2] = (tmbchar) (0x80 | (c & 0x3F));
bytes = 3;
if ( c == kUTF8ByteSwapNotAChar || c == kUTF8NotAChar )
hasError = yes;
#if 0 /* Breaks Big5 D8 - DF */
else if ( c >= kUTF16LowSurrogateBegin && c <= kUTF16HighSurrogateEnd )
/* unpaired surrogates not allowed */
hasError = yes;
#endif
}
else if (c <= 0x1FFFFF) /* 1111 0XXX four bytes */
{
buf[0] = (tmbchar) (0xF0 | (c >> 18));
buf[1] = (tmbchar) (0x80 | ((c >> 12) & 0x3F));
buf[2] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
buf[3] = (tmbchar) (0x80 | (c & 0x3F));
bytes = 4;
if (c > kMaxUTF8FromUCS4)
hasError = yes;
}
else if (c <= 0x3FFFFFF) /* 1111 10XX five bytes */
{
buf[0] = (tmbchar) (0xF8 | (c >> 24));
buf[1] = (tmbchar) (0x80 | (c >> 18));
buf[2] = (tmbchar) (0x80 | ((c >> 12) & 0x3F));
buf[3] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
buf[4] = (tmbchar) (0x80 | (c & 0x3F));
bytes = 5;
hasError = yes;
}
else if (c <= 0x7FFFFFFF) /* 1111 110X six bytes */
{
buf[0] = (tmbchar) (0xFC | (c >> 30));
buf[1] = (tmbchar) (0x80 | ((c >> 24) & 0x3F));
buf[2] = (tmbchar) (0x80 | ((c >> 18) & 0x3F));
buf[3] = (tmbchar) (0x80 | ((c >> 12) & 0x3F));
buf[4] = (tmbchar) (0x80 | ((c >> 6) & 0x3F));
buf[5] = (tmbchar) (0x80 | (c & 0x3F));
bytes = 6;
hasError = yes;
}
else
hasError = yes;
/* don't output invalid UTF-8 byte sequence to a stream */
if ( !hasError && outp != NULL )
{
int ix;
for ( ix=0; ix < bytes; ++ix )
outp->putByte( outp->sinkData, buf[ix] );
}
#if 1 && defined(_DEBUG)
if ( hasError )
{
int i;
fprintf( stderr, "UTF-8 encoding error for U+%x : ", c );
for (i = 0; i < bytes; i++)
fprintf( stderr, "0x%02x ", buf[i] );
fprintf( stderr, "\n" );
}
#endif
*count = bytes;
if (hasError)
return -1;
return 0;
}
/* return one less than the number of bytes used by the UTF-8 byte sequence */
/* str points to the UTF-8 byte sequence */
/* the Unicode char is returned in *ch */
uint TY_(GetUTF8)( ctmbstr str, uint *ch )
{
uint n;
int bytes;
int err;
bytes = 0;
/* first byte "str[0]" is passed in separately from the */
/* rest of the UTF-8 byte sequence starting at "str[1]" */
err = TY_(DecodeUTF8BytesToChar)( &n, str[0], str+1, NULL, &bytes );
if (err)
{
#if 1 && defined(_DEBUG)
fprintf(stderr, "pprint UTF-8 decoding error for U+%x : ", n);
#endif
n = 0xFFFD; /* replacement char */
}
*ch = n;
return bytes - 1;
}
/* store char c as UTF-8 encoded byte stream */
tmbstr TY_(PutUTF8)( tmbstr buf, uint c )
{
int err, count = 0;
err = TY_(EncodeCharToUTF8Bytes)( c, buf, NULL, &count );
if (err)
{
#if 1 && defined(_DEBUG)
fprintf(stderr, "pprint UTF-8 encoding error for U+%x : ", c);
#endif
/* replacement char 0xFFFD encoded as UTF-8 */
buf[0] = (byte) 0xEF;
buf[1] = (byte) 0xBF;
buf[2] = (byte) 0xBD;
count = 3;
}
buf += count;
return buf;
}
Bool TY_(IsValidUTF16FromUCS4)( tchar ucs4 )
{
return ( ucs4 <= kMaxUTF16FromUCS4 );
}
Bool TY_(IsHighSurrogate)( tchar ch )
{
return ( ch >= kUTF16HighSurrogateBegin && ch <= kUTF16HighSurrogateEnd );
}
Bool TY_(IsLowSurrogate)( tchar ch )
{
return ( ch >= kUTF16LowSurrogateBegin && ch <= kUTF16LowSurrogateEnd );
}
tchar TY_(CombineSurrogatePair)( tchar high, tchar low )
{
assert( TY_(IsHighSurrogate)(high) && TY_(IsLowSurrogate)(low) );
return ( ((low - kUTF16LowSurrogateBegin) * 0x400) +
high - kUTF16HighSurrogateBegin + 0x10000 );
}
Bool TY_(SplitSurrogatePair)( tchar utf16, tchar* low, tchar* high )
{
Bool status = ( TY_(IsValidCombinedChar)( utf16 ) && high && low );
if ( status )
{
*low = (utf16 - kUTF16SurrogatesBegin) / 0x400 + kUTF16LowSurrogateBegin;
*high = (utf16 - kUTF16SurrogatesBegin) % 0x400 + kUTF16HighSurrogateBegin;
}
return status;
}
Bool TY_(IsValidCombinedChar)( tchar ch )
{
return ( ch >= kUTF16SurrogatesBegin &&
(ch & 0x0000FFFE) != 0x0000FFFE &&
(ch & 0x0000FFFF) != 0x0000FFFF );
}
Bool TY_(IsCombinedChar)( tchar ch )
{
return ( ch >= kUTF16SurrogatesBegin );
}
/*
* local variables:
* mode: c
* indent-tabs-mode: nil
* c-basic-offset: 4
* eval: (c-set-offset 'substatement-open 0)
* end:
*/

46
src/utf8.h

@ -0,0 +1,46 @@
#ifndef __UTF8_H__
#define __UTF8_H__
/* utf8.h -- convert characters to/from UTF-8
(c) 1998-2006 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
*/
#include "tidyplatform.h"
#include "tidybuffio.h"
/* UTF-8 encoding/decoding support
** Does not convert character "codepoints", i.e. to/from 10646.
*/
int TY_(DecodeUTF8BytesToChar)( uint* c, uint firstByte, ctmbstr successorBytes,
TidyInputSource* inp, int* count );
int TY_(EncodeCharToUTF8Bytes)( uint c, tmbstr encodebuf,
TidyOutputSink* outp, int* count );
uint TY_(GetUTF8)( ctmbstr str, uint *ch );
tmbstr TY_(PutUTF8)( tmbstr buf, uint c );
#define UNICODE_BOM_BE 0xFEFF /* big-endian (default) UNICODE BOM */
#define UNICODE_BOM UNICODE_BOM_BE
#define UNICODE_BOM_LE 0xFFFE /* little-endian UNICODE BOM */
#define UNICODE_BOM_UTF8 0xEFBBBF /* UTF-8 UNICODE BOM */
Bool TY_(IsValidUTF16FromUCS4)( tchar ucs4 );
Bool TY_(IsHighSurrogate)( tchar ch );
Bool TY_(IsLowSurrogate)( tchar ch );
Bool TY_(IsCombinedChar)( tchar ch );
Bool TY_(IsValidCombinedChar)( tchar ch );
tchar TY_(CombineSurrogatePair)( tchar high, tchar low );
Bool TY_(SplitSurrogatePair)( tchar utf16, tchar* high, tchar* low );
#endif /* __UTF8_H__ */

23
src/version.h

@ -0,0 +1,23 @@
/* version information
(c) 2007-2015 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
*/
#ifdef RELEASE_DATE
static const char TY_(release_date)[] = RELEASE_DATE;
#else
static const char TY_(release_date)[] = "2015/01/22";
#endif
#ifdef LIBTIDY_VERSION
#ifdef RC_NUMBER
static const char TY_(library_version)[] = LIBTIDY_VERSION "." RC_NUMBER;
#else
static const char TY_(library_version)[] = LIBTIDY_VERSION;
#endif
#else
static const char TY_(library_version)[] = "5.0.0";
#endif
/* eof */

794
src/win32tc.c

@ -0,0 +1,794 @@
/* win32tc.c -- Interface to Win32 transcoding routines
(c) 1998-2008 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
*/
/* keep these here to keep file non-empty */
#include "tidy.h"
#include "forward.h"
#include "streamio.h"
#include "tmbstr.h"
#include "utf8.h"
#ifdef TIDY_WIN32_MLANG_SUPPORT
#define VC_EXTRALEAN
#define CINTERFACE
#define COBJMACROS
#include <windows.h>
#include <mlang.h>
#undef COBJMACROS
#undef CINTERFACE
#undef VC_EXTRALEAN
/* maximum number of bytes for a single character */
#define TC_INBUFSIZE 16
/* maximum number of characters per byte sequence */
#define TC_OUTBUFSIZE 16
#define CreateMLangObject(p) \
CoCreateInstance( \
&CLSID_CMLangConvertCharset, \
NULL, \
CLSCTX_ALL, \
&IID_IMLangConvertCharset, \
(VOID **)&p);
/* Character Set to Microsoft Windows Codepage Identifier map, */
/* from <rotor/sscli/clr/src/classlibnative/nls/encodingdata.cpp>. */
/* note: the 'safe' field indicates whether this encoding can be */
/* read/written character-by-character; this does not apply to */
/* various stateful encodings such as ISO-2022 or UTF-7, these */
/* must be read/written as a complete stream. It is possible that */
/* some 'unsafe' encodings are marked as 'save'. */
/* todo: cleanup; Tidy should use only a single mapping table to */
/* circumvent unsupported aliases in other transcoding libraries, */
/* enable reverse lookup of encoding names and ease maintenance. */
static struct _nameWinCPMap
{
tmbstr name;
uint wincp;
Bool safe;
} const NameWinCPMap[] = {
{ "cp037", 37, yes },
{ "csibm037", 37, yes },
{ "ebcdic-cp-ca", 37, yes },
{ "ebcdic-cp-nl", 37, yes },
{ "ebcdic-cp-us", 37, yes },
{ "ebcdic-cp-wt", 37, yes },
{ "ibm037", 37, yes },
{ "cp437", 437, yes },
{ "cspc8codepage437", 437, yes },
{ "ibm437", 437, yes },
{ "cp500", 500, yes },
{ "csibm500", 500, yes },
{ "ebcdic-cp-be", 500, yes },
{ "ebcdic-cp-ch", 500, yes },
{ "ibm500", 500, yes },
{ "asmo-708", 708, yes },
{ "dos-720", 720, yes },
{ "ibm737", 737, yes },
{ "ibm775", 775, yes },
{ "cp850", 850, yes },
{ "ibm850", 850, yes },
{ "cp852", 852, yes },
{ "ibm852", 852, yes },
{ "cp855", 855, yes },
{ "ibm855", 855, yes },
{ "cp857", 857, yes },
{ "ibm857", 857, yes },
{ "ccsid00858", 858, yes },
{ "cp00858", 858, yes },
{ "cp858", 858, yes },
{ "ibm00858", 858, yes },
{ "pc-multilingual-850+euro", 858, yes },
{ "cp860", 860, yes },
{ "ibm860", 860, yes },
{ "cp861", 861, yes },
{ "ibm861", 861, yes },
{ "cp862", 862, yes },
{ "dos-862", 862, yes },
{ "ibm862", 862, yes },
{ "cp863", 863, yes },
{ "ibm863", 863, yes },
{ "cp864", 864, yes },
{ "ibm864", 864, yes },
{ "cp865", 865, yes },
{ "ibm865", 865, yes },
{ "cp866", 866, yes },
{ "ibm866", 866, yes },
{ "cp869", 869, yes },
{ "ibm869", 869, yes },
{ "cp870", 870, yes },
{ "csibm870", 870, yes },
{ "ebcdic-cp-roece", 870, yes },
{ "ebcdic-cp-yu", 870, yes },
{ "ibm870", 870, yes },
{ "dos-874", 874, yes },
{ "iso-8859-11", 874, yes },
{ "tis-620", 874, yes },
{ "windows-874", 874, yes },
{ "cp875", 875, yes },
{ "csshiftjis", 932, yes },
{ "cswindows31j", 932, yes },
{ "ms_kanji", 932, yes },
{ "shift-jis", 932, yes },
{ "shift_jis", 932, yes },
{ "sjis", 932, yes },
{ "x-ms-cp932", 932, yes },
{ "x-sjis", 932, yes },
{ "chinese", 936, yes },
{ "cn-gb", 936, yes },
{ "csgb2312", 936, yes },
{ "csgb231280", 936, yes },
{ "csiso58gb231280", 936, yes },
{ "gb2312", 936, yes },
{ "gb2312-80", 936, yes },
{ "gb231280", 936, yes },
{ "gb_2312-80", 936, yes },
{ "gbk", 936, yes },
{ "iso-ir-58", 936, yes },
{ "csksc56011987", 949, yes },
{ "iso-ir-149", 949, yes },
{ "korean", 949, yes },
{ "ks-c-5601", 949, yes },
{ "ks-c5601", 949, yes },
{ "ks_c_5601", 949, yes },
{ "ks_c_5601-1987", 949, yes },
{ "ks_c_5601-1989", 949, yes },
{ "ks_c_5601_1987", 949, yes },
{ "ksc5601", 949, yes },
{ "ksc_5601", 949, yes },
{ "big5", 950, yes },
{ "big5-hkscs", 950, yes },
{ "cn-big5", 950, yes },
{ "csbig5", 950, yes },
{ "x-x-big5", 950, yes },
{ "cp1026", 1026, yes },
{ "csibm1026", 1026, yes },
{ "ibm1026", 1026, yes },
{ "ibm01047", 1047, yes },
{ "ccsid01140", 1140, yes },
{ "cp01140", 1140, yes },
{ "ebcdic-us-37+euro", 1140, yes },
{ "ibm01140", 1140, yes },
{ "ccsid01141", 1141, yes },
{ "cp01141", 1141, yes },
{ "ebcdic-de-273+euro", 1141, yes },
{ "ibm01141", 1141, yes },
{ "ccsid01142", 1142, yes },
{ "cp01142", 1142, yes },
{ "ebcdic-dk-277+euro", 1142, yes },
{ "ebcdic-no-277+euro", 1142, yes },
{ "ibm01142", 1142, yes },
{ "ccsid01143", 1143, yes },
{ "cp01143", 1143, yes },
{ "ebcdic-fi-278+euro", 1143, yes },
{ "ebcdic-se-278+euro", 1143, yes },
{ "ibm01143", 1143, yes },
{ "ccsid01144", 1144, yes },
{ "cp01144", 1144, yes },
{ "ebcdic-it-280+euro", 1144, yes },
{ "ibm01144", 1144, yes },
{ "ccsid01145", 1145, yes },
{ "cp01145", 1145, yes },
{ "ebcdic-es-284+euro", 1145, yes },
{ "ibm01145", 1145, yes },
{ "ccsid01146", 1146, yes },
{ "cp01146", 1146, yes },
{ "ebcdic-gb-285+euro", 1146, yes },
{ "ibm01146", 1146, yes },
{ "ccsid01147", 1147, yes },
{ "cp01147", 1147, yes },
{ "ebcdic-fr-297+euro", 1147, yes },
{ "ibm01147", 1147, yes },
{ "ccsid01148", 1148, yes },
{ "cp01148", 1148, yes },
{ "ebcdic-international-500+euro", 1148, yes },
{ "ibm01148", 1148, yes },
{ "ccsid01149", 1149, yes },
{ "cp01149", 1149, yes },
{ "ebcdic-is-871+euro", 1149, yes },
{ "ibm01149", 1149, yes },
{ "iso-10646-ucs-2", 1200, yes },
{ "ucs-2", 1200, yes },
{ "unicode", 1200, yes },
{ "utf-16", 1200, yes },
{ "utf-16le", 1200, yes },
{ "unicodefffe", 1201, yes },
{ "utf-16be", 1201, yes },
{ "windows-1250", 1250, yes },
{ "x-cp1250", 1250, yes },
{ "windows-1251", 1251, yes },
{ "x-cp1251", 1251, yes },
{ "windows-1252", 1252, yes },
{ "x-ansi", 1252, yes },
{ "windows-1253", 1253, yes },
{ "windows-1254", 1254, yes },
{ "windows-1255", 1255, yes },
{ "cp1256", 1256, yes },
{ "windows-1256", 1256, yes },
{ "windows-1257", 1257, yes },
{ "windows-1258", 1258, yes },
{ "johab", 1361, yes },
{ "macintosh", 10000, yes },
{ "x-mac-japanese", 10001, yes },
{ "x-mac-chinesetrad", 10002, yes },
{ "x-mac-korean", 10003, yes },
{ "x-mac-arabic", 10004, yes },
{ "x-mac-hebrew", 10005, yes },
{ "x-mac-greek", 10006, yes },
{ "x-mac-cyrillic", 10007, yes },
{ "x-mac-chinesesimp", 10008, yes },
{ "x-mac-romanian", 10010, yes },
{ "x-mac-ukrainian", 10017, yes },
{ "x-mac-thai", 10021, yes },
{ "x-mac-ce", 10029, yes },
{ "x-mac-icelandic", 10079, yes },
{ "x-mac-turkish", 10081, yes },
{ "x-mac-croatian", 10082, yes },
{ "x-chinese-cns", 20000, yes },
{ "x-cp20001", 20001, yes },
{ "x-chinese-eten", 20002, yes },
{ "x-cp20003", 20003, yes },
{ "x-cp20004", 20004, yes },
{ "x-cp20005", 20005, yes },
{ "irv", 20105, yes },
{ "x-ia5", 20105, yes },
{ "din_66003", 20106, yes },
{ "german", 20106, yes },
{ "x-ia5-german", 20106, yes },
{ "sen_850200_b", 20107, yes },
{ "swedish", 20107, yes },
{ "x-ia5-swedish", 20107, yes },
{ "norwegian", 20108, yes },
{ "ns_4551-1", 20108, yes },
{ "x-ia5-norwegian", 20108, yes },
{ "ansi_x3.4-1968", 20127, yes },
{ "ansi_x3.4-1986", 20127, yes },
{ "ascii", 20127, yes },
{ "cp367", 20127, yes },
{ "csascii", 20127, yes },
{ "ibm367", 20127, yes },
{ "iso-ir-6", 20127, yes },
{ "iso646-us", 20127, yes },
{ "iso_646.irv:1991", 20127, yes },
{ "us", 20127, yes },
{ "us-ascii", 20127, yes },
{ "x-cp20261", 20261, yes },
{ "x-cp20269", 20269, yes },
{ "cp273", 20273, yes },
{ "csibm273", 20273, yes },
{ "ibm273", 20273, yes },
{ "csibm277", 20277, yes },
{ "ebcdic-cp-dk", 20277, yes },
{ "ebcdic-cp-no", 20277, yes },
{ "ibm277", 20277, yes },
{ "cp278", 20278, yes },
{ "csibm278", 20278, yes },
{ "ebcdic-cp-fi", 20278, yes },
{ "ebcdic-cp-se", 20278, yes },
{ "ibm278", 20278, yes },
{ "cp280", 20280, yes },
{ "csibm280", 20280, yes },
{ "ebcdic-cp-it", 20280, yes },
{ "ibm280", 20280, yes },
{ "cp284", 20284, yes },
{ "csibm284", 20284, yes },
{ "ebcdic-cp-es", 20284, yes },
{ "ibm284", 20284, yes },
{ "cp285", 20285, yes },
{ "csibm285", 20285, yes },
{ "ebcdic-cp-gb", 20285, yes },
{ "ibm285", 20285, yes },
{ "cp290", 20290, yes },
{ "csibm290", 20290, yes },
{ "ebcdic-jp-kana", 20290, yes },
{ "ibm290", 20290, yes },
{ "cp297", 20297, yes },
{ "csibm297", 20297, yes },
{ "ebcdic-cp-fr", 20297, yes },
{ "ibm297", 20297, yes },
{ "cp420", 20420, yes },
{ "csibm420", 20420, yes },
{ "ebcdic-cp-ar1", 20420, yes },
{ "ibm420", 20420, yes },
{ "cp423", 20423, yes },
{ "csibm423", 20423, yes },
{ "ebcdic-cp-gr", 20423, yes },
{ "ibm423", 20423, yes },
{ "cp424", 20424, yes },
{ "csibm424", 20424, yes },
{ "ebcdic-cp-he", 20424, yes },
{ "ibm424", 20424, yes },
{ "x-ebcdic-koreanextended", 20833, yes },
{ "csibmthai", 20838, yes },
{ "ibm-thai", 20838, yes },
{ "cskoi8r", 20866, yes },
{ "koi", 20866, yes },
{ "koi8", 20866, yes },
{ "koi8-r", 20866, yes },
{ "koi8r", 20866, yes },
{ "cp871", 20871, yes },
{ "csibm871", 20871, yes },
{ "ebcdic-cp-is", 20871, yes },
{ "ibm871", 20871, yes },
{ "cp880", 20880, yes },
{ "csibm880", 20880, yes },
{ "ebcdic-cyrillic", 20880, yes },
{ "ibm880", 20880, yes },
{ "cp905", 20905, yes },
{ "csibm905", 20905, yes },
{ "ebcdic-cp-tr", 20905, yes },
{ "ibm905", 20905, yes },
{ "ccsid00924", 20924, yes },
{ "cp00924", 20924, yes },
{ "ebcdic-latin9--euro", 20924, yes },
{ "ibm00924", 20924, yes },
{ "x-cp20936", 20936, yes },
{ "x-cp20949", 20949, yes },
{ "cp1025", 21025, yes },
{ "x-cp21027", 21027, yes },
{ "koi8-ru", 21866, yes },
{ "koi8-u", 21866, yes },
{ "cp819", 28591, yes },
{ "csisolatin1", 28591, yes },
{ "ibm819", 28591, yes },
{ "iso-8859-1", 28591, yes },
{ "iso-ir-100", 28591, yes },
{ "iso8859-1", 28591, yes },
{ "iso_8859-1", 28591, yes },
{ "iso_8859-1:1987", 28591, yes },
{ "l1", 28591, yes },
{ "latin1", 28591, yes },
{ "csisolatin2", 28592, yes },
{ "iso-8859-2", 28592, yes },
{ "iso-ir-101", 28592, yes },
{ "iso8859-2", 28592, yes },
{ "iso_8859-2", 28592, yes },
{ "iso_8859-2:1987", 28592, yes },
{ "l2", 28592, yes },
{ "latin2", 28592, yes },
{ "csisolatin3", 28593, yes },
{ "iso-8859-3", 28593, yes },
{ "iso-ir-109", 28593, yes },
{ "iso_8859-3", 28593, yes },
{ "iso_8859-3:1988", 28593, yes },
{ "l3", 28593, yes },
{ "latin3", 28593, yes },
{ "csisolatin4", 28594, yes },
{ "iso-8859-4", 28594, yes },
{ "iso-ir-110", 28594, yes },
{ "iso_8859-4", 28594, yes },
{ "iso_8859-4:1988", 28594, yes },
{ "l4", 28594, yes },
{ "latin4", 28594, yes },
{ "csisolatincyrillic", 28595, yes },
{ "cyrillic", 28595, yes },
{ "iso-8859-5", 28595, yes },
{ "iso-ir-144", 28595, yes },
{ "iso_8859-5", 28595, yes },
{ "iso_8859-5:1988", 28595, yes },
{ "arabic", 28596, yes },
{ "csisolatinarabic", 28596, yes },
{ "ecma-114", 28596, yes },
{ "iso-8859-6", 28596, yes },
{ "iso-ir-127", 28596, yes },
{ "iso_8859-6", 28596, yes },
{ "iso_8859-6:1987", 28596, yes },
{ "csisolatingreek", 28597, yes },
{ "ecma-118", 28597, yes },
{ "elot_928", 28597, yes },
{ "greek", 28597, yes },
{ "greek8", 28597, yes },
{ "iso-8859-7", 28597, yes },
{ "iso-ir-126", 28597, yes },
{ "iso_8859-7", 28597, yes },
{ "iso_8859-7:1987", 28597, yes },
{ "csisolatinhebrew", 28598, yes },
{ "hebrew", 28598, yes },
{ "iso-8859-8", 28598, yes },
{ "iso-ir-138", 28598, yes },
{ "iso_8859-8", 28598, yes },
{ "iso_8859-8:1988", 28598, yes },
{ "logical", 28598, yes },
{ "visual", 28598, yes },
{ "csisolatin5", 28599, yes },
{ "iso-8859-9", 28599, yes },
{ "iso-ir-148", 28599, yes },
{ "iso_8859-9", 28599, yes },
{ "iso_8859-9:1989", 28599, yes },
{ "l5", 28599, yes },
{ "latin5", 28599, yes },
{ "iso-8859-13", 28603, yes },
{ "csisolatin9", 28605, yes },
{ "iso-8859-15", 28605, yes },
{ "iso_8859-15", 28605, yes },
{ "l9", 28605, yes },
{ "latin9", 28605, yes },
{ "x-europa", 29001, yes },
{ "iso-8859-8-i", 38598, yes },
{ "iso-2022-jp", 50220, no },
{ "csiso2022jp", 50221, no },
{ "csiso2022kr", 50225, no },
{ "iso-2022-kr", 50225, no },
{ "iso-2022-kr-7", 50225, no },
{ "iso-2022-kr-7bit", 50225, no },
{ "cp50227", 50227, no },
{ "x-cp50227", 50227, no },
{ "cp930", 50930, yes },
{ "x-ebcdic-japaneseanduscanada", 50931, yes },
{ "cp933", 50933, yes },
{ "cp935", 50935, yes },
{ "cp937", 50937, yes },
{ "cp939", 50939, yes },
{ "cseucpkdfmtjapanese", 51932, yes },
{ "euc-jp", 51932, yes },
{ "extended_unix_code_packed_format_for_japanese", 51932, yes },
{ "iso-2022-jpeuc", 51932, yes },
{ "x-euc", 51932, yes },
{ "x-euc-jp", 51932, yes },
{ "euc-cn", 51936, yes },
{ "x-euc-cn", 51936, yes },
{ "cseuckr", 51949, yes },
{ "euc-kr", 51949, yes },
{ "iso-2022-kr-8", 51949, yes },
{ "iso-2022-kr-8bit", 51949, yes },
{ "hz-gb-2312", 52936, no },
{ "gb18030", 54936, yes },
{ "x-iscii-de", 57002, yes },
{ "x-iscii-be", 57003, yes },
{ "x-iscii-ta", 57004, yes },
{ "x-iscii-te", 57005, yes },
{ "x-iscii-as", 57006, yes },
{ "x-iscii-or", 57007, yes },
{ "x-iscii-ka", 57008, yes },
{ "x-iscii-ma", 57009, yes },
{ "x-iscii-gu", 57010, yes },
{ "x-iscii-pa", 57011, yes },
{ "csunicode11utf7", 65000, no },
{ "unicode-1-1-utf-7", 65000, no },
{ "unicode-2-0-utf-7", 65000, no },
{ "utf-7", 65000, no },
{ "x-unicode-1-1-utf-7", 65000, no },
{ "x-unicode-2-0-utf-7", 65000, no },
{ "unicode-1-1-utf-8", 65001, yes },
{ "unicode-2-0-utf-8", 65001, yes },
{ "utf-8", 65001, yes },
{ "x-unicode-1-1-utf-8", 65001, yes },
{ "x-unicode-2-0-utf-8", 65001, yes },
/* final entry */
{ NULL, 0, no }
};
uint TY_(Win32MLangGetCPFromName)(TidyAllocator *allocator, ctmbstr encoding)
{
uint i;
tmbstr enc;
/* ensure name is in lower case */
enc = TY_(tmbstrdup)(allocator,encoding);
enc = TY_(tmbstrtolower)(enc);
for (i = 0; NameWinCPMap[i].name; ++i)
{
if (TY_(tmbstrcmp)(NameWinCPMap[i].name, enc) == 0)
{
IMLangConvertCharset * p = NULL;
uint wincp = NameWinCPMap[i].wincp;
HRESULT hr;
TidyFree(allocator, enc);
/* currently no support for unsafe encodings */
if (!NameWinCPMap[i].safe)
return 0;
/* hack for config.c */
CoInitialize(NULL);
hr = CreateMLangObject(p);
if (hr != S_OK || !p)
{
wincp = 0;
}
else
{
hr = IMLangConvertCharset_Initialize(p, wincp, 1200, 0);
if (hr != S_OK)
wincp = 0;
IMLangConvertCharset_Release(p);
p = NULL;
}
CoUninitialize();
return wincp;
}
}
TidyFree(allocator, enc);
return 0;
}
Bool TY_(Win32MLangInitInputTranscoder)(StreamIn * in, uint wincp)
{
IMLangConvertCharset * p = NULL;
HRESULT hr;
assert( in != NULL );
CoInitialize(NULL);
if (wincp == 0)
{
/* no codepage found for this encoding */
return no;
}
hr = CreateMLangObject(p);
if (hr != S_OK || !p)
{
/* MLang not supported */
return no;
}
hr = IMLangConvertCharset_Initialize(p, wincp, 1200, 0);
if (hr != S_OK)
{
/* encoding not supported, insufficient memory, etc. */
return no;
}
in->mlang = p;
return yes;
}
void TY_(Win32MLangUninitInputTranscoder)(StreamIn * in)
{
IMLangConvertCharset * p;
assert( in != NULL );
p = (IMLangConvertCharset *)in->mlang;
if (p)
{
IMLangConvertCharset_Release(p);
p = NULL;
in->mlang = NULL;
}
CoUninitialize();
}
#if 0
Bool Win32MLangInitOutputTranscoder(TidyAllocator *allocator, StreamOut * out, tmbstr encoding)
{
IMLangConvertCharset * p = NULL;
HRESULT hr;
uint wincp;
assert( out != NULL );
CoInitialize(NULL);
wincp = TY_(Win32MLangGetCPFromName)(allocator, encoding);
if (wincp == 0)
{
/* no codepage found for this encoding */
return no;
}
hr = CreateMLangObject(p);
if (hr != S_OK || !p)
{
/* MLang not supported */
return no;
}
IMLangConvertCharset_Initialize(p, 1200, wincp, MLCONVCHARF_NOBESTFITCHARS);
if (hr != S_OK)
{
/* encoding not supported, insufficient memory, etc. */
return no;
}
out->mlang = p;
return yes;
}
void Win32MLangUninitOutputTranscoder(StreamOut * out)
{
IMLangConvertCharset * p;
assert( out != NULL );
p = (IMLangConvertCharset *)out->mlang;
if (p)
{
IMLangConvertCharset_Release(p);
p = NULL;
out->mlang = NULL;
}
CoUninitialize();
}
#endif
int TY_(Win32MLangGetChar)(byte firstByte, StreamIn * in, uint * bytesRead)
{
IMLangConvertCharset * p;
TidyInputSource * source;
CHAR inbuf[TC_INBUFSIZE] = { 0 };
WCHAR outbuf[TC_OUTBUFSIZE] = { 0 };
HRESULT hr = S_OK;
size_t inbufsize = 0;
assert( in != NULL );
assert( &in->source != NULL );
assert( bytesRead != NULL );
assert( in->mlang != NULL );
p = (IMLangConvertCharset *)in->mlang;
source = &in->source;
inbuf[inbufsize++] = (CHAR)firstByte;
while(inbufsize < TC_INBUFSIZE)
{
UINT outbufsize = TC_OUTBUFSIZE;
UINT readNow = inbufsize;
int nextByte = EndOfStream;
hr = IMLangConvertCharset_DoConversionToUnicode(p, inbuf, &readNow, outbuf, &outbufsize);
assert( hr == S_OK );
assert( outbufsize <= 2 );
if (outbufsize == 2)
{
/* U+10000-U+10FFFF are returned as a pair of surrogates */
tchar m = (tchar)outbuf[0];
tchar n = (tchar)outbuf[1];
assert( TY_(IsHighSurrogate)(n) && TY_(IsLowSurrogate)(m) );
*bytesRead = readNow;
return (int)TY_(CombineSurrogatePair)(n, m);
}
if (outbufsize == 1)
{
/* we found the character */
/* set bytesRead and return */
*bytesRead = readNow;
return (int)outbuf[0];
}
/* we need more bytes */
nextByte = source->getByte(source->sourceData);
if (nextByte == EndOfStream)
{
/* todo: error message for broken stream? */
*bytesRead = readNow;
return EndOfStream;
}
inbuf[inbufsize++] = (CHAR)nextByte;
}
/* No full character found after reading TC_INBUFSIZE bytes, */
/* give up to read this stream, it's obviously unreadable. */
/* todo: error message for broken stream? */
return EndOfStream;
}
Bool Win32MLangIsConvertible(tchar c, StreamOut * out)
{
IMLangConvertCharset * p;
UINT i = 1;
HRESULT hr;
WCHAR inbuf[2] = { 0 };
UINT inbufsize = 0;
assert( c != 0 );
assert( c <= 0x10FFFF );
assert( out != NULL );
assert( out->mlang != NULL );
if (c > 0xFFFF)
{
tchar high = 0;
tchar low = 0;
TY_(SplitSurrogatePair)(c, &low, &high);
inbuf[inbufsize++] = (WCHAR)low;
inbuf[inbufsize++] = (WCHAR)high;
}
else
inbuf[inbufsize++] = (WCHAR)c;
p = (IMLangConvertCharset *)out->mlang;
hr = IMLangConvertCharset_DoConversionFromUnicode(p, inbuf, &inbufsize, NULL, NULL);
return hr == S_OK ? yes : no;
}
void Win32MLangPutChar(tchar c, StreamOut * out, uint * bytesWritten)
{
IMLangConvertCharset * p;
TidyOutputSink * sink;
CHAR outbuf[TC_OUTBUFSIZE] = { 0 };
UINT outbufsize = TC_OUTBUFSIZE;
HRESULT hr = S_OK;
WCHAR inbuf[2] = { 0 };
UINT inbufsize = 0;
uint i;
assert( c != 0 );
assert( c <= 0x10FFFF );
assert( bytesWritten != NULL );
assert( out != NULL );
assert( &out->sink != NULL );
assert( out->mlang != NULL );
p = (IMLangConvertCharset *)out->mlang;
sink = &out->sink;
if (c > 0xFFFF)
{
tchar high = 0;
tchar low = 0;
TY_(SplitSurrogatePair)(c, &low, &high);
inbuf[inbufsize++] = (WCHAR)low;
inbuf[inbufsize++] = (WCHAR)high;
}
else
inbuf[inbufsize++] = (WCHAR)c;
hr = IMLangConvertCharset_DoConversionFromUnicode(p, inbuf, &inbufsize, outbuf, &outbufsize);
assert( hr == S_OK );
assert( outbufsize > 0 );
assert( inbufsize == 1 || inbufsize == 2 );
for (i = 0; i < outbufsize; ++i)
sink->putByte(sink->sinkData, (byte)(outbuf[i]));
*bytesWritten = outbufsize;
return;
}
#endif /* TIDY_WIN32_MLANG_SUPPORT */
/*
* local variables:
* mode: c
* indent-tabs-mode: nil
* c-basic-offset: 4
* eval: (c-set-offset 'substatement-open 0)
* end:
*/

18
src/win32tc.h

@ -0,0 +1,18 @@
#ifndef __WIN32TC_H__
#define __WIN32TC_H__
#ifdef TIDY_WIN32_MLANG_SUPPORT
/* win32tc.h -- Interface to Win32 transcoding routines
(c) 1998-2006 (W3C) MIT, ERCIM, Keio University
See tidy.h for the copyright notice.
*/
uint TY_(Win32MLangGetCPFromName)(TidyAllocator *allocator,ctmbstr encoding);
Bool TY_(Win32MLangInitInputTranscoder)(StreamIn * in, uint wincp);
void TY_(Win32MLangUninitInputTranscoder)(StreamIn * in);
int TY_(Win32MLangGetChar)(byte firstByte, StreamIn * in, uint * bytesRead);
#endif /* TIDY_WIN32_MLANG_SUPPORT */
#endif /* __WIN32TC_H__ */

6
tests/testthat/test-htmltidy.R

@ -1,7 +1,7 @@
context("basic functionality")
test_that("we can do something", {
test_that("tidying works", {
expect_gte(nchar(tidy("<b><p><a href='http://google.com'>google &gt</a></p></b>")),
256)
expect_gte(nchar(tidy_html("<b><p><a href='http://google.com'>google &gt</a></p></b>")),
249)
})

Loading…
Cancel
Save