diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..f096636 Binary files /dev/null and b/.DS_Store differ diff --git a/DESCRIPTION b/DESCRIPTION index 955a5b2..b7f5b08 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,12 +1,20 @@ Package: htmltidy -Title: Clean up gnarly HTML/XML -Version: 0.1.0.9000 -Authors@R: c(person("Bob", "Rudis", email = "bob@rudis.net", role = c("aut", "cre"))) -Description: Clean up gnarly HTML/XML +Title: Clean Up Gnarly HTML/XML +Version: 0.2.0.9000 +Authors@R: c( + person("Bob", "Rudis", email = "bob@rudis.net", role = c("aut", "cre")), + person("Dave", "Dave", email = "dsr@w3.org", role = c("ctb", "aut"), + comment="HTML Tidy library") + ) +Maintainer: Bob Rudis +Description: HTML and XML documents can be beautiful and pristine. They can also be + wretched, evil, malformed hellspawn. Now, you can tidy up that HTML and XML before + processing it with your favorite angle-bracket parsing tools. Depends: R (>= 3.3.0) License: AGPL + file LICENSE LazyData: true +NeedsCompilation: yes Suggests: testthat, xml2 diff --git a/NAMESPACE b/NAMESPACE index 5443f4e..6cc908c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,5 @@ # Generated by roxygen2: do not edit by hand -export(tidy) +export(tidy_html) importFrom(Rcpp,sourceCpp) useDynLib(htmltidy) diff --git a/NEWS.md b/NEWS.md index 3a0f792..54fd144 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,9 @@ +# htmltidy 0.2.0.9000 + +* Bundled tidy-html5 library with the package +* Modified tests + + # htmltidy 0.1.0.9000 * Added a `NEWS.md` file to track changes to the package. diff --git a/R/RcppExports.R b/R/RcppExports.R index eee8ac8..23e0ec4 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,11 +1,11 @@ -# This file was generated by Rcpp::compileAttributes +# Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 #' Tidy HTML/XML #' #' @param source length 1 character vetor containing the HTML/XML source to process #' @export -tidy <- function(source) { - .Call('htmltidy_tidy', PACKAGE = 'htmltidy', source) +tidy_html <- function(source) { + .Call('htmltidy_tidy_html', PACKAGE = 'htmltidy', source) } diff --git a/README.Rmd b/README.Rmd index c447bd8..058a670 100644 --- a/README.Rmd +++ b/README.Rmd @@ -21,20 +21,16 @@ knitr::opts_chunk$set( Inspired by [this SO question](http://stackoverflow.com/questions/37061873/identify-a-weblink-in-bold-in-r) and because there's a great deal of cruddy HTML out there that needs fixing to use properly when scraping data. -NOTE: Requires [`libtidy`](http://www.html-tidy.org/) and presently is super-basic (no way to set options and pretty much only does HTML) - -You'll need to first do a `brew install tidy-html5` on MacOS or `apt-get install libtidy-dev` on Ubuntu/Debian to get this to work. NOTE that the linux libraries may be older and return slightly different (but no less tidy) HTML. - -**SEEKING COLLABORATORS** +It relies on a locally included version of [`libtidy`](http://www.html-tidy.org/) and presently is super-basic (no way to set options and pretty much only does HTML) This works enough for me to use in a pinch. It should be straightforward (but tedious) to: - enable passing options in a `list` -- bundle `libtidy` _with the package_ and get it to work on Windows, linux & MacOS as the library compiles on all three with the necessary tools. +- Getting it to work on Windows. The following functions are implemented: -- `tidy` : Clean up gnarly HTML/XML +- `tidy_html` : Clean up gnarly HTML/XML ### Installation @@ -54,7 +50,7 @@ library(htmltidy) # current verison packageVersion("htmltidy") -cat(tidy("

google >

")) +cat(tidy_html("

google >

")) ``` ### Code of Conduct diff --git a/README.md b/README.md index a6b452d..bdff89e 100644 --- a/README.md +++ b/README.md @@ -6,20 +6,16 @@ Inspired by [this SO question](http://stackoverflow.com/questions/37061873/identify-a-weblink-in-bold-in-r) and because there's a great deal of cruddy HTML out there that needs fixing to use properly when scraping data. -NOTE: Requires [`libtidy`](http://www.html-tidy.org/) and presently is super-basic (no way to set options and pretty much only does HTML) - -You'll need to first do a `brew install tidy-html5` on MacOS or `apt-get install libtidy-dev` on Ubuntu/Debian to get this to work. NOTE that the linux libraries may be older and return slightly different (but no less tidy) HTML. - -**SEEKING COLLABORATORS** +It relies on a locally included version of [`libtidy`](http://www.html-tidy.org/) and presently is super-basic (no way to set options and pretty much only does HTML) This works enough for me to use in a pinch. It should be straightforward (but tedious) to: - enable passing options in a `list` -- bundle `libtidy` *with the package* and get it to work on Windows, linux & MacOS as the library compiles on all three with the necessary tools. +- Getting it to work on Windows. The following functions are implemented: -- `tidy` : Clean up gnarly HTML/XML +- `tidy_html` : Clean up gnarly HTML/XML ### Installation @@ -34,14 +30,14 @@ library(htmltidy) # current verison packageVersion("htmltidy") -#> [1] '0.1.0.9000' +#> [1] '0.2.0.9000' -cat(tidy("

google >

")) +cat(tidy_html("

google >

")) #> #> #> #> "HTML Tidy for HTML5 for Mac OS X version 5.2.0" /> +#> "HTML Tidy for HTML5 for R version 5.0.0" /> #> #> #> diff --git a/man/tidy.Rd b/man/tidy_html.Rd similarity index 82% rename from man/tidy.Rd rename to man/tidy_html.Rd index b759c70..7753e5c 100644 --- a/man/tidy.Rd +++ b/man/tidy_html.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/RcppExports.R -\name{tidy} -\alias{tidy} +\name{tidy_html} +\alias{tidy_html} \title{Tidy HTML/XML} \usage{ -tidy(source) +tidy_html(source) } \arguments{ \item{source}{length 1 character vetor containing the HTML/XML source to process} diff --git a/src/.DS_Store b/src/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/src/.DS_Store differ diff --git a/src/Makevars b/src/Makevars index c6616f4..c1e29e5 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1 +1,2 @@ -PKG_LIBS=-ltidy +PKG_CPPFLAGS = -I. +PKG_CXXFLAGS = -I. diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index a8a2e95..d02a8bf 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -1,18 +1,18 @@ -// This file was generated by Rcpp::compileAttributes +// Generated by using Rcpp::compileAttributes() -> do not edit by hand // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 #include using namespace Rcpp; -// tidy -std::string tidy(std::string source); -RcppExport SEXP htmltidy_tidy(SEXP sourceSEXP) { +// tidy_html +std::string tidy_html(std::string source); +RcppExport SEXP htmltidy_tidy_html(SEXP sourceSEXP) { BEGIN_RCPP - Rcpp::RObject __result; - Rcpp::RNGScope __rngScope; + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< std::string >::type source(sourceSEXP); - __result = Rcpp::wrap(tidy(source)); - return __result; + rcpp_result_gen = Rcpp::wrap(tidy_html(source)); + return rcpp_result_gen; END_RCPP } diff --git a/src/access.c b/src/access.c new file mode 100644 index 0000000..72ad27e --- /dev/null +++ b/src/access.c @@ -0,0 +1,3305 @@ +/* access.c -- carry out accessibility checks + + Copyright University of Toronto + Portions (c) 1998-2009 (W3C) MIT, ERCIM, Keio University + See tidy.h for the copyright notice. + +*/ + +/********************************************************************* +* AccessibilityChecks +* +* Carries out processes for all accessibility checks. Traverses +* through all the content within the tree and evaluates the tags for +* accessibility. +* +* To perform the following checks, 'AccessibilityChecks' must be +* called AFTER the tree structure has been formed. +* +* If, in the command prompt, there is no specification of which +* accessibility priorities to check, no accessibility checks will be +* performed. (ie. '1' for priority 1, '2' for priorities 1 and 2, +* and '3') for priorities 1, 2 and 3.) +* +* Copyright University of Toronto +* Programmed by: Mike Lam and Chris Ridpath +* Modifications by : Terry Teague (TRT) +* +* Reference document: http://www.w3.org/TR/WAI-WEBCONTENT/ +*********************************************************************/ + + +#include "tidy-int.h" + +#if SUPPORT_ACCESSIBILITY_CHECKS + +#include "access.h" +#include "message.h" +#include "tags.h" +#include "attrs.h" +#include "tmbstr.h" + + +/* + The accessibility checks to perform depending on user's desire. + + 1. priority 1 + 2. priority 1 & 2 + 3. priority 1, 2, & 3 +*/ + +/* List of possible image types */ +static const ctmbstr imageExtensions[] = +{".jpg", ".gif", ".tif", ".pct", ".pic", ".iff", ".dib", + ".tga", ".pcx", ".png", ".jpeg", ".tiff", ".bmp"}; + +#define N_IMAGE_EXTS (sizeof(imageExtensions)/sizeof(ctmbstr)) + +/* List of possible sound file types */ +static const ctmbstr soundExtensions[] = +{".wav", ".au", ".aiff", ".snd", ".ra", ".rm"}; + +static const int soundExtErrCodes[] = +{ + AUDIO_MISSING_TEXT_WAV, + AUDIO_MISSING_TEXT_AU, + AUDIO_MISSING_TEXT_AIFF, + AUDIO_MISSING_TEXT_SND, + AUDIO_MISSING_TEXT_RA, + AUDIO_MISSING_TEXT_RM +}; + +#define N_AUDIO_EXTS (sizeof(soundExtensions)/sizeof(ctmbstr)) + +/* List of possible media extensions */ +static const ctmbstr mediaExtensions[] = +{".mpg", ".mov", ".asx", ".avi", ".ivf", ".m1v", ".mmm", ".mp2v", + ".mpa", ".mpe", ".mpeg", ".ram", ".smi", ".smil", ".swf", + ".wm", ".wma", ".wmv"}; + +#define N_MEDIA_EXTS (sizeof(mediaExtensions)/sizeof(ctmbstr)) + +/* List of possible frame sources */ +static const ctmbstr frameExtensions[] = +{".htm", ".html", ".shtm", ".shtml", ".cfm", ".cfml", +".asp", ".cgi", ".pl", ".smil"}; + +#define N_FRAME_EXTS (sizeof(frameExtensions)/sizeof(ctmbstr)) + +/* List of possible colour values */ +static const int colorValues[][3] = +{ + { 0, 0, 0}, + {128,128,128}, + {192,192,192}, + {255,255,255}, + {192, 0, 0}, + {255, 0, 0}, + {128, 0,128}, + {255, 0,255}, + { 0,128, 0}, + { 0,255, 0}, + {128,128, 0}, + {255,255, 0}, + { 0, 0,128}, + { 0, 0,255}, + { 0,128,128}, + { 0,255,255} +}; + +#define N_COLOR_VALS (sizeof(colorValues)/(sizeof(int[3])) + +/* These arrays are used to convert color names to their RGB values */ +static const ctmbstr colorNames[] = +{ + "black", + "silver", + "grey", + "white", + "maroon", + "red", + "purple", + "fuchsia", + "green", + "lime", + "olive", + "yellow", + "navy", + "blue", + "teal", + "aqua" +}; + +#define N_COLOR_NAMES (sizeof(colorNames)/sizeof(ctmbstr)) +#define N_COLORS N_COLOR_NAMES + + +/* function prototypes */ +static void InitAccessibilityChecks( TidyDocImpl* doc, int level123 ); +static void FreeAccessibilityChecks( TidyDocImpl* doc ); + +static Bool GetRgb( ctmbstr color, int rgb[3] ); +static Bool CompareColors( const int rgbBG[3], const int rgbFG[3] ); +static int ctox( tmbchar ch ); + +/* +static void CheckMapAccess( TidyDocImpl* doc, Node* node, Node* front); +static void GetMapLinks( TidyDocImpl* doc, Node* node, Node* front); +static void CompareAnchorLinks( TidyDocImpl* doc, Node* front, int counter); +static void FindMissingLinks( TidyDocImpl* doc, Node* node, int counter); +*/ +static void CheckFormControls( TidyDocImpl* doc, Node* node ); +static void MetaDataPresent( TidyDocImpl* doc, Node* node ); +static void CheckEmbed( TidyDocImpl* doc, Node* node ); +static void CheckListUsage( TidyDocImpl* doc, Node* node ); + +/* + GetFileExtension takes a path and returns the extension + portion of the path (if any). +*/ + +static void GetFileExtension( ctmbstr path, tmbchar *ext, uint maxExt ) +{ + int i = TY_(tmbstrlen)(path) - 1; + + ext[0] = '\0'; + + do { + if ( path[i] == '/' || path[i] == '\\' ) + break; + else if ( path[i] == '.' ) + { + TY_(tmbstrncpy)( ext, path+i, maxExt ); + break; + } + } while ( --i > 0 ); +} + +/************************************************************************ +* IsImage +* +* Checks if the given filename is an image file. +* Returns 'yes' if it is, 'no' if it's not. +************************************************************************/ + +static Bool IsImage( ctmbstr iType ) +{ + uint i; + + /* Get the file extension */ + tmbchar ext[20]; + GetFileExtension( iType, ext, sizeof(ext) ); + + /* Compare it to the array of known image file extensions */ + for (i = 0; i < N_IMAGE_EXTS; i++) + { + if ( TY_(tmbstrcasecmp)(ext, imageExtensions[i]) == 0 ) + return yes; + } + + return no; +} + + +/*********************************************************************** +* IsSoundFile +* +* Checks if the given filename is a sound file. +* Returns 'yes' if it is, 'no' if it's not. +***********************************************************************/ + +static int IsSoundFile( ctmbstr sType ) +{ + uint i; + tmbchar ext[ 20 ]; + GetFileExtension( sType, ext, sizeof(ext) ); + + for (i = 0; i < N_AUDIO_EXTS; i++) + { + if ( TY_(tmbstrcasecmp)(ext, soundExtensions[i]) == 0 ) + return soundExtErrCodes[i]; + } + return 0; +} + + +/*********************************************************************** +* IsValidSrcExtension +* +* Checks if the 'SRC' value within the FRAME element is valid +* The 'SRC' extension must end in ".htm", ".html", ".shtm", ".shtml", +* ".cfm", ".cfml", ".asp", ".cgi", ".pl", or ".smil" +* +* Returns yes if it is, returns no otherwise. +***********************************************************************/ + +static Bool IsValidSrcExtension( ctmbstr sType ) +{ + uint i; + tmbchar ext[20]; + GetFileExtension( sType, ext, sizeof(ext) ); + + for (i = 0; i < N_FRAME_EXTS; i++) + { + if ( TY_(tmbstrcasecmp)(ext, frameExtensions[i]) == 0 ) + return yes; + } + return no; +} + + +/********************************************************************* +* IsValidMediaExtension +* +* Checks to warn the user that syncronized text equivalents are +* required if multimedia is used. +*********************************************************************/ + +static Bool IsValidMediaExtension( ctmbstr sType ) +{ + uint i; + tmbchar ext[20]; + GetFileExtension( sType, ext, sizeof(ext) ); + + for (i = 0; i < N_MEDIA_EXTS; i++) + { + if ( TY_(tmbstrcasecmp)(ext, mediaExtensions[i]) == 0 ) + return yes; + } + return no; +} + + +/************************************************************************ +* IsWhitespace +* +* Checks if the given string is all whitespace. +* Returns 'yes' if it is, 'no' if it's not. +************************************************************************/ + +static Bool IsWhitespace( ctmbstr pString ) +{ + Bool isWht = yes; + ctmbstr cp; + + for ( cp = pString; isWht && cp && *cp; ++cp ) + { + isWht = TY_(IsWhite)( *cp ); + } + return isWht; +} + +static Bool hasValue( AttVal* av ) +{ + return ( av && ! IsWhitespace(av->value) ); +} + +/*********************************************************************** +* IsPlaceholderAlt +* +* Checks to see if there is an image and photo place holder contained +* in the ALT text. +* +* Returns 'yes' if there is, 'no' if not. +***********************************************************************/ + +static Bool IsPlaceholderAlt( ctmbstr txt ) +{ + return ( strstr(txt, "image") != NULL || + strstr(txt, "photo") != NULL ); +} + + +/*********************************************************************** +* IsPlaceholderTitle +* +* Checks to see if there is an TITLE place holder contained +* in the 'ALT' text. +* +* Returns 'yes' if there is, 'no' if not. + +static Bool IsPlaceHolderTitle( ctmbstr txt ) +{ + return ( strstr(txt, "title") != NULL ); +} +***********************************************************************/ + + +/*********************************************************************** +* IsPlaceHolderObject +* +* Checks to see if there is an OBJECT place holder contained +* in the 'ALT' text. +* +* Returns 'yes' if there is, 'no' if not. +***********************************************************************/ + +static Bool IsPlaceHolderObject( ctmbstr txt ) +{ + return ( strstr(txt, "object") != NULL ); +} + + +/********************************************************** +* EndsWithBytes +* +* Checks to see if the ALT text ends with 'bytes' +* Returns 'yes', if true, 'no' otherwise. +**********************************************************/ + +static Bool EndsWithBytes( ctmbstr txt ) +{ + uint len = TY_(tmbstrlen)( txt ); + return ( len >= 5 && TY_(tmbstrcmp)(txt+len-5, "bytes") == 0 ); +} + + +/******************************************************* +* textFromOneNode +* +* Returns a list of characters contained within one +* text node. +*******************************************************/ + +static ctmbstr textFromOneNode( TidyDocImpl* doc, Node* node ) +{ + uint i; + uint x = 0; + tmbstr txt = doc->access.text; + + if ( node ) + { + /* Copy contents of a text node */ + for (i = node->start; i < node->end; ++i, ++x ) + { + txt[x] = doc->lexer->lexbuf[i]; + + /* Check buffer overflow */ + if ( x >= sizeof(doc->access.text)-1 ) + break; + } + } + + txt[x] = '\0'; + return txt; +} + + +/********************************************************* +* getTextNode +* +* Locates text nodes within a container element. +* Retrieves text that are found contained within +* text nodes, and concatenates the text. +*********************************************************/ + +static void getTextNode( TidyDocImpl* doc, Node* node ) +{ + tmbstr txtnod = doc->access.textNode; + + /* + Continues to traverse through container element until it no + longer contains any more contents + */ + + /* If the tag of the node is NULL, then grab the text within the node */ + if ( TY_(nodeIsText)(node) ) + { + uint i; + + /* Retrieves each character found within the text node */ + for (i = node->start; i < node->end; i++) + { + /* The text must not exceed buffer */ + if ( doc->access.counter >= TEXTBUF_SIZE-1 ) + return; + + txtnod[ doc->access.counter++ ] = doc->lexer->lexbuf[i]; + } + + /* Traverses through the contents within a container element */ + for ( node = node->content; node != NULL; node = node->next ) + getTextNode( doc, node ); + } +} + + +/********************************************************** +* getTextNodeClear +* +* Clears the current 'textNode' and reloads it with new +* text. The textNode must be cleared before use. +**********************************************************/ + +static tmbstr getTextNodeClear( TidyDocImpl* doc, Node* node ) +{ + /* Clears list */ + TidyClearMemory( doc->access.textNode, TEXTBUF_SIZE ); + doc->access.counter = 0; + + getTextNode( doc, node->content ); + return doc->access.textNode; +} + +/********************************************************** +* LevelX_Enabled +* +* Tell whether access "X" is enabled. +**********************************************************/ + +static Bool Level1_Enabled( TidyDocImpl* doc ) +{ + return doc->access.PRIORITYCHK == 1 || + doc->access.PRIORITYCHK == 2 || + doc->access.PRIORITYCHK == 3; +} +static Bool Level2_Enabled( TidyDocImpl* doc ) +{ + return doc->access.PRIORITYCHK == 2 || + doc->access.PRIORITYCHK == 3; +} +static Bool Level3_Enabled( TidyDocImpl* doc ) +{ + return doc->access.PRIORITYCHK == 3; +} + +/******************************************************** +* CheckColorAvailable +* +* Verify that information conveyed with color is +* available without color. +********************************************************/ + +static void CheckColorAvailable( TidyDocImpl* doc, Node* node ) +{ + if (Level1_Enabled( doc )) + { + if ( nodeIsIMG(node) ) + TY_(ReportAccessWarning)( doc, node, INFORMATION_NOT_CONVEYED_IMAGE ); + + else if ( nodeIsAPPLET(node) ) + TY_(ReportAccessWarning)( doc, node, INFORMATION_NOT_CONVEYED_APPLET ); + + else if ( nodeIsOBJECT(node) ) + TY_(ReportAccessWarning)( doc, node, INFORMATION_NOT_CONVEYED_OBJECT ); + + else if ( nodeIsSCRIPT(node) ) + TY_(ReportAccessWarning)( doc, node, INFORMATION_NOT_CONVEYED_SCRIPT ); + + else if ( nodeIsINPUT(node) ) + TY_(ReportAccessWarning)( doc, node, INFORMATION_NOT_CONVEYED_INPUT ); + } +} + +/********************************************************************* +* CheckColorContrast +* +* Checks elements for color contrast. Must have valid contrast for +* valid visibility. +* +* This logic is extremely fragile as it does not recognize +* the fact that color is inherited by many components and +* that BG and FG colors are often set separately. E.g. the +* background color may be set by for the body or a table +* or a cell. The foreground color may be set by any text +* element (p, h1, h2, input, textarea), either explicitly +* or by style. Ergo, this test will not handle most real +* world cases. It's a start, however. +*********************************************************************/ + +static void CheckColorContrast( TidyDocImpl* doc, Node* node ) +{ + int rgbBG[3] = {255,255,255}; /* Black text on white BG */ + + if (Level3_Enabled( doc )) + { + Bool gotBG = yes; + AttVal* av; + + /* Check for 'BGCOLOR' first to compare with other color attributes */ + for ( av = node->attributes; av; av = av->next ) + { + if ( attrIsBGCOLOR(av) ) + { + if ( hasValue(av) ) + gotBG = GetRgb( av->value, rgbBG ); + } + } + + /* + Search for COLOR attributes to compare with background color + Must have valid colour contrast + */ + for ( av = node->attributes; gotBG && av != NULL; av = av->next ) + { + uint errcode = 0; + if ( attrIsTEXT(av) ) + errcode = COLOR_CONTRAST_TEXT; + else if ( attrIsLINK(av) ) + errcode = COLOR_CONTRAST_LINK; + else if ( attrIsALINK(av) ) + errcode = COLOR_CONTRAST_ACTIVE_LINK; + else if ( attrIsVLINK(av) ) + errcode = COLOR_CONTRAST_VISITED_LINK; + + if ( errcode && hasValue(av) ) + { + int rgbFG[3] = {0, 0, 0}; /* Black text */ + + if ( GetRgb(av->value, rgbFG) && + !CompareColors(rgbBG, rgbFG) ) + { + TY_(ReportAccessWarning)( doc, node, errcode ); + } + } + } + } +} + + +/************************************************************** +* CompareColors +* +* Compares two RGB colors for good contrast. +**************************************************************/ +static int minmax( int i1, int i2 ) +{ + return MAX(i1, i2) - MIN(i1,i2); +} +static int brightness( const int rgb[3] ) +{ + return ((rgb[0]*299) + (rgb[1]*587) + (rgb[2]*114)) / 1000; +} + +static Bool CompareColors( const int rgbBG[3], const int rgbFG[3] ) +{ + int brightBG = brightness( rgbBG ); + int brightFG = brightness( rgbFG ); + + int diffBright = minmax( brightBG, brightFG ); + + int diffColor = minmax( rgbBG[0], rgbFG[0] ) + + minmax( rgbBG[1], rgbFG[1] ) + + minmax( rgbBG[2], rgbFG[2] ); + + return ( diffBright > 180 && + diffColor > 500 ); +} + + +/********************************************************************* +* GetRgb +* +* Gets the red, green and blue values for this attribute for the +* background. +* +* Example: If attribute is BGCOLOR="#121005" then red = 18, green = 16, +* blue = 5. +*********************************************************************/ + +static Bool GetRgb( ctmbstr color, int rgb[] ) +{ + uint x; + + /* Check if we have a color name */ + for (x = 0; x < N_COLORS; x++) + { + if ( strstr(colorNames[x], color) != NULL ) + { + rgb[0] = colorValues[x][0]; + rgb[1] = colorValues[x][1]; + rgb[2] = colorValues[x][2]; + return yes; + } + } + + /* + No color name so must be hex values + Is this a number in hexadecimal format? + */ + + /* Must be 7 characters in the RGB value (including '#') */ + if ( TY_(tmbstrlen)(color) == 7 && color[0] == '#' ) + { + rgb[0] = (ctox(color[1]) * 16) + ctox(color[2]); + rgb[1] = (ctox(color[3]) * 16) + ctox(color[4]); + rgb[2] = (ctox(color[5]) * 16) + ctox(color[6]); + return yes; + } + return no; +} + + + +/******************************************************************* +* ctox +* +* Converts a character to a number. +* Example: if given character is 'A' then returns 10. +* +* Returns the number that the character represents. Returns -1 if not a +* valid number. +*******************************************************************/ + +static int ctox( tmbchar ch ) +{ + if ( ch >= '0' && ch <= '9' ) + { + return ch - '0'; + } + else if ( ch >= 'a' && ch <= 'f' ) + { + return ch - 'a' + 10; + } + else if ( ch >= 'A' && ch <= 'F' ) + { + return ch - 'A' + 10; + } + return -1; +} + + +/*********************************************************** +* CheckImage +* +* Checks all image attributes for specific elements to +* check for validity of the values contained within +* the attributes. An appropriate warning message is displayed +* to indicate the error. +***********************************************************/ + +static void CheckImage( TidyDocImpl* doc, Node* node ) +{ + Bool HasAlt = no; + Bool HasIsMap = no; + Bool HasLongDesc = no; + Bool HasDLINK = no; + Bool HasValidHeight = no; + Bool HasValidWidthBullet = no; + Bool HasValidWidthHR = no; + Bool HasTriggeredMissingLongDesc = no; + + AttVal* av; + + if (Level1_Enabled( doc )) + { + /* Checks all image attributes for invalid values within attributes */ + for (av = node->attributes; av != NULL; av = av->next) + { + /* + Checks for valid ALT attribute. + The length of the alt text must be less than 150 characters + long. + */ + if ( attrIsALT(av) ) + { + if (av->value != NULL) + { + if ((TY_(tmbstrlen)(av->value) < 150) && + (IsPlaceholderAlt (av->value) == no) && + (IsPlaceHolderObject (av->value) == no) && + (EndsWithBytes (av->value) == no) && + (IsImage (av->value) == no)) + { + HasAlt = yes; + } + + else if (TY_(tmbstrlen)(av->value) > 150) + { + HasAlt = yes; + TY_(ReportAccessWarning)( doc, node, IMG_ALT_SUSPICIOUS_TOO_LONG ); + } + + else if (IsImage (av->value) == yes) + { + HasAlt = yes; + TY_(ReportAccessWarning)( doc, node, IMG_ALT_SUSPICIOUS_FILENAME); + } + + else if (IsPlaceholderAlt (av->value) == yes) + { + HasAlt = yes; + TY_(ReportAccessWarning)( doc, node, IMG_ALT_SUSPICIOUS_PLACEHOLDER); + } + + else if (EndsWithBytes (av->value) == yes) + { + HasAlt = yes; + TY_(ReportAccessWarning)( doc, node, IMG_ALT_SUSPICIOUS_FILE_SIZE); + } + } + } + + /* + Checks for width values of 'bullets' and 'horizontal + rules' for validity. + + Valid pixel width for 'bullets' must be < 30, and > 150 for + horizontal rules. + */ + else if ( attrIsWIDTH(av) ) + { + /* Longdesc attribute needed if width attribute is not present. */ + if ( hasValue(av) ) + { + int width = atoi( av->value ); + if ( width < 30 ) + HasValidWidthBullet = yes; + + if ( width > 150 ) + HasValidWidthHR = yes; + } + } + + /* + Checks for height values of 'bullets' and horizontal + rules for validity. + + Valid pixel height for 'bullets' and horizontal rules + mustt be < 30. + */ + else if ( attrIsHEIGHT(av) ) + { + /* Longdesc attribute needed if height attribute not present. */ + if ( hasValue(av) && atoi(av->value) < 30 ) + HasValidHeight = yes; + } + + /* + Checks for longdesc and determines validity. + The length of the 'longdesc' must be > 1 + */ + else if ( attrIsLONGDESC(av) ) + { + if ( hasValue(av) && TY_(tmbstrlen)(av->value) > 1 ) + HasLongDesc = yes; + } + + /* + Checks for 'USEMAP' attribute. Ensures that + text links are provided for client-side image maps + */ + else if ( attrIsUSEMAP(av) ) + { + if ( hasValue(av) ) + doc->access.HasUseMap = yes; + } + + else if ( attrIsISMAP(av) ) + { + HasIsMap = yes; + } + } + + + /* + Check to see if a dLINK is present. The ANCHOR element must + be present following the IMG element. The text found between + the ANCHOR tags must be < 6 characters long, and must contain + the letter 'd'. + */ + if ( nodeIsA(node->next) ) + { + node = node->next; + + /* + Node following the anchor must be a text node + for dLINK to exist + */ + + if (node->content != NULL && (node->content)->tag == NULL) + { + /* Number of characters found within the text node */ + ctmbstr word = textFromOneNode( doc, node->content); + + if ((TY_(tmbstrcmp)(word,"d") == 0)|| + (TY_(tmbstrcmp)(word,"D") == 0)) + { + HasDLINK = yes; + } + } + } + + /* + Special case check for dLINK. This will occur if there is + whitespace between the and elements. Ignores + whitespace and continues check for dLINK. + */ + + if ( node->next && !node->next->tag ) + { + node = node->next; + + if ( nodeIsA(node->next) ) + { + node = node->next; + + /* + Node following the ANCHOR must be a text node + for dLINK to exist + */ + if (node->content != NULL && node->content->tag == NULL) + { + /* Number of characters found within the text node */ + ctmbstr word = textFromOneNode( doc, node->content ); + + if ((TY_(tmbstrcmp)(word, "d") == 0)|| + (TY_(tmbstrcmp)(word, "D") == 0)) + { + HasDLINK = yes; + } + } + } + } + + if ((HasAlt == no)&& + (HasValidWidthBullet == yes)&& + (HasValidHeight == yes)) + { + } + + if ((HasAlt == no)&& + (HasValidWidthHR == yes)&& + (HasValidHeight == yes)) + { + } + + if (HasAlt == no) + { + TY_(ReportAccessError)( doc, node, IMG_MISSING_ALT); + } + + if ((HasLongDesc == no)&& + (HasValidHeight ==yes)&& + ((HasValidWidthHR == yes)|| + (HasValidWidthBullet == yes))) + { + HasTriggeredMissingLongDesc = yes; + } + + if (HasTriggeredMissingLongDesc == no) + { + if ((HasDLINK == yes)&& + (HasLongDesc == no)) + { + TY_(ReportAccessWarning)( doc, node, IMG_MISSING_LONGDESC); + } + + if ((HasLongDesc == yes)&& + (HasDLINK == no)) + { + TY_(ReportAccessWarning)( doc, node, IMG_MISSING_DLINK); + } + + if ((HasLongDesc == no)&& + (HasDLINK == no)) + { + TY_(ReportAccessWarning)( doc, node, IMG_MISSING_LONGDESC_DLINK); + } + } + + if (HasIsMap == yes) + { + TY_(ReportAccessError)( doc, node, IMAGE_MAP_SERVER_SIDE_REQUIRES_CONVERSION); + + TY_(ReportAccessWarning)( doc, node, IMG_MAP_SERVER_REQUIRES_TEXT_LINKS); + } + } +} + + +/*********************************************************** +* CheckApplet +* +* Checks APPLET element to check for validity pertaining +* the 'ALT' attribute. An appropriate warning message is +* displayed to indicate the error. An appropriate warning +* message is displayed to indicate the error. If no 'ALT' +* text is present, then there must be alternate content +* within the APPLET element. +***********************************************************/ + +static void CheckApplet( TidyDocImpl* doc, Node* node ) +{ + Bool HasAlt = no; + Bool HasDescription = no; + + AttVal* av; + + if (Level1_Enabled( doc )) + { + /* Checks for attributes within the APPLET element */ + for (av = node->attributes; av != NULL; av = av->next) + { + /* + Checks for valid ALT attribute. + The length of the alt text must be > 4 characters in length + but must be < 150 characters long. + */ + + if ( attrIsALT(av) ) + { + if (av->value != NULL) + { + HasAlt = yes; + } + } + } + + if (HasAlt == no) + { + /* Must have alternate text representation for that element */ + if (node->content != NULL) + { + ctmbstr word = NULL; + + if ( node->content->tag == NULL ) + word = textFromOneNode( doc, node->content); + + if ( node->content->content != NULL && + node->content->content->tag == NULL ) + { + word = textFromOneNode( doc, node->content->content); + } + + if ( word != NULL && !IsWhitespace(word) ) + HasDescription = yes; + } + } + + if ( !HasDescription && !HasAlt ) + { + TY_(ReportAccessError)( doc, node, APPLET_MISSING_ALT ); + } + } +} + + +/******************************************************************* +* CheckObject +* +* Checks to verify whether the OBJECT element contains +* 'ALT' text, and to see that the sound file selected is +* of a valid sound file type. OBJECT must have an alternate text +* representation. +*******************************************************************/ + +static void CheckObject( TidyDocImpl* doc, Node* node ) +{ + Bool HasAlt = no; + Bool HasDescription = no; + + if (Level1_Enabled( doc )) + { + if ( node->content != NULL) + { + if ( node->content->type != TextNode ) + { + Node* tnode = node->content; + AttVal* av; + + for ( av=tnode->attributes; av; av = av->next ) + { + if ( attrIsALT(av) ) + { + HasAlt = yes; + break; + } + } + } + + /* Must have alternate text representation for that element */ + if ( !HasAlt ) + { + ctmbstr word = NULL; + + if ( TY_(nodeIsText)(node->content) ) + word = textFromOneNode( doc, node->content ); + + if ( word == NULL && + TY_(nodeIsText)(node->content->content) ) + { + word = textFromOneNode( doc, node->content->content ); + } + + if ( word != NULL && !IsWhitespace(word) ) + HasDescription = yes; + } + } + + if ( !HasAlt && !HasDescription ) + { + TY_(ReportAccessError)( doc, node, OBJECT_MISSING_ALT ); + } + } +} + + +/*************************************************************** +* CheckMissingStyleSheets +* +* Ensures that stylesheets are used to control the presentation. +***************************************************************/ + +static Bool CheckMissingStyleSheets( TidyDocImpl* doc, Node* node ) +{ + AttVal* av; + Node* content; + Bool sspresent = no; + + for ( content = node->content; + !sspresent && content != NULL; + content = content->next ) + { + sspresent = ( nodeIsLINK(content) || + nodeIsSTYLE(content) || + nodeIsFONT(content) || + nodeIsBASEFONT(content) ); + + for ( av = content->attributes; + !sspresent && av != NULL; + av = av->next ) + { + sspresent = ( attrIsSTYLE(av) || attrIsTEXT(av) || + attrIsVLINK(av) || attrIsALINK(av) || + attrIsLINK(av) ); + + if ( !sspresent && attrIsREL(av) ) + { + sspresent = AttrValueIs(av, "stylesheet"); + } + } + + if ( ! sspresent ) + sspresent = CheckMissingStyleSheets( doc, content ); + } + return sspresent; +} + + +/******************************************************************* +* CheckFrame +* +* Checks if the URL is valid and to check if a 'LONGDESC' is needed +* within the FRAME element. If a 'LONGDESC' is needed, the value must +* be valid. The URL must end with the file extension, htm, or html. +* Also, checks to ensure that the 'SRC' and 'TITLE' values are valid. +*******************************************************************/ + +static void CheckFrame( TidyDocImpl* doc, Node* node ) +{ + Bool HasTitle = no; + AttVal* av; + + doc->access.numFrames++; + + if (Level1_Enabled( doc )) + { + /* Checks for attributes within the FRAME element */ + for (av = node->attributes; av != NULL; av = av->next) + { + /* Checks if 'LONGDESC' value is valid only if present */ + if ( attrIsLONGDESC(av) ) + { + if ( hasValue(av) && TY_(tmbstrlen)(av->value) > 1 ) + { + doc->access.HasCheckedLongDesc++; + } + } + + /* Checks for valid 'SRC' value within the frame element */ + else if ( attrIsSRC(av) ) + { + if ( hasValue(av) && !IsValidSrcExtension(av->value) ) + { + TY_(ReportAccessError)( doc, node, FRAME_SRC_INVALID ); + } + } + + /* Checks for valid 'TITLE' value within frame element */ + else if ( attrIsTITLE(av) ) + { + if ( hasValue(av) ) + HasTitle = yes; + + if ( !HasTitle ) + { + if ( av->value == NULL || TY_(tmbstrlen)(av->value) == 0 ) + { + HasTitle = yes; + TY_(ReportAccessError)( doc, node, FRAME_TITLE_INVALID_NULL); + } + else + { + if ( IsWhitespace(av->value) && TY_(tmbstrlen)(av->value) > 0 ) + { + HasTitle = yes; + TY_(ReportAccessError)( doc, node, FRAME_TITLE_INVALID_SPACES ); + } + } + } + } + } + + if ( !HasTitle ) + { + TY_(ReportAccessError)( doc, node, FRAME_MISSING_TITLE); + } + + if ( doc->access.numFrames==3 && doc->access.HasCheckedLongDesc<3 ) + { + doc->access.numFrames = 0; + TY_(ReportAccessWarning)( doc, node, FRAME_MISSING_LONGDESC ); + } + } +} + + +/**************************************************************** +* CheckIFrame +* +* Checks if 'SRC' value is valid. Must end in appropriate +* file extension. +****************************************************************/ + +static void CheckIFrame( TidyDocImpl* doc, Node* node ) +{ + if (Level1_Enabled( doc )) + { + /* Checks for valid 'SRC' value within the IFRAME element */ + AttVal* av = attrGetSRC( node ); + if ( hasValue(av) ) + { + if ( !IsValidSrcExtension(av->value) ) + TY_(ReportAccessError)( doc, node, FRAME_SRC_INVALID ); + } + } +} + + +/********************************************************************** +* CheckAnchorAccess +* +* Checks that the sound file is valid, and to ensure that +* text transcript is present describing the 'HREF' within the +* ANCHOR element. Also checks to see ensure that the 'TARGET' attribute +* (if it exists) is not NULL and does not contain '_new' or '_blank'. +**********************************************************************/ + +static void CheckAnchorAccess( TidyDocImpl* doc, Node* node ) +{ + AttVal* av; + Bool HasDescription = no; + Bool HasTriggeredLink = no; + + /* Checks for attributes within the ANCHOR element */ + for ( av = node->attributes; av != NULL; av = av->next ) + { + if (Level1_Enabled( doc )) + { + /* Must be of valid sound file type */ + if ( attrIsHREF(av) ) + { + if ( hasValue(av) ) + { + tmbchar ext[ 20 ]; + GetFileExtension (av->value, ext, sizeof(ext) ); + + /* Checks to see if multimedia is used */ + if ( IsValidMediaExtension(av->value) ) + { + TY_(ReportAccessError)( doc, node, MULTIMEDIA_REQUIRES_TEXT ); + } + + /* + Checks for validity of sound file, and checks to see if + the file is described within the document, or by a link + that is present which gives the description. + */ + if ( TY_(tmbstrlen)(ext) < 6 && TY_(tmbstrlen)(ext) > 0 ) + { + int errcode = IsSoundFile( av->value ); + if ( errcode ) + { + if (node->next != NULL) + { + if (node->next->tag == NULL) + { + ctmbstr word = textFromOneNode( doc, node->next); + + /* Must contain at least one letter in the text */ + if (IsWhitespace (word) == no) + { + HasDescription = yes; + } + } + } + + /* Must contain text description of sound file */ + if ( !HasDescription ) + { + TY_(ReportAccessError)( doc, node, errcode ); + } + } + } + } + } + } + + if (Level2_Enabled( doc )) + { + /* Checks 'TARGET' attribute for validity if it exists */ + if ( attrIsTARGET(av) ) + { + if (AttrValueIs(av, "_new")) + { + TY_(ReportAccessWarning)( doc, node, NEW_WINDOWS_REQUIRE_WARNING_NEW); + } + else if (AttrValueIs(av, "_blank")) + { + TY_(ReportAccessWarning)( doc, node, NEW_WINDOWS_REQUIRE_WARNING_BLANK); + } + } + } + } + + if (Level2_Enabled( doc )) + { + if ((node->content != NULL)&& + (node->content->tag == NULL)) + { + ctmbstr word = textFromOneNode( doc, node->content); + + if ((word != NULL)&& + (IsWhitespace (word) == no)) + { + if (TY_(tmbstrcmp) (word, "more") == 0) + { + HasTriggeredLink = yes; + } + + if (TY_(tmbstrcmp) (word, "click here") == 0) + { + TY_(ReportAccessWarning)( doc, node, LINK_TEXT_NOT_MEANINGFUL_CLICK_HERE); + } + + if (HasTriggeredLink == no) + { + if (TY_(tmbstrlen)(word) < 6) + { + TY_(ReportAccessWarning)( doc, node, LINK_TEXT_NOT_MEANINGFUL); + } + } + + if (TY_(tmbstrlen)(word) > 60) + { + TY_(ReportAccessWarning)( doc, node, LINK_TEXT_TOO_LONG); + } + + } + } + + if (node->content == NULL) + { + TY_(ReportAccessWarning)( doc, node, LINK_TEXT_MISSING); + } + } +} + + +/************************************************************ +* CheckArea +* +* Checks attributes within the AREA element to +* determine if the 'ALT' text and 'HREF' values are valid. +* Also checks to see ensure that the 'TARGET' attribute +* (if it exists) is not NULL and does not contain '_new' +* or '_blank'. +************************************************************/ + +static void CheckArea( TidyDocImpl* doc, Node* node ) +{ + Bool HasAlt = no; + AttVal* av; + + /* Checks all attributes within the AREA element */ + for (av = node->attributes; av != NULL; av = av->next) + { + if (Level1_Enabled( doc )) + { + /* + Checks for valid ALT attribute. + The length of the alt text must be > 4 characters long + but must be less than 150 characters long. + */ + + if ( attrIsALT(av) ) + { + /* The check for validity */ + if (av->value != NULL) + { + HasAlt = yes; + } + } + } + + if (Level2_Enabled( doc )) + { + if ( attrIsTARGET(av) ) + { + if (AttrValueIs(av, "_new")) + { + TY_(ReportAccessWarning)( doc, node, NEW_WINDOWS_REQUIRE_WARNING_NEW); + } + else if (AttrValueIs(av, "_blank")) + { + TY_(ReportAccessWarning)( doc, node, NEW_WINDOWS_REQUIRE_WARNING_BLANK); + } + } + } + } + + if (Level1_Enabled( doc )) + { + /* AREA must contain alt text */ + if (HasAlt == no) + { + TY_(ReportAccessError)( doc, node, AREA_MISSING_ALT); + } + } +} + + +/*************************************************** +* CheckScript +* +* Checks the SCRIPT element to ensure that a +* NOSCRIPT section follows the SCRIPT. +***************************************************/ + +static void CheckScriptAcc( TidyDocImpl* doc, Node* node ) +{ + if (Level1_Enabled( doc )) + { + /* NOSCRIPT element must appear immediately following SCRIPT element */ + if ( node->next == NULL || !nodeIsNOSCRIPT(node->next) ) + { + TY_(ReportAccessError)( doc, node, SCRIPT_MISSING_NOSCRIPT); + } + } +} + + +/********************************************************** +* CheckRows +* +* Check to see that each table has a row of headers if +* a column of columns doesn't exist. +**********************************************************/ + +static void CheckRows( TidyDocImpl* doc, Node* node ) +{ + int numTR = 0; + int numValidTH = 0; + + doc->access.CheckedHeaders++; + + for (; node != NULL; node = node->next ) + { + numTR++; + if ( nodeIsTH(node->content) ) + { + doc->access.HasTH = yes; + if ( TY_(nodeIsText)(node->content->content) ) + { + ctmbstr word = textFromOneNode( doc, node->content->content); + if ( !IsWhitespace(word) ) + numValidTH++; + } + } + } + + if (numTR == numValidTH) + doc->access.HasValidRowHeaders = yes; + + if ( numTR >= 2 && + numTR > numValidTH && + numValidTH >= 2 && + doc->access.HasTH == yes ) + doc->access.HasInvalidRowHeader = yes; +} + + +/********************************************************** +* CheckColumns +* +* Check to see that each table has a column of headers if +* a row of columns doesn't exist. +**********************************************************/ + +static void CheckColumns( TidyDocImpl* doc, Node* node ) +{ + Node* tnode; + int numTH = 0; + Bool isMissingHeader = no; + + doc->access.CheckedHeaders++; + + /* Table must have row of headers if headers for columns don't exist */ + if ( nodeIsTH(node->content) ) + { + doc->access.HasTH = yes; + + for ( tnode = node->content; tnode; tnode = tnode->next ) + { + if ( nodeIsTH(tnode) ) + { + if ( TY_(nodeIsText)(tnode->content) ) + { + ctmbstr word = textFromOneNode( doc, tnode->content); + if ( !IsWhitespace(word) ) + numTH++; + } + } + else + { + isMissingHeader = yes; + } + } + } + + if ( !isMissingHeader && numTH > 0 ) + doc->access.HasValidColumnHeaders = yes; + + if ( isMissingHeader && numTH >= 2 ) + doc->access.HasInvalidColumnHeader = yes; +} + + +/***************************************************** +* CheckTH +* +* Checks to see if the header provided for a table +* requires an abbreviation. (only required if the +* length of the header is greater than 15 characters) +*****************************************************/ + +static void CheckTH( TidyDocImpl* doc, Node* node ) +{ + Bool HasAbbr = no; + ctmbstr word = NULL; + AttVal* av; + + if (Level3_Enabled( doc )) + { + /* Checks TH element for 'ABBR' attribute */ + for (av = node->attributes; av != NULL; av = av->next) + { + if ( attrIsABBR(av) ) + { + /* Value must not be NULL and must be less than 15 characters */ + if ((av->value != NULL)&& + (IsWhitespace (av->value) == no)) + { + HasAbbr = yes; + } + + if ((av->value == NULL)|| + (TY_(tmbstrlen)(av->value) == 0)) + { + HasAbbr = yes; + TY_(ReportAccessWarning)( doc, node, TABLE_MAY_REQUIRE_HEADER_ABBR_NULL); + } + + if ((IsWhitespace (av->value) == yes)&& + (TY_(tmbstrlen)(av->value) > 0)) + { + HasAbbr = yes; + TY_(ReportAccessWarning)( doc, node, TABLE_MAY_REQUIRE_HEADER_ABBR_SPACES); + } + } + } + + /* If the header is greater than 15 characters, an abbreviation is needed */ + word = textFromOneNode( doc, node->content); + + if ((word != NULL)&& + (IsWhitespace (word) == no)) + { + /* Must have 'ABBR' attribute if header is > 15 characters */ + if ((TY_(tmbstrlen)(word) > 15)&& + (HasAbbr == no)) + { + TY_(ReportAccessWarning)( doc, node, TABLE_MAY_REQUIRE_HEADER_ABBR); + } + } + } +} + + +/***************************************************************** +* CheckMultiHeaders +* +* Layout tables should make sense when linearized. +* TABLE must contain at least one TH element. +* This technique applies only to tables used for layout purposes, +* not to data tables. Checks for column of multiple headers. +*****************************************************************/ + +static void CheckMultiHeaders( TidyDocImpl* doc, Node* node ) +{ + Node* TNode; + Node* temp; + + Bool validColSpanRows = yes; + Bool validColSpanColumns = yes; + + int flag = 0; + + if (Level1_Enabled( doc )) + { + if (node->content != NULL) + { + TNode = node->content; + + /* + Checks for column of multiple headers found + within a data table. + */ + while (TNode != NULL) + { + if ( nodeIsTR(TNode) ) + { + flag = 0; /* Issue #168 - access test 5-2-1-2 */ + if (TNode->content != NULL) + { + temp = TNode->content; + + /* The number of TH elements found within TR element */ + if (flag == 0) + { + while (temp != NULL) + { + /* + Must contain at least one TH element + within in the TR element + */ + if ( nodeIsTH(temp) ) + { + AttVal* av; + for (av = temp->attributes; av != NULL; av = av->next) + { + if ( attrIsCOLSPAN(av) + && (atoi(av->value) > 1) ) + validColSpanColumns = no; + + if ( attrIsROWSPAN(av) + && (atoi(av->value) > 1) ) + validColSpanRows = no; + } + } + + temp = temp->next; + } + + flag = 1; + } + } + } + + TNode = TNode->next; + } + + /* Displays HTML 4 Table Algorithm when multiple column of headers used */ + if (validColSpanRows == no) + { + TY_(ReportAccessWarning)( doc, node, DATA_TABLE_REQUIRE_MARKUP_ROW_HEADERS ); + TY_(DisplayHTMLTableAlgorithm)( doc ); + } + + if (validColSpanColumns == no) + { + TY_(ReportAccessWarning)( doc, node, DATA_TABLE_REQUIRE_MARKUP_COLUMN_HEADERS ); + TY_(DisplayHTMLTableAlgorithm)( doc ); + } + } + } +} + + +/**************************************************** +* CheckTable +* +* Checks the TABLE element to ensure that the +* table is not missing any headers. Must have either +* a row or column of headers. +****************************************************/ + +static void CheckTable( TidyDocImpl* doc, Node* node ) +{ + Node* TNode; + Node* temp; + + tmbstr word = NULL; + + int numTR = 0; + + Bool HasSummary = no; + Bool HasCaption = no; + + if (Level3_Enabled( doc )) + { + AttVal* av; + /* Table must have a 'SUMMARY' describing the purpose of the table */ + for (av = node->attributes; av != NULL; av = av->next) + { + if ( attrIsSUMMARY(av) ) + { + if ( hasValue(av) ) + { + HasSummary = yes; + + if (AttrContains(av, "summary") && + AttrContains(av, "table")) + { + TY_(ReportAccessError)( doc, node, TABLE_SUMMARY_INVALID_PLACEHOLDER ); + } + } + + if ( av->value == NULL || TY_(tmbstrlen)(av->value) == 0 ) + { + HasSummary = yes; + TY_(ReportAccessError)( doc, node, TABLE_SUMMARY_INVALID_NULL ); + } + else if ( IsWhitespace(av->value) && TY_(tmbstrlen)(av->value) > 0 ) + { + HasSummary = yes; + TY_(ReportAccessError)( doc, node, TABLE_SUMMARY_INVALID_SPACES ); + } + } + } + + /* TABLE must have content. */ + if (node->content == NULL) + { + TY_(ReportAccessError)( doc, node, DATA_TABLE_MISSING_HEADERS); + + return; + } + } + + if (Level1_Enabled( doc )) + { + /* Checks for multiple headers */ + CheckMultiHeaders( doc, node ); + } + + if (Level2_Enabled( doc )) + { + /* Table must have a CAPTION describing the purpose of the table */ + if ( nodeIsCAPTION(node->content) ) + { + TNode = node->content; + + if (TNode->content && TNode->content->tag == NULL) + { + word = getTextNodeClear( doc, TNode); + } + + if ( !IsWhitespace(word) ) + { + HasCaption = yes; + } + } + + if (HasCaption == no) + { + TY_(ReportAccessError)( doc, node, TABLE_MISSING_CAPTION); + } + } + + + if (node->content != NULL) + { + if ( nodeIsCAPTION(node->content) && nodeIsTR(node->content->next) ) + { + CheckColumns( doc, node->content->next ); + } + else if ( nodeIsTR(node->content) ) + { + CheckColumns( doc, node->content ); + } + } + + if ( ! doc->access.HasValidColumnHeaders ) + { + if (node->content != NULL) + { + if ( nodeIsCAPTION(node->content) && nodeIsTR(node->content->next) ) + { + CheckRows( doc, node->content->next); + } + else if ( nodeIsTR(node->content) ) + { + CheckRows( doc, node->content); + } + } + } + + + if (Level3_Enabled( doc )) + { + /* Suppress warning for missing 'SUMMARY for HTML 2.0 and HTML 3.2 */ + if (HasSummary == no) + { + TY_(ReportAccessError)( doc, node, TABLE_MISSING_SUMMARY); + } + } + + if (Level2_Enabled( doc )) + { + if (node->content != NULL) + { + temp = node->content; + + while (temp != NULL) + { + if ( nodeIsTR(temp) ) + { + numTR++; + } + + temp = temp->next; + } + + if (numTR == 1) + { + TY_(ReportAccessWarning)( doc, node, LAYOUT_TABLES_LINEARIZE_PROPERLY); + } + } + + if ( doc->access.HasTH ) + { + TY_(ReportAccessWarning)( doc, node, LAYOUT_TABLE_INVALID_MARKUP); + } + } + + if (Level1_Enabled( doc )) + { + if ( doc->access.CheckedHeaders == 2 ) + { + if ( !doc->access.HasValidRowHeaders && + !doc->access.HasValidColumnHeaders && + !doc->access.HasInvalidRowHeader && + !doc->access.HasInvalidColumnHeader ) + { + TY_(ReportAccessError)( doc, node, DATA_TABLE_MISSING_HEADERS); + } + + if ( !doc->access.HasValidRowHeaders && + doc->access.HasInvalidRowHeader ) + { + TY_(ReportAccessError)( doc, node, DATA_TABLE_MISSING_HEADERS_ROW); + } + + if ( !doc->access.HasValidColumnHeaders && + doc->access.HasInvalidColumnHeader ) + { + TY_(ReportAccessError)( doc, node, DATA_TABLE_MISSING_HEADERS_COLUMN); + } + } + } +} + + +/*************************************************** +* CheckASCII +* +* Checks for valid text equivalents for XMP and PRE +* elements for ASCII art. Ensures that there is +* a skip over link to skip multi-lined ASCII art. +***************************************************/ + +static void CheckASCII( TidyDocImpl* doc, Node* node ) +{ + Node* temp1; + Node* temp2; + + tmbstr skipOver = NULL; + Bool IsAscii = no; + int HasSkipOverLink = 0; + + uint i, x; + int newLines = -1; + tmbchar compareLetter; + int matchingCount = 0; + AttVal* av; + + if (Level1_Enabled( doc ) && node->content) + { + /* + Checks the text within the PRE and XMP tags to see if ascii + art is present + */ + for (i = node->content->start + 1; i < node->content->end; i++) + { + matchingCount = 0; + + /* Counts the number of lines of text */ + if (doc->lexer->lexbuf[i] == '\n') + { + newLines++; + } + + compareLetter = doc->lexer->lexbuf[i]; + + /* Counts consecutive character matches */ + for (x = i; x < i + 5; x++) + { + if (doc->lexer->lexbuf[x] == compareLetter) + { + matchingCount++; + } + + else + { + break; + } + } + + /* Must have at least 5 consecutive character matches */ + if (matchingCount >= 5) + { + break; + } + } + + /* + Must have more than 6 lines of text OR 5 or more consecutive + letters that are the same for there to be ascii art + */ + if (newLines >= 6 || matchingCount >= 5) + { + IsAscii = yes; + } + + /* Checks for skip over link if ASCII art is present */ + if (IsAscii == yes) + { + if (node->prev != NULL && node->prev->prev != NULL) + { + temp1 = node->prev->prev; + + /* Checks for 'HREF' attribute */ + for (av = temp1->attributes; av != NULL; av = av->next) + { + if ( attrIsHREF(av) && hasValue(av) ) + { + skipOver = av->value; + HasSkipOverLink++; + } + } + } + } + } + + if (Level2_Enabled( doc )) + { + /* + Checks for A element following PRE to ensure proper skipover link + only if there is an A element preceding PRE. + */ + if (HasSkipOverLink == 1) + { + if ( nodeIsA(node->next) ) + { + temp2 = node->next; + + /* Checks for 'NAME' attribute */ + for (av = temp2->attributes; av != NULL; av = av->next) + { + if ( attrIsNAME(av) && hasValue(av) ) + { + /* + Value within the 'HREF' attribute must be the same + as the value within the 'NAME' attribute for valid + skipover. + */ + if ( strstr(skipOver, av->value) != NULL ) + { + HasSkipOverLink++; + } + } + } + } + } + + if (IsAscii == yes) + { + TY_(ReportAccessError)( doc, node, ASCII_REQUIRES_DESCRIPTION); + if (Level3_Enabled( doc ) && (HasSkipOverLink < 2)) + TY_(ReportAccessError)( doc, node, SKIPOVER_ASCII_ART); + } + + } +} + + +/*********************************************************** +* CheckFormControls +* +*
must have valid 'FOR' attribute, and