Modified HTML grammar file for ANTLR

/*
        Original HTML ANTLR grammar file edited by Yannick Fortin
        for ECE 4894 project.

        The original file was much more comprehensive, which made
        it extremely strict in its parsing. I removed many of the
        token definitions and watered the spec down to accept most
        HTML tags as generic tokens, which avoids some of the errors
        listed below by the authors. This greatly aided in the full
        parsing of web pages without encountering an error, but the
        tradeoff is amount of functionality, as with this few recognized
        tokens, the parser is basically only good for ripping the text out.
*/

/*      
        Based on the HTML 3.2 spec. by the W3 (http://www.w3.org)
        Alexander Hinds & Terence Parr
        Magelang Institute, Ltd.
        Send comments to:  parrt@jguru.com

        v1.2    Fixed a bug APARAM->APARM in APPLET tag.

        v1.1    Terence Parr (updated to 2.6.0)

        Fixed CCYTE->CCITE
        Fixed def of COMMENT_DATA so it scarfs stuff correctly.
        Also, fixed refs to (PCDATA)? -> (PCDATA)* because a comment
                between PCDATA returns 2 PCDATA--ya need the loop not optional.

        v1.0    Terence John Parr (version 2.5.0 of ANTLR required)

        Fixed how whitespace as handled, removing some ambiguities; some
        because of ANTLR lexical filtering in 2.5.0.

        Changed (PCDATA)* loops to (PCDATA)? general since PCDATA matches
        everything between valid tags (how could there be more than one
        between tags?)

        Made the DOCTYPE optional.

        Reduced lookahead from k=5 to k=1 on the parser and number
        of parser ambiguities to 2.  Reduced lexer lookahead from 6
        to 4; had to left factor a bunch of stuff.

        List items couldn't contain nested lists...fixed it.

        Fixed def of WORD so it can't be an INT.  Removed '-' from WORD.

        Fixed HEXNUM so it will allow letters A..F.

        KNOWN ISSUES:

        1.  Does not handle "staggered" tags, eg: <p> <i> <p> <i>

        2.  Adhere's somewhat strictly to the html spec, so many pages
        won't parse without errors.

        3.  Doesn't convert &(a signifier) to it's proper single char 
        representation

        4.  Checks only the syntax of element attributes, not the semantics,
        e.g. won't very that a base element's attribute is actually
        called "href" 

        5.  Tags split across lines, for example, <A (NEWLINE) some text >
        won't be properly recognized.  TJP: I think I fixed this.

        7.  Lines not counted properly due to the def'n of PCDATA - see the
        alternate def'n for a possible fix.  TJP: I think I fixed this.

*/



header
{
//#ifndef _UNICODE
//#define _UNICODE
//#endif

#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <tchar.h>
#include <windows.h>

}

options
{
        language = "Cpp";
}

class HTMLParser extends Parser;

options 
{
//      buildAST = true;
        exportVocab=HTML;
        k = 1;
}

{
        HANDLE NewFile;
        DWORD BytesWritten;
        DWORD BytesRead;
}


document
        :       
                {
                        NewFile = CreateFile("output.txt", GENERIC_WRITE, 0, 0, CREATE_ALWAYS, 0, 0);

                        if (NewFile == INVALID_HANDLE_VALUE)
                        {
                                std::cout << "Error creating new file \n";

                        }

                        //else
                        //{
                        //      if(!WriteFile(NewFile, _T("cool"), sizeof("cool"), &BytesWritten, NULL))
                        //      {
                        //              std::cout << "Error writing init message to file \n";
                        //      }
                        //} 

                }

                (text | anchor)+

                {
                        //if(!WriteFile(NewFile, _T("closing"), sizeof("closing"), &BytesWritten, NULL))
                        //{
                        //      std::cout << "Error writing closing message to file \n";
                        //}

                        if(!CloseHandle(NewFile))
                        {
                                std::cout << "failed in file closing! \n";
                        }

                }
                        
        ;


text:   txt:PCDATA
        {
                //std::cout << txt->getText().c_str();

                if(!WriteFile(NewFile, txt->getText().data(), txt->getText().length(), &BytesWritten, NULL))
                {
                        std::cout << "Error writing text to file \n";
                }
        }
//      | tl:LINK
//      {
//              if(!WriteFile(NewFile, tl->getText().data(), tl->getText().length(), &BytesWritten, NULL))
//              {
//                      std::cout << "Error writing text to file \n";
//              }
//
//      }
//      | OLITEM
        | COMMENT
        | GENERIC_TAG 
        ;


/*      special text level elements*/
anchor
        :       link:OANCHOR
        {
                if(!WriteFile(NewFile, link->getText().data(), link->getText().length(), &BytesWritten, NULL))
                {
                        std::cout << "Error writing link text to file \n";
                }
        
        }                       
         (text)* (CLANCHOR | CUANCHOR)
        ;


class HTMLLexer extends Lexer;
options {       
        k = 4;
        exportVocab=HTML;
//      charVocabulary = '\3'..'\377';
        charVocabulary = '\u0000'..'\u00FF';
        caseSensitive=false;

}


tokens
{
        CLANCHOR = "</a>";
        CUANCHOR = "</A>";
}

GENERIC_TAG
        :       '<' ("ap" | "ar" | "ad")? (~'>')* '>'
//      :       '<' ~('a' | '>') (~'>')* '>'
//              (
//                      (       /* the usual newline hassle: \r\n can be matched in alt 1
//                               * or by matching alt 2 followed by alt 3 in another iteration.
//                               */
//                               options {
//                                      generateAmbigWarnings=false;
//                              }
//                      :       "\r\n" | '\r' | '\n'
//                      )
//                      { newline();}
//              )*
//      |       ( "\r\n" | '\r' | '\n' ) {newline();}
//      |       .
        ;


//LINK
//      :       "<link" WS (ATTR)+ '>'  
//      ;



/* special text level elements*/

OANCHOR
        :       "<a" WS (ATTR)+ '>'
        ;


//CANCHOR
//      :       "</a>"
//      ;       


//BR
//      :       "<br" (WS ATTR)? '>'
//      ;


/*MISC STUFF*/

PCDATA
        :       (
                        /* See comment in WS.  Language for combining any flavor
                         * newline is ambiguous.  Shutting off the warning.
                         */
                        options {
                                generateAmbigWarnings=false;
                        }
                :       '\r' '\n'               {newline();}
                |       '\r'                    {newline();}
                |       '\n'                    {newline();}
                |       ~('<'|'\n'|'\r'|'"'|'>')
                )+ 
        ;

// multiple-line comments
protected
COMMENT_DATA
        :       (       /*      '\r' '\n' can be matched in one alternative or by matching
                                '\r' in one iteration and '\n' in another.  I am trying to
                                handle any flavor of newline that comes in, but the language
                                that allows both "\r\n" and "\r" and "\n" to all be valid
                                newline is ambiguous.  Consequently, the resulting grammar
                                must be ambiguous.  I'm shutting this warning off.
                         */
                        options {
                                generateAmbigWarnings=false;
                        }
                :
                        {LA(2)!='-' && LA(3)!='>'}? '-' // allow '-' if not "-->"
                |       '\r' '\n'               {newline();}
                |       '\r'                    {newline();}
                |       '\n'                    {newline();}
                |       ~('-'|'\n'|'\r')
                )*
        ;


COMMENT
        :       "<!--" COMMENT_DATA "-->"       //{ $setType(Token.SKIP); }
        ;

/*
        PROTECTED LEXER RULES
*/

protected
WS      :       (
                        /*      '\r' '\n' can be matched in one alternative or by matching
                                '\r' in one iteration and '\n' in another.  I am trying to
                                handle any flavor of newline that comes in, but the language
                                that allows both "\r\n" and "\r" and "\n" to all be valid
                                newline is ambiguous.  Consequently, the resulting grammar
                                must be ambiguous.  I'm shutting this warning off.
                         */
                        options {
                                generateAmbigWarnings=false;
                        }
                :       ' '
                |       '\t'
                |       '\n'    { newline(); }
                |       "\r\n"  { newline(); }
                |       '\r'    { newline(); }
                )+
        ;

protected
ATTR
options {
ignore=WS;
}
        :       WORD ('=' (WORD ('%')? | ('-')? INT | STRING | HEXNUM))?
        ;

//don't need uppercase for case-insen.
//the '.' is for words like "image.gif"
protected
WORD:   (       LCLETTER
                |       '.'
                |       '/'
                )

                (
                        /*      In reality, a WORD must be followed by whitespace, '=', or
                                what can follow an ATTR such as '>'.  In writing this grammar,
                                however, we just list all the possibilities as optional
                                elements.  This is loose, allowing the case where nothing is
                                matched after a WORD and then the (ATTR)* loop means the
                                grammar would allow "widthheight" as WORD WORD or WORD, hence,
                                an ambiguity.  Naturally, ANTLR will consume the input as soon
                                as possible, combing "widthheight" into one WORD.

                                I am shutting off the ambiguity here because ANTLR does the
                                right thing.  The exit path is ambiguous with ever
                                alternative.  The only solution would be to write an unnatural
                                grammar (lots of extra productions) that laid out the
                                possibilities explicitly, preventing the bogus WORD followed
                                immediately by WORD without whitespace etc...
                         */
                        options {
                                generateAmbigWarnings=false;
                        }
                :       LCLETTER
                |       DIGIT
                |       '.'
                |       '/'
                )+
        ;

protected
STRING
        :       '"' (~'"')* '"'
        |       '\'' (~'\'')* '\''
        ;

protected
WSCHARS
        :       ' ' | '\t' | '\n' | '\r'
        ;

protected 
SPECIAL
        :       '<' | '~'
        ;
        
protected
HEXNUM
        :       '#' HEXINT
        ;

protected
INT     :       (DIGIT)+
        ;

protected
HEXINT
        :       (
                        /*      Technically, HEXINT cannot be followed by a..f, but due to our
                                loose grammar, the whitespace that normally would follow this
                                rule is optional.  ANTLR reports that #4FACE could parse as
                                HEXINT "#4" followed by WORD "FACE", which is clearly bogus.
                                ANTLR does the right thing by consuming a much input as
                                possible here.  I shut the warning off.
                         */
                         options {
                                generateAmbigWarnings=false;
                        }
                :       HEXDIGIT
                )+
        ;

protected
DIGIT
        :       '0'..'9'
        ;

protected
HEXDIGIT
        :       '0'..'9'
        |       'a'..'f'
        ;

protected
LCLETTER
        :       'a'..'z'
        ;