Wednesday, September 14, 2011

ANTLR: Global member function in ‘C’ parser & custom error printing

Technorati Tags: ,,

Writing a Java parser using ANTLR is a breeze. And why not ! It is written in Java, the default IDE which comes with it is in Java and etc. But recently I needed a tool for writing C/C++ parsers. My first choice was to go ahead with YACC or BISON. But management of the generated parser is hard (at least to people who are new to them), so I started with ANTLR.
After few rounds of testing the ANTLR seemed ok to me. But the actual problem came when I have to provide my own custom handler for error processing. After some rounds of googling and diving through the documentation I found the solution.

First create a generic handler: exceptionhandler.h
#pragma once  
#include "R2SParser.h"  
#ifdef __cplusplus
extern "C" {
#endif  
void myDisplayRecognitionError (pANTLR3_BASE_RECOGNIZER recognizer, pANTLR3_UINT8 * tokenNames);  
#ifdef __cplusplus
}
#endif  
Its sample implementation: exceptionhandler.cpp (taken from antlr3baserecognizer.c)
#include "exceptionhandler.h"
#include <string>  
 
void myDisplayRecognitionError (pANTLR3_BASE_RECOGNIZER recognizer, pANTLR3_UINT8 * tokenNames) 
{
    //====================================  
    pANTLR3_PARSER            parser;
    pANTLR3_TREE_PARSER        tparser;
    pANTLR3_INT_STREAM        is;
    pANTLR3_STRING            ttext;
    pANTLR3_STRING            ftext;
    pANTLR3_EXCEPTION        ex;
    pANTLR3_COMMON_TOKEN    theToken;
    pANTLR3_BASE_TREE        theBaseTree;
    pANTLR3_COMMON_TREE        theCommonTree;  
    // Retrieve some info for easy reading.
    //
    ex        =        recognizer->state->exception;
    ttext   =        NULL;  
    std::string error;  
    // See if there is a 'filename' we can use
    //
    /*if    (ex->streamName == NULL)
    {
        if    (((pANTLR3_COMMON_TOKEN)(ex->token))->type == ANTLR3_TOKEN_EOF)
        {
            ANTLR3_FPRINTF(stderr, "-end of input-(");
        }
        else
        {
            ANTLR3_FPRINTF(stderr, "-unknown source-(");
        }
    }
    else
    {
        ftext = ex->streamName->to8(ex->streamName);
        ANTLR3_FPRINTF(stderr, "%s(", ftext->chars);
    }*/  
    // Next comes the line number
    //  
    ANTLR3_FPRINTF(stderr, "%d) ", recognizer->state->exception->line);
    ANTLR3_FPRINTF(stderr, " : error %d : %s", 
        recognizer->state->exception->type,
        (pANTLR3_UINT8)       (recognizer->state->exception->message));  
    // How we determine the next piece is dependent on which thing raised the
    // error.
    //
    switch    (recognizer->type)
    {
    case    ANTLR3_TYPE_PARSER:  
        // Prepare the knowledge we know we have
        //
        parser        = (pANTLR3_PARSER) (recognizer->super);
        if(parser->super == NULL)
        {
            fprintf(stdout, "I think i can use it");
        }else
        {
            fprintf(stdout, "BAD LUCK");
        }  
        tparser        = NULL;
        is            = parser->tstream->istream;
        theToken    = (pANTLR3_COMMON_TOKEN)(recognizer->state->exception->token);
        ttext        = theToken->toString(theToken);  
        ANTLR3_FPRINTF(stderr, ", at offset %d", recognizer->state->exception->charPositionInLine);
        if  (theToken != NULL)
        {
            if (theToken->type == ANTLR3_TOKEN_EOF)
            {
                ANTLR3_FPRINTF(stderr, ", at <EOF>");
            }
            else
            {
                // Guard against null text in a token
                //
                ANTLR3_FPRINTF(stderr, "\n    near %s\n    ", ttext == NULL ? (pANTLR3_UINT8)"<no text for the token>" : ttext->chars);
            }
        }
        break;  
    case    ANTLR3_TYPE_TREE_PARSER:  
        tparser        = (pANTLR3_TREE_PARSER) (recognizer->super);
        parser        = NULL;
        is            = tparser->ctnstream->tnstream->istream;
        theBaseTree    = (pANTLR3_BASE_TREE)(recognizer->state->exception->token);
        ttext        = theBaseTree->toStringTree(theBaseTree);  
        if  (theBaseTree != NULL)
        {
            theCommonTree    = (pANTLR3_COMMON_TREE)        theBaseTree->super;  
            if    (theCommonTree != NULL)
            {
                theToken    = (pANTLR3_COMMON_TOKEN)    theBaseTree->getToken(theBaseTree);
            }
            ANTLR3_FPRINTF(stderr, ", at offset %d", theBaseTree->getCharPositionInLine(theBaseTree));
            ANTLR3_FPRINTF(stderr, ", near %s", ttext->chars);
        }
        break;  
    default:  
        ANTLR3_FPRINTF(stderr, "Base recognizer function displayRecognitionError called by unknown parser type - provide override for this function\n");
        return;
        break;
    }  
     switch  (ex->type)
    {
    case    ANTLR3_UNWANTED_TOKEN_EXCEPTION:  
        if    (tokenNames == NULL)
        {
            ANTLR3_FPRINTF(stderr, " : Extraneous input...");
        }
        else
        {
            if    (ex->expecting == ANTLR3_TOKEN_EOF)
            {
                ANTLR3_FPRINTF(stderr, " : Extraneous input - expected <EOF>\n");
            }
            else
            {
                ANTLR3_FPRINTF(stderr, " : Extraneous input - expected %s ...\n", tokenNames[ex->expecting]);
            }
        }
        break;  
    case    ANTLR3_MISSING_TOKEN_EXCEPTION:  
        if    (tokenNames == NULL)
        {
            ANTLR3_FPRINTF(stderr, " : Missing token (%d)...\n", ex->expecting);
        }
        else
        {
            if    (ex->expecting == ANTLR3_TOKEN_EOF)
            {
                ANTLR3_FPRINTF(stderr, " : Missing <EOF>\n");
            }
            else
            {
                ANTLR3_FPRINTF(stderr, " : Missing %s \n", tokenNames[ex->expecting]);
            }
        }
        break;  
    case    ANTLR3_RECOGNITION_EXCEPTION:  
        ANTLR3_FPRINTF(stderr, " : syntax error...\n");    
        break;  
    case    ANTLR3_MISMATCHED_TOKEN_EXCEPTION:  
        if    (tokenNames == NULL)
        {
            ANTLR3_FPRINTF(stderr, " : syntax error...\n");
        }
        else
        {
            if    (ex->expecting == ANTLR3_TOKEN_EOF)
            {
                ANTLR3_FPRINTF(stderr, " : expected <EOF>\n");
            }
            else
            {
                ANTLR3_FPRINTF(stderr, " : expected %s ...\n", tokenNames[ex->expecting]);
            }
        }
        break;  
    case    ANTLR3_NO_VIABLE_ALT_EXCEPTION:  
        ANTLR3_FPRINTF(stderr, " : cannot match to any predicted input...\n");  
        break;  
    case    ANTLR3_MISMATCHED_SET_EXCEPTION:  
        {
            ANTLR3_UINT32      count;
            ANTLR3_UINT32      bit;
            ANTLR3_UINT32      size;
            ANTLR3_UINT32      numbits;
            pANTLR3_BITSET      errBits;  
            ANTLR3_FPRINTF(stderr, " : unexpected input...\n  expected one of : ");  
            count   = 0;
            errBits = antlr3BitsetLoad        (ex->expectingSet);
            numbits = errBits->numBits        (errBits);
            size    = errBits->size            (errBits);  
            if  (size > 0)
            {
                for    (bit = 1; bit < numbits && count < 8 && count < size; bit++)
                {
                    // TODO: This doesn;t look right - should be asking if the bit is set!!
                    //
                    if  (tokenNames[bit])
                    {
                        ANTLR3_FPRINTF(stderr, "%s%s", count > 0 ? ", " : "", tokenNames[bit]); 
                        count++;
                    }
                }
                ANTLR3_FPRINTF(stderr, "\n");
            }
            else
            {
                ANTLR3_FPRINTF(stderr, "Actually dude, we didn't seem to be expecting anything here, or at least\n");
                ANTLR3_FPRINTF(stderr, "I could not work out what I was expecting, like so many of us these days!\n");
            }
        }
        break;  
    case    ANTLR3_EARLY_EXIT_EXCEPTION:  
        ANTLR3_FPRINTF(stderr, " : missing elements...\n");
        break;  
    default:  
        ANTLR3_FPRINTF(stderr, " : syntax not recognized...\n");
        break;
    }  
    //====================================
}  
Then create a place holder for error message (this is a trimmed down version): errorstruct.h
#pragma once  
#ifndef __ERRORSTRUCT__
#define __ERRORSTRUCT__  
struct errormessage_struct
{
    char* message;
};  
typedef struct errormessage_struct ErrorMessage;
typedef ErrorMessage* pErrorMessage;  
#endif  
Now at this point your ANTLR grammar should have this:
@parser::header {
   #include "errorstruct.h"
   #include "exceptionhandler.h" 
   #define ERRORMESSAGE CTX->errorMessage  
}  
@parser::context
{
    ErrorMessage errorMessage;
}  
@parser::apifuncs {
    RECOGNIZER->displayRecognitionError = myDisplayRecognitionError;
    ERRORMESSAGE.message = NULL;
}
And that’s it. Now you can access your error message any where like this parser->errorMessage.message. Similarly, you can add some member function to the structure (in the above example it is errormessage_struct) and then can use it anywhere.
One point worth noting is that with the above approach you get the free threading, which is built into the code generation and the runtime. Here you get one errorMessage per thread.

References:
  1. http://www.antlr.org/pipermail/antlr-interest/2009-May/034567.html
  2. http://groups.google.com/group/il-antlr-interest/browse_thread/thread/80ec25032e9af7a8?pli=1