You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
513 lines
21 KiB
513 lines
21 KiB
/** \file
|
|
* Defines the basic structure to support recognizing by either a lexer,
|
|
* parser, or tree parser.
|
|
* \addtogroup BaseRecognizer
|
|
* @{
|
|
*/
|
|
#ifndef _ANTLR3_BASERECOGNIZER_HPP
|
|
#define _ANTLR3_BASERECOGNIZER_HPP
|
|
|
|
// [The "BSD licence"]
|
|
// Copyright (c) 2005-2009 Gokulakannan Somasundaram, ElectronDB
|
|
|
|
//
|
|
// All rights reserved.
|
|
//
|
|
// Redistribution and use in source and binary forms, with or without
|
|
// modification, are permitted provided that the following conditions
|
|
// are met:
|
|
// 1. Redistributions of source code must retain the above copyright
|
|
// notice, this list of conditions and the following disclaimer.
|
|
// 2. Redistributions in binary form must reproduce the above copyright
|
|
// notice, this list of conditions and the following disclaimer in the
|
|
// documentation and/or other materials provided with the distribution.
|
|
// 3. The name of the author may not be used to endorse or promote products
|
|
// derived from this software without specific prior written permission.
|
|
//
|
|
// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
|
// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
|
// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
|
// IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
|
// NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
|
// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
#include "antlr3defs.hpp"
|
|
#include "antlr3collections.hpp"
|
|
|
|
ANTLR_BEGIN_NAMESPACE()
|
|
|
|
/** \brief Base tracking context structure for all types of
|
|
* recognizers.
|
|
*/
|
|
template< class ImplTraits, class StreamType >
|
|
class BaseRecognizer : public ImplTraits::AllocPolicyType
|
|
{
|
|
public:
|
|
typedef typename ImplTraits::AllocPolicyType AllocPolicyType;
|
|
typedef typename StreamType::IntStreamType IntStreamType;
|
|
typedef typename ComponentTypeFinder<ImplTraits, StreamType>::ComponentType SuperType;
|
|
typedef typename StreamType::UnitType UnitType;
|
|
typedef typename ImplTraits::template ExceptionBaseType<StreamType> ExceptionBaseType;
|
|
typedef typename ImplTraits::BitsetType BitsetType;
|
|
typedef typename ImplTraits::BitsetListType BitsetListType;
|
|
typedef typename ImplTraits::StringType StringType;
|
|
typedef typename ImplTraits::template RecognizerSharedStateType<StreamType> RecognizerSharedStateType;
|
|
typedef typename ImplTraits::DebugEventListenerType DebugEventListenerType;
|
|
typedef typename ImplTraits::LexerType LexerType;
|
|
typedef typename ImplTraits::ParserType ParserType;
|
|
typedef typename ImplTraits::TreeParserType TreeParserType;
|
|
|
|
typedef typename AllocPolicyType::template StackType<StringType> StringStackType;
|
|
typedef typename AllocPolicyType::template ListType<StringType> StringListType;
|
|
|
|
private:
|
|
/// A pointer to the shared recognizer state, such that multiple
|
|
/// recognizers can use the same inputs streams and so on (in
|
|
/// the case of grammar inheritance for instance.
|
|
///
|
|
RecognizerSharedStateType* m_state;
|
|
|
|
/// If set to something other than NULL, then this structure is
|
|
/// points to an instance of the debugger interface. In general, the
|
|
/// debugger is only referenced internally in recovery/error operations
|
|
/// so that it does not cause overhead by having to check this pointer
|
|
/// in every function/method
|
|
///
|
|
DebugEventListenerType* m_debugger;
|
|
|
|
|
|
public:
|
|
BaseRecognizer(ANTLR_UINT32 sizeHint, RecognizerSharedStateType* state);
|
|
|
|
SuperType* get_super();
|
|
RecognizerSharedStateType* get_state() const;
|
|
DebugEventListenerType* get_debugger() const;
|
|
void set_state( RecognizerSharedStateType* state );
|
|
void set_debugger( DebugEventListenerType* debugger );
|
|
|
|
/// Match current input symbol against ttype. Upon error, do one token
|
|
/// insertion or deletion if possible.
|
|
/// To turn off single token insertion or deletion error
|
|
/// recovery, override mismatchRecover() and have it call
|
|
/// plain mismatch(), which does not recover. Then any error
|
|
/// in a rule will cause an exception and immediate exit from
|
|
/// rule. Rule would recover by resynchronizing to the set of
|
|
/// symbols that can follow rule ref.
|
|
///
|
|
const UnitType* match(ANTLR_UINT32 ttype, BitsetListType* follow);
|
|
|
|
/// Consumes the next token, whatever it is, and resets the recognizer state
|
|
/// so that it is not in error.
|
|
///
|
|
/// \param recognizer
|
|
/// Recognizer context pointer
|
|
///
|
|
void matchAny();
|
|
|
|
/// function that decides if the token ahead of the current one is the
|
|
/// one we were loking for, in which case the curernt one is very likely extraneous
|
|
/// and can be reported that way.
|
|
///
|
|
bool mismatchIsUnwantedToken(IntStreamType* input, ANTLR_UINT32 ttype);
|
|
|
|
/// function that decides if the current token is one that can logically
|
|
/// follow the one we were looking for, in which case the one we were looking for is
|
|
/// probably missing from the input.
|
|
///
|
|
bool mismatchIsMissingToken(IntStreamType* input, BitsetListType* follow);
|
|
|
|
/// Factor out what to do upon token mismatch so tree parsers can behave
|
|
/// differently. Override and call mismatchRecover(input, ttype, follow)
|
|
/// to get single token insertion and deletion. Use this to turn off
|
|
/// single token insertion and deletion. Override mismatchRecover
|
|
/// to call this instead.
|
|
///
|
|
/// \remark mismatch only works for parsers and must be overridden for anything else.
|
|
///
|
|
void mismatch(ANTLR_UINT32 ttype, BitsetListType* follow);
|
|
|
|
/// Report a recognition problem.
|
|
///
|
|
/// This method sets errorRecovery to indicate the parser is recovering
|
|
/// not parsing. Once in recovery mode, no errors are generated.
|
|
/// To get out of recovery mode, the parser must successfully match
|
|
/// a token (after a resync). So it will go:
|
|
///
|
|
/// 1. error occurs
|
|
/// 2. enter recovery mode, report error
|
|
/// 3. consume until token found in resynch set
|
|
/// 4. try to resume parsing
|
|
/// 5. next match() will reset errorRecovery mode
|
|
///
|
|
/// If you override, make sure to update errorCount if you care about that.
|
|
///
|
|
void reportError();
|
|
void reportError( ClassForwarder<LexerType> );
|
|
template<typename CompType>
|
|
void reportError( ClassForwarder<CompType> );
|
|
|
|
/** Function that is called to display a recognition error message. You may
|
|
* override this function independently of (*reportError)() above as that function calls
|
|
* this one to do the actual exception printing.
|
|
*/
|
|
void displayRecognitionError(ANTLR_UINT8** tokenNames);
|
|
|
|
/// Get number of recognition errors (lexer, parser, tree parser). Each
|
|
/// recognizer tracks its own number. So parser and lexer each have
|
|
/// separate count. Does not count the spurious errors found between
|
|
/// an error and next valid token match
|
|
///
|
|
/// \see reportError()
|
|
///
|
|
ANTLR_UINT32 getNumberOfSyntaxErrors();
|
|
|
|
/** Function that recovers from an error found in the input stream.
|
|
* Generally, this will be a #ANTLR3_EXCEPTION_NOVIABLE_ALT but it could also
|
|
* be from a mismatched token that the (*match)() could not recover from.
|
|
*/
|
|
void recover();
|
|
|
|
/** function that is a hook to listen to token consumption during error recovery.
|
|
* This is mainly used by the debug parser to send events to the listener.
|
|
*/
|
|
void beginResync();
|
|
|
|
/** function that is a hook to listen to token consumption during error recovery.
|
|
* This is mainly used by the debug parser to send events to the listener.
|
|
*/
|
|
void endResync();
|
|
|
|
/** function that is a hook to listen to token consumption during error recovery.
|
|
* This is mainly used by the debug parser to send events to the listener.
|
|
*/
|
|
void beginBacktrack(ANTLR_UINT32 level);
|
|
|
|
/** function that is a hook to listen to token consumption during error recovery.
|
|
* This is mainly used by the debug parser to send events to the listener.
|
|
*/
|
|
void endBacktrack(ANTLR_UINT32 level, bool successful);
|
|
|
|
/// Compute the error recovery set for the current rule.
|
|
/// Documentation below is from the Java implementation.
|
|
///
|
|
/// During rule invocation, the parser pushes the set of tokens that can
|
|
/// follow that rule reference on the stack; this amounts to
|
|
/// computing FIRST of what follows the rule reference in the
|
|
/// enclosing rule. This local follow set only includes tokens
|
|
/// from within the rule; i.e., the FIRST computation done by
|
|
/// ANTLR stops at the end of a rule.
|
|
//
|
|
/// EXAMPLE
|
|
//
|
|
/// When you find a "no viable alt exception", the input is not
|
|
/// consistent with any of the alternatives for rule r. The best
|
|
/// thing to do is to consume tokens until you see something that
|
|
/// can legally follow a call to r *or* any rule that called r.
|
|
/// You don't want the exact set of viable next tokens because the
|
|
/// input might just be missing a token--you might consume the
|
|
/// rest of the input looking for one of the missing tokens.
|
|
///
|
|
/// Consider grammar:
|
|
///
|
|
/// a : '[' b ']'
|
|
/// | '(' b ')'
|
|
/// ;
|
|
/// b : c '^' INT ;
|
|
/// c : ID
|
|
/// | INT
|
|
/// ;
|
|
///
|
|
/// At each rule invocation, the set of tokens that could follow
|
|
/// that rule is pushed on a stack. Here are the various "local"
|
|
/// follow sets:
|
|
///
|
|
/// FOLLOW(b1_in_a) = FIRST(']') = ']'
|
|
/// FOLLOW(b2_in_a) = FIRST(')') = ')'
|
|
/// FOLLOW(c_in_b) = FIRST('^') = '^'
|
|
///
|
|
/// Upon erroneous input "[]", the call chain is
|
|
///
|
|
/// a -> b -> c
|
|
///
|
|
/// and, hence, the follow context stack is:
|
|
///
|
|
/// depth local follow set after call to rule
|
|
/// 0 <EOF> a (from main())
|
|
/// 1 ']' b
|
|
/// 3 '^' c
|
|
///
|
|
/// Notice that ')' is not included, because b would have to have
|
|
/// been called from a different context in rule a for ')' to be
|
|
/// included.
|
|
///
|
|
/// For error recovery, we cannot consider FOLLOW(c)
|
|
/// (context-sensitive or otherwise). We need the combined set of
|
|
/// all context-sensitive FOLLOW sets--the set of all tokens that
|
|
/// could follow any reference in the call chain. We need to
|
|
/// resync to one of those tokens. Note that FOLLOW(c)='^' and if
|
|
/// we resync'd to that token, we'd consume until EOF. We need to
|
|
/// sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}.
|
|
/// In this case, for input "[]", LA(1) is in this set so we would
|
|
/// not consume anything and after printing an error rule c would
|
|
/// return normally. It would not find the required '^' though.
|
|
/// At this point, it gets a mismatched token error and throws an
|
|
/// exception (since LA(1) is not in the viable following token
|
|
/// set). The rule exception handler tries to recover, but finds
|
|
/// the same recovery set and doesn't consume anything. Rule b
|
|
/// exits normally returning to rule a. Now it finds the ']' (and
|
|
/// with the successful match exits errorRecovery mode).
|
|
///
|
|
/// So, you can see that the parser walks up call chain looking
|
|
/// for the token that was a member of the recovery set.
|
|
///
|
|
/// Errors are not generated in errorRecovery mode.
|
|
///
|
|
/// ANTLR's error recovery mechanism is based upon original ideas:
|
|
///
|
|
/// "Algorithms + Data Structures = Programs" by Niklaus Wirth
|
|
///
|
|
/// and
|
|
///
|
|
/// "A note on error recovery in recursive descent parsers":
|
|
/// http://portal.acm.org/citation.cfm?id=947902.947905
|
|
///
|
|
/// Later, Josef Grosch had some good ideas:
|
|
///
|
|
/// "Efficient and Comfortable Error Recovery in Recursive Descent
|
|
/// Parsers":
|
|
/// ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip
|
|
///
|
|
/// Like Grosch I implemented local FOLLOW sets that are combined
|
|
/// at run-time upon error to avoid overhead during parsing.
|
|
///
|
|
BitsetType* computeErrorRecoverySet();
|
|
|
|
/// Compute the context-sensitive FOLLOW set for current rule.
|
|
/// Documentation below is from the Java runtime.
|
|
///
|
|
/// This is the set of token types that can follow a specific rule
|
|
/// reference given a specific call chain. You get the set of
|
|
/// viable tokens that can possibly come next (look ahead depth 1)
|
|
/// given the current call chain. Contrast this with the
|
|
/// definition of plain FOLLOW for rule r:
|
|
///
|
|
/// FOLLOW(r)={x | S=>*alpha r beta in G and x in FIRST(beta)}
|
|
///
|
|
/// where x in T* and alpha, beta in V*; T is set of terminals and
|
|
/// V is the set of terminals and non terminals. In other words,
|
|
/// FOLLOW(r) is the set of all tokens that can possibly follow
|
|
/// references to r in///any* sentential form (context). At
|
|
/// runtime, however, we know precisely which context applies as
|
|
/// we have the call chain. We may compute the exact (rather
|
|
/// than covering superset) set of following tokens.
|
|
///
|
|
/// For example, consider grammar:
|
|
///
|
|
/// stat : ID '=' expr ';' // FOLLOW(stat)=={EOF}
|
|
/// | "return" expr '.'
|
|
/// ;
|
|
/// expr : atom ('+' atom)* ; // FOLLOW(expr)=={';','.',')'}
|
|
/// atom : INT // FOLLOW(atom)=={'+',')',';','.'}
|
|
/// | '(' expr ')'
|
|
/// ;
|
|
///
|
|
/// The FOLLOW sets are all inclusive whereas context-sensitive
|
|
/// FOLLOW sets are precisely what could follow a rule reference.
|
|
/// For input input "i=(3);", here is the derivation:
|
|
///
|
|
/// stat => ID '=' expr ';'
|
|
/// => ID '=' atom ('+' atom)* ';'
|
|
/// => ID '=' '(' expr ')' ('+' atom)* ';'
|
|
/// => ID '=' '(' atom ')' ('+' atom)* ';'
|
|
/// => ID '=' '(' INT ')' ('+' atom)* ';'
|
|
/// => ID '=' '(' INT ')' ';'
|
|
///
|
|
/// At the "3" token, you'd have a call chain of
|
|
///
|
|
/// stat -> expr -> atom -> expr -> atom
|
|
///
|
|
/// What can follow that specific nested ref to atom? Exactly ')'
|
|
/// as you can see by looking at the derivation of this specific
|
|
/// input. Contrast this with the FOLLOW(atom)={'+',')',';','.'}.
|
|
///
|
|
/// You want the exact viable token set when recovering from a
|
|
/// token mismatch. Upon token mismatch, if LA(1) is member of
|
|
/// the viable next token set, then you know there is most likely
|
|
/// a missing token in the input stream. "Insert" one by just not
|
|
/// throwing an exception.
|
|
///
|
|
BitsetType* computeCSRuleFollow();
|
|
|
|
/// Compute the current followset for the input stream.
|
|
///
|
|
BitsetType* combineFollows(bool exact);
|
|
|
|
/// Attempt to recover from a single missing or extra token.
|
|
///
|
|
/// EXTRA TOKEN
|
|
///
|
|
/// LA(1) is not what we are looking for. If LA(2) has the right token,
|
|
/// however, then assume LA(1) is some extra spurious token. Delete it
|
|
/// and LA(2) as if we were doing a normal match(), which advances the
|
|
/// input.
|
|
///
|
|
/// MISSING TOKEN
|
|
///
|
|
/// If current token is consistent with what could come after
|
|
/// ttype then it is ok to "insert" the missing token, else throw
|
|
/// exception For example, Input "i=(3;" is clearly missing the
|
|
/// ')'. When the parser returns from the nested call to expr, it
|
|
/// will have call chain:
|
|
///
|
|
/// stat -> expr -> atom
|
|
///
|
|
/// and it will be trying to match the ')' at this point in the
|
|
/// derivation:
|
|
///
|
|
/// => ID '=' '(' INT ')' ('+' atom)* ';'
|
|
/// ^
|
|
/// match() will see that ';' doesn't match ')' and report a
|
|
/// mismatched token error. To recover, it sees that LA(1)==';'
|
|
/// is in the set of tokens that can follow the ')' token
|
|
/// reference in rule atom. It can assume that you forgot the ')'.
|
|
///
|
|
/// The exception that was passed in, in the java implementation is
|
|
/// sorted in the recognizer exception stack in the C version. To 'throw' it we set the
|
|
/// error flag and rules cascade back when this is set.
|
|
///
|
|
const UnitType* recoverFromMismatchedToken( ANTLR_UINT32 ttype, BitsetListType* follow);
|
|
|
|
/** Function that recovers from a mismatched set in the token stream, in a similar manner
|
|
* to (*recoverFromMismatchedToken)
|
|
*/
|
|
const UnitType* recoverFromMismatchedSet(BitsetListType* follow);
|
|
|
|
/** common routine to handle single token insertion for recovery functions.
|
|
*/
|
|
/// This code is factored out from mismatched token and mismatched set
|
|
/// recovery. It handles "single token insertion" error recovery for
|
|
/// both. No tokens are consumed to recover from insertions. Return
|
|
/// true if recovery was possible else return false.
|
|
///
|
|
bool recoverFromMismatchedElement(BitsetListType* follow);
|
|
|
|
/** function that consumes input until the next token matches
|
|
* the given token.
|
|
*/
|
|
void consumeUntil(ANTLR_UINT32 tokenType);
|
|
|
|
/** function that consumes input until the next token matches
|
|
* one in the given set.
|
|
*/
|
|
void consumeUntilSet(BitsetType* set);
|
|
|
|
/** function that returns an ANTLR3_LIST of the strings that identify
|
|
* the rules in the parser that got you to this point. Can be overridden by installing your
|
|
* own function set.
|
|
*
|
|
* \todo Document how to override invocation stack functions.
|
|
*/
|
|
StringStackType getRuleInvocationStack();
|
|
StringStackType getRuleInvocationStackNamed(ANTLR_UINT8* name);
|
|
|
|
/** function that converts an ANLR3_LIST of tokens to an ANTLR3_LIST of
|
|
* string token names. As this is mostly used in string template processing it may not be useful
|
|
* in the C runtime.
|
|
*/
|
|
StringListType toStrings( const StringListType& );
|
|
|
|
/** function to return whether the rule has parsed input starting at the supplied
|
|
* start index before. If the rule has not parsed input starting from the supplied start index,
|
|
* then it will return ANTLR3_MEMO_RULE_UNKNOWN. If it has parsed from the suppled start point
|
|
* then it will return the point where it last stopped parsing after that start point.
|
|
*/
|
|
ANTLR_MARKER getRuleMemoization( ANTLR_INTKEY ruleIndex,
|
|
ANTLR_MARKER ruleParseStart);
|
|
|
|
/** function that determines whether the rule has parsed input at the current index
|
|
* in the input stream
|
|
*/
|
|
bool alreadyParsedRule(ANTLR_MARKER ruleIndex);
|
|
|
|
/** Function that records whether the rule has parsed the input at a
|
|
* current position successfully or not.
|
|
*/
|
|
void memoize(ANTLR_MARKER ruleIndex,
|
|
ANTLR_MARKER ruleParseStart);
|
|
|
|
/// Function that returns the current input symbol.
|
|
/// The is placed into any label for the associated token ref; e.g., x=ID. Token
|
|
/// and tree parsers need to return different objects. Rather than test
|
|
/// for input stream type or change the IntStream interface, I use
|
|
/// a simple method to ask the recognizer to tell me what the current
|
|
/// input symbol is.
|
|
///
|
|
/// This is ignored for lexers and the lexer implementation of this
|
|
/// function should return NULL.
|
|
///
|
|
const UnitType* getCurrentInputSymbol(IntStreamType* istream);
|
|
const UnitType* getCurrentInputSymbol(IntStreamType* istream, ClassForwarder<LexerType>);
|
|
const UnitType* getCurrentInputSymbol(IntStreamType* istream, ClassForwarder<ParserType>);
|
|
const UnitType* getCurrentInputSymbol(IntStreamType* istream, ClassForwarder<TreeParserType>);
|
|
|
|
/// Conjure up a missing token during error recovery.
|
|
///
|
|
/// The recognizer attempts to recover from single missing
|
|
/// symbols. But, actions might refer to that missing symbol.
|
|
/// For example, x=ID {f($x);}. The action clearly assumes
|
|
/// that there has been an identifier matched previously and that
|
|
/// $x points at that token. If that token is missing, but
|
|
/// the next token in the stream is what we want we assume that
|
|
/// this token is missing and we keep going. Because we
|
|
/// have to return some token to replace the missing token,
|
|
/// we have to conjure one up. This method gives the user control
|
|
/// over the tokens returned for missing tokens. Mostly,
|
|
/// you will want to create something special for identifier
|
|
/// tokens. For literals such as '{' and ',', the default
|
|
/// action in the parser or tree parser works. It simply creates
|
|
/// a CommonToken of the appropriate type. The text will be the token.
|
|
/// If you change what tokens must be created by the lexer,
|
|
/// override this method to create the appropriate tokens.
|
|
///
|
|
UnitType* getMissingSymbol( IntStreamType* istream, ExceptionBaseType* e,
|
|
ANTLR_UINT32 expectedTokenType,
|
|
BitsetListType* follow);
|
|
|
|
/** Function that returns whether the supplied grammar function
|
|
* will parse the current input stream or not. This is the way that syntactic
|
|
* predicates are evaluated. Unlike java, C is perfectly happy to invoke code
|
|
* via a pointer to a function (hence that's what all the ANTLR3 C interfaces
|
|
* do.
|
|
*/
|
|
template<typename Predicate>
|
|
bool synpred( ClassForwarder<Predicate> );
|
|
|
|
//In place of exConstruct, just directly instantiate the Exception Object
|
|
|
|
/** Reset the recognizer
|
|
*/
|
|
void reset();
|
|
void reset( ClassForwarder<LexerType> );
|
|
template<typename CompType>
|
|
void reset( ClassForwarder<CompType> );
|
|
|
|
void exConstruct();
|
|
|
|
~BaseRecognizer();
|
|
|
|
};
|
|
|
|
ANTLR_END_NAMESPACE()
|
|
|
|
#include "antlr3baserecognizer.inl"
|
|
|
|
/// @}
|
|
///
|
|
|
|
#endif /* _ANTLR3_BASERECOGNIZER_H */
|
|
|