2014-08-22 13:10:48 +02:00

1029 lines
21 KiB
ANTLR

/*
[The "BSD licence"]
Copyright (c) 2014 Terence Parr, Sam Harwell
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. The name of the author may not be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/** A Java 8 grammar for ANTLR v4 derived from ANTLR v3 Java grammar.
* Follows syntax from spec:
* http://docs.oracle.com/javase/specs/jls/se8/html/jls-19.html
* Uses ANTLR v4's left-recursive expression notation.
*
* You can test with
*
* $ antlr4 Java.g4
* $ javac *.java
* $ grun Java compilationUnit *.java
*
* Or,
~/antlr/code/grammars-v4/java8 $ j Test .
/Users/parrt/antlr/code/grammars-v4/java8/./Java8BaseListener.java
/Users/parrt/antlr/code/grammars-v4/java8/./Java8Lexer.java
/Users/parrt/antlr/code/grammars-v4/java8/./Java8Listener.java
/Users/parrt/antlr/code/grammars-v4/java8/./Java8Parser.java
/Users/parrt/antlr/code/grammars-v4/java8/./Test.java
Total lexer+parser time 384ms.
*/
grammar Java8;
// starting point for parsing a java file
compilationUnit
: packageDeclaration? importDeclaration* typeDeclaration* EOF
;
packageDeclaration
: annotation* 'package' qualifiedName ';'
;
importDeclaration
: 'import' 'static'? qualifiedName ('.' '*')? ';'
;
typeDeclaration
: classOrInterfaceModifier* classDeclaration
| classOrInterfaceModifier* enumDeclaration
| classOrInterfaceModifier* interfaceDeclaration
| classOrInterfaceModifier* annotationTypeDeclaration
| ';'
;
modifier
: classOrInterfaceModifier
| ( 'native'
| 'synchronized'
| 'transient'
| 'volatile'
)
;
classOrInterfaceModifier
: annotation // class or interface
| ( 'public' // class or interface
| 'protected' // class or interface
| 'private' // class or interface
| 'static' // class or interface
| 'abstract' // class or interface
| 'final' // class only -- does not apply to interfaces
| 'strictfp' // class or interface
)
;
variableModifier
: 'final'
| annotation
;
classDeclaration
: 'class' Identifier typeParameters?
('extends' type)?
('implements' typeList)?
classBody
;
typeParameters
: '<' typeParameter (',' typeParameter)* '>'
;
typeParameter
: Identifier ('extends' typeBound)?
;
typeBound
: type ('&' type)*
;
enumDeclaration
: ENUM Identifier ('implements' typeList)?
'{' enumConstants? ','? enumBodyDeclarations? '}'
;
enumConstants
: enumConstant (',' enumConstant)*
;
enumConstant
: annotation* Identifier arguments? classBody?
;
enumBodyDeclarations
: ';' classBodyDeclaration*
;
interfaceDeclaration
: 'interface' Identifier typeParameters? ('extends' typeList)? interfaceBody
;
typeList
: type (',' type)*
;
classBody
: '{' classBodyDeclaration* '}'
;
interfaceBody
: '{' interfaceBodyDeclaration* '}'
;
classBodyDeclaration
: ';'
| 'static'? block
| modifier* memberDeclaration
;
memberDeclaration
: methodDeclaration
| genericMethodDeclaration
| fieldDeclaration
| constructorDeclaration
| genericConstructorDeclaration
| interfaceDeclaration
| annotationTypeDeclaration
| classDeclaration
| enumDeclaration
;
/* We use rule this even for void methods which cannot have [] after parameters.
This simplifies grammar and we can consider void to be a type, which
renders the [] matching as a context-sensitive issue or a semantic check
for invalid return type after parsing.
*/
methodDeclaration
: (type|'void') Identifier formalParameters ('[' ']')*
('throws' qualifiedNameList)?
( methodBody
| ';'
)
;
genericMethodDeclaration
: typeParameters methodDeclaration
;
constructorDeclaration
: Identifier formalParameters ('throws' qualifiedNameList)?
constructorBody
;
genericConstructorDeclaration
: typeParameters constructorDeclaration
;
fieldDeclaration
: type variableDeclarators ';'
;
interfaceBodyDeclaration
: modifier* interfaceMemberDeclaration
| ';'
;
interfaceMemberDeclaration
: constDeclaration
| interfaceMethodDeclaration
| genericInterfaceMethodDeclaration
| interfaceDeclaration
| annotationTypeDeclaration
| classDeclaration
| enumDeclaration
;
constDeclaration
: type constantDeclarator (',' constantDeclarator)* ';'
;
constantDeclarator
: Identifier ('[' ']')* '=' variableInitializer
;
// see matching of [] comment in methodDeclaratorRest
interfaceMethodDeclaration
: (type|'void') Identifier formalParameters ('[' ']')*
('throws' qualifiedNameList)?
';'
;
genericInterfaceMethodDeclaration
: typeParameters interfaceMethodDeclaration
;
variableDeclarators
: variableDeclarator (',' variableDeclarator)*
;
variableDeclarator
: variableDeclaratorId ('=' variableInitializer)?
;
variableDeclaratorId
: Identifier ('[' ']')*
;
variableInitializer
: arrayInitializer
| expression
;
arrayInitializer
: '{' (variableInitializer (',' variableInitializer)* (',')? )? '}'
;
enumConstantName
: Identifier
;
type
: classOrInterfaceType ('[' ']')*
| primitiveType ('[' ']')*
;
classOrInterfaceType
: Identifier typeArguments? ('.' Identifier typeArguments? )*
;
primitiveType
: 'boolean'
| 'char'
| 'byte'
| 'short'
| 'int'
| 'long'
| 'float'
| 'double'
;
typeArguments
: '<' typeArgument (',' typeArgument)* '>'
;
typeArgument
: type
| '?' (('extends' | 'super') type)?
;
qualifiedNameList
: qualifiedName (',' qualifiedName)*
;
formalParameters
: '(' formalParameterList? ')'
;
formalParameterList
: formalParameter (',' formalParameter)* (',' lastFormalParameter)?
| lastFormalParameter
;
formalParameter
: variableModifier* type variableDeclaratorId
;
lastFormalParameter
: variableModifier* type '...' variableDeclaratorId
;
methodBody
: block
;
constructorBody
: block
;
qualifiedName
: Identifier ('.' Identifier)*
;
literal
: IntegerLiteral
| FloatingPointLiteral
| CharacterLiteral
| StringLiteral
| BooleanLiteral
| 'null'
;
// ANNOTATIONS
annotation
: '@' annotationName ( '(' ( elementValuePairs | elementValue )? ')' )?
;
annotationName : qualifiedName ;
elementValuePairs
: elementValuePair (',' elementValuePair)*
;
elementValuePair
: Identifier '=' elementValue
;
elementValue
: expression
| annotation
| elementValueArrayInitializer
;
elementValueArrayInitializer
: '{' (elementValue (',' elementValue)*)? (',')? '}'
;
annotationTypeDeclaration
: '@' 'interface' Identifier annotationTypeBody
;
annotationTypeBody
: '{' (annotationTypeElementDeclaration)* '}'
;
annotationTypeElementDeclaration
: modifier* annotationTypeElementRest
| ';' // this is not allowed by the grammar, but apparently allowed by the actual compiler
;
annotationTypeElementRest
: type annotationMethodOrConstantRest ';'
| classDeclaration ';'?
| interfaceDeclaration ';'?
| enumDeclaration ';'?
| annotationTypeDeclaration ';'?
;
annotationMethodOrConstantRest
: annotationMethodRest
| annotationConstantRest
;
annotationMethodRest
: Identifier '(' ')' defaultValue?
;
annotationConstantRest
: variableDeclarators
;
defaultValue
: 'default' elementValue
;
// STATEMENTS / BLOCKS
block
: '{' blockStatement* '}'
;
blockStatement
: localVariableDeclarationStatement
| statement
| typeDeclaration
;
localVariableDeclarationStatement
: localVariableDeclaration ';'
;
localVariableDeclaration
: variableModifier* type variableDeclarators
;
statement
: block
| ASSERT expression (':' expression)? ';'
| 'if' parExpression statement ('else' statement)?
| 'for' '(' forControl ')' statement
| 'while' parExpression statement
| 'do' statement 'while' parExpression ';'
| 'try' block (catchClause+ finallyBlock? | finallyBlock)
| 'try' resourceSpecification block catchClause* finallyBlock?
| 'switch' parExpression '{' switchBlockStatementGroup* switchLabel* '}'
| 'synchronized' parExpression block
| 'return' expression? ';'
| 'throw' expression ';'
| 'break' Identifier? ';'
| 'continue' Identifier? ';'
| ';'
| statementExpression ';'
| Identifier ':' statement
;
catchClause
: 'catch' '(' variableModifier* catchType Identifier ')' block
;
catchType
: qualifiedName ('|' qualifiedName)*
;
finallyBlock
: 'finally' block
;
resourceSpecification
: '(' resources ';'? ')'
;
resources
: resource (';' resource)*
;
resource
: variableModifier* classOrInterfaceType variableDeclaratorId '=' expression
;
/** Matches cases then statements, both of which are mandatory.
* To handle empty cases at the end, we add switchLabel* to statement.
*/
switchBlockStatementGroup
: switchLabel+ blockStatement+
;
switchLabel
: 'case' constantExpression ':'
| 'case' enumConstantName ':'
| 'default' ':'
;
forControl
: enhancedForControl
| forInit? ';' expression? ';' forUpdate?
;
forInit
: localVariableDeclaration
| expressionList
;
enhancedForControl
: variableModifier* type Identifier ':' expression
;
forUpdate
: expressionList
;
// EXPRESSIONS
parExpression
: '(' expression ')'
;
expressionList
: expression (',' expression)*
;
statementExpression
: expression
;
constantExpression
: expression
;
expression
: primary
| expression '.' Identifier
| expression '.' 'this'
| expression '.' 'new' nonWildcardTypeArguments? innerCreator
| expression '.' 'super' superSuffix
| expression '.' explicitGenericInvocation
| expression '[' expression ']'
| expression '(' expressionList? ')'
| 'new' creator
| '(' type ')' expression
| expression ('++' | '--')
| ('+'|'-'|'++'|'--') expression
| ('~'|'!') expression
| expression ('*'|'/'|'%') expression
| expression ('+'|'-') expression
| expression ('<' '<' | '>' '>' '>' | '>' '>') expression
| expression ('<=' | '>=' | '>' | '<') expression
| expression 'instanceof' type
| expression ('==' | '!=') expression
| expression '&' expression
| expression '^' expression
| expression '|' expression
| expression '&&' expression
| expression '||' expression
| expression '?' expression ':' expression
| expression
( '='<assoc=right>
| '+='<assoc=right>
| '-='<assoc=right>
| '*='<assoc=right>
| '/='<assoc=right>
| '&='<assoc=right>
| '|='<assoc=right>
| '^='<assoc=right>
| '>>='<assoc=right>
| '>>>='<assoc=right>
| '<<='<assoc=right>
| '%='<assoc=right>
)
expression
;
primary
: '(' expression ')'
| 'this'
| 'super'
| literal
| Identifier
| type '.' 'class'
| 'void' '.' 'class'
| nonWildcardTypeArguments (explicitGenericInvocationSuffix | 'this' arguments)
;
creator
: nonWildcardTypeArguments createdName classCreatorRest
| createdName (arrayCreatorRest | classCreatorRest)
;
createdName
: Identifier typeArgumentsOrDiamond? ('.' Identifier typeArgumentsOrDiamond?)*
| primitiveType
;
innerCreator
: Identifier nonWildcardTypeArgumentsOrDiamond? classCreatorRest
;
arrayCreatorRest
: '['
( ']' ('[' ']')* arrayInitializer
| expression ']' ('[' expression ']')* ('[' ']')*
)
;
classCreatorRest
: arguments classBody?
;
explicitGenericInvocation
: nonWildcardTypeArguments explicitGenericInvocationSuffix
;
nonWildcardTypeArguments
: '<' typeList '>'
;
typeArgumentsOrDiamond
: '<' '>'
| typeArguments
;
nonWildcardTypeArgumentsOrDiamond
: '<' '>'
| nonWildcardTypeArguments
;
superSuffix
: arguments
| '.' Identifier arguments?
;
explicitGenericInvocationSuffix
: 'super' superSuffix
| Identifier arguments
;
arguments
: '(' expressionList? ')'
;
// LEXER
// §3.9 Keywords
ABSTRACT : 'abstract';
ASSERT : 'assert';
BOOLEAN : 'boolean';
BREAK : 'break';
BYTE : 'byte';
CASE : 'case';
CATCH : 'catch';
CHAR : 'char';
CLASS : 'class';
CONST : 'const';
CONTINUE : 'continue';
DEFAULT : 'default';
DO : 'do';
DOUBLE : 'double';
ELSE : 'else';
ENUM : 'enum';
EXTENDS : 'extends';
FINAL : 'final';
FINALLY : 'finally';
FLOAT : 'float';
FOR : 'for';
IF : 'if';
GOTO : 'goto';
IMPLEMENTS : 'implements';
IMPORT : 'import';
INSTANCEOF : 'instanceof';
INT : 'int';
INTERFACE : 'interface';
LONG : 'long';
NATIVE : 'native';
NEW : 'new';
PACKAGE : 'package';
PRIVATE : 'private';
PROTECTED : 'protected';
PUBLIC : 'public';
RETURN : 'return';
SHORT : 'short';
STATIC : 'static';
STRICTFP : 'strictfp';
SUPER : 'super';
SWITCH : 'switch';
SYNCHRONIZED : 'synchronized';
THIS : 'this';
THROW : 'throw';
THROWS : 'throws';
TRANSIENT : 'transient';
TRY : 'try';
VOID : 'void';
VOLATILE : 'volatile';
WHILE : 'while';
// §3.10.1 Integer Literals
IntegerLiteral
: DecimalIntegerLiteral
| HexIntegerLiteral
| OctalIntegerLiteral
| BinaryIntegerLiteral
;
fragment
DecimalIntegerLiteral
: DecimalNumeral IntegerTypeSuffix?
;
fragment
HexIntegerLiteral
: HexNumeral IntegerTypeSuffix?
;
fragment
OctalIntegerLiteral
: OctalNumeral IntegerTypeSuffix?
;
fragment
BinaryIntegerLiteral
: BinaryNumeral IntegerTypeSuffix?
;
fragment
IntegerTypeSuffix
: [lL]
;
fragment
DecimalNumeral
: '0'
| NonZeroDigit (Digits? | Underscores Digits)
;
fragment
Digits
: Digit (DigitOrUnderscore* Digit)?
;
fragment
Digit
: '0'
| NonZeroDigit
;
fragment
NonZeroDigit
: [1-9]
;
fragment
DigitOrUnderscore
: Digit
| '_'
;
fragment
Underscores
: '_'+
;
fragment
HexNumeral
: '0' [xX] HexDigits
;
fragment
HexDigits
: HexDigit (HexDigitOrUnderscore* HexDigit)?
;
fragment
HexDigit
: [0-9a-fA-F]
;
fragment
HexDigitOrUnderscore
: HexDigit
| '_'
;
fragment
OctalNumeral
: '0' Underscores? OctalDigits
;
fragment
OctalDigits
: OctalDigit (OctalDigitOrUnderscore* OctalDigit)?
;
fragment
OctalDigit
: [0-7]
;
fragment
OctalDigitOrUnderscore
: OctalDigit
| '_'
;
fragment
BinaryNumeral
: '0' [bB] BinaryDigits
;
fragment
BinaryDigits
: BinaryDigit (BinaryDigitOrUnderscore* BinaryDigit)?
;
fragment
BinaryDigit
: [01]
;
fragment
BinaryDigitOrUnderscore
: BinaryDigit
| '_'
;
// §3.10.2 Floating-Point Literals
FloatingPointLiteral
: DecimalFloatingPointLiteral
| HexadecimalFloatingPointLiteral
;
fragment
DecimalFloatingPointLiteral
: Digits '.' Digits? ExponentPart? FloatTypeSuffix?
| '.' Digits ExponentPart? FloatTypeSuffix?
| Digits ExponentPart FloatTypeSuffix?
| Digits FloatTypeSuffix
;
fragment
ExponentPart
: ExponentIndicator SignedInteger
;
fragment
ExponentIndicator
: [eE]
;
fragment
SignedInteger
: Sign? Digits
;
fragment
Sign
: [+-]
;
fragment
FloatTypeSuffix
: [fFdD]
;
fragment
HexadecimalFloatingPointLiteral
: HexSignificand BinaryExponent FloatTypeSuffix?
;
fragment
HexSignificand
: HexNumeral '.'?
| '0' [xX] HexDigits? '.' HexDigits
;
fragment
BinaryExponent
: BinaryExponentIndicator SignedInteger
;
fragment
BinaryExponentIndicator
: [pP]
;
// §3.10.3 Boolean Literals
BooleanLiteral
: 'true'
| 'false'
;
// §3.10.4 Character Literals
CharacterLiteral
: '\'' SingleCharacter '\''
| '\'' EscapeSequence '\''
;
fragment
SingleCharacter
: ~['\\]
;
// §3.10.5 String Literals
StringLiteral
: '"' StringCharacters? '"'
;
fragment
StringCharacters
: StringCharacter+
;
fragment
StringCharacter
: ~["\\]
| EscapeSequence
;
// §3.10.6 Escape Sequences for Character and String Literals
fragment
EscapeSequence
: '\\' [btnfr"'\\]
| OctalEscape
| UnicodeEscape
;
fragment
OctalEscape
: '\\' OctalDigit
| '\\' OctalDigit OctalDigit
| '\\' ZeroToThree OctalDigit OctalDigit
;
fragment
UnicodeEscape
: '\\' 'u' HexDigit HexDigit HexDigit HexDigit
;
fragment
ZeroToThree
: [0-3]
;
// §3.10.7 The Null Literal
NullLiteral
: 'null'
;
// §3.11 Separators
LPAREN : '(';
RPAREN : ')';
LBRACE : '{';
RBRACE : '}';
LBRACK : '[';
RBRACK : ']';
SEMI : ';';
COMMA : ',';
DOT : '.';
// §3.12 Operators
ASSIGN : '=';
GT : '>';
LT : '<';
BANG : '!';
TILDE : '~';
QUESTION : '?';
COLON : ':';
EQUAL : '==';
LE : '<=';
GE : '>=';
NOTEQUAL : '!=';
AND : '&&';
OR : '||';
INC : '++';
DEC : '--';
ADD : '+';
SUB : '-';
MUL : '*';
DIV : '/';
BITAND : '&';
BITOR : '|';
CARET : '^';
MOD : '%';
ADD_ASSIGN : '+=';
SUB_ASSIGN : '-=';
MUL_ASSIGN : '*=';
DIV_ASSIGN : '/=';
AND_ASSIGN : '&=';
OR_ASSIGN : '|=';
XOR_ASSIGN : '^=';
MOD_ASSIGN : '%=';
LSHIFT_ASSIGN : '<<=';
RSHIFT_ASSIGN : '>>=';
URSHIFT_ASSIGN : '>>>=';
// §3.8 Identifiers (must appear after all keywords in the grammar)
Identifier
: JavaLetter JavaLetterOrDigit*
;
fragment
JavaLetter
: [a-zA-Z$_] // these are the "java letters" below 0xFF
| // covers all characters above 0xFF which are not a surrogate
~[\u0000-\u00FF\uD800-\uDBFF]
{Character.isJavaIdentifierStart(_input.LA(-1))}?
| // covers UTF-16 surrogate pairs encodings for U+10000 to U+10FFFF
[\uD800-\uDBFF] [\uDC00-\uDFFF]
{Character.isJavaIdentifierStart(Character.toCodePoint((char)_input.LA(-2), (char)_input.LA(-1)))}?
;
fragment
JavaLetterOrDigit
: [a-zA-Z0-9$_] // these are the "java letters or digits" below 0xFF
| // covers all characters above 0xFF which are not a surrogate
~[\u0000-\u00FF\uD800-\uDBFF]
{Character.isJavaIdentifierPart(_input.LA(-1))}?
| // covers UTF-16 surrogate pairs encodings for U+10000 to U+10FFFF
[\uD800-\uDBFF] [\uDC00-\uDFFF]
{Character.isJavaIdentifierPart(Character.toCodePoint((char)_input.LA(-2), (char)_input.LA(-1)))}?
;
//
// Additional symbols not defined in the lexical specification
//
AT : '@';
ELLIPSIS : '...';
//
// Whitespace and comments
//
WS : [ \t\r\n\u000C]+ -> skip
;
COMMENT
: '/*' .*? '*/' -> skip
;
LINE_COMMENT
: '//' ~[\r\n]* -> skip
;