Added PCRE.g4

2022-12-14 18:31:54 +03:00 · 2022-12-14 18:31:54 +03:00 · 2c5ddac650
parent aecb860397
commit 2c5ddac650
1 changed files with 754 additions and 0 deletions
--- a/text-processing/src/main/antlr4/ru/roboswag/textprocessing/pcre/PCRE.g4
+++ b/text-processing/src/main/antlr4/ru/roboswag/textprocessing/pcre/PCRE.g4
@ -0,0 +1,754 @@
 /*
 * Copyright (c) 2014-2022 by Bart Kiers
 *
 * The MIT license.
 *
 * Permission is hereby granted, free of charge, to any person
 * obtaining a copy of this software and associated documentation
 * files (the "Software"), to deal in the Software without
 * restriction, including without limitation the rights to use,
 * copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following
 * conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 *
 * Project      : PCRE Parser, an ANTLR 4 grammar for PCRE
 * Developed by : Bart Kiers, bart@big-o.nl
 * Also see     : https://github.com/bkiers/pcre-parser
 */
 grammar PCRE;
 // Most single line comments above the lexer- and  parser rules 
 // are copied from the official PCRE man pages (last updated: 
 // 10 January 2012): http://www.pcre.org/pcre.txt
 parse
 : alternation EOF
 ;
 // ALTERNATION
 //
 //         expr|expr|expr...
 alternation
 : expr ('|' expr)*
 ;
 expr
 : element*
 ;
 element
 : atom quantifier?
 ;
 // QUANTIFIERS
 //
 //         ?           0 or 1, greedy
 //         ?+          0 or 1, possessive
 //         ??          0 or 1, lazy
 //         *           0 or more, greedy
 //         *+          0 or more, possessive
 //         *?          0 or more, lazy
 //         +           1 or more, greedy
 //         ++          1 or more, possessive
 //         +?          1 or more, lazy
 //         {n}         exactly n
 //         {n,m}       at least n, no more than m, greedy
 //         {n,m}+      at least n, no more than m, possessive
 //         {n,m}?      at least n, no more than m, lazy
 //         {n,}        n or more, greedy
 //         {n,}+       n or more, possessive
 //         {n,}?       n or more, lazy
 quantifier
 : '?' quantifier_type
 | '+' quantifier_type
 | '*' quantifier_type
 | '{' number '}' quantifier_type
 | '{' number ',' '}' quantifier_type
 | '{' number ',' number '}' quantifier_type
 ;
 quantifier_type
 : '+'
 | '?'
 | /* nothing */
 ;
 // CHARACTER CLASSES
 //
 //         [...]       positive character class
 //         [^...]      negative character class
 //         [x-y]       range (can be used for hex characters)
 //         [[:xxx:]]   positive POSIX named set
 //         [[:^xxx:]]  negative POSIX named set
 //
 //         alnum       alphanumeric
 //         alpha       alphabetic
 //         ascii       0-127
 //         blank       space or tab
 //         cntrl       control character
 //         digit       decimal digit
 //         graph       printing, excluding space
 //         lower       lower case letter
 //         print       printing, including space
 //         punct       printing, excluding alphanumeric
 //         space       white space
 //         upper       upper case letter
 //         word        same as \w
 //         xdigit      hexadecimal digit
 //
 //       In PCRE, POSIX character set names recognize only ASCII  characters  by
 //       default,  but  some  of them use Unicode properties if PCRE_UCP is set.
 //       You can use \Q...\E inside a character class.
 character_class
 : '[' '^' CharacterClassEnd Hyphen cc_atom+ ']'
 | '[' '^' CharacterClassEnd cc_atom* ']'
 | '[' '^' cc_atom+ ']'
 | '[' CharacterClassEnd Hyphen cc_atom+ ']'
 | '[' CharacterClassEnd cc_atom* ']'
 | '[' cc_atom+ ']'
 ;
 // BACKREFERENCES
 //
 //         \n              reference by number (can be ambiguous)
 //         \gn             reference by number
 //         \g{n}           reference by number
 //         \g{-n}          relative reference by number
 //         \k<name>        reference by name (Perl)
 //         \k'name'        reference by name (Perl)
 //         \g{name}        reference by name (Perl)
 //         \k{name}        reference by name (.NET)
 //         (?P=name)       reference by name (Python)
 backreference
 : backreference_or_octal
 | '\\g' number
 | '\\g' '{' number '}'
 | '\\g' '{' '-' number '}'
 | '\\k' '<' name '>'
 | '\\k' '\'' name '\''
 | '\\g' '{' name '}'
 | '\\k' '{' name '}'
 | '(' '?' 'P' '=' name ')'
 ;
 backreference_or_octal
 : octal_char
 | Backslash digit
 ;
 // CAPTURING
 //
 //         (...)           capturing group
 //         (?<name>...)    named capturing group (Perl)
 //         (?'name'...)    named capturing group (Perl)
 //         (?P<name>...)   named capturing group (Python)
 //         (?:...)         non-capturing group
 //         (?|...)         non-capturing group; reset group numbers for
 //                          capturing groups in each alternative
 //
 // ATOMIC GROUPS
 //
 //         (?>...)         atomic, non-capturing group
 capture
 : '(' '?' '<' name '>' alternation ')'
 | '(' '?''\'' name '\'' alternation ')'
 | '(' '?' 'P' '<' name '>' alternation ')'
 | '(' alternation ')'
 ;
 non_capture
 : '(' '?' ':' alternation ')'
 | '(' '?' '|' alternation ')'
 | '(' '?' '>' alternation ')'
 | '(' '?' option_flags ':' alternation ')'
 ;
 // COMMENT
 //
 //         (?#....)        comment (not nestable)
 comment
 : '(' '?' '#' non_close_parens ')'
 ;
 // OPTION SETTING
 //
 //         (?i)            caseless
 //         (?J)            allow duplicate names
 //         (?m)            multiline
 //         (?s)            single line (dotall)
 //         (?U)            default ungreedy (lazy)
 //         (?x)            extended (ignore white space)
 //         (?-...)         unset option(s)
 //
 //       The following are recognized only at the start of a  pattern  or  after
 //       one of the newline-setting options with similar syntax:
 //
 //         (*NO_START_OPT) no start-match optimization (PCRE_NO_START_OPTIMIZE)
 //         (*UTF8)         set UTF-8 mode: 8-bit library (PCRE_UTF8)
 //         (*UTF16)        set UTF-16 mode: 16-bit library (PCRE_UTF16)
 //         (*UCP)          set PCRE_UCP (use Unicode properties for \d etc)
 option
 : '(' '?' option_flags '-' option_flags ')'
 | '(' '?' option_flags ')'
 | '(' '?' '-' option_flags ')'
 | '(' '*' 'N' 'O' '_' 'S' 'T' 'A' 'R' 'T' '_' 'O' 'P' 'T' ')'
 | '(' '*' 'U' 'T' 'F' '8' ')'
 | '(' '*' 'U' 'T' 'F' '1' '6' ')'
 | '(' '*' 'U' 'C' 'P' ')'
 ;
 option_flags
 : option_flag+
 ;
 option_flag
 : 'i'
 | 'J'
 | 'm'
 | 's'
 | 'U'
 | 'x'
 ;
 // LOOKAHEAD AND LOOKBEHIND ASSERTIONS
 //
 //         (?=...)         positive look ahead
 //         (?!...)         negative look ahead
 //         (?<=...)        positive look behind
 //         (?<!...)        negative look behind
 //
 //       Each top-level branch of a look behind must be of a fixed length.
 look_around
 : '(' '?' '=' alternation ')'
 | '(' '?' '!' alternation ')'
 | '(' '?' '<' '=' alternation ')'
 | '(' '?' '<' '!' alternation ')'
 ;
 // SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)
 //
 //         (?R)            recurse whole pattern
 //         (?n)            call subpattern by absolute number
 //         (?+n)           call subpattern by relative number
 //         (?-n)           call subpattern by relative number
 //         (?&name)        call subpattern by name (Perl)
 //         (?P>name)       call subpattern by name (Python)
 //         \g<name>        call subpattern by name (Oniguruma)
 //         \g'name'        call subpattern by name (Oniguruma)
 //         \g<n>           call subpattern by absolute number (Oniguruma)
 //         \g'n'           call subpattern by absolute number (Oniguruma)
 //         \g<+n>          call subpattern by relative number (PCRE extension)
 //         \g'+n'          call subpattern by relative number (PCRE extension)
 //         \g<-n>          call subpattern by relative number (PCRE extension)
 //         \g'-n'          call subpattern by relative number (PCRE extension)
 subroutine_reference
 : '(' '?' 'R' ')'
 | '(' '?' number ')'
 | '(' '?' '+' number ')'
 | '(' '?' '-' number ')'
 | '(' '?' '&' name ')'
 | '(' '?' 'P' '>' name ')'
 | '\\g' '<' name '>'
 | '\\g' '\'' name '\''
 | '\\g' '<' number '>'
 | '\\g' '\'' number '\''
 | '\\g' '<' '+' number '>'
 | '\\g' '\'' '+' number '\''
 | '\\g' '<' '-' number '>'
 | '\\g' '\'' '-' number '\''
 ;
 // CONDITIONAL PATTERNS
 //
 //         (?(condition)yes-pattern)
 //         (?(condition)yes-pattern|no-pattern)
 //
 //         (?(n)...        absolute reference condition
 //         (?(+n)...       relative reference condition
 //         (?(-n)...       relative reference condition
 //         (?(<name>)...   named reference condition (Perl)
 //         (?('name')...   named reference condition (Perl)
 //         (?(name)...     named reference condition (PCRE)
 //         (?(R)...        overall recursion condition
 //         (?(Rn)...       specific group recursion condition
 //         (?(R&name)...   specific recursion condition
 //         (?(DEFINE)...   define subpattern for reference
 //         (?(assert)...   assertion condition
 conditional
 : '(' '?' '(' number ')' alternation ('|' alternation)? ')'
 | '(' '?' '(' '+' number ')' alternation ('|' alternation)? ')'
 | '(' '?' '(' '-' number ')' alternation ('|' alternation)? ')'
 | '(' '?' '(' '<' name '>' ')' alternation ('|' alternation)? ')'
 | '(' '?' '(' '\'' name '\'' ')' alternation ('|' alternation)? ')'
 | '(' '?' '(' 'R' number ')' alternation ('|' alternation)? ')'
 | '(' '?' '(' 'R' ')' alternation ('|' alternation)? ')'
 | '(' '?' '(' 'R' '&' name ')' alternation ('|' alternation)? ')'
 | '(' '?' '(' 'D' 'E' 'F' 'I' 'N' 'E' ')' alternation ('|' alternation)? ')'
 | '(' '?' '(' 'a' 's' 's' 'e' 'r' 't' ')' alternation ('|' alternation)? ')'
 | '(' '?' '(' name ')' alternation ('|' alternation)? ')'
 ;
 // BACKTRACKING CONTROL
 //
 //       The following act immediately they are reached:
 //
 //         (*ACCEPT)       force successful match
 //         (*FAIL)         force backtrack; synonym (*F)
 //         (*MARK:NAME)    set name to be passed back; synonym (*:NAME)
 //
 //       The  following  act only when a subsequent match failure causes a back-
 //       track to reach them. They all force a match failure, but they differ in
 //       what happens afterwards. Those that advance the start-of-match point do
 //       so only if the pattern is not anchored.
 //
 //         (*COMMIT)       overall failure, no advance of starting point
 //         (*PRUNE)        advance to next starting character
 //         (*PRUNE:NAME)   equivalent to (*MARK:NAME)(*PRUNE)
 //         (*SKIP)         advance to current matching position
 //         (*SKIP:NAME)    advance to position corresponding to an earlier
 //                         (*MARK:NAME); if not found, the (*SKIP) is ignored
 //         (*THEN)         local failure, backtrack to next alternation
 //         (*THEN:NAME)    equivalent to (*MARK:NAME)(*THEN)
 backtrack_control
 : '(' '*' 'A' 'C' 'C' 'E' 'P' 'T' ')'
 | '(' '*' 'F' ('A' 'I' 'L')? ')'
 | '(' '*' ('M' 'A' 'R' 'K')? ':' 'N' 'A' 'M' 'E' ')'
 | '(' '*' 'C' 'O' 'M' 'M' 'I' 'T' ')'
 | '(' '*' 'P' 'R' 'U' 'N' 'E' ')'
 | '(' '*' 'P' 'R' 'U' 'N' 'E' ':' 'N' 'A' 'M' 'E' ')'
 | '(' '*' 'S' 'K' 'I' 'P' ')'
 | '(' '*' 'S' 'K' 'I' 'P' ':' 'N' 'A' 'M' 'E' ')'
 | '(' '*' 'T' 'H' 'E' 'N' ')'
 | '(' '*' 'T' 'H' 'E' 'N' ':' 'N' 'A' 'M' 'E' ')'
 ;
 // NEWLINE CONVENTIONS
 //capture
 //       These are recognized only at the very start of the pattern or  after  a
 //       (*BSR_...), (*UTF8), (*UTF16) or (*UCP) option.
 //
 //         (*CR)           carriage return only
 //         (*LF)           linefeed only
 //         (*CRLF)         carriage return followed by linefeed
 //         (*ANYCRLF)      all three of the above
 //         (*ANY)          any Unicode newline sequence
 //
 // WHAT \R MATCHES
 //
 //       These  are  recognized only at the very start of the pattern or after a
 //       (*...) option that sets the newline convention or a UTF or UCP mode.
 //
 //         (*BSR_ANYCRLF)  CR, LF, or CRLF
 //         (*BSR_UNICODE)  any Unicode newline sequence
 newline_convention
 : '(' '*' 'C' 'R' ')'
 | '(' '*' 'L' 'F' ')'
 | '(' '*' 'C' 'R' 'L' 'F' ')'
 | '(' '*' 'A' 'N' 'Y' 'C' 'R' 'L' 'F' ')'
 | '(' '*' 'A' 'N' 'Y' ')'
 | '(' '*' 'B' 'S' 'R' '_' 'A' 'N' 'Y' 'C' 'R' 'L' 'F' ')'
 | '(' '*' 'B' 'S' 'R' '_' 'U' 'N' 'I' 'C' 'O' 'D' 'E' ')'
 ;
 // CALLOUTS
 //
 //         (?C)      callout
 //         (?Cn)     callout with data n
 callout
 : '(' '?' 'C' ')'
 | '(' '?' 'C' number ')'
 ;
 atom
 : subroutine_reference
 | shared_atom
 | literal
 | character_class
 | capture
 | non_capture
 | comment
 | option
 | look_around
 | backreference
 | conditional
 | backtrack_control
 | newline_convention
 | callout
 | Dot
 | Caret
 | StartOfSubject
 | WordBoundary
 | NonWordBoundary
 | EndOfSubjectOrLine
 | EndOfSubjectOrLineEndOfSubject
 | EndOfSubject
 | PreviousMatchInSubject
 | ResetStartMatch
 | OneDataUnit
 | ExtendedUnicodeChar
 ;
 cc_atom
 : cc_literal Hyphen cc_literal
 | shared_atom
 | cc_literal
 | backreference_or_octal // only octal is valid in a cc
 ;
 shared_atom
 : POSIXNamedSet
 | POSIXNegatedNamedSet
 | ControlChar
 | DecimalDigit
 | NotDecimalDigit
 | HorizontalWhiteSpace
 | NotHorizontalWhiteSpace
 | NotNewLine
 | CharWithProperty
 | CharWithoutProperty
 | NewLineSequence
 | WhiteSpace
 | NotWhiteSpace
 | VerticalWhiteSpace
 | NotVerticalWhiteSpace
 | WordChar
 | NotWordChar
 | Backslash . // will match "unfinished" escape sequences, like `\x`
 ;
 literal
 : shared_literal
 | CharacterClassEnd
 ;
 cc_literal
 : shared_literal
 | Dot
 | CharacterClassStart
 | Caret
 | QuestionMark
 | Plus
 | Star
 | WordBoundary
 | EndOfSubjectOrLine
 | Pipe
 | OpenParen
 | CloseParen
 ;
 shared_literal
 : octal_char
 | letter
 | digit
 | BellChar
 | EscapeChar
 | FormFeed
 | NewLine
 | CarriageReturn
 | Tab
 | HexChar
 | Quoted
 | BlockQuoted
 | OpenBrace
 | CloseBrace
 | Comma
 | Hyphen
 | LessThan
 | GreaterThan
 | SingleQuote
 | Underscore
 | Colon
 | Hash
 | Equals
 | Exclamation
 | Ampersand
 | OtherChar
 ;
 number
 : digits
 ;
 octal_char
 : ( Backslash (D0 | D1 | D2 | D3) octal_digit octal_digit
   | Backslash octal_digit octal_digit                     
   )
 ;
 octal_digit
 : D0 | D1 | D2 | D3 | D4 | D5 | D6 | D7
 ;
 digits
 : digit+
 ;
 digit
 : D0 | D1 | D2 | D3 | D4 | D5 | D6 | D7 | D8 | D9
 ;
 name
 : alpha_nums
 ;
 alpha_nums
 : (letter | Underscore) (letter | Underscore | digit)*
 ;
 non_close_parens
 : non_close_paren+
 ;
 non_close_paren
 : ~CloseParen
 ;
 letter
 : ALC | BLC | CLC | DLC | ELC | FLC | GLC | HLC | ILC | JLC | KLC | LLC | MLC | NLC | OLC | PLC | QLC | RLC | SLC | TLC | ULC | VLC | WLC | XLC | YLC | ZLC |
   AUC | BUC | CUC | DUC | EUC | FUC | GUC | HUC | IUC | JUC | KUC | LUC | MUC | NUC | OUC | PUC | QUC | RUC | SUC | TUC | UUC | VUC | WUC | XUC | YUC | ZUC
 ;
 // QUOTING
 //
 //         \x         where x is non-alphanumeric is a literal x
 //         \Q...\E    treat enclosed characters as literal
 Quoted      : '\\' NonAlphaNumeric;
 BlockQuoted : '\\Q' .*? '\\E';
 // CHARACTERS
 //
 //         \a         alarm, that is, the BEL character (hex 07)
 //         \cx        "control-x", where x is any ASCII character
 //         \e         escape (hex 1B)
 //         \f         form feed (hex 0C)
 //         \n         newline (hex 0A)
 //         \r         carriage return (hex 0D)
 //         \t         tab (hex 09)
 //         \ddd       character with octal code ddd, or backreference
 //         \xhh       character with hex code hh
 //         \x{hhh..}  character with hex code hhh..
 BellChar       : '\\a';
 ControlChar    : '\\c' ASCII?;
 EscapeChar     : '\\e';
 FormFeed       : '\\f';
 NewLine        : '\\n';
 CarriageReturn : '\\r';
 Tab            : '\\t';
 Backslash      : '\\';
 HexChar        : '\\x' ( HexDigit HexDigit
                       | '{' HexDigit HexDigit HexDigit+ '}'
                       )
               ;
 // CHARACTER TYPES
 //
 //         .          any character except newline;
 //                      in dotall mode, any character whatsoever
 //         \C         one data unit, even in UTF mode (best avoided)
 //         \d         a decimal digit
 //         \D         a character that is not a decimal digit
 //         \h         a horizontal white space character
 //         \H         a character that is not a horizontal white space character
 //         \N         a character that is not a newline
 //         \p{xx}     a character with the xx property
 //         \P{xx}     a character without the xx property
 //         \R         a newline sequence
 //         \s         a white space character
 //         \S         a character that is not a white space character
 //         \v         a vertical white space character
 //         \V         a character that is not a vertical white space character
 //         \w         a "word" character
 //         \W         a "non-word" character
 //         \X         an extended Unicode sequence
 //
 //       In  PCRE,  by  default, \d, \D, \s, \S, \w, and \W recognize only ASCII
 //       characters, even in a UTF mode. However, this can be changed by setting
 //       the PCRE_UCP option.
 Dot                     : '.';
 OneDataUnit             : '\\C';
 DecimalDigit            : '\\d';
 NotDecimalDigit         : '\\D';
 HorizontalWhiteSpace    : '\\h';
 NotHorizontalWhiteSpace : '\\H';
 NotNewLine              : '\\N';
 CharWithProperty        : '\\p{' UnderscoreAlphaNumerics '}';
 CharWithoutProperty     : '\\P{' UnderscoreAlphaNumerics '}';
 NewLineSequence         : '\\R';
 WhiteSpace              : '\\s';
 NotWhiteSpace           : '\\S';
 VerticalWhiteSpace      : '\\v';
 NotVerticalWhiteSpace   : '\\V';
 WordChar                : '\\w';
 NotWordChar             : '\\W';
 ExtendedUnicodeChar     : '\\X';
 // CHARACTER CLASSES
 //
 //         [...]       positive character class
 //         [^...]      negative character class
 //         [x-y]       range (can be used for hex characters)
 //         [[:xxx:]]   positive POSIX named set
 //         [[:^xxx:]]  negative POSIX named set
 //
 //         alnum       alphanumeric
 //         alpha       alphabetic
 //         ascii       0-127
 //         blank       space or tab
 //         cntrl       control character
 //         digit       decimal digit
 //         graph       printing, excluding space
 //         lower       lower case letter
 //         print       printing, including space
 //         punct       printing, excluding alphanumeric
 //         space       white space
 //         upper       upper case letter
 //         word        same as \w
 //         xdigit      hexadecimal digit
 //
 //       In PCRE, POSIX character set names recognize only ASCII  characters  by
 //       default,  but  some  of them use Unicode properties if PCRE_UCP is set.
 //       You can use \Q...\E inside a character class.
 CharacterClassStart  : '[';
 CharacterClassEnd    : ']';
 Caret                : '^';
 Hyphen               : '-';
 POSIXNamedSet        : '[[:' AlphaNumerics ':]]';
 POSIXNegatedNamedSet : '[[:^' AlphaNumerics ':]]';
 QuestionMark : '?';
 Plus         : '+';
 Star         : '*';
 OpenBrace    : '{';
 CloseBrace   : '}';
 Comma        : ',';
 // ANCHORS AND SIMPLE ASSERTIONS
 //
 //         \b          word boundary
 //         \B          not a word boundary
 //         ^           start of subject
 //                      also after internal newline in multiline mode
 //         \A          start of subject
 //         $           end of subject
 //                      also before newline at end of subject
 //                      also before internal newline in multiline mode
 //         \Z          end of subject
 //                      also before newline at end of subject
 //         \z          end of subject
 //         \G          first matching position in subject
 WordBoundary                   : '\\b';
 NonWordBoundary                : '\\B';
 StartOfSubject                 : '\\A'; 
 EndOfSubjectOrLine             : '$';
 EndOfSubjectOrLineEndOfSubject : '\\Z'; 
 EndOfSubject                   : '\\z'; 
 PreviousMatchInSubject         : '\\G';
 // MATCH POINT RESET
 //
 //         \K          reset start of match
 ResetStartMatch : '\\K';
 SubroutineOrNamedReferenceStartG : '\\g';
 NamedReferenceStartK             : '\\k';
 Pipe        : '|';
 OpenParen   : '(';
 CloseParen  : ')';
 LessThan    : '<';
 GreaterThan : '>';
 SingleQuote : '\'';
 Underscore  : '_';
 Colon       : ':';
 Hash        : '#';
 Equals      : '=';
 Exclamation : '!';
 Ampersand   : '&';
 ALC : 'a';
 BLC : 'b';
 CLC : 'c';
 DLC : 'd';
 ELC : 'e';
 FLC : 'f';
 GLC : 'g';
 HLC : 'h';
 ILC : 'i';
 JLC : 'j';
 KLC : 'k';
 LLC : 'l';
 MLC : 'm';
 NLC : 'n';
 OLC : 'o';
 PLC : 'p';
 QLC : 'q';
 RLC : 'r';
 SLC : 's';
 TLC : 't';
 ULC : 'u';
 VLC : 'v';
 WLC : 'w';
 XLC : 'x';
 YLC : 'y';
 ZLC : 'z';
 AUC : 'A';
 BUC : 'B';
 CUC : 'C';
 DUC : 'D';
 EUC : 'E';
 FUC : 'F';
 GUC : 'G';
 HUC : 'H';
 IUC : 'I';
 JUC : 'J';
 KUC : 'K';
 LUC : 'L';
 MUC : 'M';
 NUC : 'N';
 OUC : 'O';
 PUC : 'P';
 QUC : 'Q';
 RUC : 'R';
 SUC : 'S';
 TUC : 'T';
 UUC : 'U';
 VUC : 'V';
 WUC : 'W';
 XUC : 'X';
 YUC : 'Y';
 ZUC : 'Z';
 D1 : '1';
 D2 : '2';
 D3 : '3';
 D4 : '4';
 D5 : '5';
 D6 : '6';
 D7 : '7';
 D8 : '8';
 D9 : '9';
 D0 : '0';
 OtherChar : . ;
 // fragments
 fragment UnderscoreAlphaNumerics : ('_' | AlphaNumeric)+;
 fragment AlphaNumerics           : AlphaNumeric+;
 fragment AlphaNumeric            : [a-zA-Z0-9];
 fragment NonAlphaNumeric         : ~[a-zA-Z0-9];
 fragment HexDigit                : [0-9a-fA-F];
 fragment ASCII                   : [\u0000-\u007F];