Added PCRE.g4

2022-12-14 18:31:54 +03:00 · 2022-12-14 18:31:54 +03:00 · 2c5ddac650
parent aecb860397
commit 2c5ddac650
1 changed files with 754 additions and 0 deletions
--- a/text-processing/src/main/antlr4/ru/roboswag/textprocessing/pcre/PCRE.g4
+++ b/text-processing/src/main/antlr4/ru/roboswag/textprocessing/pcre/PCRE.g4
@ -0,0 +1,754 @@
+/*
+ * Copyright (c) 2014-2022 by Bart Kiers
+ *
+ * The MIT license.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Project      : PCRE Parser, an ANTLR 4 grammar for PCRE
+ * Developed by : Bart Kiers, bart@big-o.nl
+ * Also see     : https://github.com/bkiers/pcre-parser
+ */
+grammar PCRE;
+
+// Most single line comments above the lexer- and  parser rules 
+// are copied from the official PCRE man pages (last updated: 
+// 10 January 2012): http://www.pcre.org/pcre.txt
+parse
+ : alternation EOF
+ ;
+
+// ALTERNATION
+//
+//         expr|expr|expr...
+alternation
+ : expr ('|' expr)*
+ ;
+
+expr
+ : element*
+ ;
+
+element
+ : atom quantifier?
+ ;
+
+// QUANTIFIERS
+//
+//         ?           0 or 1, greedy
+//         ?+          0 or 1, possessive
+//         ??          0 or 1, lazy
+//         *           0 or more, greedy
+//         *+          0 or more, possessive
+//         *?          0 or more, lazy
+//         +           1 or more, greedy
+//         ++          1 or more, possessive
+//         +?          1 or more, lazy
+//         {n}         exactly n
+//         {n,m}       at least n, no more than m, greedy
+//         {n,m}+      at least n, no more than m, possessive
+//         {n,m}?      at least n, no more than m, lazy
+//         {n,}        n or more, greedy
+//         {n,}+       n or more, possessive
+//         {n,}?       n or more, lazy
+quantifier
+ : '?' quantifier_type
+ | '+' quantifier_type
+ | '*' quantifier_type
+ | '{' number '}' quantifier_type
+ | '{' number ',' '}' quantifier_type
+ | '{' number ',' number '}' quantifier_type
+ ;
+
+quantifier_type
+ : '+'
+ | '?'
+ | /* nothing */
+ ;
+
+// CHARACTER CLASSES
+//
+//         [...]       positive character class
+//         [^...]      negative character class
+//         [x-y]       range (can be used for hex characters)
+//         [[:xxx:]]   positive POSIX named set
+//         [[:^xxx:]]  negative POSIX named set
+//
+//         alnum       alphanumeric
+//         alpha       alphabetic
+//         ascii       0-127
+//         blank       space or tab
+//         cntrl       control character
+//         digit       decimal digit
+//         graph       printing, excluding space
+//         lower       lower case letter
+//         print       printing, including space
+//         punct       printing, excluding alphanumeric
+//         space       white space
+//         upper       upper case letter
+//         word        same as \w
+//         xdigit      hexadecimal digit
+//
+//       In PCRE, POSIX character set names recognize only ASCII  characters  by
+//       default,  but  some  of them use Unicode properties if PCRE_UCP is set.
+//       You can use \Q...\E inside a character class.
+character_class
+ : '[' '^' CharacterClassEnd Hyphen cc_atom+ ']'
+ | '[' '^' CharacterClassEnd cc_atom* ']'
+ | '[' '^' cc_atom+ ']'
+ | '[' CharacterClassEnd Hyphen cc_atom+ ']'
+ | '[' CharacterClassEnd cc_atom* ']'
+ | '[' cc_atom+ ']'
+ ;
+
+// BACKREFERENCES
+//
+//         \n              reference by number (can be ambiguous)
+//         \gn             reference by number
+//         \g{n}           reference by number
+//         \g{-n}          relative reference by number
+//         \k<name>        reference by name (Perl)
+//         \k'name'        reference by name (Perl)
+//         \g{name}        reference by name (Perl)
+//         \k{name}        reference by name (.NET)
+//         (?P=name)       reference by name (Python)
+backreference
+ : backreference_or_octal
+ | '\\g' number
+ | '\\g' '{' number '}'
+ | '\\g' '{' '-' number '}'
+ | '\\k' '<' name '>'
+ | '\\k' '\'' name '\''
+ | '\\g' '{' name '}'
+ | '\\k' '{' name '}'
+ | '(' '?' 'P' '=' name ')'
+ ;
+
+backreference_or_octal
+ : octal_char
+ | Backslash digit
+ ;
+
+// CAPTURING
+//
+//         (...)           capturing group
+//         (?<name>...)    named capturing group (Perl)
+//         (?'name'...)    named capturing group (Perl)
+//         (?P<name>...)   named capturing group (Python)
+//         (?:...)         non-capturing group
+//         (?|...)         non-capturing group; reset group numbers for
+//                          capturing groups in each alternative
+//
+// ATOMIC GROUPS
+//
+//         (?>...)         atomic, non-capturing group
+capture
+ : '(' '?' '<' name '>' alternation ')'
+ | '(' '?''\'' name '\'' alternation ')'
+ | '(' '?' 'P' '<' name '>' alternation ')'
+ | '(' alternation ')'
+ ;
+
+non_capture
+ : '(' '?' ':' alternation ')'
+ | '(' '?' '|' alternation ')'
+ | '(' '?' '>' alternation ')'
+ | '(' '?' option_flags ':' alternation ')'
+ ;
+
+// COMMENT
+//
+//         (?#....)        comment (not nestable)
+comment
+ : '(' '?' '#' non_close_parens ')'
+ ;
+
+// OPTION SETTING
+//
+//         (?i)            caseless
+//         (?J)            allow duplicate names
+//         (?m)            multiline
+//         (?s)            single line (dotall)
+//         (?U)            default ungreedy (lazy)
+//         (?x)            extended (ignore white space)
+//         (?-...)         unset option(s)
+//
+//       The following are recognized only at the start of a  pattern  or  after
+//       one of the newline-setting options with similar syntax:
+//
+//         (*NO_START_OPT) no start-match optimization (PCRE_NO_START_OPTIMIZE)
+//         (*UTF8)         set UTF-8 mode: 8-bit library (PCRE_UTF8)
+//         (*UTF16)        set UTF-16 mode: 16-bit library (PCRE_UTF16)
+//         (*UCP)          set PCRE_UCP (use Unicode properties for \d etc)
+option
+ : '(' '?' option_flags '-' option_flags ')'
+ | '(' '?' option_flags ')'
+ | '(' '?' '-' option_flags ')'
+ | '(' '*' 'N' 'O' '_' 'S' 'T' 'A' 'R' 'T' '_' 'O' 'P' 'T' ')'
+ | '(' '*' 'U' 'T' 'F' '8' ')'
+ | '(' '*' 'U' 'T' 'F' '1' '6' ')'
+ | '(' '*' 'U' 'C' 'P' ')'
+ ;
+
+option_flags
+ : option_flag+
+ ;
+
+option_flag
+ : 'i'
+ | 'J'
+ | 'm'
+ | 's'
+ | 'U'
+ | 'x'
+ ;
+
+// LOOKAHEAD AND LOOKBEHIND ASSERTIONS
+//
+//         (?=...)         positive look ahead
+//         (?!...)         negative look ahead
+//         (?<=...)        positive look behind
+//         (?<!...)        negative look behind
+//
+//       Each top-level branch of a look behind must be of a fixed length.
+look_around
+ : '(' '?' '=' alternation ')'
+ | '(' '?' '!' alternation ')'
+ | '(' '?' '<' '=' alternation ')'
+ | '(' '?' '<' '!' alternation ')'
+ ;
+
+// SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)
+//
+//         (?R)            recurse whole pattern
+//         (?n)            call subpattern by absolute number
+//         (?+n)           call subpattern by relative number
+//         (?-n)           call subpattern by relative number
+//         (?&name)        call subpattern by name (Perl)
+//         (?P>name)       call subpattern by name (Python)
+//         \g<name>        call subpattern by name (Oniguruma)
+//         \g'name'        call subpattern by name (Oniguruma)
+//         \g<n>           call subpattern by absolute number (Oniguruma)
+//         \g'n'           call subpattern by absolute number (Oniguruma)
+//         \g<+n>          call subpattern by relative number (PCRE extension)
+//         \g'+n'          call subpattern by relative number (PCRE extension)
+//         \g<-n>          call subpattern by relative number (PCRE extension)
+//         \g'-n'          call subpattern by relative number (PCRE extension)
+subroutine_reference
+ : '(' '?' 'R' ')'
+ | '(' '?' number ')'
+ | '(' '?' '+' number ')'
+ | '(' '?' '-' number ')'
+ | '(' '?' '&' name ')'
+ | '(' '?' 'P' '>' name ')'
+ | '\\g' '<' name '>'
+ | '\\g' '\'' name '\''
+ | '\\g' '<' number '>'
+ | '\\g' '\'' number '\''
+ | '\\g' '<' '+' number '>'
+ | '\\g' '\'' '+' number '\''
+ | '\\g' '<' '-' number '>'
+ | '\\g' '\'' '-' number '\''
+ ;
+
+// CONDITIONAL PATTERNS
+//
+//         (?(condition)yes-pattern)
+//         (?(condition)yes-pattern|no-pattern)
+//
+//         (?(n)...        absolute reference condition
+//         (?(+n)...       relative reference condition
+//         (?(-n)...       relative reference condition
+//         (?(<name>)...   named reference condition (Perl)
+//         (?('name')...   named reference condition (Perl)
+//         (?(name)...     named reference condition (PCRE)
+//         (?(R)...        overall recursion condition
+//         (?(Rn)...       specific group recursion condition
+//         (?(R&name)...   specific recursion condition
+//         (?(DEFINE)...   define subpattern for reference
+//         (?(assert)...   assertion condition
+conditional
+ : '(' '?' '(' number ')' alternation ('|' alternation)? ')'
+ | '(' '?' '(' '+' number ')' alternation ('|' alternation)? ')'
+ | '(' '?' '(' '-' number ')' alternation ('|' alternation)? ')'
+ | '(' '?' '(' '<' name '>' ')' alternation ('|' alternation)? ')'
+ | '(' '?' '(' '\'' name '\'' ')' alternation ('|' alternation)? ')'
+ | '(' '?' '(' 'R' number ')' alternation ('|' alternation)? ')'
+ | '(' '?' '(' 'R' ')' alternation ('|' alternation)? ')'
+ | '(' '?' '(' 'R' '&' name ')' alternation ('|' alternation)? ')'
+ | '(' '?' '(' 'D' 'E' 'F' 'I' 'N' 'E' ')' alternation ('|' alternation)? ')'
+ | '(' '?' '(' 'a' 's' 's' 'e' 'r' 't' ')' alternation ('|' alternation)? ')'
+ | '(' '?' '(' name ')' alternation ('|' alternation)? ')'
+ ;
+
+// BACKTRACKING CONTROL
+//
+//       The following act immediately they are reached:
+//
+//         (*ACCEPT)       force successful match
+//         (*FAIL)         force backtrack; synonym (*F)
+//         (*MARK:NAME)    set name to be passed back; synonym (*:NAME)
+//
+//       The  following  act only when a subsequent match failure causes a back-
+//       track to reach them. They all force a match failure, but they differ in
+//       what happens afterwards. Those that advance the start-of-match point do
+//       so only if the pattern is not anchored.
+//
+//         (*COMMIT)       overall failure, no advance of starting point
+//         (*PRUNE)        advance to next starting character
+//         (*PRUNE:NAME)   equivalent to (*MARK:NAME)(*PRUNE)
+//         (*SKIP)         advance to current matching position
+//         (*SKIP:NAME)    advance to position corresponding to an earlier
+//                         (*MARK:NAME); if not found, the (*SKIP) is ignored
+//         (*THEN)         local failure, backtrack to next alternation
+//         (*THEN:NAME)    equivalent to (*MARK:NAME)(*THEN)
+backtrack_control
+ : '(' '*' 'A' 'C' 'C' 'E' 'P' 'T' ')'
+ | '(' '*' 'F' ('A' 'I' 'L')? ')'
+ | '(' '*' ('M' 'A' 'R' 'K')? ':' 'N' 'A' 'M' 'E' ')'
+ | '(' '*' 'C' 'O' 'M' 'M' 'I' 'T' ')'
+ | '(' '*' 'P' 'R' 'U' 'N' 'E' ')'
+ | '(' '*' 'P' 'R' 'U' 'N' 'E' ':' 'N' 'A' 'M' 'E' ')'
+ | '(' '*' 'S' 'K' 'I' 'P' ')'
+ | '(' '*' 'S' 'K' 'I' 'P' ':' 'N' 'A' 'M' 'E' ')'
+ | '(' '*' 'T' 'H' 'E' 'N' ')'
+ | '(' '*' 'T' 'H' 'E' 'N' ':' 'N' 'A' 'M' 'E' ')'
+ ;
+
+// NEWLINE CONVENTIONS
+//capture
+//       These are recognized only at the very start of the pattern or  after  a
+//       (*BSR_...), (*UTF8), (*UTF16) or (*UCP) option.
+//
+//         (*CR)           carriage return only
+//         (*LF)           linefeed only
+//         (*CRLF)         carriage return followed by linefeed
+//         (*ANYCRLF)      all three of the above
+//         (*ANY)          any Unicode newline sequence
+//
+// WHAT \R MATCHES
+//
+//       These  are  recognized only at the very start of the pattern or after a
+//       (*...) option that sets the newline convention or a UTF or UCP mode.
+//
+//         (*BSR_ANYCRLF)  CR, LF, or CRLF
+//         (*BSR_UNICODE)  any Unicode newline sequence
+newline_convention
+ : '(' '*' 'C' 'R' ')'
+ | '(' '*' 'L' 'F' ')'
+ | '(' '*' 'C' 'R' 'L' 'F' ')'
+ | '(' '*' 'A' 'N' 'Y' 'C' 'R' 'L' 'F' ')'
+ | '(' '*' 'A' 'N' 'Y' ')'
+ | '(' '*' 'B' 'S' 'R' '_' 'A' 'N' 'Y' 'C' 'R' 'L' 'F' ')'
+ | '(' '*' 'B' 'S' 'R' '_' 'U' 'N' 'I' 'C' 'O' 'D' 'E' ')'
+ ;
+
+// CALLOUTS
+//
+//         (?C)      callout
+//         (?Cn)     callout with data n
+callout
+ : '(' '?' 'C' ')'
+ | '(' '?' 'C' number ')'
+ ;
+
+atom
+ : subroutine_reference
+ | shared_atom
+ | literal
+ | character_class
+ | capture
+ | non_capture
+ | comment
+ | option
+ | look_around
+ | backreference
+ | conditional
+ | backtrack_control
+ | newline_convention
+ | callout
+ | Dot
+ | Caret
+ | StartOfSubject
+ | WordBoundary
+ | NonWordBoundary
+ | EndOfSubjectOrLine
+ | EndOfSubjectOrLineEndOfSubject
+ | EndOfSubject
+ | PreviousMatchInSubject
+ | ResetStartMatch
+ | OneDataUnit
+ | ExtendedUnicodeChar
+ ;
+
+cc_atom
+ : cc_literal Hyphen cc_literal
+ | shared_atom
+ | cc_literal
+ | backreference_or_octal // only octal is valid in a cc
+ ;
+
+shared_atom
+ : POSIXNamedSet
+ | POSIXNegatedNamedSet
+ | ControlChar
+ | DecimalDigit
+ | NotDecimalDigit
+ | HorizontalWhiteSpace
+ | NotHorizontalWhiteSpace
+ | NotNewLine
+ | CharWithProperty
+ | CharWithoutProperty
+ | NewLineSequence
+ | WhiteSpace
+ | NotWhiteSpace
+ | VerticalWhiteSpace
+ | NotVerticalWhiteSpace
+ | WordChar
+ | NotWordChar
+ | Backslash . // will match "unfinished" escape sequences, like `\x`
+ ;
+
+literal
+ : shared_literal
+ | CharacterClassEnd
+ ;
+
+cc_literal
+ : shared_literal
+ | Dot
+ | CharacterClassStart
+ | Caret
+ | QuestionMark
+ | Plus
+ | Star
+ | WordBoundary
+ | EndOfSubjectOrLine
+ | Pipe
+ | OpenParen
+ | CloseParen
+ ;
+
+shared_literal
+ : octal_char
+ | letter
+ | digit
+ | BellChar
+ | EscapeChar
+ | FormFeed
+ | NewLine
+ | CarriageReturn
+ | Tab
+ | HexChar
+ | Quoted
+ | BlockQuoted
+ | OpenBrace
+ | CloseBrace
+ | Comma
+ | Hyphen
+ | LessThan
+ | GreaterThan
+ | SingleQuote
+ | Underscore
+ | Colon
+ | Hash
+ | Equals
+ | Exclamation
+ | Ampersand
+ | OtherChar
+ ;
+
+number
+ : digits
+ ;
+
+octal_char
+ : ( Backslash (D0 | D1 | D2 | D3) octal_digit octal_digit
+   | Backslash octal_digit octal_digit                     
+   )
+
+ ;
+
+octal_digit
+ : D0 | D1 | D2 | D3 | D4 | D5 | D6 | D7
+ ;
+ 
+digits
+ : digit+
+ ;
+
+digit
+ : D0 | D1 | D2 | D3 | D4 | D5 | D6 | D7 | D8 | D9
+ ;
+
+name
+ : alpha_nums
+ ;
+
+alpha_nums
+ : (letter | Underscore) (letter | Underscore | digit)*
+ ;
+
+non_close_parens
+ : non_close_paren+
+ ;
+
+non_close_paren
+ : ~CloseParen
+ ;
+
+letter
+ : ALC | BLC | CLC | DLC | ELC | FLC | GLC | HLC | ILC | JLC | KLC | LLC | MLC | NLC | OLC | PLC | QLC | RLC | SLC | TLC | ULC | VLC | WLC | XLC | YLC | ZLC |
+   AUC | BUC | CUC | DUC | EUC | FUC | GUC | HUC | IUC | JUC | KUC | LUC | MUC | NUC | OUC | PUC | QUC | RUC | SUC | TUC | UUC | VUC | WUC | XUC | YUC | ZUC
+ ;
+
+// QUOTING
+//
+//         \x         where x is non-alphanumeric is a literal x
+//         \Q...\E    treat enclosed characters as literal
+Quoted      : '\\' NonAlphaNumeric;
+BlockQuoted : '\\Q' .*? '\\E';
+
+// CHARACTERS
+//
+//         \a         alarm, that is, the BEL character (hex 07)
+//         \cx        "control-x", where x is any ASCII character
+//         \e         escape (hex 1B)
+//         \f         form feed (hex 0C)
+//         \n         newline (hex 0A)
+//         \r         carriage return (hex 0D)
+//         \t         tab (hex 09)
+//         \ddd       character with octal code ddd, or backreference
+//         \xhh       character with hex code hh
+//         \x{hhh..}  character with hex code hhh..
+BellChar       : '\\a';
+ControlChar    : '\\c' ASCII?;
+EscapeChar     : '\\e';
+FormFeed       : '\\f';
+NewLine        : '\\n';
+CarriageReturn : '\\r';
+Tab            : '\\t';
+Backslash      : '\\';
+HexChar        : '\\x' ( HexDigit HexDigit
+                       | '{' HexDigit HexDigit HexDigit+ '}'
+                       )
+               ;
+
+// CHARACTER TYPES
+//
+//         .          any character except newline;
+//                      in dotall mode, any character whatsoever
+//         \C         one data unit, even in UTF mode (best avoided)
+//         \d         a decimal digit
+//         \D         a character that is not a decimal digit
+//         \h         a horizontal white space character
+//         \H         a character that is not a horizontal white space character
+//         \N         a character that is not a newline
+//         \p{xx}     a character with the xx property
+//         \P{xx}     a character without the xx property
+//         \R         a newline sequence
+//         \s         a white space character
+//         \S         a character that is not a white space character
+//         \v         a vertical white space character
+//         \V         a character that is not a vertical white space character
+//         \w         a "word" character
+//         \W         a "non-word" character
+//         \X         an extended Unicode sequence
+//
+//       In  PCRE,  by  default, \d, \D, \s, \S, \w, and \W recognize only ASCII
+//       characters, even in a UTF mode. However, this can be changed by setting
+//       the PCRE_UCP option.
+Dot                     : '.';
+OneDataUnit             : '\\C';
+DecimalDigit            : '\\d';
+NotDecimalDigit         : '\\D';
+HorizontalWhiteSpace    : '\\h';
+NotHorizontalWhiteSpace : '\\H';
+NotNewLine              : '\\N';
+CharWithProperty        : '\\p{' UnderscoreAlphaNumerics '}';
+CharWithoutProperty     : '\\P{' UnderscoreAlphaNumerics '}';
+NewLineSequence         : '\\R';
+WhiteSpace              : '\\s';
+NotWhiteSpace           : '\\S';
+VerticalWhiteSpace      : '\\v';
+NotVerticalWhiteSpace   : '\\V';
+WordChar                : '\\w';
+NotWordChar             : '\\W';
+ExtendedUnicodeChar     : '\\X';
+
+// CHARACTER CLASSES
+//
+//         [...]       positive character class
+//         [^...]      negative character class
+//         [x-y]       range (can be used for hex characters)
+//         [[:xxx:]]   positive POSIX named set
+//         [[:^xxx:]]  negative POSIX named set
+//
+//         alnum       alphanumeric
+//         alpha       alphabetic
+//         ascii       0-127
+//         blank       space or tab
+//         cntrl       control character
+//         digit       decimal digit
+//         graph       printing, excluding space
+//         lower       lower case letter
+//         print       printing, including space
+//         punct       printing, excluding alphanumeric
+//         space       white space
+//         upper       upper case letter
+//         word        same as \w
+//         xdigit      hexadecimal digit
+//
+//       In PCRE, POSIX character set names recognize only ASCII  characters  by
+//       default,  but  some  of them use Unicode properties if PCRE_UCP is set.
+//       You can use \Q...\E inside a character class.
+CharacterClassStart  : '[';
+CharacterClassEnd    : ']';
+Caret                : '^';
+Hyphen               : '-';
+POSIXNamedSet        : '[[:' AlphaNumerics ':]]';
+POSIXNegatedNamedSet : '[[:^' AlphaNumerics ':]]';
+
+QuestionMark : '?';
+Plus         : '+';
+Star         : '*';
+OpenBrace    : '{';
+CloseBrace   : '}';
+Comma        : ',';
+
+// ANCHORS AND SIMPLE ASSERTIONS
+//
+//         \b          word boundary
+//         \B          not a word boundary
+//         ^           start of subject
+//                      also after internal newline in multiline mode
+//         \A          start of subject
+//         $           end of subject
+//                      also before newline at end of subject
+//                      also before internal newline in multiline mode
+//         \Z          end of subject
+//                      also before newline at end of subject
+//         \z          end of subject
+//         \G          first matching position in subject
+WordBoundary                   : '\\b';
+NonWordBoundary                : '\\B';
+StartOfSubject                 : '\\A'; 
+EndOfSubjectOrLine             : '$';
+EndOfSubjectOrLineEndOfSubject : '\\Z'; 
+EndOfSubject                   : '\\z'; 
+PreviousMatchInSubject         : '\\G';
+
+// MATCH POINT RESET
+//
+//         \K          reset start of match
+ResetStartMatch : '\\K';
+
+SubroutineOrNamedReferenceStartG : '\\g';
+NamedReferenceStartK             : '\\k';
+
+Pipe        : '|';
+OpenParen   : '(';
+CloseParen  : ')';
+LessThan    : '<';
+GreaterThan : '>';
+SingleQuote : '\'';
+Underscore  : '_';
+Colon       : ':';
+Hash        : '#';
+Equals      : '=';
+Exclamation : '!';
+Ampersand   : '&';
+
+ALC : 'a';
+BLC : 'b';
+CLC : 'c';
+DLC : 'd';
+ELC : 'e';
+FLC : 'f';
+GLC : 'g';
+HLC : 'h';
+ILC : 'i';
+JLC : 'j';
+KLC : 'k';
+LLC : 'l';
+MLC : 'm';
+NLC : 'n';
+OLC : 'o';
+PLC : 'p';
+QLC : 'q';
+RLC : 'r';
+SLC : 's';
+TLC : 't';
+ULC : 'u';
+VLC : 'v';
+WLC : 'w';
+XLC : 'x';
+YLC : 'y';
+ZLC : 'z';
+
+AUC : 'A';
+BUC : 'B';
+CUC : 'C';
+DUC : 'D';
+EUC : 'E';
+FUC : 'F';
+GUC : 'G';
+HUC : 'H';
+IUC : 'I';
+JUC : 'J';
+KUC : 'K';
+LUC : 'L';
+MUC : 'M';
+NUC : 'N';
+OUC : 'O';
+PUC : 'P';
+QUC : 'Q';
+RUC : 'R';
+SUC : 'S';
+TUC : 'T';
+UUC : 'U';
+VUC : 'V';
+WUC : 'W';
+XUC : 'X';
+YUC : 'Y';
+ZUC : 'Z';
+
+D1 : '1';
+D2 : '2';
+D3 : '3';
+D4 : '4';
+D5 : '5';
+D6 : '6';
+D7 : '7';
+D8 : '8';
+D9 : '9';
+D0 : '0';
+
+OtherChar : . ;
+
+// fragments
+fragment UnderscoreAlphaNumerics : ('_' | AlphaNumeric)+;
+fragment AlphaNumerics           : AlphaNumeric+;
+fragment AlphaNumeric            : [a-zA-Z0-9];
+fragment NonAlphaNumeric         : ~[a-zA-Z0-9];
+fragment HexDigit                : [0-9a-fA-F];
+fragment ASCII                   : [\u0000-\u007F];