Added PCRE.g4
This commit is contained in:
parent
aecb860397
commit
2c5ddac650
|
|
@ -0,0 +1,754 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2014-2022 by Bart Kiers
|
||||||
|
*
|
||||||
|
* The MIT license.
|
||||||
|
*
|
||||||
|
* Permission is hereby granted, free of charge, to any person
|
||||||
|
* obtaining a copy of this software and associated documentation
|
||||||
|
* files (the "Software"), to deal in the Software without
|
||||||
|
* restriction, including without limitation the rights to use,
|
||||||
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the
|
||||||
|
* Software is furnished to do so, subject to the following
|
||||||
|
* conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be
|
||||||
|
* included in all copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||||
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||||
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||||
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||||
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||||
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
* OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
*
|
||||||
|
* Project : PCRE Parser, an ANTLR 4 grammar for PCRE
|
||||||
|
* Developed by : Bart Kiers, bart@big-o.nl
|
||||||
|
* Also see : https://github.com/bkiers/pcre-parser
|
||||||
|
*/
|
||||||
|
grammar PCRE;
|
||||||
|
|
||||||
|
// Most single line comments above the lexer- and parser rules
|
||||||
|
// are copied from the official PCRE man pages (last updated:
|
||||||
|
// 10 January 2012): http://www.pcre.org/pcre.txt
|
||||||
|
parse
|
||||||
|
: alternation EOF
|
||||||
|
;
|
||||||
|
|
||||||
|
// ALTERNATION
|
||||||
|
//
|
||||||
|
// expr|expr|expr...
|
||||||
|
alternation
|
||||||
|
: expr ('|' expr)*
|
||||||
|
;
|
||||||
|
|
||||||
|
expr
|
||||||
|
: element*
|
||||||
|
;
|
||||||
|
|
||||||
|
element
|
||||||
|
: atom quantifier?
|
||||||
|
;
|
||||||
|
|
||||||
|
// QUANTIFIERS
|
||||||
|
//
|
||||||
|
// ? 0 or 1, greedy
|
||||||
|
// ?+ 0 or 1, possessive
|
||||||
|
// ?? 0 or 1, lazy
|
||||||
|
// * 0 or more, greedy
|
||||||
|
// *+ 0 or more, possessive
|
||||||
|
// *? 0 or more, lazy
|
||||||
|
// + 1 or more, greedy
|
||||||
|
// ++ 1 or more, possessive
|
||||||
|
// +? 1 or more, lazy
|
||||||
|
// {n} exactly n
|
||||||
|
// {n,m} at least n, no more than m, greedy
|
||||||
|
// {n,m}+ at least n, no more than m, possessive
|
||||||
|
// {n,m}? at least n, no more than m, lazy
|
||||||
|
// {n,} n or more, greedy
|
||||||
|
// {n,}+ n or more, possessive
|
||||||
|
// {n,}? n or more, lazy
|
||||||
|
quantifier
|
||||||
|
: '?' quantifier_type
|
||||||
|
| '+' quantifier_type
|
||||||
|
| '*' quantifier_type
|
||||||
|
| '{' number '}' quantifier_type
|
||||||
|
| '{' number ',' '}' quantifier_type
|
||||||
|
| '{' number ',' number '}' quantifier_type
|
||||||
|
;
|
||||||
|
|
||||||
|
quantifier_type
|
||||||
|
: '+'
|
||||||
|
| '?'
|
||||||
|
| /* nothing */
|
||||||
|
;
|
||||||
|
|
||||||
|
// CHARACTER CLASSES
|
||||||
|
//
|
||||||
|
// [...] positive character class
|
||||||
|
// [^...] negative character class
|
||||||
|
// [x-y] range (can be used for hex characters)
|
||||||
|
// [[:xxx:]] positive POSIX named set
|
||||||
|
// [[:^xxx:]] negative POSIX named set
|
||||||
|
//
|
||||||
|
// alnum alphanumeric
|
||||||
|
// alpha alphabetic
|
||||||
|
// ascii 0-127
|
||||||
|
// blank space or tab
|
||||||
|
// cntrl control character
|
||||||
|
// digit decimal digit
|
||||||
|
// graph printing, excluding space
|
||||||
|
// lower lower case letter
|
||||||
|
// print printing, including space
|
||||||
|
// punct printing, excluding alphanumeric
|
||||||
|
// space white space
|
||||||
|
// upper upper case letter
|
||||||
|
// word same as \w
|
||||||
|
// xdigit hexadecimal digit
|
||||||
|
//
|
||||||
|
// In PCRE, POSIX character set names recognize only ASCII characters by
|
||||||
|
// default, but some of them use Unicode properties if PCRE_UCP is set.
|
||||||
|
// You can use \Q...\E inside a character class.
|
||||||
|
character_class
|
||||||
|
: '[' '^' CharacterClassEnd Hyphen cc_atom+ ']'
|
||||||
|
| '[' '^' CharacterClassEnd cc_atom* ']'
|
||||||
|
| '[' '^' cc_atom+ ']'
|
||||||
|
| '[' CharacterClassEnd Hyphen cc_atom+ ']'
|
||||||
|
| '[' CharacterClassEnd cc_atom* ']'
|
||||||
|
| '[' cc_atom+ ']'
|
||||||
|
;
|
||||||
|
|
||||||
|
// BACKREFERENCES
|
||||||
|
//
|
||||||
|
// \n reference by number (can be ambiguous)
|
||||||
|
// \gn reference by number
|
||||||
|
// \g{n} reference by number
|
||||||
|
// \g{-n} relative reference by number
|
||||||
|
// \k<name> reference by name (Perl)
|
||||||
|
// \k'name' reference by name (Perl)
|
||||||
|
// \g{name} reference by name (Perl)
|
||||||
|
// \k{name} reference by name (.NET)
|
||||||
|
// (?P=name) reference by name (Python)
|
||||||
|
backreference
|
||||||
|
: backreference_or_octal
|
||||||
|
| '\\g' number
|
||||||
|
| '\\g' '{' number '}'
|
||||||
|
| '\\g' '{' '-' number '}'
|
||||||
|
| '\\k' '<' name '>'
|
||||||
|
| '\\k' '\'' name '\''
|
||||||
|
| '\\g' '{' name '}'
|
||||||
|
| '\\k' '{' name '}'
|
||||||
|
| '(' '?' 'P' '=' name ')'
|
||||||
|
;
|
||||||
|
|
||||||
|
backreference_or_octal
|
||||||
|
: octal_char
|
||||||
|
| Backslash digit
|
||||||
|
;
|
||||||
|
|
||||||
|
// CAPTURING
|
||||||
|
//
|
||||||
|
// (...) capturing group
|
||||||
|
// (?<name>...) named capturing group (Perl)
|
||||||
|
// (?'name'...) named capturing group (Perl)
|
||||||
|
// (?P<name>...) named capturing group (Python)
|
||||||
|
// (?:...) non-capturing group
|
||||||
|
// (?|...) non-capturing group; reset group numbers for
|
||||||
|
// capturing groups in each alternative
|
||||||
|
//
|
||||||
|
// ATOMIC GROUPS
|
||||||
|
//
|
||||||
|
// (?>...) atomic, non-capturing group
|
||||||
|
capture
|
||||||
|
: '(' '?' '<' name '>' alternation ')'
|
||||||
|
| '(' '?''\'' name '\'' alternation ')'
|
||||||
|
| '(' '?' 'P' '<' name '>' alternation ')'
|
||||||
|
| '(' alternation ')'
|
||||||
|
;
|
||||||
|
|
||||||
|
non_capture
|
||||||
|
: '(' '?' ':' alternation ')'
|
||||||
|
| '(' '?' '|' alternation ')'
|
||||||
|
| '(' '?' '>' alternation ')'
|
||||||
|
| '(' '?' option_flags ':' alternation ')'
|
||||||
|
;
|
||||||
|
|
||||||
|
// COMMENT
|
||||||
|
//
|
||||||
|
// (?#....) comment (not nestable)
|
||||||
|
comment
|
||||||
|
: '(' '?' '#' non_close_parens ')'
|
||||||
|
;
|
||||||
|
|
||||||
|
// OPTION SETTING
|
||||||
|
//
|
||||||
|
// (?i) caseless
|
||||||
|
// (?J) allow duplicate names
|
||||||
|
// (?m) multiline
|
||||||
|
// (?s) single line (dotall)
|
||||||
|
// (?U) default ungreedy (lazy)
|
||||||
|
// (?x) extended (ignore white space)
|
||||||
|
// (?-...) unset option(s)
|
||||||
|
//
|
||||||
|
// The following are recognized only at the start of a pattern or after
|
||||||
|
// one of the newline-setting options with similar syntax:
|
||||||
|
//
|
||||||
|
// (*NO_START_OPT) no start-match optimization (PCRE_NO_START_OPTIMIZE)
|
||||||
|
// (*UTF8) set UTF-8 mode: 8-bit library (PCRE_UTF8)
|
||||||
|
// (*UTF16) set UTF-16 mode: 16-bit library (PCRE_UTF16)
|
||||||
|
// (*UCP) set PCRE_UCP (use Unicode properties for \d etc)
|
||||||
|
option
|
||||||
|
: '(' '?' option_flags '-' option_flags ')'
|
||||||
|
| '(' '?' option_flags ')'
|
||||||
|
| '(' '?' '-' option_flags ')'
|
||||||
|
| '(' '*' 'N' 'O' '_' 'S' 'T' 'A' 'R' 'T' '_' 'O' 'P' 'T' ')'
|
||||||
|
| '(' '*' 'U' 'T' 'F' '8' ')'
|
||||||
|
| '(' '*' 'U' 'T' 'F' '1' '6' ')'
|
||||||
|
| '(' '*' 'U' 'C' 'P' ')'
|
||||||
|
;
|
||||||
|
|
||||||
|
option_flags
|
||||||
|
: option_flag+
|
||||||
|
;
|
||||||
|
|
||||||
|
option_flag
|
||||||
|
: 'i'
|
||||||
|
| 'J'
|
||||||
|
| 'm'
|
||||||
|
| 's'
|
||||||
|
| 'U'
|
||||||
|
| 'x'
|
||||||
|
;
|
||||||
|
|
||||||
|
// LOOKAHEAD AND LOOKBEHIND ASSERTIONS
|
||||||
|
//
|
||||||
|
// (?=...) positive look ahead
|
||||||
|
// (?!...) negative look ahead
|
||||||
|
// (?<=...) positive look behind
|
||||||
|
// (?<!...) negative look behind
|
||||||
|
//
|
||||||
|
// Each top-level branch of a look behind must be of a fixed length.
|
||||||
|
look_around
|
||||||
|
: '(' '?' '=' alternation ')'
|
||||||
|
| '(' '?' '!' alternation ')'
|
||||||
|
| '(' '?' '<' '=' alternation ')'
|
||||||
|
| '(' '?' '<' '!' alternation ')'
|
||||||
|
;
|
||||||
|
|
||||||
|
// SUBROUTINE REFERENCES (POSSIBLY RECURSIVE)
|
||||||
|
//
|
||||||
|
// (?R) recurse whole pattern
|
||||||
|
// (?n) call subpattern by absolute number
|
||||||
|
// (?+n) call subpattern by relative number
|
||||||
|
// (?-n) call subpattern by relative number
|
||||||
|
// (?&name) call subpattern by name (Perl)
|
||||||
|
// (?P>name) call subpattern by name (Python)
|
||||||
|
// \g<name> call subpattern by name (Oniguruma)
|
||||||
|
// \g'name' call subpattern by name (Oniguruma)
|
||||||
|
// \g<n> call subpattern by absolute number (Oniguruma)
|
||||||
|
// \g'n' call subpattern by absolute number (Oniguruma)
|
||||||
|
// \g<+n> call subpattern by relative number (PCRE extension)
|
||||||
|
// \g'+n' call subpattern by relative number (PCRE extension)
|
||||||
|
// \g<-n> call subpattern by relative number (PCRE extension)
|
||||||
|
// \g'-n' call subpattern by relative number (PCRE extension)
|
||||||
|
subroutine_reference
|
||||||
|
: '(' '?' 'R' ')'
|
||||||
|
| '(' '?' number ')'
|
||||||
|
| '(' '?' '+' number ')'
|
||||||
|
| '(' '?' '-' number ')'
|
||||||
|
| '(' '?' '&' name ')'
|
||||||
|
| '(' '?' 'P' '>' name ')'
|
||||||
|
| '\\g' '<' name '>'
|
||||||
|
| '\\g' '\'' name '\''
|
||||||
|
| '\\g' '<' number '>'
|
||||||
|
| '\\g' '\'' number '\''
|
||||||
|
| '\\g' '<' '+' number '>'
|
||||||
|
| '\\g' '\'' '+' number '\''
|
||||||
|
| '\\g' '<' '-' number '>'
|
||||||
|
| '\\g' '\'' '-' number '\''
|
||||||
|
;
|
||||||
|
|
||||||
|
// CONDITIONAL PATTERNS
|
||||||
|
//
|
||||||
|
// (?(condition)yes-pattern)
|
||||||
|
// (?(condition)yes-pattern|no-pattern)
|
||||||
|
//
|
||||||
|
// (?(n)... absolute reference condition
|
||||||
|
// (?(+n)... relative reference condition
|
||||||
|
// (?(-n)... relative reference condition
|
||||||
|
// (?(<name>)... named reference condition (Perl)
|
||||||
|
// (?('name')... named reference condition (Perl)
|
||||||
|
// (?(name)... named reference condition (PCRE)
|
||||||
|
// (?(R)... overall recursion condition
|
||||||
|
// (?(Rn)... specific group recursion condition
|
||||||
|
// (?(R&name)... specific recursion condition
|
||||||
|
// (?(DEFINE)... define subpattern for reference
|
||||||
|
// (?(assert)... assertion condition
|
||||||
|
conditional
|
||||||
|
: '(' '?' '(' number ')' alternation ('|' alternation)? ')'
|
||||||
|
| '(' '?' '(' '+' number ')' alternation ('|' alternation)? ')'
|
||||||
|
| '(' '?' '(' '-' number ')' alternation ('|' alternation)? ')'
|
||||||
|
| '(' '?' '(' '<' name '>' ')' alternation ('|' alternation)? ')'
|
||||||
|
| '(' '?' '(' '\'' name '\'' ')' alternation ('|' alternation)? ')'
|
||||||
|
| '(' '?' '(' 'R' number ')' alternation ('|' alternation)? ')'
|
||||||
|
| '(' '?' '(' 'R' ')' alternation ('|' alternation)? ')'
|
||||||
|
| '(' '?' '(' 'R' '&' name ')' alternation ('|' alternation)? ')'
|
||||||
|
| '(' '?' '(' 'D' 'E' 'F' 'I' 'N' 'E' ')' alternation ('|' alternation)? ')'
|
||||||
|
| '(' '?' '(' 'a' 's' 's' 'e' 'r' 't' ')' alternation ('|' alternation)? ')'
|
||||||
|
| '(' '?' '(' name ')' alternation ('|' alternation)? ')'
|
||||||
|
;
|
||||||
|
|
||||||
|
// BACKTRACKING CONTROL
|
||||||
|
//
|
||||||
|
// The following act immediately they are reached:
|
||||||
|
//
|
||||||
|
// (*ACCEPT) force successful match
|
||||||
|
// (*FAIL) force backtrack; synonym (*F)
|
||||||
|
// (*MARK:NAME) set name to be passed back; synonym (*:NAME)
|
||||||
|
//
|
||||||
|
// The following act only when a subsequent match failure causes a back-
|
||||||
|
// track to reach them. They all force a match failure, but they differ in
|
||||||
|
// what happens afterwards. Those that advance the start-of-match point do
|
||||||
|
// so only if the pattern is not anchored.
|
||||||
|
//
|
||||||
|
// (*COMMIT) overall failure, no advance of starting point
|
||||||
|
// (*PRUNE) advance to next starting character
|
||||||
|
// (*PRUNE:NAME) equivalent to (*MARK:NAME)(*PRUNE)
|
||||||
|
// (*SKIP) advance to current matching position
|
||||||
|
// (*SKIP:NAME) advance to position corresponding to an earlier
|
||||||
|
// (*MARK:NAME); if not found, the (*SKIP) is ignored
|
||||||
|
// (*THEN) local failure, backtrack to next alternation
|
||||||
|
// (*THEN:NAME) equivalent to (*MARK:NAME)(*THEN)
|
||||||
|
backtrack_control
|
||||||
|
: '(' '*' 'A' 'C' 'C' 'E' 'P' 'T' ')'
|
||||||
|
| '(' '*' 'F' ('A' 'I' 'L')? ')'
|
||||||
|
| '(' '*' ('M' 'A' 'R' 'K')? ':' 'N' 'A' 'M' 'E' ')'
|
||||||
|
| '(' '*' 'C' 'O' 'M' 'M' 'I' 'T' ')'
|
||||||
|
| '(' '*' 'P' 'R' 'U' 'N' 'E' ')'
|
||||||
|
| '(' '*' 'P' 'R' 'U' 'N' 'E' ':' 'N' 'A' 'M' 'E' ')'
|
||||||
|
| '(' '*' 'S' 'K' 'I' 'P' ')'
|
||||||
|
| '(' '*' 'S' 'K' 'I' 'P' ':' 'N' 'A' 'M' 'E' ')'
|
||||||
|
| '(' '*' 'T' 'H' 'E' 'N' ')'
|
||||||
|
| '(' '*' 'T' 'H' 'E' 'N' ':' 'N' 'A' 'M' 'E' ')'
|
||||||
|
;
|
||||||
|
|
||||||
|
// NEWLINE CONVENTIONS
|
||||||
|
//capture
|
||||||
|
// These are recognized only at the very start of the pattern or after a
|
||||||
|
// (*BSR_...), (*UTF8), (*UTF16) or (*UCP) option.
|
||||||
|
//
|
||||||
|
// (*CR) carriage return only
|
||||||
|
// (*LF) linefeed only
|
||||||
|
// (*CRLF) carriage return followed by linefeed
|
||||||
|
// (*ANYCRLF) all three of the above
|
||||||
|
// (*ANY) any Unicode newline sequence
|
||||||
|
//
|
||||||
|
// WHAT \R MATCHES
|
||||||
|
//
|
||||||
|
// These are recognized only at the very start of the pattern or after a
|
||||||
|
// (*...) option that sets the newline convention or a UTF or UCP mode.
|
||||||
|
//
|
||||||
|
// (*BSR_ANYCRLF) CR, LF, or CRLF
|
||||||
|
// (*BSR_UNICODE) any Unicode newline sequence
|
||||||
|
newline_convention
|
||||||
|
: '(' '*' 'C' 'R' ')'
|
||||||
|
| '(' '*' 'L' 'F' ')'
|
||||||
|
| '(' '*' 'C' 'R' 'L' 'F' ')'
|
||||||
|
| '(' '*' 'A' 'N' 'Y' 'C' 'R' 'L' 'F' ')'
|
||||||
|
| '(' '*' 'A' 'N' 'Y' ')'
|
||||||
|
| '(' '*' 'B' 'S' 'R' '_' 'A' 'N' 'Y' 'C' 'R' 'L' 'F' ')'
|
||||||
|
| '(' '*' 'B' 'S' 'R' '_' 'U' 'N' 'I' 'C' 'O' 'D' 'E' ')'
|
||||||
|
;
|
||||||
|
|
||||||
|
// CALLOUTS
|
||||||
|
//
|
||||||
|
// (?C) callout
|
||||||
|
// (?Cn) callout with data n
|
||||||
|
callout
|
||||||
|
: '(' '?' 'C' ')'
|
||||||
|
| '(' '?' 'C' number ')'
|
||||||
|
;
|
||||||
|
|
||||||
|
atom
|
||||||
|
: subroutine_reference
|
||||||
|
| shared_atom
|
||||||
|
| literal
|
||||||
|
| character_class
|
||||||
|
| capture
|
||||||
|
| non_capture
|
||||||
|
| comment
|
||||||
|
| option
|
||||||
|
| look_around
|
||||||
|
| backreference
|
||||||
|
| conditional
|
||||||
|
| backtrack_control
|
||||||
|
| newline_convention
|
||||||
|
| callout
|
||||||
|
| Dot
|
||||||
|
| Caret
|
||||||
|
| StartOfSubject
|
||||||
|
| WordBoundary
|
||||||
|
| NonWordBoundary
|
||||||
|
| EndOfSubjectOrLine
|
||||||
|
| EndOfSubjectOrLineEndOfSubject
|
||||||
|
| EndOfSubject
|
||||||
|
| PreviousMatchInSubject
|
||||||
|
| ResetStartMatch
|
||||||
|
| OneDataUnit
|
||||||
|
| ExtendedUnicodeChar
|
||||||
|
;
|
||||||
|
|
||||||
|
cc_atom
|
||||||
|
: cc_literal Hyphen cc_literal
|
||||||
|
| shared_atom
|
||||||
|
| cc_literal
|
||||||
|
| backreference_or_octal // only octal is valid in a cc
|
||||||
|
;
|
||||||
|
|
||||||
|
shared_atom
|
||||||
|
: POSIXNamedSet
|
||||||
|
| POSIXNegatedNamedSet
|
||||||
|
| ControlChar
|
||||||
|
| DecimalDigit
|
||||||
|
| NotDecimalDigit
|
||||||
|
| HorizontalWhiteSpace
|
||||||
|
| NotHorizontalWhiteSpace
|
||||||
|
| NotNewLine
|
||||||
|
| CharWithProperty
|
||||||
|
| CharWithoutProperty
|
||||||
|
| NewLineSequence
|
||||||
|
| WhiteSpace
|
||||||
|
| NotWhiteSpace
|
||||||
|
| VerticalWhiteSpace
|
||||||
|
| NotVerticalWhiteSpace
|
||||||
|
| WordChar
|
||||||
|
| NotWordChar
|
||||||
|
| Backslash . // will match "unfinished" escape sequences, like `\x`
|
||||||
|
;
|
||||||
|
|
||||||
|
literal
|
||||||
|
: shared_literal
|
||||||
|
| CharacterClassEnd
|
||||||
|
;
|
||||||
|
|
||||||
|
cc_literal
|
||||||
|
: shared_literal
|
||||||
|
| Dot
|
||||||
|
| CharacterClassStart
|
||||||
|
| Caret
|
||||||
|
| QuestionMark
|
||||||
|
| Plus
|
||||||
|
| Star
|
||||||
|
| WordBoundary
|
||||||
|
| EndOfSubjectOrLine
|
||||||
|
| Pipe
|
||||||
|
| OpenParen
|
||||||
|
| CloseParen
|
||||||
|
;
|
||||||
|
|
||||||
|
shared_literal
|
||||||
|
: octal_char
|
||||||
|
| letter
|
||||||
|
| digit
|
||||||
|
| BellChar
|
||||||
|
| EscapeChar
|
||||||
|
| FormFeed
|
||||||
|
| NewLine
|
||||||
|
| CarriageReturn
|
||||||
|
| Tab
|
||||||
|
| HexChar
|
||||||
|
| Quoted
|
||||||
|
| BlockQuoted
|
||||||
|
| OpenBrace
|
||||||
|
| CloseBrace
|
||||||
|
| Comma
|
||||||
|
| Hyphen
|
||||||
|
| LessThan
|
||||||
|
| GreaterThan
|
||||||
|
| SingleQuote
|
||||||
|
| Underscore
|
||||||
|
| Colon
|
||||||
|
| Hash
|
||||||
|
| Equals
|
||||||
|
| Exclamation
|
||||||
|
| Ampersand
|
||||||
|
| OtherChar
|
||||||
|
;
|
||||||
|
|
||||||
|
number
|
||||||
|
: digits
|
||||||
|
;
|
||||||
|
|
||||||
|
octal_char
|
||||||
|
: ( Backslash (D0 | D1 | D2 | D3) octal_digit octal_digit
|
||||||
|
| Backslash octal_digit octal_digit
|
||||||
|
)
|
||||||
|
|
||||||
|
;
|
||||||
|
|
||||||
|
octal_digit
|
||||||
|
: D0 | D1 | D2 | D3 | D4 | D5 | D6 | D7
|
||||||
|
;
|
||||||
|
|
||||||
|
digits
|
||||||
|
: digit+
|
||||||
|
;
|
||||||
|
|
||||||
|
digit
|
||||||
|
: D0 | D1 | D2 | D3 | D4 | D5 | D6 | D7 | D8 | D9
|
||||||
|
;
|
||||||
|
|
||||||
|
name
|
||||||
|
: alpha_nums
|
||||||
|
;
|
||||||
|
|
||||||
|
alpha_nums
|
||||||
|
: (letter | Underscore) (letter | Underscore | digit)*
|
||||||
|
;
|
||||||
|
|
||||||
|
non_close_parens
|
||||||
|
: non_close_paren+
|
||||||
|
;
|
||||||
|
|
||||||
|
non_close_paren
|
||||||
|
: ~CloseParen
|
||||||
|
;
|
||||||
|
|
||||||
|
letter
|
||||||
|
: ALC | BLC | CLC | DLC | ELC | FLC | GLC | HLC | ILC | JLC | KLC | LLC | MLC | NLC | OLC | PLC | QLC | RLC | SLC | TLC | ULC | VLC | WLC | XLC | YLC | ZLC |
|
||||||
|
AUC | BUC | CUC | DUC | EUC | FUC | GUC | HUC | IUC | JUC | KUC | LUC | MUC | NUC | OUC | PUC | QUC | RUC | SUC | TUC | UUC | VUC | WUC | XUC | YUC | ZUC
|
||||||
|
;
|
||||||
|
|
||||||
|
// QUOTING
|
||||||
|
//
|
||||||
|
// \x where x is non-alphanumeric is a literal x
|
||||||
|
// \Q...\E treat enclosed characters as literal
|
||||||
|
Quoted : '\\' NonAlphaNumeric;
|
||||||
|
BlockQuoted : '\\Q' .*? '\\E';
|
||||||
|
|
||||||
|
// CHARACTERS
|
||||||
|
//
|
||||||
|
// \a alarm, that is, the BEL character (hex 07)
|
||||||
|
// \cx "control-x", where x is any ASCII character
|
||||||
|
// \e escape (hex 1B)
|
||||||
|
// \f form feed (hex 0C)
|
||||||
|
// \n newline (hex 0A)
|
||||||
|
// \r carriage return (hex 0D)
|
||||||
|
// \t tab (hex 09)
|
||||||
|
// \ddd character with octal code ddd, or backreference
|
||||||
|
// \xhh character with hex code hh
|
||||||
|
// \x{hhh..} character with hex code hhh..
|
||||||
|
BellChar : '\\a';
|
||||||
|
ControlChar : '\\c' ASCII?;
|
||||||
|
EscapeChar : '\\e';
|
||||||
|
FormFeed : '\\f';
|
||||||
|
NewLine : '\\n';
|
||||||
|
CarriageReturn : '\\r';
|
||||||
|
Tab : '\\t';
|
||||||
|
Backslash : '\\';
|
||||||
|
HexChar : '\\x' ( HexDigit HexDigit
|
||||||
|
| '{' HexDigit HexDigit HexDigit+ '}'
|
||||||
|
)
|
||||||
|
;
|
||||||
|
|
||||||
|
// CHARACTER TYPES
|
||||||
|
//
|
||||||
|
// . any character except newline;
|
||||||
|
// in dotall mode, any character whatsoever
|
||||||
|
// \C one data unit, even in UTF mode (best avoided)
|
||||||
|
// \d a decimal digit
|
||||||
|
// \D a character that is not a decimal digit
|
||||||
|
// \h a horizontal white space character
|
||||||
|
// \H a character that is not a horizontal white space character
|
||||||
|
// \N a character that is not a newline
|
||||||
|
// \p{xx} a character with the xx property
|
||||||
|
// \P{xx} a character without the xx property
|
||||||
|
// \R a newline sequence
|
||||||
|
// \s a white space character
|
||||||
|
// \S a character that is not a white space character
|
||||||
|
// \v a vertical white space character
|
||||||
|
// \V a character that is not a vertical white space character
|
||||||
|
// \w a "word" character
|
||||||
|
// \W a "non-word" character
|
||||||
|
// \X an extended Unicode sequence
|
||||||
|
//
|
||||||
|
// In PCRE, by default, \d, \D, \s, \S, \w, and \W recognize only ASCII
|
||||||
|
// characters, even in a UTF mode. However, this can be changed by setting
|
||||||
|
// the PCRE_UCP option.
|
||||||
|
Dot : '.';
|
||||||
|
OneDataUnit : '\\C';
|
||||||
|
DecimalDigit : '\\d';
|
||||||
|
NotDecimalDigit : '\\D';
|
||||||
|
HorizontalWhiteSpace : '\\h';
|
||||||
|
NotHorizontalWhiteSpace : '\\H';
|
||||||
|
NotNewLine : '\\N';
|
||||||
|
CharWithProperty : '\\p{' UnderscoreAlphaNumerics '}';
|
||||||
|
CharWithoutProperty : '\\P{' UnderscoreAlphaNumerics '}';
|
||||||
|
NewLineSequence : '\\R';
|
||||||
|
WhiteSpace : '\\s';
|
||||||
|
NotWhiteSpace : '\\S';
|
||||||
|
VerticalWhiteSpace : '\\v';
|
||||||
|
NotVerticalWhiteSpace : '\\V';
|
||||||
|
WordChar : '\\w';
|
||||||
|
NotWordChar : '\\W';
|
||||||
|
ExtendedUnicodeChar : '\\X';
|
||||||
|
|
||||||
|
// CHARACTER CLASSES
|
||||||
|
//
|
||||||
|
// [...] positive character class
|
||||||
|
// [^...] negative character class
|
||||||
|
// [x-y] range (can be used for hex characters)
|
||||||
|
// [[:xxx:]] positive POSIX named set
|
||||||
|
// [[:^xxx:]] negative POSIX named set
|
||||||
|
//
|
||||||
|
// alnum alphanumeric
|
||||||
|
// alpha alphabetic
|
||||||
|
// ascii 0-127
|
||||||
|
// blank space or tab
|
||||||
|
// cntrl control character
|
||||||
|
// digit decimal digit
|
||||||
|
// graph printing, excluding space
|
||||||
|
// lower lower case letter
|
||||||
|
// print printing, including space
|
||||||
|
// punct printing, excluding alphanumeric
|
||||||
|
// space white space
|
||||||
|
// upper upper case letter
|
||||||
|
// word same as \w
|
||||||
|
// xdigit hexadecimal digit
|
||||||
|
//
|
||||||
|
// In PCRE, POSIX character set names recognize only ASCII characters by
|
||||||
|
// default, but some of them use Unicode properties if PCRE_UCP is set.
|
||||||
|
// You can use \Q...\E inside a character class.
|
||||||
|
CharacterClassStart : '[';
|
||||||
|
CharacterClassEnd : ']';
|
||||||
|
Caret : '^';
|
||||||
|
Hyphen : '-';
|
||||||
|
POSIXNamedSet : '[[:' AlphaNumerics ':]]';
|
||||||
|
POSIXNegatedNamedSet : '[[:^' AlphaNumerics ':]]';
|
||||||
|
|
||||||
|
QuestionMark : '?';
|
||||||
|
Plus : '+';
|
||||||
|
Star : '*';
|
||||||
|
OpenBrace : '{';
|
||||||
|
CloseBrace : '}';
|
||||||
|
Comma : ',';
|
||||||
|
|
||||||
|
// ANCHORS AND SIMPLE ASSERTIONS
|
||||||
|
//
|
||||||
|
// \b word boundary
|
||||||
|
// \B not a word boundary
|
||||||
|
// ^ start of subject
|
||||||
|
// also after internal newline in multiline mode
|
||||||
|
// \A start of subject
|
||||||
|
// $ end of subject
|
||||||
|
// also before newline at end of subject
|
||||||
|
// also before internal newline in multiline mode
|
||||||
|
// \Z end of subject
|
||||||
|
// also before newline at end of subject
|
||||||
|
// \z end of subject
|
||||||
|
// \G first matching position in subject
|
||||||
|
WordBoundary : '\\b';
|
||||||
|
NonWordBoundary : '\\B';
|
||||||
|
StartOfSubject : '\\A';
|
||||||
|
EndOfSubjectOrLine : '$';
|
||||||
|
EndOfSubjectOrLineEndOfSubject : '\\Z';
|
||||||
|
EndOfSubject : '\\z';
|
||||||
|
PreviousMatchInSubject : '\\G';
|
||||||
|
|
||||||
|
// MATCH POINT RESET
|
||||||
|
//
|
||||||
|
// \K reset start of match
|
||||||
|
ResetStartMatch : '\\K';
|
||||||
|
|
||||||
|
SubroutineOrNamedReferenceStartG : '\\g';
|
||||||
|
NamedReferenceStartK : '\\k';
|
||||||
|
|
||||||
|
Pipe : '|';
|
||||||
|
OpenParen : '(';
|
||||||
|
CloseParen : ')';
|
||||||
|
LessThan : '<';
|
||||||
|
GreaterThan : '>';
|
||||||
|
SingleQuote : '\'';
|
||||||
|
Underscore : '_';
|
||||||
|
Colon : ':';
|
||||||
|
Hash : '#';
|
||||||
|
Equals : '=';
|
||||||
|
Exclamation : '!';
|
||||||
|
Ampersand : '&';
|
||||||
|
|
||||||
|
ALC : 'a';
|
||||||
|
BLC : 'b';
|
||||||
|
CLC : 'c';
|
||||||
|
DLC : 'd';
|
||||||
|
ELC : 'e';
|
||||||
|
FLC : 'f';
|
||||||
|
GLC : 'g';
|
||||||
|
HLC : 'h';
|
||||||
|
ILC : 'i';
|
||||||
|
JLC : 'j';
|
||||||
|
KLC : 'k';
|
||||||
|
LLC : 'l';
|
||||||
|
MLC : 'm';
|
||||||
|
NLC : 'n';
|
||||||
|
OLC : 'o';
|
||||||
|
PLC : 'p';
|
||||||
|
QLC : 'q';
|
||||||
|
RLC : 'r';
|
||||||
|
SLC : 's';
|
||||||
|
TLC : 't';
|
||||||
|
ULC : 'u';
|
||||||
|
VLC : 'v';
|
||||||
|
WLC : 'w';
|
||||||
|
XLC : 'x';
|
||||||
|
YLC : 'y';
|
||||||
|
ZLC : 'z';
|
||||||
|
|
||||||
|
AUC : 'A';
|
||||||
|
BUC : 'B';
|
||||||
|
CUC : 'C';
|
||||||
|
DUC : 'D';
|
||||||
|
EUC : 'E';
|
||||||
|
FUC : 'F';
|
||||||
|
GUC : 'G';
|
||||||
|
HUC : 'H';
|
||||||
|
IUC : 'I';
|
||||||
|
JUC : 'J';
|
||||||
|
KUC : 'K';
|
||||||
|
LUC : 'L';
|
||||||
|
MUC : 'M';
|
||||||
|
NUC : 'N';
|
||||||
|
OUC : 'O';
|
||||||
|
PUC : 'P';
|
||||||
|
QUC : 'Q';
|
||||||
|
RUC : 'R';
|
||||||
|
SUC : 'S';
|
||||||
|
TUC : 'T';
|
||||||
|
UUC : 'U';
|
||||||
|
VUC : 'V';
|
||||||
|
WUC : 'W';
|
||||||
|
XUC : 'X';
|
||||||
|
YUC : 'Y';
|
||||||
|
ZUC : 'Z';
|
||||||
|
|
||||||
|
D1 : '1';
|
||||||
|
D2 : '2';
|
||||||
|
D3 : '3';
|
||||||
|
D4 : '4';
|
||||||
|
D5 : '5';
|
||||||
|
D6 : '6';
|
||||||
|
D7 : '7';
|
||||||
|
D8 : '8';
|
||||||
|
D9 : '9';
|
||||||
|
D0 : '0';
|
||||||
|
|
||||||
|
OtherChar : . ;
|
||||||
|
|
||||||
|
// fragments
|
||||||
|
fragment UnderscoreAlphaNumerics : ('_' | AlphaNumeric)+;
|
||||||
|
fragment AlphaNumerics : AlphaNumeric+;
|
||||||
|
fragment AlphaNumeric : [a-zA-Z0-9];
|
||||||
|
fragment NonAlphaNumeric : ~[a-zA-Z0-9];
|
||||||
|
fragment HexDigit : [0-9a-fA-F];
|
||||||
|
fragment ASCII : [\u0000-\u007F];
|
||||||
Loading…
Reference in New Issue