diff --git a/text-processing/src/main/antlr4/ru/roboswag/textprocessing/pcre/PCRE.g4 b/text-processing/src/main/antlr4/ru/roboswag/textprocessing/pcre/PCRE.g4 new file mode 100644 index 0000000..b95406c --- /dev/null +++ b/text-processing/src/main/antlr4/ru/roboswag/textprocessing/pcre/PCRE.g4 @@ -0,0 +1,754 @@ +/* + * Copyright (c) 2014-2022 by Bart Kiers + * + * The MIT license. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + * Project : PCRE Parser, an ANTLR 4 grammar for PCRE + * Developed by : Bart Kiers, bart@big-o.nl + * Also see : https://github.com/bkiers/pcre-parser + */ +grammar PCRE; + +// Most single line comments above the lexer- and parser rules +// are copied from the official PCRE man pages (last updated: +// 10 January 2012): http://www.pcre.org/pcre.txt +parse + : alternation EOF + ; + +// ALTERNATION +// +// expr|expr|expr... +alternation + : expr ('|' expr)* + ; + +expr + : element* + ; + +element + : atom quantifier? + ; + +// QUANTIFIERS +// +// ? 0 or 1, greedy +// ?+ 0 or 1, possessive +// ?? 0 or 1, lazy +// * 0 or more, greedy +// *+ 0 or more, possessive +// *? 0 or more, lazy +// + 1 or more, greedy +// ++ 1 or more, possessive +// +? 1 or more, lazy +// {n} exactly n +// {n,m} at least n, no more than m, greedy +// {n,m}+ at least n, no more than m, possessive +// {n,m}? at least n, no more than m, lazy +// {n,} n or more, greedy +// {n,}+ n or more, possessive +// {n,}? n or more, lazy +quantifier + : '?' quantifier_type + | '+' quantifier_type + | '*' quantifier_type + | '{' number '}' quantifier_type + | '{' number ',' '}' quantifier_type + | '{' number ',' number '}' quantifier_type + ; + +quantifier_type + : '+' + | '?' + | /* nothing */ + ; + +// CHARACTER CLASSES +// +// [...] positive character class +// [^...] negative character class +// [x-y] range (can be used for hex characters) +// [[:xxx:]] positive POSIX named set +// [[:^xxx:]] negative POSIX named set +// +// alnum alphanumeric +// alpha alphabetic +// ascii 0-127 +// blank space or tab +// cntrl control character +// digit decimal digit +// graph printing, excluding space +// lower lower case letter +// print printing, including space +// punct printing, excluding alphanumeric +// space white space +// upper upper case letter +// word same as \w +// xdigit hexadecimal digit +// +// In PCRE, POSIX character set names recognize only ASCII characters by +// default, but some of them use Unicode properties if PCRE_UCP is set. +// You can use \Q...\E inside a character class. +character_class + : '[' '^' CharacterClassEnd Hyphen cc_atom+ ']' + | '[' '^' CharacterClassEnd cc_atom* ']' + | '[' '^' cc_atom+ ']' + | '[' CharacterClassEnd Hyphen cc_atom+ ']' + | '[' CharacterClassEnd cc_atom* ']' + | '[' cc_atom+ ']' + ; + +// BACKREFERENCES +// +// \n reference by number (can be ambiguous) +// \gn reference by number +// \g{n} reference by number +// \g{-n} relative reference by number +// \k reference by name (Perl) +// \k'name' reference by name (Perl) +// \g{name} reference by name (Perl) +// \k{name} reference by name (.NET) +// (?P=name) reference by name (Python) +backreference + : backreference_or_octal + | '\\g' number + | '\\g' '{' number '}' + | '\\g' '{' '-' number '}' + | '\\k' '<' name '>' + | '\\k' '\'' name '\'' + | '\\g' '{' name '}' + | '\\k' '{' name '}' + | '(' '?' 'P' '=' name ')' + ; + +backreference_or_octal + : octal_char + | Backslash digit + ; + +// CAPTURING +// +// (...) capturing group +// (?...) named capturing group (Perl) +// (?'name'...) named capturing group (Perl) +// (?P...) named capturing group (Python) +// (?:...) non-capturing group +// (?|...) non-capturing group; reset group numbers for +// capturing groups in each alternative +// +// ATOMIC GROUPS +// +// (?>...) atomic, non-capturing group +capture + : '(' '?' '<' name '>' alternation ')' + | '(' '?''\'' name '\'' alternation ')' + | '(' '?' 'P' '<' name '>' alternation ')' + | '(' alternation ')' + ; + +non_capture + : '(' '?' ':' alternation ')' + | '(' '?' '|' alternation ')' + | '(' '?' '>' alternation ')' + | '(' '?' option_flags ':' alternation ')' + ; + +// COMMENT +// +// (?#....) comment (not nestable) +comment + : '(' '?' '#' non_close_parens ')' + ; + +// OPTION SETTING +// +// (?i) caseless +// (?J) allow duplicate names +// (?m) multiline +// (?s) single line (dotall) +// (?U) default ungreedy (lazy) +// (?x) extended (ignore white space) +// (?-...) unset option(s) +// +// The following are recognized only at the start of a pattern or after +// one of the newline-setting options with similar syntax: +// +// (*NO_START_OPT) no start-match optimization (PCRE_NO_START_OPTIMIZE) +// (*UTF8) set UTF-8 mode: 8-bit library (PCRE_UTF8) +// (*UTF16) set UTF-16 mode: 16-bit library (PCRE_UTF16) +// (*UCP) set PCRE_UCP (use Unicode properties for \d etc) +option + : '(' '?' option_flags '-' option_flags ')' + | '(' '?' option_flags ')' + | '(' '?' '-' option_flags ')' + | '(' '*' 'N' 'O' '_' 'S' 'T' 'A' 'R' 'T' '_' 'O' 'P' 'T' ')' + | '(' '*' 'U' 'T' 'F' '8' ')' + | '(' '*' 'U' 'T' 'F' '1' '6' ')' + | '(' '*' 'U' 'C' 'P' ')' + ; + +option_flags + : option_flag+ + ; + +option_flag + : 'i' + | 'J' + | 'm' + | 's' + | 'U' + | 'x' + ; + +// LOOKAHEAD AND LOOKBEHIND ASSERTIONS +// +// (?=...) positive look ahead +// (?!...) negative look ahead +// (?<=...) positive look behind +// (?name) call subpattern by name (Python) +// \g call subpattern by name (Oniguruma) +// \g'name' call subpattern by name (Oniguruma) +// \g call subpattern by absolute number (Oniguruma) +// \g'n' call subpattern by absolute number (Oniguruma) +// \g<+n> call subpattern by relative number (PCRE extension) +// \g'+n' call subpattern by relative number (PCRE extension) +// \g<-n> call subpattern by relative number (PCRE extension) +// \g'-n' call subpattern by relative number (PCRE extension) +subroutine_reference + : '(' '?' 'R' ')' + | '(' '?' number ')' + | '(' '?' '+' number ')' + | '(' '?' '-' number ')' + | '(' '?' '&' name ')' + | '(' '?' 'P' '>' name ')' + | '\\g' '<' name '>' + | '\\g' '\'' name '\'' + | '\\g' '<' number '>' + | '\\g' '\'' number '\'' + | '\\g' '<' '+' number '>' + | '\\g' '\'' '+' number '\'' + | '\\g' '<' '-' number '>' + | '\\g' '\'' '-' number '\'' + ; + +// CONDITIONAL PATTERNS +// +// (?(condition)yes-pattern) +// (?(condition)yes-pattern|no-pattern) +// +// (?(n)... absolute reference condition +// (?(+n)... relative reference condition +// (?(-n)... relative reference condition +// (?()... named reference condition (Perl) +// (?('name')... named reference condition (Perl) +// (?(name)... named reference condition (PCRE) +// (?(R)... overall recursion condition +// (?(Rn)... specific group recursion condition +// (?(R&name)... specific recursion condition +// (?(DEFINE)... define subpattern for reference +// (?(assert)... assertion condition +conditional + : '(' '?' '(' number ')' alternation ('|' alternation)? ')' + | '(' '?' '(' '+' number ')' alternation ('|' alternation)? ')' + | '(' '?' '(' '-' number ')' alternation ('|' alternation)? ')' + | '(' '?' '(' '<' name '>' ')' alternation ('|' alternation)? ')' + | '(' '?' '(' '\'' name '\'' ')' alternation ('|' alternation)? ')' + | '(' '?' '(' 'R' number ')' alternation ('|' alternation)? ')' + | '(' '?' '(' 'R' ')' alternation ('|' alternation)? ')' + | '(' '?' '(' 'R' '&' name ')' alternation ('|' alternation)? ')' + | '(' '?' '(' 'D' 'E' 'F' 'I' 'N' 'E' ')' alternation ('|' alternation)? ')' + | '(' '?' '(' 'a' 's' 's' 'e' 'r' 't' ')' alternation ('|' alternation)? ')' + | '(' '?' '(' name ')' alternation ('|' alternation)? ')' + ; + +// BACKTRACKING CONTROL +// +// The following act immediately they are reached: +// +// (*ACCEPT) force successful match +// (*FAIL) force backtrack; synonym (*F) +// (*MARK:NAME) set name to be passed back; synonym (*:NAME) +// +// The following act only when a subsequent match failure causes a back- +// track to reach them. They all force a match failure, but they differ in +// what happens afterwards. Those that advance the start-of-match point do +// so only if the pattern is not anchored. +// +// (*COMMIT) overall failure, no advance of starting point +// (*PRUNE) advance to next starting character +// (*PRUNE:NAME) equivalent to (*MARK:NAME)(*PRUNE) +// (*SKIP) advance to current matching position +// (*SKIP:NAME) advance to position corresponding to an earlier +// (*MARK:NAME); if not found, the (*SKIP) is ignored +// (*THEN) local failure, backtrack to next alternation +// (*THEN:NAME) equivalent to (*MARK:NAME)(*THEN) +backtrack_control + : '(' '*' 'A' 'C' 'C' 'E' 'P' 'T' ')' + | '(' '*' 'F' ('A' 'I' 'L')? ')' + | '(' '*' ('M' 'A' 'R' 'K')? ':' 'N' 'A' 'M' 'E' ')' + | '(' '*' 'C' 'O' 'M' 'M' 'I' 'T' ')' + | '(' '*' 'P' 'R' 'U' 'N' 'E' ')' + | '(' '*' 'P' 'R' 'U' 'N' 'E' ':' 'N' 'A' 'M' 'E' ')' + | '(' '*' 'S' 'K' 'I' 'P' ')' + | '(' '*' 'S' 'K' 'I' 'P' ':' 'N' 'A' 'M' 'E' ')' + | '(' '*' 'T' 'H' 'E' 'N' ')' + | '(' '*' 'T' 'H' 'E' 'N' ':' 'N' 'A' 'M' 'E' ')' + ; + +// NEWLINE CONVENTIONS +//capture +// These are recognized only at the very start of the pattern or after a +// (*BSR_...), (*UTF8), (*UTF16) or (*UCP) option. +// +// (*CR) carriage return only +// (*LF) linefeed only +// (*CRLF) carriage return followed by linefeed +// (*ANYCRLF) all three of the above +// (*ANY) any Unicode newline sequence +// +// WHAT \R MATCHES +// +// These are recognized only at the very start of the pattern or after a +// (*...) option that sets the newline convention or a UTF or UCP mode. +// +// (*BSR_ANYCRLF) CR, LF, or CRLF +// (*BSR_UNICODE) any Unicode newline sequence +newline_convention + : '(' '*' 'C' 'R' ')' + | '(' '*' 'L' 'F' ')' + | '(' '*' 'C' 'R' 'L' 'F' ')' + | '(' '*' 'A' 'N' 'Y' 'C' 'R' 'L' 'F' ')' + | '(' '*' 'A' 'N' 'Y' ')' + | '(' '*' 'B' 'S' 'R' '_' 'A' 'N' 'Y' 'C' 'R' 'L' 'F' ')' + | '(' '*' 'B' 'S' 'R' '_' 'U' 'N' 'I' 'C' 'O' 'D' 'E' ')' + ; + +// CALLOUTS +// +// (?C) callout +// (?Cn) callout with data n +callout + : '(' '?' 'C' ')' + | '(' '?' 'C' number ')' + ; + +atom + : subroutine_reference + | shared_atom + | literal + | character_class + | capture + | non_capture + | comment + | option + | look_around + | backreference + | conditional + | backtrack_control + | newline_convention + | callout + | Dot + | Caret + | StartOfSubject + | WordBoundary + | NonWordBoundary + | EndOfSubjectOrLine + | EndOfSubjectOrLineEndOfSubject + | EndOfSubject + | PreviousMatchInSubject + | ResetStartMatch + | OneDataUnit + | ExtendedUnicodeChar + ; + +cc_atom + : cc_literal Hyphen cc_literal + | shared_atom + | cc_literal + | backreference_or_octal // only octal is valid in a cc + ; + +shared_atom + : POSIXNamedSet + | POSIXNegatedNamedSet + | ControlChar + | DecimalDigit + | NotDecimalDigit + | HorizontalWhiteSpace + | NotHorizontalWhiteSpace + | NotNewLine + | CharWithProperty + | CharWithoutProperty + | NewLineSequence + | WhiteSpace + | NotWhiteSpace + | VerticalWhiteSpace + | NotVerticalWhiteSpace + | WordChar + | NotWordChar + | Backslash . // will match "unfinished" escape sequences, like `\x` + ; + +literal + : shared_literal + | CharacterClassEnd + ; + +cc_literal + : shared_literal + | Dot + | CharacterClassStart + | Caret + | QuestionMark + | Plus + | Star + | WordBoundary + | EndOfSubjectOrLine + | Pipe + | OpenParen + | CloseParen + ; + +shared_literal + : octal_char + | letter + | digit + | BellChar + | EscapeChar + | FormFeed + | NewLine + | CarriageReturn + | Tab + | HexChar + | Quoted + | BlockQuoted + | OpenBrace + | CloseBrace + | Comma + | Hyphen + | LessThan + | GreaterThan + | SingleQuote + | Underscore + | Colon + | Hash + | Equals + | Exclamation + | Ampersand + | OtherChar + ; + +number + : digits + ; + +octal_char + : ( Backslash (D0 | D1 | D2 | D3) octal_digit octal_digit + | Backslash octal_digit octal_digit + ) + + ; + +octal_digit + : D0 | D1 | D2 | D3 | D4 | D5 | D6 | D7 + ; + +digits + : digit+ + ; + +digit + : D0 | D1 | D2 | D3 | D4 | D5 | D6 | D7 | D8 | D9 + ; + +name + : alpha_nums + ; + +alpha_nums + : (letter | Underscore) (letter | Underscore | digit)* + ; + +non_close_parens + : non_close_paren+ + ; + +non_close_paren + : ~CloseParen + ; + +letter + : ALC | BLC | CLC | DLC | ELC | FLC | GLC | HLC | ILC | JLC | KLC | LLC | MLC | NLC | OLC | PLC | QLC | RLC | SLC | TLC | ULC | VLC | WLC | XLC | YLC | ZLC | + AUC | BUC | CUC | DUC | EUC | FUC | GUC | HUC | IUC | JUC | KUC | LUC | MUC | NUC | OUC | PUC | QUC | RUC | SUC | TUC | UUC | VUC | WUC | XUC | YUC | ZUC + ; + +// QUOTING +// +// \x where x is non-alphanumeric is a literal x +// \Q...\E treat enclosed characters as literal +Quoted : '\\' NonAlphaNumeric; +BlockQuoted : '\\Q' .*? '\\E'; + +// CHARACTERS +// +// \a alarm, that is, the BEL character (hex 07) +// \cx "control-x", where x is any ASCII character +// \e escape (hex 1B) +// \f form feed (hex 0C) +// \n newline (hex 0A) +// \r carriage return (hex 0D) +// \t tab (hex 09) +// \ddd character with octal code ddd, or backreference +// \xhh character with hex code hh +// \x{hhh..} character with hex code hhh.. +BellChar : '\\a'; +ControlChar : '\\c' ASCII?; +EscapeChar : '\\e'; +FormFeed : '\\f'; +NewLine : '\\n'; +CarriageReturn : '\\r'; +Tab : '\\t'; +Backslash : '\\'; +HexChar : '\\x' ( HexDigit HexDigit + | '{' HexDigit HexDigit HexDigit+ '}' + ) + ; + +// CHARACTER TYPES +// +// . any character except newline; +// in dotall mode, any character whatsoever +// \C one data unit, even in UTF mode (best avoided) +// \d a decimal digit +// \D a character that is not a decimal digit +// \h a horizontal white space character +// \H a character that is not a horizontal white space character +// \N a character that is not a newline +// \p{xx} a character with the xx property +// \P{xx} a character without the xx property +// \R a newline sequence +// \s a white space character +// \S a character that is not a white space character +// \v a vertical white space character +// \V a character that is not a vertical white space character +// \w a "word" character +// \W a "non-word" character +// \X an extended Unicode sequence +// +// In PCRE, by default, \d, \D, \s, \S, \w, and \W recognize only ASCII +// characters, even in a UTF mode. However, this can be changed by setting +// the PCRE_UCP option. +Dot : '.'; +OneDataUnit : '\\C'; +DecimalDigit : '\\d'; +NotDecimalDigit : '\\D'; +HorizontalWhiteSpace : '\\h'; +NotHorizontalWhiteSpace : '\\H'; +NotNewLine : '\\N'; +CharWithProperty : '\\p{' UnderscoreAlphaNumerics '}'; +CharWithoutProperty : '\\P{' UnderscoreAlphaNumerics '}'; +NewLineSequence : '\\R'; +WhiteSpace : '\\s'; +NotWhiteSpace : '\\S'; +VerticalWhiteSpace : '\\v'; +NotVerticalWhiteSpace : '\\V'; +WordChar : '\\w'; +NotWordChar : '\\W'; +ExtendedUnicodeChar : '\\X'; + +// CHARACTER CLASSES +// +// [...] positive character class +// [^...] negative character class +// [x-y] range (can be used for hex characters) +// [[:xxx:]] positive POSIX named set +// [[:^xxx:]] negative POSIX named set +// +// alnum alphanumeric +// alpha alphabetic +// ascii 0-127 +// blank space or tab +// cntrl control character +// digit decimal digit +// graph printing, excluding space +// lower lower case letter +// print printing, including space +// punct printing, excluding alphanumeric +// space white space +// upper upper case letter +// word same as \w +// xdigit hexadecimal digit +// +// In PCRE, POSIX character set names recognize only ASCII characters by +// default, but some of them use Unicode properties if PCRE_UCP is set. +// You can use \Q...\E inside a character class. +CharacterClassStart : '['; +CharacterClassEnd : ']'; +Caret : '^'; +Hyphen : '-'; +POSIXNamedSet : '[[:' AlphaNumerics ':]]'; +POSIXNegatedNamedSet : '[[:^' AlphaNumerics ':]]'; + +QuestionMark : '?'; +Plus : '+'; +Star : '*'; +OpenBrace : '{'; +CloseBrace : '}'; +Comma : ','; + +// ANCHORS AND SIMPLE ASSERTIONS +// +// \b word boundary +// \B not a word boundary +// ^ start of subject +// also after internal newline in multiline mode +// \A start of subject +// $ end of subject +// also before newline at end of subject +// also before internal newline in multiline mode +// \Z end of subject +// also before newline at end of subject +// \z end of subject +// \G first matching position in subject +WordBoundary : '\\b'; +NonWordBoundary : '\\B'; +StartOfSubject : '\\A'; +EndOfSubjectOrLine : '$'; +EndOfSubjectOrLineEndOfSubject : '\\Z'; +EndOfSubject : '\\z'; +PreviousMatchInSubject : '\\G'; + +// MATCH POINT RESET +// +// \K reset start of match +ResetStartMatch : '\\K'; + +SubroutineOrNamedReferenceStartG : '\\g'; +NamedReferenceStartK : '\\k'; + +Pipe : '|'; +OpenParen : '('; +CloseParen : ')'; +LessThan : '<'; +GreaterThan : '>'; +SingleQuote : '\''; +Underscore : '_'; +Colon : ':'; +Hash : '#'; +Equals : '='; +Exclamation : '!'; +Ampersand : '&'; + +ALC : 'a'; +BLC : 'b'; +CLC : 'c'; +DLC : 'd'; +ELC : 'e'; +FLC : 'f'; +GLC : 'g'; +HLC : 'h'; +ILC : 'i'; +JLC : 'j'; +KLC : 'k'; +LLC : 'l'; +MLC : 'm'; +NLC : 'n'; +OLC : 'o'; +PLC : 'p'; +QLC : 'q'; +RLC : 'r'; +SLC : 's'; +TLC : 't'; +ULC : 'u'; +VLC : 'v'; +WLC : 'w'; +XLC : 'x'; +YLC : 'y'; +ZLC : 'z'; + +AUC : 'A'; +BUC : 'B'; +CUC : 'C'; +DUC : 'D'; +EUC : 'E'; +FUC : 'F'; +GUC : 'G'; +HUC : 'H'; +IUC : 'I'; +JUC : 'J'; +KUC : 'K'; +LUC : 'L'; +MUC : 'M'; +NUC : 'N'; +OUC : 'O'; +PUC : 'P'; +QUC : 'Q'; +RUC : 'R'; +SUC : 'S'; +TUC : 'T'; +UUC : 'U'; +VUC : 'V'; +WUC : 'W'; +XUC : 'X'; +YUC : 'Y'; +ZUC : 'Z'; + +D1 : '1'; +D2 : '2'; +D3 : '3'; +D4 : '4'; +D5 : '5'; +D6 : '6'; +D7 : '7'; +D8 : '8'; +D9 : '9'; +D0 : '0'; + +OtherChar : . ; + +// fragments +fragment UnderscoreAlphaNumerics : ('_' | AlphaNumeric)+; +fragment AlphaNumerics : AlphaNumeric+; +fragment AlphaNumeric : [a-zA-Z0-9]; +fragment NonAlphaNumeric : ~[a-zA-Z0-9]; +fragment HexDigit : [0-9a-fA-F]; +fragment ASCII : [\u0000-\u007F];