resyntax.h 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651
  1. /////////////////////////////////////////////////////////////////////////////
  2. // Name: resyntax.h
  3. // Purpose: topic overview
  4. // Author: wxWidgets team
  5. // Licence: wxWindows licence
  6. /////////////////////////////////////////////////////////////////////////////
  7. /**
  8. @page overview_resyntax Regular Expressions
  9. @tableofcontents
  10. A <em>regular expression</em> describes strings of characters. It's a pattern
  11. that matches certain strings and doesn't match others.
  12. @see wxRegEx
  13. @section overview_resyntax_differentflavors Different Flavors of Regular Expressions
  14. Regular expressions (RE), as defined by POSIX, come in two flavors:
  15. <em>extended regular expressions</em> (ERE) and <em>basic regular
  16. expressions</em> (BRE). EREs are roughly those of the traditional @e egrep,
  17. while BREs are roughly those of the traditional @e ed. This implementation
  18. adds a third flavor: <em>advanced regular expressions</em> (ARE), basically
  19. EREs with some significant extensions.
  20. This manual page primarily describes AREs. BREs mostly exist for backward
  21. compatibility in some old programs. POSIX EREs are almost an exact subset of
  22. AREs. Features of AREs that are not present in EREs will be indicated.
  23. @section overview_resyntax_syntax Regular Expression Syntax
  24. These regular expressions are implemented using the package written by Henry
  25. Spencer, based on the 1003.2 spec and some (not quite all) of the Perl5
  26. extensions (thanks, Henry!). Much of the description of regular expressions
  27. below is copied verbatim from his manual entry.
  28. An ARE is one or more @e branches, separated by "|", matching anything that
  29. matches any of the branches.
  30. A branch is zero or more @e constraints or @e quantified atoms, concatenated.
  31. It matches a match for the first, followed by a match for the second, etc; an
  32. empty branch matches the empty string.
  33. A quantified atom is an @e atom possibly followed by a single @e quantifier.
  34. Without a quantifier, it matches a match for the atom. The quantifiers, and
  35. what a so-quantified atom matches, are:
  36. @beginTable
  37. @row2col{ <tt>*</tt> ,
  38. A sequence of 0 or more matches of the atom. }
  39. @row2col{ <tt>+</tt> ,
  40. A sequence of 1 or more matches of the atom. }
  41. @row2col{ <tt>?</tt> ,
  42. A sequence of 0 or 1 matches of the atom. }
  43. @row2col{ <tt>{m}</tt> ,
  44. A sequence of exactly @e m matches of the atom. }
  45. @row2col{ <tt>{m\,}</tt> ,
  46. A sequence of @e m or more matches of the atom. }
  47. @row2col{ <tt>{m\,n}</tt> ,
  48. A sequence of @e m through @e n (inclusive) matches of the atom; @e m may
  49. not exceed @e n. }
  50. @row2col{ <tt>*? +? ?? {m}? {m\,}? {m\,n}?</tt> ,
  51. @e Non-greedy quantifiers, which match the same possibilities, but prefer
  52. the smallest number rather than the largest number of matches (see
  53. @ref overview_resyntax_matching). }
  54. @endTable
  55. The forms using @b { and @b } are known as @e bounds. The numbers @e m and
  56. @e n are unsigned decimal integers with permissible values from 0 to 255
  57. inclusive. An atom is one of:
  58. @beginTable
  59. @row2col{ <tt>(re)</tt> ,
  60. Where @e re is any regular expression, matches for @e re, with the match
  61. captured for possible reporting. }
  62. @row2col{ <tt>(?:re)</tt> ,
  63. As previous, but does no reporting (a "non-capturing" set of
  64. parentheses). }
  65. @row2col{ <tt>()</tt> ,
  66. Matches an empty string, captured for possible reporting. }
  67. @row2col{ <tt>(?:)</tt> ,
  68. Matches an empty string, without reporting. }
  69. @row2col{ <tt>[chars]</tt> ,
  70. A <em>bracket expression</em>, matching any one of the @e chars (see
  71. @ref overview_resyntax_bracket for more details). }
  72. @row2col{ <tt>.</tt> ,
  73. Matches any single character. }
  74. @row2col{ <tt>@\k</tt> ,
  75. Where @e k is a non-alphanumeric character, matches that character taken
  76. as an ordinary character, e.g. @\@\ matches a backslash character. }
  77. @row2col{ <tt>@\c</tt> ,
  78. Where @e c is alphanumeric (possibly followed by other characters), an
  79. @e escape (AREs only), see @ref overview_resyntax_escapes below. }
  80. @row2col{ <tt>@leftCurly</tt> ,
  81. When followed by a character other than a digit, matches the left-brace
  82. character "@leftCurly"; when followed by a digit, it is the beginning of a
  83. @e bound (see above). }
  84. @row2col{ <tt>x</tt> ,
  85. Where @e x is a single character with no other significance, matches that
  86. character. }
  87. @endTable
  88. A @e constraint matches an empty string when specific conditions are met. A
  89. constraint may not be followed by a quantifier. The simple constraints are as
  90. follows; some more constraints are described later, under
  91. @ref overview_resyntax_escapes.
  92. @beginTable
  93. @row2col{ <tt>^</tt> ,
  94. Matches at the beginning of a line. }
  95. @row2col{ <tt>@$</tt> ,
  96. Matches at the end of a line. }
  97. @row2col{ <tt>(?=re)</tt> ,
  98. @e Positive lookahead (AREs only), matches at any point where a substring
  99. matching @e re begins. }
  100. @row2col{ <tt>(?!re)</tt> ,
  101. @e Negative lookahead (AREs only), matches at any point where no substring
  102. matching @e re begins. }
  103. @endTable
  104. The lookahead constraints may not contain back references (see later), and all
  105. parentheses within them are considered non-capturing. A RE may not end with
  106. "\".
  107. @section overview_resyntax_bracket Bracket Expressions
  108. A <em>bracket expression</em> is a list of characters enclosed in <tt>[]</tt>.
  109. It normally matches any single character from the list (but see below). If the
  110. list begins with @c ^, it matches any single character (but see below) @e not
  111. from the rest of the list.
  112. If two characters in the list are separated by <tt>-</tt>, this is shorthand
  113. for the full @e range of characters between those two (inclusive) in the
  114. collating sequence, e.g. <tt>[0-9]</tt> in ASCII matches any decimal digit.
  115. Two ranges may not share an endpoint, so e.g. <tt>a-c-e</tt> is illegal.
  116. Ranges are very collating-sequence-dependent, and portable programs should
  117. avoid relying on them.
  118. To include a literal <tt>]</tt> or <tt>-</tt> in the list, the simplest method
  119. is to enclose it in <tt>[.</tt> and <tt>.]</tt> to make it a collating element
  120. (see below). Alternatively, make it the first character (following a possible
  121. <tt>^</tt>), or (AREs only) precede it with <tt>@\</tt>. Alternatively, for
  122. <tt>-</tt>, make it the last character, or the second endpoint of a range. To
  123. use a literal <tt>-</tt> as the first endpoint of a range, make it a collating
  124. element or (AREs only) precede it with <tt>@\</tt>. With the exception of
  125. these, some combinations using <tt>[</tt> (see next paragraphs), and escapes,
  126. all other special characters lose their special significance within a bracket
  127. expression.
  128. Within a bracket expression, a collating element (a character, a
  129. multi-character sequence that collates as if it were a single character, or a
  130. collating-sequence name for either) enclosed in <tt>[.</tt> and <tt>.]</tt>
  131. stands for the sequence of characters of that collating element.
  132. @e wxWidgets: Currently no multi-character collating elements are defined. So
  133. in <tt>[.X.]</tt>, @c X can either be a single character literal or the name
  134. of a character. For example, the following are both identical:
  135. <tt>[[.0.]-[.9.]]</tt> and <tt>[[.zero.]-[.nine.]]</tt> and mean the same as
  136. <tt>[0-9]</tt>. See @ref overview_resyntax_characters.
  137. Within a bracket expression, a collating element enclosed in <tt>[=</tt> and
  138. <tt>=]</tt> is an equivalence class, standing for the sequences of characters
  139. of all collating elements equivalent to that one, including itself. An
  140. equivalence class may not be an endpoint of a range.
  141. @e wxWidgets: Currently no equivalence classes are defined, so <tt>[=X=]</tt>
  142. stands for just the single character @c X. @c X can either be a single
  143. character literal or the name of a character, see
  144. @ref overview_resyntax_characters.
  145. Within a bracket expression, the name of a @e character class enclosed in
  146. <tt>[:</tt> and <tt>:]</tt> stands for the list of all characters (not all
  147. collating elements!) belonging to that class. Standard character classes are:
  148. @beginTable
  149. @row2col{ <tt>alpha</tt> , A letter. }
  150. @row2col{ <tt>upper</tt> , An upper-case letter. }
  151. @row2col{ <tt>lower</tt> , A lower-case letter. }
  152. @row2col{ <tt>digit</tt> , A decimal digit. }
  153. @row2col{ <tt>xdigit</tt> , A hexadecimal digit. }
  154. @row2col{ <tt>alnum</tt> , An alphanumeric (letter or digit). }
  155. @row2col{ <tt>print</tt> , An alphanumeric (same as alnum). }
  156. @row2col{ <tt>blank</tt> , A space or tab character. }
  157. @row2col{ <tt>space</tt> , A character producing white space in displayed text. }
  158. @row2col{ <tt>punct</tt> , A punctuation character. }
  159. @row2col{ <tt>graph</tt> , A character with a visible representation. }
  160. @row2col{ <tt>cntrl</tt> , A control character. }
  161. @endTable
  162. A character class may not be used as an endpoint of a range.
  163. @e wxWidgets: In a non-Unicode build, these character classifications depend on
  164. the current locale, and correspond to the values return by the ANSI C "is"
  165. functions: <tt>isalpha</tt>, <tt>isupper</tt>, etc. In Unicode mode they are
  166. based on Unicode classifications, and are not affected by the current locale.
  167. There are two special cases of bracket expressions: the bracket expressions
  168. <tt>[[:@<:]]</tt> and <tt>[[:@>:]]</tt> are constraints, matching empty strings at
  169. the beginning and end of a word respectively. A word is defined as a sequence
  170. of word characters that is neither preceded nor followed by word characters. A
  171. word character is an @e alnum character or an underscore (_). These special
  172. bracket expressions are deprecated; users of AREs should use constraint escapes
  173. instead (see escapes below).
  174. @section overview_resyntax_escapes Escapes
  175. Escapes (AREs only), which begin with a <tt>@\</tt> followed by an alphanumeric
  176. character, come in several varieties: character entry, class shorthands,
  177. constraint escapes, and back references. A <tt>@\</tt> followed by an
  178. alphanumeric character but not constituting a valid escape is illegal in AREs.
  179. In EREs, there are no escapes: outside a bracket expression, a <tt>@\</tt>
  180. followed by an alphanumeric character merely stands for that character as an
  181. ordinary character, and inside a bracket expression, <tt>@\</tt> is an ordinary
  182. character. (The latter is the one actual incompatibility between EREs and
  183. AREs.)
  184. Character-entry escapes (AREs only) exist to make it easier to specify
  185. non-printing and otherwise inconvenient characters in REs:
  186. @beginTable
  187. @row2col{ <tt>@\a</tt> , Alert (bell) character, as in C. }
  188. @row2col{ <tt>@\b</tt> , Backspace, as in C. }
  189. @row2col{ <tt>@\B</tt> ,
  190. Synonym for <tt>@\</tt> to help reduce backslash doubling in some
  191. applications where there are multiple levels of backslash processing. }
  192. @row2col{ <tt>@\cX</tt> ,
  193. The character whose low-order 5 bits are the same as those of @e X, and
  194. whose other bits are all zero, where @e X is any character. }
  195. @row2col{ <tt>@\e</tt> ,
  196. The character whose collating-sequence name is @c ESC, or failing that,
  197. the character with octal value 033. }
  198. @row2col{ <tt>@\f</tt> , Formfeed, as in C. }
  199. @row2col{ <tt>@\n</tt> , Newline, as in C. }
  200. @row2col{ <tt>@\r</tt> , Carriage return, as in C. }
  201. @row2col{ <tt>@\t</tt> , Horizontal tab, as in C. }
  202. @row2col{ <tt>@\uwxyz</tt> ,
  203. The Unicode character <tt>U+wxyz</tt> in the local byte ordering, where
  204. @e wxyz is exactly four hexadecimal digits. }
  205. @row2col{ <tt>@\Ustuvwxyz</tt> ,
  206. Reserved for a somewhat-hypothetical Unicode extension to 32 bits, where
  207. @e stuvwxyz is exactly eight hexadecimal digits. }
  208. @row2col{ <tt>@\v</tt> , Vertical tab, as in C are all available. }
  209. @row2col{ <tt>@\xhhh</tt> ,
  210. The single character whose hexadecimal value is @e 0xhhh, where @e hhh is
  211. any sequence of hexadecimal digits. }
  212. @row2col{ <tt>@\0</tt> , The character whose value is 0. }
  213. @row2col{ <tt>@\xy</tt> ,
  214. The character whose octal value is @e 0xy, where @e xy is exactly two octal
  215. digits, and is not a <em>back reference</em> (see below). }
  216. @row2col{ <tt>@\xyz</tt> ,
  217. The character whose octal value is @e 0xyz, where @e xyz is exactly three
  218. octal digits, and is not a <em>back reference</em> (see below). }
  219. @endTable
  220. Hexadecimal digits are 0-9, a-f, and A-F. Octal digits are 0-7.
  221. The character-entry escapes are always taken as ordinary characters. For
  222. example, <tt>@\135</tt> is <tt>]</tt> in ASCII, but <tt>@\135</tt> does not
  223. terminate a bracket expression. Beware, however, that some applications (e.g.,
  224. C compilers) interpret such sequences themselves before the regular-expression
  225. package gets to see them, which may require doubling (quadrupling, etc.) the
  226. '<tt>@\</tt>'.
  227. Class-shorthand escapes (AREs only) provide shorthands for certain
  228. commonly-used character classes:
  229. @beginTable
  230. @row2col{ <tt>@\d</tt> , <tt>[[:digit:]]</tt> }
  231. @row2col{ <tt>@\s</tt> , <tt>[[:space:]]</tt> }
  232. @row2col{ <tt>@\w</tt> , <tt>[[:alnum:]_]</tt> (note underscore) }
  233. @row2col{ <tt>@\D</tt> , <tt>[^[:digit:]]</tt> }
  234. @row2col{ <tt>@\S</tt> , <tt>[^[:space:]]</tt> }
  235. @row2col{ <tt>@\W</tt> , <tt>[^[:alnum:]_]</tt> (note underscore) }
  236. @endTable
  237. Within bracket expressions, <tt>@\d</tt>, <tt>@\s</tt>, and <tt>@\w</tt> lose
  238. their outer brackets, and <tt>@\D</tt>, <tt>@\S</tt>, <tt>@\W</tt> are illegal.
  239. So, for example, <tt>[a-c@\d]</tt> is equivalent to <tt>[a-c[:digit:]]</tt>.
  240. Also, <tt>[a-c@\D]</tt>, which is equivalent to <tt>[a-c^[:digit:]]</tt>, is
  241. illegal.
  242. A constraint escape (AREs only) is a constraint, matching the empty string if
  243. specific conditions are met, written as an escape:
  244. @beginTable
  245. @row2col{ <tt>@\A</tt> , Matches only at the beginning of the string, see
  246. @ref overview_resyntax_matching for how this differs
  247. from <tt>^</tt>. }
  248. @row2col{ <tt>@\m</tt> , Matches only at the beginning of a word. }
  249. @row2col{ <tt>@\M</tt> , Matches only at the end of a word. }
  250. @row2col{ <tt>@\y</tt> , Matches only at the beginning or end of a word. }
  251. @row2col{ <tt>@\Y</tt> , Matches only at a point that is not the beginning or
  252. end of a word. }
  253. @row2col{ <tt>@\Z</tt> , Matches only at the end of the string, see
  254. @ref overview_resyntax_matching for how this differs
  255. from <tt>@$</tt>. }
  256. @row2col{ <tt>@\m</tt> , A <em>back reference</em>, where @e m is a non-zero
  257. digit. See below. }
  258. @row2col{ <tt>@\mnn</tt> ,
  259. A <em>back reference</em>, where @e m is a nonzero digit, and @e nn is some
  260. more digits, and the decimal value @e mnn is not greater than the number of
  261. closing capturing parentheses seen so far. See below. }
  262. @endTable
  263. A word is defined as in the specification of <tt>[[:@<:]]</tt> and
  264. <tt>[[:@>:]]</tt> above. Constraint escapes are illegal within bracket
  265. expressions.
  266. A back reference (AREs only) matches the same string matched by the
  267. parenthesized subexpression specified by the number. For example, "([bc])\1"
  268. matches "bb" or "cc" but not "bc". The subexpression must entirely precede the
  269. back reference in the RE.Subexpressions are numbered in the order of their
  270. leading parentheses. Non-capturing parentheses do not define subexpressions.
  271. There is an inherent historical ambiguity between octal character-entry escapes
  272. and back references, which is resolved by heuristics, as hinted at above. A
  273. leading zero always indicates an octal escape. A single non-zero digit, not
  274. followed by another digit, is always taken as a back reference. A multi-digit
  275. sequence not starting with a zero is taken as a back reference if it comes
  276. after a suitable subexpression (i.e. the number is in the legal range for a
  277. back reference), and otherwise is taken as octal.
  278. @section overview_resyntax_metasyntax Metasyntax
  279. In addition to the main syntax described above, there are some special forms
  280. and miscellaneous syntactic facilities available.
  281. Normally the flavor of RE being used is specified by application-dependent
  282. means. However, this can be overridden by a @e director. If an RE of any flavor
  283. begins with <tt>***:</tt>, the rest of the RE is an ARE. If an RE of any
  284. flavor begins with <tt>***=</tt>, the rest of the RE is taken to be a literal
  285. string, with all characters considered ordinary characters.
  286. An ARE may begin with <em>embedded options</em>: a sequence <tt>(?xyz)</tt>
  287. (where @e xyz is one or more alphabetic characters) specifies options affecting
  288. the rest of the RE. These supplement, and can override, any options specified
  289. by the application. The available option letters are:
  290. @beginTable
  291. @row2col{ <tt>b</tt> , Rest of RE is a BRE. }
  292. @row2col{ <tt>c</tt> , Case-sensitive matching (usual default). }
  293. @row2col{ <tt>e</tt> , Rest of RE is an ERE. }
  294. @row2col{ <tt>i</tt> , Case-insensitive matching (see
  295. @ref overview_resyntax_matching, below). }
  296. @row2col{ <tt>m</tt> , Historical synonym for @e n. }
  297. @row2col{ <tt>n</tt> , Newline-sensitive matching (see
  298. @ref overview_resyntax_matching, below). }
  299. @row2col{ <tt>p</tt> , Partial newline-sensitive matching (see
  300. @ref overview_resyntax_matching, below). }
  301. @row2col{ <tt>q</tt> , Rest of RE is a literal ("quoted") string, all ordinary
  302. characters. }
  303. @row2col{ <tt>s</tt> , Non-newline-sensitive matching (usual default). }
  304. @row2col{ <tt>t</tt> , Tight syntax (usual default; see below). }
  305. @row2col{ <tt>w</tt> , Inverse partial newline-sensitive ("weird") matching
  306. (see @ref overview_resyntax_matching, below). }
  307. @row2col{ <tt>x</tt> , Expanded syntax (see below). }
  308. @endTable
  309. Embedded options take effect at the <tt>)</tt> terminating the sequence. They
  310. are available only at the start of an ARE, and may not be used later within it.
  311. In addition to the usual (@e tight) RE syntax, in which all characters are
  312. significant, there is an @e expanded syntax, available in AREs with the
  313. embedded x option. In the expanded syntax, white-space characters are ignored
  314. and all characters between a <tt>@#</tt> and the following newline (or the end
  315. of the RE) are ignored, permitting paragraphing and commenting a complex RE.
  316. There are three exceptions to that basic rule:
  317. @li A white-space character or <tt>@#</tt> preceded by <tt>@\</tt> is retained.
  318. @li White space or <tt>@#</tt> within a bracket expression is retained.
  319. @li White space and comments are illegal within multi-character symbols like
  320. the ARE <tt>(?:</tt> or the BRE <tt>\(</tt>.
  321. Expanded-syntax white-space characters are blank, tab, newline, and any
  322. character that belongs to the @e space character class.
  323. Finally, in an ARE, outside bracket expressions, the sequence <tt>(?@#ttt)</tt>
  324. (where @e ttt is any text not containing a <tt>)</tt>) is a comment, completely
  325. ignored. Again, this is not allowed between the characters of multi-character
  326. symbols like <tt>(?:</tt>. Such comments are more a historical artifact than a
  327. useful facility, and their use is deprecated; use the expanded syntax instead.
  328. @e None of these metasyntax extensions is available if the application (or an
  329. initial <tt>***=</tt> director) has specified that the user's input be treated
  330. as a literal string rather than as an RE.
  331. @section overview_resyntax_matching Matching
  332. In the event that an RE could match more than one substring of a given string,
  333. the RE matches the one starting earliest in the string. If the RE could match
  334. more than one substring starting at that point, the choice is determined by
  335. it's @e preference: either the longest substring, or the shortest.
  336. Most atoms, and all constraints, have no preference. A parenthesized RE has the
  337. same preference (possibly none) as the RE. A quantified atom with quantifier
  338. <tt>{m}</tt> or <tt>{m}?</tt> has the same preference (possibly none) as the
  339. atom itself. A quantified atom with other normal quantifiers (including
  340. <tt>{m,n}</tt> with @e m equal to @e n) prefers longest match. A quantified
  341. atom with other non-greedy quantifiers (including <tt>{m,n}?</tt> with @e m
  342. equal to @e n) prefers shortest match. A branch has the same preference as the
  343. first quantified atom in it which has a preference. An RE consisting of two or
  344. more branches connected by the @c | operator prefers longest match.
  345. Subject to the constraints imposed by the rules for matching the whole RE,
  346. subexpressions also match the longest or shortest possible substrings, based on
  347. their preferences, with subexpressions starting earlier in the RE taking
  348. priority over ones starting later. Note that outer subexpressions thus take
  349. priority over their component subexpressions.
  350. Note that the quantifiers <tt>{1,1}</tt> and <tt>{1,1}?</tt> can be used to
  351. force longest and shortest preference, respectively, on a subexpression or a
  352. whole RE.
  353. Match lengths are measured in characters, not collating elements. An empty
  354. string is considered longer than no match at all. For example, <tt>bb*</tt>
  355. matches the three middle characters of "abbbc",
  356. <tt>(week|wee)(night|knights)</tt> matches all ten characters of "weeknights",
  357. when <tt>(.*).*</tt> is matched against "abc" the parenthesized subexpression
  358. matches all three characters, and when <tt>(a*)*</tt> is matched against "bc"
  359. both the whole RE and the parenthesized subexpression match an empty string.
  360. If case-independent matching is specified, the effect is much as if all case
  361. distinctions had vanished from the alphabet. When an alphabetic that exists in
  362. multiple cases appears as an ordinary character outside a bracket expression,
  363. it is effectively transformed into a bracket expression containing both cases,
  364. so that @c x becomes @c [xX]. When it appears inside a bracket expression, all
  365. case counterparts of it are added to the bracket expression, so that @c [x]
  366. becomes @c [xX] and @c [^x] becomes @c [^xX].
  367. If newline-sensitive matching is specified, "." and bracket expressions using
  368. "^" will never match the newline character (so that matches will never cross
  369. newlines unless the RE explicitly arranges it) and "^" and "$" will match the
  370. empty string after and before a newline respectively, in addition to matching
  371. at beginning and end of string respectively. ARE <tt>@\A</tt> and <tt>@\Z</tt>
  372. continue to match beginning or end of string @e only.
  373. If partial newline-sensitive matching is specified, this affects "." and
  374. bracket expressions as with newline-sensitive matching, but not "^" and "$".
  375. If inverse partial newline-sensitive matching is specified, this affects "^"
  376. and "$" as with newline-sensitive matching, but not "." and bracket
  377. expressions. This isn't very useful but is provided for symmetry.
  378. @section overview_resyntax_limits Limits and Compatibility
  379. No particular limit is imposed on the length of REs. Programs intended to be
  380. highly portable should not employ REs longer than 256 bytes, as a
  381. POSIX-compliant implementation can refuse to accept such REs.
  382. The only feature of AREs that is actually incompatible with POSIX EREs is that
  383. <tt>@\</tt> does not lose its special significance inside bracket expressions.
  384. All other ARE features use syntax which is illegal or has undefined or
  385. unspecified effects in POSIX EREs; the <tt>***</tt> syntax of directors
  386. likewise is outside the POSIX syntax for both BREs and EREs.
  387. Many of the ARE extensions are borrowed from Perl, but some have been changed
  388. to clean them up, and a few Perl extensions are not present. Incompatibilities
  389. of note include <tt>@\b</tt>, <tt>@\B</tt>, the lack of special treatment for a
  390. trailing newline, the addition of complemented bracket expressions to the
  391. things affected by newline-sensitive matching, the restrictions on parentheses
  392. and back references in lookahead constraints, and the longest/shortest-match
  393. (rather than first-match) matching semantics.
  394. The matching rules for REs containing both normal and non-greedy quantifiers
  395. have changed since early beta-test versions of this package. The new rules are
  396. much simpler and cleaner, but don't work as hard at guessing the user's real
  397. intentions.
  398. Henry Spencer's original 1986 @e regexp package, still in widespread use,
  399. implemented an early version of today's EREs. There are four incompatibilities
  400. between @e regexp's near-EREs (RREs for short) and AREs. In roughly increasing
  401. order of significance:
  402. @li In AREs, <tt>@\</tt> followed by an alphanumeric character is either an
  403. escape or an error, while in RREs, it was just another way of writing the
  404. alphanumeric. This should not be a problem because there was no reason to
  405. write such a sequence in RREs.
  406. @li @c { followed by a digit in an ARE is the beginning of a bound, while in
  407. RREs, @c { was always an ordinary character. Such sequences should be rare,
  408. and will often result in an error because following characters will not
  409. look like a valid bound.
  410. @li In AREs, @c @\ remains a special character within @c [], so a literal @c @\
  411. within @c [] must be written as <tt>@\@\</tt>. <tt>@\@\</tt> also gives a
  412. literal @c @\ within @c [] in RREs, but only truly paranoid programmers
  413. routinely doubled the backslash.
  414. @li AREs report the longest/shortest match for the RE, rather than the first
  415. found in a specified search order. This may affect some RREs which were
  416. written in the expectation that the first match would be reported. The
  417. careful crafting of RREs to optimize the search order for fast matching is
  418. obsolete (AREs examine all possible matches in parallel, and their
  419. performance is largely insensitive to their complexity) but cases where the
  420. search order was exploited to deliberately find a match which was @e not
  421. the longest/shortest will need rewriting.
  422. @section overview_resyntax_bre Basic Regular Expressions
  423. BREs differ from EREs in several respects. @c |, @c +, and @c ? are ordinary
  424. characters and there is no equivalent for their functionality. The delimiters
  425. for bounds are @c @\{ and @c @\}, with @c { and @c } by themselves ordinary
  426. characters. The parentheses for nested subexpressions are @c @\( and @c @\),
  427. with @c ( and @c ) by themselves ordinary characters. @c ^ is an ordinary
  428. character except at the beginning of the RE or the beginning of a parenthesized
  429. subexpression, @c $ is an ordinary character except at the end of the RE or the
  430. end of a parenthesized subexpression, and @c * is an ordinary character if it
  431. appears at the beginning of the RE or the beginning of a parenthesized
  432. subexpression (after a possible leading <tt>^</tt>). Finally, single-digit back
  433. references are available, and @c @\@< and @c @\@> are synonyms for
  434. <tt>[[:@<:]]</tt> and <tt>[[:@>:]]</tt> respectively; no other escapes are
  435. available.
  436. @section overview_resyntax_characters Regular Expression Character Names
  437. Note that the character names are case sensitive.
  438. <center><table class='doctable' border='0' cellspacing='5' cellpadding='4'><tr>
  439. <td>
  440. @beginTable
  441. @row2col{ <tt>NUL</tt> , @\0 }
  442. @row2col{ <tt>SOH</tt> , @\001 }
  443. @row2col{ <tt>STX</tt> , @\002 }
  444. @row2col{ <tt>ETX</tt> , @\003 }
  445. @row2col{ <tt>EOT</tt> , @\004 }
  446. @row2col{ <tt>ENQ</tt> , @\005 }
  447. @row2col{ <tt>ACK</tt> , @\006 }
  448. @row2col{ <tt>BEL</tt> , @\007 }
  449. @row2col{ <tt>alert</tt> , @\007 }
  450. @row2col{ <tt>BS</tt> , @\010 }
  451. @row2col{ <tt>backspace</tt> , @\b }
  452. @row2col{ <tt>HT</tt> , @\011 }
  453. @row2col{ <tt>tab</tt> , @\t }
  454. @row2col{ <tt>LF</tt> , @\012 }
  455. @row2col{ <tt>newline</tt> , @\n }
  456. @row2col{ <tt>VT</tt> , @\013 }
  457. @row2col{ <tt>vertical-tab</tt> , @\v }
  458. @row2col{ <tt>FF</tt> , @\014 }
  459. @row2col{ <tt>form-feed</tt> , @\f }
  460. @endTable
  461. </td>
  462. <td>
  463. @beginTable
  464. @row2col{ <tt>CR</tt> , @\015 }
  465. @row2col{ <tt>carriage-return</tt> , @\r }
  466. @row2col{ <tt>SO</tt> , @\016 }
  467. @row2col{ <tt>SI</tt> , @\017 }
  468. @row2col{ <tt>DLE</tt> , @\020 }
  469. @row2col{ <tt>DC1</tt> , @\021 }
  470. @row2col{ <tt>DC2</tt> , @\022 }
  471. @row2col{ <tt>DC3</tt> , @\023 }
  472. @row2col{ <tt>DC4</tt> , @\024 }
  473. @row2col{ <tt>NAK</tt> , @\025 }
  474. @row2col{ <tt>SYN</tt> , @\026 }
  475. @row2col{ <tt>ETB</tt> , @\027 }
  476. @row2col{ <tt>CAN</tt> , @\030 }
  477. @row2col{ <tt>EM</tt> , @\031 }
  478. @row2col{ <tt>SUB</tt> , @\032 }
  479. @row2col{ <tt>ESC</tt> , @\033 }
  480. @row2col{ <tt>IS4</tt> , @\034 }
  481. @row2col{ <tt>FS</tt> , @\034 }
  482. @row2col{ <tt>IS3</tt> , @\035 }
  483. @endTable
  484. </td>
  485. <td>
  486. @beginTable
  487. @row2col{ <tt>GS</tt> , @\035 }
  488. @row2col{ <tt>IS2</tt> , @\036 }
  489. @row2col{ <tt>RS</tt> , @\036 }
  490. @row2col{ <tt>IS1</tt> , @\037 }
  491. @row2col{ <tt>US</tt> , @\037 }
  492. @row2col{ <tt>space</tt> , " " (space) }
  493. @row2col{ <tt>exclamation-mark</tt> , ! }
  494. @row2col{ <tt>quotation-mark</tt> , " }
  495. @row2col{ <tt>number-sign</tt> , @# }
  496. @row2col{ <tt>dollar-sign</tt> , @$ }
  497. @row2col{ <tt>percent-sign</tt> , @% }
  498. @row2col{ <tt>ampersand</tt> , @& }
  499. @row2col{ <tt>apostrophe</tt> , ' }
  500. @row2col{ <tt>left-parenthesis</tt> , ( }
  501. @row2col{ <tt>right-parenthesis</tt> , ) }
  502. @row2col{ <tt>asterisk</tt> , * }
  503. @row2col{ <tt>plus-sign</tt> , + }
  504. @row2col{ <tt>comma</tt> , \, }
  505. @row2col{ <tt>hyphen</tt> , - }
  506. @endTable
  507. </td>
  508. <td>
  509. @beginTable
  510. @row2col{ <tt>hyphen-minus</tt> , - }
  511. @row2col{ <tt>period</tt> , . }
  512. @row2col{ <tt>full-stop</tt> , . }
  513. @row2col{ <tt>slash</tt> , / }
  514. @row2col{ <tt>solidus</tt> , / }
  515. @row2col{ <tt>zero</tt> , 0 }
  516. @row2col{ <tt>one</tt> , 1 }
  517. @row2col{ <tt>two</tt> , 2 }
  518. @row2col{ <tt>three</tt> , 3 }
  519. @row2col{ <tt>four</tt> , 4 }
  520. @row2col{ <tt>five</tt> , 5 }
  521. @row2col{ <tt>six</tt> , 6 }
  522. @row2col{ <tt>seven</tt> , 7 }
  523. @row2col{ <tt>eight</tt> , 8 }
  524. @row2col{ <tt>nine</tt> , 9 }
  525. @row2col{ <tt>colon</tt> , : }
  526. @row2col{ <tt>semicolon</tt> , ; }
  527. @row2col{ <tt>less-than-sign</tt> , @< }
  528. @row2col{ <tt>equals-sign</tt> , = }
  529. @endTable
  530. </td>
  531. <td>
  532. @beginTable
  533. @row2col{ <tt>greater-than-sign</tt> , @> }
  534. @row2col{ <tt>question-mark</tt> , ? }
  535. @row2col{ <tt>commercial-at</tt> , @@ }
  536. @row2col{ <tt>left-square-bracket</tt> , [ }
  537. @row2col{ <tt>backslash</tt> , @\ }
  538. @row2col{ <tt>reverse-solidus</tt> , @\ }
  539. @row2col{ <tt>right-square-bracket</tt> , ] }
  540. @row2col{ <tt>circumflex</tt> , ^ }
  541. @row2col{ <tt>circumflex-accent</tt> , ^ }
  542. @row2col{ <tt>underscore</tt> , _ }
  543. @row2col{ <tt>low-line</tt> , _ }
  544. @row2col{ <tt>grave-accent</tt> , ' }
  545. @row2col{ <tt>left-brace</tt> , @leftCurly }
  546. @row2col{ <tt>left-curly-bracket</tt> , @leftCurly }
  547. @row2col{ <tt>vertical-line</tt> , | }
  548. @row2col{ <tt>right-brace</tt> , @rightCurly }
  549. @row2col{ <tt>right-curly-bracket</tt> , @rightCurly }
  550. @row2col{ <tt>tilde</tt> , ~ }
  551. @row2col{ <tt>DEL</tt> , @\177 }
  552. @endTable
  553. </td>
  554. </tr></table></center>
  555. */