regex.pl 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436
  1. #!/usr/bin/env perl
  2. #############################################################################
  3. # Name: regex.pl
  4. # Purpose: Generate test code for wxRegEx from 'reg.test'
  5. # Author: Mike Wetherell
  6. # Copyright: (c) Mike Wetherell
  7. # Licence: wxWindows licence
  8. #############################################################################
  9. #
  10. # Notes:
  11. # See './regex.pl -h' for usage
  12. #
  13. # Output at the moment is C++ using the cppunit testing framework. The
  14. # language/framework specifics are separated, with the following 5
  15. # subs as an interface: 'begin_output', 'begin_section', 'write_test',
  16. # 'end_section' and 'end_output'. So for a different language/framework,
  17. # implement 5 new similar subs.
  18. #
  19. # I've avoided using 'use encoding "UTF-8"', since this wasn't available
  20. # in perl 5.6.x. Instead I've used some hacks like 'pack "U0C*"'. Versions
  21. # earler than perl 5.6.0 aren't going to work.
  22. #
  23. use strict;
  24. use warnings;
  25. use File::Basename;
  26. #use encoding "UTF-8"; # enable in the future when perl 5.6.x is just a memory
  27. # if 0 output is wide characters, if 1 output is utf8 encoded
  28. my $utf = 1;
  29. # quote a parameter (C++ helper)
  30. #
  31. sub quotecxx {
  32. my %esc = ( "\a" => "a", "\b" => "b", "\f" => "f",
  33. "\n" => "n", "\r" => "r", "\t" => "t",
  34. "\013" => "v", '"' => '"', "\\" => "\\" );
  35. # working around lack of 'use encoding'
  36. if (!$utf) {
  37. $_ = pack "U0C*", unpack "C*", $_;
  38. use utf8;
  39. }
  40. s/[\000-\037"\\\177-\x{ffff}]/
  41. if ($esc{$&}) {
  42. "\\$esc{$&}";
  43. } elsif (ord($&) > 0x9f && !$utf) {
  44. sprintf "\\u%04x", ord($&);
  45. } else {
  46. sprintf "\\%03o", ord($&);
  47. }
  48. /ge;
  49. # working around lack of 'use encoding'
  50. if (!$utf) {
  51. no utf8;
  52. $_ = pack "C*", unpack "C*", $_;
  53. }
  54. return ($utf ? '"' : 'L"') . $_ . '"'
  55. }
  56. # start writing the output code (C++ interface)
  57. #
  58. sub begin_output {
  59. my ($from, $instructions) = @_;
  60. # embed it in the comment
  61. $from = "\n$from";
  62. $from =~ s/^(?: )?/ * /mg;
  63. # $instructions contains information about the flags etc.
  64. if ($instructions) {
  65. $instructions = "\n$instructions";
  66. $instructions =~ s/^(?: )?/ * /mg;
  67. }
  68. my $u = $utf ? " (UTF-8 encoded)" : "";
  69. print <<EOT;
  70. /*
  71. * Test data for wxRegEx$u
  72. $from$instructions */
  73. EOT
  74. }
  75. my @classes;
  76. # start a new section (C++ interface)
  77. #
  78. sub begin_section {
  79. my ($id, $title) = @_;
  80. my $class = "regextest_$id";
  81. $class =~ s/\W/_/g;
  82. push @classes, [$id, $class];
  83. print <<EOT;
  84. /*
  85. * $id $title
  86. */
  87. class $class : public RegExTestSuite
  88. {
  89. public:
  90. $class() : RegExTestSuite("regex.$id") { }
  91. static Test *suite();
  92. };
  93. Test *$class\::suite()
  94. {
  95. RegExTestSuite *suite = new $class;
  96. EOT
  97. }
  98. # output a test line (C++ interface)
  99. #
  100. sub write_test {
  101. my @args = @_;
  102. $_ = quotecxx for @args;
  103. print " suite->add(" . (join ', ', @args) . ", NULL);\n";
  104. }
  105. # end a section (C++ interface)
  106. #
  107. sub end_section {
  108. my ($id, $class) = @{$classes[$#classes]};
  109. print <<EOT;
  110. return suite;
  111. }
  112. CPPUNIT_TEST_SUITE_NAMED_REGISTRATION($class, "regex.$id");
  113. EOT
  114. }
  115. # finish off the output (C++ interface)
  116. #
  117. sub end_output {
  118. print <<EOT;
  119. /*
  120. * A suite containing all the above suites
  121. */
  122. class regextest : public TestSuite
  123. {
  124. public:
  125. regextest() : TestSuite("regex") { }
  126. static Test *suite();
  127. };
  128. Test *regextest::suite()
  129. {
  130. TestSuite *suite = new regextest;
  131. EOT
  132. print " suite->addTest(".$_->[1]."::suite());\n" for @classes;
  133. print <<EOT;
  134. return suite;
  135. }
  136. CPPUNIT_TEST_SUITE_NAMED_REGISTRATION(regextest, "regex");
  137. CPPUNIT_TEST_SUITE_REGISTRATION(regextest);
  138. EOT
  139. }
  140. # Parse a tcl string. Handles curly quoting and double quoting.
  141. #
  142. sub parsetcl {
  143. my ($curly, $quote);
  144. # recursively defined expression that can parse balanced braces
  145. # warning: uses experimental features of perl, see perlop(1)
  146. $curly = qr/\{(?:(?>(?:\\[{}]|[^{}])+)|(??{$curly}))*\}/;
  147. $quote = qr/"(?:\\"|[^"])*"/;
  148. my @tokens = shift =~ /($curly|$quote|\S+)/g;
  149. # now remove braces/quotes and unescape any escapes
  150. for (@tokens) {
  151. if (s/^{(.*)}$/$1/) {
  152. # for curly quoting, only unescape \{ and \}
  153. s/\\([{}])/$1/g;
  154. } else {
  155. s/^"(.*)"$/$1/;
  156. # unescape any escapes
  157. my %esc = ( "a" => "\a", "b" => "\b", "f" => "\f",
  158. "n" => "\n", "r" => "\r", "t" => "\t",
  159. "v" => "\013" );
  160. my $x = qr/[[:xdigit:]]/;
  161. s/\\([0-7]{1,3}|x$x+|u$x{1,4}|.)/
  162. if ($1 =~ m{^([0-7]+)}) {
  163. chr(oct($1));
  164. } elsif ($1 =~ m{^x($x+)}) {
  165. pack("C0U", hex($1) & 0xff);
  166. } elsif ($1 =~ m{^u($x+)}) {
  167. pack("C0U", hex($1));
  168. } elsif ($esc{$1}) {
  169. $esc{$1};
  170. } else {
  171. $1;
  172. }
  173. /ge;
  174. }
  175. }
  176. return @tokens;
  177. }
  178. # helpers which keep track of whether begin_section has been called, so that
  179. # end_section can be called when appropriate
  180. #
  181. my @doing = ("0", "");
  182. my $in_section = 0;
  183. sub handle_doing {
  184. end_section if $in_section;
  185. $in_section = 0;
  186. @doing = @_;
  187. }
  188. sub handle_test {
  189. begin_section(@doing) if !$in_section;
  190. $in_section = 1;
  191. write_test @_;
  192. }
  193. sub handle_end {
  194. end_section if $in_section;
  195. $in_section = 0;
  196. end_output;
  197. }
  198. # 'main' - start by parsing the command lines options.
  199. #
  200. my $badoption = !@ARGV;
  201. my $utfdefault = $utf;
  202. my $outputname;
  203. for (my $i = 0; $i < @ARGV; ) {
  204. if ($ARGV[$i] !~ m{^-.}) {
  205. $i++;
  206. next;
  207. }
  208. if ($ARGV[$i] eq '--') {
  209. splice @ARGV, $i, 1;
  210. last;
  211. }
  212. if ($ARGV[$i] =~ s{^-(.*)o(.*)$}{-$1}i) { # -o : output file
  213. $outputname = $2 || splice @ARGV, $i + 1, 1;
  214. }
  215. for (split //, substr($ARGV[$i], 1)) {
  216. if (/u/i) { # -u : utf-8 output
  217. $utf = 1;
  218. } elsif (/w/i) { # -w : wide char output
  219. $utf = 0;
  220. } else {
  221. $badoption = 1;
  222. }
  223. }
  224. splice @ARGV, $i, 1;
  225. }
  226. # Display help
  227. #
  228. if ($badoption) {
  229. my $prog = basename $0;
  230. my ($w, $u) = (" (default)", " ");
  231. ($w, $u) = ($u, $w) if $utfdefault;
  232. print <<EOT;
  233. Usage: $prog [-u|-w] [-o OUTPUT] [FILE...]
  234. Generate test code for wxRegEx from 'reg.test'
  235. Example: $prog -o regex.inc reg.test wxreg.test
  236. -w$w Output will be wide characters.
  237. -u$u Output will be UTF-8 encoded.
  238. Input files should be in UTF-8. If no input files are specified input is
  239. read from stdin. If no output file is specified output is written to stdout.
  240. See the comments in reg.test for details of the input file format.
  241. EOT
  242. exit 0;
  243. }
  244. # Open the output file
  245. #
  246. open STDOUT, ">$outputname" if $outputname;
  247. # Read in the files and initially parse just the comments for copyright
  248. # information and instructions on the tests
  249. #
  250. my @input; # slurped input files stripped of comments
  251. my $files = ""; # copyright info from the input comments
  252. my $instructions = ""; # test instructions from the input comments
  253. do {
  254. my $inputname = basename $ARGV[0] if @ARGV;
  255. # slurp input
  256. undef $/;
  257. my $in = <>;
  258. # remove escaped newlines
  259. $in =~ s/(?<!\\)\\\n//g;
  260. # record the copyrights of the input files
  261. for ($in =~ /^#[\t ]*(.*copyright.*)$/mig) {
  262. s/[\s:]+/ /g;
  263. $files .= " ";
  264. $files .= $inputname . ": " if $inputname && $inputname ne '-';
  265. $files .= "$_\n";
  266. }
  267. # Parse the comments for instructions on the tests, which look like this:
  268. # i successful match with -indices (used in checking things like
  269. # nonparticipating subexpressions)
  270. if (!$instructions) {
  271. my $sp = qr{\t| +}; # tab or three or more spaces
  272. my @instructions = $in =~
  273. /\n(
  274. (?:
  275. \#$sp\S?$sp\S[^\n]+\n # instruction line
  276. (?:\#$sp$sp\S[^\n]+\n)* # continuation lines (if any)
  277. )+
  278. )/gx;
  279. if (@instructions) {
  280. $instructions[0] = "Test types:\n$instructions[0]";
  281. if (@instructions > 1) {
  282. $instructions[1] = "Flag characters:\n$instructions[1]";
  283. }
  284. $instructions = join "\n", @instructions;
  285. $instructions =~ s/^#([^\t]?)/ $1/mg;
  286. }
  287. }
  288. # @input is the input of all files (stipped of comments)
  289. $in =~ s/^#.*$//mg;
  290. push @input, $in;
  291. } while $ARGV[0];
  292. # Make a string naming the generator, the input files and copyright info
  293. #
  294. my $from = "Generated " . localtime() . " by " . basename $0;
  295. $from =~ s/[\s]+/ /g;
  296. if ($files) {
  297. if ($files =~ /:/) {
  298. $from .= " from the following files:";
  299. } else {
  300. $from .= " from work with the following copyright:";
  301. }
  302. }
  303. $from = join("\n", $from =~ /(.{0,76}(?:\s|$))/g); # word-wrap
  304. $from .= "\n$files" if $files;
  305. # Now start to print the code
  306. #
  307. begin_output $from, $instructions;
  308. # numbers for 'extra' sections
  309. my $extra = 1;
  310. for (@input)
  311. {
  312. # Print the main tests
  313. #
  314. # Test lines look like this:
  315. # m 3 b {\(a\)b} ab ab a
  316. #
  317. # Also looks for heading lines, e.g.:
  318. # doing 4 "parentheses"
  319. #
  320. for (split "\n") {
  321. if (/^doing\s+(\S+)\s+(\S.*)/) {
  322. handle_doing parsetcl "$1 $2";
  323. } elsif (/^[efimp]\s/) {
  324. handle_test parsetcl $_;
  325. }
  326. }
  327. # Extra tests
  328. #
  329. # The expression below matches something like this:
  330. # test reg-33.8 {Bug 505048} {
  331. # regexp -inline {\A\s*[^b]*b} ab
  332. # } ab
  333. #
  334. # The three subexpressions then return these parts:
  335. # $extras[$i] = '{Bug 505048}',
  336. # $extras[$i + 1] = '-inline {\A\s*[^b]*b} ab'
  337. # $extras[$i + 2] = 'ab'
  338. #
  339. my @extras = /\ntest\s+\S+\s*(\{.*?\})\s*\{\n # line 1
  340. \s*regexp\s+([^\n]+)\n # line 2
  341. \}\s*(\S[^\n]*)/gx; # line 3
  342. handle_doing "extra_" . $extra++, "checks for bug fixes" if @extras;
  343. for (my $i = 0; $i < @extras; $i += 3) {
  344. my $id = $extras[$i];
  345. # further parse the middle line into options and the rest (i.e. $args)
  346. my ($opts, $args) = $extras[$i + 1] =~ /^\s*((?:-\S+\s+)*)([^\s-].*)/;
  347. my @args = parsetcl $args;
  348. $#args = 1; # only want the first two
  349. # now handle the options
  350. my $test = $opts =~ /-indices/ ? 'i' : $extras[$i + 2] ? 'm' : 'f';
  351. my $results = $opts =~ /-inline/ && $test ne 'f' ? $extras[$i+2] : '';
  352. # get them all in the right order and print
  353. unshift @args, $test, parsetcl($id), $results ? '-' : 'o';
  354. push @args, parsetcl(parsetcl($results)) if $results;
  355. handle_test @args;
  356. }
  357. }
  358. # finish
  359. #
  360. handle_end;