もう flex 使わない、これからは re2c
flex & bison をやめて re2c & bison にしたらパーサが速くなった。re2c は UTF32 の入力に対応しているので非常にすっきり書けるし設計も歪まない。
re2c is a tool for writing very fast and very flexible scanners. Unlike any other such tool, re2c focuses on generating high efficient code for regular expression matching.
re2c は flex とは互換がなく独自のマクロなどを定義しなくてはいけないので注意。サンプルとドキュメントの少ないのもマイナスポイント。
それこそ UTF32 を利用している例なんて見つけられなかった。
re2c で多分難しいのは YYFILL (flex でいうところの YYIN)を定義するところなのだけど、下に貼ったコードをほぼそのまま使えば良いと思う。(間違っていたらごめんなさい)
でも何だかんだで flex の不満点がいくつも解消されているし、速いし re2c 最高。
速度
速度ですが
re2c > 手書き(Gauche の read) > flex
という印象。
手書き (read) と、re2c を利用した (read) で速度比較してみました。
手書き(micro sec) | re2c(micro sec) | ファイル名 |
---|---|---|
443 | 237 | ./scripts/analyze-profile.scm |
2458 | 754 | ./scripts/bench.scm |
22944 | 555 | ./scripts/count-insn.scm |
1087 | 378 | ./scripts/detect-repeat.scm |
1059 | 341 | ./scripts/gen-compiler.scm |
608 | 532 | ./scripts/gen-cproc.scm |
24658 | 616 | ./scripts/gen-doc-index.scm |
15483 | 285 | ./scripts/gen-doc.scm |
16217 | 235 | ./scripts/gen-free-vars-decl.scm |
10746 | 143 | ./scripts/gen-insn.scm |
866 | 371 | ./scripts/gen-label.scm |
5458 | 362 | ./scripts/gen-pre-compiled-cpp.scm |
2056 | 1120 | ./scripts/gen-pre-compiled-faster-cpp.scm |
2334 | 3650 | ./scripts/gen-profiler.scm |
1648 | 553 | ./scripts/gen-short-insn.scm |
792 | 364 | ./scripts/gen-test.scm |
579 | 446 | ./scripts/insn-bridge.scm |
426 | 117 | ./scripts/log-analyzer.scm |
1010 | 453 | ./scripts/pp.scm |
3565 | 2313 | ./scripts/test-compiler.scm |
2040 | 879 | ./test/reader.scm |
253066 | 81535 | ./bench/apply.scm |
302 | 84 | ./bench/array1.scm |
850 | 335 | ./bench/case.scm |
479 | 86 | ./bench/cpstak.scm |
605 | 175 | ./bench/fib.scm |
525 | 98 | ./bench/let.scm |
528 | 579 | ./bench/sum.scm |
509 | 185 | ./bench/tak.scm |
926 | 196 | ./bench/takl.scm |
617 | 241 | ./bench/triangl.scm |
3817 | 502 | ./compiler-gauche.scm |
36055 | 35933 | ./compiler-vm-cpp.scm |
42516 | 22165 | ./compiler-vm.scm |
36207 | 36680 | ./compiler-with-library.scm |
31136 | 33579 | ./dynamic-wind-test.scm |
342 | 394 | ./example/grass.scm |
1652 | 5062 | ./free-vars-decl.scm |
588 | 533 | ./free-vars.scm |
1718 | 1762 | ./hage.scm |
82 | 35 | ./hige.scm |
2391 | 995 | ./hoge.scm |
19020 | 13131 | ./instruction.scm |
4315 | 821 | ./library.scm |
Parsing R6RS lexical syntax (almost)
いつか検索で誰かがたどり着けるようにコード貼っておきます。
R6RS Lexical syntax.
#include <stdio.h> #include "Object.h" #include "Pair.h" #include "StringProcedures.h" #include "TextualInputPort.h" #include "TextualOutputPort.h" #include "ucs4string.h" #include "ScannerHelper.h" #include "Scanner.h" #include "reader.h" #include "reader.tab.hpp" #include "VM.h" #define YYCTYPE ucs4char #define YYCURSOR cursor_ #define YYMARKER marker_ #define YYLIMIT limit_ #define YYTOKEN token_ #define YYDEBUG(state, ch) yydebug(state, ch) #define YYFILL(n) fill(n) using namespace scheme; extern VM* theVM; extern TextualInputPort* parser_port(); extern YYSTYPE yylval; Scanner::Scanner() : buffer_(NULL), cursor_(buffer_), token_(buffer_), limit_(buffer_), marker_(buffer_), bufferSize_(32) { } Scanner::~Scanner() { } static void yydebug(int state, ucs4char ch) { #if 0 TextualOutputPort* const port = theVM->getOutputPort().toTextualOutputPort(); port->format(UC("state=~d ch=[~a]\n"), Pair::list2(Object::makeInt(state), Object::makeChar(ch))); #endif } void Scanner::fill(int n) { TextualInputPort* const inputPort = parser_port(); const int restCharCount = limit_ - token_; const int tokenOffset = token_ - buffer_; if (buffer_ == NULL) { buffer_ = new(GC) ucs4char[bufferSize_]; cursor_ = buffer_; limit_ = buffer_; token_ = buffer_; marker_ = buffer_; } if ((restCharCount + n) > bufferSize_) { ucs4char* newBuffer = new(GC) ucs4char[restCharCount + n + 1]; bufferSize_ = restCharCount + n + 1; memmove(newBuffer, token_, restCharCount * sizeof(ucs4char)); cursor_ = &newBuffer[cursor_ - buffer_]; token_ = &newBuffer[token_ - buffer_]; limit_ = &newBuffer[limit_ - buffer_]; marker_ = &newBuffer[marker_ - buffer_]; buffer_ = newBuffer; } else if (restCharCount > 0) { memmove(buffer_, token_, restCharCount * sizeof(ucs4char)); } int i; for (i = 0; i < n; i++) { const ucs4char ch = inputPort->getChar(); if (ch == EOF) { buffer_[i + restCharCount] = '\0'; i++; break; } else { buffer_[i + restCharCount] = ch; } } const int readSize = i; cursor_ = cursor_ - tokenOffset; token_ = buffer_; marker_ = marker_ - tokenOffset; limit_ = limit_ - tokenOffset + readSize; } int yylex() { return parser_port()->scanner()->scan(); } int Scanner::scan() { /*!re2c LINE_FEED = "\n"; CHARACTER_TABULATION = "\X0009"; LINE_TABULATION = "\X000B"; LINE_SEPARATOR = "\X2028"; FORM_FEED = "\X000C"; CARRIGE_RETURN = "\r"; NEXT_LINE = "\X0085"; UNICODE_ZL_ZP = [\X2028-\x2029]; UNICODE_ZS = "\X0020" | "\X00A0" | "\X1680" | "\X180E" | [\X2000-\X200A] | "\X202F" | "\X205F" | "\X3000"; LINE_ENDING = LINE_FEED | CARRIGE_RETURN | (CARRIGE_RETURN LINE_FEED) | NEXT_LINE | (CARRIGE_RETURN NEXT_LINE) | LINE_SEPARATOR; WHITE_SPACE = CHARACTER_TABULATION | LINE_FEED | LINE_TABULATION | FORM_FEED | CARRIGE_RETURN | NEXT_LINE | UNICODE_ZL_ZP | UNICODE_ZS; DELMITER = [\(\)\[\]\";#]|WHITE_SPACE; ANY_CHARACTER = [^]; DIGIT = [0-9]; HEX_DIGIT = DIGIT | [a-f] | [A-F]; HEX_SCALAR_VALUE = HEX_DIGIT +; INTRA_LINE_WHITE_SPACE = "\t" | UNICODE_ZS; INLINE_HEX_ESCAPE = "\\x" HEX_SCALAR_VALUE ";"; STRING_ELEMENT = [^\"\\] | ("\\" [abtnvfr\"\\]) | ("\\" INTRA_LINE_WHITE_SPACE LINE_ENDING INTRA_LINE_WHITE_SPACE) | INLINE_HEX_ESCAPE; REGEXP_ELEMENT = "\\\/" | [^/]; DIGIT_10 = DIGIT; UINTEGER_10 = DIGIT_10 +; NAN_INF = "nan.0" | "inf.0"; SIGN = [\+\-]?; EXPONENT_MARKER = [eEsSfFdDlL]; MANTISSA_WIDTH = ("|" (DIGIT_10)+)?; SUFFIX = (EXPONENT_MARKER SIGN (DIGIT_10)+)?; DECIMAL_10 = (UINTEGER_10 SUFFIX) | ("." (DIGIT_10)+ SUFFIX) | ((DIGIT_10)+ "." (DIGIT_10)* SUFFIX) | ((DIGIT_10)+ "." SUFFIX); UREAL_10 = UINTEGER_10 | (UINTEGER_10 "/" UINTEGER_10); REAL_10 = (SIGN UREAL_10) | ([\+\-] NAN_INF); COMPLEX_10 = REAL_10 | (REAL_10 "@" REAL_10) | (REAL_10 [\+\-] UREAL_10 "i") | (REAL_10 [\+\-] NAN_INF "i") | (REAL_10 [\+\-] "i") | ([\+-\] UREAL_10 "i") | ([\+\-] NAN_INF "i") | ([\+\-] "i"); RADIX_10 = ("#"[dD])?; EXACTNESS = ("#"[iIeE])?; PREFIX_10 = (RADIX_10 EXACTNESS) | (EXACTNESS RADIX_10); NUM_10 = PREFIX_10 COMPLEX_10; SPECIAL_INITIAL = [!\$%&\*\/\:\<=\>\?\^\_~]; LETTER = [a-z] | [A-Z]; CONSTITUENT = LETTER | [\X0080-\XFFFF]; /* todo: replace \X0080-\XFFFF to Unicode category */ INITIAL = CONSTITUENT | SPECIAL_INITIAL | INLINE_HEX_ESCAPE; SUBSEQUENT = INITIAL | DIGIT | [\+\-\.@]; /* todo: Add Unicode category Nd, Mc and Me */ PECULIAR_IDENTIFIER = [\+\-] | "..." | ("->" (SUBSEQUENT)*) | "@"; /* "@" is not R6RS match.scm required it. */ IDENTIFIER = (INITIAL (SUBSEQUENT)*) | PECULIAR_IDENTIFIER; COMMENT = (";"[^\n\X0000]* (LINE_ENDING | "\X0000")) | ("#!"[^\n]*); */ for(;;) { /*!re2c "#"[tT] DELMITER { yylval.boolValue = true; YYCURSOR--; YYTOKEN = YYCURSOR; return BOOLEAN; } "#"[fF] DELMITER { yylval.boolValue = false; YYCURSOR--; YYTOKEN = YYCURSOR; return BOOLEAN; } "#\\space" DELMITER { yylval.charValue = ' '; YYCURSOR--; YYTOKEN = YYCURSOR; return CHARACTER; } "#\\newline" DELMITER { yylval.charValue = '\n'; YYCURSOR--; YYTOKEN = YYCURSOR; return CHARACTER; } "#\\nul" DELMITER { yylval.charValue = 0x00; YYCURSOR--; YYTOKEN = YYCURSOR; return CHARACTER; } "#\\alerm" DELMITER { yylval.charValue = 0x07; YYCURSOR--; YYTOKEN = YYCURSOR; return CHARACTER; } "#\\backspace" DELMITER { yylval.charValue = 0x08; YYCURSOR--; YYTOKEN = YYCURSOR; return CHARACTER; } "#\\tab" DELMITER { yylval.charValue = 0x09; YYCURSOR--; YYTOKEN = YYCURSOR; return CHARACTER; } "#\\linefeed" DELMITER { yylval.charValue = 0x0A; YYCURSOR--; YYTOKEN = YYCURSOR; return CHARACTER; } "#\\vtab" DELMITER { yylval.charValue = 0x0B; YYCURSOR--; YYTOKEN = YYCURSOR; return CHARACTER; } "#\\page" DELMITER { yylval.charValue = 0x0C; YYCURSOR--; YYTOKEN = YYCURSOR; return CHARACTER; } "#\\return" DELMITER { yylval.charValue = 0x0D; YYCURSOR--; YYTOKEN = YYCURSOR; return CHARACTER; } "#\\esc" DELMITER { yylval.charValue = 0x1B; YYCURSOR--; YYTOKEN = YYCURSOR; return CHARACTER; } "#\\delete" DELMITER { yylval.charValue = 0x7F; YYCURSOR--; YYTOKEN = YYCURSOR; return CHARACTER; } "#\\" ANY_CHARACTER DELMITER { yylval.charValue = YYTOKEN[2]; YYCURSOR--; YYTOKEN = YYCURSOR; return CHARACTER; } "#\\x" HEX_SCALAR_VALUE DELMITER { yylval.charValue = ScannerHelper::hexStringToUCS4Char(YYTOKEN + 3, YYCURSOR - 1); YYCURSOR--; YYTOKEN = YYCURSOR; return CHARACTER; } /* we now omit "(DECIMAL_10 MANTISSA_WIDTH)" for UREAL_10. */ /* it causes infinite loop. */ NUM_10 DELMITER { yylval.intValue = ScannerHelper::num10StringToInt(YYTOKEN, YYCURSOR - 1); YYCURSOR--; YYTOKEN = YYCURSOR; return NUMBER; } IDENTIFIER DELMITER { yylval.stringValue = ucs4string(YYTOKEN, (YYCURSOR - YYTOKEN) - 1); YYCURSOR--; YYTOKEN = YYCURSOR; return IDENTIFIER; } "#/" REGEXP_ELEMENT* "/" DELMITER { yylval.stringValue = ucs4string(YYTOKEN + 2, (YYCURSOR - YYTOKEN) - 4); YYCURSOR--; YYTOKEN = YYCURSOR; return REGEXP; } "\"" STRING_ELEMENT* "\"" DELMITER { yylval.stringValue = ucs4string(YYTOKEN + 1, (YYCURSOR - YYTOKEN) - 3); YYCURSOR--; YYTOKEN = YYCURSOR; return STRING; } "." { return DOT; } "`" { YYTOKEN = YYCURSOR; return ABBV_QUASIQUOTE; } "'" { YYTOKEN = YYCURSOR; return ABBV_QUOTE; } ",@" { YYTOKEN = YYCURSOR; return ABBV_UNQUOTESPLICING; } "," { YYTOKEN = YYCURSOR; return ABBV_UNQUOTE; } "#'" { YYTOKEN = YYCURSOR; return ABBV_SYNTAX; } "#`" { YYTOKEN = YYCURSOR; return ABBV_QUASISYNTAX; } "#," { YYTOKEN = YYCURSOR; return ABBV_UNSYNTAX; } "#,@" { YYTOKEN = YYCURSOR; return ABBV_UNSYNTAXSPLICING; } COMMENT { YYTOKEN = YYCURSOR; continue; } [\]\)] { YYTOKEN = YYCURSOR; return RIGHT_PAREN; } [\(\[] { YYTOKEN = YYCURSOR; return LEFT_PAREN; } "#(" { YYTOKEN = YYCURSOR; return VECTOR_START; } "#vu8(" { YYTOKEN = YYCURSOR; return BYTE_VECTOR_START; } WHITE_SPACE { YYTOKEN = YYCURSOR; continue; } "\X0000" { YYTOKEN = YYCURSOR; return END_OF_FILE; } "#|" { goto comment; } */ comment: YYTOKEN = YYCURSOR; /*!re2c "|#" { YYTOKEN = YYCURSOR; continue; } ANY_CHARACTER { goto comment; } */ } } ucs4char* Scanner::currentToken() const { return token_; }