もう flex 使わない、これからは re2c

flex & bison をやめて re2c & bison にしたらパーサが速くなった。re2c は UTF32 の入力に対応しているので非常にすっきり書けるし設計も歪まない。

re2c is a tool for writing very fast and very flexible scanners. Unlike any other such tool, re2c focuses on generating high efficient code for regular expression matching.

re2c は flex とは互換がなく独自のマクロなどを定義しなくてはいけないので注意。サンプルとドキュメントの少ないのもマイナスポイント。
それこそ UTF32 を利用している例なんて見つけられなかった。
re2c で多分難しいのは YYFILL (flex でいうところの YYIN)を定義するところなのだけど、下に貼ったコードをほぼそのまま使えば良いと思う。(間違っていたらごめんなさい)


でも何だかんだで flex の不満点がいくつも解消されているし、速いし re2c 最高。

速度

速度ですが

re2c > 手書き(Gauche の read) > flex

という印象。


手書き (read) と、re2c を利用した (read) で速度比較してみました。

手書き(micro sec) re2c(micro sec) ファイル名
443 237 ./scripts/analyze-profile.scm
2458 754 ./scripts/bench.scm
22944 555 ./scripts/count-insn.scm
1087 378 ./scripts/detect-repeat.scm
1059 341 ./scripts/gen-compiler.scm
608 532 ./scripts/gen-cproc.scm
24658 616 ./scripts/gen-doc-index.scm
15483 285 ./scripts/gen-doc.scm
16217 235 ./scripts/gen-free-vars-decl.scm
10746 143 ./scripts/gen-insn.scm
866 371 ./scripts/gen-label.scm
5458 362 ./scripts/gen-pre-compiled-cpp.scm
2056 1120 ./scripts/gen-pre-compiled-faster-cpp.scm
2334 3650 ./scripts/gen-profiler.scm
1648 553 ./scripts/gen-short-insn.scm
792 364 ./scripts/gen-test.scm
579 446 ./scripts/insn-bridge.scm
426 117 ./scripts/log-analyzer.scm
1010 453 ./scripts/pp.scm
3565 2313 ./scripts/test-compiler.scm
2040 879 ./test/reader.scm
253066 81535 ./bench/apply.scm
302 84 ./bench/array1.scm
850 335 ./bench/case.scm
479 86 ./bench/cpstak.scm
605 175 ./bench/fib.scm
525 98 ./bench/let.scm
528 579 ./bench/sum.scm
509 185 ./bench/tak.scm
926 196 ./bench/takl.scm
617 241 ./bench/triangl.scm
3817 502 ./compiler-gauche.scm
36055 35933 ./compiler-vm-cpp.scm
42516 22165 ./compiler-vm.scm
36207 36680 ./compiler-with-library.scm
31136 33579 ./dynamic-wind-test.scm
342 394 ./example/grass.scm
1652 5062 ./free-vars-decl.scm
588 533 ./free-vars.scm
1718 1762 ./hage.scm
82 35 ./hige.scm
2391 995 ./hoge.scm
19020 13131 ./instruction.scm
4315 821 ./library.scm

Parsing R6RS lexical syntax (almost)

いつか検索で誰かがたどり着けるようにコード貼っておきます。
R6RS Lexical syntax.

#include <stdio.h>
#include "Object.h"
#include "Pair.h"
#include "StringProcedures.h"
#include "TextualInputPort.h"
#include "TextualOutputPort.h"
#include "ucs4string.h"
#include "ScannerHelper.h"
#include "Scanner.h"

#include "reader.h"
#include "reader.tab.hpp"
#include "VM.h"

#define YYCTYPE ucs4char
#define YYCURSOR cursor_
#define YYMARKER marker_
#define YYLIMIT limit_
#define YYTOKEN token_
#define YYDEBUG(state, ch)  yydebug(state, ch)
#define YYFILL(n) fill(n)

using namespace scheme;
extern VM* theVM;
extern TextualInputPort* parser_port();
extern YYSTYPE yylval;

Scanner::Scanner() : buffer_(NULL),
                     cursor_(buffer_),
                     token_(buffer_),
                     limit_(buffer_),
                     marker_(buffer_),
                     bufferSize_(32)
{
}

Scanner::~Scanner()
{
}


static void yydebug(int state, ucs4char ch)
{
#if 0
    TextualOutputPort* const port = theVM->getOutputPort().toTextualOutputPort();
    port->format(UC("state=~d ch=[~a]\n"), Pair::list2(Object::makeInt(state), Object::makeChar(ch)));
#endif
}


void Scanner::fill(int n)
{
    TextualInputPort* const inputPort = parser_port();
    const int restCharCount = limit_ - token_;
    const int tokenOffset = token_ - buffer_;

    if (buffer_ == NULL) {
        buffer_ = new(GC) ucs4char[bufferSize_];
        cursor_ = buffer_;
        limit_ = buffer_;
        token_ = buffer_;
        marker_ = buffer_;
    }

    if ((restCharCount + n) > bufferSize_) {
        ucs4char* newBuffer = new(GC) ucs4char[restCharCount + n + 1];
        bufferSize_ = restCharCount + n + 1;
        memmove(newBuffer, token_, restCharCount * sizeof(ucs4char));
        cursor_ = &newBuffer[cursor_ - buffer_];
        token_ = &newBuffer[token_ - buffer_];
        limit_ = &newBuffer[limit_ - buffer_];
        marker_ = &newBuffer[marker_ - buffer_];
        buffer_ = newBuffer;
    } else if (restCharCount > 0) {
        memmove(buffer_, token_, restCharCount * sizeof(ucs4char));
    }

    int i;
    for (i = 0; i < n; i++) {
        const ucs4char ch = inputPort->getChar();
        if (ch == EOF) {
            buffer_[i + restCharCount] = '\0';
            i++;
            break;
        } else {
            buffer_[i + restCharCount] = ch;
        }
    }
    const int readSize = i;
    cursor_ = cursor_ - tokenOffset;
    token_ = buffer_;
    marker_ = marker_ - tokenOffset;
    limit_ = limit_ - tokenOffset + readSize;
}

int yylex()
{
    return parser_port()->scanner()->scan();
}

int Scanner::scan()
{
/*!re2c
  LINE_FEED              = "\n";
  CHARACTER_TABULATION   = "\X0009";
  LINE_TABULATION        = "\X000B";
  LINE_SEPARATOR         = "\X2028";
  FORM_FEED              = "\X000C";
  CARRIGE_RETURN         = "\r";
  NEXT_LINE              = "\X0085";
  UNICODE_ZL_ZP          = [\X2028-\x2029];
  UNICODE_ZS             = "\X0020" | "\X00A0" | "\X1680" | "\X180E" | [\X2000-\X200A] | "\X202F" | "\X205F" | "\X3000";
  LINE_ENDING            = LINE_FEED | CARRIGE_RETURN | (CARRIGE_RETURN LINE_FEED) | NEXT_LINE | (CARRIGE_RETURN NEXT_LINE) | LINE_SEPARATOR;
  WHITE_SPACE            = CHARACTER_TABULATION | LINE_FEED | LINE_TABULATION | FORM_FEED | CARRIGE_RETURN | NEXT_LINE | UNICODE_ZL_ZP | UNICODE_ZS;
  DELMITER               = [\(\)\[\]\";#]|WHITE_SPACE;
  ANY_CHARACTER          = [^];
  DIGIT                  = [0-9];
  HEX_DIGIT              = DIGIT | [a-f] | [A-F];
  HEX_SCALAR_VALUE       = HEX_DIGIT +;
  INTRA_LINE_WHITE_SPACE = "\t" | UNICODE_ZS;
  INLINE_HEX_ESCAPE      = "\\x" HEX_SCALAR_VALUE ";";
  STRING_ELEMENT         = [^\"\\] | ("\\" [abtnvfr\"\\]) | ("\\" INTRA_LINE_WHITE_SPACE LINE_ENDING INTRA_LINE_WHITE_SPACE) | INLINE_HEX_ESCAPE;
  REGEXP_ELEMENT         = "\\\/" | [^/];
  DIGIT_10               = DIGIT;
  UINTEGER_10            = DIGIT_10 +;
  NAN_INF                = "nan.0" | "inf.0";
  SIGN                   = [\+\-]?;
  EXPONENT_MARKER        = [eEsSfFdDlL];
  MANTISSA_WIDTH         = ("|" (DIGIT_10)+)?;
  SUFFIX                 = (EXPONENT_MARKER SIGN (DIGIT_10)+)?;
  DECIMAL_10             = (UINTEGER_10 SUFFIX) | ("." (DIGIT_10)+ SUFFIX) | ((DIGIT_10)+ "." (DIGIT_10)* SUFFIX) | ((DIGIT_10)+ "." SUFFIX);
  UREAL_10               = UINTEGER_10 | (UINTEGER_10 "/" UINTEGER_10);
  REAL_10                = (SIGN UREAL_10) | ([\+\-] NAN_INF);
  COMPLEX_10             = REAL_10 | (REAL_10 "@" REAL_10) | (REAL_10 [\+\-] UREAL_10 "i") | (REAL_10 [\+\-] NAN_INF "i") | (REAL_10 [\+\-] "i") | ([\+-\] UREAL_10 "i") | ([\+\-] NAN_INF "i") | ([\+\-] "i");
  RADIX_10               = ("#"[dD])?;
  EXACTNESS              = ("#"[iIeE])?;
  PREFIX_10              = (RADIX_10 EXACTNESS) | (EXACTNESS RADIX_10);
  NUM_10                 = PREFIX_10 COMPLEX_10;
  SPECIAL_INITIAL        = [!\$%&\*\/\:\<=\>\?\^\_~];
  LETTER                 = [a-z] | [A-Z];
  CONSTITUENT            = LETTER | [\X0080-\XFFFF]; /* todo: replace \X0080-\XFFFF to Unicode category */
  INITIAL                = CONSTITUENT | SPECIAL_INITIAL | INLINE_HEX_ESCAPE;
  SUBSEQUENT             = INITIAL | DIGIT | [\+\-\.@]; /* todo: Add Unicode category Nd, Mc and Me */
  PECULIAR_IDENTIFIER    = [\+\-] | "..." | ("->" (SUBSEQUENT)*) | "@"; /* "@" is not R6RS match.scm required it. */
  IDENTIFIER             = (INITIAL (SUBSEQUENT)*) | PECULIAR_IDENTIFIER;
  COMMENT                = (";"[^\n\X0000]* (LINE_ENDING | "\X0000")) | ("#!"[^\n]*);
*/

    for(;;)
    {
/*!re2c
        "#"[tT] DELMITER {
            yylval.boolValue = true;
            YYCURSOR--;
            YYTOKEN = YYCURSOR;
            return BOOLEAN;
        }
        "#"[fF] DELMITER {
            yylval.boolValue = false;
            YYCURSOR--;
            YYTOKEN = YYCURSOR;
            return BOOLEAN;
        }
        "#\\space" DELMITER {
            yylval.charValue = ' ';
            YYCURSOR--;
            YYTOKEN = YYCURSOR;
            return CHARACTER;
        }
        "#\\newline" DELMITER {
            yylval.charValue = '\n';
            YYCURSOR--;
            YYTOKEN = YYCURSOR;
            return CHARACTER;
        }
        "#\\nul" DELMITER {
            yylval.charValue = 0x00;
            YYCURSOR--;
            YYTOKEN = YYCURSOR;
            return CHARACTER;
        }
        "#\\alerm" DELMITER {
            yylval.charValue = 0x07;
            YYCURSOR--;
            YYTOKEN = YYCURSOR;
            return CHARACTER;
        }
        "#\\backspace" DELMITER {
            yylval.charValue = 0x08;
            YYCURSOR--;
            YYTOKEN = YYCURSOR;
            return CHARACTER;
        }
        "#\\tab" DELMITER {
            yylval.charValue = 0x09;
            YYCURSOR--;
            YYTOKEN = YYCURSOR;
            return CHARACTER;
        }
        "#\\linefeed" DELMITER {
            yylval.charValue = 0x0A;
            YYCURSOR--;
            YYTOKEN = YYCURSOR;
            return CHARACTER;
        }
        "#\\vtab" DELMITER {
            yylval.charValue = 0x0B;
            YYCURSOR--;
            YYTOKEN = YYCURSOR;
            return CHARACTER;
        }
        "#\\page" DELMITER {
            yylval.charValue = 0x0C;
            YYCURSOR--;
            YYTOKEN = YYCURSOR;
            return CHARACTER;
        }
        "#\\return" DELMITER {
            yylval.charValue = 0x0D;
            YYCURSOR--;
            YYTOKEN = YYCURSOR;
            return CHARACTER;
        }
        "#\\esc" DELMITER {
            yylval.charValue = 0x1B;
            YYCURSOR--;
            YYTOKEN = YYCURSOR;
            return CHARACTER;
        }
        "#\\delete" DELMITER {
            yylval.charValue = 0x7F;
            YYCURSOR--;
            YYTOKEN = YYCURSOR;
            return CHARACTER;
        }
        "#\\" ANY_CHARACTER DELMITER {
            yylval.charValue = YYTOKEN[2];
            YYCURSOR--;
            YYTOKEN = YYCURSOR;
            return CHARACTER;
        }
        "#\\x" HEX_SCALAR_VALUE DELMITER {
            yylval.charValue = ScannerHelper::hexStringToUCS4Char(YYTOKEN + 3, YYCURSOR - 1);
            YYCURSOR--;
            YYTOKEN = YYCURSOR;
            return CHARACTER;
        }
        /* we now omit "(DECIMAL_10 MANTISSA_WIDTH)" for UREAL_10. */
        /* it causes infinite loop. */
        NUM_10 DELMITER {
            yylval.intValue = ScannerHelper::num10StringToInt(YYTOKEN, YYCURSOR - 1);
            YYCURSOR--;
            YYTOKEN = YYCURSOR;
            return NUMBER;
        }
        IDENTIFIER DELMITER {
            yylval.stringValue = ucs4string(YYTOKEN, (YYCURSOR - YYTOKEN) - 1);
            YYCURSOR--;
            YYTOKEN = YYCURSOR;
            return IDENTIFIER;
        }
        "#/" REGEXP_ELEMENT* "/" DELMITER {
            yylval.stringValue = ucs4string(YYTOKEN + 2, (YYCURSOR - YYTOKEN) - 4);
            YYCURSOR--;
            YYTOKEN = YYCURSOR;

            return REGEXP;
        }
        "\"" STRING_ELEMENT* "\"" DELMITER {
            yylval.stringValue = ucs4string(YYTOKEN + 1, (YYCURSOR - YYTOKEN) - 3);
            YYCURSOR--;
            YYTOKEN = YYCURSOR;
            return STRING;
        }
        "." {
            return DOT;
        }
        "`"   {
            YYTOKEN = YYCURSOR;
            return ABBV_QUASIQUOTE;
        }
        "'"   {
            YYTOKEN = YYCURSOR;
            return ABBV_QUOTE;
        }
        ",@"  {
            YYTOKEN = YYCURSOR;
            return ABBV_UNQUOTESPLICING;
        }
        ","   {
            YYTOKEN = YYCURSOR;
            return ABBV_UNQUOTE; }
        "#'"  {
            YYTOKEN = YYCURSOR;
            return ABBV_SYNTAX; }
        "#`"  {
            YYTOKEN = YYCURSOR;
            return ABBV_QUASISYNTAX; }
        "#,"  {
            YYTOKEN = YYCURSOR;
            return ABBV_UNSYNTAX;
        }
        "#,@" {
            YYTOKEN = YYCURSOR;
            return ABBV_UNSYNTAXSPLICING;
        }
        COMMENT {
            YYTOKEN = YYCURSOR;
            continue;
        }
        [\]\)] {
            YYTOKEN = YYCURSOR;
            return RIGHT_PAREN;
        }
        [\(\[] {
            YYTOKEN = YYCURSOR;
            return LEFT_PAREN;
        }
        "#(" {
            YYTOKEN = YYCURSOR;
            return VECTOR_START;
        }
        "#vu8(" {
            YYTOKEN = YYCURSOR;
            return BYTE_VECTOR_START;
        }
        WHITE_SPACE {
            YYTOKEN = YYCURSOR;
            continue;
        }
        "\X0000" {
            YYTOKEN = YYCURSOR;
            return END_OF_FILE;
        }
        "#|" {
            goto comment;
        }
*/

comment:
        YYTOKEN = YYCURSOR;
/*!re2c
        "|#" {
            YYTOKEN = YYCURSOR;
            continue;
        }
        ANY_CHARACTER
        {
            goto comment;
        }
*/
    }
}

ucs4char* Scanner::currentToken() const
{
    return token_;
}