Regex.cpp

/*********************************************************************** Author: Zihan Chen (vczh) Licensed under https://github.com/vczh-libraries/License ***********************************************************************/

#include "Regex.h"

#include "RegexExpression.h"

#include "RegexPure.h"

#include "RegexRich.h"

namespace

{

namespace

regex

{

using

namespace

collections

;

using

namespace

regex_internal

;

/*********************************************************************** RegexString ***********************************************************************/

RegexString

(

vint

_start

) :start(

_start

) ,length(

) { }

RegexString

(

const

WString

_string

vint

_start

vint

_length

) :value(

_length

L""

_string

Sub

(

_start

_length

)) ,start(

_start

) ,length(

_length

) { }

vint

RegexString

Start

()

const

{

return

start

; }

vint

RegexString

Length

()

const

{

return

length

; }

const

WString

RegexString

Value

()

const

{

return

value

; }

bool

RegexString

operator

(

const

RegexString

string

)

const

{

return

start

string

start

length

string

length

value

string

value

; }

/*********************************************************************** RegexMatch ***********************************************************************/

RegexMatch

(

const

WString

_string

PureResult

_result

) :success(

true

) ,result(

_string

_result

start

_result

length

) { }

RegexMatch

(

const

WString

_string

RichResult

_result

RichInterpretor

_rich

) :success(

true

) ,result(

_string

_result

start

_result

length

) {

for

(

vint

;

_result

captures

Count

();

++) {

CaptureRecord

capture

_result

captures

[

];

(

capture

==-

) {

captures

Add

(

RegexString

(

_string

capture

start

capture

length

)); }

else

{

groups

Add

(

_rich

CaptureNames

().

Get

(

capture

RegexString

(

_string

capture

start

capture

length

)); } } }

RegexMatch

(

const

RegexString

_result

) :success(

false

) ,result(

_result

) { }

bool

RegexMatch

Success

()

const

{

return

success

; }

const

RegexString

RegexMatch

Result

()

const

{

return

result

; }

const

RegexMatch

CaptureList

RegexMatch

Captures

()

const

{

return

captures

; }

const

RegexMatch

CaptureGroup

RegexMatch

Groups

()

const

{

return

groups

; }

/*********************************************************************** Regex ***********************************************************************/

void

Regex

Process

(

const

WString

text

bool

keepEmpty

bool

keepSuccess

bool

keepFail

RegexMatch

List

matches

)

const

{

(

rich

) {

const

wchar_t

start

text

Buffer

();

const

wchar_t

input

start

;

RichResult

result

;

while

(

rich

Match

(

input

start

result

)) {

vint

offset

input

start

;

(

keepFail

) {

(

result

start

offset

keepEmpty

) {

matches

Add

(

new

RegexMatch

(

RegexString

(

text

offset

result

start

offset

))); } }

(

keepSuccess

) {

matches

Add

(

new

RegexMatch

(

text

, &

result

rich

)); }

input

start

result

start

result

length

; }

(

keepFail

) {

vint

remain

input

start

;

vint

length

text

Length

()-

remain

;

(

length

keepEmpty

) {

matches

Add

(

new

RegexMatch

(

RegexString

(

text

remain

length

))); } } }

else

{

const

wchar_t

start

text

Buffer

();

const

wchar_t

input

start

;

PureResult

result

;

while

(

pure

Match

(

input

start

result

)) {

vint

offset

input

start

;

(

keepFail

) {

(

result

start

offset

keepEmpty

) {

matches

Add

(

new

RegexMatch

(

RegexString

(

text

offset

result

start

offset

))); } }

(

keepSuccess

) {

matches

Add

(

new

RegexMatch

(

text

, &

result

)); }

input

start

result

start

result

length

; }

(

keepFail

) {

vint

remain

input

start

;

vint

length

text

Length

()-

remain

;

(

length

keepEmpty

) {

matches

Add

(

new

RegexMatch

(

RegexString

(

text

remain

length

))); } } } }

Regex

(

const

WString

code

bool

preferPure

) {

CharRange

List

subsets

;

RegexExpression

Ref

regex

ParseRegexExpression

(

code

);

Expression

Ref

expression

regex

Merge

();

expression

NormalizeCharSet

(

subsets

);

bool

pureRequired

false

;

bool

richRequired

false

;

(

preferPure

) {

(

expression

HasNoExtension

()) {

pureRequired

true

; }

else

{

(

expression

CanTreatAsPure

()) {

pureRequired

true

;

richRequired

true

; }

else

{

richRequired

true

; } } }

else

{

richRequired

true

; }

try

{

(

pureRequired

) {

Dictionary

State

nfaStateMap

;

Group

State

dfaStateMap

;

Automaton

Ref

eNfa

expression

GenerateEpsilonNfa

();

Automaton

Ref

nfa

EpsilonNfaToNfa

(

eNfa

PureEpsilonChecker

nfaStateMap

);

Automaton

Ref

dfa

NfaToDfa

(

nfa

dfaStateMap

);

pure

new

PureInterpretor

(

dfa

subsets

); }

(

richRequired

) {

Dictionary

State

nfaStateMap

;

Group

State

dfaStateMap

;

Automaton

Ref

eNfa

expression

GenerateEpsilonNfa

();

Automaton

Ref

nfa

EpsilonNfaToNfa

(

eNfa

RichEpsilonChecker

nfaStateMap

);

Automaton

Ref

dfa

NfaToDfa

(

nfa

dfaStateMap

);

rich

new

RichInterpretor

(

dfa

); } }

catch

(...) {

(

pure

)

delete

pure

;

(

rich

)

delete

rich

;

throw

; } }

Regex

() {

(

pure

)

delete

pure

;

(

rich

)

delete

rich

; }

bool

Regex

IsPureMatch

()

const

{

return

rich

false

true

; }

bool

Regex

IsPureTest

()

const

{

return

pure

true

false

; }

RegexMatch

Ref

Regex

MatchHead

(

const

WString

text

)

const

{

(

rich

) {

RichResult

result

;

(

rich

MatchHead

(

text

Buffer

(),

text

Buffer

(),

result

)) {

return

new

RegexMatch

(

text

, &

result

rich

); }

else

{

return

; } }

else

{

PureResult

result

;

(

pure

MatchHead

(

text

Buffer

(),

text

Buffer

(),

result

)) {

return

new

RegexMatch

(

text

, &

result

); }

else

{

return

; } } }

RegexMatch

Ref

Regex

Match

(

const

WString

text

)

const

{

(

rich

) {

RichResult

result

;

(

rich

Match

(

text

Buffer

(),

text

Buffer

(),

result

)) {

return

new

RegexMatch

(

text

, &

result

rich

); }

else

{

return

; } }

else

{

PureResult

result

;

(

pure

Match

(

text

Buffer

(),

text

Buffer

(),

result

)) {

return

new

RegexMatch

(

text

, &

result

); }

else

{

return

; } } }

bool

Regex

TestHead

(

const

WString

text

)

const

{

(

pure

) {

PureResult

result

;

return

pure

MatchHead

(

text

Buffer

(),

text

Buffer

(),

result

); }

else

{

RichResult

result

;

return

rich

MatchHead

(

text

Buffer

(),

text

Buffer

(),

result

); } }

bool

Regex

Test

(

const

WString

text

)

const

{

(

pure

) {

PureResult

result

;

return

pure

Match

(

text

Buffer

(),

text

Buffer

(),

result

); }

else

{

RichResult

result

;

return

rich

Match

(

text

Buffer

(),

text

Buffer

(),

result

); } }

void

Regex

(

const

WString

text

RegexMatch

List

matches

)

const

{

Process

(

text

false

true

false

matches

); }

void

Regex

Split

(

const

WString

text

bool

keepEmptyMatch

RegexMatch

List

matches

)

const

{

Process

(

text

keepEmptyMatch

false

true

matches

); }

void

Regex

Cut

(

const

WString

text

bool

keepEmptyMatch

RegexMatch

List

matches

)

const

{

Process

(

text

keepEmptyMatch

true

matches

); }

/*********************************************************************** RegexTokens ***********************************************************************/

bool

RegexToken

operator

(

const

RegexToken

_token

)

const

{

return

length

_token

length

token

_token

token

reading

_token

reading

; }

bool

RegexToken

operator

(

const

wchar_t

_token

)

const

{

return

wcslen

(

_token

)==

length

wcsncmp

(

reading

_token

length

)==

; }

class

RegexTokenEnumerator

public

Object

public

IEnumerator

RegexToken

> {

protected

RegexToken

token

;

vint

index

= -

;

PureInterpretor

pure

;

const

Array

vint

stateTokens

;

const

wchar_t

start

;

vint

codeIndex

;

RegexProc

proc

;

const

wchar_t

reading

;

vint

rowStart

;

vint

columnStart

;

bool

cacheAvailable

false

;

RegexToken

cacheToken

;

public

RegexTokenEnumerator

(

const

RegexTokenEnumerator

enumerator

) :token(

enumerator

token

) , index(

enumerator

index

) , pure(

enumerator

pure

) , stateTokens(

enumerator

stateTokens

) , proc(

enumerator

proc

) , reading(

enumerator

reading

) , start(

enumerator

start

) , rowStart(

enumerator

rowStart

) , columnStart(

enumerator

columnStart

) , codeIndex(

enumerator

codeIndex

) , cacheAvailable(

enumerator

cacheAvailable

) , cacheToken(

enumerator

cacheToken

) { }

RegexTokenEnumerator

(

PureInterpretor

_pure

const

Array

vint

_stateTokens

const

wchar_t

_start

vint

_codeIndex

RegexProc

_proc

) :index(-

) , pure(

_pure

) , stateTokens(

_stateTokens

) , start(

_start

) , codeIndex(

_codeIndex

) , proc(

_proc

) , reading(

_start

) { }

IEnumerator

RegexToken

Clone

()

const

{

return

new

RegexTokenEnumerator

this

); }

const

RegexToken

Current

()

const

{

return

token

; }

vint

Index

()

const

{

return

index

; }

bool

() {

cacheAvailable

&& !*

reading

)

return

false

;

(

cacheAvailable

) {

token

cacheToken

;

cacheAvailable

false

; }

else

{

token

reading

;

token

start

;

token

length

;

token

= -

;

token

completeToken

true

; }

token

rowStart

;

token

columnStart

;

token

rowEnd

rowStart

;

token

columnEnd

columnStart

;

token

codeIndex

;

PureResult

result

;

while

reading

) {

vint

= -

;

bool

completeToken

true

;

pure

MatchHead

(

reading

start

result

)) {

result

start

reading

start

;

(

== -

result

terminateState

!= -

) {

vint

state

pure

GetRelatedFinalState

(

result

terminateState

);

(

state

!= -

) {

stateTokens

[

state

]; } }

(

== -

) {

result

length

; }

else

{

completeToken

false

; } }

else

{

stateTokens

Get

(

result

finalState

); }

(

!= -

proc

extendProc

) {

RegexProcessingToken

token

(

result

start

result

length

completeToken

nullptr

);

proc

extendProc

(

proc

argument

reading

, -

true

token

);

#if _DEBUG

CHECK_ERROR(token.interTokenState == nullptr, L"RegexTokenEnumerator::Next()#The extendProc is only allowed to create interTokenState in RegexLexerColorizer."); #endif

{

(!(

token

interTokenState

nullptr

))

throw

Error

(

L"RegexTokenEnumerator::Next()#The extendProc is only allowed to create interTokenState in RegexLexerColorizer."

);}

while

(

);

result

length

token

length

;

token

;

completeToken

token

completeToken

; }

(

token

== -

) {

token

start

result

start

;

token

length

result

length

;

token

;

token

completeToken

; }

else

(

token

== -

) {

token

length

result

length

; }

else

{

cacheAvailable

true

;

cacheToken

reading

;

cacheToken

start

result

start

;

cacheToken

length

result

length

;

cacheToken

codeIndex

;

cacheToken

token

;

cacheToken

completeToken

; }

reading

result

length

;

(

cacheAvailable

) {

break

; } }

index

++;

for

(

vint

;

token

length

;

++) {

token

rowEnd

rowStart

;

token

columnEnd

columnStart

;

(

token

reading

[

] ==

L'\n'

) {

rowStart

++;

columnStart

; }

else

{

columnStart

++; } }

return

true

; }

void

Reset

() {

index

= -

;

reading

start

;

cacheAvailable

false

; }

void

ReadToEnd

(

List

RegexToken

tokens

bool

discard

)(

vint

)) {

while

(

()) {

discard

(

token

)) {

tokens

Add

(

token

); } } } };

RegexTokens

(

PureInterpretor

_pure

const

Array

vint

_stateTokens

const

WString

_code

vint

_codeIndex

RegexProc

_proc

) :pure(

_pure

) , stateTokens(

_stateTokens

) , code(

_code

) , codeIndex(

_codeIndex

) , proc(

_proc

) { }

RegexTokens

(

const

RegexTokens

tokens

) :pure(

tokens

pure

) , stateTokens(

tokens

stateTokens

) , code(

tokens

code

) , codeIndex(

tokens

codeIndex

) , proc(

tokens

proc

) { }

IEnumerator

RegexToken

RegexTokens

CreateEnumerator

()

const

{

return

new

RegexTokenEnumerator

(

pure

stateTokens

code

Buffer

(),

codeIndex

proc

); }

bool

DefaultDiscard

(

vint

token

) {

return

false

; }

void

RegexTokens

ReadToEnd

(

collections

List

RegexToken

tokens

bool

discard

)(

vint

))

const

{

(

discard

) {

discard

DefaultDiscard

; }

RegexTokenEnumerator

(

pure

stateTokens

code

Buffer

(),

codeIndex

proc

ReadToEnd

(

tokens

discard

); }

/*********************************************************************** RegexLexerWalker ***********************************************************************/

RegexLexerWalker

(

PureInterpretor

_pure

const

Array

vint

_stateTokens

) :pure(

_pure

) , stateTokens(

_stateTokens

) { }

RegexLexerWalker

(

const

RegexLexerWalker

tokens

) : pure(

tokens

pure

) , stateTokens(

tokens

stateTokens

) { }

RegexLexerWalker

() { }

RegexTokens

() { }

vint

RegexLexerWalker

GetStartState

()

const

{

return

pure

GetStartState

(); }

vint

RegexLexerWalker

GetRelatedToken

(

vint

state

)

const

{

vint

finalState

state

== -

? -

pure

GetRelatedFinalState

(

state

);

return

finalState

== -

? -

stateTokens

Get

(

finalState

); }

void

RegexLexerWalker

Walk

(

wchar_t

input

vint

state

vint

token

bool

finalState

bool

previousTokenStop

)

const

{

vint

previousState

state

;

token

;

finalState

false

;

previousTokenStop

false

;

(

state

==-

) {

state

pure

GetStartState

();

previousTokenStop

true

; }

state

pure

Transit

(

input

state

);

(

state

==-

) {

previousTokenStop

true

;

(

previousState

==-

) {

finalState

true

;

return

; }

else

(

pure

IsFinalState

(

previousState

)) {

state

pure

Transit

(

input

pure

GetStartState

()); } }

(

pure

IsFinalState

(

state

)) {

token

stateTokens

Get

(

state

);

finalState

true

;

return

; }

else

{

finalState

state

==-

;

return

; } }

vint

RegexLexerWalker

Walk

(

wchar_t

input

vint

state

)

const

{

vint

token

;

bool

finalState

false

;

bool

previousTokenStop

false

;

Walk

(

input

state

token

finalState

previousTokenStop

);

return

state

; }

bool

RegexLexerWalker

IsClosedToken

(

const

wchar_t

input

vint

length

)

const

{

vint

state

pure

GetStartState

();

for

(

vint

;

length

;

++) {

state

pure

Transit

(

input

[

state

);

(

state

==-

)

return

true

;

(

pure

IsDeadState

(

state

))

return

true

; }

return

false

; }

bool

RegexLexerWalker

IsClosedToken

(

const

WString

input

)

const

{

return

IsClosedToken

(

input

Buffer

(),

input

Length

()); }

/*********************************************************************** RegexLexerColorizer ***********************************************************************/

RegexLexerColorizer

(

const

RegexLexerWalker

_walker

RegexProc

_proc

) :walker(

_walker

) , proc(

_proc

) {

internalState

currentState

walker

GetStartState

(); }

RegexLexerColorizer

(

const

RegexLexerColorizer

colorizer

) :walker(

colorizer

walker

) , proc(

colorizer

proc

) , internalState(

colorizer

internalState

) { }

RegexLexerColorizer

() { }

RegexLexerColorizer

InternalState

RegexLexerColorizer

GetInternalState

() {

return

internalState

; }

void

RegexLexerColorizer

SetInternalState

(

InternalState

state

) {

internalState

state

; }

void

RegexLexerColorizer

Pass

(

wchar_t

input

) {

WalkOneToken

input

false

); }

vint

RegexLexerColorizer

GetStartState

()

const

{

return

walker

GetStartState

(); }

void

RegexLexerColorizer

CallExtendProcAndColorizeProc

(

const

wchar_t

input

vint

length

RegexProcessingToken

token

bool

colorize

) {

vint

oldTokenLength

token

length

;

proc

extendProc

(

proc

argument

input

token

start

length

token

start

false

token

);

#if _DEBUG

{

bool

pausedAtTheEnd

token

start

token

length

&& !

token

completeToken

;

CHECK_ERROR( token.completeToken || pausedAtTheEnd, L"RegexLexerColorizer::WalkOneToken(const wchar_t*, vint, vint, bool)#The extendProc is not allowed pause before the end of the input." );

{

(!(

token

completeToken

pausedAtTheEnd

))

throw

Error

(

L"RegexLexerColorizer::WalkOneToken(const wchar_t*, vint, vint, bool)#The extendProc is not allowed pause before the end of the input."

);}

while

(

);

CHECK_ERROR( token.completeToken || token.token != -1, L"RegexLexerColorizer::WalkOneToken(const wchar_t*, vint, vint, bool)#The extendProc is not allowed to pause without a valid token id." );

{

(!(

token

completeToken

token

!= -

))

throw

Error

(

L"RegexLexerColorizer::WalkOneToken(const wchar_t*, vint, vint, bool)#The extendProc is not allowed to pause without a valid token id."

);}

while

(

);

CHECK_ERROR( oldTokenLength <= token.length, L"RegexLexerColorizer::WalkOneToken(const wchar_t*, vint, vint, bool)#The extendProc is not allowed to decrease the token length." );

{

(!(

oldTokenLength

token

length

))

throw

Error

(

L"RegexLexerColorizer::WalkOneToken(const wchar_t*, vint, vint, bool)#The extendProc is not allowed to decrease the token length."

);}

while

(

);

CHECK_ERROR( (token.interTokenState == nullptr) == !pausedAtTheEnd, L"RegexLexerColorizer::Colorize(const wchar_t*, vint, void*)#The extendProc should return an inter token state object if and only if a valid token does not end at the end of the input." );

{

(!((

token

interTokenState

nullptr

) == !

pausedAtTheEnd

))

throw

Error

(

L"RegexLexerColorizer::Colorize(const wchar_t*, vint, void*)#The extendProc should return an inter token state object if and only if a valid token does not end at the end of the input."

);}

while

(

);

}

#endif

((

internalState

interTokenState

token

interTokenState

)) {

internalState

interTokenId

token

; }

(

colorize

) {

proc

colorizeProc

(

proc

argument

token

start

token

length

token

); } }

vint

RegexLexerColorizer

WalkOneToken

(

const

wchar_t

input

vint

length

vint

start

bool

colorize

) {

(

internalState

interTokenState

) {

RegexProcessingToken

token

, -

internalState

interTokenId

false

internalState

interTokenState

);

proc

extendProc

(

proc

argument

input

length

false

token

);

#if _DEBUG

{

bool

pausedAtTheEnd

token

length

&& !

token

completeToken

;

CHECK_ERROR( token.completeToken || pausedAtTheEnd, L"RegexLexerColorizer::WalkOneToken(const wchar_t*, vint, vint, bool)#The extendProc is not allowed to pause before the end of the input." );

{

(!(

token

completeToken

pausedAtTheEnd

))

throw

Error

(

L"RegexLexerColorizer::WalkOneToken(const wchar_t*, vint, vint, bool)#The extendProc is not allowed to pause before the end of the input."

);}

while

(

);

CHECK_ERROR( token.completeToken || token.token == internalState.interTokenId, L"RegexLexerColorizer::WalkOneToken(const wchar_t*, vint, vint, bool)#The extendProc is not allowed to continue pausing with a different token id." );

{

(!(

token

completeToken

token

internalState

interTokenId

))

throw

Error

(

L"RegexLexerColorizer::WalkOneToken(const wchar_t*, vint, vint, bool)#The extendProc is not allowed to continue pausing with a different token id."

);}

while

(

);

{

(!((

token

interTokenState

nullptr

) == !

pausedAtTheEnd

))

throw

Error

(

L"RegexLexerColorizer::Colorize(const wchar_t*, vint, void*)#The extendProc should return an inter token state object if and only if a valid token does not end at the end of the input."

);}

while

(

);

}

#endif

(

colorize

) {

proc

colorizeProc

(

proc

argument

token

length

token

); }

(!(

internalState

interTokenState

token

interTokenState

)) {

internalState

interTokenId

= -

; }

return

token

length

; }

vint

lastFinalStateLength

;

vint

lastFinalStateToken

= -

;

vint

lastFinalStateState

= -

;

vint

tokenStartState

internalState

currentState

;

for

(

vint

start

;

length

;

++) {

vint

currentToken

= -

;

bool

finalState

false

;

bool

previousTokenStop

false

;

walker

Walk

(

input

[

internalState

currentState

currentToken

finalState

previousTokenStop

);

(

previousTokenStop

) {

(

proc

extendProc

lastFinalStateToken

!= -

) {

RegexProcessingToken

token

(

start

lastFinalStateLength

lastFinalStateToken

true

nullptr

);

CallExtendProcAndColorizeProc

(

input

length

token

colorize

);

(

token

completeToken

) {

internalState

currentState

walker

GetStartState

(); }

return

start

token

length

; }

else

(

start

) {

(

tokenStartState

GetStartState

()) {

(

colorize

) {

proc

colorizeProc

(

proc

argument

start

, -

); }

internalState

currentState

walker

GetStartState

();

return

; } }

else

{

(

colorize

) {

proc

colorizeProc

(

proc

argument

start

lastFinalStateLength

lastFinalStateToken

); }

internalState

currentState

lastFinalStateState

;

return

start

lastFinalStateLength

; } }

(

finalState

) {

lastFinalStateLength

start

;

lastFinalStateToken

currentToken

;

lastFinalStateState

internalState

currentState

; } }

(

lastFinalStateToken

!= -

start

lastFinalStateLength

length

) {

(

proc

extendProc

) {

RegexProcessingToken

token

(

start

lastFinalStateLength

lastFinalStateToken

true

nullptr

);

CallExtendProcAndColorizeProc

(

input

length

token

colorize

); }

else

(

colorize

) {

proc

colorizeProc

(

proc

argument

start

lastFinalStateLength

lastFinalStateToken

); } }

else

(

colorize

) {

proc

colorizeProc

(

proc

argument

start

length

start

walker

GetRelatedToken

(

internalState

currentState

)); }

return

length

; }

void

RegexLexerColorizer

Colorize

(

const

wchar_t

input

vint

length

) {

vint

index

;

while

(

index

length

) {

index

WalkOneToken

(

input

length

index

true

); }

return

internalState

interTokenState

; }

/*********************************************************************** RegexLexer ***********************************************************************/

RegexLexer

(

const

collections

IEnumerable

WString

tokens

RegexProc

_proc

) :proc(

_proc

) {

// Build DFA for all tokens

List

Expression

Ref

expressions

;

List

Automaton

Ref

dfas

;

CharRange

List

subsets

;

Ptr

IEnumerator

WString

enumerator

tokens

CreateEnumerator

();

while

(

enumerator

()) {

const

WString

code

enumerator

Current

();

RegexExpression

Ref

regex

ParseRegexExpression

(

code

);

Expression

Ref

expression

regex

Merge

();

expression

CollectCharSet

(

subsets

);

expressions

Add

(

expression

); }

for

(

vint

;

expressions

Count

();

++) {

Dictionary

State

nfaStateMap

;

Group

State

dfaStateMap

;

Expression

Ref

expression

expressions

[

];

expression

ApplyCharSet

(

subsets

);

Automaton

Ref

eNfa

expression

GenerateEpsilonNfa

();

Automaton

Ref

nfa

EpsilonNfaToNfa

(

eNfa

PureEpsilonChecker

nfaStateMap

);

Automaton

Ref

dfa

NfaToDfa

(

nfa

dfaStateMap

);

dfas

Add

(

dfa

); }

// Mark all states in DFAs

for

(

vint

;

dfas

Count

();

++) {

Automaton

Ref

dfa

dfas

[

];

for

(

vint

;

dfa

states

Count

();

++) {

(

dfa

states

[

]

finalState

) {

dfa

states

[

]

userData

= (

void

; }

else

{

dfa

states

[

]

userData

= (

void

dfas

Count

(); } } }

// Connect all DFAs to an e-NFA

Automaton

Ref

bigEnfa

new

Automaton

;

for

(

vint

;

dfas

Count

();

++) {

CopyFrom

(

bigEnfa

states

dfas

[

]

states

);

CopyFrom

(

bigEnfa

transitions

dfas

[

]

transitions

); }

bigEnfa

startState

bigEnfa

NewState

();

for

(

vint

;

dfas

Count

();

++) {

bigEnfa

NewEpsilon

(

bigEnfa

startState

dfas

[

]

startState

); }

// Build a single DFA out of the e-NFA

Dictionary

State

nfaStateMap

;

Group

State

dfaStateMap

;

Automaton

Ref

bigNfa

EpsilonNfaToNfa

(

bigEnfa

PureEpsilonChecker

nfaStateMap

);

for

(

vint

;

nfaStateMap

Keys

().

Count

();

++) {

void

userData

nfaStateMap

Values

().

Get

(

)->

userData

;

nfaStateMap

Keys

()

[

]->

userData

; }

Automaton

Ref

bigDfa

NfaToDfa

(

bigNfa

dfaStateMap

);

for

(

vint

;

dfaStateMap

Keys

().

Count

();

++) {

void

userData

dfaStateMap

GetByIndex

(

Get

(

)->

userData

;

for

(

vint

;

dfaStateMap

GetByIndex

(

Count

();

++) {

void

newData

dfaStateMap

GetByIndex

(

Get

(

)->

userData

;

(

userData

newData

) {

userData

newData

; } }

dfaStateMap

Keys

()

[

]->

userData

; }

// Build state machine

pure

new

PureInterpretor

(

bigDfa

subsets

);

stateTokens

Resize

(

bigDfa

states

Count

());

for

(

vint

;

stateTokens

Count

();

++) {

void

userData

bigDfa

states

[

]

userData

;

stateTokens

[

] = (

vint

)

userData

; } }

RegexLexer

() {

(

pure

)

delete

pure

; }

RegexTokens

RegexLexer

Parse

(

const

WString

code

vint

codeIndex

)

const

{

pure

PrepareForRelatedFinalStateTable

();

return

RegexTokens

(

pure

stateTokens

code

codeIndex

proc

); }

RegexLexerWalker

RegexLexer

Walk

()

const

{

pure

PrepareForRelatedFinalStateTable

();

return

RegexLexerWalker

(

pure

stateTokens

); }

RegexLexerColorizer

RegexLexer

Colorize

()

const

{

return

RegexLexerColorizer

(

Walk

(),

proc

); } } }