Regex.h

/*********************************************************************** Author: Zihan Chen (vczh) Licensed under https://github.com/vczh-libraries/License ***********************************************************************/

#ifndef VCZH_REGEX_REGEX

#define VCZH_REGEX_REGEX

#include <Vlpp.h>

namespace

{

namespace

regex_internal

{

class

PureResult

;

class

PureInterpretor

;

class

RichResult

;

class

RichInterpretor

; }

namespace

regex

{

/*********************************************************************** Data Structure ***********************************************************************/

/// <summary>A sub string of the string that a <see cref="Regex"/> is matched against.</summary>

class

RegexString

public

Object

{

protected

WString

value

;

vint

start

;

vint

length

;

public

RegexString

(

vint

_start

);

RegexString

(

const

WString

_string

vint

_start

vint

_length

);

/// <summary>The position of the input string in characters.</summary>

/// <returns>The position.</returns>

vint

Start

()

const

;

/// <summary>The size of the sub string in characters.</summary>

/// <returns>The size.</returns>

vint

Length

()

const

;

/// <summary>Get the sub string as a <see cref="WString"/>.</summary>

/// <returns>The sub string.</returns>

const

WString

Value

()

const

;

bool

operator

(

const

RegexString

string

)

const

; };

/// <summary>A match produces by a <see cref="Regex"/>.</summary>

class

RegexMatch

public

Object

private

NotCopyable

{

friend

class

Regex

;

public

typedef

Ptr

RegexMatch

Ref

;

typedef

collections

List

Ref

List

;

typedef

collections

List

RegexString

CaptureList

;

typedef

collections

Group

WString

RegexString

CaptureGroup

;

protected

collections

List

RegexString

captures

;

collections

Group

WString

RegexString

groups

;

bool

success

;

RegexString

result

;

RegexMatch

(

const

WString

_string

regex_internal

PureResult

_result

);

RegexMatch

(

const

WString

_string

regex_internal

RichResult

_result

regex_internal

RichInterpretor

_rich

);

RegexMatch

(

const

RegexString

_result

);

public

/// <summary>

/// Test if this match is a succeeded match or a failed match.

/// A failed match will only appear when calling [M:vl.regex.Regex.Split] or [M:vl.regex.Regex.Cut].

/// In other cases, failed matches are either not included in the result.

/// </summary>

/// <returns>Returns true if this match is a succeeded match.</returns>

bool

Success

()

const

;

/// <summary>Get the matched sub string.</summary>

/// <returns>The matched sub string.</returns>

const

RegexString

Result

()

const

;

/// <summary>Get all sub strings that are captured anonymously.</summary>

/// <returns>All sub strings that are captured anonymously.</returns>

/// <example><![CDATA[

/// int main()

/// {

/// Regex regex(L"^/.*?((?C/S+)(/.*?))+$");

/// auto match = regex.MatchHead(L"C++ and C# are my favorite programing languages");

/// FOREACH(RegexString, capture, match->Captures())

/// {

/// Console::WriteLine(capture.Value());

/// }

/// ]]></example>

const

CaptureList

Captures

()

const

;

/// <summary>Get all sub strings that are captured by named groups.</summary>

/// <returns>All sub strings that are captured by named groups.</returns>

/// <example><![CDATA[

/// int main()

/// {

/// Regex regex(L"^/.*?((<lang>C/S+)(/.*?))+$");

/// auto match = regex.MatchHead(L"C++ and C# are my favorite programing languages");

/// FOREACH(RegexString, capture, match->Groups().Get(L"lang"))

/// {

/// Console::WriteLine(capture.Value());

/// }

/// ]]></example>

const

CaptureGroup

Groups

()

const

; };

/*********************************************************************** Regex ***********************************************************************/

/// <summary>

///

/// Regular Expression. Here is a brief description of the regular expression grammar.

///

///

/// <ul>

/// <li>

/// Charset:

/// <ul>

/// <li>a, [a-z], [^a-z]</li>

/// </ul>

/// </li>

/// <li>

/// Functional characters:

/// <ul>

/// <li>^: the beginning of the input (DFA incompatible)</li>

/// <li>$: the end of the input (DFA incompatible)</li>

/// <li>regex1|regex2: match either regex1 or regex2</li>

/// </ul>

/// </li>

/// <li>

/// Escaping (both \ and / mean the next character is escaped):

/// <ul>

/// <li>

/// Escaped characters:

/// <ul>

/// <li>\r: the CR character</li>

/// <li>\n: the LF character</li>

/// <li>\t: the tab character</li>

/// <li>\s: spacing characters (including space, \r, \n, \t)</li>

/// <li>\S: non-spacing characters</li>

/// <li>\d: [0-9]</li>

/// <li>\D: [^0-9]</li>

/// <li>\l: [a-zA-Z]</li>

/// <li>\L: [^a-zA-Z]</li>

/// <li>\w: [a-zA-Z0-9_]</li>

/// <li>\W: [^a-zA-Z0-9_]</li>

/// <li>\.: any character (this is the main different from other regex, which treat "." as any characters and "\." as the dot character)</li>

/// <li>\\, \/, $, $, \+, \*, \?, \{, \}, \[, \], \<, \>, \^, \$, \!, \=: represents itself</li>

/// </ul>

/// </li>

/// <li>

/// Escaped characters in charset defined in a square bracket:

/// <ul>

/// <li>\r: the CR character</li>

/// <li>\n: the LF character</li>

/// <li>\t: the tab character</li>

/// <li>\-, \[, \], \\, \/, \^, \$: represents itself</li>

/// </ul>

/// </li>

/// </ul>

/// </li>

/// <li>

/// Loops:

/// <ul>

/// <li>regex{3}: repeats 3 times</li>

/// <li>regex{3,}: repeats 3 or more times</li>

/// <li>regex{1,3}: repeats 1 to 3 times</li>

/// <li>regex?: repeats 0 or 1 times</li>

/// <li>regex*: repeats 0 or more times</li>

/// <li>regex+: repeats 1 or more times</li>

/// </ul>

/// if you add an additional ? right after a loop, it means repeating as less as possible (DFA incompatible)

/// </li>

/// <li>

/// Capturing: (DFA incompatible)

/// <ul>

/// <li>(regex): No capturing, just change the operators' association</li>

/// <li>(?regex): Capture matched fragment</li>

/// <li>(<name>regex): Capture matched fragment in a named group called "name"</li>

/// <li>(<$i>): Match the i-th captured fragment, begins from 0</li>

/// <li>(<$name;i>): Match the i-th captured fragment in the named group called "name", begins from 0</li>

/// <li>(<$name>): Match any captured fragment in the named group called "name"</li>

/// </ul>

/// </li>

/// <li>

/// MISC

/// <ul>

/// <li>(=regex): The prefix of the following text should match the regex, but it is not counted in the whole match (DFA incompatible)</li>

/// <li>(!regex): Any prefix of the following text should not match the regex, and it is not counted in the whole match (DFA incompatible)</li>

/// <li>(<#name>regex): Name the regex "name", and it applies here</li>

/// <li>(<&name>): Copy the named regex "name" here and apply</li>

/// </ul>

/// </li>

/// </ul>

///

///

/// The regular expression has pupre mode and rich mode.

/// Pure mode means the regular expression is driven by a DFA, while the rich mode is not.

///

///

/// The regular expression can test a string instead of matching.

/// Testing only returns a bool very indicating success or failure.

///

/// </summary>

class

Regex

public

Object

private

NotCopyable

{

protected

regex_internal

PureInterpretor

pure

nullptr

;

regex_internal

RichInterpretor

rich

nullptr

;

void

Process

(

const

WString

text

bool

keepEmpty

bool

keepSuccess

bool

keepFail

RegexMatch

List

matches

)

const

;

public

/// <summary>Create a regular expression. It will crash if the regular expression produces syntax error.</summary>

/// <param name="code">The regular expression in a string.</param>

/// <param name="preferPure">Set to true to use DFA if possible.</param>

Regex

(

const

WString

code

bool

preferPure

true

);

Regex

();

/// <summary>Test is a DFA used to match a string.</summary>

/// <returns>Returns true if a DFA is used.</returns>

bool

IsPureMatch

()

const

;

/// <summary>Test is a DFA used to test a string. It ignores all capturing.</summary>

/// <returns>Returns true if a DFA is used.</returns>

bool

IsPureTest

()

const

;

/// <summary>Match a prefix of the text.</summary>

/// <returns>Returns the match. Returns null if failed.</returns>

/// <param name="text">The text to match.</param>

/// <example><![CDATA[

/// int main()

/// {

/// Regex regex(L"C/S+");

/// auto match = regex.MatchHead(L"C++ and C# are my favorite programing languages");

/// Console::WriteLine(match->Result().Value());

/// }

/// ]]></example>

RegexMatch

Ref

MatchHead

(

const

WString

text

)

const

;

/// <summary>Match a sub string of the text.</summary>

/// <returns>Returns the first match. Returns null if failed.</returns>

/// <param name="text">The text to match.</param>

/// <example><![CDATA[

/// int main()

/// {

/// Regex regex(L"C/S+");

/// auto match = regex.Match(L"C++ and C# are my favorite programing languages");

/// Console::WriteLine(match->Result().Value());

/// }

/// ]]></example>

RegexMatch

Ref

Match

(

const

WString

text

)

const

;

/// <summary>Match a prefix of the text, ignoring all capturing.</summary>

/// <returns>Returns true if it succeeded.</returns>

/// <param name="text">The text to match.</param>

bool

TestHead

(

const

WString

text

)

const

;

/// <summary>Match a sub string of the text, ignoring all capturing.</summary>

/// <returns>Returns true if succeeded.</returns>

/// <param name="text">The text to match.</param>

bool

Test

(

const

WString

text

)

const

;

/// <summary>Find all matched fragments in the given text, returning all matched sub strings.</summary>

/// <param name="text">The text to match.</param>

/// <param name="matches">Returns all succeeded matches.</param>

/// <example><![CDATA[

/// int main()

/// {

/// Regex regex(L"C/S+");

/// RegexMatch::List matches;

/// regex.Search(L"C++ and C# are my favorite programing languages", matches);

/// FOREACH(Ptr<RegexMatch>, match, matches)

/// {

/// Console::WriteLine(match->Result().Value());

/// }

/// ]]></example>

void

(

const

WString

text

RegexMatch

List

matches

)

const

;

/// <summary>Split the text by matched sub strings, returning all unmatched sub strings.</summary>

/// <param name="text">The text to match.</param>

/// <param name="keepEmptyMatch">Set to true to keep all empty unmatched sub strings. This could happen when there is nothing between two matched sub strings.</param>

/// <param name="matches">Returns all failed matches.</param>

/// <example><![CDATA[

/// int main()

/// {

/// Regex regex(L"C/S+");

/// RegexMatch::List matches;

/// regex.Split(L"C++ and C# are my favorite programing languages", false, matches);

/// FOREACH(Ptr<RegexMatch>, match, matches)

/// {

/// Console::WriteLine(match->Result().Value());

/// }

/// ]]></example>

void

Split

(

const

WString

text

bool

keepEmptyMatch

RegexMatch

List

matches

)

const

;

/// <summary>Cut the text by matched sub strings, returning all matched and unmatched sub strings.</summary>

/// <param name="text">The text to match.</param>

/// <param name="keepEmptyMatch">Set to true to keep all empty matches. This could happen when there is nothing between two matched sub strings.</param>

/// <param name="matches">Returns all succeeded and failed matches.</param>

/// <example><![CDATA[

/// int main()

/// {

/// Regex regex(L"C/S+");

/// RegexMatch::List matches;

/// regex.Cut(L"C++ and C# are my favorite programing languages", false, matches);

/// FOREACH(Ptr<RegexMatch>, match, matches)

/// {

/// Console::WriteLine(match->Result().Value());

/// }

/// ]]></example>

void

Cut

(

const

WString

text

bool

keepEmptyMatch

RegexMatch

List

matches

)

const

; };

/*********************************************************************** Tokenizer ***********************************************************************/

/// <summary>A token.</summary>

struct

RegexToken

{

/// <summary>Position in the input string in characters.</summary>

vint

start

;

/// <summary>Size of this token in characters.</summary>

vint

length

;

/// <summary>The token id, begins at 0, represents the regular expression in the list (the first argument in the contructor of <see cref="RegexLexer"/>) that matches this token. -1 means this token is produced by an error.</summary>

vint

token

;

/// <summary>The pointer to where this token starts in the input string .</summary>

/// <remarks>This pointer comes from a <see cref="WString"/> that used to be analyzed. You should keep a variable to that string alive, so that to keep this pointer alive.</remarks>

const

wchar_t

reading

;

/// <summary>The "codeIndex" argument from [M:vl.regex.RegexLexer.Parse].</summary>

vint

codeIndex

;

/// <summary>True if this token is complete. False if this token does not end here. This could happend when colorizing a text line by line.</summary>

bool

completeToken

;

/// <summary>Row number of the first character, begins at 0.</summary>

vint

rowStart

;

/// <summary>Column number of the first character, begins at 0.</summary>

vint

columnStart

;

/// <summary>Row number of the last character, begins at 0.</summary>

vint

rowEnd

;

/// <summary>Column number of the last character, begins at 0.</summary>

vint

columnEnd

;

bool

operator

(

const

RegexToken

_token

)

const

;

bool

operator

(

const

wchar_t

_token

)

const

; };

/// <summary>Token information for <see cref="RegexProc::extendProc"/>.</summary>

struct

RegexProcessingToken

{

/// <summary>

/// The read only start position of the token.

/// This value will be -1 if <see cref="interTokenState"/> is not null.

/// </summary>

const

vint

start

;

/// <summary>

/// The length of the token, allowing to be updated by the callback.

/// When the callback returns, the length is not allowed to be decreased.

/// This value will be -1 if <see cref="interTokenState"/> is not null.

/// </summary>

vint

length

;

/// <summary>

/// The id of the token, allowing to be updated by the callback.

/// </summary>

vint

token

;

/// <summary>

/// The flag indicating if this token is completed, allowing to be updated by the callback.

/// </summary>

bool

completeToken

;

/// <summary>

/// The inter token state object, allowing to be updated by the callback.

/// When the callback returns:

/// <ul>

/// <li>if the completeText parameter is true in <see cref="RegexProc::extendProc"/>, it should be nullptr.</li>

/// <li>if the token does not end at the end of the input, it should not be nullptr.</li>

/// <li>if a token is completed in one attemp of extending, it should be nullptr.</li>

/// </ul>

/// </summary>

void

interTokenState

;

RegexProcessingToken

(

vint

_start

vint

_length

vint

_token

bool

_completeToken

void

_interTokenState

) :start(

_start

) , length(

_length

) , token(

_token

) , completeToken(

_completeToken

) , interTokenState(

_interTokenState

) { } };

using

RegexInterTokenStateDeleter

void

(*)(

void

* interTokenState);

using

RegexTokenExtendProc

void

(*)(

void

* argument,

const

wchar_t

* reading,

vint

length,

bool

completeText,

RegexProcessingToken

& processingToken);

using

RegexTokenColorizeProc

void

(*)(

void

* argument,

vint

start,

vint

length,

vint

token);

/// <summary>Callback procedures</summary>

struct

RegexProc

{

/// <summary>

/// The deleter which deletes <see cref="RegexProcessingToken::interTokenState"/> created by <see cref="extendProc"/>.

/// This callback is not called automatically.

/// It is here to make the maintainance convenient for the caller.

/// </summary>

RegexInterTokenStateDeleter

deleter

nullptr

;

/// <summary>

/// The token extend callback. It is called after recognizing any token, and run a customized procedure to modify the token based on the given context.

/// If the length parameter is -1, it means the caller does not measure the incoming text buffer, which automatically indicates that the buffer is null-terminated.

/// If the length parameter is not -1, it means the number of available characters in the buffer.

/// The completeText parameter could be true or false. When it is false, it means that the buffer does not contain all the text.

/// </summary>

/// <remarks>

///

/// This is very useful to recognize any token that cannot be expressed using a regular expression.

/// For example, a C++ literal string R"tag(the conteng)tag".

/// It is recommended to add a token for R"tag(,

/// and then use this extend proc to search for a )tag" to complete the token.

///

///

/// Important:

/// when colorizing a text line by line,

/// a cross-line token could be incomplete at the end of the line.

/// Because a given buffer ends at the end of that line,

/// the extend proc is not able to know right now about what is going on in the future.

/// Here is what <see cref="RegexProcessingToken::interTokenState"/> is designed for,

/// the extend proc can store anything it wants using that pointer.

///

///

/// The caller can get this pointer from the return value of <see cref="RegexLexerColorizer::Colorize"/>.

/// This pointer only available for cross-line tokens, it is obvious that one line produces at most one such pointer.

/// Then the caller keeps calling that function to walk throught the whole string.

/// When the return value is changed, the pointer is no longer used, and it can be deleted by calling <see cref="deleter"/> manually.

///

///

/// The first argument is <see cref="argument"/>.

///

///

/// The second argument is a pointer to the buffer of the first character in this token.

/// If the previous token is incomplete, then the buffer begins at the first character of the new buffer.

///

///

/// The third argument is the length of the recognized token in characters.

///

///

/// The fourth character indicates if the token is completed.

/// Even if a token is completed, but the extend proc found that, the extend exceeds the end of the buffer,

/// then it can update the value to make it incomplete.

///

///

/// The fifth contains the context for this token. Fields except "start" are allowed to be updated by the extend proc.

///

/// </remarks>

/// <example><![CDATA[

/// int main()

/// {

/// List<WString> tokenDefs;

/// tokenDefs.Add(L"/d+");

/// tokenDefs.Add(L"[a-zA-Z_]/w*");

/// tokenDefs.Add(L"\"([^\"/\\]|/\\/.)*\"");

/// tokenDefs.Add(L"R\"[^(]*/(");

/// tokenDefs.Add(L"[(){};]");

/// tokenDefs.Add(L"/s+");

/// tokenDefs.Add(L"///*+([^//*]|/*+[^//])*/*+//");

///

/// const wchar_t* lines[] = {

/// L"/*********************",

/// L"MAIN.CPP",

/// L"*********************/",

/// L"",

/// L"int main()",

/// L"{",

/// L" printf(\"This is a \\\"simple\\\" text.\");",

/// L" printf(R\"____(This is a",

/// L"\"multiple lined\"",

/// L"literal text)____\");",

/// L" return 0;",

/// L"}",

/// };

///

/// struct Argument

/// {

/// // for a real colorizer, you can put a color buffer here.

/// // the buffer is reused for every line of code.

/// // but for the demo, I put the current processing text instead.

/// // so that I am able to print what is processed.

/// const wchar_t* processingText = nullptr;

/// } argument;

///

/// struct InterTokenState

/// {

/// WString postfix;

/// };

///

/// RegexProc proc;

/// proc.argument = &argument;

/// proc.colorizeProc = [](void* argument, vint start, vint length, vint token)

/// {

/// // this is guaranteed by "proc.argument = &argument;"

/// auto text = reinterpret_cast<Argument*>(argument)->processingText;

/// Console::WriteLine(itow(token) + L": <" + WString(text + start, length) + L">");

/// };

/// proc.deleter = [](void* interTokenState)

/// {

/// delete reinterpret_cast<InterTokenState*>(interTokenState);

/// };

/// proc.extendProc = [](void* argument, const wchar_t* reading, vint length, bool completeText, RegexProcessingToken& processingToken)

/// {

/// // 3 is R"[^(]*/(

/// // 7 is not used in tokenDefs, it is occupied to represent an extended literal string

/// if (processingToken.token == 3 || processingToken.token == 7)

/// {

/// // for calling wcsstr, create a buffer that is zero terminated

/// WString readingBuffer = length == -1 ? WString(reading, false) : WString(reading, length);

/// reading = readingBuffer.Buffer();

///

/// // get the postfix, which is )____" in this case

/// WString postfix;

/// if (processingToken.interTokenState)

/// {

/// postfix = reinterpret_cast<InterTokenState*>(processingToken.interTokenState)->postfix;

/// }

/// else

/// {

/// postfix = L")" + WString(reading + 2, processingToken.length - 3) + L"\"";

/// }

///

/// // try to find if the postfix, which is )____" in this case, appear in the given buffer

/// auto find = wcsstr(reading, postfix.Buffer());

/// if (find)

/// {

/// // if we find the postfix, it means we find the end of the literal string

/// // here processingToken.token automatically becomes 7

/// // interTokenState needs to be nullptr to indicate this

/// processingToken.length = (vint)(find - reading) + postfix.Length();

/// processingToken.completeToken = true;

/// processingToken.interTokenState = nullptr;

/// }

/// else

/// {

/// // if we don't find the postfix, it means the end of the literal string is in future lines

/// // we need to set the token to 7, which is the real token id for literal strings

/// // since we change any token from 3 to 7, 3 will never be passed to colorizeProc in "token" argument

/// processingToken.length = readingBuffer.Length();

/// processingToken.token = 7;

/// processingToken.completeToken = false;

///

/// // we need to ensure that interTokenState is not nullptr, and we can save the postfix here

/// if (!completeText && !processingToken.interTokenState)

/// {

/// auto state = new InterTokenState;

/// state->postfix = postfix;

/// processingToken.interTokenState = state;

/// }

/// };

///

/// RegexLexer lexer(tokenDefs, proc);

/// RegexLexerColorizer colorizer = lexer.Colorize();

///

/// void* lastInterTokenState = nullptr;

/// FOREACH_INDEXER(const wchar_t*, line, index, From(lines))

/// {

/// Console::WriteLine(L"Begin line " + itow(index));

/// argument.processingText = line;

/// void* interTokenState = colorizer.Colorize(line, wcslen(line));

///

/// if (lastInterTokenState && lastInterTokenState != interTokenState)

/// {

/// // call the deleter manually

/// proc.deleter(lastInterTokenState);

/// }

/// lastInterTokenState = interTokenState;

///

/// argument.processingText = nullptr;

/// colorizer.Pass(L'\r');

/// colorizer.Pass(L'\n');

/// Console::WriteLine(L"");

/// }

/// ]]></example>

RegexTokenExtendProc

extendProc

nullptr

;

/// <summary>

///

/// The colorizer callback. It is called when a token is recognized.

///

///

/// The first argument is <see cref="argument"/>.

///

///

/// The second argument is the position of the first character of the token in characters.

///

///

/// The third argument is the length of the recognized token in characters.

///

///

/// The fourth character is the regular expression in the list (the first argument in the contructor of <see cref="RegexLexer"/>) that matches this token.

///

/// </summary>

RegexTokenColorizeProc

colorizeProc

nullptr

;

/// <summary>

/// The argument object that is the first argument for <see cref="extendProc"/> and <see cref="colorizeProc"/>.

/// </summary>

void

argument

nullptr

; };

/// <summary>Token collection representing the result from the lexical analyzer. Call <see cref="RegexLexer::Parse"/> to create this object.</summary>

/// <example><![CDATA[

/// int main()

/// {

/// List<WString> tokenDefs;

/// tokenDefs.Add(L"/d+");

/// tokenDefs.Add(L"/w+");

/// tokenDefs.Add(L"/s+");

///

/// RegexLexer lexer(tokenDefs, {});

/// WString input = L"I have 2 books.";

/// auto tokenResult = lexer.Parse(input);

///

/// FOREACH(RegexToken, token, tokenResult)

/// {

/// // input must be in a variable

/// // because token.reading points to a position from input.Buffer();

/// Console::WriteLine(itow(token.token) + L": <" + WString(token.reading, token.length) + L">");

/// }

/// ]]></example>

class

RegexTokens

public

Object

public

collections

IEnumerable

RegexToken

> {

friend

class

RegexLexer

;

protected

regex_internal

PureInterpretor

pure

;

const

collections

Array

vint

stateTokens

;

WString

code

;

vint

codeIndex

;

RegexProc

proc

;

RegexTokens

(

regex_internal

PureInterpretor

_pure

const

collections

Array

vint

_stateTokens

const

WString

_code

vint

_codeIndex

RegexProc

_proc

);

public

RegexTokens

(

const

RegexTokens

tokens

);

RegexTokens

();

collections

IEnumerator

RegexToken

CreateEnumerator

()

const

;

/// <summary>Copy all tokens.</summary>

/// <param name="tokens">Returns all tokens.</param>

/// <param name="discard">A callback to decide which kind of tokens to discard. The input is [F:vl.regex.RegexToken.token]. Returns true to discard this kind of tokens.</param>

/// <example><![CDATA[

/// int main()

/// {

/// List<WString> tokenDefs;

/// tokenDefs.Add(L"/d+");

/// tokenDefs.Add(L"/w+");

/// tokenDefs.Add(L"/s+");

///

/// RegexLexer lexer(tokenDefs, {});

/// WString input = L"I have 2 books.";

/// auto tokenResult = lexer.Parse(input);

///

/// List<RegexToken> filtered;

/// tokenResult.ReadToEnd(filtered, [](vint token) { return token < 0 || token == 2; });

///

/// FOREACH(RegexToken, token, tokenResult)

/// {

/// // input must be in a variable

/// // because token.reading points to a position from input.Buffer();

/// Console::WriteLine(itow(token.token) + L": <" + WString(token.reading, token.length) + L">");

/// }

/// ]]></example>

void

ReadToEnd

(

collections

List

RegexToken

tokens

bool

discard

)(

vint

)

const

; };

/// <summary>A type for walking through a text against a <see cref="RegexLexer"/>. Call <see cref="RegexLexer::Walk"/> to create this object.</summary>

/// <example><![CDATA[

/// int main()

/// {

/// List<WString> tokenDefs;

/// tokenDefs.Add(L"/d+./d+");

/// tokenDefs.Add(L"/d+");

/// tokenDefs.Add(L"/w+");

/// tokenDefs.Add(L"/s+");

///

/// RegexLexer lexer(tokenDefs, {});

/// RegexLexerWalker walker = lexer.Walk();

///

/// WString input = L"This book costs 2.5. That book costs 2.";

/// const wchar_t* reading = input.Buffer();

///

/// const wchar_t* tokenBegin = reading;

/// const wchar_t* tokenEnd = nullptr;

/// vint tokenId = -1;

///

/// vint state = walker.GetStartState();

/// while (*reading)

/// {

/// vint token = -1;

/// bool finalState = false;

/// bool previousTokenStop = false;

/// walker.Walk(*reading++, state, token, finalState, previousTokenStop);

///

/// if (previousTokenStop || !*reading)

/// {

/// if (tokenEnd)

/// {

/// if (tokenBegin == tokenEnd)

/// {

/// Console::WriteLine(L"Recognized token: " + itow(tokenId) + L": <" + WString(*tokenBegin) + L">");

/// tokenBegin = reading;

/// tokenEnd = nullptr;

/// tokenId = -1;

/// state = walker.GetStartState();

/// }

/// else

/// {

/// Console::WriteLine(L"Recognized token: " + itow(tokenId) + L": <" + WString(tokenBegin, tokenEnd - tokenBegin) + L">");

/// tokenBegin = reading = tokenEnd;

/// tokenEnd = nullptr;

/// tokenId = -1;

/// state = walker.GetStartState();

/// }

/// else

/// {

/// Console::WriteLine(L"Unrecognized character: <" + WString(*tokenBegin) + L">");

/// tokenBegin++;

/// state = walker.GetStartState();

/// }

/// else if (finalState)

/// {

/// tokenEnd = reading;

/// tokenId = token;

/// }

/// ]]></example>

class

RegexLexerWalker

public

Object

{

friend

class

RegexLexer

;

protected

regex_internal

PureInterpretor

pure

;

const

collections

Array

vint

stateTokens

;

RegexLexerWalker

(

regex_internal

PureInterpretor

_pure

const

collections

Array

vint

_stateTokens

);

public

RegexLexerWalker

(

const

RegexLexerWalker

tokens

);

RegexLexerWalker

();

/// <summary>Get the start DFA state number, which represents the correct state before parsing any input.</summary>

/// <returns>The DFA state number.</returns>

/// <remarks>When calling <see cref="Walk"/> for the first character, the return value should be passed to the second parameter.</remarks>

vint

GetStartState

()

const

;

/// <summary>Test if this state can only lead to the end of one kind of token.</summary>

/// <returns>Returns the token index if this state can only lead to the end of one kind of token. Returns -1 if not.</returns>

/// <param name="state">The DFA state number.</param>

vint

GetRelatedToken

(

vint

state

)

const

;

/// <summary>Step forward by one character.</summary>

/// <param name="input">The input character.</param>

/// <param name="state">The current state. Returns the new current state when this function returns.</param>

/// <param name="token">Returns the token index at the end of the token.</param>

/// <param name="finalState">Returns true if it reach the end of the token.</param>

/// <param name="previousTokenStop">Returns true if the previous character is the end of the token.</param>

/// <remarks>

///

/// The "finalState" argument is important.

/// When "previousTokenStop" becomes true,

/// it tells you that this character can no longer form a token with previous consumed characters.

/// But it does not mean that the recognized token ends at the previous token.

/// The recognized token could end eariler,

/// which is indiated at the last time when "finalState" becomes true.

///

///

/// See the example for <see cref="RegexLexerWalker"/> about how to use this function.

///

/// </remarks>

void

Walk

(

wchar_t

input

vint

state

vint

token

bool

finalState

bool

previousTokenStop

)

const

;

/// <summary>Step forward by one character.</summary>

/// <returns>Returns the new current state. It is used to walk the next character.</returns>

/// <param name="input">The input character.</param>

/// <param name="state">The current state.</param>

vint

Walk

(

wchar_t

input

vint

state

)

const

;

/// <summary>Test if the input text is a closed token.</summary>

/// <returns>Returns true if the input text is a closed token.</returns>

/// <param name="input">The input text.</param>

/// <param name="length">Size of the input text in characters.</param>

/// <remarks>

///

/// A closed token means that,

/// there is a prefix that is a recognized token.

/// At the same time, the input string itself could not be a token, or a prefix of any token.

/// the recognized token has ended before reaching the end of the string.

///

///

/// An unrecognized token is also considered as closed.

///

///

/// For example, assume we have a token defined by "/d+./d+":

/// <ul>

/// <li>"2" is not a closed token, because it has not ended.</li>

/// <li>

/// "2.5." is a closed token, because it has ended at "2.5",

/// and "2.5." could never be a prefix of any token,

/// unless we have another token defined by "/d+./d+./d+".

/// </li>

/// </ul>

///

/// </remarks>

/// <example><![CDATA[

/// int main()

/// {

/// List<WString> tokenDefs;

/// tokenDefs.Add(L"/d+./d+");

/// tokenDefs.Add(L"/d+");

///

/// RegexLexer lexer(tokenDefs, {});

/// RegexLexerWalker walker = lexer.Walk();

///

/// WString tests[] = { L".", L"2", L"2.", L"2.5", L"2.5." };

/// FOREACH(WString, test, From(tests))

/// {

/// if (walker.IsClosedToken(test.Buffer(), test.Length()))

/// {

/// Console::WriteLine(test + L" is a closed token.");

/// }

/// else

/// {

/// Console::WriteLine(test + L" is not a closed token.");

/// }

/// ]]></example>

bool

IsClosedToken

(

const

wchar_t

input

vint

length

)

const

;

/// <summary>Test if the input is a closed token.</summary>

/// <returns>Returns true if the input text is a closed token.</returns>

/// <param name="input">The input text.</param>

/// <remarks>

///

/// A closed token means that,

/// there is a prefix that is a recognized token.

/// At the same time, the input string itself could not be a token, or a prefix of any token.

/// the recognized token has ended before reaching the end of the string.

///

///

/// An unrecognized token is also considered as closed.

///

///

/// For example, assume we have a token defined by "/d+./d+":

/// <ul>

/// <li>"2" is not a closed token, because it has not ended.</li>

/// <li>

/// "2.5." is a closed token, because it has ended at "2.5",

/// and "2.5." could never be a prefix of any token,

/// unless we have another token defined by "/d+./d+./d+".

/// </li>

/// </ul>

///

/// </remarks>

/// <example><![CDATA[

/// int main()

/// {

/// List<WString> tokenDefs;

/// tokenDefs.Add(L"/d+./d+");

/// tokenDefs.Add(L"/d+");

///

/// RegexLexer lexer(tokenDefs, {});

/// RegexLexerWalker walker = lexer.Walk();

///

/// WString tests[] = { L".", L"2", L"2.", L"2.5", L"2.5." };

/// FOREACH(WString, test, From(tests))

/// {

/// if (walker.IsClosedToken(test))

/// {

/// Console::WriteLine(test + L" is a closed token.");

/// }

/// else

/// {

/// Console::WriteLine(test + L" is not a closed token.");

/// }

/// ]]></example>

bool

IsClosedToken

(

const

WString

input

)

const

; };

/// <summary>Lexical colorizer. Call <see cref="RegexLexer::Colorize"/> to create this object.</summary>

/// <example><![CDATA[

/// int main()

/// {

/// List<WString> tokenDefs;

/// tokenDefs.Add(L"/d+");

/// tokenDefs.Add(L"[a-zA-Z_]/w*");

/// tokenDefs.Add(L"[(){};]");

/// tokenDefs.Add(L"/s+");

/// tokenDefs.Add(L"///*+([^//*]|/*+[^//])*/*+//");

///

/// const wchar_t* lines[] = {

/// L"/*********************",

/// L"MAIN.CPP",

/// L"*********************/",

/// L"",

/// L"int main()",

/// L"{",

/// L" return 0;",

/// L"}",

/// };

///

/// struct Argument

/// {

/// // for a real colorizer, you can put a color buffer here.

/// // the buffer is reused for every line of code.

/// // but for the demo, I put the current processing text instead.

/// // so that I am able to print what is processed.

/// const wchar_t* processingText = nullptr;

/// } argument;

///

/// RegexProc proc;

/// proc.argument = &argument;

/// proc.colorizeProc = [](void* argument, vint start, vint length, vint token)

/// {

/// // this is guaranteed by "proc.argument = &argument;"

/// auto text = reinterpret_cast<Argument*>(argument)->processingText;

/// Console::WriteLine(itow(token) + L": <" + WString(text + start, length) + L">");

/// };

///

/// RegexLexer lexer(tokenDefs, proc);

/// RegexLexerColorizer colorizer = lexer.Colorize();

///

/// FOREACH_INDEXER(const wchar_t*, line, index, From(lines))

/// {

/// Console::WriteLine(L"Begin line " + itow(index));

/// argument.processingText = line;

/// colorizer.Colorize(line, wcslen(line));

///

/// argument.processingText = nullptr;

/// colorizer.Pass(L'\r');

/// colorizer.Pass(L'\n');

/// Console::WriteLine(L"");

/// }

/// ]]></example>

class

RegexLexerColorizer

public

Object

{

friend

class

RegexLexer

;

public

struct

InternalState

{

vint

currentState

= -

;

vint

interTokenId

= -

;

void

interTokenState

nullptr

; };

protected

RegexLexerWalker

walker

;

RegexProc

proc

;

InternalState

internalState

;

void

CallExtendProcAndColorizeProc

(

const

wchar_t

input

vint

length

RegexProcessingToken

token

bool

colorize

);

vint

WalkOneToken

(

const

wchar_t

input

vint

length

vint

start

bool

colorize

);

RegexLexerColorizer

(

const

RegexLexerWalker

_walker

RegexProc

_proc

);

public

RegexLexerColorizer

(

const

RegexLexerColorizer

colorizer

);

RegexLexerColorizer

();

/// <summary>Get the internal state.</summary>

/// <returns>The internal state.</returns>

/// <remarks>

///

/// If <see cref="Colorize"/> has not been called, the return value of this function is the start state.

///

///

/// If a text is multi-lined, <see cref="Colorize"/> could be called line by line, and the internal state is changed.

///

///

/// In order to colorize another piece of multi-lined text,

/// you can either save the start state and call <see cref="SetInternalState"/> to reset the state,

/// or call <see cref="RegexLexer::Colorize"/> for a new colorizer.

///

/// </remarks>

InternalState

GetInternalState

();

/// <summary>Restore the colorizer to a specified state.</summary>

/// <param name="state">The state to restore.</param>

void

SetInternalState

(

InternalState

state

);

/// <summary>Step forward by one character.</summary>

/// <param name="input">The input character.</param>

/// <remarks>Callbacks in <see cref="RegexProc"/> will be called except colorizeProc, which is from the second argument of the constructor of <see cref="RegexLexer"/>.</remarks>

void

Pass

(

wchar_t

input

);

/// <summary>Get the start DFA state number, which represents the correct state before colorizing any characters.</summary>

/// <returns>The DFA state number.</returns>

vint

GetStartState

()

const

;

/// <summary>Colorize a text.</summary>

/// <returns>An inter token state at the end of this line. It could be the same object to which is returned from the previous call.</returns>

/// <param name="input">The text to colorize.</param>

/// <param name="length">Size of the text in characters.</param>

/// <remarks>

/// See <see cref="RegexProcessingToken::interTokenState"/> and <see cref="RegexProc::extendProc"/> for more information about the return value.

/// Callbacks in <see cref="RegexProc"/> will be called, which is from the second argument of the constructor of <see cref="RegexLexer"/>.

/// </remarks>

void

Colorize

(

const

wchar_t

input

vint

length

); };

/// <summary>Lexical analyzer.</summary>

class

RegexLexer

public

Object

private

NotCopyable

{

protected

regex_internal

PureInterpretor

pure

nullptr

;

collections

Array

vint

ids

;

collections

Array

vint

stateTokens

;

RegexProc

proc

;

public

/// <summary>Create a lexical analyzer by a set of regular expressions. [F:vl.regex.RegexToken.token] will be the index of the matched regular expression in the first argument.</summary>

/// <param name="tokens">ALl regular expression, each one represent a kind of tokens.</param>

/// <param name="_proc">Configuration of all callbacks.</param>

RegexLexer

(

const

collections

IEnumerable

WString

tokens

RegexProc

_proc

);

RegexLexer

();

/// <summary>Tokenize an input text.</summary>

/// <returns>All tokens, including recognized tokens or unrecognized tokens. For unrecognized tokens, [F:vl.regex.RegexToken.token] will be -1.</returns>

/// <param name="code">The text to tokenize.</param>

/// <param name="codeIndex">Extra information that will be copied to [F:vl.regex.RegexToken.codeIndex].</param>

/// <remarks>Callbacks in <see cref="RegexProc"/> will be called when iterating through tokens, which is from the second argument of the constructor of <see cref="RegexLexer"/>.</remarks>

RegexTokens

Parse

(

const

WString

code

vint

codeIndex

)

const

;

/// <summary>Create a equivalence walker from this lexical analyzer. A walker enable you to walk throught characters one by one,</summary>

/// <returns>The walker.</returns>

RegexLexerWalker

Walk

()

const

;

/// <summary>Create a equivalence colorizer from this lexical analyzer.</summary>

/// <returns>The colorizer.</returns>

RegexLexerColorizer

Colorize

()

const

; }; } }

#endif