我在此处作为答案发布的原始版本存在一个问题,即只有在有多个与当前表达式匹配的“ Regex”时才起作用。也就是说,只有一个Regex匹配时,它将返回令牌-
而大多数人希望Regex是“贪婪的”。对于诸如“引号字符串”之类的情况尤其如此。
正则表达式之上的唯一解决方案是逐行读取输入(这意味着您不能拥有跨越多行的令牌)。我可以忍受-
毕竟,这是一个穷人的词汇!此外,在任何情况下从Lexer中获取行号信息通常都是有用的。
因此,这是一个解决这些问题的新版本。信用也去这
public interface IMatcher{ /// <summary> /// Return the number of characters that this "regex" or equivalent /// matches. /// </summary> /// <param name="text">The text to be matched</param> /// <returns>The number of characters that matched</returns> int Match(string text);}sealed class RegexMatcher : IMatcher{ private readonly Regex regex; public RegexMatcher(string regex) => this.regex = new Regex(string.Format("^{0}", regex)); public int Match(string text) { var m = regex.Match(text); return m.Success ? m.Length : 0; } public override string ToString() => regex.ToString();}public sealed class TokenDefinition{ public readonly IMatcher Matcher; public readonly object Token; public TokenDefinition(string regex, object token) { this.Matcher = new RegexMatcher(regex); this.Token = token; }}public sealed class Lexer : IDisposable{ private readonly TextReader reader; private readonly TokenDefinition[] tokenDefinitions; private string lineRemaining; public Lexer(TextReader reader, TokenDefinition[] tokenDefinitions) { this.reader = reader; this.tokenDefinitions = tokenDefinitions; nextLine(); } private void nextLine() { do { lineRemaining = reader.ReadLine(); ++LineNumber; Position = 0; } while (lineRemaining != null && lineRemaining.Length == 0); } public bool Next() { if (lineRemaining == null) return false; foreach (var def in tokenDefinitions) { var matched = def.Matcher.Match(lineRemaining); if (matched > 0) { Position += matched; Token = def.Token; TokenContents = lineRemaining.Substring(0, matched); lineRemaining = lineRemaining.Substring(matched); if (lineRemaining.Length == 0) nextLine(); return true; } } throw new Exception(string.Format("Unable to match against any tokens at line {0} position {1} "{2}"", LineNumber, Position, lineRemaining)); } public string TokenContents { get; private set; } public object Token { get; private set; } public int LineNumber { get; private set; } public int Position { get; private set; } public void Dispose() => reader.Dispose();}示例程序:
string sample = @"( one (two 456 -43.2 "" "" quoted"" ))";var defs = new TokenDefinition[]{ // Thanks to [steven levithan][2] for this great quoted string // regex new TokenDefinition(@"([""'])(?:\1|.)*?1", "QUOTED-STRING"), // Thanks to http://www.regular-expressions.info/floatingpoint.html new TokenDefinition(@"[-+]?d*.d+([eE][-+]?d+)?", "FLOAT"), new TokenDefinition(@"[-+]?d+", "INT"), new TokenDefinition(@"#t", "TRUE"), new TokenDefinition(@"#f", "FALSE"), new TokenDefinition(@"[*<>?-+/A-Za-z->!]+", "SYMBOL"), new TokenDefinition(@".", "DOT"), new TokenDefinition(@"(", "LEFT"), new TokenDefinition(@")", "RIGHT"), new TokenDefinition(@"s", "SPACE")};TextReader r = new StringReader(sample);Lexer l = new Lexer(r, defs);while (l.Next()) Console.WriteLine("Token: {0} Contents: {1}", l.Token, l.TokenContents);输出:
Token: LEFT Contents: (Token: SPACE Contents:Token: SYMBOL Contents: oneToken: SPACE Contents:Token: LEFT Contents: (Token: SYMBOL Contents: twoToken: SPACE Contents:Token: INT Contents: 456Token: SPACE Contents:Token: FLOAT Contents: -43.2Token: SPACE Contents:Token: QUOTED-STRING Contents: " " quoted"Token: SPACE Contents:Token: RIGHT Contents: )Token: RIGHT Contents: )



