feat: enhance RegexParser with group handling and capture functionality

This commit is contained in:
monoid 2025-06-29 14:15:21 +09:00
parent 78472511e7
commit 5536b872b4
3 changed files with 95 additions and 48 deletions

View file

@ -7,8 +7,11 @@ import com.github.h0tk3y.betterParse.lexer.literalToken
import com.github.h0tk3y.betterParse.lexer.regexToken
import com.github.h0tk3y.betterParse.lexer.token
import com.github.h0tk3y.betterParse.parser.Parser
import com.github.h0tk3y.betterParse.grammar.parseToEnd
class RegexParser : Grammar<RegexItem>() {
private var groupCounter = 0
// val bracketContent by regexToken("[^\\]]*")
val escapedCharacter by regexToken("\\\\[+*?.()|\\[\\]]")
val postfixOperator by regexToken("[+*?]")
@ -49,7 +52,13 @@ class RegexParser : Grammar<RegexItem>() {
(dot asJust DotItem()) or
(escapedCharacter map { CharItem(it.text.substring(1)) }) or
(bracketContent map { BracketItem(it.text.substring(1, it.text.length - 1)) }) or
(skip(openParenSymbol) and (parser(::rootParser)) and skip(closeParenSymbol))
(skip(openParenSymbol) and
(parser(::rootParser)) and
skip(closeParenSymbol) map
{
val groupName = "${groupCounter++}"
GroupItem(it, groupName)
})
val term: Parser<RegexItem> by
(item and optional(postfixOperator)) map
@ -76,3 +85,7 @@ class RegexParser : Grammar<RegexItem>() {
override val rootParser: Parser<RegexItem> by termWithAlternation
}
fun compileRegex(input: String): RegexItem {
return RegexParser().parseToEnd(input)
}

View file

@ -3,6 +3,7 @@ package org.example
data class State(
val matched: String,
val remaining: String,
val captures: Map<String, String> = emptyMap()
)
data class AvailableState(val seq: Sequence<State> = emptySequence()) : Sequence<State> by seq {
@ -38,6 +39,11 @@ fun RegexItem.match(item: String): MatchResult {
return MatchResult(this.findMatch(item))
}
fun RegexItem.test(item: String): Boolean {
// 매칭 결과가 성공인지 확인하는 헬퍼 함수
return this.match(item).isSuccess
}
class AndThenItem(val left: RegexItem, val right: RegexItem) : RegexItem {
override fun toString(): String = "${left}${right}"
override fun findMatch(str: String): AvailableState {
@ -54,7 +60,11 @@ class AndThenItem(val left: RegexItem, val right: RegexItem) : RegexItem {
if (!rightMatch.isEmpty) {
// If right match is successful, combine the matched parts
rightMatch.map { rightState ->
State(state.matched + rightState.matched, rightState.remaining)
State(
state.matched + rightState.matched,
rightState.remaining,
state.captures + rightState.captures
) // Combine captures
}
} else {
// If right match fails, return an empty sequence
@ -107,6 +117,21 @@ class BracketItem(val content: String) : RegexItem {
}
}
class GroupItem(val item: RegexItem, val name: String) : RegexItem {
override fun toString(): String = "(${item})"
override fun findMatch(str: String): AvailableState {
// 그룹은 내부 아이템과 동일하게 매칭을 시도
val ret = item.findMatch(str)
// 매칭된 상태에 그룹 이름을 추가하여 반환
return AvailableState(
ret.seq.map { state ->
State(state.matched, state.remaining, state.captures + (name to state.matched))
}
)
}
}
fun matchMany(
str: String,
item: RegexItem,
@ -191,7 +216,7 @@ class DotItem : RegexItem {
}
class AlternationItem(val left: RegexItem, val right: RegexItem) : RegexItem {
override fun toString(): String = "(${left}|${right})"
override fun toString(): String = "${left}|${right}"
override fun findMatch(str: String): AvailableState {
// Alternation은 왼쪽 또는 오른쪽 항목 중 하나와 매칭되므로, 각각 시도해보고 성공하는 경우를 반환
val leftMatch = left.findMatch(str)

View file

@ -2,21 +2,19 @@ package org.example
import kotlin.test.Test
import kotlin.test.assertEquals
import com.github.h0tk3y.betterParse.grammar.parseToEnd
class ParserTest {
@Test
fun testSimpleCharacter() {
val input = "a"
val result = RegexParser().parseToEnd(input)
assertEquals("a",result.toString())
val result = compileRegex(input)
assertEquals("a", result.toString())
}
@Test
fun testCharacterWithPlus() {
val input = "a+"
val parser = RegexParser()
val result = parser.parseToEnd(input)
val result = compileRegex(input)
assertEquals("a+", result.toString())
assert(result.match("a").isSuccess)
assert(result.match("aa").isSuccess)
@ -25,8 +23,7 @@ class ParserTest {
@Test
fun testCharacterWithStar() {
val input = "b*"
val parser = RegexParser()
val result = parser.parseToEnd(input)
val result = compileRegex(input)
assertEquals("b*", result.toString())
assert(result.match("").isSuccess)
assert(result.match("b").isSuccess)
@ -36,8 +33,7 @@ class ParserTest {
@Test
fun testCharacterWithQuestion() {
val input = "c?"
val parser = RegexParser()
val result = parser.parseToEnd(input)
val result = compileRegex(input)
assertEquals("c?", result.toString())
assert(result.match("").isSuccess)
assert(result.match("c").isSuccess)
@ -46,8 +42,7 @@ class ParserTest {
@Test
fun testDot() {
val input = "."
val parser = RegexParser()
val result = parser.parseToEnd(input)
val result = compileRegex(input)
assertEquals(".", result.toString())
assert(result.match("a").isSuccess)
assert(result.match("1").isSuccess)
@ -57,9 +52,8 @@ class ParserTest {
@Test
fun testAlternation() {
val input = "a|b"
val parser = RegexParser()
val result = parser.parseToEnd(input)
assertEquals("(a|b)", result.toString())
val result = compileRegex(input)
assertEquals("a|b", result.toString())
assert(result.match("a").isSuccess)
assert(result.match("b").isSuccess)
assert(!result.match("c").isSuccess)
@ -67,9 +61,8 @@ class ParserTest {
@Test
fun testParentheses() {
val input = "(d)"
val parser = RegexParser()
val result = parser.parseToEnd(input)
assertEquals("d", result.toString())
val result = compileRegex(input)
assertEquals("(d)", result.toString())
assert(result.match("d").isSuccess)
assert(!result.match("e").isSuccess)
}
@ -77,8 +70,7 @@ class ParserTest {
@Test
fun testComplexExpression() {
val input = "a(b|c)*d+"
val parser = RegexParser()
val result = parser.parseToEnd(input)
val result = compileRegex(input)
assertEquals("a(b|c)*d+", result.toString())
assert(result.match("ad").isSuccess)
assert(!result.match("ab").isSuccess)
@ -90,8 +82,7 @@ class ParserTest {
@Test
fun testAndThen() {
val input = "ab"
val parser = RegexParser()
val result = parser.parseToEnd(input)
val result = compileRegex(input)
assertEquals("ab", result.toString())
assert(result.match("ab").isSuccess)
assert(!result.match("a").isSuccess)
@ -99,9 +90,8 @@ class ParserTest {
}
@Test
fun testDotAndPlus() {
val input = ".+a";
val parser = RegexParser()
val result = parser.parseToEnd(input)
val input = ".+a"
val result = compileRegex(input)
assertEquals(".+a", result.toString())
assert(!result.match("a").isSuccess)
assert(result.match("ba").isSuccess)
@ -110,8 +100,7 @@ class ParserTest {
@Test
fun testEscapedCharacter() {
val input = "\\+"
val parser = RegexParser()
val result = parser.parseToEnd(input)
val result = compileRegex(input)
assertEquals("\\+", result.toString())
assert(result.match("+").isSuccess)
assert(!result.match("a").isSuccess)
@ -119,12 +108,32 @@ class ParserTest {
@Test
fun testBracketContent() {
val input = "[abc]"
val parser = RegexParser()
val result = parser.parseToEnd(input)
val result = compileRegex(input)
assertEquals("[abc]", result.toString())
assert(result.match("a").isSuccess)
assert(result.match("b").isSuccess)
assert(result.match("c").isSuccess)
assert(!result.match("d").isSuccess)
}
@Test
fun testNestedGroups() {
val input = "(a(b|c)d)+"
val result = compileRegex(input)
assertEquals("(a(b|c)d)+", result.toString())
assert(!result.match("ad").isSuccess)
assert(result.match("abd").isSuccess)
assert(result.match("acd").isSuccess)
assert(!result.match("a").isSuccess)
}
@Test
fun testCaptureGroups() {
val input = "(a)(b)"
val result = compileRegex(input)
assertEquals("(a)(b)", result.toString())
val matchResult = result.match("ab")
assert(matchResult.isSuccess)
val captures = matchResult.available.first();
assertEquals("a", captures.captures.get("0"))
assertEquals("b", captures.captures.get("1"))
}
}