From 5536b872b4863a5e1c269b94ba05e34e7861a093 Mon Sep 17 00:00:00 2001 From: monoid Date: Sun, 29 Jun 2025 14:15:21 +0900 Subject: [PATCH] feat: enhance RegexParser with group handling and capture functionality --- lib/src/main/kotlin/org/example/Parser.kt | 15 ++++- lib/src/main/kotlin/org/example/RegexItem.kt | 61 ++++++++++++----- lib/src/test/kotlin/org/example/ParserTest.kt | 67 +++++++++++-------- 3 files changed, 95 insertions(+), 48 deletions(-) diff --git a/lib/src/main/kotlin/org/example/Parser.kt b/lib/src/main/kotlin/org/example/Parser.kt index bceac9f..c97c5cc 100644 --- a/lib/src/main/kotlin/org/example/Parser.kt +++ b/lib/src/main/kotlin/org/example/Parser.kt @@ -7,8 +7,11 @@ import com.github.h0tk3y.betterParse.lexer.literalToken import com.github.h0tk3y.betterParse.lexer.regexToken import com.github.h0tk3y.betterParse.lexer.token import com.github.h0tk3y.betterParse.parser.Parser +import com.github.h0tk3y.betterParse.grammar.parseToEnd class RegexParser : Grammar() { + private var groupCounter = 0 + // val bracketContent by regexToken("[^\\]]*") val escapedCharacter by regexToken("\\\\[+*?.()|\\[\\]]") val postfixOperator by regexToken("[+*?]") @@ -49,7 +52,13 @@ class RegexParser : Grammar() { (dot asJust DotItem()) or (escapedCharacter map { CharItem(it.text.substring(1)) }) or (bracketContent map { BracketItem(it.text.substring(1, it.text.length - 1)) }) or - (skip(openParenSymbol) and (parser(::rootParser)) and skip(closeParenSymbol)) + (skip(openParenSymbol) and + (parser(::rootParser)) and + skip(closeParenSymbol) map + { + val groupName = "${groupCounter++}" + GroupItem(it, groupName) + }) val term: Parser by (item and optional(postfixOperator)) map @@ -76,3 +85,7 @@ class RegexParser : Grammar() { override val rootParser: Parser by termWithAlternation } + +fun compileRegex(input: String): RegexItem { + return RegexParser().parseToEnd(input) +} \ No newline at end of file diff --git a/lib/src/main/kotlin/org/example/RegexItem.kt b/lib/src/main/kotlin/org/example/RegexItem.kt index a85fff0..ac6500a 100644 --- a/lib/src/main/kotlin/org/example/RegexItem.kt +++ b/lib/src/main/kotlin/org/example/RegexItem.kt @@ -3,6 +3,7 @@ package org.example data class State( val matched: String, val remaining: String, + val captures: Map = emptyMap() ) data class AvailableState(val seq: Sequence = emptySequence()) : Sequence by seq { @@ -38,6 +39,11 @@ fun RegexItem.match(item: String): MatchResult { return MatchResult(this.findMatch(item)) } +fun RegexItem.test(item: String): Boolean { + // 매칭 결과가 성공인지 확인하는 헬퍼 함수 + return this.match(item).isSuccess +} + class AndThenItem(val left: RegexItem, val right: RegexItem) : RegexItem { override fun toString(): String = "${left}${right}" override fun findMatch(str: String): AvailableState { @@ -54,7 +60,11 @@ class AndThenItem(val left: RegexItem, val right: RegexItem) : RegexItem { if (!rightMatch.isEmpty) { // If right match is successful, combine the matched parts rightMatch.map { rightState -> - State(state.matched + rightState.matched, rightState.remaining) + State( + state.matched + rightState.matched, + rightState.remaining, + state.captures + rightState.captures + ) // Combine captures } } else { // If right match fails, return an empty sequence @@ -66,21 +76,21 @@ class AndThenItem(val left: RegexItem, val right: RegexItem) : RegexItem { } class CharItem(val value: String) : RegexItem { - override fun toString(): String = - // escape 특수 문자를 처리하여 출력 - when (value) { - "+" -> "\\+" - "*" -> "\\*" - "?" -> "\\?" - "." -> "\\." - "(" -> "\\(" - ")" -> "\\)" - "|" -> "\\|" - "[" -> "\\[" - "]" -> "\\]" - else -> value // 일반 문자 그대로 반환 - } - + override fun toString(): String = + // escape 특수 문자를 처리하여 출력 + when (value) { + "+" -> "\\+" + "*" -> "\\*" + "?" -> "\\?" + "." -> "\\." + "(" -> "\\(" + ")" -> "\\)" + "|" -> "\\|" + "[" -> "\\[" + "]" -> "\\]" + else -> value // 일반 문자 그대로 반환 + } + override fun findMatch(str: String): AvailableState { return when { // 첫번째 문자가 value와 일치하는지 확인 @@ -94,7 +104,7 @@ class CharItem(val value: String) : RegexItem { class BracketItem(val content: String) : RegexItem { override fun toString(): String = "[$content]" - + // TODO: 범위 처리 override fun findMatch(str: String): AvailableState { // 대괄호 안의 내용과 일치하는 첫 문자를 찾음 @@ -107,6 +117,21 @@ class BracketItem(val content: String) : RegexItem { } } +class GroupItem(val item: RegexItem, val name: String) : RegexItem { + override fun toString(): String = "(${item})" + + override fun findMatch(str: String): AvailableState { + // 그룹은 내부 아이템과 동일하게 매칭을 시도 + val ret = item.findMatch(str) + // 매칭된 상태에 그룹 이름을 추가하여 반환 + return AvailableState( + ret.seq.map { state -> + State(state.matched, state.remaining, state.captures + (name to state.matched)) + } + ) + } +} + fun matchMany( str: String, item: RegexItem, @@ -191,7 +216,7 @@ class DotItem : RegexItem { } class AlternationItem(val left: RegexItem, val right: RegexItem) : RegexItem { - override fun toString(): String = "(${left}|${right})" + override fun toString(): String = "${left}|${right}" override fun findMatch(str: String): AvailableState { // Alternation은 왼쪽 또는 오른쪽 항목 중 하나와 매칭되므로, 각각 시도해보고 성공하는 경우를 반환 val leftMatch = left.findMatch(str) diff --git a/lib/src/test/kotlin/org/example/ParserTest.kt b/lib/src/test/kotlin/org/example/ParserTest.kt index 9183754..0c10bcb 100644 --- a/lib/src/test/kotlin/org/example/ParserTest.kt +++ b/lib/src/test/kotlin/org/example/ParserTest.kt @@ -2,21 +2,19 @@ package org.example import kotlin.test.Test import kotlin.test.assertEquals -import com.github.h0tk3y.betterParse.grammar.parseToEnd class ParserTest { @Test fun testSimpleCharacter() { val input = "a" - val result = RegexParser().parseToEnd(input) - assertEquals("a",result.toString()) + val result = compileRegex(input) + assertEquals("a", result.toString()) } @Test fun testCharacterWithPlus() { val input = "a+" - val parser = RegexParser() - val result = parser.parseToEnd(input) + val result = compileRegex(input) assertEquals("a+", result.toString()) assert(result.match("a").isSuccess) assert(result.match("aa").isSuccess) @@ -25,8 +23,7 @@ class ParserTest { @Test fun testCharacterWithStar() { val input = "b*" - val parser = RegexParser() - val result = parser.parseToEnd(input) + val result = compileRegex(input) assertEquals("b*", result.toString()) assert(result.match("").isSuccess) assert(result.match("b").isSuccess) @@ -36,8 +33,7 @@ class ParserTest { @Test fun testCharacterWithQuestion() { val input = "c?" - val parser = RegexParser() - val result = parser.parseToEnd(input) + val result = compileRegex(input) assertEquals("c?", result.toString()) assert(result.match("").isSuccess) assert(result.match("c").isSuccess) @@ -46,8 +42,7 @@ class ParserTest { @Test fun testDot() { val input = "." - val parser = RegexParser() - val result = parser.parseToEnd(input) + val result = compileRegex(input) assertEquals(".", result.toString()) assert(result.match("a").isSuccess) assert(result.match("1").isSuccess) @@ -57,9 +52,8 @@ class ParserTest { @Test fun testAlternation() { val input = "a|b" - val parser = RegexParser() - val result = parser.parseToEnd(input) - assertEquals("(a|b)", result.toString()) + val result = compileRegex(input) + assertEquals("a|b", result.toString()) assert(result.match("a").isSuccess) assert(result.match("b").isSuccess) assert(!result.match("c").isSuccess) @@ -67,9 +61,8 @@ class ParserTest { @Test fun testParentheses() { val input = "(d)" - val parser = RegexParser() - val result = parser.parseToEnd(input) - assertEquals("d", result.toString()) + val result = compileRegex(input) + assertEquals("(d)", result.toString()) assert(result.match("d").isSuccess) assert(!result.match("e").isSuccess) } @@ -77,8 +70,7 @@ class ParserTest { @Test fun testComplexExpression() { val input = "a(b|c)*d+" - val parser = RegexParser() - val result = parser.parseToEnd(input) + val result = compileRegex(input) assertEquals("a(b|c)*d+", result.toString()) assert(result.match("ad").isSuccess) assert(!result.match("ab").isSuccess) @@ -90,8 +82,7 @@ class ParserTest { @Test fun testAndThen() { val input = "ab" - val parser = RegexParser() - val result = parser.parseToEnd(input) + val result = compileRegex(input) assertEquals("ab", result.toString()) assert(result.match("ab").isSuccess) assert(!result.match("a").isSuccess) @@ -99,9 +90,8 @@ class ParserTest { } @Test fun testDotAndPlus() { - val input = ".+a"; - val parser = RegexParser() - val result = parser.parseToEnd(input) + val input = ".+a" + val result = compileRegex(input) assertEquals(".+a", result.toString()) assert(!result.match("a").isSuccess) assert(result.match("ba").isSuccess) @@ -110,8 +100,7 @@ class ParserTest { @Test fun testEscapedCharacter() { val input = "\\+" - val parser = RegexParser() - val result = parser.parseToEnd(input) + val result = compileRegex(input) assertEquals("\\+", result.toString()) assert(result.match("+").isSuccess) assert(!result.match("a").isSuccess) @@ -119,12 +108,32 @@ class ParserTest { @Test fun testBracketContent() { val input = "[abc]" - val parser = RegexParser() - val result = parser.parseToEnd(input) + val result = compileRegex(input) assertEquals("[abc]", result.toString()) assert(result.match("a").isSuccess) assert(result.match("b").isSuccess) assert(result.match("c").isSuccess) assert(!result.match("d").isSuccess) } -} \ No newline at end of file + @Test + fun testNestedGroups() { + val input = "(a(b|c)d)+" + val result = compileRegex(input) + assertEquals("(a(b|c)d)+", result.toString()) + assert(!result.match("ad").isSuccess) + assert(result.match("abd").isSuccess) + assert(result.match("acd").isSuccess) + assert(!result.match("a").isSuccess) + } + @Test + fun testCaptureGroups() { + val input = "(a)(b)" + val result = compileRegex(input) + assertEquals("(a)(b)", result.toString()) + val matchResult = result.match("ab") + assert(matchResult.isSuccess) + val captures = matchResult.available.first(); + assertEquals("a", captures.captures.get("0")) + assertEquals("b", captures.captures.get("1")) + } +}