Compare commits

...

10 commits

3 changed files with 329 additions and 198 deletions

View file

@ -2,16 +2,45 @@ package org.example
import com.github.h0tk3y.betterParse.combinators.* import com.github.h0tk3y.betterParse.combinators.*
import com.github.h0tk3y.betterParse.grammar.Grammar import com.github.h0tk3y.betterParse.grammar.Grammar
import com.github.h0tk3y.betterParse.grammar.parseToEnd
import com.github.h0tk3y.betterParse.grammar.parser import com.github.h0tk3y.betterParse.grammar.parser
import com.github.h0tk3y.betterParse.lexer.literalToken import com.github.h0tk3y.betterParse.lexer.literalToken
import com.github.h0tk3y.betterParse.lexer.regexToken import com.github.h0tk3y.betterParse.lexer.regexToken
import com.github.h0tk3y.betterParse.lexer.token
import com.github.h0tk3y.betterParse.parser.Parser import com.github.h0tk3y.betterParse.parser.Parser
class RegexParser : Grammar<RegexItem>() { class RegexParser : Grammar<RegexItem>() {
private var groupCounter = 0
val escapedCharacter by regexToken("\\\\[+*?.$^()|\\[\\]]")
val postfixOperator by regexToken("[+*?]") val postfixOperator by regexToken("[+*?]")
val anchorOperator by regexToken("[$^]")
val alternationSymbol by literalToken("|") val alternationSymbol by literalToken("|")
val openParenSymbol by literalToken("(") val openParenSymbol by literalToken("(")
val closeParenSymbol by literalToken(")") val closeParenSymbol by literalToken(")")
val bracketContent by
token(
name = "bracketContent",
matcher = { seq, from ->
if (seq[from] != '[') {
0 // 대괄호로 시작하지 않으면 매칭 실패
} else {
// 대괄호의 시작 위치에서부터 ']'를 찾음
var to = seq.indexOf(']', from)
// 이스케이프 ']' 건너 뛰기
while (to >= 0 && to > from && seq[to - 1] == '\\') {
to = seq.indexOf(']', to + 1)
}
if (to < 0) {
0
} else if (to == from + 1) {
0 // 빈 대괄호는 허용하지 않음
} else {
to - from + 1 // 대괄호의 시작 위치부터 ']'까지의 길이
}
}
}
)
val dot by literalToken(".") val dot by literalToken(".")
val charToken by regexToken("[a-zA-Z0-9]") val charToken by regexToken("[a-zA-Z0-9]")
@ -20,24 +49,27 @@ class RegexParser : Grammar<RegexItem>() {
val item: Parser<RegexItem> by val item: Parser<RegexItem> by
char or char or
(anchorOperator map { AnchorItem(it.text) }) or
(dot asJust DotItem()) or (dot asJust DotItem()) or
(skip(openParenSymbol) and (parser(::rootParser)) and skip(closeParenSymbol)) (escapedCharacter map { CharItem(it.text.substring(1)) }) or
(bracketContent map { BracketItem(it.text.substring(1, it.text.length - 1)) }) or
(skip(openParenSymbol) and
(parser(::rootParser)) and
skip(closeParenSymbol) map
{
val groupName = "${groupCounter++}"
GroupItem(it, groupName)
})
val term: Parser<RegexItem> by val term: Parser<RegexItem> by
(item and optional(postfixOperator)) map (item and optional(postfixOperator)) map { (item, op) ->
{ result -> when (op?.text) {
result.t1.let { first -> "+" -> PlusItem(item)
result.t2?.let { "*" -> StarItem(item)
when (it.text) { "?" -> QuestionItem(item)
"+" -> PlusItem(first) else -> item
"*" -> StarItem(first) }
"?" -> QuestionItem(first) }
else -> first
}
}
?: first
}
}
val andThen: Parser<RegexItem> by val andThen: Parser<RegexItem> by
oneOrMore(term) map { items -> items.reduce { left, right -> AndThenItem(left, right) } } oneOrMore(term) map { items -> items.reduce { left, right -> AndThenItem(left, right) } }
val termWithAlternation: Parser<RegexItem> by val termWithAlternation: Parser<RegexItem> by
@ -48,3 +80,7 @@ class RegexParser : Grammar<RegexItem>() {
override val rootParser: Parser<RegexItem> by termWithAlternation override val rootParser: Parser<RegexItem> by termWithAlternation
} }
fun compileRegex(input: String): RegexItem {
return RegexParser().parseToEnd(input)
}

View file

@ -1,175 +1,208 @@
package org.example package org.example
data class State( data class State(
val matched: String, val input: String,
val remaining: String, val startIndex: Int,
) val endIndex: Int,
val captures: Map<String, String> = emptyMap()
data class AvailableState(val seq: Sequence<State> = emptySequence()) : Sequence<State> by seq { ) {
val isEmpty: Boolean val matched: String
get() = seq.none() get() = input.substring(startIndex, endIndex)
val remaining: String
get() = input.substring(endIndex)
} }
class MatchResult(val available: AvailableState) { typealias AvailableState = Sequence<State>
val isSuccess: Boolean // 재귀 하향 분석기.
get() = !this.available.isEmpty // 백트랙킹 기반.
interface RegexItem {
override fun toString(): String
fun findMatch(str: String, position: Int = 0): AvailableState
}
val isFailure: Boolean fun RegexItem.findAll(item: String): Sequence<State> {
get() = this.available.isEmpty return sequence {
var position = 0
// 문자열의 끝까지 반복합니다. 비어있어도 한번은 시도합니다.
while (position <= item.length) {
// findMatch 메서드를 호출하여 매칭을 시도합니다.
val matchResult = findMatch(item, position).firstOrNull()
if (matchResult == null) {
// 매칭이 실패하면 position을 증가시키고 다시 시도합니다.
position++
continue
}
// 매칭이 성공하면 MatchResult를 생성하여 반환합니다.
yield(matchResult)
override fun toString(): String { // 다음 위치로 이동합니다.
return if (isSuccess) { position =
"MatchResult(success)" if (matchResult.startIndex == matchResult.endIndex) {
} else { position + 1
"MatchResult(failure)" } else {
matchResult.endIndex
}
} }
} }
} }
// 재귀 하향 분석기. fun RegexItem.find(item: String): State? {
interface RegexItem { // findAll 에서 첫 번째 매칭 결과를 반환합니다.
override fun toString(): String return this.findAll(item).firstOrNull()
fun findMatch(str: String): AvailableState
} }
fun RegexItem.match(item: String): MatchResult { fun RegexItem.containsMatchIn(item: String): Boolean {
// 기본 매칭 함수. AvailableState를 MatchResult로 변환 // 매칭 결과가 성공인지 확인하는 헬퍼 함수
return MatchResult(this.findMatch(item)) return this.find(item) != null
} }
class AndThenItem(val left: RegexItem, val right: RegexItem) : RegexItem { class AndThenItem(val left: RegexItem, val right: RegexItem) : RegexItem {
override fun toString(): String = "${left}${right}" override fun toString(): String = "${left}${right}"
override fun findMatch(str: String): AvailableState { override fun findMatch(str: String, position: Int): AvailableState {
val leftMatch = left.findMatch(str) val leftMatch = left.findMatch(str, position)
if (leftMatch.isEmpty) { return leftMatch.flatMap { leftState ->
return AvailableState() // If left match fails, return empty sequence right.findMatch(str, leftState.endIndex).map { rightState ->
// If right match is successful, combine the matched parts
State(
str,
leftState.startIndex,
rightState.endIndex,
leftState.captures + rightState.captures // Combine captures
)
}
} }
// If left match is successful, try to match the right item with the remaining string
// from the left match.
return AvailableState(
leftMatch.flatMap { state ->
val rightMatch = right.findMatch(state.remaining)
if (!rightMatch.isEmpty) {
// If right match is successful, combine the matched parts
rightMatch.map { rightState ->
State(state.matched + rightState.matched, rightState.remaining)
}
} else {
// If right match fails, return an empty sequence
emptySequence()
}
}
)
} }
} }
class CharItem(val value: String) : RegexItem { class CharItem(val value: String) : RegexItem {
override fun toString(): String = value companion object {
override fun findMatch(str: String): AvailableState { private val META_CHARS = setOf("\\", "+", "*", "?", ".", "(", ")", "|", "[", "]", "^", "$")
}
override fun toString(): String =
if (value in META_CHARS) "\\$value" else value
override fun findMatch(str: String, position: Int): AvailableState {
return when { return when {
// 첫번째 문자가 value와 일치하는지 확인 // 첫번째 문자가 value와 일치하는지 확인
str.isNotEmpty() && str[0].toString() == value -> { position < str.length && str[position].toString() == value -> {
AvailableState(sequenceOf(State(value, str.substring(1)))) sequenceOf(State(str, position, position + 1))
} }
else -> AvailableState() else -> emptySequence() // 일치하지 않으면 빈 시퀀스 반환
} }
} }
} }
fun matchMany( class BracketItem(val content: String) : RegexItem {
str: String, override fun toString(): String = "[$content]"
item: RegexItem,
): Sequence<State> { // TODO: 범위 처리
override fun findMatch(str: String, position: Int): AvailableState {
// 대괄호 안의 내용과 일치하는 첫 문자를 찾음
return when {
position < str.length && content.contains(str[position]) -> {
sequenceOf(State(str, position, position + 1))
}
else -> emptySequence() // 일치하지 않으면 빈 시퀀스 반환
}
}
}
class GroupItem(val item: RegexItem, val name: String) : RegexItem {
override fun toString(): String = "(${item})"
override fun findMatch(str: String, position: Int): AvailableState {
// 그룹은 내부 아이템과 동일하게 매칭을 시도
val ret = item.findMatch(str, position)
// 매칭된 상태에 그룹 이름을 추가하여 반환
return ret.map { state -> state.copy(captures = state.captures + (name to state.matched)) }
}
}
class AnchorItem(val anchor: String) : RegexItem {
override fun toString(): String = anchor
override fun findMatch(str: String, position: Int): AvailableState {
// 앵커는 문자열의 시작(^) 또는 끝($)과 매칭됨
return when (anchor) {
"^" ->
if (position == 0) {
sequenceOf(State(str, 0, 0))
} else {
emptySequence() // 시작 앵커가 실패하면 빈 시퀀스 반환
}
"$" ->
if (position == str.length) {
sequenceOf(State(str, str.length, str.length))
} else {
emptySequence() // 끝 앵커가 실패하면 빈 시퀀스 반환
}
// 다른 앵커는 지원하지 않음
else -> throw IllegalArgumentException("Unknown anchor: $anchor")
}
}
}
fun matchMany(str: String, item: RegexItem, position: Int): Sequence<State> {
// 욕심쟁이 매칭을 위한 헬퍼 함수 // 욕심쟁이 매칭을 위한 헬퍼 함수
return item.findMatch(str).seq.flatMap { state -> return item.findMatch(str, position).flatMap { state ->
if (state.remaining.isEmpty()) { if (state.endIndex == str.length) {
sequenceOf(state) // If remaining is empty, return the matched state sequenceOf(state) // If remaining is empty, return the matched state
} else { } else {
// Otherwise, continue matching with the remaining string // Otherwise, continue matching with the remaining string
matchMany(state.remaining, item).map { nextState -> matchMany(str, item, state.endIndex).map { nextState ->
State(state.matched + nextState.matched, nextState.remaining) State(str, state.startIndex, nextState.endIndex)
} + sequenceOf(state) // Include the current state as well } + sequenceOf(state) // Include the current state as well
} }
} }
} }
// fun matchMany(str: String, item: RegexItem): Sequence<State> = sequence {
// val stack = mutableListOf(item.findMatch(str).seq)
// while (stack.isNotEmpty()) {
// val current = stack.removeAt(stack.lastIndex)
// for (state in current) {
// yield(state) // Yield the current state
// if (state.remaining.isNotEmpty()) {
// // If there is remaining string, continue matching
// stack.add(item.findMatch(state.remaining).seq.map { nextState ->
// State(state.matched + nextState.matched, nextState.remaining)
// })
// }
// }
// }
// }
class PlusItem(val item: RegexItem) : RegexItem { class PlusItem(val item: RegexItem) : RegexItem {
override fun toString(): String = "${item}+" override fun toString(): String = "${item}+"
override fun findMatch(str: String): AvailableState { override fun findMatch(str: String, position: Int): AvailableState {
return AvailableState(matchMany(str, item)) return matchMany(str, item, position)
} }
} }
class StarItem(val item: RegexItem) : RegexItem { class StarItem(val item: RegexItem) : RegexItem {
override fun toString(): String = "${item}*" override fun toString(): String = "${item}*"
override fun findMatch(str: String): AvailableState { override fun findMatch(str: String, position: Int): AvailableState {
// *는 0개 이상의 매칭을 의미하므로, 먼저 시도해보고 실패하면 빈 시퀀스를 반환 // *는 0번 또는 1번 이상 일치합니다.
val matchResult = this.item.findMatch(str) // 욕심쟁이(greedy) 방식으로 구현하기 위해, 가장 긴 매치(1번 이상)를 먼저 찾고, 그 다음에 0번 매치를 추가합니다.
if (matchResult.isEmpty) { val oneOrMoreMatches = matchMany(str, item, position)
// If the item does not match, return an empty sequence val zeroMatch = sequenceOf(State(str, position, position))
return AvailableState(sequenceOf(State("", str))) return oneOrMoreMatches + zeroMatch
}
// If it matches, return the successful match and continue matching with the remaining string
return AvailableState(
matchResult.flatMap { state ->
sequenceOf(state) + matchMany(state.remaining, this.item)
}
)
} }
} }
class QuestionItem(val item: RegexItem) : RegexItem { class QuestionItem(val item: RegexItem) : RegexItem {
override fun toString(): String = "${item}?" override fun toString(): String = "${item}?"
override fun findMatch(str: String): AvailableState { override fun findMatch(str: String, position: Int): AvailableState {
// ?는 0개 또는 1개 매칭을 의미하므로, 먼저 시도해보고 실패하면 빈 시퀀스를 반환 // ?는 0번 또는 1번 일치합니다.
val matchResult = this.item.findMatch(str) val oneMatch = item.findMatch(str, position)
if (matchResult.isEmpty) { val zeroMatch = sequenceOf(State(str, position, position))
// If the item does not match, return an empty sequence // 1번 매치를 0번 매치보다 우선합니다.
return AvailableState(sequenceOf(State("", str))) return oneMatch + zeroMatch
}
// If it matches, return the successful match
return AvailableState(matchResult.map { State(it.matched, it.remaining) })
} }
} }
class DotItem : RegexItem { class DotItem : RegexItem {
override fun toString(): String = "." override fun toString(): String = "."
override fun findMatch(str: String): AvailableState = override fun findMatch(str: String, position: Int): AvailableState =
// .은 임의의 한 문자와 매칭되므로, 첫 문자가 존재하면 매칭 성공 // .은 임의의 한 문자와 매칭되므로, 첫 문자가 존재하면 매칭 성공
when { when {
str.isNotEmpty() -> position < str.length -> sequenceOf(State(str, position, position + 1))
AvailableState(sequenceOf(State(str[0].toString(), str.substring(1)))) else -> emptySequence() // 빈 문자열에 대해서는 매칭 실패
else -> AvailableState() // 빈 문자열에 대해서는 매칭 실패 }
}
} }
class AlternationItem(val left: RegexItem, val right: RegexItem) : RegexItem { class AlternationItem(val left: RegexItem, val right: RegexItem) : RegexItem {
override fun toString(): String = "(${left}|${right})" override fun toString(): String = "${left}|${right}"
override fun findMatch(str: String): AvailableState { override fun findMatch(str: String, position: Int): AvailableState {
// Alternation은 왼쪽 또는 오른쪽 항목 중 하나와 매칭되므로, 각각 시도해보고 성공하는 경우를 반환 // Alternation은 왼쪽 또는 오른쪽 항목 중 하나와 매칭되므로, 각각 시도해보고 성공하는 경우를 반환
val leftMatch = left.findMatch(str) val leftMatch = left.findMatch(str, position)
val rightMatch = right.findMatch(str) val rightMatch = right.findMatch(str, position)
return AvailableState( return leftMatch + rightMatch // 두 매칭 결과를 합쳐서 반환
(leftMatch + rightMatch) // 두 매칭 결과를 합쳐서 반환
)
} }
} }

View file

@ -2,109 +2,171 @@ package org.example
import kotlin.test.Test import kotlin.test.Test
import kotlin.test.assertEquals import kotlin.test.assertEquals
import com.github.h0tk3y.betterParse.grammar.parseToEnd import kotlin.test.assertNotNull
private fun checkRegex(
pattern: String,
block: RegexTestAsserter .() -> Unit
) {
val regex = compileRegex(pattern)
// 나중에는 안 같겠지만, 일단은 같다고 가정
assertEquals(pattern, regex.toString())
block(RegexTestAsserter(regex))
}
private class RegexTestAsserter(private val regex: RegexItem) {
fun String.shouldMatch() {
assert(regex.containsMatchIn(this)) { "Expected '$this' to match" }
}
fun String.shouldNotMatch() {
assert(!regex.containsMatchIn(this)) { "Expected '$this' not to match" }
}
}
class ParserTest { class ParserTest {
@Test @Test
fun testSimpleCharacter() { fun testSimpleCharacter() {
val input = "a" checkRegex("a") {
val result = RegexParser().parseToEnd(input) "a".shouldMatch()
assertEquals("a",result.toString()) "b".shouldNotMatch()
"".shouldNotMatch()
}
} }
@Test @Test
fun testCharacterWithPlus() { fun testCharacterWithPlus() {
val input = "a+" checkRegex("a+") {
val parser = RegexParser() "a".shouldMatch()
val result = parser.parseToEnd(input) "aa".shouldMatch()
assertEquals("a+", result.toString()) "b".shouldNotMatch()
assert(result.match("a").isSuccess) "".shouldNotMatch()
assert(result.match("aa").isSuccess) }
assert(!result.match("b").isSuccess)
} }
@Test @Test
fun testCharacterWithStar() { fun testCharacterWithStar() {
val input = "b*" checkRegex("b*") {
val parser = RegexParser() "b".shouldMatch()
val result = parser.parseToEnd(input) "bb".shouldMatch()
assertEquals("b*", result.toString()) "".shouldMatch() // 빈 문자열도 매칭됨
assert(result.match("").isSuccess) }
assert(result.match("b").isSuccess)
assert(result.match("bb").isSuccess)
assert(result.match("a").isSuccess)
} }
@Test @Test
fun testCharacterWithQuestion() { fun testCharacterWithQuestion() {
val input = "c?" checkRegex("c?") {
val parser = RegexParser() "c".shouldMatch()
val result = parser.parseToEnd(input) "".shouldMatch() // 빈 문자열도 매칭됨
assertEquals("c?", result.toString()) }
assert(result.match("").isSuccess)
assert(result.match("c").isSuccess)
assert(result.match("cc").isSuccess)
} }
@Test @Test
fun testDot() { fun testDot() {
val input = "." checkRegex(".") {
val parser = RegexParser() "a".shouldMatch()
val result = parser.parseToEnd(input) "1".shouldMatch()
assertEquals(".", result.toString()) "".shouldNotMatch() // 빈 문자열은 매칭되지 않음
assert(result.match("a").isSuccess) }
assert(result.match("1").isSuccess)
assert(!result.match("").isSuccess)
} }
@Test @Test
fun testAlternation() { fun testAlternation() {
val input = "a|b" checkRegex("a|b") {
val parser = RegexParser() "a".shouldMatch()
val result = parser.parseToEnd(input) "b".shouldMatch()
assertEquals("(a|b)", result.toString()) "c".shouldNotMatch()
assert(result.match("a").isSuccess) "".shouldNotMatch() // 빈 문자열은 매칭되지 않음
assert(result.match("b").isSuccess) }
assert(!result.match("c").isSuccess)
} }
@Test @Test
fun testParentheses() { fun testParentheses() {
val input = "(d)" checkRegex("(d)") {
val parser = RegexParser() "d".shouldMatch()
val result = parser.parseToEnd(input) "e".shouldNotMatch()
assertEquals("d", result.toString()) "".shouldNotMatch() // 빈 문자열은 매칭되지 않음
assert(result.match("d").isSuccess) }
assert(!result.match("e").isSuccess)
} }
@Test @Test
fun testComplexExpression() { fun testComplexExpression() {
val input = "a(b|c)*d+" checkRegex("a(b|c)*d+") {
val parser = RegexParser() "ad".shouldMatch()
val result = parser.parseToEnd(input) "ab".shouldNotMatch()
assertEquals("a(b|c)*d+", result.toString()) "acd".shouldMatch()
assert(result.match("ad").isSuccess) "abbbd".shouldMatch()
assert(!result.match("ab").isSuccess) "a".shouldNotMatch()
assert(result.match("acd").isSuccess) "b".shouldNotMatch()
assert(result.match("abbbd").isSuccess) }
assert(!result.match("a").isSuccess)
assert(!result.match("b").isSuccess)
} }
@Test @Test
fun testAndThen() { fun testAndThen() {
val input = "ab" checkRegex("ab") {
val parser = RegexParser() "ab".shouldMatch()
val result = parser.parseToEnd(input) "a".shouldNotMatch()
assertEquals("ab", result.toString()) "b".shouldNotMatch()
assert(result.match("ab").isSuccess) }
assert(!result.match("a").isSuccess)
assert(!result.match("b").isSuccess)
} }
@Test @Test
fun testDotAndPlus() { fun testDotAndPlus() {
val input = ".+a"; checkRegex(".+a") {
val parser = RegexParser() "a".shouldNotMatch()
val result = parser.parseToEnd(input) "ba".shouldMatch()
assertEquals(".+a", result.toString()) "bca".shouldMatch()
assert(!result.match("a").isSuccess) }
assert(result.match("ba").isSuccess) }
assert(result.match("bca").isSuccess) @Test
fun testEscapedCharacter() {
checkRegex("\\+") {
"+".shouldMatch()
"a".shouldNotMatch()
}
}
@Test
fun testBracketContent() {
checkRegex("[abc]") {
"a".shouldMatch()
"b".shouldMatch()
"c".shouldMatch()
"d".shouldNotMatch()
}
}
@Test
fun testNestedGroups() {
checkRegex("(a(b|c)d)+") {
"ad".shouldNotMatch()
"abd".shouldMatch()
"acd".shouldMatch()
"a".shouldNotMatch()
}
}
@Test
fun testAnchorOperators() {
checkRegex("^abc$") {
"abc".shouldMatch()
"ab".shouldNotMatch()
"abcd".shouldNotMatch()
"xabc".shouldNotMatch()
}
}
@Test
fun testCaptureGroups() {
val input = "(a)(b)"
val result = compileRegex(input)
assertEquals("(a)(b)", result.toString())
val matchResult = result.find("ab")
assertNotNull(matchResult, "Expected match result to be non-null")
assertEquals("ab", matchResult.matched)
assertEquals(2, matchResult.captures.size)
assertEquals("a", matchResult.captures["0"])
assertEquals("b", matchResult.captures["1"])
}
@Test
fun testMatchAll() {
val input = "a+"
val regex = compileRegex(input)
val matches = regex.findAll("aaabaaa")
val results = matches.toList()
println("Matches found: ${results}")
// assertEquals(2, results.size)
assertEquals("aaa", results[0].matched)
assertEquals("aaa", results[1].matched)
} }
} }