-
Notifications
You must be signed in to change notification settings - Fork 75
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
基于 levenshteinDistance, 相似度达到 80% 判定为匹配
- Loading branch information
Showing
3 changed files
with
172 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
69 changes: 69 additions & 0 deletions
69
app/shared/app-data/src/commonMain/kotlin/domain/mediasource/StringMatcher.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
/* | ||
* Copyright (C) 2024 OpenAni and contributors. | ||
* | ||
* 此源代码的使用受 GNU AFFERO GENERAL PUBLIC LICENSE version 3 许可证的约束, 可以在以下链接找到该许可证. | ||
* Use of this source code is governed by the GNU AGPLv3 license, which can be found at the following link. | ||
* | ||
* https://github.com/open-ani/ani/blob/main/LICENSE | ||
*/ | ||
|
||
package me.him188.ani.app.domain.mediasource | ||
|
||
object StringMatcher { | ||
/** | ||
* Calculates the match rate between two strings (0..100). | ||
* Uses Levenshtein distance to measure the difference between them, | ||
* then converts it to a percentage of similarity. | ||
*/ | ||
fun calculateMatchRate(a: String, b: String): Int { | ||
if (a.isEmpty() && b.isEmpty()) { | ||
return 100 | ||
} | ||
|
||
val distance = levenshteinDistance(a, b) | ||
// We scale similarity based on the longest string length | ||
val maxLen = maxOf(a.length, b.length) | ||
// Similarity is 1 - (distance / maxLen), then multiplied by 100 | ||
val similarity = 1 - (distance.toDouble() / maxLen) | ||
return (similarity * 100).toInt().coerceIn(0, 100) | ||
} | ||
|
||
/** | ||
* Computes the Levenshtein distance between two strings. | ||
* | ||
* Levenshtein distance is the minimum number of single-character | ||
* edits (insertions, deletions, or substitutions) required to | ||
* change one word into the other. | ||
*/ | ||
private fun levenshteinDistance(s1: String, s2: String): Int { | ||
val len1 = s1.length | ||
val len2 = s2.length | ||
|
||
// Create a 2D array DP where dp[i][j] represents | ||
// the distance between s1[0..i-1] and s2[0..j-1] | ||
val dp = Array(len1 + 1) { IntArray(len2 + 1) } | ||
|
||
// Base cases | ||
for (i in 0..len1) { | ||
dp[i][0] = i | ||
} | ||
for (j in 0..len2) { | ||
dp[0][j] = j | ||
} | ||
|
||
// Fill dp array | ||
for (i in 1..len1) { | ||
for (j in 1..len2) { | ||
val cost = if (s1[i - 1] == s2[j - 1]) 0 else 1 | ||
|
||
dp[i][j] = minOf( | ||
dp[i - 1][j] + 1, // deletion | ||
dp[i][j - 1] + 1, // insertion | ||
dp[i - 1][j - 1] + cost, // substitution | ||
) | ||
} | ||
} | ||
|
||
return dp[len1][len2] | ||
} | ||
} |
97 changes: 97 additions & 0 deletions
97
app/shared/app-data/src/commonTest/kotlin/domain/mediasource/StringMatcherTest.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
/* | ||
* Copyright (C) 2024 OpenAni and contributors. | ||
* | ||
* 此源代码的使用受 GNU AFFERO GENERAL PUBLIC LICENSE version 3 许可证的约束, 可以在以下链接找到该许可证. | ||
* Use of this source code is governed by the GNU AGPLv3 license, which can be found at the following link. | ||
* | ||
* https://github.com/open-ani/ani/blob/main/LICENSE | ||
*/ | ||
|
||
package me.him188.ani.app.domain.mediasource | ||
|
||
import me.him188.ani.test.TestFactory | ||
import me.him188.ani.test.runDynamicTests | ||
import kotlin.test.Test | ||
import kotlin.test.assertEquals | ||
|
||
|
||
class StringMatcherTest { | ||
|
||
@Test | ||
fun `test empty strings`() { | ||
val result = StringMatcher.calculateMatchRate("", "") | ||
assertEquals(100, result, "Empty strings should have 100% match") | ||
} | ||
|
||
@Test | ||
fun `test identical strings`() { | ||
val result = StringMatcher.calculateMatchRate("kotlin", "kotlin") | ||
assertEquals(100, result, "Identical strings should have 100% match") | ||
} | ||
|
||
@Test | ||
fun `test single character difference`() { | ||
val result = StringMatcher.calculateMatchRate("kotlin", "kotli") | ||
// 'kotli' is missing the last character 'n', distance = 1, | ||
// maxLen = 6 => similarity = 1 - (1/6) => ~83.33 | ||
// After multiplying by 100 => ~83 | ||
assertEquals(83, result, "Strings with one character difference should have around 83% match") | ||
} | ||
|
||
@Test | ||
fun `test partial match`() { | ||
val result = StringMatcher.calculateMatchRate("hello", "hallo") | ||
// For reference: distance(hello, hallo) is 1 (substituting 'e' for 'a') | ||
// maxLen = 5 => similarity = 1 - (1/5) => 0.8 => 80 | ||
assertEquals(80, result, "Expected around 80% similarity for 'hello' and 'hallo'") | ||
} | ||
|
||
@Test | ||
fun `test completely different strings`() { | ||
val result = StringMatcher.calculateMatchRate("abcd", "wxyz") | ||
// distance can be up to 4 if all different | ||
// maxLen = 4 => similarity = 1 - (4/4) = 0 => 0% | ||
assertEquals(0, result, "Completely different strings should have 0% match") | ||
} | ||
|
||
@Test | ||
fun `test one string empty`() { | ||
val result = StringMatcher.calculateMatchRate("kotlin", "") | ||
// distance is length of "kotlin", i.e., 6 | ||
// maxLen = 6 => similarity = 1 - (6/6) = 0 => 0% | ||
assertEquals(0, result, "Non-empty vs. empty string should have 0% match") | ||
} | ||
|
||
@TestFactory | ||
fun `test matches`() = runDynamicTests { | ||
infix fun String.matches(another: String): Pair<String, String> = this to another | ||
infix fun Pair<String, String>.at(expected: Int) { | ||
add("$first matches $second at $expected") { | ||
val result = StringMatcher.calculateMatchRate(first, second) | ||
assertEquals(expected, result) | ||
} | ||
} | ||
|
||
// 语法: 有错误的名字 matches 正确名称 at 匹配率 | ||
|
||
// 错了一个字 | ||
"哥特萝莉侦探事件薄" matches "哥特萝莉侦探事件簿" at 88 | ||
|
||
// 这说明 <80% 可能会匹配到第二季 | ||
"哥特萝莉侦探事件簿" matches "哥特萝莉侦探事件簿第二季" at 75 | ||
|
||
// #780 | ||
"败犬女主太多了" matches "败犬女主太多啦" at 85 | ||
|
||
// 此方法无法识别这种区别 | ||
"别当欧尼酱了" matches "别当哥哥了" at 50 | ||
} | ||
|
||
@Test | ||
fun `test GOSICK2`() { | ||
val result = StringMatcher.calculateMatchRate("哥特萝莉侦探事件簿", "哥特萝莉侦探事件薄") | ||
// distance is length of "kotlin", i.e., 6 | ||
// maxLen = 6 => similarity = 1 - (6/6) = 0 => 0% | ||
assertEquals(88, result) | ||
} | ||
} |