Skip to content

Commit

Permalink
增加条目名称模糊匹配, close #780
Browse files Browse the repository at this point in the history
基于 levenshteinDistance, 相似度达到 80% 判定为匹配
  • Loading branch information
Him188 committed Dec 21, 2024
1 parent 567b212 commit ad6afc2
Show file tree
Hide file tree
Showing 3 changed files with 172 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,13 @@ import me.him188.ani.utils.platform.trimSB
object MediaListFilters {
val ContainsSubjectName = BasicMediaListFilter { media ->
subjectNamesWithoutSpecial.any { subjectName ->
removeSpecials(media.originalTitle, removeWhitespace = true, replaceNumbers = true)
val originalTitle = removeSpecials(media.originalTitle, removeWhitespace = true, replaceNumbers = true)
fun exactlyContains() = originalTitle
.contains(subjectName, ignoreCase = true)

fun fuzzyMatches() = StringMatcher.calculateMatchRate(originalTitle, subjectName) >= 0.8

exactlyContains() || fuzzyMatches()
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/*
* Copyright (C) 2024 OpenAni and contributors.
*
* 此源代码的使用受 GNU AFFERO GENERAL PUBLIC LICENSE version 3 许可证的约束, 可以在以下链接找到该许可证.
* Use of this source code is governed by the GNU AGPLv3 license, which can be found at the following link.
*
* https://github.com/open-ani/ani/blob/main/LICENSE
*/

package me.him188.ani.app.domain.mediasource

object StringMatcher {
/**
* Calculates the match rate between two strings (0..100).
* Uses Levenshtein distance to measure the difference between them,
* then converts it to a percentage of similarity.
*/
fun calculateMatchRate(a: String, b: String): Int {
if (a.isEmpty() && b.isEmpty()) {
return 100
}

val distance = levenshteinDistance(a, b)
// We scale similarity based on the longest string length
val maxLen = maxOf(a.length, b.length)
// Similarity is 1 - (distance / maxLen), then multiplied by 100
val similarity = 1 - (distance.toDouble() / maxLen)
return (similarity * 100).toInt().coerceIn(0, 100)
}

/**
* Computes the Levenshtein distance between two strings.
*
* Levenshtein distance is the minimum number of single-character
* edits (insertions, deletions, or substitutions) required to
* change one word into the other.
*/
private fun levenshteinDistance(s1: String, s2: String): Int {
val len1 = s1.length
val len2 = s2.length

// Create a 2D array DP where dp[i][j] represents
// the distance between s1[0..i-1] and s2[0..j-1]
val dp = Array(len1 + 1) { IntArray(len2 + 1) }

// Base cases
for (i in 0..len1) {
dp[i][0] = i
}
for (j in 0..len2) {
dp[0][j] = j
}

// Fill dp array
for (i in 1..len1) {
for (j in 1..len2) {
val cost = if (s1[i - 1] == s2[j - 1]) 0 else 1

dp[i][j] = minOf(
dp[i - 1][j] + 1, // deletion
dp[i][j - 1] + 1, // insertion
dp[i - 1][j - 1] + cost, // substitution
)
}
}

return dp[len1][len2]
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
/*
* Copyright (C) 2024 OpenAni and contributors.
*
* 此源代码的使用受 GNU AFFERO GENERAL PUBLIC LICENSE version 3 许可证的约束, 可以在以下链接找到该许可证.
* Use of this source code is governed by the GNU AGPLv3 license, which can be found at the following link.
*
* https://github.com/open-ani/ani/blob/main/LICENSE
*/

package me.him188.ani.app.domain.mediasource

import me.him188.ani.test.TestFactory
import me.him188.ani.test.runDynamicTests
import kotlin.test.Test
import kotlin.test.assertEquals


class StringMatcherTest {

@Test
fun `test empty strings`() {
val result = StringMatcher.calculateMatchRate("", "")
assertEquals(100, result, "Empty strings should have 100% match")
}

@Test
fun `test identical strings`() {
val result = StringMatcher.calculateMatchRate("kotlin", "kotlin")
assertEquals(100, result, "Identical strings should have 100% match")
}

@Test
fun `test single character difference`() {
val result = StringMatcher.calculateMatchRate("kotlin", "kotli")
// 'kotli' is missing the last character 'n', distance = 1,
// maxLen = 6 => similarity = 1 - (1/6) => ~83.33
// After multiplying by 100 => ~83
assertEquals(83, result, "Strings with one character difference should have around 83% match")
}

@Test
fun `test partial match`() {
val result = StringMatcher.calculateMatchRate("hello", "hallo")
// For reference: distance(hello, hallo) is 1 (substituting 'e' for 'a')
// maxLen = 5 => similarity = 1 - (1/5) => 0.8 => 80
assertEquals(80, result, "Expected around 80% similarity for 'hello' and 'hallo'")
}

@Test
fun `test completely different strings`() {
val result = StringMatcher.calculateMatchRate("abcd", "wxyz")
// distance can be up to 4 if all different
// maxLen = 4 => similarity = 1 - (4/4) = 0 => 0%
assertEquals(0, result, "Completely different strings should have 0% match")
}

@Test
fun `test one string empty`() {
val result = StringMatcher.calculateMatchRate("kotlin", "")
// distance is length of "kotlin", i.e., 6
// maxLen = 6 => similarity = 1 - (6/6) = 0 => 0%
assertEquals(0, result, "Non-empty vs. empty string should have 0% match")
}

@TestFactory
fun `test matches`() = runDynamicTests {
infix fun String.matches(another: String): Pair<String, String> = this to another
infix fun Pair<String, String>.at(expected: Int) {
add("$first matches $second at $expected") {
val result = StringMatcher.calculateMatchRate(first, second)
assertEquals(expected, result)
}
}

// 语法: 有错误的名字 matches 正确名称 at 匹配率

// 错了一个字
"哥特萝莉侦探事件薄" matches "哥特萝莉侦探事件簿" at 88

// 这说明 <80% 可能会匹配到第二季
"哥特萝莉侦探事件簿" matches "哥特萝莉侦探事件簿第二季" at 75

// #780
"败犬女主太多了" matches "败犬女主太多啦" at 85

// 此方法无法识别这种区别
"别当欧尼酱了" matches "别当哥哥了" at 50
}

@Test
fun `test GOSICK2`() {
val result = StringMatcher.calculateMatchRate("哥特萝莉侦探事件簿", "哥特萝莉侦探事件薄")
// distance is length of "kotlin", i.e., 6
// maxLen = 6 => similarity = 1 - (6/6) = 0 => 0%
assertEquals(88, result)
}
}

0 comments on commit ad6afc2

Please sign in to comment.