measures) {
- WordSimUtils.measures = Lists.immutable.withAll(measures);
+ public boolean addMeasure(WordSimMeasure measure) {
+ return this.measures.add(measure);
}
/**
- * Sets the default comparison strategy. The specified strategy will be used for all subsequent comparisons that
- * themselves do not specify a strategy.
+ * Sets the default comparison strategy. The specified strategy will be used for all subsequent comparisons that themselves do not specify a strategy.
*
* @param strategy the new default strategy
*/
- public static void setStrategy(ComparisonStrategy strategy) {
- WordSimUtils.strategy = strategy;
+ public void setStrategy(ComparisonStrategy strategy) {
+ this.strategy = strategy;
}
/**
- * Evaluates whether the words from the given {@link ComparisonContext} are similar using the specified comparison
- * strategy.
+ * Sets the default similarity strategy. The specified strategy will be used for all subsequent comparisons that themselves do not specify a strategy.
+ *
+ * @param strategy the new default strategy
+ */
+ public void setStrategy(SimilarityStrategy strategy) {
+ this.similarityStrategy = strategy;
+ }
+
+ public void setCharacterMatchFunction(UnicodeCharacterMatchFunctions characterMatch) {
+ this.characterMatch = characterMatch;
+ }
+
+ public UnicodeCharacterMatchFunctions getCharacterMatchFunction() {
+ return this.characterMatch;
+ }
+
+ public void setConsiderAbbreviations(boolean considerAbbreviations) {
+ this.considerAbbreviations = considerAbbreviations;
+ }
+
+ public boolean getConsiderAbbreviations() {
+ return this.considerAbbreviations;
+ }
+
+ /**
+ * Evaluates whether the words from the given {@link ComparisonContext} are similar using the specified comparison strategy.
*
* @param ctx the context
* @param strategy the strategy
* @return Returns {@code true} if the given strategy considers the words similar enough.
*/
- public static boolean areWordsSimilar(ComparisonContext ctx, ComparisonStrategy strategy) {
+ public boolean areWordsSimilar(ComparisonContext ctx, ComparisonStrategy strategy) {
Objects.requireNonNull(ctx);
Objects.requireNonNull(strategy);
+ var firstTerm = ctx.firstTerm();
+ var secondTerm = ctx.secondTerm();
+
+ if (getConsiderAbbreviations()) {
+ var ambiguatedFirstTerm = AbbreviationDisambiguationHelper.ambiguateAll(firstTerm, true);
+ var ambiguatedSecondTerm = AbbreviationDisambiguationHelper.ambiguateAll(secondTerm, true);
+ var different = !ambiguatedFirstTerm.equals(firstTerm) || !ambiguatedSecondTerm.equals(secondTerm);
+
+ if (different && areWordsSimilar(new ComparisonContext(ambiguatedFirstTerm, ambiguatedSecondTerm, null, null, false, ctx.characterMatch()))) {
+ return true;
+ }
+ }
+
// Currently, we need the split test as it improves results by a lot. In the future, we should try to avoid its requirement
if (!splitLengthTest(ctx)) {
return false;
@@ -69,33 +120,33 @@ public static boolean areWordsSimilar(ComparisonContext ctx, ComparisonStrategy
return strategy.areWordsSimilar(ctx, measures.toList());
}
- private static boolean splitLengthTest(ComparisonContext ctx) {
+ private boolean splitLengthTest(ComparisonContext ctx) {
var first = ctx.firstTerm().toLowerCase();
var second = ctx.secondTerm().toLowerCase();
return (first.split(" ").length == second.split(" ").length);
}
/**
- * Evaluates whether the words from the given {@link ComparisonContext} are similar using the default comparison
- * strategy. The default strategy can be changed with the {@link #setStrategy(ComparisonStrategy)} method.
+ * Evaluates whether the words from the given {@link ComparisonContext} are similar using the default comparison strategy. The default strategy can be
+ * changed with the {@link #setStrategy(ComparisonStrategy)} method.
*
* @param ctx the context
* @return Returns {@code true} if the default strategy considers the words similar enough.
*/
- public static boolean areWordsSimilar(ComparisonContext ctx) {
+ public boolean areWordsSimilar(ComparisonContext ctx) {
Objects.requireNonNull(ctx);
return areWordsSimilar(ctx, strategy);
}
/**
- * Evaluates whether the given words are similar using the default comparison strategy. The default strategy can be
- * changed with the {@link #setStrategy(ComparisonStrategy)} method.
+ * Evaluates whether the given words are similar using the default comparison strategy. The default strategy can be changed with the
+ * {@link #setStrategy(ComparisonStrategy)} method.
*
* @param firstWord the first word
* @param secondWord the second word
* @return Returns {@code true} if the default strategy considers the words similar enough.
*/
- public static boolean areWordsSimilar(String firstWord, String secondWord) {
+ public boolean areWordsSimilar(String firstWord, String secondWord) {
return areWordsSimilar(new ComparisonContext(firstWord, secondWord, false), strategy);
}
@@ -107,19 +158,19 @@ public static boolean areWordsSimilar(String firstWord, String secondWord) {
* @param strategy the strategy to use
* @return Returns {@code true} if the given strategy considers the words similar enough.
*/
- public static boolean areWordsSimilar(String firstWord, String secondWord, ComparisonStrategy strategy) {
+ public boolean areWordsSimilar(String firstWord, String secondWord, ComparisonStrategy strategy) {
return areWordsSimilar(new ComparisonContext(firstWord, secondWord, false), strategy);
}
/**
- * Evaluates whether the given words are similar using the default comparison strategy. The default strategy can be
- * changed with the {@link #setStrategy(ComparisonStrategy)} method.
+ * Evaluates whether the given words are similar using the default comparison strategy. The default strategy can be changed with the
+ * {@link #setStrategy(ComparisonStrategy)} method.
*
* @param firstWord the first word
* @param secondWord the second word
* @return Returns {@code true} if the default strategy considers the words similar enough.
*/
- public static boolean areWordsSimilar(Word firstWord, Word secondWord) {
+ public boolean areWordsSimilar(Word firstWord, Word secondWord) {
return areWordsSimilar(new ComparisonContext(firstWord, secondWord, false), strategy);
}
@@ -131,20 +182,20 @@ public static boolean areWordsSimilar(Word firstWord, Word secondWord) {
* @param strategy the strategy to use
* @return Returns {@code true} if the given strategy considers the words similar enough.
*/
- public static boolean areWordsSimilar(Word firstWord, Word secondWord, ComparisonStrategy strategy) {
+ public boolean areWordsSimilar(Word firstWord, Word secondWord, ComparisonStrategy strategy) {
return areWordsSimilar(new ComparisonContext(firstWord, secondWord, false), strategy);
}
/**
- * Evaluates whether the given words are similar using the default comparison strategy. The default strategy can be
- * changed with the {@link #setStrategy(ComparisonStrategy)} method.
+ * Evaluates whether the given words are similar using the default comparison strategy. The default strategy can be changed with the
+ * {@link #setStrategy(ComparisonStrategy)} method.
*
* @param firstWord the first word
* @param secondWord the second word
* @return Returns {@code true} if the default strategy considers the words similar enough.
*/
- public static boolean areWordsSimilar(String firstWord, Word secondWord) {
- return areWordsSimilar(new ComparisonContext(firstWord, secondWord.getText(), null, secondWord, false), strategy);
+ public boolean areWordsSimilar(String firstWord, Word secondWord) {
+ return areWordsSimilar(new ComparisonContext(firstWord, secondWord.getText(), null, secondWord, false, characterMatch), strategy);
}
/**
@@ -155,8 +206,50 @@ public static boolean areWordsSimilar(String firstWord, Word secondWord) {
* @param strategy the strategy to use
* @return Returns {@code true} if the given strategy considers the words similar enough.
*/
- public static boolean areWordsSimilar(String firstWord, Word secondWord, ComparisonStrategy strategy) {
- return areWordsSimilar(new ComparisonContext(firstWord, secondWord.getText(), null, secondWord, false), strategy);
+ public boolean areWordsSimilar(String firstWord, Word secondWord, ComparisonStrategy strategy) {
+ return areWordsSimilar(new ComparisonContext(firstWord, secondWord.getText(), null, secondWord, false, characterMatch), strategy);
+ }
+
+ /**
+ * Evaluates the similarity of the given words using the specified similarity strategy.
+ *
+ * @param firstWord the first word
+ * @param secondWord the second word
+ * @param strategy the strategy to use
+ * @param ignoreCase whether to ignore the case during comparison
+ * @return Returns similarity in range [0,1]
+ */
+ public double getSimilarity(String firstWord, String secondWord, SimilarityStrategy strategy, boolean ignoreCase) {
+ var allMeasuresExceptDefault = this.measures.stream().filter(m -> !(m instanceof EqualityMeasure)).collect(Collectors.toCollection(ArrayList::new));
+ if (allMeasuresExceptDefault.isEmpty())
+ allMeasuresExceptDefault.add(new EqualityMeasure());
+
+ return strategy.getSimilarity(new ComparisonContext(ignoreCase ? firstWord.toLowerCase() : firstWord, ignoreCase ?
+ secondWord.toLowerCase() :
+ secondWord, null, null, false, characterMatch), allMeasuresExceptDefault);
+ }
+
+ /**
+ * Evaluates the similarity of the given words.
+ *
+ * @param firstWord the first word
+ * @param secondWord the second word
+ * @return Returns similarity in range [0,1]
+ */
+ public double getSimilarity(String firstWord, String secondWord) {
+ return getSimilarity(firstWord, secondWord, false);
+ }
+
+ /**
+ * Evaluates the similarity of the given words.
+ *
+ * @param firstWord the first word
+ * @param secondWord the second word
+ * @param ignoreCase whether to ignore the case during comparison
+ * @return Returns similarity in range [0,1]
+ */
+ public double getSimilarity(String firstWord, String secondWord, boolean ignoreCase) {
+ return getSimilarity(firstWord, secondWord, similarityStrategy, ignoreCase);
}
public static SQLiteConfig getSqLiteConfig() {
diff --git a/framework/common/src/main/java/edu/kit/kastel/mcse/ardoco/core/common/util/wordsim/measures/equality/EqualityMeasure.java b/framework/common/src/main/java/edu/kit/kastel/mcse/ardoco/core/common/util/wordsim/measures/equality/EqualityMeasure.java
index df7179d90..33b4003e3 100644
--- a/framework/common/src/main/java/edu/kit/kastel/mcse/ardoco/core/common/util/wordsim/measures/equality/EqualityMeasure.java
+++ b/framework/common/src/main/java/edu/kit/kastel/mcse/ardoco/core/common/util/wordsim/measures/equality/EqualityMeasure.java
@@ -1,18 +1,28 @@
-/* Licensed under MIT 2022-2023. */
+/* Licensed under MIT 2022-2024. */
package edu.kit.kastel.mcse.ardoco.core.common.util.wordsim.measures.equality;
+import java.util.Locale;
+
import edu.kit.kastel.mcse.ardoco.core.common.util.wordsim.ComparisonContext;
+import edu.kit.kastel.mcse.ardoco.core.common.util.wordsim.UnicodeCharacterSequence;
import edu.kit.kastel.mcse.ardoco.core.common.util.wordsim.WordSimMeasure;
/**
- * This word similarity measure just checks whether the most appropriate string representations of the passed objects
- * are equal.
+ * This word similarity measure just checks whether the most appropriate string representations of the passed objects are equal.
+ * Equality of two characters is determined using the provided {@link ComparisonContext#characterMatch() Character Match Function}.
+ * Letter-casing is not considered.
*/
public class EqualityMeasure implements WordSimMeasure {
@Override
public boolean areWordsSimilar(ComparisonContext ctx) {
- return ctx.firstTerm().equalsIgnoreCase(ctx.secondTerm());
+ var firstTerm = UnicodeCharacterSequence.valueOf(ctx.firstTerm().toLowerCase(Locale.ENGLISH));
+ var secondTerm = UnicodeCharacterSequence.valueOf(ctx.secondTerm().toLowerCase(Locale.ENGLISH));
+ return firstTerm.match(secondTerm, ctx.characterMatch());
}
+ @Override
+ public double getSimilarity(ComparisonContext ctx) {
+ return areWordsSimilar(ctx) ? 1 : 0;
+ }
}
diff --git a/framework/common/src/main/java/edu/kit/kastel/mcse/ardoco/core/common/util/wordsim/measures/glove/GloveMeasure.java b/framework/common/src/main/java/edu/kit/kastel/mcse/ardoco/core/common/util/wordsim/measures/glove/GloveMeasure.java
index 7713942a7..9346f5d0c 100644
--- a/framework/common/src/main/java/edu/kit/kastel/mcse/ardoco/core/common/util/wordsim/measures/glove/GloveMeasure.java
+++ b/framework/common/src/main/java/edu/kit/kastel/mcse/ardoco/core/common/util/wordsim/measures/glove/GloveMeasure.java
@@ -1,4 +1,4 @@
-/* Licensed under MIT 2022-2023. */
+/* Licensed under MIT 2022-2024. */
package edu.kit.kastel.mcse.ardoco.core.common.util.wordsim.measures.glove;
import java.nio.file.Path;
@@ -12,11 +12,11 @@
import edu.kit.kastel.mcse.ardoco.core.common.util.wordsim.vector.RetrieveVectorException;
import edu.kit.kastel.mcse.ardoco.core.common.util.wordsim.vector.VectorBasedWordSimMeasure;
import edu.kit.kastel.mcse.ardoco.core.common.util.wordsim.vector.VectorSqliteDatabase;
+import edu.kit.kastel.mcse.ardoco.core.common.util.wordsim.vector.WordVectorDataSource;
/**
- * This word similarity measures utilizes GloVe trained word vector representations to calculate word similarity. It
- * retrieves vectors for each word and compares them using cosine similarity. This measure additionally manages a cache
- * to improve lookup speeds.
+ * This word similarity measures utilizes GloVe trained word vector representations to calculate word similarity. It retrieves vectors for each word and
+ * compares them using cosine similarity. This measure additionally manages a cache to improve lookup speeds.
*/
public class GloveMeasure extends VectorBasedWordSimMeasure {
@@ -26,23 +26,18 @@ public class GloveMeasure extends VectorBasedWordSimMeasure {
/**
* Constructs a new {@link GloveMeasure} using the settings provided by {@link CommonTextToolsConfig}.
- *
- * @throws SQLException if establishing the connection to the data source fails
*/
- public GloveMeasure() throws SQLException {
- this(new VectorSqliteDatabase(Path.of(CommonTextToolsConfig.GLOVE_DB_FILE_PATH)), CommonTextToolsConfig.GLOVE_SIMILARITY_THRESHOLD);
+ public GloveMeasure() {
+ this(CommonTextToolsConfig.GLOVE_SIMILARITY_THRESHOLD);
}
/**
* Constructs a new {@link GloveMeasure} instance.
*
- * @param dataSource the data source from which word vectors are loaded
* @param similarityThreshold the threshold above which words are considered similar, between 0 and 1
* @throws IllegalArgumentException if the given threshold is not between 0 and 1
*/
- public GloveMeasure(VectorSqliteDatabase dataSource, double similarityThreshold) throws IllegalArgumentException {
- super(dataSource);
-
+ public GloveMeasure(double similarityThreshold) throws IllegalArgumentException {
this.similarityThreshold = similarityThreshold;
if (similarityThreshold < 0.0 || similarityThreshold > 1.0) {
@@ -52,15 +47,25 @@ public GloveMeasure(VectorSqliteDatabase dataSource, double similarityThreshold)
@Override
public boolean areWordsSimilar(ComparisonContext ctx) {
- double similarity = Double.NaN;
+ return getSimilarity(ctx) >= this.similarityThreshold;
+ }
+ @Override
+ public double getSimilarity(ComparisonContext ctx) {
try {
- similarity = this.compareVectors(ctx.firstTerm(), ctx.secondTerm());
+ return this.compareVectors(ctx.firstTerm(), ctx.secondTerm());
} catch (RetrieveVectorException e) {
LOGGER.error("Failed to compare glove vectors: " + ctx, e);
+ return Double.NaN;
}
-
- return similarity >= this.similarityThreshold;
}
+ @Override
+ protected WordVectorDataSource getVectorDataSource() {
+ try {
+ return new VectorSqliteDatabase(Path.of(CommonTextToolsConfig.GLOVE_DB_FILE_PATH));
+ } catch (SQLException e) {
+ throw new IllegalArgumentException(e);
+ }
+ }
}
diff --git a/framework/common/src/main/java/edu/kit/kastel/mcse/ardoco/core/common/util/wordsim/measures/jarowinkler/JaroWinklerMeasure.java b/framework/common/src/main/java/edu/kit/kastel/mcse/ardoco/core/common/util/wordsim/measures/jarowinkler/JaroWinklerMeasure.java
index d0125d14b..47d1d59cd 100644
--- a/framework/common/src/main/java/edu/kit/kastel/mcse/ardoco/core/common/util/wordsim/measures/jarowinkler/JaroWinklerMeasure.java
+++ b/framework/common/src/main/java/edu/kit/kastel/mcse/ardoco/core/common/util/wordsim/measures/jarowinkler/JaroWinklerMeasure.java
@@ -1,8 +1,6 @@
-/* Licensed under MIT 2022-2023. */
+/* Licensed under MIT 2022-2024. */
package edu.kit.kastel.mcse.ardoco.core.common.util.wordsim.measures.jarowinkler;
-import org.apache.commons.text.similarity.JaroWinklerSimilarity;
-
import edu.kit.kastel.mcse.ardoco.core.common.util.CommonTextToolsConfig;
import edu.kit.kastel.mcse.ardoco.core.common.util.wordsim.ComparisonContext;
import edu.kit.kastel.mcse.ardoco.core.common.util.wordsim.WordSimMeasure;
@@ -12,8 +10,6 @@
*/
public class JaroWinklerMeasure implements WordSimMeasure {
- private final JaroWinklerSimilarity jaroWinklerSimilarity = new JaroWinklerSimilarity();
-
private final double similarityThreshold;
/**
@@ -25,7 +21,7 @@ public JaroWinklerMeasure() {
/**
* Constructs a new {@link JaroWinklerMeasure}.
- *
+ *
* @param similarityThreshold the threshold above which words are considered similar, between 0 and 1
* @throws IllegalArgumentException if the given threshold is not between 0 and 1
*/
@@ -39,8 +35,13 @@ public JaroWinklerMeasure(double similarityThreshold) throws IllegalArgumentExce
@Override
public boolean areWordsSimilar(ComparisonContext ctx) {
- double similarity = this.jaroWinklerSimilarity.apply(ctx.firstTerm(), ctx.secondTerm());
+ double similarity = getSimilarity(ctx);
return similarity >= this.similarityThreshold;
}
+ @Override
+ public double getSimilarity(ComparisonContext ctx) {
+ return UnicodeJaroWinklerSimilarity.apply(ctx.firstTerm(), ctx.secondTerm(), ctx.characterMatch());
+ }
+
}
diff --git a/framework/common/src/main/java/edu/kit/kastel/mcse/ardoco/core/common/util/wordsim/measures/jarowinkler/UnicodeJaroWinklerSimilarity.java b/framework/common/src/main/java/edu/kit/kastel/mcse/ardoco/core/common/util/wordsim/measures/jarowinkler/UnicodeJaroWinklerSimilarity.java
new file mode 100644
index 000000000..7ddcb493f
--- /dev/null
+++ b/framework/common/src/main/java/edu/kit/kastel/mcse/ardoco/core/common/util/wordsim/measures/jarowinkler/UnicodeJaroWinklerSimilarity.java
@@ -0,0 +1,166 @@
+/* Licensed under MIT 2023-2024. */
+package edu.kit.kastel.mcse.ardoco.core.common.util.wordsim.measures.jarowinkler;
+
+import java.io.Serializable;
+import java.util.Arrays;
+
+import edu.kit.kastel.mcse.ardoco.core.common.util.wordsim.UnicodeCharacter;
+import edu.kit.kastel.mcse.ardoco.core.common.util.wordsim.UnicodeCharacterMatchFunctions;
+import edu.kit.kastel.mcse.ardoco.core.common.util.wordsim.UnicodeCharacterSequence;
+
+/**
+ * A similarity algorithm indicating the percentage of matched characters between two character sequences.
+ *
+ *
+ * The Jaro measure is the weighted sum of percentage of matched characters from each file and transposed characters. Winkler increased this measure for
+ * matching initial characters.
+ *
+ *
+ *
+ * This implementation is based on the Jaro Winkler similarity algorithm from