Skip to content

Commit

Permalink
fix: fixing unittests that fail becuase of work done to author transl…
Browse files Browse the repository at this point in the history
…iterations
  • Loading branch information
romanchyla committed Feb 17, 2022
1 parent f552f2d commit 22269ef
Show file tree
Hide file tree
Showing 7 changed files with 97 additions and 97 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.solr.analysis.WriteableExplicitSynonymMap;
import org.apache.solr.analysis.WriteableSynonymMap;
import org.apache.solr.analysis.author.AuthorQueryVariations;
Expand Down Expand Up @@ -75,9 +78,9 @@ public static HashMap<String,HashSet<String>> transformGroup(List<String> group)
log.debug("withAutoSynonyms: " + withAutoSynonyms.toString());

// build a map of name -> variations to be used later
final HashMap<String,HashSet<String>> variationsMap = new HashMap<String,HashSet<String>>();
final HashMap<String, Set<String>> variationsMap = new HashMap<String,Set<String>>();
for (String s : withAutoSynonyms) {
HashMap<String,String> parsedAuthor = null;
Map<String,String> parsedAuthor = null;
try {
parsedAuthor = AuthorUtils.parseAuthor(s);
} catch (Exception e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,9 @@ protected static Set<String> generateNameVariations(Map<String, String> parsedAu
generateSynonymVariations(parsedAuthor, variations);

// add the variations that are needed only for the query phase
String last = parsedAuthor.get("last");
String first = parsedAuthor.get("first");
String middle = parsedAuthor.get("middle");
String last = parsedAuthor.get("Last");
String first = parsedAuthor.get("First");
String middle = parsedAuthor.get("Middle");

if (first != null) {
if (middle != null) {
Expand Down Expand Up @@ -115,7 +115,7 @@ protected static Set<String> generateNameVariations(Map<String, String> parsedAu
* @param authorString name in the natural form
* @return map with string mappings
*/
public static HashSet<String> getQueryVariations(String authorString) {
public static Set<String> getQueryVariations(String authorString) {

Map<String, String> parsedAuthor = null;
parsedAuthor = AuthorUtils.parseAuthor(authorString);
Expand All @@ -128,12 +128,12 @@ public static HashSet<String> getQueryVariations(String authorString) {
return generateSynonymVariations(parsedAuthor, variations);
}

public static HashSet<String> generateSynonymVariations(HashMap<String, String> parsedAuthor) {
public static Set<String> generateSynonymVariations(Map<String, String> parsedAuthor) {
HashSet<String> variations = new LinkedHashSet<String>();
return generateSynonymVariations(parsedAuthor, variations);
}

protected static HashSet<String> generateSynonymVariations(Map<String, String> parsedAuthor,
protected static Set<String> generateSynonymVariations(Map<String, String> parsedAuthor,
Set<String> variations) {

String last = parsedAuthor.get("Last");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,11 +114,13 @@ public static ArrayList<String> getAsciiTransliteratedVariants(String a) {
synonyms.add(a);

// downgrade to ascii
synonyms.add(foldToAscii(a));

// work around unidecode not always doing what we want
String b = replaceUmlaut(a);
if (b != a) {
synonyms.add(foldToAscii(b));
}
synonyms.add(foldToAscii(a));

// handle russian name stuff
HashSet<String> transRus = transliterateRussianNames(synonyms);
Expand All @@ -143,6 +145,9 @@ private static String replaceUmlaut(String input) {
StringBuilder out = new StringBuilder();
for (char c: input.toCharArray()) {
switch(c) {
case '\u00fc':
out.append("ue");
break;
case '\u00f6':
out.append("oe");
break;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,53 +12,45 @@
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;


public class TestAuthorTransliterationFilter extends BaseTokenStreamTestCase {

final class TestFilter extends TokenFilter {
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);

public TestFilter(TokenStream input) {
super(input);
}

public boolean incrementToken() throws IOException {
if (!input.incrementToken()) return false;
typeAtt.setType(AuthorUtils.AUTHOR_INPUT);
return true;
if (!input.incrementToken())
return false;
typeAtt.setType(AuthorUtils.AUTHOR_INPUT);
return true;
}
}

public void testAuthorSynonyms() throws Exception {

checkIt("Müller, Bill", "Müller, Bill", "Mueller, Bill", "Muller, Bill");
checkIt("Peißker, L", "Peißker, L", "Peissker, L");



}

public void testAccents() throws Exception {
checkIt("Jeřábková, Tereza", "Jeřábková, Tereza", "Jerhaebkovae, Tereza", "Jerabkova, Tereza");
checkIt("Dupré", "Dupré", "Dupree", "Dupre");
checkIt("Duprè", "Duprè", "Dupre", "Duprè"); // Dupre\\xcc\\x80
checkIt("\u0141", "Ł", "L");
// System.out.println("\u0141");
// System.out.println("\u0308E");
// System.out.println("\u030aA");
// System.out.println("\u0301E");
// System.out.println("\u030cH");
// //checkIt("\u0308E", "̈E");
// checkIt("Mendigutıa", "Mendigutia");
// checkIt("\u030aA", "\u030aA", "A");
// checkIt("\u0301E", "E");
// checkIt("\u030cH", "H");

}

checkIt("Jeřábková, Tereza", "Jeřábková, Tereza", "Jerabkova, Tereza");
checkIt("Dupré", "Dupré", "Dupre,");
checkIt("Duprè", "Duprè", "Dupre,"); // Dupre\\xcc\\x80
checkIt("\u0141", "Ł", "L,");
checkIt("Mendigutıa", "Mendigutıa", "Mendigutia,");

}

private void checkIt(String input, String... expected) throws Exception {
Reader reader = new StringReader(input);
Tokenizer tokenizer = new KeywordTokenizer();
tokenizer.setReader(reader);
AuthorTransliterationFactory factory = new AuthorTransliterationFactory(new HashMap<String,String>());
TokenStream stream = factory.create(new TestFilter(tokenizer));
assertTokenStreamContents(stream, expected);
Reader reader = new StringReader(input);
Tokenizer tokenizer = new KeywordTokenizer();
tokenizer.setReader(reader);
AuthorTransliterationFactory factory = new AuthorTransliterationFactory(new HashMap<String, String>());
TokenStream stream = factory.create(new TestFilter(tokenizer));
assertTokenStreamContents(stream, expected);
}
}
Loading

0 comments on commit 22269ef

Please sign in to comment.