From 11bf32bbb94dc4d76dca3e15c67101adbe60fb42 Mon Sep 17 00:00:00 2001 From: Roman Chyla Date: Mon, 14 Mar 2022 21:38:53 -0400 Subject: [PATCH] feat: reworked old russing transliteration rules --- .classpath | 1 + .../solr/analysis/author/AuthorUtils.java | 459 ++++++++++-------- .../author/TestAdsabsTypeAuthorParsing.java | 12 +- .../TestAuthorTransliterationFilter.java | 2 +- .../solr/analysis/author/TestAuthorUtils.java | 71 ++- 5 files changed, 321 insertions(+), 224 deletions(-) diff --git a/.classpath b/.classpath index 9ab31636b..9fc78d881 100644 --- a/.classpath +++ b/.classpath @@ -188,5 +188,6 @@ + diff --git a/contrib/adsabs/src/java/org/apache/solr/analysis/author/AuthorUtils.java b/contrib/adsabs/src/java/org/apache/solr/analysis/author/AuthorUtils.java index 41b9d111e..0c88b5794 100644 --- a/contrib/adsabs/src/java/org/apache/solr/analysis/author/AuthorUtils.java +++ b/contrib/adsabs/src/java/org/apache/solr/analysis/author/AuthorUtils.java @@ -9,16 +9,12 @@ import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; - -import org.jython.JythonObjectFactory; -import org.jython.monty.interfaces.JythonNameParser; - -import java.text.*; +import org.apache.commons.lang3.StringUtils; import static net.gcardone.junidecode.Junidecode.unidecode; public class AuthorUtils { - + static final Trie trie = buildTrie(); static final NameParser nameParser = new NameParser(); public static final String AUTHOR_QUERY_VARIANT = "AUTHOR_QUERY_VARIANT"; @@ -128,6 +124,7 @@ public static ArrayList getAsciiTransliteratedVariants(String a) { // handle russian name stuff HashSet transRus = transliterateRussianNames(synonyms ); synonyms.addAll(transRus); + synonyms.addAll(translitRussianApostrophes(synonyms.iterator())); // apostrophes are now preserved in the index // so we need to generate translits for those @@ -203,226 +200,296 @@ public static String[] splitName(String name) { return name.split(" "); } } - - - - // XXX: this doesn't look right to me, the fifth step gets (possibly) - // 5 times more items than the first step - private static HashSet transliterateRussianNames(Set in) { - HashSet synonyms = new HashSet(); - for (String s : in) { - HashSet syn = new HashSet(); - syn.add(s); - syn.addAll(translitRussianApostrophes(syn.iterator())); - syn.addAll(translitRussianLastNames1(syn.iterator())); - syn.addAll(translitRussianLastNames2(syn.iterator())); - syn.addAll(translitRussianLastNames3(syn.iterator())); - syn.addAll(translitRussianLastNames4(syn.iterator())); - syn.addAll(translitRussianLastNames5(syn.iterator())); - syn.addAll(translitRussianFirstNames(syn.iterator())); - synonyms.addAll(syn); + + + /** + * Build efficient data structure for searching suffixes + * + */ + private static Trie buildTrie() { + ArrayList patterns = new ArrayList(); + + /* russian last names I: + * [^IJY]EV$ => IEV$ == YEV$ == JEV$ + * [^IJY]EVA$ => IEVA$ == YEVA$ == JEVA$ + */ + patterns.add(new Resolution(new String[]{"ev,", "iev,", "yev,", "jev,"})); + patterns.add(new Resolution(new String[]{"eva,", "ieva,", "yeva,", "jeva,"})); + + + /* russian last names II: + * ([NRBO])IA$ == $1IIA$ == $1IYA$ + */ + patterns.add(new Resolution(new String[]{"ia,", "iia,", "iya,"}, "nrbo")); + + /* russian last names III: + * ([DHKLMNPSZ])IAN$ == $1YAN$ == $1JAN$ + */ + patterns.add(new Resolution(new String[]{"ian,", "yan,", "jan,"}, "dhklmnpsz")); + + /* russian last names IV: + * AIA$ == AYA$ == AJA$ + */ + + patterns.add(new Resolution(new String[]{"aia,", "aya,", "aja,"})); + + /* russian last names V: + * KI$ == KII$ == KIJ$ == KIY$ = KYI$ + * VI$ == VII$ == VIJ$ == VIY$ = VYI$ + * first transform [KVH]I into [KVH]II + */ + patterns.add(new Resolution(new String[]{"ki,", "kii,", "kij,", "kiy,", "kyi,"}, "dhklmnpsz")); + + + /* russian first names + * ^IU == ^YU + * ^IA == ^YA + * + * The only detail is that the pattern must be reversed (because we normally search in + * a reversed version of a name; and comma is missing) + */ + patterns.add(new Resolution(new String[]{"ui", "uy"})); + patterns.add(new Resolution(new String[]{"ai", "ay"})); + + String reverse; + Trie trie = new Trie(); + + for (Resolution resolution: patterns) { + for (String s: resolution.suffixes) { + reverse = StringUtils.reverse(s); + trie.insert(reverse, resolution); + } } - return synonyms; + return trie; + } - /* - * take care of russian apostrophes: - * 'E => E == IE == YE - * note that we do not index 'E since the search - * engine simply strips all apostrophes - */ - private static Pattern p0 = Pattern.compile("(?<=\\w{2})'(?=[Ee])"); - private static HashSet translitRussianApostrophes(Iterator itr) { - HashSet syn = new HashSet(); - - String x; - while (itr.hasNext()) { - x = itr.next(); - Matcher m = p0.matcher(x); - if (m.find()) { - if (x.charAt(m.end()) == 'E') { - syn.add(m.replaceAll("I")); - syn.add(m.replaceAll("Y")); - syn.add(m.replaceAll("")); + + static Set transliterateRussianName(String name) { + // always search lowercase + name = name.toLowerCase(); + HashSet out= new HashSet(); + out.add(name); + + String[] parts = splitName(name); + String surname = parts[0]; + StringBuilder first = new StringBuilder(); + int i = 1; + while (i < parts.length) { + if (i > 1) + first.append(" "); + first.append(parts[i]); + i += 1; + } + + String rn = StringUtils.reverse(surname); + Result result = trie.search(rn); + + // first modify surnames (suffixes are unique) + if (result != null) { + Resolution v = result.result; + for (String x: v.transform(surname, result.suffix)) { + if (first.length() > 0) { + out.add(x + " " + first); } else { - syn.add(m.replaceAll("i")); - syn.add(m.replaceAll("y")); - syn.add(m.replaceAll("")); + out.add(x); } } } - //log.debug("apostrophes: " + syn); - return syn; - } - - /* russian last names I: - * [^IJY]EV$ => IEV$ == YEV$ == JEV$ - * [^IJY]EVA$ => IEVA$ == YEVA$ == JEVA$ - */ - private static Pattern p1 = Pattern.compile("(? translitRussianLastNames1(Iterator itr) { - HashSet syn = new HashSet(); - String x; - while (itr.hasNext()) { - x = itr.next(); - Matcher m = p1.matcher(x); - if (m.find()) { - if (x.charAt(m.start()) == 'E') { - syn.add(m.replaceAll("IEV")); - syn.add(m.replaceAll("YEV")); - syn.add(m.replaceAll("JEV")); - } - else { - syn.add(m.replaceAll("iev")); - syn.add(m.replaceAll("yev")); - syn.add(m.replaceAll("jev")); - } - + + // then modify first names (possibly multi-plying output) + String rfn = StringUtils.reverse(first.toString()); + result = trie.search(first.toString()); + if (result != null) { + for (String x: result.result.transform(rfn, result.suffix)) { + x = StringUtils.reverse(x); + for (String o: out) { + parts = splitName(o); + out.add(parts[0] + " " + x); + }; } } - //log.debug("last names I: " + syn); - return syn; + out.remove(name); // remove the original + return out; + } - /* russian last names II: - * ([NRBO])IA$ == $1IIA$ == $1IYA$ + /* + * transliterate all names using Trie search for suffixes */ - private static Pattern p2 = Pattern.compile("(?<=[NRBOnrbo])[Ii](?=[Aa],)"); - private static HashSet translitRussianLastNames2(Iterator itr) { - HashSet syn = new HashSet(); - String x; - while (itr.hasNext()) { - x = itr.next(); - Matcher m = p2.matcher(x); - if (m.find()) { - if (x.charAt(m.start()) == 'I') { - syn.add(m.replaceAll("II")); - syn.add(m.replaceAll("IY")); - } - else { - syn.add(m.replaceAll("ii")); - syn.add(m.replaceAll("iy")); - } + private static HashSet transliterateRussianNames(Set in) { + HashSet synonyms = new HashSet(); + for (String s : in) { + for (String r: transliterateRussianName(s)) { + synonyms.add(r); } } - //log.debug("last names II: " + syn); - return syn; + return synonyms; } - /* russian last names III: - * ([DHKLMNPSZ])IAN$ == $1YAN$ == $1JAN$ + /* + * take care of russian apostrophes: + * 'E => E == IE == YE + * note that we do not index 'E since the search + * engine simply strips all apostrophes */ - private static Pattern p3 = Pattern.compile("(?<=[DHKLMNPSZdhklmnpsz])[IJYijy](?=[Aa][Nn],)"); - private static HashSet translitRussianLastNames3(Iterator itr) { - HashSet syn = new HashSet(); - String x; - while (itr.hasNext()) { - x = itr.next(); - Matcher m = p3.matcher(x); - if (m.find()) { - if (x.charAt(m.start()) == 'I' || x.charAt(m.start()) == 'J' || x.charAt(m.start()) == 'Y') { - syn.add(m.replaceAll("I")); - syn.add(m.replaceAll("J")); - syn.add(m.replaceAll("Y")); - } - else { - syn.add(m.replaceAll("i")); - syn.add(m.replaceAll("j")); - syn.add(m.replaceAll("y")); - } + private static Pattern p0 = Pattern.compile("(?<=\\w{2})\'(?=[Ee])"); + private static Set translitRussianApostrophes(Iterator it) { + Set out = new HashSet(); + String name; + while (it.hasNext()) { + name = it.next(); + if (name.indexOf("'e") >= 1) { + //name = name.replaceAll("'e", "__"); + out.add(name.replaceAll("'e", "ie")); + out.add(name.replaceAll("'e", "ye")); + out.add(name.replaceAll("'e", "e")); } + } - //log.debug("last names III: " + syn); - return syn; + + return out; } - /* russian last names IV: - * AIA$ == AYA$ == AJA$ - */ - private static Pattern p4 = Pattern.compile("(?<=[KNVknv][Aa])[IJYijy](?=[Aa],)"); - private static HashSet translitRussianLastNames4(Iterator itr) { - HashSet syn = new HashSet(); - String x; - while (itr.hasNext()) { - x = itr.next(); - Matcher m = p4.matcher(x); - if (m.find()) { - if (x.charAt(m.start()) == 'I' || x.charAt(m.start()) == 'J' || x.charAt(m.start()) == 'Y') { - syn.add(m.replaceAll("I")); - syn.add(m.replaceAll("J")); - syn.add(m.replaceAll("Y")); - } - else { - syn.add(m.replaceAll("i")); - syn.add(m.replaceAll("j")); - syn.add(m.replaceAll("y")); - } - } - } - //log.debug("last names IV: " + syn); - return syn; + + private static class TrieNode { + private char c; + private HashMap children = new HashMap<>(); + private Resolution leaf = null; + + public TrieNode() {} + + public TrieNode(char c){ + this.c = c; + } + + public HashMap getChildren() { + return children; + } + + public void setChildren(HashMap children) { + this.children = children; + } + + public boolean isLeaf() { + return leaf != null; + } + + public void setLeaf(Resolution res) { + this.leaf = res; + } + + public Resolution getValue() { + return this.leaf; + } } + + private static class Trie { + + private TrieNode root; + + public Trie() { + root = new TrieNode(); + } + + public void insert(String word, Resolution res) { + HashMap children = root.getChildren(); + for(int i = 0; i < word.length(); i++) { + char c = word.charAt(i); + TrieNode node; + if(children.containsKey(c)) { + node = children.get(c); + } else { + node = new TrieNode(c); + children.put(c, node); + } + children = node.getChildren(); + + if(i == word.length() - 1) { + node.setLeaf(res); + } + } + } + + public Result search(String word) { + HashMap children = root.getChildren(); + Resolution lastFound = null; + int lastI = 0; + + TrieNode node = null; + for(int i = 0; i < word.length(); i++) { + char c = word.charAt(i); + if(children.containsKey(c)) { + node = children.get(c); + children = node.getChildren(); + if (node.isLeaf()) { + lastFound = node.getValue(); + lastI = i; + } + } else { + node = null; + break; + } + } + if (lastFound == null) + return null; + return new Result(word.substring(0, lastI+1), lastFound); + } - /* russian last names V: - * KI$ == KII$ == KIJ$ == KIY$ = KYI$ - * VI$ == VII$ == VIJ$ == VIY$ = VYI$ - * first transform [KVH]I into [KVH]II - */ - private static Pattern p5 = Pattern.compile("(?<=[KVkv])[Ii](?=,)"); - private static HashSet translitRussianLastNames5(Iterator itr) { - HashSet syn = new HashSet(); - String x; - while (itr.hasNext()) { - x = itr.next(); - Matcher m = p5.matcher(x); - if (m.find()) { - if (x.charAt(m.start()) == 'I') { - syn.add(m.replaceAll("I")); - syn.add(m.replaceAll("Y")); - syn.add(m.replaceAll("YI")); - syn.add(m.replaceAll("IY")); - syn.add(m.replaceAll("IJ")); - syn.add(m.replaceAll("II")); - } - else { - syn.add(m.replaceAll("i")); - syn.add(m.replaceAll("y")); - syn.add(m.replaceAll("yi")); - syn.add(m.replaceAll("iy")); - syn.add(m.replaceAll("ij")); - syn.add(m.replaceAll("ii")); - } - } + } + + private static class Result { + private String suffix; + private Resolution result; + + Result(String suffix, Resolution res) { + this.suffix = suffix; + this.result = res; } - //log.debug("last names V: " + syn); - return syn; } - - /* russian first names - * ^IU == ^YU - * ^IA == ^YA - */ - private static Pattern p6 = Pattern.compile("(?<=, )[YIyi](?=[AUau])"); - private static HashSet translitRussianFirstNames(Iterator itr) { - HashSet syn = new HashSet(); - String x; - while (itr.hasNext()) { - x = itr.next(); - Matcher m = p6.matcher(x); - if (m.find()) { - if (x.charAt(m.start()) == 'I' || x.charAt(m.start()) == 'Y') { - syn.add(m.replaceAll("I")); - syn.add(m.replaceAll("Y")); + + private static class Resolution { + private String[] suffixes; + private String mustMatch = ""; + private String mustNotMatch = ""; + Resolution(String[] suffixes) { + this.suffixes = suffixes; + + } + Resolution(String[] suffixes, String mustMatch) { + this.suffixes = suffixes; + this.mustMatch = mustMatch; + + } + Resolution(String[] suffixes, String mustMatch, String mustNotMatch) { + this.suffixes = suffixes; + this.mustMatch = mustMatch; + this.mustNotMatch = mustNotMatch; + } + + List transform(String surname, String key) { + ArrayList out = new ArrayList(); + //out.add(surname); + + String prefix = surname.substring(0, surname.length() - key.length()); + String suffix = surname.substring(surname.length()-key.length()); + int prevChar = surname.length() - key.length() - 1; + + for (String s: suffixes) { + if (suffix.equals(s)) + continue; + if (this.mustMatch.length() > 0 && (prevChar <= 0 || (mustMatch.indexOf(surname.charAt(prevChar)) == -1))) { + continue; } - else { - syn.add(m.replaceAll("i")); - syn.add(m.replaceAll("y")); + if (this.mustNotMatch.length() > 0 && (prevChar <= 0 || (mustNotMatch.indexOf(surname.charAt(prevChar)) > -1))) { + continue; } + out.add(prefix + s); } + + return out; } - //log.debug("first names: " + syn); - return syn; } } diff --git a/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAdsabsTypeAuthorParsing.java b/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAdsabsTypeAuthorParsing.java index ff3cd8ac6..8af278d96 100644 --- a/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAdsabsTypeAuthorParsing.java +++ b/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAdsabsTypeAuthorParsing.java @@ -138,7 +138,7 @@ public static String getSchemaFile() { // automatically harvested variations of author names (collected during indexing) // it will be enriched by the indexing File generatedTransliterations = createTempFile(formatSynonyms(new String[]{ - "wyrzykowsky, l=>wyrzykowski, l;wyrzykowski, ł", + "wyrzykowskij, l=>wyrzykowski, l;wyrzykowski, ł", "ADAMCuk, m => ADAMČuk, m", "ADAMCZuk, m => ADAMČuk, m", //"ADAMCHuk, m K=> ADAMČuk, m K", => deactivated for test purposes, see , <1> <2> use case @@ -373,7 +373,11 @@ public void xtestX() throws Exception { public void testAuthorParsingUseCases() throws Exception { testAuthorQuery("\"krivodubski, v\"", - "", + "krivodubski, | krivodubski, v | krivodubski, v* | krivodubskii, | krivodubskii, v | krivodubskii, v* | krivodubskij, | krivodubskij, v | krivodubskij, v* | krivodubskiy, | krivodubskiy, v | krivodubskiy, v* | krivodubskyi, | krivodubskyi, v | krivodubskyi, v*", + "//*[@numFound='0']" + ); + testAuthorQuery("\"krivodubskij, v\"", + "krivodubski, | krivodubski, v | krivodubski, v* | krivodubskii, | krivodubskii, v | krivodubskii, v* | krivodubskij, | krivodubskij, v | krivodubskij, v* | krivodubskiy, | krivodubskiy, v | krivodubskiy, v* | krivodubskyi, | krivodubskyi, v | krivodubskyi, v*", "//*[@numFound='0']" ); @@ -392,8 +396,8 @@ public void testAuthorParsingUseCases() throws Exception { "//*[@numFound='11']"); // multiple synonyms in the file are separated with semicolon - testAuthorQuery("\"wyrzykowsky, l\"", - "wyrzykowski, | wyrzykowski, l | wyrzykowski, l* | wyrzykowski, ł | wyrzykowski, ł* | wyrzykowskii, | wyrzykowskii, l | wyrzykowskii, l* | wyrzykowskii, ł | wyrzykowskii, ł* | wyrzykowskij, | wyrzykowskij, l | wyrzykowskij, l* | wyrzykowskij, ł | wyrzykowskij, ł* | wyrzykowskiy, | wyrzykowskiy, l | wyrzykowskiy, l* | wyrzykowskiy, ł | wyrzykowskiy, ł* | wyrzykowsky, | wyrzykowsky, l | wyrzykowsky, l* | wyrzykowsky, ł | wyrzykowsky, ł* | wyrzykowskyi, | wyrzykowskyi, l | wyrzykowskyi, l* | wyrzykowskyi, ł | wyrzykowskyi, ł*", + testAuthorQuery("\"wyrzykowskij, l\"", + "wyrzykowski, | wyrzykowski, l | wyrzykowski, l* | wyrzykowski, ł | wyrzykowski, ł* | wyrzykowskii, | wyrzykowskii, l | wyrzykowskii, l* | wyrzykowskii, ł | wyrzykowskii, ł* | wyrzykowskij, | wyrzykowskij, l | wyrzykowskij, l* | wyrzykowskij, ł | wyrzykowskij, ł* | wyrzykowskiy, | wyrzykowskiy, l | wyrzykowskiy, l* | wyrzykowskiy, ł | wyrzykowskiy, ł* | wyrzykowskyi, | wyrzykowskyi, l | wyrzykowskyi, l* | wyrzykowskyi, ł | wyrzykowskyi, ł*", "//*[@numFound='1']"); // multiple names diff --git a/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAuthorTransliterationFilter.java b/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAuthorTransliterationFilter.java index 926d349be..fba24567c 100644 --- a/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAuthorTransliterationFilter.java +++ b/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAuthorTransliterationFilter.java @@ -42,7 +42,7 @@ public void testAccents() throws Exception { checkIt("Duprè", "Duprè", "Dupre,"); // Dupre\\xcc\\x80 checkIt("\u0141", "Ł", "L,"); checkIt("Mendigutıa", "Mendigutıa", "Mendigutia,"); - checkIt("krivodubski, v", "krivodubski, v", "krivodubskyi, v", "krivodubsky, v", "krivodubskij, v", "krivodubskiy, v", "krivodubskii, v" ); + checkIt("krivodubski, v", "krivodubski, v", "krivodubskyi, v", "krivodubskiy, v", "krivodubskij, v", "krivodubskii, v" ); } diff --git a/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAuthorUtils.java b/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAuthorUtils.java index 10f124e35..3b8d57be7 100644 --- a/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAuthorUtils.java +++ b/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAuthorUtils.java @@ -67,28 +67,53 @@ public void testTransliterate() { check("guer\u00E7o, r", "guerco, r"); // guerço, } - public void testTransliterations() { - check("FOO'EYE, BAR", "FOOEYE, BAR", "FOOYEYE, BAR", "FOOIEYE, BAR"); - check("FOOEV, BAR", "FOOYEV, BAR", "FOOJEV, BAR", "FOOIEV, BAR"); - check("Fooev, BAR", "Fooyev, BAR", "Foojev, BAR", "Fooiev, BAR"); - check("FOODJAN, BAR", "FOODYAN, BAR", "FOODIAN, BAR"); - check("Fookaya, BAR", "Fookaja, BAR", "Fookaia, BAR"); - check("FOOKI, BAR", "FOOKYI, BAR", "FOOKII, BAR", "FOOKY, BAR", "FOOKIY, BAR", "FOOKIJ, BAR"); - check("FOOVI, BAR", "FOOVYI, BAR", "FOOVII, BAR", "FOOVY, BAR", "FOOVIY, BAR", "FOOVIJ, BAR"); - check("FOO, YURI", "FOO, IURI"); - check("FOO, IAGNI", "FOO, YAGNI"); - check("krivodubski, v", "krivodubskii, v", "krivodubskij, v", "krivodubskiy, v", "krivodubsky, v", "krivodubskyi, v"); + public void testRussian() { + + // must work in any direction + check("krivodubski, v", "krivodubskii, v", "krivodubskij, v", "krivodubskiy, v", "krivodubskyi, v"); + check("krivodubskij, v", "krivodubskii, v", "krivodubski, v", "krivodubskiy, v", "krivodubskyi, v"); + check("krivodubskiy, v", "krivodubski, v", "krivodubskii, v", "krivodubskij, v", "krivodubskyi, v"); + check("krivodubskyi, v", "krivodubskii, v", "krivodubskij, v", "krivodubskiy, v", "krivodubski, v"); + + // suffix -ki wont be matched because it is not preceded by selected consonant + check("peki, v"); + + // this one is + check("pelki, v", "pelkii, v", "pelkij, v", "pelkiy, v", "pelkyi, v"); + + // similar (but different suffixes) + check("anajev, z", "anayev, z", "anaiev, z", "anaev, z"); + check("anaev, z", "anayev, z", "anaiev, z", "anajev, z"); + + check("tarzjan,", "tarzian,", "tarzyan,"); + check("tarzyan,", "tarzian,", "tarzjan,"); + + check("tarjan,"); + check("taryan,"); + + check("fookaya,", "fookaja,", "fookaia,"); + + // woman's surname + check("sarnieva,", "sarneva,", "sarnjeva,", "sarnyeva,"); + check("sarneva,", "sarnjeva,", "sarnyeva,", "sarnieva,"); + + //old pattern: looks wrong, because k is preceded by o + //check("FOOKI, BAR", "FOOKYI, BAR", "FOOKII, BAR", "FOOKY, BAR", "FOOKIY, BAR", "FOOKIJ, BAR"); + //what we do now: + check("fooki,"); + + + // first names + check("gagarin, yuri", "gagarin, iuri"); + check("gagarin, iuri", "gagarin, yuri"); + + // german modifications + check("foo, BÄR", "foo, BAER", "foo, BAR"); + + // long 'E -> E, IE, YE (seem wrong still) + check("f'edorov", "fedorov,", "fiedorov,", "fyedorov,"); } - public void testTransRussianNames() { - check("FOOVI, YURI", "FOOVI, IURI", "FOOVII, IURI", "FOOVII, YURI", "FOOVIJ, IURI", "FOOVIJ, YURI", - "FOOVIY, IURI", "FOOVIY, YURI", "FOOVY, IURI", "FOOVY, YURI", "FOOVYI, IURI", "FOOVYI, YURI"); - } - - public void testGenSynonyms() { - check("FOO'EYE, BÄR", "FOO'EYE, BAER", "FOO'EYE, BAR", "FOOEYE, BAER", "FOOEYE, BAR", "FOOEYE, BÄR", - "FOOIEYE, BAER", "FOOIEYE, BAR", "FOOIEYE, BÄR", "FOOYEYE, BAER", "FOOYEYE, BAR", "FOOYEYE, BÄR"); - } private void check(String a, String... expected) { ArrayList actual = AuthorUtils.getAsciiTransliteratedVariants(a); @@ -99,9 +124,9 @@ private void check(String a, String... expected) { actual.toArray(ac); Arrays.sort(ac); - // System.out.println(a); - // System.out.println(Arrays.asList(expected)); - // System.out.println(Arrays.asList(ac)); +// System.out.println(a); +// System.out.println(Arrays.asList(expected)); +// System.out.println(Arrays.asList(ac)); assertEquals(Arrays.asList(expected), Arrays.asList(ac));