From 11bf32bbb94dc4d76dca3e15c67101adbe60fb42 Mon Sep 17 00:00:00 2001
From: Roman Chyla <roman.chyla@gmail.com>
Date: Mon, 14 Mar 2022 21:38:53 -0400
Subject: [PATCH] feat: reworked old russing transliteration rules

---
 .classpath                                    |   1 +
 .../solr/analysis/author/AuthorUtils.java     | 459 ++++++++++--------
 .../author/TestAdsabsTypeAuthorParsing.java   |  12 +-
 .../TestAuthorTransliterationFilter.java      |   2 +-
 .../solr/analysis/author/TestAuthorUtils.java |  71 ++-
 5 files changed, 321 insertions(+), 224 deletions(-)
diff --git a/.classpath b/.classpath
index 9ab31636b..9fc78d881 100644
--- a/.classpath
+++ b/.classpath
@@ -188,5 +188,6 @@
 	<classpathentry kind="lib" path="build/solrjars-extracted/jetty/ext/log4j-api-2.15.0.jar"/>
 	<classpathentry kind="lib" path="build/solrjars-extracted/jetty/ext/log4j-core-2.15.0.jar"/>
 	<classpathentry kind="lib" path="build/solrjars-extracted/jetty/ext/log4j-slf4j-impl-2.15.0.jar"/>
+	<classpathentry kind="lib" path="build/solr-download/apache-solr-77/solr/contrib/extraction/lib/commons-collections4-4.2.jar"/>
 	<classpathentry kind="output" path="bin"/>
 </classpath>
diff --git a/contrib/adsabs/src/java/org/apache/solr/analysis/author/AuthorUtils.java b/contrib/adsabs/src/java/org/apache/solr/analysis/author/AuthorUtils.java
index 41b9d111e..0c88b5794 100644
--- a/contrib/adsabs/src/java/org/apache/solr/analysis/author/AuthorUtils.java
+++ b/contrib/adsabs/src/java/org/apache/solr/analysis/author/AuthorUtils.java
@@ -9,16 +9,12 @@
 import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
-
-import org.jython.JythonObjectFactory;
-import org.jython.monty.interfaces.JythonNameParser;
-
-import java.text.*;
+import org.apache.commons.lang3.StringUtils;
 import static net.gcardone.junidecode.Junidecode.unidecode;
 
 public class AuthorUtils {
 
-
+	static final Trie trie = buildTrie();
 	static final NameParser nameParser = new NameParser(); 
 
 	public static final String AUTHOR_QUERY_VARIANT = "AUTHOR_QUERY_VARIANT";
@@ -128,6 +124,7 @@ public static ArrayList<String> getAsciiTransliteratedVariants(String a) {
 		// handle russian name stuff
 		HashSet<String> transRus = transliterateRussianNames(synonyms );
 		synonyms.addAll(transRus);
+		synonyms.addAll(translitRussianApostrophes(synonyms.iterator()));
 
 		// apostrophes are now preserved in the index
 		// so we need to generate translits for those
@@ -203,226 +200,296 @@ public static String[] splitName(String name) {
 			return name.split(" ");
 		}
 	}
-
-
-
-	// XXX: this doesn't look right to me, the fifth step gets (possibly)
-	// 5 times more items than the first step
-	private static HashSet<String> transliterateRussianNames(Set<String> in) {
-		HashSet<String> synonyms = new HashSet<String>();
-		for (String s : in) {
-			HashSet<String> syn = new HashSet<String>();
-			syn.add(s);
-			syn.addAll(translitRussianApostrophes(syn.iterator()));
-			syn.addAll(translitRussianLastNames1(syn.iterator()));
-			syn.addAll(translitRussianLastNames2(syn.iterator()));
-			syn.addAll(translitRussianLastNames3(syn.iterator()));
-			syn.addAll(translitRussianLastNames4(syn.iterator()));
-			syn.addAll(translitRussianLastNames5(syn.iterator()));
-			syn.addAll(translitRussianFirstNames(syn.iterator()));
-			synonyms.addAll(syn);
+	
+	
+	/**
+	 * Build efficient data structure for searching suffixes
+	 * 
+	 */
+	private static Trie buildTrie() {
+		ArrayList<Resolution> patterns = new ArrayList<Resolution>();
+		
+		/* russian last names I:
+		 * [^IJY]EV$ => IEV$ == YEV$ == JEV$ 
+		 * [^IJY]EVA$ => IEVA$ == YEVA$ == JEVA$ 
+		 */
+		patterns.add(new Resolution(new String[]{"ev,", "iev,", "yev,", "jev,"}));
+		patterns.add(new Resolution(new String[]{"eva,", "ieva,", "yeva,", "jeva,"}));
+		
+		
+		/* russian last names II:
+		 * ([NRBO])IA$ == $1IIA$ == $1IYA$
+		 */
+		patterns.add(new Resolution(new String[]{"ia,", "iia,", "iya,"}, "nrbo"));
+		
+		/* russian last names III:
+		 * ([DHKLMNPSZ])IAN$ == $1YAN$ == $1JAN$ 
+		 */
+		patterns.add(new Resolution(new String[]{"ian,", "yan,", "jan,"}, "dhklmnpsz"));
+		
+		/* russian last names IV:
+		 * AIA$ == AYA$ == AJA$ 
+		 */
+		
+		patterns.add(new Resolution(new String[]{"aia,", "aya,", "aja,"}));
+		
+		/* russian last names V:
+		 * KI$ == KII$ == KIJ$ == KIY$ = KYI$
+		 * VI$ == VII$ == VIJ$ == VIY$ = VYI$
+		 * first transform [KVH]I into [KVH]II
+		 */
+		patterns.add(new Resolution(new String[]{"ki,", "kii,", "kij,", "kiy,", "kyi,"}, "dhklmnpsz"));
+		
+		
+		/* russian first names
+		 * ^IU == ^YU
+		 * ^IA == ^YA
+		 * 
+		 * The only detail is that the pattern must be reversed (because we normally search in 
+		 * a reversed version of a name; and comma is missing)
+		 */
+		patterns.add(new Resolution(new String[]{"ui", "uy"}));
+		patterns.add(new Resolution(new String[]{"ai", "ay"}));
+		
+		String reverse;
+		Trie trie = new Trie();
+		
+		for (Resolution resolution: patterns) {
+			for (String s: resolution.suffixes) {
+				reverse = StringUtils.reverse(s);
+				trie.insert(reverse,  resolution);
+			}			
 		}
-		return synonyms;
+		return trie;
+		
 	}
 
-	/*
-	 * take care of russian apostrophes:
-	 * 'E => E == IE == YE
-	 * note that we do not index 'E since the search
-	 * engine simply strips all apostrophes
-	 */
-	private static Pattern p0 = Pattern.compile("(?<=\\w{2})'(?=[Ee])");
-	private static HashSet<String> translitRussianApostrophes(Iterator<String> itr) {
-		HashSet<String> syn = new HashSet<String>();
-
-		String x;
-		while (itr.hasNext()) {
-			x = itr.next();
-			Matcher m = p0.matcher(x);
-			if (m.find()) {
-				if (x.charAt(m.end()) == 'E') {
-					syn.add(m.replaceAll("I"));
-					syn.add(m.replaceAll("Y"));
-					syn.add(m.replaceAll(""));			    
+	
+	static Set<String> transliterateRussianName(String name) {
+		// always search lowercase
+		name = name.toLowerCase();
+		HashSet<String> out= new HashSet<String>();
+		out.add(name);
+		
+		String[] parts = splitName(name);
+		String surname = parts[0];
+		StringBuilder first = new StringBuilder();
+		int i = 1;
+		while (i < parts.length) {
+			if (i > 1)
+				first.append(" ");
+			first.append(parts[i]);
+			i += 1;
+		}
+		
+		String rn = StringUtils.reverse(surname);
+		Result result = trie.search(rn);
+		
+		// first modify surnames (suffixes are unique)
+		if (result != null) {
+			Resolution v = result.result;
+			for (String x: v.transform(surname, result.suffix)) {
+				if (first.length() > 0) {
+					out.add(x + " " + first);					
 				}
 				else {
-					syn.add(m.replaceAll("i"));
-					syn.add(m.replaceAll("y"));
-					syn.add(m.replaceAll(""));
+					out.add(x);
 				}
 			}
 		}
-		//log.debug("apostrophes: " + syn);
-		return syn;
-	}
-
-	/* russian last names I:
-	 * [^IJY]EV$ => IEV$ == YEV$ == JEV$ 
-	 * [^IJY]EVA$ => IEVA$ == YEVA$ == JEVA$ 
-	 */
-	private static Pattern p1 = Pattern.compile("(?<![IJYijy])[Ee][Vv](?=[aA]?,)");
-	private static HashSet<String> translitRussianLastNames1(Iterator<String> itr) {
-		HashSet<String> syn = new HashSet<String>();
-		String x;
-		while (itr.hasNext()) {
-			x = itr.next();
-			Matcher m = p1.matcher(x);
-			if (m.find()) {
-				if (x.charAt(m.start()) == 'E') {
-					syn.add(m.replaceAll("IEV"));
-					syn.add(m.replaceAll("YEV"));
-					syn.add(m.replaceAll("JEV"));
-				}
-				else {
-					syn.add(m.replaceAll("iev"));
-					syn.add(m.replaceAll("yev"));
-					syn.add(m.replaceAll("jev"));
-				}
-
+		
+		// then modify first names (possibly multi-plying output)
+		String rfn = StringUtils.reverse(first.toString());
+		result = trie.search(first.toString());
+		if (result != null) {
+			for (String x: result.result.transform(rfn, result.suffix)) {
+				x = StringUtils.reverse(x);
+				for (String o: out) {
+					parts = splitName(o);
+					out.add(parts[0] + " " + x);
+				};
 			}
 		}
-		//log.debug("last names I: " + syn);
-		return syn;
+		out.remove(name); // remove the original
+		return out;
+		
 	}
 
-	/* russian last names II:
-	 * ([NRBO])IA$ == $1IIA$ == $1IYA$
+	/*
+	 * transliterate all names using Trie search for suffixes
 	 */
-	private static Pattern p2 = Pattern.compile("(?<=[NRBOnrbo])[Ii](?=[Aa],)");
-	private static HashSet<String> translitRussianLastNames2(Iterator<String> itr) {
-		HashSet<String> syn = new HashSet<String>();
-		String x;
-		while (itr.hasNext()) {
-			x = itr.next();
-			Matcher m = p2.matcher(x);
-			if (m.find()) {
-				if (x.charAt(m.start()) == 'I') {
-					syn.add(m.replaceAll("II"));
-					syn.add(m.replaceAll("IY"));			    
-				}
-				else {
-					syn.add(m.replaceAll("ii"));
-					syn.add(m.replaceAll("iy"));
-				}
+	private static HashSet<String> transliterateRussianNames(Set<String> in) {
+		HashSet<String> synonyms = new HashSet<String>();
+		for (String s : in) {
+			for (String r: transliterateRussianName(s)) {
+				synonyms.add(r);
 			}
 		}
-		//log.debug("last names II: " + syn);
-		return syn;
+		return synonyms;
 	}
 
-	/* russian last names III:
-	 * ([DHKLMNPSZ])IAN$ == $1YAN$ == $1JAN$ 
+	/*
+	 * take care of russian apostrophes:
+	 * 'E => E == IE == YE
+	 * note that we do not index 'E since the search
+	 * engine simply strips all apostrophes
 	 */
-	private static Pattern p3 = Pattern.compile("(?<=[DHKLMNPSZdhklmnpsz])[IJYijy](?=[Aa][Nn],)");
-	private static HashSet<String> translitRussianLastNames3(Iterator<String> itr) {
-		HashSet<String> syn = new HashSet<String>();
-		String x;
-		while (itr.hasNext()) {
-			x = itr.next();
-			Matcher m = p3.matcher(x);
-			if (m.find()) {
-				if (x.charAt(m.start()) == 'I' || x.charAt(m.start()) == 'J' || x.charAt(m.start()) == 'Y') {
-					syn.add(m.replaceAll("I"));
-					syn.add(m.replaceAll("J"));
-					syn.add(m.replaceAll("Y"));			    
-				}
-				else {
-					syn.add(m.replaceAll("i"));
-					syn.add(m.replaceAll("j"));
-					syn.add(m.replaceAll("y"));
-				}
+	private static Pattern p0 = Pattern.compile("(?<=\\w{2})\'(?=[Ee])");
+	private static Set<String> translitRussianApostrophes(Iterator<String> it) {
+		Set<String> out = new HashSet<String>();
+		String name;
+		while (it.hasNext()) {
+			name = it.next();
+			if (name.indexOf("'e") >= 1) {
+				//name = name.replaceAll("'e", "__");
+				out.add(name.replaceAll("'e", "ie"));
+				out.add(name.replaceAll("'e", "ye"));
+				out.add(name.replaceAll("'e", "e"));
 			}
+			
 		}
-		//log.debug("last names III: " + syn);
-		return syn;
+		
+		return out;
 	}
 
-	/* russian last names IV:
-	 * AIA$ == AYA$ == AJA$ 
-	 */
-	private static Pattern p4 = Pattern.compile("(?<=[KNVknv][Aa])[IJYijy](?=[Aa],)");
-	private static HashSet<String> translitRussianLastNames4(Iterator<String> itr) {
-		HashSet<String> syn = new HashSet<String>();
-		String x;
-		while (itr.hasNext()) {
-			x = itr.next();
-			Matcher m = p4.matcher(x);
-			if (m.find()) {
-				if (x.charAt(m.start()) == 'I' || x.charAt(m.start()) == 'J' || x.charAt(m.start()) == 'Y') {
-					syn.add(m.replaceAll("I"));
-					syn.add(m.replaceAll("J"));
-					syn.add(m.replaceAll("Y"));			    
-				}
-				else {
-					syn.add(m.replaceAll("i"));
-					syn.add(m.replaceAll("j"));
-					syn.add(m.replaceAll("y"));
-				}
-			}
-		}
-		//log.debug("last names IV: " + syn);
-		return syn;
+	
+	private static class TrieNode {
+	    private char c;
+	    private HashMap<Character, TrieNode> children = new HashMap<>();
+	    private Resolution leaf = null;
+
+	    public TrieNode() {}
+
+	    public TrieNode(char c){
+	        this.c = c;
+	    }
+
+	    public HashMap<Character, TrieNode> getChildren() {
+	        return children;
+	    }
+
+	    public void setChildren(HashMap<Character, TrieNode> children) {
+	        this.children = children;
+	    }
+
+	    public boolean isLeaf() {
+	        return leaf != null;
+	    }
+
+	    public void setLeaf(Resolution res) {
+	        this.leaf = res;
+	    }
+	    
+	    public Resolution getValue() {
+	    	return this.leaf;
+	    }
 	}
+	
+	private static class Trie {
+
+	    private TrieNode root;
+
+	    public Trie() {
+	        root = new TrieNode();
+	    }
+
+	    public void insert(String word, Resolution res) {
+	        HashMap<Character, TrieNode> children = root.getChildren();
+	        for(int i = 0; i < word.length(); i++) {
+	            char c = word.charAt(i);
+	            TrieNode node;
+	            if(children.containsKey(c)) {
+	                node = children.get(c);
+	            } else { 
+	                node = new TrieNode(c);
+	                children.put(c, node);
+	            }
+	            children = node.getChildren();
+
+	            if(i == word.length() - 1) {
+	                node.setLeaf(res);
+	            }
+	        }
+	    }
+
+	    public Result search(String word) {
+	        HashMap<Character, TrieNode> children = root.getChildren();
+	        Resolution lastFound = null;
+	        int lastI = 0;
+	        
+	        TrieNode node = null;
+	        for(int i = 0; i < word.length(); i++) {
+	            char c = word.charAt(i);
+	            if(children.containsKey(c)) {
+	                node = children.get(c);
+	                children = node.getChildren();
+	                if (node.isLeaf()) {
+	                	lastFound = node.getValue();
+	                	lastI = i;
+	                }
+	            } else { 
+	                node = null;
+	                break;
+	            }
+	        }
+	        if (lastFound == null)
+	        	return null;
+	        return new Result(word.substring(0, lastI+1), lastFound);
+	    }
 
-	/* russian last names V:
-	 * KI$ == KII$ == KIJ$ == KIY$ = KYI$
-	 * VI$ == VII$ == VIJ$ == VIY$ = VYI$
-	 * first transform [KVH]I into [KVH]II
-	 */
-	private static Pattern p5 = Pattern.compile("(?<=[KVkv])[Ii](?=,)");
-	private static HashSet<String> translitRussianLastNames5(Iterator<String> itr) {
-		HashSet<String> syn = new HashSet<String>();
-		String x;
-		while (itr.hasNext()) {
-			x = itr.next();
-			Matcher m = p5.matcher(x);
-			if (m.find()) {
-				if (x.charAt(m.start()) == 'I') {
-					syn.add(m.replaceAll("I"));
-					syn.add(m.replaceAll("Y"));
-					syn.add(m.replaceAll("YI"));
-					syn.add(m.replaceAll("IY"));
-					syn.add(m.replaceAll("IJ"));
-					syn.add(m.replaceAll("II"));			    
-				}
-				else {
-					syn.add(m.replaceAll("i"));
-					syn.add(m.replaceAll("y"));
-					syn.add(m.replaceAll("yi"));
-					syn.add(m.replaceAll("iy"));
-					syn.add(m.replaceAll("ij"));
-					syn.add(m.replaceAll("ii"));
-				}
-			}
+	}
+	
+	private static class Result { 
+	private String suffix;
+	private Resolution result;
+	
+		Result(String suffix, Resolution res) {
+			this.suffix = suffix;
+			this.result = res;
 		}
-		//log.debug("last names V: " + syn);
-		return syn;
 	}
-
-	/* russian first names
-	 * ^IU == ^YU
-	 * ^IA == ^YA
-	 */
-	private static Pattern p6 = Pattern.compile("(?<=, )[YIyi](?=[AUau])");
-	private static HashSet<String> translitRussianFirstNames(Iterator<String> itr) {
-		HashSet<String> syn = new HashSet<String>();
-		String x;
-		while (itr.hasNext()) {
-			x = itr.next();
-			Matcher m = p6.matcher(x);
-			if (m.find()) {
-				if (x.charAt(m.start()) == 'I' || x.charAt(m.start()) == 'Y') {
-					syn.add(m.replaceAll("I"));
-					syn.add(m.replaceAll("Y"));
+	
+	private static class Resolution {
+		private String[] suffixes;
+		private String mustMatch = "";
+		private String mustNotMatch = "";
+		Resolution(String[] suffixes) {
+			this.suffixes = suffixes;
+			
+		}
+		Resolution(String[] suffixes, String mustMatch) {
+			this.suffixes = suffixes;
+			this.mustMatch = mustMatch;
+			
+		}
+		Resolution(String[] suffixes, String mustMatch, String mustNotMatch) {
+			this.suffixes = suffixes;
+			this.mustMatch = mustMatch;
+			this.mustNotMatch = mustNotMatch;
+		}
+		
+		List<String> transform(String surname, String key) {
+			ArrayList<String> out = new ArrayList<String>();
+			//out.add(surname);
+			
+			String prefix = surname.substring(0, surname.length() - key.length());
+			String suffix = surname.substring(surname.length()-key.length());
+			int prevChar = surname.length() - key.length() - 1;
+			
+			for (String s: suffixes) {
+				if (suffix.equals(s))
+					continue;
+				if (this.mustMatch.length() > 0 && (prevChar <= 0 || (mustMatch.indexOf(surname.charAt(prevChar)) == -1))) {
+					continue;					
 				}
-				else {
-					syn.add(m.replaceAll("i"));
-					syn.add(m.replaceAll("y"));			    
+				if (this.mustNotMatch.length() > 0 && (prevChar <= 0 || (mustNotMatch.indexOf(surname.charAt(prevChar)) > -1))) {
+					continue;					
 				}
+				out.add(prefix + s);
 			}
+			
+			return out;
 		}
-		//log.debug("first names: " + syn);
-		return syn;
 	}
 
 }
diff --git a/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAdsabsTypeAuthorParsing.java b/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAdsabsTypeAuthorParsing.java
index ff3cd8ac6..8af278d96 100644
--- a/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAdsabsTypeAuthorParsing.java
+++ b/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAdsabsTypeAuthorParsing.java
@@ -138,7 +138,7 @@ public static String getSchemaFile() {
       // automatically harvested variations of author names (collected during indexing)
       // it will be enriched by the indexing
       File generatedTransliterations = createTempFile(formatSynonyms(new String[]{
-          "wyrzykowsky, l=>wyrzykowski, l;wyrzykowski, ł",
+          "wyrzykowskij, l=>wyrzykowski, l;wyrzykowski, ł",
           "ADAMCuk, m => ADAMČuk, m",
           "ADAMCZuk, m => ADAMČuk, m",
           //"ADAMCHuk, m K=> ADAMČuk, m K",  => deactivated for test purposes, see <surname>, <1> <2> use case
@@ -373,7 +373,11 @@ public void xtestX() throws Exception {
   public void testAuthorParsingUseCases() throws Exception {
   	
 	testAuthorQuery("\"krivodubski, v\"",
-			"",
+			"krivodubski, | krivodubski, v | krivodubski, v* | krivodubskii, | krivodubskii, v | krivodubskii, v* | krivodubskij, | krivodubskij, v | krivodubskij, v* | krivodubskiy, | krivodubskiy, v | krivodubskiy, v* | krivodubskyi, | krivodubskyi, v | krivodubskyi, v*",
+	        "//*[@numFound='0']"
+	);
+	testAuthorQuery("\"krivodubskij, v\"",
+			"krivodubski, | krivodubski, v | krivodubski, v* | krivodubskii, | krivodubskii, v | krivodubskii, v* | krivodubskij, | krivodubskij, v | krivodubskij, v* | krivodubskiy, | krivodubskiy, v | krivodubskiy, v* | krivodubskyi, | krivodubskyi, v | krivodubskyi, v*",
 	        "//*[@numFound='0']"
 	);
 	  
@@ -392,8 +396,8 @@ public void testAuthorParsingUseCases() throws Exception {
   			"//*[@numFound='11']");
     
     // multiple synonyms in the file are separated with semicolon
-    testAuthorQuery("\"wyrzykowsky, l\"",
-        "wyrzykowski, | wyrzykowski, l | wyrzykowski, l* | wyrzykowski, ł | wyrzykowski, ł* | wyrzykowskii, | wyrzykowskii, l | wyrzykowskii, l* | wyrzykowskii, ł | wyrzykowskii, ł* | wyrzykowskij, | wyrzykowskij, l | wyrzykowskij, l* | wyrzykowskij, ł | wyrzykowskij, ł* | wyrzykowskiy, | wyrzykowskiy, l | wyrzykowskiy, l* | wyrzykowskiy, ł | wyrzykowskiy, ł* | wyrzykowsky, | wyrzykowsky, l | wyrzykowsky, l* | wyrzykowsky, ł | wyrzykowsky, ł* | wyrzykowskyi, | wyrzykowskyi, l | wyrzykowskyi, l* | wyrzykowskyi, ł | wyrzykowskyi, ł*",
+    testAuthorQuery("\"wyrzykowskij, l\"",
+        "wyrzykowski, | wyrzykowski, l | wyrzykowski, l* | wyrzykowski, ł | wyrzykowski, ł* | wyrzykowskii, | wyrzykowskii, l | wyrzykowskii, l* | wyrzykowskii, ł | wyrzykowskii, ł* | wyrzykowskij, | wyrzykowskij, l | wyrzykowskij, l* | wyrzykowskij, ł | wyrzykowskij, ł* | wyrzykowskiy, | wyrzykowskiy, l | wyrzykowskiy, l* | wyrzykowskiy, ł | wyrzykowskiy, ł* | wyrzykowskyi, | wyrzykowskyi, l | wyrzykowskyi, l* | wyrzykowskyi, ł | wyrzykowskyi, ł*",
         "//*[@numFound='1']");
     
     // multiple names
diff --git a/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAuthorTransliterationFilter.java b/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAuthorTransliterationFilter.java
index 926d349be..fba24567c 100644
--- a/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAuthorTransliterationFilter.java
+++ b/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAuthorTransliterationFilter.java
@@ -42,7 +42,7 @@ public void testAccents() throws Exception {
 		checkIt("Duprè", "Duprè", "Dupre,"); // Dupre\\xcc\\x80
 		checkIt("\u0141", "Ł", "L,");
 		checkIt("Mendigutıa", "Mendigutıa", "Mendigutia,");
-		checkIt("krivodubski, v", "krivodubski, v", "krivodubskyi, v", "krivodubsky, v", "krivodubskij, v", "krivodubskiy, v", "krivodubskii, v" );
+		checkIt("krivodubski, v", "krivodubski, v", "krivodubskyi, v", "krivodubskiy, v", "krivodubskij, v", "krivodubskii, v" );
 
 	}
 
diff --git a/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAuthorUtils.java b/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAuthorUtils.java
index 10f124e35..3b8d57be7 100644
--- a/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAuthorUtils.java
+++ b/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAuthorUtils.java
@@ -67,28 +67,53 @@ public void testTransliterate() {
 		check("guer\u00E7o, r", "guerco, r"); // guerço,
 	}
 
-	public void testTransliterations() {
-		check("FOO'EYE, BAR", "FOOEYE, BAR", "FOOYEYE, BAR", "FOOIEYE, BAR");
-		check("FOOEV, BAR", "FOOYEV, BAR", "FOOJEV, BAR", "FOOIEV, BAR");
-		check("Fooev, BAR", "Fooyev, BAR", "Foojev, BAR", "Fooiev, BAR");
-		check("FOODJAN, BAR", "FOODYAN, BAR", "FOODIAN, BAR");
-		check("Fookaya, BAR", "Fookaja, BAR", "Fookaia, BAR");
-		check("FOOKI, BAR", "FOOKYI, BAR", "FOOKII, BAR", "FOOKY, BAR", "FOOKIY, BAR", "FOOKIJ, BAR");
-		check("FOOVI, BAR", "FOOVYI, BAR", "FOOVII, BAR", "FOOVY, BAR", "FOOVIY, BAR", "FOOVIJ, BAR");
-		check("FOO, YURI", "FOO, IURI");
-		check("FOO, IAGNI", "FOO, YAGNI");
-		check("krivodubski, v", "krivodubskii, v", "krivodubskij, v", "krivodubskiy, v", "krivodubsky, v", "krivodubskyi, v");
+	public void testRussian() {
+
+		// must work in any direction
+		check("krivodubski, v", "krivodubskii, v", "krivodubskij, v", "krivodubskiy, v", "krivodubskyi, v");
+		check("krivodubskij, v", "krivodubskii, v", "krivodubski, v", "krivodubskiy, v", "krivodubskyi, v");
+		check("krivodubskiy, v", "krivodubski, v", "krivodubskii, v", "krivodubskij, v", "krivodubskyi, v");
+		check("krivodubskyi, v", "krivodubskii, v", "krivodubskij, v", "krivodubskiy, v", "krivodubski, v");
+		
+		// suffix -ki wont be matched because it is not preceded by selected consonant
+		check("peki, v");
+		
+		// this one is
+		check("pelki, v", "pelkii, v", "pelkij, v", "pelkiy, v", "pelkyi, v");
+		
+		// similar (but different suffixes)
+		check("anajev, z", "anayev, z", "anaiev, z", "anaev, z");
+		check("anaev, z", "anayev, z", "anaiev, z", "anajev, z");
+		
+		check("tarzjan,", "tarzian,", "tarzyan,");
+		check("tarzyan,", "tarzian,", "tarzjan,");
+		
+		check("tarjan,");
+		check("taryan,");
+		
+		check("fookaya,", "fookaja,", "fookaia,");
+		
+		// woman's surname
+		check("sarnieva,", "sarneva,", "sarnjeva,", "sarnyeva,");
+		check("sarneva,", "sarnjeva,", "sarnyeva,", "sarnieva,");
+		
+		//old pattern: looks wrong, because k is preceded by o
+		//check("FOOKI, BAR", "FOOKYI, BAR", "FOOKII, BAR", "FOOKY, BAR", "FOOKIY, BAR", "FOOKIJ, BAR");
+		//what we do now:
+		check("fooki,");
+		
+		
+		// first names
+		check("gagarin, yuri", "gagarin, iuri");
+		check("gagarin, iuri", "gagarin, yuri");
+		
+		// german modifications
+		check("foo, BÄR", "foo, BAER", "foo, BAR");
+		
+		// long 'E -> E, IE, YE (seem wrong still)
+		check("f'edorov", "fedorov,", "fiedorov,", "fyedorov,");
 	}
 
-	public void testTransRussianNames() {
-		check("FOOVI, YURI", "FOOVI, IURI", "FOOVII, IURI", "FOOVII, YURI", "FOOVIJ, IURI", "FOOVIJ, YURI",
-				"FOOVIY, IURI", "FOOVIY, YURI", "FOOVY, IURI", "FOOVY, YURI", "FOOVYI, IURI", "FOOVYI, YURI");
-	}
-
-	public void testGenSynonyms() {
-		check("FOO'EYE, BÄR", "FOO'EYE, BAER", "FOO'EYE, BAR", "FOOEYE, BAER", "FOOEYE, BAR", "FOOEYE, BÄR",
-				"FOOIEYE, BAER", "FOOIEYE, BAR", "FOOIEYE, BÄR", "FOOYEYE, BAER", "FOOYEYE, BAR", "FOOYEYE, BÄR");
-	}
 
 	private void check(String a, String... expected) {
 		ArrayList<String> actual = AuthorUtils.getAsciiTransliteratedVariants(a);
@@ -99,9 +124,9 @@ private void check(String a, String... expected) {
 		actual.toArray(ac);
 		Arrays.sort(ac);
 
-		// System.out.println(a);
-		// System.out.println(Arrays.asList(expected));
-		// System.out.println(Arrays.asList(ac));
+//		 System.out.println(a);
+//		 System.out.println(Arrays.asList(expected));
+//		 System.out.println(Arrays.asList(ac));
 
 		assertEquals(Arrays.asList(expected), Arrays.asList(ac));