diff --git a/contrib/adsabs/src/java/org/adsabs/solr/analysis/ProcessCuratedAuthorSynonyms.java b/contrib/adsabs/src/java/org/adsabs/solr/analysis/ProcessCuratedAuthorSynonyms.java index cd710f400..3c57ad2b9 100644 --- a/contrib/adsabs/src/java/org/adsabs/solr/analysis/ProcessCuratedAuthorSynonyms.java +++ b/contrib/adsabs/src/java/org/adsabs/solr/analysis/ProcessCuratedAuthorSynonyms.java @@ -6,6 +6,9 @@ import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Map; +import java.util.Set; + import org.apache.solr.analysis.WriteableExplicitSynonymMap; import org.apache.solr.analysis.WriteableSynonymMap; import org.apache.solr.analysis.author.AuthorQueryVariations; @@ -75,9 +78,9 @@ public static HashMap> transformGroup(List group) log.debug("withAutoSynonyms: " + withAutoSynonyms.toString()); // build a map of name -> variations to be used later - final HashMap> variationsMap = new HashMap>(); + final HashMap> variationsMap = new HashMap>(); for (String s : withAutoSynonyms) { - HashMap parsedAuthor = null; + Map parsedAuthor = null; try { parsedAuthor = AuthorUtils.parseAuthor(s); } catch (Exception e) { diff --git a/contrib/adsabs/src/java/org/apache/solr/analysis/author/AuthorQueryVariations.java b/contrib/adsabs/src/java/org/apache/solr/analysis/author/AuthorQueryVariations.java index 11cc5b33f..b0b655ad8 100644 --- a/contrib/adsabs/src/java/org/apache/solr/analysis/author/AuthorQueryVariations.java +++ b/contrib/adsabs/src/java/org/apache/solr/analysis/author/AuthorQueryVariations.java @@ -71,9 +71,9 @@ protected static Set generateNameVariations(Map parsedAu generateSynonymVariations(parsedAuthor, variations); // add the variations that are needed only for the query phase - String last = parsedAuthor.get("last"); - String first = parsedAuthor.get("first"); - String middle = parsedAuthor.get("middle"); + String last = parsedAuthor.get("Last"); + String first = parsedAuthor.get("First"); + String middle = parsedAuthor.get("Middle"); if (first != null) { if (middle != null) { @@ -115,7 +115,7 @@ protected static Set generateNameVariations(Map parsedAu * @param authorString name in the natural form * @return map with string mappings */ - public static HashSet getQueryVariations(String authorString) { + public static Set getQueryVariations(String authorString) { Map parsedAuthor = null; parsedAuthor = AuthorUtils.parseAuthor(authorString); @@ -128,12 +128,12 @@ public static HashSet getQueryVariations(String authorString) { return generateSynonymVariations(parsedAuthor, variations); } - public static HashSet generateSynonymVariations(HashMap parsedAuthor) { + public static Set generateSynonymVariations(Map parsedAuthor) { HashSet variations = new LinkedHashSet(); return generateSynonymVariations(parsedAuthor, variations); } - protected static HashSet generateSynonymVariations(Map parsedAuthor, + protected static Set generateSynonymVariations(Map parsedAuthor, Set variations) { String last = parsedAuthor.get("Last"); diff --git a/contrib/adsabs/src/java/org/apache/solr/analysis/author/AuthorUtils.java b/contrib/adsabs/src/java/org/apache/solr/analysis/author/AuthorUtils.java index bc6c1df28..7ef006f1c 100644 --- a/contrib/adsabs/src/java/org/apache/solr/analysis/author/AuthorUtils.java +++ b/contrib/adsabs/src/java/org/apache/solr/analysis/author/AuthorUtils.java @@ -114,11 +114,13 @@ public static ArrayList getAsciiTransliteratedVariants(String a) { synonyms.add(a); // downgrade to ascii + synonyms.add(foldToAscii(a)); + + // work around unidecode not always doing what we want String b = replaceUmlaut(a); if (b != a) { synonyms.add(foldToAscii(b)); } - synonyms.add(foldToAscii(a)); // handle russian name stuff HashSet transRus = transliterateRussianNames(synonyms); @@ -143,6 +145,9 @@ private static String replaceUmlaut(String input) { StringBuilder out = new StringBuilder(); for (char c: input.toCharArray()) { switch(c) { + case '\u00fc': + out.append("ue"); + break; case '\u00f6': out.append("oe"); break; diff --git a/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAuthorTransliterationFilter.java b/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAuthorTransliterationFilter.java index 68ac23823..f7c223beb 100644 --- a/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAuthorTransliterationFilter.java +++ b/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAuthorTransliterationFilter.java @@ -12,53 +12,45 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.BaseTokenStreamTestCase; - public class TestAuthorTransliterationFilter extends BaseTokenStreamTestCase { - + final class TestFilter extends TokenFilter { private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); + public TestFilter(TokenStream input) { super(input); } + public boolean incrementToken() throws IOException { - if (!input.incrementToken()) return false; - typeAtt.setType(AuthorUtils.AUTHOR_INPUT); - return true; + if (!input.incrementToken()) + return false; + typeAtt.setType(AuthorUtils.AUTHOR_INPUT); + return true; } } - + public void testAuthorSynonyms() throws Exception { - + checkIt("Müller, Bill", "Müller, Bill", "Mueller, Bill", "Muller, Bill"); checkIt("Peißker, L", "Peißker, L", "Peissker, L"); - - + } - + public void testAccents() throws Exception { - checkIt("Jeřábková, Tereza", "Jeřábková, Tereza", "Jerhaebkovae, Tereza", "Jerabkova, Tereza"); - checkIt("Dupré", "Dupré", "Dupree", "Dupre"); - checkIt("Duprè", "Duprè", "Dupre", "Duprè"); // Dupre\\xcc\\x80 - checkIt("\u0141", "Ł", "L"); -// System.out.println("\u0141"); -// System.out.println("\u0308E"); -// System.out.println("\u030aA"); -// System.out.println("\u0301E"); -// System.out.println("\u030cH"); -// //checkIt("\u0308E", "̈E"); -// checkIt("Mendigutıa", "Mendigutia"); -// checkIt("\u030aA", "\u030aA", "A"); -// checkIt("\u0301E", "E"); -// checkIt("\u030cH", "H"); - - } - + checkIt("Jeřábková, Tereza", "Jeřábková, Tereza", "Jerabkova, Tereza"); + checkIt("Dupré", "Dupré", "Dupre,"); + checkIt("Duprè", "Duprè", "Dupre,"); // Dupre\\xcc\\x80 + checkIt("\u0141", "Ł", "L,"); + checkIt("Mendigutıa", "Mendigutıa", "Mendigutia,"); + + } + private void checkIt(String input, String... expected) throws Exception { - Reader reader = new StringReader(input); - Tokenizer tokenizer = new KeywordTokenizer(); - tokenizer.setReader(reader); - AuthorTransliterationFactory factory = new AuthorTransliterationFactory(new HashMap()); - TokenStream stream = factory.create(new TestFilter(tokenizer)); - assertTokenStreamContents(stream, expected); + Reader reader = new StringReader(input); + Tokenizer tokenizer = new KeywordTokenizer(); + tokenizer.setReader(reader); + AuthorTransliterationFactory factory = new AuthorTransliterationFactory(new HashMap()); + TokenStream stream = factory.create(new TestFilter(tokenizer)); + assertTokenStreamContents(stream, expected); } } diff --git a/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAuthorVariations.java b/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAuthorVariations.java index cec0e34b7..7cb9e7a5a 100644 --- a/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAuthorVariations.java +++ b/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAuthorVariations.java @@ -2,6 +2,7 @@ import java.util.HashMap; import java.util.HashSet; +import java.util.Set; import org.apache.solr.analysis.author.AuthorQueryVariations; @@ -10,7 +11,7 @@ public class TestAuthorVariations extends TestCase { public void xtestgetNameVariations() { - HashSet name = AuthorQueryVariations.getQueryVariationsInclRegex("Hector, Gomez Q"); + Set name = AuthorQueryVariations.getQueryVariationsInclRegex("Hector, Gomez Q"); for (String n: name) { System.out.println(n); } @@ -37,81 +38,81 @@ public void xtestgetNameVariations() { public void testgenerateNameVariations1() { HashMap input = new HashMap(); - input.put("first", "HECTOR"); - input.put("last", "GOMEZ"); - input.put("middle", "Q"); + input.put("First", "HECTOR"); + input.put("Last", "GOMEZ"); + input.put("Middle", "Q"); HashSet expected = new HashSet(); expected.add("GOMEZ, HECTOR Q.*"); expected.add("GOMEZ, HECTOR"); expected.add("GOMEZ, H Q.*"); expected.add("GOMEZ, H"); expected.add("GOMEZ,"); - HashSet actual = AuthorQueryVariations.generateNameVariations(input); + Set actual = AuthorQueryVariations.generateNameVariations(input); assertEquals(expected, actual); } public void testgenerateNameVariations2() { HashMap input = new HashMap(); - input.put("first", "HECTOR"); - input.put("last", "GOMEZ"); + input.put("First", "HECTOR"); + input.put("Last", "GOMEZ"); HashSet expected = new HashSet(); expected.add("GOMEZ, HECTOR\\b.*"); expected.add("GOMEZ, H\\b.*"); expected.add("GOMEZ, H"); expected.add("GOMEZ,"); - HashSet actual = AuthorQueryVariations.generateNameVariations(input); + Set actual = AuthorQueryVariations.generateNameVariations(input); assertEquals(expected, actual); } public void testgenerateNameVariations3() { HashMap input = new HashMap(); - input.put("first", "H"); - input.put("last", "GOMEZ"); + input.put("First", "H"); + input.put("Last", "GOMEZ"); HashSet expected = new HashSet(); expected.add("GOMEZ, H.*"); expected.add("GOMEZ,"); - HashSet actual = AuthorQueryVariations.generateNameVariations(input); + Set actual = AuthorQueryVariations.generateNameVariations(input); assertEquals(expected, actual); } public void testgenerateNameVariations4() { HashMap input = new HashMap(); - input.put("first", "H"); - input.put("last", "GOMEZ"); - input.put("middle", "Q"); + input.put("First", "H"); + input.put("Last", "GOMEZ"); + input.put("Middle", "Q"); HashSet expected = new HashSet(); expected.add("GOMEZ, H\\w* Q.*"); expected.add("GOMEZ, H\\w*"); expected.add("GOMEZ,"); - HashSet actual = AuthorQueryVariations.generateNameVariations(input); + Set actual = AuthorQueryVariations.generateNameVariations(input); assertEquals(expected, actual); } public void testgenerateNameVariations5() { HashMap input = new HashMap(); - input.put("first", "H"); - input.put("last", "GOMEZ"); - input.put("middle", "QUINTERO"); + input.put("First", "H"); + input.put("Last", "GOMEZ"); + input.put("Middle", "QUINTERO"); HashSet expected = new HashSet(); expected.add("GOMEZ, H\\w* QUINTERO\\b.*"); expected.add("GOMEZ, H\\w* Q\\b.*"); expected.add("GOMEZ, H\\w*"); expected.add("GOMEZ,"); - HashSet actual = AuthorQueryVariations.generateNameVariations(input); + Set actual = AuthorQueryVariations.generateNameVariations(input); assertEquals(expected, actual); } public void testgenerateNameVariations6() { HashMap input = new HashMap(); - input.put("last", "GOMEZ"); + input.put("Last", "GOMEZ"); HashSet expected = new HashSet(); expected.add("GOMEZ,.*"); - HashSet actual = AuthorQueryVariations.generateNameVariations(input); + Set actual = AuthorQueryVariations.generateNameVariations(input); assertEquals(expected, actual); } public void testgenerateNameVariations7() { HashMap input = new HashMap() {{ - put("last", "MILLAR"); - put("first", "CAROL"); - put("middle", "EVELYN"); + put("Last", "MILLAR"); + put("First", "CAROL"); + put("Middle", "EVELYN"); }}; HashSet expected = new HashSet() {{ add("MILLAR, CAROL EVELYN\\b.*"); @@ -121,83 +122,83 @@ public void testgenerateNameVariations7() { add("MILLAR, C"); add("MILLAR,"); }}; - HashSet actual = AuthorQueryVariations.generateNameVariations(input); + Set actual = AuthorQueryVariations.generateNameVariations(input); assertEquals(expected, actual); } public void testGenerateSynonymVariations1() { HashMap input = new HashMap(); - input.put("first", "HECTOR"); - input.put("last", "GOMEZ"); - input.put("middle", "Q"); + input.put("First", "HECTOR"); + input.put("Last", "GOMEZ"); + input.put("Middle", "Q"); HashSet expected = new HashSet(); expected.add("GOMEZ, HECTOR"); expected.add("GOMEZ, H"); expected.add("GOMEZ,"); - HashSet actual = AuthorQueryVariations.generateSynonymVariations(input); + Set actual = AuthorQueryVariations.generateSynonymVariations(input); assertEquals(expected, actual); } public void testGenerateSynonymVariations2() { HashMap input = new HashMap(); - input.put("first", "HECTOR"); - input.put("last", "GOMEZ"); + input.put("First", "HECTOR"); + input.put("Last", "GOMEZ"); HashSet expected = new HashSet(); expected.add("GOMEZ, HECTOR\\b.*"); expected.add("GOMEZ, H"); expected.add("GOMEZ,"); - HashSet actual = AuthorQueryVariations.generateSynonymVariations(input); + Set actual = AuthorQueryVariations.generateSynonymVariations(input); assertEquals(expected, actual); } public void testGenerateSynonymVariations3() { HashMap input = new HashMap(); - input.put("first", "H"); - input.put("last", "GOMEZ"); + input.put("First", "H"); + input.put("Last", "GOMEZ"); HashSet expected = new HashSet(); expected.add("GOMEZ, H.*"); expected.add("GOMEZ,"); - HashSet actual = AuthorQueryVariations.generateSynonymVariations(input); + Set actual = AuthorQueryVariations.generateSynonymVariations(input); assertEquals(expected, actual); } public void testGenerateSynonymVariations4() { HashMap input = new HashMap(); - input.put("first", "H"); - input.put("last", "GOMEZ"); - input.put("middle", "Q"); + input.put("First", "H"); + input.put("Last", "GOMEZ"); + input.put("Middle", "Q"); HashSet expected = new HashSet(); expected.add("GOMEZ, H\\w* Q.*"); expected.add("GOMEZ, H\\w*"); expected.add("GOMEZ,"); - HashSet actual = AuthorQueryVariations.generateSynonymVariations(input); + Set actual = AuthorQueryVariations.generateSynonymVariations(input); assertEquals(expected, actual); } public void testGenerateSynonymVariations5() { HashMap input = new HashMap(); - input.put("first", "H"); - input.put("last", "GOMEZ"); - input.put("middle", "QUINTERO"); + input.put("First", "H"); + input.put("Last", "GOMEZ"); + input.put("Middle", "QUINTERO"); HashSet expected = new HashSet(); expected.add("GOMEZ, H\\w* QUINTERO\\b.*"); expected.add("GOMEZ, H\\w*"); expected.add("GOMEZ,"); - HashSet actual = AuthorQueryVariations.generateSynonymVariations(input); + Set actual = AuthorQueryVariations.generateSynonymVariations(input); assertEquals(expected, actual); } public void testGenerateSynonymVariations6() { HashMap input = new HashMap(); - input.put("last", "GOMEZ"); + input.put("Last", "GOMEZ"); HashSet expected = new HashSet(); expected.add("GOMEZ,.*"); - HashSet actual = AuthorQueryVariations.generateSynonymVariations(input); + Set actual = AuthorQueryVariations.generateSynonymVariations(input); assertEquals(expected, actual); } public void testgenerateSynonymVariations7() { HashMap input = new HashMap() {{ - put("last", "MILLAR"); - put("first", "CAROL"); - put("middle", "EVELYN"); + put("Last", "MILLAR"); + put("First", "CAROL"); + put("Middle", "EVELYN"); }}; HashSet expected = new HashSet() {{ add("MILLAR, CAROL EVELYN\\b.*"); @@ -206,14 +207,14 @@ public void testgenerateSynonymVariations7() { add("MILLAR, C"); add("MILLAR,"); }}; - HashSet actual = AuthorQueryVariations.generateSynonymVariations(input); + Set actual = AuthorQueryVariations.generateSynonymVariations(input); assertEquals(expected, actual); } // public void testParseAuthor() { // HashMap expected = new HashMap(); -// expected.put("last", "Miller"); -// expected.put("first", "Janice"); -// expected.put("middle", "G"); +// expected.put("Last", "Miller"); +// expected.put("First", "Janice"); +// expected.put("Middle", "G"); // HashMap actual = AuthorVariations.parseAuthor("Miller, Janice G"); // assertEquals(expected, actual); // } diff --git a/contrib/adsabs/src/test/org/apache/solr/search/TestAqpLuceneQParserPlugin.java b/contrib/adsabs/src/test/org/apache/solr/search/TestAqpLuceneQParserPlugin.java index 4828f02d6..97128fac1 100644 --- a/contrib/adsabs/src/test/org/apache/solr/search/TestAqpLuceneQParserPlugin.java +++ b/contrib/adsabs/src/test/org/apache/solr/search/TestAqpLuceneQParserPlugin.java @@ -80,7 +80,7 @@ public void test() throws IOException, Exception { SolrQueryRequest r = req(CommonParams.Q, "franklin NEAR hero", CommonParams.DF, "text"); QParser parser = a.createParser("franklin NEAR hero", r.getParams(), r.getParams(), r); Query query = parser.parse(); - assertEquals("spanNear([text:franklin, text:hero], 5, true)", query.toString()); + assertEquals("spanNear([text:franklin, text:hero], 5, false)", query.toString()); r.close(); diff --git a/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml b/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml index d77125007..22ae978cd 100644 --- a/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml +++ b/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml @@ -61,8 +61,7 @@ class="solr.analysis.author.AuthorNormalizeFilterFactory" /> - +