From 413fc487ccafb8597be63438ca00d884c96d498d Mon Sep 17 00:00:00 2001 From: Roman Chyla Date: Wed, 27 Jan 2021 14:11:09 -0500 Subject: [PATCH] Revert "One implementation of the synonym->transliteration fix (jones>forman situation is failing)" This reverts commit f509f8362e398a5000770f9f2a8c0e1d6b99c98d. --- .../AuthorShortNameUpgradeFilterFactory.java | 141 ------------------ .../search/TestSecondOrderQueryTypesAds.java | 7 +- .../author/TestAdsabsTypeAuthorParsing.java | 71 ++------- .../server/solr/collection1/conf/schema.xml | 19 +-- 4 files changed, 24 insertions(+), 214 deletions(-) diff --git a/contrib/adsabs/src/java/org/apache/solr/analysis/author/AuthorShortNameUpgradeFilterFactory.java b/contrib/adsabs/src/java/org/apache/solr/analysis/author/AuthorShortNameUpgradeFilterFactory.java index a91750e32..71d7b7bf9 100644 --- a/contrib/adsabs/src/java/org/apache/solr/analysis/author/AuthorShortNameUpgradeFilterFactory.java +++ b/contrib/adsabs/src/java/org/apache/solr/analysis/author/AuthorShortNameUpgradeFilterFactory.java @@ -448,146 +448,5 @@ public TokenStream create(TokenStream input) { // NewSynonymFilteFactory return input; } - - public static class SimpleShortNames extends NewSynonymFilterFactory.SynonymBuilderFactory { - - public SimpleShortNames(Map args) { - super(args); - } - - protected SynonymParser getParser(Analyzer analyzer) { - - char sep = ','; - if (args.containsKey("format") && args.get("format").equals("semicolon")) { - sep = ';'; - }; - - final Character charSeparator = sep; - - return new NewSolrSynonymParser(true, true, analyzer) { - - public void add(Reader in) throws IOException, ParseException { - LineNumberReader br = new LineNumberReader(in); - StringBuffer newBr = new StringBuffer(); - String line = null; - - String[] parts; - - try { - while ((line = br.readLine()) != null) { - // modify the original on-the-fly - if (line.length() == 0 || line.charAt(0) == '#') { - continue; // ignore empty lines and comments - } - String[] sides = line.split("=>"); - if (sides.length > 1) { // explicit mapping - String[] names = getNames(sides[1]); - newBr.append(escape(names[0])); - newBr.append("=>"); - boolean first = false; - for (String n: names) { - if (first) - newBr.append(','); - newBr.append(escape(n)); - first = true; - } - } - else { - String[] names = getNames(sides[0]); - newBr.append(buildLine(names)); - } - newBr.append("\n"); - } - } catch (IllegalArgumentException e) { - ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0); - ex.initCause(e); - throw ex; - } finally { - br.close(); - } - - // pass the modified synonym to the builder to create a synonym map - super.add(new InputStreamReader(new ByteArrayInputStream(newBr.toString().getBytes()), - Charset.forName("UTF-8").newDecoder())); - - } - @Override - public void add(CharsRef input, CharsRef output, boolean includeOrig) { - super.add(input, output, true); - } - - private String[] getNames(String vals) { - List nn = StrUtils.splitSmart(vals, charSeparator); - String names[] = new String[nn.size()]; - int j = 0; - for (String n: nn) { - names[j] = unescape(n); - j++; - } - return names; - } - private String buildLine(String[] names) { - HashSet set = new HashSet(); - StringBuilder out = new StringBuilder(); - boolean notFirst = false; - - for (String name: names) { - - String[] p = AuthorUtils.splitName(name); - if (isLongForm(p)) { - set.add(makeShortForm(p)); - } - set.add(name); - } - for (String name: set) { - if (notFirst) out.append(','); - out.append(escape(name)); - notFirst = true; - } - return out.toString(); - } - - - private String unescape(String s) { - return s.replace("\\ ", " ").replace("\\" + charSeparator, charSeparator.toString()); - } - - - private String escape(String s) { - return s.replace(" ", "\\ ").replace(",", "\\,"); - } - - - private String makeShortForm(String[] parts) { - StringBuilder out = new StringBuilder(); - out.append(parts[0]); - for (int i=1;i 1) - return true; - } - return res; - } - private int containsLongForm(String[] names) { - int i = 0; - for (String name: names) { - if (isLongForm(AuthorUtils.splitName(name))) { - i++; - } - } - return i; - } - }; - } - } - } \ No newline at end of file diff --git a/contrib/adsabs/src/test/org/apache/lucene/search/TestSecondOrderQueryTypesAds.java b/contrib/adsabs/src/test/org/apache/lucene/search/TestSecondOrderQueryTypesAds.java index 824e997d8..213791b90 100644 --- a/contrib/adsabs/src/test/org/apache/lucene/search/TestSecondOrderQueryTypesAds.java +++ b/contrib/adsabs/src/test/org/apache/lucene/search/TestSecondOrderQueryTypesAds.java @@ -140,9 +140,10 @@ public void testADSOperators() throws Exception { LuceneCacheWrapper boostTwo = LuceneCacheWrapper.getFloatCache( "boost_2", UninvertingReader.Type.FLOAT_POINT, tempReq.getSearcher().getSlowAtomicReader()); - assertEquals("wrong data", 1.0f, boostConstant.getFloat(0), 0.0f); - assertEquals("wrong data", 0.1f, boostOne.getFloat(0), 0.0f); - assertEquals("wrong data", 0.5f, boostTwo.getFloat(0), 0.0f); + + assertEquals("Unexpected value from cache", 1.0f, boostConstant.getFloat(0), 0.0f); + assertEquals("Unexpected value from cache", 0.1f, boostOne.getFloat(0), 0.0f); + assertEquals("Unexpected value from cache", 0.5f, boostTwo.getFloat(0), 0.0f); // expecting 4 results with various order, simply based on the boost factor testQ2("id:1", new SecondOrderCollectorOperatorExpertsCiting(referencesWrapper, boostConstant), diff --git a/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAdsabsTypeAuthorParsing.java b/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAdsabsTypeAuthorParsing.java index b12e7e6fa..823648ac4 100644 --- a/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAdsabsTypeAuthorParsing.java +++ b/contrib/adsabs/src/test/org/apache/solr/analysis/author/TestAdsabsTypeAuthorParsing.java @@ -127,15 +127,12 @@ public static String getSchemaFile() { "ADAMŠuk, m; ADAMGuk, m;ADAMČuk, m", // hand-made additions "MÜLLER, A WILLIAM;MÜLLER, A BILL", "MÜLLER, WILLIAM;MÜLLER, BILL", - //"JONES, CHRISTINE;FORMAN, CHRISTINE", // the famous post-synonym expansion - "JONES, C=>Jones,Christine;FORMAN, CHRISTINE", // the famous post-synonym expansion - "FORMAN, C=>FORMAN, CHRISTINE;JONES, C", // the famous post-synonym expansion + "JONES, CHRISTINE;FORMAN, CHRISTINE", // the famous post-synonym expansion "DE ZEEUW, TIM=>DE ZEEUW, P TIM", "DE ZEEUW, P TIM=>DE ZEEUW, TIM;DE ZEEUW,", "grant, carolyn s; stern grant, carolyn; stern, carolyn p", "orlitova, ivana; stoklasova, ivana", - "orlitova,; stoklasova,", - "wedemeyer boehm, s; wedemeyer, s" + "orlitova,; stoklasova," }); // automatically harvested variations of author names (collected during indexing) @@ -168,13 +165,7 @@ public static String getSchemaFile() { "Gonzalez Alfonso, E=>González Alfonso, E", "Chyelkovae,=>Chýlková,", "stoklasova,=>stoklasová,", - "orlitova,=>orlitová,", - "wedemeyer boehm, s=>wedemeyer böhm, s", - "wedemeyer boehm, sven=>wedemeyer böhm, sven", - "wedemeyer boehm,=>wedemeyer böhm,", - "wedemeyer bohm, s=>wedemeyer böhm, s", - "wedemeyer bohm, sven=>wedemeyer böhm, sven", - "wedemeyer bohm,=>wedemeyer böhm," + "orlitova,=>orlitová," } )); @@ -387,32 +378,6 @@ public void xtestX() throws Exception { public void testAuthorParsingUseCases() throws Exception { - assertQueryEquals(req("q", "author:\"Wedemeyer, Sven\""), - "wedemeyer boehm, | wedemeyer boehm, s | wedemeyer boehm, s * | wedemeyer boehm, sven | wedemeyer boehm, sven * | wedemeyer bohm, | wedemeyer bohm, s | wedemeyer bohm, s * | wedemeyer bohm, sven | wedemeyer bohm, sven * | wedemeyer böhm, | wedemeyer böhm, s | wedemeyer böhm, s * | wedemeyer böhm, sven | wedemeyer böhm, sven * | wedemeyer, | wedemeyer, s | wedemeyer, s * | wedemeyer, sven | wedemeyer, sven *", - DisjunctionMaxQuery.class); - assertQueryEquals(req("q", "author:\"wedemeyer, sven\""), - "wedemeyer boehm, | wedemeyer boehm, s | wedemeyer boehm, s * | wedemeyer boehm, sven | wedemeyer boehm, sven * | wedemeyer bohm, | wedemeyer bohm, s | wedemeyer bohm, s * | wedemeyer bohm, sven | wedemeyer bohm, sven * | wedemeyer böhm, | wedemeyer böhm, s | wedemeyer böhm, s * | wedemeyer böhm, sven | wedemeyer böhm, sven * | wedemeyer, | wedemeyer, s | wedemeyer, s * | wedemeyer, sven | wedemeyer, sven *", - DisjunctionMaxQuery.class); - assertQueryEquals(req("q", "author:\"wedemeyer böhm, sven\""), - "wedemeyer boehm, | wedemeyer boehm, s | wedemeyer boehm, s * | wedemeyer boehm, sven | wedemeyer boehm, sven * | wedemeyer bohm, | wedemeyer bohm, s | wedemeyer bohm, s * | wedemeyer bohm, sven | wedemeyer bohm, sven * | wedemeyer böhm, | wedemeyer böhm, s | wedemeyer böhm, s * | wedemeyer böhm, sven | wedemeyer böhm, sven * | wedemeyer, | wedemeyer, s | wedemeyer, s * | wedemeyer, sven | wedemeyer, sven *", - DisjunctionMaxQuery.class); - assertQueryEquals(req("q", "author:\"wedemeyer böhm, s\""), - "wedemeyer boehm, | wedemeyer boehm, s | wedemeyer boehm, s* | wedemeyer boehm, sven | wedemeyer boehm, sven * | wedemeyer bohm, | wedemeyer bohm, s | wedemeyer bohm, s* | wedemeyer bohm, sven | wedemeyer bohm, sven * | wedemeyer böhm, | wedemeyer böhm, s | wedemeyer böhm, s* | wedemeyer böhm, sven | wedemeyer böhm, sven * | wedemeyer, | wedemeyer, s | wedemeyer, s* | wedemeyer, sven | wedemeyer, sven *", - DisjunctionMaxQuery.class); - assertQueryEquals(req("q", "author:\"wedemeyer, s\""), - "wedemeyer boehm, | wedemeyer boehm, s | wedemeyer boehm, s* | wedemeyer boehm, sven | wedemeyer boehm, sven * | wedemeyer bohm, | wedemeyer bohm, s | wedemeyer bohm, s* | wedemeyer bohm, sven | wedemeyer bohm, sven * | wedemeyer böhm, | wedemeyer böhm, s | wedemeyer böhm, s* | wedemeyer böhm, sven | wedemeyer böhm, sven * | wedemeyer, | wedemeyer, s | wedemeyer, s* | wedemeyer, sven | wedemeyer, sven *", - DisjunctionMaxQuery.class); - assertQueryEquals(req("q", "author:\"wedemeyer böhm, s\""), - "wedemeyer boehm, | wedemeyer boehm, s | wedemeyer boehm, s* | wedemeyer boehm, sven | wedemeyer boehm, sven * | wedemeyer bohm, | wedemeyer bohm, s | wedemeyer bohm, s* | wedemeyer bohm, sven | wedemeyer bohm, sven * | wedemeyer böhm, | wedemeyer böhm, s | wedemeyer böhm, s* | wedemeyer böhm, sven | wedemeyer böhm, sven * | wedemeyer, | wedemeyer, s | wedemeyer, s* | wedemeyer, sven | wedemeyer, sven *", - DisjunctionMaxQuery.class); - - // wedemeyer boehm, sven; wedemeyer, sven - // (author:wedemeyer boehm, sven | author:wedemeyer boehm, sven * | author:wedemeyer boehm, s | author:wedemeyer boehm, s * | author:wedemeyer boehm, | author:wedemeyer, sven | author:wedemeyer, sven * | author:wedemeyer, s | author:wedemeyer, s * | author:wedemeyer,) - // wedemeyer boehm, s; wedemeyer, s - // (author:wedemeyer boehm, s | author:wedemeyer boehm, s * | author:wedemeyer boehm, | author:wedemeyer boehm, sven | author:wedemeyer boehm, sven * | author:wedemeyer, s | author:wedemeyer, s * | author:wedemeyer, | author:wedemeyer, sven | author:wedemeyer, sven *) - // wedemeyer boehm, s; wedemeyer boehm, sven; wedemeyer, s; wedemeyer, sven - // (author:wedemeyer, sven | author:wedemeyer, sven * | author:wedemeyer, s | author:wedemeyer, s * | author:wedemeyer, | author:wedemeyer boehm, s | author:wedemeyer boehm, s * | author:wedemeyer boehm,) - assertQueryEquals(req("q", "author:\"van dok*, h\""), "author:van dok*, h", WildcardQuery.class); assertQ(req("q", "author:\"van dok*, h\""), "//*[@numFound='1']", @@ -472,7 +437,7 @@ public void testAuthorParsingUseCases() throws Exception { // expected: // | author:orlitova, | author:stoklasová,* | author:orlitova, ivana | author:orlitova, ivana * | author:stoklasova, i | author:stoklasova, i * | author:stoklasova, ivana | author:stoklasova, ivana * | author:orlitova, i | author:orlitova, i * | author:orlitova,* | author:stoklasova, | author:stoklasova,* | author:orlitová, | author:orlitová,* | author:orlitovae, | author:orlitovae,* | author:stoklasová, | author:stoklasovae, | author:stoklasovae,* // TODO: optimize the query, remove the clauses that match the doc twice - setDebug(true); + testAuthorQuery("\"stoklasova\"", "author:orlitova, | author:stoklasová, | author:orlitova, ivana | author:stoklasova, i | author:stoklasova, ivana | author:orlitova, i | author:orlitova,* | author:stoklasova, | author:stoklasova,* | author:orlitová, | author:orlitová,* | author:orlitovae, | author:orlitovae,* | author:stoklasová,* | author:stoklasovae, | author:stoklasovae,*", "//*[@numFound='0']"); @@ -2449,64 +2414,56 @@ public void testAuthorParsingMainLogic() throws Exception { * */ - setDebug(true); testAuthorQuery( //must NOT have "jones*", must have "jones, c;jones, christine" "forman", "author:forman, | author:forman, c | author:jones, christine | author:jones, c " + "author:forman, christine | author:forman,*", - "//*[@numFound='7']" + "//*[@numFound='7']", // forman numFound=7 // 110 Jones, Christine 111 Jones, C 112 Forman, Christine // 113 Forman, C 115 Jones, C 116 Forman, Christopher - // 117 Forman, C - ); - testAuthorQuery( + // 117 Forman, C //must NOT have "forman*", must have "forman, c;forman, christine" // PLUS - must have other jones's and allen's "jones", "author:jones, | author:jones, l | author:allen, l | author:allen, r l " + "author:allen, lynne | author:jones, r l | author:jones, r lynne | author:jones, lynne " + "author:allen, r lynne | author:forman, c | author:jones, christine | author:jones, c " + "author:forman, christine | author:jones,*", - "//*[@numFound='15']" + "//*[@numFound='15']", // jones numFound=15 // 110 Jones, Christine 111 Jones, C 112 Forman, Christine // 113 Forman, C 114 Jones, Christopher 115 Jones, C // 117 Forman, C 120 Allen, Lynne 121 Allen, L // 122 Allen, R Lynne 123 Allen, R L 124 Jones, Lynne - // 125 Jones, L 126 Jones, R Lynne 127 Jones, R L - ); - testAuthorQuery( + // 125 Jones, L 126 Jones, R Lynne 127 Jones, R L //must NOT have "jones, c*", must have "jones, christine" "\"forman, c\"", "author:forman, c | author:forman, christine | author:forman, c* | author:forman," + "author:jones, christine | author:jones, c", - "//*[@numFound='7']" + "//*[@numFound='7']", // "forman, c" numFound=7 // 110 Jones, Christine 111 Jones, C 112 Forman, Christine // 113 Forman, C 115 Jones, C 116 Forman, Christopher // 117 Forman, C - ); - testAuthorQuery( + //must NOT have "forman, c*", must have "forman, christine" "\"jones, c\"", "author:jones, c | author:jones, christine | author:jones, c* | author:jones," + "author:forman, christine | author:forman, c", - "//*[@numFound='7']" + "//*[@numFound='7']", // "jones, c" numFound=7 // 110 Jones, Christine 111 Jones, C 112 Forman, Christine // 113 Forman, C 114 Jones, Christopher 115 Jones, C // 117 Forman, C - ); - testAuthorQuery( + "\"jones, christine\"", "author:jones, christine | author:jones, christine * | author:jones, c " + "author:jones, c * | author:jones, | author:forman, christine " + "author:forman, christine * | author:forman, c | author:forman, c * " + "author:forman,", - "//*[@numFound='6']" + "//*[@numFound='6']", // "jones, christine" numFound=6 // 110 Jones, Christine 111 Jones, C 112 Forman, Christine // 113 Forman, C 115 Jones, C 117 Forman, C - ); - testAuthorQuery( + "\"forman, christine\"", "author:jones, christine | author:jones, christine * | author:jones, c " + "author:jones, c * | author:jones, | author:forman, christine | author:forman, christine * " + "author:forman, c | author:forman, c * | author:forman,", diff --git a/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml b/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml index 9b1582752..471d7d847 100644 --- a/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml +++ b/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml @@ -86,20 +86,15 @@ - - + class="org.apache.lucene.analysis.synonym.NewSynonymFilterFactory" + synonyms="author_curated.synonyms" format="semicolon" + ignoreCase="true" expand="true" + tokenizerFactory="solr.KeywordTokenizerFactory" /> - - - - + @@ -411,7 +404,7 @@ format="semicolon" synonyms="author_curated.synonyms" ignoreCase="true" expand="true" tokenizerFactory="solr.KeywordTokenizerFactory" - builderFactory="org.apache.solr.analysis.author.AuthorShortNameUpgradeFilterFactory$SimpleShortNames" + builderFactory="org.apache.solr.analysis.author.AuthorShortNameUpgradeFilterFactory$MakeAllShortNames" inclOrig="true" />