diff --git a/contrib/adsabs/src/java/org/apache/lucene/queryparser/flexible/aqp/processors/AqpChangeRewriteMethodProcessor.java b/contrib/adsabs/src/java/org/apache/lucene/queryparser/flexible/aqp/processors/AqpChangeRewriteMethodProcessor.java index aaf5bc14f..04f27a6f8 100644 --- a/contrib/adsabs/src/java/org/apache/lucene/queryparser/flexible/aqp/processors/AqpChangeRewriteMethodProcessor.java +++ b/contrib/adsabs/src/java/org/apache/lucene/queryparser/flexible/aqp/processors/AqpChangeRewriteMethodProcessor.java @@ -247,6 +247,7 @@ private void pickSynonyms(List termList, List newList, Set int minFreq = Integer.MAX_VALUE; int maxFreq = Integer.MIN_VALUE; Integer closestLen = null; + int oldSize = newList.size(); // first decide one scenarios 1. xor 2. for (QueryNode n: termList) { @@ -347,8 +348,18 @@ else if (strategy.equals("cantDecide") && closestLenTerm != null) { } - if (newList.size() == 0) { // we didn't find any type that would satisfy the condition - newList.add(termList.get(0)); + if (newList.size() == oldSize) { // we didn't find any type that would satisfy the condition + QueryNode picked = termList.get(0); + // pick the longest if you can + int x = 0; + for (QueryNode t: termList) { + int l = ((FieldQueryNode) t).getTextAsString().length(); + if (l > x) { + x = l; + picked = t; + } + } + newList.add(picked); } } diff --git a/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeAffiliationText.java b/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeAffiliationText.java index 9b2121870..e2918055c 100644 --- a/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeAffiliationText.java +++ b/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeAffiliationText.java @@ -69,6 +69,12 @@ public void test() throws Exception { //System.err.println(h.query(req("q", "aff:foo1"))); assertQ(req("q", "*:*"), "//*[@numFound>='2']"); + + assertQueryEquals(req("q", "aff:\"Institut d’Astrophysique\"", "aqp.multiphrase.keep_one", "SYNONYM", "qt", "aqp"), + "(aff:\"institut d'astrophysique\" | aff:\"institut d astrophysique\")", + DisjunctionMaxQuery.class + ); + assertQ(req("q", "aff:xfoo"), "//*[@numFound='0']"); assertQueryEquals(req("q", "aff:\"Pasadena, CA 91125\"", "qt", "aqp"), @@ -130,7 +136,7 @@ public void test() throws Exception { ); - assert h.query(req("q", "aff:foo1")) + assert h.query(req("q", "aff:foo1", "fl", "aff")) .contains("" + "foo1" + "foo2" + diff --git a/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeAffiliationTokens.java b/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeAffiliationTokens.java index 2c58d214e..e81f192e1 100644 --- a/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeAffiliationTokens.java +++ b/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeAffiliationTokens.java @@ -166,7 +166,7 @@ public void test() throws Exception { // check the affiliation is there stored as one string - assert h.query(req("q", "institution:\"Kavli Institute/Dept of Physics\"")) + assert h.query(req("q", "institution:\"Kavli Institute/Dept of Physics\"", "fl", "institution")) .contains("Kavli Institute/Dept of Physics" ); diff --git a/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeFulltextParsing.java b/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeFulltextParsing.java index 49b9c5152..2a204c084 100644 --- a/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeFulltextParsing.java +++ b/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeFulltextParsing.java @@ -135,7 +135,7 @@ public static String getConfigFile() { newConfig = duplicateFile(new File(configFile)); - //replaceInFile(newConfig, "solr.SchemaCodecFactory", "solr.SimpleTextCodecFactory"); + replaceInFile(newConfig, "solr.SchemaCodecFactory", "solr.SimpleTextCodecFactory"); } catch (IOException e) { e.printStackTrace(); @@ -233,7 +233,7 @@ public void setUp() throws Exception { assertU(adoc("id", "11", "bibcode", "xxxxxxxxxxx11", "title", "All-sky data survey")); assertU(adoc("id", "12", "bibcode", "xxxxxxxxxxx12", "title", "NoSky data survey")); assertU(adoc("id", "13", "bibcode", "xxxxxxxxxxx13", "title", "AllSky data survey")); - assertU(adoc("id", "14", "bibcode", "xxxxxxxxxxx14", "title", "Modified Newtonian Dynamics (MOND): Observational Phenomenology and Relativistic Extensions")); + assertU(adoc("id", "14", "bibcode", "xxxxxxxxxxx14", "title", "Modified Newtonian Dynamics: Observational Phenomenology and Relativistic Extensions")); assertU(adoc("id", "15", "bibcode", "xxxxxxxxxxx15", "title", "MOND test")); assertU(adoc("id", "16", "bibcode", "xxxxxxxxxxx16", "title", "mond test")); assertU(adoc("id", "17", "bibcode", "xxxxxxxxxxx17", "title", "bubble pace telescope multi-pace foobar")); @@ -269,6 +269,11 @@ public void setUp() throws Exception { "keyword", "one Astrophysics Data System two")); assertU(adoc("id", "502", "bibcode", "xxxxxxxxxx502", "keyword", "one ads two")); + assertU(adoc("id", "600", "bibcode", "xxxxxxxxxx600", + "title", "THE HUBBLE constant: A SUMMARY OF THE HST PROGRAM FOR THE LUMINOSITY CALIBRATION OF TYPE Ia SUPERNOVAE BY MEANS OF CEPHEIDS")); + assertU(adoc("id", "601", "bibcode", "xxxxxxxxxx601", + "title", "the hubble constant: a summary of the HST program for the luminosity calibration of type Ia supernovae by means of cepheids")); + assertU(commit()); } @@ -355,6 +360,9 @@ public void testMultiTokens() throws Exception { assertQueryEquals(req("q", "title:\"observations black hole\""), "(title:\"observations black hole\" | title:\"observations (syn::black hole syn::bh acr::bh)\"~2)", DisjunctionMaxQuery.class); + assertQueryEquals(req("q", "title:\"observations BH\""), + "title:\"observations (acr::bh syn::black hole syn::bh)\"~2", + MultiPhraseQuery.class); // btw our analyzer chain outputs all multi synonyms during indexing (but only canonical synonym for single ones) // that makes sense because we don't know how the user/author are writing them; but because we output all of them @@ -535,8 +543,7 @@ public void testMultiTokens() throws Exception { "Synonym(all:mond all:syn::lunar)", SynonymQuery.class); assertQ(req("q", "title" + ":mond"), - "//*[@numFound='5']", - "//doc/str[@name='id'][.='14']", + "//*[@numFound='4']", "//doc/str[@name='id'][.='15']", "//doc/str[@name='id'][.='16']", "//doc/str[@name='id'][.='17']", @@ -546,9 +553,8 @@ public void testMultiTokens() throws Exception { "Synonym(all:mond all:syn::lunar)", SynonymQuery.class); assertQ(req("q", "title" + ":Mond"), - "//*[@numFound='5']", + "//*[@numFound='4']", "//doc/str[@name='id'][.='17']", // orig 'space' -> syn:lunar; look at the synonym file to understand - "//doc/str[@name='id'][.='14']", "//doc/str[@name='id'][.='15']", "//doc/str[@name='id'][.='16']", "//doc/str[@name='id'][.='20']"); @@ -588,9 +594,15 @@ public void testMultiTokens() throws Exception { "(title:\"modified newtonian dynamics\" " + "| Synonym(title:acr::mond title:syn::modified newtonian dynamics title:syn::mond))", DisjunctionMaxQuery.class); + assertQueryEquals(req("q", "title:\"MOND\"", "defType", "aqp"), + "Synonym(title:acr::mond title:syn::modified newtonian dynamics title:syn::mond)", + SynonymQuery.class); assertQ(req("q", "title" + ":\"modified newtonian dynamics\""), "//*[@numFound='2']", "//doc/str[@name='id'][.='14']", "//doc/str[@name='id'][.='15']"); + assertQ(req("q", "title" + ":\"MOND\""), "//*[@numFound='2']", + "//doc/str[@name='id'][.='14']", + "//doc/str[@name='id'][.='15']"); // multi-token. this is truly crazy (several synonyms overlap) @@ -786,9 +798,12 @@ public void testSynonyms() throws Exception { DisjunctionMaxQuery.class); assertQ(req("q", "title:\"hubble space telescope\""), - "//*[@numFound='2']", + "//*[@numFound='4']", "//doc/str[@name='id'][.='4']", - "//doc/str[@name='id'][.='5']"); + "//doc/str[@name='id'][.='5']", + "//doc/str[@name='id'][.='600']", + "//doc/str[@name='id'][.='601']" + ); // preceded by something @@ -845,7 +860,7 @@ public void testSynonyms() throws Exception { "Synonym(title:acr::hst title:syn::hst title:syn::hubble space telescope)", SynonymQuery.class); assertQ(req("q", "title:HST"), - "//*[@numFound='2']", + "//*[@numFound='4']", "//doc/str[@name='id'][.='4']", "//doc/str[@name='id'][.='5']"); @@ -999,6 +1014,91 @@ public void testSynonyms() throws Exception { public void testOtherCases() throws Exception { + + /** + * + * input: + * + * THE HUBBLE constant: A SUMMARY OF THE HST PROGRAM FOR THE LUMINOSITY CALIBRATION OF TYPE Ia SUPERNOVAE BY MEANS OF CEPHEIDS + * + * this is how it gets indexed internally (before moving acronym filter after stop filters): + * + * [(0, ['acr::the']), + (1, ['acr::hubble', 'hubble']), + (2, ['constant']), + (3, ['acr::of', 'acr::summary', 'acr::the', 'summary']), + (4, ['acr::hst', 'hst', 'syn::hst', 'syn::hubble space telescope']), + (5, ['acr::for', 'acr::program', 'acr::the', 'program']), + (6, ['acr::luminosity', 'luminosity']), + (7, ['acr::calibration', 'acr::of', 'calibration']), + (8, ['acr::type', 'type']), + (9, ['ia']), + (10, ['acr::supernovae', 'supernovae']), + (11, ['acr::by', 'by']), + (12, ['acr::means', 'acr::of', 'means']), + (13, ['acr::cepheids', 'cepheids']), + (14, []), + (15, []), + (16, []), + (17, []), + (18, []), + (19, [])] + + with stop filters before acronyms + + [(0, []), + (1, ['acr::hubble']), + (2, ['constant']), + (3, ['acr::summary', 'summary']), + (4, []), + (5, ['acr::program', 'program']), + (6, ['acr::luminosity', 'luminosity']), + (7, ['acr::calibration', 'calibration']), + (8, ['acr::type', 'type']), + (9, ['ia']), + (10, ['acr::supernovae', 'supernovae']), + (11, ['acr::by', 'by']), + (12, ['acr::means', 'means']), + (13, ['acr::cepheids', 'cepheids']), + + and this how it got parsed before the change: + + title:"acr::the acr::hubble constant acr::summary acr::of acr::the (acr::hst syn::hubble space telescope syn::hst) acr::program acr::for acr::the acr::luminosity acr::calibration acr::of acr::type ia acr::supernovae acr::by acr::means acr::of acr::cepheids"~3 + + [(0, 'acr::the'), + (1, 'acr::hubble'), + (2, 'constant'), + (3, 'acr::summary'), + (4, 'acr::of'), + (5, 'acr::the'), + (6, '(acr::hst syn::hubble space telescope syn::hst)'), + (7, 'acr::program'), + (8, 'acr::for'), + (9, 'acr::the'), + (10, 'acr::luminosity'), + (11, 'acr::calibration'), + (12, 'acr::of'), + (13, 'acr::type'), + (14, 'ia'), + (15, 'acr::supernovae'), + (16, 'acr::by'), + (17, 'acr::means'), + (18, 'acr::of'), + (19, 'acr::cepheids')] + + + */ + assertQueryEquals(req("q", "title:\"THE HUBBLE constant: A SUMMARY OF THE HST PROGRAM FOR THE LUMINOSITY CALIBRATION OF TYPE Ia SUPERNOVAE BY MEANS OF CEPHEIDS\""), + "title:\"acr::hubble constant acr::summary (acr::hst syn::hubble space telescope syn::hst) acr::program acr::luminosity acr::calibration acr::type ia acr::supernovae acr::by acr::means acr::cepheids\"~3", + MultiPhraseQuery.class); + assertQ(req("q", "title:\"THE HUBBLE constant: A SUMMARY OF THE HST PROGRAM FOR THE LUMINOSITY CALIBRATION OF TYPE Ia SUPERNOVAE BY MEANS OF CEPHEIDS\""), + "//*[@numFound='1']", + "//doc/str[@name='id'][.='600']"); + assertQ(req("q", "title:\"the hubble constant: a summary of the HST program for the luminosity calibration of type Ia supernovae by means of cepheids\""), + "//*[@numFound='2']", + "//doc/str[@name='id'][.='600']", + "//doc/str[@name='id'][.='601']"); + // change to NGC tokenizer in the schema; we want to index both // variants, but during search time only query for the concat version @@ -1225,8 +1325,71 @@ public void testOtherCases() throws Exception { //TODO: this test is intentionally left failing; it used to work until the scoring changes (i'd like to // investigate more how the multi-token affects recall) + /** + * 1. A 350-MHz GBT Survey of 50 Faint Fermi γ-ray + * + * gets indexed as: + * + * [(0, []), + (1, []), + (2, ['350mhz', 'syn::mhz']), + (3, ['gbt', 'syn::gbt', 'syn::green bank telescope']), + (4, ['syn::survey']), + (5, ['50']), + (6, ['faint', 'syn::faint']), + (7, ['fermi', 'syn::fermi']), + (8, + ['syn::gamma', + 'syn::gamma ray', + 'syn::gamma rays', + 'syn::gammaray', + 'syn::gammarays']), + (9, + ['gammaray', + 'ray', + 'syn::gamma ray', + 'syn::gamma rays', + 'syn::gammaray', + 'syn::gammarays']), + (10, ['syn::source']), + (11, ['syn::radio']), + (12, ['millisecond', 'syn::millisecond'])] + + while + + 2. A 350-MHz GBT Survey of 50 Faint Fermi γ ray + + gets indexed as: + + [(0, []), + (1, []), + (2, ['350mhz', 'syn::mhz']), + (3, ['syn::gbt', 'syn::green bank telescope']), + (4, ['syn::survey']), + (5, ['50']), + (6, ['faint', 'syn::faint']), + (7, ['fermi', 'syn::fermi']), + (8, + ['syn::gamma', + 'syn::gamma ray', + 'syn::gamma rays', + 'syn::gammaray', + 'syn::gammarays']), + (9, ['ray']), + (10, ['syn::source']), + (11, ['syn::radio']), + (12, ['millisecond', 'syn::millisecond'])] + */ dumpDoc(null, "title", "bibcode"); - assertQ(req("q", "title:\"γ-ray Sources\"", + + assertQueryEquals(req("q", "title:\"γ ray Sources\""), + "(title:\"(gamma syn::gamma) ray (sources syn::source)\"~2 | title:\"(syn::gamma ray syn::gammaray syn::gamma rays syn::gammarays) ? (sources syn::source)\"~2)", + DisjunctionMaxQuery.class); + assertQueryEquals(req("q", "title:\"γ-ray Sources\""), + "title:\"(gamma syn::gamma syn::gamma ray syn::gammaray syn::gamma rays syn::gammarays) (ray gammaray syn::gamma ray syn::gammaray syn::gamma rays syn::gammarays) (sources syn::source)\"~2", + MultiPhraseQuery.class); + + assertQ(req("q", "title:\"γ ray Sources\"", "indent", "true", "debugQuery", "true"), "//*[@numFound='4']", @@ -1235,7 +1398,7 @@ public void testOtherCases() throws Exception { "//doc/str[@name='id'][.='402']", "//doc/str[@name='id'][.='403']" ); - assertQ(req("q", "title:\"γ ray Sources\"", + assertQ(req("q", "title:\"$\\gamma$ ray Sources\"", "indent", "true", "debugQuery", "true"), "//*[@numFound='4']", @@ -1244,7 +1407,7 @@ public void testOtherCases() throws Exception { "//doc/str[@name='id'][.='402']", "//doc/str[@name='id'][.='403']" ); - assertQ(req("q", "title:\"$\\gamma$ ray Sources\"", + assertQ(req("q", "title:\"γ-ray Sources\"", "indent", "true", "debugQuery", "true"), "//*[@numFound='4']", diff --git a/contrib/adsabs/src/test/org/apache/solr/search/TestSolrCitationQuery.java b/contrib/adsabs/src/test/org/apache/solr/search/TestSolrCitationQuery.java index 980767982..a21f6746f 100644 --- a/contrib/adsabs/src/test/org/apache/solr/search/TestSolrCitationQuery.java +++ b/contrib/adsabs/src/test/org/apache/solr/search/TestSolrCitationQuery.java @@ -69,7 +69,7 @@ public void testSearch() throws Exception { "//*[@numFound='6']" ); - assertQ(req("q", "bibcode:A"), + assertQ(req("q", "bibcode:A", "fl", "bibcode"), "//*[@numFound='1']", "//result/doc[1]/str[@name='bibcode']='A'" ); @@ -78,25 +78,25 @@ public void testSearch() throws Exception { "//*[@numFound='0']" ); - assertQ(req("q", "citations(bibcode:b)"), + assertQ(req("q", "citations(bibcode:b)", "fl", "bibcode"), "//*[@numFound='2']", "//result/doc/str[@name='bibcode']='A'", "//result/doc/str[@name='bibcode']='D'" ); - assertQ(req("q", "joincitations(bibcode:B)"), + assertQ(req("q", "joincitations(bibcode:B)", "fl", "bibcode"), "//*[@numFound='2']", "//result/doc/str[@name='bibcode']='A'", "//result/doc/str[@name='bibcode']='D'" ); - assertQ(req("q", "references(bibcode:A)"), + assertQ(req("q", "references(bibcode:A)", "fl", "bibcode"), "//*[@numFound='3']", "//result/doc/str[@name='bibcode']='B'", "//result/doc/str[@name='bibcode']='C'", "//result/doc/str[@name='bibcode']='D'" ); - assertQ(req("q", "joinreferences(bibcode:A)"), + assertQ(req("q", "joinreferences(bibcode:A)", "fl", "bibcode"), "//*[@numFound='3']", "//result/doc/str[@name='bibcode']='B'", "//result/doc/str[@name='bibcode']='C'", diff --git a/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml b/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml index 98db659dc..1f9a3d857 100644 --- a/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml +++ b/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml @@ -461,10 +461,6 @@ builderFactory="org.apache.lucene.analysis.synonym.NewSynonymFilterFactory$BestEffortIgnoreCaseSelectively" inclOrig="true" /> - - - - + + + - - - - - + + + +