Skip to content

Commit

Permalink
Hunting bug of multi-synonym position indexing
Browse files Browse the repository at this point in the history
  • Loading branch information
romanchyla committed Jan 16, 2020
1 parent f7c2f6c commit ffc8081
Show file tree
Hide file tree
Showing 7 changed files with 96 additions and 55 deletions.
2 changes: 1 addition & 1 deletion .classpath
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@
<classpathentry kind="lib" path="build/solrjars-extracted/solr/solrj-lib/woodstox-core-asl-4.4.1.jar"/>
<classpathentry kind="lib" path="build/solrjars-extracted/solr/test-framework/lib/junit4-ant-2.4.0.jar"/>
<classpathentry kind="lib" path="build/solrjars-extracted/solr/test-framework/lib/randomizedtesting-runner-2.4.0.jar"/>
<classpathentry kind="lib" path="lib/luke-with-deps.jar"/>
<classpathentry kind="lib" path="lib/luke-with-deps.jar" sourcepath="/home/rchyla/Downloads/luke-luke-6.6.0/src/main/java"/>
<classpathentry kind="lib" path="contrib/adsabs/extra-lib/junidecode-0.5.jar"/>
<classpathentry kind="lib" path="build/solrjars-extracted/solr/test-framework/lucene-libs/lucene-test-framework-6.3.0-SNAPSHOT.jar"/>
<classpathentry kind="output" path="bin"/>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,13 @@ public void setDebug(boolean v) {

/*
* This is only for printing/debugging, DO NOT use this for testing!!!
* This method can go away
*
* It will only work if the field(s) are indexed with stored positions
* i.e.
*
* <field name="title" ..... termVectors="true" termPositions="true"/>
*
* Also, the codec used must NOT be SimpleTextCodec
*/
public void dumpDoc(Integer docId, String...fields) throws Exception {
//DirectoryReader reader = h.getCore().getSearcher().get().getIndexReader();
Expand All @@ -162,7 +168,7 @@ public void dumpDoc(Integer docId, String...fields) throws Exception {
docs = new int[]{docId};
}

DocReconstructor reconstructor = new DocReconstructor(reader, fields, 10);
DocReconstructor reconstructor = new DocReconstructor(reader, fields, -1);
Reconstructed d;

for (Integer dd: docs) {
Expand Down
39 changes: 19 additions & 20 deletions contrib/adsabs/src/test/org/adsabs/TestAdsAllFields.java
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ public void test() throws Exception {
", \"aff_id\": [\"61814\", \"A1036\", \"-\"]" +
", \"aff_abbrev\": [\"CfA\", \"Harvard U/Dep Ast\", \"-\"]" +
", \"aff_canonical\": [\"Harvard Smithsonian Center for Astrophysics\", \"Harvard University, Department of Astronomy\", \"-\"]" +
", \"aff_raw\": [\"-\", \"NASA Kavli space center, Cambridge, MA 02138, USA\", \"Einstein institute, Zurych, Switzerland\"]" +
", \"aff\": [\"-\", \"NASA Kavli space center, Cambridge, MA 02138, USA\", \"Einstein institute, Zurych, Switzerland\"]" +
", \"institution\": [\"CfA\", \"Harvard U/Dep Ast\", \"-\", \"foo/bar baz\"]" +
", \"aff_facet\": [[\"A1234\", \"facet abbrev/parent abbrev\"]]" +
", \"aff_facet_hier\": [\"1/1812/61814\", \"1/8264/61814\", \"1/1812/A1036\", \"-\"]" +
Expand Down Expand Up @@ -496,29 +496,29 @@ public void test() throws Exception {


/*
* aff - is a virtual field;
* affil - is a virtual field;
* all aff_ fields must be the same order as authors
*/

assertQ(req("q", "aff:NASA"),
assertQ(req("q", "affil:NASA"),
"//doc/int[@name='recid'][.='100']",
"//*[@numFound='1']"
);
assertQ(req("q", "aff:NASA AND author:\"Anders\""),
assertQ(req("q", "affil:NASA AND author:\"Anders\""),
"//doc/int[@name='recid'][.='100']",
"//*[@numFound='1']"
);
assertQ(req("q", "aff:SPACE"), "//*[@numFound='0']"); // be case sensitive with uppercased query terms
assertQ(req("q", "aff:KAVLI"), "//*[@numFound='0']"); // same here
assertQ(req("q", "aff:kavli"), // otherwise case-insensitive
assertQ(req("q", "affil:SPACE"), "//*[@numFound='0']"); // be case sensitive with uppercased query terms
assertQ(req("q", "affil:KAVLI"), "//*[@numFound='0']"); // same here
assertQ(req("q", "affil:kavli"), // otherwise case-insensitive
"//*[@numFound='1']",
"//doc/int[@name='recid'][.='100']"
);
assertQ(req("q", "aff:Kavli"),
assertQ(req("q", "affil:Kavli"),
"//*[@numFound='1']",
"//doc/int[@name='recid'][.='100']"
);
assertQ(req("q", "aff:\"kavli space\""),
assertQ(req("q", "affil:\"kavli space\""),
"//*[@numFound='1']",
"//doc/int[@name='recid'][.='100']"
);
Expand Down Expand Up @@ -548,15 +548,15 @@ public void test() throws Exception {
);


assertQ(req("q", "aff:\"Kavli\""),
assertQ(req("q", "affil:\"Kavli\""),
"//*[@numFound='1']");
assertQ(req("q", "aff_canonical:\"Harvard\""),
"//*[@numFound='1']");
// assertQ(req("q", "aff_facet:\"A1234\""),
// "//*[@numFound='1']");
assertQ(req("q", "aff_id:\"61814\""),
"//*[@numFound='1']");
assertQ(req("q", "aff_raw:\"02138\""),
assertQ(req("q", "aff:\"02138\""),
"//*[@numFound='1']");
assertQ(req("q", "aff_canonical:\"Smithsonian\""),
"//*[@numFound='1']");
Expand All @@ -565,15 +565,15 @@ public void test() throws Exception {


assert h.query(req("q", "recid:100"))
.contains("<arr name=\"aff_raw\">" +
.contains("<arr name=\"aff\">" +
"<str>-</str>" +
"<str>NASA Kavli space center, Cambridge, MA 02138, USA</str>" +
"<str>Einstein institute, Zurych, Switzerland</str></arr>"
);
assertQ(req("q", "=aff:\"acr::nasa\" AND recid:100"),
assertQ(req("q", "=affil:\"acr::nasa\" AND recid:100"),
"//*[@numFound='1']"
);
assertQ(req("q", "pos(aff_raw:kavli, 2) AND recid:100"),
assertQ(req("q", "pos(aff:kavli, 2) AND recid:100"),
"//*[@numFound='1']"
);

Expand Down Expand Up @@ -1622,12 +1622,11 @@ public void test() throws Exception {
* these are the cases that depend on the default parameters specified in the
* solrcofig.xml; here we just test what came up as bugs
*/
assertQueryEquals(req("q", "aff:\"ASTRO 3D\""),
"(aff_abbrev:\"acr::astro (3d 3d)\" | aff_abbrev:\"acr::astro 3 d\") "
+ "(institution:astro 3d)^2.0 "
+ "aff_id:astro 3d "
+ "(aff_canonical:\"acr::astro (3d 3d)\" | aff_canonical:\"acr::astro 3 d\") "
+ "((aff_raw:\"acr::astro (3d 3d)\" | aff_raw:\"acr::astro 3 d\"))^0.5",
assertQueryEquals(req("q", "affil:\"ASTRO 3D\""),
"((aff:\"acr::astro (3d 3d)\" | aff:\"acr::astro 3 d\"))^0.5 "
+ "(aff_abbrev:\"acr::astro (3d 3d)\" | aff_abbrev:\"acr::astro 3 d\") "
+ "(institution:astro 3d)^2.0 aff_id:astro 3d "
+ "(aff_canonical:\"acr::astro (3d 3d)\" | aff_canonical:\"acr::astro 3 d\")",
BooleanQuery.class);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -55,83 +55,83 @@ public static void beforeClass() throws Exception {

public void test() throws Exception {

assertU(addDocs("aff_raw", "W.K. Kellogg Radiation Laboratory, California Institute of Technology, Pasadena, CA 91125, USA"));
assertU(addDocs("aff_raw", "W.K. Kellogg <xfoo> Radiation Laboratory, California Institute of Technology, Pasadena, CA(91125), USA"));
assertU(addDocs("aff_raw", "IMCCE/Observatoire de Paris"));
assertU(addDocs("aff_raw", "INAF - Osservatorio Astronomico di Brera, via E. Bianchi 46, I-23807 Merate, Italy;",
"aff_raw", "INAF - IASF Milano, via E. Bassini 15, I-20133 Milano, Italy"));
assertU(addDocs("aff_raw", "Instituto de Astrofísica de Andalucía (IAA-CSIC) foo:doo"));
assertU(addDocs("aff_raw", "foo1", "aff_raw", "foo2", "aff_raw", "-", "aff_raw", "foo4"));
assertU(addDocs("aff", "W.K. Kellogg Radiation Laboratory, California Institute of Technology, Pasadena, CA 91125, USA"));
assertU(addDocs("aff", "W.K. Kellogg <xfoo> Radiation Laboratory, California Institute of Technology, Pasadena, CA(91125), USA"));
assertU(addDocs("aff", "IMCCE/Observatoire de Paris"));
assertU(addDocs("aff", "INAF - Osservatorio Astronomico di Brera, via E. Bianchi 46, I-23807 Merate, Italy;",
"aff", "INAF - IASF Milano, via E. Bassini 15, I-20133 Milano, Italy"));
assertU(addDocs("aff", "Instituto de Astrofísica de Andalucía (IAA-CSIC) foo:doo"));
assertU(addDocs("aff", "foo1", "aff", "foo2", "aff", "-", "aff", "foo4"));

assertU(commit());

//dumpDoc(null, "aff_raw");
//System.err.println(h.query(req("q", "aff_raw:foo1")));
//dumpDoc(null, "aff");
//System.err.println(h.query(req("q", "aff:foo1")));

assertQ(req("q", "*:*"), "//*[@numFound>='2']");
assertQ(req("q", "aff_raw:xfoo"), "//*[@numFound='0']");
assertQ(req("q", "aff:xfoo"), "//*[@numFound='0']");

assertQueryEquals(req("q", "aff_raw:\"Pasadena, CA 91125\"", "qt", "aqp"),
"aff_raw:\"pasadena acr::ca 91125\"",
assertQueryEquals(req("q", "aff:\"Pasadena, CA 91125\"", "qt", "aqp"),
"aff:\"pasadena acr::ca 91125\"",
PhraseQuery.class
);
assertQueryEquals(req("q", "aff_raw:\"Pasadena, CA(91125)\"", "qt", "aqp"),
"aff_raw:\"pasadena acr::ca 91125\"",
assertQueryEquals(req("q", "aff:\"Pasadena, CA(91125)\"", "qt", "aqp"),
"aff:\"pasadena acr::ca 91125\"",
PhraseQuery.class
);

assertQ(req("q", "aff_raw:\"Pasadena, CA 91125\""),
assertQ(req("q", "aff:\"Pasadena, CA 91125\""),
"//*[@numFound='2']",
"//doc/str[@name='id'][.='0']",
"//doc/str[@name='id'][.='1']"
);

assertQ(req("q", "aff_raw:IMCCE"),
assertQ(req("q", "aff:IMCCE"),
"//*[@numFound='1']",
"//doc/str[@name='id'][.='2']"
);

assertQ(req("q", "aff_raw:imcce"),
assertQ(req("q", "aff:imcce"),
"//*[@numFound='1']",
"//doc/str[@name='id'][.='2']"
);

assertQ(req("q", "aff_raw:IASF"),
assertQ(req("q", "aff:IASF"),
"//*[@numFound='1']",
"//doc/str[@name='id'][.='3']"
);

assertQ(req("q", "aff_raw:iasf"),
assertQ(req("q", "aff:iasf"),
"//*[@numFound='1']",
"//doc/str[@name='id'][.='3']"
);

assertQ(req("q", "aff_raw:IAA-CSIC"),
assertQ(req("q", "aff:IAA-CSIC"),
"//*[@numFound='1']",
"//doc/str[@name='id'][.='4']"
);

assertQ(req("q", "aff_raw:IAACSIC"),
assertQ(req("q", "aff:IAACSIC"),
"//*[@numFound='1']",
"//doc/str[@name='id'][.='4']"
);
assertQ(req("q", "aff_raw:iaa-csic"),
assertQ(req("q", "aff:iaa-csic"),
"//*[@numFound='1']",
"//doc/str[@name='id'][.='4']"
);

assertQ(req("q", "aff_raw:\"INAF - IASF\""),
assertQ(req("q", "aff:\"INAF - IASF\""),
"//*[@numFound='1']",
"//doc/str[@name='id'][.='3']"
);
assertQ(req("q", "aff_raw:\"inaf - iasf\""),
assertQ(req("q", "aff:\"inaf - iasf\""),
"//*[@numFound='1']",
"//doc/str[@name='id'][.='3']"
);


assert h.query(req("q", "aff_raw:foo1"))
.contains("<arr name=\"aff_raw\">" +
assert h.query(req("q", "aff:foo1"))
.contains("<arr name=\"aff\">" +
"<str>foo1</str>" +
"<str>foo2</str>" +
"<str>-</str>" +
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@
* TODO: the analyzer for the synonyms must use the same StopFilters as the query chain
*
*/

public class TestAdsabsTypeFulltextParsing extends MontySolrQueryTestCase {


Expand Down Expand Up @@ -134,7 +135,7 @@ public static String getConfigFile() {

newConfig = duplicateFile(new File(configFile));

replaceInFile(newConfig, "solr.SchemaCodecFactory", "solr.SimpleTextCodecFactory");
//replaceInFile(newConfig, "solr.SchemaCodecFactory", "solr.SimpleTextCodecFactory");

} catch (IOException e) {
e.printStackTrace();
Expand Down Expand Up @@ -1224,6 +1225,35 @@ public void testOtherCases() throws Exception {

//TODO: this test is intentionally left failing; it used to work until the scoring changes (i'd like to
// investigate more how the multi-token affects recall)
dumpDoc(null, "title", "bibcode");
assertQ(req("q", "title:\"γ-ray Sources\"",
"indent", "true",
"debugQuery", "true"),
"//*[@numFound='4']",
"//doc/str[@name='id'][.='400']",
"//doc/str[@name='id'][.='401']",
"//doc/str[@name='id'][.='402']",
"//doc/str[@name='id'][.='403']"
);
assertQ(req("q", "title:\"γ ray Sources\"",
"indent", "true",
"debugQuery", "true"),
"//*[@numFound='4']",
"//doc/str[@name='id'][.='400']",
"//doc/str[@name='id'][.='401']",
"//doc/str[@name='id'][.='402']",
"//doc/str[@name='id'][.='403']"
);
assertQ(req("q", "title:\"$\\gamma$ ray Sources\"",
"indent", "true",
"debugQuery", "true"),
"//*[@numFound='4']",
"//doc/str[@name='id'][.='400']",
"//doc/str[@name='id'][.='401']",
"//doc/str[@name='id'][.='402']",
"//doc/str[@name='id'][.='403']"
);

assertQ(req("q", "title:\"A 350-MHz GBT Survey of 50 Faint Fermi γ-ray Sources for Radio Millisecond Pulsars\"",
"indent", "true",
"debugQuery", "true"),
Expand All @@ -1241,6 +1271,10 @@ public void testOtherCases() throws Exception {
"//doc/str[@name='id'][.='403']");


//assertU(adoc("id", "402", "bibcode", "xxxxxxxxxx402", "title",
//"A 350-MHz GBT Survey of 50 Faint Fermi $\\gamma$ ray Sources for Radio Millisecond Pulsars"));
//assertU(adoc("id", "403", "bibcode", "xxxxxxxxxx403", "title",
//"A 350-MHz GBT Survey of 50 Faint Fermi γ ray Sources for Radio Millisecond Pulsars"));


}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1200,8 +1200,9 @@


<field name="title" type="ads_text" indexed="true" stored="true"
multiValued="true"/>
<field name="title_nosyn" type="ads_text_nosyn" indexed="false" stored="false"/>
multiValued="true"
/>
<field name="title_nosyn" type="ads_text_nosyn" indexed="false" stored="false"/>

<field name="alternate_title" type="ads_text" indexed="true" stored="true"
multiValued="true" />
Expand Down Expand Up @@ -1242,7 +1243,7 @@
<field name="editor" type="author" indexed="true" stored="true"
multiValued="true" omitNorms="true" useDocValuesAsStored="false"/>

<!-- TODO: remove once index has been rebuilt -->

<field name="aff" type="affiliation_text" indexed="true" stored="true"
multiValued="true" omitNorms="true"/>
<field name="aff_abbrev" type="affiliation_text" indexed="true" stored="true"
Expand All @@ -1259,6 +1260,7 @@
docValues="true"/>
<field name="aff_id" type="affiliation_tokens" indexed="true" stored="true"
multiValued="true" omitNorms="true"/>
<!-- TODO: remove once index has been rebuilt -->
<field name="aff_raw" type="affiliation_text" indexed="true" stored="true"
omitNorms="true" multiValued="true" />

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -695,7 +695,7 @@
<str name="full">title^2 abstract^2 body keyword ack</str>
<str name="abs">title abstract keyword</str>
<str name="orcid">orcid_pub orcid_user^0.9 orcid_other^0.8</str>
<str name="aff">aff_abbrev aff_canonical aff_id institution^2 aff_raw^0.5</str>
<str name="affil">aff_abbrev aff_canonical aff_id institution^2 aff^0.5</str>
</lst>
<str name="aqp.unfieldedSearchField">unfielded_search</str>
<str name="aqp.defaultField">unfielded_search</str>
Expand Down Expand Up @@ -726,7 +726,7 @@
<str name="full">title^2 abstract^2 body keyword ack</str>
<str name="abs">title abstract keyword</str>
<str name="orcid">orcid_pub orcid_user^0.9 orcid_other^0.8</str>
<str name="aff">aff_abbrev aff_canonical aff_id institution^2 aff_raw^0.5</str>
<str name="affil">aff_abbrev aff_canonical aff_id institution^2 aff^0.5</str>
</lst>
<str name="aqp.unfieldedSearchField">unfielded_search</str>
<str name="aqp.defaultField">unfielded_search</str>
Expand Down

0 comments on commit ffc8081

Please sign in to comment.