Skip to content

Commit

Permalink
Made a release with all the changes
Browse files Browse the repository at this point in the history
  • Loading branch information
romanchyla committed May 26, 2020
1 parent cffa6d3 commit 3eaef03
Show file tree
Hide file tree
Showing 6 changed files with 213 additions and 34 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,7 @@ private void pickSynonyms(List<QueryNode> termList, List<QueryNode> newList, Set
int minFreq = Integer.MAX_VALUE;
int maxFreq = Integer.MIN_VALUE;
Integer closestLen = null;
int oldSize = newList.size();

// first decide one scenarios 1. xor 2.
for (QueryNode n: termList) {
Expand Down Expand Up @@ -347,8 +348,18 @@ else if (strategy.equals("cantDecide") && closestLenTerm != null) {
}


if (newList.size() == 0) { // we didn't find any type that would satisfy the condition
newList.add(termList.get(0));
if (newList.size() == oldSize) { // we didn't find any type that would satisfy the condition
QueryNode picked = termList.get(0);
// pick the longest if you can
int x = 0;
for (QueryNode t: termList) {
int l = ((FieldQueryNode) t).getTextAsString().length();
if (l > x) {
x = l;
picked = t;
}
}
newList.add(picked);
}

}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,12 @@ public void test() throws Exception {
//System.err.println(h.query(req("q", "aff:foo1")));

assertQ(req("q", "*:*"), "//*[@numFound>='2']");

assertQueryEquals(req("q", "aff:\"Institut d’Astrophysique\"", "aqp.multiphrase.keep_one", "SYNONYM", "qt", "aqp"),
"(aff:\"institut d'astrophysique\" | aff:\"institut d astrophysique\")",
DisjunctionMaxQuery.class
);

assertQ(req("q", "aff:xfoo"), "//*[@numFound='0']");

assertQueryEquals(req("q", "aff:\"Pasadena, CA 91125\"", "qt", "aqp"),
Expand Down Expand Up @@ -130,7 +136,7 @@ public void test() throws Exception {
);


assert h.query(req("q", "aff:foo1"))
assert h.query(req("q", "aff:foo1", "fl", "aff"))
.contains("<arr name=\"aff\">" +
"<str>foo1</str>" +
"<str>foo2</str>" +
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ public void test() throws Exception {


// check the affiliation is there stored as one string
assert h.query(req("q", "institution:\"Kavli Institute/Dept of Physics\""))
assert h.query(req("q", "institution:\"Kavli Institute/Dept of Physics\"", "fl", "institution"))
.contains("<str>Kavli Institute/Dept of Physics</str>"
);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ public static String getConfigFile() {

newConfig = duplicateFile(new File(configFile));

//replaceInFile(newConfig, "solr.SchemaCodecFactory", "solr.SimpleTextCodecFactory");
replaceInFile(newConfig, "solr.SchemaCodecFactory", "solr.SimpleTextCodecFactory");

} catch (IOException e) {
e.printStackTrace();
Expand Down Expand Up @@ -233,7 +233,7 @@ public void setUp() throws Exception {
assertU(adoc("id", "11", "bibcode", "xxxxxxxxxxx11", "title", "All-sky data survey"));
assertU(adoc("id", "12", "bibcode", "xxxxxxxxxxx12", "title", "NoSky data survey"));
assertU(adoc("id", "13", "bibcode", "xxxxxxxxxxx13", "title", "AllSky data survey"));
assertU(adoc("id", "14", "bibcode", "xxxxxxxxxxx14", "title", "Modified Newtonian Dynamics (MOND): Observational Phenomenology and Relativistic Extensions"));
assertU(adoc("id", "14", "bibcode", "xxxxxxxxxxx14", "title", "Modified Newtonian Dynamics: Observational Phenomenology and Relativistic Extensions"));
assertU(adoc("id", "15", "bibcode", "xxxxxxxxxxx15", "title", "MOND test"));
assertU(adoc("id", "16", "bibcode", "xxxxxxxxxxx16", "title", "mond test"));
assertU(adoc("id", "17", "bibcode", "xxxxxxxxxxx17", "title", "bubble pace telescope multi-pace foobar"));
Expand Down Expand Up @@ -269,6 +269,11 @@ public void setUp() throws Exception {
"keyword", "one Astrophysics Data System two"));
assertU(adoc("id", "502", "bibcode", "xxxxxxxxxx502",
"keyword", "one ads two"));
assertU(adoc("id", "600", "bibcode", "xxxxxxxxxx600",
"title", "THE HUBBLE constant: A SUMMARY OF THE HST PROGRAM FOR THE LUMINOSITY CALIBRATION OF TYPE Ia SUPERNOVAE BY MEANS OF CEPHEIDS"));
assertU(adoc("id", "601", "bibcode", "xxxxxxxxxx601",
"title", "the hubble constant: a summary of the HST program for the luminosity calibration of type Ia supernovae by means of cepheids"));

assertU(commit());
}

Expand Down Expand Up @@ -355,6 +360,9 @@ public void testMultiTokens() throws Exception {
assertQueryEquals(req("q", "title:\"observations black hole\""),
"(title:\"observations black hole\" | title:\"observations (syn::black hole syn::bh acr::bh)\"~2)",
DisjunctionMaxQuery.class);
assertQueryEquals(req("q", "title:\"observations BH\""),
"title:\"observations (acr::bh syn::black hole syn::bh)\"~2",
MultiPhraseQuery.class);

// btw our analyzer chain outputs all multi synonyms during indexing (but only canonical synonym for single ones)
// that makes sense because we don't know how the user/author are writing them; but because we output all of them
Expand Down Expand Up @@ -535,8 +543,7 @@ public void testMultiTokens() throws Exception {
"Synonym(all:mond all:syn::lunar)",
SynonymQuery.class);
assertQ(req("q", "title" + ":mond"),
"//*[@numFound='5']",
"//doc/str[@name='id'][.='14']",
"//*[@numFound='4']",
"//doc/str[@name='id'][.='15']",
"//doc/str[@name='id'][.='16']",
"//doc/str[@name='id'][.='17']",
Expand All @@ -546,9 +553,8 @@ public void testMultiTokens() throws Exception {
"Synonym(all:mond all:syn::lunar)",
SynonymQuery.class);
assertQ(req("q", "title" + ":Mond"),
"//*[@numFound='5']",
"//*[@numFound='4']",
"//doc/str[@name='id'][.='17']", // orig 'space' -> syn:lunar; look at the synonym file to understand
"//doc/str[@name='id'][.='14']",
"//doc/str[@name='id'][.='15']",
"//doc/str[@name='id'][.='16']",
"//doc/str[@name='id'][.='20']");
Expand Down Expand Up @@ -588,9 +594,15 @@ public void testMultiTokens() throws Exception {
"(title:\"modified newtonian dynamics\" "
+ "| Synonym(title:acr::mond title:syn::modified newtonian dynamics title:syn::mond))",
DisjunctionMaxQuery.class);
assertQueryEquals(req("q", "title:\"MOND\"", "defType", "aqp"),
"Synonym(title:acr::mond title:syn::modified newtonian dynamics title:syn::mond)",
SynonymQuery.class);
assertQ(req("q", "title" + ":\"modified newtonian dynamics\""), "//*[@numFound='2']",
"//doc/str[@name='id'][.='14']",
"//doc/str[@name='id'][.='15']");
assertQ(req("q", "title" + ":\"MOND\""), "//*[@numFound='2']",
"//doc/str[@name='id'][.='14']",
"//doc/str[@name='id'][.='15']");


// multi-token. this is truly crazy (several synonyms overlap)
Expand Down Expand Up @@ -786,9 +798,12 @@ public void testSynonyms() throws Exception {
DisjunctionMaxQuery.class);

assertQ(req("q", "title:\"hubble space telescope\""),
"//*[@numFound='2']",
"//*[@numFound='4']",
"//doc/str[@name='id'][.='4']",
"//doc/str[@name='id'][.='5']");
"//doc/str[@name='id'][.='5']",
"//doc/str[@name='id'][.='600']",
"//doc/str[@name='id'][.='601']"
);


// preceded by something
Expand Down Expand Up @@ -845,7 +860,7 @@ public void testSynonyms() throws Exception {
"Synonym(title:acr::hst title:syn::hst title:syn::hubble space telescope)",
SynonymQuery.class);
assertQ(req("q", "title:HST"),
"//*[@numFound='2']",
"//*[@numFound='4']",
"//doc/str[@name='id'][.='4']",
"//doc/str[@name='id'][.='5']");

Expand Down Expand Up @@ -999,6 +1014,91 @@ public void testSynonyms() throws Exception {

public void testOtherCases() throws Exception {


/**
*
* input:
*
* THE HUBBLE constant: A SUMMARY OF THE HST PROGRAM FOR THE LUMINOSITY CALIBRATION OF TYPE Ia SUPERNOVAE BY MEANS OF CEPHEIDS
*
* this is how it gets indexed internally (before moving acronym filter after stop filters):
*
* [(0, ['acr::the']),
(1, ['acr::hubble', 'hubble']),
(2, ['constant']),
(3, ['acr::of', 'acr::summary', 'acr::the', 'summary']),
(4, ['acr::hst', 'hst', 'syn::hst', 'syn::hubble space telescope']),
(5, ['acr::for', 'acr::program', 'acr::the', 'program']),
(6, ['acr::luminosity', 'luminosity']),
(7, ['acr::calibration', 'acr::of', 'calibration']),
(8, ['acr::type', 'type']),
(9, ['ia']),
(10, ['acr::supernovae', 'supernovae']),
(11, ['acr::by', 'by']),
(12, ['acr::means', 'acr::of', 'means']),
(13, ['acr::cepheids', 'cepheids']),
(14, []),
(15, []),
(16, []),
(17, []),
(18, []),
(19, [])]
with stop filters before acronyms
[(0, []),
(1, ['acr::hubble']),
(2, ['constant']),
(3, ['acr::summary', 'summary']),
(4, []),
(5, ['acr::program', 'program']),
(6, ['acr::luminosity', 'luminosity']),
(7, ['acr::calibration', 'calibration']),
(8, ['acr::type', 'type']),
(9, ['ia']),
(10, ['acr::supernovae', 'supernovae']),
(11, ['acr::by', 'by']),
(12, ['acr::means', 'means']),
(13, ['acr::cepheids', 'cepheids']),
and this how it got parsed before the change:
title:"acr::the acr::hubble constant acr::summary acr::of acr::the (acr::hst syn::hubble space telescope syn::hst) acr::program acr::for acr::the acr::luminosity acr::calibration acr::of acr::type ia acr::supernovae acr::by acr::means acr::of acr::cepheids"~3
[(0, 'acr::the'),
(1, 'acr::hubble'),
(2, 'constant'),
(3, 'acr::summary'),
(4, 'acr::of'),
(5, 'acr::the'),
(6, '(acr::hst syn::hubble space telescope syn::hst)'),
(7, 'acr::program'),
(8, 'acr::for'),
(9, 'acr::the'),
(10, 'acr::luminosity'),
(11, 'acr::calibration'),
(12, 'acr::of'),
(13, 'acr::type'),
(14, 'ia'),
(15, 'acr::supernovae'),
(16, 'acr::by'),
(17, 'acr::means'),
(18, 'acr::of'),
(19, 'acr::cepheids')]
*/
assertQueryEquals(req("q", "title:\"THE HUBBLE constant: A SUMMARY OF THE HST PROGRAM FOR THE LUMINOSITY CALIBRATION OF TYPE Ia SUPERNOVAE BY MEANS OF CEPHEIDS\""),
"title:\"acr::hubble constant acr::summary (acr::hst syn::hubble space telescope syn::hst) acr::program acr::luminosity acr::calibration acr::type ia acr::supernovae acr::by acr::means acr::cepheids\"~3",
MultiPhraseQuery.class);
assertQ(req("q", "title:\"THE HUBBLE constant: A SUMMARY OF THE HST PROGRAM FOR THE LUMINOSITY CALIBRATION OF TYPE Ia SUPERNOVAE BY MEANS OF CEPHEIDS\""),
"//*[@numFound='1']",
"//doc/str[@name='id'][.='600']");
assertQ(req("q", "title:\"the hubble constant: a summary of the HST program for the luminosity calibration of type Ia supernovae by means of cepheids\""),
"//*[@numFound='2']",
"//doc/str[@name='id'][.='600']",
"//doc/str[@name='id'][.='601']");

// change to NGC tokenizer in the schema; we want to index both
// variants, but during search time only query for the concat version

Expand Down Expand Up @@ -1225,8 +1325,71 @@ public void testOtherCases() throws Exception {

//TODO: this test is intentionally left failing; it used to work until the scoring changes (i'd like to
// investigate more how the multi-token affects recall)
/**
* 1. A 350-MHz GBT Survey of 50 Faint Fermi γ-ray
*
* gets indexed as:
*
* [(0, []),
(1, []),
(2, ['350mhz', 'syn::mhz']),
(3, ['gbt', 'syn::gbt', 'syn::green bank telescope']),
(4, ['syn::survey']),
(5, ['50']),
(6, ['faint', 'syn::faint']),
(7, ['fermi', 'syn::fermi']),
(8,
['syn::gamma',
'syn::gamma ray',
'syn::gamma rays',
'syn::gammaray',
'syn::gammarays']),
(9,
['gammaray',
'ray',
'syn::gamma ray',
'syn::gamma rays',
'syn::gammaray',
'syn::gammarays']),
(10, ['syn::source']),
(11, ['syn::radio']),
(12, ['millisecond', 'syn::millisecond'])]
while
2. A 350-MHz GBT Survey of 50 Faint Fermi γ ray
gets indexed as:
[(0, []),
(1, []),
(2, ['350mhz', 'syn::mhz']),
(3, ['syn::gbt', 'syn::green bank telescope']),
(4, ['syn::survey']),
(5, ['50']),
(6, ['faint', 'syn::faint']),
(7, ['fermi', 'syn::fermi']),
(8,
['syn::gamma',
'syn::gamma ray',
'syn::gamma rays',
'syn::gammaray',
'syn::gammarays']),
(9, ['ray']),
(10, ['syn::source']),
(11, ['syn::radio']),
(12, ['millisecond', 'syn::millisecond'])]
*/
dumpDoc(null, "title", "bibcode");
assertQ(req("q", "title:\"γ-ray Sources\"",

assertQueryEquals(req("q", "title:\"γ ray Sources\""),
"(title:\"(gamma syn::gamma) ray (sources syn::source)\"~2 | title:\"(syn::gamma ray syn::gammaray syn::gamma rays syn::gammarays) ? (sources syn::source)\"~2)",
DisjunctionMaxQuery.class);
assertQueryEquals(req("q", "title:\"γ-ray Sources\""),
"title:\"(gamma syn::gamma syn::gamma ray syn::gammaray syn::gamma rays syn::gammarays) (ray gammaray syn::gamma ray syn::gammaray syn::gamma rays syn::gammarays) (sources syn::source)\"~2",
MultiPhraseQuery.class);

assertQ(req("q", "title:\"γ ray Sources\"",
"indent", "true",
"debugQuery", "true"),
"//*[@numFound='4']",
Expand All @@ -1235,7 +1398,7 @@ public void testOtherCases() throws Exception {
"//doc/str[@name='id'][.='402']",
"//doc/str[@name='id'][.='403']"
);
assertQ(req("q", "title:\"γ ray Sources\"",
assertQ(req("q", "title:\"$\\gamma$ ray Sources\"",
"indent", "true",
"debugQuery", "true"),
"//*[@numFound='4']",
Expand All @@ -1244,7 +1407,7 @@ public void testOtherCases() throws Exception {
"//doc/str[@name='id'][.='402']",
"//doc/str[@name='id'][.='403']"
);
assertQ(req("q", "title:\"$\\gamma$ ray Sources\"",
assertQ(req("q", "title:\"γ-ray Sources\"",
"indent", "true",
"debugQuery", "true"),
"//*[@numFound='4']",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ public void testSearch() throws Exception {
"//*[@numFound='6']"
);

assertQ(req("q", "bibcode:A"),
assertQ(req("q", "bibcode:A", "fl", "bibcode"),
"//*[@numFound='1']",
"//result/doc[1]/str[@name='bibcode']='A'"
);
Expand All @@ -78,25 +78,25 @@ public void testSearch() throws Exception {
"//*[@numFound='0']"
);

assertQ(req("q", "citations(bibcode:b)"),
assertQ(req("q", "citations(bibcode:b)", "fl", "bibcode"),
"//*[@numFound='2']",
"//result/doc/str[@name='bibcode']='A'",
"//result/doc/str[@name='bibcode']='D'"
);
assertQ(req("q", "joincitations(bibcode:B)"),
assertQ(req("q", "joincitations(bibcode:B)", "fl", "bibcode"),
"//*[@numFound='2']",
"//result/doc/str[@name='bibcode']='A'",
"//result/doc/str[@name='bibcode']='D'"
);


assertQ(req("q", "references(bibcode:A)"),
assertQ(req("q", "references(bibcode:A)", "fl", "bibcode"),
"//*[@numFound='3']",
"//result/doc/str[@name='bibcode']='B'",
"//result/doc/str[@name='bibcode']='C'",
"//result/doc/str[@name='bibcode']='D'"
);
assertQ(req("q", "joinreferences(bibcode:A)"),
assertQ(req("q", "joinreferences(bibcode:A)", "fl", "bibcode"),
"//*[@numFound='3']",
"//result/doc/str[@name='bibcode']='B'",
"//result/doc/str[@name='bibcode']='C'",
Expand Down
Loading

0 comments on commit 3eaef03

Please sign in to comment.