Skip to content

Commit

Permalink
Fix for the multi-token phrase search not finding papers when N-\d to…
Browse files Browse the repository at this point in the history
…ken is present
  • Loading branch information
romanchyla committed Nov 15, 2021
1 parent 277bde9 commit a8b1e85
Show file tree
Hide file tree
Showing 7 changed files with 69 additions and 43 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,10 @@ else if (typeAtt.type().equals(tokenType) && this.genVariants()) {
}

private boolean genVariants() {
//log.debug("generating name variants for: " + authorName);
//System.out.println("generating name variants for: " + termAtt.toString());
ArrayList<String> synonyms = AuthorUtils.getAsciiTransliteratedVariants(termAtt.toString());
if (synonyms != null && synonyms.size() > 0) {
//log.debug("variants: " + synonyms);
//System.out.println("variants: " + synonyms);
transliterationStack = synonyms;
N = synonyms.size();
return true;
Expand Down
48 changes: 25 additions & 23 deletions contrib/adsabs/src/test/org/adsabs/TestAdsAllFields.java
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ public void test() throws Exception {
", \"aff_abbrev\": [\"CfA\", \"Harvard U/Dep Ast\", \"-\"]" +
", \"aff_canonical\": [\"Harvard Smithsonian Center for Astrophysics\", \"Harvard University, Department of Astronomy\", \"-\"]" +
", \"aff\": [\"-\", \"NASA Kavli space center, Cambridge, MA 02138, USA\", \"Einstein institute, Zurych, Switzerland\"]" +
", \"institution\": [\"CfA\", \"Harvard U/Dep Ast\", \"-\", \"foo/bar baz\"]" +
", \"institution\": [\"CfA\", \"Harvard U/Dep Ast\", \"-\", \"foo/bar baz\"]" +
", \"aff_facet\": [[\"A1234\", \"facet abbrev/parent abbrev\"]]" +
", \"aff_facet_hier\": [\"1/1812/61814\", \"1/8264/61814\", \"1/1812/A1036\", \"-\"]" +

Expand Down Expand Up @@ -193,17 +193,18 @@ public void test() throws Exception {
", \"date\": \"2013-08-05T00:30:00Z\"" +
", \"doctype\": \"article\"" +
", \"doctype_facet_hier\": [\"0/Article\", \"1/Article/Book chapter\"]" +
", \"doi\": \"doi:ŽŠČŘĎŤŇ:123456789\"" +
", \"editor\": [\"t' Hooft, van X\"]" +
", \"eid\": \"00001\"" +
", \"email\": [\"-\", \"anders@email.com\", \"-\"]" +
// entry_date --> see below
", \"esources\": [\"AUTHOR_HTML\", \"PUB_PDF\"]" +
// Field that contains both grant ids and grant agencies.
", \"grant\": [\"NASA\", \"123456-78\", \"NSF-AST\", \"0618398\"]" +
// grant_agency/grant_id
", \"grant_facet_hier\": [\"0/NASA\", \"1/NASA/123456-78\"]" +
", \"identifier\": [\"arxiv:1234.5678\", \"ARXIV:hep-ph/1234\"]" +
", \"doi\": \"doi:ŽŠČŘĎŤŇ:123456789\"" +
", \"editor\": [\"t' Hooft, van X\"]" +
", \"eid\": \"00001\"" +
", \"email\": [\"-\", \"anders@email.com\", \"-\"]" +
// entry_date --> see below
", \"esources\": [\"AUTHOR_HTML\", \"PUB_PDF\"]" +
// Field that contains both grant ids and grant agencies.
", \"grant\": [\"NASA\", \"123456-78\", \"NSF-AST\", \"0618398\"]" +
// grant_agency/grant_id
", \"grant_facet_hier\": [\"0/NASA\", \"1/NASA/123456-78\"]" +
// pipeline is now filling this field (we do no more copyfield)
", \"identifier\": [\"arxiv:1234.5678\", \"ARXIV:hep-ph/1234\", \"2014JNuM..455...1a1\", \"2014JNuM..455...1a2\", \"2014JNuM..455...10B\"]" +
", \"ids_data\": [\"{whatever: here there MAST}\"]" +
", \"issue\": \"24i\"" +

Expand All @@ -217,9 +218,9 @@ public void test() throws Exception {
", \"links_data\": [\"{whatever: here there MAST}\"," +
"\"{\\\"foo\\\": [\\\"bar\\\", \\\"baz\\\"], \\\"one\\\": {\\\"two\\\": \\\"three\\\"}}\"]" +

", \"nedid\": [\"X+1-5 =6\", \"foo bar\"]" +
", \"nedtype\": [\"Other\", \"type2\"]" +
", \"ned_object_facet_hier\": [ \"0/Other\", \"1/Other/X+1-5 =6\", \"0/type2\", \"1/type2/foo bar\"]" +
", \"nedid\": [\"X+1-5 =6\", \"foo bar\"]" +
", \"nedtype\": [\"Other\", \"type2\"]" +
", \"ned_object_facet_hier\": [ \"0/Other\", \"1/Other/X+1-5 =6\", \"0/type2\", \"1/type2/foo bar\"]" +

", \"orcid_pub\": [\"1111-2222-3333-4444\", \"-\", \"0000-0002-4110-3511\"]" +
", \"orcid_user\": [\"-\", \"-\", \"0000-0002-4110-3511\"]" +
Expand Down Expand Up @@ -255,14 +256,14 @@ public void test() throws Exception {
", \"metadata_ctime\": \"2017-09-19T11:01:32.809Z\"" + // missing 'Z' (not accepted)
", \"metadata_mtime\": \"2017-09-19T11:01:32.809Z\"" +
", \"fulltext_ctime\": \"2017-09-19T11:01:32.809Z\"" +
", \"fulltext_mtime\": \"2017-09-19T11:01:32.809Z\"" +
", \"nonbib_ctime\": \"2017-09-19T11:01:32.809Z\"" +
", \"nonbib_mtime\": \"2017-09-19T11:01:32.809Z\"" +
", \"metrics_ctime\": \"2017-09-19T11:01:32.809Z\"" +
", \"metrics_mtime\": \"2017-09-19T11:01:32.809Z\"" +
", \"orcid_ctime\": \"2017-09-19T11:01:32.809Z\"" +
", \"orcid_mtime\": \"2017-09-19T11:01:32.809Z\"" +

", \"fulltext_mtime\": \"2017-09-19T11:01:32.809Z\"" +
", \"nonbib_ctime\": \"2017-09-19T11:01:32.809Z\"" +
", \"nonbib_mtime\": \"2017-09-19T11:01:32.809Z\"" +
", \"metrics_ctime\": \"2017-09-19T11:01:32.809Z\"" +
", \"metrics_mtime\": \"2017-09-19T11:01:32.809Z\"" +
", \"orcid_ctime\": \"2017-09-19T11:01:32.809Z\"" +
", \"orcid_mtime\": \"2017-09-19T11:01:32.809Z\"" +

"}" +
"}}";
Expand Down Expand Up @@ -307,6 +308,7 @@ public void test() throws Exception {

assertU(adoc("id", "60", "bibcode", "b60", "abstract", "all no-sky survey"));


assertU(commit("waitSearcher", "true"));

assertQ(req("q", "*:*"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -619,6 +619,7 @@ public void testMultipleTokenConcatenation() throws Exception {
assertQueryEquals("foo:(A)", null, "foo:a");
assertQueryEquals("foo:(A -B)", null, "+foo:a -foo:b");
assertQueryEquals("foo:(A B D E)", null, "+foo:a +foo:b +foo:d +foo:e"); // but this is fielded
assertQueryEquals("\"A B D E\"", null, "\"a b d e\"");
assertQueryEquals("A B D E", null, "\"a b d e\"");
assertQueryEquals("+A B D E", null, "\"a b d e\"");
assertQueryEquals("A +B D E", null, "+a +\"b d e\"");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1131,8 +1131,8 @@ public void testOtherCases() throws Exception {
);

assertQueryEquals(req("q", "title:\"NGC 1\"", "defType", "aqp"),
"title:acr::ngc1",
TermQuery.class);
"(title:acr::ngc1 | title:\"acr::ngc 1\")",
DisjunctionMaxQuery.class);
assertQ(req("q", "title" + ":NGC 1", "indent", "true"),
"//*[@numFound='5']",
"//doc/str[@name='id'][.='153']",
Expand All @@ -1144,8 +1144,8 @@ public void testOtherCases() throws Exception {


assertQueryEquals(req("q", "title:\"NGC-1\"", "defType", "aqp"),
"title:acr::ngc1",
TermQuery.class);
"(title:acr::ngc1 | title:\"acr::ngc 1\")",
DisjunctionMaxQuery.class);
assertQ(req("q", "title" + ":NGC-1"),
"//*[@numFound='5']",
"//doc/str[@name='id'][.='153']",
Expand All @@ -1156,8 +1156,8 @@ public void testOtherCases() throws Exception {
);

assertQueryEquals(req("q", "title:\"N-1\"", "defType", "aqp"),
"title:n1",
TermQuery.class);
"(title:n1 | title:\"n 1\")",
DisjunctionMaxQuery.class);
assertQ(req("q", "title" + ":N-1"),
"//*[@numFound='2']",
"//doc/str[@name='id'][.!='153']",
Expand All @@ -1170,8 +1170,8 @@ public void testOtherCases() throws Exception {
// this finds 0 because during indexing, we'd turn the two
// tokens into 'n1' - and this search
assertQueryEquals(req("q", "title:\"N 1\"", "defType", "aqp"),
"title:n1",
TermQuery.class);
"(title:n1 | title:\"n 1\")",
DisjunctionMaxQuery.class);
assertQ(req("q", "title" + ":\"N 1\""),
"//*[@numFound='2']",
"//doc/str[@name='id'][.!='153']",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,26 @@ public void testAuthorSynonyms() throws Exception {

checkIt("Müller, Bill", "Müller, Bill", "Mueller, Bill", "Muller, Bill");
checkIt("Peißker, L", "Peißker, L", "Peissker, L");


}

public void testAccents() throws Exception {
checkIt("Jeřábková, Tereza", "Jeřábková, Tereza", "Jerhaebkovae, Tereza", "Jerabkova, Tereza");
checkIt("Dupré", "Dupré", "Dupree", "Dupre");
checkIt("Duprè", "Duprè", "Dupre", "Duprè"); // Dupre\\xcc\\x80
checkIt("\u0141", "Ł", "L");
// System.out.println("\u0141");
// System.out.println("\u0308E");
// System.out.println("\u030aA");
// System.out.println("\u0301E");
// System.out.println("\u030cH");
// //checkIt("\u0308E", "̈E");
// checkIt("Mendigutıa", "Mendigutia");
// checkIt("\u030aA", "\u030aA", "A");
// checkIt("\u0301E", "E");
// checkIt("\u030cH", "H");

}

private void checkIt(String input, String... expected) throws Exception {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,8 @@ public static String getSchemaFile() {
"lensing => mikrolinseneffekt",
"pink => pinkish",
"stephen, stephens => stephen",
"bremßtrahlung => brehmen"
"bremßtrahlung => brehmen",
"protostars, protostellar, protostar, protosterne, circumprotostellar, protoestrellas, protosellar, prototstars, photostellar, preprotostars, protoetoile, protostellarlike, protostarlike => protostars"
});
replaceInFile(newConfig, "synonyms=\"ads_text_simple.synonyms\"", "synonyms=\"" + synonymsFile.getAbsolutePath() + "\"");

Expand Down Expand Up @@ -383,11 +384,19 @@ public void testUnfieldedSearch() throws Exception {
}

public void testSpecialCases() throws Exception {

assertU(adoc("id", "61", "bibcode", "b61",
"title", "A Change of Rotation Profile in the Envelope in the HH 111 Protostellar System: A Transition to a Disk?"));

assertU(adoc("id", "2", "bibcode", "XXX", "abstract", "foo bar baz",
"title", "title bitle"));
assertU(commit("waitSearcher", "true"));

assertQ(req("q", "title:\"A Change of Rotation Profile in the Envelope in the HH 111 Protostellar System: A Transition to a Disk\""),
"//*[@numFound='1']",
"//doc/str[@name='id'][.='61']");


assertQEx("INVALID_SYNTAX", req("q", "author:\"^\"de marco year:2015"), 400);


Expand Down Expand Up @@ -456,10 +465,11 @@ public void testSpecialCases() throws Exception {
int md = ir.get().maxDoc();
ir.decref();

assertQueryEquals(req("defType", "aqp", "q", "similar(topn(200, *:*), title abstract)"),
setDebug(true);
assertQueryEquals(req("defType", "aqp", "q", "similar(topn(200, abstract:foo), title abstract)"),
"+like:foo bar baz title bitle -BitSetQuery(" + md + ")",
BooleanQuery.class);
assertQueryEquals(req("defType", "aqp", "q", "similar(topn(200, *:*) , title abstract) foo"),
assertQueryEquals(req("defType", "aqp", "q", "similar(topn(200, abstract:foo) , title abstract) foo"),
"+(+like:foo bar baz title bitle -BitSetQuery(" + md + ")) +all:foo",
BooleanQuery.class);

Expand Down
12 changes: 5 additions & 7 deletions contrib/examples/adsabs/server/solr/collection1/conf/schema.xml
Original file line number Diff line number Diff line change
Expand Up @@ -528,23 +528,22 @@
<!-- final normalization -->
<filter class="solr.TrimFilterFactory" />
<filter class="solr.LowerCaseFilterFactory" />

<!-- <filter class="org.apache.solr.analysis.DiagnoseFilterFactory" msg="index:1"/> -->
</analyzer>
<analyzer type="query">
<charFilter class="solr.HTMLStripCharFilterFactory" />
<!-- AA: as above, but we only have one canonical replacement for the
expression -->
<charFilter class="solr.PatternReplaceCharFilterFactory"
pattern="\b(?i:(MESSIER)(-|\s+)([0-9]+[A-Z]*))\b"
replacement="$1$3" />
replacement="$1-$3" />
<charFilter class="solr.PatternReplaceCharFilterFactory"
pattern="\b(?i:(ABELL)(-|\s+)([0-9]+[A-Z]*))\b" replacement="$1$3" />
pattern="\b(?i:(ABELL)(-|\s+)([0-9]+[A-Z]*))\b" replacement="$1-$3" />
<charFilter class="solr.PatternReplaceCharFilterFactory"
pattern="\b(?i:(NGC|N)(-|\s+)([0-9]+[A-Z]*))\b" replacement="$1$3" />
pattern="\b(?i:(NGC|N)(-|\s+)([0-9]+[A-Z]*))\b" replacement="$1-$3" />
<charFilter class="solr.PatternReplaceCharFilterFactory"
pattern="\b(?i:([34]CR?|ADS|H[DHR]|IC|[MW]|MKN|NGC|PKS|PSR[BJ]?|SAO|UGC|UT)(-|\s+)([0-9]+[A-Z]*))\b"
replacement="$1$3" />

replacement="$1-$3" />

<!-- tokenize on empty space (if it is not a hyphen connecting other
words) -->
Expand All @@ -562,7 +561,6 @@
catenateNumbers="0" catenateAll="1" splitOnCaseChange="0"
splitOnNumerics="0" stemEnglishPossessive="1" preserveOriginal="0" />

<!-- <filter class="org.apache.solr.analysis.DiagnoseFilterFactory" msg="query:split"/> -->

<!-- lowercase words, but keep ACRONYMS case ie. MOND => MOND Mond =>
mond Hubble Space Telescope => hubble space telescope -->
Expand Down

0 comments on commit a8b1e85

Please sign in to comment.