Skip to content

Commit

Permalink
Changed the rules for NGC objects
Browse files Browse the repository at this point in the history
  • Loading branch information
romanchyla committed Aug 7, 2019
1 parent e6a6d3c commit 276be34
Show file tree
Hide file tree
Showing 2 changed files with 101 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,11 @@ public void setUp() throws Exception {
assertU(adoc("id", "150", "bibcode", "xxxxxxxxxx150", "title", "nag5-abcd"));
assertU(adoc("id", "151", "bibcode", "xxxxxxxxxx151", "title", "nag5abcd"));
assertU(adoc("id", "152", "bibcode", "xxxxxxxxxx152", "title", "nag5 abcd"));
assertU(adoc("id", "153", "bibcode", "xxxxxxxxxx153", "title", "NGC 1"));
assertU(adoc("id", "154", "bibcode", "xxxxxxxxxx154", "title", "NGC-1"));
assertU(adoc("id", "155", "bibcode", "xxxxxxxxxx155", "title", "N-1"));
assertU(adoc("id", "156", "bibcode", "xxxxxxxxxx156", "title", "N 1"));
assertU(adoc("id", "157", "bibcode", "xxxxxxxxxx157", "title", "NGC1"));

assertU(adoc("id", "318", "bibcode", "xxxxxxxxxx318", "title", "creation of a thesaurus", "pub", "creation of a thesaurus"));
assertU(adoc("id", "382", "bibcode", "xxxxxxxxxx382", "title", "xhtml <tags> should be <SUB>fooxx</SUB> <xremoved>"));
Expand Down Expand Up @@ -970,7 +975,96 @@ public void testSynonyms() throws Exception {
}

public void testOtherCases() throws Exception {

// change to NGC tokenizer in the schema; we want to index both
// variants, but during search time only query for the concat version

assertQ(req("q", "title" + ":NGC"),
"//*[@numFound='4']",
"//doc/str[@name='id'][.='153']", //NGC 1
"//doc/str[@name='id'][.='154']", //NGC-1
"//doc/str[@name='id'][.='155']", //N-1
"//doc/str[@name='id'][.='156']" //N 1
//"//doc/str[@name='id'][.='157']" //NGC1
);

assertQueryEquals(req("q", "title:\"NGC 1\"", "defType", "aqp"),
"title:acr::ngc1",
TermQuery.class);
assertQ(req("q", "title" + ":NGC 1", "indent", "true"),
"//*[@numFound='5']",
"//doc/str[@name='id'][.='153']",
"//doc/str[@name='id'][.='154']",
"//doc/str[@name='id'][.='155']",
"//doc/str[@name='id'][.='156']",
"//doc/str[@name='id'][.='157']"
);


assertQueryEquals(req("q", "title:\"NGC-1\"", "defType", "aqp"),
"title:acr::ngc1",
TermQuery.class);
assertQ(req("q", "title" + ":NGC-1"),
"//*[@numFound='5']",
"//doc/str[@name='id'][.='153']",
"//doc/str[@name='id'][.='154']",
"//doc/str[@name='id'][.='155']",
"//doc/str[@name='id'][.='156']",
"//doc/str[@name='id'][.='157']" //NGC1
);

assertQueryEquals(req("q", "title:\"N-1\"", "defType", "aqp"),
"title:n1",
TermQuery.class);
assertQ(req("q", "title" + ":N-1"),
"//*[@numFound='2']",
"//doc/str[@name='id'][.!='153']",
"//doc/str[@name='id'][.!='154']",
"//doc/str[@name='id'][.='155']",
"//doc/str[@name='id'][.='156']",
"//doc/str[@name='id'][.!='157']"
);

// this finds 0 because during indexing, we'd turn the two
// tokens into 'n1' - and this search
assertQueryEquals(req("q", "title:\"N 1\"", "defType", "aqp"),
"title:n1",
TermQuery.class);
assertQ(req("q", "title" + ":\"N 1\""),
"//*[@numFound='2']",
"//doc/str[@name='id'][.!='153']",
"//doc/str[@name='id'][.!='154']",
"//doc/str[@name='id'][.='155']",
"//doc/str[@name='id'][.='156']",
"//doc/str[@name='id'][.!='157']" //NGC1
);

assertQueryEquals(req("q", "title:\"NGC1\"", "defType", "aqp"),
"title:acr::ngc1",
TermQuery.class);
assertQ(req("q", "title" + ":NGC1"),
"//*[@numFound='5']",
"//doc/str[@name='id'][.='153']",
"//doc/str[@name='id'][.='154']",
"//doc/str[@name='id'][.='155']",
"//doc/str[@name='id'][.='156']",
"//doc/str[@name='id'][.='157']"
);

assertQueryEquals(req("q", "=title:\"NGC 1\"", "defType", "aqp"),
"title:\"acr::ngc 1\"",
PhraseQuery.class);
assertQ(req("q", "=title" + ":NGC 1"),
"//*[@numFound='4']",
"//doc/str[@name='id'][.='153']",
"//doc/str[@name='id'][.='154']",
"//doc/str[@name='id'][.='155']",
"//doc/str[@name='id'][.='156']",
"//doc/str[@name='id'][.!='157']"
);



// #147 - parsing of WDDF tokens
// analyzer operation. eg. XXX-YYYY => (XXX AND YYY) OR XXXYYY
assertQueryEquals(req("q", "NAG5-ABCD", "defType", "aqp"),
Expand Down
26 changes: 7 additions & 19 deletions contrib/examples/adsabs/server/solr/collection1/conf/schema.xml
Original file line number Diff line number Diff line change
Expand Up @@ -415,7 +415,7 @@
<charFilter class="solr.PatternReplaceCharFilterFactory"
pattern="\b(?i:(ABELL)(-|\s+)([0-9]+[A-Z]*))\b" replacement="$1-$3 A$3" />
<charFilter class="solr.PatternReplaceCharFilterFactory"
pattern="\b(?i:(NGC)(-|\s+)([0-9]+[A-Z]*))\b" replacement="NGC$3" />
pattern="\b(?i:(N)(-|\s+)([0-9]+[A-Z]*))\b" replacement="NGC-$3 N$3" />
<charFilter class="solr.PatternReplaceCharFilterFactory"
pattern="\b(?i:([34]CR?|ADS|H[DHR]|IC|[MW]|MKN|NGC|PKS|PSR[BJ]?|SAO|UGC|UT)(-|\s+)([0-9]+[A-Z]*))\b"
replacement="$1-$3" />
Expand Down Expand Up @@ -490,18 +490,18 @@
<filter class="solr.TrimFilterFactory" />
<filter class="solr.LowerCaseFilterFactory" />

<!-- <filter class="org.apache.solr.analysis.DiagnoseFilterFactory" msg="indexer"/> -->
<!-- <filter class="org.apache.solr.analysis.DiagnoseFilterFactory" msg="indexer"/> //-->
</analyzer>
<analyzer type="query">
<charFilter class="solr.HTMLStripCharFilterFactory"/>
<!-- AA: as above, but we only have one canonical replacement for the
expression -->
<charFilter class="solr.PatternReplaceCharFilterFactory"
pattern="\b(?i:(MESSIER)(-|\s+)([0-9]+[A-Z]*))\b" replacement="M$3" />
pattern="\b(?i:(MESSIER)(-|\s+)([0-9]+[A-Z]*))\b" replacement="$1$3" />
<charFilter class="solr.PatternReplaceCharFilterFactory"
pattern="\b(?i:(ABELL)(-|\s+)([0-9]+[A-Z]*))\b" replacement="A$3" />
pattern="\b(?i:(ABELL)(-|\s+)([0-9]+[A-Z]*))\b" replacement="$1$3" />
<charFilter class="solr.PatternReplaceCharFilterFactory"
pattern="\b(?i:(NGC)(-|\s+)([0-9]+[A-Z]*))\b" replacement="NGC$3" />
pattern="\b(?i:(NGC|N)(-|\s+)([0-9]+[A-Z]*))\b" replacement="$1$3" />
<charFilter class="solr.PatternReplaceCharFilterFactory"
pattern="\b(?i:([34]CR?|ADS|H[DHR]|IC|[MW]|MKN|NGC|PKS|PSR[BJ]?|SAO|UGC|UT)(-|\s+)([0-9]+[A-Z]*))\b"
replacement="$1$3" />
Expand All @@ -522,7 +522,7 @@
splitOnNumerics="0" stemEnglishPossessive="1" preserveOriginal="0"
/>

<!-- <filter class="org.apache.solr.analysis.DiagnoseFilterFactory" msg="split"/> -->
<!-- <filter class="org.apache.solr.analysis.DiagnoseFilterFactory" msg="query:split"/> -->

<!-- lowercase words, but keep ACRONYMS case ie. MOND => MOND Mond =>
mond Hubble Space Telescope => hubble space telescope -->
Expand Down Expand Up @@ -599,19 +599,7 @@
<fieldType name="ads_text_nosyn" class="solr.TextField">
<analyzer type="query">
<charFilter class="solr.HTMLStripCharFilterFactory"/>
<!-- AA: as above, but we only have one canonical replacement for the
expression -->
<charFilter class="solr.PatternReplaceCharFilterFactory"
pattern="\b(?i:(MESSIER)(-|\s+)([0-9]+[A-Z]*))\b" replacement="M$3" />
<charFilter class="solr.PatternReplaceCharFilterFactory"
pattern="\b(?i:(ABELL)(-|\s+)([0-9]+[A-Z]*))\b" replacement="A$3" />
<charFilter class="solr.PatternReplaceCharFilterFactory"
pattern="\b(?i:(NGC)(-|\s+)([0-9]+[A-Z]*))\b" replacement="NGC$3" />
<charFilter class="solr.PatternReplaceCharFilterFactory"
pattern="\b(?i:([34]CR?|ADS|H[DHR]|IC|[MW]|MKN|NGC|PKS|PSR[BJ]?|SAO|UGC|UT)(-|\s+)([0-9]+[A-Z]*))\b"
replacement="$1$3" />



<!-- tokenize on empty space (if it is not a hyphen connecting other
words) -->
<tokenizer class="solr.PatternTokenizerFactory" pattern="(?&lt;![-\s])\s+(?!-)"
Expand Down

0 comments on commit 276be34

Please sign in to comment.