Skip to content

Commit

Permalink
Fixed FU guy's feedback
Browse files Browse the repository at this point in the history
  • Loading branch information
romanchyla committed May 21, 2020
1 parent 8d0021b commit 1363a91
Show file tree
Hide file tree
Showing 4 changed files with 113 additions and 22 deletions.
18 changes: 9 additions & 9 deletions contrib/adsabs/src/test/org/adsabs/TestAdsAllFields.java
Original file line number Diff line number Diff line change
Expand Up @@ -449,7 +449,7 @@ public void test() throws Exception {
"//doc/int[@name='recid'][.='100']",
"//doc/float[@name='score'][.='26.0']");

assert h.query(req("q", "author:\"Einstein, A\""))
assert h.query(req("q", "author:\"Einstein, A\"", "fl", "author_norm"))
.contains("<arr name=\"author_norm\">" +
"<str>t' Hooft, van X</str>" +
"<str>Anders, John Michael</str>" +
Expand Down Expand Up @@ -570,7 +570,7 @@ public void test() throws Exception {
"//*[@numFound='1']");


assert h.query(req("q", "recid:100"))
assert h.query(req("q", "recid:100", "fl", "aff"))
.contains("<arr name=\"aff\">" +
"<str>-</str>" +
"<str>NASA Kavli space center, Cambridge, MA 02138, USA</str>" +
Expand Down Expand Up @@ -646,7 +646,7 @@ public void test() throws Exception {


// order/gaps are important
assert h.query(req("q", "recid:100"))
assert h.query(req("q", "recid:100", "fl", "email"))
.contains("<arr name=\"email\">" +
"<str>-</str>" +
"<str>anders@email.com</str>" +
Expand All @@ -667,7 +667,7 @@ public void test() throws Exception {
"//doc/int[@name='recid'][.='100']",
"//*[@numFound='1']"
);
assert h.query(req("q", "recid:100"))
assert h.query(req("q", "recid:100", "fl", "orcid_pub"))
.contains("<arr name=\"orcid_pub\">" +
"<str>1111-2222-3333-4444</str>" +
"<str>-</str>" +
Expand Down Expand Up @@ -1293,7 +1293,7 @@ public void test() throws Exception {
* links_data (generated and stored as JSON for display purposes)
* ids_data (generated and stored as JSON for display purposes)
*/
assertQ(req("q", "id:100"),
assertQ(req("q", "id:100", "fl", "links_data"),
"//doc/arr[@name='links_data']/str[contains(text(),'MAST')]",
"//doc/arr[@name='links_data']/str[contains(text(),'{\"foo\": [\"bar\", \"baz\"], \"one\": {\"two\": \"three\"}}')]"
);
Expand Down Expand Up @@ -1469,7 +1469,7 @@ public void test() throws Exception {
assertQ(req("q", "page_range:23"),
"//*[@numFound='0']" // not searchable
);
assertQ(req("q", "page:55"),
assertQ(req("q", "page:55", "fl", "page_range"),
"//doc[1]/str[@name='page_range'][.='23-55s']"
);

Expand Down Expand Up @@ -1540,7 +1540,7 @@ public void test() throws Exception {
assertQ(req("q", "data:(nEd OR foo)"),
"//doc[1]/int[@name='recid'][.='100']"
);
assertQ(req("q", "data:\"NED:999\""), // numbers should be ignored in search, but stored
assertQ(req("q", "data:\"NED:999\"", "fl", "recid,data"), // numbers should be ignored in search, but stored
"//doc[1]/int[@name='recid'][.='100']",
"//doc[1]/arr[@name='data']/str[contains(text(),'NED:15')]"
);
Expand All @@ -1549,7 +1549,7 @@ public void test() throws Exception {
* esources
*
*/
assertQ(req("q", "esources:pub_pDF"),
assertQ(req("q", "esources:pub_pDF", "fl", "recid"),
"//doc[1]/int[@name='recid'][.='100']"
);

Expand Down Expand Up @@ -1626,7 +1626,7 @@ public void test() throws Exception {
/*
* similar() query
*/
assertQ(req("q", "similar(recid:100)"),
assertQ(req("q", "similar(recid:100)", "fl", "recid"),
"//doc[1]/int[@name='recid'][.='60']"
);

Expand Down
25 changes: 13 additions & 12 deletions contrib/examples/adsabs/server/solr/collection1/conf/schema.xml
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,7 @@
unittest: TestAdsabsTypeFulltext -->
<!-- the tokenizing part needs more work, probably using synonyms to match
patterns? -->
<fieldType name="ads_text" class="solr.TextField">
<fieldType name="ads_text" class="solr.TextField" positionIncrementGap="0">
<analyzer type="index">
<charFilter class="solr.HTMLStripCharFilterFactory"/>

Expand Down Expand Up @@ -445,6 +445,7 @@
class="org.apache.lucene.analysis.core.SelectiveLowerCaseFilterFactory" />



<!-- find synonyms, first multi-tokens -->
<filter class="org.apache.lucene.analysis.synonym.NewSynonymFilterFactory"
synonyms="ads_text_multi.synonyms" ignoreCase="false" expand="true"
Expand All @@ -464,11 +465,6 @@
<filter class="solr.analysis.ResetFilterFactory"
incomingType="SYNONYM" addPrefix="syn::" posIncrement="0" />

<!-- if the original or synonym contains UPPERCASE variant, mark it as
an acronym but keep the type information (if it is a synonym, it will remain
SYNONYM) which is important for the query parsing -->
<filter class="solr.AcronymTokenFilterFactory" emitBoth="true"
prefix="acr::" setType="ACRONYM"/>

<!-- remove stop words - first the case sensitively -->
<filter class="org.apache.lucene.analysis.core.AqpStopFilterFactory" ignoreCase="false"
Expand All @@ -478,6 +474,11 @@
<filter class="org.apache.lucene.analysis.core.AqpStopFilterFactory" ignoreCase="true"
words="ads_text.kill"/>

<!-- if the original or synonym contains UPPERCASE variant, mark it as
an acronym but keep the type information (if it is a synonym, it will remain
SYNONYM) which is important for the query parsing -->
<filter class="solr.AcronymTokenFilterFactory" emitBoth="true"
prefix="acr::" setType="ACRONYM"/>


<!-- we emit ASCIIField version of the token (at the same position):
Expand All @@ -489,7 +490,7 @@
<filter class="solr.TrimFilterFactory" />
<filter class="solr.LowerCaseFilterFactory" />

<!-- <filter class="org.apache.solr.analysis.DiagnoseFilterFactory" msg="indexer"/> //-->
<!-- <filter class="org.apache.solr.analysis.DiagnoseFilterFactory" msg="indexer"/> -->
</analyzer>
<analyzer type="query">
<charFilter class="solr.HTMLStripCharFilterFactory"/>
Expand Down Expand Up @@ -551,11 +552,6 @@
inclOrig="true" />


<!-- if the original or synonym contains UPPERCASE variant, mark it as
an acronym but do not change its type, if it was a SYNONYM, it is important
information for query parsing -->
<filter class="solr.AcronymTokenFilterFactory" emitBoth="false" allowTypes="SYNONYM"
prefix="acr::" setType="ACRONYM"/>

<!-- add a prefix to all synonyms -->
<filter class="solr.analysis.ResetFilterFactory"
Expand All @@ -575,6 +571,11 @@
words="ads_text.kill"
/>

<!-- if the original or synonym contains UPPERCASE variant, mark it as
an acronym but do not change its type, if it was a SYNONYM, it is important
information for query parsing -->
<filter class="solr.AcronymTokenFilterFactory" emitBoth="false" allowTypes="SYNONYM"
prefix="acr::" setType="ACRONYM"/>


<!-- we emit ASCIIField version of the token (at the same position) -->
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@
<lst name="defaults">
<str name="wt">json</str>
<int name="rows">10</int>
<str name="fl">recid author title abstract page pub</str>
<str name="fl">id recid author title abstract page pub</str>
<!--
ADSLABS:12/12: this can be overrided by url params.
But unfielded search will not work if you use "df" parameter!
Expand Down
90 changes: 90 additions & 0 deletions scripts/parse_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
def parse(lines, docid):
out = [[] for x in range(1000)]
seqs = ['field', 'term', 'doc', 'freq', 'pos']
ix = 0
i = 0
term = None
maxpos = 0

while i < len(lines):
l = lines[i].strip()
if l == '':
i+= 1
continue
if ix > 3:
print 'processing', l, term
parts = l.split(' ', 1)
if len(parts) <= 1:
i+= 1
continue
key, value = parts[0], parts[1]
if key == seqs[ix]:
if key == 'field':
if value == 'title':
ix += 1
else:
ix = 0
elif key == 'term':
term = value
ix += 1
elif key == 'doc':
if value != docid:
i += 1
ix -= 1
continue
doc = value
ix += 1
elif key == 'freq':
freq = value
ix += 1
elif key == 'pos':
pos = int(value)
if pos > maxpos:
maxpos = pos
out[pos].append(term)
j = i
print 'adding', term, 'position', pos
while j+1 < len(lines) and lines[j+1].strip().split(' ', 1)[0] == 'pos':
pos = int(lines[j+1].strip().split(' ', 1)[1])
if pos > maxpos:
maxpos = pos
out[pos].append(term)
print 'adding', term, 'position', pos
j += 1
i = j
ix = 1
i += 1



return out[0:maxpos+1]


#usage:
# make your test write index using text codec
# replaceInFile(newConfig, "solr.SchemaCodecFactory", "solr.SimpleTextCodecFactory");
# then find the location of the _pst file and process it
# lines = open('/tmp/solr.analysis.TestAdsabsTypeFulltextParsing_3E990BE8BF267D4E-001/init-core-data-001/index/_0.pst', 'r').read().split('\n')
# list(enumerate(parse(lines, '41')))
"""
[(0, []),
(1, ['acr::hubble']),
(2, ['constant']),
(3, ['acr::summary', 'summary']),
(4, []),
(5, ['acr::program', 'program']),
(6, ['acr::luminosity', 'luminosity']),
(7, ['acr::calibration', 'calibration']),
(8, ['acr::type', 'type']),
(9, ['ia']),
(10, ['acr::supernovae', 'supernovae']),
(11, ['acr::by', 'by']),
(12, ['acr::means', 'means']),
(13, ['acr::cepheids', 'cepheids']),
(14, []),
(15, []),
(16, []),
(17, []),
(18, []),
(19, [])]
"""

0 comments on commit 1363a91

Please sign in to comment.