Fixed FU guy's feedback

romanchyla · May 21, 2020 · 1363a91 · 1363a91
1 parent 8d0021b
commit 1363a91
Show file tree

Hide file tree

Showing 4 changed files with 113 additions and 22 deletions.
diff --git a/contrib/adsabs/src/test/org/adsabs/TestAdsAllFields.java b/contrib/adsabs/src/test/org/adsabs/TestAdsAllFields.java
@@ -449,7 +449,7 @@ public void test() throws Exception {
         "//doc/int[@name='recid'][.='100']",
         "//doc/float[@name='score'][.='26.0']");
 
-		assert h.query(req("q", "author:\"Einstein, A\""))
+		assert h.query(req("q", "author:\"Einstein, A\"", "fl", "author_norm"))
 				.contains("<arr name=\"author_norm\">" +
 				"<str>t' Hooft, van X</str>" +
 				"<str>Anders, John Michael</str>" +
@@ -570,7 +570,7 @@ public void test() throws Exception {
 			"//*[@numFound='1']");
 
 
-		assert h.query(req("q", "recid:100"))
+		assert h.query(req("q", "recid:100", "fl", "aff"))
  			.contains("<arr name=\"aff\">" +
 				"<str>-</str>" +
 				"<str>NASA Kavli space center, Cambridge, MA 02138, USA</str>" +
@@ -646,7 +646,7 @@ public void test() throws Exception {
 
 
 		// order/gaps are important
-		assert h.query(req("q", "recid:100"))
+		assert h.query(req("q", "recid:100", "fl", "email"))
  			.contains("<arr name=\"email\">" +
 				"<str>-</str>" +
 				"<str>anders@email.com</str>" +
@@ -667,7 +667,7 @@ public void test() throws Exception {
         "//doc/int[@name='recid'][.='100']",
         "//*[@numFound='1']"
     );
-    assert h.query(req("q", "recid:100"))
+    assert h.query(req("q", "recid:100", "fl", "orcid_pub"))
     .contains("<arr name=\"orcid_pub\">" +
       "<str>1111-2222-3333-4444</str>" +
       "<str>-</str>" +
@@ -1293,7 +1293,7 @@ public void test() throws Exception {
 		 * links_data (generated and stored as JSON for display purposes)
 		 * ids_data (generated and stored as JSON for display purposes)
 		 */
-		assertQ(req("q", "id:100"),
+		assertQ(req("q", "id:100", "fl", "links_data"),
 				"//doc/arr[@name='links_data']/str[contains(text(),'MAST')]",
 				"//doc/arr[@name='links_data']/str[contains(text(),'{\"foo\": [\"bar\", \"baz\"], \"one\": {\"two\": \"three\"}}')]"
 				);
@@ -1469,7 +1469,7 @@ public void test() throws Exception {
     assertQ(req("q", "page_range:23"),
         "//*[@numFound='0']" // not searchable
         );
-    assertQ(req("q", "page:55"),
+    assertQ(req("q", "page:55", "fl", "page_range"),
         "//doc[1]/str[@name='page_range'][.='23-55s']"
         );
 
@@ -1540,7 +1540,7 @@ public void test() throws Exception {
     assertQ(req("q", "data:(nEd OR foo)"),
         "//doc[1]/int[@name='recid'][.='100']"
         );
-    assertQ(req("q", "data:\"NED:999\""), // numbers should be ignored in search, but stored
+    assertQ(req("q", "data:\"NED:999\"", "fl", "recid,data"), // numbers should be ignored in search, but stored
         "//doc[1]/int[@name='recid'][.='100']",
         "//doc[1]/arr[@name='data']/str[contains(text(),'NED:15')]"
         );
@@ -1549,7 +1549,7 @@ public void test() throws Exception {
      * esources
      * 
      */
-    assertQ(req("q", "esources:pub_pDF"),
+    assertQ(req("q", "esources:pub_pDF", "fl", "recid"),
         "//doc[1]/int[@name='recid'][.='100']"
         );
 
@@ -1626,7 +1626,7 @@ public void test() throws Exception {
     /*
      * similar() query
      */
-    assertQ(req("q", "similar(recid:100)"),
+    assertQ(req("q", "similar(recid:100)", "fl", "recid"),
         "//doc[1]/int[@name='recid'][.='60']"
         );
 

diff --git a/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml b/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml
@@ -403,7 +403,7 @@
 		     unittest: TestAdsabsTypeFulltext -->
 		<!-- the tokenizing part needs more work, probably using synonyms to match
 			patterns? -->
-		<fieldType name="ads_text" class="solr.TextField">
+		<fieldType name="ads_text" class="solr.TextField" positionIncrementGap="0">
 			<analyzer type="index">
             <charFilter class="solr.HTMLStripCharFilterFactory"/>
 
@@ -445,6 +445,7 @@
 					class="org.apache.lucene.analysis.core.SelectiveLowerCaseFilterFactory" />
 
 
+
 				<!-- find synonyms, first multi-tokens -->
 				<filter class="org.apache.lucene.analysis.synonym.NewSynonymFilterFactory"
 					synonyms="ads_text_multi.synonyms" ignoreCase="false" expand="true"
@@ -464,11 +465,6 @@
 				<filter class="solr.analysis.ResetFilterFactory"
 					incomingType="SYNONYM" addPrefix="syn::" posIncrement="0" />
 
-				<!-- if the original or synonym contains UPPERCASE variant, mark it as
-					an acronym but keep the type information (if it is a synonym, it will remain
-					SYNONYM) which is important for the query parsing -->
-				<filter class="solr.AcronymTokenFilterFactory" emitBoth="true"
-					prefix="acr::" setType="ACRONYM"/>
 
 				<!-- remove stop words - first the case sensitively -->
 				<filter class="org.apache.lucene.analysis.core.AqpStopFilterFactory" ignoreCase="false"
@@ -478,6 +474,11 @@
 				<filter class="org.apache.lucene.analysis.core.AqpStopFilterFactory" ignoreCase="true"
 					words="ads_text.kill"/>
 
+				<!-- if the original or synonym contains UPPERCASE variant, mark it as
+					an acronym but keep the type information (if it is a synonym, it will remain
+					SYNONYM) which is important for the query parsing -->
+				<filter class="solr.AcronymTokenFilterFactory" emitBoth="true"
+					prefix="acr::" setType="ACRONYM"/>
 
 
 				<!-- we emit ASCIIField version of the token (at the same position):
@@ -489,7 +490,7 @@
 				<filter class="solr.TrimFilterFactory" />
 				<filter class="solr.LowerCaseFilterFactory" />
 
-				<!-- <filter class="org.apache.solr.analysis.DiagnoseFilterFactory" msg="indexer"/> //-->
+				<!-- <filter class="org.apache.solr.analysis.DiagnoseFilterFactory" msg="indexer"/> -->
 			</analyzer>
 			<analyzer type="query">
                 <charFilter class="solr.HTMLStripCharFilterFactory"/>
@@ -551,11 +552,6 @@
 					inclOrig="true" />
 
 
-				<!-- if the original or synonym contains UPPERCASE variant, mark it as
-					an acronym but do not change its type, if it was a SYNONYM, it is important
-					information for query parsing -->
-				<filter class="solr.AcronymTokenFilterFactory" emitBoth="false" allowTypes="SYNONYM"
-					prefix="acr::" setType="ACRONYM"/>
 
 				<!-- add a prefix to all synonyms -->
 				<filter class="solr.analysis.ResetFilterFactory"
@@ -575,6 +571,11 @@
 					words="ads_text.kill"
 					/>
 
+				<!-- if the original or synonym contains UPPERCASE variant, mark it as
+					an acronym but do not change its type, if it was a SYNONYM, it is important
+					information for query parsing -->
+				<filter class="solr.AcronymTokenFilterFactory" emitBoth="false" allowTypes="SYNONYM"
+					prefix="acr::" setType="ACRONYM"/>
 
 
 				<!-- we emit ASCIIField version of the token (at the same position) -->

diff --git a/contrib/examples/adsabs/server/solr/collection1/conf/solrconfig.xml b/contrib/examples/adsabs/server/solr/collection1/conf/solrconfig.xml
@@ -307,7 +307,7 @@
      <lst name="defaults">
        <str name="wt">json</str>
        <int name="rows">10</int>
-       <str name="fl">recid author title abstract page pub</str>
+       <str name="fl">id recid author title abstract page pub</str>
        <!-- 
        ADSLABS:12/12: this can be overrided by url params. 
        But unfielded search will not work if you use "df" parameter!

diff --git a/scripts/parse_index.py b/scripts/parse_index.py
@@ -0,0 +1,90 @@
+def parse(lines, docid):
+    out = [[] for x in range(1000)]
+    seqs = ['field', 'term', 'doc', 'freq', 'pos']
+    ix = 0
+    i = 0
+    term = None
+    maxpos = 0
+
+    while i < len(lines):
+        l = lines[i].strip()
+        if l == '':
+            i+= 1
+            continue
+        if ix > 3:
+            print 'processing', l, term
+        parts = l.split(' ', 1)
+        if len(parts) <= 1:
+            i+= 1
+            continue
+        key, value = parts[0], parts[1]
+        if key == seqs[ix]:
+            if key == 'field':
+                if value == 'title':
+                    ix += 1
+                else:
+                    ix = 0
+            elif key == 'term':
+                term = value
+                ix += 1
+            elif key == 'doc':
+                if value != docid:
+                    i += 1
+                    ix -= 1
+                    continue
+                doc = value
+                ix += 1
+            elif key == 'freq':
+                freq = value
+                ix += 1
+            elif key == 'pos':
+                pos = int(value)
+                if pos > maxpos:
+                    maxpos = pos
+                out[pos].append(term)
+                j = i
+                print 'adding', term, 'position', pos
+                while j+1 < len(lines) and lines[j+1].strip().split(' ', 1)[0] == 'pos':
+                    pos = int(lines[j+1].strip().split(' ', 1)[1])
+                    if pos > maxpos:
+                        maxpos = pos
+                    out[pos].append(term)
+                    print 'adding', term, 'position', pos
+                    j += 1
+                i = j
+                ix = 1
+        i += 1
+
+
+
+    return out[0:maxpos+1]
+
+
+#usage:
+# make your test write index using text codec
+# replaceInFile(newConfig, "solr.SchemaCodecFactory", "solr.SimpleTextCodecFactory");    
+# then find the location of the _pst file and process it
+# lines = open('/tmp/solr.analysis.TestAdsabsTypeFulltextParsing_3E990BE8BF267D4E-001/init-core-data-001/index/_0.pst', 'r').read().split('\n')
+# list(enumerate(parse(lines, '41')))
+"""
+[(0, []),
+ (1, ['acr::hubble']),
+ (2, ['constant']),
+ (3, ['acr::summary', 'summary']),
+ (4, []),
+ (5, ['acr::program', 'program']),
+ (6, ['acr::luminosity', 'luminosity']),
+ (7, ['acr::calibration', 'calibration']),
+ (8, ['acr::type', 'type']),
+ (9, ['ia']),
+ (10, ['acr::supernovae', 'supernovae']),
+ (11, ['acr::by', 'by']),
+ (12, ['acr::means', 'means']),
+ (13, ['acr::cepheids', 'cepheids']),
+ (14, []),
+ (15, []),
+ (16, []),
+ (17, []),
+ (18, []),
+ (19, [])]
+ """