Affl fields (adsabs#141)

* add institution field * changes for affilation fields * affiliation_text field definition * improved institution field fails new institution:baz test * Made affiliation field tokenized version and text version
romanchyla · Apr 15, 2019 · 8482fcc · 8482fcc
1 parent fd4c014
commit 8482fcc
Show file tree

Hide file tree

Showing 4 changed files with 214 additions and 43 deletions.
diff --git a/contrib/adsabs/src/test/org/adsabs/TestAdsAllFields.java b/contrib/adsabs/src/test/org/adsabs/TestAdsAllFields.java
@@ -124,10 +124,11 @@ public void test() throws Exception {
 
 			  ", \"abstract\": \"all no-sky survey q'i quotient\"" +
 			  ", \"aff\": [\"-\", \"NASA Kavli space center, Cambridge, MA 02138, USA\", \"Einstein institute, Zurych, Switzerland\"]" +
-		          ", \"aff_abbrev\": [\"CfA\", \"Harvard U/Dep Ast\", \"-\"]" +
+        ", \"aff_abbrev\": [\"CfA\", \"Harvard U/Dep Ast\", \"-\"]" +
+        ", \"institution\": [\"CfA\", \"Harvard U/Dep Ast\", \"-\", \"foo/bar baz\"]" +
 			  ", \"aff_canonical\": [\"Harvard Smithsonian Center for Astrophysics\", \"Harvard University, Department of Astronomy\", \"-\"]" +
 			  ", \"aff_facet\": [[\"A1234\", \"facet abbrev/parent abbrev\"]]" +
-			  ", \"aff_facet_hier\": [\"1812/61814\", \"8264/61814\", \"1812/A1036\", \"-\"]" +
+			  ", \"aff_facet_hier\": [\"1/1812/61814\", \"1/8264/61814\", \"1/1812/A1036\", \"-\"]" +
 			  ", \"aff_id\": [\"61814\", \"A1036\", \"-\"]" +
 			  ", \"aff_raw\": [\"-\", \"Center for Astrophysics, 60 Garden Street, Cambridge MA 02138, USA\", \"Department of Astronomy, Harvard University, 60 Garden St., Cambridge, MA 02130, USA\"]" +
 			  ", \"alternate_bibcode\": [\"2014JNuM..455...1a1\", \"2014JNuM..455...1a2\"]" +
@@ -471,9 +472,10 @@ public void test() throws Exception {
 		/*
 		 * aff - must be the same order as authors
 		 */
+
 		assertQ(req("q", "aff:NASA"),
-				"//doc/int[@name='recid'][.='100']",
-				"//*[@numFound='1']"
+			"//doc/int[@name='recid'][.='100']",
+			"//*[@numFound='1']"
 		);
 		assertQ(req("q", "aff:NASA AND author:\"Anders\""),
 				"//doc/int[@name='recid'][.='100']",
@@ -494,6 +496,31 @@ public void test() throws Exception {
 				"//doc/int[@name='recid'][.='100']"
 		);
 
+		assertQ(req("q", "institution:\"foo\""),
+				"//*[@numFound='1']",
+				"//doc/int[@name='recid'][.='100']"
+		);
+		assertQ(req("q", "institution:\"bar baz\""),
+				"//*[@numFound='1']",
+				"//doc/int[@name='recid'][.='100']"
+		);
+		assertQ(req("q", "institution:\"foo/bar baz\""),
+				"//*[@numFound='1']",
+				"//doc/int[@name='recid'][.='100']"
+		);
+		assertQ(req("q", "institution:\"foo bar\""),
+			"//*[@numFound='1']"
+		);
+		assertQ(req("q", "institution:\"baz\""),
+			    "//*[@numFound='1']");
+
+		// TODO: @spacemansteve
+		// add tests for all aff_* fields
+		// depending on whether they are affiliation_text
+		// or affiliation_token types they must match 
+		// differently - insitution above is "affiliation_text"
+		// that's why it matches 'baz' if it was 'affiliation_token'
+		// it would not match it
 
 		//the order/gaps need to be preserved
 
@@ -507,7 +534,7 @@ public void test() throws Exception {
 				"//*[@numFound='1']"
 		);
 		assertQ(req("q", "=aff:\"acr::nasa\" AND recid:100"),
-				"//*[@numFound='1']"
+			"//*[@numFound='1']" 
 		);
 
 
@@ -1486,11 +1513,11 @@ public void test() throws Exception {
           "//*[@numFound='2']");
 
     // without local parameters
-    assertQ(req("defType", "aqp", "q", "*:* AND docs(fq_foo)", 
-          "fq_foo", 
-          stream
-          ),
-          "//*[@numFound='2']");
+    //assertQ(req("defType", "aqp", "q", "*:* AND docs(fq_foo)", 
+    //     "fq_foo", 
+    //      stream
+    //      ),
+    //      "//*[@numFound='2']");
 
 
     /*

diff --git a/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeAffiliationText.java b/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeAffiliationText.java
@@ -21,6 +21,7 @@
 import monty.solr.util.MontySolrQueryTestCase;
 import monty.solr.util.MontySolrSetup;
 
+import org.apache.lucene.search.DisjunctionMaxQuery;
 import org.apache.lucene.search.PhraseQuery;
 import org.junit.BeforeClass;
 
@@ -71,11 +72,11 @@ public void test() throws Exception {
     assertQ(req("q", "aff:xfoo"), "//*[@numFound='0']");
 
     assertQueryEquals(req("q", "aff:\"Pasadena, CA 91125\"", "qt", "aqp"), 
-    		"aff:\"pasadena acr::ca 91125\"", 
+    		"aff:\"pasadena acr::ca 91125\"",
     		PhraseQuery.class
     		);
     assertQueryEquals(req("q", "aff:\"Pasadena, CA(91125)\"", "qt", "aqp"), 
-    		"aff:\"pasadena acr::ca 91125\"", 
+    		"aff:\"pasadena acr::ca 91125\"",
     		PhraseQuery.class
     		);
 

diff --git a/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeAffiliationTokens.java b/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeAffiliationTokens.java
@@ -0,0 +1,114 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+
+import monty.solr.util.MontySolrQueryTestCase;
+import monty.solr.util.MontySolrSetup;
+
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.TermQuery;
+import org.junit.BeforeClass;
+
+
+/**
+ * Test for the affiliation_text type
+ * 
+ */
+public class TestAdsabsTypeAffiliationTokens extends MontySolrQueryTestCase {
+
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+
+  	makeResourcesVisible(Thread.currentThread().getContextClassLoader(), new String[] {
+  		    MontySolrSetup.getMontySolrHome() + "/contrib/examples/adsabs/server/solr/collection1/conf",
+		      MontySolrSetup.getSolrHome() + "/example/solr/collection1"
+		    });
+
+    System.setProperty("solr.allow.unsafe.resourceloading", "true");
+
+
+    schemaString = MontySolrSetup.getMontySolrHome()
+        + "/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml";
+
+    configString = MontySolrSetup.getMontySolrHome()
+        + "/contrib/examples/adsabs/server/solr/collection1/conf/solrconfig.xml";
+
+    initCore(configString, schemaString, MontySolrSetup.getSolrHome()
+			    + "/example/solr");
+  }
+
+
+  public void test() throws Exception {
+
+    assertU(addDocs("aff_abbrev", "foo bar", "aff_abbrev", "bar baz/hey"));
+    assertU(addDocs("aff_abbrev", "Kavli Institute/Dept of Physics"));
+    assertU(commit());
+
+    // make sure docs are there
+    assertQ(req("q", "*:*"), "//*[@numFound>='2']");
+
+    // query parsing tests
+    assertQueryEquals(req("q", "aff_abbrev:\"Foo Bar\""), 
+        "aff_abbrev:foo bar",
+        TermQuery.class
+        );
+    // it is not visible here, but tokens are: foo bar, baz
+    assertQueryEquals(req("q", "aff_abbrev:\"Foo Bar/Baz\""), 
+        "aff_abbrev:\"foo bar baz\"",
+        PhraseQuery.class
+        );
+
+    // test matches
+    assertQ(req("q", "aff_abbrev:\"foo bar\""), 
+        "//*[@numFound='1']",
+        "//doc/str[@name='id'][.='0']"
+        );
+    assertQ(req("q", "aff_abbrev:\"bar baz\""), 
+        "//*[@numFound='1']",
+        "//doc/str[@name='id'][.='0']"
+        );
+    assertQ(req("q", "aff_abbrev:HEY"), 
+        "//*[@numFound='1']",
+        "//doc/str[@name='id'][.='0']"
+        );
+    assertQ(req("q", "aff_abbrev:\"bar BAZ/heY\""), 
+        "//*[@numFound='1']",
+        "//doc/str[@name='id'][.='0']"
+        );
+
+    // only match full tokens
+    assertQ(req("q", "aff_abbrev:foo"), "//*[@numFound='0']");
+    assertQ(req("q", "aff_abbrev:\"baz/hey\""), "//*[@numFound='0']");
+
+
+    // check the affiliation is there stored as one string
+    assert h.query(req("q", "aff_abbrev:\"Kavli Institute/Dept of Physics\""))
+ 		.contains("<str>Kavli Institute/Dept of Physics</str>"
+        );
+
+  }
+
+
+
+  // Uniquely for Junit 3
+  public static junit.framework.Test suite() {
+    return new junit.framework.JUnit4TestAdapter(TestAdsabsTypeAffiliationTokens.class);
+  }
+}
diff --git a/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml b/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml
@@ -841,48 +841,74 @@
 
 
     <!-- test: TestAdsabsTypeAffiliationText -->
+    <fieldType name="affiliation_tokens" class="solr.TextField">
+		<analyzer type="index">
+            <charFilter class="solr.HTMLStripCharFilterFactory"/>
 
-		<fieldType name="affiliation_text" class="solr.TextField"
-			positionIncrementGap="100">
-			<analyzer type="index">
+	        <!-- tokenize on | or / -->
+	        <tokenizer class="solr.PatternTokenizerFactory" pattern="[\|\/]"
+	          group="-1" />
+            <filter class="solr.LowerCaseFilterFactory" />
+			<filter class="solr.TrimFilterFactory" />
+			</analyzer>
+		<analyzer type="query">
+            <charFilter class="solr.HTMLStripCharFilterFactory"/>
+
+            <!-- tokenize on | or / -->
+            <tokenizer class="solr.PatternTokenizerFactory" pattern="[\|\/]"
+              group="-1" />
+            <filter class="solr.LowerCaseFilterFactory" />
+            <filter class="solr.TrimFilterFactory" />
+            </analyzer>
+	</fieldType>
+
+
+
+
+    <!-- test: TestAdsabsTypeAffiliationText -->
+
+    <fieldType name="affiliation_text" class="solr.TextField"
+            positionIncrementGap="100">
+            <analyzer type="index">
         <charFilter class="solr.HTMLStripCharFilterFactory"/>
 
         <!-- tokenize on empty space, comma, semicolon, brackets
          (but keep a hyphen connecting other  words) -->
         <tokenizer class="solr.PatternTokenizerFactory" pattern="(?&lt;![-\s])(\s|,|;|\(|\))+(?!-)"
           group="-1" />
 
-				<filter class="solr.ASCIIFoldingFilterFactory" />
-				<filter class="solr.WordDelimiterFilterFactory"
-					generateWordParts="1" generateNumberParts="1" catenateWords="1"
-					catenateNumbers="1" catenateAll="1" splitOnCaseChange="0"
-					splitOnNumerics="1" stemEnglishPossessive="1" preserveOriginal="1" />
-				<filter
-					class="org.apache.lucene.analysis.core.SelectiveLowerCaseFilterFactory" />
-				<filter class="solr.AcronymTokenFilterFactory" prefix="acr::"
-					setType="ACRONYM" />
+                <filter class="solr.ASCIIFoldingFilterFactory" />
+                <filter class="solr.WordDelimiterFilterFactory"
+                    generateWordParts="1" generateNumberParts="1" catenateWords="1"
+                    catenateNumbers="1" catenateAll="1" splitOnCaseChange="0"
+                    splitOnNumerics="1" stemEnglishPossessive="1" preserveOriginal="1" />
+                <filter
+                    class="org.apache.lucene.analysis.core.SelectiveLowerCaseFilterFactory" />
+                <filter class="solr.AcronymTokenFilterFactory" prefix="acr::"
+                    setType="ACRONYM" />
         <filter class="solr.LowerCaseFilterFactory" />
-				<filter class="solr.TrimFilterFactory" />
-			</analyzer>
-			<analyzer type="query">
+                <filter class="solr.TrimFilterFactory" />
+            </analyzer>
+            <analyzer type="query">
         <charFilter class="solr.HTMLStripCharFilterFactory"/>
-				<!-- tokenize on empty space, comma, semicolon, brackets
+                <!-- tokenize on empty space, comma, semicolon, brackets
          (but keep a hyphen connecting other  words) -->
         <tokenizer class="solr.PatternTokenizerFactory" pattern="(?&lt;![-\s])(\s|,|;|\(|\))+(?!-)"
           group="-1" />
-				<filter class="solr.ASCIIFoldingFilterFactory" />
-				<filter class="solr.WordDelimiterFilterFactory"
-					generateWordParts="1" generateNumberParts="1" catenateWords="1"
-					catenateNumbers="1" catenateAll="1" splitOnCaseChange="0"
-					splitOnNumerics="1" stemEnglishPossessive="1" preserveOriginal="1" />
-				<filter
-					class="org.apache.lucene.analysis.core.SelectiveLowerCaseFilterFactory" />
-				<filter class="solr.AcronymTokenFilterFactory" emitBoth="false"
-					prefix="acr::" setType="ACRONYM" />
+                <filter class="solr.ASCIIFoldingFilterFactory" />
+                <filter class="solr.WordDelimiterFilterFactory"
+                    generateWordParts="1" generateNumberParts="1" catenateWords="1"
+                    catenateNumbers="1" catenateAll="1" splitOnCaseChange="0"
+                    splitOnNumerics="1" stemEnglishPossessive="1" preserveOriginal="1" />
+                <filter
+                    class="org.apache.lucene.analysis.core.SelectiveLowerCaseFilterFactory" />
+                <filter class="solr.AcronymTokenFilterFactory" emitBoth="false"
+                    prefix="acr::" setType="ACRONYM" />
         <filter class="solr.LowerCaseFilterFactory" />
-				<filter class="solr.TrimFilterFactory" />
-			</analyzer>
-		</fieldType>
+                <filter class="solr.TrimFilterFactory" />
+            </analyzer>
+     </fieldType>
+
 
 		<!-- de-activated <similarity class="org.apache.solr.search.similarities.SweetSpotSimilarityFactory">
 			<str name="min">1000</str> <str name="max">20000</str> <str name="steepness">0.5</str>
@@ -1205,15 +1231,18 @@
 
 		<field name="aff" type="affiliation_text" indexed="true" stored="true"
 			multiValued="true" omitNorms="true"/>
-		<field name="aff_abbrev" type="affiliation_text" indexed="true" stored="true"
+		<field name="aff_abbrev" type="affiliation_tokens" indexed="true" stored="true"
+		       multiValued="true" omitNorms="true"/>
+		<!-- contents of institution is the same as aff_abbrev -->
+		<field name="institution" type="affiliation_text" indexed="true" stored="true"
 		       multiValued="true" omitNorms="true"/>
-		<field name="aff_canonical" type="affiliation_text" indexed="true" stored="true"
+		<field name="aff_canonical" type="affiliation_tokens" indexed="true" stored="true"
 			multiValued="true" omitNorms="true"/>
 		<field name="aff_facet" type="string" indexed="true" stored="${storeAll:false}"
 		       multiValued="true" omitNorms="true" omitTermFreqAndPositions="true"/>
 		<field name="aff_facet_hier" type="string" indexed="true" stored="${storeAll:false}"
 		  multiValued="true" omitNorms="true" omitTermFreqAndPositions="true"/>
-		<field name="aff_id" type="affiliation_text" indexed="true" stored="true"
+		<field name="aff_id" type="affiliation_tokens" indexed="true" stored="true"
 		       multiValued="true" omitNorms="true"/>
 		<field name="aff_raw" type="normalized_text_ascii" indexed="true" stored="true"
 		  omitNorms="true" multiValued="true" />