Fix for affiliations and synonym searches

romanchyla · Mar 8, 2021 · eeff1d7 · eeff1d7
1 parent 5a80435
commit eeff1d7
Show file tree

Hide file tree

Showing 7 changed files with 141 additions and 17 deletions.
diff --git a/...c/java/org/apache/lucene/queryparser/flexible/aqp/builders/AqpAdsabsSubQueryProvider.java b/...c/java/org/apache/lucene/queryparser/flexible/aqp/builders/AqpAdsabsSubQueryProvider.java
@@ -46,6 +46,7 @@
 import org.apache.lucene.search.SecondOrderQuery;
 import org.apache.lucene.search.SimpleCollector;
 import org.apache.lucene.search.SolrCacheWrapper;
+import org.apache.lucene.search.SynonymQuery;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.search.TopFieldCollector;
@@ -383,7 +384,8 @@ public Query parse(FunctionQParser fp) throws SyntaxError {
 				int positionIncrementGap = 1;
 				if (fp.getReq() != null) {
 					IndexSchema schema = fp.getReq().getSchema();
-					SchemaField field = schema.getFieldOrNull(query.toString().split(":")[0]);
+					String f = getField(query);
+					SchemaField field = schema.getFieldOrNull(f);
 					if (field != null) {
 						FieldType fType = field.getType();
 						//if (!fType.isMultiValued()) {
@@ -423,6 +425,34 @@ public Query parse(FunctionQParser fp) throws SyntaxError {
 				  query = new BoostQuery(query, boostFactor);
 				return query;
 			}
+
+      private String getField(Query query) throws SyntaxError {
+
+        if (query instanceof TermQuery) {
+          return ((TermQuery) query).getTerm().field();
+        }
+        else if (query instanceof SynonymQuery) {
+          for (Term t: ((SynonymQuery) query).getTerms()) {
+            return t.field();
+          }
+        }
+        else if (query instanceof BooleanQuery) {
+          HashSet<String> s = new HashSet<String>();
+          for (BooleanClause c: ((BooleanQuery) query).clauses()) {
+            s.add(getField(c.getQuery()));
+          }
+          if (s.size() > 1) {
+            throw new SyntaxError("Illegal state: combining multiple fields is not allowed");
+          }
+          return (String) s.toArray()[0];
+        }
+        else {
+          // last resort
+          return query.toString().split(":")[0];          
+        }
+
+        return null;
+      }
 		});
 
 		/* @api.doc

diff --git a/...ava/org/apache/lucene/queryparser/flexible/aqp/processors/AqpAdsabsAnalyzerProcessor.java b/...ava/org/apache/lucene/queryparser/flexible/aqp/processors/AqpAdsabsAnalyzerProcessor.java
@@ -8,7 +8,6 @@
 
 import org.apache.lucene.queryparser.flexible.aqp.builders.AqpFunctionQueryBuilder;
 import org.apache.lucene.queryparser.flexible.aqp.config.AqpAdsabsQueryConfigHandler;
-import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpANTLRNode;
 import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpFunctionQueryNode;
 import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpNonAnalyzedQueryNode;
 import org.apache.lucene.queryparser.flexible.aqp.parser.AqpStandardQueryConfigHandler;
@@ -19,6 +18,7 @@
 import org.apache.lucene.queryparser.flexible.core.nodes.FieldQueryNode;
 import org.apache.lucene.queryparser.flexible.core.nodes.QueryNode;
 import org.apache.lucene.queryparser.flexible.messages.MessageImpl;
+import org.apache.lucene.queryparser.flexible.aqp.processors.AqpAnalyzerQueryNodeProcessor;
 
 /**
  * This processor prevents analysis to happen for nodes that are 
@@ -118,29 +118,30 @@ protected QueryNode postProcessNode(QueryNode node)
     return rn;
 
   }
-
-  private Map<String, String> getConfigMap() {
+  protected static Map<String, String> empty = new HashMap<String, String>();
+  public Map<String, String> getConfigMap() {
     Map<String, String> args = getQueryConfigHandler().get(
         AqpStandardQueryConfigHandler.ConfigurationKeys.NAMED_PARAMETER);
     if (args == null)
-      return new HashMap<String, String>();
+      return empty;
     return args;
   }
 
-  private String getConfigVal(String key) {
+  public String getConfigVal(String key) {
     Map<String, String> args = getConfigMap();
     if (args.containsKey(key)) {
       return args.get(key);
     }
     return null;
   }
 
-  private String getConfigVal(String key, String defaultVal) {
+  public String getConfigVal(String key, String defaultVal) {
     Map<String, String> args = getConfigMap();
     if (args.containsKey(key)) {
       return args.get(key);
     }
     return defaultVal;
   }
 
+
 }
diff --git a/...va/org/apache/lucene/queryparser/flexible/aqp/processors/AqpAdsabsRegexNodeProcessor.java b/...va/org/apache/lucene/queryparser/flexible/aqp/processors/AqpAdsabsRegexNodeProcessor.java
@@ -16,7 +16,7 @@
 import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpAdsabsRegexQueryNode;
 import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpNonAnalyzedQueryNode;
 
-public class AqpAdsabsRegexNodeProcessor extends QueryNodeProcessorImpl implements
+public class AqpAdsabsRegexNodeProcessor extends AqpQueryNodeProcessorImpl implements
 	QueryNodeProcessor  {
 
 	@Override
@@ -33,6 +33,10 @@ protected QueryNode preProcessNode(QueryNode node)
 			FieldQueryNode n = (FieldQueryNode) node;
 			String input = n.getTextAsString();
 
+			String field = n.getFieldAsString();
+			if (!isRegexField(field))
+			  return node;
+
 			if (input == null) {
 				return node;
 			}
@@ -62,7 +66,19 @@ protected QueryNode preProcessNode(QueryNode node)
 		return node;
 	}
 
-	@Override
+	// can be overriden to plug in some configuration
+	protected boolean isRegexField(String field) {
+    String val = getConfigVal("aqp.regex.disallowed.fields", null);
+    if (val == null)
+      return true;
+    for (String f: val.split(";")) {
+      if (f.equals(field))
+        return false;
+    }
+    return true;
+  }
+
+  @Override
 	protected List<QueryNode> setChildrenOrder(List<QueryNode> children)
 			throws QueryNodeException {
 		return children;

diff --git a/...processors/AqpFuzzyModifierProcessor.java → ...processors/AqpFuzzyModifierProcessor.java b/...processors/AqpFuzzyModifierProcessor.java → ...processors/AqpFuzzyModifierProcessor.java
diff --git a/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeAffiliationTokens.java b/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeAffiliationTokens.java
@@ -78,6 +78,9 @@ public static String getSchemaFile() {
 			        + "ror.1;foo;bar\n"
 			    		+ "A00001;Aalborg U;Aalborg University;RID1004;04m5j1k67;000000010742471X;Q601956;grid.5117.2;\n\n"
 			    		+ "A00002;Aarhus U;Aarhus University;RID1006;01aj84f44;0000000119562722;Q924265;grid.7048.b;\n"
+			    		+ "A01400;SI/CfA;Center for Astrophysics | Harvard and Smithsonian;Harvard Smithsonian Center for Astrophysics;RID61814;03c3r2d17;Q1133697;grid.455754.2\n"
+			    		+ "AX;SI\n"
+			    		+ "AB=>CfA\n"
 			    		});
 
 			replaceInFile(newConfig, "synonyms=\"aff_id.synonyms\"",
@@ -99,6 +102,16 @@ public void test() throws Exception {
         "institution", "U Catania/Dep Phy Ast; -; -; INFN/Catania",
         "institution", "U Catania/Dep Phy Ast; -"
         ));
+    assertU(addDocs(
+        "institution", "SI/CfA; Harvard U/CfA", 
+        "institution", "Harvard U/Phys; Brown U/Ast",
+        "aff", "SI/CfA")
+        );
+    assertU(addDocs(
+        "institution", "Harvard U/Law; -",
+        "institution", "SI/CfA; Harvard U/CfA", 
+        "aff", "SI/CfA")
+        );
     assertU(commit());
 
     assertQueryEquals(req("q", "aff_id:https\\://ror.org/ror.1"), 
@@ -193,14 +206,82 @@ public void test() throws Exception {
     assertQ(req("q", "pos(institution:\"U Catania/Dep Phy Ast\", 1)"), "//*[@numFound='1']");
     assertQ(req("q", "pos(institution:\"U Catania\", 1)"), "//*[@numFound='1']");
     assertQ(req("q", "pos(institution:\"Dep Phy Ast\", 1)"), "//*[@numFound='1']");
-    assertQ(req("q", "pos(institution:\"-\", 1)"), "//*[@numFound='1']");
+    assertQ(req("q", "pos(institution:\"-\", 1)"), "//*[@numFound='2']");
     assertQ(req("q", "pos(institution:\"INFN/Catania\", 1)"), "//*[@numFound='0']");
     assertQ(req("q", "pos(institution:\"INFN\", 1)"), "//*[@numFound='0']");
     assertQ(req("q", "pos(institution:\"Catania\", 1)"), "//*[@numFound='0']");
 
     assertQ(req("q", "pos(institution:\"INFN/Catania\", 2)"), "//*[@numFound='1']");
     assertQ(req("q", "pos(institution:\"INFN\", 2)"), "//*[@numFound='1']");
     assertQ(req("q", "pos(institution:\"Catania\", 2)"), "//*[@numFound='1']");
+
+    // search parts of the affiliation
+    assertQ(req("q", "institution:\"SI\""), "//*[@numFound='2']");
+    assertQ(req("q", "institution:\"CfA\""), "//*[@numFound='2']");
+    assertQ(req("q", "institution:\"Harvard U\""), "//*[@numFound='2']");
+    assertQ(req("q", "institution:\"Law\""), "//*[@numFound='1']");
+    assertQ(req("q", "institution:\"Phys\""), "//*[@numFound='1']");
+
+    // search parts (but honour position)
+    assertQ(req("q", "pos(institution:\"SI\", 1)"), "//*[@numFound='1']");
+    assertQ(req("q", "pos(institution:\"CfA\", 1)"), "//*[@numFound='1']");
+    assertQ(req("q", "pos(institution:\"Harvard U\", 2)"), "//*[@numFound='2']");
+    assertQ(req("q", "pos(institution:\"CfA\", 2)"), "//*[@numFound='1']");
+
+    // search parent/child
+    assertQ(req("q", "institution:\"SI/CfA\""), "//*[@numFound='2']");
+
+    // do the same but as phrase; it should fail because the parser WILL NOT
+    // treat empty space as a delimiter; it considers it part of the token
+    // like 'Harvard U'
+    assertQ(req("q", "institution:\"SI CfA\""), "//*[@numFound='0']");
+
+    // proximity operator however should yield the record
+    assertQ(req("q", "institution:\"SI\" NEAR1 institution:\"CfA\""), "//*[@numFound='2']");
+
+    // but not mix up insitutions that were separated by ';' (those affiliations
+    // belong to different authors/persons)
+    assertQ(req("q", "institution:\"Law\" NEAR5 institution:\"CfA\""), "//*[@numFound='0']");
+
+    // one person however can have multiple affiliations; and they can be searched via proximity
+    assertQ(req("q", "institution:\"Phys\" NEAR5 institution:\"Ast\""), "//*[@numFound='1']");
+
+    // SI/CfA is also known through identifiers/canonical names - NOTE: the input synonyms
+    // MUST CONTAIN correct entry, i.e. "SI/CfA" - for some reason we used to have "SI CfA"
+    assertQ(req("q", "institution:\"A01400\""), "//*[@numFound='2']");
+    assertQ(req("q", "institution:\"RID61814\""), "//*[@numFound='2']");
+    assertQ(req("q", "institution:\"Harvard Smithsonian Center for Astrophysics\""), "//*[@numFound='2']");
+    assertQ(req("q", "institution:\"03c3r2d17\""), "//*[@numFound='2']");
+    assertQ(req("q", "institution:\"Q1133697\""), "//*[@numFound='2']");
+    assertQ(req("q", "institution:\"grid.455754.2\""), "//*[@numFound='2']");
+
+
+    // what is the meaning of the pipe? (|) -- it forces our parser to treat the query
+    // as a regex; to not do that we have to set aqp.regex.disallowed.fields
+    assertQ(req("q", "institution:\"Center for Astrophysics | Harvard and Smithsonian\"",
+        "aqp.regex.disallowed.fields", "institution"), "//*[@numFound='2']");
+
+    // and we also want to find the records via parent/child relationship BUT using
+    // synonyms; so assume that parent (SI) is also known under synonym 'AX' and 
+    // CfA is known under synonym 'AB'; the search "AX/AB" should then find the same
+    // thing as "SI/CfA" -- HOWEVER, note, this feature requires synonym mapping
+    // either of the explicit form:
+    // AX => SI
+    // or more forgiving (and more wasteful):
+    // AX;SI
+    // IMHO this feature is confusing; will bloat the synonym file and users
+    // are going to be confused by it. I'd just say: use canonical forms of 
+    // the synonym. And, and... be aware that if the synonym file contains a cycle
+    // i.e. 'parent/child' entry sharing synonyms with 'child' entry; then the
+    // two will be merged and considered as one:
+    // 
+    // SI/CfA;A01400
+    // CfA;A014000
+    // 
+    // becomes:
+    // SI/CfA;A01400;CfA
+    assertQ(req("q", "institution:\"AX/AB\""), "//*[@numFound='2']");
+
   }
 
 

diff --git a/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml b/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml
@@ -892,6 +892,7 @@
 					types="wdafftypes.txt" />
 				<filter class="solr.LowerCaseFilterFactory" />
 				<filter class="solr.TrimFilterFactory" />
+				<filter class="org.apache.solr.analysis.DiagnoseFilterFactory" msg="aff_token"/>
 			</analyzer>
 			<analyzer type="query">
 				<charFilter class="solr.PatternReplaceCharFilterFactory" 
@@ -974,6 +975,7 @@
 					setType="ACRONYM" />
 				<filter class="solr.LowerCaseFilterFactory" />
 				<filter class="solr.TrimFilterFactory" />
+				<filter class="org.apache.solr.analysis.DiagnoseFilterFactory" msg="aff_text"/>
 			</analyzer>
 			<analyzer type="query">
 				<charFilter class="solr.HTMLStripCharFilterFactory" />
@@ -1292,9 +1294,6 @@
                docValues="true"/>
         <field name="aff_id" type="affiliation_tokens" indexed="true" stored="true"
                multiValued="true" omitNorms="true"/>
-        <!--  TODO: remove once index has been rebuilt -->
-        <field name="aff_raw" type="affiliation_text" indexed="true" stored="true"
-               omitNorms="true" multiValued="true" />
 
         <!-- for Unified Astronomy Thesaurus -->
         <field name="uat" type="uat_tokens" indexed="true" stored="true"

diff --git a/contrib/examples/adsabs/server/solr/collection1/conf/solrconfig.xml b/contrib/examples/adsabs/server/solr/collection1/conf/solrconfig.xml
@@ -39,10 +39,7 @@
     <ramBufferSizeMB>${solr.ramBufferSize:1000}</ramBufferSizeMB>
     <maxBufferedDocs>${solr.maxBufferedDocs:50000}</maxBufferedDocs>
 
-    <mergePolicyFactory class="solr.TieredMergePolicyFactory">
-        <int name="maxMergeAtOnce">${solr.mergePolicy.maxMergeAtOnce:10}</int>
-        <int name="segmentsPerTier">${solr.mergePolicy.segmentsPerTier:10}</int>
-    </mergePolicyFactory>
+
 
     <!-- When we run several instances sharing the same index, we must
          make sure that only one writer is modifying it; and other