Skip to content

Commit

Permalink
Added configuration to remove certain fields from the logic we apply …
Browse files Browse the repository at this point in the history
…to synonyms

The logic is beneficial only to fields with index-time synonyms expansion; in case of affiliations we only have query-time synonyms
  • Loading branch information
romanchyla committed Apr 6, 2021
1 parent 4024418 commit 1bf928f
Show file tree
Hide file tree
Showing 5 changed files with 72 additions and 10 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.apache.lucene.queryparser.flexible.aqp.builders;

import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.flexible.aqp.config.AqpAdsabsQueryConfigHandler;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpAnalyzerQueryNodeProcessor;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpPostAnalysisProcessor;

Expand All @@ -23,13 +24,13 @@

import org.apache.lucene.queryparser.flexible.core.QueryNodeException;
import org.apache.lucene.queryparser.flexible.core.builders.QueryTreeBuilder;
import org.apache.lucene.queryparser.flexible.core.nodes.FieldQueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.QueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.SlopQueryNode;
import org.apache.lucene.queryparser.flexible.standard.builders.StandardQueryBuilder;
import org.apache.lucene.search.MultiPhraseQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PhraseQuery.Builder;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.lucene.search.Query;

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;

Expand All @@ -29,11 +28,9 @@
import org.apache.lucene.queryparser.flexible.aqp.config.AqpAdsabsQueryConfigHandler;
import org.apache.lucene.queryparser.flexible.aqp.config.AqpRequestParams;
import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpAdsabsScoringQueryNode;
import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpFunctionQueryNode;
import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpOrQueryNode;
import org.apache.lucene.queryparser.flexible.core.QueryNodeException;
import org.apache.lucene.queryparser.flexible.core.builders.QueryBuilder;
import org.apache.lucene.queryparser.flexible.core.builders.QueryTreeBuilder;
import org.apache.lucene.queryparser.flexible.core.config.FieldConfig;
import org.apache.lucene.queryparser.flexible.core.messages.QueryParserMessages;
import org.apache.lucene.queryparser.flexible.core.nodes.FieldQueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.QueryNode;
Expand All @@ -44,10 +41,11 @@
import org.apache.lucene.queryparser.flexible.standard.nodes.WildcardQueryNode;
import org.apache.lucene.queryparser.flexible.standard.processors.MultiTermRewriteMethodProcessor;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.handler.AdsConfigHandler;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.RefCounted;

Expand All @@ -56,6 +54,7 @@ public class AqpChangeRewriteMethodProcessor extends
boolean first = true;
private Set<String> types = null;
private Set<String> fields = null;
private Set<String> ignoredFields = null;

protected QueryNode preProcessNode(QueryNode node) throws QueryNodeException {

Expand Down Expand Up @@ -111,6 +110,37 @@ else if (node instanceof MultiPhraseQueryNode) {
}
}

if (getConfigVal("aqp.multiphrase.keep_one.ignore.fields", null) != null) {

if (ignoredFields == null) {
ignoredFields = new HashSet<String>();
for (String s: getConfigVal("aqp.multiphrase.keep_one.ignore.fields").split(",")) {
ignoredFields.add(s);
}
}

if (ignoredFields.contains((String)((MultiPhraseQueryNode) node).getField())) {

// for ignored fields, we don't want to do proximity search
for (QueryNode child: node.getChildren()) {
child.setTag(AqpAnalyzerQueryNodeProcessor.MAX_MULTI_TOKEN_SIZE, 0);
}

return node;
}
}


AqpRequestParams reqAttr = this.getQueryConfigHandler().get(AqpAdsabsQueryConfigHandler.ConfigurationKeys.SOLR_REQUEST);
if (reqAttr != null) {
IndexSchema schema = reqAttr.getRequest().getSchema();
FieldType fType = schema.getFieldType((String)((MultiPhraseQueryNode) node).getField());
if (fType != null) {
node.setTag("field.is.tokenized", fType.isTokenized());
}
}


try {
node = simplifyMultiphrase(node, types);
} catch (IOException e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import java.io.IOException;

import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.MultiPhraseQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.SynonymQuery;
import org.apache.lucene.search.TermQuery;
Expand Down Expand Up @@ -78,9 +79,11 @@ public static String getSchemaFile() {
+ "ror.1;foo;bar\n"
+ "A00001;Aalborg U;Aalborg University;RID1004;04m5j1k67;000000010742471X;Q601956;grid.5117.2;\n\n"
+ "A00002;Aarhus U;Aarhus University;RID1006;01aj84f44;0000000119562722;Q924265;grid.7048.b;\n"
+ "A01400;SI/CfA;Center for Astrophysics | Harvard and Smithsonian;Harvard Smithsonian Center for Astrophysics;RID61814;03c3r2d17;Q1133697;grid.455754.2\n"
//+ "A01400;SI/CfA;Center for Astrophysics | Harvard and Smithsonian;Harvard Smithsonian Center for Astrophysics;RID61814;03c3r2d17;Q1133697;grid.455754.2\n"
+ "AX;SI\n"
+ "AB=>CfA\n"
+ "A01400;CfA;SI/CfA;Harvard U/CfA;Center for Astrophysics Harvard and Smithsonian;Harvard Smithsonian Center for Astrophysics;RID61814;03c3r2d17;Q1133697;grid.455754.2\n"
+ "A01397;SI;Smithsonian Institution;RID8264;01pp8nd67;0000000087163312;Q131626;grid.1214.6"
});

replaceInFile(newConfig, "synonyms=\"aff_id.synonyms\"",
Expand Down Expand Up @@ -258,9 +261,12 @@ public void test() throws Exception {

// what is the meaning of the pipe? (|) -- it forces our parser to treat the query
// as a regex; to not do that we have to set aqp.regex.disallowed.fields
assertQ(req("q", "institution:\"Center for Astrophysics | Harvard and Smithsonian\"",
"aqp.regex.disallowed.fields", "institution"), "//*[@numFound='2']");
//assertQ(req("q", "institution:\"Center for Astrophysics | Harvard and Smithsonian\"",
// "aqp.regex.disallowed.fields", "institution"), "//*[@numFound='2']");

assertQ(req("q", "institution:\"Center for Astrophysics Harvard and Smithsonian\"",
"aqp.regex.disallowed.fields", "institution"), "//*[@numFound='2']");

// and we also want to find the records via parent/child relationship BUT using
// synonyms; so assume that parent (SI) is also known under synonym 'AX' and
// CfA is known under synonym 'AB'; the search "AX/AB" should then find the same
Expand All @@ -282,6 +288,28 @@ public void test() throws Exception {
// SI/CfA;A01400;CfA
assertQ(req("q", "institution:\"AX/AB\""), "//*[@numFound='2']");

//this tests behaviour with ADS's extended configuration for multi-token synonym handling
//first what happens what we are doing by default; then with the configuration to disable
//such treatment for specific fields
assertQueryEquals(req("q", "institution:\"SI/CfA\"",
"aqp.multiphrase.keep_one", "SYNONYM"
),
"institution:\"si cfa\"~6",
MultiPhraseQuery.class
);

assertQueryEquals(req("q", "institution:\"SI/CfA\"",
"aqp.multiphrase.keep_one", "SYNONYM",
"aqp.multiphrase.keep_one.ignore.fields", "aff_id,aff_raw,institution"),
"institution:\"(ax si a01397 smithsonian institution rid8264 01pp8nd67 0000000087163312 q131626 grid.1214.6) (a01400 cfa si/cfa harvard u/cfa center for astrophysics harvard and smithsonian harvard smithsonian center for astrophysics rid61814 03c3r2d17 q1133697 grid.455754.2)\"",
MultiPhraseQuery.class
);
// and check we still retrieve the same docs
assertQ(req("q", "institution:\"SI/CfA\"",
"aqp.multiphrase.keep_one", "SYNONYM",
"aqp.multiphrase.keep_one.ignore.fields", "aff_id,aff_raw,institution"),
"//*[@numFound='2']");

}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -892,6 +892,7 @@
types="wdafftypes.txt" />
<filter class="solr.LowerCaseFilterFactory" />
<filter class="solr.TrimFilterFactory" />
<!-- <filter class="org.apache.solr.analysis.DiagnoseFilterFactory" msg="aff_tokens"/> -->
</analyzer>
<analyzer type="query">
<charFilter class="solr.PatternReplaceCharFilterFactory"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,7 @@
<str name="aqp.classic_scoring.modifier">0.5</str>
<str name="aqp.constant_scoring">first_author^14 author^13 year^10 bibstem^10</str>
<str name="aqp.multiphrase.keep_one">SYNONYM</str>
<str name="aqp.multiphrase.keep_one.ignore.fields">aff_raw,aff_id,institution</str>

<str name="q.op">AND</str>

Expand Down Expand Up @@ -370,6 +371,7 @@
<str name="aqp.classic_scoring.modifier">0.5</str>
<str name="aqp.constant_scoring">first_author^14 author^13 year^10 bibstem^10</str>
<str name="aqp.multiphrase.keep_one">SYNONYM</str>
<str name="aqp.multiphrase.keep_one.ignore.fields">aff_raw,aff_id,institution</str>

<str name="q.op">AND</str>
<str name="df">unfielded_search</str>
Expand Down

0 comments on commit 1bf928f

Please sign in to comment.