Skip to content

Commit

Permalink
Fix for affiliations and synonym searches
Browse files Browse the repository at this point in the history
  • Loading branch information
romanchyla committed Mar 8, 2021
1 parent 5a80435 commit eeff1d7
Show file tree
Hide file tree
Showing 7 changed files with 141 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
import org.apache.lucene.search.SecondOrderQuery;
import org.apache.lucene.search.SimpleCollector;
import org.apache.lucene.search.SolrCacheWrapper;
import org.apache.lucene.search.SynonymQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopFieldCollector;
Expand Down Expand Up @@ -383,7 +384,8 @@ public Query parse(FunctionQParser fp) throws SyntaxError {
int positionIncrementGap = 1;
if (fp.getReq() != null) {
IndexSchema schema = fp.getReq().getSchema();
SchemaField field = schema.getFieldOrNull(query.toString().split(":")[0]);
String f = getField(query);
SchemaField field = schema.getFieldOrNull(f);
if (field != null) {
FieldType fType = field.getType();
//if (!fType.isMultiValued()) {
Expand Down Expand Up @@ -423,6 +425,34 @@ public Query parse(FunctionQParser fp) throws SyntaxError {
query = new BoostQuery(query, boostFactor);
return query;
}

private String getField(Query query) throws SyntaxError {

if (query instanceof TermQuery) {
return ((TermQuery) query).getTerm().field();
}
else if (query instanceof SynonymQuery) {
for (Term t: ((SynonymQuery) query).getTerms()) {
return t.field();
}
}
else if (query instanceof BooleanQuery) {
HashSet<String> s = new HashSet<String>();
for (BooleanClause c: ((BooleanQuery) query).clauses()) {
s.add(getField(c.getQuery()));
}
if (s.size() > 1) {
throw new SyntaxError("Illegal state: combining multiple fields is not allowed");
}
return (String) s.toArray()[0];
}
else {
// last resort
return query.toString().split(":")[0];
}

return null;
}
});

/* @api.doc
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

import org.apache.lucene.queryparser.flexible.aqp.builders.AqpFunctionQueryBuilder;
import org.apache.lucene.queryparser.flexible.aqp.config.AqpAdsabsQueryConfigHandler;
import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpANTLRNode;
import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpFunctionQueryNode;
import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpNonAnalyzedQueryNode;
import org.apache.lucene.queryparser.flexible.aqp.parser.AqpStandardQueryConfigHandler;
Expand All @@ -19,6 +18,7 @@
import org.apache.lucene.queryparser.flexible.core.nodes.FieldQueryNode;
import org.apache.lucene.queryparser.flexible.core.nodes.QueryNode;
import org.apache.lucene.queryparser.flexible.messages.MessageImpl;
import org.apache.lucene.queryparser.flexible.aqp.processors.AqpAnalyzerQueryNodeProcessor;

/**
* This processor prevents analysis to happen for nodes that are
Expand Down Expand Up @@ -118,29 +118,30 @@ protected QueryNode postProcessNode(QueryNode node)
return rn;

}

private Map<String, String> getConfigMap() {
protected static Map<String, String> empty = new HashMap<String, String>();
public Map<String, String> getConfigMap() {
Map<String, String> args = getQueryConfigHandler().get(
AqpStandardQueryConfigHandler.ConfigurationKeys.NAMED_PARAMETER);
if (args == null)
return new HashMap<String, String>();
return empty;
return args;
}

private String getConfigVal(String key) {
public String getConfigVal(String key) {
Map<String, String> args = getConfigMap();
if (args.containsKey(key)) {
return args.get(key);
}
return null;
}

private String getConfigVal(String key, String defaultVal) {
public String getConfigVal(String key, String defaultVal) {
Map<String, String> args = getConfigMap();
if (args.containsKey(key)) {
return args.get(key);
}
return defaultVal;
}


}
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpAdsabsRegexQueryNode;
import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpNonAnalyzedQueryNode;

public class AqpAdsabsRegexNodeProcessor extends QueryNodeProcessorImpl implements
public class AqpAdsabsRegexNodeProcessor extends AqpQueryNodeProcessorImpl implements
QueryNodeProcessor {

@Override
Expand All @@ -33,6 +33,10 @@ protected QueryNode preProcessNode(QueryNode node)
FieldQueryNode n = (FieldQueryNode) node;
String input = n.getTextAsString();

String field = n.getFieldAsString();
if (!isRegexField(field))
return node;

if (input == null) {
return node;
}
Expand Down Expand Up @@ -62,7 +66,19 @@ protected QueryNode preProcessNode(QueryNode node)
return node;
}

@Override
// can be overriden to plug in some configuration
protected boolean isRegexField(String field) {
String val = getConfigVal("aqp.regex.disallowed.fields", null);
if (val == null)
return true;
for (String f: val.split(";")) {
if (f.equals(field))
return false;
}
return true;
}

@Override
protected List<QueryNode> setChildrenOrder(List<QueryNode> children)
throws QueryNodeException {
return children;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,9 @@ public static String getSchemaFile() {
+ "ror.1;foo;bar\n"
+ "A00001;Aalborg U;Aalborg University;RID1004;04m5j1k67;000000010742471X;Q601956;grid.5117.2;\n\n"
+ "A00002;Aarhus U;Aarhus University;RID1006;01aj84f44;0000000119562722;Q924265;grid.7048.b;\n"
+ "A01400;SI/CfA;Center for Astrophysics | Harvard and Smithsonian;Harvard Smithsonian Center for Astrophysics;RID61814;03c3r2d17;Q1133697;grid.455754.2\n"
+ "AX;SI\n"
+ "AB=>CfA\n"
});

replaceInFile(newConfig, "synonyms=\"aff_id.synonyms\"",
Expand All @@ -99,6 +102,16 @@ public void test() throws Exception {
"institution", "U Catania/Dep Phy Ast; -; -; INFN/Catania",
"institution", "U Catania/Dep Phy Ast; -"
));
assertU(addDocs(
"institution", "SI/CfA; Harvard U/CfA",
"institution", "Harvard U/Phys; Brown U/Ast",
"aff", "SI/CfA")
);
assertU(addDocs(
"institution", "Harvard U/Law; -",
"institution", "SI/CfA; Harvard U/CfA",
"aff", "SI/CfA")
);
assertU(commit());

assertQueryEquals(req("q", "aff_id:https\\://ror.org/ror.1"),
Expand Down Expand Up @@ -193,14 +206,82 @@ public void test() throws Exception {
assertQ(req("q", "pos(institution:\"U Catania/Dep Phy Ast\", 1)"), "//*[@numFound='1']");
assertQ(req("q", "pos(institution:\"U Catania\", 1)"), "//*[@numFound='1']");
assertQ(req("q", "pos(institution:\"Dep Phy Ast\", 1)"), "//*[@numFound='1']");
assertQ(req("q", "pos(institution:\"-\", 1)"), "//*[@numFound='1']");
assertQ(req("q", "pos(institution:\"-\", 1)"), "//*[@numFound='2']");
assertQ(req("q", "pos(institution:\"INFN/Catania\", 1)"), "//*[@numFound='0']");
assertQ(req("q", "pos(institution:\"INFN\", 1)"), "//*[@numFound='0']");
assertQ(req("q", "pos(institution:\"Catania\", 1)"), "//*[@numFound='0']");

assertQ(req("q", "pos(institution:\"INFN/Catania\", 2)"), "//*[@numFound='1']");
assertQ(req("q", "pos(institution:\"INFN\", 2)"), "//*[@numFound='1']");
assertQ(req("q", "pos(institution:\"Catania\", 2)"), "//*[@numFound='1']");

// search parts of the affiliation
assertQ(req("q", "institution:\"SI\""), "//*[@numFound='2']");
assertQ(req("q", "institution:\"CfA\""), "//*[@numFound='2']");
assertQ(req("q", "institution:\"Harvard U\""), "//*[@numFound='2']");
assertQ(req("q", "institution:\"Law\""), "//*[@numFound='1']");
assertQ(req("q", "institution:\"Phys\""), "//*[@numFound='1']");

// search parts (but honour position)
assertQ(req("q", "pos(institution:\"SI\", 1)"), "//*[@numFound='1']");
assertQ(req("q", "pos(institution:\"CfA\", 1)"), "//*[@numFound='1']");
assertQ(req("q", "pos(institution:\"Harvard U\", 2)"), "//*[@numFound='2']");
assertQ(req("q", "pos(institution:\"CfA\", 2)"), "//*[@numFound='1']");

// search parent/child
assertQ(req("q", "institution:\"SI/CfA\""), "//*[@numFound='2']");

// do the same but as phrase; it should fail because the parser WILL NOT
// treat empty space as a delimiter; it considers it part of the token
// like 'Harvard U'
assertQ(req("q", "institution:\"SI CfA\""), "//*[@numFound='0']");

// proximity operator however should yield the record
assertQ(req("q", "institution:\"SI\" NEAR1 institution:\"CfA\""), "//*[@numFound='2']");

// but not mix up insitutions that were separated by ';' (those affiliations
// belong to different authors/persons)
assertQ(req("q", "institution:\"Law\" NEAR5 institution:\"CfA\""), "//*[@numFound='0']");

// one person however can have multiple affiliations; and they can be searched via proximity
assertQ(req("q", "institution:\"Phys\" NEAR5 institution:\"Ast\""), "//*[@numFound='1']");

// SI/CfA is also known through identifiers/canonical names - NOTE: the input synonyms
// MUST CONTAIN correct entry, i.e. "SI/CfA" - for some reason we used to have "SI CfA"
assertQ(req("q", "institution:\"A01400\""), "//*[@numFound='2']");
assertQ(req("q", "institution:\"RID61814\""), "//*[@numFound='2']");
assertQ(req("q", "institution:\"Harvard Smithsonian Center for Astrophysics\""), "//*[@numFound='2']");
assertQ(req("q", "institution:\"03c3r2d17\""), "//*[@numFound='2']");
assertQ(req("q", "institution:\"Q1133697\""), "//*[@numFound='2']");
assertQ(req("q", "institution:\"grid.455754.2\""), "//*[@numFound='2']");


// what is the meaning of the pipe? (|) -- it forces our parser to treat the query
// as a regex; to not do that we have to set aqp.regex.disallowed.fields
assertQ(req("q", "institution:\"Center for Astrophysics | Harvard and Smithsonian\"",
"aqp.regex.disallowed.fields", "institution"), "//*[@numFound='2']");

// and we also want to find the records via parent/child relationship BUT using
// synonyms; so assume that parent (SI) is also known under synonym 'AX' and
// CfA is known under synonym 'AB'; the search "AX/AB" should then find the same
// thing as "SI/CfA" -- HOWEVER, note, this feature requires synonym mapping
// either of the explicit form:
// AX => SI
// or more forgiving (and more wasteful):
// AX;SI
// IMHO this feature is confusing; will bloat the synonym file and users
// are going to be confused by it. I'd just say: use canonical forms of
// the synonym. And, and... be aware that if the synonym file contains a cycle
// i.e. 'parent/child' entry sharing synonyms with 'child' entry; then the
// two will be merged and considered as one:
//
// SI/CfA;A01400
// CfA;A014000
//
// becomes:
// SI/CfA;A01400;CfA
assertQ(req("q", "institution:\"AX/AB\""), "//*[@numFound='2']");

}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -892,6 +892,7 @@
types="wdafftypes.txt" />
<filter class="solr.LowerCaseFilterFactory" />
<filter class="solr.TrimFilterFactory" />
<filter class="org.apache.solr.analysis.DiagnoseFilterFactory" msg="aff_token"/>
</analyzer>
<analyzer type="query">
<charFilter class="solr.PatternReplaceCharFilterFactory"
Expand Down Expand Up @@ -974,6 +975,7 @@
setType="ACRONYM" />
<filter class="solr.LowerCaseFilterFactory" />
<filter class="solr.TrimFilterFactory" />
<filter class="org.apache.solr.analysis.DiagnoseFilterFactory" msg="aff_text"/>
</analyzer>
<analyzer type="query">
<charFilter class="solr.HTMLStripCharFilterFactory" />
Expand Down Expand Up @@ -1292,9 +1294,6 @@
docValues="true"/>
<field name="aff_id" type="affiliation_tokens" indexed="true" stored="true"
multiValued="true" omitNorms="true"/>
<!-- TODO: remove once index has been rebuilt -->
<field name="aff_raw" type="affiliation_text" indexed="true" stored="true"
omitNorms="true" multiValued="true" />

<!-- for Unified Astronomy Thesaurus -->
<field name="uat" type="uat_tokens" indexed="true" stored="true"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,7 @@
<ramBufferSizeMB>${solr.ramBufferSize:1000}</ramBufferSizeMB>
<maxBufferedDocs>${solr.maxBufferedDocs:50000}</maxBufferedDocs>

<mergePolicyFactory class="solr.TieredMergePolicyFactory">
<int name="maxMergeAtOnce">${solr.mergePolicy.maxMergeAtOnce:10}</int>
<int name="segmentsPerTier">${solr.mergePolicy.segmentsPerTier:10}</int>
</mergePolicyFactory>


<!-- When we run several instances sharing the same index, we must
make sure that only one writer is modifying it; and other
Expand Down

0 comments on commit eeff1d7

Please sign in to comment.