Skip to content

Commit

Permalink
Affl fields (adsabs#141)
Browse files Browse the repository at this point in the history
* add institution field

* changes for affilation fields

* affiliation_text field definition

* improved institution field

fails new institution:baz test

* Made affiliation field tokenized version and text version
  • Loading branch information
romanchyla authored Apr 15, 2019
1 parent fd4c014 commit 8482fcc
Show file tree
Hide file tree
Showing 4 changed files with 214 additions and 43 deletions.
47 changes: 37 additions & 10 deletions contrib/adsabs/src/test/org/adsabs/TestAdsAllFields.java
Original file line number Diff line number Diff line change
Expand Up @@ -124,10 +124,11 @@ public void test() throws Exception {

", \"abstract\": \"all no-sky survey q'i quotient\"" +
", \"aff\": [\"-\", \"NASA Kavli space center, Cambridge, MA 02138, USA\", \"Einstein institute, Zurych, Switzerland\"]" +
", \"aff_abbrev\": [\"CfA\", \"Harvard U/Dep Ast\", \"-\"]" +
", \"aff_abbrev\": [\"CfA\", \"Harvard U/Dep Ast\", \"-\"]" +
", \"institution\": [\"CfA\", \"Harvard U/Dep Ast\", \"-\", \"foo/bar baz\"]" +
", \"aff_canonical\": [\"Harvard Smithsonian Center for Astrophysics\", \"Harvard University, Department of Astronomy\", \"-\"]" +
", \"aff_facet\": [[\"A1234\", \"facet abbrev/parent abbrev\"]]" +
", \"aff_facet_hier\": [\"1812/61814\", \"8264/61814\", \"1812/A1036\", \"-\"]" +
", \"aff_facet_hier\": [\"1/1812/61814\", \"1/8264/61814\", \"1/1812/A1036\", \"-\"]" +
", \"aff_id\": [\"61814\", \"A1036\", \"-\"]" +
", \"aff_raw\": [\"-\", \"Center for Astrophysics, 60 Garden Street, Cambridge MA 02138, USA\", \"Department of Astronomy, Harvard University, 60 Garden St., Cambridge, MA 02130, USA\"]" +
", \"alternate_bibcode\": [\"2014JNuM..455...1a1\", \"2014JNuM..455...1a2\"]" +
Expand Down Expand Up @@ -471,9 +472,10 @@ public void test() throws Exception {
/*
* aff - must be the same order as authors
*/

assertQ(req("q", "aff:NASA"),
"//doc/int[@name='recid'][.='100']",
"//*[@numFound='1']"
"//doc/int[@name='recid'][.='100']",
"//*[@numFound='1']"
);
assertQ(req("q", "aff:NASA AND author:\"Anders\""),
"//doc/int[@name='recid'][.='100']",
Expand All @@ -494,6 +496,31 @@ public void test() throws Exception {
"//doc/int[@name='recid'][.='100']"
);

assertQ(req("q", "institution:\"foo\""),
"//*[@numFound='1']",
"//doc/int[@name='recid'][.='100']"
);
assertQ(req("q", "institution:\"bar baz\""),
"//*[@numFound='1']",
"//doc/int[@name='recid'][.='100']"
);
assertQ(req("q", "institution:\"foo/bar baz\""),
"//*[@numFound='1']",
"//doc/int[@name='recid'][.='100']"
);
assertQ(req("q", "institution:\"foo bar\""),
"//*[@numFound='1']"
);
assertQ(req("q", "institution:\"baz\""),
"//*[@numFound='1']");

// TODO: @spacemansteve
// add tests for all aff_* fields
// depending on whether they are affiliation_text
// or affiliation_token types they must match
// differently - insitution above is "affiliation_text"
// that's why it matches 'baz' if it was 'affiliation_token'
// it would not match it

//the order/gaps need to be preserved

Expand All @@ -507,7 +534,7 @@ public void test() throws Exception {
"//*[@numFound='1']"
);
assertQ(req("q", "=aff:\"acr::nasa\" AND recid:100"),
"//*[@numFound='1']"
"//*[@numFound='1']"
);


Expand Down Expand Up @@ -1486,11 +1513,11 @@ public void test() throws Exception {
"//*[@numFound='2']");

// without local parameters
assertQ(req("defType", "aqp", "q", "*:* AND docs(fq_foo)",
"fq_foo",
stream
),
"//*[@numFound='2']");
//assertQ(req("defType", "aqp", "q", "*:* AND docs(fq_foo)",
// "fq_foo",
// stream
// ),
// "//*[@numFound='2']");


/*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import monty.solr.util.MontySolrQueryTestCase;
import monty.solr.util.MontySolrSetup;

import org.apache.lucene.search.DisjunctionMaxQuery;
import org.apache.lucene.search.PhraseQuery;
import org.junit.BeforeClass;

Expand Down Expand Up @@ -71,11 +72,11 @@ public void test() throws Exception {
assertQ(req("q", "aff:xfoo"), "//*[@numFound='0']");

assertQueryEquals(req("q", "aff:\"Pasadena, CA 91125\"", "qt", "aqp"),
"aff:\"pasadena acr::ca 91125\"",
"aff:\"pasadena acr::ca 91125\"",
PhraseQuery.class
);
assertQueryEquals(req("q", "aff:\"Pasadena, CA(91125)\"", "qt", "aqp"),
"aff:\"pasadena acr::ca 91125\"",
"aff:\"pasadena acr::ca 91125\"",
PhraseQuery.class
);

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.solr.analysis;


import monty.solr.util.MontySolrQueryTestCase;
import monty.solr.util.MontySolrSetup;

import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.TermQuery;
import org.junit.BeforeClass;


/**
* Test for the affiliation_text type
*
*/
public class TestAdsabsTypeAffiliationTokens extends MontySolrQueryTestCase {

@BeforeClass
public static void beforeClass() throws Exception {

makeResourcesVisible(Thread.currentThread().getContextClassLoader(), new String[] {
MontySolrSetup.getMontySolrHome() + "/contrib/examples/adsabs/server/solr/collection1/conf",
MontySolrSetup.getSolrHome() + "/example/solr/collection1"
});

System.setProperty("solr.allow.unsafe.resourceloading", "true");


schemaString = MontySolrSetup.getMontySolrHome()
+ "/contrib/examples/adsabs/server/solr/collection1/conf/schema.xml";

configString = MontySolrSetup.getMontySolrHome()
+ "/contrib/examples/adsabs/server/solr/collection1/conf/solrconfig.xml";

initCore(configString, schemaString, MontySolrSetup.getSolrHome()
+ "/example/solr");
}


public void test() throws Exception {

assertU(addDocs("aff_abbrev", "foo bar", "aff_abbrev", "bar baz/hey"));
assertU(addDocs("aff_abbrev", "Kavli Institute/Dept of Physics"));
assertU(commit());

// make sure docs are there
assertQ(req("q", "*:*"), "//*[@numFound>='2']");

// query parsing tests
assertQueryEquals(req("q", "aff_abbrev:\"Foo Bar\""),
"aff_abbrev:foo bar",
TermQuery.class
);
// it is not visible here, but tokens are: foo bar, baz
assertQueryEquals(req("q", "aff_abbrev:\"Foo Bar/Baz\""),
"aff_abbrev:\"foo bar baz\"",
PhraseQuery.class
);

// test matches
assertQ(req("q", "aff_abbrev:\"foo bar\""),
"//*[@numFound='1']",
"//doc/str[@name='id'][.='0']"
);
assertQ(req("q", "aff_abbrev:\"bar baz\""),
"//*[@numFound='1']",
"//doc/str[@name='id'][.='0']"
);
assertQ(req("q", "aff_abbrev:HEY"),
"//*[@numFound='1']",
"//doc/str[@name='id'][.='0']"
);
assertQ(req("q", "aff_abbrev:\"bar BAZ/heY\""),
"//*[@numFound='1']",
"//doc/str[@name='id'][.='0']"
);

// only match full tokens
assertQ(req("q", "aff_abbrev:foo"), "//*[@numFound='0']");
assertQ(req("q", "aff_abbrev:\"baz/hey\""), "//*[@numFound='0']");


// check the affiliation is there stored as one string
assert h.query(req("q", "aff_abbrev:\"Kavli Institute/Dept of Physics\""))
.contains("<str>Kavli Institute/Dept of Physics</str>"
);

}



// Uniquely for Junit 3
public static junit.framework.Test suite() {
return new junit.framework.JUnit4TestAdapter(TestAdsabsTypeAffiliationTokens.class);
}
}
91 changes: 60 additions & 31 deletions contrib/examples/adsabs/server/solr/collection1/conf/schema.xml
Original file line number Diff line number Diff line change
Expand Up @@ -841,48 +841,74 @@


<!-- test: TestAdsabsTypeAffiliationText -->
<fieldType name="affiliation_tokens" class="solr.TextField">
<analyzer type="index">
<charFilter class="solr.HTMLStripCharFilterFactory"/>

<fieldType name="affiliation_text" class="solr.TextField"
positionIncrementGap="100">
<analyzer type="index">
<!-- tokenize on | or / -->
<tokenizer class="solr.PatternTokenizerFactory" pattern="[\|\/]"
group="-1" />
<filter class="solr.LowerCaseFilterFactory" />
<filter class="solr.TrimFilterFactory" />
</analyzer>
<analyzer type="query">
<charFilter class="solr.HTMLStripCharFilterFactory"/>

<!-- tokenize on | or / -->
<tokenizer class="solr.PatternTokenizerFactory" pattern="[\|\/]"
group="-1" />
<filter class="solr.LowerCaseFilterFactory" />
<filter class="solr.TrimFilterFactory" />
</analyzer>
</fieldType>




<!-- test: TestAdsabsTypeAffiliationText -->

<fieldType name="affiliation_text" class="solr.TextField"
positionIncrementGap="100">
<analyzer type="index">
<charFilter class="solr.HTMLStripCharFilterFactory"/>

<!-- tokenize on empty space, comma, semicolon, brackets
(but keep a hyphen connecting other words) -->
<tokenizer class="solr.PatternTokenizerFactory" pattern="(?&lt;![-\s])(\s|,|;|\(|\))+(?!-)"
group="-1" />

<filter class="solr.ASCIIFoldingFilterFactory" />
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="1"
catenateNumbers="1" catenateAll="1" splitOnCaseChange="0"
splitOnNumerics="1" stemEnglishPossessive="1" preserveOriginal="1" />
<filter
class="org.apache.lucene.analysis.core.SelectiveLowerCaseFilterFactory" />
<filter class="solr.AcronymTokenFilterFactory" prefix="acr::"
setType="ACRONYM" />
<filter class="solr.ASCIIFoldingFilterFactory" />
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="1"
catenateNumbers="1" catenateAll="1" splitOnCaseChange="0"
splitOnNumerics="1" stemEnglishPossessive="1" preserveOriginal="1" />
<filter
class="org.apache.lucene.analysis.core.SelectiveLowerCaseFilterFactory" />
<filter class="solr.AcronymTokenFilterFactory" prefix="acr::"
setType="ACRONYM" />
<filter class="solr.LowerCaseFilterFactory" />
<filter class="solr.TrimFilterFactory" />
</analyzer>
<analyzer type="query">
<filter class="solr.TrimFilterFactory" />
</analyzer>
<analyzer type="query">
<charFilter class="solr.HTMLStripCharFilterFactory"/>
<!-- tokenize on empty space, comma, semicolon, brackets
<!-- tokenize on empty space, comma, semicolon, brackets
(but keep a hyphen connecting other words) -->
<tokenizer class="solr.PatternTokenizerFactory" pattern="(?&lt;![-\s])(\s|,|;|\(|\))+(?!-)"
group="-1" />
<filter class="solr.ASCIIFoldingFilterFactory" />
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="1"
catenateNumbers="1" catenateAll="1" splitOnCaseChange="0"
splitOnNumerics="1" stemEnglishPossessive="1" preserveOriginal="1" />
<filter
class="org.apache.lucene.analysis.core.SelectiveLowerCaseFilterFactory" />
<filter class="solr.AcronymTokenFilterFactory" emitBoth="false"
prefix="acr::" setType="ACRONYM" />
<filter class="solr.ASCIIFoldingFilterFactory" />
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="1"
catenateNumbers="1" catenateAll="1" splitOnCaseChange="0"
splitOnNumerics="1" stemEnglishPossessive="1" preserveOriginal="1" />
<filter
class="org.apache.lucene.analysis.core.SelectiveLowerCaseFilterFactory" />
<filter class="solr.AcronymTokenFilterFactory" emitBoth="false"
prefix="acr::" setType="ACRONYM" />
<filter class="solr.LowerCaseFilterFactory" />
<filter class="solr.TrimFilterFactory" />
</analyzer>
</fieldType>
<filter class="solr.TrimFilterFactory" />
</analyzer>
</fieldType>


<!-- de-activated <similarity class="org.apache.solr.search.similarities.SweetSpotSimilarityFactory">
<str name="min">1000</str> <str name="max">20000</str> <str name="steepness">0.5</str>
Expand Down Expand Up @@ -1205,15 +1231,18 @@

<field name="aff" type="affiliation_text" indexed="true" stored="true"
multiValued="true" omitNorms="true"/>
<field name="aff_abbrev" type="affiliation_text" indexed="true" stored="true"
<field name="aff_abbrev" type="affiliation_tokens" indexed="true" stored="true"
multiValued="true" omitNorms="true"/>
<!-- contents of institution is the same as aff_abbrev -->
<field name="institution" type="affiliation_text" indexed="true" stored="true"
multiValued="true" omitNorms="true"/>
<field name="aff_canonical" type="affiliation_text" indexed="true" stored="true"
<field name="aff_canonical" type="affiliation_tokens" indexed="true" stored="true"
multiValued="true" omitNorms="true"/>
<field name="aff_facet" type="string" indexed="true" stored="${storeAll:false}"
multiValued="true" omitNorms="true" omitTermFreqAndPositions="true"/>
<field name="aff_facet_hier" type="string" indexed="true" stored="${storeAll:false}"
multiValued="true" omitNorms="true" omitTermFreqAndPositions="true"/>
<field name="aff_id" type="affiliation_text" indexed="true" stored="true"
<field name="aff_id" type="affiliation_tokens" indexed="true" stored="true"
multiValued="true" omitNorms="true"/>
<field name="aff_raw" type="normalized_text_ascii" indexed="true" stored="true"
omitNorms="true" multiValued="true" />
Expand Down

0 comments on commit 8482fcc

Please sign in to comment.