From 2229efeda270006b21e3510033dbfddfa8cdd382 Mon Sep 17 00:00:00 2001 From: Roman Chyla Date: Fri, 6 Jun 2014 20:51:28 -0400 Subject: [PATCH 1/9] Fix for #604 --- .classpath | 84 ++--- .../AdsSpecialCharactersFilter.java | 288 ++++++++++++++++++ .../AdsSpecialCharactersFilterFactory.java | 47 +++ .../processors/AqpPostAnalysisProcessor.java | 40 ++- .../solr/analysis/DiagnoseFilterFactory.java | 47 +-- .../TestAdsabsTypeFulltextParsing.java | 233 ++++++++++---- .../adsabs/solr/collection1/conf/schema.xml | 16 +- 7 files changed, 632 insertions(+), 123 deletions(-) create mode 100644 contrib/adsabs/src/java/org/apache/lucene/analysis/miscellaneous/AdsSpecialCharactersFilter.java create mode 100644 contrib/adsabs/src/java/org/apache/lucene/analysis/miscellaneous/AdsSpecialCharactersFilterFactory.java diff --git a/.classpath b/.classpath index 8199ba902..b43df69fe 100644 --- a/.classpath +++ b/.classpath @@ -1,19 +1,34 @@ - - + + + + - + + + - - - - - + + + + + + + + + + + + + + + + @@ -25,54 +40,43 @@ - + + + - + + + + + + + + + + - + + + - - - - - - - - - - - - - - - - - - - - - - - - - - + + - - - + + + diff --git a/contrib/adsabs/src/java/org/apache/lucene/analysis/miscellaneous/AdsSpecialCharactersFilter.java b/contrib/adsabs/src/java/org/apache/lucene/analysis/miscellaneous/AdsSpecialCharactersFilter.java new file mode 100644 index 000000000..e008f73e7 --- /dev/null +++ b/contrib/adsabs/src/java/org/apache/lucene/analysis/miscellaneous/AdsSpecialCharactersFilter.java @@ -0,0 +1,288 @@ +package org.apache.lucene.analysis.miscellaneous; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.RamUsageEstimator; + +/** + * + * This is a modified ASCIIFoldingFilter + * + * It translates greek math symbols + * + * For example, 'γ' will be replaced by 'gamma'. + */ +public final class AdsSpecialCharactersFilter extends TokenFilter { + public AdsSpecialCharactersFilter(TokenStream input) + { + super(input); + } + + private char[] output = new char[512]; + private int outputPos; + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + + @Override + public boolean incrementToken() throws IOException { + if (input.incrementToken()) { + final char[] buffer = termAtt.buffer(); + final int length = termAtt.length(); + + // If no characters actually require rewriting then we + // just return token as-is: + for(int i = 0 ; i < length ; ++i) { + final char c = buffer[i]; + if (c >= '\u0080') + { + foldToASCII(buffer, length); + termAtt.copyBuffer(output, 0, outputPos); + break; + } + } + return true; + } else { + return false; + } + } + + /** + * Converts characters above ASCII to their ASCII equivalents. For example, + * accents are removed from accented characters. + * @param input The string to fold + * @param length The number of characters in the input string + */ + public void foldToASCII(char[] input, int length) + { + // Worst-case length required: + final int maxSizeNeeded = 4 * length; + if (output.length < maxSizeNeeded) { + output = new char[ArrayUtil.oversize(maxSizeNeeded, RamUsageEstimator.NUM_BYTES_CHAR)]; + } + + outputPos = foldToASCII(input, 0, output, 0, length); + } + + /** + * Converts characters above ASCII to their ASCII equivalents. For example, + * accents are removed from accented characters. + * @param input The characters to fold + * @param inputPos Index of the first character to fold + * @param output The result of the folding. Should be of size >= {@code length * 4}. + * @param outputPos Index of output where to put the result of the folding + * @param length The number of characters to fold + * @return length of output + * @lucene.internal + */ + public static final int foldToASCII(char input[], int inputPos, char output[], int outputPos, int length) + { + final int end = inputPos + length; + for (int pos = inputPos; pos < end ; ++pos) { + final char c = input[pos]; + + // Quick test: if it's not in range then just keep current character + if (c < '\u0080') { + output[outputPos++] = c; + } else { + switch (c) { + case '\u0391': + case '\u03B1': + output[outputPos++] = 'a'; + output[outputPos++] = 'l'; + output[outputPos++] = 'p'; + output[outputPos++] = 'h'; + output[outputPos++] = 'a'; + break; + case '\u0392': + case '\u03B2': + output[outputPos++] = 'b'; + output[outputPos++] = 'e'; + output[outputPos++] = 't'; + output[outputPos++] = 'a'; + break; + case '\u0393': + case '\u03B3': + output[outputPos++] = 'g'; + output[outputPos++] = 'a'; + output[outputPos++] = 'm'; + output[outputPos++] = 'm'; + output[outputPos++] = 'a'; + break; + case '\u0394': + case '\u03B4': + output[outputPos++] = 'd'; + output[outputPos++] = 'e'; + output[outputPos++] = 'l'; + output[outputPos++] = 't'; + output[outputPos++] = 'a'; + break; + case '\u0395': + case '\u03B5': + output[outputPos++] = 'e'; + output[outputPos++] = 'p'; + output[outputPos++] = 's'; + output[outputPos++] = 'i'; + output[outputPos++] = 'l'; + output[outputPos++] = 'o'; + output[outputPos++] = 'n'; + break; + case '\u0396': + case '\u03B6': + output[outputPos++] = 'z'; + output[outputPos++] = 'e'; + output[outputPos++] = 't'; + output[outputPos++] = 'a'; + break; + case '\u0397': + case '\u03B7': + output[outputPos++] = 'e'; + output[outputPos++] = 't'; + output[outputPos++] = 'a'; + break; + case '\u0398': + case '\u03B8': + output[outputPos++] = 't'; + output[outputPos++] = 'h'; + output[outputPos++] = 'e'; + output[outputPos++] = 't'; + output[outputPos++] = 'a'; + break; + case '\u0399': + case '\u03B9': + output[outputPos++] = 'i'; + output[outputPos++] = 'o'; + output[outputPos++] = 't'; + output[outputPos++] = 'a'; + break; + case '\u039A': + case '\u03BA': + output[outputPos++] = 'k'; + output[outputPos++] = 'a'; + output[outputPos++] = 'p'; + output[outputPos++] = 'p'; + output[outputPos++] = 'a'; + break; + case '\u039B': + case '\u03BB': + output[outputPos++] = 'l'; + output[outputPos++] = 'a'; + output[outputPos++] = 'm'; + output[outputPos++] = 'b'; + output[outputPos++] = 'd'; + output[outputPos++] = 'a'; + break; + case '\u039C': + case '\u03BC': + output[outputPos++] = 'm'; + output[outputPos++] = 'u'; + break; + case '\u039D': + case '\u03BD': + output[outputPos++] = 'n'; + output[outputPos++] = 'u'; + break; + case '\u039E': + case '\u03BE': + output[outputPos++] = 'x'; + output[outputPos++] = 'i'; + break; + case '\u039F': + case '\u03BF': + output[outputPos++] = 'o'; + output[outputPos++] = 'm'; + output[outputPos++] = 'i'; + output[outputPos++] = 'c'; + output[outputPos++] = 'r'; + output[outputPos++] = 'o'; + output[outputPos++] = 'n'; + break; + case '\u03A0': + case '\u03C0': + output[outputPos++] = 'p'; + output[outputPos++] = 'i'; + break; + case '\u03A1': + case '\u03C1': + output[outputPos++] = 'r'; + output[outputPos++] = 'h'; + output[outputPos++] = 'o'; + break; + case '\u03A3': + case '\u03C3': + output[outputPos++] = 's'; + output[outputPos++] = 'i'; + output[outputPos++] = 'g'; + output[outputPos++] = 'm'; + output[outputPos++] = 'a'; + break; + case '\u03A4': + case '\u03C4': + output[outputPos++] = 't'; + output[outputPos++] = 'a'; + output[outputPos++] = 'u'; + break; + case '\u03A5': + case '\u03C5': + output[outputPos++] = 'u'; + output[outputPos++] = 'p'; + output[outputPos++] = 's'; + output[outputPos++] = 'i'; + output[outputPos++] = 'l'; + output[outputPos++] = 'o'; + output[outputPos++] = 'n'; + break; + case '\u03A6': + case '\u03C6': + output[outputPos++] = 'p'; + output[outputPos++] = 'h'; + output[outputPos++] = 'i'; + break; + case '\u03A7': + case '\u03C7': + output[outputPos++] = 'c'; + output[outputPos++] = 'h'; + output[outputPos++] = 'i'; + break; + case '\u03A8': + case '\u03C8': + output[outputPos++] = 'p'; + output[outputPos++] = 's'; + output[outputPos++] = 'i'; + break; + case '\u03A9': + case '\u03C9': + output[outputPos++] = 'o'; + output[outputPos++] = 'm'; + output[outputPos++] = 'e'; + output[outputPos++] = 'g'; + output[outputPos++] = 'a'; + break; + default: + output[outputPos++] = c; + break; + } + } + } + return outputPos; + } +} diff --git a/contrib/adsabs/src/java/org/apache/lucene/analysis/miscellaneous/AdsSpecialCharactersFilterFactory.java b/contrib/adsabs/src/java/org/apache/lucene/analysis/miscellaneous/AdsSpecialCharactersFilterFactory.java new file mode 100644 index 000000000..b7d7bb202 --- /dev/null +++ b/contrib/adsabs/src/java/org/apache/lucene/analysis/miscellaneous/AdsSpecialCharactersFilterFactory.java @@ -0,0 +1,47 @@ +package org.apache.lucene.analysis.miscellaneous; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.util.AbstractAnalysisFactory; +import org.apache.lucene.analysis.util.MultiTermAwareComponent; +import org.apache.lucene.analysis.util.TokenFilterFactory; +import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter; +import org.apache.lucene.analysis.TokenStream; + +/** + * Factory for {@link ASCIIFoldingFilter}. + *
+ * <fieldType name="text_ascii" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.ASCIIFoldingFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * + */ +public class AdsSpecialCharactersFilterFactory extends TokenFilterFactory implements MultiTermAwareComponent { + public AdsSpecialCharactersFilter create(TokenStream input) { + return new AdsSpecialCharactersFilter(input); + } + + @Override + public AbstractAnalysisFactory getMultiTermComponent() { + return this; + } +} + diff --git a/contrib/adsabs/src/java/org/apache/lucene/queryparser/flexible/aqp/processors/AqpPostAnalysisProcessor.java b/contrib/adsabs/src/java/org/apache/lucene/queryparser/flexible/aqp/processors/AqpPostAnalysisProcessor.java index cffee84a7..073ee762a 100644 --- a/contrib/adsabs/src/java/org/apache/lucene/queryparser/flexible/aqp/processors/AqpPostAnalysisProcessor.java +++ b/contrib/adsabs/src/java/org/apache/lucene/queryparser/flexible/aqp/processors/AqpPostAnalysisProcessor.java @@ -339,7 +339,7 @@ public List>> traverseGraphFindAllQueries() // measure how long a string the query covers List> paths = path.getAllPaths(); - int[] measured = measurePaths(paths); + int[] measured = measurePathsInclGaps(paths); // we'll consider only the queries that cover the max distance int max = 0; @@ -352,8 +352,11 @@ public List>> traverseGraphFindAllQueries() // retrieve only the queries made of query elements that cover the longest distance for (int i=0;i> oneQuery = new ArrayList>(); retrieveQueryElements(oneQuery, paths.get(i), 0); assert oneQuery.size() == paths.get(i).size() / 2; @@ -411,6 +414,37 @@ private int[] measurePaths(List> paths) { } return measuredPaths; } + + /** + * Measure the length that the path covers; but penalize gaps; + * eg. if there is a gap between tokens bigger than 2; the total + * length will be decreased + * + * @param paths + * @return + */ + private int[] measurePathsInclGaps(List> paths) { + int[] measuredPaths = new int[paths.size()]; + int pathLength = 0; + for (int j=0; j path = paths.get(j); + assert path.size() % 2 == 0; + pathLength = path.get(path.size()-1) - path.get(0); + int gaps = 0; + // measure the gaps between tokens + for (int i=1;i args) { - super.init(args); - } - - /* (non-Javadoc) - * @see org.apache.solr.analysis.TokenFilterFactory#create(org.apache.lucene.analysis.TokenStream) - */ - public DiagnoseFilter create(TokenStream input) { - return new DiagnoseFilter(input); - } - + public void init(Map args) { + super.init(args); + + if (args.containsKey("idString")) { + idString = args.remove("idString"); + } + } + + /* (non-Javadoc) + * @see org.apache.solr.analysis.TokenFilterFactory#create(org.apache.lucene.analysis.TokenStream) + */ + public DiagnoseFilter create(TokenStream input) { + return new DiagnoseFilter(input, this.idString); + } + } final class DiagnoseFilter extends TokenFilter { - + private int numTokens = 0; private final PositionIncrementAttribute posIncrAtt; @@ -35,32 +41,35 @@ final class DiagnoseFilter extends TokenFilter { private final CharTermAttribute termAtt; private final OffsetAttribute offsetAtt; + private String idString; - public DiagnoseFilter(TokenStream input) { + + public DiagnoseFilter(TokenStream input, String idString) { super(input); posIncrAtt = addAttribute(PositionIncrementAttribute.class); typeAtt = addAttribute(TypeAttribute.class); termAtt = addAttribute(CharTermAttribute.class); offsetAtt = addAttribute(OffsetAttribute.class); + this.idString = idString; } - - + + /* (non-Javadoc) * @see org.apache.lucene.analysis.TokenStream#incrementToken() */ @Override public boolean incrementToken() throws IOException { - + if (!input.incrementToken()) return false; - System.out.println("term=" + termAtt.toString() + " pos=" + posIncrAtt.getPositionIncrement() + " type=" + typeAtt.type() + " offsetStart=" + offsetAtt.startOffset() + " offsetEnd=" + offsetAtt.endOffset()); + System.out.println(idString + " term=" + termAtt.toString() + " pos=" + posIncrAtt.getPositionIncrement() + " type=" + typeAtt.type() + " offsetStart=" + offsetAtt.startOffset() + " offsetEnd=" + offsetAtt.endOffset()); return true; } - - + + @Override public void reset() throws IOException { super.reset(); diff --git a/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeFulltextParsing.java b/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeFulltextParsing.java index 59afedfa5..de3bee7d4 100644 --- a/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeFulltextParsing.java +++ b/contrib/adsabs/src/test/org/apache/solr/analysis/TestAdsabsTypeFulltextParsing.java @@ -131,7 +131,16 @@ public String getSchemaFile() { File simpleTokenSynonymsFile = createTempFile(new String[]{ "moon,moons,luna,lune,mond=>lunar\n" + "stetoscope=>glass\n" + - "pace=> lunar\n" + "pace=> lunar\n" + + "mhz, khz, terahertz, hertz, gigahertz, kilohertz, megahertz, hertzian, millihertz, microhz, microhertz, submegahertz, millihz, gigahertzs, microherz => mhz\n" + + "survey, surveys, surveyed, surveyor, surveying, durchmusterung, surveyors, resurveyed, resurvey, minisurvey, survery, durchmusterungen, nonsurvey, surveyable, relevamientos, surveyof, serveying, unsurveyable, surfey, servey => survey\n" + + "source, sources, multisource, sourcing, sourceless, quellen, souce, subsources, radioquellen, souces, soruce, circumsource, soruces, sourse, sourses, subsource, pseudosource, surces, cources, intersource, sourcers, intrasource, sourcefile, scource, souarce, sourceat => source\n" + + "faint, fainter, faintest, faintness, faintly, faintward, faintwards, faintening, fiant => faint\n" + + "gamma, gammas, amma, gam, gama, gamm, gammar, gammma, gramma, gammaisation => gamma\n" + + "radio, radios, nonradio, radioed, radiobereich, adio, miniradio, radido => radio\n" + + "pulsars, pulsar, psr, pulser, psrs, pulsare, pulsares, pulars, pulsary, puslsar, interpulsars, pusar, nonpulsar, psro, rontgenpulsare, pulsarlike, pulsarpsr => pulsars\n" + + "millisecond, milliseconds, submillisecond, millisec, milliseconde, millesecond, millisekunden, milliseond, millisecnd => millisecond\n" + + "fermi, fermilab => fermi\n" }); File multiTokenSynonymsFile = createTempFile(new String[]{ @@ -142,7 +151,10 @@ public String getSchemaFile() { "ABC,Astrophysics\0Business\0Center\n" + "Astrophysics\0Business\0Commons, ABC\n" + "MOND,modified\0newtonian\0dynamics\n" + - "bubble\0pace\0telescope,BPT\n" + "bubble\0pace\0telescope,BPT\n" + + "GBT,Green\0bank\0telescope\n" + + "gamma\0ray,gammaray,gamma\0rays,gammarays\n" + }); replaceInFile(newConfig, "synonyms=\"ads_text_multi.synonyms\"", "synonyms=\"" + multiTokenSynonymsFile.getAbsolutePath() + "\""); @@ -167,44 +179,51 @@ public String getSolrConfigFile() { public void setUp() throws Exception { super.setUp(); - assertU(adoc(F.ID, "1", F.BIBCODE, "xxxxxxxxxxxx1", F.TYPE_ADS_TEXT, "Bílá kobyla skočila přes čtyřista")); - assertU(adoc(F.ID, "2", F.BIBCODE, "xxxxxxxxxxxx2", F.TYPE_ADS_TEXT, "třicet-tři stříbrných střech")); - assertU(adoc(F.ID, "3", F.BIBCODE, "xxxxxxxxxxxx3", F.TYPE_ADS_TEXT, "A ještě TřistaTřicetTři stříbrných křepeliček")); - assertU(adoc(F.ID, "4", F.BIBCODE, "xxxxxxxxxxxx4", F.TYPE_ADS_TEXT, "Mirrors of the hubble space telescope goes home")); - assertU(adoc(F.ID, "5", F.BIBCODE, "xxxxxxxxxxxx5", F.TYPE_ADS_TEXT, "Mirrors of the HST second")); - assertU(adoc(F.ID, "6", F.BIBCODE, "xxxxxxxxxxxx6", F.TYPE_ADS_TEXT, "Mirrors of the Hst third")); - assertU(adoc(F.ID, "7", F.BIBCODE, "xxxxxxxxxxxx7", F.TYPE_ADS_TEXT, "Mirrors of the HubbleSpaceTelescope fourth")); - assertU(adoc(F.ID, "8", F.BIBCODE, "xxxxxxxxxxxx8", F.TYPE_ADS_TEXT, "Take Massachusets Institute of Technology (MIT)")); - assertU(adoc(F.ID, "9", F.BIBCODE, "xxxxxxxxxxxx9", F.TYPE_ADS_TEXT, "MIT developed new network protocols")); - assertU(adoc(F.ID, "10", F.BIBCODE, "xxxxxxxxxxx10", F.TYPE_ADS_TEXT, "No-sky data survey")); - assertU(adoc(F.ID, "11", F.BIBCODE, "xxxxxxxxxxx11", F.TYPE_ADS_TEXT, "All-sky data survey")); - assertU(adoc(F.ID, "12", F.BIBCODE, "xxxxxxxxxxx12", F.TYPE_ADS_TEXT, "NoSky data survey")); - assertU(adoc(F.ID, "13", F.BIBCODE, "xxxxxxxxxxx13", F.TYPE_ADS_TEXT, "AllSky data survey")); - assertU(adoc(F.ID, "14", F.BIBCODE, "xxxxxxxxxxx14", F.TYPE_ADS_TEXT, "Modified Newtonian Dynamics (MOND): Observational Phenomenology and Relativistic Extensions")); - assertU(adoc(F.ID, "15", F.BIBCODE, "xxxxxxxxxxx15", F.TYPE_ADS_TEXT, "MOND test")); - assertU(adoc(F.ID, "16", F.BIBCODE, "xxxxxxxxxxx16", F.TYPE_ADS_TEXT, "mond test")); - assertU(adoc(F.ID, "17", F.BIBCODE, "xxxxxxxxxxx17", F.TYPE_ADS_TEXT, "bubble pace telescope multi-pace foobar")); - assertU(adoc(F.ID, "18", F.BIBCODE, "xxxxxxxxxxx18", F.TYPE_ADS_TEXT, "Mirrors of the Hubble fooox Space Telescope")); - assertU(adoc(F.ID, "19", F.BIBCODE, "xxxxxxxxxxx19", F.TYPE_ADS_TEXT, "BPT MIT")); - assertU(adoc(F.ID, "20", F.BIBCODE, "xxxxxxxxxxx20", F.TYPE_ADS_TEXT, "bubble pace telescope multi-foo")); - assertU(adoc(F.ID, "21", F.BIBCODE, "xxxxxxxxxxx21", F.TYPE_ADS_TEXT, "BPT multi-foo")); - - assertU(adoc(F.ID, "147", F.BIBCODE, "xxxxxxxxxx147", F.TYPE_ADS_TEXT, "NAG5-5269")); - assertU(adoc(F.ID, "148", F.BIBCODE, "xxxxxxxxxx148", F.TYPE_ADS_TEXT, "NAG55269")); - assertU(adoc(F.ID, "149", F.BIBCODE, "xxxxxxxxxx149", F.TYPE_ADS_TEXT, "NAG5 5269")); - assertU(adoc(F.ID, "150", F.BIBCODE, "xxxxxxxxxx150", F.TYPE_ADS_TEXT, "nag5-5269")); - assertU(adoc(F.ID, "151", F.BIBCODE, "xxxxxxxxxx151", F.TYPE_ADS_TEXT, "nag55269")); - assertU(adoc(F.ID, "152", F.BIBCODE, "xxxxxxxxxx152", F.TYPE_ADS_TEXT, "nag5 5269")); - - assertU(adoc(F.ID, "318", F.BIBCODE, "xxxxxxxxxx318", F.TYPE_ADS_TEXT, "creation of a thesaurus")); - assertU(adoc(F.ID, "382", F.BIBCODE, "xxxxxxxxxx382", F.TYPE_ADS_TEXT, "xhtml should be fooxx ")); + assertU(adoc("id", "1", "bibcode", "xxxxxxxxxxxx1", "title", "Bílá kobyla skočila přes čtyřista")); + assertU(adoc("id", "2", "bibcode", "xxxxxxxxxxxx2", "title", "třicet-tři stříbrných střech")); + assertU(adoc("id", "3", "bibcode", "xxxxxxxxxxxx3", "title", "A ještě TřistaTřicetTři stříbrných křepeliček")); + assertU(adoc("id", "4", "bibcode", "xxxxxxxxxxxx4", "title", "Mirrors of the hubble space telescope goes home")); + assertU(adoc("id", "5", "bibcode", "xxxxxxxxxxxx5", "title", "Mirrors of the HST second")); + assertU(adoc("id", "6", "bibcode", "xxxxxxxxxxxx6", "title", "Mirrors of the Hst third")); + assertU(adoc("id", "7", "bibcode", "xxxxxxxxxxxx7", "title", "Mirrors of the HubbleSpaceTelescope fourth")); + assertU(adoc("id", "8", "bibcode", "xxxxxxxxxxxx8", "title", "Take Massachusets Institute of Technology (MIT)")); + assertU(adoc("id", "9", "bibcode", "xxxxxxxxxxxx9", "title", "MIT developed new network protocols")); + assertU(adoc("id", "10", "bibcode", "xxxxxxxxxxx10", "title", "No-sky data survey")); + assertU(adoc("id", "11", "bibcode", "xxxxxxxxxxx11", "title", "All-sky data survey")); + assertU(adoc("id", "12", "bibcode", "xxxxxxxxxxx12", "title", "NoSky data survey")); + assertU(adoc("id", "13", "bibcode", "xxxxxxxxxxx13", "title", "AllSky data survey")); + assertU(adoc("id", "14", "bibcode", "xxxxxxxxxxx14", "title", "Modified Newtonian Dynamics (MOND): Observational Phenomenology and Relativistic Extensions")); + assertU(adoc("id", "15", "bibcode", "xxxxxxxxxxx15", "title", "MOND test")); + assertU(adoc("id", "16", "bibcode", "xxxxxxxxxxx16", "title", "mond test")); + assertU(adoc("id", "17", "bibcode", "xxxxxxxxxxx17", "title", "bubble pace telescope multi-pace foobar")); + assertU(adoc("id", "18", "bibcode", "xxxxxxxxxxx18", "title", "Mirrors of the Hubble fooox Space Telescope")); + assertU(adoc("id", "19", "bibcode", "xxxxxxxxxxx19", "title", "BPT MIT")); + assertU(adoc("id", "20", "bibcode", "xxxxxxxxxxx20", "title", "bubble pace telescope multi-foo")); + assertU(adoc("id", "21", "bibcode", "xxxxxxxxxxx21", "title", "BPT multi-foo")); + + assertU(adoc("id", "147", "bibcode", "xxxxxxxxxx147", "title", "NAG5-5269")); + assertU(adoc("id", "148", "bibcode", "xxxxxxxxxx148", "title", "NAG55269")); + assertU(adoc("id", "149", "bibcode", "xxxxxxxxxx149", "title", "NAG5 5269")); + assertU(adoc("id", "150", "bibcode", "xxxxxxxxxx150", "title", "nag5-5269")); + assertU(adoc("id", "151", "bibcode", "xxxxxxxxxx151", "title", "nag55269")); + assertU(adoc("id", "152", "bibcode", "xxxxxxxxxx152", "title", "nag5 5269")); + + assertU(adoc("id", "318", "bibcode", "xxxxxxxxxx318", "title", "creation of a thesaurus")); + assertU(adoc("id", "382", "bibcode", "xxxxxxxxxx382", "title", "xhtml should be fooxx ")); + + // greek letter should not be a problem, #604 + assertU(adoc("id", "400", "bibcode", "xxxxxxxxxx400", "title", "A 350-MHz GBT Survey of 50 Faint Fermi $\\gamma$-ray Sources for Radio Millisecond Pulsars")); + assertU(adoc("id", "401", "bibcode", "xxxxxxxxxx401", "title", "A 350-MHz GBT Survey of 50 Faint Fermi γ-ray Sources for Radio Millisecond Pulsars")); + assertU(adoc("id", "402", "bibcode", "xxxxxxxxxx402", "title", "A 350-MHz GBT Survey of 50 Faint Fermi $\\gamma$ ray Sources for Radio Millisecond Pulsars")); + assertU(adoc("id", "403", "bibcode", "xxxxxxxxxx403", "title", "A 350-MHz GBT Survey of 50 Faint Fermi γ ray Sources for Radio Millisecond Pulsars")); + assertU(commit()); } public void testMultiTokens() throws Exception { - //dumpDoc(null, F.ID, F.TYPE_ADS_TEXT); + //dumpDoc(null, "id", "title"); // assertQueryEquals(req("q", "\"NASA grant\"~3 NEAR N*", "defType", "aqp", "qf", "author^1.5 title^1.4 abstract^1.3 all"), @@ -267,13 +286,13 @@ public void testMultiTokens() throws Exception { // unfielded simple token assertQueryEquals(req("q", "MOND", "defType", "aqp"), "(all:acr::mond all:syn::acr::mond all:syn::modified newtonian dynamics)", BooleanQuery.class); - assertQ(req("q", F.TYPE_ADS_TEXT + ":MOND"), "//*[@numFound='2']", + assertQ(req("q", "title" + ":MOND"), "//*[@numFound='2']", "//doc/str[@name='id'][.='14']", "//doc/str[@name='id'][.='15']"); assertQueryEquals(req("q", "mond", "defType", "aqp"), "(all:mond all:syn::lunar)", BooleanQuery.class); - assertQ(req("q", F.TYPE_ADS_TEXT + ":mond"), + assertQ(req("q", "title" + ":mond"), "//*[@numFound='5']", "//doc/str[@name='id'][.='14']", "//doc/str[@name='id'][.='15']", @@ -283,7 +302,7 @@ public void testMultiTokens() throws Exception { assertQueryEquals(req("q", "Mond", "defType", "aqp"), "(all:mond all:syn::lunar)", BooleanQuery.class); - assertQ(req("q", F.TYPE_ADS_TEXT + ":Mond"), + assertQ(req("q", "title" + ":Mond"), "//*[@numFound='5']", "//doc/str[@name='id'][.='17']", // orig 'space' -> syn:lunar; look at the synonym file to understand "//doc/str[@name='id'][.='14']", @@ -295,7 +314,7 @@ public void testMultiTokens() throws Exception { // in our synonym files - look above) assertQueryEquals(req("q", "title:pace", "defType", "aqp"), "title:pace title:syn::lunar", BooleanQuery.class); - assertQ(req("q", F.TYPE_ADS_TEXT + ":pace"), + assertQ(req("q", "title" + ":pace"), "//*[@numFound='3']", "//doc/str[@name='id'][.='17']", "//doc/str[@name='id'][.='16']", @@ -305,12 +324,12 @@ public void testMultiTokens() throws Exception { // and 'lunar' is not on the left hand side assertQueryEquals(req("q", "title:lunar", "defType", "aqp"), "title:lunar", TermQuery.class); - assertQ(req("q", F.TYPE_ADS_TEXT + ":lunar"), "//*[@numFound='0']"); + assertQ(req("q", "title" + ":lunar"), "//*[@numFound='0']"); // but 'luna' is a synonym (syn::lunar) assertQueryEquals(req("q", "title:luna", "defType", "aqp"), "title:luna title:syn::lunar", BooleanQuery.class); - assertQ(req("q", F.TYPE_ADS_TEXT + ":luna"), + assertQ(req("q", "title" + ":luna"), "//*[@numFound='3']", "//doc/str[@name='id'][.='17']", "//doc/str[@name='id'][.='16']", @@ -322,7 +341,7 @@ public void testMultiTokens() throws Exception { "title:\"modified newtonian dynamics\"" + " (title:syn::acr::mond title:syn::modified newtonian dynamics)", BooleanQuery.class); - assertQ(req("q", F.TYPE_ADS_TEXT + ":\"modified newtonian dynamics\""), "//*[@numFound='2']", + assertQ(req("q", "title" + ":\"modified newtonian dynamics\""), "//*[@numFound='2']", "//doc/str[@name='id'][.='14']", "//doc/str[@name='id'][.='15']"); @@ -337,7 +356,7 @@ public void testMultiTokens() throws Exception { " title:\"(syn::bubble pace telescope syn::acr::bpt) ? ? multi (pace syn::lunar) foobar\"~2" + " title:\"(syn::bubble pace telescope syn::acr::bpt) ? ? ? multipace foobar\"~3", BooleanQuery.class); - assertQ(req("q", F.TYPE_ADS_TEXT + ":\"bubble pace telescope multi-pace foobar\""), "//*[@numFound='1']", + assertQ(req("q", "title" + ":\"bubble pace telescope multi-pace foobar\""), "//*[@numFound='1']", "//doc/str[@name='id'][.='17']"); @@ -345,7 +364,7 @@ public void testMultiTokens() throws Exception { assertQueryEquals(req("q", "title:modified\\ newtonian\\ dynamics", "defType", "aqp"), "(+title:modified +title:newtonian +title:dynamics) (title:syn::acr::mond title:syn::modified newtonian dynamics)", BooleanQuery.class); - assertQ(req("q", F.TYPE_ADS_TEXT + ":modified\\ newtonian\\ dynamics"), + assertQ(req("q", "title" + ":modified\\ newtonian\\ dynamics"), "//*[@numFound='2']", "//doc/str[@name='id'][.='14']", "//doc/str[@name='id'][.='15']"); @@ -399,7 +418,7 @@ public void testMultiTokens() throws Exception { public void unfieldedSearch() throws Exception { // non-phrase: by default do span search - setDebug(true); + //setDebug(true); assertQueryEquals(req("q", "hubble space telescope", "defType", "aqp", "aqp.unfielded.tokens.strategy", "join", "df", "all"), @@ -690,7 +709,7 @@ public void testSynonyms() throws Exception { //TODO: add the corresponding searches, but this shows we are indexing properly - //dumpDoc(null, F.ID, F.ADS_TEXT_TYPE); + //dumpDoc(null, "id", F.ADS_TEXT_TYPE); } public void testOtherCases() throws Exception { @@ -707,9 +726,9 @@ public void testOtherCases() throws Exception { // the ascii folding filter emits both unicode and the ascii version - assertQ(req("q", F.TYPE_ADS_TEXT + ":Bílá"), "//*[@numFound='1']", "//doc[1]/str[@name='id'][.='1']"); - assertQ(req("q", F.TYPE_ADS_TEXT + ":Bila"), "//*[@numFound='1']", "//doc[1]/str[@name='id'][.='1']"); - assertQ(req("q", F.TYPE_ADS_TEXT + ":bila"), "//*[@numFound='1']", "//doc[1]/str[@name='id'][.='1']"); + assertQ(req("q", "title" + ":Bílá"), "//*[@numFound='1']", "//doc[1]/str[@name='id'][.='1']"); + assertQ(req("q", "title" + ":Bila"), "//*[@numFound='1']", "//doc[1]/str[@name='id'][.='1']"); + assertQ(req("q", "title" + ":bila"), "//*[@numFound='1']", "//doc[1]/str[@name='id'][.='1']"); // test that the two lines in the synonym file get merged and produce correct synonym expansion assertQueryEquals(req("q", "ABC", "defType", "aqp"), @@ -720,23 +739,23 @@ public void testOtherCases() throws Exception { // "all-sky" is indexed as "all", "sky", "all-sky" // we could achieve higher precision if WDDF generateWordParts=0 // but that would cause "some-other-hyphenated" tokens to be missed - assertQ(req("q", F.TYPE_ADS_TEXT + ":no-sky"), "//*[@numFound='2']", + assertQ(req("q", "title" + ":no-sky"), "//*[@numFound='2']", "//doc/str[@name='id'][.='10']", "//doc/str[@name='id'][.='12']"); - assertQ(req("q", F.TYPE_ADS_TEXT + ":nosky"), "//*[@numFound='2']", + assertQ(req("q", "title" + ":nosky"), "//*[@numFound='2']", "//doc/str[@name='id'][.='10']", "//doc/str[@name='id'][.='12']"); - assertQ(req("q", F.TYPE_ADS_TEXT + ":all-sky"), "//*[@numFound='2']", + assertQ(req("q", "title" + ":all-sky"), "//*[@numFound='2']", "//doc/str[@name='id'][.='11']", "//doc/str[@name='id'][.='13']"); - assertQ(req("q", F.TYPE_ADS_TEXT + ":allsky"), "//*[@numFound='2']", + assertQ(req("q", "title" + ":allsky"), "//*[@numFound='2']", "//doc/str[@name='id'][.='11']", "//doc/str[@name='id'][.='13']"); - assertQ(req("q", F.TYPE_ADS_TEXT + ":sky"), "//*[@numFound='2']", + assertQ(req("q", "title" + ":sky"), "//*[@numFound='2']", "//doc/str[@name='id'][.='10']", "//doc/str[@name='id'][.='11']" ); - assertQ(req("q", F.TYPE_ADS_TEXT + ":*sky"), "//*[@numFound='4']", + assertQ(req("q", "title" + ":*sky"), "//*[@numFound='4']", "//doc/str[@name='id'][.='10']", "//doc/str[@name='id'][.='11']", "//doc/str[@name='id'][.='12']", @@ -745,10 +764,110 @@ public void testOtherCases() throws Exception { /* * Html tags should be removed */ - //dumpDoc(null, F.TYPE_ADS_TEXT); - assertQ(req("q", F.TYPE_ADS_TEXT + ":xremoved"), "//*[@numFound='0']"); - assertQ(req("q", F.TYPE_ADS_TEXT + ":xhtml"), "//*[@numFound='1']", + + assertQ(req("q", "title" + ":xremoved"), "//*[@numFound='0']"); + assertQ(req("q", "title" + ":xhtml"), "//*[@numFound='1']", "//doc/str[@name='id'][.='382']"); + + /** + * Latex symbols should simply be converted to ascii + */ + + assertQ(req("q", "title:\"$\\gamma$-ray\""), + "//*[@numFound='4']", + "//doc/str[@name='id'][.='400']", + "//doc/str[@name='id'][.='401']", + "//doc/str[@name='id'][.='402']", + "//doc/str[@name='id'][.='403']" + ); + assertQ(req("q", "title:\"$\\gamma$ ray\""), + "//*[@numFound='4']", + "//doc/str[@name='id'][.='400']", + "//doc/str[@name='id'][.='401']", + "//doc/str[@name='id'][.='402']", + "//doc/str[@name='id'][.='403']" + ); + assertQ(req("q", "title:\"γ-ray\""), + "//*[@numFound='4']", + "//doc/str[@name='id'][.='400']", + "//doc/str[@name='id'][.='401']", + "//doc/str[@name='id'][.='402']", + "//doc/str[@name='id'][.='403']" + ); + assertQ(req("q", "title:\"γ ray\""), + "//*[@numFound='4']", + "//doc/str[@name='id'][.='400']", + "//doc/str[@name='id'][.='401']", + "//doc/str[@name='id'][.='402']", + "//doc/str[@name='id'][.='403']" + ); + + assertQueryEquals(req( + "q", "title:\"A 350-MHz GBT Survey of 50 Faint Fermi $\\gamma$ ray Sources for Radio Millisecond Pulsars\"", + "defType", "aqp"), + "title:\"350 (mhz syn::mhz) (acr::gbt syn::acr::gbt syn::green bank telescope) (survey syn::survey) 50 (faint syn::faint) (fermi syn::fermi) (gamma syn::gamma) ray (sources syn::source) (radio syn::radio) (millisecond syn::millisecond) (pulsars syn::pulsars)\" " + + "title:\"350 (mhz syn::mhz) (acr::gbt syn::acr::gbt syn::green bank telescope) (survey syn::survey) 50 (faint syn::faint) (fermi syn::fermi) (syn::gamma ray syn::gammaray syn::gamma rays syn::gammarays) ? (sources syn::source) (radio syn::radio) (millisecond syn::millisecond) (pulsars syn::pulsars)\" " + + "title:\"350mhz (acr::gbt syn::acr::gbt syn::green bank telescope) (survey syn::survey) 50 (faint syn::faint) (fermi syn::fermi) (gamma syn::gamma) ray (sources syn::source) (radio syn::radio) (millisecond syn::millisecond) (pulsars syn::pulsars)\" " + + "title:\"350mhz (acr::gbt syn::acr::gbt syn::green bank telescope) (survey syn::survey) 50 (faint syn::faint) (fermi syn::fermi) (syn::gamma ray syn::gammaray syn::gamma rays syn::gammarays) ? (sources syn::source) (radio syn::radio) (millisecond syn::millisecond) (pulsars syn::pulsars)\"~2", + BooleanQuery.class); + + + assertQueryEquals(req( + "q", "title:\"A 350-MHz GBT Survey of 50 Faint Fermi $\\gamma$-ray Sources for Radio Millisecond Pulsars\"", + "defType", "aqp"), + "title:\"350 (mhz syn::mhz) (acr::gbt syn::acr::gbt syn::green bank telescope) (survey syn::survey) 50 (faint syn::faint) (fermi syn::fermi) (gamma syn::gamma) ray (sources syn::source) (radio syn::radio) (millisecond syn::millisecond) (pulsars syn::pulsars)\" " + + "title:\"350 (mhz syn::mhz) (acr::gbt syn::acr::gbt syn::green bank telescope) (survey syn::survey) 50 (faint syn::faint) (fermi syn::fermi) (syn::gamma ray syn::gammaray syn::gamma rays syn::gammarays) (gammaray syn::gamma ray syn::gammaray syn::gamma rays syn::gammarays) (sources syn::source) (radio syn::radio) (millisecond syn::millisecond) (pulsars syn::pulsars)\" " + + "title:\"350mhz (acr::gbt syn::acr::gbt syn::green bank telescope) (survey syn::survey) 50 (faint syn::faint) (fermi syn::fermi) (gamma syn::gamma) ray (sources syn::source) (radio syn::radio) (millisecond syn::millisecond) (pulsars syn::pulsars)\" " + + "title:\"350mhz (acr::gbt syn::acr::gbt syn::green bank telescope) (survey syn::survey) 50 (faint syn::faint) (fermi syn::fermi) (syn::gamma ray syn::gammaray syn::gamma rays syn::gammarays) (gammaray syn::gamma ray syn::gammaray syn::gamma rays syn::gammarays) (sources syn::source) (radio syn::radio) (millisecond syn::millisecond) (pulsars syn::pulsars)\"", + BooleanQuery.class); + + + assertQueryEquals(req( + "q", "title:\"A 350-MHz GBT Survey of 50 Faint Fermi γ ray Sources for Radio Millisecond Pulsars\"", + "defType", "aqp"), + "title:\"350 (mhz syn::mhz) (acr::gbt syn::acr::gbt syn::green bank telescope) (survey syn::survey) 50 (faint syn::faint) (fermi syn::fermi) (gamma syn::gamma) ray (sources syn::source) (radio syn::radio) (millisecond syn::millisecond) (pulsars syn::pulsars)\" " + + "title:\"350 (mhz syn::mhz) (acr::gbt syn::acr::gbt syn::green bank telescope) (survey syn::survey) 50 (faint syn::faint) (fermi syn::fermi) (syn::gamma ray syn::gammaray syn::gamma rays syn::gammarays) ? (sources syn::source) (radio syn::radio) (millisecond syn::millisecond) (pulsars syn::pulsars)\" " + + "title:\"350mhz (acr::gbt syn::acr::gbt syn::green bank telescope) (survey syn::survey) 50 (faint syn::faint) (fermi syn::fermi) (gamma syn::gamma) ray (sources syn::source) (radio syn::radio) (millisecond syn::millisecond) (pulsars syn::pulsars)\" " + + "title:\"350mhz (acr::gbt syn::acr::gbt syn::green bank telescope) (survey syn::survey) 50 (faint syn::faint) (fermi syn::fermi) (syn::gamma ray syn::gammaray syn::gamma rays syn::gammarays) ? (sources syn::source) (radio syn::radio) (millisecond syn::millisecond) (pulsars syn::pulsars)\"~2", + BooleanQuery.class); + + assertQueryEquals(req( + "q", "title:\"A 350-MHz GBT Survey of 50 Faint Fermi γ-ray Sources for Radio Millisecond Pulsars\"", + "defType", "aqp"), + "title:\"350 (mhz syn::mhz) (acr::gbt syn::acr::gbt syn::green bank telescope) (survey syn::survey) 50 (faint syn::faint) (fermi syn::fermi) (gamma syn::gamma syn::gamma ray syn::gammaray syn::gamma rays syn::gammarays) (ray gammaray syn::gamma ray syn::gammaray syn::gamma rays syn::gammarays) (sources syn::source) (radio syn::radio) (millisecond syn::millisecond) (pulsars syn::pulsars)\" " + + "title:\"350mhz (acr::gbt syn::acr::gbt syn::green bank telescope) (survey syn::survey) 50 (faint syn::faint) (fermi syn::fermi) (gamma syn::gamma syn::gamma ray syn::gammaray syn::gamma rays syn::gammarays) (ray gammaray syn::gamma ray syn::gammaray syn::gamma rays syn::gammarays) (sources syn::source) (radio syn::radio) (millisecond syn::millisecond) (pulsars syn::pulsars)\"", + BooleanQuery.class); + + //dumpDoc(null, "title"); + assertQ(req("q", "title:\"A 350-MHz GBT Survey of 50 Faint Fermi $\\gamma$ ray Sources for Radio Millisecond Pulsars\""), + "//*[@numFound='4']", + "//doc/str[@name='id'][.='400']", + "//doc/str[@name='id'][.='401']", + "//doc/str[@name='id'][.='402']", + "//doc/str[@name='id'][.='403']"); + assertQ(req("q", "title:\"A 350-MHz GBT Survey of 50 Faint Fermi $\\gamma$-ray Sources for Radio Millisecond Pulsars\""), + "//*[@numFound='4']", + "//doc/str[@name='id'][.='400']", + "//doc/str[@name='id'][.='401']", + "//doc/str[@name='id'][.='402']", + "//doc/str[@name='id'][.='403']"); + + assertQ(req("q", "title:\"A 350-MHz GBT Survey of 50 Faint Fermi γ-ray Sources for Radio Millisecond Pulsars\""), + "//*[@numFound='4']", + "//doc/str[@name='id'][.='400']", + "//doc/str[@name='id'][.='401']", + "//doc/str[@name='id'][.='402']", + "//doc/str[@name='id'][.='403']"); + assertQ(req("q", "title:\"A 350-MHz GBT Survey of 50 Faint Fermi γ ray Sources for Radio Millisecond Pulsars\""), + "//*[@numFound='4']", + "//doc/str[@name='id'][.='400']", + "//doc/str[@name='id'][.='401']", + "//doc/str[@name='id'][.='402']", + "//doc/str[@name='id'][.='403']"); + + + + } diff --git a/contrib/examples/adsabs/solr/collection1/conf/schema.xml b/contrib/examples/adsabs/solr/collection1/conf/schema.xml index 4107d59b8..3e5621de8 100644 --- a/contrib/examples/adsabs/solr/collection1/conf/schema.xml +++ b/contrib/examples/adsabs/solr/collection1/conf/schema.xml @@ -394,6 +394,9 @@ replacement="-" replace="all" /> + + + - - + + + + + + - - + + From 5773e82c43d8c4143099db58b7e180ced3fa71b7 Mon Sep 17 00:00:00 2001 From: Roman Chyla Date: Mon, 9 Jun 2014 15:48:52 -0400 Subject: [PATCH 2/9] Updating the installation script --- contrib/examples/adsabs/reader.profile | 2 +- contrib/examples/adsabs/silent.profile | 2 +- contrib/examples/adsabs/writer.profile | 2 +- contrib/examples/build.xml | 5 +- .../examples/src/python/montysolrupdate.py | 272 +----------------- 5 files changed, 17 insertions(+), 266 deletions(-) diff --git a/contrib/examples/adsabs/reader.profile b/contrib/examples/adsabs/reader.profile index aa8b9d21b..dc04aa156 100644 --- a/contrib/examples/adsabs/reader.profile +++ b/contrib/examples/adsabs/reader.profile @@ -1,3 +1,3 @@ PYTHONPATH=${build.dir}/${example.name}/python${path.separator}$PYTHONPATH -MONTYSOLR_JVMARGS=-d64 -Xmx20480m -DstoreAll=true -Dmontysolr.enable.write=false -Dmontysolr.enable.warming=true -Djava.util.logging.config.file=${build.dir}/${example.name}/etc/logging.properties -Dmontysolr.locktype=native -XX:+AggressiveOpts -XX:+UseG1GC -XX:+UseStringCache -XX:+OptimizeStringConcat -XX:-UseSplitVerifier -XX:+UseNUMA -XX:MaxGCPauseMillis=50 -XX:GCPauseIntervalMillis=1000 +MONTYSOLR_JVMARGS=-d64 -Xmx20480m -DstoreAll=true -Dmontysolr.enable.write=false -Dmontysolr.enable.warming=true -Djetty.port=8984 -Djava.util.logging.config.file=${build.dir}/${example.name}/etc/logging.properties -Dmontysolr.locktype=native -XX:+AggressiveOpts -XX:+UseG1GC -XX:+UseStringCache -XX:+OptimizeStringConcat -XX:-UseSplitVerifier -XX:+UseNUMA -XX:MaxGCPauseMillis=50 -XX:GCPauseIntervalMillis=1000 MONTYSOLR_ARGS= diff --git a/contrib/examples/adsabs/silent.profile b/contrib/examples/adsabs/silent.profile index e969a6fac..ffbf03d68 100644 --- a/contrib/examples/adsabs/silent.profile +++ b/contrib/examples/adsabs/silent.profile @@ -1,3 +1,3 @@ PYTHONPATH=${build.dir}/${example.name}/python${path.separator}$PYTHONPATH -MONTYSOLR_JVMARGS=-d64 -Xmx2048m -Djava.util.logging.config.file=${build.dir}/${example.name}/etc/logging.properties -DstoreAll=true +MONTYSOLR_JVMARGS=-d64 -Xmx2048m -Djava.util.logging.config.file=${build.dir}/${example.name}/etc/logging.properties -DstoreAll=true -Djetty.port=8984 MONTYSOLR_ARGS= diff --git a/contrib/examples/adsabs/writer.profile b/contrib/examples/adsabs/writer.profile index 027767449..f02686c16 100644 --- a/contrib/examples/adsabs/writer.profile +++ b/contrib/examples/adsabs/writer.profile @@ -1,3 +1,3 @@ PYTHONPATH=${build.dir}/${example.name}/python${path.separator}$PYTHONPATH -MONTYSOLR_JVMARGS=-d64 -Xmx16084m -Dsolr.cache.size=12 -Dsolr.cache.initial=0 -DstoreAll=true -Dmontysolr.enable.write=true -Dmontysolr.enable.warming=false -Djava.util.logging.config.file=${build.dir}/${example.name}/etc/logging.properties +MONTYSOLR_JVMARGS=-d64 -Xmx16084m -Dsolr.cache.size=12 -Dsolr.cache.initial=0 -DstoreAll=true -Dmontysolr.enable.write=true -Dmontysolr.enable.warming=false -Djetty.port=8984 -Djava.util.logging.config.file=${build.dir}/${example.name}/etc/logging.properties MONTYSOLR_ARGS= diff --git a/contrib/examples/build.xml b/contrib/examples/build.xml index 0b5c06f14..18a9c1391 100644 --- a/contrib/examples/build.xml +++ b/contrib/examples/build.xml @@ -294,7 +294,7 @@ - %s' % (source, tgz)]) - print("Making .tar.bz2") - run_cmd(['tar cf - %s | bzip2 -9 > %s' % (source, bz)]) - print("Making .tar.xz") - run_cmd(['tar cf - %s | xz > %s' % (source, xz)]) - print('Calculating md5 sums') - checksum_tgz = hashlib.md5() - with open(tgz, 'rb') as data: - checksum_tgz.update(data.read()) - checksum_bz2 = hashlib.md5() - with open(bz, 'rb') as data: - checksum_bz2.update(data.read()) - checksum_xz = hashlib.md5() - with open(xz, 'rb') as data: - checksum_xz.update(data.read()) - print(' %s %8s %s' % ( - checksum_tgz.hexdigest(), int(os.path.getsize(tgz)), tgz)) - print(' %s %8s %s' % ( - checksum_bz2.hexdigest(), int(os.path.getsize(bz)), bz)) - print(' %s %8s %s' % ( - checksum_xz.hexdigest(), int(os.path.getsize(xz)), xz)) - with open(tgz + '.md5', 'w', encoding="ascii") as fp: - fp.write(checksum_tgz.hexdigest()) - with open(bz + '.md5', 'w', encoding="ascii") as fp: - fp.write(checksum_bz2.hexdigest()) - with open(xz + '.md5', 'w', encoding="ascii") as fp: - fp.write(checksum_xz.hexdigest()) - - - - class Tag(object): @@ -674,8 +616,9 @@ def start_indexing(instance_dir, port): url = 'http://localhost:%s/solr/invenio-doctor' % port rsp = req(url, command='status') - if rsp['status'] != 'idle': + if rsp['status'] == 'busy': print ('WARNING: live instance is reporting to be already busy: %s' % instance_dir) + return rsp = req(url, command='discover') rsp = req(url, command='start') @@ -786,12 +729,6 @@ def check_prerequisites(options): setup_ant(options) if options.setup_prerequisites or options.setup_python: setup_python(options) - if options.setup_prerequisites or options.setup_jcc: - setup_jcc(options) - if options.setup_prerequisites or options.setup_pylucene: - setup_pylucene(options) - if options.setup_prerequisites or options.setup_invenio: - setup_invenio(options) check_ant(options) @@ -824,7 +761,7 @@ def setup_ant(options): if options.force_recompilation and os.path.exists('ant'): run_cmd(['rm', '-fr', 'ant']) - elif os.path.exists('ant/RELEASE') and str(get_pid('ant/RELEASE')) == str(JCC_SVN_TAG): + elif os.path.exists('ant/RELEASE') and str(get_pid('ant/RELEASE')) == str(UPDATER_RELEASE): return # already installed with open("install_ant.sh", "w") as build_ant: @@ -845,7 +782,7 @@ def setup_ant(options): """ % {'java_home': JAVA_HOME, 'ant_home': os.path.join(INSTDIR, "perpetuum/ant"), - 'release': JCC_SVN_TAG}) + 'release': UPDATER_RELEASE}) run_cmd(['chmod', 'u+x', 'install_ant.sh']) run_cmd(['./install_ant.sh']) @@ -878,8 +815,8 @@ def setup_python(options): 'setuptools', 'sqlalchemy', 'mysql-python', - 'numpy', - 'lxml', + #'numpy', + #'lxml', 'simplejson', 'configobj', 'pyparsing==1.5.7', @@ -911,192 +848,9 @@ def setup_python(options): run_cmd(['chmod', 'u+x', 'install_python.sh']) run_cmd(['./install_python.sh']) -def setup_jcc(options): - - if options.force_recompilation and os.path.exists('jcc'): - run_cmd(['rm', '-fr', 'jcc']) - - if os.path.exists('jcc') and str(get_pid('jcc/RELEASE')) == str(JCC_SVN_TAG): - return # already there - - with open("install_jcc.sh", "w") as inpython: - header = '#!/bin/bash -e\n' - venv_activate = 'source python/bin/activate\n' - core_commands = '\n'.join([ - 'svn co http://svn.apache.org/repos/asf/lucene/pylucene/trunk/jcc@%(JCC_SVN_TAG)s', - 'cd jcc', - 'echo "0" > RELEASE', - 'export USE_DISTUTILS', - 'export JCC_JDK=%(JAVA_HOME)s', - 'python setup.py build', - 'python setup.py bdist_egg', - 'python setup.py install', - 'echo "%(JCC_SVN_TAG)s" > RELEASE', - '', - '# verify installation', - 'cd ..', - 'python -c "import jcc;jcc.initVM();print jcc.__file__"', - '', - ]) - venv_deactivate = 'deactivate\n' - if options.no_venv: - venv_activate,venv_deactivate = '','' - - inpython.write( - (header+venv_activate+core_commands+venv_deactivate) - % {'JAVA_HOME': JAVA_HOME, 'JCC_SVN_TAG' : JCC_SVN_TAG} - ) - - run_cmd(['chmod', 'u+x', 'install_jcc.sh']) - run_cmd(['bash', '-e', './install_jcc.sh']) - - - -def setup_pylucene(options): - - if options.force_recompilation and os.path.exists('pylucene'): - run_cmd(['rm', '-fr', 'pylucene']) - - if os.path.exists('pylucene') and str(get_pid('pylucene/RELEASE')) == str(PYLUCENE_SVN_TAG): - return # already there - - - run_cmd(['rm', 'pylucene/Makefile*'], strict=False) - - with open("install_pylucene.sh", "w") as infile: - header = '#!/bin/bash -xe\n' - venv_activate = 'source python/bin/activate\n' - venv_deactivate = 'deactivate\n' - core_commands = '\n'.join([ - 'export ANT_HOME=%(ant_home)s', - 'export JAVA_HOME=%(java_home)s', - '', - 'svn co http://svn.apache.org/repos/asf/lucene/pylucene/trunk@%(PYLUCENE_SVN_TAG)s pylucene', - 'cd pylucene', - '', - 'echo "0" > RELEASE', - '', - 'if [ ! -f Makefile.copy ]; then', - ' cp Makefile Makefile.copy', - '' , - ' echo "VERSION=4.0-0', - 'LUCENE_SVN_VER=HEAD', - 'LUCENE_VER=4.0', - 'LUCENE_SVN=http://svn.apache.org/repos/asf/lucene/dev/branches/lucene_solr_4_0', - 'PYLUCENE:=\$(shell pwd)', - 'LUCENE_SRC=lucene-java-\$(LUCENE_VER)', - 'LUCENE=\$(LUCENE_SRC)/lucene', - 'PREFIX_PYTHON=/usr', - 'ANT=%(ant_home)s/bin/ant', - 'PYTHON=python', - 'JCC=\$(PYTHON) -m jcc.__main__ --shared --use_full_names', - 'NUM_FILES=3', - '" > Makefile', - '', - ' tail -n +25 Makefile.copy >> Makefile', - '', - ' # on stupid old centos, icupkg is outdated and since it took 2 hours of my life', - ' # i deactivate it from the build (we dont use it anyways....)', - '' , - " sed 's/shell which icupkg/shell which icupkgooooo/' Makefile > Makefile.tmp", - ' mv Makefile.tmp Makefile', - '', - 'fi', - '', - 'make', - 'make install', - '', - 'echo "%(PYLUCENE_SVN_TAG)s" > RELEASE', - '', - 'cd ..', - 'python -c "import lucene;lucene.initVM();print lucene.__file__"', - '', - ]) - if options.no_venv: - venv_activate,venv_deactivate = '','' - - infile.write( - (header+venv_activate+core_commands+venv_deactivate) - % {'ant_home': ANT_HOME, 'PYLUCENE_SVN_TAG': PYLUCENE_SVN_TAG,'java_home': JAVA_HOME}) - - run_cmd(['chmod', 'u+x', 'install_pylucene.sh']) - run_cmd(['./install_pylucene.sh']) -def setup_invenio(options): - - if options.force_recompilation and os.path.exists('invenio'): - run_cmd(['rm', '-fr', 'invenio']) - - if os.path.exists('invenio/RELEASE') and str(get_pid('invenio/RELEASE', raw=True)) == str(INVENIO_COMMIT): - return # already there - - #if os.path.exists('invenio.git') and os.path.exists('invenio'): - # with changed_dir('invenio.git'): - # commit = get_output(["git", "log", "--pretty=oneline", "--abbrev-commit", "-n 1"]) - # run_cmd(['git', 'fetch']) - # run_cmd(['git', 'reset', '--hard', 'origin/master']) - # run_cmd(['git', 'checkout', 'master']) - # commit2 = get_output(["git", "log", "--pretty=oneline", "--abbrev-commit", "-n 1"]) - # if commit == commit2 and len(commit2) > 0: - # return # already installed, no changes there - - - with open("install_invenio.sh", "w") as inpython: - inpython.write("""#!/bin/bash -e - -source python/bin/activate - -target=%(INSTDIR)s/perpetuum/invenio - -if [ ! -d invenio.git ]; then - git clone git://github.com/tiborsimko/invenio.git invenio.git -else - cd invenio.git - git fetch - git reset --hard origin/master - cd .. -fi - -site_packages=`python -c "import os,sys;print '%%s/lib/python%%s.%%s/site-packages' %% (os.path.realpath('python'), sys.version_info[0], sys.version_info[1])"` - -cd invenio.git -git checkout %(INVENIO_COMMIT)s - -aclocal && automake -a && autoconf - -export PYTHONPATH=%(INSTDIR)s/perpetuum/invenio/lib/python:$PYTHONPATH -CONFIGURE_OPTS="--with-python=`which python` --prefix=$target" -./configure $CONFIGURE_OPTS 0 %(INSTDIR)s/perpetuum/invenio/etc/invenio-local.conf - -echo "%(INVENIO_COMMIT)s" > %(INSTDIR)s/perpetuum/invenio/RELEASE - -# this actually generates invnenio module (inside invenio lib) -python %(INSTDIR)s/perpetuum/invenio/bin/inveniocfg --update-all - -deactivate -exit 0 -""" % {'INSTDIR':INSTDIR, - 'invenio_config': INVENIO_CONFIG.replace("$", "\\$"), - 'INVENIO_COMMIT': INVENIO_COMMIT}) - - run_cmd(['chmod', 'u+x', 'install_invenio.sh']) - run_cmd(['bash', '-e', './install_invenio.sh']) - def setup_build_properties(options): lines = [] @@ -1144,14 +898,12 @@ def upgrade_montysolr(curr_tag, git_tag,options): ' fi', ' ant clean', ' ant get-solr build-all', - ' ant test-python', ' ;;', '"minor" | "3")', ' if [ -f RELEASE ]; then', ' rm RELEASE', ' fi', ' ant get-solr build-all', - ' ant test-python', ' ;;', 'esac', '', @@ -1346,7 +1098,7 @@ def start_live_instance(options, instance_dir, port, start = '\n'.join(lines) start = re.sub(r'HOMEDIR=.*\n', 'HOMEDIR=%s\n' % os.path.realpath('.'), start) - start = re.sub(r'--port\s+\d+', '--port %s' % port, start) + start = re.sub(r'-Djetty.port\=\d+', '-Djetty.port=%s' % port, start) start = re.sub('\n([\t\s]+)(java -cp )', '\\1export PATH=%s/bin:$PATH\n\\1\\2' % JAVA_HOME, start) # this is necessary only when in test run (and there we can be sure that the files were @@ -1392,7 +1144,7 @@ def start_live_instance(options, instance_dir, port, fo.close() run_cmd(['chmod', 'u+x', 'automatic-run.sh']) - run_cmd(['bash', '-e', './automatic-run.sh', '"%s"' % START_JVMARGS, '"%s"' % START_ARGS, '&']) + run_cmd(['bash', '-e', './automatic-run.sh', '"%s"' % START_JVMARGS, '"%s"' % START_ARGS, '&'], False) fo = open('manual-run.sh', 'w') fo.write('bash -e ./automatic-run.sh "%s" "%s" &' % (START_JVMARGS, START_ARGS)) From b4ad72860f5816e22e2d18bd5f8484f640028416 Mon Sep 17 00:00:00 2001 From: Roman Chyla Date: Tue, 10 Jun 2014 17:52:23 -0400 Subject: [PATCH 3/9] Added a new function to generate JSON representation of the parse tree --- .classpath | 4 +- contrib/antlrqueryparser/build.properties | 4 +- contrib/antlrqueryparser/build.xml | 2 +- .../flexible/aqp/nodes/AqpANTLRNode.java | 75 ++++++++++++++++++- .../aqp/processors/AqpQProcessor.java | 5 -- 5 files changed, 79 insertions(+), 11 deletions(-) diff --git a/.classpath b/.classpath index b43df69fe..c8889b713 100644 --- a/.classpath +++ b/.classpath @@ -21,7 +21,7 @@ - + @@ -60,7 +60,7 @@ - + diff --git a/contrib/antlrqueryparser/build.properties b/contrib/antlrqueryparser/build.properties index ac55ea34c..82019ca2c 100644 --- a/contrib/antlrqueryparser/build.properties +++ b/contrib/antlrqueryparser/build.properties @@ -7,4 +7,6 @@ dot_viewer=xdot svg_generator=dot -java_executable=java \ No newline at end of file +java_executable=java + +python=python \ No newline at end of file diff --git a/contrib/antlrqueryparser/build.xml b/contrib/antlrqueryparser/build.xml index 13953927c..db9981d72 100644 --- a/contrib/antlrqueryparser/build.xml +++ b/contrib/antlrqueryparser/build.xml @@ -221,7 +221,7 @@ Note: the ${dotprop.svg_generator} must exist and be executable You can fix the path in ${common.dir}/contrib/antlrqueryparser/build.properties - ", ">"); + } + + public String escapeJsonVal(String v) { + return v.replace("\"", "\\\"").replace("'", "\\'").replace("\\", "\\\\").replace("\t", "\\t").replace("\n", "\\n"); + } + + /** + * Method to transform the tree into JSON - so that we can send it to the + * javascript clients + * + * @return + */ + public String toJson() { + return toJson(0); + } + + public String toJson(int level) { + StringBuffer buf = new StringBuffer(); + buf.append("\n"); + for (int i = 0; i < level; i++) { + buf.append(" "); + } + + buf.append("{name:\""); + buf.append(getTokenName()); + buf.append("\""); + + if (getTokenInput() != null) { + buf.append(", input:\""); + buf.append(escapeJsonVal(getTokenInput())); + buf.append("\", start:" + getTokenStart()); + buf.append(", end:" + getTokenEnd()); + } else { + buf.append(", label:\""); + buf.append(getTokenLabel()); + buf.append("\""); + } + + List children = this.getChildren(); + + if (children != null) { + buf.append(", children: ["); + for (QueryNode child : children) { + if (child instanceof AqpANTLRNode) { + buf.append(((AqpANTLRNode) child).toJson(level + 4)); + buf.append(","); + } else { + buf.append("{xvalue:\""); + buf.append(escapeJsonVal(child.toString())); + buf.append("\"}"); + } + } + buf.append("]"); + } + + if (isLeaf()) { + buf.append("}"); + } else { + buf.append("\n"); + for (int i = 0; i < level; i++) { + buf.append(" "); + } + buf.append("}"); + } + + return buf.toString(); + } + public String toStringNodeOnly() { if (getTokenInput() != null) { - return ""; } else { @@ -93,7 +164,7 @@ public String toString(int level) { buf.append(" Date: Tue, 10 Jun 2014 19:57:41 -0400 Subject: [PATCH 4/9] Added component to return parsed syntax tree --- .classpath | 4 +- .../solr/handler/component/AqpQueryTree.java | 111 ++++++++++++++++++ .../apache/solr/search/AqpAdsabsQParser.java | 76 ++++++------ .../collection1/conf/solrconfig-qtree.xml | 49 ++++++++ .../handler/component/TestAqpQueryTree.java | 86 ++++++++++++++ .../solr/collection1/conf/solrconfig.xml | 20 ++++ 6 files changed, 305 insertions(+), 41 deletions(-) create mode 100644 contrib/adsabs/src/java/org/apache/solr/handler/component/AqpQueryTree.java create mode 100644 contrib/adsabs/src/test-files/solr/collection1/conf/solrconfig-qtree.xml create mode 100644 contrib/adsabs/src/test/org/apache/solr/handler/component/TestAqpQueryTree.java diff --git a/.classpath b/.classpath index c8889b713..87ad6b3e6 100644 --- a/.classpath +++ b/.classpath @@ -25,8 +25,8 @@ - - + + diff --git a/contrib/adsabs/src/java/org/apache/solr/handler/component/AqpQueryTree.java b/contrib/adsabs/src/java/org/apache/solr/handler/component/AqpQueryTree.java new file mode 100644 index 000000000..db9e85969 --- /dev/null +++ b/contrib/adsabs/src/java/org/apache/solr/handler/component/AqpQueryTree.java @@ -0,0 +1,111 @@ +package org.apache.solr.handler.component; + +import java.io.IOException; + +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.queryparser.flexible.aqp.AqpQueryParser; +import org.apache.lucene.queryparser.flexible.aqp.nodes.AqpANTLRNode; +import org.apache.lucene.queryparser.flexible.core.QueryNodeParseException; +import org.apache.lucene.queryparser.flexible.core.nodes.QueryNode; +import org.apache.lucene.queryparser.flexible.core.parser.SyntaxParser; +import org.apache.solr.common.params.CommonParams; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.search.AqpAdsabsQParser; +import org.apache.solr.search.QParser; +import org.apache.solr.search.QParserPlugin; +import org.apache.solr.search.QueryParsing; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Returns the JSON representation of the query syntax tree + * useful for parsing query (providing feedback to the user; + * used by UI Bumblebee) + * + */ +public class AqpQueryTree extends SearchComponent { + + public static final String COMPONENT_NAME = "qtree"; + + @Override + public void prepare(ResponseBuilder rb) throws IOException { + SolrQueryRequest req = rb.req; + SolrParams params = req.getParams(); + if (!params.getBool(COMPONENT_NAME, true)) { + return; + } + SolrQueryResponse rsp = rb.rsp; + + + String defType = params.get(QueryParsing.DEFTYPE,QParserPlugin.DEFAULT_QTYPE); + + // get it from the response builder to give a different component a chance + // to set it. + String queryString = rb.getQueryString(); + if (queryString == null) { + // this is the normal way it's set. + queryString = params.get( CommonParams.Q ); + } + + QParser parser; + try { + parser = QParser.getParser(queryString, defType, req); + + if (parser instanceof AqpAdsabsQParser) { + AqpQueryParser aqpParser = ((AqpAdsabsQParser) parser).getParser(); + SyntaxParser syntaxParser = aqpParser.getSyntaxParser(); + QueryNode queryTree = syntaxParser.parse(queryString, null); + if (queryTree instanceof AqpANTLRNode) { + if (params.get(CommonParams.WT, "json") == "json") { + //System.err.println(((AqpANTLRNode) queryTree).toJson()); + rsp.add("qtree", ((AqpANTLRNode) queryTree).toJson()); + } + else { + //System.err.println(((AqpANTLRNode) queryTree).toString()); + rsp.add("qtree", ((AqpANTLRNode) queryTree).toString()); + } + } + } + + } catch (ParseException e) { + rsp.add("qtreeError", e.getMessage()); + } catch (QueryNodeParseException e) { + rsp.add("qtreeError", e.getMessage()); + } + + + } + + @Override + public void process(ResponseBuilder rb) throws IOException { + // do nothing + } + + @Override + public String getDescription() { + return null; + } + + @Override + public String getSource() { + return null; + } + +} diff --git a/contrib/adsabs/src/java/org/apache/solr/search/AqpAdsabsQParser.java b/contrib/adsabs/src/java/org/apache/solr/search/AqpAdsabsQParser.java index 0ed0437fa..bc4b8618b 100644 --- a/contrib/adsabs/src/java/org/apache/solr/search/AqpAdsabsQParser.java +++ b/contrib/adsabs/src/java/org/apache/solr/search/AqpAdsabsQParser.java @@ -53,7 +53,6 @@ public class AqpAdsabsQParser extends QParser { .getLogger(AqpAdsabsQParser.class); private AqpQueryParser qParser; - private static String adsConfigName = "/ads-config"; public AqpAdsabsQParser(AqpQueryParser parser, String qstr, SolrParams localParams, SolrParams params, SolrQueryRequest req, SolrParserConfigParams defaultConfig) @@ -76,34 +75,50 @@ public AqpAdsabsQParser(AqpQueryParser parser, String qstr, SolrParams localPara QueryConfigHandler config = qParser.getQueryConfigHandler(); + // get the named parameters from solr request object (they will be passed further on) + Map namedParams = config.get(AqpStandardQueryConfigHandler.ConfigurationKeys.NAMED_PARAMETER); + if (params != null) { + for (Entry par: params.toNamedList()) { + String k = par.getKey(); + if (k.startsWith("aqp.")) { + namedParams.put(k, (String) par.getValue()); + } + } + } + if (localParams != null) { + for (Entry par: localParams.toNamedList()) { + String k = par.getKey(); + if (k.startsWith("aqp.")) { + namedParams.put(k, (String) par.getValue()); + } + } + } - AdsConfigHandler extra = (AdsConfigHandler) req.getCore().getRequestHandler(adsConfigName); - - - if (extra == null) { - throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, - "Configuration error, ads-config resource missing"); - } - SolrParams parserConfig = extra.getParams("queryParser"); - qParser.setAnalyzer(schema.getAnalyzer()); String defaultField = getParam(CommonParams.DF); - if (defaultField == null) { - defaultField = parserConfig.get("defaultField", getReq().getSchema().getDefaultSearchFieldName()); + if (defaultField == null && namedParams.containsKey("aqp.defaultField")) { + defaultField = namedParams.get("aqp.defaultField"); } - + //else { + // defaultField = getReq().getSchema().getDefaultSearchFieldName(); + //} if (defaultField != null) { config.set(AqpStandardQueryConfigHandler.ConfigurationKeys.DEFAULT_FIELD, defaultField); } // if defaultField was set, this will be useless - config.set(AqpAdsabsQueryConfigHandler.ConfigurationKeys.UNFIELDED_SEARCH_FIELD, "unfielded_search"); + if (namedParams.containsKey("aqp.unfieldedSearchField")) + config.set(AqpAdsabsQueryConfigHandler.ConfigurationKeys.UNFIELDED_SEARCH_FIELD, namedParams.get("aqp.unfieldedSearchField")); + // default operator String opParam = getParam(QueryParsing.OP); - if (opParam == null) { - opParam = parserConfig.get("defaultOperator", getReq().getSchema().getQueryParserDefaultOperator()); + if (opParam == null && namedParams.containsKey("aqp.defaultOperator")) { + opParam = namedParams.get("aqp.defaultOperator"); + } + else { + opParam = getReq().getSchema().getQueryParserDefaultOperator(); } if (opParam != null) { @@ -114,23 +129,24 @@ public AqpAdsabsQParser(AqpQueryParser parser, String qstr, SolrParams localPara "The defaultOperator is set to null"); } + Map fieldMap; - for (String fName: new String[]{"fieldMap", "fieldMapPostAnalysis"}) { - if (fName.equals("fieldMap")) { // booo + for (String fName: new String[]{"aqp.fieldMap", "aqp.fieldMapPostAnalysis"}) { + if (fName.equals("aqp.fieldMap")) { // booo fieldMap = config.get(AqpStandardQueryConfigHandler.ConfigurationKeys.FIELD_MAPPER); } else { fieldMap = config.get(AqpStandardQueryConfigHandler.ConfigurationKeys.FIELD_MAPPER_POST_ANALYSIS); } - if (parserConfig.get(fName, null) != null) { - String[] fields = parserConfig.get(fName).split(";"); + if (namedParams.containsKey(fName)) { + String[] fields = namedParams.get(fName).split(";"); String ffs[]; for (String f: fields) { ffs = f.split("\\s+"); if (ffs.length < 2) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, - "Configuration error inside " + adsConfigName + ", in the section: " + fName); + "Configuration error in the section: " + fName); } String target = ffs[ffs.length-1]; for (int i=0;i namedParams = config.get(AqpStandardQueryConfigHandler.ConfigurationKeys.NAMED_PARAMETER); - if (params != null) { - for (Entry par: params.toNamedList()) { - String k = par.getKey(); - if (k.startsWith("aqp.")) { - namedParams.put(k, (String) par.getValue()); - } - } - } - if (localParams != null) { - for (Entry par: localParams.toNamedList()) { - String k = par.getKey(); - if (k.startsWith("aqp.")) { - namedParams.put(k, (String) par.getValue()); - } - } - } if (namedParams.containsKey("aqp.df.fields")) { qParser.setMultiFields(namedParams.get("aqp.df.fields").split(",")); diff --git a/contrib/adsabs/src/test-files/solr/collection1/conf/solrconfig-qtree.xml b/contrib/adsabs/src/test-files/solr/collection1/conf/solrconfig-qtree.xml new file mode 100644 index 000000000..49574119b --- /dev/null +++ b/contrib/adsabs/src/test-files/solr/collection1/conf/solrconfig-qtree.xml @@ -0,0 +1,49 @@ + + + + + + + ${tests.luceneMatchVersion:LUCENE_CURRENT} + ${solr.data.dir:} + + + + + explicit + 10 + aqp + + + qtree + + + + + + + + + title^2 abstract^2 body keyword ack + title abstract keyword + + + + + + + diff --git a/contrib/adsabs/src/test/org/apache/solr/handler/component/TestAqpQueryTree.java b/contrib/adsabs/src/test/org/apache/solr/handler/component/TestAqpQueryTree.java new file mode 100644 index 000000000..fde576fd2 --- /dev/null +++ b/contrib/adsabs/src/test/org/apache/solr/handler/component/TestAqpQueryTree.java @@ -0,0 +1,86 @@ +package org.apache.solr.handler.component; + +import java.io.File; +import java.io.IOException; + +import org.apache.solr.request.SolrRequestHandler; + +import monty.solr.util.MontySolrQueryTestCase; +import monty.solr.util.MontySolrSetup; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class TestAqpQueryTree extends MontySolrQueryTestCase { + + public String getSchemaFile() { + return MontySolrSetup.getMontySolrHome() + + "/contrib/adsabs/src/test-files/solr/collection1/conf/schema-minimal.xml"; + } + + public String getSolrConfigFile() { + return MontySolrSetup.getMontySolrHome() + + "/contrib/adsabs/src/test-files/solr/collection1/conf/solrconfig-qtree.xml"; + } + + @Override + public String getSolrHome() { + return MontySolrSetup.getMontySolrHome(); + } + + public void test() throws Exception { + //SolrRequestHandler qtreeHandler = h.getCore().getRequestHandler("/qtree"); + + String s = "{name:\"OPERATOR\", label:\"DEFOP\", children: [" + + " {name:\"MODIFIER\", label:\"MODIFIER\", children: [" + + " {name:\"TMODIFIER\", label:\"TMODIFIER\", children: [" + + " {name:\"FIELD\", label:\"FIELD\", children: [" + + " {name:\"TERM_NORMAL\", input:\"title\", start:0, end:4}," + + " {name:\"QNORMAL\", label:\"QNORMAL\", children: [" + + " {name:\"TERM_NORMAL\", input:\"joe\", start:6, end:8},]" + + " },]" + + " },]" + + " },]" + + " }," + + " {name:\"MODIFIER\", label:\"MODIFIER\", children: [" + + " {name:\"TMODIFIER\", label:\"TMODIFIER\", children: [" + + " {name:\"FIELD\", label:\"FIELD\", children: [" + + " {name:\"QNORMAL\", label:\"QNORMAL\", children: [" + + " {name:\"TERM_NORMAL\", input:\"doe\", start:10, end:12},]" + + " },]" + + " },]" + + " },]" + + " },]" + + "}"; + + s = "\\n{name:\\\"OPERATOR\\\", label:\\\"DEFOP\\\", children: [\\n {name:\\\"MODIFIER\\\", label:\\\"MODIFIER\\\", children: [\\n {name:\\\"TMODIFIER\\\", label:\\\"TMODIFIER\\\", children: [\\n {name:\\\"FIELD\\\", label:\\\"FIELD\\\", children: [\\n {name:\\\"TERM_NORMAL\\\", input:\\\"title\\\", start:0, end:4},\\n {name:\\\"QNORMAL\\\", label:\\\"QNORMAL\\\", children: [\\n {name:\\\"TERM_NORMAL\\\", input:\\\"joe\\\", start:6, end:8},]\\n },]\\n },]\\n },]\\n },\\n {name:\\\"MODIFIER\\\", label:\\\"MODIFIER\\\", children: [\\n {name:\\\"TMODIFIER\\\", label:\\\"TMODIFIER\\\", children: [\\n {name:\\\"FIELD\\\", label:\\\"FIELD\\\", children: [\\n {name:\\\"QNORMAL\\\", label:\\\"QNORMAL\\\", children: [\\n {name:\\\"TERM_NORMAL\\\", input:\\\"doe\\\", start:10, end:12},]\\n },]\\n },]\\n },]\\n },]\\n}"; + String response = h.query(req("qt", "/qtree", "q", "title:joe doe", "wt", "json")); + + assert response.contains(s); + + s = "<astOPERATOR label=\"DEFOP\" name=\"OPERATOR\" type=\"35\" >\n" + + " <astMODIFIER label=\"MODIFIER\" name=\"MODIFIER\" type=\"30\" >\n" + + " <astTMODIFIER label=\"TMODIFIER\" name=\"TMODIFIER\" type=\"66\" >\n" + + " <astFIELD label=\"FIELD\" name=\"FIELD\" type=\"19\" >"; + + response = h.query(req("qt", "/qtree", "q", "title:joe doe", "wt", "xml")); + + assert response.contains(s); + + + } +} diff --git a/contrib/examples/adsabs/solr/collection1/conf/solrconfig.xml b/contrib/examples/adsabs/solr/collection1/conf/solrconfig.xml index 2ad30eec0..2f4263c52 100644 --- a/contrib/examples/adsabs/solr/collection1/conf/solrconfig.xml +++ b/contrib/examples/adsabs/solr/collection1/conf/solrconfig.xml @@ -862,6 +862,13 @@ simple edismax_combined_aqp + unfielded_search + unfielded_search + AND + arxiv identifier; + pubdate date;author_nosyn author_notrans author_nosyn_notrans author + author^1.5 title^1.4 abstract^1.3 all + + + + + explicit + 10 + aqp + + + qtree + + + From d2fb6195f8989bf9a46da75f91860ea25188751e Mon Sep 17 00:00:00 2001 From: Roman Chyla Date: Wed, 11 Jun 2014 16:20:54 -0400 Subject: [PATCH 5/9] Fixed the configuration of the parser --- .../apache/solr/search/AdsQParserPlugin.java | 7 ++++ .../apache/solr/search/AqpAdsabsQParser.java | 41 +++++++++++++------ .../solr/search/SolrParserConfigParams.java | 2 + .../solr/collection1/conf/solrconfig.xml | 14 +++---- 4 files changed, 44 insertions(+), 20 deletions(-) diff --git a/contrib/adsabs/src/java/org/apache/solr/search/AdsQParserPlugin.java b/contrib/adsabs/src/java/org/apache/solr/search/AdsQParserPlugin.java index 370f21500..f64f55253 100644 --- a/contrib/adsabs/src/java/org/apache/solr/search/AdsQParserPlugin.java +++ b/contrib/adsabs/src/java/org/apache/solr/search/AdsQParserPlugin.java @@ -39,6 +39,13 @@ public void init(NamedList args) { } } + for (int i=0; i namedParams = config.get(AqpStandardQueryConfigHandler.ConfigurationKeys.NAMED_PARAMETER); + + // get the parameters from the parser configuration (and pass them on) + for (Entry par: defaultConfig.params.entrySet()) { + String k = par.getKey(); + if (k.startsWith("aqp.")) { + namedParams.put(k, (String) par.getValue()); + } + } + + // get the named parameters from solr request object (they will be passed further on) if (params != null) { for (Entry par: params.toNamedList()) { String k = par.getKey(); @@ -98,29 +108,34 @@ public AqpAdsabsQParser(AqpQueryParser parser, String qstr, SolrParams localPara qParser.setAnalyzer(schema.getAnalyzer()); String defaultField = getParam(CommonParams.DF); - if (defaultField == null && namedParams.containsKey("aqp.defaultField")) { - defaultField = namedParams.get("aqp.defaultField"); + if (defaultField == null) { + if (namedParams.containsKey("aqp.defaultField")) { + defaultField = namedParams.get("aqp.defaultField"); + } + else { + defaultField = getReq().getSchema().getDefaultSearchFieldName(); + } } - //else { - // defaultField = getReq().getSchema().getDefaultSearchFieldName(); - //} + if (defaultField != null) { config.set(AqpStandardQueryConfigHandler.ConfigurationKeys.DEFAULT_FIELD, defaultField); } - + // if defaultField was set, this will be useless if (namedParams.containsKey("aqp.unfieldedSearchField")) config.set(AqpAdsabsQueryConfigHandler.ConfigurationKeys.UNFIELDED_SEARCH_FIELD, namedParams.get("aqp.unfieldedSearchField")); // default operator String opParam = getParam(QueryParsing.OP); - if (opParam == null && namedParams.containsKey("aqp.defaultOperator")) { - opParam = namedParams.get("aqp.defaultOperator"); - } - else { - opParam = getReq().getSchema().getQueryParserDefaultOperator(); + if (opParam == null) { + if (namedParams.containsKey("aqp.defaultOperator")) { + opParam = namedParams.get("aqp.defaultOperator"); + } + else { + opParam = getReq().getSchema().getQueryParserDefaultOperator(); + } } - + if (opParam != null) { qParser.setDefaultOperator("AND".equals(opParam.toUpperCase()) ? Operator.AND : Operator.OR); diff --git a/contrib/adsabs/src/java/org/apache/solr/search/SolrParserConfigParams.java b/contrib/adsabs/src/java/org/apache/solr/search/SolrParserConfigParams.java index b0a3048f1..70136686d 100644 --- a/contrib/adsabs/src/java/org/apache/solr/search/SolrParserConfigParams.java +++ b/contrib/adsabs/src/java/org/apache/solr/search/SolrParserConfigParams.java @@ -5,7 +5,9 @@ public class SolrParserConfigParams { public Map> virtualFields; + public HashMap params; public SolrParserConfigParams() { virtualFields = new HashMap>(); + params = new HashMap(); } } diff --git a/contrib/examples/adsabs/solr/collection1/conf/solrconfig.xml b/contrib/examples/adsabs/solr/collection1/conf/solrconfig.xml index 2f4263c52..d274de679 100644 --- a/contrib/examples/adsabs/solr/collection1/conf/solrconfig.xml +++ b/contrib/examples/adsabs/solr/collection1/conf/solrconfig.xml @@ -862,13 +862,6 @@ simple edismax_combined_aqp - unfielded_search - unfielded_search - AND - arxiv identifier; - pubdate date;author_nosyn author_notrans author_nosyn_notrans author - author^1.5 title^1.4 abstract^1.3 all -