From 2e82d234831a42281faad02f2b08d144eb4cfc92 Mon Sep 17 00:00:00 2001 From: Roman Chyla Date: Wed, 8 Apr 2015 19:12:25 -0400 Subject: [PATCH] Added code to produce tf/idf for wordcloud - this code is here to test performance against term vector component --- .classpath | 3 +- .project | 65 +++++- .../handler/component/WordCloudComponent.java | 186 ++++++++++++++++++ .../collection1/conf/solrconfig-wordcloud.xml | 17 ++ .../component/TestWordCloudComponent.java | 89 +++++++++ 5 files changed, 351 insertions(+), 9 deletions(-) create mode 100644 contrib/adsabs/src/java/org/apache/solr/handler/component/WordCloudComponent.java create mode 100644 contrib/adsabs/src/test-files/solr/collection1/conf/solrconfig-wordcloud.xml create mode 100644 contrib/adsabs/src/test/org/apache/solr/handler/component/TestWordCloudComponent.java diff --git a/.classpath b/.classpath index 1fef21005..d5312e399 100644 --- a/.classpath +++ b/.classpath @@ -48,7 +48,7 @@ - + @@ -113,5 +113,6 @@ + diff --git a/.project b/.project index 005ef8e1e..950db9624 100644 --- a/.project +++ b/.project @@ -1,16 +1,10 @@ - montysolr-next + apache-solr-48 - solr-next - - org.python.pydev.PyDevBuilder - - - org.eclipse.jdt.core.javabuilder @@ -19,6 +13,61 @@ org.eclipse.jdt.core.javanature - org.python.pydev.pythonNature + + + 1353353379237 + + 30 + + org.eclipse.ui.ide.multiFilter + 1.0-projectRelativePath-matches-false-false-lucene/build + + + + 1353353379238 + + 30 + + org.eclipse.ui.ide.multiFilter + 1.0-projectRelativePath-matches-false-false-solr/build + + + + 1353353379240 + + 30 + + org.eclipse.ui.ide.multiFilter + 1.0-projectRelativePath-matches-false-false-lucene/dist + + + + 1353353379242 + + 30 + + org.eclipse.ui.ide.multiFilter + 1.0-projectRelativePath-matches-false-false-solr/package + + + + 1353353379244 + + 30 + + org.eclipse.ui.ide.multiFilter + 1.0-projectRelativePath-matches-false-false-solr/dist + + + + 1353353379246 + + 10 + + org.eclipse.ui.ide.multiFilter + 1.0-name-matches-false-false-.svn + + + diff --git a/contrib/adsabs/src/java/org/apache/solr/handler/component/WordCloudComponent.java b/contrib/adsabs/src/java/org/apache/solr/handler/component/WordCloudComponent.java new file mode 100644 index 000000000..36d1e0af2 --- /dev/null +++ b/contrib/adsabs/src/java/org/apache/solr/handler/component/WordCloudComponent.java @@ -0,0 +1,186 @@ +package org.apache.solr.handler.component; + +import java.io.IOException; +import java.io.StringReader; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.Map.Entry; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.NumericTokenStream.NumericTermAttribute; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.AtomicReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.Query; +import org.apache.lucene.util.BytesRef; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.SolrException.ErrorCode; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.common.util.StrUtils; +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.transform.DocTransformer; +import org.apache.solr.response.transform.TransformContext; +import org.apache.solr.schema.FieldType; +import org.apache.solr.schema.IndexSchema; +import org.apache.solr.schema.SchemaField; +import org.apache.solr.search.DocIterator; +import org.apache.solr.search.DocList; +import org.apache.solr.search.SolrIndexSearcher; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class WordCloudComponent extends SearchComponent { + + public static final String COMPONENT_NAME = "wordcloud"; + + @Override + public void prepare(ResponseBuilder rb) throws IOException { + SolrQueryRequest req = rb.req; + SolrParams params = req.getParams(); + if (!params.getBool(COMPONENT_NAME, true)) { + return; + } + + Query query = rb.getQuery(); + if (query == null) return; + + } + + @Override + public void process(ResponseBuilder rb) throws IOException { + SolrQueryRequest req = rb.req; + SolrParams params = req.getParams(); + if (!params.getBool(COMPONENT_NAME, true)) { + return; + } + + String wcFields = null; + if ((wcFields = params.get("wordcloud.fl", null)) == null) { + return; + } + + Set flds = new HashSet(StrUtils.splitSmart(wcFields,',')); + DocList ids = rb.getResults().docList; + + SolrIndexSearcher searcher = rb.req.getSearcher(); + IndexSchema schema = rb.req.getCore().getLatestSchema(); + + final Analyzer analyzer = rb.req.getCore().getLatestSchema().getAnalyzer(); + final HashMap fieldsToLoad = new HashMap(); + + CharTermAttribute termAtt; + Map>tokens = new HashMap>(); + + for (String f: flds) { + SchemaField field = schema.getFieldOrNull(f); + if (field==null || !field.stored()) { + continue; // ignore this field + } + fieldsToLoad.put(f, field.getType()); + tokens.put(f, new HashMap()); + } + + + + DocIterator iterator = ids.iterator(); + String w; Integer v; + int sz = ids.size(); + for (int i=0; i en: fieldsToLoad.entrySet()) { + Map toks = tokens.get(en.getKey()); + String[] vals = doc.getValues(en.getKey()); + FieldType fType = en.getValue(); + + if (vals != null) { + for (String s: vals) { + TokenStream buffer = analyzer.tokenStream(en.getKey(), new StringReader(fType.indexedToReadable(s))); + + if (!buffer.hasAttribute(CharTermAttribute.class)) { + continue; // empty stream + } + + termAtt = buffer.getAttribute(CharTermAttribute.class); + buffer.reset(); + + while (buffer.incrementToken()) { + w = termAtt.toString(); + v = toks.get(w); + if (v == null) v = 0; + toks.put(w, ++v); + } + + buffer.close(); + } + } + } + } + + // TODO: filter out the tokens (use some sort of a range 0.1-0.9 by frequency) + + AtomicReader reader = searcher.getAtomicReader(); + BytesRef term; + int df; + String f; + + Map>docFreqs = new HashMap>(); + for (Entry>field: tokens.entrySet()) { + HashMap idfs = new HashMap(); + f = field.getKey(); + docFreqs.put(f, idfs); + int N = reader.getDocCount(f); + + for (Entrytoken: field.getValue().entrySet()) { + w = token.getKey(); + df = reader.docFreq(new Term(f, new BytesRef(w))); + if (df != 0) { + idfs.put(w, Math.log10(N/df)); + } + } + } + + HashMap ret = new HashMap(); + for (String fi: fieldsToLoad.keySet()) { + HashMap va = new HashMap(); + va.put("tf", tokens.get(fi)); + va.put("idf", docFreqs.get(fi)); + ret.put(fi, va); + } + rb.rsp.add("wordcloud", ret); + + } + + @Override + public String getDescription() { + return "return tokens with TF and IDF for wordcloud"; + } + + @Override + public String getSource() { + return null; + } + +} diff --git a/contrib/adsabs/src/test-files/solr/collection1/conf/solrconfig-wordcloud.xml b/contrib/adsabs/src/test-files/solr/collection1/conf/solrconfig-wordcloud.xml new file mode 100644 index 000000000..d7a3cd34f --- /dev/null +++ b/contrib/adsabs/src/test-files/solr/collection1/conf/solrconfig-wordcloud.xml @@ -0,0 +1,17 @@ + + + + + ${tests.luceneMatchVersion:LUCENE_CURRENT} + ${solr.data.dir:} + + + + + + + wordcloud + + + + diff --git a/contrib/adsabs/src/test/org/apache/solr/handler/component/TestWordCloudComponent.java b/contrib/adsabs/src/test/org/apache/solr/handler/component/TestWordCloudComponent.java new file mode 100644 index 000000000..5979326db --- /dev/null +++ b/contrib/adsabs/src/test/org/apache/solr/handler/component/TestWordCloudComponent.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.solr.handler.component; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.BitSet; +import java.util.List; + +import monty.solr.util.MontySolrAbstractTestCase; +import monty.solr.util.MontySolrSetup; + +import org.apache.solr.common.util.ContentStream; +import org.apache.solr.common.util.ContentStreamBase; +import org.apache.solr.common.util.NamedList; +import org.apache.solr.request.SolrQueryRequestBase; +import org.junit.BeforeClass; +import org.junit.Test; + +public class TestWordCloudComponent extends MontySolrAbstractTestCase { + + @BeforeClass + public static void beforeClass() throws Exception { + + makeResourcesVisible(Thread.currentThread().getContextClassLoader(), new String[] { + MontySolrSetup.getMontySolrHome() + "/contrib/adsabs/src/test-files/solr/collection1/conf", + MontySolrSetup.getSolrHome() + "/example/solr/collection1/conf" + }); + + System.setProperty("solr.allow.unsafe.resourceloading", "true"); + schemaString = MontySolrSetup.getMontySolrHome() + "/contrib/adsabs/src/test-files/solr/collection1/conf/" + + "schema-minimal.xml"; + + configString = MontySolrSetup.getMontySolrHome() + "/contrib/adsabs/src/test-files/solr/collection1/conf/" + + "solrconfig-wordcloud.xml"; + + initCore(configString, schemaString, MontySolrSetup.getSolrHome() + + "/example/solr"); + } + + + public void createIndex() { + assertU(adoc("id","1","recid","1", "text", "who")); + assertU(adoc("id","2","recid","2", "text", "is stopword")); + assertU(adoc("id","3","recid","3", "text", "able")); + assertU(adoc("id","4","recid","4", "text", "to stopword")); + assertU(adoc("id","5","recid","5", "text", "exchange")); + assertU(commit("waitSearcher", "true")); + + assertU(adoc("id","16","recid","16", "text", "liberty")); + assertU(adoc("id","17","recid","17", "text", "for stopword")); + assertU(adoc("id","18","recid","18", "text", "safety")); + assertU(adoc("id","19","recid","19", "text", "deserves")); + assertU(adoc("id","20","recid","20", "text", "neither who")); + assertU(commit("waitSearcher", "true")); + } + + @Override + public void setUp() throws Exception { + super.setUp(); + createIndex(); + } + + @Test + public void test() throws IOException, Exception { + + assertQ(req("q", "*:*", "wordcloud", "true", "wordcloud.fl", "id,text", "indent", "true"), + "//lst[@name='wordcloud']/lst[@name='text']/lst[@name='tf']/int[@name='liberty']='1'", + "//lst[@name='wordcloud']/lst[@name='text']/lst[@name='idf']/double[@name='liberty']='1.0'" + ); + + } +} + +