Added code to produce tf/idf for wordcloud - this code is here to tes…

…t performance against term vector component
romanchyla · Apr 8, 2015 · 2e82d23 · 2e82d23
1 parent a45a8d1
commit 2e82d23
Show file tree

Hide file tree

Showing 5 changed files with 351 additions and 9 deletions.
diff --git a/.classpath b/.classpath
@@ -48,7 +48,7 @@
 	<classpathentry kind="lib" path="build/solrjars-extracted/lucene-analyzers-uima-4.8-SNAPSHOT.jar"/>
 	<classpathentry kind="lib" path="build/solrjars-extracted/lucene-benchmark-4.8-SNAPSHOT.jar"/>
 	<classpathentry kind="lib" path="build/solrjars-extracted/lucene-classification-4.8-SNAPSHOT.jar"/>
-	<classpathentry kind="lib" path="build/solrjars-extracted/lucene-codecs-4.8-SNAPSHOT.jar"/>
+	<classpathentry kind="lib" path="build/solrjars-extracted/lucene-codecs-4.8-SNAPSHOT.jar" sourcepath="/solr-next/lucene/memory/src/java"/>
 	<classpathentry kind="lib" path="build/solrjars-extracted/lucene-core-4.8-SNAPSHOT.jar" sourcepath="build/solr-download/apache-solr-4.0.0-SVN/lucene/core/src/java"/>
 	<classpathentry kind="lib" path="build/solrjars-extracted/lucene-demo-4.8-SNAPSHOT.jar"/>
 	<classpathentry kind="lib" path="build/solrjars-extracted/lucene-expressions-4.8-SNAPSHOT.jar"/>
@@ -113,5 +113,6 @@
 	<classpathentry kind="lib" path="contrib/antlrqueryparser/lib/antlr-3.4.jar"/>
 	<classpathentry kind="lib" path="contrib/antlrqueryparser/lib/antlr-runtime-3.4.jar"/>
 	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
+	<classpathentry combineaccessrules="false" kind="src" path="/solr-next"/>
 	<classpathentry kind="output" path="bin"/>
 </classpath>
diff --git a/.project b/.project
@@ -1,16 +1,10 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <projectDescription>
-	<name>montysolr-next</name>
+	<name>apache-solr-48</name>
 	<comment></comment>
 	<projects>
-		<project>solr-next</project>
 	</projects>
 	<buildSpec>
-		<buildCommand>
-			<name>org.python.pydev.PyDevBuilder</name>
-			<arguments>
-			</arguments>
-		</buildCommand>
 		<buildCommand>
 			<name>org.eclipse.jdt.core.javabuilder</name>
 			<arguments>
@@ -19,6 +13,61 @@
 	</buildSpec>
 	<natures>
 		<nature>org.eclipse.jdt.core.javanature</nature>
-		<nature>org.python.pydev.pythonNature</nature>
 	</natures>
+	<filteredResources>
+		<filter>
+			<id>1353353379237</id>
+			<name></name>
+			<type>30</type>
+			<matcher>
+				<id>org.eclipse.ui.ide.multiFilter</id>
+				<arguments>1.0-projectRelativePath-matches-false-false-lucene/build</arguments>
+			</matcher>
+		</filter>
+		<filter>
+			<id>1353353379238</id>
+			<name></name>
+			<type>30</type>
+			<matcher>
+				<id>org.eclipse.ui.ide.multiFilter</id>
+				<arguments>1.0-projectRelativePath-matches-false-false-solr/build</arguments>
+			</matcher>
+		</filter>
+		<filter>
+			<id>1353353379240</id>
+			<name></name>
+			<type>30</type>
+			<matcher>
+				<id>org.eclipse.ui.ide.multiFilter</id>
+				<arguments>1.0-projectRelativePath-matches-false-false-lucene/dist</arguments>
+			</matcher>
+		</filter>
+		<filter>
+			<id>1353353379242</id>
+			<name></name>
+			<type>30</type>
+			<matcher>
+				<id>org.eclipse.ui.ide.multiFilter</id>
+				<arguments>1.0-projectRelativePath-matches-false-false-solr/package</arguments>
+			</matcher>
+		</filter>
+		<filter>
+			<id>1353353379244</id>
+			<name></name>
+			<type>30</type>
+			<matcher>
+				<id>org.eclipse.ui.ide.multiFilter</id>
+				<arguments>1.0-projectRelativePath-matches-false-false-solr/dist</arguments>
+			</matcher>
+		</filter>
+		<filter>
+			<id>1353353379246</id>
+			<name></name>
+			<type>10</type>
+			<matcher>
+				<id>org.eclipse.ui.ide.multiFilter</id>
+				<arguments>1.0-name-matches-false-false-.svn</arguments>
+			</matcher>
+		</filter>
+	</filteredResources>
 </projectDescription>
diff --git a/contrib/adsabs/src/java/org/apache/solr/handler/component/WordCloudComponent.java b/contrib/adsabs/src/java/org/apache/solr/handler/component/WordCloudComponent.java
@@ -0,0 +1,186 @@
+package org.apache.solr.handler.component;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.Map.Entry;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.NumericTokenStream.NumericTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.AtomicReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.util.BytesRef;
+import org.apache.solr.common.SolrDocument;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrException.ErrorCode;
+import org.apache.solr.common.params.SolrParams;
+import org.apache.solr.common.util.StrUtils;
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.transform.DocTransformer;
+import org.apache.solr.response.transform.TransformContext;
+import org.apache.solr.schema.FieldType;
+import org.apache.solr.schema.IndexSchema;
+import org.apache.solr.schema.SchemaField;
+import org.apache.solr.search.DocIterator;
+import org.apache.solr.search.DocList;
+import org.apache.solr.search.SolrIndexSearcher;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class WordCloudComponent extends SearchComponent {
+
+  public static final String COMPONENT_NAME = "wordcloud";
+
+  @Override
+  public void prepare(ResponseBuilder rb) throws IOException {
+    SolrQueryRequest req = rb.req;
+    SolrParams params = req.getParams();
+    if (!params.getBool(COMPONENT_NAME, true)) {
+      return;
+    }
+
+    Query query = rb.getQuery();
+    if (query == null) return;
+
+  }
+
+  @Override
+  public void process(ResponseBuilder rb) throws IOException {
+    SolrQueryRequest req = rb.req;
+    SolrParams params = req.getParams();
+    if (!params.getBool(COMPONENT_NAME, true)) {
+      return;
+    }
+
+    String wcFields = null;
+    if ((wcFields = params.get("wordcloud.fl", null)) == null) {
+      return;
+    }
+
+    Set<String> flds = new HashSet<String>(StrUtils.splitSmart(wcFields,','));
+    DocList ids = rb.getResults().docList;
+
+    SolrIndexSearcher searcher = rb.req.getSearcher();
+    IndexSchema schema = rb.req.getCore().getLatestSchema();
+
+    final Analyzer analyzer = rb.req.getCore().getLatestSchema().getAnalyzer();
+    final HashMap<String, FieldType> fieldsToLoad = new HashMap<String, FieldType>();
+
+    CharTermAttribute termAtt;
+    Map<String, Map<String, Integer>>tokens = new HashMap<String, Map<String, Integer>>();
+
+    for (String f: flds) {
+      SchemaField field = schema.getFieldOrNull(f);
+      if (field==null || !field.stored()) {
+        continue; // ignore this field
+      }
+      fieldsToLoad.put(f, field.getType());
+      tokens.put(f, new HashMap<String, Integer>());
+    }
+
+
+
+    DocIterator iterator = ids.iterator();
+    String w; Integer v;
+    int sz = ids.size();
+    for (int i=0; i<sz; i++) {
+      int id = iterator.nextDoc();
+      Document doc = searcher.doc(id, fieldsToLoad.keySet());
+      for (Entry<String,FieldType> en: fieldsToLoad.entrySet()) {
+        Map<String,Integer> toks = tokens.get(en.getKey());
+        String[] vals = doc.getValues(en.getKey());
+        FieldType fType = en.getValue();
+
+        if (vals != null) {
+          for (String s: vals) {
+            TokenStream buffer = analyzer.tokenStream(en.getKey(), new StringReader(fType.indexedToReadable(s)));
+
+            if (!buffer.hasAttribute(CharTermAttribute.class)) {
+              continue; // empty stream
+            }
+
+            termAtt = buffer.getAttribute(CharTermAttribute.class);
+            buffer.reset();
+
+            while (buffer.incrementToken()) {
+              w = termAtt.toString();
+              v = toks.get(w);
+              if (v == null) v = 0;
+              toks.put(w, ++v);
+            }
+
+            buffer.close();
+          }
+        }
+      }
+    }
+
+    // TODO: filter out the tokens (use some sort of a range 0.1-0.9 by frequency)
+
+    AtomicReader reader = searcher.getAtomicReader();
+    BytesRef term;
+    int df;
+    String f;
+
+    Map<String, Map<String, Double>>docFreqs = new HashMap<String, Map<String, Double>>();
+    for (Entry<String, Map<String, Integer>>field: tokens.entrySet()) {
+      HashMap<String,Double> idfs = new HashMap<String, Double>();
+      f = field.getKey();
+      docFreqs.put(f, idfs);
+      int N = reader.getDocCount(f);
+
+      for (Entry<String, Integer>token: field.getValue().entrySet()) {
+        w = token.getKey();
+        df = reader.docFreq(new Term(f, new BytesRef(w)));
+        if (df != 0) {
+          idfs.put(w, Math.log10(N/df));
+        }
+      }
+    }
+
+    HashMap<String,Object> ret = new HashMap<String, Object>();
+    for (String fi: fieldsToLoad.keySet()) {
+      HashMap<String, Object> va = new HashMap<String, Object>();
+      va.put("tf", tokens.get(fi));
+      va.put("idf", docFreqs.get(fi));
+      ret.put(fi, va);
+    }
+    rb.rsp.add("wordcloud", ret);
+
+  }
+
+  @Override
+  public String getDescription() {
+    return "return tokens with TF and IDF for wordcloud";
+  }
+
+  @Override
+  public String getSource() {
+    return null;
+  }
+
+}
diff --git a/contrib/adsabs/src/test-files/solr/collection1/conf/solrconfig-wordcloud.xml b/contrib/adsabs/src/test-files/solr/collection1/conf/solrconfig-wordcloud.xml
@@ -0,0 +1,17 @@
+<?xml version="1.0" ?>
+
+
+<config>
+  <luceneMatchVersion>${tests.luceneMatchVersion:LUCENE_CURRENT}</luceneMatchVersion>
+  <dataDir>${solr.data.dir:}</dataDir>
+  <directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.RAMDirectoryFactory}"/>
+
+  <searchComponent name="wordcloud" class="solr.WordCloudComponent"/>
+
+  <requestHandler name="standard" class="solr.StandardRequestHandler">
+    <arr name="last-components">
+      <str>wordcloud</str>
+    </arr>
+  </requestHandler>
+
+</config>
diff --git a/contrib/adsabs/src/test/org/apache/solr/handler/component/TestWordCloudComponent.java b/contrib/adsabs/src/test/org/apache/solr/handler/component/TestWordCloudComponent.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.handler.component;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.List;
+
+import monty.solr.util.MontySolrAbstractTestCase;
+import monty.solr.util.MontySolrSetup;
+
+import org.apache.solr.common.util.ContentStream;
+import org.apache.solr.common.util.ContentStreamBase;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.request.SolrQueryRequestBase;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class TestWordCloudComponent extends MontySolrAbstractTestCase {
+
+	@BeforeClass
+	public static void beforeClass() throws Exception {
+
+		makeResourcesVisible(Thread.currentThread().getContextClassLoader(), new String[] {
+			    MontySolrSetup.getMontySolrHome() + "/contrib/adsabs/src/test-files/solr/collection1/conf",
+		      MontySolrSetup.getSolrHome() + "/example/solr/collection1/conf"
+		    });
+
+		System.setProperty("solr.allow.unsafe.resourceloading", "true");
+		schemaString = MontySolrSetup.getMontySolrHome() + "/contrib/adsabs/src/test-files/solr/collection1/conf/"
+				  +	"schema-minimal.xml";
+
+		configString = MontySolrSetup.getMontySolrHome() + "/contrib/adsabs/src/test-files/solr/collection1/conf/" 
+					+ "solrconfig-wordcloud.xml";
+
+		initCore(configString, schemaString, MontySolrSetup.getSolrHome()
+			    + "/example/solr");
+	}
+
+
+	public void createIndex() {
+		assertU(adoc("id","1","recid","1", "text", "who"));
+		assertU(adoc("id","2","recid","2", "text", "is stopword"));
+		assertU(adoc("id","3","recid","3", "text", "able"));
+		assertU(adoc("id","4","recid","4", "text", "to stopword"));
+		assertU(adoc("id","5","recid","5", "text", "exchange"));
+		assertU(commit("waitSearcher", "true"));
+
+		assertU(adoc("id","16","recid","16", "text", "liberty"));
+		assertU(adoc("id","17","recid","17", "text", "for stopword"));
+		assertU(adoc("id","18","recid","18", "text", "safety"));
+		assertU(adoc("id","19","recid","19", "text", "deserves"));
+		assertU(adoc("id","20","recid","20", "text", "neither who"));
+		assertU(commit("waitSearcher", "true"));
+	}
+
+	@Override
+	public void setUp() throws Exception {
+		super.setUp();
+		createIndex();
+	}
+
+	@Test
+	public void test() throws IOException, Exception {
+
+	  assertQ(req("q", "*:*", "wordcloud", "true", "wordcloud.fl", "id,text", "indent", "true"),
+	      "//lst[@name='wordcloud']/lst[@name='text']/lst[@name='tf']/int[@name='liberty']='1'",
+        "//lst[@name='wordcloud']/lst[@name='text']/lst[@name='idf']/double[@name='liberty']='1.0'"
+    );
+
+	}
+}
+
+