Skip to content

Commit

Permalink
Added code to produce tf/idf for wordcloud - this code is here to tes…
Browse files Browse the repository at this point in the history
…t performance against term vector component
  • Loading branch information
romanchyla committed Apr 8, 2015
1 parent a45a8d1 commit 2e82d23
Show file tree
Hide file tree
Showing 5 changed files with 351 additions and 9 deletions.
3 changes: 2 additions & 1 deletion .classpath
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
<classpathentry kind="lib" path="build/solrjars-extracted/lucene-analyzers-uima-4.8-SNAPSHOT.jar"/>
<classpathentry kind="lib" path="build/solrjars-extracted/lucene-benchmark-4.8-SNAPSHOT.jar"/>
<classpathentry kind="lib" path="build/solrjars-extracted/lucene-classification-4.8-SNAPSHOT.jar"/>
<classpathentry kind="lib" path="build/solrjars-extracted/lucene-codecs-4.8-SNAPSHOT.jar"/>
<classpathentry kind="lib" path="build/solrjars-extracted/lucene-codecs-4.8-SNAPSHOT.jar" sourcepath="/solr-next/lucene/memory/src/java"/>
<classpathentry kind="lib" path="build/solrjars-extracted/lucene-core-4.8-SNAPSHOT.jar" sourcepath="build/solr-download/apache-solr-4.0.0-SVN/lucene/core/src/java"/>
<classpathentry kind="lib" path="build/solrjars-extracted/lucene-demo-4.8-SNAPSHOT.jar"/>
<classpathentry kind="lib" path="build/solrjars-extracted/lucene-expressions-4.8-SNAPSHOT.jar"/>
Expand Down Expand Up @@ -113,5 +113,6 @@
<classpathentry kind="lib" path="contrib/antlrqueryparser/lib/antlr-3.4.jar"/>
<classpathentry kind="lib" path="contrib/antlrqueryparser/lib/antlr-runtime-3.4.jar"/>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
<classpathentry combineaccessrules="false" kind="src" path="/solr-next"/>
<classpathentry kind="output" path="bin"/>
</classpath>
65 changes: 57 additions & 8 deletions .project
Original file line number Diff line number Diff line change
@@ -1,16 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>montysolr-next</name>
<name>apache-solr-48</name>
<comment></comment>
<projects>
<project>solr-next</project>
</projects>
<buildSpec>
<buildCommand>
<name>org.python.pydev.PyDevBuilder</name>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.jdt.core.javabuilder</name>
<arguments>
Expand All @@ -19,6 +13,61 @@
</buildSpec>
<natures>
<nature>org.eclipse.jdt.core.javanature</nature>
<nature>org.python.pydev.pythonNature</nature>
</natures>
<filteredResources>
<filter>
<id>1353353379237</id>
<name></name>
<type>30</type>
<matcher>
<id>org.eclipse.ui.ide.multiFilter</id>
<arguments>1.0-projectRelativePath-matches-false-false-lucene/build</arguments>
</matcher>
</filter>
<filter>
<id>1353353379238</id>
<name></name>
<type>30</type>
<matcher>
<id>org.eclipse.ui.ide.multiFilter</id>
<arguments>1.0-projectRelativePath-matches-false-false-solr/build</arguments>
</matcher>
</filter>
<filter>
<id>1353353379240</id>
<name></name>
<type>30</type>
<matcher>
<id>org.eclipse.ui.ide.multiFilter</id>
<arguments>1.0-projectRelativePath-matches-false-false-lucene/dist</arguments>
</matcher>
</filter>
<filter>
<id>1353353379242</id>
<name></name>
<type>30</type>
<matcher>
<id>org.eclipse.ui.ide.multiFilter</id>
<arguments>1.0-projectRelativePath-matches-false-false-solr/package</arguments>
</matcher>
</filter>
<filter>
<id>1353353379244</id>
<name></name>
<type>30</type>
<matcher>
<id>org.eclipse.ui.ide.multiFilter</id>
<arguments>1.0-projectRelativePath-matches-false-false-solr/dist</arguments>
</matcher>
</filter>
<filter>
<id>1353353379246</id>
<name></name>
<type>10</type>
<matcher>
<id>org.eclipse.ui.ide.multiFilter</id>
<arguments>1.0-name-matches-false-false-.svn</arguments>
</matcher>
</filter>
</filteredResources>
</projectDescription>
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
package org.apache.solr.handler.component;

import java.io.IOException;
import java.io.StringReader;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Map.Entry;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.NumericTokenStream.NumericTermAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.transform.DocTransformer;
import org.apache.solr.response.transform.TransformContext;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocList;
import org.apache.solr.search.SolrIndexSearcher;

/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

public class WordCloudComponent extends SearchComponent {

public static final String COMPONENT_NAME = "wordcloud";

@Override
public void prepare(ResponseBuilder rb) throws IOException {
SolrQueryRequest req = rb.req;
SolrParams params = req.getParams();
if (!params.getBool(COMPONENT_NAME, true)) {
return;
}

Query query = rb.getQuery();
if (query == null) return;

}

@Override
public void process(ResponseBuilder rb) throws IOException {
SolrQueryRequest req = rb.req;
SolrParams params = req.getParams();
if (!params.getBool(COMPONENT_NAME, true)) {
return;
}

String wcFields = null;
if ((wcFields = params.get("wordcloud.fl", null)) == null) {
return;
}

Set<String> flds = new HashSet<String>(StrUtils.splitSmart(wcFields,','));
DocList ids = rb.getResults().docList;

SolrIndexSearcher searcher = rb.req.getSearcher();
IndexSchema schema = rb.req.getCore().getLatestSchema();

final Analyzer analyzer = rb.req.getCore().getLatestSchema().getAnalyzer();
final HashMap<String, FieldType> fieldsToLoad = new HashMap<String, FieldType>();

CharTermAttribute termAtt;
Map<String, Map<String, Integer>>tokens = new HashMap<String, Map<String, Integer>>();

for (String f: flds) {
SchemaField field = schema.getFieldOrNull(f);
if (field==null || !field.stored()) {
continue; // ignore this field
}
fieldsToLoad.put(f, field.getType());
tokens.put(f, new HashMap<String, Integer>());
}



DocIterator iterator = ids.iterator();
String w; Integer v;
int sz = ids.size();
for (int i=0; i<sz; i++) {
int id = iterator.nextDoc();
Document doc = searcher.doc(id, fieldsToLoad.keySet());
for (Entry<String,FieldType> en: fieldsToLoad.entrySet()) {
Map<String,Integer> toks = tokens.get(en.getKey());
String[] vals = doc.getValues(en.getKey());
FieldType fType = en.getValue();

if (vals != null) {
for (String s: vals) {
TokenStream buffer = analyzer.tokenStream(en.getKey(), new StringReader(fType.indexedToReadable(s)));

if (!buffer.hasAttribute(CharTermAttribute.class)) {
continue; // empty stream
}

termAtt = buffer.getAttribute(CharTermAttribute.class);
buffer.reset();

while (buffer.incrementToken()) {
w = termAtt.toString();
v = toks.get(w);
if (v == null) v = 0;
toks.put(w, ++v);
}

buffer.close();
}
}
}
}

// TODO: filter out the tokens (use some sort of a range 0.1-0.9 by frequency)

AtomicReader reader = searcher.getAtomicReader();
BytesRef term;
int df;
String f;

Map<String, Map<String, Double>>docFreqs = new HashMap<String, Map<String, Double>>();
for (Entry<String, Map<String, Integer>>field: tokens.entrySet()) {
HashMap<String,Double> idfs = new HashMap<String, Double>();
f = field.getKey();
docFreqs.put(f, idfs);
int N = reader.getDocCount(f);

for (Entry<String, Integer>token: field.getValue().entrySet()) {
w = token.getKey();
df = reader.docFreq(new Term(f, new BytesRef(w)));
if (df != 0) {
idfs.put(w, Math.log10(N/df));
}
}
}

HashMap<String,Object> ret = new HashMap<String, Object>();
for (String fi: fieldsToLoad.keySet()) {
HashMap<String, Object> va = new HashMap<String, Object>();
va.put("tf", tokens.get(fi));
va.put("idf", docFreqs.get(fi));
ret.put(fi, va);
}
rb.rsp.add("wordcloud", ret);

}

@Override
public String getDescription() {
return "return tokens with TF and IDF for wordcloud";
}

@Override
public String getSource() {
return null;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<?xml version="1.0" ?>


<config>
<luceneMatchVersion>${tests.luceneMatchVersion:LUCENE_CURRENT}</luceneMatchVersion>
<dataDir>${solr.data.dir:}</dataDir>
<directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.RAMDirectoryFactory}"/>

<searchComponent name="wordcloud" class="solr.WordCloudComponent"/>

<requestHandler name="standard" class="solr.StandardRequestHandler">
<arr name="last-components">
<str>wordcloud</str>
</arr>
</requestHandler>

</config>
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.component;

import java.io.IOException;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.List;

import monty.solr.util.MontySolrAbstractTestCase;
import monty.solr.util.MontySolrSetup;

import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.util.ContentStreamBase;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.request.SolrQueryRequestBase;
import org.junit.BeforeClass;
import org.junit.Test;

public class TestWordCloudComponent extends MontySolrAbstractTestCase {

@BeforeClass
public static void beforeClass() throws Exception {

makeResourcesVisible(Thread.currentThread().getContextClassLoader(), new String[] {
MontySolrSetup.getMontySolrHome() + "/contrib/adsabs/src/test-files/solr/collection1/conf",
MontySolrSetup.getSolrHome() + "/example/solr/collection1/conf"
});

System.setProperty("solr.allow.unsafe.resourceloading", "true");
schemaString = MontySolrSetup.getMontySolrHome() + "/contrib/adsabs/src/test-files/solr/collection1/conf/"
+ "schema-minimal.xml";

configString = MontySolrSetup.getMontySolrHome() + "/contrib/adsabs/src/test-files/solr/collection1/conf/"
+ "solrconfig-wordcloud.xml";

initCore(configString, schemaString, MontySolrSetup.getSolrHome()
+ "/example/solr");
}


public void createIndex() {
assertU(adoc("id","1","recid","1", "text", "who"));
assertU(adoc("id","2","recid","2", "text", "is stopword"));
assertU(adoc("id","3","recid","3", "text", "able"));
assertU(adoc("id","4","recid","4", "text", "to stopword"));
assertU(adoc("id","5","recid","5", "text", "exchange"));
assertU(commit("waitSearcher", "true"));

assertU(adoc("id","16","recid","16", "text", "liberty"));
assertU(adoc("id","17","recid","17", "text", "for stopword"));
assertU(adoc("id","18","recid","18", "text", "safety"));
assertU(adoc("id","19","recid","19", "text", "deserves"));
assertU(adoc("id","20","recid","20", "text", "neither who"));
assertU(commit("waitSearcher", "true"));
}

@Override
public void setUp() throws Exception {
super.setUp();
createIndex();
}

@Test
public void test() throws IOException, Exception {

assertQ(req("q", "*:*", "wordcloud", "true", "wordcloud.fl", "id,text", "indent", "true"),
"//lst[@name='wordcloud']/lst[@name='text']/lst[@name='tf']/int[@name='liberty']='1'",
"//lst[@name='wordcloud']/lst[@name='text']/lst[@name='idf']/double[@name='liberty']='1.0'"
);

}
}


0 comments on commit 2e82d23

Please sign in to comment.