Solr WordDelimiterFilter + Lucene Highlighter
- by Lucas
I am trying to get the Highlighter class from Lucene to work properly with tokens coming from Solr's WordDelimiterFilter. It works 90% of the time, but if the matching text contains a ',' such as "1,500" the output is incorrect:
Expected: 'test 1,500 this'
Observed: 'test 11,500 this'
I am not currently sure whether it is Highlighter messing up the recombination or WordDelimiterFilter messing up the tokenization but something is unhappy. Here are the relevant dependencies from my pom:
org.apache.lucene
lucene-core
2.9.3
jar
compile
org.apache.lucene
lucene-highlighter
2.9.3
jar
compile
org.apache.solr
solr-core
1.4.0
jar
compile
And here is a simple JUnit test class demonstrating the problem:
package test.lucene;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.util.Version;
import org.apache.solr.analysis.StandardTokenizerFactory;
import org.apache.solr.analysis.WordDelimiterFilterFactory;
import org.junit.Test;
public class HighlighterTester {
private static final String PRE_TAG = "<b>";
private static final String POST_TAG = "</b>";
private static String[] highlightField( Query query, String fieldName, String text )
throws IOException, InvalidTokenOffsetsException {
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter( PRE_TAG, POST_TAG );
Highlighter highlighter = new Highlighter( formatter, new QueryScorer( query, fieldName ) );
highlighter.setTextFragmenter( new SimpleFragmenter( Integer.MAX_VALUE ) );
return highlighter.getBestFragments( getAnalyzer(), fieldName, text, 10 );
}
private static Analyzer getAnalyzer() {
return new Analyzer() {
@Override
public TokenStream tokenStream( String fieldName, Reader reader ) {
// Start with a StandardTokenizer
TokenStream stream = new StandardTokenizerFactory().create( reader );
// Chain on a WordDelimiterFilter
WordDelimiterFilterFactory wordDelimiterFilterFactory = new WordDelimiterFilterFactory();
HashMap<String, String> arguments = new HashMap<String, String>();
arguments.put( "generateWordParts", "1" );
arguments.put( "generateNumberParts", "1" );
arguments.put( "catenateWords", "1" );
arguments.put( "catenateNumbers", "1" );
arguments.put( "catenateAll", "0" );
wordDelimiterFilterFactory.init( arguments );
return wordDelimiterFilterFactory.create( stream );
}
};
}
@Test
public void TestHighlighter() throws ParseException, IOException, InvalidTokenOffsetsException {
String fieldName = "text";
String text = "test 1,500 this";
String queryString = "1500";
String expected = "test " + PRE_TAG + "1,500" + POST_TAG + " this";
QueryParser parser = new QueryParser( Version.LUCENE_29, fieldName, getAnalyzer() );
Query q = parser.parse( queryString );
String[] observed = highlightField( q, fieldName, text );
for ( int i = 0; i < observed.length; i++ ) {
System.out.println( "\t" + i + ": '" + observed[i] + "'" );
}
if ( observed.length > 0 ) {
System.out.println( "Expected: '" + expected + "'\n" + "Observed: '" + observed[0] + "'" );
assertEquals( expected, observed[0] );
}
else {
assertTrue( "No matches found", false );
}
}
}
Anyone have any ideas or suggestions?