Lucene Highlighter HowTo


In practice, we may want to highlight the matched word in the query response, so user can easily look at the matched section and jump to it.

package org.lifelongprogrammer.learningLucene;
public class LuceneHighlighterInAction {

 public static void main(String[] args) throws Exception {
  Directory directory = new RAMDirectory();
  StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_4_9);

  String fieldName = "content";
  writeDocs(directory, analyzer, fieldName);
  // use Highlighter
  try (DirectoryReader indexReader = DirectoryReader.open(directory);) {
   IndexSearcher searcher = new IndexSearcher(indexReader);
   TermQuery query = new TermQuery(new Term(fieldName, "love"));

   TopDocs topDocs = searcher.search(query, 10);
   System.out.println("Total hits: " + topDocs.totalHits);
   ScoreDoc[] scoreDocs = topDocs.scoreDocs;

   // use SimpleHTMLFormatter
   System.out.println("use SimpleHTMLFormatter");
   QueryScorer scorer = new QueryScorer(query);
   Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter(
     "<font color='red'>", "</font>"), scorer);
   Fragmenter fragmenter = new SimpleFragmenter(200);
   highlighter.setTextFragmenter(fragmenter);

   for (int i = 0; i < Math.min(scoreDocs.length, 10); ++i) {
    Document doc = searcher.doc(scoreDocs[i].doc);
    String fieldContent = doc.get(fieldName);
    System.out.println(fieldContent + " , " + scoreDocs[i].score);
    System.out.println(highlighter.getBestFragment(analyzer,
      fieldName, fieldContent));
   }

   // use SimpleSpanFragmenter
   System.out.println("use SimpleSpanFragmenter");
   highlighter = new Highlighter(scorer);
   //default is Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE 50*1024
   highlighter.setMaxDocCharsToAnalyze(10240);
   fragmenter = new SimpleSpanFragmenter(new QueryScorer(query), 10);
   for (int i = 0; i < Math.min(scoreDocs.length, 10); ++i) {
    Document doc = searcher.doc(scoreDocs[i].doc);
    String fieldContent = doc.get(fieldName);
    System.out.println(fieldContent + " , " + scoreDocs[i].score);
    TokenStream tokenStream = analyzer.tokenStream(fieldName,
      fieldContent);
    String result = highlighter.getBestFragments(tokenStream,
      fieldContent, 2, "...");
    System.out.println(result);
   }
  }
 }

 private static void writeDocs(Directory directory,
   StandardAnalyzer analyzer, String fieldName) throws IOException {
  IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_9,
    analyzer);
  config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
  try (IndexWriter writer = new IndexWriter(directory, config)) {

   FieldType fieldType = new FieldType();
   fieldType.setIndexed(true);
   fieldType.setStored(true);
   fieldType.setTokenized(true);
   fieldType.setStoreTermVectors(true);
   fieldType.setStoreTermVectorOffsets(true);
   fieldType.setStoreTermVectorPositions(true);
   fieldType.setOmitNorms(false);
   fieldType.freeze();

   Document doc = new Document();
   doc.add(new Field(
     fieldName,
     "I am a lifelong programmer, I love coding; I am a lifelong programmer, I love programming.",
     fieldType));
   writer.addDocument(doc);

   doc = new Document();
   doc.add(new Field(
     fieldName,
     "I am a lifelong programmer, I love the world; I am a lifelong programmer, I love the life.",
     fieldType));
   writer.addDocument(doc);
  }
 }
}
Main code: org.apache.lucene.search.highlight.Highlighter.getBestTextFragments(TokenStream, String, boolean, int) 
Highlighter in Solr
https://cwiki.apache.org/confluence/display/solr/Highlighting
http://wiki.apache.org/solr/HighlightingParameters

Labels

adsense (5) Algorithm (69) Algorithm Series (35) Android (7) ANT (6) bat (8) Big Data (7) Blogger (14) Bugs (6) Cache (5) Chrome (19) Code Example (29) Code Quality (7) Coding Skills (5) Database (7) Debug (16) Design (5) Dev Tips (63) Eclipse (32) Git (5) Google (33) Guava (7) How to (9) Http Client (8) IDE (7) Interview (88) J2EE (13) J2SE (49) Java (186) JavaScript (27) JSON (7) Learning code (9) Lesson Learned (6) Linux (26) Lucene-Solr (112) Mac (10) Maven (8) Network (9) Nutch2 (18) Performance (9) PowerShell (11) Problem Solving (11) Programmer Skills (6) regex (5) Scala (6) Security (9) Soft Skills (38) Spring (22) System Design (11) Testing (7) Text Mining (14) Tips (17) Tools (24) Troubleshooting (29) UIMA (9) Web Development (19) Windows (21) xml (5)