Lucene Built-in Collectors

TotalHitCountCollector
Collector's collect method is called for each matched docs:
The main methods in the process:
org.apache.lucene.search.IndexSearcher.search(List, Weight, Collector)
org.apache.lucene.search.Weight.DefaultBulkScorer.scoreAll(Collector, Scorer)

TopScoreDocCollector
Create collector:
org.apache.lucene.search.TopScoreDocCollector.create(int, ScoreDoc, boolean)

public static TopScoreDocCollector create(int numHits, ScoreDoc after, boolean docsScoredInOrder) {
  if (docsScoredInOrder) {
    return after == null 
      ? new InOrderTopScoreDocCollector(numHits) 
      : new InOrderPagingScoreDocCollector(after, numHits);
  } else {
    return after == null
      ? new OutOfOrderTopScoreDocCollector(numHits)
      : new OutOfOrderPagingScoreDocCollector(after, numHits);
  }
}
The collector put docs into HitQueue(PriorityQueue)
org.apache.lucene.search.TopScoreDocCollector.OutOfOrderTopScoreDocCollector.collect(int)
org.apache.lucene.search.HitQueue.lessThan(ScoreDoc, ScoreDoc)
TopFieldCollector
public static TopFieldCollector create(Sort sort, int numHits, FieldDoc after,
    boolean fillFields, boolean trackDocScores, boolean trackMaxScore,
    boolean docsScoredInOrder)
    throws IOException {
  FieldValueHitQueue<Entry> queue = FieldValueHitQueue.create(sort.fields, numHits);
  if (after == null) {
    if (queue.getComparators().length == 1) {
      if (docsScoredInOrder) {
        if (trackMaxScore) {
          return new OneComparatorScoringMaxScoreCollector(queue, numHits, fillFields);
        } else if (trackDocScores) {
          return new OneComparatorScoringNoMaxScoreCollector(queue, numHits, fillFields);
        } else {
          return new OneComparatorNonScoringCollector(queue, numHits, fillFields);
        }
      } else {
        if (trackMaxScore) {
          return new OutOfOrderOneComparatorScoringMaxScoreCollector(queue, numHits, fillFields);
        } else if (trackDocScores) {
          return new OutOfOrderOneComparatorScoringNoMaxScoreCollector(queue, numHits, fillFields);
        } else {
          return new OutOfOrderOneComparatorNonScoringCollector(queue, numHits, fillFields);
        }
      }
    }
    // multiple comparators.
    if (docsScoredInOrder) {
      if (trackMaxScore) {
        return new MultiComparatorScoringMaxScoreCollector(queue, numHits, fillFields);
      } else if (trackDocScores) {
        return new MultiComparatorScoringNoMaxScoreCollector(queue, numHits, fillFields);
      } else {
        return new MultiComparatorNonScoringCollector(queue, numHits, fillFields);
      }
    } else {
      if (trackMaxScore) {
        return new OutOfOrderMultiComparatorScoringMaxScoreCollector(queue, numHits, fillFields);
      } else if (trackDocScores) {
        return new OutOfOrderMultiComparatorScoringNoMaxScoreCollector(queue, numHits, fillFields);
      } else {
        return new OutOfOrderMultiComparatorNonScoringCollector(queue, numHits, fillFields);
      }
    }
  } else {
    return new PagingFieldCollector(queue, after, numHits, fillFields, trackDocScores, trackMaxScore);
  }
}
org.apache.lucene.search.FieldValueHitQueue org.apache.lucene.search.FieldValueHitQueue.OneComparatorFieldValueHitQueue org.apache.lucene.search.FieldValueHitQueue.MultiComparatorsFieldValueHitQueue

Test Lucene Built-in Collectors
public class LearningCollector {

 @Before
 public void setup() throws IOException {
  Utils.writeIndex();
 }

 @Test
 public void testBuiltCollector() throws IOException {
  try (Directory directory = FSDirectory.open(new File(
    Utils.INDEX_FOLDER_PATH));
    DirectoryReader indexReader = DirectoryReader.open(directory);) {
   IndexSearcher searcher = new IndexSearcher(indexReader);

   usingTotalHitCollector(searcher);
   usingTopScoreDocCollector(searcher);
   usingTopFieldCollector(searcher);
   usingLuceneGroup(searcher);
  }
 }

 private void usingTotalHitCollector(IndexSearcher searcher)
   throws IOException {
  TotalHitCountCollector collector = new TotalHitCountCollector();
  TermQuery query = new TermQuery(new Term("title", "java"));
  searcher.search(query, collector);
  System.out.println("total hits:" + collector.getTotalHits());
 }

 private void usingLuceneGroup(IndexSearcher searcher) throws IOException {
  String groupField = "title";
  TermQuery query = new TermQuery(new Term("title", "java"));
  Sort groupSort = new Sort(new SortField("title", Type.STRING));
  Sort docSort = new Sort((new SortField("price", Type.INT, true)));
  groupBy(searcher, query, groupField, groupSort, docSort);
 }

 // Use TermFirstPassGroupingCollector, TermSecondPassGroupingCollector,
 // CachingCollector, TermAllGroupsCollector,MultiCollector
 private void groupBy(IndexSearcher searcher, Query query,
   String groupField, Sort groupSort, Sort docSort) throws IOException {
  // return ngroups every page
  int topNGroups = 10;
  int groupOffset = 0;
  boolean fillFields = true;

  int docOffset = 0;
  boolean requiredTotalGroupCount = true;

  TermFirstPassGroupingCollector c1 = new TermFirstPassGroupingCollector(
    groupField, groupSort, topNGroups);
  boolean cacheScores = true;
  double maxCacheRAMMB = 16.0;
  CachingCollector cachedCollector = CachingCollector.create(c1,
    cacheScores, maxCacheRAMMB);
  searcher.search(query, cachedCollector);

  Collection<SearchGroup<BytesRef>> topGroups = c1.getTopGroups(
    groupOffset, fillFields);

  if (topGroups == null) {
   // No groups matched
   return;
  }

  Collector secondPassCollector = null;

  boolean getScores = true;
  boolean getMaxScores = true;
  boolean fillSortFields = true;
  int docsPerGroup = 10;
  TermSecondPassGroupingCollector c2 = new TermSecondPassGroupingCollector(
    groupField, topGroups, groupSort, docSort, docsPerGroup,
    getScores, getMaxScores, fillSortFields);

  // Optionally compute total group count
  TermAllGroupsCollector allGroupsCollector = null;
  if (requiredTotalGroupCount) {
   allGroupsCollector = new TermAllGroupsCollector(groupField);
   secondPassCollector = MultiCollector.wrap(c2, allGroupsCollector);
  } else {
   secondPassCollector = c2;
  }

  if (cachedCollector.isCached()) {
   // Cache fit within maxCacheRAMMB, so we can replay it:
   cachedCollector.replay(secondPassCollector);
  } else {
   // Cache was too large; must re-execute query:
   searcher.search(query, secondPassCollector);
  }

  int totalGroupCount = -1;
  int totalHitCount = -1;
  int totalGroupedHitCount = -1;
  if (requiredTotalGroupCount) {
   totalGroupCount = allGroupsCollector.getGroupCount();
  }
  System.out.println("groupCount: " + totalGroupCount);

  TopGroups<BytesRef> groupsResult = c2.getTopGroups(docOffset);
  totalHitCount = groupsResult.totalHitCount;
  totalGroupedHitCount = groupsResult.totalGroupedHitCount;
  System.out.println("groupsResult.totalHitCount:" + totalHitCount);
  System.out.println("groupsResult.totalGroupedHitCount:"
    + totalGroupedHitCount);

  int groupIdx = 0;
  for (GroupDocs<BytesRef> groupDocs : groupsResult.groups) {
   groupIdx++;
   System.out.println("group[" + groupIdx + "]:"
     + groupDocs.groupValue);
   System.out
     .println("group[" + groupIdx + "]:" + groupDocs.totalHits);
   int docIdx = 0;
   for (ScoreDoc scoreDoc : groupDocs.scoreDocs) {
    docIdx++;
    System.out.println("group[" + groupIdx + "][" + docIdx + "]:"
      + scoreDoc.doc + "/" + scoreDoc.score);
    Document doc = searcher.doc(scoreDoc.doc);
    System.out.println("group[" + groupIdx + "][" + docIdx + "]:"
      + doc);
   }
  }
 }

 private void usingTopFieldCollector(IndexSearcher searcher)
   throws IOException {
  TermQuery query = new TermQuery(new Term("title", "java"));
  // reverse is true: sort=price desc
  Sort sort = new Sort(new SortField("price", Type.INT, true));
  TopFieldCollector collector = TopFieldCollector.create(sort, 10, false,
    false, false, false);

  searcher.search(query, collector);
  printAndExplainSearchResult(searcher, collector, true, query, "price");
  // set these to true: fillFields, trackDocScores, trackMaxScore
  collector = TopFieldCollector.create(sort, 10, true, true, true, false);

  searcher.search(query, collector);
  printAndExplainSearchResult(searcher, collector, true, query, "price");

  // sort by multiple field
  sort = new Sort(new SortField("price", Type.INT, true), new SortField(
    "title", Type.STRING, false));
  collector = TopFieldCollector.create(sort, 10, true, true, true, false);

  searcher.search(query, collector);
  printAndExplainSearchResult(searcher, collector, true, query, "price",
    "title");
 }

 private void usingTopScoreDocCollector(IndexSearcher searcher)
   throws IOException {
  TermQuery query = new TermQuery(new Term("title", "java"));
  TopScoreDocCollector collector = TopScoreDocCollector.create(10, false);
  searcher.search(query, collector);
  printAndExplainSearchResult(searcher, collector, true, query, "title",
    "author");
  // TODO: searchAfte example
 }
}

Post a Comment

Labels

Java (159) Lucene-Solr (110) Interview (61) All (58) J2SE (53) Algorithm (45) Soft Skills (37) Eclipse (33) Code Example (31) Linux (24) JavaScript (23) Spring (22) Windows (22) Web Development (20) Nutch2 (18) Tools (18) Bugs (17) Debug (16) Defects (14) Text Mining (14) J2EE (13) Network (13) Troubleshooting (13) PowerShell (11) Chrome (9) Design (9) How to (9) Learning code (9) Performance (9) Problem Solving (9) UIMA (9) html (9) Http Client (8) Maven (8) Security (8) bat (8) blogger (8) Big Data (7) Continuous Integration (7) Google (7) Guava (7) JSON (7) ANT (6) Coding Skills (6) Database (6) Scala (6) Shell (6) css (6) Algorithm Series (5) Cache (5) Dynamic Languages (5) IDE (5) Lesson Learned (5) Programmer Skills (5) System Design (5) Tips (5) adsense (5) xml (5) AIX (4) Code Quality (4) GAE (4) Git (4) Good Programming Practices (4) Jackson (4) Memory Usage (4) Miscs (4) OpenNLP (4) Project Managment (4) Spark (4) Testing (4) ads (4) regular-expression (4) Android (3) Apache Spark (3) Become a Better You (3) Concurrency (3) Eclipse RCP (3) English (3) Happy Hacking (3) IBM (3) J2SE Knowledge Series (3) JAX-RS (3) Jetty (3) Restful Web Service (3) Script (3) regex (3) seo (3) .Net (2) Android Studio (2) Apache (2) Apache Procrun (2) Architecture (2) Batch (2) Bit Operation (2) Build (2) Building Scalable Web Sites (2) C# (2) C/C++ (2) CSV (2) Career (2) Cassandra (2) Distributed (2) Fiddler (2) Firefox (2) Google Drive (2) Gson (2) How to Interview (2) Html Parser (2) Http (2) Image Tools (2) JQuery (2) Jersey (2) LDAP (2) Life (2) Logging (2) Python (2) Software Issues (2) Storage (2) Text Search (2) xml parser (2) AOP (1) Application Design (1) AspectJ (1) Chrome DevTools (1) Cloud (1) Codility (1) Data Mining (1) Data Structure (1) ExceptionUtils (1) Exif (1) Feature Request (1) FindBugs (1) Greasemonkey (1) HTML5 (1) Httpd (1) I18N (1) IBM Java Thread Dump Analyzer (1) JDK Source Code (1) JDK8 (1) JMX (1) Lazy Developer (1) Mac (1) Machine Learning (1) Mobile (1) My Plan for 2010 (1) Netbeans (1) Notes (1) Operating System (1) Perl (1) Problems (1) Product Architecture (1) Programming Life (1) Quality (1) Redhat (1) Redis (1) Review (1) RxJava (1) Solutions logs (1) Team Management (1) Thread Dump Analyzer (1) Visualization (1) boilerpipe (1) htm (1) ongoing (1) procrun (1) rss (1)

Popular Posts