Solr: Using docid within same Seacher to boost performance


We all know that docid in Lucene/Solr is volatile, it may change when we remove some docs and solr merges segments.

For example:
We add 3 docs: doc0, doc1, doc2
http://localhost:12345/solr/update?stream.body=<add><doc><field name="id">doc0</field></doc><doc><field name="id">doc1</field></doc><doc><field name="id">doc1</field></doc></add>&commit=true
Their docid would be like: doc0:0, doc1:1,  doc2:2

Then we delete doc0, and commit it with expungeDeletes=true(meger deletes will also happen when merge segements)
http://localhost:12345/solr/update?stream.body=<delete><query>id:0</query></delete>&commit=true&expungeDeletes=true

Now, their docid would be changed: doc1:0, doc2:1

But in following request handler, whether the docid will be changed during the 2 queries?
public class TestDocIdHandler extends RequestHandlerBase {
  public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp)
      throws Exception {
    int docid = getLookupDocId(req.getSearcher(), "doc1");
    // stop here, and delete doc0:
    // http://localhost:12345/solr/update?stream.body=<delete><query>id:doc0</query></delete>&commit=true&expungeDeletes=true
    // check whether docid is changed
    int newdocid = getLookupDocId(req.getSearcher(), "doc12");
    
    System.out.println(docid == newdocid);
  }
  
  private int getLookupDocId(SolrIndexSearcher searcher, String lookup)
      throws IOException {
    TermQuery tq = new TermQuery(new Term("contentid", lookup));
    TopDocs hits = searcher.search(tq, 1);
    ScoreDoc[] docs = hits.scoreDocs;
    if (docs.length == 1) {
      return docs[0].doc;
    }
    return -1; // not found
  }
}

The answer is no:
The docid would be same, because we are querying using same SolrIndexSearcher: SolrIndexSearcher holds the snapshot of the index(data) at some specific time, it will not reflect the change(add,delete,etc) we made until it's reopened.

In the next post, we will demonstrate how we can use this feature in our code.

Practical Example: Use docid to boost performance
The User Case:
Give some query(q,fq, may be join or group group), we want to know the position of one doc given its id.

We can first get the docid of this document, then run the query:
SolrIndexSearcher.search().scoreDocs
then iterate all docid until we find it.
public class GetDocPositionReqHandler extends RequestHandlerBase {
  public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp)
      throws Exception {
    SolrParams params = req.getParams();
    String lookup = Preconditions.checkNotNull(params.get("lookup"));
    
    SolrIndexSearcher searcher = req.getSearcher();
    int lookupId = getLookupDocId(searcher, lookup);
    
    if (lookupId != -1) {
      boolean isGroup = params.getBool(GroupParams.GROUP, false);
      if (!isGroup) {
        nonGroupImpl(req, rsp, lookupId);
      } else {
        //
        groupImpl(req, rsp, params, lookupId);
      }
    }
  }
    
  private void nonGroupImpl(SolrQueryRequest req, SolrQueryResponse rsp,
      int lookupId) throws SyntaxError, IOException {
    int lookupPos = -1;
    ScoreDoc[] docs = runReqQuery(req);
    int newPos = 0;
    for (ScoreDoc doc : docs) {
      newPos++;
      if (doc.doc == lookupId) {
        lookupPos = newPos;
        break;
      }
    }
    
    rsp.add("newPos", lookupPos);
  }
  private void groupImpl(SolrQueryRequest req, SolrQueryResponse rsp,
      SolrParams params, int lookupId) throws SyntaxError, IOException {
    ScoreDoc[] docs = runReqQuery(req);
    // split to group
    // in our case, the type of group.field is string, group.sort is type long field
    Map<String,Set<Integer>> groupMap = new LinkedHashMap<String,Set<Integer>>();
    String lookupGroup = null;
    
    String groupField = Objects.requireNonNull(
        params.get(GroupParams.GROUP_FIELD),
        "No group field in the request string.");
    BinaryDocValues groupCache = FieldCache.DEFAULT.getTerms(req.getSearcher()
        .getAtomicReader(), groupField);
    for (ScoreDoc doc : docs) {
      int docid = doc.doc;
      BytesRef result = new BytesRef();
      groupCache.get(docid, result);
      String groupValue = result.utf8ToString();
      Set<Integer> groupItems = groupMap.get(groupValue);
      if (groupItems == null) {
        groupItems = new LinkedHashSet<Integer>();
        groupMap.put(groupValue, groupItems);
      }
      groupItems.add(docid);
      if (doc.doc == lookupId) {
        lookupGroup = groupValue;
      }
    }
    int lookupPos = -1;
    if (lookupGroup != null) {
      // then iterate the map to get the position
      int newPos = 0;
      Iterator<Entry<String,Set<Integer>>> it = groupMap.entrySet().iterator();
      
      outer: while (it.hasNext()) {
        Entry<String,Set<Integer>> entry = it.next();
        String groupName = entry.getKey();
        if (lookupGroup.equals(groupName)) {
          Set<Integer> items = entry.getValue();
          for (Integer item : items) {
            newPos++;
            if (item == lookupId) {
              lookupPos = newPos;
              break outer;
            }
          }
        } else {
          newPos += entry.getValue().size();
        }
      }
    }
    rsp.add("newPos", lookupPos);
  }
  
  private ScoreDoc[] runReqQuery(SolrQueryRequest req) throws SyntaxError,
      IOException {
    SolrParams params = req.getParams();
    SolrIndexSearcher searcher = req.getSearcher();
    String qstr = params.get(CommonParams.Q);
    
    QParser parser = QParser.getParser(qstr, ExtendedDismaxQParserPlugin.NAME,
        req);
    Query newQuery = parser.parse();
    Sort sort = SolrPluginUtils.getSort(req);
    
    String[] fqs = params.getParams(CommonParams.FQ);
    ChainedFilter chainedFilter = null;
    if (fqs != null) {
      Filter[] filters = new Filter[fqs.length];
      int i = 0;
      for (String fq : fqs) {
        filters[i++] = new QueryWrapperFilter(QParser.getParser(fq,
            ExtendedDismaxQParserPlugin.NAME, req).parse());
      }
      chainedFilter = new ChainedFilter(filters);
    }
    TopDocs topDocs;
    if (sort != null) {
      topDocs = searcher.search(newQuery, chainedFilter, searcher.maxDoc(),
          sort);
    } else {
      topDocs = searcher.search(newQuery, chainedFilter, searcher.maxDoc());
    }
    ScoreDoc[] docs = topDocs.scoreDocs;
    return docs;
  }
  
  private int getLookupDocId(SolrIndexSearcher searcher, String lookup)
      throws IOException {
    TermQuery tq = new TermQuery(new Term("id", lookup));
    TopDocs hits = searcher.search(tq, 1);
    ScoreDoc[] docs = hits.scoreDocs;
    if (docs.length == 1) {
      return docs[0].doc;
    }
    return -1; // not found
  }
  
}

Labels

adsense (5) Algorithm (69) Algorithm Series (35) Android (7) ANT (6) bat (8) Big Data (7) Blogger (14) Bugs (6) Cache (5) Chrome (19) Code Example (29) Code Quality (7) Coding Skills (5) Database (7) Debug (16) Design (5) Dev Tips (63) Eclipse (32) Git (5) Google (33) Guava (7) How to (9) Http Client (8) IDE (7) Interview (88) J2EE (13) J2SE (49) Java (186) JavaScript (27) JSON (7) Learning code (9) Lesson Learned (6) Linux (26) Lucene-Solr (112) Mac (10) Maven (8) Network (9) Nutch2 (18) Performance (9) PowerShell (11) Problem Solving (11) Programmer Skills (6) regex (5) Scala (6) Security (9) Soft Skills (38) Spring (22) System Design (11) Testing (7) Text Mining (14) Tips (17) Tools (24) Troubleshooting (29) UIMA (9) Web Development (19) Windows (21) xml (5)