Programmer: Lifelong Learning: Solr Wilcard Query with Stemming

The Problem
Today, I was asked to take a look one query issue:
When user searches file, files or file*, Solr return matches correctly, but if user searches files*, Solr doesn't return match.

The Solution
Google Search, find the solution in this page:
Stemming not working with wildcard search

Wildcards and stemming are incompatible at query time - you need to manually stem the term before applying your wildcard.

Wildcards are not supported in quoted phrases. They will be treated as punctuation, and ignored by the standard tokenizer or the word delimiter filter.

In this case, it is PrefixQuery which work similar as Wildcard Query.

The solution is to add KeywordRepeatFilterFactory and RemoveDuplicatesTokenFilterFactory around the Stem Factory:

<fieldType name="text_rev" class="solr.TextField"
  positionIncrementGap="100">
  <analyzer type="index">
    <tokenizer class="solr.WhitespaceTokenizerFactory" />
    <filter class="solr.StopFilterFactory" ignoreCase="true"
      words="stopwords.txt" enablePositionIncrements="true" />
    <filter class="solr.WordDelimiterFilterFactory"
      generateWordParts="1" generateNumberParts="1" catenateWords="1"
      catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"
      preserveOriginal="1" />
    <filter class="solr.LowerCaseFilterFactory" />
    <filter class="solr.KeywordRepeatFilterFactory"/> 
    <filter class="solr.PorterStemFilterFactory"/> 
    <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> 
    <filter class="solr.ReversedWildcardFilterFactory"
      withOriginal="true" maxPosAsterisk="3" maxPosQuestion="2"
      maxFractionAsterisk="0.33" />
  </analyzer>
  <analyzer type="query">
    <tokenizer class="solr.WhitespaceTokenizerFactory" />
    <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
      ignoreCase="true" expand="true" />
    <filter class="solr.StopFilterFactory" ignoreCase="true"
      words="stopwords.txt" enablePositionIncrements="true" />
    <filter class="solr.WordDelimiterFilterFactory"
      generateWordParts="1" generateNumberParts="1" catenateWords="0"
      catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"
      preserveOriginal="1" />        
    <filter class="solr.LowerCaseFilterFactory" />
    <filter class="solr.PorterStemFilterFactory"/> 
  </analyzer>
</fieldType>

Test
Next let's write unit test to test and verify the change.

public void testWildcardStemFromSchema() {
  try {
    URLClassLoader urlClassLoader = (URLClassLoader) ClassLoader
        .getSystemClassLoader();
    // from
    // http://www.hangar.org/docs/activitats/SummerLAB/Pure%20Data/OSC%20-%20OpenSoundControl/SwingOSC/src/de/sciss/util/DynamicURLClassLoader.java
    DynamicURLClassLoader dynaLoader = new DynamicURLClassLoader(
        urlClassLoader);
    dynaLoader.addURL(new File(CONF_FOLDER).toURI().toURL());
    Thread.currentThread().setContextClassLoader(dynaLoader);
    InputSource solrCfgIs = new InputSource(new FileReader(new File(
        CONF_FOLDER, "solrconfig.xml")));
    SolrConfig solrConfig = new SolrConfig(null, solrCfgIs);
    InputSource solrSchemaIs = new InputSource(new FileReader(new File(
        CONF_FOLDER, "schema.xml")));
    IndexSchema solrSchema = new IndexSchema(solrConfig, "mySchema",
        solrSchemaIs);
    Map<String, FieldType> fieldTypes = solrSchema.getFieldTypes();
    listAllFieldTypes(fieldTypes);

    // now test text_rev
    String inputText = "files";
    FieldType fieldTypeText = fieldTypes.get("text_rev");
    Analyzer indexAnalyzer = fieldTypeText.getIndexAnalyzer();
    Analyzer queryAnalyzer = fieldTypeText.getQueryAnalyzer();

    System.out.println("Indexing analysis:");
testIndexerSearcher(solrSchema, indexAnalyzer, queryAnalyzer);

    TokenStream tokenStream = indexAnalyzer.tokenStream("content",
        new StringReader(inputText));
    CharTermAttribute termAttr = tokenStream
        .getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAttr = tokenStream
        .getAttribute(OffsetAttribute.class);

    TypeAttribute typeAttr = tokenStream
        .getAttribute(TypeAttribute.class);
    tokenStream.reset();

    while (tokenStream.incrementToken()) {

      System.out.println(termAttr.toString() + " offset: "
          + offsetAttr.startOffset() + ":"
          + offsetAttr.endOffset() + ", type:" + typeAttr.type());
    }
    tokenStream.end();
    tokenStream.close();

    String searchText = "files*";
    System.out.println("\r\nQuerying analysis:");

    tokenStream = queryAnalyzer.tokenStream("content",
        new StringReader(searchText));
    tokenStream.reset();
    CharTermAttribute termAttr2 = (CharTermAttribute) tokenStream
        .getAttribute(CharTermAttribute.class);
    while (tokenStream.incrementToken()) {
      System.out.println(termAttr2.toString());
    }
    tokenStream.end();
    tokenStream.close();

  } catch (Exception e) {
    e.printStackTrace();
  }
}

private void testIndexerSearcher(IndexSchema solrSchema,
    Analyzer indexAnalyzer, Analyzer queryAnalyzer) throws IOException,
    ParseException {
  IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_9,
      indexAnalyzer);
  // recreate the index on each execution
  config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
  config.setUseCompoundFile(false);
  // if we setInfoStream, add the below annotation to the TestClass
  // @SuppressSysoutChecks(bugUrl = "Solr logs to JUL")
  // config.setInfoStream(System.out);

  // be sure to close Directory and IndexWriter
  try (Directory directory = FSDirectory.open(new File(FILE_PATH));
      IndexWriter writer = new IndexWriter(directory, config)) {
    Document doc = new Document();
    IndexableField field = solrSchema.getField("content").createField(
        "files", 1.0f);
    doc.add(field);
    writer.addDocument(doc);

    writer.commit();
  }
  try (Directory directory = FSDirectory.open(new File(FILE_PATH));
      DirectoryReader indexReader = DirectoryReader.open(directory);) {
    IndexSearcher searcher = new IndexSearcher(indexReader);

    QueryParser queryParser = new QueryParser(Version.LUCENE_4_9,
        "content", queryAnalyzer);

    Query query = queryParser.parse("files*");
    System.out.println("queryParser query:" + query.toString());
    TopDocs docs = searcher.search(query, 10);
    LuceneUtil.printAndExplaunSearchResult(searcher, docs, query,
        "content");
  }
}
public void testUsingAnalyzer() {
  try {
    URLClassLoader urlClassLoader = (URLClassLoader) ClassLoader
        .getSystemClassLoader();
    // from
    // http://www.hangar.org/docs/activitats/SummerLAB/Pure%20Data/OSC%20-%20OpenSoundControl/SwingOSC/src/de/sciss/util/DynamicURLClassLoader.java
    DynamicURLClassLoader dynaLoader = new DynamicURLClassLoader(
        urlClassLoader);
    dynaLoader.addURL(new File(CONF_FOLDER).toURI().toURL());
    Thread.currentThread().setContextClassLoader(dynaLoader);

    StringReader inputText = new StringReader("pictures files");
    Map<String, String> commonArgs = ImmutableMap
        .<String, String> builder()
        .put(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM,
            Version.LUCENE_4_9 + "").build();
    // These factories remove consumed element from map
    // org.apache.lucene.analysis.util.AbstractAnalysisFactory.AbstractAnalysisFactory(Map<String,
    // String>)
    // args.remove(CLASS_NAME); // consume the class arg

    // why they remove value from map? -- so it can detect unwanted
    // parameters, to avoid typo mistake
    // org.apache.lucene.analysis.core.WhitespaceTokenizerFactory.WhitespaceTokenizerFactory(Map<String,
    // String>)
    // if (!args.isEmpty()) {
    // throw new IllegalArgumentException("Unknown parameters: " +
    // args);
    // }
    TokenizerFactory tkf = new WhitespaceTokenizerFactory(
        new HashMap<String, String>(commonArgs));
    Tokenizer tkz = tkf.create(inputText);

    HashMap<String, String> stopFilterParmas = new HashMap<String, String>(
        commonArgs);
    stopFilterParmas.put("words", "stopwords.txt");

    // CONF_FOLDER is added to classpath
    ResourceLoader loader = new ClasspathResourceLoader();
    // ResourceLoader loader = new FilesystemResourceLoader(new File(
    // CONF_FOLDER));
    StopFilterFactory stf = new StopFilterFactory(stopFilterParmas);
    stf.inform(loader);
    TokenStream st = stf.create(tkz);
    WordDelimiterFilterFactory wdff = new WordDelimiterFilterFactory(
        new HashMap<String, String>(commonArgs));
    TokenFilter wdf = wdff.create(st);

    LowerCaseFilterFactory lcf = new LowerCaseFilterFactory(
        new HashMap<String, String>(commonArgs));
    TokenStream lcts = lcf.create(wdf);

    KeywordRepeatFilterFactory krff = new KeywordRepeatFilterFactory(
        new HashMap<String, String>(commonArgs));
    TokenStream kdrf = krff.create(lcts);
    TokenFilterFactory psff = new PorterStemFilterFactory(
        new HashMap<String, String>(commonArgs));
    TokenStream psf = psff.create(kdrf);

    RemoveDuplicatesTokenFilterFactory rdtff = new RemoveDuplicatesTokenFilterFactory(
        new HashMap<String, String>(commonArgs));
    RemoveDuplicatesTokenFilter rdtf = rdtff.create(psf);

    ReversedWildcardFilterFactory rwff = new ReversedWildcardFilterFactory(
        new HashMap<String, String>(commonArgs));
    TokenStream rwf = rwff.create(rdtf);

    CharTermAttribute termAttrib = (CharTermAttribute) rwf
        .getAttribute(CharTermAttribute.class);

    rwf.reset();
    while (rwf.incrementToken()) {
      String term = termAttrib.toString();
      System.out.println(term);
    }
    rwf.end();
    rwf.close();

  } catch (Exception e) {
    e.printStackTrace();
  }
}

Output of the test case:
text_rev:TokenizerChain(org.apache.lucene.analysis.core.WhitespaceTokenizerFactory@48ae9b55, org.apache.lucene.analysis.core.StopFilterFactory@1700915, org.apache.lucene.analysis.miscellaneous.WordDelimiterFilterFactory@21de60b4, org.apache.lucene.analysis.core.LowerCaseFilterFactory@c267ef4, org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory@30ee2816, org.apache.lucene.analysis.en.PorterStemFilterFactory@31d7b7bf, org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory@635eaaf1, org.apache.solr.analysis.ReversedWildcardFilterFactory@5c30a9b0)
queryParser query:content:files*
Found : 1 hits.
1. files
1.0 = (MATCH) ConstantScore(content:files*), product of:
1.0 = boost
1.0 = queryNorm

Indexing analysis:
selif offset: 0:5, type:word
files offset: 0:5, type:word
elif offset: 0:5, type:word
file offset: 0:5, type:word

Querying analysis:
files*

file
References
Stemming not working with wildcard search
Testing Solr schema, analyzers and tokenization

Solr Wilcard Query with Stemming

Labels