Solr Wilcard Query with Stemming

The Problem
Today, I was asked to take a look one query issue:
When user searches file, files or file*, Solr return matches correctly, but if user searches files*, Solr doesn't return match.

The Solution
Google Search, find the solution in this page:
Stemming not working with wildcard search

Wildcards and stemming are incompatible at query time - you need to manually stem the term before applying your wildcard.

Wildcards are not supported in quoted phrases. They will be treated as punctuation, and ignored by the standard tokenizer or the word delimiter filter.

In this case, it is PrefixQuery which work similar as Wildcard Query.

The solution is to add KeywordRepeatFilterFactory and RemoveDuplicatesTokenFilterFactory around the Stem Factory:
<fieldType name="text_rev" class="solr.TextField"
  positionIncrementGap="100">
  <analyzer type="index">
    <tokenizer class="solr.WhitespaceTokenizerFactory" />
    <filter class="solr.StopFilterFactory" ignoreCase="true"
      words="stopwords.txt" enablePositionIncrements="true" />
    <filter class="solr.WordDelimiterFilterFactory"
      generateWordParts="1" generateNumberParts="1" catenateWords="1"
      catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"
      preserveOriginal="1" />
    <filter class="solr.LowerCaseFilterFactory" />
    <filter class="solr.KeywordRepeatFilterFactory"/> 
    <filter class="solr.PorterStemFilterFactory"/> 
    <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> 
    <filter class="solr.ReversedWildcardFilterFactory"
      withOriginal="true" maxPosAsterisk="3" maxPosQuestion="2"
      maxFractionAsterisk="0.33" />
  </analyzer>
  <analyzer type="query">
    <tokenizer class="solr.WhitespaceTokenizerFactory" />
    <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
      ignoreCase="true" expand="true" />
    <filter class="solr.StopFilterFactory" ignoreCase="true"
      words="stopwords.txt" enablePositionIncrements="true" />
    <filter class="solr.WordDelimiterFilterFactory"
      generateWordParts="1" generateNumberParts="1" catenateWords="0"
      catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"
      preserveOriginal="1" />        
    <filter class="solr.LowerCaseFilterFactory" />
    <filter class="solr.PorterStemFilterFactory"/> 
  </analyzer>
</fieldType>
Test
Next let's write unit test to test and verify the change.
public void testWildcardStemFromSchema() {
  try {
    URLClassLoader urlClassLoader = (URLClassLoader) ClassLoader
        .getSystemClassLoader();
    // from
    // http://www.hangar.org/docs/activitats/SummerLAB/Pure%20Data/OSC%20-%20OpenSoundControl/SwingOSC/src/de/sciss/util/DynamicURLClassLoader.java
    DynamicURLClassLoader dynaLoader = new DynamicURLClassLoader(
        urlClassLoader);
    dynaLoader.addURL(new File(CONF_FOLDER).toURI().toURL());
    Thread.currentThread().setContextClassLoader(dynaLoader);
    InputSource solrCfgIs = new InputSource(new FileReader(new File(
        CONF_FOLDER, "solrconfig.xml")));
    SolrConfig solrConfig = new SolrConfig(null, solrCfgIs);
    InputSource solrSchemaIs = new InputSource(new FileReader(new File(
        CONF_FOLDER, "schema.xml")));
    IndexSchema solrSchema = new IndexSchema(solrConfig, "mySchema",
        solrSchemaIs);
    Map<String, FieldType> fieldTypes = solrSchema.getFieldTypes();
    listAllFieldTypes(fieldTypes);

    // now test text_rev
    String inputText = "files";
    FieldType fieldTypeText = fieldTypes.get("text_rev");
    Analyzer indexAnalyzer = fieldTypeText.getIndexAnalyzer();
    Analyzer queryAnalyzer = fieldTypeText.getQueryAnalyzer();
    System.out.println("Indexing analysis:");
testIndexerSearcher(solrSchema, indexAnalyzer, queryAnalyzer); TokenStream tokenStream = indexAnalyzer.tokenStream("content", new StringReader(inputText)); CharTermAttribute termAttr = tokenStream .getAttribute(CharTermAttribute.class); OffsetAttribute offsetAttr = tokenStream .getAttribute(OffsetAttribute.class); TypeAttribute typeAttr = tokenStream .getAttribute(TypeAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { System.out.println(termAttr.toString() + " offset: " + offsetAttr.startOffset() + ":" + offsetAttr.endOffset() + ", type:" + typeAttr.type()); } tokenStream.end(); tokenStream.close(); String searchText = "files*"; System.out.println("\r\nQuerying analysis:"); tokenStream = queryAnalyzer.tokenStream("content", new StringReader(searchText)); tokenStream.reset(); CharTermAttribute termAttr2 = (CharTermAttribute) tokenStream .getAttribute(CharTermAttribute.class); while (tokenStream.incrementToken()) { System.out.println(termAttr2.toString()); } tokenStream.end(); tokenStream.close(); } catch (Exception e) { e.printStackTrace(); } } private void testIndexerSearcher(IndexSchema solrSchema, Analyzer indexAnalyzer, Analyzer queryAnalyzer) throws IOException, ParseException { IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_9, indexAnalyzer); // recreate the index on each execution config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); config.setUseCompoundFile(false); // if we setInfoStream, add the below annotation to the TestClass // @SuppressSysoutChecks(bugUrl = "Solr logs to JUL") // config.setInfoStream(System.out); // be sure to close Directory and IndexWriter try (Directory directory = FSDirectory.open(new File(FILE_PATH)); IndexWriter writer = new IndexWriter(directory, config)) { Document doc = new Document(); IndexableField field = solrSchema.getField("content").createField( "files", 1.0f); doc.add(field); writer.addDocument(doc); writer.commit(); } try (Directory directory = FSDirectory.open(new File(FILE_PATH)); DirectoryReader indexReader = DirectoryReader.open(directory);) { IndexSearcher searcher = new IndexSearcher(indexReader); QueryParser queryParser = new QueryParser(Version.LUCENE_4_9, "content", queryAnalyzer); Query query = queryParser.parse("files*"); System.out.println("queryParser query:" + query.toString()); TopDocs docs = searcher.search(query, 10); LuceneUtil.printAndExplaunSearchResult(searcher, docs, query, "content"); } } public void testUsingAnalyzer() { try { URLClassLoader urlClassLoader = (URLClassLoader) ClassLoader .getSystemClassLoader(); // from // http://www.hangar.org/docs/activitats/SummerLAB/Pure%20Data/OSC%20-%20OpenSoundControl/SwingOSC/src/de/sciss/util/DynamicURLClassLoader.java DynamicURLClassLoader dynaLoader = new DynamicURLClassLoader( urlClassLoader); dynaLoader.addURL(new File(CONF_FOLDER).toURI().toURL()); Thread.currentThread().setContextClassLoader(dynaLoader); StringReader inputText = new StringReader("pictures files"); Map<String, String> commonArgs = ImmutableMap .<String, String> builder() .put(AbstractAnalysisFactory.LUCENE_MATCH_VERSION_PARAM, Version.LUCENE_4_9 + "").build(); // These factories remove consumed element from map // org.apache.lucene.analysis.util.AbstractAnalysisFactory.AbstractAnalysisFactory(Map<String, // String>) // args.remove(CLASS_NAME); // consume the class arg // why they remove value from map? -- so it can detect unwanted // parameters, to avoid typo mistake // org.apache.lucene.analysis.core.WhitespaceTokenizerFactory.WhitespaceTokenizerFactory(Map<String, // String>) // if (!args.isEmpty()) { // throw new IllegalArgumentException("Unknown parameters: " + // args); // } TokenizerFactory tkf = new WhitespaceTokenizerFactory( new HashMap<String, String>(commonArgs)); Tokenizer tkz = tkf.create(inputText); HashMap<String, String> stopFilterParmas = new HashMap<String, String>( commonArgs); stopFilterParmas.put("words", "stopwords.txt"); // CONF_FOLDER is added to classpath ResourceLoader loader = new ClasspathResourceLoader(); // ResourceLoader loader = new FilesystemResourceLoader(new File( // CONF_FOLDER)); StopFilterFactory stf = new StopFilterFactory(stopFilterParmas); stf.inform(loader); TokenStream st = stf.create(tkz); WordDelimiterFilterFactory wdff = new WordDelimiterFilterFactory( new HashMap<String, String>(commonArgs)); TokenFilter wdf = wdff.create(st); LowerCaseFilterFactory lcf = new LowerCaseFilterFactory( new HashMap<String, String>(commonArgs)); TokenStream lcts = lcf.create(wdf); KeywordRepeatFilterFactory krff = new KeywordRepeatFilterFactory( new HashMap<String, String>(commonArgs)); TokenStream kdrf = krff.create(lcts); TokenFilterFactory psff = new PorterStemFilterFactory( new HashMap<String, String>(commonArgs)); TokenStream psf = psff.create(kdrf); RemoveDuplicatesTokenFilterFactory rdtff = new RemoveDuplicatesTokenFilterFactory( new HashMap<String, String>(commonArgs)); RemoveDuplicatesTokenFilter rdtf = rdtff.create(psf); ReversedWildcardFilterFactory rwff = new ReversedWildcardFilterFactory( new HashMap<String, String>(commonArgs)); TokenStream rwf = rwff.create(rdtf); CharTermAttribute termAttrib = (CharTermAttribute) rwf .getAttribute(CharTermAttribute.class); rwf.reset(); while (rwf.incrementToken()) { String term = termAttrib.toString(); System.out.println(term); } rwf.end(); rwf.close(); } catch (Exception e) { e.printStackTrace(); } }
Output of the test case:
text_rev:TokenizerChain(org.apache.lucene.analysis.core.WhitespaceTokenizerFactory@48ae9b55, org.apache.lucene.analysis.core.StopFilterFactory@1700915, org.apache.lucene.analysis.miscellaneous.WordDelimiterFilterFactory@21de60b4, org.apache.lucene.analysis.core.LowerCaseFilterFactory@c267ef4, org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilterFactory@30ee2816, org.apache.lucene.analysis.en.PorterStemFilterFactory@31d7b7bf, org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilterFactory@635eaaf1, org.apache.solr.analysis.ReversedWildcardFilterFactory@5c30a9b0)
queryParser query:content:files*
Found : 1 hits.
1. files
1.0 = (MATCH) ConstantScore(content:files*), product of:
  1.0 = boost
  1.0 = queryNorm

Indexing analysis:
selif offset: 0:5, type:word
files offset: 0:5, type:word
elif offset: 0:5, type:word
file offset: 0:5, type:word

Querying analysis:
files*

file
References
Stemming not working with wildcard search
Testing Solr schema, analyzers and tokenization
Post a Comment

Labels

Java (159) Lucene-Solr (112) Interview (61) All (58) J2SE (53) Algorithm (45) Soft Skills (38) Eclipse (33) Code Example (31) Linux (25) JavaScript (23) Spring (22) Windows (22) Web Development (20) Tools (19) Nutch2 (18) Bugs (17) Debug (16) Defects (14) Text Mining (14) J2EE (13) Network (13) Troubleshooting (13) PowerShell (11) Chrome (9) Design (9) How to (9) Learning code (9) Performance (9) Problem Solving (9) UIMA (9) html (9) Http Client (8) Maven (8) Security (8) bat (8) blogger (8) Big Data (7) Continuous Integration (7) Google (7) Guava (7) JSON (7) Shell (7) ANT (6) Coding Skills (6) Database (6) Lesson Learned (6) Programmer Skills (6) Scala (6) Tips (6) css (6) Algorithm Series (5) Cache (5) Dynamic Languages (5) IDE (5) System Design (5) adsense (5) xml (5) AIX (4) Code Quality (4) GAE (4) Git (4) Good Programming Practices (4) Jackson (4) Memory Usage (4) Miscs (4) OpenNLP (4) Project Managment (4) Spark (4) Testing (4) ads (4) regular-expression (4) Android (3) Apache Spark (3) Become a Better You (3) Concurrency (3) Eclipse RCP (3) English (3) Happy Hacking (3) IBM (3) J2SE Knowledge Series (3) JAX-RS (3) Jetty (3) Restful Web Service (3) Script (3) regex (3) seo (3) .Net (2) Android Studio (2) Apache (2) Apache Procrun (2) Architecture (2) Batch (2) Bit Operation (2) Build (2) Building Scalable Web Sites (2) C# (2) C/C++ (2) CSV (2) Career (2) Cassandra (2) Distributed (2) Fiddler (2) Firefox (2) Google Drive (2) Gson (2) How to Interview (2) Html Parser (2) Http (2) Image Tools (2) JQuery (2) Jersey (2) LDAP (2) Life (2) Logging (2) Python (2) Software Issues (2) Storage (2) Text Search (2) xml parser (2) AOP (1) Application Design (1) AspectJ (1) Chrome DevTools (1) Cloud (1) Codility (1) Data Mining (1) Data Structure (1) ExceptionUtils (1) Exif (1) Feature Request (1) FindBugs (1) Greasemonkey (1) HTML5 (1) Httpd (1) I18N (1) IBM Java Thread Dump Analyzer (1) JDK Source Code (1) JDK8 (1) JMX (1) Lazy Developer (1) Mac (1) Machine Learning (1) Mobile (1) My Plan for 2010 (1) Netbeans (1) Notes (1) Operating System (1) Perl (1) Problems (1) Product Architecture (1) Programming Life (1) Quality (1) Redhat (1) Redis (1) Review (1) RxJava (1) Solutions logs (1) Team Management (1) Thread Dump Analyzer (1) Visualization (1) boilerpipe (1) htm (1) ongoing (1) procrun (1) rss (1)

Popular Posts