Solr: Use UpdateRequestProcessor to Round Data


We can extend UpdateRequestProcessor to extend Solr to do many things, clean data, transform date, etc.

Sometimes, we need round the passed in data, for example: a date value, 2012-12-21T12:12:12.234Z, customer may only cares about date part, doesn't care about hour, minute parts.

So to reduce index size, and improve query performance, we can use UpdateRequestProcessor round date to 2012-12-21T00:00:00Z.
In solrconfig.xml, we can configure a processor to specify round what fields to what format, in the following code, we round them to only keep date part.
<updateRequestProcessorChain name="dateRoundChain">
  <processor class="solr.LogUpdateProcessorFactory" />
  </processor>
  <processor class="org.codeexample.jeffery.solr.DateRoundProcessorFactory" >
   <bool name="ignoreError">true</bool>
   <str name="date.fields">access_time,modify_time,mtm</str>
   <str name="date.round.fields">day,day,day</str>
  </processor>
  <processor class="solr.RunUpdateProcessorFactory" />
 </updateRequestProcessorChain>

  <requestHandler name="/import/csv" class="solr.CSVRequestHandler">
  <lst name="defaults">
   <str name="stream.contentType">application/csv</str>
   <str name="update.chain">dateRoundChain</str>
  </lst>
 </requestHandler>

The code is like below:
It now only support rounding date to only keep date or second parts, but you can easily add code to round date to only keep year, month, hour, minute part.
package org.codeexample.jeffery.solr;
public class DateRoundProcessorFactory extends UpdateRequestProcessorFactory {

	private List<String> dateFields;
	private List<String> dateRoundFields;
	// ignoreError
	private boolean ignoreError;

	private static String ROUND_DAY = "DAY";
	private static String FORMAT_DAY = "yyyy-MM-dd'T'00:00:00.0'Z'";

	// yyyy-MM-dd'T'HH:mm:ss.SSS'Z'
	private static String ROUND_SECOND = "SECOND";
	private static String FORMAT_SECOND = "yyyy-MM-dd'T'HH:mm:ss'Z'";

	@SuppressWarnings("rawtypes")
	@Override
	public void init(final NamedList args) {
		if (args != null) {
			SolrParams params = SolrParams.toSolrParams(args);
			Object fields = args.get("date.fields");
			dateFields = fields == null ? null : StrUtils.splitSmart(
					(String) fields, ",", true);

			fields = args.get("date.round.fields");
			dateRoundFields = fields == null ? null : StrUtils.splitSmart(
					(String) fields, ",", true);

			if ((dateFields == null && dateRoundFields != null)
					|| (dateFields != null && dateRoundFields == null)
					|| (dateFields != null && dateRoundFields != null
							& dateFields.size() != dateRoundFields.size()))
				throw new IllegalArgumentException(
						"Size of date.fields and date.round.fields must be same.");
			ignoreError = params.getBool("ignoreError", false);
		}
	}

	@Override
	public UpdateRequestProcessor getInstance(SolrQueryRequest req,
			SolrQueryResponse rsp, UpdateRequestProcessor next) {
		return new DateRoundProcessor(req, next);
	}

	class DateRoundProcessor extends UpdateRequestProcessor {
		public DateRoundProcessor(SolrQueryRequest req,
				UpdateRequestProcessor next) {
			super(next);
		}

		@Override
		public void processAdd(AddUpdateCommand cmd) throws IOException {
			SolrInputDocument solrInputDocument = cmd.getSolrInputDocument();
			for (int i = 0; i < dateFields.size(); i++) {
				try {
					String dateField = dateFields.get(i);
					SolrInputField inputField = solrInputDocument
							.getField(dateField);

					if (inputField != null) {
						Object obj = inputField.getValue();
						Object result = null;
						if (obj instanceof String) {
							String value = (String) obj;
							Date solrDate = parseSolrDate(value);
							String roundTo = dateRoundFields.get(i);
							DateFormat df = null;
							if (ROUND_DAY.equalsIgnoreCase(roundTo)) {
								df = new SimpleDateFormat(FORMAT_DAY);
							} else if (ROUND_SECOND.equalsIgnoreCase(roundTo)) {
								df = new SimpleDateFormat(FORMAT_SECOND);
							}
							if (df != null) {
								result = df.format(solrDate);
								// only remove it, if there is no error
								solrInputDocument.removeField(dateField);
								solrInputDocument.addField(dateField, result);
							}
						}
					}
				} catch (Exception ex) {
					if (!ignoreError) {
						throw new IOException(ex);
					}
				}
			}
			super.processAdd(cmd);
		}
	}

	public Date parseSolrDate(String dateString) throws ParseException {
		SimpleDateFormat sdf = new SimpleDateFormat(
				"yyyy-MM-dd'T'HH:mm:ss.SSS'Z'", Locale.US);
		sdf.setTimeZone(TimeZone.getTimeZone("UTC"));
		return sdf.parse(dateString);
	}
}

You can view the complete source code here:
https://github.com/jefferyyuan/solr.misc

Labels

adsense (5) Algorithm (69) Algorithm Series (35) Android (7) ANT (6) bat (8) Big Data (7) Blogger (14) Bugs (6) Cache (5) Chrome (19) Code Example (29) Code Quality (7) Coding Skills (5) Database (7) Debug (16) Design (5) Dev Tips (63) Eclipse (32) Git (5) Google (33) Guava (7) How to (9) Http Client (8) IDE (7) Interview (88) J2EE (13) J2SE (49) Java (186) JavaScript (27) JSON (7) Learning code (9) Lesson Learned (6) Linux (26) Lucene-Solr (112) Mac (10) Maven (8) Network (9) Nutch2 (18) Performance (9) PowerShell (11) Problem Solving (11) Programmer Skills (6) regex (5) Scala (6) Security (9) Soft Skills (38) Spring (22) System Design (11) Testing (7) Text Mining (14) Tips (17) Tools (24) Troubleshooting (29) UIMA (9) Web Development (19) Windows (21) xml (5)