My colleague meets problem when trying to import a CSV file which contains Double-Quotes in a column value to Solr.
Looked at CSV standard:
If double-quotes are used to enclose fields, then a double-quote appearing inside a field must be escaped by preceding it with another double quote.
For example:
"aaa","b""bb","ccc"
Afte add a another preceding double quote, it works.
Implementation in Solr
When Solr imports CSV file, it honors CSV standard.
In Solr, the default encapsulator is also ". Please refer to: Updating a Solr Index with CSV
From org.apache.solr.internal.csv.CSVParser.encapsulatedTokenLexer(Token, int), we can see How Solr parse value.
Looked at CSV standard:
If double-quotes are used to enclose fields, then a double-quote appearing inside a field must be escaped by preceding it with another double quote.
For example:
"aaa","b""bb","ccc"
Afte add a another preceding double quote, it works.
Implementation in Solr
When Solr imports CSV file, it honors CSV standard.
In Solr, the default encapsulator is also ". Please refer to: Updating a Solr Index with CSV
From org.apache.solr.internal.csv.CSVParser.encapsulatedTokenLexer(Token, int), we can see How Solr parse value.
private Token encapsulatedTokenLexer(Token tkn, int c) throws IOException {
for (;;) {
c = in.read();
if (c == '\\' && strategy.getUnicodeEscapeInterpretation() && in.lookAhead()=='u') {
tkn.content.append((char) unicodeEscapeLexer(c));
} else if (c == strategy.getEscape()) {
tkn.content.append((char)readEscape(c));
} else if (c == strategy.getEncapsulator()) {
if (in.lookAhead() == strategy.getEncapsulator()) {
// double or escaped encapsulator -> add single encapsulator to token
c = in.read();
tkn.content.append((char) c);
} else {
// token finish mark (encapsulator) reached: ignore whitespace till delimiter
for (;;) {
c = in.read();
if (c == strategy.getDelimiter()) {
tkn.type = TT_TOKEN;
tkn.isReady = true;
return tkn;
} else if (isEndOfFile(c)) {
tkn.type = TT_EOF;
tkn.isReady = true;
return tkn;
} else if (isEndOfLine(c)) {
// ok eo token reached
tkn.type = TT_EORECORD;
tkn.isReady = true;
return tkn;
} else if (!isWhitespace(c)) {
// error invalid char between token and next delimiter
throw new IOException(
"(line " + getLineNumber()
+ ") invalid char between encapsulated token end delimiter"
);
}
}
}
} else if (isEndOfFile(c)) {
// error condition (end of file before end of token)
throw new IOException(
"(startline " + startLineNumber + ")"
+ "eof reached before encapsulated token finished"
);
} else {
// consume character
tkn.content.append((char) c);
}
}
}