/*
* Copyright 2013 Cloudera Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cloudera.cdk.morphline.solrcell;
import java.util.Collection;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.handler.extraction.SolrContentHandler;
import org.apache.solr.handler.extraction.SolrContentHandlerFactory;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.tika.metadata.Metadata;
/**
* SolrContentHandler and associated factory that strips non-characters anhd trims on output.
* This prevents exceptions on parsing integer fields inside Solr server.
*/
public class StripNonCharSolrContentHandlerFactory extends SolrContentHandlerFactory {
public StripNonCharSolrContentHandlerFactory(Collection<String> dateFormats) {
super(dateFormats);
}
@Override
public SolrContentHandler createSolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema) {
return new StripNonCharSolrContentHandler(metadata, params, schema, dateFormats);
}
///////////////////////////////////////////////////////////////////////////////
// Nested classes:
///////////////////////////////////////////////////////////////////////////////
private static final class StripNonCharSolrContentHandler extends SolrContentHandler {
public StripNonCharSolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema, Collection<String> dateFormats) {
super(metadata, params, schema, dateFormats);
}
/**
* Strip all non-characters, which can cause SolrReducer problems if present.
* This is borrowed from Apache Nutch.
*/
private static String stripNonCharCodepoints(String input) {
StringBuilder stripped = new StringBuilder(input.length());
char ch;
for (int i = 0; i < input.length(); i++) {
ch = input.charAt(i);
// Strip all non-characters http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Noncharacter_Code_Point=True:]
// and non-printable control characters except tabulator, new line and carriage return
if (ch % 0x10000 != 0xffff && // 0xffff - 0x10ffff range step 0x10000
ch % 0x10000 != 0xfffe && // 0xfffe - 0x10fffe range
(ch <= 0xfdd0 || ch >= 0xfdef) && // 0xfdd0 - 0xfdef
(ch > 0x1F || ch == 0x9 || ch == 0xa || ch == 0xd)) {
stripped.append(ch);
}
}
return stripped.toString();
}
@Override
protected String transformValue(String val, SchemaField schemaField) {
String ret = super.transformValue(val, schemaField).trim();
ret = stripNonCharCodepoints(ret);
return ret;
}
}
}