/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.solr.morphlines.cell; import java.util.Collection; import org.apache.solr.common.params.SolrParams; import org.apache.solr.handler.extraction.SolrContentHandler; import org.apache.solr.handler.extraction.SolrContentHandlerFactory; import org.apache.solr.schema.IndexSchema; import org.apache.solr.schema.SchemaField; import org.apache.tika.metadata.Metadata; /** * {@link SolrContentHandler} and associated factory that strips non-characters and trims on output. * This prevents exceptions on parsing integer fields inside Solr server. */ public class StripNonCharSolrContentHandlerFactory extends SolrContentHandlerFactory { public StripNonCharSolrContentHandlerFactory(Collection<String> dateFormats) { super(dateFormats); } @Override public SolrContentHandler createSolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema) { return new StripNonCharSolrContentHandler(metadata, params, schema, dateFormats); } /////////////////////////////////////////////////////////////////////////////// // Nested classes: /////////////////////////////////////////////////////////////////////////////// private static final class StripNonCharSolrContentHandler extends SolrContentHandler { public StripNonCharSolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema, Collection<String> dateFormats) { super(metadata, params, schema, dateFormats); } /** * Strip all non-characters, which can cause SolrReducer problems if present. * This is borrowed from Apache Nutch. */ private static String stripNonCharCodepoints(String input) { StringBuilder stripped = new StringBuilder(input.length()); char ch; for (int i = 0; i < input.length(); i++) { ch = input.charAt(i); // Strip all non-characters http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:Noncharacter_Code_Point=True:] // and non-printable control characters except tabulator, new line and carriage return if (ch % 0x10000 != 0xffff && // 0xffff - 0x10ffff range step 0x10000 ch % 0x10000 != 0xfffe && // 0xfffe - 0x10fffe range (ch <= 0xfdd0 || ch >= 0xfdef) && // 0xfdd0 - 0xfdef (ch > 0x1F || ch == 0x9 || ch == 0xa || ch == 0xd)) { stripped.append(ch); } } return stripped.toString(); } @Override protected String transformValue(String val, SchemaField schemaField) { String ret = super.transformValue(val, schemaField).trim(); ret = stripNonCharCodepoints(ret); return ret; } } }