/* * Copyright 2016 Christoph Böhme * * Licensed under the Apache License, Version 2.0 the "License"; * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.culturegraph.mf.strings; import java.text.Normalizer; import java.text.Normalizer.Form; import org.culturegraph.mf.framework.FluxCommand; import org.culturegraph.mf.framework.StreamReceiver; import org.culturegraph.mf.framework.annotations.Description; import org.culturegraph.mf.framework.annotations.In; import org.culturegraph.mf.framework.annotations.Out; import org.culturegraph.mf.framework.helpers.DefaultStreamPipe; /** * Normalises Unicode characters in record identifiers, entity and literal * names and literal values. Unicode normalisation converts between precomposed * characters and composed characters. There are four different forms of * conversion which can be selected using {@link #setNormalizationForm(Form)}. <p> * In the default configuration only literal values are * converted. The {@link #setNormalizeIds(boolean)}, * {@link #setNormalizeKeys(boolean)} and {@link #setNormalizeValues(boolean)} * parameters can be used to change this behaviour. * * @author Christoph Böhme */ @Description("Normalises composed and decomposed Unicode characters.") @In(StreamReceiver.class) @Out(StreamReceiver.class) @FluxCommand("normalize-unicode-stream") public final class StreamUnicodeNormalizer extends DefaultStreamPipe<StreamReceiver> { /** * The default value for {@link #setNormalizationForm(Form)}. */ public static final Normalizer.Form DEFAULT_NORMALIZATION_FORM = Normalizer.Form.NFC; private boolean normalizeIds; private boolean normalizeKeys; private boolean normalizeValues = true; private Normalizer.Form normalizationForm = DEFAULT_NORMALIZATION_FORM; /** * Controls whether to normalise record identifiers. By default record * identifiers are not normalised. * <p> * This parameter may be changed at any time. It becomes immediately * effective and affects all subsequently received <i>start-record</i> * events. * * @param normalizeIds if true identifiers are normalised, otherwise not. */ public void setNormalizeIds(final boolean normalizeIds) { this.normalizeIds = normalizeIds; } public boolean getNormalizeIds() { return normalizeIds; } /** * Controls whether to normalise literal and entity names. By default these * are not normalised. * <p> * This parameter may be changed at any time. It becomes immediately * effective and affects all subsequently received <i>start-entity</i> and * <i>literal</i> events. * * @param normalizeKeys if true literal and entity names are normalised, * otherwise not. */ public void setNormalizeKeys(final boolean normalizeKeys) { this.normalizeKeys = normalizeKeys; } public boolean getNormalizeKeys() { return normalizeKeys; } /** * Controls whether to normalise literal values. By default these are * normalised. * <p> * This parameter may be changed at any time. It becomes immediately * effective and affects all subsequently received <i>literal</i> events. * * @param normalizeValues if true literal values are normalised, otherwise * not. */ public void setNormalizeValues(final boolean normalizeValues) { this.normalizeValues = normalizeValues; } public boolean getNormalizeValues() { return normalizeValues; } /** * Sets the normalisation form used for normalising identifiers, names and * values. * <p> * The default value is {@value #DEFAULT_NORMALIZATION_FORM}. * <p> * This parameter may be set at any time during processing. It becomes * immediately effective and affects all subsequently received events. * * @param normalizationForm the normalisation form to use. * */ public void setNormalizationForm( final Normalizer.Form normalizationForm) { this.normalizationForm = normalizationForm; } public Normalizer.Form getNormalizationForm() { return normalizationForm; } @Override public void startRecord(final String identifier) { final String normalizedIdentifier = normalizeIds ? normalize(identifier) : identifier; getReceiver().startRecord(normalizedIdentifier); } @Override public void endRecord() { getReceiver().endRecord(); } @Override public void startEntity(final String name) { final String normalizedName = normalizeKeys ? normalize(name) : name; getReceiver().startEntity(normalizedName); } @Override public void endEntity() { getReceiver().endEntity(); } @Override public void literal(final String name, final String value) { final String normalizedName = normalizeKeys ? normalize(name) : name; final String normalizedValue= normalizeValues ? normalize(value) : value; getReceiver().literal(normalizedName, normalizedValue); } private String normalize(final String string) { return string == null ? null : Normalizer.normalize(string, normalizationForm); } }