/* * Copyright 2007 T-Rank AS * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package no.trank.openpipe.step; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import no.trank.openpipe.api.MultiInputOutputFieldPipelineStep; import no.trank.openpipe.api.PipelineException; import no.trank.openpipe.api.document.AnnotatedField; import no.trank.openpipe.api.document.BaseAnnotatedField; import no.trank.openpipe.api.document.Document; import no.trank.openpipe.config.annotation.NotNull; /** * This step offers java regex functionality. * * @version $Revision$ */ public class RegexField extends MultiInputOutputFieldPipelineStep { private static final Logger log = LoggerFactory.getLogger(RegexField.class); @NotNull private Pattern fromPattern; @NotNull private String toPattern; private boolean copyOnMiss; private boolean deleteOnEmpty = true; private boolean nullIsBlank; public RegexField() { super(false); } @Override protected void process(Document doc, String inputFieldName, List<AnnotatedField> inputFields, String outputFieldName) throws PipelineException { List<String> outValues = new ArrayList<String>(); if (inputFields.isEmpty() && nullIsBlank) { inputFields = new ArrayList<AnnotatedField>(); inputFields.add(new BaseAnnotatedField("")); } for (AnnotatedField field : inputFields) { final Matcher m = fromPattern.matcher(field.getValue()); if (m.find()) { log.debug("Field '{}' matches", inputFieldName); outValues.add(m.replaceAll(toPattern)); } else { log.debug("Field '{}' does not match", inputFieldName); if (copyOnMiss) { outValues.add(field.getValue()); } } } if (outValues.isEmpty()) { if (deleteOnEmpty) { doc.removeField(outputFieldName); } } else { doc.setFieldValues(outputFieldName, outValues); } } @Override public String getRevision() { return "$Revision$"; } /** * Gets the regex pattern used for matching against the input field values. * * @return the regex pattern */ public String getFromPattern() { return fromPattern != null ? fromPattern.pattern() : null; } /** * Compiles the regex pattern used for matching against the input field values. * Note that {@link Matcher#find()} is called during the matching process, to allow for replace all effects. * * @param fromPattern the pattern to be compiled */ public void setFromPattern(String fromPattern) { this.fromPattern = Pattern.compile(fromPattern); } /** * Gets the pattern that is applied when producing the output field values. * * @return the pattern */ public String getToPattern() { return toPattern; } /** * Sets the pattern that is applied when producing the output field values through calls to * {@link Matcher#replaceAll(String)}. * * @param toPattern the output pattern */ public void setToPattern(String toPattern) { this.toPattern = toPattern; } /** * Gets whether the input field value should be copied to the output field if the input field value does not * match the from pattern. * * @return <code>true</code> if the input field value should be copied to the output field if the input field value * does not match the from pattern, <code>false</code> otherwise */ public boolean isCopyOnMiss() { return copyOnMiss; } /** * Specifies whether the input field value should be copied to the output field if the input field value * does not match the from pattern. * * @param copyOnMiss <code>true</code> if the input field value should be copied to the output field if pattern * doesn't match. */ public void setCopyOnMiss(boolean copyOnMiss) { this.copyOnMiss = copyOnMiss; } public boolean isDeleteOnEmpty() { return deleteOnEmpty; } /** * Specifies whether the output field value should be removed if the input field value does not match the from * pattern and isCopyOnMiss is false. * * @param deleteOnEmpty <code>true</code> if the the output field should be deleted. */ public void setDeleteOnEmpty(boolean deleteOnEmpty) { this.deleteOnEmpty = deleteOnEmpty; } /** * Specifies if the RegexMatcher treats a null field as a blank field * @return true if the RegexMatcher should treat a null field as a blank field */ public boolean isNullIsBlank() { return nullIsBlank; } /** * Specifies that the RegexMatcher should treat a null field as a blank field. This makes it possible to match the * empty field by using the match string "^$". * * @param nullIsBlank true if the RegexMatcher should treat a null field as a blank field */ public void setNullIsBlank(boolean nullIsBlank) { this.nullIsBlank = nullIsBlank; } }