/* * Copyright 2007 T-Rank AS * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package no.trank.openpipe.step; import java.util.ArrayList; import java.util.List; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import no.trank.openpipe.api.MultiInputOutputFieldPipelineStep; import no.trank.openpipe.api.PipelineException; import no.trank.openpipe.api.document.AnnotatedField; import no.trank.openpipe.api.document.Document; import no.trank.openpipe.config.annotation.NotNull; /** * This step splits input text according to a few simple rules and outputs the combinations to a multi valued field: * * <p><ul> * <li>The property <code>levelSplit</code> is used to split the text into separate levels.</li> * <li>The property <code>alternativeSplit</code> is used to split the text on a level into alternatives.</li> * <li>The property <code>numLevels</code> is used to stop the output after a set number of levels.</li> * </ul> * * * <p><h4>Examples</h4> * * <table> * <tr> * <td><code>levelSplit</code>:</td> * <td>/</td> * </tr> * <tr> * <td><code>alternativeSplit</code>:</td> * <td>¦</td> * </tr> * <tr> * <td><code>numLevels</code>:</td> * <td>0 (no restriction)</td> * </tr> * <tr> * <td>Input:</td> * <td>a/b¦c/d</td> * </tr> * <tr> * <td valign="top">Output:</td> * <td> * <ul> * <li>a</li> * <li>a/b</li> * <li>a/c</li> * <li>a/b/d</li> * <li>a/c/d</li> * </ul> * </td> * </tr> * </table> * * * <p><table> * <tr> * <td><code>levelSplit</code>:</td> * <td>/</td> * </tr> * <tr> * <td><code>alternativeSplit</code>:</td> * <td>¦</td> * </tr> * <tr> * <td><code>numLevels</code>:</td> * <td>2</td> * </tr> * <tr> * <td>Input:</td> * <td>a/b¦c/d</td> * </tr> * <tr> * <td valign="top">Output:</td> * <td> * <ul> * <li>a</li> * <li>a/b</li> * <li>a/c</li> * </ul> * </td> * </tr> * </table> * * @version $Revision$ */ public class HierarchicalSplitter extends MultiInputOutputFieldPipelineStep { private static Logger log = LoggerFactory.getLogger(HierarchicalSplitter.class); private int numLevels; @NotNull private String levelSplit; @NotNull private String alternativeSplit; public HierarchicalSplitter() { super(false); } @Override protected void process(Document doc, String inputFieldName, List<AnnotatedField> inputFields, String outputFieldName) throws PipelineException { if(inputFields.isEmpty()) { log.debug("Missing field '{}'", inputFieldName); doc.removeField(outputFieldName); } else { List<String> outValues = new ArrayList<String>(); for(AnnotatedField field : inputFields) { String text = field.getValue(); if(text != null && text.length() > 0) { List<String> values = resolveSplits(text); if(!values.isEmpty()) { outValues.addAll(values); } } } if (outValues.isEmpty()) { doc.removeField(outputFieldName); } else { doc.setFieldValues(outputFieldName, outValues); } } } private List<String> resolveSplits(String text) { String[] levels = text.split(levelSplit); List<String> ret = new ArrayList<String>(); int lastLevel = Math.min(numLevels > 0 ? numLevels : Integer.MAX_VALUE, levels.length); for(int size = 1; size <= lastLevel; ++size) { int[] ind = new int[size]; boolean incr = false; while(!incr) { String cat = ""; incr = true; for(int i = size-1; i >= 0; --i) { String[] tmp = levels[i].split(alternativeSplit); cat = (i > 0 ? levelSplit : "") + tmp[ind[i]] + cat; if(incr) { ind[i] = (ind[i] + 1) % tmp.length; incr = ind[i] == 0; } } ret.add(cat); } } if(log.isDebugEnabled()) { log.debug("Resolved " + ret.size() + " split" + (ret.size() == 1 ? "" : "s") + " over " + lastLevel + " level" + (lastLevel == 1 ? "" : "s")); } return ret; } @Override public String getRevision() { return "$Revision$"; } /** * Gets the split used to separate the alternatives within a level of the input. * * @return the split */ public String getAlternativeSplit() { return alternativeSplit; } /** * Sets the split used to separate the alternatives within a level of the input. * * @param alternativeSplit the split */ public void setAlternativeSplit(String alternativeSplit) { this.alternativeSplit = alternativeSplit; } /** * Gets the split used to separate the levels in the input. * * @return the split */ public String getLevelSplit() { return levelSplit; } /** * Sets the split used to separate the levels in the input. * * @param levelSplit */ public void setLevelSplit(String levelSplit) { this.levelSplit = levelSplit; } /** * Gets the max number of levels in the output. * * @return the number of levels */ public int getNumLevels() { return numLevels; } /** * Sets the max number of levels in the output. There is no restriction if not set. * * @param numLevels the number of levels */ public void setNumLevels(int numLevels) { this.numLevels = numLevels; } }