/* * Sifarish: Recommendation Engine * Author: Pranab Ghosh * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package org.sifarish.etl; import java.io.IOException; import java.io.Serializable; import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; import org.sifarish.feature.DynamicAttrSimilarityStrategy; /** * Normalizer for structured text field type * @author pranab * */ public class TextFieldTokenNormalizer implements Serializable { private String fieldType; private String[][] normalizers; /** * @return */ public String getFieldType() { return fieldType; } /** * @param fieldType */ public void setFieldType(String fieldType) { this.fieldType = fieldType; } /** * @return */ public String[][] getNormalizers() { return normalizers; } /** * @param normalizers */ public void setNormalizers(String[][] normalizers) { this.normalizers = normalizers; } /** * @param item * @return */ public String normalize(String item) { String newItem = item; for (String[] normalizer : normalizers) { newItem = newItem.replace(normalizer[0], normalizer[1]); } return newItem; } /** * @param normalized * @return */ public boolean containsNormalize(String normalized) { boolean contains = false; for (String[] normalizer : normalizers) { contains = normalizer[1].equals(normalized); if (contains) break; } return contains; } /** * @param item * @param textSimStrategy * @return * @throws IOException */ public Pair<String, Double> fuzzymatchWithUnnormalized(String item, DynamicAttrSimilarityStrategy textSimStrategy) throws IOException { return fuzzymatch(item, textSimStrategy, 0); } /** * @param item * @param textSimStrategy * @return * @throws IOException */ public Pair<String, Double> fuzzymatchWithNormalized(String item, DynamicAttrSimilarityStrategy textSimStrategy) throws IOException { return fuzzymatch(item, textSimStrategy, 1); } /** * @param item * @param textSimStrategy * @param index * @return * @throws IOException */ private Pair<String, Double> fuzzymatch(String item, DynamicAttrSimilarityStrategy textSimStrategy, int index ) throws IOException { double dist = 1.0; String token = null; for (String[] normalizer : normalizers) { double thisDist = textSimStrategy.findDistance(item, normalizer[index]); if (thisDist < dist) { dist = thisDist; token = normalizer[0]; } } ImmutablePair<String, Double> match = new ImmutablePair<String, Double>(token, dist); return match; } }