/* * Sifarish: Recommendation Engine * Author: Pranab Ghosh * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package org.sifarish.feature; import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; /** * Finds distance with matching character pair algorithms * @author pranab * */ public class CharacterPairSimilarity extends DynamicAttrSimilarityStrategy { /* (non-Javadoc) * @see org.sifarish.feature.DynamicAttrSimilarityStrategy#findDistance(java.lang.String, java.lang.String) */ public double findDistance(String src, String target) throws IOException { double dist = 0; String[] srcTerms = src.split(fieldDelimRegex); List<String> srcPairs = getCharacterPairs(srcTerms); String[] trgTerms = target.split(fieldDelimRegex); List<String> trgPairs = getCharacterPairs(trgTerms); int union = srcPairs.size() + trgPairs.size(); int intersection = findIntersection(srcPairs, trgPairs); double sim = (2.0 * intersection) / union; dist = 1.0 -sim; return dist; } /** * return all 2 char pairs * @param terms * @return */ private List<String> getCharacterPairs(String[] terms) { List<String> pairs = new ArrayList<String>(); for (String term : terms) { for (int i = 0; i < term.length() - 1; ++i) { String pair = term.substring(i, i+2); pairs.add(pair); } } return pairs; } /** * @param srcPairs * @param trgPairs * @return */ private int findIntersection(List<String> srcPairs, List<String> trgPairs) { int intersection = 0; Set<Integer> matchedTrg = new HashSet<Integer>(); for (int i = 0; i < srcPairs.size(); ++i) { String srcPair = srcPairs.get(i); for (int j = 0; i < trgPairs.size(); ++j) { String trgPair = trgPairs.get(j); if (srcPair.equals(trgPair) && !matchedTrg.contains(j)) { ++intersection; matchedTrg.add(j); break; } } } return intersection; } }