/* * Sifarish: Recommendation Engine * Author: Pranab Ghosh * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package org.sifarish.feature; /** * Jaccard similarity * @author pranab * */ public class JaccardSimilarity extends DynamicAttrSimilarityStrategy { private double srcNonMatchingTermWeight; private double trgNonMatchingTermWeight; /** * @param srcNonMatchingTermWeight * @param trgNonMatchingTermWeight */ public JaccardSimilarity(double srcNonMatchingTermWeight, double trgNonMatchingTermWeight) { super(); this.srcNonMatchingTermWeight = srcNonMatchingTermWeight; this.trgNonMatchingTermWeight = trgNonMatchingTermWeight; } /* (non-Javadoc) * @see org.sifarish.feature.DynamicAttrSimilarityStrategy#findDistance(java.lang.String, java.lang.String) */ @Override public double findDistance(String src, String target) { double distance = 1.0; String[] srcTerms = src.split(fieldDelimRegex); String[] trgTerms = target.split(fieldDelimRegex); int matchCount = 0; for (String srcTerm : srcTerms) { for (String trgTerm : trgTerms) { if (srcTerm.equals(trgTerm)) { ++matchCount; } } } int srcNonMatchCount = srcTerms.length - matchCount; int trgNonMatchCount = trgTerms.length - matchCount; distance = 1.0 - (double)matchCount / ((double)matchCount + srcNonMatchingTermWeight * srcNonMatchCount + trgNonMatchingTermWeight * trgNonMatchCount); intersectionLength = matchCount; return distance; } }