/* * chombo: Hadoop Map Reduce utility * Author: Pranab Ghosh * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package org.chombo.distance; import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.lang3.ArrayUtils; import org.chombo.util.Attribute; import org.chombo.util.BasicUtils; import org.chombo.util.GenericAttributeSchema; import org.chombo.util.Pair; import org.chombo.util.RichAttribute; import org.chombo.util.RichAttributeSchema; /** * @author pranab * */ public class InterRecordDistance implements Serializable { private GenericAttributeSchema attrSchema; private AttributeDistanceSchema attrDistSchema; private String fieldDelim; private String subFieldDelim = BasicUtils.DEF_SUB_FIELD_DELIM; private String[] firstItems; private String[] secondItems; private String firstItem; private String secondItem; private int ordinal; private Map<Integer, Double> attrDistances = new HashMap<Integer, Double>(); private Map<Integer, DynamicVectorSimilarity> textSimilarityStrategies = new HashMap<Integer, DynamicVectorSimilarity>(); private boolean doubleRange; private boolean categoricalSet; private int fieldOrd; protected Map<Integer, Map<Pair<String, String>, Double>> valueDiffMetricDist = new HashMap<Integer, Map<Pair<String, String>, Double>>(); private int scale = 1; private int[] facetedFields; /** * @param attrSchema * @param attrDistSchema * @param fieldDelim */ public InterRecordDistance(GenericAttributeSchema attrSchema, AttributeDistanceSchema attrDistSchema, String fieldDelim) { this.attrSchema = attrSchema; this.attrDistSchema = attrDistSchema; this.fieldDelim = fieldDelim; } /** * @param scale * @return */ public InterRecordDistance withScale(int scale) { this.scale = scale; return this; } /** * @param facetedFields * @return */ public InterRecordDistance withFacetedFields(int[] facetedFields) { this.facetedFields = facetedFields; return this; } /** * @param subFieldDelim * @return */ public InterRecordDistance withSubFieldDelim(String subFieldDelim) { this.subFieldDelim = subFieldDelim; return this; } /** * @param doubleRange * @return */ public InterRecordDistance withDoubleRange(boolean doubleRange) { this.doubleRange = doubleRange; return this; } /** * @param categoricalSet * @return */ public InterRecordDistance withCategoricalSet(boolean categoricalSet) { this.categoricalSet = categoricalSet; return this; } /** * @param records * @param idFieldLen * @return */ public InterRecordDistance withValueDiffMetricDist(List<String[]> records, int idFieldLen) { for (String[] rec : records) { int offset = idFieldLen; int attrOrd = Integer.parseInt(rec[offset++]); Map<Pair<String, String>, Double> valuePairDist = new HashMap<Pair<String, String>, Double>(); valueDiffMetricDist.put(attrOrd, valuePairDist); while (offset < rec.length) { String firstAttrVal = rec[offset++]; String secAttrVal = rec[offset++]; Pair<String,String> valPair = distWithAttrVAluesSorted(firstAttrVal, secAttrVal); Double dist = Double.parseDouble(rec[offset++]); valuePairDist.put(valPair, dist); } } return this; } /** * @param firstAttrVal * @param secAttrVal * @return */ private Pair<String,String> distWithAttrVAluesSorted(String firstAttrVal, String secAttrVal) { Pair<String,String> valPair = null; if (firstAttrVal.compareTo(secAttrVal) > 0) { valPair = new Pair<String,String>(firstAttrVal, secAttrVal); } else { valPair = new Pair<String,String>(secAttrVal, firstAttrVal); } return valPair; } /** * @param first * @param second * @return * @throws IOException */ public int findScaledDistance(String first, String second) throws IOException { int dist = (int)(scale * findDistance(first, second)); return dist; } /** * @param first * @param second * @return * @throws IOException */ public double findDistance(String[] firstRec, String[] secondRec, int fieldOrd ) throws IOException { this.fieldOrd = fieldOrd; return findDistance(secondRec[fieldOrd], secondRec[fieldOrd]); } /** * @param first * @param second * @return * @throws IOException */ public double findDistance(String first, String second ) throws IOException { double recDist = 0; double dist = 0; attrDistances.clear(); firstItems = first.split(fieldDelim); secondItems = second.split(fieldDelim); //attribute pair distances for (Attribute attr : attrSchema.getAttributes()) { ordinal = attr.getOrdinal(); //skip if ID if (attr.isId()) continue; //skip if not faceted field if (null != facetedFields) { //if faceted set but field not included, then skip it if (!ArrayUtils.contains(facetedFields, ordinal)) { continue; } } AttributeDistance attrDist = attrDistSchema.findAttributeDistanceByOrdinal(ordinal); firstItem = firstItems[ordinal]; secondItem = secondItems[ordinal]; if (attr.isCategorical()) { dist = categoricalDistance(attr, attrDist); } else if (attr.isInteger()) { dist = numericDistance(Integer.parseInt(firstItem), Integer.parseInt(secondItem), attrDist); } else if (attr.isDouble()) { if (doubleRange) { DoubleRange firstItemRange = DoubleRange.create(firstItem, subFieldDelim); if (null != firstItemRange) { dist = numericDistance(firstItemRange, Double.parseDouble(secondItem), attrDist); } else { DoubleRange secondItemRange = DoubleRange.create(secondItem, subFieldDelim); if (null != secondItemRange) { dist = numericDistance(secondItemRange, Double.parseDouble(firstItem), attrDist); } else { throw new IllegalStateException("no range data found in field"); } } } else { dist = numericDistance(Double.parseDouble(firstItem), Double.parseDouble(secondItem), attrDist); } } else if (attr.isText()) { dist = textDistance(attrDist); } else if (attr.isGeoLocation()) { dist = geoLocationDistance(attrDist); } attrDistances.put(ordinal, dist); } //aggregate double sumDist = 0; double sumWeight = 0; for (AttributeDistanceAggregator aggregator : attrDistSchema.getAttrAggregators()) { if (aggregator.getAlgorithm().equals("euclidean")) { dist = aggregateEuclidean(aggregator.getOrdinals()); } else if (aggregator.getAlgorithm().equals("manhattan")) { dist = aggregateManhattan(aggregator.getOrdinals()); } else if (aggregator.getAlgorithm().equals("minkwoski")) { dist = aggregateMinkwoski(aggregator.getOrdinals(), aggregator.getParam()); } else if (aggregator.getAlgorithm().equals("categorical")) { dist = aggregateCategorical(aggregator.getOrdinals()); } sumDist += dist * aggregator.getWeight(); sumWeight += aggregator.getWeight(); } recDist = sumDist / sumWeight; return recDist; } /** * @param attr * @param attrDist * @return */ private double categoricalDistance(Attribute attr, AttributeDistance attrDist) { double dist = 0; if (attrDist.getAlgorithm().equals("cardinality")) { //cardinality dist = firstItem.equals(secondItem) ? 0 : Math.sqrt(2) / attr.getCardinality().size(); } else if (attrDist.getAlgorithm().equals("valueDiffMetric")) { //value difference metric Pair<String,String> valPair = distWithAttrVAluesSorted(firstItem, secondItem); dist = valueDiffMetricDist.get(fieldOrd).get(valPair); }else { //default equality or inclusion based if (categoricalSet) { List<String> firstList = BasicUtils.toList(firstItem.split(subFieldDelim)); List<String> secondList = BasicUtils.toList(secondItem.split(subFieldDelim)); dist = BasicUtils.listIncluded(firstList, secondList) ? 0 : 1; }else { dist = firstItem.equals(secondItem) ? 0 : 1; } } return dist; } /** * @param firstItemDbl * @param secondItemDbl * @param attrDist * @return */ private double numericDistance(double firstItemDbl, double secondItemDbl, AttributeDistance attrDist) { double dist = 0; dist = Math.abs(firstItemDbl - secondItemDbl); //apply weight if (attrDist.isWeightSet()) { dist /= attrDist.getWeight(); } //apply threshold if (attrDist.isUpperThresholdSet() && dist > attrDist.getUpperThreshold()) { dist = 1; } else if (attrDist.isLowerThresholdSet() && dist > attrDist.getLowerThreshold()) { dist = 0; } return dist; } /** * @param firstItemDblRange * @param secondItemDbl * @param attrDist * @return */ private double numericDistance(DoubleRange firstItemDblRange, double secondItemDbl, AttributeDistance attrDist) { double dist = 0; if (secondItemDbl > firstItemDblRange.getUpper()) { dist = numericDistance(firstItemDblRange.getUpper(), secondItemDbl, attrDist); } else if (secondItemDbl < firstItemDblRange.getLower()) { dist = numericDistance(secondItemDbl, firstItemDblRange.getLower(), attrDist); } return dist; } /** * @param attrDist * @return * @throws IOException */ private double textDistance(AttributeDistance attrDist) throws IOException { DynamicVectorSimilarity simStrategy = textSimilarityStrategies.get(ordinal); if (null == simStrategy) { simStrategy = DynamicVectorSimilarity.createSimilarityStrategy(attrDist); textSimilarityStrategies.put(ordinal, simStrategy); } return simStrategy.findDistance(firstItem, secondItem); } /** * @param attrDist * @return */ private double geoLocationDistance(AttributeDistance attrDist) { String[] items = firstItem.split(subFieldDelim); double lat1 = Double.parseDouble(items[0]); double long1 = Double.parseDouble(items[1]); items = secondItem.split(":"); double lat2 = Double.parseDouble(items[0]); double long2 = Double.parseDouble(items[1]); double dist = BasicUtils.getGeoDistance(lat1, long1, lat2, long2); if (attrDist.isMaxGeoDistanceSet()) { dist /= attrDist.getMaxGeoDistance(); } return dist; } /** * @param ordinals * @return */ private double aggregateEuclidean(int[] ordinals) { double dist = 0; double sum = 0; for (int ordinal : ordinals) { sum += attrDistances.get(ordinal) * attrDistances.get(ordinal); } dist = Math.sqrt(sum) / ordinals.length; return dist; } /** * @param ordinals * @return */ private double aggregateManhattan(int[] ordinals) { double dist = 0; double sum = 0; for (int ordinal : ordinals) { sum += attrDistances.get(ordinal); } dist = sum / ordinals.length; return dist; } /** * @param ordinals * @return */ private double aggregateMinkwoski(int[] ordinals, double param) { double dist = 0; double sum = 0; for (int ordinal : ordinals) { Math.pow(attrDistances.get(ordinal), param); sum += Math.pow(attrDistances.get(ordinal), param); } dist = Math.pow(sum, 1.0/param) / ordinals.length; return dist; } /** * @param ordinals * @return */ private double aggregateCategorical(int[] ordinals) { double dist = 0; double sum = 0; for (int ordinal : ordinals) { sum += attrDistances.get(ordinal); } dist = sum / ordinals.length; return dist; } /** * @author pranab * */ private static class DoubleRange extends Pair<Double, Double> { /** * @param first * @param second */ public DoubleRange(Double first, Double second) { super(first, second); } /** * @return */ public double getLower() { return getLeft(); } /** * @return */ public double getUpper() { return getRight(); } /** * @param field * @param subFieldDelim * @return */ public static DoubleRange create(String field, String subFieldDelim) { DoubleRange doubleRange = null; String[] items = field.split(subFieldDelim); if (items.length == 2) { doubleRange = new DoubleRange(Double.parseDouble(items[0]), Double.parseDouble(items[1])); } else if (items.length != 1){ throw new IllegalStateException("too many sub fields"); } return doubleRange; } } }