/* * Sifarish: Recommendation Engine * Author: Pranab Ghosh * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package org.sifarish.feature; import java.io.IOException; import java.text.ParseException; import java.util.ArrayList; import java.util.List; import org.apache.commons.lang3.ArrayUtils; import org.sifarish.etl.StructuredTextNormalizer; import org.sifarish.util.Event; import org.sifarish.util.Field; import org.sifarish.util.HourWindow; import org.sifarish.util.Location; import org.sifarish.util.TimeWindow; /** * Finds distance between two records. Supports various kinds of attributes * @author pranab * */ public class RecordDistanceFinder { private boolean mixedInSets; private String fieldDelimRegex; private int idOrdinal; private int setIdSize; private int distThreshold; private DistanceStrategy distStrategy; private SingleTypeSchema schema; private int[] facetedFields; private boolean includePassiveFields; private int[] passiveFields; private DynamicAttrSimilarityStrategy textSimStrategy; private String subFieldDelim; private StructuredTextNormalizer textNormalizer; /** * @param fieldDelimRegex * @param idOrdinal * @param distThreshold * @param distStrategy * @param schema * @param textSimStrategy * @param subFieldDelim */ public RecordDistanceFinder(String fieldDelimRegex, int idOrdinal, int scale, int distThreshold, SingleTypeSchema schema, String subFieldDelim, StructuredTextNormalizer textNormalizer) { super(); this.fieldDelimRegex = fieldDelimRegex; this.idOrdinal = idOrdinal; this.distThreshold = distThreshold; this.distStrategy = schema.createDistanceStrategy(scale); this.schema = schema; this.textSimStrategy =schema.createTextSimilarityStrategy(); this.subFieldDelim = subFieldDelim; this.textNormalizer = textNormalizer; } /** * @param fieldDelimRegex * @param idOrdinal * @param scale * @param distThreshold * @param schema * @param subFieldDelim */ public RecordDistanceFinder(String fieldDelimRegex, int idOrdinal, int scale, int distThreshold, SingleTypeSchema schema, String subFieldDelim) { this(fieldDelimRegex, idOrdinal, scale, distThreshold, schema, subFieldDelim,null); } /** * @param mixedInSets * @return */ public RecordDistanceFinder withMixedInSets(boolean mixedInSets) { this.mixedInSets = mixedInSets; return this; } /** * @param setIdSize * @return */ public RecordDistanceFinder withSetIdSize(int setIdSize) { this.setIdSize = setIdSize; return this; } /** * @param facetedFields * @return */ public RecordDistanceFinder withFacetedFields(int[] facetedFields) { this.facetedFields = facetedFields; return this; } /** * @param includePassiveFields * @return */ public RecordDistanceFinder withIncludePassiveFields(boolean includePassiveFields) { this.includePassiveFields = includePassiveFields; return this; } /** * @param passiveFields * @return */ public RecordDistanceFinder withPassiveFields(int[] passiveFields) { this.passiveFields = passiveFields; return this; } /** * @param first * @param second * @return * @throws IOException */ public int findDistance(String first, String second) throws IOException { String[] firstItems = first.split(fieldDelimRegex); String[] secondItems = second.split(fieldDelimRegex); return findDistance(firstItems, secondItems); } /** * @param first * @param second * @return * @throws IOException */ public int findDistance(String[] firstItems, String[] secondItems) throws IOException { String firstId = firstItems[idOrdinal]; String secondId = secondItems[idOrdinal]; int netDist = 0; //if inter set matching with mixed in sets, match only same ID from different sets if (mixedInSets) { //entityID is concatenation of setID and real entityID String firstEntityId = firstId.substring(setIdSize); String secondEntityId = secondId.substring(setIdSize); if (!firstEntityId.equals(secondEntityId)) { netDist = distThreshold + 1; return netDist; } } double dist = 0; boolean valid = false; distStrategy.initialize(); List<Integer> activeFields = null; boolean thresholdCrossed = false; for (Field field : schema.getEntity().getFields()) { if (null != facetedFields) { //if facetted set but field not included, then skip it if (!ArrayUtils.contains(facetedFields, field.getOrdinal())) { continue; } } //if ID or class attribute field, skip it if (field.isId() || field.isClassAttribute()) { continue; } //track fields participating is dist calculation if (includePassiveFields && null == passiveFields) { if (null == activeFields) { activeFields = new ArrayList<Integer>(); } activeFields.add(field.getOrdinal()); } //extract fields String firstAttr = ""; if (field.getOrdinal() < firstItems.length ){ firstAttr = firstItems[field.getOrdinal()]; } else { throw new IOException("Invalid field ordinal. Looking for field " + field.getOrdinal() + " found " + firstItems.length + " fields in the record starting with :" + firstItems[0]); } String secondAttr = ""; if (field.getOrdinal() < secondItems.length ){ secondAttr = secondItems[field.getOrdinal()]; }else { throw new IOException("Invalid field ordinal. Looking for field " + field.getOrdinal() + " found " + secondItems.length + " fields in the record starting with:" + secondItems[0]); } String unit = field.getUnit(); if (firstAttr.isEmpty() || secondAttr.isEmpty() ) { //handle missing value String missingValueHandler = schema.getMissingValueHandler(); if (missingValueHandler.equals("default")) { dist = 1.0; } else if (missingValueHandler.equals("skip")) { continue; } else { //custom handler } } else { dist = 0; if (field.getDataType().equals(Field.DATA_TYPE_CATEGORICAL)) { //categorical dist = field.findDistance(firstAttr, secondAttr); } else if (field.getDataType().equals(Field.DATA_TYPE_INT)) { //int dist = numericDistance(field, firstAttr, secondAttr, true); } else if (field.getDataType().equals(Field.DATA_TYPE_DOUBLE)) { //double dist = numericDistance( field, firstAttr, secondAttr, false); } else if (field.getDataType().equals(Field.DATA_TYPE_TEXT)) { //text dist = textDistance(field, firstAttr, secondAttr); } else if (field.getDataType().equals(Field.DATA_TYPE_TIME_WINDOW)) { //time window dist = timeWindowDistance(field, firstAttr, secondAttr); } else if (field.getDataType().equals(Field.DATA_TYPE_HOUR_WINDOW)) { //hour window dist = hourWindowDistance(field, firstAttr, secondAttr); } else if (field.getDataType().equals(Field.DATA_TYPE_LOCATION)) { //location dist = locationDistance(field, firstAttr, secondAttr); } else if (field.getDataType().equals(Field.DATA_TYPE_GEO_LOCATION)) { //geo location dist = geoLocationDistance(field, firstAttr, secondAttr); } else if (field.getDataType().equals(Field.DATA_TYPE_EVENT)) { //event dist = eventDistance(field, firstAttr, secondAttr); } } //if threshold crossed for this attribute, skip the remaining attributes of the entity pair thresholdCrossed = field.isDistanceThresholdCrossed(dist); if (thresholdCrossed){ break; } //aggregate attribute distance for all entity attributes distStrategy.accumulate(dist, field); } //initialize passive fields if (includePassiveFields && null == passiveFields) { intializePassiveFieldOrdinal(activeFields, firstItems.length); } netDist = thresholdCrossed? distThreshold + 1 : distStrategy.getSimilarity(); return netDist; } /** * @param field * @param firstAttr * @param secondAttr * @return * @throws IOException */ private double textDistance(Field field, String firstAttr, String secondAttr) throws IOException { double dist = 0; if (field.getDataSubType() == Field.TEXT_TYPE_PERSON_NAME) { dist = personNameDistance(field, firstAttr, secondAttr); } if (field.getDataSubType() == Field.TEXT_TYPE_STREET_ADDRESS) { dist = streetAddressDistance(field, firstAttr, secondAttr); } else { dist = textSimStrategy.findDistance(firstAttr, secondAttr); } return dist; } /** * @param field * @param firstAttr * @param secondAttr * @return */ private double numericDistance(Field field, String firstAttr, String secondAttr, boolean isInt) { double dist = 0; String[] firstValItems = firstAttr.split("\\s+"); String[] secondValItems = secondAttr.split("\\s+"); boolean valid = false; String unit = field.getUnit(); if (firstValItems.length == 1 && secondValItems.length == 1){ valid = true; } else if (firstValItems.length == 2 && secondValItems.length == 2 && firstValItems[1].equals(unit) && secondValItems[1].equals(unit)) { valid = true; } if (valid) { try { if (isInt) { dist = field.findDistance(Integer.parseInt(firstValItems[0]), Integer.parseInt(secondValItems[0]), schema.getNumericDiffThreshold()); } else { dist = field.findDistance(Double.parseDouble(firstValItems[0]), Double.parseDouble(secondValItems[0]), schema.getNumericDiffThreshold()); } } catch (NumberFormatException nfEx) { } } else { } return dist; } /** * Distance as overlap between time ranges * @param field * @param firstAttr * @param secondAttr * @param context * @return */ private double timeWindowDistance(Field field, String firstAttr, String secondAttr) { double dist = 0; try { String[] subFields = firstAttr.split(subFieldDelim); TimeWindow firstTimeWindow = new TimeWindow(subFields[0], subFields[1]); subFields = secondAttr.split(subFieldDelim); TimeWindow secondTimeWindow = new TimeWindow(subFields[0], subFields[1]); dist = field.findDistance(firstTimeWindow, secondTimeWindow); } catch (ParseException e) { //context.getCounter("Invalid Data Format", "Field:" + field.getOrdinal()).increment(1); } return dist; } /** * @param field * @param firstAttr * @param secondAttr * @param context * @return */ private double hourWindowDistance(Field field, String firstAttr, String secondAttr) { double dist = 0; try { String[] subFields = firstAttr.split(subFieldDelim); HourWindow firstTimeWindow = new HourWindow(subFields[0], subFields[1]); subFields = secondAttr.split(subFieldDelim); HourWindow secondTimeWindow = new HourWindow(subFields[0], subFields[1]); dist = field.findDistance(firstTimeWindow, secondTimeWindow); } catch (ParseException e) { //context.getCounter("Invalid Data Format", "Field:" + field.getOrdinal()).increment(1); } return dist; } /** * @param field * @param firstAttr * @param secondAttr * @param context * @return */ private double locationDistance(Field field, String firstAttr, String secondAttr) { double dist = 0; String[] subFields = firstAttr.split(subFieldDelim); Location firstLocation = new Location( subFields[0], subFields[1], subFields[2]); subFields = secondAttr.split(subFieldDelim); Location secondLocation = new Location( subFields[0], subFields[1], subFields[2]); dist = field.findDistance(firstLocation, secondLocation); return dist; } /** * @param field * @param firstAttr * @param secondAttr * @param context * @return */ private double geoLocationDistance(Field field, String firstAttr, String secondAttr) { double dist = org.sifarish.util.Utility.getGeoDistance(firstAttr, secondAttr); dist /= field.getMaxDistance(); dist = dist <= 1.0 ? dist : 1.0; return dist; } /** * @param activeFields * @param numFields */ private void intializePassiveFieldOrdinal(List<Integer> activeFields, int numFields) { int len = numFields - activeFields.size(); if (len > 0) { //all fields that are not active i.e not defined in schema passiveFields = new int[len]; for (int i = 0,j=0; i < numFields; ++i) { if (!activeFields.contains(i) ) { passiveFields[j++] = i; } } } } /** * @param field * @param firstAttr * @param secondAttr * @param context * @return */ private double eventDistance(Field field, String firstAttr, String secondAttr) { double dist = 0; try { double[] locationWeights = schema.getLocationComponentWeights(); String[] subFields = firstAttr.split(subFieldDelim); String description = subFields[0]; Location location = new Location( subFields[1], subFields[2], subFields[3]); TimeWindow timeWindow = new TimeWindow(subFields[4], subFields[5]); Event firstEvent = new Event(description, location, timeWindow, locationWeights); subFields = secondAttr.split(subFieldDelim); description = subFields[0]; location = new Location( subFields[1], subFields[2], subFields[3]); timeWindow = new TimeWindow(subFields[4], subFields[5]); Event secondEvent = new Event(description, location, timeWindow, locationWeights); dist = field.findDistance(firstEvent, secondEvent); } catch (ParseException e) { //context.getCounter("Invalid Data Format", "Field:" + field.getOrdinal()).increment(1); } return dist; } /** * @param field * @param firstAttr * @param secondAttr * @return * @throws IOException */ private double personNameDistance(Field field, String firstAttr, String secondAttr) throws IOException { double dist = 0; String[] firstItems = firstAttr.split("\\s+"); String[] secondItems = secondAttr.split("\\s+"); double firstNameDist = textSimStrategy.findDistance(firstItems[0], secondItems[0]); double lastNameDist = textSimStrategy.findDistance(firstItems[firstItems.length-1], secondItems[secondItems.length-1]); dist = firstNameDist * field.getPartWeights()[0] + lastNameDist * field.getPartWeights()[1]; return dist; } /** * @param field * @param firstAttr * @param secondAttr * @return * @throws IOException */ private double streetAddressDistance(Field field, String firstAttr, String secondAttr) throws IOException { double dist = 0; String[] firstStreetCoponents = getStreetComponents(firstAttr); String[] secondStreetCoponents = getStreetComponents(secondAttr); dist = textSimStrategy.findDistance(firstStreetCoponents[0], secondStreetCoponents[0]) * field.getPartWeights()[0] + textSimStrategy.findDistance(firstStreetCoponents[1], secondStreetCoponents[1]) * field.getPartWeights()[1]; return dist; } /** * @param address * @return */ private String[] getStreetComponents(String address) { String baseAddress = ""; int pos; String[] streeTypes = {"Street", "Avenue", "Road", "Boulevard"}; for (String streetType : streeTypes) { pos = address.indexOf(streetType); if (pos > 0) { baseAddress = address.substring(0, pos) + streetType; break; } } String[] streetCoponents = new String[2]; pos = baseAddress.indexOf("\\s+"); streetCoponents[0] = baseAddress.substring(0, pos); streetCoponents[1] = baseAddress.substring(pos).trim(); return streetCoponents; } }