/*#####################################################################
*
* CLAVIN (Cartographic Location And Vicinity INdexer)
* ---------------------------------------------------
*
* Copyright (C) 2012-2013 Berico Technologies
* http://clavin.bericotechnologies.com
*
* ====================================================================
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*
* ====================================================================
*
* DefaultScorer.java
*
*###################################################################*/
package com.bericotech.clavin.resolver.multipart;
import static com.bericotech.clavin.util.DamerauLevenshtein.damerauLevenshteinDistanceCaseInsensitive;
import com.bericotech.clavin.resolver.multipart.MatchedLocation.Match;
import java.util.Collections;
import java.util.EnumMap;
import java.util.List;
import java.util.Map;
/**
* The default scorer combines several weighted metrics to return a score
* in the range [0.0, 1.0]. The score components are:
*
* - Match Ratio (M)
* the number of matched terms / the number of input terms
*
* - Component Score (C)
* A metric boosting the scores of matches containing particular location
* types. From highest to lowest: CITY, COUNTRY, ADMIN1, ADMIN2, ADMIN3,
* ADMIN4
*
* - Average Damerau-Levenshtein (DL) Distance (D)
* The average DL distance (number of edits) between the original search
* term and the matched name. This metric is inverted (1/DL) because
* lower DL numbers are better. Identical matches are given a score of 1.
*
* - Average Search Depth (S)
* The average rank within the search results for the parents of the
* most specific match. Matches including only a single location will
* receive a score of 1.
*
* The components are weighted according to the following formula:
* <code>(0.40 * M) + (0.25 * C) + (0.20 * D) + (0.15 * S)</code>
*/
public class DefaultScorer implements Scorer {
private static final double MATCH_RATIO_WEIGHT = 0.40d;
private static final double COMPONENT_SCORE_WEIGHT = 0.25d;
private static final double DL_DISTANCE_WEIGHT = 0.20d;
private static final double SEARCH_DEPTH_WEIGHT = 0.15d;
private static final Map<SearchLevel, Integer> COMPONENT_WEIGHTS;
private static final Map<SearchLevel, Integer> SINGLE_COMPONENT_WEIGHTS;
private static final int MAX_COMPONENT_WEIGHT;
private static final int MAX_SINGLE_COMPONENT_WEIGHT;
static {
Map<SearchLevel, Integer> weightMap = new EnumMap<SearchLevel, Integer>(SearchLevel.class);
weightMap.put(SearchLevel.CITY, 5);
weightMap.put(SearchLevel.COUNTRY, 4);
weightMap.put(SearchLevel.ADMIN1, 3);
weightMap.put(SearchLevel.ADMIN2, 1);
weightMap.put(SearchLevel.ADMIN3, 1);
weightMap.put(SearchLevel.ADMIN4, 1);
weightMap.put(SearchLevel.ADMINX, 1);
COMPONENT_WEIGHTS = Collections.unmodifiableMap(weightMap);
int maxWeight = 0;
for (Integer weight : weightMap.values()) {
maxWeight += weight;
}
MAX_COMPONENT_WEIGHT = maxWeight;
Map<SearchLevel, Integer> singleWeightMap = new EnumMap<SearchLevel, Integer>(SearchLevel.class);
singleWeightMap.put(SearchLevel.COUNTRY, 5);
singleWeightMap.put(SearchLevel.ADMIN1, 4);
singleWeightMap.put(SearchLevel.CITY, 3);
singleWeightMap.put(SearchLevel.ADMIN2, 2);
singleWeightMap.put(SearchLevel.ADMIN3, 1);
singleWeightMap.put(SearchLevel.ADMIN4, 1);
singleWeightMap.put(SearchLevel.ADMINX, 1);
SINGLE_COMPONENT_WEIGHTS = Collections.unmodifiableMap(singleWeightMap);
int maxSingleWeight = 0;
for (Integer weight : singleWeightMap.values()) {
maxSingleWeight += weight;
}
MAX_SINGLE_COMPONENT_WEIGHT = maxSingleWeight;
}
@Override
public double score(final List<String> terms, final MatchedLocation candidate) {
int matchCount = candidate.getMatchCount();
double totalDL = 0.0d;
int compWeight = 0;
double totalDepth = 0.0d;
for (Match match : candidate.getMatches()) {
// calculate inverse DL distance
int dl = damerauLevenshteinDistanceCaseInsensitive(match.getLocation().getLocation().getText(),
match.getLocation().getMatchedName());
totalDL += dl > 0 ? 1.0d / dl : 1.0d;
// calculate component weight
Integer weight = matchCount > 1 ? COMPONENT_WEIGHTS.get(match.getLevel()) : SINGLE_COMPONENT_WEIGHTS.get(match.getLevel());
compWeight += weight != null ? weight : 0;
// calculate inverse search depth; since depths are 0-indexed, increase all values by 1
totalDepth += 1.0d / (match.getDepth() + 1);
}
double matchRatio = (double) matchCount / terms.size();
double avgDL = totalDL / matchCount;
double compScore = (double) compWeight / (matchCount > 1 ? MAX_COMPONENT_WEIGHT : MAX_SINGLE_COMPONENT_WEIGHT);
double avgDepth = totalDepth / matchCount;
double score = (MATCH_RATIO_WEIGHT * matchRatio) +
(DL_DISTANCE_WEIGHT * avgDL) +
(COMPONENT_SCORE_WEIGHT * compScore) +
(SEARCH_DEPTH_WEIGHT * avgDepth);
return score;
}
@Override
public double getMinimumScore() {
return 0.0d;
}
@Override
public double getMaximumScore() {
return 1.0d;
}
}