/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is OpenEMRConnect.
*
* The Initial Developer of the Original Code is International Training &
* Education Center for Health (I-TECH) <http://www.go2itech.org/>
*
* Portions created by the Initial Developer are Copyright (C) 2011
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
*
* ***** END LICENSE BLOCK ***** */
package ke.go.moh.oec.mpi.match;
import java.util.logging.Level;
import ke.go.moh.oec.lib.Mediator;
import ke.go.moh.oec.mpi.Scorecard;
/**
* Represents a string value for matching.
*
* @author Jim Grace
*/
public class StringMatch {
/**
* Defines different ways of matching.
* Normal matching just considers the edit distance between two strings.
* Substring matching considers that one string may be a substring of the other.
*/
public enum MatchType {
NORMAL,
SUBSTRING;
}
/** Original string (all lower case and trimmed) */
private String original = null;
public StringMatch(String original) {
if (original != null) {
original = original.toLowerCase().trim();
if (original.length() > 0) {
this.original = original; // (For matching, always store empty string as null.)
}
}
}
public String getOriginal() {
return original;
}
/**
* Scores this StringMatch for possible matching against another, using a scorecard.
*
* @param s scorecard
* @param other other string to compare with
*/
public void score(Scorecard s, StringMatch other) {
if (original == null) {
s.addScore(Scorecard.SEARCH_TERM_MISSING_WEIGHT, 0, Scorecard.SearchTerm.MISSING);
} else if (other.original == null) {
s.addScore(Scorecard.MPI_VALUE_MISSING_WEIGHT, 0);
} else {
Double score = computeScore(this, other);
if (score != null) {
double weight = Scorecard.OTHER_MATCH_WEIGHT;
if (score == 0) {
weight = Scorecard.OTHER_MISS_WEIGHT;
}
s.addScore(weight, score);
if (Mediator.testLoggerLevel(Level.FINEST)) {
Mediator.getLogger(StringMatch.class.getName()).log(Level.FINEST,
"Score {0},{1} total {2},{3},{4} comparing {5} with {6}",
new Object[]{score, weight, s.getTotalScore(), s.getTotalWeight(), s.getSearchTermScore(), original, other.original});
}
}
}
}
/**
* Computes the score as a result of matching two StringMatch objects,
* using approximate matching.
* The score is returned as a double precision floating point number on a
* scale of 0 to 1, where 0 means no match at all, and 1 means a perfect match.
* <p>
* Returns a score of 1 if the two strings are identical.
* Returns between 0 and 1 if the strings are not identical, but
* are close according to the edit distance between the two strings.
* <p>
* Returns null if one or both of the strings is null.
* <p>
* For a string match of type SUBSTRING, it is assumed that one string may
* be a substring of the other. This is used, for example, for site matching.
* The site name may be "Siaya District Hospital", while the search term
* entered for this may be "Siaya". For a SUBSTRING match, we expect that
* a substring may have a high score, but there may be many unmatched characters
* in one of the strings.
*
* @param sm1 the first string to match
* @param sm2 the second string to match
* @param type type of string match to use
* @return the score from matching the two strings.
* Returns null if one or the other string is null.
*/
public static Double computeScore(StringMatch sm1, StringMatch sm2, MatchType type) {
Double score = null;
String s1 = sm1.original;
String s2 = sm2.original;
if (s1 != null && s2 != null) {
score = 0.0;
if (s1.equals(s2)) {
score = 1.0;
} else {
int distance = Levenshtein.damerauLevenshteinDistance(s1, s2);
switch (type) {
case NORMAL:
break;
case SUBSTRING:
int lengthDiff = Math.abs(s1.length() - s2.length());
if (lengthDiff > 2) {
distance -= (lengthDiff - 2);
break;
}
}
score = 1.0 - (0.2 * distance);
if (score < 0.0) {
score = 0.0;
}
}
Mediator.getLogger(StringMatch.class.getName()).log(Level.FINEST,
"StringMatch.computeScore({0},{1}) = {2}", new Object[]{s1, s2, score});
}
return score;
}
/**
* Computes the score as a result of matching two StringMatch objects,
* using approximate matching, and using NORMAL string matching.
* See the documentation for the other computeScore() overload.
*
* @param sm1 the first string to match
* @param sm2 the second string to match
* @return the score from matching the two strings.
*/
public static Double computeScore(StringMatch sm1, StringMatch sm2) {
return computeScore(sm1, sm2, StringMatch.MatchType.NORMAL);
}
}