package eu.dnetlib.iis.wf.affmatching.match.voter;
import java.util.List;
import java.util.function.Function;
import org.apache.commons.lang3.StringUtils;
import com.google.common.base.Objects;
import com.google.common.base.Preconditions;
import eu.dnetlib.iis.wf.affmatching.model.AffMatchAffiliation;
import eu.dnetlib.iis.wf.affmatching.model.AffMatchOrganization;
/**
* Match voter that splits {@link AffMatchAffiliation#getOrganizationName()}
* and {@link AffMatchOrganization#getName()} into sections.<br/>
* This voter votes for match if all of the organization sections
* have similar sections in affiliation organization name.<br/>
* Similarity is measured based on Levenshtein distance.
*
* @author madryk
*/
public class SectionedNameLevenshteinMatchVoter extends AbstractSectionedMatchVoter {
private static final long serialVersionUID = 1L;
private final double minSimilarity;
private Function<AffMatchOrganization, List<String>> getOrgNamesFunction = new GetOrgNameFunction();
//------------------------ CONSTRUCTORS --------------------------
/**
* Default constructor
*
* @param minSimilarity - minimum similarity for two sections to be found similar.
* Value must be between (0,1] (minimum similarity equal to one means
* that two sections must be equal).
* @see #containsOrgSection(List, String)
*/
public SectionedNameLevenshteinMatchVoter(double minSimilarity) {
super();
Preconditions.checkArgument(minSimilarity > 0 && minSimilarity <= 1);
this.minSimilarity = minSimilarity;
}
//------------------------ LOGIC --------------------------
/**
* Returns true if any of the affiliation name sections is similar to
* the organization name section.<br/>
* Similarity is measured based on Levenshtein distance according to
* the following formula:<br/>
* <code>similarity = 1 - (levenshteinDistance(a, b) / max(a.length(), b.length()))</code><br/>
* where <code>a</code> is affiliation name section and <code>b</code>
* is organization name section.
*/
@Override
protected boolean containsOrgSection(List<String> affOrgNameSections, String orgNameSection) {
for (String section : affOrgNameSections) {
int distance = StringUtils.getLevenshteinDistance(section, orgNameSection);
int maxCharacters = Math.max(section.length(), orgNameSection.length());
double similarity = 1 - (double)distance/maxCharacters;
if (similarity >= minSimilarity) {
return true;
}
}
return false;
}
@Override
protected List<String> getOrganizationNames(AffMatchOrganization organization) {
return getOrgNamesFunction.apply(organization);
}
//------------------------ SETTERS --------------------------
/**
* Sets the function that will be used to get the organization names
*/
public void setGetOrgNamesFunction(Function<AffMatchOrganization, List<String>> getOrgNamesFunction) {
this.getOrgNamesFunction = getOrgNamesFunction;
}
//------------------------ toString --------------------------
@Override
public String toString() {
return Objects.toStringHelper(this)
.add("minSimilarity", minSimilarity)
.add("getOrgNamesFunction", getOrgNamesFunction.getClass().getSimpleName())
.toString();
}
}