package eu.dnetlib.iis.wf.affmatching.match.voter; import java.util.List; import java.util.function.Function; import org.apache.commons.lang3.StringUtils; import com.google.common.base.Objects; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; import eu.dnetlib.iis.wf.affmatching.model.AffMatchAffiliation; import eu.dnetlib.iis.wf.affmatching.model.AffMatchOrganization; /** * Match voter that checks if <br/> * the ratio of the common (same/ similar) words in organization names specified in {@link AffMatchAffiliation} and * {@link AffMatchOrganization} WITH REGARD TO all the words in the organization name in the given organization or affiliation<br/> * IS GREATER than some expected value. * * @author madryk, lukdumi */ public class CommonWordsVoter extends AbstractAffOrgMatchVoter { private static final long serialVersionUID = 1L; /** How the ratio of common words will be calculated */ public enum RatioRelation {/** the ratio of common words will be calculated with regard to the number of words in {@link AffMatchAffiliation#getOrganizationName()}*/ WITH_REGARD_TO_AFF_WORDS, /** the ratio of common words will be calculated with regard to the number of words in organization name in an organization object */ WITH_REGARD_TO_ORG_WORDS} private StringFilter stringFilter = new StringFilter(); private CommonSimilarWordCalculator commonSimilarWordCalculator; private RatioRelation ratioRelation; private final List<Character> charsToFilter; private final double minCommonWordsRatio; private final int wordToRemoveMaxLength; private Function<AffMatchOrganization, List<String>> getOrgNamesFunction = new GetOrgNameFunction(); //------------------------ CONSTRUCTORS -------------------------- /** * Default constructor * * @param charsToFilter list of characters that will be filtered out before comparing words * @param wordToRemoveMaxLength words with length equal or less than this value will be filtered out before comparing words. * Setting it to zero disables this feature. * @param minCommonWordsRatio minimum ratio of common words of the organization name in an affiliation and organization with regard to all the * words in the affiliation or organization. The value must be between (0,1]. * @param ratioRelation decides how the ratio of the common words will be calculated (with respect to the words in an affiliation or organization). * * @see StringSimilarityChecker#containsSimilarString(java.util.Collection, String, double) */ public CommonWordsVoter(List<Character> charsToFilter, int wordToRemoveMaxLength, double minCommonWordsRatio, RatioRelation ratioRelation) { super(); Preconditions.checkNotNull(charsToFilter); Preconditions.checkArgument(wordToRemoveMaxLength >= 0); Preconditions.checkArgument(minCommonWordsRatio > 0 && minCommonWordsRatio <= 1); Preconditions.checkNotNull(ratioRelation); this.charsToFilter = charsToFilter; this.wordToRemoveMaxLength = wordToRemoveMaxLength; this.minCommonWordsRatio = minCommonWordsRatio; this.ratioRelation = ratioRelation; } //------------------------ LOGIC -------------------------- /** * Returns true if the ratio of the common (same/ similar) words in organization names specified in {@link AffMatchAffiliation} and * {@link AffMatchOrganization} WITH REGARD TO all the words in the organization name in the given organization or affiliation<br/> * IS GREATER minCommonWordsRatio * * @see #CommonWordsVoter(List, int, double, RatioRelation) * @see #setGetOrgNamesFunction(Function) */ @Override public boolean voteMatch(AffMatchAffiliation affiliation, AffMatchOrganization organization) { String filteredAffName = stringFilter.filterCharsAndShortWords(affiliation.getOrganizationName(), charsToFilter, wordToRemoveMaxLength); if (StringUtils.isEmpty(filteredAffName)) { return false; } List<String> affWords = ImmutableList.copyOf(StringUtils.split(filteredAffName)); for (String orgName : getOrgNamesFunction.apply(organization)) { String filteredOrgName = stringFilter.filterCharsAndShortWords(orgName, charsToFilter, wordToRemoveMaxLength); if (StringUtils.isEmpty(filteredOrgName)) { continue; } List<String> orgWords = ImmutableList.copyOf(StringUtils.split(filteredOrgName)); if (isProperNumberOfSimilarWords(affWords, orgWords)) { return true; } } return false; } //------------------------ PRIVATE -------------------------- private boolean isProperNumberOfSimilarWords(List<String> affWords, List<String> orgWords) { double similarWordRatio = 0; if (ratioRelation == RatioRelation.WITH_REGARD_TO_AFF_WORDS) { similarWordRatio = commonSimilarWordCalculator.calcSimilarWordRatio(affWords, orgWords); } else { similarWordRatio = commonSimilarWordCalculator.calcSimilarWordRatio(orgWords, affWords); } return similarWordRatio >= minCommonWordsRatio; } //------------------------ SETTERS -------------------------- /** * Sets the function that will be used to get the organization names */ public void setGetOrgNamesFunction(Function<AffMatchOrganization, List<String>> getOrgNamesFunction) { this.getOrgNamesFunction = getOrgNamesFunction; } public void setCommonSimilarWordCalculator(CommonSimilarWordCalculator commonSimilarWordCalculator) { this.commonSimilarWordCalculator = commonSimilarWordCalculator; } public void setRatioRelation(RatioRelation ratioRelation) { Preconditions.checkNotNull(ratioRelation); this.ratioRelation = ratioRelation; } //------------------------ toString -------------------------- @Override public String toString() { return Objects.toStringHelper(this).add("matchStength", getMatchStrength()) .add("ratioRelation", ratioRelation) .add("charsToFilter", charsToFilter) .add("minFittingOrgWordsRatio", minCommonWordsRatio) .add("wordToRemoveMaxLength", wordToRemoveMaxLength) .add("getOrgNamesFunction", getOrgNamesFunction.getClass().getSimpleName()) .add("commonSimilarWordCalculator", commonSimilarWordCalculator) .toString(); } }