package eu.dnetlib.iis.wf.affmatching.normalize; import java.io.Serializable; import java.util.Set; import org.apache.commons.lang3.StringUtils; import com.google.common.collect.ImmutableList; import com.google.common.collect.Sets; import eu.dnetlib.iis.common.string.LenientComparisonStringNormalizer; import eu.dnetlib.iis.common.string.StringNormalizer; /** * {@link StringNormalizer} that normalizes organization name string * * @author madryk */ public class OrganizationNameNormalizer implements StringNormalizer, Serializable { private static final long serialVersionUID = 1L; private StringNormalizer innerNormalizer = new LenientComparisonStringNormalizer(ImmutableList.of(',', ';')); private Set<String> stopwords = Sets.newHashSet("of"); //------------------------ LOGIC -------------------------- /** * Normalizes the given organization name:<br/> * <ul> * <li>removes any text between brackets</li> * <li>performs inner normalization (using {@link LenientComparisonStringNormalizer} by default)</li> * <li>removes stopwords (defined in {@link #setStopwords(Set)})</li> * </ul> * First it removes any text between brackets and then * it uses internal {@link StringNormalizer} for further normalization. */ @Override public String normalize(String organizationName) { if (StringUtils.isBlank(organizationName)) { return ""; } // remove brackets String filteredOrganizationName = StringUtils.removePattern(organizationName, "\\(.*?\\)"); // internal normalization filteredOrganizationName = innerNormalizer.normalize(filteredOrganizationName); // remove stopwords for (String stopword : stopwords) { filteredOrganizationName = StringUtils.removePattern(filteredOrganizationName, "\\b" + stopword + "\\b"); } filteredOrganizationName = filteredOrganizationName.trim().replaceAll(" +", " "); return filteredOrganizationName; } //------------------------ SETTERS -------------------------- public void setInnerNormalizer(StringNormalizer innerNormalizer) { this.innerNormalizer = innerNormalizer; } public void setStopwords(Set<String> stopwords) { this.stopwords = stopwords; } }