package eu.dnetlib.iis.wf.affmatching.bucket; import java.io.Serializable; import org.apache.commons.lang3.StringUtils; import eu.dnetlib.iis.wf.affmatching.orgsection.OrganizationSection; /** * Class that generates hash of the given {@link OrganizationSection}. * * @author madryk */ public class OrganizationSectionHasher implements Serializable { private static final long serialVersionUID = 1L; private int numberOfLettersPerWord = 3; //------------------------ LOGIC -------------------------- /** * Returns a hash of the passed section.<br/> * * Hash is generated from first letters of {@link OrganizationSection#getType()}, * first {@link OrganizationSectionHasher#setNumberOfLettersPerWord(int)} letters * of one word before and one word after the type significant word * {@link OrganizationSection#getTypeSignificantWordPos()}.<br/> * * If passed section has no type significant word then * the first and the second word of the section will be used.<br/> * * Hash length is always the same. Any missing characters will be replaced * by underscores.<br/><br/> * */ public String hash(OrganizationSection section) { if (section.getTypeSignificantWordPos() == -1) { return hashUsingFirstWords(section); } return hashUsingAdjacentWords(section); } //------------------------ PRIVATE -------------------------- private String hashUsingAdjacentWords(OrganizationSection section) { String[] words = section.getSectionWords(); int typeSignificantWordPos = section.getTypeSignificantWordPos(); String wordBefore = (typeSignificantWordPos <= 0) ? "" : words[typeSignificantWordPos - 1]; String wordAfter = (typeSignificantWordPos >= words.length-1) ? "" : words[typeSignificantWordPos + 1]; String wordBeforeAfterHash = generateSortedHash(wordBefore, wordAfter); return generateWordHash(section.getType().name(), OrganizationSection.SECTION_NUMBER_OF_LETTERS) + wordBeforeAfterHash; } private String hashUsingFirstWords(OrganizationSection section) { String[] words = section.getSectionWords(); String firstWord = words.length == 0 ? "" : words[0]; String secondWord = words.length <= 1 ? "" : words[1]; String wordFirstSecondHash = generateSortedHash(firstWord, secondWord); return generateWordHash(section.getType().name(), OrganizationSection.SECTION_NUMBER_OF_LETTERS) + wordFirstSecondHash; } private String generateSortedHash(String firstWord, String secondWord) { String wordBeforeHash = generateWordHash(firstWord); String wordAfterHash = generateWordHash(secondWord); String wordBeforeAfterHash = wordBeforeHash + wordAfterHash; if (wordBeforeHash.compareTo(wordAfterHash) > 0) { wordBeforeAfterHash = wordAfterHash + wordBeforeHash; } return wordBeforeAfterHash; } private String generateWordHash(String word) { return generateWordHash(word, numberOfLettersPerWord); } private String generateWordHash(String word, int numberOfLettersPerWord) { return StringUtils.rightPad(word, numberOfLettersPerWord, '_').substring(0, numberOfLettersPerWord); } //------------------------ SETTERS -------------------------- public void setNumberOfLettersPerWord(int numberOfLettersPerWord) { this.numberOfLettersPerWord = numberOfLettersPerWord; } }