package pl.edu.icm.saos.importer.common.converter; import java.util.regex.Pattern; import javax.annotation.PostConstruct; import org.apache.commons.lang3.StringUtils; import org.springframework.stereotype.Service; import pl.edu.icm.saos.common.util.PersonNameNormalizer; import pl.edu.icm.saos.common.util.StringTools; /** * Service that normalizes (corrects) imported judge names. * * @author Łukasz Dumiszewski */ @Service("judgeNameNormalizer") public class JudgeNameNormalizer { private final String[] DEFAULT_PARTS_TO_REMOVE = new String[] { "sędzia", "sędziowie", "sędziego", "sądu", "rejonowego", "okręgowego", "apelacyjnego", "st. sekr.", "sekr. sąd.", "sekr", "sek.", "sądowy", "prof", "prof.", "dr", "sso", "s.s.o", "s.s.o.", "s s o", "s. s.o", "so", "so.", "s.o.", "s o", "soo", "sa", "sa.", "s.a.", "s a", "do", "spr", "spr.", "wso", "w", "gdańsku", "słupsku", "toruniu", "sw", "sprawozdawca", "protokolant", "stażysta", "przewodniczący", "p.o.", "ssa", "ss", "sss", "ssw", "s.s.r", "s.s.r.", "ssr", "spraw", "spraw.", "spr.aw", "sprw", "spr.awozdanie", "delegowana", "delegowany", "sn", "swsg", "gen", "gen.", "dyw.", "płk", "plk", "ppłk", "pplk", "sr", "s.r", "s r", "srr", "del", "del.", "dei.", "deleg.", "deleg", "dal.", "wdel", "s. ref.", "s. ref", "ref", "ref.", "z izby pracy", "z izby cywilnej", "pr." }; private String[] partsToRemove = DEFAULT_PARTS_TO_REMOVE; private String[] DEFAULT_REGEXES_TO_REMOVE = new String[] { "\\-(spr)\\s*$", "\\-(spraw.)\\s*$", "\\-(sprawozdawca)\\s*$", "\\-(ref.)\\s*$", "^\\s*w\\s+w\\s+", "^\\s*(sa)\\s*\\-", "^\\s*(sędzia)\\s*\\-", "^\\s*(sędzia sa)\\s*\\-", "^\\s*(del)\\s*\\-", }; private String[] regexesToRemove = DEFAULT_REGEXES_TO_REMOVE; private String[] allRegexesToRemove = null; //------------------------ LOGIC -------------------------- /** * Normalizes the given judgeName, i.e. removes whitespaces, capitalize first letters, and/or removes * parts of the name that seem to be a mistake (see: {@link #setRegexesToRemove(String[])} and {@link #setPartsToRemove(String[])})<br/> * <br/> * Uses {@link PersonNameNormalizer#normalize(String)} internally * <br/> * <pre> * normalize("Sędzia Jan Kowalski") -> "Jan Kowalski" * normalize("gen. Jan Kowalski ()") -> "Jan Kowalski" * normalize("SA- Jan Kowalski-Bzyk ()") -> "Jan Kowalski-Bzyk" * Assumption: sędzia, gen and sa are defined in {@link #setPartsToRemove(String[])} * </pre> */ public String normalize(String judgeName) { if (StringUtils.isBlank(judgeName)) { return ""; } judgeName = PersonNameNormalizer.normalize(judgeName); judgeName = StringTools.toRootLowerCase(judgeName); judgeName = removeDefinedParts(judgeName); judgeName = removeTrailingDashes(judgeName); judgeName = removeTrailingDots(judgeName); return PersonNameNormalizer.normalize(judgeName); } @PostConstruct public void postConstruct() { generatePartToRemoveRegexes(); } //------------------------ PRIVATE -------------------------- private void generatePartToRemoveRegexes() { allRegexesToRemove = new String[partsToRemove.length + regexesToRemove.length]; int j=0; for (j=0; j < regexesToRemove.length; j++) { allRegexesToRemove[j] = regexesToRemove[j]; } for (int i=0; i < partsToRemove.length; i++) { String partToRemove = Pattern.quote(partsToRemove[i]); String beginningPart = "^\\s*("+partToRemove+")"+"\\s+"; String middlePart = "\\s+("+partToRemove+")\\s+"; String endPart = "\\s+("+partToRemove+")\\s*$"; String partAsWhole = "^\\s*("+partToRemove+")\\s*$"; allRegexesToRemove[j+i] = beginningPart+"|"+middlePart+"|"+endPart+"|"+partAsWhole; } } private String removeDefinedParts(String judgeName) { if (allRegexesToRemove == null || allRegexesToRemove.length==0) { throw new IllegalStateException("no parsts or regexes to remove defined"); } for (String partToRemoveRegex : allRegexesToRemove) { judgeName = judgeName.replaceAll(partToRemoveRegex, " "); } return judgeName; } private String removeTrailingDashes(String judgeName) { return judgeName.replaceAll("^(\\s*\\-+)|(\\-+\\s*)$", ""); } private String removeTrailingDots(String judgeName) { return judgeName.replaceAll("^(\\s*\\.+)|(\\.+\\s*)$", ""); } //------------------------ SETTERS -------------------------- /** * Parts of a name that should be removed from judge names.<br/> * Appropriate regexes will be generated from these parts internally by adding white-spaces and/or beginning and end * characters and these regexes will be used later in the removal process:<br/> * regexToRemove = "^\\s("+partToRemove+")"+"\\s+|\\s+("+partToRemove+")\\s+|\\s+("+partToRemove)+"\\s*$; * <br/><br/> * * The case of characters doesn't matter. The non-alphabetic characters mustn't be included (they are removed * from the name of a judge before removing the given parts from it). You should however include dots [.] and/or * dashes [-] if appropriate. <br/><br/> * * Don't set this field if you want to use default values (see: {@link #DEFAULT_PARTS_TO_REMOVE} <br/> * */ public void setPartsToRemove(String[] partsToRemove) { this.partsToRemove = partsToRemove; } /** * Sets the strict regexes that should be removed from judge names.<br/> * You don't have to set this field if you want to use default values (see: {@link #DEFAULT_REGEXES_TO_REMOVE). <br/> */ public void setRegexesToRemove(String[] regexesToRemove) { this.regexesToRemove = regexesToRemove; } }