package eu.dnetlib.iis.wf.affmatching.orgsection; import java.io.Serializable; import java.util.List; import org.apache.commons.lang3.StringUtils; import com.google.common.collect.ImmutableList; import com.google.common.collect.Lists; import eu.dnetlib.iis.wf.affmatching.orgsection.OrganizationSection.OrgSectionType; /** * Splitter of organization name into sections. * * @author madryk */ public class OrganizationSectionsSplitter implements Serializable { private static final long serialVersionUID = 1L; private static final List<String> RESTRICTED_SECTIONS = ImmutableList.<String>builder() .add("ltd") .add("inc") .build(); //------------------------ LOGIC -------------------------- /** * Splits provided organization name into sections.<br/> * Method assumes that sections are separated by a comma or by a semicolon in * organization name string. */ public List<String> splitToSections(String organizationName) { String[] sectionsArray = StringUtils.split(organizationName, ",;"); List<String> sections = Lists.newArrayList(); for (int i=0; i<sectionsArray.length; ++i) { String section = sectionsArray[i].trim(); if (StringUtils.isNotBlank(section) && !RESTRICTED_SECTIONS.contains(section)) { sections.add(section); } } return sections; } /** * Splits provided organization name into sections.<br/> * Internally uses {@link #splitToSections(String)}. * * @return detailed information about sections */ public List<OrganizationSection> splitToSectionsDetailed(String organizationName) { List<String> sections = splitToSections(organizationName); List<OrganizationSection> sectionsDetailed = Lists.newArrayList(); for (String section : sections) { OrganizationSection sectionDetailed = buildSectionDetailed(section); sectionsDetailed.add(sectionDetailed); } return sectionsDetailed; } //------------------------ PRIVATE -------------------------- private OrganizationSection buildSectionDetailed(String section) { String[] sectionWords = section.split(" "); int universityWordPos = findWordStartingWithAny(sectionWords, "univ", "uniw"); if (universityWordPos != -1) { return new OrganizationSection(OrgSectionType.UNIVERSITY, sectionWords, universityWordPos); } return new OrganizationSection(OrgSectionType.UNKNOWN, sectionWords, -1); } private int findWordStartingWithAny(String[] words, String ... wordStart) { for (int i=0; i<words.length; ++i) { if (StringUtils.startsWithAny(words[i], wordStart)) { return i; } } return -1; } }