/** * AnalyzerBeans * Copyright (C) 2014 Neopost - Customer Information Management * * This copyrighted material is made available to anyone wishing to use, modify, * copy, or redistribute it subject to the terms and conditions of the GNU * Lesser General Public License, as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this distribution; if not, write to: * Free Software Foundation, Inc. * 51 Franklin Street, Fifth Floor * Boston, MA 02110-1301 USA */ package org.eobjects.analyzer.beans.standardize; import java.util.ArrayList; import java.util.List; import javax.inject.Inject; import org.eobjects.analyzer.beans.api.Categorized; import org.eobjects.analyzer.beans.api.Configured; import org.eobjects.analyzer.beans.api.Description; import org.eobjects.analyzer.beans.api.Initialize; import org.eobjects.analyzer.beans.api.OutputColumns; import org.eobjects.analyzer.beans.api.Transformer; import org.eobjects.analyzer.beans.api.TransformerBean; import org.eobjects.analyzer.beans.categories.MatchingAndStandardizationCategory; import org.eobjects.analyzer.data.InputColumn; import org.eobjects.analyzer.data.InputRow; import org.eobjects.analyzer.util.HasGroupLiteral; import org.eobjects.analyzer.util.NamedPattern; import org.eobjects.analyzer.util.NamedPatternMatch; /** * Tokenizes/standardizes four components of a full name: Firstname, Lastname, * Middlename and Titulation. * * */ @TransformerBean("Name standardizer") @Description("Identify the various parts of a full name column and turn it into separate, standardized tokens.") @Categorized({ MatchingAndStandardizationCategory.class }) public class NameStandardizerTransformer implements Transformer<String> { public static final String[] DEFAULT_PATTERNS = { "FIRSTNAME LASTNAME", "TITULATION. FIRSTNAME LASTNAME", "TITULATION FIRSTNAME LASTNAME", "FIRSTNAME MIDDLENAME LASTNAME", "TITULATION. FIRSTNAME MIDDLENAME LASTNAME", "LASTNAME, FIRSTNAME", "LASTNAME, FIRSTNAME MIDDLENAME" }; public static enum NamePart implements HasGroupLiteral { FIRSTNAME, LASTNAME, MIDDLENAME, TITULATION; @Override public String getGroupLiteral() { if (this == TITULATION) { return "(Mr|Ms|Mrs|Hr|Fru|Frk|Miss|Mister)"; } return null; } } @Inject @Configured InputColumn<String> inputColumn; @Inject @Configured("Name patterns") String[] stringPatterns = DEFAULT_PATTERNS; private List<NamedPattern<NamePart>> namedPatterns; @Initialize public void init() { if (stringPatterns == null) { stringPatterns = new String[0]; } namedPatterns = new ArrayList<NamedPattern<NamePart>>(); for (String stringPattern : stringPatterns) { namedPatterns.add(new NamedPattern<NamePart>(stringPattern, NamePart.class)); } } @Override public OutputColumns getOutputColumns() { return new OutputColumns("Firstname", "Lastname", "Middlename", "Titulation"); } @Override public String[] transform(InputRow inputRow) { String value = inputRow.getValue(inputColumn); return transform(value); } public String[] transform(String value) { String firstName = null; String lastName = null; String middleName = null; String titulation = null; if (value != null) { for (NamedPattern<NamePart> namedPattern : namedPatterns) { NamedPatternMatch<NamePart> match = namedPattern.match(value); if (match != null) { firstName = match.get(NamePart.FIRSTNAME); lastName = match.get(NamePart.LASTNAME); middleName = match.get(NamePart.MIDDLENAME); titulation = match.get(NamePart.TITULATION); break; } } } return new String[] { firstName, lastName, middleName, titulation }; } @SuppressWarnings("unchecked") public void setInputColumn(InputColumn<?> inputColumn) { this.inputColumn = (InputColumn<String>) inputColumn; } public void setStringPatterns(String... stringPatterns) { this.stringPatterns = stringPatterns; } }