/* * Sifarish: Recommendation Engine * Author: Pranab Ghosh * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package org.sifarish.etl; import java.io.IOException; import java.util.HashMap; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; import org.sifarish.feature.DynamicAttrSimilarityStrategy; /** * Country specific formats for different kinds of structured text data * @author pranab * */ public abstract class CountryStandardFormat { protected Map<String, String> stateCodes = new HashMap<String, String>(); private static String fullName = "(\\w{2,})\\s+(\\w{2,})\\s+(\\w{2,})"; private static String firstFullName = "(\\w{2,})\\s+(\\w{2,})"; private static String firstFullMidIntialName = "(\\w{2,})\\s+(\\w{1})\\s+(\\w{2,})"; private static String firstMidIntialName = "(\\w{1})\\s+(\\w{1})\\s+(\\w{2,})"; private static String lastNameFirstFirstIntialName = "(\\w{2,}),\\s+(\\w{1})"; private static String lastNameFirstFirstName = "(\\w{2,}),\\s+(\\w{2,})"; private static String lastNameFirstFirstMidIntialName = "(\\w{2,}),\\s+(\\w{1})\\s+(\\w{1})"; private static String lastNameFirstFirstMidName = "(\\w{2,}),\\s+(\\w{2,})\\s+(\\w{2,})"; private static Pattern fullNamePattern = Pattern.compile(fullName); private static Pattern firstFullNamePattern = Pattern.compile(firstFullName); private static Pattern firstFullMidIntialNamePattern = Pattern.compile(firstFullMidIntialName); private static Pattern firstMidIntialNamePattern = Pattern.compile(firstMidIntialName); private static Pattern lastNameFirstFirstIntialNamePattern = Pattern.compile(lastNameFirstFirstIntialName); private static Pattern lastNameFirstFirstNamePattern = Pattern.compile(lastNameFirstFirstName); private static Pattern lastNameFirstFirstMidIntialNamePattern = Pattern.compile(lastNameFirstFirstMidIntialName); private static Pattern lastNameFirstFirstMidNamePattern = Pattern.compile(lastNameFirstFirstMidName); /** * @param country * @return */ public static CountryStandardFormat createCountryStandardFormat(String country, StructuredTextNormalizer textNormalizer) { CountryStandardFormat countryFormat = null; if (country.equals("USA")) { countryFormat = new UnitedStatesStandardFormat(textNormalizer); } else { throw new IllegalArgumentException("invalid country name"); } return countryFormat; } /** * */ public CountryStandardFormat() { super(); intializeStateCodes(); } /** * initializes state codes */ public abstract void intializeStateCodes(); /** * case based formatting * @param item * @param format * @return */ public abstract String caseFormat(String item, String format); /** * phone number formatting * @param item * @param format * @return */ public abstract String phoneNumFormat(String item, String format); /** * state name formatting * @param item * @return */ public abstract String stateFormat(String item) throws IOException; /** * @param item * @param fuzzyMatch * @param textSimStrategy * @param minDist * @return * @throws IOException */ public abstract String stateFormat(String item, boolean fuzzyMatch, DynamicAttrSimilarityStrategy textSimStrategy, double minDist) throws IOException; /** * @param item * @param format * @return */ public String personNameFormat(String item) { String firstFull = null; String firstInitial = null; String middleFull = null; String middleInitial = null; String last = null; boolean matchFound = false; //first full, mid full, last Matcher matcher = fullNamePattern.matcher(item); if (matcher.matches()) { firstFull = matcher.group(1); middleFull = matcher.group(2); last = matcher.group(3); matchFound = true; } //first full, last if (!matchFound) { matcher = firstFullNamePattern.matcher(item); if (matcher.matches()) { firstFull = matcher.group(1); last = matcher.group(2); matchFound = true; } } //first full, middle initial,last if (!matchFound) { matcher = firstFullMidIntialNamePattern.matcher(item); if (matcher.matches()) { firstFull = matcher.group(1); middleInitial = matcher.group(2); last = matcher.group(3); matchFound = true; } } //first , middle initial,last if (!matchFound) { matcher = firstMidIntialNamePattern.matcher(item); if (matcher.matches()) { firstInitial = matcher.group(1); middleInitial = matcher.group(2); last = matcher.group(3); matchFound = true; } } //last , first initial if (!matchFound) { matcher = lastNameFirstFirstIntialNamePattern.matcher(item); if (matcher.matches()) { last = matcher.group(1); firstInitial = matcher.group(2); matchFound = true; } } //last , first if (!matchFound) { matcher = lastNameFirstFirstNamePattern.matcher(item); if (matcher.matches()) { last = matcher.group(1); firstFull = matcher.group(2); matchFound = true; } } //last , first initial, mid intial if (!matchFound) { matcher = lastNameFirstFirstMidIntialNamePattern.matcher(item); if (matcher.matches()) { last = matcher.group(1); firstInitial = matcher.group(2); middleInitial = matcher.group(3); matchFound = true; } } //last , first , mid if (!matchFound) { matcher = lastNameFirstFirstMidNamePattern.matcher(item); if (matcher.matches()) { last = matcher.group(1); firstFull = matcher.group(2); middleFull = matcher.group(3); matchFound = true; } } if (null != firstFull) { firstFull = StringUtils.capitalize(firstFull.toLowerCase()); } if (null != firstInitial) { firstInitial = StringUtils.upperCase(firstInitial); } if (null != middleFull) { middleFull = StringUtils.capitalize(middleFull.toLowerCase()); } if (null != middleInitial) { middleInitial = StringUtils.upperCase(middleInitial); } if (null != last) { last = StringUtils.capitalize(last.toLowerCase()); } StringBuilder stBld = new StringBuilder(); if (null != firstFull) { stBld.append(firstFull).append(" "); } else if (null != firstInitial ) { stBld.append(firstInitial).append(" "); } if (null != middleFull) { stBld.append(middleFull).append(" "); } else if (null != middleInitial ) { stBld.append(middleInitial).append(" "); } if (null != last) { stBld.append(last); } return stBld.toString(); } /** * @param item * @return */ public abstract String streetAddressFormat(String item) throws IOException; /** * @param item * @return */ public abstract String addressFormat(String item) throws IOException; /** * @param item * @return */ public abstract String streetAddressOneFormat(String item) throws IOException; /** * @param item * @param fuzzyMatch * @param textSimStrategy * @param minDist * @return * @throws IOException */ public abstract String streetAddressOneFormat(String item, boolean fuzzyMatch, DynamicAttrSimilarityStrategy textSimStrategy, double minDist) throws IOException; /** * @param item * @return */ public abstract String streetAddressTwoFormat(String item) throws IOException; /** * @param item * @param fuzzyMatch * @param textSimStrategy * @param minDist * @return * @throws IOException */ public abstract String streetAddressTwoFormat(String item, boolean fuzzyMatch, DynamicAttrSimilarityStrategy textSimStrategy, double minDist) throws IOException; /** * @param item * @param format * @return */ public String emailFormat(String item, String format) { String[] elements = item.split("@"); String name = elements[0]; if (format.equals("lowerCase")) { name = name.toLowerCase(); } else if (format.equals("upperCase")) { name = name.toUpperCase(); } else if (format.equals("capitalize")) { name = StringUtils.capitalize(name.toLowerCase()); } else { throw new IllegalArgumentException("invalid case format"); } return name + "@" + elements[1]; } /** * @param item * @return */ public String removePunctuations(String item) { String newItem = item.replaceAll("\\.",""); newItem = item.replaceAll(",",""); return newItem; } /** * @param component * @param tokenNormalizer * @param textSimStrategy * @param minDist * @return * @throws IOException */ protected Pair<Boolean, String> fuzyyMatchComponent(String component, TextFieldTokenNormalizer tokenNormalizer, DynamicAttrSimilarityStrategy textSimStrategy, double minDist) throws IOException { String newComponent = component; boolean fuzzyMatched = false; if (!tokenNormalizer.containsNormalize(component)) { //try fuzzy matching Pair<String, Double> match = tokenNormalizer.fuzzymatchWithUnnormalized(component, textSimStrategy); if (match.getRight() <= minDist) { newComponent = match.getLeft(); newComponent = tokenNormalizer.normalize(newComponent); fuzzyMatched = true; } else { match = tokenNormalizer.fuzzymatchWithNormalized(component, textSimStrategy); if (match.getRight() <= minDist) { newComponent = match.getLeft(); fuzzyMatched = true; } } } return new ImmutablePair<Boolean, String>(fuzzyMatched, newComponent); } }