package com.gisgraphy.addressparser.format; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.gisgraphy.addressparser.Address; import com.gisgraphy.addressparser.StreetTypeOrder; /** * @author <a href="mailto:david.masclet@gisgraphy.com">David Masclet</a> * */ public class BasicAddressFormater { private static final Pattern CAPITALIZE = Pattern.compile("(?:\\b([a-z0-9])([a-z0-9]*)\\b(\\W*))",Pattern.CASE_INSENSITIVE); public static final String FILENAME = "format.tsv"; private static int NUMBER_OF_FIELDS_BY_LINE = 9; Map<String, AddressFormatInfo> formatMap = new HashMap<String, AddressFormatInfo>(); static BasicAddressFormater instance = new BasicAddressFormater(); public BasicAddressFormater() { super(); init(); } public static BasicAddressFormater getInstance(){ return instance; } public AddressFormatInfo getCountryInfo(String countryCode){ if (countryCode!=null){ return formatMap.get(countryCode.toUpperCase()); } else { return null; } } void init() { BufferedReader br = null; InputStream bis = null; int count = 0; try { bis = Thread.currentThread().getContextClassLoader().getResourceAsStream(FILENAME); if (bis == null) { throw new RuntimeException("file " + FILENAME + " does not exists or is not present in classpath"); } br = new BufferedReader(new InputStreamReader(bis, "UTF-8")); String line; while ((line = br.readLine()) != null) { count++; String[] fields = line.split("\\t"); if (fields.length != NUMBER_OF_FIELDS_BY_LINE) { throw new RuntimeException("Line " + count + " has not the correct number Of fields : expected " + NUMBER_OF_FIELDS_BY_LINE + " but was " + fields.length); } AddressFormatInfo formatInfo = new AddressFormatInfo(); //country code if (fields[0] == null || fields[0].trim().length() != 2) { throw new RuntimeException("Incorect countrycode '" + fields[0].trim().length() + "' on line " + count); } //pattern if (fields[1] == null) { throw new RuntimeException("Format pattern is null for " + fields[0] + " on line " + count); } if (fields[1] != null && !"".equals(fields[1].trim())) { formatInfo.setFormatString(fields[1]); } // if (fields[2] != null) { if (fields[2].trim().equals("1")){ formatInfo.setOptionalState(true); } else if ( fields[2].trim().equals("0")||"".equals(fields[2].trim())){ formatInfo.setOptionalState(false); } else { throw new RuntimeException("address formater : unknow optional state value '"+fields[2]); } } if (fields[3] != null && !"".equals(fields[3].trim())) { int value; try { value = Integer.valueOf(fields[3].trim()); } catch (Exception e) { throw new RuntimeException("address formater : unknow street type before street name value '"+fields[4]); } formatInfo.setStateLevel(value);; } if (fields[4] != null && !"".equals(fields[4].trim())) { int value; try { value = Integer.valueOf(fields[4].trim()); } catch (Exception e) { throw new RuntimeException("address formater : unknow street type before street name value '"+fields[4]); } formatInfo.setStreetTypeBeforeStreetName(value); } if (fields[5] != null) { if (fields[5].trim().equals("1")){ formatInfo.setStateCode(true); } else if (fields[5].trim().equals("0") || "".equals(fields[5].trim())){ formatInfo.setStateCode(false); } else { throw new RuntimeException("address formater : unknow state code value '"+fields[4]); } } if (fields[6] != null && !"".equals(fields[6].trim())) { formatInfo.setFormatRTLString(fields[6]); } if (fields[7] != null) { if (fields[7].trim().equals("1")){ formatInfo.setPoBoxOnly(true); } else if ( fields[7].trim().equals("0")||"".equals(fields[7].trim())){ formatInfo.setPoBoxOnly(false); } else { throw new RuntimeException("address formater : unknow pobox only value '"+fields[6]); } } if (fields[8] != null && !"".equals(fields[8].trim())) { Matcher m = CAPITALIZE.matcher(fields[8]); StringBuilder sb = new StringBuilder(); int last = 0; while (m.find()) { if (m.group(2).length()==0){ sb.append(m.group(1).toLowerCase()); } else { sb.append(m.group(1).toUpperCase()); sb.append(m.group(2).toLowerCase()); } sb.append(m.group(3)); last = m.end(); } sb.append(fields[8].substring(last).toLowerCase()); String stringCamel = sb.toString(); formatInfo.setCountryName(stringCamel); } formatMap.put(fields[0].toUpperCase(), formatInfo); } } catch (Exception e) { throw new RuntimeException(e.getMessage(), e); } finally { if (bis != null) { try { bis.close(); } catch (IOException e) { } } if (br != null) { try { br.close(); } catch (IOException e) { } } } } /** * same as {@link #getEnvelopeAddress(Address, ScriptType, DisplayMode)}, assuming scriptType is {@linkplain ScriptType#LTR} */ public String getEnvelopeAddress(Address address,ScriptType scriptType, DisplayMode displayMode) { String newLine; if (DisplayMode.ENVELOPE == displayMode) { newLine = "\r\n"; } else if (DisplayMode.HTML == displayMode) { newLine = "<br/>"; } else if (DisplayMode.COMMA == displayMode) { newLine = ", "; } else { newLine = " "; } List<String> lines = getLines(address,scriptType); return join(lines, newLine); } public String getEnvelopeAddress(Address address, DisplayMode displayMode) { return getEnvelopeAddress(address,ScriptType.LTR, displayMode); } private static boolean isNotNullOrEmpty(String toTest){ if (toTest != null && !"".trim().equals(toTest)){ return true; } return false; } /** * same as {@link #getLines(Address, ScriptType)}, assuming scriptType is {@linkplain ScriptType#LTR} */ protected List<String> getLines(Address address) { return getLines(address,ScriptType.LTR); } /** * Gets formatted address. For example, * * <p> * John Doe<br> * Dnar Corp<br> * 5th St<br> * Santa Monica CA 90123 * </p> * * This method does not validate addresses. Also, it will "normalize" the * result strings by removing redundant spaces and empty lines. */ protected List<String> getLines(Address address,ScriptType scriptTypeparam) { if (address == null) { throw new RuntimeException("null input address not allowed"); } String regionCode = address.getCountryCode(); if (regionCode != null){ regionCode = regionCode.toUpperCase(); } ScriptType scriptType = scriptTypeparam==null? ScriptType.LTR:scriptTypeparam; List<String> lines = new ArrayList<String>(); if (scriptType == ScriptType.LTR && (isNotNullOrEmpty(address.getName()) || isNotNullOrEmpty(address.getRecipientName()))){ lines.add(joinAndSkipNulls(" ",address.getName(),address.getRecipientName()).trim()); } String formatString = getFormatString(scriptType, regionCode, address); if (address.getStreetName()==null && formatString.indexOf("2")>=0){ formatString = formatString.replace("*75*", "*5*7*"); } String[] substrings = formatString.split("\\*"); for (String substr : substrings) { StringBuilder currentLine = new StringBuilder(); int donTputNextComma = -1; int current_item = 0; for (char c : substr.toCharArray()) { boolean lineIsNotEmpty = currentLine.toString().trim().length()!=0; String sep= ","; current_item++; String part = ""; if (c == '0'){ /*AddressFormatInfo formatInfo = formatMap.get(regionCode); part = formatInfo.getCountryName(); currentLine.append(part).append(" ");*/ //will be managed in a separate part } else if (c == '1'){ part = joinAndSkipNulls(" ",address.getHouseNumber(), address.getHouseNumberInfo()); if (!"".equals(part) && current_item !=1 && lineIsNotEmpty){ part=sep+" "+part; } donTputNextComma =2; } else if (c == '2'){ StreetTypeOrder order = detectStreetTypeOrderFromAddress(address); String nameAndType; if (order==StreetTypeOrder.nameThenType){ if (scriptType == ScriptType.LTR){ nameAndType=joinAndSkipNulls(" ",address.getStreetName(),address.getStreetType()); } else { nameAndType = joinAndSkipNulls(" ",address.getStreetType(),address.getStreetName()); } }else if (order==StreetTypeOrder.typeThenName){ if (scriptType == ScriptType.LTR){ nameAndType = joinAndSkipNulls(" ",address.getStreetType(),address.getStreetName()); } else { nameAndType=joinAndSkipNulls(" ",address.getStreetName(),address.getStreetType()); } } else { //default to the most popular one nameAndType = joinAndSkipNulls(" ",address.getStreetName(),address.getStreetType()); } part = joinAndSkipNulls(" ",address.getPreDirection(), nameAndType, address.getPostDirection()); if (!"".equals(part) && current_item !=1 && lineIsNotEmpty && donTputNextComma<1){ part=", "+part; } } else if (c == '3'){ part = joinAndSkipNulls(", ",address.getCitySubdivision(), address.getDependentLocality(), address.getQuarter(),address.getDistrict()); if (!"".equals(part) && current_item !=1 && lineIsNotEmpty && donTputNextComma<1){ part=sep+" "+part; } } else if (c == '4'){ //not managed yet } else if (c == '5'){ if (address.getPostTown()!=null && !address.getPostTown().equals(address.getCity())){ part = joinAndSkipNulls(" ",address.getCity(), address.getPostTown()); } else if (isNotNullOrEmpty(address.getCity())){ part = address.getCity(); } if (!"".equals(part) && current_item !=1 && lineIsNotEmpty && donTputNextComma<1){ part=sep+" "+part; } } else if (c == '6'){ AddressFormatInfo info = formatMap.get(regionCode); if (info!=null && !info.getOptionalState()){ String state = getState(address); part = joinAndSkipNulls(" ",state);; } if (!"".equals(part) && current_item !=1 && lineIsNotEmpty){ part=sep+" "+part; } } else if (c == '7'){ part = joinAndSkipNulls(" ",address.getZipCode()); } else if (c == '8'){ //for future use } else if (c == '9'){ part = joinAndSkipNulls(", ",address.getPOBox(),address.getPOBoxInfo(),address.getPOBoxAgency(),address.getPostOfficeBox()); if (!"".equals(part) && current_item !=1 && lineIsNotEmpty && donTputNextComma<1){ part=sep+" "+part; } } else { part=c+""; } donTputNextComma--; currentLine.append(part); currentLine.append(" "); } String normalizedStr = removeAllRedundantSpaces(currentLine.toString()); if (normalizedStr.length() > 0) { lines.add(normalizedStr.trim()); } } if (scriptType == ScriptType.RTL && (isNotNullOrEmpty(address.getName()) || isNotNullOrEmpty(address.getRecipientName()))){ lines.add(joinAndSkipNulls(" ",address.getName(),address.getRecipientName()).trim()); } return lines; } protected String getState(Address address) { String state =null; if (address.getCountryCode()==null){ if ( address.getState()!=null){ state = address.getState(); } else if (address.getDistrict()!=null){ state = address.getDistrict(); } else { state = address.getAdm1Name(); } }else { AddressFormatInfo info = formatMap.get(address.getCountryCode().toUpperCase()); if ( address.getState()!=null){ state = address.getState(); } else if (address.getDistrict()!=null){ state = address.getDistrict(); } if (info.getStateLevel()==1 && address.getAdm1Name()!=null){ state = address.getAdm1Name(); } if (info.getStateLevel()==2){ if (address.getAdm2Name()!=null){ state = address.getAdm2Name(); } else if (address.getAdm1Name()!=null){ state = address.getAdm1Name(); } } if (info.getStateLevel()==3){ if (address.getAdm3Name()!=null){ state = address.getAdm3Name(); } else if (address.getAdm1Name()!=null){ state = address.getAdm1Name(); } } if (info.getStateLevel()==4){ if (address.getAdm4Name()!=null){ state = address.getAdm4Name(); } else if (address.getAdm1Name()!=null){ state = address.getAdm1Name(); } }if (info.getStateLevel()==5){ if (address.getAdm5Name()!=null){ state = address.getAdm5Name(); } else if (address.getAdm1Name()!=null){ state = address.getAdm1Name(); } } } return state; } /** * Joins input string with the given separator. If an input string is null, * it will be skipped. */ private static String joinAndSkipNulls(String separator, String... strings) { StringBuilder sb = null; for (String s : strings) { if (s != null) { s = s.trim(); if (s.length() > 0) { if (sb == null) { sb = new StringBuilder(s); } else { sb.append(separator).append(s); } } } } return sb == null ? "" : sb.toString(); } public StreetTypeOrder detectStreetTypeOrderFromAddress(Address address) { if (address == null) { return StreetTypeOrder.unknow; } String countryCode = address.getCountryCode(); if (countryCode != null) { AddressFormatInfo formatInfo = formatMap.get(countryCode); if (formatInfo!=null){ int order = formatInfo.getStreetTypeBeforeStreetName(); switch (order) { case 0: return StreetTypeOrder.nameThenType; case 1: return StreetTypeOrder.typeThenName; case 2: return StreetTypeOrder.unknow; case 3: return StreetTypeOrder.unknow; case 4: return StreetTypeOrder.unknow; default: return StreetTypeOrder.unknow; } } } return StreetTypeOrder.unknow; } private String removeAllRedundantSpaces(String str) { str = str.trim(); str = str.replaceAll(" +", " "); str = str.replaceAll(" ,", ","); return str; } protected String getFormatString(ScriptType scriptType, String countrycode, Address address) { AddressFormatInfo formatInfo = formatMap.get(countrycode); if (address !=null && formatInfo!=null && formatInfo.isPoBoxOnly() && address.getPOBox()==null && address.getPOBoxInfo()==null && address.getPostOfficeBox()==null){ formatInfo = formatMap.get("ZZ"); if (formatInfo==null){ throw new RuntimeException("no default pattern found"); } return formatInfo.getFormatString(); } if (formatInfo == null) { formatInfo = formatMap.get("ZZ"); if (formatInfo==null){ throw new RuntimeException("no default pattern found"); } } String format; if (scriptType == ScriptType.LTR) { format = formatInfo.getFormatString(); } else { format = formatInfo.getFormatRTLString(); if (format == null) { format = formatInfo.getFormatString(); } } return format; } protected static String join(Collection<String> s, String delimiter) { StringBuffer buffer = new StringBuffer(); Iterator<String> iter = s.iterator(); while (iter.hasNext()) { String next = iter.next(); if (next != null && !"".equals(next.trim())){ buffer.append(next); if (iter.hasNext()) { buffer.append(delimiter); } } } return buffer.toString(); } public int getAdmLevelByContryCode(String countryCode){ if (countryCode != null){ AddressFormatInfo info = formatMap.get(countryCode); if (info!=null){ Boolean optionalState = info.getOptionalState(); if (optionalState!=null && optionalState==false){ return info.getStateLevel(); } } } return 0; } }