/*******************************************************************************
* Gisgraphy Project
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA
*
* Copyright 2008 Gisgraphy project
* David Masclet <davidmasclet@gisgraphy.com>
*
*
*******************************************************************************/
package com.gisgraphy.helper;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.gisgraphy.compound.Decompounder;
import com.gisgraphy.compound.Decompounder.state;
import com.gisgraphy.domain.geoloc.entity.AlternateOsmName;
import com.gisgraphy.domain.geoloc.entity.OpenStreetMap;
/**
* Provide some usefull method to compute string for autocompletion and fulltextsearch
*
* @author <a href="mailto:david.masclet@gisgraphy.com">David Masclet</a>
*/
public class StringHelper {
public static final int MAX_STRING_INDEXABLE_LENGTH = 40;
public static final char WHITESPACE_CHAR_DELIMITER = '-';
protected static final Logger logger = LoggerFactory.getLogger(StringHelper.class);
protected static final int MISSING_WORD_TOLERANCE = 1;
private static final Pattern ORDINAL_PATTERN = Pattern.compile("(\\d+)\\s?(?:rd|st|nd|th)?\\b");
protected static final Pattern SYNONYMS_PATTERN= Pattern.compile("(saint|santa)", Pattern.CASE_INSENSITIVE);
private final static Pattern RN_PATTERN = Pattern.compile("\\b(rn)\\s?(\\d{1,4}\\b)", Pattern.CASE_INSENSITIVE);
private final static Pattern ZIPCONCATENATE_2_3_PATTERN = Pattern.compile("(.*)\\s\\b(\\d{2})[\\s-](\\d{3}\\b)");
private final static Pattern ZIPCONCATENATE_3_2__PATTERN = Pattern.compile("(.*)\\s\\b(\\d{3})[\\s-](\\d{2}\\b)");
private static final Pattern GERMAN_SYNONYM_PATTEN = Pattern.compile("(str\\b)[\\.]?",Pattern.CASE_INSENSITIVE);
private static final Pattern DIRECTION_PATTERN = Pattern.compile("((?:\\b\\s[sewn]$)|(?:^[sewn]\\b\\s))",Pattern.CASE_INSENSITIVE);
private static Decompounder decompounder = new Decompounder();
private static LevenshteinAlgorithm levenstein = new LevenshteinAlgorithm();
/**
* Process a string to apply filter as lucene and solr does :
* - remove accent
* - lowercase
* - word delimiter ('-', '.'
* @param originalString the string to process
* @return the transformed String or null if the original String is null
*/
public static final String normalize(String originalString) {
return originalString == null ? null : EncodingHelper.removeAccents(originalString.trim()).toLowerCase().replace("-", " ").replace(".", " ").replace("\"", " ").replace("'", " ").replace(';', ' ');
}
/**
* Process a string to in order to be stored in a specific postgres
* field to allow the index usage for ilike (ilike(%String%):
* e.g : 'it s ok'=> s ok, s o, it s, t s o, t s, it s ok, ok, it s o, it, t s ok
* it remove duplicates and don't put single character.
*
* @param originalString the string to process
* @param delimiter words will be delimited by this char
* (it should be the same as the one in {@link StringHelper#transformStringForPartialWordSearch(String, char)}.
* For gisgraphy the char is {@link StringHelper#WHITESPACE_CHAR_DELIMITER}
* IMPORTANT NOTE : if the string is greater than {@link #MAX_STRING_INDEXABLE_LENGTH}, the method will return null;
* @return the transformed String (or null if the original String is null) to be used by the postgres function to_ts_vector
* @see #transformStringForPartialWordSearch(String, char)
*/
public static final String transformStringForPartialWordIndexation(String originalString, char delimiter) {
if (originalString == null) {
return null;
}
if (originalString.length() > MAX_STRING_INDEXABLE_LENGTH) {
return null;
}
//use hashset to remove duplicate
String substring = null;
StringBuffer sb = new StringBuffer();
Set<String> set = new HashSet<String>();
originalString = normalize(originalString);
for (int i = 0; i < originalString.length(); i++) {
for (int j = i + 1; j <= originalString.length(); j++) {
substring = originalString.substring(i, j);
if (!substring.endsWith(" ")) {//we have alredy add the entry the last loop
if (substring.startsWith(" ")) {//need to trim?
substring = substring.substring(1);
}
if (substring.length() > 1) {//only index string that have length >=2
set.add(substring.replace(" ", String.valueOf(delimiter)));
}
}
}
}
for (String part : set) {
sb.append(part).append(" ");
}
return sb.toString();
}
/**
*
* @param originalString the string to transform
* @param delimiter the delimiter
* (it should be the same as the one use in {@link #transformStringForPartialWordIndexation(String, char)})
* For gisgraphy the char is {@link StringHelper#WHITESPACE_CHAR_DELIMITER}
* @return the transformed string (or null if the original String is null) to be use by the postgres function plainto_tsquery)
* @see #transformStringForPartialWordIndexation(String, char)
*/
public static final String transformStringForPartialWordSearch(String originalString, char delimiter) {
if (originalString == null) {
return null;
}
return normalize(originalString.trim()).replace(" ", String.valueOf(delimiter));
}
/**
* @param openStreetMap the openStreetMap Entity to update
* @return the same openstreetmap entity with the {@link OpenStreetMap#FULLTEXTSEARCH_COLUMN_NAME}
*/
public static OpenStreetMap updateOpenStreetMapEntityForIndexation(OpenStreetMap openStreetMap) {
if (openStreetMap.getName() != null) {
openStreetMap.setTextSearchName(StringHelper.normalize(openStreetMap.getName()));
}
return openStreetMap;
}
/**
* @param s a camel Case string
* @return a human readable string where upper char is replaced by a space and the lowercase char
*/
public static String splitCamelCase(String s) {
return s.replaceAll(
String.format("%s|%s|%s",
"(?<=[A-Z])(?=[A-Z][a-z])",
"(?<=[^A-Z])(?=[A-Z])",
"(?<=[A-Za-z])(?=[^A-Za-z])"
),
" "
);
}
/**
* Usefull method to be compatible with jdk1.5 (jdk 1.6 already have this method)
* @param string the string to test
* @return true if the string is not null or empty (trimmed)
*/
public static boolean isNotEmptyString(String string){
return !isEmptyString(string);
}
/**
* Usefull method to be compatible with jdk1.5 (jdk 1.6 already have this method)
* @param string the string to test
* @return true if the sting is null or empty (trimmed)
*/
public static boolean isEmptyString(String string){
if (string==null ||
"".equals(string.trim()) ){
return true;
}
return false;
}
/**
* @param aThrowable
* @return the stacktrace as string
*/
public static String getStackTraceAsString(Throwable aThrowable) {
final Writer result = new StringWriter();
final PrintWriter printWriter = new PrintWriter(result);
aThrowable.printStackTrace(printWriter);
return result.toString();
}
public static boolean isSameName(String expected, String actual){
if (actual!=null && expected!=null){
if (decompounder.isDecompoudName(actual)){
return isSameName(expected, actual, MISSING_WORD_TOLERANCE) || isSameName(expected, decompounder.getOtherFormat(actual), MISSING_WORD_TOLERANCE);
}
else {
return isSameStreetName_intern(expected,actual);
}
}
return false;
}
/**
* @param expected
* @param actual
* @param tolerance the number of word that can be missing if there is more than two words specified
* @return
*/
public static boolean isSameName(String expected, String actual,int tolerance){
if (actual!=null && expected!=null){
if (actual.equalsIgnoreCase(expected)){ //shortcut
return true;
}
//split the strings
String[] actualSplited = actual.split("[,\\s\\-\\–\\一;]");
String[] expectedSplited = expected.split("[,\\s\\-\\–\\一]");
if (Math.abs(actualSplited.length -expectedSplited.length) >=2){
return false;
}
//first we check if actual has more long words than expected
//saint jean is not saint jean de luz, but 'la petite maison' is ok for 'petite maison'
List<String> actualSplitedLong = new ArrayList<String>();
for (String word:actualSplited){
if (word.length()>3){
if (word!=null){
actualSplitedLong.add(normalize(word));
}
} else if (word.equals("st")){
Matcher m =SYNONYMS_PATTERN.matcher(expected);
if (m.find() && m.groupCount()>=1){
actualSplitedLong.add(m.group(1).toLowerCase());
}
} else if (StringUtils.isNumeric(word)){
actualSplitedLong.add(normalize(word));
}
}
List<String> expectedSplitedLong = new ArrayList<String>();
for (String word:expectedSplited){
if (word.length()>3){
if (word!=null){
expectedSplitedLong.add(normalize(word));
}
} else if (word.equals("st")){
Matcher m =SYNONYMS_PATTERN.matcher(actual);
if (m.find()&&m.groupCount()>=1){
expectedSplitedLong.add(m.group(1).toLowerCase());
}
}
}
if (actualSplitedLong.size() > expectedSplitedLong.size() ){
return false;
}
if (actualSplitedLong.size() < expectedSplitedLong.size() ){
return false;
}
//same number of word but are they the same ?
int countMissing = 0;
for (String word :actualSplitedLong){
if(!expectedSplitedLong.contains(word)){
countMissing++;
}
if (expectedSplitedLong.size() == actualSplitedLong.size() && (expectedSplitedLong.size()==1 || expectedSplitedLong.size()==2) && countMissing >0){
//if one or two words, every words should be present
return false;
} else if (countMissing > tolerance){
return false;
}
}
return true;
}
return false;
}
private static List<String> FR_COUNTRIES = new ArrayList<String>(){{
add("CA");add("FR");add("BE");add("CH");add("RE");add("GP");add("MF");add("MP");add("DZ");add("MA");add("SD");add("CD");add("CM");add("SN");add("PM");}};
private static List<String> EN_COUNTRIES = new ArrayList<String>(){{add("US");add("CA");add("CN");add("ID");add("IN");add("AU");add("SG");add("HK");add("IR");add("FI");add("SA");add("VI");add("FK");add("GI");add("GL");add("FO");add("AS");add("IM");add("UM");add("GB");add("UK");add("PR");add("JE");add("SH");add("GS");add("GG");}};
private static List<String> SP_COUNTRIES = new ArrayList<String>(){{add("AR");add("ES");add("MX");add("CO");add("PA");}};
private static List<String> IT_COUNTRIES = new ArrayList<String>(){{add("IT");add("SM");add("VA");}};
public static boolean isSameStreetName(String expected, String actual, String countrycode){
if (actual!=null && expected!=null){
actual=expandStreetType(actual, countrycode);
expected=expandStreetType(expected, countrycode);
actual=expandStreetSynonyms(actual);
expected=expandStreetSynonyms(expected);
if (countrycode!=null && (countrycode.equalsIgnoreCase("CA") || countrycode.equalsIgnoreCase("US"))){
actual=expandStreetDirections(actual);
expected=expandStreetDirections(expected);
}
return (isSameStreetName_intern(expected,actual) ||
(actual.replaceAll("[^0-9]", "").equals(expected.replaceAll("[^0-9]", "")) && levenstein.execute(normalize(actual).replaceAll("\\s-", ""), normalize(expected).replaceAll("\\s-", ""))<2)
);
}
return false;
}
private static boolean isSameStreetName_intern(String expected, String actual){
int tolerance = 0;
if (actual!=null && expected!=null){
if (actual.equalsIgnoreCase(expected)){ //shortcut
return true;
}
//split the strings
String[] actualSplited = actual.split("[,\\s\\-\\–\\一;]");
String[] expectedSplited = expected.split("[,\\s\\-\\–\\一]");
//first we check if actual has more long words than expected
//saint jean is not saint jean de luz, but 'la petite maison' is ok for 'petite maison'
List<String> actualSplitedLong = new ArrayList<String>();
for (String word:actualSplited){
if (word.length()>3){
if (word!=null){
actualSplitedLong.add(normalize(word));
}
} else if (word.equals("st")){
Matcher m =SYNONYMS_PATTERN.matcher(expected);
if (m.find() && m.groupCount()>=1){
actualSplitedLong.add(m.group(1).toLowerCase());
}
} else if (StringUtils.isNumeric(word)){
actualSplitedLong.add(normalize(word));
}
}
List<String> expectedSplitedLong = new ArrayList<String>();
for (String word:expectedSplited){
if (word.length()>3){
if (word!=null){
expectedSplitedLong.add(normalize(word));
}
} else if (word.equals("st")){
Matcher m =SYNONYMS_PATTERN.matcher(actual);
if (m.find()&&m.groupCount()>=1){
expectedSplitedLong.add(m.group(1).toLowerCase());
}
}
}
if (actualSplitedLong.size() > expectedSplitedLong.size() ){
return false;
}
if (actualSplitedLong.size() < expectedSplitedLong.size() ){
return false;
}
//same number of word but are they the same ?
int countMissing = 0;
for (String word :actualSplitedLong){
Matcher matcher1 = ORDINAL_PATTERN.matcher(word);
if (matcher1.find()){
boolean foundOrdinal = false;
for (String expectedLong:expectedSplitedLong){
Matcher matcher2 = ORDINAL_PATTERN.matcher(expectedLong);
if (!foundOrdinal && matcher2.find()){
if(matcher1.group(1).equals(matcher2.group(1))){
foundOrdinal=true;
}
}
}
if (!foundOrdinal){
countMissing++;
}
}else {
if(!expectedSplitedLong.contains(word)){
countMissing++;
}
if (expectedSplitedLong.size() == actualSplitedLong.size() && (expectedSplitedLong.size()==1 || expectedSplitedLong.size()==2) && countMissing >0){
//if one or two words, every words should be present
return false;
} else if (countMissing > tolerance){
return false;
}
}
}
return true;
}
return false;
}
public static boolean isSameAlternateNames(String name, List<String> name_alternates) {
if (name_alternates!=null && name !=null){
for (String nameAlternate:name_alternates){
if (nameAlternate!=null){
if (isSameName(name, nameAlternate)){
return true;
}
}
}
}
return false;
}
public static String expandStreetDirections(String street) {
if (street==null){
return null;
}
street= street.trim();
Matcher m = DIRECTION_PATTERN.matcher(street);
StringBuffer sb = new StringBuffer();
if (m.find()){
String group = m.group(1).trim();
String replacement =group;
if (group.equalsIgnoreCase("N")){
replacement = " north ";
}
else if (group.equalsIgnoreCase("E")){
replacement = " east ";
}
else if (group.equalsIgnoreCase("S")){
replacement = " south ";
}
else if (group.equalsIgnoreCase("W")){
replacement = " west ";
}
m.appendReplacement(sb, replacement);
m.appendTail(sb);
return sb.toString().trim();
}
return street;
}
public static String expandStreetSynonyms(String street) {
if (street==null){
return null;
}
street= street.trim();
Matcher m = Pattern.compile("\\b(st)\\b\\s", Pattern.CASE_INSENSITIVE).matcher(street);
StringBuffer sb = new StringBuffer();
if (m.find()){
m.appendReplacement(sb, " saint ");
m.appendTail(sb);
return sb.toString().trim();
}
return street;
}
/**
* correct the street type according the countrycode. e.g :
* av=>avenue, r=>rue
* @param street
* @param countryCode
* @return
*/
public static String expandStreetType(String street, String countryCode) {
if (street==null){
return null;
}
street= street.trim();
boolean hasPoint = false;
if (countryCode != null && FR_COUNTRIES.contains(countryCode.toUpperCase())){
if (street.indexOf(' ')>0){
String firstWord = street.substring(0, street.indexOf(' '));
if (firstWord.indexOf(".")>0){
firstWord = firstWord.substring(0, firstWord.indexOf('.'));
hasPoint=true;
}
if (firstWord !=null ){
if (FR_STREET_TYPE_MAP.get(firstWord.toLowerCase())!=null){
String toReplace = hasPoint?firstWord+".":firstWord;
return street.replaceFirst(toReplace, FR_STREET_TYPE_MAP.get(firstWord.toLowerCase()));
}
}
}
}
if (countryCode != null && SP_COUNTRIES.contains(countryCode.toUpperCase())){
if (street.indexOf(' ')>0){
String firstWord = street.substring(0, street.indexOf(' '));
if (firstWord.indexOf(".")>0){
firstWord = firstWord.substring(0, firstWord.indexOf('.'));
hasPoint=true;
}
if (firstWord !=null ){
if (SP_STREET_TYPE_MAP.get(firstWord.toLowerCase())!=null){
String toReplace = hasPoint?firstWord+".":firstWord;
return street.replaceFirst(toReplace, SP_STREET_TYPE_MAP.get(firstWord.toLowerCase()));
}
}
}
}
if (countryCode != null && IT_COUNTRIES.contains(countryCode.toUpperCase())){
if (street.indexOf(' ')>0){
String firstWord = street.substring(0, street.indexOf(' '));
if (firstWord.indexOf(".")>0){
firstWord = firstWord.substring(0, firstWord.indexOf('.'));
hasPoint=true;
}
if (firstWord !=null ){
if (IT_STREET_TYPE_MAP.get(firstWord.toLowerCase())!=null){
String toReplace = hasPoint?firstWord+".":firstWord;
return street.replaceFirst(toReplace, IT_STREET_TYPE_MAP.get(firstWord.toLowerCase()));
}
}
}
}
else if (countryCode != null && (EN_COUNTRIES.contains(countryCode.toUpperCase()))){
//last word
if (street.indexOf(' ')>0){
String lastword = street.substring(street.lastIndexOf(" ")+1);
if (lastword.indexOf(".")>0){
lastword = lastword.substring(0, lastword.indexOf('.'));
hasPoint=true;
}
if (lastword !=null) {
if (US_STREET_TYPE_MAP.get(lastword.toLowerCase())!=null){
String toReplace = hasPoint?lastword+".":lastword;
return street.replaceFirst(toReplace, US_STREET_TYPE_MAP.get(lastword.toLowerCase()));
}
}
}
//numbered road
if (street.indexOf(' ')>0){
String firstWord = street.substring(0, street.indexOf(' '));
hasPoint = false;
if (firstWord.indexOf(".")>0){
firstWord = firstWord.substring(0, firstWord.indexOf('.'));
hasPoint=true;
}
if (firstWord !=null ){
if (NUMBERED_STREET_TYPE_MAP.get(firstWord.toLowerCase())!=null){
String toReplace = hasPoint?firstWord+".":firstWord;
return street.replaceFirst(toReplace, NUMBERED_STREET_TYPE_MAP.get(firstWord.toLowerCase()));
}
}
}
}
StringBuffer sb = new StringBuffer();
Matcher m = GERMAN_SYNONYM_PATTEN.matcher(street);
while (m.find()) {
if (countryCode!=null){
countryCode = countryCode.toUpperCase();
if (countryCode.equals("DE")|countryCode.equals("AT")){
m.appendReplacement(sb, "straße");
} else if (countryCode.equals("NL")){
m.appendReplacement(sb, "straat");
} else if (countryCode.equals("CH")){
m.appendReplacement(sb, "strasse");
}else if (countryCode.equals("DK")){
m.appendReplacement(sb, "stræde");
}else if (countryCode.equals("MD")){
m.appendReplacement(sb, "strada");
}
}else {
//default to straße
m.appendReplacement(sb, "straße");
}
}
m.appendTail(sb);
String s = sb.toString();
// s= s.replaceAll(" stra(?:(?:ss)|(?:ß))e", "strasse");
return s;
}
//always put in lowercase
private static final Map<String, String> FR_STREET_TYPE_MAP = new HashMap<String, String>(){
{
put("r","rue");
put("rte","route");
put("av","avenue");
put("bd","boulevard");
put("blvd","boulevard");
put("chem","chemin");
put("departementale","route departementale");
put("rte departementale","route departementale");
put("gr","grande randonnee");
}
};
//always put in lowercase
private static final Map<String, String> NUMBERED_STREET_TYPE_MAP = new HashMap<String, String>(){
{
put("sr","state route");
put("hwy","route");
put("pth","perimeter highway");
put("ss","strada statale");
}
};
//only frequent commonly used
//always put in lowercase
private static final Map<String, String> US_STREET_TYPE_MAP = new HashMap<String, String>(){
{
put("r","rue");
put("rte","route");
put("av","avenue");
put("ave","avenue");
put("bd","boulevard");
put("blvd","boulevard");
put("aly","alley");
put("anx","anex");
put("arc","arcade");
put("bch","beach");
put("boul","boulevard");
put("boulv","boulevard");
put("brdg","bridge");
put("dr","drive");
put("drs","drives");
put("highwy","highway");
put("hiway","highway");
put("hiwy","highway");
put("hway","highway");
put("hwy","highway");
put("pl","place");
put("rd","road");
put("st","street");
put("str","street");
put("tunl","tunnel");
put("tunnl","tunnel");
put("trail","trail");
put("ct","court");
put("cir","circle");
put("dr","drive");
put("ln","lane");
}
};
//always put in lowercase
public static final Map<String,String> SP_STREET_TYPE_MAP = new HashMap<String,String>(){{
put("alam","alameda");
put("angta","angosta");
put("auto","autopista");
put("autov","autovia");
put("av","avenida");
put("ave","avenida");
put("avd","avenida");
put("avda","avenida");
put("avinguda","avenida");
put("bulev","bulevar");
put("c","calle");
put("ch","camino hondo");
put("cn","camino nuevo");
put("cv","camino viejo");
put("callecillas","callecilla");
put("callecitas","callecita");
put("callezonas","callezona");
put("callezotas","callezota");
put("ccvcn","circunvalacion");
put("cint","carretera interestatal");
put("carretera de circunvalacion","circunvalacion");// deviation
put("cjla","calleja");
put("cjon","callejon");
put("cl","calle");
put("cllja","calleja");
put("cllon","callejon");
put("cllzo","callizo");
put("cllza","calliza");
put("cmno","camino");
put("cro","carrero");
put("cra","carrera");//
put("cr","carrera");//
put("crr","carrera");//
put("cro","carrer");
put("crril","Carril");
put("ctra","carretera");
put("ctrin","Carreterín");
put("czada","calzada");
put("diag","diagonal");
put("err","errepidea");
put("etorb","etorbidea");
put("gv","gran vía");
put("gta","glorieta");
put("pasaje","passatge");
put("psaje","passatge");
put("ptge","passatge");
put("passeig","passatge");
put("pg","passatge");
put("pl","plaça");
put("plza","plaza");
put("pza","plaza");
put("pnte","puente");
put("pto","puerto");
put("rbla","rambla");
put("sedra","sendaera");
put("send","sendaera");
put("sendera","sendaera");
put("trans","tránsito");
put("trval","transversal");
put("trva","tranvia");
put("v","via");
}
};
public static final Map<String,String> IT_STREET_TYPE_MAP = new HashMap<String,String>(){{
put("v","via");
put("c","calle");
}};
public static final Collection<String> IT_STREETTYPE_LIST_AFTER_NORMALIZATION=getlistOfNormalizedStreetType(IT_STREET_TYPE_MAP);
public static final Collection<String> EN_STREETTYPE_LIST_AFTER_NORMALIZATION=getlistOfNormalizedStreetType(US_STREET_TYPE_MAP);
public static final Collection<String> FR_STREETTYPE_LIST_AFTER_NORMALIZATION=getlistOfNormalizedStreetType(FR_STREET_TYPE_MAP);
public static final Collection<String> SP_STREETTYPE_LIST_AFTER_NORMALIZATION=getlistOfNormalizedStreetType(SP_STREET_TYPE_MAP);
public static final Collection<String> DE_STREETTYPE_LIST_AFTER_NORMALIZATION=new ArrayList<String>(){
{
add("strassen");
add("strasse");
add("straße");
add("straßen");
add("str");
add("allee");
add("alleen");
add("all");
add("platz");
add("fleck");//place
add("Platze");
add("pl");
add("gewerbegebiet");//ZI
add("gg");
add("damm");
add("damme");
add("res");
add("chausee");
add("chee");
add("brucke");//pont
add("br");
add("gasse");//ruelle
add("gassen");//ruelle
add("pfad");//sentier, chemin
add("weg");
add("landstraße");
add("landstraßen");
add("pfad");
add("pfade");
add("ring");
add("steig");
add("steige");
add("ufer");
add("landstr");
add("park");
add("autobahn");
add("platz");
add("platze");
add("stræde");
add("staede");
add("strada");
add("straat");
}
};
public static String removeStreetType(String street,String countryCode){
if (street==null){
return null;
}
street= street.trim();
boolean hasPoint = false;
if (countryCode != null && FR_COUNTRIES.contains(countryCode.toUpperCase())){
if (street.indexOf(' ')>0){
String firstWord = street.substring(0, street.indexOf(' '));
if (firstWord.indexOf(".")>0){
firstWord = firstWord.substring(0, firstWord.indexOf('.'));
hasPoint=true;
}
if (firstWord !=null ){
if (FR_STREETTYPE_LIST_AFTER_NORMALIZATION.contains(firstWord.toLowerCase())){
String toReplace = hasPoint?firstWord+".":firstWord;
return street.replaceFirst(toReplace, "").trim();
}
}
}
}
if (countryCode != null && SP_COUNTRIES.contains(countryCode.toUpperCase())){
if (street.indexOf(' ')>0){
String firstWord = street.substring(0, street.indexOf(' '));
if (firstWord.indexOf(".")>0){
firstWord = firstWord.substring(0, firstWord.indexOf('.'));
hasPoint=true;
}
if (firstWord !=null ){
if (SP_STREETTYPE_LIST_AFTER_NORMALIZATION.contains(firstWord.toLowerCase())){
String toReplace = hasPoint?firstWord+".":firstWord;
return street.replaceFirst(toReplace, "").trim();
}
}
}
}
if (countryCode != null && IT_COUNTRIES.contains(countryCode.toUpperCase())){
if (street.indexOf(' ')>0){
String firstWord = street.substring(0, street.indexOf(' '));
if (firstWord.indexOf(".")>0){
firstWord = firstWord.substring(0, firstWord.indexOf('.'));
hasPoint=true;
}
if (firstWord !=null ){
if (IT_STREETTYPE_LIST_AFTER_NORMALIZATION.contains(firstWord.toLowerCase())){
String toReplace = hasPoint?firstWord+".":firstWord;
return street.replaceFirst(toReplace, "").trim();
}
}
}
}
else if (countryCode != null && EN_COUNTRIES.contains(countryCode.toUpperCase())){
//last word
if (street.indexOf(' ')>0){
String lastword = street.substring(street.lastIndexOf(" ")+1);
if (lastword.indexOf(".")>0){
lastword = lastword.substring(0, lastword.indexOf('.'));
hasPoint=true;
}
if (lastword !=null) {
if (EN_STREETTYPE_LIST_AFTER_NORMALIZATION.contains(lastword.toLowerCase())){
String toReplace = hasPoint?lastword+".":lastword;
return street.replaceFirst(toReplace, "").trim();
}
}
}
}
else if((countryCode!=null && (Decompounder.isDecompoudCountryCode(countryCode)|| "BE".equalsIgnoreCase(countryCode))) || decompounder.getSate(street)!=state.NOT_APPLICABLE){
if (street.indexOf(' ')>0){
String lastword = street.substring(street.lastIndexOf(" ")+1);
if (lastword.indexOf(".")>0){
lastword = lastword.substring(0, lastword.indexOf('.'));
hasPoint=true;
}
if (lastword !=null) {
if (DE_STREETTYPE_LIST_AFTER_NORMALIZATION.contains(lastword.toLowerCase())){
String toReplace = hasPoint?lastword+".":lastword;
return street.replaceFirst(toReplace, "").trim();
}
}
}
street = decompounder.getOtherFormat(street);
if (street.indexOf(' ')>0){
String lastword = street.substring(street.lastIndexOf(" ")+1);
if (lastword.indexOf(".")>0){
lastword = lastword.substring(0, lastword.indexOf('.'));
hasPoint=true;
}
if (lastword !=null) {
if (DE_STREETTYPE_LIST_AFTER_NORMALIZATION.contains(lastword.toLowerCase())){
String toReplace = hasPoint?lastword+".":lastword;
return street.replaceFirst(toReplace, "").trim();
}
}
}
}
return street;
}
public static boolean isSameStreetName(String name,OpenStreetMap openstreetmap){
if (name!=null && openstreetmap!=null){
if (StringHelper.isSameStreetName(name, openstreetmap.getName(), openstreetmap.getCountryCode())){
return true;
}
//search deeper
if (openstreetmap.getAlternateNames()!=null){
for (AlternateOsmName alterString:openstreetmap.getAlternateNames()){
if (alterString!=null){
if (StringHelper.isSameStreetName(name, alterString.getName(), openstreetmap.getCountryCode())){
return true;
}
}
}
}
}
return false;
}
private static Collection<String> getlistOfNormalizedStreetType(
Map<String, String> map) {
Collection<String> results = new ArrayList<String>(map.values());
Collection<String> keySet = map.keySet();
for (String s : map.keySet()){
results.add(s);
}
return results;
}
public static String prepareQuery(String rawAddress) {
if (rawAddress == null){
return rawAddress;
}
StringBuffer sb;
Matcher m = RN_PATTERN.matcher(rawAddress);
if (m.find()){
sb = new StringBuffer();
m.appendReplacement(sb,"route nationale "+m.group(2));
m.appendTail(sb);
rawAddress = sb.toString();
}
m = ZIPCONCATENATE_3_2__PATTERN.matcher(rawAddress);
if (m.find()){
sb = new StringBuffer();
m.appendReplacement(sb,m.group(1)+" "+m.group(2)+m.group(3));
m.appendTail(sb);
rawAddress = sb.toString();
} else {
m = ZIPCONCATENATE_2_3_PATTERN.matcher(rawAddress);
if (m.find()){
sb = new StringBuffer();
m.appendReplacement(sb,m.group(1)+" "+m.group(2)+m.group(3));
m.appendTail(sb);
rawAddress = sb.toString();
}
}
logger.error("prepared address : "+rawAddress);
return rawAddress;
}
}