package com.gisgraphy.fulltext;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.gisgraphy.helper.StringHelper;
/**
* A class to detect if a text contains a street type
*
* @author <a href="mailto:david.masclet@gisgraphy.com">David Masclet</a>
*
*/
public class SmartStreetDetection {
private static final Pattern STRABE_PATTERN = Pattern.compile("straße",Pattern.CASE_INSENSITIVE);
private final static List<String> STREET_TYPES = new ArrayList<String>(){
private static final long serialVersionUID = -3194005170253765829L;
{
add("rue");
add("boulevard");
add("autoroute");
add("bd");
add("blvd");
add("avenue");
add("chemin");
add("rte");
add("route");
add("impasse");
add("passage");
add("place");
add("sentier");
add("voie");
add("via");
add("allee");
add("alley");
add("avenue");
add("blvd");
add("boulevard");
add("highway");
add("hiway");
add("motorway");
add("plaza");
add("road");
add("route");
add("street");
add("rua");
add("plaza");
add("carrera");
add("camino");
add("passatge");
add("autovia");
add("autopista");
add("autobahn");
add("fleck");
};
};
private static final List<String> STREET_TYPES_DECOMPOUND = new ArrayList<String>(){
{
add("str");
add("straße");
add("strasse");
add("plätze");
add("platze");
add("landstraße");
add("landstrasse");
}
};
private static final String STREET_REGEXP = getRegexp();
private static final Pattern STREET_PATTERN = Pattern.compile(STREET_REGEXP,Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
public List<String> getStreetTypes(String textToTest){
String textToTestNormalize = textToTest;
int nbSpecialchar = 0;
if (textToTest!=null){
textToTestNormalize = StringHelper.normalize(textToTest);
textToTest = textToTest.trim();
nbSpecialchar = countNumberOfstrasse(textToTest);
} else {
return new ArrayList<String>();
}
Matcher matcher = STREET_PATTERN.matcher(textToTestNormalize);
List<String> splitedString = new ArrayList<String>();
int counter =0;
while (matcher.find()) {
for (int j = 1; j <= matcher.groupCount(); j++) {
//System.out.println(matcher.group(j));
int shift=0;
if (nbSpecialchar > 0 && matcher.group(j).indexOf("strasse")>=0){
nbSpecialchar--;
counter++;
shift = 1;
}
int max =(matcher.end(j)-(shift*counter));
int min=matcher.start(j)-(shift*(counter-1));
if (max>textToTest.length()){//avoid out of range
int decal=(matcher.end(j)-(shift*counter))-textToTest.length();
max=textToTest.length();
min=min-decal;
if (min<0){
min=0;
}
}
String realTextNotNormalized = textToTest.substring(min,max);
if (realTextNotNormalized!= null && !"".equals(realTextNotNormalized.trim())){
splitedString.add(realTextNotNormalized);
}
}
}
return splitedString;
}
private int countNumberOfstrasse(String text){
int i = 0;
Matcher m = STRABE_PATTERN.matcher(text);
while (m.find()) {
i++;
}
return i;
}
static String getRegexp() {
StringBuffer sb =new StringBuffer("((?:");
for (int i=0;i<STREET_TYPES.size();i++){
sb.append("\\b").append(STREET_TYPES.get(i)).append("\\b");
if (i!=STREET_TYPES.size()-1){
sb.append("|");
}
}
sb.append("\\b)|(?:");
for (int i=0;i<STREET_TYPES_DECOMPOUND.size();i++){
sb.append(STREET_TYPES_DECOMPOUND.get(i)).append("\\b");
if (i!=STREET_TYPES_DECOMPOUND.size()-1){
sb.append("|");
}
}
sb.append("))");
return sb.toString();
}
}