/******************************************************************************* * Gisgraphy Project * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA * * Copyright 2008 Gisgraphy project * David Masclet <davidmasclet@gisgraphy.com> * * *******************************************************************************/ package com.gisgraphy.compound; import static com.gisgraphy.compound.Trie.CONDENSE; import static com.gisgraphy.compound.Trie.trie; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * @author david Masclet * * decompounder are often based on wordlist and doesn't return unknow words (as lucene one). * It is not very useful when you got street names (e.g : fooStrasse).<br/><br> * This decompounder aim is to split a word based on words list but keep the unknow words: * e.g : if words are {weg,wald} then foowegwald will return [foowegwald foo weg wald]. * lucene one would have returned [weg wald]. * */ public class Decompounder { private Pattern p; private static Pattern concatenatePattern; public enum state {CONCATENATE, SEPARATE, NOT_APPLICABLE}; Pattern ENDING_POINT = Pattern.compile("\\.$"); public static List<String> DEFAULT_WORD = new ArrayList<String>(){ { add("weg."); add("str."); add("straße."); add("strasse."); add("plätze."); add("plätz."); add("platze."); add("platz."); add("wald."); } }; private static List<String> DECOMPOUND_COUNTRIES = new ArrayList<String>(){ { add("DE"); add("CH"); add("LI"); add("AT"); add("DK"); } }; /** * create a basic decompounder with default ending word */ public Decompounder(){ this(DEFAULT_WORD); } public Decompounder(List<String> words) { if (words==null){ throw new RuntimeException("words list is mandatory for a decompounder"); } List<String> inWords = new ArrayList<String>(); List<String> endWords = new ArrayList<String>(); for (String word: words){ if (word.endsWith(".")){ String endWord = word.substring(0, word.length()-1); endWords.add(endWord); } else { inWords.add(word); } } String re = trie(inWords, CONDENSE); String re2 = trie(endWords, CONDENSE); concatenatePattern = Pattern .compile("((\\S|\\s)("+re2+"\\b[\\.]?))",Pattern.CASE_INSENSITIVE); re= "((?:("+re2+"\\b[\\.]?))|(?:"+re+"))"; //System.out.println(re); p = Pattern .compile(re, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE); } public String[] decompound(String str) { //Simple but probably not optimized Matcher m = p.matcher(str); StringBuffer s = new StringBuffer(); boolean found=false; while (m.find()) { found=true; m.appendReplacement(s, " " + m.group(0) + " "); } m.appendTail(s); if(found){ return s.toString().replaceAll("\\s+", " ").trim().split(" "); } else { return new String[]{str}; } } public String getOtherFormat(String str){ state state = getSate(str); if (state==state.CONCATENATE){ return separate(str); } else if (state == state.SEPARATE){ return concatenate(str); } else { return str; } } public String getOtherFormatForText(String text){ if (text==null){ return text; } Matcher m = concatenatePattern.matcher(text); StringBuffer s = new StringBuffer(); while (m.find()) { if (" ".equals(m.group(2))|| "-".equals(m.group(2))){ m.appendReplacement(s, m.group(3) ); } else { m.appendReplacement(s, m.group(2)+" " +m.group(3) + " "); } } m.appendTail(s); return s.toString().replaceAll("\\s+", " ").trim(); } /*public String separate(String str) { //Simple but probably not optimized Matcher m = p.matcher(str); StringBuffer s = new StringBuffer(); boolean found=false; while (m.find()) { found=true; m.appendReplacement(s, " " + m.group(0) + " "); } m.appendTail(s); if(found){ return s.toString().replaceAll("\\s+", " ").trim(); } else { return str; } }*/ public String concatenate(String text){ return separate(text); } public String separate(String text){ if (text==null){ return text; } Matcher m = concatenatePattern.matcher(text); StringBuffer s = new StringBuffer(); while (m.find()) { if (" ".equals(m.group(2))){ m.appendReplacement(s, m.group(3) ); } else { m.appendReplacement(s, m.group(2)+" " +m.group(3) + " "); } m.appendTail(s); return s.toString().replaceAll("\\s+", " ").trim(); } return text; } public state getSate(String text){ if (text==null){ return state.NOT_APPLICABLE; } Matcher m = concatenatePattern.matcher(text); if (m.find()){ if (" ".equals(m.group(2))){ return state.SEPARATE; } else { return state.CONCATENATE; } } return state.NOT_APPLICABLE; } public static boolean isDecompoudCountryCode(String countryCode){ if (countryCode!=null){ return DECOMPOUND_COUNTRIES.contains(countryCode.toUpperCase()); } else { return false; } } public boolean isDecompoudName(String name){ if (name!=null){ return getSate(name)!=state.NOT_APPLICABLE; } else { return false; } } }