/* * Copyright 2013 Corpuslinguistic working group Humboldt University Berlin. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package annis.gui.flatquerybuilder; import com.vaadin.server.ClassResource; import com.vaadin.ui.Notification; import java.io.IOException; import java.util.HashMap; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; /** * @author klotzmaz * @author tom */ public class ReducingStringComparator { private HashMap<String, HashMap> ALLOGRAPHS; private static final String READING_ERROR_MESSAGE = "ERROR: Unable to load mapping file(s)!"; private static String MAPPING_FILE = "mapfile.fqb"; public ReducingStringComparator() { initAlphabet(); readMappings(); } public HashMap<String, HashMap> getMappings(){ return ALLOGRAPHS; } private HashMap initAlphabet() { HashMap<Character, Character> h = new HashMap<>(); //standard-alphabet: for(int i=97; i<122; i++) { char c = (char)i; h.put(c, c); h.put(Character.toUpperCase(c), c); } return h; } private void readMappings() { ALLOGRAPHS = new HashMap<>(); ClassResource cr = new ClassResource(ReducingStringComparator.class, MAPPING_FILE); try{ DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); DocumentBuilder db = dbf.newDocumentBuilder(); Document mappingD = db.parse(cr.getStream().getStream()); NodeList mappings = mappingD.getElementsByTagName("mapping"); for (int i = 0; i < mappings.getLength(); i++) { Element mapping = (Element) mappings.item(i); String mappingName = mapping.getAttribute("name"); HashMap mappingMap = initAlphabet(); NodeList variants = mapping.getElementsByTagName("variant"); for (int j = 0; j < variants.getLength(); j++) { Element var = (Element) variants.item(j); char varvalue = var.getAttribute("value").charAt(0); Element character = (Element) var.getParentNode(); char charactervalue = character.getAttribute("value").charAt(0); mappingMap.put(varvalue, charactervalue); } ALLOGRAPHS.put(mappingName, mappingMap); } } catch(SAXException e) { e = null; Notification.show(READING_ERROR_MESSAGE); } catch(IOException e) { e = null; Notification.show(READING_ERROR_MESSAGE); } catch(ParserConfigurationException e) { e = null; Notification.show(READING_ERROR_MESSAGE); } } private String removeCombiningCharacters(String s) { String t=""; for (int i=0; i<s.length(); i++) { char c = s.charAt(i); int cp = (int)c; if(!( ((cp>767) & (cp<880)) | ((cp>1154) & (cp<1162)) | (cp==1619) | ((cp>2026) & (cp<2036)) | (cp==4352) | ((cp>4956) & (cp<4960)) | (cp==6783) | ((cp>7018) & (cp<7028)) | ((cp>7615) & (cp<7655)) | ((cp>7675) & (cp<7680)) | ((cp>8399) & (cp<8433)) | ((cp>11502) & (cp<11506)) | ((cp>11743) & (cp<11776)) | ((cp>12440) & (cp<12443)) | ((cp>42606) & (cp<42611)) | ((cp>42611) & (cp<42622)) | ((cp>42654) & (cp<42738)) | ((cp>43231) & (cp<43250)) | ((cp>65055) & (cp<65063)) | (cp==66045) | ((cp>119140) & (cp<119146)) | ((cp>119148) & (cp<119155)) | ((cp>119162) & (cp<119171)) | ((cp>119172) & (cp<119180)) | ((cp>119209) & (cp<119214)) | ((cp>119361) & (cp<119365)) )) { t = t + c; } } return t; } public int compare(Object a, Object b, String mapname) /* * use with Strings only * * <0: a<b * =0: a=b * >0: a>b * * compare() is split in 2 methods to make contains() * more comfortable (contains() could use compare2(), * so that a multiple application of removeCombiningCharacters() * on the same string is avoided) * */ { String s1 = removeCombiningCharacters((String)a); String s2 = removeCombiningCharacters((String)b); //compare without spaces return compare2(s1.replace(" ", ""), s2.replace(" ", ""), mapname); } private int compare2(String s1, String s2, String mapname) { int l = s1.length(); if (l<s2.length()) { return -1; } else if (l>s2.length()) { return 1; } for(int i=0; i<l; i++) { char c1 = s1.charAt(i); char c2 = s2.charAt(i); HashMap<Character, Character> curMap = ALLOGRAPHS.get(mapname); char rc1 = curMap.containsKey(c1) ? curMap.get(c1) : c1; char rc2 = (curMap.containsKey(c2)) ? curMap.get(c2) : c2; if(rc1<rc2) { return -1; } else if(rc1>rc2) { return 1; } } return 0; } public boolean startsWith(String fullSequence, String subSequence, String mapname) { //kill diacritics: String subS = removeCombiningCharacters(subSequence); String fullS = removeCombiningCharacters(fullSequence); //remove spaces: subS = subS.replace(" ", ""); fullS = fullS.replace(" ", ""); int l = subS.length(); if (fullS.length()<l) {return false;} return (compare2(fullS.substring(0, l), subS, mapname)==0); } public boolean contains(String fullSequence, String subSequence, String mapname) { //kill diacritics: String subS = removeCombiningCharacters(subSequence); String fullS = removeCombiningCharacters(fullSequence); //remove spaces: subS = subS.replace(" ", ""); fullS = fullS.replace(" ", ""); int l = subS.length(); for (int i=0; i<fullS.length()-l+1; i++) { if (compare2(fullS.substring(i, i+l), subS, mapname)==0) { return true; } } return false; } }