/** * Copyright 2014, Emory University * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package edu.emory.clir.clearnlp.dictionary.universal; import java.io.InputStream; import java.util.Set; import java.util.regex.Matcher; import edu.emory.clir.clearnlp.collection.tree.CharAffixTree; import edu.emory.clir.clearnlp.dictionary.PathTokenizer; import edu.emory.clir.clearnlp.util.DSUtils; import edu.emory.clir.clearnlp.util.IOUtils; import edu.emory.clir.clearnlp.util.MetaUtils; import edu.emory.clir.clearnlp.util.StringUtils; /** * @since 3.0.0 * @author Jinho D. Choi ({@code jinho.choi@emory.edu}) */ public class DTEmoticon { private Set<String> s_emoticon; private CharAffixTree t_prefix; private CharAffixTree t_suffix; public DTEmoticon() { init(IOUtils.getInputStreamsFromClasspath(PathTokenizer.EMOTICONS)); } public DTEmoticon(InputStream in) { init(in); } public void init(InputStream in) { s_emoticon = DSUtils.createStringHashSet(in, true, false); t_prefix = new CharAffixTree(true); t_prefix.addAll(s_emoticon); t_suffix = new CharAffixTree(false); t_suffix.addAll(s_emoticon); } public int[] getEmoticonRange(String s) { s = StringUtils.toLowerCase(s); if (s_emoticon.contains(s)) return new int[]{0, s.length()}; Matcher m = MetaUtils.EMOTICON.matcher(s); if (m.find()) return new int[]{m.start(), m.end()}; int idx; if ((idx = t_prefix.getAffixIndex(s, false)) >= 0) return new int[]{0, idx+1}; if ((idx = t_suffix.getAffixIndex(s, false)) >= 0) return new int[]{idx, s.length()}; return null; } }