/** * Copyright 2014, Emory University * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package edu.emory.clir.clearnlp.dictionary.universal; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.HashMap; import java.util.Map; import java.util.regex.Matcher; import edu.emory.clir.clearnlp.dictionary.PathTokenizer; import edu.emory.clir.clearnlp.util.IOUtils; import edu.emory.clir.clearnlp.util.Splitter; import edu.emory.clir.clearnlp.util.StringUtils; import edu.emory.clir.clearnlp.util.constant.PatternConst; import edu.emory.clir.clearnlp.util.constant.StringConst; /** * @since 3.0.0 * @author Jinho D. Choi ({@code jinho.choi@emory.edu}) */ public class DTHtml { private Map<String,String> m_tags; public DTHtml() { init(IOUtils.getInputStreamsFromClasspath(PathTokenizer.HTML_TAGS)); } /** @param in internally wrapped by {@code new BufferedReader(new InputStreamReader(in))}. */ public DTHtml(InputStream in) { init(in); } /** @param in internally wrapped by {@code new BufferedReader(new InputStreamReader(in))}. */ public void init(InputStream in) { BufferedReader reader = new BufferedReader(new InputStreamReader(in)); String[] t; String line; m_tags = new HashMap<>(); try { while ((line = reader.readLine()) != null) { t = Splitter.splitTabs(line); m_tags.put(t[0], Character.toString((char)Integer.parseInt(t[1]))); } } catch (IOException e) {e.printStackTrace();} } public String replace(String s) { Matcher m = PatternConst.HTML_TAG.matcher(s); StringBuffer sb = null; while (m.find()) { if (sb == null) sb = new StringBuffer(); appendReplacement(sb, m); } if (sb == null) return s; else { m.appendTail(sb); return sb.toString(); } } private void appendReplacement(StringBuffer sb, Matcher m) { String key = m.group(1), value; int ascii; if ((value = m_tags.get(key.toLowerCase())) != null) m.appendReplacement(sb, value); else if ((ascii = getASCII(key)) != -1) m.appendReplacement(sb, Character.toString((char)ascii)); else m.appendReplacement(sb, m.group()); } private int getASCII(String s) { if (s.startsWith(StringConst.POUND)) { s = s.substring(1); if (StringUtils.containsDigitOnly(s)) { int i = Integer.parseInt(s); if (32 <= i && i <= 917631) return i; } } return -1; } }