/** * Copyright 2014, Emory University * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package edu.emory.clir.clearnlp.dictionary.universal; import java.io.InputStream; import java.util.Set; import edu.emory.clir.clearnlp.dictionary.AbstractDTTokenizer; import edu.emory.clir.clearnlp.dictionary.PathTokenizer; import edu.emory.clir.clearnlp.util.CharUtils; import edu.emory.clir.clearnlp.util.DSUtils; import edu.emory.clir.clearnlp.util.IOUtils; import edu.emory.clir.clearnlp.util.Splitter; import edu.emory.clir.clearnlp.util.constant.StringConst; /** * @since 3.0.0 * @author Jinho D. Choi ({@code jinho.choi@emory.edu}) */ public class DTCurrency extends AbstractDTTokenizer { private Set<String> s_currency; private Set<String> s_dollar; public DTCurrency() { InputStream currency = IOUtils.getInputStreamsFromClasspath(PathTokenizer.CURRENCY); InputStream dollar = IOUtils.getInputStreamsFromClasspath(PathTokenizer.CURRENCY_DOLLAR); init(currency, dollar); } public DTCurrency(InputStream currency, InputStream dollar) { init(currency, dollar); } public void init(InputStream currency, InputStream dollar) { s_currency = DSUtils.createStringHashSet(currency, true, true); s_dollar = DSUtils.createStringHashSet(dollar , true, true); for (String s : s_dollar) s_currency.add(s+StringConst.DOLLAR); } public boolean isCurrencyDollar(String lower) { return s_dollar.contains(lower); } public boolean isCurrency(String lower) { return s_currency.contains(lower); } /** @return "US$1" -> ["US$", "1"]. */ public String[] tokenize(String original, String lower, char[] lcs) { int i, len = original.length(); for (String currency : s_currency) { if (lower.startsWith(currency)) { i = currency.length(); if (i < len && CharUtils.isDigit(lcs[i])) return Splitter.split(original, i); } else if (lower.endsWith(currency)) { i = len - currency.length(); if (0 <= i-1 && CharUtils.isDigit(lcs[i-1])) return Splitter.split(original, i); } } return null; } }