/** * Copyright (C) 2012 cogroo <cogroo@cogroo.org> * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.cogroo.dictionary.impl; import static org.cogroo.util.ByteArrayUtil.toByteArray; import java.io.ByteArrayInputStream; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.Scanner; import org.apache.log4j.Logger; import org.cogroo.dictionary.LemmaDictionary; import org.cogroo.util.CacheWrapper; import org.cogroo.util.PairWordPOSTag; import morfologik.stemming.Dictionary; import morfologik.stemming.DictionaryLookup; import morfologik.stemming.WordData; import opennlp.tools.postag.TagDictionary; public class FSADictionary implements TagDictionary, LemmaDictionary, Iterable<String> { protected static final Logger LOGGER = Logger.getLogger(FSADictionary.class); private volatile DictionaryLookup dictLookup; private FSADictionary(DictionaryLookup dictLookup) { this.dictLookup = dictLookup; } // private Cache tagCache = new Cache(500); private CacheWrapper<String[]> tagCache = new CacheWrapper<String[]>() { @Override public String[] compute(String key) { List<WordData> data = null; String lowerKey = key.toLowerCase(); try { data = dictLookup.lookup(key); } catch(Throwable e) { // sorry this failed. Please submit a bug report. LOGGER.error("Exception in dictionary lookup. Please report the bug. Word: [" + key + "]", e); throw e; } if (data.size() > 0) { List<String> tags = new ArrayList<String>(data.size()); for (int i = 0; i < data.size(); i++) { if(isValid(data.get(i))) { tags.add(data.get(i).getTag().toString()); } } if(tags.size() > 0) { String[] res = tags.toArray(new String[tags.size()]); return res; } return null; } else if(!key.equals(lowerKey)) { return compute(lowerKey); } return null; } }; private String[] tagLookup(String word) { return tagCache.get(word); } private CacheWrapper<List<PairWordPOSTag>> lemmaTagCache = new CacheWrapper<List<PairWordPOSTag>>() { @Override public List<PairWordPOSTag> compute(String key) { String lowerKey = key.toLowerCase(); List<PairWordPOSTag> list; List<WordData> data = dictLookup.lookup(key); if (data.size() > 0) { list = new ArrayList<PairWordPOSTag>(data.size()); for (int i = 0; i < data.size(); i++) { WordData wd = data.get(i); if(isValid(wd)) { list.add(new PairWordPOSTag(wd.getStem().toString(), wd.getTag() .toString())); } } List<PairWordPOSTag> tags = Collections.unmodifiableList(list); return tags; } else if(!key.equals(lowerKey)) { return compute(lowerKey); } return Collections.emptyList(); } }; private List<PairWordPOSTag> lemmaTagLookup(String word) { return lemmaTagCache.get(word); } private boolean isValid(WordData wd) { if(wd.getStem() == null) { LOGGER.error("Got invalid entry from FSA dictionary: " + wd); return false; } return true; } public String[] getTags(String word) { if (word == null) { return null; } return tagLookup(word); } public String[] getLemmas(String word, String tag) { List<String> output = new ArrayList<String>(); List<PairWordPOSTag> pairs = lemmaTagLookup(word); for (PairWordPOSTag pairWordPOSTag : pairs) { boolean match = false; if (pairWordPOSTag.getPosTag().equals(tag)) { match = true; } else { // TODO: this is language specific !! if ("n-adj".equals(pairWordPOSTag.getPosTag())) { if ("n".equals(tag) || "adj".equals(tag)) { match = true; } } else if ("n-adj".equals(tag)) { if ("n".equals(pairWordPOSTag.getPosTag()) || "adj".equals(pairWordPOSTag.getPosTag())) { match = true; } } } if (match) output.add(pairWordPOSTag.getWord()); } return output.toArray(new String[output.size()]); } /** This is used by rule system */ public List<PairWordPOSTag> getTagsAndLemms(String aWord) { // TODO: acabar isso usando Cache.. Colocar cache no return lemmaTagLookup(aWord); } public static TagDictionary create(String path) throws IllegalArgumentException, IOException { FileInputStream fsaData = new FileInputStream(path); FileInputStream featuresData = new FileInputStream( Dictionary.getExpectedFeaturesName(path)); return create(fsaData, featuresData); } public static byte[] getFSADictionaryInfo(String path) throws IOException { FileInputStream featuresData = new FileInputStream( Dictionary.getExpectedFeaturesName(path)); return toByteArray(featuresData); } public static byte[] getFSADictionaryData(String path) throws IOException { FileInputStream featuresData = new FileInputStream(path); return toByteArray(featuresData); } public static FSADictionary create(InputStream fsaData, InputStream featuresData) throws IllegalArgumentException, IOException { DictionaryLookup dictLookup = new DictionaryLookup(Dictionary.readAndClose( fsaData, featuresData)); return new FSADictionary(dictLookup); } public static TagDictionary create(byte[] dictData, byte[] dictInfo) throws IllegalArgumentException, IOException { return create(new ByteArrayInputStream(dictData), new ByteArrayInputStream( dictInfo)); } public static FSADictionary createFromResources(String path) throws IllegalArgumentException, IOException { InputStream dic = FSADictionary.class.getResourceAsStream(path); InputStream info = FSADictionary.class.getResourceAsStream(Dictionary.getExpectedFeaturesName(path)); FSADictionary fsa = create(dic, info); dic.close(); info.close(); return fsa; } private static class IteratorWrapper implements Iterator<String> { private final Iterator<WordData> innerIterator; public IteratorWrapper(Iterator<WordData> iterator) { this.innerIterator = iterator; } public boolean hasNext() { return innerIterator.hasNext(); } public String next() { WordData wd = innerIterator.next(); if (wd != null) { return wd.getWord().toString(); } return null; } public void remove() { innerIterator.remove(); } } public Iterator<String> iterator() { return new IteratorWrapper(this.dictLookup.iterator()); } public static void main(String[] args) throws IllegalArgumentException, IOException { long start = System.nanoTime(); String path = "/fsa_dictionaries/pos/pt_br_jspell_corpus"; InputStream dict = FSADictionary.class.getResourceAsStream(path + ".dict"); InputStream info = FSADictionary.class.getResourceAsStream(path + ".info"); FSADictionary td = (FSADictionary) create(dict, info); System.out.println("Loading time [" + ((System.nanoTime() - start) / 1000000) + "ms]"); Scanner kb = new Scanner(System.in); System.out.print("Enter a query: "); String input = kb.nextLine(); while (!input.equals("q")) { if (input.equals("0")) { input = "cão"; } List<PairWordPOSTag> pair = td.getTagsAndLemms(input); for (PairWordPOSTag pairWordPOSTag : pair) { System.out.println(pairWordPOSTag.getPosTag() + " : " + pairWordPOSTag.getWord()); } System.out.print("Enter a query: "); input = kb.nextLine(); } } }