/** * Copyright (C) 2012 cogroo <cogroo@cogroo.org> * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package opennlp.tools.postag; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import org.cogroo.tools.featurizer.WordTag; import opennlp.tools.dictionary.serializer.Attributes; import opennlp.tools.dictionary.serializer.DictionaryEntryPersistor; import opennlp.tools.dictionary.serializer.Entry; import opennlp.tools.util.InvalidFormatException; import opennlp.tools.util.StringList; import opennlp.tools.util.StringUtil; /** * Provides a means of determining which tags are valid for a particular word * based on a tag dictionary read from a file. */ public class ExtendedPOSDictionary implements Iterable<WordTag>, ExtendedTagDictionary { // word => [ tag => [lemma, feature]] // private Map<String, String[]> dictionary; private Map<String, List<Triple>> dictionary; private static final String ATTR_TAGS = "tags"; private static final String ATTR_LEMMAS = "lemmas"; private static final String ATTR_FEATS = "feats"; private boolean caseSensitive = true; public ExtendedPOSDictionary() { this(true); } public ExtendedPOSDictionary(boolean caseSensitive) { dictionary = new HashMap<String, List<Triple>>(); this.caseSensitive = caseSensitive; } /** * Returns a list of valid tags for the specified word. * * @param word * The word. * * @return A list of valid tags for the specified word or null if no * information is available for that word. */ public String[] getTags(String word) { if (caseSensitive) { return getTags(dictionary.get(word)); } else { return getTags(dictionary.get(word.toLowerCase())); } } public String[] getFeatureTag(String word) { if (caseSensitive) { return getFeats(dictionary.get(word)); } else { return getFeats(dictionary.get(word.toLowerCase())); } } public String[] getCompleteTag(String word) { if (caseSensitive) { return getCompleteTag(dictionary.get(word)); } else { return getCompleteTag(dictionary.get(word.toLowerCase())); } } void addTriple(String word, Triple triple) { addTriple(dictionary, word, triple); } static void addTriple(Map<String, List<Triple>> dic, String word, Triple triple) { if (!dic.containsKey(word)) { dic.put(word, new ArrayList<Triple>()); } dic.get(word).add(triple); } /** * Retrieves an iterator over all words in the dictionary. */ public Iterator<WordTag> iterator() { return new IteratorWrapper(); } private static String tagsToString(String tags[]) { StringBuilder tagString = new StringBuilder(); for (String tag : tags) { tagString.append(tag); tagString.append(' '); } // remove last space if (tagString.length() > 0) { tagString.setLength(tagString.length() - 1); } return tagString.toString(); } /** * Writes the {@link ExtendedPOSDictionary} to the given {@link OutputStream}; * * After the serialization is finished the provided {@link OutputStream} * remains open. * * @param out * the {@link OutputStream} to write the dictionary into. * * @throws IOException * if writing to the {@link OutputStream} fails */ public void serialize(OutputStream out) throws IOException { Iterator<Entry> entries = new Iterator<Entry>() { Iterator<String> iterator = dictionary.keySet().iterator(); public boolean hasNext() { return iterator.hasNext(); } public Entry next() { String word = iterator.next(); List<Triple> triples = dictionary.get(word); Attributes tagAttribute = new Attributes(); String[] tags = new String[triples.size()]; String[] lemmas = new String[triples.size()]; String[] feats = new String[triples.size()]; int i = 0; for (Triple t : triples) { tags[i] = t.getClazz(); lemmas[i] = t.getLemma(); feats[i++] = t.getFeats(); } tagAttribute.setValue(ATTR_TAGS, tagsToString(tags)); tagAttribute.setValue(ATTR_LEMMAS, tagsToString(lemmas)); tagAttribute.setValue(ATTR_FEATS, tagsToString(feats)); return new Entry(new StringList(word), tagAttribute); } public void remove() { throw new UnsupportedOperationException(); } }; DictionaryEntryPersistor.serialize(out, entries, caseSensitive); } // @Override // public boolean equals(Object o) { // // if (o == this) { // return true; // } else if (o instanceof ExtendedPOSDictionary) { // ExtendedPOSDictionary dictionary = (ExtendedPOSDictionary) o; // // if (this.dictionary.size() == dictionary.dictionary.size()) { // // for (String word : this) { // // List<Triple> aTriples = this.dictionary.get(word); // List<Triple> bTriples = dictionary.dictionary.get(word); // if (!aTriples.equals(bTriples)) { // return false; // } // } // // return true; // } // } // // return false; // } @Override public String toString() { StringBuilder dictionaryString = new StringBuilder(); int i = 0; for (String word : dictionary.keySet()) { dictionaryString.append(word).append(" -> ") .append(tagsToString(getTags(word))); dictionaryString.append("\n"); if (i > 3) break; i++; } // remove last new line if (dictionaryString.length() > 0) { dictionaryString.setLength(dictionaryString.length() - 1); } return dictionaryString.toString(); } /** * Creates a new {@link ExtendedPOSDictionary} from a provided * {@link InputStream}. * * After creation is finished the provided {@link InputStream} is closed. * * @param in * * @return the pos dictionary * * @throws IOException * @throws InvalidFormatException */ public static ExtendedPOSDictionary create(InputStream in) throws IOException, InvalidFormatException { final ExtendedPOSDictionary newPosDict = new ExtendedPOSDictionary(); boolean isCaseSensitive = DictionaryEntryPersistor.create(in, entry -> { String tagString = entry.getAttributes().getValue(ATTR_TAGS); String lemmaString = entry.getAttributes().getValue(ATTR_LEMMAS); String featString = entry.getAttributes().getValue(ATTR_FEATS); String[] tags = tagString.split(" "); String[] lemmas = lemmaString.split(" "); String[] feats = featString.split(" "); StringList word = entry.getTokens(); if (word.size() != 1) throw new InvalidFormatException( "Each entry must have exactly one token! " + word); if (tags.length != lemmas.length || tags.length != feats.length) { throw new InvalidFormatException( "Each entry must have exactly number of tags, lemmas and feats! " + word); } addTriple(newPosDict.dictionary, word.getToken(0), createTriple(tags, lemmas, feats)); }); newPosDict.caseSensitive = isCaseSensitive; // TODO: The dictionary API needs to be improved to do this better! if (!isCaseSensitive) { Map<String, List<Triple>> lowerCasedDictionary = new HashMap<String, List<Triple>>(); for (java.util.Map.Entry<String, List<Triple>> entry : newPosDict.dictionary .entrySet()) { lowerCasedDictionary.put(StringUtil.toLowerCase(entry.getKey()), entry.getValue()); } newPosDict.dictionary = lowerCasedDictionary; } return newPosDict; } protected static void addTriple(Map<String, List<Triple>> dict, String token, Triple[] triple) { for (Triple t : triple) { addTriple(dict, token, t); } } public String[] getFeatures(String word, String tag) { List<String> feats = new ArrayList<String>(); if (caseSensitive) { List<Triple> triples = dictionary.get(word); if (triples == null) return null; for (Triple t : triples) { if (tag.equals(t.getClazz())) { feats.add(t.getFeats()); } } } else { List<Triple> triples = dictionary.get(word.toLowerCase()); if (triples == null) return null; for (Triple t : triples) { if (tag.equals(t.getClazz())) { feats.add(t.getFeats()); } } } if (feats.size() > 0) { return feats.toArray(new String[feats.size()]); } return null; } public String getLemma(String word, String tag) { if (caseSensitive) { List<Triple> triples = dictionary.get(word); if (triples == null) return null; for (Triple t : triples) { if (tag.equals(t.getClazz())) { return t.getLemma(); } } } else { List<Triple> triples = dictionary.get(word.toLowerCase()); if (triples == null) return null; for (Triple t : triples) { if (tag.equals(t.getClazz())) { return t.getLemma(); } } } return null; } private static String[] getFeats(List<Triple> triples) { if (triples == null) return null; String[] feats = new String[triples.size()]; int i = 0; for (Triple t : triples) { feats[i++] = t.getFeats(); } return feats; } private String[] getCompleteTag(List<Triple> triples) { if (triples == null) return null; String[] feats = new String[triples.size()]; int i = 0; for (Triple t : triples) { feats[i++] = t.getClazz() + "_" + t.getFeats(); } return feats; } private static String[] getTags(List<Triple> triples) { if (triples == null) { return null; } String[] tags = new String[triples.size()]; int i = 0; for (Triple t : triples) { tags[i++] = t.getClazz(); } return tags; } protected static Triple[] createTriple(String[] tags, String[] lemmas, String[] feats) { Triple[] triples = new Triple[tags.length]; for (int i = 0; i < tags.length; i++) { triples[i] = new Triple(tags[i], lemmas[i], feats[i]); } return triples; } private class IteratorWrapper implements Iterator<WordTag> { Iterator<String> inner = dictionary.keySet().iterator(); String word = null; Iterator<Triple> innerTriple = null; public boolean hasNext() { if(innerTriple != null && innerTriple.hasNext()) { return true; } return inner.hasNext(); } public WordTag next() { if(innerTriple == null || !innerTriple.hasNext()) { word = inner.next(); innerTriple = dictionary.get(word).iterator(); } Triple val = innerTriple.next(); return new WordTag(word, val.getClazz()); } public void remove() { // TODO Auto-generated method stub } } }