/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package opennlp.tools.postag; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Objects; import opennlp.tools.dictionary.serializer.Attributes; import opennlp.tools.dictionary.serializer.DictionaryEntryPersistor; import opennlp.tools.dictionary.serializer.Entry; import opennlp.tools.util.InvalidFormatException; import opennlp.tools.util.StringList; import opennlp.tools.util.StringUtil; import opennlp.tools.util.model.SerializableArtifact; /** * Provides a means of determining which tags are valid for a particular word * based on a tag dictionary read from a file. */ public class POSDictionary implements Iterable<String>, MutableTagDictionary, SerializableArtifact { private Map<String, String[]> dictionary; private boolean caseSensitive = true; /** * Initializes an empty case sensitive {@link POSDictionary}. */ public POSDictionary() { this(true); } /** * Initializes an empty {@link POSDictionary}. * @param caseSensitive the {@link POSDictionary} case sensitivity */ public POSDictionary(boolean caseSensitive) { dictionary = new HashMap<>(); this.caseSensitive = caseSensitive; } /** * Returns a list of valid tags for the specified word. * * @param word The word. * * @return A list of valid tags for the specified word or * null if no information is available for that word. */ public String[] getTags(String word) { if (caseSensitive) { return dictionary.get(word); } else { return dictionary.get(StringUtil.toLowerCase(word)); } } /** * Associates the specified tags with the specified word. If the dictionary * previously contained the word, the old tags are replaced by the specified * ones. * * @param word * The word to be added to the dictionary. * @param tags * The set of tags associated with the specified word. * * @deprecated Use {@link #put(String, String[])} instead */ void addTags(String word, String... tags) { put(word, tags); } /** * Retrieves an iterator over all words in the dictionary. */ public Iterator<String> iterator() { return dictionary.keySet().iterator(); } private static String tagsToString(String[] tags) { StringBuilder tagString = new StringBuilder(); for (String tag : tags) { tagString.append(tag); tagString.append(' '); } // remove last space if (tagString.length() > 0) { tagString.setLength(tagString.length() - 1); } return tagString.toString(); } /** * Writes the {@link POSDictionary} to the given {@link OutputStream}; * * After the serialization is finished the provided * {@link OutputStream} remains open. * * @param out * the {@link OutputStream} to write the dictionary into. * * @throws IOException * if writing to the {@link OutputStream} fails */ public void serialize(OutputStream out) throws IOException { Iterator<Entry> entries = new Iterator<Entry>() { Iterator<String> iterator = dictionary.keySet().iterator(); public boolean hasNext() { return iterator.hasNext(); } public Entry next() { String word = iterator.next(); Attributes tagAttribute = new Attributes(); tagAttribute.setValue("tags", tagsToString(getTags(word))); return new Entry(new StringList(word), tagAttribute); } public void remove() { throw new UnsupportedOperationException(); } }; DictionaryEntryPersistor.serialize(out, entries, caseSensitive); } @Override public int hashCode() { int[] keyHashes = new int[dictionary.size()]; int[] valueHashes = new int[dictionary.size()]; int i = 0; for (String word : this) { keyHashes[i] = word.hashCode(); valueHashes[i] = Arrays.hashCode(getTags(word)); i++; } Arrays.sort(keyHashes); Arrays.sort(valueHashes); return Objects.hash(Arrays.hashCode(keyHashes), Arrays.hashCode(valueHashes)); } @Override public boolean equals(Object obj) { if (obj == this) { return true; } if (obj instanceof POSDictionary) { POSDictionary posDictionary = (POSDictionary) obj; if (this.dictionary.size() == posDictionary.dictionary.size()) { for (String word : this) { if (!Arrays.equals(getTags(word), posDictionary.getTags(word))) { return false; } } return true; } } return false; } @Override public String toString() { // it is time consuming to output the dictionary entries. // will output something meaningful for debugging, like // POSDictionary{size=100, caseSensitive=true} return "POSDictionary{size=" + dictionary.size() + ", caseSensitive=" + this.caseSensitive + "}"; } /** * Creates a new {@link POSDictionary} from a provided {@link InputStream}. * * After creation is finished the provided {@link InputStream} is closed. * * @param in * * @return the pos dictionary * * @throws IOException * @throws InvalidFormatException */ public static POSDictionary create(InputStream in) throws IOException { final POSDictionary newPosDict = new POSDictionary(); boolean isCaseSensitive = DictionaryEntryPersistor.create(in, entry -> { String tagString = entry.getAttributes().getValue("tags"); String[] tags = tagString.split(" "); StringList word = entry.getTokens(); if (word.size() != 1) throw new InvalidFormatException("Each entry must have exactly one token! " + word); newPosDict.dictionary.put(word.getToken(0), tags); }); newPosDict.caseSensitive = isCaseSensitive; // TODO: The dictionary API needs to be improved to do this better! if (!isCaseSensitive) { Map<String, String[]> lowerCasedDictionary = new HashMap<>(); for (Map.Entry<String, String[]> entry : newPosDict.dictionary.entrySet()) { lowerCasedDictionary.put(StringUtil.toLowerCase(entry.getKey()), entry.getValue()); } newPosDict.dictionary = lowerCasedDictionary; } return newPosDict; } public String[] put(String word, String... tags) { if (this.caseSensitive) { return dictionary.put(word, tags); } else { return dictionary.put(StringUtil.toLowerCase(word), tags); } } public boolean isCaseSensitive() { return this.caseSensitive; } @Override public Class<?> getArtifactSerializerClass() { return POSTaggerFactory.POSDictionarySerializer.class; } }