/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.stanbol.commons.opennlp; import java.io.IOException; import java.io.InputStream; import java.lang.reflect.Constructor; import java.lang.reflect.InvocationTargetException; import java.nio.charset.Charset; import java.security.AccessController; import java.security.PrivilegedActionException; import java.security.PrivilegedExceptionAction; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; import opennlp.tools.chunker.Chunker; import opennlp.tools.chunker.ChunkerME; import opennlp.tools.chunker.ChunkerModel; import opennlp.tools.namefind.NameFinderME; import opennlp.tools.namefind.TokenNameFinder; import opennlp.tools.namefind.TokenNameFinderModel; import opennlp.tools.postag.POSModel; import opennlp.tools.postag.POSTagger; import opennlp.tools.postag.POSTaggerME; import opennlp.tools.sentdetect.SentenceDetector; import opennlp.tools.sentdetect.SentenceDetectorME; import opennlp.tools.sentdetect.SentenceModel; import opennlp.tools.tokenize.SimpleTokenizer; import opennlp.tools.tokenize.Tokenizer; import opennlp.tools.tokenize.TokenizerME; import opennlp.tools.tokenize.TokenizerModel; import opennlp.tools.util.InvalidFormatException; import org.apache.commons.io.IOUtils; import org.apache.felix.scr.annotations.Component; import org.apache.felix.scr.annotations.Reference; import org.apache.felix.scr.annotations.Service; import org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileProvider; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * OSGI service that let you load OpenNLP Models via the Stanbol * {@link DataFileProvider} infrastructure. This allows users to copy models * to the 'datafiles' directory or developer to provide models via via OSGI * bundles.<p> * This service also provides methods that directly return the OpenNLP component * wrapping the model. */ @Component(immediate=true) @Service(value=OpenNLP.class) public class OpenNLP { /** * added as link to the download location for requested model files * Will show up in the DataFilePorivder tab in the Apache Felix Web Console */ private static final String DOWNLOAD_ROOT = "http://opennlp.sourceforge.net/models-1.5/"; /** * The logger */ private final Logger log = LoggerFactory.getLogger(getClass()); @Reference private DataFileProvider dataFileProvider; /** * Map holding the already built models * TODO: change to use a WeakReferenceMap */ protected Map<String,Object> models = new HashMap<String,Object>(); /** * used to sync access to the {@link #models} and {@link #modelCreationLock} */ protected ReadWriteLock modelLock = new ReentrantReadWriteLock(); /** * used to avoid loading the same model multiple times in parallel. * The value is a int array with an single element. The int at index zero is * used as reference count. When it reaches zero the mapping can be deleted * from the map. */ protected Map<String,int[]> modelCreationLock = new HashMap<String,int[]>(); /** * Default constructor */ public OpenNLP(){ super(); } /** * Constructor intended to be used when running outside an OSGI environment * (e.g. when used for UnitTests) * @param dataFileProvider the dataFileProvider used to load Model data. */ public OpenNLP(DataFileProvider dataFileProvider){ this(); this.dataFileProvider = dataFileProvider; } /** * Getter for the sentence detection model of the parsed language. * If the model is not yet available a new one is built. The required data * are loaded by using the {@link DataFileProvider} service. * @param language the language * @return the model or <code>null</code> if no model data are found * @throws InvalidFormatException in case the found model data are in the wrong format * @throws IOException on any error while reading the model data */ public SentenceModel getSentenceModel(String language) throws InvalidFormatException, IOException { return initModel(String.format("%s-sent.bin", language), SentenceModel.class); } /** * Getter for the sentence detector of the parsed language. * @param language the language * @return the model or <code>null</code> if no model data are found * @throws InvalidFormatException in case the found model data are in the wrong format * @throws IOException on any error while reading the model data */ public SentenceDetector getSentenceDetector(String language) throws IOException { SentenceModel sentModel = getSentenceModel(language); if(sentModel != null){ return new SentenceDetectorME(sentModel); } else { log.debug("No Sentence Detection Model for language '{}'",language); return null; } } /** * Getter for the named entity finder model for the parsed entity type and language. * If the model is not yet available a new one is built. The required data * are loaded by using the {@link DataFileProvider} service. * @param type the type of the named entities to find (person, organization) * @param language the language * @return the model or <code>null</code> if no model data are found * @throws InvalidFormatException in case the found model data are in the wrong format * @throws IOException on any error while reading the model data */ public TokenNameFinderModel getNameModel(String type, String language) throws InvalidFormatException, IOException { return initModel(String.format("%s-ner-%s.bin", language, type), TokenNameFinderModel.class); } /** * Getter for the {@link TokenNameFinder} for the parsed entity type and language. * @param type the type of the named entities to find (person, organization) * @param language the language * @return the model or <code>null</code> if no model data are found * @throws InvalidFormatException in case the found model data are in the wrong format * @throws IOException on any error while reading the model data */ public TokenNameFinder getNameFinder(String type, String language) throws IOException { TokenNameFinderModel model = getNameModel(type, language); if(model != null){ return new NameFinderME(model); } else { log.debug("TokenNameFinder model for type {} and langauge {} not present",type,language); return null; } } /** * Getter for the tokenizer model for the parsed language. * If the model is not yet available a new one is built. The required data * are loaded by using the {@link DataFileProvider} service. * @param language the language * @return the model or <code>null</code> if no model data are found * @throws InvalidFormatException in case the found model data are in the wrong format * @throws IOException on any error while reading the model data */ public TokenizerModel getTokenizerModel(String language) throws InvalidFormatException, IOException { return initModel(String.format("%s-token.bin", language),TokenizerModel.class); } /** * Getter for the Tokenizer of a given language. This first tries to * create an {@link TokenizerME} instance if the required * {@link TokenizerModel} for the parsed language is available. if such a * model is not available it returns the {@link SimpleTokenizer} instance. * @param language the language or <code>null</code> to build a * {@link SimpleTokenizer} * @return the {@link Tokenizer} for the parsed language. */ public Tokenizer getTokenizer(String language) { Tokenizer tokenizer = null; if(language != null){ try { TokenizerModel model = getTokenizerModel(language); if(model != null){ tokenizer = new TokenizerME(model); } } catch (InvalidFormatException e) { log.warn("Unable to load Tokenizer Model for "+language+": " + "Will use Simple Tokenizer instead",e); } catch (IOException e) { log.warn("Unable to load Tokenizer Model for "+language+": " + "Will use Simple Tokenizer instead",e); } } if(tokenizer == null){ log.debug("Use Simple Tokenizer for language {}",language); tokenizer = SimpleTokenizer.INSTANCE; } else { log.debug("Use ME Tokenizer for language {}",language); } return tokenizer; } /** * Getter for the "part-of-speech" model for the parsed language. * If the model is not yet available a new one is built. The required data * are loaded by using the {@link DataFileProvider} service. * @param language the language * @return the model or <code>null</code> if no model data are found * @throws InvalidFormatException in case the found model data are in the wrong format * @throws IOException on any error while reading the model data */ public POSModel getPartOfSpeechModel(String language) throws IOException, InvalidFormatException { //typically there are two versions //we prefer the perceptron variant but if not available try to build the other IOException first = null; POSModel model; try { model = initModel(String.format("%s-pos-perceptron.bin",language), POSModel.class); } catch (IOException e) { first = e; log.warn("Unable to laod preceptron based POS model for "+language,e); model = null; } if(model == null){ log.debug("No perceptron based POS model for language "+language+ "available. Will try to load maxent model"); try { model = initModel(String.format("%s-pos-maxent.bin",language), POSModel.class); } catch (IOException e) { if(first != null){ throw first; } else { throw e; } } } return model; } /** * Getter for the "part-of-speech" tagger for the parsed language. * @param language the language * @return the model or <code>null</code> if no model data are found * @throws InvalidFormatException in case the found model data are in the wrong format * @throws IOException on any error while reading the model data */ public POSTagger getPartOfSpeechTagger(String language) throws IOException { POSModel posModel = getPartOfSpeechModel(language); if(posModel != null){ return new POSTaggerME(posModel); } else { log.debug("No POS Model for language '{}'",language); return null; } } /** * Getter for the Model with the parsed type, name and properties. * @param modelType the type of the Model (e.g. {@link ChunkerModel}) * @param modelName the name of the model file. MUST BE available via the * {@link DataFileProvider}. * @param properties additional properties about the model (parsed to the * {@link DataFileProvider}. NOTE that "Description", "Model Type" and * "Download Location" are set to default values if not defined in the * parsed value. * @return the loaded (or cached) model * @throws InvalidFormatException in case the found model data are in the wrong format * @throws IOException on any error while reading the model data */ public <T> T getModel(Class<T> modelType,String modelName, Map<String,String> properties) throws InvalidFormatException, IOException { return initModel(modelName, modelType, properties); } /** * Getter for the chunker model for the parsed language. * If the model is not yet available a new one is built. The required data * are loaded by using the {@link DataFileProvider} service. * @param language the language * @return the model or <code>null</code> if no model data are present * @throws InvalidFormatException in case the found model data are in the wrong format * @throws IOException on any error while reading the model data */ public ChunkerModel getChunkerModel(String language) throws InvalidFormatException, IOException { return initModel(String.format("%s-chunker.bin", language), ChunkerModel.class); } /** * Getter for the {@link Chunker} for a given language * @param language the language * @return the {@link Chunker} or <code>null</code> if no model is present * @throws InvalidFormatException in case the found model data are in the wrong format * @throws IOException on any error while reading the model data */ public Chunker getChunker(String language) throws IOException { ChunkerModel chunkerModel = getChunkerModel(language); if(chunkerModel != null){ return new ChunkerME(chunkerModel); } else { log.debug("No Chunker Model for language {}",language); return null; } } // /** // * Activates the component and re-enables all {@link DataFileProvider}s // * previously {@link #registerModelLocation(BundleContext, String...) registered}. // * @param context the context // */ // @Activate // protected void activate(ComponentContext context){ // synchronized (modelLocations) { // for(ModelLocation modelLocation : modelLocations.values()){ // if(modelLocation.provider == null){ // modelLocation.provider = new BundleResourceProvider( // modelLocation.bundleContext, // modelLocation.paths == null ? null : Arrays.asList(modelLocation.paths)); // } // still registered -> should never happen unless activate is called twice // } // } // } // /** // * Deactivates this component. Deactivates all {@link DataFileProvider}s for // * {@link #registerModelLocation(BundleContext, String...) registered} // * locations to search for OpenNLP models and also // * {@link Map#clear() clears} the {@link #models model cache}. // * @param context the context // */ // @Deactivate // protected void deactivate(ComponentContext context){ // synchronized (modelLocations) { // for(ModelLocation modelLocation : modelLocations.values()){ // if(modelLocation.provider != null){ // modelLocation.provider.close(); // modelLocation.provider = null; // } // } // } // //clear the model cache // models.clear(); // } // /** // * Registers the parsed paths as locations to lookup openNLP models.<p> // * This Method is a convenience for manually registering a // * {@link DataFileProvider} that provides the openNLP model classes such as: // * <pre><code> // * protected void activate(ComponentContext context){ // * this.modelProvider = new BundleResourceProvider( // * context.getBundleContext, Arrays.asList("openNLP/models")); // * ... // * } // * // * protected void deactivate(ComponentContext context){ // * if(this.modelProvider != null){ // * modelProvider.close(); // * modelProvider = null; // * } // * ... // * } // * </code></pre><p> // * Note that multiple calls with the same bundleContext will cause previous // * registration for the same {@link BundleContext} to be removed.<p> // * {@link DataFileProvider}s created by this will be removed/added as this // * Component is activated/deactivated. However registrations are not // * persisted and will be gone after an restart of the OSGI environment // * @param bundleContext The context of the bundle used to load openNLP models // * @param searchPaths The paths used to search openNLP models (via the // * bundles classpath). // */ // public void registerModelLocation(BundleContext bundleContext, String...searchPaths){ // if(bundleContext == null){ // throw new IllegalArgumentException("The parsed BundleContext MUST NOT be NULL!"); // } // String bundleSymbolicName = bundleContext.getBundle().getSymbolicName(); // synchronized (modelLocations) { // ModelLocation current = modelLocations.get(bundleSymbolicName); // if(current != null){ // if(Arrays.equals(searchPaths, current.paths)) { // log.debug("ModelLocations for Bundle {} and Paths {} already registered"); // return; // } else { //remove current registration // log.debug("remove existing ModelLocations for Bundle {} and Paths {}", // bundleSymbolicName,current.paths); // if(current.provider != null){ // current.provider.close(); // } // } // } else { // current = new ModelLocation(); // current.bundleContext = bundleContext; // } // current.paths = searchPaths; // current.provider = new BundleResourceProvider(bundleContext, // searchPaths == null ? null : Arrays.asList(searchPaths)); // modelLocations.put(bundleSymbolicName, current); // } // // } // /** // * Removes previously registerd openNLP model locations for the parsed bundle // * context. // * @param bundleContext // */ // public void unregisterModelLocation(BundleContext bundleContext){ // if(bundleContext == null){ // throw new IllegalArgumentException("The parsed BundleContext MUST NOT be NULL!"); // } // String bundleSymbolicName = bundleContext.getBundle().getSymbolicName(); // synchronized (modelLocations) { // ModelLocation current = modelLocations.remove(bundleSymbolicName); // if(current != null){ // log.debug("remove modelLocation for Bundle {} and paths {}", // bundleSymbolicName,current.paths); // if(current.provider != null){ // current.provider.close(); // } // } // } // } /** * Uses generics to build models of the parsed type. The {@link #models} * map is used to lookup already created models. * @param <T> the type of the model to create * @param name the name of the file with the model data * @param modelType the class object representing the model to create * @return the model or <code>null</code> if the model data where not found * @throws InvalidFormatException if the model data are in an invalid format * @throws IOException on any error while loading the model data * @throws IllegalStateException on any Exception while creating the model */ private <T> T initModel(String name,Class<T> modelType) throws InvalidFormatException, IOException { return initModel(name, modelType,null); } /** * Uses generics to build models of the parsed type. The {@link #models} * map is used to lookup already created models. * @param <T> the type of the model to create * @param name the name of the file with the model data * @param modelType the class object representing the model to create * @param modelProperties additional metadata about the requested model * @return the model or <code>null</code> if the model data where not found * @throws InvalidFormatException if the model data are in an invalid format * @throws IOException on any error while loading the model data * @throws IllegalStateException on any Exception while creating the model */ private <T> T initModel(String name,Class<T> modelType, Map<String,String> modelProperties) throws InvalidFormatException, IOException { T model = getCachedModel(name, modelType); if(model != null){ return model; } //else create the model //We need to avoid creating a model twice in parallel modelLock.writeLock().lock(); int[] lock; try { lock = modelCreationLock.get(name); if(lock == null){ lock = new int[]{0}; modelCreationLock.put(name, lock); } lock[0]++; } finally { modelLock.writeLock().unlock(); } try { //create only one model with the same name in parallel synchronized (lock) { //now we have the lock ... // first check if it was created while we where waiting for the lock model = getCachedModel(name, modelType); if(model != null){ return model; } //not created in the meantime ... we need to create it! T built = loadModel(name, modelType, modelProperties); //register the model modelLock.writeLock().lock(); try { models.put(name, built); } finally { modelLock.writeLock().unlock(); } return built; } } finally { //we do no longer need the lock lock[0]--; //check if we need to clean up the modelCreationLock map if(lock[0] == 0){ modelLock.writeLock().lock(); try { if(lock[0] == 0){ modelCreationLock.remove(name); } } finally { modelLock.writeLock().unlock(); } } } } private <T> T loadModel(String name, Class<T> modelType, Map<String, String> modelProperties) throws InvalidFormatException, IOException { if(modelProperties != null){ //copy the data to avoid external modifications modelProperties = new HashMap<String,String>(modelProperties); }else { modelProperties = new HashMap<String,String>(); } if(!modelProperties.containsKey("Description")){ modelProperties.put("Description", "Statistical model for OpenNLP"); } if(!modelProperties.containsKey("Model Type")){ modelProperties.put("Model Type", modelType.getSimpleName()); } if(!modelProperties.containsKey("Download Location")){ modelProperties.put("Download Location", DOWNLOAD_ROOT+name); } InputStream modelDataStream; try { modelDataStream = lookupModelStream(name,modelProperties); } catch (IOException e) { log.debug("Unable to load Resource {} via the DataFileProvider",name); return null; } if(modelDataStream == null){ log.debug("Unable to load Resource {} via the DataFileProvider",name); return null; } T built; try { Constructor<T> constructor; constructor = modelType.getConstructor(InputStream.class); built = constructor.newInstance(modelDataStream); } catch (SecurityException e) { throw new IllegalStateException(String.format( "Unable to create %s for %s!",modelType.getSimpleName(), name),e); } catch (NoSuchMethodException e) { throw new IllegalStateException(String.format( "Unable to create %s for %s!",modelType.getSimpleName(), name),e); } catch (IllegalArgumentException e) { throw new IllegalStateException(String.format( "Unable to create %s for %s!",modelType.getSimpleName(), name),e); } catch (InstantiationException e) { throw new IllegalStateException(String.format( "Unable to create %s for %s!",modelType.getSimpleName(), name),e); } catch (IllegalAccessException e) { throw new IllegalStateException(String.format( "Unable to create %s for %s!",modelType.getSimpleName(), name),e); } catch (InvocationTargetException e) { //this indicates an exception while creating the instance //for InvalidFormatException and IO Exceptions we shall //directly throw the cause. for all others wrap the thrown one //in an IllegalStateException Throwable checked = e.getCause(); if (checked instanceof InvalidFormatException){ throw (InvalidFormatException)checked; } else if(checked instanceof IOException){ throw (IOException)checked; } else { throw new IllegalStateException(String.format( "Unable to create %s for %s!",modelType.getSimpleName(), name),e); } } finally { IOUtils.closeQuietly(modelDataStream); } return built; } /** * Used to retrieve a model of the parsed model type from the internal cache * @param name the name of the model * @param modelType the type of the model * @return the model or <code>null</code> if not cached * @throws IllegalStateException if the cached model does not have the * expected type */ private <T> T getCachedModel(String name, Class<T> modelType) { modelLock.readLock().lock(); try { Object model = models.get(name); if(model != null) { if(modelType.isAssignableFrom(model.getClass())){ return modelType.cast(model); } else { throw new IllegalStateException(String.format( "Incompatible Model Types for name '%s': present=%s | requested=%s", name,model.getClass(),modelType)); } } else { return null; } } finally { modelLock.readLock().unlock(); } } /** * Lookup an openNLP data file via the {@link #dataFileProvider} * @param modelName the name of the model * @return the stream or <code>null</code> if not found * @throws IOException an any error while opening the model file */ protected InputStream lookupModelStream(final String modelName, final Map<String,String> properties) throws IOException { try { return AccessController.doPrivileged(new PrivilegedExceptionAction<InputStream>() { public InputStream run() throws IOException { return dataFileProvider.getInputStream(null, modelName,properties); } }); } catch (PrivilegedActionException pae) { Exception e = pae.getException(); if(e instanceof IOException){ throw (IOException)e; } else { throw RuntimeException.class.cast(e); } } } /** * Remove non UTF-8 compliant characters (typically control characters) so has to avoid polluting the * annotation graph with snippets that are not serializable as XML. */ protected static String removeNonUtf8CompliantCharacters(final String text) { if (null == text) { return null; } Charset UTF8 = Charset.forName("UTF-8"); byte[] bytes = text.getBytes(UTF8); for (int i = 0; i < bytes.length; i++) { byte ch = bytes[i]; // remove any characters outside the valid UTF-8 range as well as all control characters // except tabs and new lines if (!((ch > 31 && ch < 253) || ch == '\t' || ch == '\n' || ch == '\r')) { bytes[i] = ' '; } } return new String(bytes, UTF8); } }