/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.tika.parser.ner.opennlp; import org.apache.tika.parser.ner.NERecogniser; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; /** * * This implementation of {@link NERecogniser} chains an array of * {@link OpenNLPNameFinder}s for which NER models are * available in classpath. * * The following models are scanned during initialization via class loader.: * * <table> * <tr> * <th>Entity Type</th><th>Path</th> * </tr> * <tr> * <td>{@value PERSON}</td><td> {@value PERSON_FILE}</td> * </tr> * <tr> * <td>{@value LOCATION}</td><td>{@value LOCATION_FILE}</td> * </tr> * <tr> * <td>{@value ORGANIZATION}</td><td>{@value ORGANIZATION_FILE}</td> * </tr> * <tr> * <td>{@value TIME}</td><td>{@value TIME_FILE}</td> * </tr> * <tr> * <td>{@value DATE}</td><td>{@value DATE_FILE}</td> * </tr> * <tr> * <td>{@value PERCENT}</td><td>{@value PERCENT_FILE}</td> * </tr> * <tr> * <td>{@value MONEY}</td><td>{@value MONEY_FILE}</td> * </tr> * </table> * * @see org.apache.tika.parser.ner.NamedEntityParser#DEFAULT_NER_IMPL */ public class OpenNLPNERecogniser implements NERecogniser { public static final String MODELS_DIR = OpenNLPNERecogniser.class .getPackage().getName().replace(".", "/"); public static final String PERSON_FILE = "ner-person.bin"; public static final String LOCATION_FILE = "ner-location.bin"; public static final String ORGANIZATION_FILE = "ner-organization.bin"; public static final String TIME_FILE = "ner-time.bin"; public static final String DATE_FILE = "ner-date.bin"; public static final String PERCENT_FILE = "ner-percentage.bin"; public static final String MONEY_FILE = "ner-money.bin"; //Default (English) Models for the common 7 classes of named types public static final String NER_PERSON_MODEL = MODELS_DIR + "/" + PERSON_FILE; public static final String NER_LOCATION_MODEL = MODELS_DIR + "/" + LOCATION_FILE; public static final String NER_ORGANIZATION_MODEL = MODELS_DIR + "/" + ORGANIZATION_FILE; public static final String NER_TIME_MODEL = MODELS_DIR + "/" + TIME_FILE; public static final String NER_DATE_MODEL = MODELS_DIR + "/" + DATE_FILE; public static final String NER_PERCENT_MODEL = MODELS_DIR + "/" + PERCENT_FILE; public static final String NER_MONEY_MODEL = MODELS_DIR + "/" + MONEY_FILE; public static final Map<String, String> DEFAULT_MODELS = new HashMap<String, String>(){{ put(PERSON, NER_PERSON_MODEL); put(LOCATION, NER_LOCATION_MODEL); put(ORGANIZATION, NER_ORGANIZATION_MODEL); put(TIME, NER_TIME_MODEL); put(DATE, NER_DATE_MODEL); put(PERCENT, NER_PERCENT_MODEL); put(MONEY, NER_MONEY_MODEL); }}; private Set<String> entityTypes; private List<OpenNLPNameFinder> nameFinders; private boolean available; /** * Creates a default chain of Name finders using default OpenNLP recognizers */ public OpenNLPNERecogniser(){ this(DEFAULT_MODELS); } /** * Creates a chain of Named Entity recognisers * @param models map of entityType -> model path * NOTE: the model path should be known to class loader. */ public OpenNLPNERecogniser(Map<String, String> models){ this.nameFinders = new ArrayList<>(); this.entityTypes = new HashSet<>(); for (Map.Entry<String, String> entry : models.entrySet()) { OpenNLPNameFinder finder = new OpenNLPNameFinder(entry.getKey(), entry.getValue()); if (finder.isAvailable()) { this.nameFinders.add(finder); this.entityTypes.add(entry.getKey()); } } this.entityTypes = Collections.unmodifiableSet(this.entityTypes); this.available = nameFinders.size() > 0; //at least one finder is present } @Override public boolean isAvailable() { return available; } @Override public Set<String> getEntityTypes() { return entityTypes; } @Override public Map<String, Set<String>> recognise(String text) { String[] tokens = OpenNLPNameFinder.tokenize(text); Map<String, Set<String>> names = new HashMap<>(); for (OpenNLPNameFinder finder : nameFinders) { names.putAll(finder.findNames(tokens)); } return names; } }