/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engines.keywordextraction.impl;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import opennlp.tools.chunker.Chunker;
import opennlp.tools.postag.POSTagger;
import org.apache.clerezza.commons.rdf.IRI;
import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText;
import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText.Chunk;
import org.apache.stanbol.commons.opennlp.TextAnalyzer.AnalysedText.Token;
import org.apache.stanbol.enhancer.engines.keywordextraction.linking.EntitySearcher;
import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses;
import org.apache.stanbol.entityhub.servicesapi.defaults.NamespaceEnum;
/**
* The configuration for the {@link EntityLinker}. Typically this
* configuration does not change often. Therefore it will be used for
* several {@link EntityLinker} instances processing different
* contents.
* @author Rupert Westenthaler
*
*/
public class EntityLinkerConfig {
/**
* The minimum length of Token to be used for searches in case no
* POS (Part of Speech) tags are available.
*/
public static final int DEFAULT_MIN_SEARCH_TOKEN_LENGTH = 3;
/**
* The default number for the maximum number of terms suggested for a word
*/
public static final int DEFAULT_SUGGESTIONS = 3;
/**
* Default value for the number of tokens that must be contained in
* suggested terms.
*/
public static final int DEFAULT_MIN_FOUND_TOKENS = 2;
/**
* Multiple Tokens can be sent to the {@link EntitySearcher} service. The
* service uses this as optional parameters for the search. Therefore
* returned Concepts MUST contain at least a single of the parsed
* tokens. <p>
* The default value of <code>2</code> should be enough for nearly all
* Taxonomies to sufficiently reduce the number of results.<p>
* NOTE that the labels (nameField) of the results are compared as a
* whole. So even if only 2 Tokens are used for the search there may be
* more mapped to the actual label of an result.
*/
public static final int DEFAULT_MAX_SEARCH_TOKENS = 2;
/**
* Default value for {@link #getNameField()} (rdfs:label)
*/
public static final String DEFAULT_NAME_FIELD = "http://www.w3.org/2000/01/rdf-schema#label";
/**
* Default value for {@link #getTypeField()} (rdf:type)
*/
public static final String DEFAULT_TYPE_FIELD = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type";
/**
* Default value for {@link #getRedirectField()} (rdf:seeAlso)
*/
public static final String DEFAULT_REDIRECT_FIELD = "http://www.w3.org/2000/01/rdf-schema#seeAlso";
/**
* The default language used to search for labels regardless of the language
* of the text. The default value is <code>null</code> causing to include
* labels that do not have a language assigned.
*/
public static final String DEFAULT_LANGUAGE = null;
/**
* The default for case sensitive matching is set to <code>false</code>
*/
public static final boolean DEFAULT_CASE_SENSITIVE_MATCHING_STATE = false;
/**
* Default mapping for Concept types to dc:type values added for
* TextAnnotations.
*/
public static final Map<String,IRI> DEFAULT_ENTITY_TYPE_MAPPINGS;
static { //the default mappings for the three types used by the Stanbol Enhancement Structure
Map<String,IRI> mappings = new HashMap<String,IRI>();
mappings.put(OntologicalClasses.DBPEDIA_ORGANISATION.getUnicodeString(),
OntologicalClasses.DBPEDIA_ORGANISATION);
mappings.put("http://dbpedia.org/ontology/Newspaper", OntologicalClasses.DBPEDIA_ORGANISATION);
mappings.put("http://schema.org/Organization", OntologicalClasses.DBPEDIA_ORGANISATION);
// mappings.put(NamespaceEnum.dailymed+"organization",OntologicalClasses.DBPEDIA_ORGANISATION);
mappings.put(OntologicalClasses.DBPEDIA_PERSON.getUnicodeString(),
OntologicalClasses.DBPEDIA_PERSON);
mappings.put("http://xmlns.com/foaf/0.1/Person", OntologicalClasses.DBPEDIA_PERSON);
mappings.put("http://schema.org/Person", OntologicalClasses.DBPEDIA_PERSON);
mappings.put(OntologicalClasses.DBPEDIA_PLACE.getUnicodeString(),
OntologicalClasses.DBPEDIA_PLACE);
mappings.put("http://schema.org/Place", OntologicalClasses.DBPEDIA_PLACE);
mappings.put("http://www.opengis.net/gml/_Feature", OntologicalClasses.DBPEDIA_PLACE);
mappings.put(OntologicalClasses.SKOS_CONCEPT.getUnicodeString(),
OntologicalClasses.SKOS_CONCEPT);
mappings.put(OntologicalClasses.DBPEDIA_ORGANISATION.getUnicodeString(),
OntologicalClasses.DBPEDIA_ORGANISATION);
// IRI DRUG = new IRI(NamespaceEnum.drugbank+"drugs");
// mappings.put(DRUG.getUnicodeString(), DRUG);
// mappings.put(NamespaceEnum.dbpediaOnt+"Drug", DRUG);
// mappings.put(NamespaceEnum.dailymed+"drugs", DRUG);
// mappings.put(NamespaceEnum.sider+"drugs", DRUG);
// mappings.put(NamespaceEnum.tcm+"Medicine", DRUG);
//
// IRI DISEASE = new IRI(NamespaceEnum.diseasome+"diseases");
// mappings.put(DISEASE.getUnicodeString(), DISEASE);
// mappings.put(NamespaceEnum.linkedct+"condition", DISEASE);
// mappings.put(NamespaceEnum.tcm+"Disease", DISEASE);
//
// IRI SIDE_EFFECT = new IRI(NamespaceEnum.sider+"side_effects");
// mappings.put(SIDE_EFFECT.getUnicodeString(), SIDE_EFFECT);
//
// IRI INGREDIENT = new IRI(NamespaceEnum.dailymed+"ingredients");
// mappings.put(INGREDIENT.getUnicodeString(), INGREDIENT);
DEFAULT_ENTITY_TYPE_MAPPINGS = Collections.unmodifiableMap(mappings);
}
/**
* Enumeration over the different possibilities on how to deal with
* redirects (similar to Browsers following HTTP status 303 and RDF defining
* the "rdf:seeAlso" relation.
* @author Rupert Westenthaler
*/
public static enum RedirectProcessingMode {
/**
* Ignore redirects
*/
IGNORE,
/**
* Follow redirects, but only add the values (e.g. labels, types) such
* entities to the original one.
*/
ADD_VALUES,
/**
* Follow the redirect.
*/
FOLLOW
}
/**
* The default value for how to process redirect is set to
* {@link RedirectProcessingMode#IGNORE}
*/
public static RedirectProcessingMode DEFAULT_REDIRECT_PROCESSING_MODE =
RedirectProcessingMode.IGNORE;
/**
* The minimum length of labels that are looked-up in the directory
*/
private int minSearchTokenLength = DEFAULT_MIN_SEARCH_TOKEN_LENGTH;
/**
* The the maximum number of terms suggested for a word
*/
private int maxSuggestions = DEFAULT_SUGGESTIONS;
/**
* If several words are selected from the text to search for an Entity in the
* Dictionary (e.g. if a {@link Chunker} is used or if the {@link POSTagger}
* detects several connected nouns) that entities found for the such chunks
* MUST define a label (with no or the correct lanugage) that contains at
* least this number of tokens to be accepted.<p>
* TODO: make configurable
*/
private int minFoundTokens = DEFAULT_MIN_FOUND_TOKENS;
/**
* The maximum numbers of Tokens sent to the {@link EntitySearcher} to search
* for concepts. <p>
* NOTE that the labels (nameField) of the results are compared as a
* whole. So even if only e.g. 2 tokens are used for the search there may be
* more mapped to the actual label of an result.
*/
private int maxSearchTokens = DEFAULT_MAX_SEARCH_TOKENS;
private boolean caseSensitiveMatchingState = DEFAULT_CASE_SENSITIVE_MATCHING_STATE;
/**
* Holds the mappings of rdf:type used by concepts to dc:type values used
* by TextAnnotations.
*/
private Map<String,IRI> typeMappings;
private Map<String, IRI> unmodTypeMappings;
/**
* The mode on how to process redirect for Entities.
*/
private RedirectProcessingMode redirectProcessingMode;
/**
* the default DC Type
*/
private IRI defaultDcType;
private String nameField;
private String redirectField;
private String typeField;
private Set<String> selectedFields = new HashSet<String>();
/**
* The language always included in searches (regardless of the language
* detected for the text.
*/
private String defaultLanguage = DEFAULT_LANGUAGE;
/**
* Default for the maximum number of non-processable tokens that are
* allowed to not match before no further tokens are matched against a label
* of an Entity. <p>
* This allows e.g. to match "Dr. Richard Dogles" with "Dr Richard Dogles"
* as '.' is a non-processable token in the text that is missing in the
* label.<p>
* The default is set to <code>1</code>
*/
public final static int DEFAULT_MAX_NOT_FOUND = 1;
/**
* Value of the maximum number of non-processable tokens that are
* allowed to not match before no further tokens are matched against a label
* of an Entity. <p>
* This allows e.g. to match "Dr. Richard Dogles" with "Dr Richard Dogles"
* as '.' is a non-processable token in the text that is missing in the
* label.
*/
private int maxNotFound;
/**
* Default value for the minimum token match factor.
* If Tokens match is determined by comparing them using some algorithm.
* Results need to be in the range [0..1]. This factor defines the minimum
* similarity value so that a match is assumed. Not that this factor only
* is used for filtering out non-matching tokens. The similarity value will
* still used for calculating the confidence.<p>
* The default is set to <code>0.7</code>.
*/
public final static float DEFAULT_MIN_TOKEN_MATCH_FACTOR = 0.7f;
/**
* If Tokens match is determined by comparing them using some algorithm.
* Results need to be in the range [0..1]. This factor defines the minimum
* similarity value so that a match is assumed. Not that this factor only
* is used for filtering out non-matching tokens. The similarity value will
* still used for calculating the confidence
*/
private float minTokenMatchFactor;
/**
* Default constructor the initialises the configuration with the
* default values
*/
public EntityLinkerConfig(){
setMinSearchTokenLength(DEFAULT_MIN_SEARCH_TOKEN_LENGTH);
setMaxSuggestions(DEFAULT_SUGGESTIONS);
setMaxSearchTokens(DEFAULT_MAX_SEARCH_TOKENS);
setRedirectProcessingMode(DEFAULT_REDIRECT_PROCESSING_MODE);
typeMappings = new HashMap<String,IRI>(DEFAULT_ENTITY_TYPE_MAPPINGS);
unmodTypeMappings = Collections.unmodifiableMap(typeMappings);
setDefaultDcType(typeMappings.remove(null));
setNameField(DEFAULT_NAME_FIELD);
setRedirectField(DEFAULT_REDIRECT_FIELD);
setTypeField(DEFAULT_TYPE_FIELD);
setMaxNotFound(DEFAULT_MAX_NOT_FOUND);
setMinTokenMatchFactor(DEFAULT_MIN_TOKEN_MATCH_FACTOR);
}
/**
* Getter for the uri of the field used for the names in the taxonomy
* (e.g. rdfs:label, skos:prefLabel). Needs to return the full URI
* @return the field used for the names of in the Taxonomy.
*/
public final String getNameField() {
return nameField;
}
/**
* Setter for the uri of the field used for the entities in the vocabulary<p>
* <b>NOTE</b>: does NOT support the convertion of short to full URIs
* {@link NamespaceEnum}.
* @param nameField the nameField to set
*/
public final void setNameField(String nameField) {
this.nameField = nameField;
updateSelectedFields();
}
/**
* internally used to update the selected fields on changes to
* {@link #setNameField(String)}, {@link #setRedirectField(String)} or
* {@link #setTypeField(String)}
*/
private void updateSelectedFields() {
selectedFields.clear();
selectedFields.add(nameField);
selectedFields.add(redirectField);
selectedFields.add(typeField);
}
/**
* Getter for the selected fields. A set that includes the current
* {@link #getNameField()}, {@link #getTypeField()} and {@link #getRedirectField()}.
* @return the selectedFields
*/
public final Set<String> getSelectedFields() {
return selectedFields;
}
/**
* The field used to follow redirects (typically rdf:seeAlso)
* @return the redirect field
*/
public final String getRedirectField() {
return redirectField;
}
/**
* The field used to follow redirects (typically rdf:seeAlso)<p>
* <b>NOTE</b>: does NOT support the convertion of short to full URIs
* @param redirectField the redirectField to set
*/
public final void setRedirectField(String redirectField) {
this.redirectField = redirectField;
updateSelectedFields();
}
/**
* The field used to lookup the types (typically rdf:type)
* @return the field name used to lookup types
*/
public final String getTypeField() {
return typeField;
}
/**
* The field used to lookup the types (typically rdf:type)<p>
* <b>NOTE</b>: does NOT support the convertion of short to full URIs
* @param typeField the typeField to set
*/
public final void setTypeField(String typeField) {
this.typeField = typeField;
updateSelectedFields();
}
/**
* The minimum number of character a {@link Token} (word) must have to be
* used {@link EntitySearcher#lookup(java.util.List, String...) lookup} concepts
* in the taxonomy. Note that this parameter is only used of no POS (Part-
* of-speech) tags are available in the {@link AnalysedText}.
* @param minSearchTokenLength the minSearchTokenLength to set
*/
public void setMinSearchTokenLength(int minSearchTokenLength) {
this.minSearchTokenLength = minSearchTokenLength;
}
/**
* The minimum number of character a {@link Token} (word) must have to be
* used {@link EntitySearcher#lookup(java.util.List, String...) lookup} concepts
* in the taxonomy. Note that this parameter is only used of no POS (Part-
* of-speech) tags are available in the {@link AnalysedText}.
* @return the minSearchTokenLength
*/
public int getMinSearchTokenLength() {
return minSearchTokenLength;
}
/**
* Setter for the maximum number of suggestion returned.
* @param maxSuggestions the maxSuggestions to set
*/
public void setMaxSuggestions(int maxSuggestions) {
this.maxSuggestions = maxSuggestions;
}
/**
* Getter for the maximum number of suggestion returned.
* @return the maxSuggestions
*/
public int getMaxSuggestions() {
return maxSuggestions;
}
/**
* Setter for the minimum number of Tokens (of the content) that MUST match
* with a {@link EntitySearcher#getNameField() label} of a
* {@link EntitySearcher#lookup(java.util.List, String...) concept of the taxonomy}
* so that it is {@link Suggestion suggested} even if the match is only
* {@link MATCH#PARTIAL}. Entities that match less than that are only included
* if a label is an {@link MATCH#EXACT EXACT} match with the current position
* in the text.
* @param minFoundTokens the minFoundTokens to set
*/
public void setMinFoundTokens(int minFoundTokens) {
this.minFoundTokens = minFoundTokens;
}
/**
* Getter for the minimum number of Tokens (of the content) that MUST match
* with a {@link EntitySearcher#getNameField() label} of a
* {@link EntitySearcher#lookup(java.util.List, String...) concept of the taxonomy}
* so that it is {@link Suggestion suggested} even if the match is only
* {@link MATCH#PARTIAL}. Entities that match less than that are only included
* if a label is an {@link MATCH#EXACT EXACT} match with the current position
* in the text.
* @return the minFoundTokens
*/
public int getMinFoundTokens() {
return minFoundTokens;
}
/**
* Getter for the maximum number of tokens parsed to
* {@link EntitySearcher#lookup(java.util.List, String...)}
* @return the maxSearchTokens
*/
public final int getMaxSearchTokens() {
return maxSearchTokens;
}
/**
* The maximum number of tokens parsed to
* {@link EntitySearcher#lookup(java.util.List, String...)}. This is NOT the
* maximum number of Tokens mapped for Entities returned by such queries.<p>
* In case {@link Chunk}s are available in the parsed {@link AnalysedText}
* searches can be scoped by such chunks. However if no chunks are available,
* than this value is used to collect this number of words in the text.<p>
* The {@link #DEFAULT_MAX_SEARCH_TOKENS default value} of <code>2</code>
* should be ok in most cases.
* @param maxSearchTokens the maxSearchTokens to set
*/
public final void setMaxSearchTokens(int maxSearchTokens) {
this.maxSearchTokens = maxSearchTokens;
}
/**
* Getter for the case sensitive matching state
* @return the state
*/
public boolean isCaseSensitiveMatching() {
return caseSensitiveMatchingState;
}
/**
* Setter for the case sensitive matching state
* @param caseSensitiveMatchingState the state
*/
public void setCaseSensitiveMatchingState(boolean state) {
this.caseSensitiveMatchingState = state;
}
/**
* Removes the mapping for the parsed concept type
* @param conceptType the concept type to remove the mapping
* @return the previously mapped dc:type value or <code>null</code> if
* no mapping for the parsed concept type was present
*/
public IRI removeTypeMapping(String conceptType){
return typeMappings.remove(conceptType);
}
/**
*
* @param conceptType the type of the concept or <code>null</code> to
* add the default dc:type mapping. See also {@link #setDefaultDcType(IRI)}
* @param dcType the dc:type for the parsed concept type
* @return the previously mapped dc:type value if an existing mapping
* was updated or <code>null</code> if a new mapping was added.
*/
public IRI setTypeMapping(String conceptType, IRI dcType){
if(dcType == null) {
throw new IllegalArgumentException("The parsed dc:type URI MUST NOT be NULL!");
}
if(conceptType == null){ //handle setting of the default dc:type value
IRI oldDefault = getDefaultDcType();
setDefaultDcType(dcType);
return oldDefault;
}
return typeMappings.put(conceptType, dcType);
}
/**
* Setter for the default dc:type of linked entities if for none of the
* types of the suggestions a {@link #getTypeMappings()} exists. Set this
* to <code>null</code> to specify that no dc:type should be set in such
* cases.
* @param defaultDcType the defaultDcType to set
*/
public void setDefaultDcType(IRI defaultDcType) {
this.defaultDcType = defaultDcType;
}
/**
* The default type for Entities if no {@link #getTypeMappings() type mapping}
* is present. <code>null</code> means that no type should be set if no
* explicit mapping exists
* @return the defaultDcType
*/
public IRI getDefaultDcType() {
return defaultDcType;
}
/**
* Setter for the mode on how to deal with redirects
* @param redirectProcessingMode the redirectProcessingMode to set
*/
public void setRedirectProcessingMode(RedirectProcessingMode redirectProcessingMode) {
this.redirectProcessingMode = redirectProcessingMode;
}
/**
* Getter for the mode how to deal with redirects
* @return the redirectProcessingMode
*/
public RedirectProcessingMode getRedirectProcessingMode() {
return redirectProcessingMode;
}
/**
* Getter for the read only mappings of type mappings
* @return the type mappings (read only)
*/
public Map<String,IRI> getTypeMappings() {
return unmodTypeMappings;
}
/**
* Setter for the language of labels searched in addition to the current
* language of the text. Setting this to <code>null</code> (also the default)
* will cause to search labels without any defined language.<p>
* Changing this makes only sense if a dataset (such as dbpedia.org) adds
* language tags to labels even if they are typically used in any language.
* @param defaultLanguage the default language
*/
public void setDefaultLanguage(String defaultLanguage) {
this.defaultLanguage = defaultLanguage;
}
/**
* Getter for the language of labels searched in addition to the current
* language of the text.
* @return the default language
*/
public String getDefaultLanguage() {
return defaultLanguage;
}
/**
* Getter for the maximum number of non-processable tokens that are
* allowed to not match before no further tokens are matched against a label
* of an Entity. <p>
* This allows e.g. to match "Dr. Richard Dogles" with "Dr Richard Dogles"
* as '.' is a non-processable token in the text that is missing in the
* label.
* @return the maxNotFound
*/
public int getMaxNotFound() {
return maxNotFound;
}
/**
* Setter for the maximum number of non-processable tokens that are
* allowed to not match before no further tokens are matched against a label
* of an Entity. <p>
* This allows e.g. to match "Dr. Richard Dogles" with "Dr Richard Dogles"
* as '.' is a non-processable token in the text that is missing in the
* label.
* @param maxNotFound the maxNotFound to set
*/
public void setMaxNotFound(int maxNotFound) {
if(maxNotFound < 0){
this.maxNotFound = DEFAULT_MAX_NOT_FOUND;
} else {
this.maxNotFound = maxNotFound;
}
}
/**
* Getter for the minimum token match Factor.
* If Tokens match is determined by comparing them using some algorithm.
* Results need to be in the range [0..1]. This factor defines the minimum
* similarity value so that a match is assumed. Not that this factor only
* is used for filtering out non-matching tokens. The similarity value will
* still used for calculating the confidence
* @return the minTokenMatchFactor
*/
public float getMinTokenMatchFactor() {
return minTokenMatchFactor;
}
/**
* Setter for the minimum token match Factor.
* If Tokens match is determined by comparing them using some algorithm.
* Results need to be in the range [0..1]. This factor defines the minimum
* similarity value so that a match is assumed. Not that this factor only
* is used for filtering out non-matching tokens. The similarity value will
* still used for calculating the confidence
* @param minTokenMatchFactor the minTokenMatchFactor to set
*/
public void setMinTokenMatchFactor(float minTokenMatchFactor) {
if(minTokenMatchFactor < 0 ){
this.minTokenMatchFactor = DEFAULT_MIN_TOKEN_MATCH_FACTOR;
} else if(minTokenMatchFactor == 0 || minTokenMatchFactor > 1){
throw new IllegalArgumentException("minimum Token Match Facter MUST be > 0 <= 1 (parsed: "+minTokenMatchFactor+")!");
} else {
this.minTokenMatchFactor = minTokenMatchFactor;
}
}
}