/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.stanbol.enhancer.engines.entitylinking.config; import java.lang.reflect.InvocationTargetException; import java.util.Arrays; import java.util.Collections; import java.util.Dictionary; import java.util.EnumSet; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import org.apache.stanbol.enhancer.engines.entitylinking.engine.EntityLinkingEngine; import org.apache.stanbol.enhancer.nlp.NlpAnnotations; import org.apache.stanbol.enhancer.nlp.model.Token; import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory; import org.apache.stanbol.enhancer.nlp.pos.Pos; import org.apache.stanbol.enhancer.nlp.pos.PosTag; import org.apache.stanbol.enhancer.nlp.utils.LanguageConfiguration; import org.osgi.service.cm.ConfigurationException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class TextProcessingConfig { private static final Logger log = LoggerFactory.getLogger(TextProcessingConfig.class); /** * Holds a list of ISO 2 letter language codes that do use unicase scripts - * do not know upper case letters.<p> * More information is available the Wikipedia page for * <a href="http://en.wikipedia.org/wiki/Letter_case">Letter case</a>. */ public static final Set<String> UNICASE_SCRIPT_LANUAGES; static { UNICASE_SCRIPT_LANUAGES = Collections.unmodifiableSet(new HashSet<String>(Arrays.asList( "ar","he","zh","ja","ko","ka","hi","ne"))); } /** * If enabled only {@link Pos#ProperNoun}, {@link Pos#Foreign} and {@link Pos#Acronym} are Matched. If * deactivated all Tokens with the category {@link LexicalCategory#Noun} and * {@link LexicalCategory#Residual} are considered for matching.<p> * This property allows an easy configuration of the matching that is sufficient for most usage scenarios. * Users that need to have more control can configure language specific mappings by using * {@link #PARAM_LEXICAL_CATEGORIES}, {@link #PARAM_POS_TYPES}, {@link #PARAM_POS_TAG} and * {@link #PARAM_POS_PROBABILITY} in combination with the {@link #PROCESSED_LANGUAGES} * configuration.<p> * The {@link #DEFAULT_PROCESS_ONLY_PROPER_NOUNS_STATE default} if this is <code>false</code> */ public static final String PROCESS_ONLY_PROPER_NOUNS_STATE = "enhancer.engines.linking.properNounsState"; /** * Default for the {@link #PROCESS_ONLY_PROPER_NOUNS_STATE} (false) */ public static final boolean DEFAULT_PROCESS_ONLY_PROPER_NOUNS_STATE = false; /** * Switch that allows to enable a mode where only upper case tokens are marked as * 'linkable' if no POS tag is available (or existing POS tags are of low probability).<p> * This is especially usefull for processing text in languages where no POS tagger is * available.<p> * NOTE: that this configuration is ignored for lanugages where there are no * upper case letters (Arabic, Hebrew, Chinese, Japanese, Korean, Hindi) */ public static final String LINK_ONLY_UPPER_CASE_TOKENS_WITH_MISSING_POS_TAG = "enhancer.engines.linking.linkOnlyUpperCaseTokensWithMissingPosTag"; /** * Allows to configure the processed languages by using the syntax supported by {@link LanguageConfiguration}. * In addition this engine supports language specific configurations for matched {@link LexicalCategory} * {@link Pos} and String POS tags as well as Pos annotation probabilities by using the parameters * {@link #PARAM_LEXICAL_CATEGORIES}, {@link #PARAM_POS_TYPES}, {@link #PARAM_POS_TAG} and * {@link #PARAM_POS_PROBABILITY}.<p> * See the documentation of {@link LanguageConfiguration} for details of the Syntax. */ public static final String PROCESSED_LANGUAGES = "enhancer.engines.linking.processedLanguages"; /** * The minimum length of Token to be used for searches in case no * POS (Part of Speech) tags are available. */ public static final int DEFAULT_MIN_SEARCH_TOKEN_LENGTH = 3; /** * Used as fallback in case a {@link Token} does not have a {@link PosTag} or * {@link NlpAnnotations#POS_ANNOTATION POS annotations} do have a low confidence. * In such cases only words that are longer than this value will be considerd for * linking */ public static final String MIN_SEARCH_TOKEN_LENGTH = "enhancer.engines.linking.minSearchTokenLength"; /** * The minimum length of labels that are looked-up in the directory */ private int minSearchTokenLength = DEFAULT_MIN_SEARCH_TOKEN_LENGTH; /* * Parameters used for language specific text processing configurations */ // (1) PHRASE level /** * Allows to configure the processed Chunk type (the default is * <code>cc={@link LexicalCategory#Noun Noun}</code> to process only * Noun Phrases). If set to <code>cc</code> (empty value) processing * of chunks is deactivated. */ public static final String PARAM_PHRASE_CATEGORIES = "pc"; public static final String PARAM_PHRASE_TAG = "ptag"; public static final String PARAM_PHRASE_PROBABILITY = "pprob"; public static final String PARAM_LINK_MULTI_MATCHABLE_TOKEN_IN_PHRASE = "lmmtip"; //(2) TOKEN level public static final String PARAM_LEXICAL_CATEGORIES = "lc"; public static final String PARAM_POS_TYPES = "pos"; public static final String PARAM_POS_TAG = "tag"; public static final String PARAM_POS_PROBABILITY = "prob"; public static final String PARAM_CHUNKABLE_CATEGORIES = "cc"; public static final String PARAM_CHUNKABLE_POS_TYPES = "cp"; public static final String PARAM_CHUNKABLE_TAGS = "ct"; /** * Parameter used to configure how to deal with upper case tokens */ public static final String PARAM_UPPER_CASE = "uc"; /** * Enumeration defining valued for the {@link EntityLinkingEngine#PARAM_UPPER_CASE} parameter */ public static enum UPPER_CASE_MODE {NONE,MATCH,LINK}; /** * The default state to dereference entities set to <code>true</code>. */ public static final boolean DEFAULT_DEREFERENCE_ENTITIES_STATE = true; /** * Default set of languages. This is an empty set indicating that texts in any * language are processed. */ public static final Set<String> DEFAULT_LANGUAGES = Collections.emptySet(); public static final double DEFAULT_MIN_POS_TAG_PROBABILITY = 0.6667; /** * The languages this engine is configured to enhance. An empty List is * considered as active for any language */ private LanguageConfiguration languages = new LanguageConfiguration(PROCESSED_LANGUAGES, // link multiple matchable tokens in chunks; link upper case words new String[]{"*;lmmtip;uc=LINK;prob=0.75;pprob=0.75", "de;uc=MATCH", //in German all Nouns are upper case }); private LanguageProcessingConfig defaultConfig; private Map<String,LanguageProcessingConfig> languageConfigs = new HashMap<String,LanguageProcessingConfig>(); public TextProcessingConfig(){ this.defaultConfig = new LanguageProcessingConfig(); } public LanguageProcessingConfig getDefaults(){ return defaultConfig; } /** * Getter for the language specific configuration. * @param language * @return the configuration sepcific to the parsed language or <code>null</code> * if none. */ public LanguageProcessingConfig getLanguageSpecificConfig(String language){ return languageConfigs.get(language); } /** * Creates a language specific configuration by copying the currently configured * defaults. * @param language the language * @return the specific configuration * @throws IllegalStateException if a language specific configuration for the * parsed language already exists. */ public LanguageProcessingConfig createLanguageSpecificConfig(String language){ if(languageConfigs.containsKey(language)){ throw new IllegalStateException("A specific configuration for the language '" +language+ "' does already exist!"); } LanguageProcessingConfig conf = defaultConfig.clone(); languageConfigs.put(language, conf); return conf; } /** * Removes the language specific configuration for the parsed language * @param language the language * @return the removed configuration */ public LanguageProcessingConfig removeLanguageSpecificConfig(String language){ return languageConfigs.remove(language); } /** * The {@link LanguageProcessingConfig} for the parsed language * or <code>null</code> if the language is not included in the * configuration. This will return the {@link #getDefaults()} if * the parsed language does not have a specific configuration.<p> * To obtain just language specific configuration use * {@link #getLanguageSpecificConfig(String)} * @param language the language * @return the configuration or <code>null</code> if the language is * not configured to be processed. */ public LanguageProcessingConfig getConfiguration(String language) { if(languages.isLanguage(language)){ LanguageProcessingConfig lpc = languageConfigs.get(language); return lpc == null ? defaultConfig : lpc; } else { return null; } } /** * Initialise the {@link TextAnalyzer} component.<p> * Currently this includes the following configurations: <ul> * <li>{@link #PROCESSED_LANGUAGES}: If no configuration is present the * default (process all languages) is used. * <li> {@value #MIN_POS_TAG_PROBABILITY}: If no configuration is * present the #DEFAULT_MIN_POS_TAG_PROBABILITY is used * languages based on the value of the * * @param configuration the OSGI component configuration */ public final static TextProcessingConfig createInstance(Dictionary<String,Object> configuration) throws ConfigurationException { TextProcessingConfig tpc = new TextProcessingConfig(); //Parse the Proper Noun Linking state Object value = configuration.get(PROCESS_ONLY_PROPER_NOUNS_STATE); boolean properNounState; if(value instanceof Boolean){ properNounState = ((Boolean)value).booleanValue(); } else if (value != null){ properNounState = Boolean.parseBoolean(value.toString()); } else { properNounState = DEFAULT_PROCESS_ONLY_PROPER_NOUNS_STATE; } if(properNounState){ tpc.defaultConfig.setLinkedLexicalCategories(Collections.EMPTY_SET); tpc.defaultConfig.setLinkedPos(LanguageProcessingConfig.DEFAULT_LINKED_POS); log.debug("> ProperNoun matching activated (matched Pos: {})", tpc.defaultConfig.getLinkedPos()); } else { tpc.defaultConfig.setLinkedLexicalCategories(LanguageProcessingConfig.DEFAULT_LINKED_LEXICAL_CATEGORIES); tpc.defaultConfig.setLinkedPos(Collections.EMPTY_SET); log.debug("> Noun matching activated (matched LexicalCategories: {})", tpc.defaultConfig.getLinkedLexicalCategories()); } //parse upper case linking for languages without POS support state //see STANBOL-1049 value = configuration.get(LINK_ONLY_UPPER_CASE_TOKENS_WITH_MISSING_POS_TAG); final Boolean linkOnlyUpperCaseTokensWithMissingPosTag; if(value instanceof Boolean){ tpc.defaultConfig.setLinkOnlyUpperCaseTokenWithUnknownPos(((Boolean)value).booleanValue()); } else if(value != null){ tpc.defaultConfig.setLinkOnlyUpperCaseTokenWithUnknownPos(Boolean.parseBoolean(value.toString())); } else { //the default is the same as the properNounState tpc.defaultConfig.setLinkOnlyUpperCaseTokenWithUnknownPos(properNounState); } // init MIN_SEARCH_TOKEN_LENGTH value = configuration.get(MIN_SEARCH_TOKEN_LENGTH); Integer minSearchTokenLength; if(value instanceof Integer){ minSearchTokenLength = (Integer)value; } else if (value != null){ try { minSearchTokenLength = Integer.valueOf(value.toString()); } catch(NumberFormatException e){ throw new ConfigurationException(MIN_SEARCH_TOKEN_LENGTH, "Values MUST be valid Integer values > 0",e); } } else { minSearchTokenLength = null; } if(minSearchTokenLength != null){ if(minSearchTokenLength < 1){ throw new ConfigurationException(MIN_SEARCH_TOKEN_LENGTH, "Values MUST be valid Integer values > 0"); } tpc.defaultConfig.setMinSearchTokenLength(minSearchTokenLength); } //parse the language configuration value = configuration.get(PROCESSED_LANGUAGES); if(value instanceof String){ throw new ConfigurationException(PROCESSED_LANGUAGES, "Comma separated String " + "is not supported for configurung the processed languages for the because " + "the comma is used as separator for values of the parameters '" + PARAM_LEXICAL_CATEGORIES+"', '"+ PARAM_POS_TYPES+"'and'"+PARAM_POS_TAG + "! Users need to use String[] or Collection<?> instead!"); } tpc.languages.setConfiguration(configuration); Map<String,String> defaultConfig = tpc.languages.getDefaultParameters(); //apply the default parameters (parameter set for the '*' or '' (empty) language if(!defaultConfig.isEmpty()){ applyLanguageParameter(tpc.defaultConfig,null,defaultConfig); } //apply language specific configurations for(String lang : tpc.languages.getExplicitlyIncluded()){ LanguageProcessingConfig lpc = tpc.defaultConfig.clone(); applyLanguageParameter(lpc, lang, tpc.languages.getParameters(lang)); tpc.languageConfigs.put(lang, lpc); } return tpc; } private static void applyLanguageParameter(LanguageProcessingConfig tpc, String language, Map<String,String> config) throws ConfigurationException { log.info(" > parse language Configuration for language: {}", language == null ? "default":language); //parse Phrase level configuration Set<LexicalCategory> chunkCats = parseEnumParam(config, PROCESSED_LANGUAGES, language, PARAM_PHRASE_CATEGORIES, LexicalCategory.class); Set<String> chunkTags = parseStringTags(config.get(PARAM_PHRASE_TAG)); if(chunkCats.isEmpty() && config.containsKey(PARAM_PHRASE_CATEGORIES) && chunkTags.isEmpty()){ log.info(" + enable ignorePhrase"); tpc.setIgnoreChunksState(true); tpc.setProcessedPhraseCategories(Collections.EMPTY_SET); } else { tpc.setIgnoreChunksState(false); if(!chunkCats.isEmpty()){ log.info(" + set processable Phrase cat {}",chunkCats); tpc.setProcessedPhraseCategories(chunkCats); } else { log.info(" - use processable Phrase cats {}",tpc.getProcessedPhraseCategories()); } if(!chunkTags.isEmpty()) { log.info(" + set processable Phrase tags {}",chunkTags); tpc.setProcessedPhraseTags(chunkTags); } else { log.info(" - use processable Phrase tags {}",tpc.getProcessedPhraseTags()); } } Double chunkProb = parseNumber(config, PROCESSED_LANGUAGES, language, PARAM_PHRASE_PROBABILITY, Double.class); if(chunkProb != null || //if explicitly set config.containsKey(PARAM_PHRASE_PROBABILITY)){ //set to empty value (set default) log.info(" + set min ChunkTag probability: {}", chunkProb == null ? "default" : chunkProb); tpc.setMinPhraseAnnotationProbability(chunkProb); tpc.setMinExcludePhraseAnnotationProbability(chunkProb == null ? null : chunkProb/2); } else { log.info(" - use min PhraseTag probability: {}",tpc.getMinPhraseAnnotationProbability()); } //link multiple matchable Tokens within Chunks Boolean lmmticState = parseState(config, PARAM_LINK_MULTI_MATCHABLE_TOKEN_IN_PHRASE); if(lmmticState != null){ log.info(" + set the link multi matchable tokens in Phrase state to : {}",lmmticState); tpc.setLinkMultiMatchableTokensInChunkState(lmmticState); } else { log.info(" - use the link multi matchable tokens in Phrase state to : {}",tpc.isLinkMultiMatchableTokensInChunk()); } //parse Token level configuration Set<LexicalCategory> lexCats = parseEnumParam(config, PROCESSED_LANGUAGES, language, PARAM_LEXICAL_CATEGORIES, LexicalCategory.class); Set<Pos> pos = parseEnumParam(config, PROCESSED_LANGUAGES, language,PARAM_POS_TYPES, Pos.class); Set<String> tags = parseStringTags(config.get(PARAM_POS_TAG)); if(config.containsKey(PARAM_LEXICAL_CATEGORIES) || config.containsKey(PARAM_POS_TYPES) || config.containsKey(PARAM_POS_TAG)){ log.info(" + set Linkable Tokens: cat: {}, pos: {}, tags {}", new Object[]{lexCats,pos,tags}); tpc.setLinkedLexicalCategories(lexCats); tpc.setLinkedPos(pos); tpc.setLinkedPosTags(tags); } else { log.info(" - use Linkable Tokens: cat: {}, pos: {}, tags {}", new Object[]{tpc.getLinkedLexicalCategories(), tpc.getLinkedPos(), tpc.getLinkedPos()}); } //min POS tag probability Double prob = parseNumber(config,PROCESSED_LANGUAGES,language, PARAM_POS_PROBABILITY,Double.class); if(prob != null || //explicitly set config.containsKey(PARAM_POS_PROBABILITY)){ //set to empty value (set default) log.info(" + set minimum POS tag probability: {}", prob == null ? "default" : prob); tpc.setMinPosAnnotationProbability(prob); tpc.setMinExcludePosAnnotationProbability(prob == null ? null : prob/2d); } else { log.info(" - use minimum POS tag probability: {}", tpc.getMinPosAnnotationProbability()); } //parse upper case Set<UPPER_CASE_MODE> ucMode = parseEnumParam(config, PROCESSED_LANGUAGES,language,PARAM_UPPER_CASE,UPPER_CASE_MODE.class); if(ucMode.size() > 1){ throw new ConfigurationException(PROCESSED_LANGUAGES, "Parameter 'uc' (Upper case mode) MUST NOT be multi valued (langauge: " +(language == null ? "default":language)+", parsed value='"+config.get(PARAM_UPPER_CASE)+"')!"); } if(!ucMode.isEmpty()){ UPPER_CASE_MODE mode = ucMode.iterator().next(); log.info(" + set upper case token mode to {}", mode); switch (mode) { case NONE: tpc.setMatchUpperCaseTokensState(false); tpc.setLinkUpperCaseTokensState(false); break; case MATCH: tpc.setMatchUpperCaseTokensState(true); tpc.setLinkUpperCaseTokensState(false); break; case LINK: tpc.setMatchUpperCaseTokensState(true); tpc.setLinkUpperCaseTokensState(true); break; default: log.warn("Unsupported {} entry {} -> set defaults",UPPER_CASE_MODE.class.getSimpleName(),mode); tpc.setMatchUpperCaseTokensState(null); tpc.setLinkUpperCaseTokensState(null); break; } } else { log.info(" - use upper case token mode: match={}, link={}", tpc.isMatchUpperCaseTokens(), tpc.isLinkUpperCaseTokens()); } //apply chunkable parameters (STANBOL-1117) if(config.containsKey(PARAM_CHUNKABLE_CATEGORIES)){ Set<LexicalCategory> chunkableCategories = parseEnumParam(config, PROCESSED_LANGUAGES, language, PARAM_CHUNKABLE_CATEGORIES, LexicalCategory.class); log.info(" ... set chunkable Categories to {}", chunkableCategories); tpc.setChunkableCategories(chunkableCategories); } if(config.containsKey(PARAM_CHUNKABLE_POS_TYPES)){ Set<Pos> chunkablePos = parseEnumParam(config, PROCESSED_LANGUAGES, language, PARAM_CHUNKABLE_POS_TYPES, Pos.class); log.info(" ... set chunkable POS tags to {}", chunkablePos); tpc.setChunkablePos(chunkablePos); } if(config.containsKey(PARAM_CHUNKABLE_TAGS)){ Set<String> chunkableTags = parseStringTags(config.get(PARAM_CHUNKABLE_TAGS)); log.info(" ... set chunkable String tags to {}", chunkableTags); tpc.setChunkableTags(chunkableTags); } } private static Boolean parseState(Map<String,String> config, String param){ String value = config.get(param); return value == null && config.containsKey(param) ? Boolean.TRUE : value != null ? Boolean.valueOf(value) : null; } private static <T extends Number> T parseNumber(Map<String,String> config, String property, String language, String param, Class<T> clazz) throws ConfigurationException { String paramVal = config.get(PARAM_POS_PROBABILITY); if(paramVal != null && !paramVal.trim().isEmpty()){ try { //all Number subclasses do have a String constructor! return clazz.getConstructor(String.class).newInstance(paramVal.trim()); } catch (NumberFormatException e) { throw new ConfigurationException(property, "Unable to parse " + clazz.getSimpleName()+" from Parameter '" + PARAM_POS_PROBABILITY+"="+paramVal.trim() + "' from the "+(language == null ? "default" : language) + " language configuration", e); } catch (IllegalArgumentException e) { throw new IllegalStateException("Unable to create new "+clazz.getSimpleName() +"("+paramVal.trim()+"::String)",e); } catch (SecurityException e) { throw new IllegalStateException("Unable to create new "+clazz.getSimpleName() +"("+paramVal.trim()+"::String)",e); } catch (InstantiationException e) { throw new IllegalStateException("Unable to create new "+clazz.getSimpleName() +"("+paramVal.trim()+"::String)",e); } catch (IllegalAccessException e) { throw new IllegalStateException("Unable to create new "+clazz.getSimpleName() +"("+paramVal.trim()+"::String)",e); } catch (InvocationTargetException e) { throw new IllegalStateException("Unable to create new "+clazz.getSimpleName() +"("+paramVal.trim()+"::String)",e); } catch (NoSuchMethodException e) { throw new IllegalStateException("Unable to create new "+clazz.getSimpleName() +"("+paramVal.trim()+"::String)",e); } } return null; } private static Set<String> parseStringTags(String value) { if(value == null || value.isEmpty()){ return Collections.emptySet(); } else { Set<String> tags = new HashSet<String>(); for(String entry : value.split(",")){ entry = entry.trim(); if(!entry.isEmpty()){ tags.add(entry); } } return tags; } } /** * Utility to parse Enum members out of a comma separated string * @param config the config * @param property the property (only used for error handling) * @param param the key of the config used to obtain the config * @param enumClass the {@link Enum} class * @return the configured members of the Enum or an empty set if none * @throws ConfigurationException if a configured value was not part of the enum */ private static <T extends Enum<T>> Set<T> parseEnumParam(Map<String,String> config, String property, String language, //params used for logging String param,Class<T> enumClass) throws ConfigurationException { Set<T> enumSet; String val = config.get(param); if(val == null){ enumSet = Collections.emptySet(); } else { enumSet = EnumSet.noneOf(enumClass); for(String entry : val.split(",")){ entry = entry.trim(); if(!entry.isEmpty()){ try { enumSet.add(Enum.valueOf(enumClass,entry.toString())); } catch (IllegalArgumentException e) { throw new ConfigurationException(property, "'"+entry +"' of param '"+param+"' for language '" + (language == null ? "default" : language) + "'is not a member of the enum "+ enumClass.getSimpleName() + "(configured : '"+val+"')!" ,e); } } } } return enumSet; } }