/* * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.nlp4j.internal; import static java.util.Arrays.asList; import java.io.IOException; import java.io.ObjectInputStream; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import org.w3c.dom.Document; import org.w3c.dom.Element; import de.tudarmstadt.ukp.dkpro.core.api.resources.ResourceUtils; import edu.emory.mathcs.nlp.common.collection.tree.PrefixTree; import edu.emory.mathcs.nlp.common.util.IOUtils; import edu.emory.mathcs.nlp.component.template.OnlineComponent; import edu.emory.mathcs.nlp.component.template.feature.FeatureItem; import edu.emory.mathcs.nlp.component.template.feature.Field; import edu.emory.mathcs.nlp.component.template.node.NLPNode; import edu.emory.mathcs.nlp.component.template.lexicon.GlobalLexica; import edu.emory.mathcs.nlp.component.template.lexicon.GlobalLexicon;; public class EmoryNlpUtils { private static GlobalLexica<NLPNode> lexica; public static synchronized void initGlobalLexica() throws IOException, ParserConfigurationException { if (lexica != null) { return; } // Cf. classpath:/edu/emory/mathcs/nlp/configuration/config-decode-en.xml String LEXICA_PREFIX = "classpath:/edu/emory/mathcs/nlp/lexica/"; DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder(); Document xmlDoc = builder.newDocument(); Element root = xmlDoc.createElement("dummy"); lexica = new GlobalLexica<>(root); lexica.setAmbiguityClasses(new GlobalLexicon<Map<String, List<String>>>( loadLexicon(LEXICA_PREFIX + "en-ambiguity-classes-simplified-lowercase.xz"), Field.word_form_simplified_lowercase, "en-ambiguity-classes-simplified-lowercase")); lexica.setWordClusters(new GlobalLexicon<Map<String,Set<String>>>( loadLexicon(LEXICA_PREFIX + "en-brown-clusters-simplified-lowercase.xz"), Field.word_form_simplified_lowercase, "en-brown-clusters-simplified-lowercase")); lexica.setNamedEntityGazetteers(new GlobalLexicon<PrefixTree<String,Set<String>>>( loadLexicon(LEXICA_PREFIX + "en-named-entity-gazetteers-simplified.xz"), Field.word_form_simplified, "en-named-entity-gazetteers-simplified")); lexica.setWordEmbeddings(new GlobalLexicon<Map<String,float[]>>( loadLexicon(LEXICA_PREFIX + "en-word-embeddings-undigitalized.xz"), Field.word_form_undigitalized, "en-word-embeddings-undigitalized")); // lexica.setStopWords( // loadLexicon(LEXICA_PREFIX + "en-stop-words-simplified-lowercase.xz")); } public static void assignGlobalLexica(NLPNode[] aNodes) { lexica.process(aNodes); } @SuppressWarnings("unchecked") private static <T> T loadLexicon(String aLocation) throws IOException { try (ObjectInputStream is = IOUtils.createObjectXZBufferedInputStream( ResourceUtils.resolveLocation(aLocation).openStream())) { return (T) is.readObject(); } catch (ClassNotFoundException e) { throw new IOException(e); } } public static Set<String> extractFeatures(OnlineComponent<?, ?> component) throws IllegalAccessException { Set<String> features = new HashSet<String>(); for (FeatureItem f : component.getFeatureTemplate().getSetFeatureList()) { features.add(f.field.name()); } for (FeatureItem f : component.getFeatureTemplate().getEmbeddingFeatureList()) { features.add(f.field.name()); } for (FeatureItem[] fl : component.getFeatureTemplate().getFeatureList()) { for (FeatureItem f : fl) { features.add(f.field.name()); } } return features; } public static Set<String> extractUnsupportedFeatures( OnlineComponent<?, ?> component, String... aExtra) throws IllegalAccessException { Set<String> features = extractFeatures(component); Set<String> unsupportedFeatures = new HashSet<String>(features); // This is generated in FeatureTemplate.getPositionFeatures unsupportedFeatures.remove("positional"); // This is generated in FeatureTemplate.getOrthographicFeatures // FIXME There is a special handling for hyperlinks which we likely do not support! unsupportedFeatures.remove("orthographic"); unsupportedFeatures.remove("orthographic_lowercase"); // This is generated in FeatureTemplate.getPrefix / getSuffix unsupportedFeatures.remove("prefix"); unsupportedFeatures.remove("suffix"); // The following are created internally in NLPNode.setWordForm() unsupportedFeatures.remove("word_form"); unsupportedFeatures.remove("word_form_simplified"); unsupportedFeatures.remove("word_form_undigitalized"); unsupportedFeatures.remove("word_form_simplified_lowercase"); // These are handled internally in NLPNode unsupportedFeatures.remove("word_shape"); // These are handled by GlobalLexica.assignGlobalLexica() unsupportedFeatures.remove("ambiguity_classes"); unsupportedFeatures.remove("word_clusters"); unsupportedFeatures.remove("named_entity_gazetteers"); unsupportedFeatures.remove("word_embedding"); // We know POS tag if POS tagger ran before unsupportedFeatures.remove("part_of_speech_tag"); // We know the lemma if we ran a lemmatizer before unsupportedFeatures.remove("lemma"); unsupportedFeatures.removeAll(asList(aExtra)); return unsupportedFeatures; } }