/** * Copyright 2014, Emory University * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package edu.emory.clir.clearnlp.component.mode.ner; import java.io.Serializable; import java.util.Set; import edu.emory.clir.clearnlp.collection.map.IntObjectHashMap; import edu.emory.clir.clearnlp.collection.ngram.Bigram; import edu.emory.clir.clearnlp.collection.pair.ObjectIntPair; import edu.emory.clir.clearnlp.collection.tree.PrefixTree; import edu.emory.clir.clearnlp.component.utils.NLPUtils; import edu.emory.clir.clearnlp.dependency.DEPNode; import edu.emory.clir.clearnlp.dependency.DEPTree; import edu.emory.clir.clearnlp.ner.NERInfoSet; import edu.emory.clir.clearnlp.util.Joiner; import edu.emory.clir.clearnlp.util.Splitter; import edu.emory.clir.clearnlp.util.constant.StringConst; /** * @since 3.0.3 * @author Jinho D. Choi ({@code jinho.choi@emory.edu}) */ public class NERLexicon implements Serializable { private static final long serialVersionUID = 3816259878124239839L; private PrefixTree<String,NERInfoSet> ner_dictionary; private Bigram<String,String> dict_counts; private Set<String> collect_labels; private int collect_cutoff; public NERLexicon(NERConfiguration configuration) { // setDictionaryCutoff(configuration.getCollectCutoff()); // collect_labels = configuration.getCollectLabelSet(); dict_counts = new Bigram<>(); if (configuration.getDictionaryPath() != null) setDictionary(NLPUtils.getNERDictionary(configuration.getDictionaryPath())); else setDictionary(new PrefixTree<>()); } public void collect(DEPTree tree) { DEPNode[] nodes = tree.toNodeArray(); IntObjectHashMap<String> map = NERState.collectNamedEntityMap(nodes, DEPNode::getNamedEntityTag); int bIdx, eIdx, size = tree.size(); for (ObjectIntPair<String> p : map) { bIdx = p.i / size; eIdx = p.i % size; if (collect_labels.contains(p.o)) dict_counts.add(p.o, Joiner.join(nodes, StringConst.SPACE, bIdx, eIdx+1, DEPNode::getWordForm)); } } public void populateDictionary() { NERInfoSet set; String[] array; for (String type : dict_counts.getBigramSet()) { for (ObjectIntPair<String> p : dict_counts.toList(type, collect_cutoff)) { array = Splitter.splitSpace(p.o); set = NERState.pick(ner_dictionary, type, array, 0, array.length, String::toString, p.i); set.addCorrectCount(p.i); } } dict_counts = null; } public PrefixTree<String,NERInfoSet> getDictionary() { return ner_dictionary; } public void setDictionary(PrefixTree<String,NERInfoSet> dictionary) { ner_dictionary = dictionary; } public int getDictionaryCutoff() { return collect_cutoff; } public void setDictionaryCutoff(int cutoff) { collect_cutoff = cutoff; } }