/**
* Copyright (C) 2012 cogroo <cogroo@cogroo.org>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.cogroo.tools.featurizer;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.cogroo.tools.chunker2.TokenTag;
import opennlp.tools.ml.EventTrainer;
import opennlp.tools.ml.TrainerFactory;
import opennlp.tools.ml.model.Event;
import opennlp.tools.ml.model.MaxentModel;
import opennlp.tools.ml.model.SequenceClassificationModel;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.Sequence;
import opennlp.tools.util.SequenceValidator;
import opennlp.tools.util.TrainingParameters;
/**
* The class represents a maximum-entropy-based chunker. Such a chunker can be
* used to find flat structures based on sequence inputs such as noun phrases or
* named entities.
*/
public class FeaturizerME implements Featurizer {
public static final int DEFAULT_BEAM_SIZE = 10;
private final FeaturizerContextGenerator contextGenerator;
private final SequenceValidator<TokenTag> sequenceValidator;
private Sequence bestSequence;
/**
* The model used to assign chunk tags to a sequence of tokens.
*/
protected SequenceClassificationModel<TokenTag> model;
/**
* Initializes the current instance with the specified model. The default beam
* size is used.
*
* @param model
*/
public FeaturizerME(FeaturizerModel model) {
FeaturizerFactory factory = model.getFactory();
this.model = model.getChunkerSequenceModel();
this.contextGenerator = model.getFactory().getFeaturizerContextGenerator();
this.sequenceValidator = model.getFactory().getSequenceValidator();
}
public String[] featurize(String[] toks, String[] tags) {
bestSequence = model.bestSequence(TokenTag.create(toks,tags), null, contextGenerator, sequenceValidator);
List<String> c = bestSequence.getOutcomes();
return c.toArray(new String[c.size()]);
}
public Sequence[] topKSequences(String[] sentence, String[] tags) {
return model.bestSequences(DEFAULT_BEAM_SIZE, TokenTag.create(sentence, tags), new Object[] { },
contextGenerator, sequenceValidator);
}
public Sequence[] topKSequences(String[] sentence, String[] tags,
double minSequenceScore) {
return model.bestSequences(DEFAULT_BEAM_SIZE, TokenTag.create(sentence, tags), new Object[] { }, minSequenceScore,
contextGenerator, sequenceValidator);
}
/**
* Populates the specified array with the probabilities of the last decoded
* sequence. The sequence was determined based on the previous call to
* <code>chunk</code>. The specified array should be at least as large as the
* numbe of tokens in the previous call to <code>chunk</code>.
*
* @param probs
* An array used to hold the probabilities of the last decoded
* sequence.
*/
public void probs(double[] probs) {
bestSequence.getProbs(probs);
}
/**
* Returns an array with the probabilities of the last decoded sequence. The
* sequence was determined based on the previous call to <code>chunk</code>.
*
* @return An array with the same number of probabilities as tokens were sent
* to <code>chunk</code> when it was last called.
*/
public double[] probs() {
return bestSequence.getProbs();
}
public static FeaturizerModel train(String lang,
ObjectStream<FeatureSample> in,
TrainingParameters mlParams,
FeaturizerFactory factory) throws IOException {
Map<String, String> manifestInfoEntries = new HashMap<String, String>();
ObjectStream<Event> es = new FeaturizerEventStream(in, factory.getFeaturizerContextGenerator());
EventTrainer trainer = TrainerFactory.getEventTrainer(
mlParams.getSettings(), manifestInfoEntries);
MaxentModel maxentModel = trainer.train(es);
return new FeaturizerModel(lang, maxentModel, manifestInfoEntries, factory);
}
}