/*
* Copyright 2007 T-Rank AS
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package no.trank.openpipe.lemmatizer;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.ListIterator;
import no.trank.openpipe.api.MultiInputOutputFieldPipelineStep;
import no.trank.openpipe.api.PipelineException;
import no.trank.openpipe.api.document.AnnotatedField;
import no.trank.openpipe.api.document.Annotation;
import no.trank.openpipe.api.document.Document;
import no.trank.openpipe.api.document.PreResolvedAnnotation;
import no.trank.openpipe.api.document.ResolvedAnnotation;
import no.trank.openpipe.config.annotation.NotEmpty;
import no.trank.openpipe.config.annotation.NotNull;
import no.trank.openpipe.lemmatizer.model.LemmatizeModel;
/**
* A step for lemmatization by expansion.
* <p/>
* This step needs a pre tokenized field, with annotation type {@link #getInputAnnotationType() inputAnnotationType}.
* <p/>
* Output from this step is {@link PreResolvedAnnotation}'s with {@link PreResolvedAnnotation#getStartPos() startPos}
* and {@link PreResolvedAnnotation#getEndPos() endPos} same as the original {@link Annotation}.
*
* @version $Revision$
*/
public class LemmatizerStep extends MultiInputOutputFieldPipelineStep {
@NotEmpty
private String inputAnnotationType = "word";
@NotEmpty
private String outputAnnotationType = "lemma";
@NotNull
private LemmatizeModel model;
private boolean overwriteOutputAnnotation;
private boolean includeInputAnnotation;
/**
* {@inheritDoc}
*/
public LemmatizerStep() {
super("Lemmatizer", true);
}
@Override
protected void process(Document doc, String inputFieldName, List<AnnotatedField> inputFields, String outputFieldName)
throws PipelineException {
for (AnnotatedField field : inputFields) {
final ListIterator<ResolvedAnnotation> it = field.iterator(inputAnnotationType);
if (it.hasNext()) {
final List<ResolvedAnnotation> anns = new ArrayList<ResolvedAnnotation>();
while (it.hasNext()) {
addAnnotations(it.next(), anns);
}
if (!overwriteOutputAnnotation) {
field.add(outputAnnotationType, anns);
} else {
field.set(outputAnnotationType, anns);
}
}
}
}
private void addAnnotations(ResolvedAnnotation term, List<ResolvedAnnotation> anns) {
final Iterator<String> it = model.get(term.getValue());
if (it.hasNext()) {
while (it.hasNext()) {
anns.add(new PreResolvedAnnotation(term.getStartPos(), term.getEndPos(), it.next()));
}
} else if (includeInputAnnotation) {
anns.add(term);
}
}
@Override
public void prepare() throws PipelineException {
super.prepare();
model.reset();
}
@Override
public void finish(boolean success) throws PipelineException {
model.log();
super.finish(success);
}
/**
* Gets the model used for lemmatization.
*
* @return the model used for lemmatization.
*/
public LemmatizeModel getModel() {
return model;
}
/**
* Sets the model used for lemmatization.
*
* @param model the model used for lemmatization. <i>Cannot</i> be <tt>null</tt>.
*/
public void setModel(LemmatizeModel model) {
this.model = model;
}
/**
* Gets the type of annotations used for input. <br/>
* Default value is <tt>"word"</tt>.
*
* @return the type of annotations used for input.
*/
public String getInputAnnotationType() {
return inputAnnotationType;
}
/**
* Sets the type of annotations used for input.
*
* @param inputAnnotationType the type of annotations used for input.
*
* @see #getInputAnnotationType()
*/
public void setInputAnnotationType(String inputAnnotationType) {
this.inputAnnotationType = inputAnnotationType;
}
/**
* Gets the type of annotations used for output. <br/>
* Default value is <tt>"lemma"</tt>.
*
* @return the type of annotations used for output.
*/
public String getOutputAnnotationType() {
return outputAnnotationType;
}
/**
* Sets the type of annotations used for output.
*
* @param outputAnnotationType the type of annotations used for output.
*
* @see #getOutputAnnotationType()
*/
public void setOutputAnnotationType(String outputAnnotationType) {
this.outputAnnotationType = outputAnnotationType;
}
public void setOverwriteOutputAnnotation(boolean overwriteOutputAnnotation) {
this.overwriteOutputAnnotation = overwriteOutputAnnotation;
}
public void setIncludeInputAnnotation(boolean includeInputAnnotation) {
this.includeInputAnnotation = includeInputAnnotation;
}
@Override
public String getRevision() {
return "$Revision$";
}
}