/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept. This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit). http://www.cs.umass.edu/~mccallum/mallet This software is provided under the terms of the Common Public License, version 1.0, as published by http://www.opensource.org. For further information, see the file `LICENSE' included with this distribution. */ /** @author Aron Culotta <a href="mailto:culotta@cs.umass.edu">culotta@cs.umass.edu</a> */ package cc.mallet.fst.confidence; import java.util.ArrayList; import java.util.logging.*; import cc.mallet.fst.*; import cc.mallet.types.*; import cc.mallet.util.MalletLogger; /** * Corrects a subset of the {@link Segment}s produced by a {@link * Transducer}. It's most useful to find the {@link Segment}s that the * {@link Transducer} is least confident in and correct those using * the true {@link Labeling} * (<code>correctLeastConfidenceSegments</code>). Unlike in {@link * ConstrainedViterbi}, the corrected segment does not affect the * labeling of other segments in the sequence. For comparison. */ public class IsolatedSegmentTransducerCorrector implements TransducerCorrector { private static Logger logger = MalletLogger.getLogger(IsolatedSegmentTransducerCorrector.class.getName()); TransducerConfidenceEstimator confidenceEstimator; Transducer model; public IsolatedSegmentTransducerCorrector (TransducerConfidenceEstimator confidenceEstimator, Transducer model) { this.confidenceEstimator = confidenceEstimator; this.model = model; } public IsolatedSegmentTransducerCorrector (Transducer model) { this (new ConstrainedForwardBackwardConfidenceEstimator (model), model); } /** @param ilist original Transducer InstanceList @param startTags start segment tags (B-) @param continueTags continue segment tags (I-) TransducerConfidenceEstimator} @return a list of {@link Sequence}s corresponding to the corrected tagging of each Instance in <code>ilist</code>. Note that these corrections will not affect tokens outside of the corrected segment. */ public ArrayList correctLeastConfidentSegments (InstanceList ilist, Object[] startTags, Object[] continueTags) { ArrayList correctedPredictionList = new ArrayList (); for (int i=0; i < ilist.size(); i++) { logger.fine ("correcting instance# " + i + " / " + ilist.size()); Instance instance = ilist.get (i); Segment[] orderedSegments = new Segment[1]; orderedSegments = confidenceEstimator.rankSegmentsByConfidence (instance, startTags, continueTags); Segment leastConfidentSegment = orderedSegments[0]; logger.fine ("Ordered Segments:\nTrue sequence: " + leastConfidentSegment.getTruth()); for (int j=0; j < orderedSegments.length; j++) { logger.fine (orderedSegments[j].toString()); } // _do not_ run constrained viterbi on this sequence with the // constraint that this segment is tagged correctly. // instead, simply replace the labeling of the corrected // segment. MultiSegmentationEvaluator eval = new MultiSegmentationEvaluator (new InstanceList[0], new String[0], startTags, continueTags); Sequence truth = leastConfidentSegment.getTruth(); Sequence predicted = leastConfidentSegment.getPredicted(); int numIncorrect = eval.numIncorrectSegments (truth, predicted); String[] sequence = new String[truth.size()]; for (int j=0; j < truth.size(); j++) { if (j <= leastConfidentSegment.getEnd() && j >= leastConfidentSegment.getStart()) sequence[j] = (String)truth.get (j); else sequence[j] = (String) predicted.get (j); } ArraySequence segmentCorrectedOutput = new ArraySequence (sequence); logger.fine ("Original prediction: "); for (int j=0; j < predicted.size(); j++) logger.fine ((String)predicted.get (j) + "\t"); logger.fine ("\nCorrected prediction: "); for (int j=0; j < segmentCorrectedOutput.size(); j++) logger.fine ((String)segmentCorrectedOutput.get (j) + "\t"); logger.fine (""); if (numIncorrect > -1) correctedPredictionList.add (segmentCorrectedOutput); else correctedPredictionList.add (null); } return correctedPredictionList; } }