package cc.mallet.pipe; import cc.mallet.types.*; import cc.mallet.pipe.*; import java.io.Serializable; import java.util.Arrays; public class TargetStringToFeatures extends Pipe implements Serializable { public TargetStringToFeatures () { super(null, new Alphabet()); } public Instance pipe(Instance carrier) { if (! (carrier.getTarget() instanceof String)) { throw new IllegalArgumentException("Target must be of type String"); } String featuresLine = (String) carrier.getTarget(); String[] features = featuresLine.split(",?\\s+"); double[] values = new double[ features.length ]; Arrays.fill(values, 1.0); for (int i=0; i<features.length; i++) { // Support the syntax "FEATURE=0.000342 OTHER_FEATURE=-2.32423" \ if (features[i].indexOf("=") != -1) { String[] keyValuePair = features[i].split("="); features[i] = keyValuePair[0]; values[i] = Double.parseDouble(keyValuePair[1]); } // ensure that the feature has a spot in the alphabet \ getTargetAlphabet().lookupIndex(features[i], true); } FeatureVector target = new FeatureVector(getTargetAlphabet(), features, values); carrier.setTarget(target); return carrier; } private static final long serialVersionUID = 1; private static final int CURRENT_SERIAL_VERSION = 0; }