/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept. This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit). http://www.cs.umass.edu/~mccallum/mallet This software is provided under the terms of the Common Public License, version 1.0, as published by http://www.opensource.org. For further information, see the file `LICENSE' included with this distribution. */ /** Convert a String containing space-separated feature-name floating-point-value pairs into a FeatureVector. For example: <pre>length=12 width=1.75 blue temperature=-17.2</pre> Features without a corresponding value (ie those not including the character "=", such as the feature <code>blue</code> here) will be set to 1.0. <p>If a feature occurs more than once in the input string, the values of each occurrence will be added.</p> @author David Mimno and Andrew McCallum */ package cc.mallet.pipe; import java.io.*; import cc.mallet.types.Alphabet; import cc.mallet.types.Instance; import cc.mallet.types.FeatureVector; public class FeatureValueString2FeatureVector extends Pipe implements Serializable { public FeatureValueString2FeatureVector (Alphabet dataDict) { super (dataDict, null); } public FeatureValueString2FeatureVector () { super(new Alphabet(), null); } public Instance pipe (Instance carrier) { String[] fields = carrier.getData().toString().split("\\s+"); int numFields = fields.length; Object[] featureNames = new Object[numFields]; double[] featureValues = new double[numFields]; for (int i = 0; i < numFields; i++) { if (fields[i].contains("=")) { String[] subFields = fields[i].split("="); featureNames[i] = subFields[0]; featureValues[i] = Double.parseDouble(subFields[1]); } else { featureNames[i] = fields[i]; featureValues[i] = 1.0; } } carrier.setData(new FeatureVector(getDataAlphabet(), featureNames, featureValues)); return carrier; } }