/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept. This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit). http://www.cs.umass.edu/~mccallum/mallet This software is provided under the terms of the Common Public License, version 1.0, as published by http://www.opensource.org. For further information, see the file `LICENSE' included with this distribution. */ package cc.mallet.pipe; import java.util.logging.*; import java.util.*; import cc.mallet.pipe.Pipe; import cc.mallet.types.Alphabet; import cc.mallet.types.FeatureVector; import cc.mallet.types.Instance; import cc.mallet.types.Labeling; import cc.mallet.util.MalletLogger; /** * Converts a string of the form * <tt>feature_1:val_1 feature_2:val_2 ... feature_k:val_k</tt> * into a (sparse) FeatureVector. * * Features with no ":" character are assumed to have value 1.0. * * @author Gary Huang */ public class Csv2FeatureVector extends Pipe { private static Logger logger = MalletLogger.getLogger(Csv2FeatureVector.class.getName()); public Csv2FeatureVector(int capacity) { this.dataAlphabet = new Alphabet(capacity); } public Csv2FeatureVector() { this(1000); } /** * Convert the data in the given <tt>Instance</tt> from a <tt>CharSequence</tt> * of sparse feature-value pairs to a <tt>FeatureVector</tt> */ public Instance pipe(Instance carrier) { CharSequence c = (CharSequence) carrier.getData(); String[] pairs = c.toString().trim().split("\\s+"); int[] keys = new int[pairs.length]; double[] values = new double[pairs.length]; for (int i = 0; i < pairs.length; i++) { int delimIndex = pairs[i].lastIndexOf(":"); if (delimIndex <= 0 || delimIndex == (pairs[i].length()-1)) { keys[i] = dataAlphabet.lookupIndex(pairs[i], true); values[i] = 1.0; } else { keys[i] = dataAlphabet.lookupIndex(pairs[i].substring(0, delimIndex), true); values[i] = Double.parseDouble(pairs[i].substring(delimIndex+1)); } } // [removed code that sorted indices but NOT values -DM] FeatureVector fv = new FeatureVector(dataAlphabet, keys, values); carrier.setData( fv ); return carrier; } }