/* * This file is part of ELKI: * Environment for Developing KDD-Applications Supported by Index-Structures * * Copyright (C) 2017 * ELKI Development Team * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package de.lmu.ifi.dbs.elki.datasource.parser; import gnu.trove.iterator.TIntDoubleIterator; import gnu.trove.map.TObjectIntMap; import gnu.trove.map.hash.TIntDoubleHashMap; import gnu.trove.map.hash.TObjectIntHashMap; import java.util.ArrayList; import de.lmu.ifi.dbs.elki.data.LabelList; import de.lmu.ifi.dbs.elki.data.SparseFloatVector; import de.lmu.ifi.dbs.elki.data.SparseNumberVector; import de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation; import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation; import de.lmu.ifi.dbs.elki.data.type.VectorTypeInformation; import de.lmu.ifi.dbs.elki.logging.Logging; import de.lmu.ifi.dbs.elki.utilities.documentation.Description; import de.lmu.ifi.dbs.elki.utilities.documentation.Title; import de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException; import de.lmu.ifi.dbs.elki.utilities.optionhandling.OptionID; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.Parameterization; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.Flag; import de.lmu.ifi.dbs.elki.utilities.optionhandling.parameters.ObjectParameter; /** * A parser to load term frequency data, which essentially are sparse vectors * with text keys. * * If your data does not contain frequencies, you can maybe use * {@link SimpleTransactionParser} instead. * * @author Erich Schubert * @since 0.4.0 * * @apiviz.has SparseNumberVector */ @Title("Term frequency parser") @Description("Parse a file containing term frequencies. The expected format is 'label term1 <freq> term2 <freq> ...'. Terms must not contain the separator character!") public class TermFrequencyParser<V extends SparseNumberVector> extends NumberVectorLabelParser<V> { /** * Class logger. */ private static final Logging LOG = Logging.getLogger(TermFrequencyParser.class); /** * Number of different terms observed. */ int numterms; /** * Map. */ TObjectIntMap<String> keymap; /** * Normalize. */ boolean normalize; /** * Same as {@link #factory}, but subtype. */ private SparseNumberVector.Factory<V> sparsefactory; /** * (Reused) set of values for the number vector. */ TIntDoubleHashMap values = new TIntDoubleHashMap(); /** * (Reused) label buffer. */ ArrayList<String> labels = new ArrayList<>(); /** * Constructor. * * @param normalize Normalize * @param factory Vector type */ public TermFrequencyParser(boolean normalize, SparseNumberVector.Factory<V> factory) { this(normalize, CSVReaderFormat.DEFAULT_FORMAT, null, factory); } /** * Constructor. * * @param normalize Normalize * @param format Input format * @param labelIndices Indices to use as labels * @param factory Vector type */ public TermFrequencyParser(boolean normalize, CSVReaderFormat format, long[] labelIndices, SparseNumberVector.Factory<V> factory) { super(format, labelIndices, factory); this.normalize = normalize; this.keymap = new TObjectIntHashMap<>(1001, .5f, -1); this.sparsefactory = factory; } @Override protected boolean parseLineInternal() { double len = 0; String curterm = null; int c = 0; for(/* initialized by nextLineExceptComments() */; tokenizer.valid(); tokenizer.advance()) { if(isLabelColumn(c++)) { labels.add(tokenizer.getSubstring()); continue; } if(curterm == null) { curterm = tokenizer.getSubstring(); continue; } try { double attribute = tokenizer.getDouble(); int curdim = keymap.get(curterm); if(curdim < 0) { curdim = numterms; keymap.put(curterm, curdim); ++numterms; } values.put(curdim, attribute); len += attribute; curterm = null; } catch(NumberFormatException e) { if(curterm != null) { labels.add(curterm); } curterm = tokenizer.getSubstring(); } } if(curterm != null) { labels.add(curterm); } haslabels |= !labels.isEmpty(); if(normalize && Math.abs(len - 1.0) > Double.MIN_NORMAL) { for(TIntDoubleIterator iter = values.iterator(); iter.hasNext();) { iter.advance(); iter.setValue(iter.value() / len); } } curvec = sparsefactory.newNumberVector(values, numterms); curlbl = LabelList.make(labels); values.clear(); labels.clear(); return true; } @Override protected SimpleTypeInformation<V> getTypeInformation(int mindim, int maxdim) { if(mindim == maxdim) { return new VectorFieldTypeInformation<>(factory, mindim); } else if(mindim < maxdim) { return new VectorTypeInformation<>(factory, factory.getDefaultSerializer(), mindim, maxdim); } throw new AbortException("No vectors were read from the input file - cannot determine vector data type."); } @Override protected Logging getLogger() { return LOG; } /** * Parameterization class. * * @author Erich Schubert * * @apiviz.exclude */ public static class Parameterizer<V extends SparseNumberVector> extends NumberVectorLabelParser.Parameterizer<V> { /** * Option ID for normalization. */ public static final OptionID NORMALIZE_FLAG = new OptionID("tf.normalize", "Normalize vectors to manhattan length 1 (convert term counts to term frequencies)"); /** * Normalization flag. */ boolean normalize = false; @Override protected void makeOptions(Parameterization config) { super.makeOptions(config); Flag normF = new Flag(NORMALIZE_FLAG); if(config.grab(normF)) { normalize = normF.isTrue(); } } @Override protected void getFactory(Parameterization config) { ObjectParameter<SparseNumberVector.Factory<V>> factoryP = new ObjectParameter<>(VECTOR_TYPE_ID, SparseNumberVector.Factory.class, SparseFloatVector.Factory.class); if(config.grab(factoryP)) { factory = factoryP.instantiateClass(config); } } @Override protected TermFrequencyParser<V> makeInstance() { return new TermFrequencyParser<>(normalize, format, labelIndices, (SparseNumberVector.Factory<V>) factory); } } }