/* * This file is part of ELKI: * Environment for Developing KDD-Applications Supported by Index-Structures * * Copyright (C) 2017 * ELKI Development Team * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package de.lmu.ifi.dbs.elki.datasource.parser; import gnu.trove.iterator.TObjectIntIterator; import gnu.trove.list.array.TLongArrayList; import gnu.trove.map.TObjectIntMap; import gnu.trove.map.hash.TObjectIntHashMap; import java.io.IOException; import java.io.InputStream; import de.lmu.ifi.dbs.elki.data.BitVector; import de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation; import de.lmu.ifi.dbs.elki.data.type.VectorTypeInformation; import de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta; import de.lmu.ifi.dbs.elki.logging.Logging; /** * Simple parser for transactional data, such as market baskets. * * To keep the input format simple and readable, all tokens are assumed to be of * text and separated by whitespace, and each transaction is on a separate line. * * An example file containing two transactions looks like this * * <pre> * bread butter milk * paste tomato basil * </pre> * * TODO: add a parameter to e.g. use the first or last entry as labels instead * of tokens. * * @author Erich Schubert * @since 0.7.0 * * @apiviz.has BitVector */ public class SimpleTransactionParser extends AbstractStreamingParser { /** * Class logger. */ private static final Logging LOG = Logging.getLogger(SimpleTransactionParser.class); /** * Number of different terms observed. */ int numterms; /** * Map. */ TObjectIntMap<String> keymap; /** * Metadata. */ protected BundleMeta meta; /** * Event to report next. */ Event nextevent; /** * Current vector. */ BitVector curvec; /** * Buffer, will be reused. */ TLongArrayList buf = new TLongArrayList(); /** * Constructor. * * @param format Input format */ public SimpleTransactionParser(CSVReaderFormat format) { super(format); keymap = new TObjectIntHashMap<>(1001, .5f, -1); } @Override public void initStream(InputStream in) { super.initStream(in); nextevent = Event.META_CHANGED; // Initial event. } @Override public Event nextEvent() { if(nextevent != null) { Event ret = nextevent; nextevent = null; return ret; } try { while(reader.nextLineExceptComments()) { // Don't reuse bitsets, will not be copied by BitVector constructor. buf.clear(); for(/* initialized by nextLineExceptComments() */; tokenizer.valid(); tokenizer.advance()) { String token = tokenizer.getSubstring(); int t = keymap.get(token); if(t < 0) { t = keymap.size(); keymap.put(token, t); } final int word = t >>> 6; final int off = t & 0x3F; while(word >= buf.size()) { // Ensure size. buf.add(0L); } buf.set(word, buf.get(word) | (1L << off)); } curvec = new BitVector(buf.toArray(), keymap.size()); return Event.NEXT_OBJECT; } nextevent = Event.END_OF_STREAM; // Construct final metadata: meta = new BundleMeta(1); String[] colnames = new String[keymap.size()]; for(TObjectIntIterator<String> iter = keymap.iterator(); iter.hasNext();) { iter.advance(); colnames[iter.value()] = iter.key(); } meta.add(new VectorFieldTypeInformation<>(BitVector.FACTORY, colnames.length, colnames)); return Event.META_CHANGED; // Force a final meta update. } catch(IOException e) { throw new IllegalArgumentException("Error while parsing line " + reader.getLineNumber() + "."); } } @Override public void cleanup() { super.cleanup(); curvec = null; } @Override public Object data(int rnum) { if(rnum == 0) { return curvec; } throw new ArrayIndexOutOfBoundsException(); } @Override public BundleMeta getMeta() { if(meta == null) { meta = new BundleMeta(1); meta.add(new VectorTypeInformation<>(BitVector.FACTORY, BitVector.SHORT_SERIALIZER, 0, numterms)); } return meta; } @Override protected Logging getLogger() { return LOG; } /** * Parameterization class. * * @author Erich Schubert * * @apiviz.exclude */ public static class Parameterizer extends AbstractStreamingParser.Parameterizer { @Override protected SimpleTransactionParser makeInstance() { return new SimpleTransactionParser(format); } } }