Text2Vectors.java example

Explorer

TACIT-master
- edu.usc.cssl.tacit.classify.naivebayes
  - src
    - edu
      - usc
        cssl
        tacit
        classify
        naivebayes
        Activator.java
        services
        CommandOption.java
        CrossValidationIterator.java
        CrossValidator.java
        NaiveBayesClassifier.java
        Text2Vectors.java
        Vectors2Classify.java
- edu.usc.cssl.tacit.classify.naivebayes.ui
  - src
    - edu
      - usc
        cssl
        tacit
        classify
        naivebayes
        ui
        Activator.java
        NaiveBayesClassifierView.java
        handlers
        NaiveBayesClassifierViewHandler.java
        internal
        INaiveBayesClassifierViewConstants.java
        NaiveBayesClassifierViewImageRegistry.java
- edu.usc.cssl.tacit.classify.svm
  - src
    - edu
      - usc
        cssl
        tacit
        classify
        svm
        Activator.java
        services
        CrossValidator.java
        SVMClassify.java
        SVMPredict.java
        SVMTrain.java
- edu.usc.cssl.tacit.classify.svm.ui
  - src
    - edu
      - usc
        cssl
        tacit
        classifiy
        svm
        ui
        Activator.java
        SVMView.java
        classify
        svm
        ui
        handlers
        OpenSVMViewHandler.java
        internal
        ISVMViewConstants.java
        SVMViewImageRegistry.java
- edu.usc.cssl.tacit.cluster.hierarchical
  - src
    - edu
      - usc
        cssl
        tacit
        cluster
        hierarchical
        Activator.java
        services
        HierarchicalClusterAnalysis.java
- edu.usc.cssl.tacit.cluster.hierarchical.ui
  - src
    - edu
      - usc
        cssl
        tacit
        cluster
        hierarchical
        ui
        Activator.java
        HierarchicalClusterView.java
        handlers
        OpenHierarchicalClusterViewHandler.java
        internal
        HeirarchicalClusterViewImageRegistry.java
        IHeirarchicalClusterViewConstants.java
- edu.usc.cssl.tacit.cluster.kmeans
  - src
    - edu
      - uc
        cssl
        tacit
        cluster
        kmeans
        Activator.java
        services
        KmeansClusterAnalysis.java
- edu.usc.cssl.tacit.cluster.kmeans.ui
  - src
    - edu
      - usc
        cssl
        tacit
        cluster
        kmeans
        ui
        Activator.java
        KmeansClusterView.java
        handlers
        OpenKmeansClusterViewHandler.java
        internal
        IKmeansClusterViewConstants.java
        KmeansClusterViewImageRegistry.java
- edu.usc.cssl.tacit.common
  - src
    - edu
      - usc
        cssl
        tacit
        common
        Activator.java
        JsonHandler.java
        Preprocess.java
        TacitUtility.java
        crawlers
        ICrawler.java
        snowballstemmer
        Among.java
        DanishStemmer.java
        DutchStemmer.java
        EnglishStemmer.java
        FinnishStemmer.java
        FrenchStemmer.java
        GermanStemmer.java
        HungarianStemmer.java
        ItalianStemmer.java
        NorwegianStemmer.java
        PorterStemmer.java
        PortugueseStemmer.java
        RomanianStemmer.java
        RussianStemmer.java
        SnowballProgram.java
        SnowballStemmer.java
        SpanishStemmer.java
        SwedishStemmer.java
        TurkishStemmer.java
- edu.usc.cssl.tacit.common.ui
  - src
    - edu
      - usc
        cssl
        tacit
        common
        ui
        CommonUiActivator.java
        ICommonUiConstants.java
        IPreprocessorSettingsConstant.java
        composite
        from
        TacitFormComposite.java
        handlers
        OpenConsoleViewHandler.java
        OpenCorpusManagementViewHandler.java
        OpenPreProcessViewHandler.java
        OpenProblemsViewHandler.java
        OpenProgressViewHandler.java
        internal
        CommonUiViewImageRegistry.java
        TargetLocationContentProvider.java
        TargetLocationLabelProvider.java
        TargetLocationsGroup.java
        TreeParent.java
        outputdata
        OutputLayoutData.java
        TableLayoutData.java
        preferencepage
        ELanguageType.java
        MainPreferencePage.java
        PrepocessorSettings.java
        utility
        INlpCommonUiConstants.java
        IconRegistry.java
        validation
        OutputPathValidation.java
        views
        ConsoleView.java
        CorpusManagementView.java
        PreprocessorView.java
- edu.usc.cssl.tacit.crawlers.latin
  - src
    - edu
      - usc
        cssl
        tacit
        crawlers
        latin
        Activator.java
        ILatinCrawlerConstants.java
        services
        LatinCrawler.java
- edu.usc.cssl.tacit.crawlers.latin.ui
  - src
    - edu
      - usc
        cssl
        tacit
        crawlers
        latin
        ui
        Activator.java
        AuthorListDialog.java
        LatinCrawlerView.java
        handlers
        OpenLatinCrawlerViewHandler.java
        internal
        ILatinCrawlerUIConstants.java
        LatinCrawlerImageRegistry.java
- edu.usc.cssl.tacit.crawlers.reddit
  - src
    - edu
      - usc
        cssl
        tacit
        crawlers
        reddit
        Activator.java
        services
        RedditCrawler.java
        RedditPlugin.java
- edu.usc.cssl.tacit.crawlers.reddit.ui
  - src
    - edu
      - usc
        cssl
        tacit
        crawlers
        reddit
        ui
        Activator.java
        RedditCrawlerView.java
        handlers
        RedditCrawlerViewHandler.java
        internal
        IRedditCrawlerViewConstants.java
        RedditCrawlerViewImageRegistry.java
- edu.usc.cssl.tacit.crawlers.senate
  - src
    - edu
      - usc
        cssl
        tacit
        crawlers
        senate
        Activator.java
        services
        AvailableRecords.java
        SenateCrawler.java
        SenatorDetails.java
- edu.usc.cssl.tacit.crawlers.senate.ui
  - src
    - edu
      - usc
        cssl
        tacit
        crawlers
        senate
        ui
        Activator.java
        SenateCrawlerView.java
        SenatorListDialog.java
        handlers
        SenateCrawlerViewHandler.java
        internal
        AbstractElementListSelectionDialog.java
        ElementListSelectionDialog.java
        FilteredList.java
        ISenateCrawlerViewConstants.java
        SenateCrawlerViewImageRegistry.java
- edu.usc.cssl.tacit.crawlers.supremecourt
  - src
    - edu
      - uc
        cssl
        tacit
        crawlers
        supremecourt
        Activator.java
        services
        CrawlerData.java
        CrawlerJob.java
        MyThread.java
        SupremCrawlerFilter.java
        SupremeCourtCrawler.java
- edu.usc.cssl.tacit.crawlers.supremecourt.ui
  - src
    - edu
      - usc
        cssl
        tacit
        crawlers
        supremecourt
        ui
        Activator.java
        SupremeCrawlerView.java
        handlers
        OpenSupremeCrawlerViewHandler.java
        internal
        ISupremeCrawlerUIConstants.java
        SupremeCrawlerImageRegistry.java
- edu.usc.cssl.tacit.crawlers.twitter
  - src
    - edu
      - usc
        cssl
        tacit
        crawlers
        twitter
        Activator.java
        services
        TwitterStreamApi.java
        TwitterTestProgram.java
- edu.usc.cssl.tacit.crawlers.twitter.ui
  - src
    - edu
      - usc
        cssl
        tacit
        crawlers
        twitter
        ui
        Activator.java
        TwitterCrawlerView.java
        handlers
        OpenTwitterCrawlerViewHandler.java
        internal
        ITwitterCrawlerUIConstants.java
        TwitterCrawlerImageRegistry.java
        preferencepage
        ITwitterConstant.java
        TwitterUserConfiguration.java
- edu.usc.cssl.tacit.help
  - src
    - edu
      - usc
        cssl
        tacit
        help
        Activator.java
- edu.usc.cssl.tacit.repository
  - src
    - edu
      - usc
        cssl
        tacit
        repository
        Activator.java
        Application.java
        ApplicationActionBarAdvisor.java
        ApplicationWorkbenchAdvisor.java
        ApplicationWorkbenchWindowAdvisor.java
        Perspective.java
- edu.usc.cssl.tacit.topicmodel.lda
  - src
    - edu
      - usc
        cssl
        tacit
        topicmodel
        lda
        Activator.java
        services
        LdaAnalysis.java
        Text2Vectors.java
        Vectors2Topics.java
- edu.usc.cssl.tacit.topicmodel.lda.ui
  - src
    - edu
      - usc
        cssl
        tacit
        topicmodel
        lda
        ui
        Activator.java
        LdaTopicModelView.java
        handlers
        OpenLdaTopicModelViewHandler.java
        internal
        ILdaTopicModelClusterViewConstants.java
        LdaTopicModelViewImageRegistry.java
- edu.usc.cssl.tacit.topicmodel.zlda
  - src
    - edu
      - usc
        cssl
        tacit
        topicmodel
        zlda
        Activator.java
        services
        DTWC.java
        ZlabelLDA.java
        ZlabelTopicModelAnalysis.java
- edu.usc.cssl.tacit.topicmodel.zlda.ui
  - src
    - edu
      - usc
        cssl
        tacit
        topicmodel
        zlda
        ui
        Activator.java
        ZlabelLdaTopicModelView.java
        handlers
        OpenZlabelLdaTopicModelViewHandler.java
        internal
        IZlabelLdaTopicModelClusterViewConstants.java
        ZlabelLdaTopicModelViewImageRegistry.java
- edu.usc.cssl.tacit.wordcount.cooccurrence
  - src
    - edu
      - usc
        cssl
        tacit
        wordcount
        cooccurrence
        Activator.java
        services
        CooccurrenceAnalysis.java
- edu.usc.cssl.tacit.wordcount.cooccurrence.ui
  - src
    - edu
      - usc
        cssl
        tacit
        wordcount
        cooccurrence
        ui
        Activator.java
        CooccurrenceWordCountView.java
        handlers
        OpenCooccurrenceWordCountViewHandler.java
        internal
        CooccurrenceWordCountImageRegistry.java
        ICooccurrenceWordCountViewConstants.java
- edu.usc.cssl.tacit.wordcount.standard
  - src
    - edu
      - usc
        cssl
        tacit
        wordcount
        standard
        Activator.java
        services
        WordCountPlugin.java
- edu.usc.cssl.tacit.wordcount.standard.ui
  - src
    - edu
      - usc
        cssl
        tacit
        wordcount
        standard
        ui
        Activator.java
        StandardWordCountView.java
        handlers
        OpenStandardWordCountViewHandler.java
        internal
        IStandardWordCountViewConstants.java
        StandardWordCountImageRegistry.java
- edu.usc.cssl.tacit.wordcount.weighted
  - src
    - edu
      - uc
        cssl
        tacit
        wordcount
        weighted
        Activator.java
        services
        Trie.java
        WordCountApi.java
- edu.usc.cssl.tacit.wordcount.weighted.ui
  - src
    - edu
      - usc
        cssl
        tacit
        wordcount
        weighted
        ui
        Activator.java
        WeightedWordCountView.java
        handlers
        OpenWeightedWordCountViewHandler.java
        internal
        AddLocationPage.java
        EditDirectoryContainerPage.java
        IWeightedWordCountViewConstants.java
        WeightedWordCountImageRegistry.java

package edu.usc.cssl.tacit.classify.naivebayes.services;

/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
 This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
 http://www.cs.umass.edu/~mccallum/mallet
 This software is provided under the terms of the Common Public License,
 version 1.0, as published by http://www.opensource.org.  For further
 information, see the file `LICENSE' included with this distribution. */

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

import cc.mallet.pipe.CharSequence2TokenSequence;
import cc.mallet.pipe.CharSequenceRemoveHTML;
import cc.mallet.pipe.CharSubsequence;
import cc.mallet.pipe.FeatureSequence2AugmentableFeatureVector;
import cc.mallet.pipe.Input2CharSequence;
import cc.mallet.pipe.Pipe;
import cc.mallet.pipe.PrintInputAndTarget;
import cc.mallet.pipe.SaveDataInSource;
import cc.mallet.pipe.SerialPipes;
import cc.mallet.pipe.Target2Label;
import cc.mallet.pipe.TokenSequence2FeatureSequence;
import cc.mallet.pipe.TokenSequence2FeatureSequenceWithBigrams;
import cc.mallet.pipe.TokenSequenceLowercase;
import cc.mallet.pipe.TokenSequenceNGrams;
import cc.mallet.pipe.TokenSequenceRemoveNonAlpha;
import cc.mallet.pipe.TokenSequenceRemoveStopwords;
import cc.mallet.pipe.iterator.FileIterator;
import cc.mallet.types.InstanceList;
import cc.mallet.util.CharSequenceLexer;
import cc.mallet.util.CommandOption;
import cc.mallet.util.MalletLogger;
import cc.mallet.util.Strings;

/**
 * Convert document files into vectors (a persistent instance list).
 * 
 * @author Andrew McCallum <a
 *         href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
 */

public class Text2Vectors {

	private static Logger logger = MalletLogger.getLogger(Text2Vectors.class
			.getName());

	static CommandOption.SpacedStrings classDirs = new CommandOption.SpacedStrings(
			Text2Vectors.class,
			"input",
			"DIR...",
			true,
			null,
			"The directories containing text files to be classified, one directory per class",
			null);

	static CommandOption.File outputFile = new CommandOption.File(
			Text2Vectors.class, "output", "FILE", true,
			new File("text.vectors"),
			"Write the instance list to this file; Using - indicates stdout.",
			null);

	static CommandOption.File usePipeFromVectorsFile = new CommandOption.File(
			Text2Vectors.class,
			"use-pipe-from",
			"FILE",
			true,
			new File("text.vectors"),
			"Use the pipe and alphabets from a previously created vectors file. "
					+ "Allows the creation, for example, of a test set of vectors that are "
					+ "compatible with a previously created set of training vectors",
			null);

	static CommandOption.Boolean preserveCase = new CommandOption.Boolean(
			Text2Vectors.class, "preserve-case", "[TRUE|FALSE]", false, false,
			"If true, do not force all strings to lowercase.", null);

	static CommandOption.Boolean removeStopWords = new CommandOption.Boolean(
			Text2Vectors.class,
			"remove-stopwords",
			"[TRUE|FALSE]",
			false,
			false,
			"If true, remove a default list of common English \"stop words\" from the text.",
			null);

	static CommandOption.File stoplistFile = new CommandOption.File(
			Text2Vectors.class,
			"stoplist-file",
			"FILE",
			true,
			null,
			"Instead of the default list, read stop words from a file, one per line. Implies --remove-stopwords",
			null);

	static CommandOption.File extraStopwordsFile = new CommandOption.File(
			Text2Vectors.class,
			"extra-stopwords",
			"FILE",
			true,
			null,
			"Read whitespace-separated words from this file, and add them to either\n"
					+ "   the default English stoplist or the list specified by --stoplist-file.",
			null);

	static CommandOption.Boolean skipHeader = new CommandOption.Boolean(
			Text2Vectors.class, "skip-header", "[TRUE|FALSE]", false, false,
			"If true, in each document, remove text occurring before a blank line."
					+ "  This is useful for removing email or UseNet headers",
			null);

	static CommandOption.Boolean skipHtml = new CommandOption.Boolean(
			Text2Vectors.class, "skip-html", "[TRUE|FALSE]", false, false,
			"If true, remove text occurring inside <...>, as in HTML or SGML.",
			null);

	static CommandOption.Boolean binaryFeatures = new CommandOption.Boolean(
			Text2Vectors.class, "binary-features", "[TRUE|FALSE]", false,
			false, "If true, features will be binary.", null);

	static CommandOption.IntegerArray gramSizes = new CommandOption.IntegerArray(
			Text2Vectors.class,
			"gram-sizes",
			"INTEGER,[INTEGER,...]",
			true,
			new int[] { 1 },
			"Include among the features all n-grams of sizes specified.  "
					+ "For example, to get all unigrams and bigrams, use --gram-sizes 1,2.  "
					+ "This option occurs after the removal of stop words, if removed.",
			null);

	static CommandOption.Boolean keepSequence = new CommandOption.Boolean(
			Text2Vectors.class,
			"keep-sequence",
			"[TRUE|FALSE]",
			false,
			false,
			"If true, final data will be a FeatureSequence rather than a FeatureVector.",
			null);

	static CommandOption.Boolean keepSequenceBigrams = new CommandOption.Boolean(
			Text2Vectors.class,
			"keep-sequence-bigrams",
			"[TRUE|FALSE]",
			false,
			false,
			"If true, final data will be a FeatureSequenceWithBigrams rather than a FeatureVector.",
			null);

	static CommandOption.Boolean saveTextInSource = new CommandOption.Boolean(
			Text2Vectors.class, "save-text-in-source", "[TRUE|FALSE]", false,
			false, "If true, save original text of document in source.", null);

	static CommandOption.ObjectFromBean stringPipe = new CommandOption.ObjectFromBean(
			Text2Vectors.class,
			"string-pipe",
			"Pipe constructor",
			true,
			null,
			"Java code for the constructor of a Pipe to be run as soon as input becomes a CharSequence",
			null);

	static CommandOption.ObjectFromBean tokenPipe = new CommandOption.ObjectFromBean(
			Text2Vectors.class,
			"token-pipe",
			"Pipe constructor",
			true,
			null,
			"Java code for the constructor of a Pipe to be run as soon as input becomes a TokenSequence",
			null);

	static CommandOption.ObjectFromBean featureVectorPipe = new CommandOption.ObjectFromBean(
			Text2Vectors.class,
			"fv-pipe",
			"Pipe constructor",
			true,
			null,
			"Java code for the constructor of a Pipe to be run as soon as input becomes a FeatureVector",
			null);

	static CommandOption.String encoding = new CommandOption.String(
			Text2Vectors.class, "encoding", "STRING", true, Charset
					.defaultCharset().displayName(),
			"Character encoding for input file", null);

	static CommandOption.String tokenRegex = new CommandOption.String(
			Text2Vectors.class,
			"token-regex",
			"REGEX",
			true,
			CharSequenceLexer.LEX_ALPHA.toString(),
			"Regular expression used for tokenization.\n"
					+ "   Example: \"[\\p{L}\\p{N}_]+|[\\p{P}]+\" (unicode letters, numbers and underscore OR all punctuation) ",
			null);

	static CommandOption.Boolean printOutput = new CommandOption.Boolean(
			Text2Vectors.class,
			"print-output",
			"[TRUE|FALSE]",
			false,
			false,
			"If true, print a representation of the processed data\n"
					+ "   to standard output. This option is intended for debugging.",
			null);

	public static void main(String[] args) throws FileNotFoundException,
			IOException {
		// Process the command-line options
		CommandOption
				.setSummary(
						Text2Vectors.class,
						"A tool for creating instance lists of FeatureVectors or FeatureSequences from text documents.\n");
		CommandOption.process(Text2Vectors.class, args);
		// String[] classDirs = CommandOption.process (Text2Vectors.class,
		// args);

		// Print some helpful messages for error cases
		if (args.length == 0) {
			CommandOption.getList(Text2Vectors.class).printUsage(false);
			System.exit(-1);
		}
		if (classDirs.value.length == 0) {
			throw new IllegalArgumentException(
					"You must include --input DIR1 DIR2 ...' in order to specify a "
							+ "list of directories containing the documents for each class.");
		}

		// Remove common prefix from all the input class directories
		int commonPrefixIndex = Strings.commonPrefixIndex(classDirs.value);

		logger.info("Labels = ");
		File[] directories = new File[classDirs.value.length];
		for (int i = 0; i < classDirs.value.length; i++) {
			directories[i] = new File(classDirs.value[i]);
			if (commonPrefixIndex < classDirs.value.length) {
				logger.info("   "
						+ classDirs.value[i].substring(commonPrefixIndex));
			} else {
				logger.info("   " + classDirs.value[i]);
			}
		}

		Pipe instancePipe;
		InstanceList previousInstanceList = null;

		if (usePipeFromVectorsFile.wasInvoked()) {
			previousInstanceList = InstanceList
					.load(usePipeFromVectorsFile.value);
			instancePipe = previousInstanceList.getPipe();
		} else {

			// Build a new pipe

			// Create a list of pipes that will be added to a SerialPipes object
			// later
			ArrayList<Pipe> pipeList = new ArrayList<Pipe>();

			// Convert the "target" object into a numeric index
			// into a LabelAlphabet.
			pipeList.add(new Target2Label());

			// The "data" field is currently a filename. Save it as "source".
			pipeList.add(new SaveDataInSource());

			// Set "data" to the file's contents. "data" is now a String.
			pipeList.add(new Input2CharSequence(encoding.value));

			// Optionally save the text to "source" -- not recommended if memory
			// is scarce.
			if (saveTextInSource.wasInvoked()) {
				pipeList.add(new SaveDataInSource());
			}

			// Allow the user to specify an arbitrary Pipe object
			// that operates on Strings
			if (stringPipe.wasInvoked()) {
				pipeList.add((Pipe) stringPipe.value);
			}

			// Remove all content before the first empty line.
			// Useful for email and usenet news posts.
			if (skipHeader.value) {
				pipeList.add(new CharSubsequence(CharSubsequence.SKIP_HEADER));
			}

			// Remove HTML tags. Suitable for SGML and XML.
			if (skipHtml.value) {
				pipeList.add(new CharSequenceRemoveHTML());
			}

			//
			// Tokenize the input: first compile the tokenization pattern
			//

			Pattern tokenPattern = null;

			if (keepSequenceBigrams.value) {
				// We do not want to record bigrams across punctuation,
				// so we need to keep non-word tokens.
				tokenPattern = CharSequenceLexer.LEX_NONWHITESPACE_CLASSES;
			} else {
				// Otherwise, try to compile the regular expression pattern.

				try {
					tokenPattern = Pattern.compile(tokenRegex.value);
				} catch (PatternSyntaxException pse) {
					throw new IllegalArgumentException(
							"The token regular expression (" + tokenRegex.value
									+ ") was invalid: " + pse.getMessage());
				}
			}

			// Add the tokenizer
			pipeList.add(new CharSequence2TokenSequence(tokenPattern));

			// Allow user to specify an arbitrary Pipe object
			// that operates on TokenSequence objects.
			if (tokenPipe.wasInvoked()) {
				pipeList.add((Pipe) tokenPipe.value);
			}

			if (!preserveCase.value()) {
				pipeList.add(new TokenSequenceLowercase());
			}

			if (keepSequenceBigrams.value) {
				// Remove non-word tokens, but record the fact that they
				// were there.
				pipeList.add(new TokenSequenceRemoveNonAlpha(true));
			}

			// Stopword removal.

			if (stoplistFile.wasInvoked()) {

				// The user specified a new list

				TokenSequenceRemoveStopwords stopwordFilter = new TokenSequenceRemoveStopwords(
						stoplistFile.value, encoding.value, false, // don't
																	// include
																	// default
																	// list
						false, keepSequenceBigrams.value);

				if (extraStopwordsFile.wasInvoked()) {
					stopwordFilter.addStopWords(extraStopwordsFile.value);
				}

				pipeList.add(stopwordFilter);
			} else if (removeStopWords.value) {

				// The user did not specify a new list, so use the default
				// built-in English list, possibly adding extra words.

				TokenSequenceRemoveStopwords stopwordFilter = new TokenSequenceRemoveStopwords(
						false, keepSequenceBigrams.value);

				if (extraStopwordsFile.wasInvoked()) {
					stopwordFilter.addStopWords(extraStopwordsFile.value);
				}

				pipeList.add(stopwordFilter);

			}

			// gramSizes is an integer array, with default value [1].
			// Check if we have a non-default value.
			if (!(gramSizes.value.length == 1 && gramSizes.value[0] == 1)) {
				pipeList.add(new TokenSequenceNGrams(gramSizes.value));
			}

			// So far we have a sequence of Token objects that contain
			// String values. Look these up in an alphabet and store integer IDs
			// ("features") instead of Strings.
			if (keepSequenceBigrams.value) {
				pipeList.add(new TokenSequence2FeatureSequenceWithBigrams());
			} else {
				pipeList.add(new TokenSequence2FeatureSequence());
			}

			// For many applications, we do not need to preserve the sequence of
			// features,
			// only the number of times times a feature occurs.
			if (!(keepSequence.value || keepSequenceBigrams.value)) {
				pipeList.add(new FeatureSequence2AugmentableFeatureVector(
						binaryFeatures.value));
			}

			// Allow users to specify an arbitrary Pipe object that operates on
			// feature vectors.
			if (featureVectorPipe.wasInvoked()) {
				pipeList.add((Pipe) featureVectorPipe.value);
			}

			if (printOutput.value) {
				pipeList.add(new PrintInputAndTarget());
			}

			instancePipe = new SerialPipes(pipeList);

		}

		InstanceList instances = new InstanceList(instancePipe);

		boolean removeCommonPrefix = true;
		instances.addThruPipe(new FileIterator(directories,
				FileIterator.STARTING_DIRECTORIES, removeCommonPrefix));

		// write vector file
		ObjectOutputStream oos;
		if (outputFile.value.toString().equals("-")) {
			oos = new ObjectOutputStream(System.out);
		} else {
			oos = new ObjectOutputStream(new FileOutputStream(outputFile.value));
		}
		oos.writeObject(instances);
		oos.close();

		// *rewrite* vector file used as source of pipe in case we changed the
		// alphabet(!)
		if (usePipeFromVectorsFile.wasInvoked()) {
			logger.info(" rewriting previous instance list, with ID = "
					+ previousInstanceList.getPipe().getInstanceId());
			oos = new ObjectOutputStream(new FileOutputStream(
					usePipeFromVectorsFile.value));
			oos.writeObject(previousInstanceList);
			oos.close();
		}

	}

}