TokenSequenceRemoveStopwords.java example

Explorer

nina-master
- src
  - edu
    - nd
      - nina
        DirectedGraph.java
        EdgeFactory.java
        Graph.java
        GraphHelper.java
        GraphMapping.java
        GraphPath.java
        Graphs.java
        ListenableGraph.java
        Type.java
        TypedGraph.java
        UndirectedGraph.java
        VertexFactory.java
        WeightedGraph.java
        alg
        AbstractPathElement.java
        AbstractPathElementList.java
        BellmanFordIterator.java
        BellmanFordPathElement.java
        BellmanFordShortestPath.java
        BiconnectivityInspector.java
        BlockCutpointGraph.java
        BreadthFirstSearch.java
        BronKerboschCliqueFinder.java
        CalculateStatistics.java
        ChromaticNumber.java
        ConnectivityInspector.java
        ConstrainedRandomWalkWithRestart.java
        CycleDetector.java
        DijkstraShortestPath.java
        DirectedNeighborIndex.java
        EdmondsKarpMaximumFlow.java
        EulerianCircuit.java
        FloydWarshallShortestPaths.java
        HamiltonianCycle.java
        KShortestPaths.java
        KShortestPathsIterator.java
        KruskalMinimumSpanningTree.java
        MetaPath.java
        MetaPathClas.java
        MetaPathClus.java
        NeighborIndex.java
        RankingPathElement.java
        RankingPathElementList.java
        SingularValueDecomposition.java
        StatVal.java
        StoerWagnerMinimumCut.java
        StrongConnectivityInspector.java
        TransitiveClosure.java
        Triangles.java
        VertexCovers.java
        drivers
        EdgeStats.java
        util
        UnionFind.java
        VertexDegreeComparator.java
        demo
        CompleteGraphDemo.java
        HelloJGraphT.java
        JGraphAdapterDemo.java
        PerformanceDemo.java
        TypedGraphDemo.java
        event
        ConnectedComponentTraversalEvent.java
        EdgeTraversalEvent.java
        GraphChangeEvent.java
        GraphEdgeChangeEvent.java
        GraphListener.java
        GraphVertexChangeEvent.java
        TraversalListener.java
        TraversalListenerAdapter.java
        VertexSetListener.java
        VertexTraversalEvent.java
        experimental
        GraphReader.java
        GraphSquare.java
        GraphTests.java
        PartiteRandomGraphGenerator.java
        RandomGraphHelper.java
        UniformRandomGraphGenerator.java
        alg
        ApproximationAlgorithm.java
        ExactAlgorithm.java
        IntArrayGraphAlgorithm.java
        color
        BrownBacktrackColoring.java
        GreedyColoring.java
        dag
        DirectedAcyclicGraph.java
        equivalence
        EquivalenceComparator.java
        EquivalenceComparatorChain.java
        EquivalenceComparatorChainBase.java
        EquivalenceSet.java
        EquivalenceSetCreator.java
        UniformEquivalenceComparator.java
        isomorphism
        AbstractExhaustiveIsomorphismInspector.java
        AdaptiveIsomorphismInspectorFactory.java
        EquivalenceIsomorphismInspector.java
        GraphIsomorphismInspector.java
        GraphOrdering.java
        IsomorphismRelation.java
        PermutationIsomorphismInspector.java
        VertexDegreeEquivalenceComparator.java
        permutation
        ArrayPermutationsIter.java
        CollectionPermutationIter.java
        CompoundPermutationIter.java
        IntegerPermutationIter.java
        PermutationFactory.java
        touchgraph
        SimpleTouchgraphApplet.java
        TouchgraphConverter.java
        TouchgraphPanel.java
        ext
        ComponentAttributeProvider.java
        DOTExporter.java
        EdgeNameProvider.java
        GmlExporter.java
        GraphMLExporter.java
        IntegerEdgeNameProvider.java
        IntegerNameProvider.java
        JGraphModelAdapter.java
        MatrixExporter.java
        StringEdgeNameProvider.java
        StringNameProvider.java
        VertexNameProvider.java
        VisioExporter.java
        extract
        Span.java
        StringSpan.java
        StringTokenization.java
        Tokenization.java
        generate
        CompleteBipartiteGraphGenerator.java
        CompleteGraphGenerator.java
        EmptyGraphGenerator.java
        ForestFireGraphGenerator.java
        GraphGenerator.java
        GridGraphGenerator.java
        HyperCubeGraphGenerator.java
        LinearGraphGenerator.java
        RandomGraphGenerator.java
        RingGraphGenerator.java
        ScaleFreeGraphGenerator.java
        StarGraphGenerator.java
        TypedNetworkGenerator.java
        WheelGraphGenerator.java
        graph
        AbstractBaseGraph.java
        AbstractGraph.java
        AsUndirectedGraph.java
        AsUnweightedDirectedGraph.java
        AsUnweightedGraph.java
        AsWeightedGraph.java
        ClassBasedEdgeFactory.java
        ClassBasedVertexFactory.java
        DefaultDirectedGraph.java
        DefaultDirectedWeightedGraph.java
        DefaultEdge.java
        DefaultGraphMapping.java
        DefaultListenableGraph.java
        DefaultWeightedEdge.java
        DirectedFeatureGraph.java
        DirectedGraphUnion.java
        DirectedMaskSubgraph.java
        DirectedMultigraph.java
        DirectedPseudograph.java
        DirectedSubgraph.java
        DirectedWeightedMultigraph.java
        DirectedWeightedSubgraph.java
        EdgeReversedGraph.java
        EdgeSetFactory.java
        GraphDelegator.java
        GraphPathImpl.java
        GraphUnion.java
        IntrusiveEdge.java
        ListenableDirectedGraph.java
        ListenableDirectedWeightedGraph.java
        ListenableUndirectedGraph.java
        ListenableUndirectedWeightedGraph.java
        MaskEdgeSet.java
        MaskFunctor.java
        MaskSubgraph.java
        MaskVertexSet.java
        Multigraph.java
        ParanoidGraph.java
        Pseudograph.java
        SimpleDirectedGraph.java
        SimpleDirectedWeightedGraph.java
        SimpleGraph.java
        SimpleWeightedGraph.java
        Subgraph.java
        TypedEdge.java
        TypedSimpleGraph.java
        UndirectedGraphUnion.java
        UndirectedMaskSubgraph.java
        UndirectedSubgraph.java
        UndirectedWeightedSubgraph.java
        UnmodifiableDirectedGraph.java
        UnmodifiableGraph.java
        UnmodifiableUndirectedGraph.java
        WeightedMultigraph.java
        WeightedPseudograph.java
        load
        CharSequence2TokenSequence.java
        CharSequenceRemoveHTML.java
        EmptyInstanceIterator.java
        FeatureSequence2AugmentableFeatureVector.java
        IDToSource.java
        Input2CharSequence.java
        LineIterator.java
        LoadFromFeatureGraph.java
        LoadFromHBase.java
        NameToName.java
        Noop.java
        Pipe.java
        PrintInputAndTarget.java
        SaveDataInSource.java
        SerialPipes.java
        Target2Label.java
        TokenSequence2FeatureSequence.java
        TokenSequence2FeatureSequenceWithBigrams.java
        TokenSequenceLowercase.java
        TokenSequenceNGrams.java
        TokenSequenceRemoveNonAlpha.java
        TokenSequenceRemoveStopwords.java
        hdtm
        EvaluateHDTMResults.java
        HierachicalDocTopicModel.java
        io
        ArnetMiner.java
        DBLP.java
        Dot.java
        EdgeList.java
        FeatureGraph.java
        FileHandler.java
        KDDCup2013.java
        NINALogger.java
        PrintStatistics.java
        Save.java
        WikiHBaseToCatGraph.java
        WikidumpToHbase.java
        math
        Dirichlet.java
        GammaFunction.java
        LinearAlgebra.java
        LogisticRegressionFit.java
        LogisticRegressionPrediction.java
        Moment.java
        Numerical.java
        Randoms.java
        ReferenceCount.java
        snap
        agm
        AGM.java
        AGMFit.java
        AGMUtil.java
        cascades
        CascadeStatistics.java
        Cascades.java
        forestfire
        ForestFire.java
        structs
        Pair.java
        Triple.java
        traverse
        AbstractGraphIterator.java
        BreadthFirstIterator.java
        ClosestFirstIterator.java
        CrossComponentIterator.java
        DepthFirstIterator.java
        GraphIterator.java
        TopologicalOrderIterator.java
        types
        Alphabet.java
        AlphabetCarrying.java
        AugmentableFeatureVector.java
        ConstantMatrix.java
        DenseMatrix.java
        DenseVector.java
        FeatureConjunction.java
        FeatureSelection.java
        FeatureSequence.java
        FeatureSequenceWithBigrams.java
        FeatureVector.java
        IndexedSparseVector.java
        Instance.java
        InstanceList.java
        Label.java
        LabelAlphabet.java
        LabelVector.java
        Labeling.java
        Matrix.java
        Multinomial.java
        PropertyHolder.java
        RankedFeatureVector.java
        Sequence.java
        SingleInstanceIterator.java
        SparseVector.java
        Token.java
        TokenSequence.java
        dblp
        Author.java
        Paper.java
        Term.java
        Venue.java
        Year.java
        kddcup2013
        Affiliation.java
        Author.java
        AuthorAlsoKnownAs.java
        Confirmed.java
        Deleted.java
        Paper.java
        Term.java
        Venue.java
        VenueWebPage.java
        Year.java
        util
        ArrayUnenforcedSet.java
        CharSequenceLexer.java
        FibonacciHeap.java
        FibonacciHeapNode.java
        GraphUtil.java
        Lexer.java
        MathUtil.java
        MatrixOps.java
        ModifiableInteger.java
        Plot.java
        PrefetchIterator.java
        PropertyList.java
        TypeUtil.java
        ValueSorter.java
        VertexPair.java
        WeightCombiner.java
- test
  - edu
    - nd
      - nina
        AllTests.java
        EnhancedTestCase.java
        alg
        AllAlgTests.java
        BellmanFordShortestPathTest.java
        BiconnectedGraph.java
        BiconnectivityInspectorTest.java
        BlockCutpointGraphTest.java
        BronKerboschCliqueFinderTest.java
        ChromaticNumberTest.java
        ConnectivityInspectorTest.java
        CycleDetectorTest.java
        DijkstraShortestPathTest.java
        EdmondsKarpMaximumFlowTest.java
        EulerianCircuitTest.java
        FloydWarshallShortestPathsTest.java
        HamiltonianCycleTest.java
        KSPDiscardsValidPathsTest.java
        KSPExampleGraph.java
        KSPExampleTest.java
        KShortestPathCompleteGraph4.java
        KShortestPathCompleteGraph5.java
        KShortestPathCompleteGraph6.java
        KShortestPathCostTest.java
        KShortestPathKValuesTest.java
        KruskalMinimumSpanningTreeTest.java
        NeighborIndexTest.java
        NotBiconnectedGraph.java
        Picture1Graph.java
        ShortestPathTestCase.java
        StoerWagnerMinimumCutTest.java
        TransitiveClosureTest.java
        VertexCoversTest.java
        util
        AllAlgUtilTests.java
        UnionFindTest.java
        experimental
        GraphReaderTest.java
        alg
        ColoringTest.java
        dag
        DirectedAcyclicGraphTest.java
        equivalence
        EquivalenceGroupCreatorTest.java
        isomorphism
        EdgeTopologyCompare.java
        IntegerVertexFactory.java
        IsomorphismInspectorTest.java
        comparators
        DirectedEdgeWeightOddEvenComparator.java
        Mod3GroupComparator.java
        OddEvenGroupComparator.java
        permutation
        CompoundPermutationIterTest.java
        ext
        DOTExporterTest.java
        GmlExporterTest.java
        GraphMLExporterTest.java
        MatrixExporterTest.java
        generate
        AllGenerateTests.java
        GraphGeneratorTest.java
        RandomGraphGeneratorTest.java
        graph
        AllGraphTests.java
        AsUndirectedGraphTest.java
        AsUnweightedGraphTest.java
        AsWeightedGraphTest.java
        CloneTest.java
        DefaultDirectedGraphTest.java
        GenericGraphsTest.java
        ListenableGraphTest.java
        SerializationTest.java
        SimpleDirectedGraphTest.java
        SubgraphTest.java
        traverse
        AbstractGraphIteratorTest.java
        AllTraverseTests.java
        BreadthFirstIteratorTest.java
        ClosestFirstIteratorTest.java
        DepthFirstIteratorTest.java
        IgnoreDirectionTest.java
        TopologicalOrderIteratorTest.java
        util
        AllUtilTests.java
        FibonacciHeapTest.java
        PrefetchIteratorTest.java
        StopWatch.java

/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */

package edu.nd.nina.graph.load;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashSet;

import edu.nd.nina.types.FeatureSequenceWithBigrams;
import edu.nd.nina.types.Instance;
import edu.nd.nina.types.Token;
import edu.nd.nina.types.TokenSequence;

/**
 * Remove tokens from the token sequence in the data field whose text is in the
 * stopword list.
 * 
 * @author Andrew McCallum <a
 *         href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
 */

public class TokenSequenceRemoveStopwords extends Pipe {
	// xxx Use a gnu.trove collection instead
	HashSet<String> stoplist = null;
	boolean caseSensitive = true;
	boolean markDeletions = false;

	private HashSet<String> newDefaultStopList() {
		HashSet<String> sl = new HashSet<String>();
		for (int i = 0; i < stopwords.length; i++)
			sl.add(stopwords[i]);
		return sl;
	}

	public TokenSequenceRemoveStopwords(boolean caseSensitive,
			boolean markDeletions) {
		stoplist = newDefaultStopList();
		this.caseSensitive = caseSensitive;
		this.markDeletions = markDeletions;
	}

	public TokenSequenceRemoveStopwords(boolean caseSensitive) {
		stoplist = newDefaultStopList();
		this.caseSensitive = caseSensitive;
	}

	public TokenSequenceRemoveStopwords() {
		this(false);
	}

	/**
	 * Load a stoplist from a file.
	 * 
	 * @param stoplistFile
	 *            The file to load
	 * @param encoding
	 *            The encoding of the stoplist file (eg UTF-8)
	 * @param includeDefault
	 *            Whether to include the standard mallet English stoplist
	 */
	public TokenSequenceRemoveStopwords(File stoplistFile, String encoding,
			boolean includeDefault, boolean caseSensitive, boolean markDeletions) {
		if (!includeDefault) {
			stoplist = new HashSet<String>();
		} else {
			stoplist = newDefaultStopList();
		}

		addStopWords(fileToStringArray(stoplistFile, encoding));

		this.caseSensitive = caseSensitive;
		this.markDeletions = markDeletions;
	}

	public TokenSequenceRemoveStopwords setCaseSensitive(boolean flag) {
		this.caseSensitive = flag;
		return this;
	}

	public TokenSequenceRemoveStopwords setMarkDeletions(boolean flag) {
		this.markDeletions = flag;
		return this;
	}

	public TokenSequenceRemoveStopwords addStopWords(String[] words) {
		for (int i = 0; i < words.length; i++)
			stoplist.add(words[i]);
		return this;
	}

	public TokenSequenceRemoveStopwords removeStopWords(String[] words) {
		for (int i = 0; i < words.length; i++)
			stoplist.remove(words[i]);
		return this;
	}

	/** Remove whitespace-separated tokens in file "wordlist" to the stoplist. */
	public TokenSequenceRemoveStopwords removeStopWords(File wordlist) {
		this.removeStopWords(fileToStringArray(wordlist, null));
		return this;
	}

	/** Add whitespace-separated tokens in file "wordlist" to the stoplist. */
	public TokenSequenceRemoveStopwords addStopWords(File wordlist) {
		if (wordlist != null)
			this.addStopWords(fileToStringArray(wordlist, null));
		return this;
	}

	private String[] fileToStringArray(File f, String encoding) {
		ArrayList<String> wordarray = new ArrayList<String>();

		try {

			BufferedReader input = null;
			if (encoding == null) {
				input = new BufferedReader(new FileReader(f));
			} else {
				input = new BufferedReader(new InputStreamReader(
						new FileInputStream(f), encoding));
			}
			String line;

			while ((line = input.readLine()) != null) {
				String[] words = line.split("\\s+");
				for (int i = 0; i < words.length; i++)
					wordarray.add(words[i]);
			}

		} catch (IOException e) {
			throw new IllegalArgumentException("Trouble reading file " + f);
		}
		return (String[]) wordarray.toArray(new String[] {});
	}

	public Instance pipe(Instance carrier) {
		TokenSequence ts = (TokenSequence) carrier.getData();
		// xxx This doesn't seem so efficient. Perhaps have TokenSequence
		// use a LinkedList, and remove Tokens from it? -?
		// But a LinkedList implementation of TokenSequence would be quite
		// inefficient -AKM
		TokenSequence ret = new TokenSequence();
		Token prevToken = null;
		for (int i = 0; i < ts.size(); i++) {
			Token t = ts.get(i);
			if (!stoplist.contains(caseSensitive ? t.getText() : t.getText()
					.toLowerCase())) {
				// xxx Should we instead make and add a copy of the Token?
				ret.add(t);
				prevToken = t;
			} else if (markDeletions && prevToken != null)
				prevToken.setProperty(FeatureSequenceWithBigrams.deletionMark,
						t.getText());
		}
		carrier.setData(ret);
		return carrier;
	}


	static final String[] stopwords = { "a", "able", "about", "above",
			"according", "accordingly", "across", "actually", "after",
			"afterwards", "again", "against", "all", "allow", "allows",
			"almost", "alone", "along", "already", "also", "although",
			"always", "am", "among", "amongst", "an", "and", "another", "any",
			"anybody", "anyhow", "anyone", "anything", "anyway", "anyways",
			"anywhere", "apart", "appear", "appreciate", "appropriate", "are",
			"around", "as", "aside", "ask", "asking", "associated", "at",
			"available", "away", "awfully", "b", "be", "became", "because",
			"become", "becomes", "becoming", "been", "before", "beforehand",
			"behind", "being", "believe", "below", "beside", "besides", "best",
			"better", "between", "beyond", "both", "brief", "but", "by", "c",
			"came", "can", "cannot", "cant", "cause", "causes", "certain",
			"certainly", "changes", "clearly", "co", "com", "come", "comes",
			"concerning", "consequently", "consider", "considering", "contain",
			"containing", "contains", "corresponding", "could", "course",
			"currently", "d", "definitely", "described", "despite", "did",
			"different", "do", "does", "doing", "done", "down", "downwards",
			"during", "e", "each", "edu", "eg", "eight", "either", "else",
			"elsewhere", "enough", "entirely", "especially", "et", "etc",
			"even", "ever", "every", "everybody", "everyone", "everything",
			"everywhere", "ex", "exactly", "example", "except", "f", "far",
			"few", "fifth", "first", "five", "followed", "following",
			"follows", "for", "former", "formerly", "forth", "four", "from",
			"further", "furthermore", "g", "get", "gets", "getting", "given",
			"gives", "go", "goes", "going", "gone", "got", "gotten",
			"greetings", "h", "had", "happens", "hardly", "has", "have",
			"having", "he", "hello", "help", "hence", "her", "here",
			"hereafter", "hereby", "herein", "hereupon", "hers", "herself",
			"hi", "him", "himself", "his", "hither", "hopefully", "how",
			"howbeit", "however", "i", "ie", "if", "ignored", "immediate",
			"in", "inasmuch", "inc", "indeed", "indicate", "indicated",
			"indicates", "inner", "insofar", "instead", "into", "inward", "is",
			"it", "its", "itself", "j", "just", "k", "keep", "keeps", "kept",
			"know", "knows", "known", "l", "last", "lately", "later", "latter",
			"latterly", "least", "less", "lest", "let", "like", "liked",
			"likely", "little", "look", "looking", "looks", "ltd", "m",
			"mainly", "many", "may", "maybe", "me", "mean", "meanwhile",
			"merely", "might", "more", "moreover", "most", "mostly", "much",
			"must", "my", "myself", "n", "name", "namely", "nd", "near",
			"nearly", "necessary", "need", "needs", "neither", "never",
			"nevertheless", "new", "next", "nine", "no", "nobody", "non",
			"none", "noone", "nor", "normally", "not", "nothing", "novel",
			"now", "nowhere", "o", "obviously", "of", "off", "often", "oh",
			"ok", "okay", "old", "on", "once", "one", "ones", "only", "onto",
			"or", "other", "others", "otherwise", "ought", "our", "ours",
			"ourselves", "out", "outside", "over", "overall", "own", "p",
			"particular", "particularly", "per", "perhaps", "placed", "please",
			"plus", "possible", "presumably", "probably", "provides", "q",
			"que", "quite", "qv", "r", "rather", "rd", "re", "really",
			"reasonably", "regarding", "regardless", "regards", "relatively",
			"respectively", "right", "s", "said", "same", "saw", "say",
			"saying", "says", "second", "secondly", "see", "seeing", "seem",
			"seemed", "seeming", "seems", "seen", "self", "selves", "sensible",
			"sent", "serious", "seriously", "seven", "several", "shall", "she",
			"should", "since", "six", "so", "some", "somebody", "somehow",
			"someone", "something", "sometime", "sometimes", "somewhat",
			"somewhere", "soon", "sorry", "specified", "specify", "specifying",
			"still", "sub", "such", "sup", "sure", "t", "take", "taken",
			"tell", "tends", "th", "than", "thank", "thanks", "thanx", "that",
			"thats", "the", "their", "theirs", "them", "themselves", "then",
			"thence", "there", "thereafter", "thereby", "therefore", "therein",
			"theres", "thereupon", "these", "they", "think", "third", "this",
			"thorough", "thoroughly", "those", "though", "three", "through",
			"throughout", "thru", "thus", "to", "together", "too", "took",
			"toward", "towards", "tried", "tries", "truly", "try", "trying",
			"twice", "two", "u", "un", "under", "unfortunately", "unless",
			"unlikely", "until", "unto", "up", "upon", "us", "use", "used",
			"useful", "uses", "using", "usually", "uucp", "v", "value",
			"various", "very", "via", "viz", "vs", "w", "want", "wants", "was",
			"way", "we", "welcome", "well", "went", "were", "what", "whatever",
			"when", "whence", "whenever", "where", "whereafter", "whereas",
			"whereby", "wherein", "whereupon", "wherever", "whether", "which",
			"while", "whither", "who", "whoever", "whole", "whom", "whose",
			"why", "will", "willing", "wish", "with", "within", "without",
			"wonder", "would", "would", "x", "y", "yes", "yet", "you", "your",
			"yours", "yourself", "yourselves", "z", "zero",
	// stop words for paper abstracts
	// "abstract",
	// "paper",
	// "presents",
	// "discuss",
	// "discusses",
	// "conclude",
	// "concludes",
	// "based",
	// "approach"
	};
	// stopwords for french, added by Limin Yao
	static final String[] stopwordsFrench = { "fut", "S", "ces", "ral", "new",
			"tr", "arm", "y", "autres", "o", "tait", "dont", "ann", "apr",
			"sous", "ans", "cette", "politique", "of", "c", "contre", "leur",
			"ville", "fait", "res", "on", "deux", "cle", "v", "publique",
			"france", "te", "guerre", "sident", "unis", "mais", "entre",
			"aussi", "tat", "ais", "ses", "sa", "ont", "tre", "d", "pays",
			"en", "Il", "tats", "comme", "am", "si", "c", "fran", "pas", "g",
			"qu", "R", "aux", "ce", "f", "p", "ne", "son", "me", "avec", "l",
			"se", "ou", "sont", "il", "Les", "re", "plus", "m", "es", "pr",
			"la", "sur", "que", "pour", "modifier", "a", "qui", "Le", "t", "n",
			"au", "dans", "une", "par", "un", "r", "est", "e", "du", "s",
			"les", "en", "des", "le", "et", "l", "d", "la", "de",

	};

}