LoadFromHBase.java example

Explorer

nina-master
- src
  - edu
    - nd
      - nina
        DirectedGraph.java
        EdgeFactory.java
        Graph.java
        GraphHelper.java
        GraphMapping.java
        GraphPath.java
        Graphs.java
        ListenableGraph.java
        Type.java
        TypedGraph.java
        UndirectedGraph.java
        VertexFactory.java
        WeightedGraph.java
        alg
        AbstractPathElement.java
        AbstractPathElementList.java
        BellmanFordIterator.java
        BellmanFordPathElement.java
        BellmanFordShortestPath.java
        BiconnectivityInspector.java
        BlockCutpointGraph.java
        BreadthFirstSearch.java
        BronKerboschCliqueFinder.java
        CalculateStatistics.java
        ChromaticNumber.java
        ConnectivityInspector.java
        ConstrainedRandomWalkWithRestart.java
        CycleDetector.java
        DijkstraShortestPath.java
        DirectedNeighborIndex.java
        EdmondsKarpMaximumFlow.java
        EulerianCircuit.java
        FloydWarshallShortestPaths.java
        HamiltonianCycle.java
        KShortestPaths.java
        KShortestPathsIterator.java
        KruskalMinimumSpanningTree.java
        MetaPath.java
        MetaPathClas.java
        MetaPathClus.java
        NeighborIndex.java
        RankingPathElement.java
        RankingPathElementList.java
        SingularValueDecomposition.java
        StatVal.java
        StoerWagnerMinimumCut.java
        StrongConnectivityInspector.java
        TransitiveClosure.java
        Triangles.java
        VertexCovers.java
        drivers
        EdgeStats.java
        util
        UnionFind.java
        VertexDegreeComparator.java
        demo
        CompleteGraphDemo.java
        HelloJGraphT.java
        JGraphAdapterDemo.java
        PerformanceDemo.java
        TypedGraphDemo.java
        event
        ConnectedComponentTraversalEvent.java
        EdgeTraversalEvent.java
        GraphChangeEvent.java
        GraphEdgeChangeEvent.java
        GraphListener.java
        GraphVertexChangeEvent.java
        TraversalListener.java
        TraversalListenerAdapter.java
        VertexSetListener.java
        VertexTraversalEvent.java
        experimental
        GraphReader.java
        GraphSquare.java
        GraphTests.java
        PartiteRandomGraphGenerator.java
        RandomGraphHelper.java
        UniformRandomGraphGenerator.java
        alg
        ApproximationAlgorithm.java
        ExactAlgorithm.java
        IntArrayGraphAlgorithm.java
        color
        BrownBacktrackColoring.java
        GreedyColoring.java
        dag
        DirectedAcyclicGraph.java
        equivalence
        EquivalenceComparator.java
        EquivalenceComparatorChain.java
        EquivalenceComparatorChainBase.java
        EquivalenceSet.java
        EquivalenceSetCreator.java
        UniformEquivalenceComparator.java
        isomorphism
        AbstractExhaustiveIsomorphismInspector.java
        AdaptiveIsomorphismInspectorFactory.java
        EquivalenceIsomorphismInspector.java
        GraphIsomorphismInspector.java
        GraphOrdering.java
        IsomorphismRelation.java
        PermutationIsomorphismInspector.java
        VertexDegreeEquivalenceComparator.java
        permutation
        ArrayPermutationsIter.java
        CollectionPermutationIter.java
        CompoundPermutationIter.java
        IntegerPermutationIter.java
        PermutationFactory.java
        touchgraph
        SimpleTouchgraphApplet.java
        TouchgraphConverter.java
        TouchgraphPanel.java
        ext
        ComponentAttributeProvider.java
        DOTExporter.java
        EdgeNameProvider.java
        GmlExporter.java
        GraphMLExporter.java
        IntegerEdgeNameProvider.java
        IntegerNameProvider.java
        JGraphModelAdapter.java
        MatrixExporter.java
        StringEdgeNameProvider.java
        StringNameProvider.java
        VertexNameProvider.java
        VisioExporter.java
        extract
        Span.java
        StringSpan.java
        StringTokenization.java
        Tokenization.java
        generate
        CompleteBipartiteGraphGenerator.java
        CompleteGraphGenerator.java
        EmptyGraphGenerator.java
        ForestFireGraphGenerator.java
        GraphGenerator.java
        GridGraphGenerator.java
        HyperCubeGraphGenerator.java
        LinearGraphGenerator.java
        RandomGraphGenerator.java
        RingGraphGenerator.java
        ScaleFreeGraphGenerator.java
        StarGraphGenerator.java
        TypedNetworkGenerator.java
        WheelGraphGenerator.java
        graph
        AbstractBaseGraph.java
        AbstractGraph.java
        AsUndirectedGraph.java
        AsUnweightedDirectedGraph.java
        AsUnweightedGraph.java
        AsWeightedGraph.java
        ClassBasedEdgeFactory.java
        ClassBasedVertexFactory.java
        DefaultDirectedGraph.java
        DefaultDirectedWeightedGraph.java
        DefaultEdge.java
        DefaultGraphMapping.java
        DefaultListenableGraph.java
        DefaultWeightedEdge.java
        DirectedFeatureGraph.java
        DirectedGraphUnion.java
        DirectedMaskSubgraph.java
        DirectedMultigraph.java
        DirectedPseudograph.java
        DirectedSubgraph.java
        DirectedWeightedMultigraph.java
        DirectedWeightedSubgraph.java
        EdgeReversedGraph.java
        EdgeSetFactory.java
        GraphDelegator.java
        GraphPathImpl.java
        GraphUnion.java
        IntrusiveEdge.java
        ListenableDirectedGraph.java
        ListenableDirectedWeightedGraph.java
        ListenableUndirectedGraph.java
        ListenableUndirectedWeightedGraph.java
        MaskEdgeSet.java
        MaskFunctor.java
        MaskSubgraph.java
        MaskVertexSet.java
        Multigraph.java
        ParanoidGraph.java
        Pseudograph.java
        SimpleDirectedGraph.java
        SimpleDirectedWeightedGraph.java
        SimpleGraph.java
        SimpleWeightedGraph.java
        Subgraph.java
        TypedEdge.java
        TypedSimpleGraph.java
        UndirectedGraphUnion.java
        UndirectedMaskSubgraph.java
        UndirectedSubgraph.java
        UndirectedWeightedSubgraph.java
        UnmodifiableDirectedGraph.java
        UnmodifiableGraph.java
        UnmodifiableUndirectedGraph.java
        WeightedMultigraph.java
        WeightedPseudograph.java
        load
        CharSequence2TokenSequence.java
        CharSequenceRemoveHTML.java
        EmptyInstanceIterator.java
        FeatureSequence2AugmentableFeatureVector.java
        IDToSource.java
        Input2CharSequence.java
        LineIterator.java
        LoadFromFeatureGraph.java
        LoadFromHBase.java
        NameToName.java
        Noop.java
        Pipe.java
        PrintInputAndTarget.java
        SaveDataInSource.java
        SerialPipes.java
        Target2Label.java
        TokenSequence2FeatureSequence.java
        TokenSequence2FeatureSequenceWithBigrams.java
        TokenSequenceLowercase.java
        TokenSequenceNGrams.java
        TokenSequenceRemoveNonAlpha.java
        TokenSequenceRemoveStopwords.java
        hdtm
        EvaluateHDTMResults.java
        HierachicalDocTopicModel.java
        io
        ArnetMiner.java
        DBLP.java
        Dot.java
        EdgeList.java
        FeatureGraph.java
        FileHandler.java
        KDDCup2013.java
        NINALogger.java
        PrintStatistics.java
        Save.java
        WikiHBaseToCatGraph.java
        WikidumpToHbase.java
        math
        Dirichlet.java
        GammaFunction.java
        LinearAlgebra.java
        LogisticRegressionFit.java
        LogisticRegressionPrediction.java
        Moment.java
        Numerical.java
        Randoms.java
        ReferenceCount.java
        snap
        agm
        AGM.java
        AGMFit.java
        AGMUtil.java
        cascades
        CascadeStatistics.java
        Cascades.java
        forestfire
        ForestFire.java
        structs
        Pair.java
        Triple.java
        traverse
        AbstractGraphIterator.java
        BreadthFirstIterator.java
        ClosestFirstIterator.java
        CrossComponentIterator.java
        DepthFirstIterator.java
        GraphIterator.java
        TopologicalOrderIterator.java
        types
        Alphabet.java
        AlphabetCarrying.java
        AugmentableFeatureVector.java
        ConstantMatrix.java
        DenseMatrix.java
        DenseVector.java
        FeatureConjunction.java
        FeatureSelection.java
        FeatureSequence.java
        FeatureSequenceWithBigrams.java
        FeatureVector.java
        IndexedSparseVector.java
        Instance.java
        InstanceList.java
        Label.java
        LabelAlphabet.java
        LabelVector.java
        Labeling.java
        Matrix.java
        Multinomial.java
        PropertyHolder.java
        RankedFeatureVector.java
        Sequence.java
        SingleInstanceIterator.java
        SparseVector.java
        Token.java
        TokenSequence.java
        dblp
        Author.java
        Paper.java
        Term.java
        Venue.java
        Year.java
        kddcup2013
        Affiliation.java
        Author.java
        AuthorAlsoKnownAs.java
        Confirmed.java
        Deleted.java
        Paper.java
        Term.java
        Venue.java
        VenueWebPage.java
        Year.java
        util
        ArrayUnenforcedSet.java
        CharSequenceLexer.java
        FibonacciHeap.java
        FibonacciHeapNode.java
        GraphUtil.java
        Lexer.java
        MathUtil.java
        MatrixOps.java
        ModifiableInteger.java
        Plot.java
        PrefetchIterator.java
        PropertyList.java
        TypeUtil.java
        ValueSorter.java
        VertexPair.java
        WeightCombiner.java
- test
  - edu
    - nd
      - nina
        AllTests.java
        EnhancedTestCase.java
        alg
        AllAlgTests.java
        BellmanFordShortestPathTest.java
        BiconnectedGraph.java
        BiconnectivityInspectorTest.java
        BlockCutpointGraphTest.java
        BronKerboschCliqueFinderTest.java
        ChromaticNumberTest.java
        ConnectivityInspectorTest.java
        CycleDetectorTest.java
        DijkstraShortestPathTest.java
        EdmondsKarpMaximumFlowTest.java
        EulerianCircuitTest.java
        FloydWarshallShortestPathsTest.java
        HamiltonianCycleTest.java
        KSPDiscardsValidPathsTest.java
        KSPExampleGraph.java
        KSPExampleTest.java
        KShortestPathCompleteGraph4.java
        KShortestPathCompleteGraph5.java
        KShortestPathCompleteGraph6.java
        KShortestPathCostTest.java
        KShortestPathKValuesTest.java
        KruskalMinimumSpanningTreeTest.java
        NeighborIndexTest.java
        NotBiconnectedGraph.java
        Picture1Graph.java
        ShortestPathTestCase.java
        StoerWagnerMinimumCutTest.java
        TransitiveClosureTest.java
        VertexCoversTest.java
        util
        AllAlgUtilTests.java
        UnionFindTest.java
        experimental
        GraphReaderTest.java
        alg
        ColoringTest.java
        dag
        DirectedAcyclicGraphTest.java
        equivalence
        EquivalenceGroupCreatorTest.java
        isomorphism
        EdgeTopologyCompare.java
        IntegerVertexFactory.java
        IsomorphismInspectorTest.java
        comparators
        DirectedEdgeWeightOddEvenComparator.java
        Mod3GroupComparator.java
        OddEvenGroupComparator.java
        permutation
        CompoundPermutationIterTest.java
        ext
        DOTExporterTest.java
        GmlExporterTest.java
        GraphMLExporterTest.java
        MatrixExporterTest.java
        generate
        AllGenerateTests.java
        GraphGeneratorTest.java
        RandomGraphGeneratorTest.java
        graph
        AllGraphTests.java
        AsUndirectedGraphTest.java
        AsUnweightedGraphTest.java
        AsWeightedGraphTest.java
        CloneTest.java
        DefaultDirectedGraphTest.java
        GenericGraphsTest.java
        ListenableGraphTest.java
        SerializationTest.java
        SimpleDirectedGraphTest.java
        SubgraphTest.java
        traverse
        AbstractGraphIteratorTest.java
        AllTraverseTests.java
        BreadthFirstIteratorTest.java
        ClosestFirstIteratorTest.java
        DepthFirstIteratorTest.java
        IgnoreDirectionTest.java
        TopologicalOrderIteratorTest.java
        util
        AllUtilTests.java
        FibonacciHeapTest.java
        PrefetchIteratorTest.java
        StopWatch.java

package edu.nd.nina.graph.load;

import java.io.File;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

import edu.nd.nina.util.CharSequenceLexer;

public class LoadFromHBase {
	// ---------------------------------------------

		// If true, do not force all strings to lowercase.
		private static final Boolean preserveCase = false;
		// If true, remove a default list of common English \"stop words\" from
		// the text.
		private static final Boolean removeStopWords = false;

		// Instead of the default list, read stop words from a file, one per
		// line. Implies --remove-stopwords
		private static final File stoplistFile = null;

		// Read whitespace-separated words from this file, and add them to
		// either the default English stoplist or the list specified by
		// --stoplist-file.
		private static final File extraStopwordsFile = null;

		// If true, remove text occurring inside <...>, as in HTML or SGML.
		private static final Boolean skipHtml = false;

		// If true, features will be binary.
		private static final Boolean binaryFeatures = false;

		// Include among the features all n-grams of sizes specified. For
		// example, to get all unigrams and bigrams, use --gram-sizes 1,2. This
		// option occurs after the removal of stop words, if removed.
		private static final int[] gramSizes = { 1 };

		// If true, final data will be a FeatureSequence rather than a
		// FeatureVector.
		private static final Boolean keepSequence = false;

		// If true, final data will be a FeatureSequenceWithBigrams rather than
		// a FeatureVector.
		private static final Boolean keepSequenceBigrams = true;

		// Character encoding for input file
		private static final String encoding = Charset.defaultCharset()
				.displayName();

		// "Regular expression used for tokenization. Example:
		// \"[\\p{L}\\p{N}_]+|[\\p{P}]+\" (unicode letters, numbers and
		// underscore OR all punctuation)
		private static final String tokenRegex = CharSequenceLexer.LEX_ALPHA
				.toString();

		// If true, print a representation of the processed data to standard
		// output. This option is intended for debugging.
		private static final Boolean printOutput = false;
		
		public static Pipe createPipe() {
			// Build a new pipe

			// Create a list of pipes that will be added to a SerialPipes object
			// later
			ArrayList<Pipe> pipeList = new ArrayList<Pipe>();

			// Convert the "target" object into a numeric index
			// into a LabelAlphabet.
			pipeList.add(new Target2Label());

			// The "data" field is currently a filename. Save it as "source".
			// pipeList.add(new SaveDataInSource());

			// Set "data" to the file's contents. "data" is now a String.
			pipeList.add(new Input2CharSequence(encoding));

			// Remove HTML tags. Suitable for SGML and XML.
			if (skipHtml) {
				pipeList.add(new CharSequenceRemoveHTML());
			}

			//
			// Tokenize the input: first compile the tokenization pattern
			//

			Pattern tokenPattern = null;

			if (keepSequenceBigrams) {
				// We do not want to record bigrams across punctuation,
				// so we need to keep non-word tokens.
				tokenPattern = CharSequenceLexer.LEX_NONWHITESPACE_CLASSES;
			} else {
				// Otherwise, try to compile the regular expression pattern.

				try {
					tokenPattern = Pattern.compile(tokenRegex);
				} catch (PatternSyntaxException pse) {
					throw new IllegalArgumentException(
							"The token regular expression (" + tokenRegex
									+ ") was invalid: " + pse.getMessage());
				}
			}

			// Add the tokenizer
			pipeList.add(new CharSequence2TokenSequence(tokenPattern));

			
			// The first token is the ID - add it to the source
			//pipeList.add(new IDToSource());
			
			if (!preserveCase) {
				pipeList.add(new TokenSequenceLowercase());
			}

			

			if (keepSequenceBigrams) {
				// Remove non-word tokens, but record the fact that they
				// were there.
				pipeList.add(new TokenSequenceRemoveNonAlpha(true));
			}

			// Stopword removal.

			if (stoplistFile != null) {

				// The user specified a new list

				TokenSequenceRemoveStopwords stopwordFilter = new TokenSequenceRemoveStopwords(
						stoplistFile, encoding, false, // don't include default list
						false, keepSequenceBigrams);

				if (extraStopwordsFile != null) {
					stopwordFilter.addStopWords(extraStopwordsFile);
				}

				pipeList.add(stopwordFilter);
			} else if (removeStopWords) {

				// The user did not specify a new list, so use the default
				// built-in English list, possibly adding extra words.

				TokenSequenceRemoveStopwords stopwordFilter = new TokenSequenceRemoveStopwords(
						false, keepSequenceBigrams);

				if (extraStopwordsFile != null) {
					stopwordFilter.addStopWords(extraStopwordsFile);
				}

				pipeList.add(stopwordFilter);

			}

			// gramSizes is an integer array, with default value [1].
			// Check if we have a non-default value.
			if (!(gramSizes.length == 1 && gramSizes[0] == 1)) {
				pipeList.add(new TokenSequenceNGrams(gramSizes));
			}

			// So far we have a sequence of Token objects that contain
			// String values. Look these up in an alphabet and store integer IDs
			// ("features") instead of Strings.
			if (keepSequenceBigrams) {
				pipeList.add(new TokenSequence2FeatureSequenceWithBigrams());
			} else {
				pipeList.add(new TokenSequence2FeatureSequence());
			}

			// For many applications, we do not need to preserve the sequence of
			// features,
			// only the number of times times a feature occurs.
			if (!(keepSequence || keepSequenceBigrams)) {
				pipeList.add(new FeatureSequence2AugmentableFeatureVector(
						binaryFeatures));
			}

			if (printOutput) {
				pipeList.add(new PrintInputAndTarget());
			}

			return new SerialPipes(pipeList);
		}
}