PreprocessingContext.java example

Explorer

carrot2-master
- applications
  - carrot2-benchmarks
    - src-test
      - org
        carrot2
        core
        ControllerOverheadBenchmark.java
        benchmarks
        memtime
        BasicPreprocessing.java
        CompletePreprocessing.java
        MemTimeBenchmark.java
        OpenSourceAlgorithmsBenchmark.java
        PreprocessingBenchmark.java
  - carrot2-cli
    - src
      - org
        carrot2
        cli
        batch
        BatchApp.java
  - carrot2-dcs
    - examples
      - java
        src
        main
        java
        org
        carrot2
        dcs
        Examples.java
        HttpClientPostProvider.java
        IHttpMultipartPostProvider.java
        JaxRsPostProvider.java
        StreamUtils.java
    - src
      - org
        carrot2
        dcs
        DcsApp.java
        DcsConfig.java
        DcsRequestModel.java
        MemoryFileItemFactory.java
        RestProcessorServlet.java
    - src-test
      - org
        carrot2
        dcs
        AuthConnectionTest.java
        DcsAppTest.java
  - carrot2-examples
    - examples
      - org
        carrot2
        examples
        ConsoleFormatter.java
        CreateLuceneIndex.java
        SampleDocumentData.java
        clustering
        BingKeyAccess.java
        ClusteringDataFromDocumentSources.java
        ClusteringDataFromLucene.java
        ClusteringDataFromLuceneWithCustomFields.java
        ClusteringDataFromPubMed.java
        ClusteringDocumentList.java
        ClusteringNonEnglishContent.java
        MoreConfigurationsOfOneAlgorithmInCachingController.java
        UsingAttributes.java
        UsingCachingController.java
        UsingComponentSuites.java
        UsingCustomLanguageModel.java
        UsingCustomLexicalResources.java
        core
        LoadingAttributeValuesFromXml.java
        SavingAttributeValuesToXml.java
        SavingResultsToJson.java
        SavingResultsToXml.java
        research
        ClusteringQualityBenchmark.java
        source
        ByFirstTitleLetterClusteringAlgorithm.java
        ExampleCustomComponents.java
        ModuloDocumentSource.java
  - carrot2-webapp
    - src
      - org
        carrot2
        webapp
        LogInitContextListener.java
        QueryProcessorServlet.java
        RootRedirectFilter.java
        WebApp.java
        filter
        FarFutureExpiresHeaderFilter.java
        QueryWordHighlighter.java
        jawr
        JawrUrlGenerator.java
        model
        AssetUrlsModel.java
        AttributeMetadataModel.java
        ModelWithDefault.java
        PageModel.java
        RequestModel.java
        RequestType.java
        ResultsCacheModel.java
        ResultsSizeModel.java
        ResultsViewModel.java
        SkinModel.java
        WebappConfig.java
        util
        UserAgentUtils.java
    - src-test
      - org
        carrot2
        webapp
        ControlUnicodeCharacterTest.java
        filter
        QueryWordHighlighterTest.java
        util
        UserAgentUtilsTest.java
- core
  - carrot2-algorithm-kmeans
    - src
      - org
        carrot2
        clustering
        kmeans
        BisectingKMeansClusteringAlgorithm.java
        BisectingKMeansProcessingContext.java
    - src-test
      - org
        carrot2
        clustering
        kmeans
        BisectingKMeansClusteringAlgorithmTest.java
  - carrot2-algorithm-lingo
    - src
      - org
        carrot2
        clustering
        lingo
        ClusterBuilder.java
        IFeatureScorer.java
        ILabelAssigner.java
        LingoClusteringAlgorithm.java
        LingoProcessingContext.java
        SimpleLabelAssigner.java
        UniqueLabelAssigner.java
    - src-test
      - org
        carrot2
        clustering
        lingo
        ClusterDocumentAssignerTest.java
        ClusterLabelBuilderTest.java
        ClusterMergerTest.java
        LingoClusteringAlgorithmTest.java
        LingoProcessingComponentTestBase.java
  - carrot2-algorithm-stc
    - src
      - org
        carrot2
        clustering
        stc
        ClusterCandidate.java
        GeneralizedSuffixTree.java
        STCClusteringAlgorithm.java
        text
        suffixtree
        CharacterSequence.java
        ISequence.java
        IntegerSequence.java
        SuffixTree.java
        SuffixTreeBuilder.java
    - src-test
      - org
        carrot2
        clustering
        stc
        GeneralizedSuffixTreeTest.java
        STCClusteringAlgorithmTest.java
        text
        suffixtree
        SuffixTreeTest.java
  - carrot2-algorithm-synthetic
    - src
      - org
        carrot2
        clustering
        synthetic
        ByFieldClusteringAlgorithm.java
        ByUrlClusteringAlgorithm.java
        PassthroughClusteringAlgorithm.java
    - src-test
      - org
        carrot2
        clustering
        synthetic
        ByFieldClusteringAlgorithmTest.java
        ByUrlClusteringAlgorithmTest.java
        DocumentWithUrlsFactory.java
  - carrot2-component-suites
    - src-test
      - org
        carrot2
        core
        ComponentSuitesTest.java
  - carrot2-core
    - src
      - org
        carrot2
        core
        CachingProcessingComponentManager.java
        Cluster.java
        ComponentInitializationException.java
        Controller.java
        ControllerContextImpl.java
        ControllerContextListenerAdapter.java
        ControllerFactory.java
        ControllerStatistics.java
        ControllerUtils.java
        Document.java
        DocumentSourceDescriptor.java
        ExecutorServiceShutdownListener.java
        HttpAuthHub.java
        IClusteringAlgorithm.java
        IControllerContext.java
        IControllerContextListener.java
        IDocumentSource.java
        IProcessingComponent.java
        IProcessingComponentManager.java
        LanguageCode.java
        Platform.java
        PoolingProcessingComponentManager.java
        ProcessingComponentBase.java
        ProcessingComponentConfiguration.java
        ProcessingComponentDescriptor.java
        ProcessingComponentSuite.java
        ProcessingComponentSuiteInclude.java
        ProcessingException.java
        ProcessingResult.java
        ReferenceEquality.java
        SimpleProcessingComponentManager.java
        attribute
        AttributeNames.java
        CommonAttributes.java
        Init.java
        Internal.java
        InternalAttributePredicate.java
        Processing.java
        source
        MultipageSearchEngine.java
        MultipageSearchEngineMetadata.java
        SearchEngineBase.java
        SearchEngineResponse.java
        SearchEngineStats.java
        SimpleSearchEngine.java
        UniqueFieldPredicate.java
    - src-test
      - org
        carrot2
        core
        ClusterTest.java
        ControllerTest.java
        ControllerTestsBase.java
        ControllerTestsCaching.java
        ControllerTestsCommon.java
        ControllerTestsPooling.java
        DelegatingProcessingComponent.java
        DocumentTest.java
        DummyControllerContext.java
        ProcessingResultTest.java
        TestAlgorithm.java
        TestDocumentSource.java
        test
        Assertions.java
        ByteByteArrayAssert.java
        CharCharArrayAssert.java
        ClusteringAlgorithmTestBase.java
        DocumentSourceTestBase.java
        DoubleArrayAssert.java
        IntIntArrayAssert.java
        MultipageDocumentSourceTestBase.java
        ProcessingComponentTestBase.java
        QueryableDocumentSourceTestBase.java
        SampleDocumentData.java
        TestDocumentFactory.java
        assertions
        Carrot2CoreAssertions.java
        ClusterAssertion.java
        ClusterCheck.java
        ClusterListAssertion.java
        ClusterPairCheck.java
        DocumentAssertion.java
        DocumentListAssertion.java
        GenericListAssertion.java
        source
        SearchRangeTest.java
  - carrot2-output-metrics
    - src
      - org
        carrot2
        output
        metrics
        ClusteringMetricsCalculator.java
        ContaminationMetric.java
        IClusteringMetric.java
        IdealPartitioningBasedMetric.java
        NormalizedMutualInformationMetric.java
        PrecisionRecallMetric.java
    - src-test
      - org
        carrot2
        output
        metrics
        ContaminationMetricTest.java
        IdealPartitioningBasedMetricTest.java
        NormalizedMutualInformationMetricTest.java
        PrecisionRecallMetricTest.java
  - carrot2-source-ambient
    - src
      - org
        carrot2
        source
        ambient
        AmbientDocumentSource.java
        FubDocumentSource.java
        FubTestCollection.java
        Odp239DocumentSource.java
    - src-test
      - org
        carrot2
        source
        ambient
        AmbientDocumentSourceTest.java
        FubDocumentSourceTestBase.java
        Odp239DocumentSourceTest.java
  - carrot2-source-etools
    - src
      - org
        carrot2
        source
        etools
        EToolsDocumentSource.java
        IpBannedException.java
    - src-test
      - org
        carrot2
        source
        etools
        EToolsDocumentSourceTest.java
  - carrot2-source-idol
    - src
      - org
        carrot2
        source
        idol
        IdolDocumentSource.java
  - carrot2-source-lucene
    - src
      - org
        carrot2
        source
        lucene
        FSDirectoryWrapper.java
        IFieldMapper.java
        LuceneDocumentSource.java
        PlainTextFormatter.java
        SimpleFieldMapper.java
    - src-test
      - org
        carrot2
        source
        lucene
        FSDirectoryWrapperTest.java
        LuceneDocumentSourceTest.java
        LuceneIndexUtils.java
  - carrot2-source-microsoft
    - src
      - org
        carrot2
        source
        microsoft
        v5
        AdultOption.java
        Bing5DocumentSource.java
        Bing5NewsDocumentSource.java
        BingResponse.java
        ErrorResponse.java
        Freshness.java
        MarketOption.java
        NewsResponse.java
        SearchResponse.java
        SourceType.java
        UnstructuredResponse.java
    - src-test
      - org
        carrot2
        source
        microsoft
        v5
        Bing5DocumentSourceTest.java
        Bing5NewsDocumentSourceTest.java
        Bing5ResponseParsingTest.java
  - carrot2-source-opensearch
    - src
      - org
        carrot2
        source
        opensearch
        OpenSearchDocumentSource.java
        RomeFetcherUtils.java
    - src-test
      - org
        carrot2
        source
        opensearch
        OpenSearchDocumentSourceByResultIncrementTest.java
        OpenSearchDocumentSourceTest.java
  - carrot2-source-pubmed
    - src
      - org
        carrot2
        source
        pubmed
        EmptyEntityResolver.java
        PathTrackingHandler.java
        PubMedContentHandler.java
        PubMedDocumentSource.java
        PubMedIdSearchHandler.java
    - src-test
      - org
        carrot2
        source
        pubmed
        PubMedContentHandlerTest.java
        PubMedDocumentSourceTest.java
        PubMedIdSearchHandlerTest.java
  - carrot2-source-solr
    - src
      - org
        carrot2
        source
        solr
        SolrDocumentSource.java
  - carrot2-source-xml
    - src
      - org
        carrot2
        source
        xml
        RemoteXmlSimpleSearchEngineBase.java
        XmlDocumentSource.java
        XmlDocumentSourceHelper.java
    - src-test
      - org
        carrot2
        source
        xml
        XmlDocumentSourceTest.java
  - carrot2-util-common
    - src
      - org
        carrot2
        util
        CharArrayUtils.java
        CharSequenceUtils.java
        CloseableUtils.java
        CollectionUtils.java
        ExceptionUtils.java
        ExecutorServiceUtils.java
        GraphUtils.java
        IntArrayPredicateIterator.java
        IntMapUtils.java
        LinearApproximation.java
        ListUtils.java
        MapUtils.java
        MathUtils.java
        Pair.java
        PriorityQueue.java
        RangeUtils.java
        ReflectionUtils.java
        RollingWindowAverage.java
        SetUtils.java
        StreamUtils.java
        StringUtils.java
        SystemPropertyStack.java
        annotations
        AspectModified.java
        Immutable.java
        ThreadSafe.java
        attribute
        AttributeValueSet.java
        AttributeValueSets.java
        DefaultGroups.java
        factory
        CachedInstanceFactoryDecorator.java
        FallbackFactory.java
        IFactory.java
        NewClassInstanceFactory.java
        SingletonFactory.java
        httpclient
        HttpClientFactory.java
        HttpHeaders.java
        HttpRedirectStrategy.java
        HttpUtils.java
        pool
        FixedSizePool.java
        IActivationListener.java
        IDisposalListener.java
        IInstantiationListener.java
        IParameterizedPool.java
        IPassivationListener.java
        SoftUnboundedPool.java
        resource
        ClassLoaderLocator.java
        ClassLoaderResource.java
        ClassLocator.java
        ClassResource.java
        ContextClassLoaderLocator.java
        DirLocator.java
        FileResource.java
        IResource.java
        IResourceLocator.java
        PrefixDecoratorLocator.java
        ResourceCache.java
        ResourceLookup.java
        ServletContextLocator.java
        URLResource.java
        URLResourceWithParams.java
        simplexml
        DefaultConstructorSimpleXmlWrapper.java
        ISimpleXmlWrapper.java
        ISourceLocationAware.java
        ListSimpleXmlWrapper.java
        MapSimpleXmlWrapper.java
        PersisterHelpers.java
        SessionInitStrategy.java
        SimpleXmlWrapperValue.java
        SimpleXmlWrappers.java
        tests
        CarrotTestCase.java
        SuiteResultInfoWriter.java
        UsesExternalServices.java
        xslt
        NopURIResolver.java
        StylesheetErrorListener.java
        TemplatesPool.java
        TransformerErrorListener.java
    - src-test
      - org
        carrot2
        util
        CharArrayUtilsTest.java
        ExceptionUtilsTest.java
        GraphUtilsTest.java
        IndirectSorterTest.java
        IntArrayPredicateIteratorTest.java
        LinearApproximationTest.java
        RangeUtilsTest.java
        RollingWindowAverageTest.java
        StringUtilsTest.java
        attribute
        AttributeValueSetTest.java
        AttributeValueSetsTest.java
        ResourceFromStringTest.java
        httpclient
        HttpClientFactoryTest.java
        pool
        FixedSizePoolTest.java
        ParameterizedPoolTestBase.java
        SoftUnboundedPoolTest.java
        resource
        ResourceLookupTest.java
        ServletContextLocatorTest.java
        URLResourceWithParamsTest.java
        simplexml
        SimpleXmlWrappersTest.java
  - carrot2-util-log4j
    - src
      - org
        carrot2
        log4j
        BufferingAppender.java
  - carrot2-util-matrix
    - src
      - org
        carrot2
        mahout
        collections
        Arithmetic.java
        Constants.java
        common
        RandomUtils.java
        math
        AbstractMatrix.java
        AbstractVector.java
        Algebra.java
        Arrays.java
        CardinalityException.java
        DenseMatrix.java
        DenseVector.java
        IndexException.java
        Matrix.java
        MatrixSlice.java
        MatrixVectorView.java
        MatrixView.java
        OrderedIntDoubleMapping.java
        PersistentObject.java
        RandomAccessSparseVector.java
        SequentialAccessSparseVector.java
        SingularValueDecomposition.java
        Sorting.java
        Swapper.java
        Vector.java
        VectorIterable.java
        VectorView.java
        buffer
        DoubleBufferConsumer.java
        IntBufferConsumer.java
        function
        ByteComparator.java
        CharComparator.java
        DoubleComparator.java
        DoubleDoubleFunction.java
        DoubleFunction.java
        DoubleProcedure.java
        FloatComparator.java
        Functions.java
        IntComparator.java
        IntDoubleProcedure.java
        IntIntDoubleFunction.java
        IntProcedure.java
        LongComparator.java
        Mult.java
        PlusMult.java
        ShortComparator.java
        VectorFunction.java
        list
        AbstractDoubleList.java
        AbstractIntList.java
        AbstractList.java
        DoubleArrayList.java
        IntArrayList.java
        map
        AbstractIntDoubleMap.java
        HashFunctions.java
        OpenIntDoubleHashMap.java
        PrimeFinder.java
        matrix
        DoubleMatrix1D.java
        DoubleMatrix2D.java
        impl
        AbstractMatrix.java
        AbstractMatrix1D.java
        AbstractMatrix2D.java
        DelegateDoubleMatrix1D.java
        DenseDoubleMatrix1D.java
        DenseDoubleMatrix2D.java
        SelectedDenseDoubleMatrix1D.java
        SelectedDenseDoubleMatrix2D.java
        SelectedSparseDoubleMatrix1D.java
        SelectedSparseDoubleMatrix2D.java
        SparseDoubleMatrix1D.java
        SparseDoubleMatrix2D.java
        WrapperDoubleMatrix1D.java
        WrapperDoubleMatrix2D.java
        linalg
        EigenvalueDecomposition.java
        Property.java
        set
        AbstractSet.java
        matrix
        MatrixUtils.java
        factorization
        IIterativeMatrixFactorization.java
        IMatrixFactorization.java
        IMatrixFactorizationFactory.java
        IterationNumberGuesser.java
        IterativeMatrixFactorizationBase.java
        IterativeMatrixFactorizationFactory.java
        KMeansMatrixFactorization.java
        KMeansMatrixFactorizationFactory.java
        LocalNonnegativeMatrixFactorization.java
        LocalNonnegativeMatrixFactorizationFactory.java
        MatrixFactorizationBase.java
        NonnegativeMatrixFactorizationED.java
        NonnegativeMatrixFactorizationEDFactory.java
        NonnegativeMatrixFactorizationKL.java
        NonnegativeMatrixFactorizationKLFactory.java
        PartialSingularValueDecomposition.java
        PartialSingularValueDecompositionFactory.java
        seeding
        ISeedingStrategy.java
        ISeedingStrategyFactory.java
        KMeansSeedingStrategy.java
        KMeansSeedingStrategyFactory.java
        RandomSeedingStrategy.java
        RandomSeedingStrategyFactory.java
    - src-test
      - org
        carrot2
        matrix
        DoubleMatrix1DAssertion.java
        DoubleMatrix2DAssertion.java
        MatrixAssertions.java
        MatrixUtilsTest.java
        factorization
        MatrixFactorizationTest.java
  - carrot2-util-text
    - src
      - org
        carrot2
        text
        analysis
        ExtendedWhitespaceTokenizer.java
        ExtendedWhitespaceTokenizerImpl.java
        ITokenizer.java
        TokenTypeUtils.java
        clustering
        IMonolingualClusteringAlgorithm.java
        MultilingualClustering.java
        linguistic
        DefaultLexicalData.java
        DefaultLexicalDataFactory.java
        DefaultStemmerFactory.java
        DefaultTokenizerFactory.java
        ILexicalData.java
        ILexicalDataFactory.java
        IStemmer.java
        IStemmerFactory.java
        ITokenizerFactory.java
        IdentityStemmer.java
        IdentityStemmerFactory.java
        JapaneseUnsupportedStub.java
        LanguageModel.java
        LexicalDataLoader.java
        SnowballStemmerAdapter.java
        lucene
        ArabicStemmerAdapter.java
        ChineseTokenizerAdapter.java
        HindiNormalizer.java
        HindiStemmer.java
        HindiStemmerAdapter.java
        IndicNormalizer.java
        StemmerUtil.java
        ThaiTokenizerAdapter.java
        morfologik
        MorfologikStemmerAdapter.java
        snowball
        Among.java
        SnowballProgram.java
        stemmers
        DanishStemmer.java
        DutchStemmer.java
        EnglishStemmer.java
        FinnishStemmer.java
        FrenchStemmer.java
        GermanStemmer.java
        HungarianStemmer.java
        ItalianStemmer.java
        NorwegianStemmer.java
        PortugueseStemmer.java
        RomanianStemmer.java
        RussianStemmer.java
        SpanishStemmer.java
        SwedishStemmer.java
        TurkishStemmer.java
        preprocessing
        CaseNormalizer.java
        DocumentAssigner.java
        LabelFilterProcessor.java
        LabelFormatter.java
        LanguageModelStemmer.java
        PhraseExtractor.java
        PreprocessedDocumentScanner.java
        PreprocessingContext.java
        SparseArray.java
        StopListMarker.java
        Substring.java
        SubstringComparator.java
        SuffixSorter.java
        Tokenizer.java
        filter
        CompleteLabelFilter.java
        CompleteLabelFilterBase.java
        GenitiveLabelFilter.java
        ILabelFilter.java
        LeftCompleteLabelFilter.java
        MinLengthLabelFilter.java
        NumericLabelFilter.java
        QueryLabelFilter.java
        RightCompleteLabelFilter.java
        SingleLabelFilterBase.java
        StopLabelFilter.java
        StopWordLabelFilter.java
        pipeline
        BasicPreprocessingPipeline.java
        CompletePreprocessingPipeline.java
        IPreprocessingPipeline.java
        util
        CharArrayComparators.java
        MutableCharArray.java
        MutableCharArrayUtils.java
        TabularOutput.java
        vsm
        ITermWeighting.java
        LinearTfIdfTermWeighting.java
        LogTfIdfTermWeighting.java
        ReducedVectorSpaceModelContext.java
        TermDocumentMatrixBuilder.java
        TermDocumentMatrixReducer.java
        TfTermWeighting.java
        VectorSpaceModelContext.java
    - src-test
      - org
        carrot2
        text
        clustering
        MultilingualClusteringTest.java
        linguistic
        ArabicTest.java
        ChineseTokenizerTest.java
        DefaultLexicalDataFactoryTest.java
        DefaultStemmerFactoryTest.java
        DefaultTokenizerFactoryTest.java
        EnglishTest.java
        ExtendedWhitespaceTokenizerTest.java
        HindiStemmerFactoryTest.java
        LanguageModelTest.java
        LanguageModelTestBase.java
        PolishTest.java
        ThaiTokenizerTest.java
        TokenizerTestBase.java
        preprocessing
        CaseNormalizerTest.java
        DocumentAssignerTest.java
        LabelFilterTestBase.java
        LabelFormatterTest.java
        PhraseExtractorTest.java
        PreprocessedDocumentScannerTest.java
        PreprocessingComponentTestBase.java
        PreprocessingContextAssert.java
        PreprocessingContextBuilder.java
        PreprocessingContextTestBase.java
        StemmerEnglishTest.java
        StemmerSyntheticTest.java
        SubstringComparatorTest.java
        SuffixSorterTest.java
        TestLanguageModelFactory.java
        TestLexicalDataFactory.java
        TestStemmerFactory.java
        TestTokenizerFactory.java
        TokenizerTest.java
        WordMarkerTest.java
        filter
        CompleteLabelFilterTest.java
        GenitiveLabelFilterTest.java
        MinLengthLabelFilterTest.java
        NumericLabelFilterTest.java
        QueryLabelFilterTest.java
        StopLabelFilterEnglishTest.java
        StopWordLabelFilterEnglishTest.java
        StopWordLabelFilterMergedTest.java
        StopWordLabelFilterSyntheticTest.java
        util
        CharArrayComparatorsTest.java
        MutableCharArrayTest.java
        MutableCharArrayUtilsTest.java
        vsm
        PhraseMatrixBuilderTest.java
        ReducedTermDocumentMatrixBuilderTestBase.java
        TermDocumentMatrixBuilderTest.java
        TermDocumentMatrixBuilderTestBase.java
  - carrot2-util-xsltfilter
    - src
      - org
        carrot2
        util
        xsltfilter
        AddHeaderFilter.java
        DeferredOutputStream.java
        IContentTypeListener.java
        TransformingDocumentHandler.java
        XSLTFilter.java
        XSLTFilterConstants.java
        XSLTFilterServletResponse.java
- doc
  - src
    - org
      - carrot2
        core
        ProcessingComponentDumper.java
- lib
  - org.carrot2.antlib
    - src
      - main
        java
        org
        carrot2
        antlib
        tasks
        AbstractLicenseTask.java
        FileURL.java
        FindVersionTask.java
        LicenseListTask.java
        LicenseReplaceTask.java
        SetPropertyTask.java
        SourceFile.java
        SwitchClassLoader.java
- workbench


/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.text.preprocessing;

import java.io.StringWriter;
import java.util.Arrays;
import java.util.List;

import org.carrot2.core.Document;
import org.carrot2.text.analysis.ITokenizer;
import org.carrot2.text.linguistic.IStemmer;
import org.carrot2.text.linguistic.LanguageModel;
import org.carrot2.text.util.MutableCharArray;
import org.carrot2.text.util.TabularOutput;

import com.carrotsearch.hppc.*;

/**
 * Document preprocessing context provides low-level (usually integer-coded) data
 * structures useful for further processing.
 * 
 * <p><img src="doc-files/preprocessing-arrays.png"
 *      alt="Internals of PreprocessingContext"></p>
 */
public final class PreprocessingContext
{
    /** Uninitialized structure constant. */
    private static final String UNINITIALIZED = "[uninitialized]\n";

    /** Query used to perform processing, may be <code>null</code> */
    public final String query;

    /** A list of documents to process. */
    public final List<Document> documents;

    /** Language model to be used */
    public final LanguageModel language;

    /**
     * Token interning cache. Token images are interned to save memory and allow reference
     * comparisons.
     */
    private ObjectHashSet<MutableCharArray> tokenCache = new ObjectHashSet<>();

    /**
     * Creates a preprocessing context for the provided <code>documents</code> and with
     * the provided <code>languageModel</code>.
     */
    public PreprocessingContext(LanguageModel languageModel, List<Document> documents,
        String query)
    {
        this.query = query;
        this.documents = documents;
        this.language = languageModel;
    }

    /**
     * Information about all tokens of the input {@link PreprocessingContext#documents}.
     * Each element of each of the arrays corresponds to one individual token from the
     * input or a synthetic separator inserted between documents, fields and sentences.
     * Last element of this array is a special terminator entry.
     * <p>
     * All arrays in this class have the same length and values across different arrays
     * correspond to each other for the same index.
     */
    public class AllTokens
    {
        /**
         * Token image as it appears in the input. On positions where {@link #type} is
         * equal to one of {@link ITokenizer#TF_TERMINATOR},
         * {@link ITokenizer#TF_SEPARATOR_DOCUMENT} or
         * {@link ITokenizer#TF_SEPARATOR_FIELD} , image is <code>null</code>.
         * <p>
         * This array is produced by {@link Tokenizer}.
         */
        public char [][] image;

        /**
         * Token's {@link ITokenizer} bit flags.
         * <p>
         * This array is produced by {@link Tokenizer}.
         */
        public short [] type;

        /**
         * Document field the token came from. The index points to arrays in
         * {@link AllFields}, equal to <code>-1</code> for document and field separators.
         * <p>
         * This array is produced by {@link Tokenizer}.
         */
        public byte [] fieldIndex;

        /**
         * Index of the document this token came from, points to elements of
         * {@link PreprocessingContext#documents}. Equal to <code>-1</code> for document
         * separators.
         * <p>
         * This array is produced by {@link Tokenizer}.
         * </p>
         * <p>
         * This array is accessed in in {@link CaseNormalizer} and {@link PhraseExtractor}
         * to compute by-document statistics, e.g. tf-by document, which are then needed
         * to build a VSM or assign documents to labels. An alternative to this representation
         * would be creating an <code>AllDocuments</code> holder and keep there an array
         * of start token indexes for each document and then refactor the model building code
         * to do a binary search to determine the document index given token index. This is
         * likely to be a significant performance hit because model building code accesses 
         * the documentIndex array pretty much randomly (in the suffix order), so we'd be
         * doing twice-the-number-of-tokens binary searches. Unless there's some other
         * data structure that can help us here.
         * </p>
         */
        public int [] documentIndex;

        /**
         * A pointer to {@link AllWords} arrays for this token. Equal to <code>-1</code>
         * for document, field and {@link ITokenizer#TT_PUNCTUATION} tokens (including
         * sentence separators).
         * <p>
         * This array is produced by {@link CaseNormalizer}.
         */
        public int [] wordIndex;

        /**
         * The suffix order of tokens. Suffixes starting with a separator come at the end
         * of the array.
         * <p>
         * This array is produced by {@link PhraseExtractor}.
         */
        public int [] suffixOrder;

        /**
         * The Longest Common Prefix for the adjacent suffix-sorted token sequences.
         * <p>
         * This array is produced by {@link PhraseExtractor}.
         */
        public int [] lcp;

        /** For debugging purposes. */
        @Override
        public String toString()
        {
            if (image == null)
            {
                return UNINITIALIZED;
            }

            StringWriter sw = new StringWriter();
            TabularOutput t = new TabularOutput(sw);
            t.flushEvery(Integer.MAX_VALUE);

            t.addColumn("#");
            t.addColumn("token").alignLeft();
            t.addColumn("type");
            t.addColumn("fieldIndex");
            t.addColumn("=>field").alignLeft();
            t.addColumn("docIdx");
            t.addColumn("wordIdx");
            t.addColumn("=>word").alignLeft();

            for (int i = 0; i < image.length; i++, t.nextRow())
            {
                t.rowData(
                    i,
                    image[i] == null ? "<null>" : new String(image[i]),
                    type[i],
                    fieldIndex[i],
                    fieldIndex[i] >= 0 ? allFields.name[fieldIndex[i]] : null,
                    documentIndex[i],
                    wordIndex[i],
                    wordIndex[i] >= 0 ? new String(allWords.image[wordIndex[i]]) : null);
            }

            if (suffixOrder != null)
            {
                t = new TabularOutput(sw);
                t.addColumn("#");
                t.addColumn("sa");
                t.addColumn("lcp");
                t.addColumn("=>words").alignLeft();

                sw.append("\n");
                final StringBuilder suffixImage = new StringBuilder();
                for (int i = 0; i < suffixOrder.length; i++, t.nextRow())
                {
                    t.rowData(
                        i,
                        suffixOrder[i],
                        lcp[i]);

                    int windowLength = 5;
                    for (int j = suffixOrder[i], max = Math.min(suffixOrder[i] + windowLength, wordIndex.length); j < max;)
                    {
                        suffixImage.append(
                            wordIndex[j] >= 0 ? new String(allWords.image[wordIndex[j]]) : "|").append(" ");
                        if (++j == max && j != wordIndex.length)
                            suffixImage.append(" [...]");
                    }
                    t.rowData(suffixImage.toString());
                    suffixImage.setLength(0);
                }
                sw.append("\n");
            }

            t.flush();
            sw.append("\n");
            return sw.toString();
        }
    }

    /**
     * Information about all tokens of the input {@link PreprocessingContext#documents}.
     */
    public final AllTokens allTokens = new AllTokens();

    /**
     * Information about all fields processed for the input
     * {@link PreprocessingContext#documents}.
     */
    public static class AllFields
    {
        /**
         * Name of the document field. Entries of {@link AllTokens#fieldIndex} point to
         * this array.
         * <p>
         * This array is produced by {@link Tokenizer}.
         */
        public String [] name;
        
        /** For debugging purposes. */
        @Override
        public String toString()
        {
            if (name == null)
            {
                return UNINITIALIZED;
            }
            
            StringWriter sw = new StringWriter();
            TabularOutput t = new TabularOutput(sw);
            t.flushEvery(Integer.MAX_VALUE);
            t.addColumn("#");
            t.addColumn("name").format("%-10s").alignLeft();

            int i = 0;
            for (String n : name)
            {
                t.rowData(i++, n).nextRow();
            }

            t.flush();
            sw.append("\n");
            return sw.toString();
        }        
    }

    /**
     * Information about all fields processed for the input
     * {@link PreprocessingContext#documents}.
     */
    public final AllFields allFields = new AllFields();

    /**
     * Information about all unique words found in the input
     * {@link PreprocessingContext#documents}. An entry in each parallel array corresponds to one
     * conflated form of a word. For example, <em>data</em> and <em>DATA</em> will most likely become
     * a single entry in the words table. However, different grammatical forms of a single lemma
     * (like <em>computer</em> and <em>computers</em>) will have different entries in the
     * words table. See {@link AllStems} for inflection-conflated versions.
     * <p>
     * All arrays in this class have the same length and values across different arrays
     * correspond to each other for the same index.
     */
    public class AllWords
    {
        /**
         * The most frequently appearing variant of the word with respect to case. E.g. if
         * a token <em>MacOS</em> appeared 12 times in the input and <em>macos</em>
         * appeared 3 times, the image will be equal to <em>MacOS</em>.
         * <p>
         * This array is produced by {@link CaseNormalizer}.
         */
        public char [][] image;

        /**
         * Token type of this word copied from {@link AllTokens#type}. Additional
         * flags are set for each word by 
         * {@link CaseNormalizer} and {@link LanguageModelStemmer}.
         * 
         * <p>
         * This array is produced by {@link CaseNormalizer}.
         * This array is modified by {@link LanguageModelStemmer}.
         * 
         * @see ITokenizer
         */
        public short [] type;

        /**
         * Term Frequency of the word, aggregated across all variants with respect to
         * case. Frequencies for each variant separately are not available.
         * <p>
         * This array is produced by {@link CaseNormalizer}.
         */
        public int [] tf;

        /**
         * Term Frequency of the word for each document. The length of this array is equal
         * to the number of documents this word appeared in (Document Frequency)
         * multiplied by 2. Elements at even indices contain document indices pointing to
         * {@link PreprocessingContext#documents}, elements at odd indices contain the
         * frequency of the word in the document. For example, an array with 4 values:
         * <code>[2, 15, 138, 7]</code> means that the word appeared 15 times in document
         * at index 2 and 7 times in document at index 138.
         * <p>
         * This array is produced by {@link CaseNormalizer}. The order of documents in this
         * array is not defined.
         */
        public int [][] tfByDocument;

        /**
         * A pointer to the {@link AllStems} arrays for this word.
         * <p>
         * This array is produced by {@link LanguageModelStemmer}.
         */
        public int [] stemIndex;

        /**
         * A bit-packed indices of all fields in which this word appears at least once. 
         * Indexes (positions) of selected bits are pointers to the 
         * {@link AllFields} arrays. Fast conversion between the bit-packed representation
         * and <code>byte[]</code> with index values is done by {@link #toFieldIndexes(byte)}  
         * <p>
         * This array is produced by {@link CaseNormalizer}.
         */
        public byte [] fieldIndices;

        /** For debugging purposes. */
        @Override
        public String toString()
        {
            if (image == null)
            {
                return UNINITIALIZED;
            }
            
            StringWriter sw = new StringWriter();
            TabularOutput t = new TabularOutput(sw);
            t.flushEvery(Integer.MAX_VALUE);
            t.addColumn("#");
            t.addColumn("image").alignLeft();
            t.addColumn("type");
            t.addColumn("tf");
            t.addColumn("tfByDocument").alignLeft();
            t.addColumn("fieldIndices");

            if (stemIndex != null)
            {
                t.addColumn("stemIndex");
                t.addColumn("=>stem").alignLeft();
            }

            for (int i = 0; i < image.length; i++, t.nextRow())
            {
                t.rowData(
                    i,
                    image[i] == null ? "<null>" : new String(image[i]),
                    type[i],
                    tf[i],
                    SparseArray.sparseToString(tfByDocument[i]));

                t.rowData(Arrays.toString(toFieldIndexes(fieldIndices[i])).replace(" ", ""));

                if (stemIndex != null)
                {
                    t.rowData(stemIndex[i]);
                    t.rowData(new String(allStems.image[stemIndex[i]]));
                }
            }

            t.flush();
            sw.append("\n");
            return sw.toString();
        }
    }

    /**
     * Information about all unique words found in the input
     * {@link PreprocessingContext#documents}.
     */
    public final AllWords allWords = new AllWords();

    /**
     * Information about all unique stems found in the input
     * {@link PreprocessingContext#documents}. Each entry in each array corresponds to one
     * base form different words can be transformed to by the {@link IStemmer} used while
     * processing. E.g. the English <em>mining</em> and <em>mine</em> will be aggregated
     * to one entry in the arrays, while they will have separate entries in
     * {@link AllWords}.
     * <p>
     * All arrays in this class have the same length and values across different arrays
     * correspond to each other for the same index.
     */
    public class AllStems
    {
        /**
         * Stem image as produced by the {@link IStemmer}, may not correspond to any
         * correct word.
         * <p>
         * This array is produced by {@link LanguageModelStemmer}.
         */
        public char [][] image;

        /**
         * Pointer to the {@link AllWords} arrays, to the most frequent original form of
         * the stem. Pointers to the less frequent variants are not available.
         * <p>
         * This array is produced by {@link LanguageModelStemmer}.
         */
        public int [] mostFrequentOriginalWordIndex;

        /**
         * Term frequency of the stem, i.e. the sum of all {@link AllWords#tf} values
         * for which the {@link AllWords#stemIndex} points to this stem.
         * <p>
         * This array is produced by {@link LanguageModelStemmer}.
         */
        public int [] tf;

        /**
         * Term frequency of the stem for each document. For the encoding of this array,
         * see {@link AllWords#tfByDocument}.
         * <p>
         * This array is produced by {@link LanguageModelStemmer}. The order of documents in this
         * array is not defined.
         */
        public int [][] tfByDocument;

        /**
         * A bit-packed indices of all fields in which this word appears at least once. 
         * Indexes (positions) of selected bits are pointers to the 
         * {@link AllFields} arrays. Fast conversion between the bit-packed representation
         * and <code>byte[]</code> with index values is done by {@link #toFieldIndexes(byte)}  
         * <p>
         * This array is produced by {@link LanguageModelStemmer}
         */
        public byte [] fieldIndices;

        /** For debugging purposes. */
        @Override
        public String toString()
        {
            if (image == null)
            {
                return UNINITIALIZED;
            }
            
            StringWriter sw = new StringWriter();
            TabularOutput t = new TabularOutput(sw);
            t.flushEvery(Integer.MAX_VALUE);
            t.addColumn("#");
            t.addColumn("stem");
            t.addColumn("mostFrqWord");
            t.addColumn("=>mostFrqWord").alignLeft();
            t.addColumn("tf");
            t.addColumn("tfByDocument").alignLeft();
            t.addColumn("fieldIndices");

            for (int i = 0; i < image.length; i++, t.nextRow())
            {
                t.rowData(
                    i,
                    image[i] == null ? "<null>" : new String(image[i]),
                    mostFrequentOriginalWordIndex[i],
                    new String(allWords.image[mostFrequentOriginalWordIndex[i]]),
                    tf[i],
                    SparseArray.sparseToString(tfByDocument[i]),
                    Arrays.toString(toFieldIndexes(fieldIndices[i])).replace(" ", ""));
            }

            t.flush();
            sw.append("\n");
            return sw.toString();
        }
    }

    /**
     * Information about all unique stems found in the input
     * {@link PreprocessingContext#documents}.
     */
    public final AllStems allStems = new AllStems();

    /**
     * Information about all frequently appearing sequences of words found in the input
     * {@link PreprocessingContext#documents}. Each entry in each array corresponds to one
     * sequence.
     * <p>
     * All arrays in this class have the same length and values across different arrays
     * correspond to each other for the same index.
     */
    public class AllPhrases
    {
        /**
         * Pointers to {@link AllWords} for each word in the phrase sequence.
         * <p>
         * This array is produced by {@link PhraseExtractor}.
         */
        public int [][] wordIndices;

        /**
         * Term frequency of the phrase.
         * <p>
         * This array is produced by {@link PhraseExtractor}.
         */
        public int [] tf;

        /**
         * Term frequency of the phrase for each document. The encoding of this
         * array is similar to {@link AllWords#tfByDocument}: consecutive pairs of:
         * document index, frequency.
         * <p>
         * This array is produced by {@link PhraseExtractor}. The order of documents in this
         * array is not defined.
         */
        public int [][] tfByDocument;

        /** For debugging purposes. */
        @Override
        public String toString()
        {
            if (wordIndices == null)
            {
                return UNINITIALIZED;
            }

            StringWriter sw = new StringWriter();
            TabularOutput t = new TabularOutput(sw);
            t.flushEvery(Integer.MAX_VALUE);
            t.addColumn("#");
            t.addColumn("wordIndices");
            t.addColumn("=>words").alignLeft();
            t.addColumn("tf");
            t.addColumn("tfByDocument").alignLeft();

            for (int i = 0; i < wordIndices.length; i++, t.nextRow())
            {
                t.rowData(
                    i,
                    Arrays.toString(wordIndices[i]).replace(" ", ""),
                    getPhrase(i),
                    tf[i],
                    SparseArray.sparseToString(tfByDocument[i]));
            }

            t.flush();
            sw.append("\n");
            return sw.toString();
        }

        /** Returns space-separated words that constitute this phrase. */
        public CharSequence getPhrase(int index)
        {
            StringBuilder sb = new StringBuilder();
            for (int i = 0; i < wordIndices[index].length; i++)
            {
                if (i > 0) sb.append(" ");
                sb.append(new String(allWords.image[wordIndices[index][i]]));
            }
            return sb;
        }
        
        /**
         * Returns length of all arrays in this {@link AllPhrases}.
         */
        public int size()
        {
            return wordIndices.length;
        }
    }

    /**
     * Information about all frequently appearing sequences of words found in the input
     * {@link PreprocessingContext#documents}.
     */
    public AllPhrases allPhrases = new AllPhrases();

    /**
     * Information about words and phrases that might be good cluster label candidates.
     * Each entry in each array corresponds to one label candidate.
     * <p>
     * All arrays in this class have the same length and values across different arrays
     * correspond to each other for the same index.
     */
    public class AllLabels
    {
        /**
         * Feature index of the label candidate. Features whose values are less than the
         * size of {@link AllWords} arrays are single word features and point to entries
         * in {@link AllWords}. Features whose values are larger or equal to the size of
         * {@link AllWords}, after subtracting the size of {@link AllWords}, point to
         * {@link AllPhrases}.
         * <p>
         * This array is produced by {@link LabelFilterProcessor}.
         */
        public int [] featureIndex;

        /**
         * Indices of documents assigned to the label candidate.
         * <p>
         * This array is produced by {@link DocumentAssigner}.
         */
        public BitSet [] documentIndices;

        /**
         * The first index in {@link #featureIndex} which 
         * points to {@link AllPhrases}, or -1 if there are no phrases
         * in {@link #featureIndex}.
         * <p>
         * This value is set by {@link LabelFilterProcessor}.
         * 
         * @see #featureIndex
         */
        public int firstPhraseIndex;
        
        /** For debugging purposes. */
        @Override
        public String toString()
        {
            if (featureIndex == null)
                return UNINITIALIZED;

            StringWriter sw = new StringWriter();
            TabularOutput t = new TabularOutput(sw);
            t.flushEvery(Integer.MAX_VALUE);
            t.addColumn("#");
            t.addColumn("featureIdx");
            t.addColumn("=>feature").alignLeft();
            t.addColumn("documentIdx").alignLeft();

            for (int i = 0; i < featureIndex.length; i++, t.nextRow())
            {
                t.rowData(
                    i,
                    featureIndex[i],
                    getLabel(i),
                    documentIndices != null ? documentIndices[i].toString().replace(" ", "") : "");
            }

            t.flush();
            sw.append("\n");
            return t.toString();
        }

        private CharSequence getLabel(int index)
        {
            final int wordsSize = allWords.image.length;
            if (featureIndex[index] < wordsSize)
                return new String(allWords.image[featureIndex[index]]);
            else
                return allPhrases.getPhrase(featureIndex[index] - wordsSize);
        }        
    }

    /**
     * Information about words and phrases that might be good cluster label candidates.
     */
    public final AllLabels allLabels = new AllLabels();

    /**
     * Returns <code>true</code> if this context contains any words.
     */
    public boolean hasWords()
    {
        return allWords.image.length > 0;
    }

    /**
     * Returns <code>true</code> if this context contains any label candidates.
     */
    public boolean hasLabels()
    {
        return allLabels.featureIndex != null && allLabels.featureIndex.length > 0;
    }

    @Override
    public String toString()
    {
        return "PreprocessingContext 0x" + Integer.toHexString(this.hashCode()) + "\n"
            + "== Fields:\n" + this.allFields.toString()
            + "== Tokens:\n" + this.allTokens.toString()
            + "== Words:\n" + this.allWords.toString()
            + "== Stems:\n" + this.allStems.toString()
            + "== Phrases:\n" + this.allPhrases.toString()
            + "== Labels:\n" + this.allLabels.toString();
    }
    
    /**
     * Static conversion between selected bits and an array of indexes of these bits. 
     */
    private final static int [][] bitsCache;
    static
    {
        bitsCache = new int [0x100][];
        for (int i = 0; i < 0x100; i++)
        {
            bitsCache[i] = new int [Integer.bitCount(i & 0xFF)];
            for (int v = 0, bit = 0, j = i & 0xff; j != 0; j >>>= 1, bit++)
            {
                if ((j & 0x1) != 0)
                    bitsCache[i][v++] = bit;
            }
        }
    }
    
    /**
     * Convert the selected bits in a byte to an array of indexes.
     */
    public static int [] toFieldIndexes(byte b)
    {
        return bitsCache[b & 0xff];
    }

    /* 
     * These should really be package-private, shouldn't they? We'd need to move classes under pipeline.
     * here for accessibility.
     */

    /**
     * This method should be invoked after all preprocessing contributors have been executed
     * to release temporary data structures. 
     */
    public void preprocessingFinished()
    {
        this.tokenCache = null;
    }

    /**
     * Return a unique char buffer representing a given character sequence.
     */
    public char [] intern(MutableCharArray chs)
    {
        int index = tokenCache.indexOf(chs);
        if (tokenCache.indexExists(index))
        {
            return tokenCache.indexGet(index).getBuffer();
        }
        else
        {
            final char [] tokenImage = new char [chs.length()];
            System.arraycopy(chs.getBuffer(), chs.getStart(), tokenImage, 0, chs.length());
            tokenCache.add(new MutableCharArray(tokenImage));
            return tokenImage;
        }
    }
}