STCClusteringAlgorithm.java example

Explorer

carrot2-master
- applications
  - carrot2-benchmarks
    - src-test
      - org
        carrot2
        core
        ControllerOverheadBenchmark.java
        benchmarks
        memtime
        BasicPreprocessing.java
        CompletePreprocessing.java
        MemTimeBenchmark.java
        OpenSourceAlgorithmsBenchmark.java
        PreprocessingBenchmark.java
  - carrot2-cli
    - src
      - org
        carrot2
        cli
        batch
        BatchApp.java
  - carrot2-dcs
    - examples
      - java
        src
        main
        java
        org
        carrot2
        dcs
        Examples.java
        HttpClientPostProvider.java
        IHttpMultipartPostProvider.java
        JaxRsPostProvider.java
        StreamUtils.java
    - src
      - org
        carrot2
        dcs
        DcsApp.java
        DcsConfig.java
        DcsRequestModel.java
        MemoryFileItemFactory.java
        RestProcessorServlet.java
    - src-test
      - org
        carrot2
        dcs
        AuthConnectionTest.java
        DcsAppTest.java
  - carrot2-examples
    - examples
      - org
        carrot2
        examples
        ConsoleFormatter.java
        CreateLuceneIndex.java
        SampleDocumentData.java
        clustering
        BingKeyAccess.java
        ClusteringDataFromDocumentSources.java
        ClusteringDataFromLucene.java
        ClusteringDataFromLuceneWithCustomFields.java
        ClusteringDataFromPubMed.java
        ClusteringDocumentList.java
        ClusteringNonEnglishContent.java
        MoreConfigurationsOfOneAlgorithmInCachingController.java
        UsingAttributes.java
        UsingCachingController.java
        UsingComponentSuites.java
        UsingCustomLanguageModel.java
        UsingCustomLexicalResources.java
        core
        LoadingAttributeValuesFromXml.java
        SavingAttributeValuesToXml.java
        SavingResultsToJson.java
        SavingResultsToXml.java
        research
        ClusteringQualityBenchmark.java
        source
        ByFirstTitleLetterClusteringAlgorithm.java
        ExampleCustomComponents.java
        ModuloDocumentSource.java
  - carrot2-webapp
    - src
      - org
        carrot2
        webapp
        LogInitContextListener.java
        QueryProcessorServlet.java
        RootRedirectFilter.java
        WebApp.java
        filter
        FarFutureExpiresHeaderFilter.java
        QueryWordHighlighter.java
        jawr
        JawrUrlGenerator.java
        model
        AssetUrlsModel.java
        AttributeMetadataModel.java
        ModelWithDefault.java
        PageModel.java
        RequestModel.java
        RequestType.java
        ResultsCacheModel.java
        ResultsSizeModel.java
        ResultsViewModel.java
        SkinModel.java
        WebappConfig.java
        util
        UserAgentUtils.java
    - src-test
      - org
        carrot2
        webapp
        ControlUnicodeCharacterTest.java
        filter
        QueryWordHighlighterTest.java
        util
        UserAgentUtilsTest.java
- core
  - carrot2-algorithm-kmeans
    - src
      - org
        carrot2
        clustering
        kmeans
        BisectingKMeansClusteringAlgorithm.java
        BisectingKMeansProcessingContext.java
    - src-test
      - org
        carrot2
        clustering
        kmeans
        BisectingKMeansClusteringAlgorithmTest.java
  - carrot2-algorithm-lingo
    - src
      - org
        carrot2
        clustering
        lingo
        ClusterBuilder.java
        IFeatureScorer.java
        ILabelAssigner.java
        LingoClusteringAlgorithm.java
        LingoProcessingContext.java
        SimpleLabelAssigner.java
        UniqueLabelAssigner.java
    - src-test
      - org
        carrot2
        clustering
        lingo
        ClusterDocumentAssignerTest.java
        ClusterLabelBuilderTest.java
        ClusterMergerTest.java
        LingoClusteringAlgorithmTest.java
        LingoProcessingComponentTestBase.java
  - carrot2-algorithm-stc
    - src
      - org
        carrot2
        clustering
        stc
        ClusterCandidate.java
        GeneralizedSuffixTree.java
        STCClusteringAlgorithm.java
        text
        suffixtree
        CharacterSequence.java
        ISequence.java
        IntegerSequence.java
        SuffixTree.java
        SuffixTreeBuilder.java
    - src-test
      - org
        carrot2
        clustering
        stc
        GeneralizedSuffixTreeTest.java
        STCClusteringAlgorithmTest.java
        text
        suffixtree
        SuffixTreeTest.java
  - carrot2-algorithm-synthetic
    - src
      - org
        carrot2
        clustering
        synthetic
        ByFieldClusteringAlgorithm.java
        ByUrlClusteringAlgorithm.java
        PassthroughClusteringAlgorithm.java
    - src-test
      - org
        carrot2
        clustering
        synthetic
        ByFieldClusteringAlgorithmTest.java
        ByUrlClusteringAlgorithmTest.java
        DocumentWithUrlsFactory.java
  - carrot2-component-suites
    - src-test
      - org
        carrot2
        core
        ComponentSuitesTest.java
  - carrot2-core
    - src
      - org
        carrot2
        core
        CachingProcessingComponentManager.java
        Cluster.java
        ComponentInitializationException.java
        Controller.java
        ControllerContextImpl.java
        ControllerContextListenerAdapter.java
        ControllerFactory.java
        ControllerStatistics.java
        ControllerUtils.java
        Document.java
        DocumentSourceDescriptor.java
        ExecutorServiceShutdownListener.java
        HttpAuthHub.java
        IClusteringAlgorithm.java
        IControllerContext.java
        IControllerContextListener.java
        IDocumentSource.java
        IProcessingComponent.java
        IProcessingComponentManager.java
        LanguageCode.java
        Platform.java
        PoolingProcessingComponentManager.java
        ProcessingComponentBase.java
        ProcessingComponentConfiguration.java
        ProcessingComponentDescriptor.java
        ProcessingComponentSuite.java
        ProcessingComponentSuiteInclude.java
        ProcessingException.java
        ProcessingResult.java
        ReferenceEquality.java
        SimpleProcessingComponentManager.java
        attribute
        AttributeNames.java
        CommonAttributes.java
        Init.java
        Internal.java
        InternalAttributePredicate.java
        Processing.java
        source
        MultipageSearchEngine.java
        MultipageSearchEngineMetadata.java
        SearchEngineBase.java
        SearchEngineResponse.java
        SearchEngineStats.java
        SimpleSearchEngine.java
        UniqueFieldPredicate.java
    - src-test
      - org
        carrot2
        core
        ClusterTest.java
        ControllerTest.java
        ControllerTestsBase.java
        ControllerTestsCaching.java
        ControllerTestsCommon.java
        ControllerTestsPooling.java
        DelegatingProcessingComponent.java
        DocumentTest.java
        DummyControllerContext.java
        ProcessingResultTest.java
        TestAlgorithm.java
        TestDocumentSource.java
        test
        Assertions.java
        ByteByteArrayAssert.java
        CharCharArrayAssert.java
        ClusteringAlgorithmTestBase.java
        DocumentSourceTestBase.java
        DoubleArrayAssert.java
        IntIntArrayAssert.java
        MultipageDocumentSourceTestBase.java
        ProcessingComponentTestBase.java
        QueryableDocumentSourceTestBase.java
        SampleDocumentData.java
        TestDocumentFactory.java
        assertions
        Carrot2CoreAssertions.java
        ClusterAssertion.java
        ClusterCheck.java
        ClusterListAssertion.java
        ClusterPairCheck.java
        DocumentAssertion.java
        DocumentListAssertion.java
        GenericListAssertion.java
        source
        SearchRangeTest.java
  - carrot2-output-metrics
    - src
      - org
        carrot2
        output
        metrics
        ClusteringMetricsCalculator.java
        ContaminationMetric.java
        IClusteringMetric.java
        IdealPartitioningBasedMetric.java
        NormalizedMutualInformationMetric.java
        PrecisionRecallMetric.java
    - src-test
      - org
        carrot2
        output
        metrics
        ContaminationMetricTest.java
        IdealPartitioningBasedMetricTest.java
        NormalizedMutualInformationMetricTest.java
        PrecisionRecallMetricTest.java
  - carrot2-source-ambient
    - src
      - org
        carrot2
        source
        ambient
        AmbientDocumentSource.java
        FubDocumentSource.java
        FubTestCollection.java
        Odp239DocumentSource.java
    - src-test
      - org
        carrot2
        source
        ambient
        AmbientDocumentSourceTest.java
        FubDocumentSourceTestBase.java
        Odp239DocumentSourceTest.java
  - carrot2-source-etools
    - src
      - org
        carrot2
        source
        etools
        EToolsDocumentSource.java
        IpBannedException.java
    - src-test
      - org
        carrot2
        source
        etools
        EToolsDocumentSourceTest.java
  - carrot2-source-idol
    - src
      - org
        carrot2
        source
        idol
        IdolDocumentSource.java
  - carrot2-source-lucene
    - src
      - org
        carrot2
        source
        lucene
        FSDirectoryWrapper.java
        IFieldMapper.java
        LuceneDocumentSource.java
        PlainTextFormatter.java
        SimpleFieldMapper.java
    - src-test
      - org
        carrot2
        source
        lucene
        FSDirectoryWrapperTest.java
        LuceneDocumentSourceTest.java
        LuceneIndexUtils.java
  - carrot2-source-microsoft
    - src
      - org
        carrot2
        source
        microsoft
        v5
        AdultOption.java
        Bing5DocumentSource.java
        Bing5NewsDocumentSource.java
        BingResponse.java
        ErrorResponse.java
        Freshness.java
        MarketOption.java
        NewsResponse.java
        SearchResponse.java
        SourceType.java
        UnstructuredResponse.java
    - src-test
      - org
        carrot2
        source
        microsoft
        v5
        Bing5DocumentSourceTest.java
        Bing5NewsDocumentSourceTest.java
        Bing5ResponseParsingTest.java
  - carrot2-source-opensearch
    - src
      - org
        carrot2
        source
        opensearch
        OpenSearchDocumentSource.java
        RomeFetcherUtils.java
    - src-test
      - org
        carrot2
        source
        opensearch
        OpenSearchDocumentSourceByResultIncrementTest.java
        OpenSearchDocumentSourceTest.java
  - carrot2-source-pubmed
    - src
      - org
        carrot2
        source
        pubmed
        EmptyEntityResolver.java
        PathTrackingHandler.java
        PubMedContentHandler.java
        PubMedDocumentSource.java
        PubMedIdSearchHandler.java
    - src-test
      - org
        carrot2
        source
        pubmed
        PubMedContentHandlerTest.java
        PubMedDocumentSourceTest.java
        PubMedIdSearchHandlerTest.java
  - carrot2-source-solr
    - src
      - org
        carrot2
        source
        solr
        SolrDocumentSource.java
  - carrot2-source-xml
    - src
      - org
        carrot2
        source
        xml
        RemoteXmlSimpleSearchEngineBase.java
        XmlDocumentSource.java
        XmlDocumentSourceHelper.java
    - src-test
      - org
        carrot2
        source
        xml
        XmlDocumentSourceTest.java
  - carrot2-util-common
    - src
      - org
        carrot2
        util
        CharArrayUtils.java
        CharSequenceUtils.java
        CloseableUtils.java
        CollectionUtils.java
        ExceptionUtils.java
        ExecutorServiceUtils.java
        GraphUtils.java
        IntArrayPredicateIterator.java
        IntMapUtils.java
        LinearApproximation.java
        ListUtils.java
        MapUtils.java
        MathUtils.java
        Pair.java
        PriorityQueue.java
        RangeUtils.java
        ReflectionUtils.java
        RollingWindowAverage.java
        SetUtils.java
        StreamUtils.java
        StringUtils.java
        SystemPropertyStack.java
        annotations
        AspectModified.java
        Immutable.java
        ThreadSafe.java
        attribute
        AttributeValueSet.java
        AttributeValueSets.java
        DefaultGroups.java
        factory
        CachedInstanceFactoryDecorator.java
        FallbackFactory.java
        IFactory.java
        NewClassInstanceFactory.java
        SingletonFactory.java
        httpclient
        HttpClientFactory.java
        HttpHeaders.java
        HttpRedirectStrategy.java
        HttpUtils.java
        pool
        FixedSizePool.java
        IActivationListener.java
        IDisposalListener.java
        IInstantiationListener.java
        IParameterizedPool.java
        IPassivationListener.java
        SoftUnboundedPool.java
        resource
        ClassLoaderLocator.java
        ClassLoaderResource.java
        ClassLocator.java
        ClassResource.java
        ContextClassLoaderLocator.java
        DirLocator.java
        FileResource.java
        IResource.java
        IResourceLocator.java
        PrefixDecoratorLocator.java
        ResourceCache.java
        ResourceLookup.java
        ServletContextLocator.java
        URLResource.java
        URLResourceWithParams.java
        simplexml
        DefaultConstructorSimpleXmlWrapper.java
        ISimpleXmlWrapper.java
        ISourceLocationAware.java
        ListSimpleXmlWrapper.java
        MapSimpleXmlWrapper.java
        PersisterHelpers.java
        SessionInitStrategy.java
        SimpleXmlWrapperValue.java
        SimpleXmlWrappers.java
        tests
        CarrotTestCase.java
        SuiteResultInfoWriter.java
        UsesExternalServices.java
        xslt
        NopURIResolver.java
        StylesheetErrorListener.java
        TemplatesPool.java
        TransformerErrorListener.java
    - src-test
      - org
        carrot2
        util
        CharArrayUtilsTest.java
        ExceptionUtilsTest.java
        GraphUtilsTest.java
        IndirectSorterTest.java
        IntArrayPredicateIteratorTest.java
        LinearApproximationTest.java
        RangeUtilsTest.java
        RollingWindowAverageTest.java
        StringUtilsTest.java
        attribute
        AttributeValueSetTest.java
        AttributeValueSetsTest.java
        ResourceFromStringTest.java
        httpclient
        HttpClientFactoryTest.java
        pool
        FixedSizePoolTest.java
        ParameterizedPoolTestBase.java
        SoftUnboundedPoolTest.java
        resource
        ResourceLookupTest.java
        ServletContextLocatorTest.java
        URLResourceWithParamsTest.java
        simplexml
        SimpleXmlWrappersTest.java
  - carrot2-util-log4j
    - src
      - org
        carrot2
        log4j
        BufferingAppender.java
  - carrot2-util-matrix
    - src
      - org
        carrot2
        mahout
        collections
        Arithmetic.java
        Constants.java
        common
        RandomUtils.java
        math
        AbstractMatrix.java
        AbstractVector.java
        Algebra.java
        Arrays.java
        CardinalityException.java
        DenseMatrix.java
        DenseVector.java
        IndexException.java
        Matrix.java
        MatrixSlice.java
        MatrixVectorView.java
        MatrixView.java
        OrderedIntDoubleMapping.java
        PersistentObject.java
        RandomAccessSparseVector.java
        SequentialAccessSparseVector.java
        SingularValueDecomposition.java
        Sorting.java
        Swapper.java
        Vector.java
        VectorIterable.java
        VectorView.java
        buffer
        DoubleBufferConsumer.java
        IntBufferConsumer.java
        function
        ByteComparator.java
        CharComparator.java
        DoubleComparator.java
        DoubleDoubleFunction.java
        DoubleFunction.java
        DoubleProcedure.java
        FloatComparator.java
        Functions.java
        IntComparator.java
        IntDoubleProcedure.java
        IntIntDoubleFunction.java
        IntProcedure.java
        LongComparator.java
        Mult.java
        PlusMult.java
        ShortComparator.java
        VectorFunction.java
        list
        AbstractDoubleList.java
        AbstractIntList.java
        AbstractList.java
        DoubleArrayList.java
        IntArrayList.java
        map
        AbstractIntDoubleMap.java
        HashFunctions.java
        OpenIntDoubleHashMap.java
        PrimeFinder.java
        matrix
        DoubleMatrix1D.java
        DoubleMatrix2D.java
        impl
        AbstractMatrix.java
        AbstractMatrix1D.java
        AbstractMatrix2D.java
        DelegateDoubleMatrix1D.java
        DenseDoubleMatrix1D.java
        DenseDoubleMatrix2D.java
        SelectedDenseDoubleMatrix1D.java
        SelectedDenseDoubleMatrix2D.java
        SelectedSparseDoubleMatrix1D.java
        SelectedSparseDoubleMatrix2D.java
        SparseDoubleMatrix1D.java
        SparseDoubleMatrix2D.java
        WrapperDoubleMatrix1D.java
        WrapperDoubleMatrix2D.java
        linalg
        EigenvalueDecomposition.java
        Property.java
        set
        AbstractSet.java
        matrix
        MatrixUtils.java
        factorization
        IIterativeMatrixFactorization.java
        IMatrixFactorization.java
        IMatrixFactorizationFactory.java
        IterationNumberGuesser.java
        IterativeMatrixFactorizationBase.java
        IterativeMatrixFactorizationFactory.java
        KMeansMatrixFactorization.java
        KMeansMatrixFactorizationFactory.java
        LocalNonnegativeMatrixFactorization.java
        LocalNonnegativeMatrixFactorizationFactory.java
        MatrixFactorizationBase.java
        NonnegativeMatrixFactorizationED.java
        NonnegativeMatrixFactorizationEDFactory.java
        NonnegativeMatrixFactorizationKL.java
        NonnegativeMatrixFactorizationKLFactory.java
        PartialSingularValueDecomposition.java
        PartialSingularValueDecompositionFactory.java
        seeding
        ISeedingStrategy.java
        ISeedingStrategyFactory.java
        KMeansSeedingStrategy.java
        KMeansSeedingStrategyFactory.java
        RandomSeedingStrategy.java
        RandomSeedingStrategyFactory.java
    - src-test
      - org
        carrot2
        matrix
        DoubleMatrix1DAssertion.java
        DoubleMatrix2DAssertion.java
        MatrixAssertions.java
        MatrixUtilsTest.java
        factorization
        MatrixFactorizationTest.java
  - carrot2-util-text
    - src
      - org
        carrot2
        text
        analysis
        ExtendedWhitespaceTokenizer.java
        ExtendedWhitespaceTokenizerImpl.java
        ITokenizer.java
        TokenTypeUtils.java
        clustering
        IMonolingualClusteringAlgorithm.java
        MultilingualClustering.java
        linguistic
        DefaultLexicalData.java
        DefaultLexicalDataFactory.java
        DefaultStemmerFactory.java
        DefaultTokenizerFactory.java
        ILexicalData.java
        ILexicalDataFactory.java
        IStemmer.java
        IStemmerFactory.java
        ITokenizerFactory.java
        IdentityStemmer.java
        IdentityStemmerFactory.java
        JapaneseUnsupportedStub.java
        LanguageModel.java
        LexicalDataLoader.java
        SnowballStemmerAdapter.java
        lucene
        ArabicStemmerAdapter.java
        ChineseTokenizerAdapter.java
        HindiNormalizer.java
        HindiStemmer.java
        HindiStemmerAdapter.java
        IndicNormalizer.java
        StemmerUtil.java
        ThaiTokenizerAdapter.java
        morfologik
        MorfologikStemmerAdapter.java
        snowball
        Among.java
        SnowballProgram.java
        stemmers
        DanishStemmer.java
        DutchStemmer.java
        EnglishStemmer.java
        FinnishStemmer.java
        FrenchStemmer.java
        GermanStemmer.java
        HungarianStemmer.java
        ItalianStemmer.java
        NorwegianStemmer.java
        PortugueseStemmer.java
        RomanianStemmer.java
        RussianStemmer.java
        SpanishStemmer.java
        SwedishStemmer.java
        TurkishStemmer.java
        preprocessing
        CaseNormalizer.java
        DocumentAssigner.java
        LabelFilterProcessor.java
        LabelFormatter.java
        LanguageModelStemmer.java
        PhraseExtractor.java
        PreprocessedDocumentScanner.java
        PreprocessingContext.java
        SparseArray.java
        StopListMarker.java
        Substring.java
        SubstringComparator.java
        SuffixSorter.java
        Tokenizer.java
        filter
        CompleteLabelFilter.java
        CompleteLabelFilterBase.java
        GenitiveLabelFilter.java
        ILabelFilter.java
        LeftCompleteLabelFilter.java
        MinLengthLabelFilter.java
        NumericLabelFilter.java
        QueryLabelFilter.java
        RightCompleteLabelFilter.java
        SingleLabelFilterBase.java
        StopLabelFilter.java
        StopWordLabelFilter.java
        pipeline
        BasicPreprocessingPipeline.java
        CompletePreprocessingPipeline.java
        IPreprocessingPipeline.java
        util
        CharArrayComparators.java
        MutableCharArray.java
        MutableCharArrayUtils.java
        TabularOutput.java
        vsm
        ITermWeighting.java
        LinearTfIdfTermWeighting.java
        LogTfIdfTermWeighting.java
        ReducedVectorSpaceModelContext.java
        TermDocumentMatrixBuilder.java
        TermDocumentMatrixReducer.java
        TfTermWeighting.java
        VectorSpaceModelContext.java
    - src-test
      - org
        carrot2
        text
        clustering
        MultilingualClusteringTest.java
        linguistic
        ArabicTest.java
        ChineseTokenizerTest.java
        DefaultLexicalDataFactoryTest.java
        DefaultStemmerFactoryTest.java
        DefaultTokenizerFactoryTest.java
        EnglishTest.java
        ExtendedWhitespaceTokenizerTest.java
        HindiStemmerFactoryTest.java
        LanguageModelTest.java
        LanguageModelTestBase.java
        PolishTest.java
        ThaiTokenizerTest.java
        TokenizerTestBase.java
        preprocessing
        CaseNormalizerTest.java
        DocumentAssignerTest.java
        LabelFilterTestBase.java
        LabelFormatterTest.java
        PhraseExtractorTest.java
        PreprocessedDocumentScannerTest.java
        PreprocessingComponentTestBase.java
        PreprocessingContextAssert.java
        PreprocessingContextBuilder.java
        PreprocessingContextTestBase.java
        StemmerEnglishTest.java
        StemmerSyntheticTest.java
        SubstringComparatorTest.java
        SuffixSorterTest.java
        TestLanguageModelFactory.java
        TestLexicalDataFactory.java
        TestStemmerFactory.java
        TestTokenizerFactory.java
        TokenizerTest.java
        WordMarkerTest.java
        filter
        CompleteLabelFilterTest.java
        GenitiveLabelFilterTest.java
        MinLengthLabelFilterTest.java
        NumericLabelFilterTest.java
        QueryLabelFilterTest.java
        StopLabelFilterEnglishTest.java
        StopWordLabelFilterEnglishTest.java
        StopWordLabelFilterMergedTest.java
        StopWordLabelFilterSyntheticTest.java
        util
        CharArrayComparatorsTest.java
        MutableCharArrayTest.java
        MutableCharArrayUtilsTest.java
        vsm
        PhraseMatrixBuilderTest.java
        ReducedTermDocumentMatrixBuilderTestBase.java
        TermDocumentMatrixBuilderTest.java
        TermDocumentMatrixBuilderTestBase.java
  - carrot2-util-xsltfilter
    - src
      - org
        carrot2
        util
        xsltfilter
        AddHeaderFilter.java
        DeferredOutputStream.java
        IContentTypeListener.java
        TransformingDocumentHandler.java
        XSLTFilter.java
        XSLTFilterConstants.java
        XSLTFilterServletResponse.java
- doc
  - src
    - org
      - carrot2
        core
        ProcessingComponentDumper.java
- lib
  - org.carrot2.antlib
    - src
      - main
        java
        org
        carrot2
        antlib
        tasks
        AbstractLicenseTask.java
        FileURL.java
        FindVersionTask.java
        LicenseListTask.java
        LicenseReplaceTask.java
        SetPropertyTask.java
        SourceFile.java
        SwitchClassLoader.java
- workbench


/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.clustering.stc;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Locale;
import java.util.Map;

import org.carrot2.clustering.stc.GeneralizedSuffixTree.SequenceBuilder;
import org.carrot2.core.Cluster;
import org.carrot2.core.Document;
import org.carrot2.core.IClusteringAlgorithm;
import org.carrot2.core.LanguageCode;
import org.carrot2.core.ProcessingComponentBase;
import org.carrot2.core.ProcessingException;
import org.carrot2.core.attribute.AttributeNames;
import org.carrot2.core.attribute.CommonAttributes;
import org.carrot2.core.attribute.Init;
import org.carrot2.core.attribute.Internal;
import org.carrot2.core.attribute.Processing;
import org.carrot2.text.analysis.ITokenizer;
import org.carrot2.text.analysis.TokenTypeUtils;
import org.carrot2.text.clustering.IMonolingualClusteringAlgorithm;
import org.carrot2.text.clustering.MultilingualClustering;
import org.carrot2.text.linguistic.ILexicalData;
import org.carrot2.text.preprocessing.LabelFormatter;
import org.carrot2.text.preprocessing.PreprocessingContext;
import org.carrot2.text.preprocessing.pipeline.BasicPreprocessingPipeline;
import org.carrot2.text.preprocessing.pipeline.IPreprocessingPipeline;
import org.carrot2.util.attribute.Attribute;
import org.carrot2.util.attribute.AttributeLevel;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.DefaultGroups;
import org.carrot2.util.attribute.Group;
import org.carrot2.util.attribute.Input;
import org.carrot2.util.attribute.Label;
import org.carrot2.util.attribute.Level;
import org.carrot2.util.attribute.Output;
import org.carrot2.util.attribute.Required;
import org.carrot2.util.attribute.constraint.DoubleRange;
import org.carrot2.util.attribute.constraint.ImplementingClasses;
import org.carrot2.util.attribute.constraint.IntRange;

import com.carrotsearch.hppc.BitSet;
import com.carrotsearch.hppc.BitSetIterator;
import com.carrotsearch.hppc.IntArrayList;
import com.carrotsearch.hppc.IntStack;
import org.carrot2.shaded.guava.common.base.Predicate;
import org.carrot2.shaded.guava.common.collect.Collections2;
import org.carrot2.shaded.guava.common.collect.Lists;
import org.carrot2.shaded.guava.common.collect.Maps;

/**
 * Suffix Tree Clustering (STC) algorithm. Pretty much as described in: <i>Oren Zamir,
 * Oren Etzioni, Grouper: A Dynamic Clustering Interface to Web Search Results, 1999.</i>
 * Some liberties were taken wherever STC's description was not clear enough or where we
 * thought some improvements could be made.
 */
@Bindable(prefix = "STCClusteringAlgorithm", inherit = CommonAttributes.class)
@Label("STC Clustering")
public final class STCClusteringAlgorithm extends ProcessingComponentBase implements
    IClusteringAlgorithm
{
    /** {@link Group} name. */
    private final static String BASE_CLUSTERS = "Base clusters";
    /** {@link Group} name. */
    private final static String MERGING_AND_OUTPUT = "Merging and output";

    /**
     * Query that produced the documents. The query will help the algorithm to create
     * better clusters. Therefore, providing the query is optional but desirable.
     */
    @Processing
    @Input
    @Internal
    @Attribute(key = AttributeNames.QUERY, inherit = true)
    public String query = null;

    /**
     * Documents to cluster.
     */
    @Processing
    @Input
    @Required
    @Internal
    @Attribute(key = AttributeNames.DOCUMENTS, inherit = true)
    public List<Document> documents;

    /**
     * Clusters created by the algorithm.
     */
    @Processing
    @Output
    @Internal
    @Attribute(key = AttributeNames.CLUSTERS, inherit = true)
    public List<Cluster> clusters = null;

    /**
     * Minimum word-document recurrences.
     */
    @Processing
    @Input
    @Attribute
    @IntRange(min = 2)
    @Level(AttributeLevel.MEDIUM)
    @Group(DefaultGroups.WORD_FILTERING)
    public int ignoreWordIfInFewerDocs = 2;

    /**
     * Maximum word-document ratio. A number between 0 and 1, if a word exists in more
     * snippets than this ratio, it is ignored.
     */
    @Processing
    @Input
    @Attribute
    @DoubleRange(min = 0, max = 1)
    @Level(AttributeLevel.MEDIUM)
    @Group(DefaultGroups.WORD_FILTERING)
    public double ignoreWordIfInHigherDocsPercent = 0.9d;

    /**
     * Minimum base cluster score.
     */
    @Processing
    @Input
    @Attribute
    @DoubleRange(min = 0, max = 10)
    @Level(AttributeLevel.ADVANCED)
    @Group(BASE_CLUSTERS)
    public double minBaseClusterScore = 2.0d;

    /**
     * Maximum base clusters count. Trims the base cluster array after N-th position for
     * the merging phase.
     */
    @Processing
    @Input
    @Attribute
    @IntRange(min = 2)
    @Level(AttributeLevel.ADVANCED)
    @Group(BASE_CLUSTERS)
    public int maxBaseClusters = 300;

    /**
     * Minimum documents per base cluster.
     */
    @Processing
    @Input
    @Attribute
    @IntRange(min = 2, max = 20)
    @Level(AttributeLevel.ADVANCED)
    @Group(BASE_CLUSTERS)
    public int minBaseClusterSize = 2;

    /**
     * Maximum final clusters.
     */
    @Processing
    @Input
    @Attribute
    @IntRange(min = 1)
    @Level(AttributeLevel.BASIC)
    @Group(MERGING_AND_OUTPUT)
    public int maxClusters = 15;

    /**
     * Base cluster merge threshold.
     */
    @Processing
    @Input
    @Attribute
    @DoubleRange(min = 0, max = 1)
    @Level(AttributeLevel.ADVANCED)
    @Group(MERGING_AND_OUTPUT)
    public double mergeThreshold = 0.6d;

    /**
     * Maximum cluster phrase overlap.
     */
    @Processing
    @Input
    @Attribute
    @DoubleRange(min = 0, max = 1)
    @Level(AttributeLevel.ADVANCED)
    @Group(DefaultGroups.LABELS)
    public double maxPhraseOverlap = 0.6d;

    /**
     * Minimum general phrase coverage. Minimum phrase coverage to appear in cluster
     * description.
     */
    @Processing
    @Input
    @Attribute
    @DoubleRange(min = 0, max = 1)
    @Level(AttributeLevel.ADVANCED)
    @Group(DefaultGroups.LABELS)
    public double mostGeneralPhraseCoverage = 0.5d;

    /**
     * Maximum words per label. Base clusters formed by phrases with more words than this
     * ratio are trimmed.
     */
    @Processing
    @Input
    @Attribute
    @IntRange(min = 1)
    @Level(AttributeLevel.BASIC)
    @Group(DefaultGroups.LABELS)    
    public int maxDescPhraseLength = 4;

    /**
     * Maximum phrases per label. Maximum number of phrases from base clusters promoted
     * to the cluster's label.
     */
    @Processing
    @Input
    @Attribute
    @IntRange(min = 1)
    @Level(AttributeLevel.BASIC)
    @Group(DefaultGroups.LABELS)
    public int maxPhrases = 3;

    /**
     * Single term boost. A factor in calculation of the base cluster score. If greater
     * then zero, single-term base clusters are assigned this value regardless of the
     * penalty function.
     */
    @Processing
    @Input
    @Attribute
    @DoubleRange(min = 0)
    @Level(AttributeLevel.MEDIUM)
    @Group(BASE_CLUSTERS)    
    public double singleTermBoost = 0.5d;

    /**
     * Optimal label length. A factor in calculation of the base cluster score.
     */
    @Processing
    @Input
    @Attribute
    @IntRange(min = 1)
    @Level(AttributeLevel.BASIC)
    @Group(BASE_CLUSTERS)
    public int optimalPhraseLength = 3;

    /**
     * Phrase length tolerance. A factor in calculation of the base cluster score.
     */
    @Processing
    @Input
    @Attribute
    @DoubleRange(min = 0.5)
    @Level(AttributeLevel.MEDIUM)
    @Group(BASE_CLUSTERS)
    public double optimalPhraseLengthDev = 2.0d;

    /**
     * Document count boost. A factor in calculation of the base cluster score, boosting
     * the score depending on the number of documents found in the base cluster.
     */
    @Processing
    @Input
    @Attribute
    @DoubleRange(min = 0)
    @Level(AttributeLevel.MEDIUM)
    @Group(BASE_CLUSTERS)
    public double documentCountBoost = 1.0d;
    
    /**
     * Common preprocessing tasks handler.
     */
    @Init
    @Input
    @Attribute
    @Internal
    @ImplementingClasses(classes = {
        BasicPreprocessingPipeline.class
    }, strict = false)
    @Level(AttributeLevel.ADVANCED)
    public IPreprocessingPipeline preprocessingPipeline = new BasicPreprocessingPipeline();

    /**
     * Balance between cluster score and size during cluster sorting. Value equal to 0.0
     * will sort clusters based only on cluster size. Value equal to 1.0
     * will sort clusters based only on cluster score.
     */
    @Input
    @Processing
    @Attribute
    @DoubleRange(min = 0.0, max = 1.0)
    @Label("Size-Score sorting ratio")
    @Level(AttributeLevel.MEDIUM)
    @Group(DefaultGroups.CLUSTERS)
    public double scoreWeight = 1.0;

    /**
     * Merge all stem-equivalent base clusters before running the merge phase.
     * 
     * @see "http://issues.carrot2.org/browse/CARROT-1008"
     */
    @Input
    @Processing
    @Attribute
    @Label("Merge all stem-equivalent phrases when discovering base clusters")
    @Level(AttributeLevel.MEDIUM)
    @Group(DefaultGroups.CLUSTERS)
    public boolean mergeStemEquivalentBaseClusters = true;

    /**
     * A helper for performing multilingual clustering.
     */
    public final MultilingualClustering multilingualClustering = new MultilingualClustering();

    /**
     * Stores the preprocessing context during {@link #process()}.
     */
    PreprocessingContext context;
    
    /**
     * Suffix tree and suffix tree input during {@link #process()}.
     */
    GeneralizedSuffixTree.SequenceBuilder sb;

    /**
     * Helper class for computing merged cluster labels.
     * 
     * @see STCClusteringAlgorithm#merge
     */
    private static final class PhraseCandidate
    {
        final ClusterCandidate cluster;
        final float coverage;
        
        /** If <code>false</code> the phrase should not be selected (various criteria). */
        boolean selected = true;
    
        /** @see STCClusteringAlgorithm#markSubSuperPhrases(ArrayList) */
        boolean mostGeneral = true;
    
        /** @see STCClusteringAlgorithm#markSubSuperPhrases(ArrayList) */
        boolean mostSpecific = true;
        
        PhraseCandidate(ClusterCandidate c, float coverage)
        {
            this.cluster = c;
            this.coverage = coverage;
        }
    }

    /**
     * Returns a collection of {@link PhraseCandidate}s that have
     * {@link PhraseCandidate#selected} set to <code>false</code>. 
     */
    private final static Predicate<PhraseCandidate> notSelected = new Predicate<PhraseCandidate>()
    {
        public boolean apply(PhraseCandidate p)
        {
            return !p.selected;
        }
    };

    /**
     * Performs STC clustering of {@link #documents}.
     */
    @Override
    public void process() throws ProcessingException
    {
        // There is a tiny trick here to support multilingual clustering without
        // refactoring the whole component: we remember the original list of documents
        // and invoke clustering for each language separately within the 
        // IMonolingualClusteringAlgorithm implementation below. This is safe because
        // processing components are not thread-safe by definition and 
        // IMonolingualClusteringAlgorithm forbids concurrent execution by contract.
        final List<Document> originalDocuments = documents;
        clusters = multilingualClustering.process(documents,
            new IMonolingualClusteringAlgorithm()
            {
                public List<Cluster> process(List<Document> documents,
                    LanguageCode language)
                {
                    STCClusteringAlgorithm.this.documents = documents;
                    STCClusteringAlgorithm.this.cluster(language);
                    return STCClusteringAlgorithm.this.clusters;
                }
            });
        documents = originalDocuments;
    }

    /**
     * Performs the actual clustering with an assumption that all documents are written in
     * one <code>language</code>.
     */
    private void cluster(LanguageCode language)
    {
        clusters = new ArrayList<Cluster>();

        /*
         * Step 1. Preprocessing: tokenization, stop word marking and stemming (if available).
         */
        context = preprocessingPipeline.preprocess(documents, query, language);

        /*
         * Step 2: Create a generalized suffix tree from phrases in the input.
         */
        sb = new GeneralizedSuffixTree.SequenceBuilder();

        final int [] tokenIndex = context.allTokens.wordIndex;
        final short [] tokenType = context.allTokens.type;
        for (int i = 0; i < tokenIndex.length; i++)
        {
            /* Advance until the first real token. */
            if (tokenIndex[i] == -1)
            {
                if ((tokenType[i] & (ITokenizer.TF_SEPARATOR_DOCUMENT | ITokenizer.TF_TERMINATOR)) != 0)
                {
                    sb.endDocument();
                }
                continue;
            }

            /* We have the first token. Advance until non-token. */
            final int s = i;

            while (tokenIndex[i + 1] != -1) i++;
            final int phraseLength = 1 + i - s; 
            if (phraseLength >= 1)
            {
                /* We have a phrase. */
                sb.addPhrase(tokenIndex, s, phraseLength);
            }
        }
        sb.buildSuffixTree();

        /*
         * Step 3: Find "base" clusters by looking up frequently recurring phrases in the 
         * generalized suffix tree.
         */
        List<ClusterCandidate> baseClusters = createBaseClusters(sb);

        /*
         * Step 4: Merge base clusters that overlap too much to form final clusters.
         */
        List<ClusterCandidate> mergedClusters = createMergedClusters(baseClusters);

        /*
         * Step 5: Create the junk (unassigned documents) cluster and create the final
         * set of clusters in Carrot2 format.
         */
        postProcessing(mergedClusters);
    }

    /**
     * Memory cleanups.
     */
    @Override
    public void afterProcessing()
    {
        super.afterProcessing();
        this.context = null;
        this.sb = null;
    }

    /**
     * Create <i>base clusters</i>. Base clusters are frequently occurring words and
     * phrases. We extract them by walking the generalized suffix tree constructed for
     * each phrase, and extracting paths from those internal tree states, that occurred in
     * more than one document.
     */
    private List<ClusterCandidate> createBaseClusters(SequenceBuilder sb)
    {
        /*
         * Collect all phrases that will form base clusters, 
         * initially filtered to fulfill the minimum acceptance criteria.
         */
        final List<ClusterCandidate> candidates = Lists.newArrayList();

        // Walk the internal nodes of the suffix tree.
        new GeneralizedSuffixTree.Visitor(sb, minBaseClusterSize) {
            protected void visit(int state, int cardinality, 
                BitSet documents, IntStack path)
            {
                // Check minimum base cluster cardinality.
                assert cardinality >= minBaseClusterSize;

                /*
                 * Consider certain special cases of internal suffix tree nodes.  
                 */
                if (!checkAcceptablePhrase(path))
                {
                    return;
                }

                // Calculate "effective phrase length", which is the number of non-stopwords.
                final int effectivePhraseLen = effectivePhraseLength(path);
                if (effectivePhraseLen == 0)
                {
                    return;
                }

                /*
                 * Calculate base cluster's score as a function of effective phrase's length.
                 * STC originally used a linear gradient, we modified it to penalize very long
                 * phrases (which usually correspond to duplicated snippets anyway). 
                 */
                final float score = baseClusterScore(effectivePhraseLen, cardinality);
                candidates.add(
                    new ClusterCandidate(path.toArray(), 
                        (BitSet) documents.clone(), cardinality, score));
            }
        }.visit();

        /*
         * Combine all phrases that are stem-equivalent into one candidate.
         */
        if (mergeStemEquivalentBaseClusters)
        {
            mergeStemEquivalentBaseClusters(sb, candidates);
        }

        /*
         * Remove any base clusters that fall below the minimum score.
         */
        int j = 0;
        for (int max = candidates.size(), i = 0; i < max; i++) 
        {
            ClusterCandidate cc = candidates.get(i);
            if (cc.score >= minBaseClusterScore) {
                candidates.set(j++, cc);
            }
        }
        candidates.subList(j, candidates.size()).clear();

        /*
         * We limit the number of base clusters to the one requested by the user.
         * First we sort by the base clusters score, then pick the top-K entries,
         * filtering out any stop labels on the way.
         */
        Collections.sort(candidates, new Comparator<ClusterCandidate>()
        {
            @Override
            public int compare(ClusterCandidate c1, ClusterCandidate c2)
            {
                return -Float.compare(c1.score, c2.score);
            }
        });

        j = 0;
        ILexicalData lexicalData = context.language.getLexicalData();
        for (int max = candidates.size(), i = 0; i < max && j < maxBaseClusters; i++) 
        {
            ClusterCandidate cc = candidates.get(i);
            // Build the candidate cluster's label for filtering. This may be costly so
            // we only do this for base clusters which are promoted to merging phase.
            assert cc.phrases.size() == 1;
            if (!lexicalData.isStopLabel(buildLabel(cc.phrases.get(0))))
            {
                candidates.set(j++, cc);
            }
        }

        if (j < candidates.size())
        {
            candidates.subList(j, candidates.size()).clear();
            assert candidates.size() == j; 
        }

        return candidates;
    }

    /* */
    private void mergeStemEquivalentBaseClusters(SequenceBuilder sb, final List<ClusterCandidate> candidates)
    {
        // Look for candidates to merge.
        Map<IntArrayList, ClusterCandidate> merged = Maps.newHashMap();
        int j = 0;
        for (int max = candidates.size(), i = 0; i < max; i++)
        {
            ClusterCandidate cc = candidates.get(i);
            candidates.set(j, cc);

            // Convert word indices to stem indices.
            assert cc.phrases.size() == 1;
            int [] stemIndices = context.allWords.stemIndex;
            int [] phraseWords = cc.phrases.get(0);
            IntArrayList stemList = new IntArrayList(phraseWords.length);
            for (int seqIndex : phraseWords)
            {
                int termIndex = sb.input.get(seqIndex);
                stemList.add(stemIndices[termIndex]);
            }
            
            // Check if we have stem-equivalent phrase like this.
            ClusterCandidate equivalent = merged.get(stemList);
            if (equivalent == null)
            {
                merged.put(stemList, cc);
                j++;
            }
            else
            {
                // Merge the two candidates. The surface form with the highest cardinality
                // is taken as the representation of an equivalence group.
                if (equivalent.cardinality < cc.cardinality)
                {
                    equivalent.cardinality = cc.cardinality;
                    equivalent.phrases.add(0, cc.phrases.get(0));
                }
                else
                {
                    equivalent.phrases.add(cc.phrases.get(0));
                }

                // Collect actual documents to recompute cardinality later on.
                equivalent.documents.or(cc.documents);
            }
        }

        // Trim to only include shifted merged candidates.
        candidates.subList(j, candidates.size()).clear();

        // Recalculate score after merging.
        IntStack scratch = new IntStack();
        for (ClusterCandidate cc : candidates)
        {
            if (cc.phrases.size() > 1)
            {
                cc.cardinality = (int) cc.documents.cardinality();
                scratch.buffer = cc.phrases.get(0);
                scratch.elementsCount = scratch.buffer.length;
                cc.score = baseClusterScore(
                    effectivePhraseLength(scratch),
                    cc.cardinality);

                // Clear any other phrase variants. 
                cc.phrases.subList(1, cc.phrases.size()).clear();
            }
        }
    }

    /**
     * Create final clusters by merging base clusters and pruning their labels. Cluster
     * merging is a greedy process of compacting clusters with document sets that overlap
     * by a certain ratio. In other words, phrases that "cover" nearly identical document
     * sets will be conflated.
     */
    private ArrayList<ClusterCandidate> createMergedClusters(List<ClusterCandidate> baseClusters)
    {
        /*
         * Calculate overlap between base clusters first, saving adjacency lists for
         * each base cluster.
         */

        // [i] - next neighbor or END, [i + 1] - neighbor cluster index.
        final int END = -1;
        final IntStack neighborList = new IntStack();
        neighborList.push(END);
        final int [] neighbors = new int [baseClusters.size()];
        final float m = (float) mergeThreshold;
        for (int i = 0; i < baseClusters.size(); i++)
        {
            for (int j = i + 1; j < baseClusters.size(); j++)
            {
                final ClusterCandidate c1 = baseClusters.get(i);
                final ClusterCandidate c2 = baseClusters.get(j);

                final float a = c1.cardinality;
                final float b = c2.cardinality;
                final float c = BitSet.intersectionCount(c1.documents, c2.documents);

                if (c / a > m && c / b > m)
                {
                    neighborList.push(neighbors[i], j);
                    neighbors[i] = neighborList.size() - 2;
                    neighborList.push(neighbors[j], i);
                    neighbors[j] = neighborList.size() - 2;
                }
            }
        }

        /*
         * Find connected components in the similarity graph using Tarjan's algorithm
         * (flattened to use the stack instead of recursion).
         */

        final int NO_INDEX = -1;
        final int [] merged = new int [baseClusters.size()];
        Arrays.fill(merged, NO_INDEX);

        final ArrayList<ClusterCandidate> mergedClusters = 
            Lists.newArrayListWithCapacity(baseClusters.size());
        final IntStack stack = new IntStack(baseClusters.size());
        final IntStack mergeList = new IntStack(baseClusters.size());
        int mergedIndex = 0;
        for (int v = 0; v < baseClusters.size(); v++)
        {
            if (merged[v] != NO_INDEX) continue;

            // Recursively mark all connected components from an unmerged cluster.
            stack.push(v);
            while (stack.size() > 0)
            {
                final int c = stack.pop();

                assert merged[c] == NO_INDEX || merged[c] == mergedIndex;
                if (merged[c] == mergedIndex) continue;

                merged[c] = mergedIndex;
                mergeList.push(c);

                for (int i = neighbors[c]; neighborList.get(i) != END;)
                {
                    final int neighbor = neighborList.get(i + 1);
                    if (merged[neighbor] == NO_INDEX)
                    {
                        stack.push(neighbor);
                    }
                    else
                    {
                        assert merged[neighbor] == mergedIndex;
                    }
                    i = neighborList.get(i);
                }
            }
            mergedIndex++;

            /*
             * Aggregate documents from each base cluster of the current merge, compute 
             * the score and labels.
             */
            mergedClusters.add(merge(mergeList, baseClusters));
            mergeList.clear();
        }

        /*
         * Sort merged clusters.
         */
        Collections.sort(mergedClusters, new Comparator<ClusterCandidate>() {
            public int compare(ClusterCandidate c1, ClusterCandidate c2) {
                if (c1.score < c2.score) return 1;
                if (c1.score > c2.score) return -1;
                if (c1.cardinality < c2.cardinality) return 1;
                if (c1.cardinality > c2.cardinality) return -1;
                return 0;
            };
        });
        
        if (mergedClusters.size() > maxClusters)
        {
            mergedClusters.subList(maxClusters, mergedClusters.size()).clear();
        }

        return mergedClusters;
    }

    /**
     * Merge a list of base clusters into one.
     */
    private ClusterCandidate merge(IntStack mergeList, 
        List<ClusterCandidate> baseClusters)
    {
        assert mergeList.size() > 0;
        final ClusterCandidate result = new ClusterCandidate(); 

        /*
         * Merge documents from all base clusters and update the score.
         */
        for (int i = 0; i < mergeList.size(); i++)
        {
            final ClusterCandidate cc = baseClusters.get(mergeList.get(i));
            result.documents.or(cc.documents);
            result.score += cc.score;
        }
        result.cardinality = (int) result.documents.cardinality();

        /*
         * Combine cluster labels and try to find the best description for the cluster.
         */
        final ArrayList<PhraseCandidate> phrases = 
            new ArrayList<PhraseCandidate>(mergeList.size());
        for (int i = 0; i < mergeList.size(); i++)
        {
            final ClusterCandidate cc = baseClusters.get(mergeList.get(i));
            final float coverage = cc.cardinality / (float) result.cardinality;
            phrases.add(new PhraseCandidate(cc, coverage));
        }

        markSubSuperPhrases(phrases);
        Collections2.filter(phrases, notSelected).clear();

        markOverlappingPhrases(phrases);
        Collections2.filter(phrases, notSelected).clear();

        Collections.sort(phrases, new Comparator<PhraseCandidate>() {
            public int compare(PhraseCandidate p1, PhraseCandidate p2) {
                if (p1.coverage < p2.coverage) return 1;
                if (p1.coverage > p2.coverage) return -1;
                return 0;
            };
        });

        int max = maxPhrases;
        for (PhraseCandidate p : phrases)
        {
            if (max-- <= 0) break;
            result.phrases.add(p.cluster.phrases.get(0));
        }

        return result;
    }

    /**
     * Leave only most general (no other phrase is a substring of this one) and 
     * most specific (no other phrase is a superstring of this one) phrases.
     */
    private void markSubSuperPhrases(ArrayList<PhraseCandidate> phrases)
    {
        final int max = phrases.size();

        // A list of all words for each candidate phrase.
        final IntStack words = new IntStack(
            maxDescPhraseLength * phrases.size());

        // Offset pairs in the words list -- a pair [start, length].
        final IntStack offsets = new IntStack(phrases.size() * 2);

        for (PhraseCandidate p : phrases)
        {
            appendWords(words, offsets, p);
        }

        /*
         * Mark phrases that cannot be most specific or most general.
         */
        for (int i = 0; i < max; i++)
        {
            for (int j = 0; j < max; j++)
            {
                if (i == j) continue;

                int index = indexOf(
                    words.buffer, offsets.get(2 * i), offsets.get(2 * i + 1),
                    words.buffer, offsets.get(2 * j), offsets.get(2 * j + 1));
                if (index >= 0)
                {
                    // j is a subphrase of i, hence i cannot be mostGeneral and j
                    // cannot be most specific.
                    phrases.get(i).mostGeneral = false;
                    phrases.get(j).mostSpecific = false;
                }
            }
        }

        /*
         * For most general phrases, do not display them if a more specific phrase
         * exists with pretty much the same coverage. 
         */
        for (int i = 0; i < max; i++)
        {
            final PhraseCandidate a = phrases.get(i); 
            if (!a.mostGeneral) continue;

            for (int j = 0; j < max; j++)
            {
                final PhraseCandidate b = phrases.get(j);
                if (i == j || !b.mostSpecific) continue;

                int index = indexOf(
                    words.buffer, offsets.get(2 * j), offsets.get(2 * j + 1),
                    words.buffer, offsets.get(2 * i), offsets.get(2 * i + 1));
                if (index >= 0)
                {
                    if (a.coverage - b.coverage < mostGeneralPhraseCoverage)
                    {
                        a.selected = false;
                        j = max;
                    }
                }
            }
        }

        /*
         * Mark phrases that should be removed from the candidate set.
         */
        for (PhraseCandidate p : phrases)
        {
            if (!p.mostGeneral && !p.mostSpecific)
            {
                p.selected = false;
            }
        }
    }

    /**
     * Mark those phrases that overlap with other phrases by more than
     * {@link #maxPhraseOverlap} and have lower coverage.
     */
    private void markOverlappingPhrases(ArrayList<PhraseCandidate> phrases)
    {
        final int max = phrases.size();

        // A list of all unique words for each candidate phrase.
        final IntStack words = new IntStack(
            maxDescPhraseLength * phrases.size());

        // Offset pairs in the words list -- a pair [start, length].
        final IntStack offsets = new IntStack(phrases.size() * 2);

        for (PhraseCandidate p : phrases)
        {
            appendUniqueWords(words, offsets, p);
        }

        for (int i = 0; i < max; i++)
        {
            for (int j = i + 1; j < max; j++)
            {
                final PhraseCandidate a = phrases.get(i);
                final PhraseCandidate b = phrases.get(j);

                final int a_words = offsets.get(2 * i + 1);
                final int b_words = offsets.get(2 * j + 1);

                final float intersection = computeIntersection(
                    words.buffer, offsets.get(2 * i), a_words,
                    words.buffer, offsets.get(2 * j), b_words);

                if ((intersection / b_words) > maxPhraseOverlap 
                    && b.coverage < a.coverage)
                {
                    b.selected = false;
                }

                if ((intersection / a_words) > maxPhraseOverlap
                    && a.coverage < b.coverage) 
                {
                    a.selected = false;
                }
            }
        }
    }

    /**
     * Compute the number of common elements in two (sorted) lists. 
     */
    static int computeIntersection(int [] a, int aPos, int aLength, int [] b, int bPos, int bLength)
    {
        final int maxa = aPos + aLength;
        final int maxb = bPos + bLength;

        int ea;
        int eb;
        int common = 0;
        while (aPos < maxa && bPos < maxb)
        {
            ea = a[aPos]; eb = b[bPos];
            if (ea >= eb) bPos++; 
            if (ea <= eb) aPos++;
            if (ea == eb) common++;
        }

        return common;
    }

    /**
     * Collect all unique non-stop word from a phrase. 
     */
    private void appendUniqueWords(IntStack words, IntStack offsets, PhraseCandidate p)
    {
        assert p.cluster.phrases.size() == 1;

        final int start = words.size();
        final int [] phraseIndices  = p.cluster.phrases.get(0);
        final short [] tokenTypes = context.allWords.type;
        for (int i = 0; i < phraseIndices.length; i += 2)
        {
            for (int j = phraseIndices[i]; j <= phraseIndices[i + 1]; j++)
            {
                final int termIndex = sb.input.get(j);
                if (!TokenTypeUtils.isCommon(tokenTypes[termIndex]))
                {
                    words.push(termIndex);
                }
            }
        }

        // Sort words, we don't care about their order when counting subsets.
        Arrays.sort(words.buffer, start, words.size());

        // Reorder to keep only unique words.
        int j = start;
        for (int i = start + 1; i < words.size(); i++)
        {
            if (words.buffer[j] != words.buffer[i])
            {
                words.buffer[++j] = words.buffer[i];
            }
        }
        words.elementsCount = j + 1;

        offsets.push(start, words.size() - start);
    }

    /**
     * Collect all words from a phrase.
     */
    private void appendWords(IntStack words, IntStack offsets, PhraseCandidate p)
    {
        final int start = words.size();
        
        final int [] phraseIndices  = p.cluster.phrases.get(0);
        final short [] tokenTypes = context.allWords.type;
        for (int i = 0; i < phraseIndices.length; i += 2)
        {
            for (int j = phraseIndices[i]; j <= phraseIndices[i + 1]; j++)
            {
                final int termIndex = sb.input.get(j);
                if (!TokenTypeUtils.isCommon(tokenTypes[termIndex]))
                {
                    words.push(termIndex);
                }
            }
        }

        offsets.push(start, words.size() - start);
    }

    /**
     * Create the junk (unassigned documents) cluster and create the final
     * set of clusters in Carrot2 format. 
     */
    private void postProcessing(List<ClusterCandidate> clusters)
    {
        // Adapt to Carrot2 classes, counting used documents on the way.
        final BitSet all = new BitSet(documents.size());
        final ArrayList<Document> docs = Lists.newArrayListWithCapacity(documents.size());
        final ArrayList<String> phrases = Lists.newArrayListWithCapacity(3);
        for (ClusterCandidate c : clusters)
        {
            final Cluster c2 = new Cluster();
            c2.addPhrases(collectPhrases(phrases, c));
            c2.addDocuments(collectDocuments(docs, c.documents));
            c2.setScore((double) c.score);
            this.clusters.add(c2);

            all.or(c.documents);
            docs.clear(); 
            phrases.clear();
        }

        Collections.sort(this.clusters,
            Cluster.byReversedWeightedScoreAndSizeComparator(scoreWeight));

        Cluster.appendOtherTopics(this.documents, this.clusters);
    }
    
    /**
     * Collect phrases from a cluster.
     */
    private List<String> collectPhrases(List<String> l, ClusterCandidate c)
    {
        assert l != null;
        for (int [] phraseIndexes : c.phrases)
        {
            l.add(buildLabel(phraseIndexes));
        }
        return l;
    }

    /**
     * Collect documents from a bitset.
     */
    private List<Document> collectDocuments(List<Document> l, BitSet bitset)
    {
        if (l == null)
        {
            l = Lists.newArrayListWithCapacity((int) bitset.cardinality());
        }

        final BitSetIterator i = bitset.iterator();
        for (int d = i.nextSetBit(); d >= 0; d = i.nextSetBit())
        {
            l.add(documents.get(d));
        }
        return l;
    }

    /**
     * Build the cluster's label from suffix tree edge indices. 
     */
    private String buildLabel(int [] phraseIndices)
    {
        // Count the number of terms first.
        int termsCount = 0;
        for (int j = 0; j < phraseIndices.length; j += 2)
        {
            termsCount += phraseIndices[j + 1] - phraseIndices[j] + 1;
        }
    
        // Extract terms info for the phrase and construct the label.
        final boolean [] stopwords = new boolean[termsCount];
        final char [][] images = new char [termsCount][];
        final short [] tokenTypes = context.allWords.type;

        int k = 0;
        for (int i = 0; i < phraseIndices.length; i += 2)
        {
            for (int j = phraseIndices[i]; j <= phraseIndices[i + 1]; j++, k++)
            {
                final int termIndex = sb.input.get(j);
                images[k] = context.allWords.image[termIndex];
                stopwords[k] = TokenTypeUtils.isCommon(tokenTypes[termIndex]);
            }
        }
        
        return LabelFormatter.format(images, stopwords, 
            context.language.getLanguageCode().usesSpaceDelimiters());
    }

    @SuppressWarnings("unused")
    private String toString(PhraseCandidate c)
    {
        return String.format(Locale.ENGLISH, "%3.2f %s %s %s %s", 
            c.coverage, 
            buildLabel(c.cluster.phrases.get(0)),
            c.selected ? "S" : "",
            c.mostGeneral ? "MG" : "",
            c.mostSpecific ? "MS" : "");
    }

    /**
     * Build a cluster's label from suffix tree edge indices, including some debugging and
     * diagnostic information.
     */
    @SuppressWarnings("unused")
    private String buildDebugLabel(int [] phraseIndices)
    {
        final StringBuilder b = new StringBuilder();

        String sep = "";
        int k = 0;
        final short [] tokenTypes = context.allWords.type;
        for (int i = 0; i < phraseIndices.length; i += 2)
        {
            for (int j = phraseIndices[i]; j <= phraseIndices[i + 1]; j++, k++)
            {
                b.append(sep);

                final int termIndex = sb.input.get(j);
                b.append(context.allWords.image[termIndex]);

                if (TokenTypeUtils.isCommon(tokenTypes[termIndex])) b.append("[S]");
                sep = " ";
            }
            sep = "_";
        }

        return b.toString();
    }

    /**
     * Consider certain special cases of internal suffix tree nodes. The suffix tree may 
     * contain internal nodes with paths starting or ending with a stop word (common 
     * word). We have the following interesting scenarios:
     * 
     * <dl>
     * <dt>IF LEADING STOPWORD: IGNORE THE NODE.</dt>
     * <dd>
     * There MUST be a phrase with this stopword chopped off in the suffix tree 
     * (a suffix of this phrase) and its frequency will be just as high.</dd>
     * 
     * <dt>IF TRAILING STOPWORDS:</dt> 
     * <dd>
     * Check if the edge leading to the current node is composed entirely of stopwords. If so, 
     * there must be a parent node that contains non-stopwords and we can ignore the current node.
     * Otherwise we can chop off the trailing stopwords from the current node's phrase (this phrase
     * cannot be duplicated anywhere in the tree because if it were, there would have to be a branch
     * somewhere in the suffix tree on the edge).</dd>
     * </dl>
     */
    final boolean checkAcceptablePhrase(IntStack path)
    {
        assert path.size() > 0;

        final int [] terms = sb.input.buffer;
        final short [] tokenTypes = context.allWords.type;

        // Ignore nodes that start with a stop word.
        if (TokenTypeUtils.isCommon(tokenTypes[terms[path.get(0)]]))
        {
            return false;
        }

        // Check the last edge of the current node.
        int i = path.get(path.size() - 2);
        int j = path.get(path.size() - 1);
        final int k = j;
        while (i <= j && TokenTypeUtils.isCommon(tokenTypes[terms[j]]))
        {
            j--;
        }

        if (j < i)
        {
            // If the edge contains only stopwords, ignore the node.
            return false;
        }
        else if (j < k)
        {
            // There have been trailing stop words on the edge. Chop them off.
            path.buffer[path.size() - 1] = j;
        }

        // Check the total phrase length (in words, including stopwords).
        int termsCount = 0;
        for (j = 0; j < path.size(); j += 2)
        {
            termsCount += path.get(j + 1) - path.get(j) + 1;
        }

        if (termsCount >  maxDescPhraseLength)
        {
            return false;
        }

        return true;
    }

    /**
     * Calculate "effective phrase length", that is the number of non-ignored words
     * in the phrase.
     */
    final int effectivePhraseLength(IntStack path)
    {
        final int [] terms = sb.input.buffer;
        final int lower = ignoreWordIfInFewerDocs;
        final int upper = (int) (ignoreWordIfInHigherDocsPercent * documents.size());

        int effectivePhraseLen = 0;
        for (int i = 0; i < path.size(); i += 2)
        {
            for (int j = path.get(i); j <= path.get(i + 1); j++)
            {
                final int termIndex = terms[j];

                // If this term is a stop word, don't count it.
                if (TokenTypeUtils.isCommon(context.allWords.type[termIndex]))
                {
                    continue;
                }

                // If this word occurs in more than a given fraction of the input
                // collection don't count it.
                final int docCount = context.allWords.tfByDocument[termIndex].length / 2; 
                if (docCount < lower || docCount > upper)
                {
                    continue;
                }

                effectivePhraseLen++;
            }
        }

        return effectivePhraseLen;
    }

    /**
     * Calculates base cluster score.
     * <p>
     * The boost is calculated as a Gaussian function of density around the "optimum"
     * expected phrase length (average) and "tolerance" towards shorter and longer phrases
     * (standard deviation). You can draw this score multiplier's characteristic with
     * gnuplot:
     * <pre>
     * reset
     * 
     * set xrange [0:10]
     * set yrange [0:]
     * set samples 11
     * set boxwidth 1 absolute
     * 
     * set xlabel "Phrase length"
     * set ylabel "Score multiplier"
     * 
     * set border 3
     * set key noautotitles
     * 
     * set grid
     * 
     * set xtics border nomirror 1
     * set ytics border nomirror
     * set ticscale 1.0
     * show tics
     * 
     * set size ratio .5
     * 
     * # Base cluster boost function.
     * boost(x) = exp(-(x - optimal) * (x - optimal) / (2 * tolerance * tolerance)) 
     * 
     * plot optimal=2, tolerance=2, boost(x) with histeps title "optimal=2, tolerance=2", \
     *      optimal=2, tolerance=4, boost(x) with histeps title "optimal=2, tolerance=4", \
     *      optimal=2, tolerance=6, boost(x) with histeps title "optimal=2, tolerance=6"
     * 
     * pause -1
     * </pre>
     * One word-phrases can be given a fixed boost, if 
     * {@link #singleTermBoost} is greater than zero. 
     * 
     * @param phraseLength Effective phrase length (number of non-stopwords).
     * @param documentCount Number of documents this phrase occurred in.
     * @return Returns the base cluster score calculated as a function of the number of
     *         documents the phrase occurred in and a function of the effective length of
     *         the phrase.
     */
    final float baseClusterScore(final int phraseLength, final int documentCount)
    {
        final double boost;
        if (phraseLength == 1 && singleTermBoost > 0)
        {
            boost = singleTermBoost;
        }
        else
        {
            final int tmp = phraseLength - optimalPhraseLength;
            boost = Math.exp((-tmp * tmp) 
                / (2 * optimalPhraseLengthDev * optimalPhraseLengthDev));
        }

        return (float) (boost * (documentCount * documentCountBoost));
    }

    /**
     * Subsequence search in int arrays.
     */
    private static int indexOf(int [] source, int sourceOffset, int sourceCount, 
        int [] target, int targetOffset, int targetCount)
    {
        if (targetCount == 0)
        {
            return 0;
        }
    
        final int first = target[targetOffset];
        final int max = sourceOffset + (sourceCount - targetCount);
    
        for (int i = sourceOffset; i <= max; i++)
        {
            /* Look for first element. */
            if (source[i] != first)
            {
                while (++i <= max && source[i] != first) /* do nothing */;
            }
    
            /* Found first element, now look at the rest of the pattern */
            if (i <= max)
            {
                int j = i + 1;
                int end = j + targetCount - 1;
                for (int k = targetOffset + 1; j < end && source[j] == target[k]; j++, k++)
                    /* do nothing */;
    
                if (j == end)
                {
                    /* Found whole pattern. */
                    return i - sourceOffset;
                }
            }
        }
        return -1;
    }
}