/* * Carrot2 project. * * Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński. * All rights reserved. * * Refer to the full license file "carrot2.LICENSE" * in the root folder of the repository checkout or at: * http://www.carrot2.org/carrot2.LICENSE */ package org.carrot2.text.preprocessing.filter; import java.util.ArrayList; import java.util.List; import org.carrot2.text.preprocessing.PreprocessingContext; import org.carrot2.shaded.guava.common.collect.Lists; /** * Base class for complete phrase filtering. */ abstract class CompleteLabelFilterBase { void filter(PreprocessingContext context, boolean [] acceptedStems, boolean [] acceptedPhrases, double labelOverrideThreshold) { if (acceptedStems.length + acceptedPhrases.length < 2) { return; } final int [] stemTf = context.allStems.tf; final int [] phraseTf = context.allPhrases.tf; final int [] mostFrequentOriginalWordIndex = context.allStems.mostFrequentOriginalWordIndex; final int [] wordsStemIndex = context.allWords.stemIndex; // Build labelIndex-wordIndices combos for each word and phrase. We'll use // them below to create an LCP array. final ArrayList<LabelIndexWithCodes> phraseIndexesWithCodes = Lists .newArrayListWithExpectedSize(acceptedStems.length + acceptedPhrases.length); for (int i = 0; i < acceptedStems.length + acceptedPhrases.length; i++) { phraseIndexesWithCodes.add(new LabelIndexWithCodes(i, getLabelWordIndexes( acceptedStems.length, mostFrequentOriginalWordIndex, context.allPhrases.wordIndices, i))); } // Sort and create LCP array final List<LabelIndexWithCodes> sortedPhrasesWithCodes = sortPhraseCodes(phraseIndexesWithCodes); int [] lcpArray = createLcp(sortedPhrasesWithCodes); // Remove superseded phrases int i = 0; while (i < sortedPhrasesWithCodes.size() - 1) { final LabelIndexWithCodes currentLabelWithCodes = sortedPhrasesWithCodes .get(i); final int currentLabelIndex = currentLabelWithCodes.getLabelIndex(); // Check only those phrases that are not removed and that are // themselves subphrases of some longer phrases if (getLabelLength(acceptedStems.length, context.allPhrases.wordIndices, currentLabelIndex) == lcpArray[i] && isLabelAccepted(acceptedStems.length, wordsStemIndex, currentLabelIndex, acceptedStems, acceptedPhrases)) { int j = i; while (j < sortedPhrasesWithCodes.size() - 1 && lcpArray[j] >= lcpArray[i]) { final LabelIndexWithCodes nextPhraseWithCodes = sortedPhrasesWithCodes .get(j + 1); final int nextLabelIndex = nextPhraseWithCodes.getLabelIndex(); double labelOverride = calculateLabelOverride(acceptedStems.length, stemTf, phraseTf, nextLabelIndex, currentLabelIndex); if ((isLabelAccepted(acceptedStems.length, wordsStemIndex, nextLabelIndex, acceptedStems, acceptedPhrases) && labelOverride >= labelOverrideThreshold)) { markLabelAsRemoved(acceptedStems.length, currentLabelIndex, acceptedStems, acceptedPhrases); break; } j++; } } i++; } } abstract List<LabelIndexWithCodes> sortPhraseCodes( List<LabelIndexWithCodes> phrasesWithCodes); abstract int [] createLcp(List<LabelIndexWithCodes> sortedPhrasesWithCodes); static class LabelIndexWithCodes { final private int labelIndex; final private int [] codes; public LabelIndexWithCodes(int labelIndex, int [] codes) { this.labelIndex = labelIndex; this.codes = codes; } public int [] getCodes() { return codes; } public int getLabelIndex() { return labelIndex; } } private final static int [] getLabelWordIndexes(int wordCount, int [] mostFrequentWordIndex, int [][] wordIndices, int featureIndex) { if (featureIndex < wordCount) { return new int [] { mostFrequentWordIndex[featureIndex] }; } else { return wordIndices[featureIndex - wordCount]; } } private final static int getLabelLength(int wordCount, int [][] wordIndices, int featureIndex) { return featureIndex < wordCount ? 1 : wordIndices[featureIndex - wordCount].length; } private final static boolean isLabelAccepted(int wordCount, int [] wordStemIndex, int featureIndex, boolean [] acceptedStems, boolean [] acceptedPhrases) { if (featureIndex < wordCount) { return acceptedStems[featureIndex]; } else { return acceptedPhrases[featureIndex - wordCount]; } } private final static void markLabelAsRemoved(int wordCount, int featureIndex, boolean [] acceptedWords, boolean [] acceptedPhrases) { if (featureIndex < wordCount) { acceptedWords[featureIndex] = false; } else { acceptedPhrases[featureIndex - wordCount] = false; } } private final static double calculateLabelOverride(int wordCount, int [] wordTf, int [] phraseTf, int overridingLabelIndex, int overridenLabelIndex) { final int overridingTf = overridingLabelIndex < wordCount ? wordTf[overridingLabelIndex] : phraseTf[overridingLabelIndex - wordCount]; final int overridenTf = overridenLabelIndex < wordCount ? wordTf[overridenLabelIndex] : phraseTf[overridenLabelIndex - wordCount]; return ((double) overridingTf) / overridenTf; } }