/* * Carrot2 project. * * Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński. * All rights reserved. * * Refer to the full license file "carrot2.LICENSE" * in the root folder of the repository checkout or at: * http://www.carrot2.org/carrot2.LICENSE */ package org.carrot2.text.preprocessing; import java.util.Arrays; import org.carrot2.text.preprocessing.PreprocessingContext.*; import org.carrot2.text.preprocessing.filter.*; import org.carrot2.util.attribute.Bindable; import com.carrotsearch.hppc.IntArrayList; /** * Applies basic filtering to words and phrases to produce candidates for cluster labels. * Filtering is applied to {@link AllWords} and {@link AllPhrases}, the results are saved * to {@link AllLabels}. Currently, the following filters are applied: * <ol> * <li>{@link StopWordLabelFilter}</li> * <li>{@link CompleteLabelFilter}</li> * </ol> * This class saves the following results to the {@link PreprocessingContext}: * <ul> * <li>{@link AllLabels#featureIndex}</li> * </ul> * <p> * This class requires that {@link Tokenizer}, {@link CaseNormalizer}, * {@link StopListMarker} and {@link PhraseExtractor} be invoked first. */ @Bindable(prefix = "LabelFilterProcessor") public class LabelFilterProcessor { // For the time being we include filters as instance fields here. If there is a need // to add custom label filters as parameters, we'll need to come up with something. /** * Query word label filter for this processor. */ public QueryLabelFilter queryLabelFilter = new QueryLabelFilter(); /** * Stop word label filter for this processor. */ public StopWordLabelFilter stopWordLabelFilter = new StopWordLabelFilter(); /** * Numeric label filter for this processor. */ public NumericLabelFilter numericLabelFilter = new NumericLabelFilter(); /** * Truncated phrase filter for this processor. */ public CompleteLabelFilter completeLabelFilter = new CompleteLabelFilter(); /** * Min length label filter. */ public MinLengthLabelFilter minLengthLabelFilter = new MinLengthLabelFilter(); /** * Genitive length label filter. */ public GenitiveLabelFilter genitiveLabelFilter = new GenitiveLabelFilter(); /** * Stop label filter. */ public StopLabelFilter stopLabelFilter = new StopLabelFilter(); /** * Processes all filters declared as fields of this class. */ public void process(PreprocessingContext context) { final int wordCount = context.allWords.image.length; final boolean [] acceptedStems = new boolean [context.allStems.image.length]; final boolean [] acceptedPhrases = new boolean [context.allPhrases.tf.length]; Arrays.fill(acceptedStems, true); Arrays.fill(acceptedPhrases, true); minLengthLabelFilter.filter(context, acceptedStems, acceptedPhrases); genitiveLabelFilter.filter(context, acceptedStems, acceptedPhrases); queryLabelFilter.filter(context, acceptedStems, acceptedPhrases); stopWordLabelFilter.filter(context, acceptedStems, acceptedPhrases); numericLabelFilter.filter(context, acceptedStems, acceptedPhrases); stopLabelFilter.filter(context, acceptedStems, acceptedPhrases); completeLabelFilter.filter(context, acceptedStems, acceptedPhrases); final IntArrayList acceptedFeatures = new IntArrayList(acceptedStems.length + acceptedPhrases.length); final int [] mostFrequentOriginalWordIndex = context.allStems.mostFrequentOriginalWordIndex; for (int i = 0; i < acceptedStems.length; i++) { if (acceptedStems[i]) { acceptedFeatures.add(mostFrequentOriginalWordIndex[i]); } } for (int i = 0; i < acceptedPhrases.length; i++) { if (acceptedPhrases[i]) { acceptedFeatures.add(i + wordCount); } } context.allLabels.featureIndex = acceptedFeatures.toArray(); updateFirstPhraseIndex(context); } static void updateFirstPhraseIndex(PreprocessingContext context) { final int wordCount = context.allWords.image.length; final int [] labelsFeatureIndex = context.allLabels.featureIndex; // In theory we could do a binary search here, but the effort of writing // a customized version may not be worth the gain int firstPhraseIndex = -1; for (int i = 0; i < labelsFeatureIndex.length; i++) { if (labelsFeatureIndex[i] >= wordCount) { firstPhraseIndex = i; break; } } context.allLabels.firstPhraseIndex = firstPhraseIndex; } }