/* * Carrot2 project. * * Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński. * All rights reserved. * * Refer to the full license file "carrot2.LICENSE" * in the root folder of the repository checkout or at: * http://www.carrot2.org/carrot2.LICENSE */ package org.carrot2.text.preprocessing; import com.carrotsearch.hppc.sorting.IndirectComparator; import com.carrotsearch.hppc.sorting.IndirectSort; /** * A simple suffix sorting utility based on the generic sorting routines from {@link IndirectSort}. */ final class SuffixSorter { /** * An int comparator that enables suffix sorting. */ private static class SuffixComparator implements IndirectComparator { private int [] suffixData; public SuffixComparator(int [] suffixData) { this.suffixData = suffixData; } public int compare(int suffixA, int suffixB) { if (suffixA == suffixB) { return 0; } /* * Suffix data ends with a unique negative value, so we don't need to do extra * range checks and we still won't run into array index out of bounds * exceptions. */ while (suffixData[suffixA] == suffixData[suffixB]) { suffixA++; suffixB++; } return suffixData[suffixB] - suffixData[suffixA]; } } /** * Performs suffix sorting and saves the results to the <code>context</code>. */ void suffixSort(PreprocessingContext context) { /* * Create a temporary array based on word indices with -1 values replaced with * unique negative values. This will ensure that the phrases discovered based on * the sorted/lcp array will not cross sentence/field boundaries. At some point we * may want to make it an option. In this case, we'll need to review Substring and * SubstringComparator for possible array index out of bounds. */ final int [] intCodes = new int [context.allTokens.wordIndex.length]; System.arraycopy(context.allTokens.wordIndex, 0, intCodes, 0, intCodes.length); int currentSeparatorCode = -1; for (int i = 0; i < intCodes.length; i++) { if (intCodes[i] < 0) { intCodes[i] = currentSeparatorCode--; } } // Create suffix order int [] suffixOrder = IndirectSort.mergesort(0, intCodes.length, new SuffixComparator(intCodes)); context.allTokens.suffixOrder = suffixOrder; // Add LCPs context.allTokens.lcp = calculateLcp(intCodes, suffixOrder); } /** * Calculates the Longest Common Prefix values for each token. */ private int [] calculateLcp(int [] intCodes, int [] suffixOrder) { // LCP array int [] lcpArray = new int [intCodes.length]; lcpArray[0] = 0; for (int i = 1; i < lcpArray.length - 1; i++) { int lcp = 0; while (intCodes[suffixOrder[i - 1] + lcp] == intCodes[suffixOrder[i] + lcp]) { lcp++; } lcpArray[i] = lcp; } return lcpArray; } }