/*
* Copyright 2000-2013 Enonic AS
* http://www.enonic.com/license
*/
package com.enonic.cms.core.content.index;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import com.enonic.cms.framework.xml.IllegalCharacterCleaner;
public class BigText
{
private static final String SPECIAL_XML_CHARS_TO_REPLACE = ".,=<>*@/\n";
private static final String CHARS_TO_REPLACE_WITH = " ";
private IllegalCharacterCleaner xmlCleaner = new IllegalCharacterCleaner();
private String text = "";
private Set<String> words;
public BigText( String text )
{
if ( text == null )
{
throw new IllegalArgumentException( "Given text cannot be null" );
}
this.text = xmlCleaner.cleanXml( text );
this.text = this.text.replaceAll( "[" + SPECIAL_XML_CHARS_TO_REPLACE + "]", CHARS_TO_REPLACE_WITH );
this.text = this.text.trim();
}
public String getText()
{
return text;
}
private void initWords()
{
words = new LinkedHashSet<String>();
BreakIterator wb = BreakIterator.getWordInstance();
wb.setText( text );
int start = wb.first();
for ( int end = wb.next(); end != BreakIterator.DONE; start = end, end = wb.next() )
{
String word = text.substring( start, end );
word = word.toLowerCase().trim();
if ( word.length() > 0 && !".".equals( word ) && !":".equals( word ) && !")".equals( word ) )
{
words.add( word.toLowerCase() );
}
}
}
public Collection<String> getWords()
{
if ( words == null )
{
initWords();
}
return words;
}
public List<String> getTextSplitted( int splitTreshold, String lineSeparator )
{
String value = getText();
ArrayList<String> values = new ArrayList<String>();
while ( value.length() > splitTreshold )
{
int index = findSplitIndex( value, splitTreshold, lineSeparator );
values.add( value.substring( 0, index ).trim() );
value = value.substring( index + 1 ).trim();
}
values.add( value );
return values;
}
/**
* Tries to make a smart split, by looking for spaces and line feeds in the text, to find a place to split the string at the last word
* break before the limit of 255 characters.
*
* @param value The string to figure out where to split.
* @param splitTreshold The maximum length of the the text that may be spilt off.
* @param lineSeparator The character used to spilt lines.
* @return A number between 0 and the split threshold, which is the best place to split the given string.
*/
private int findSplitIndex( String value, int splitTreshold, String lineSeparator )
{
String valueMax = value.substring( 0, splitTreshold );
int index = valueMax.lastIndexOf( ' ' );
if ( ( index < 0 ) || ( index > splitTreshold ) )
{
index = value.lastIndexOf( lineSeparator );
if ( ( index < 0 ) || ( index > splitTreshold ) )
{
index = splitTreshold;
}
}
return index;
}
}