/*
* Hibernate Search, full-text search for your domain model
*
* License: GNU Lesser General Public License (LGPL), version 2.1 or later
* See the lgpl.txt file in the root directory or <http://www.gnu.org/licenses/lgpl-2.1.html>.
*/
package org.hibernate.search.testsupport.textbuilder;
import java.util.Locale;
import java.util.Random;
import java.util.Set;
import java.util.TreeSet;
/**
* Test utility meant to produce sentences of a randomly generated language,
* having some properties of natural languages.
* The goal is to produce sentences which look like a western text,
* but without needing an actual resource to read from so we can create unlimited
* garbage. We also get a chance to produce some novel poetry.
* All sentences from the same SentenceInventor will share
* a limited dictionary, making the frequencies somehow repeatable, suitable to test
* with Lucene.
* Sentences produced depend from the constructor arguments,
* making the output predictable for testing purposes.
*
* @author Sanne Grinovero
*/
public class SentenceInventor {
private final Random r;
private final WordDictionary dictionary;
private final Locale randomlocale;
//array contains repeated object for probability distribution (more chance for a ",")
private final char[] sentenceSeparators = new char[] { ',', ',', ',' , ';', ':', ':' };
//same as above, but favour the "full stop" char as a more likely end for periods.
private final char[] periodSeparators = new char[] { '.', '.', '.' , '.', '.', '?', '?', '!' };
/**
* @param randomSeed the seed to use for random generator
* @param dictionarySize the number of terms to insert in the dictionary used to build sentences
*/
public SentenceInventor(long randomSeed, int dictionarySize) {
r = new Random( randomSeed );
randomlocale = randomLocale();
dictionary = randomDictionary( dictionarySize );
}
/**
* @return a random Locale among the ones available on the current system
*/
private Locale randomLocale() {
Locale[] availableLocales = Locale.getAvailableLocales();
int index = r.nextInt( availableLocales.length );
return availableLocales[index];
}
/**
* @return a random character from the ASCII table (text chars only)
*/
public char randomCharacter() {
return (char) (r.nextInt( 26 ) + 65);
}
/**
* @param length the desired length
* @return a randomly generated String
*/
public String randomString(int length) {
char[] chars = new char[length];
for ( int i = 0; i < length; i++ ) {
chars[i] = randomCharacter();
}
return new String( chars );
}
/**
* Produces a randomly generated String, using
* only western alphabet characters and selecting
* the length as a normal distribution of natural languages.
* @return the generated String
*/
public String randomString() {
double d = r.nextGaussian() * 6.3d;
int l = (int) d + 6;
if ( l > 0 ) {
return randomString( l );
}
else {
return randomString();
}
}
/**
* Produces a random String, which might be lowercase,
* completely uppercase, or uppercasing the first char
* (randomly selected)
* @return produced String
*/
public String randomTerm() {
int i = r.nextInt( 200 );
String term = randomString();
if ( i > 10 ) {
//completely lowercase 189/200 cases
return term.toLowerCase( randomlocale );
}
else if ( i < 2 ) {
//completely uppercase in 2/200 cases
return term;
}
else {
//first letter uppercase in 9/200 cases
return term.substring( 0, 1 ) + term.substring( 1 ).toLowerCase( randomlocale );
}
}
private WordDictionary randomDictionary(int size) {
Set<String> tree = new TreeSet<String>();
while ( tree.size() != size ) {
tree.add( randomTerm() );
}
return new WordDictionary( tree );
}
/**
* Builds a sentence concatenating terms from the generated dictionary and spaces
* @return a sentence
*/
public String nextSentence() {
int sentenceLength = r.nextInt( 3 ) + r.nextInt( 10 ) + 1;
String[] sentence = new String[sentenceLength];
for ( int i = 0; i < sentenceLength; i++ ) {
sentence[i] = dictionary.randomWord();
}
if ( sentenceLength == 1 ) {
return sentence[0];
}
else {
StringBuilder sb = new StringBuilder( sentence[0] );
for ( int i = 1; i < sentenceLength; i++ ) {
sb.append( " " );
sb.append( sentence[i] );
}
return sb.toString();
}
}
/**
* Combines a random (gaussian) number of sentences in a period,
* using some punctuation symbols and
* capitalizing first char, terminating with dot and newline.
* @return
*/
public String nextPeriod() {
//Combine two random values to make extreme long/short less likely,
//But still make the "one statement" period more likely than other shapes.
int periodLengthSentences = r.nextInt( 6 ) + r.nextInt( 4 ) - 3;
periodLengthSentences = ( periodLengthSentences < 1 ) ? 1 : periodLengthSentences;
String firstsentence = nextSentence();
StringBuilder sb = new StringBuilder()
.append( firstsentence.substring( 0,1 ).toUpperCase( randomlocale ) )
.append( firstsentence.substring( 1 ) );
for ( int i = 1; i < periodLengthSentences; i++ ) {
int separatorCharIndex = r.nextInt( sentenceSeparators.length );
sb
.append( sentenceSeparators[separatorCharIndex] )
.append( ' ' )
.append( nextSentence() );
}
int periodSeparatorCharIndex = r.nextInt( periodSeparators.length );
sb.append( periodSeparators[periodSeparatorCharIndex] );
sb.append( "\n" );
return sb.toString();
}
//run it to get an idea of what this class is going to produce
public static void main(String[] args) {
SentenceInventor wi = new SentenceInventor( 7L, 10000 );
for ( int i = 0; i < 3000; i++ ) {
System.out.print( wi.nextPeriod() );
}
}
}