/*
* Hibernate, Relational Persistence for Idiomatic Java
*
* Copyright (c) 2010, Red Hat, Inc. and/or its affiliates or third-party contributors as
* indicated by the @author tags or express copyright attribution
* statements applied by the authors. All third-party contributions are
* distributed under license by Red Hat, Inc.
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.hibernate.search.test.util.textbuilder;
import java.util.Random;
import java.util.Set;
import java.util.TreeSet;
/**
* Test utility meant to produce sentences of a randomly generated language,
* having some properties of natural languages.
* The goal is to produce sentences which look like a western text,
* but are not.
* All sentences from the same SentenceInventor will share
* a limited dictionary, making the frequencies suitable to test
* with Lucene.
* Sentences produced depend from the constructor arguments,
* making the output predictable for testing purposes.
*
* @author Sanne Grinovero
*/
public class SentenceInventor {
private final Random r;
private final WordDictionary dictionary;
//array contains repeated object for probability distribution (more chance for a ",")
private final char[] sentenceSeparators = new char[] { ',', ',', ',' , ';', ':', ':' };
/**
* @param randomSeed the seed to use for random generator
* @param dictionarySize the number of terms to insert in the dictionary used to build sentences
*/
public SentenceInventor(long randomSeed, int dictionarySize) {
r = new Random( randomSeed );
dictionary = randomDictionary( dictionarySize );
}
/**
* @return a random character from the ASCII table (text chars only)
*/
public char randomCharacter() {
return (char) (r.nextInt( 26 ) + 65);
}
/**
* @param length the desired length
* @return a randomly generated String
*/
public String randomString(int length) {
char[] chars = new char[length];
for ( int i=0; i<length; i++ ) {
chars[i] = randomCharacter();
}
return new String( chars );
}
/**
* Produces a randomly generated String, using
* only western alphabet characters and selecting
* the length as a normal distribution of natural languages.
* @return the generated String
*/
public String randomString() {
double d = r.nextGaussian() * 6.3d;
int l = (int) d + 6;
if ( l > 0 )
return randomString( l );
else
return randomString();
}
/**
* Produces a random String, which might be lowercase,
* completely uppercase, or uppercasing the first char
* (randomly selected)
* @return produced String
*/
public String randomTerm() {
int i = r.nextInt( 200 );
String term = randomString();
if ( i > 10 )
//completely lowercase 189/200 cases
return term.toLowerCase();
else if ( i < 2 )
//completely uppercase in 2/200 cases
return term;
else
//first letter uppercase in 9/200 cases
return term.substring( 0, 1 ) + term.substring( 1 ).toLowerCase();
}
private WordDictionary randomDictionary(int size) {
Set<String> tree = new TreeSet<String>();
while ( tree.size() != size ) {
tree.add( randomTerm() );
}
return new WordDictionary( tree );
}
/**
* Builds a sentence concatenating terms from the generated dictionary and spaces
* @return a sentence
*/
public String nextSentence() {
int sentenceLength = r.nextInt( 3 ) + r.nextInt( 10 ) + 1;
String[] sentence = new String[sentenceLength];
for ( int i=0; i<sentenceLength; i++ ) {
sentence[i] = dictionary.randomWord();
}
if ( sentenceLength == 1 ) {
return sentence[0];
}
else {
StringBuilder sb = new StringBuilder( sentence[0]);
for ( int i=1; i<sentenceLength; i++) {
sb.append( " " );
sb.append( sentence[i] );
}
return sb.toString();
}
}
/**
* Combines a random (gaussian) number of sentences in a period,
* using some punctuation symbols and
* capitalizing first char, terminating with dot and newline.
* @return
*/
public String nextPeriod() {
int periodLengthSentences = r.nextInt( 7 ) - 2;
periodLengthSentences = ( periodLengthSentences < 1 ) ? 1 : periodLengthSentences;
String firstsentence = nextSentence();
StringBuilder sb = new StringBuilder()
.append( firstsentence.substring( 0,1 ).toUpperCase() )
.append( firstsentence.substring( 1 ) );
for ( int i=1; i<periodLengthSentences; i++ ) {
int separatorCharIndex = r.nextInt( sentenceSeparators.length );
sb
.append( sentenceSeparators[separatorCharIndex] )
.append( ' ' )
.append( nextSentence() );
}
sb.append( ".\n" );
return sb.toString();
}
//run it to get an idea of what this class is going to produce
public static void main(String[] args) {
SentenceInventor wi = new SentenceInventor( 7L, 10000 );
for (int i=0; i<30; i++) {
System.out.print( wi.nextPeriod() );
}
}
}