SentenceInventor.java example

Explorer
hibernate-search-master
/*
 * Hibernate Search, full-text search for your domain model
 *
 * License: GNU Lesser General Public License (LGPL), version 2.1 or later
 * See the lgpl.txt file in the root directory or <http://www.gnu.org/licenses/lgpl-2.1.html>.
 */
package org.hibernate.search.testsupport.textbuilder;

import java.util.Locale;
import java.util.Random;
import java.util.Set;
import java.util.TreeSet;

/**
 * Test utility meant to produce sentences of a randomly generated language,
 * having some properties of natural languages.
 * The goal is to produce sentences which look like a western text,
 * but without needing an actual resource to read from so we can create unlimited
 * garbage. We also get a chance to produce some novel poetry.
 * All sentences from the same SentenceInventor will share
 * a limited dictionary, making the frequencies somehow repeatable, suitable to test
 * with Lucene.
 * Sentences produced depend from the constructor arguments,
 * making the output predictable for testing purposes.
 *
 * @author Sanne Grinovero
 */
public class SentenceInventor {

	private final Random r;
	private final WordDictionary dictionary;
	private final Locale randomlocale;
	//array contains repeated object for probability distribution (more chance for a ",")
	private final char[] sentenceSeparators = new char[] { ',', ',', ',' , ';', ':', ':' };
	//same as above, but favour the "full stop" char as a more likely end for periods.
	private final char[] periodSeparators = new char[] { '.', '.', '.' , '.', '.', '?', '?', '!' };

	/**
	 * @param randomSeed the seed to use for random generator
	 * @param dictionarySize the number of terms to insert in the dictionary used to build sentences
	 */
	public SentenceInventor(long randomSeed, int dictionarySize) {
		r = new Random( randomSeed );
		randomlocale = randomLocale();
		dictionary = randomDictionary( dictionarySize );
	}

	/**
	 * @return a random Locale among the ones available on the current system
	 */
	private Locale randomLocale() {
		Locale[] availableLocales = Locale.getAvailableLocales();
		int index = r.nextInt( availableLocales.length );
		return availableLocales[index];
	}

	/**
	 * @return a random character from the ASCII table (text chars only)
	 */
	public char randomCharacter() {
		return (char) (r.nextInt( 26 ) + 65);
	}

	/**
	 * @param length the desired length
	 * @return a randomly generated String
	 */
	public String randomString(int length) {
		char[] chars = new char[length];
		for ( int i = 0; i < length; i++ ) {
			chars[i] = randomCharacter();
		}
		return new String( chars );
	}

	/**
	 * Produces a randomly generated String, using
	 * only western alphabet characters and selecting
	 * the length as a normal distribution of natural languages.
	 * @return the generated String
	 */
	public String randomString() {
		double d = r.nextGaussian() * 6.3d;
		int l = (int) d + 6;
		if ( l > 0 ) {
			return randomString( l );
		}
		else {
			return randomString();
		}
	}

	/**
	 * Produces a random String, which might be lowercase,
	 * completely uppercase, or uppercasing the first char
	 * (randomly selected)
	 * @return produced String
	 */
	public String randomTerm() {
		int i = r.nextInt( 200 );
		String term = randomString();
		if ( i > 10 ) {
			//completely lowercase 189/200 cases
			return term.toLowerCase( randomlocale );
		}
		else if ( i < 2 ) {
			//completely uppercase in 2/200 cases
			return term;
		}
		else {
			//first letter uppercase in 9/200 cases
			return term.substring( 0, 1 ) + term.substring( 1 ).toLowerCase( randomlocale );
		}
	}

	private WordDictionary randomDictionary(int size) {
		Set<String> tree = new TreeSet<String>();
		while ( tree.size() != size ) {
			tree.add( randomTerm() );
		}
		return new WordDictionary( tree );
	}

	/**
	 * Builds a sentence concatenating terms from the generated dictionary and spaces
	 * @return a sentence
	 */
	public String nextSentence() {
		int sentenceLength = r.nextInt( 3 ) + r.nextInt( 10 ) + 1;
		String[] sentence = new String[sentenceLength];
		for ( int i = 0; i < sentenceLength; i++ ) {
			sentence[i] = dictionary.randomWord();
		}
		if ( sentenceLength == 1 ) {
			return sentence[0];
		}
		else {
			StringBuilder sb = new StringBuilder( sentence[0] );
			for ( int i = 1; i < sentenceLength; i++ ) {
				sb.append( " " );
				sb.append( sentence[i] );
			}
			return sb.toString();
		}
	}

	/**
	 * Combines a random (gaussian) number of sentences in a period,
	 * using some punctuation symbols and
	 * capitalizing first char, terminating with dot and newline.
	 * @return
	 */
	public String nextPeriod() {
		//Combine two random values to make extreme long/short less likely,
		//But still make the "one statement" period more likely than other shapes.
		int periodLengthSentences = r.nextInt( 6 ) + r.nextInt( 4 ) - 3;
		periodLengthSentences = ( periodLengthSentences < 1 ) ? 1 : periodLengthSentences;
		String firstsentence = nextSentence();
		StringBuilder sb = new StringBuilder()
			.append( firstsentence.substring( 0,1 ).toUpperCase( randomlocale ) )
			.append( firstsentence.substring( 1 ) );
		for ( int i = 1; i < periodLengthSentences; i++ ) {
			int separatorCharIndex = r.nextInt( sentenceSeparators.length );
			sb
				.append( sentenceSeparators[separatorCharIndex] )
				.append( ' ' )
				.append( nextSentence() );
		}
		int periodSeparatorCharIndex = r.nextInt( periodSeparators.length );
		sb.append( periodSeparators[periodSeparatorCharIndex] );
		sb.append( "\n" );
		return sb.toString();
	}

	//run it to get an idea of what this class is going to produce
	public static void main(String[] args) {
		SentenceInventor wi = new SentenceInventor( 7L, 10000 );
		for ( int i = 0; i < 3000; i++ ) {
			System.out.print( wi.nextPeriod() );
		}
	}

}