/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.vectorizer;
import java.util.Random;
import org.apache.mahout.common.RandomUtils;
public class RandomDocumentGenerator {
private static final int AVG_DOCUMENT_LENGTH = 20;
private static final int AVG_SENTENCE_LENGTH = 8;
private static final int AVG_WORD_LENGTH = 6;
private static final String CHARSET = "abcdef";
private static final String DELIM = " .,?;:!\t\n\r";
private static final String ERRORSET = "`1234567890" + "-=~@#$%^&*()_+[]{}'\"/<>|\\";
private final Random random = RandomUtils.getRandom();
private char getRandomDelimiter() {
return DELIM.charAt(random.nextInt(DELIM.length()));
}
public String getRandomDocument() {
int length = (AVG_DOCUMENT_LENGTH >> 1) + random.nextInt(AVG_DOCUMENT_LENGTH);
StringBuilder sb = new StringBuilder(length * AVG_SENTENCE_LENGTH * AVG_WORD_LENGTH);
for (int i = 0; i < length; i++) {
sb.append(getRandomSentence());
}
return sb.toString();
}
public String getRandomSentence() {
int length = (AVG_SENTENCE_LENGTH >> 1) + random.nextInt(AVG_SENTENCE_LENGTH);
StringBuilder sb = new StringBuilder(length * AVG_WORD_LENGTH);
for (int i = 0; i < length; i++) {
sb.append(getRandomString()).append(' ');
}
sb.append(getRandomDelimiter());
return sb.toString();
}
public String getRandomString() {
int length = (AVG_WORD_LENGTH >> 1) + random.nextInt(AVG_WORD_LENGTH);
StringBuilder sb = new StringBuilder(length);
for (int i = 0; i < length; i++) {
sb.append(CHARSET.charAt(random.nextInt(CHARSET.length())));
}
if (random.nextInt(10) == 0) {
sb.append(ERRORSET.charAt(random.nextInt(ERRORSET.length())));
}
return sb.toString();
}
}