/*******************************************************************************
* Copyright 2007, 2009 Luan Nguyen, Stephen O'Rourke (stephen.orourke@sydney.edu.au)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
package tml.vectorspace.operations;
import java.text.BreakIterator;
import java.util.ArrayList;
import tml.corpus.ParagraphCorpus;
import tml.corpus.TextDocument;
import tml.vectorspace.operations.results.ReadabilityResult;
/**
* This operation a calculation of readability measurements
*
* @author Luan Nguyen, Stephen O'Rourke
*
*/
public class Readability extends AbstractOperation<ReadabilityResult> {
/** Number of syllables at which a word is considered difficult. */
private static final int THRESH_HOLD = 4;
public Readability() {
this.name = "Readbility Index";
}
/**
* Check if a string is a word
*
* @param word
* @return if the word
*/
public boolean isWord(String word) {
return word.matches("[a-zA-Z]+");
}
/**
* Check if the string (char input) is a vowel e.g a,e,i,o,u and y (y is
* treated as a vowel also)
*
* @param input
* a character
* @return if the character is a vowel
*/
public boolean isVowel(char input) {
if (input == 'a' || input == 'e' || input == 'i' || input == 'o' || input == 'u' || input == 'y') {
return true;
}
return false;
}
/**
* Count number of syllable per word input using the rules provided
*
* @param input
* text to count syllables from
* @return the number of syllables in the text
*/
public double countSyllable(String input) {
double syllable = 0;
double wordLength = input.length();
String word = input;
char charArray[] = word.toCharArray();
if (word.indexOf("www") > 0 || word.indexOf("http") > 0 || word.indexOf("@") > 0 || word.indexOf(".co") > 0) {
return 1;
}
for (int i = 0; i < wordLength; i++) {
if (isVowel(charArray[i])) {
char thisChar = charArray[i];
char nextChar;
char secondNextChar;
char previousChar;
if (i < wordLength - 1) {
nextChar = charArray[i + 1];
} else {
nextChar = ' ';
}
if (i != 0) {
previousChar = charArray[i - 1];
} else {
previousChar = ' ';
}
// if current character is 'a'
if (thisChar == 'a') {
if (nextChar == 'i' || nextChar == 'u' || nextChar == 'y') {
i++;
}
syllable++;
}
// if current character is 'o'
if (thisChar == 'o') {
if (nextChar == 'a' || nextChar == 'o' || nextChar == 'u' || nextChar == 'y') {
i++;
} else if (nextChar == 'i' && ((i + 2) < wordLength)) {
secondNextChar = charArray[i + 2];
if (secondNextChar != 'n') {
i++;
}
}
syllable++;
}
// if current character is 'u'
if (thisChar == 'u') {
if (nextChar == 'e' || nextChar == 'y') {
i++;
} else if (nextChar == 'o' && previousChar != 'd') {
i++;
} else if (nextChar == 'a') {
if (previousChar == 'q' || previousChar == 's') {
i++;
}
}
syllable++;
}
// if current character is 'e'
if (thisChar == 'e') {
if (nextChar == 'e') {
i++;
} else if (nextChar == 'a' && ((i + 2) < wordLength)) {
secondNextChar = charArray[i + 2];
if (secondNextChar != 't' && secondNextChar != 'c') {
i++;
}
} else if (nextChar == 'i') {
if (previousChar != 'r') {
i++;
} else if ((i + 2) < wordLength) {
secondNextChar = charArray[i + 2];
if (secondNextChar != 'n') {
i++;
}
}
} else if (nextChar == 'o' || nextChar == 'u' || nextChar == 'y') {
if (previousChar != 'r') {
i++;
}
} else if (nextChar == 'd') {
if (previousChar != 'd' && previousChar != 't') {
i++;
}
} else if (i == (wordLength - 1)) {
if (previousChar != 'h' && previousChar != 'w' && previousChar != 'm' && previousChar != 'b' && previousChar != 'l' && previousChar != 'r') {
syllable--;
} else if (i >= 2) {
char secondPreviousChar = charArray[i - 2];
if (previousChar == 'r' && secondPreviousChar != 'd' && secondPreviousChar != 't' && secondPreviousChar != 'i') {
syllable--;
} else if (previousChar != 'r' && !isVowel(previousChar)) {
syllable--;
}
}
}
syllable++;
}
// if it is a 'i'
if (thisChar == 'i') {
if (nextChar == 'e') {
if ((i + 2) < wordLength) {
secondNextChar = charArray[i + 2];
if (secondNextChar != 'm' || secondNextChar != 'n' || secondNextChar != 't' || secondNextChar != 'r' || secondNextChar != 's') {
i++;
// syllable++;
}
}
}
if (nextChar == 'a') {
if ((i + 2) < wordLength) {
secondNextChar = charArray[i + 2];
if (secondNextChar != 't' && previousChar != 't') {
i++;
}
}
}
if (nextChar == 'o') {
if ((i + 2) < wordLength) {
secondNextChar = charArray[i + 2];
if (secondNextChar != 't' && previousChar != 't') {
i++;
} else if (previousChar == 'l' || previousChar == 's' || previousChar == 't' || previousChar == 'c' || previousChar == 'n') {
i++;
}
}
}
syllable++;
}
// if it is y
if (thisChar == 'y') {
if (i == 0) {
i++;
} else {
syllable++;
}
}
}
}
if (syllable == 0) {
syllable = 1;
}
return syllable;
}
/**
* Calculate Flesch Kincaid Grade Level from number of words, number of
* sentence and number of syllable given This uses the standard Flesch
* Kincaid Grade Level formula
*
* @param numberOfWord
* @param numberOfSentence
* @param numberOfSyllable
* @return Flesch Kincaid grade level
*/
public double calculateGradeLevel(double numberOfWord, double numberOfSentence, double numberOfSyllable) {
double fleschKincaidGradeLevel = ((0.39 * (numberOfWord / numberOfSentence) + (11.8 * numberOfSyllable / numberOfWord)) - 15.59);
return fleschKincaidGradeLevel;
}
/**
* Calculate Flesch Reading Ease from number of words, number of sentence
* and number of syllable given This uses the standard Flesch Reading Ease
* formula
*
* @param numberOfWord
* @param numberOfSentence
* @param numberOfSyllable
* @return the Flesch Reading Ease index
*/
public double calculateReadingEase(double numberOfWord, double numberOfSentence, double numberOfSyllable) {
double fleschReadingEase = 206.835 - (1.015 * (numberOfWord / numberOfSentence)) - (84.6 * numberOfSyllable / numberOfWord);
return fleschReadingEase;
}
/**
* Calculate the difference between readability indices of 2 paragraphs a
* absolute value is return because we are only interested in the difference
* not comparing between the two paragraphs
*
* @param number1
* @param number2
* @return difference between two paragraphs' indices
*/
public double calculateDifferent(double number1, double number2) {
return Math.abs(number1 - number2);
}
/**
* Calculate the difference between readability indices of 2 consecutive
* paragraphs
*
* @param results
*/
public void differentiate(ArrayList<ReadabilityResult> results) {
for (int i = 0; i < results.size() - 1; i++) {
ReadabilityResult result1 = results.get(i);
ReadabilityResult result2 = results.get(i + 1);
result1.setDiffReadingEase(calculateDifferent(result1.getFleshReadingEase(), result2.getFleshReadingEase()));
result1.setDiffGradeLevel(calculateDifferent(result1.getFleshKincaidGradeLevel(), result2.getFleshKincaidGradeLevel()));
}
}
/**
* Start the operation (non-Javadoc)
*
* @see tml.vectorspace.operations.AbstractOperation#start()
*/
@Override
public void start() throws Exception {
super.start();
this.results = new ArrayList<ReadabilityResult>();
if (!(corpus instanceof ParagraphCorpus)) {
logger.debug("This function requires paragraph corpus to meet its purpose");
return;
}
for (int key = 0; key < this.corpus.getPassages().length; key++) {
double wordInParagraph = 0;
double sentenceInParagraph = 0;
double syllableInWord = 0;
double syllableInParagraph = 0;
TextDocument passage = this.repository.getTextDocument(this.corpus.getPassages()[key]);
ReadabilityResult result = new ReadabilityResult();
String currentParagraph = passage.getContent();
BreakIterator sentenceIterator = BreakIterator.getSentenceInstance(corpus.getRepository().getLocale());
sentenceIterator.setText(currentParagraph);
int sentenceStart = sentenceIterator.first(), sentenceEnd = 0;
while ((sentenceEnd = sentenceIterator.next()) != BreakIterator.DONE) {
String currentSentence = currentParagraph.substring(sentenceStart, sentenceEnd);
sentenceInParagraph++;
BreakIterator wordIterator = BreakIterator.getWordInstance(corpus.getRepository().getLocale());
wordIterator.setText(currentSentence);
int wordStart = wordIterator.first(), wordEnd = 0;
while ((wordEnd = wordIterator.next()) != BreakIterator.DONE) {
String currentWord = currentSentence.substring(wordStart, wordEnd).trim().toLowerCase();
if (isWord(currentWord)) {
wordInParagraph++;
syllableInParagraph += countSyllable(currentWord);
if (syllableInWord >= THRESH_HOLD) {
result.addHardWord(currentWord);
}
}
wordStart = wordEnd;
}
sentenceStart = sentenceEnd;
}
result.setFleshKincaidGradeLevel(calculateGradeLevel(wordInParagraph, sentenceInParagraph, syllableInParagraph));
result.setFleshReadingEase(calculateReadingEase(wordInParagraph, sentenceInParagraph, syllableInParagraph));
result.setTextPassageContent(passage.getContent());
result.setTextPassageId(key);
results.add(result);
}
differentiate(results);
}
}