/**
* <a href="http://www.openolat.org">
* OpenOLAT - Online Learning and Training</a><br>
* <p>
* Licensed under the Apache License, Version 2.0 (the "License"); <br>
* you may not use this file except in compliance with the License.<br>
* You may obtain a copy of the License at the
* <a href="http://www.apache.org/licenses/LICENSE-2.0">Apache homepage</a>
* <p>
* Unless required by applicable law or agreed to in writing,<br>
* software distributed under the License is distributed on an "AS IS" BASIS, <br>
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. <br>
* See the License for the specific language governing permissions and <br>
* limitations under the License.
* <p>
* Initial code contributed and copyrighted by<br>
* frentix GmbH, http://www.frentix.com
* <p>
*/
package org.olat.core.commons.services.text.impl;
import java.text.BreakIterator;
import java.util.Locale;
import org.olat.core.commons.services.text.TextService;
import org.olat.core.commons.services.text.impl.nutch.LanguageIdentifier;
import org.olat.core.util.i18n.I18nModule;
import org.springframework.stereotype.Service;
/**
* Description:<br>
* The language guess is based on a NGRam algorithm. The implementation
* is based on a plugin of nutch (an apache project). To create a profile
* for a new language, do collect a sample text of the language. It need
* a lot of text (the standard are done with more than a millions of words).
* Then create the profile:<br>
* java org.olat.core.commons.services.text.impl.nutch.NGramPofile -create profile-name filename encoding<br>
* OLAT work with UTF-8 so create the profile with UTF-8. The right encoding is very important.<br>
* Then add the profile-name.ngp file with the other in _resources. The profile-name is the name of the language.
*
* <P>
* Initial Date: 25 nov. 2009 <br>
* @author srosse
*/
@Service
public class TextServiceImpl implements TextService {
private static final float CHINESE_RATIO_WORD_CHARACTER = 2.2f;
private LanguageIdentifier identifier = new LanguageIdentifier();
/**
* [spring only]
*/
public TextServiceImpl() {
//
}
/**
* Return the locale found by a NGram analyse. The different profile are saved in _resources.
* The best result are with longer text, other more then 100 characters.
* @see org.olat.core.util.lang.LanguageService#detectLocale(java.lang.String)
*/
@Override
public Locale detectLocale(String text) {
String language = identifier.identify(text);
for(Locale locale:Locale.getAvailableLocales()) {
if(language.equals(locale.getLanguage())) {
return locale;
}
}
return null;
}
@Override
public int characterCount(String text, Locale locale) {
return countCharacters(text, locale);
}
/**
* Use the java.text.BreakIterator to count the number of words. There is an excpetion for chinese
* language because only a human count reliably count the number of words in a chinese text (Word are
* not separated by space in Chinese, Japanese and Thai). The different entreprise involved in traduction
* count the words with the number of characters and a factor 2.2.
* @param text
* @param locale
* @return
*/
@Override
public int wordCount(String text, Locale locale) {
if(locale == null) {
locale = I18nModule.getDefaultLocale();
}
if(Locale.CHINESE.getLanguage().equals(locale.getLanguage())) {
return countChineseWords(text, locale);
}
return countWords(text, locale);
}
private int countChineseWords(String text, Locale locale) {
int characters = countCharacters(text, locale);
return Math.round(characters / CHINESE_RATIO_WORD_CHARACTER);
}
private int countWords(String text, Locale locale) {
int count = 0;
BreakIterator wordIterator = BreakIterator.getWordInstance(locale);
wordIterator.setText(text);
int start = wordIterator.first();
int end = wordIterator.next();
while (end != BreakIterator.DONE) {
char ch = text.charAt(start);
if (Character.isLetterOrDigit(ch)) {
count++;
}
start = end;
end = wordIterator.next();
}
return count;
}
private int countCharacters(String text, Locale locale) {
if(locale == null) {
locale = I18nModule.getDefaultLocale();
}
int count = 0;
BreakIterator characterIterator = BreakIterator.getCharacterInstance(locale);
characterIterator.setText(text);
int start = characterIterator.first();
int end = characterIterator.next();
while (end != BreakIterator.DONE) {
char ch = text.charAt(start);
if (Character.isLetterOrDigit(ch)) {
count++;
}
start = end;
end = characterIterator.next();
}
return count;
}
}