/**
*
*/
package uk.bl.wa.extract;
/*
* #%L
* warc-indexer
* $Id:$
* $HeadURL:$
* %%
* Copyright (C) 2013 - 2014 The UK Web Archive
* %%
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as
* published by the Free Software Foundation, either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program. If not, see
* <http://www.gnu.org/licenses/gpl-2.0.html>.
* #L%
*/
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import uk.bl.wa.analyser.text.lang.LanguageIdentifier;
import uk.bl.wa.util.Instrument;
import com.cybozu.labs.langdetect.Detector;
import com.cybozu.labs.langdetect.DetectorFactory;
import com.cybozu.labs.langdetect.LangDetectException;
import com.typesafe.config.Config;
/**
* @author Andrew Jackson <Andrew.Jackson@bl.uk>
*
*/
public class LanguageDetector {
private static Log log = LogFactory.getLog(LanguageDetector.class);
public static final String[] DEFAULT_LANGDETECT_PROFILES = new String[] {
"af", "ar", "bg", "bn", "cs", "da", "de", "el", "en", "es", "et", "fa", "fi", "fr", "gu",
"he", "hi", "hr", "hu", "id", "it", "ja", "kn", "ko", "lt", "lv", "mk", "ml", "mr",
"ne", "nl", "no", "pa", "pl", "pt", "ro", "ru", "sk", "sl", "so", "sq", "sv", "sw",
"ta", "te", "th", "tl", "tr", "uk", "ur", "vi", "zh-cn", "zh-tw"
};
// Only take the first 100,000 characters:
private static final int MAX_TEXT_LEN = 100 * 1000;
public LanguageDetector(Config conf) {
final long start = System.nanoTime();
if (conf != null && conf.hasPath("warc.index.extract.content.language.langdetectprofiles")) {
init(conf.getStringList("warc.index.extract.content.language.langdetectprofiles"));
log.info("Initialising LanguageDetector using app configuration.");
} else {
init(Arrays.asList(DEFAULT_LANGDETECT_PROFILES));
log.info("Initialising LanguageDetector using library defaults.");
}
Instrument.timeRel("LanguageAnalyzer#total", "LanguageDetector#startup", start);
}
public void init(List<String> langdetectProfiles) {
// Set up the langdetect:
if( DetectorFactory.getLangList().size() != 0 ) {
log.info("langdetect already initialized with " + DetectorFactory.getLangList().size() + " profiles");
return;
}
List<String> json_profiles = new ArrayList<String>();
for( String lc : langdetectProfiles ) {
BufferedReader is = new BufferedReader( new InputStreamReader(
this.getClass().getResourceAsStream("/lang-detect-profiles/"+lc)
) );
StringBuilder sb = new StringBuilder();
String line;
try {
while ((line = is.readLine()) != null) {
sb.append(line+"\n");
}
json_profiles.add(sb.toString());
} catch( IOException e ) {
log.error("Exception while reading langdetect profile: "+lc);
}
}
try {
DetectorFactory.loadProfile(json_profiles);
log.info("Initialized with " + langdetectProfiles.size() + " langdetect profiles and "
+ LanguageIdentifier.getSupportedLanguages().size() + " Tika LanguageIdentifier profiles");
} catch( LangDetectException e) {
log.error("Error occurred when loading language profiles:"+e+" Language detection will likely fail. ");
}
}
/**
* Detect using the langdetect method.
*
* @param text
* @return language code
*/
private String getLangdetectLanguage( String text ) {
try {
Detector detector = DetectorFactory.create();
detector.append(text);
return detector.detect();
} catch (LangDetectException e) {
log.info("Could not detect language: "+e);
return null;
}
}
public String detectLanguage( String text ) {
final long start = System.nanoTime();
try {
// Avoid ngramming totally massive texts:
if (text != null && text.length() > MAX_TEXT_LEN) {
text = text.substring(0, MAX_TEXT_LEN);
}
// Attept to use Tika module first (should be good for long texts)
LanguageIdentifier li = new LanguageIdentifier(text);
// Only return that result if it's credible:
if( li.isReasonablyCertain() ) return li.getLanguage();
} finally {
Instrument.timeRel("LanguageAnalyzer#total", "LanguageDetector.detectLanguage#li", start);
}
final long startLD = System.nanoTime();
try {
// Otherwise, fall back to the langdetect approach (should be better for short texts)
// (Having found that (very) short texts are not likely to be classified sensibly.)
if( text.length() > 200 )
return this.getLangdetectLanguage(text);
return null;
} finally {
Instrument.timeRel("LanguageAnalyzer#total", "LanguageDetector.detectLanguage#ld", startLD);
}
}
/**
* @param args
* @throws LangDetectException
*/
public static void main(String[] args) throws LangDetectException {
LanguageDetector ld = new LanguageDetector(null);
System.out.println("Lang: " + ld.detectLanguage("I just wanted to double check that the IP address we need to redirect these to is the IP address of www.webarchive.org.uk. which is"));
}
}