package focusedCrawler.util; import java.io.BufferedReader; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.List; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.cybozu.labs.langdetect.Detector; import com.cybozu.labs.langdetect.DetectorFactory; import com.cybozu.labs.langdetect.Language; import focusedCrawler.target.model.Page; public class LangDetection { private static final Logger logger = LoggerFactory.getLogger(LangDetection.class); /** * Loads language profiles from resources folder */ static { String[] languages = { "af", "ar", "bg", "bn", "cs", "da", "de", "el", "en", "es", "et", "fa", "fi", "fr", "gu", "he", "hi", "hr", "hu", "id", "it", "ja", "kn", "ko", "lt", "lv", "mk", "ml", "mr", "ne", "nl","no", "pa", "pl", "pt", "ro", "ru", "sk", "sl", "so", "sq", "sv", "sw", "ta", "te", "th", "tl", "tr", "uk", "ur", "vi", "zh-cn", "zh-tw" }; try { List<String> profiles = new ArrayList<String>(); for (String language : languages) { String filename = "profiles/"+language; InputStream is = LangDetection.class.getClassLoader().getResourceAsStream(filename); BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8")); String jsonProfile = br.readLine(); profiles.add(jsonProfile); } DetectorFactory.loadProfile(profiles); } catch (Exception e) { throw new IllegalStateException("Could not load language profiles."); } } /** * Try to detect the language of the text in the String. * * @param page * @return true if the String contains English language, false otherwise */ public Boolean isEnglish(String content) { try { if (content == null || content.isEmpty()) { return false; } Detector detector = DetectorFactory.create(); detector.append(content); ArrayList<Language> langs = detector.getProbabilities(); if (langs.size() == 0) { return false; } for (Language l : langs) { if (l.lang.equals("en")) { return true; } } return false; } catch (Exception ex) { logger.warn("Problem while detecting language in text: " + content, ex); return false; } } /** * Try to detect the language of contents of the page. * * @param page * @return true if the page is in English language, false otherwise */ public Boolean isEnglish(Page page) { try { return isEnglish(page.getParsedData().getCleanText()); } catch (Exception e) { System.out.println("Exception in detect_page"); return false; } } }