/** * Licensed to DigitalPebble Ltd under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * DigitalPebble licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.stormcrawler.parse.filter; import java.io.IOException; import java.util.List; import java.util.Map; import org.apache.commons.lang.StringUtils; import org.w3c.dom.DocumentFragment; import com.digitalpebble.stormcrawler.parse.ParseFilter; import com.digitalpebble.stormcrawler.parse.ParseResult; import com.fasterxml.jackson.databind.JsonNode; import com.google.common.base.Optional; import com.optimaize.langdetect.LanguageDetector; import com.optimaize.langdetect.LanguageDetectorBuilder; import com.optimaize.langdetect.i18n.LdLocale; import com.optimaize.langdetect.ngram.NgramExtractors; import com.optimaize.langdetect.profiles.LanguageProfile; import com.optimaize.langdetect.profiles.LanguageProfileReader; import com.optimaize.langdetect.text.CommonTextObjectFactories; import com.optimaize.langdetect.text.TextObject; import com.optimaize.langdetect.text.TextObjectFactory; /** * Language identification; the language code gets stored in the metadata. <br> * * To use it, just add the module as a dependency in your pom and include * * ```json { "class": "com.digitalpebble.stormcrawler.parse.filter.LanguageID", * "name": "LanguageID", "params": { "key": "lang" } } * * in the parse filter config. ``` **/ public class LanguageID extends ParseFilter { private static LanguageDetector languageDetector; private static final TextObjectFactory textObjectFactory = CommonTextObjectFactories .forDetectingOnLargeText(); private String mdKey = "lang"; static { try { // load all languages: List<LanguageProfile> languageProfiles = new LanguageProfileReader() .readAllBuiltIn(); // build language detector: languageDetector = LanguageDetectorBuilder .create(NgramExtractors.standard()) .withProfiles(languageProfiles).build(); } catch (IOException e) { throw new RuntimeException("Error while loading language profiles", e); } } @Override public void configure(Map stormConf, JsonNode filterParams) { JsonNode node = filterParams.get("key"); if (node != null && node.isTextual()) { mdKey = node.asText("lang"); } } @Override public void filter(String url, byte[] content, DocumentFragment doc, ParseResult parse) { String text = parse.get(url).getText(); if (StringUtils.isBlank(text)) { return; } TextObject textObject = textObjectFactory.forText(text); synchronized (languageDetector) { Optional<LdLocale> lang = languageDetector.detect(textObject); if (lang.isPresent()) { String code = lang.get().getLanguage(); parse.get(url).getMetadata().setValue(mdKey, code); } } } }