/* * Copyright (C) 2015 Stratio (http://stratio.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.stratio.morphlines.nlp; import java.util.Collection; import java.util.Collections; import java.util.Locale; import java.util.Map; import org.kitesdk.morphline.api.Command; import org.kitesdk.morphline.api.CommandBuilder; import org.kitesdk.morphline.api.MorphlineContext; import org.kitesdk.morphline.api.Record; import org.kitesdk.morphline.base.AbstractCommand; import com.google.common.collect.ImmutableMap; import com.typesafe.config.Config; import cue.lang.Counter; import cue.lang.NGramIterator; import cue.lang.stop.StopWords; public class TopWordsBuilder implements CommandBuilder { private final static String COMMAND_NAME = "topWords"; public Collection<String> getNames() { return Collections.singletonList(COMMAND_NAME); } public Command build(Config config, Command parent, Command child, MorphlineContext context) { return new TopWords(this, config, parent, child, context); } private static final class TopWords extends AbstractCommand { private final static String LANGUAGE_FIELD = "language"; private final static String INPUT_FIELD = "input"; private final static String OUTPUT_FIELD = "output"; private final String languageFieldName; private final String inputFieldName; private final String outputFieldName; public TopWords(CommandBuilder builder, Config config, Command parent, Command child, final MorphlineContext context) { super(builder, config, parent, child, context); this.languageFieldName = getConfigs().getString(config, LANGUAGE_FIELD); this.inputFieldName = getConfigs().getString(config, INPUT_FIELD); this.outputFieldName = getConfigs().getString(config, OUTPUT_FIELD); validateArguments(); } @Override protected boolean doProcess(Record record) { String language = (String) record.get(languageFieldName).get(0); String inputText = (String) record.get(inputFieldName).get(0); Locale locale = new Locale(language); StopWords stopwords = StopWordsFinder.find(language); final Counter<String> ngrams = new Counter<String>(); for (final String ngram : new NGramIterator(1, inputText, locale, stopwords)) { ngrams.note(ngram.toLowerCase(locale)); } if (!ngrams.getMostFrequent(1).isEmpty()) record.put(outputFieldName, ngrams.getMostFrequent(1).get(0)); // pass record to next command in chain: return super.doProcess(record); } } private final static class StopWordsFinder { private static Map<String,StopWords> stopWordsMap = ImmutableMap.<String, StopWords>builder() .put("es", StopWords.Spanish) .put("en", StopWords.English) .put("ca", StopWords.Catalan) .put("de", StopWords.German) .put("ar", StopWords.Arabic) .put("hr", StopWords.Croatian) .put("cs", StopWords.Czech) .put("nl", StopWords.Dutch) .put("da", StopWords.Danish) .put("eo", StopWords.Esperanto) .put("fi", StopWords.Finnish) .put("fr", StopWords.French) .put("el", StopWords.Greek) .put("hi", StopWords.Hindi) .put("hu", StopWords.Hungarian) .put("it", StopWords.Italian) .put("la", StopWords.Latin) .put("no", StopWords.Norwegian) .put("pl", StopWords.Polish) .put("pt", StopWords.Portuguese) .put("ro", StopWords.Romanian) .put("ru", StopWords.Russian) .put("sl", StopWords.Slovenian) .put("sk", StopWords.Slovak) .put("sv", StopWords.Swedish) .put("he", StopWords.Hebrew) .put("tk", StopWords.Turkish) .build(); public static StopWords find(String language) { StopWords result = stopWordsMap.get(language); if (result == null) { return StopWords.Custom; } return result; } } }