/* * Copyright (C) 2015 Stratio (http://stratio.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.stratio.morphlines.wikipediacleaner; import info.bliki.wiki.filter.PlainTextConverter; import info.bliki.wiki.model.WikiModel; import java.util.Collection; import java.util.Collections; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.kitesdk.morphline.api.Command; import org.kitesdk.morphline.api.CommandBuilder; import org.kitesdk.morphline.api.MorphlineContext; import org.kitesdk.morphline.api.Record; import org.kitesdk.morphline.base.AbstractCommand; import com.typesafe.config.Config; public class WikipediaCleanerBuilder implements CommandBuilder { private final static String COMMAND_NAME = "wikipediaCleaner"; private final static String TEMPLATE_PATTERN = "\\{\\{[^{}]*\\}\\}"; public Collection<String> getNames() { return Collections.singletonList(COMMAND_NAME); } public Command build(Config config, Command parent, Command child, MorphlineContext context) { return new WikipediaCleaner(this, config, parent, child, context); } private static final class WikipediaCleaner extends AbstractCommand { private final static String INPUT_FIELD = "input"; private final static String OUTPUT_FIELD = "output"; private String inputFieldName; private String outputFieldName; public WikipediaCleaner(CommandBuilder builder, Config config, Command parent, Command child, final MorphlineContext context) { super(builder, config, parent, child, context); this.inputFieldName = getConfigs().getString(config, INPUT_FIELD); this.outputFieldName = getConfigs().getString(config, OUTPUT_FIELD); validateArguments(); } @Override protected boolean doProcess(Record record) { Object value = record.get(inputFieldName).get(0); WikiModel wikiModel = new WikiModel( "http://www.mywiki.com/wiki/${image}", "http://www.mywiki.com/wiki/${title}"); String cleanText = wikiModel.render(new PlainTextConverter(), value.toString()); Pattern p = Pattern.compile(TEMPLATE_PATTERN); Matcher m = p.matcher(cleanText); while (m.find()) { cleanText = m.replaceAll(""); m = p.matcher(cleanText); } record.put(outputFieldName, cleanText.trim()); // pass record to next command in chain: return super.doProcess(record); } } }