/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.streams.regex;
import org.apache.streams.core.StreamsDatum;
import org.apache.streams.core.StreamsProcessor;
import org.apache.streams.jackson.StreamsJacksonMapper;
import org.apache.streams.pojo.extensions.ExtensionUtil;
import org.apache.streams.pojo.json.Activity;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.google.common.collect.Sets;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* Provides a base implementation for extracting entities from text using regular expressions and then
* modifying the appropriate {@link org.apache.streams.pojo.json.Activity} extensions object.
*/
public abstract class AbstractRegexExtensionExtractor<T> implements StreamsProcessor {
private final String patternConfigKey;
private final String extensionKey;
private final String defaultPattern;
private static final Logger LOGGER = LoggerFactory.getLogger(AbstractRegexExtensionExtractor.class);
private static final ObjectMapper mapper = StreamsJacksonMapper.getInstance();
private String pattern;
protected AbstractRegexExtensionExtractor(String patternConfigKey, String extensionKey, String defaultPattern) {
this.patternConfigKey = patternConfigKey;
this.extensionKey = extensionKey;
this.defaultPattern = defaultPattern;
}
public String getPattern() {
return pattern;
}
@Override
public List<StreamsDatum> process(StreamsDatum entry) {
Activity activity;
if (entry.getDocument() instanceof Activity) {
activity = (Activity) entry.getDocument();
} else if (entry.getDocument() instanceof ObjectNode) {
activity = mapper.convertValue(entry.getDocument(), Activity.class);
} else {
return new ArrayList<>();
}
if (StringUtils.isBlank(pattern)) {
prepare(null);
}
Map<String, List<Integer>> matches = RegexUtils.extractMatches(pattern, activity.getContent());
Collection<T> entities = ensureTargetObject(activity);
for (String key : matches.keySet()) {
entities.add(prepareObject(key));
}
Set<T> set = new HashSet<>();
set.addAll(entities);
entities.clear();
entities.addAll(set);
entry.setDocument(activity);
return Collections.singletonList(entry);
}
@Override
public void prepare(Object configurationObject) {
if (configurationObject instanceof Map) {
if (((Map) configurationObject).containsKey(patternConfigKey)) {
pattern = (String) ((Map) configurationObject).get(patternConfigKey);
}
} else if (configurationObject instanceof String) {
pattern = (String) configurationObject;
} else {
pattern = defaultPattern;
}
}
@Override
public void cleanUp() {
//NOP
}
/**
* Configures the value to be persisted to the extensions object.
* @param extracted the value extracted by the regex
* @return an object representing the appropriate extension
*/
protected abstract T prepareObject(String extracted);
@SuppressWarnings("unchecked")
protected Collection<T> ensureTargetObject(Activity activity) {
Map<String, Object> extensions = ExtensionUtil.getInstance().ensureExtensions(activity);
Set<T> hashtags;
if (extensions.containsKey(extensionKey) && extensions.get(extensionKey) != null) {
hashtags = Sets.newHashSet((Iterable<T>) extensions.get(extensionKey));
} else {
hashtags = new HashSet<>();
}
extensions.put(extensionKey, hashtags);
return hashtags;
}
}