/*
* Copyright 2016 Christoph Böhme
*
* Licensed under the Apache License, Version 2.0 the "License";
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.culturegraph.mf.strings;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.culturegraph.mf.framework.FluxCommand;
import org.culturegraph.mf.framework.StreamReceiver;
import org.culturegraph.mf.framework.annotations.Description;
import org.culturegraph.mf.framework.annotations.In;
import org.culturegraph.mf.framework.annotations.Out;
import org.culturegraph.mf.framework.helpers.DefaultObjectPipe;
/**
* Decodes a string based on a regular expression using named capture groups.
* <p>
* Named capture groups are denoted by {@literal (?<name>X)}. The name of a
* matched group (" name") is used as the literal name, the
* captured content (" X") as the literal value. Group names are
* composed of the following characters:
* <ul>
* <li>The uppercase letters 'A' through 'Z',
* <li>The lowercase letters 'a' through 'z',
* <li>The digits '0' through '9'.
* </ul>
* The first character of a group name must be a letter.
* <p>
* The pattern is matched repeatedly to the input string. On each match the
* captured content of all named capture groups in the pattern is emitted as
* literals. Non-matching parts of the input are ignored.
* <p>
* The regular expression may contain unnamed capture groups. These are
* ignored.
* <p>
* If the pattern contains a capture group named "{@value
* #ID_CAPTURE_GROUP}", the first match of this group will be used as
* record identifier. If there is no such capture group or if it does not
* match, the empty string is used as record identifier.
* <p>
* Example: The regex
* <pre>{@literal
* a=(?<foo>[0-9]+),b=(?<bar>[x-z]+)
* }</pre>
* matched against the input
* <pre>{@literal
* a=42,b=xyzzy,c=ignored,a=23,b=xyz,d=ignored
* }</pre>
* will produce the sequence of events:
* <pre>{@literal
* start-record ""
* literal "foo": 42
* literal "bar": xyzzy
* literal "foo": 23
* literal "bar": xyz
* end-record
* }</pre>
*
* @author Thomas Seidel
* @author Christoph Böhme
*
*/
@Description("Decodes a string based on a regular expression using named capture groups")
@In(String.class)
@Out(StreamReceiver.class)
@FluxCommand("regex-decode")
public final class RegexDecoder extends DefaultObjectPipe<String, StreamReceiver> {
public static final String ID_CAPTURE_GROUP = "id";
private static final Pattern NAMED_CAPTURE_GROUP_PATTERN =
Pattern.compile("\\(\\?<([A-Za-z0-9]+)>");
private final Matcher matcher;
private final List<String> captureGroupNames;
private final boolean hasRecordIdCaptureGroup;
private String rawInputLiteral;
public RegexDecoder(final String regex) {
matcher = Pattern.compile(regex).matcher("");
captureGroupNames = collectCaptureGroupNames(regex);
hasRecordIdCaptureGroup = captureGroupNames.contains(ID_CAPTURE_GROUP);
}
private List<String> collectCaptureGroupNames(final String regex) {
final List<String> groupNames = new ArrayList<>();
final Matcher groupNameMatcher = NAMED_CAPTURE_GROUP_PATTERN.matcher(regex);
while (groupNameMatcher.find()) {
groupNames.add(groupNameMatcher.group(1));
}
return groupNames;
}
/**
* Sets the name of a literal containing the unmodified input received by
* {@link RegexDecoder}. If not set, no raw input literals are emitted.
* <p>
* The raw input <i>literal</i> event is always the first event emitted
* after the <i>start-record</i> event.
* <p>
* This parameter can be changed at any time during processing. It becomes
* effective with the next record being processed.
*
* @param rawInputLiteral name of the literal which contains the umodified
* input string. If null, raw input literals will be
* disabled.
*/
public void setRawInputLiteral(final String rawInputLiteral) {
this.rawInputLiteral = rawInputLiteral;
}
public String getRawInputLiteral() {
return this.rawInputLiteral;
}
@Override
public void process(final String input) {
matcher.reset(input);
if (!matcher.find()) {
return;
}
getReceiver().startRecord(getRecordId());
emitRawInputLiteral(input);
emitCaptureGroupsAsLiterals();
getReceiver().endRecord();
}
private String getRecordId() {
if (!hasRecordIdCaptureGroup) {
return "";
}
return matcher.group(ID_CAPTURE_GROUP);
}
private void emitCaptureGroupsAsLiterals() {
do {
for (final String groupName : captureGroupNames) {
getReceiver().literal(groupName, matcher.group(groupName));
}
} while (matcher.find());
}
private void emitRawInputLiteral(final String input) {
if (rawInputLiteral != null) {
getReceiver().literal(rawInputLiteral, input);
}
}
}