/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.beam.sdk.transforms;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PCollection;
/**
* {@code PTransorm}s to use Regular Expressions to process elements in a {@link PCollection}.
*
* <p>{@link Regex#matches(String, int)} can be used to see if an entire line matches a Regex.
* {@link Regex#matchesKV(String, int, int)} can be used to see if an entire line matches a Regex
* and output certain groups as a {@link KV}.
*
* <p>{@link Regex#find(String, int)} can be used to see if a portion of a line matches a Regex.
* {@link Regex#matchesKV(String, int, int)} can be used to see if a portion of a line matches a
* Regex and output certain groups as a {@link KV}.
*
* <p>Lines that do not match the Regex will not be output.
*/
public class Regex {
private Regex() {
// do not instantiate
}
/**
* Returns a {@link Regex.Matches} {@link PTransform} that checks if the entire line matches the
* Regex. Returns the entire line (group 0) as a {@link PCollection}.
*
* @param regex The regular expression to run
*/
public static Matches matches(String regex) {
return matches(regex, 0);
}
/**
* Returns a {@link Regex.Matches} {@link PTransform} that checks if the entire line matches the
* Regex. Returns the entire line (group 0) as a {@link PCollection}.
*
* @param pattern The regular expression to run
*/
public static Matches matches(Pattern pattern) {
return matches(pattern, 0);
}
/**
* Returns a {@link Regex.Matches} {@link PTransform} that checks if the entire line matches the
* Regex. Returns the group as a {@link PCollection}.
*
* @param regex The regular expression to run
* @param group The Regex group to return as a PCollection
*/
public static Matches matches(String regex, int group) {
return matches(Pattern.compile(regex), group);
}
/**
* Returns a {@link Regex.Matches} {@link PTransform} that checks if the entire line matches the
* Regex. Returns the group as a {@link PCollection}.
*
* @param pattern The regular expression to run
* @param group The Regex group to return as a PCollection
*/
public static Matches matches(Pattern pattern, int group) {
return new Matches(pattern, group);
}
/**
* Returns a {@link Regex.MatchesName} {@link PTransform} that checks if the entire line matches
* the Regex. Returns the group as a {@link PCollection}.
*
* @param regex The regular expression to run
* @param groupName The Regex group name to return as a PCollection
*/
public static MatchesName matches(String regex, String groupName) {
return matches(Pattern.compile(regex), groupName);
}
/**
* Returns a {@link Regex.MatchesName} {@link PTransform} that checks if the entire line matches
* the Regex. Returns the group as a {@link PCollection}.
*
* @param pattern The regular expression to run
* @param groupName The Regex group name to return as a PCollection
*/
public static MatchesName matches(Pattern pattern, String groupName) {
return new MatchesName(pattern, groupName);
}
/**
* Returns a {@link Regex.AllMatches} {@link PTransform} that checks if the entire line matches
* the Regex. Returns all groups as a List<String> in a {@link PCollection}.
*
* @param regex The regular expression to run
*/
public static AllMatches allMatches(String regex) {
return allMatches(Pattern.compile(regex));
}
/**
* Returns a {@link Regex.AllMatches} {@link PTransform} that checks if the entire line matches
* the Regex. Returns all groups as a List<String> in a {@link PCollection}.
*
* @param pattern The regular expression to run
*/
public static AllMatches allMatches(Pattern pattern) {
return new AllMatches(pattern);
}
/**
* Returns a {@link Regex.MatchesKV} {@link PTransform} that checks if the entire line matches the
* Regex. Returns the specified groups as the key and value as a {@link PCollection}.
*
* @param regex The regular expression to run
* @param keyGroup The Regex group to use as the key
* @param valueGroup The Regex group to use the value
*/
public static MatchesKV matchesKV(String regex, int keyGroup, int valueGroup) {
return matchesKV(Pattern.compile(regex), keyGroup, valueGroup);
}
/**
* Returns a {@link Regex.MatchesKV} {@link PTransform} that checks if the entire line matches the
* Regex. Returns the specified groups as the key and value as a {@link PCollection}.
*
* @param pattern The regular expression to run
* @param keyGroup The Regex group to use as the key
* @param valueGroup The Regex group to use the value
*/
public static MatchesKV matchesKV(Pattern pattern, int keyGroup, int valueGroup) {
return new MatchesKV(pattern, keyGroup, valueGroup);
}
/**
* Returns a {@link Regex.MatchesNameKV} {@link PTransform} that checks if the entire line matches
* the Regex. Returns the specified groups as the key and value as a {@link PCollection}.
*
* @param regex The regular expression to run
* @param keyGroupName The Regex group name to use as the key
* @param valueGroupName The Regex group name to use the value
*/
public static MatchesNameKV matchesKV(String regex, String keyGroupName, String valueGroupName) {
return matchesKV(Pattern.compile(regex), keyGroupName, valueGroupName);
}
/**
* Returns a {@link Regex.MatchesNameKV} {@link PTransform} that checks if the entire line matches
* the Regex. Returns the specified groups as the key and value as a {@link PCollection}.
*
* @param pattern The regular expression to run
* @param keyGroupName The Regex group name to use as the key
* @param valueGroupName The Regex group name to use the value
*/
public static MatchesNameKV matchesKV(
Pattern pattern, String keyGroupName, String valueGroupName) {
return new MatchesNameKV(pattern, keyGroupName, valueGroupName);
}
/**
* Returns a {@link Regex.Find} {@link PTransform} that checks if a portion of the line matches
* the Regex. Returns the entire line (group 0) as a {@link PCollection}.
*
* @param regex The regular expression to run
*/
public static Find find(String regex) {
return find(regex, 0);
}
/**
* Returns a {@link Regex.Find} {@link PTransform} that checks if a portion of the line matches
* the Regex. Returns the entire line (group 0) as a {@link PCollection}.
*
* @param pattern The regular expression to run
*/
public static Find find(Pattern pattern) {
return find(pattern, 0);
}
/**
* Returns a {@link Regex.Find} {@link PTransform} that checks if a portion of the line matches
* the Regex. Returns the group as a {@link PCollection}.
*
* @param regex The regular expression to run
* @param group The Regex group to return as a PCollection
*/
public static Find find(String regex, int group) {
return find(Pattern.compile(regex), group);
}
/**
* Returns a {@link Regex.Find} {@link PTransform} that checks if a portion of the line matches
* the Regex. Returns the group as a {@link PCollection}.
*
* @param pattern The regular expression to run
* @param group The Regex group to return as a PCollection
*/
public static Find find(Pattern pattern, int group) {
return new Find(pattern, group);
}
/**
* Returns a {@link Regex.FindName} {@link PTransform} that checks if a portion of the line
* matches the Regex. Returns the group as a {@link PCollection}.
*
* @param regex The regular expression to run
* @param groupName The Regex group name to return as a PCollection
*/
public static FindName find(String regex, String groupName) {
return find(Pattern.compile(regex), groupName);
}
/**
* Returns a {@link Regex.FindName} {@link PTransform} that checks if a portion of the line
* matches the Regex. Returns the group as a {@link PCollection}.
*
* @param pattern The regular expression to run
* @param groupName The Regex group name to return as a PCollection
*/
public static FindName find(Pattern pattern, String groupName) {
return new FindName(pattern, groupName);
}
/**
* Returns a {@link Regex.FindAll} {@link PTransform} that checks if a portion of the line matches
* the Regex. Returns all the groups as a List<String> in a {@link PCollection}.
*
* @param regex The regular expression to run
*/
public static FindAll findAll(String regex) {
return findAll(Pattern.compile(regex));
}
/**
* Returns a {@link Regex.FindAll} {@link PTransform} that checks if a portion of the line matches
* the Regex. Returns all the groups as a List<String> in a {@link PCollection}.
*
* @param pattern The regular expression to run
*/
public static FindAll findAll(Pattern pattern) {
return new FindAll(pattern);
}
/**
* Returns a {@link Regex.FindKV} {@link PTransform} that checks if a portion of the line matches
* the Regex. Returns the specified groups as the key and value as a {@link PCollection}.
*
* @param regex The regular expression to run
* @param keyGroup The Regex group to use as the key
* @param valueGroup The Regex group to use the value
*/
public static FindKV findKV(String regex, int keyGroup, int valueGroup) {
return findKV(Pattern.compile(regex), keyGroup, valueGroup);
}
/**
* Returns a {@link Regex.FindKV} {@link PTransform} that checks if a portion of the line matches
* the Regex. Returns the specified groups as the key and value as a {@link PCollection}.
*
* @param pattern The regular expression to run
* @param keyGroup The Regex group to use as the key
* @param valueGroup The Regex group to use the value
*/
public static FindKV findKV(Pattern pattern, int keyGroup, int valueGroup) {
return new FindKV(pattern, keyGroup, valueGroup);
}
/**
* Returns a {@link Regex.FindNameKV} {@link PTransform} that checks if a portion of the line
* matches the Regex. Returns the specified groups as the key and value as a {@link PCollection}.
*
* @param regex The regular expression to run
* @param keyGroupName The Regex group name to use as the key
* @param valueGroupName The Regex group name to use the value
*/
public static FindNameKV findKV(String regex, String keyGroupName, String valueGroupName) {
return findKV(Pattern.compile(regex), keyGroupName, valueGroupName);
}
/**
* Returns a {@link Regex.FindNameKV} {@link PTransform} that checks if a portion of the line
* matches the Regex. Returns the specified groups as the key and value as a {@link PCollection}.
*
* @param pattern The regular expression to run
* @param keyGroupName The Regex group name to use as the key
* @param valueGroupName The Regex group name to use the value
*/
public static FindNameKV findKV(Pattern pattern, String keyGroupName, String valueGroupName) {
return new FindNameKV(pattern, keyGroupName, valueGroupName);
}
/**
* Returns a {@link Regex.ReplaceAll} {@link PTransform} that checks if a portion of the line
* matches the Regex and replaces all matches with the replacement String. Returns the group as a
* {@link PCollection}.
*
* @param regex The regular expression to run
* @param replacement The string to be substituted for each match
*/
public static ReplaceAll replaceAll(String regex, String replacement) {
return replaceAll(Pattern.compile(regex), replacement);
}
/**
* Returns a {@link Regex.ReplaceAll} {@link PTransform} that checks if a portion of the line
* matches the Regex and replaces all matches with the replacement String. Returns the group as a
* {@link PCollection}.
*
* @param pattern The regular expression to run
* @param replacement The string to be substituted for each match
*/
public static ReplaceAll replaceAll(Pattern pattern, String replacement) {
return new ReplaceAll(pattern, replacement);
}
/**
* Returns a {@link Regex.ReplaceAll} {@link PTransform} that checks if a portion of the line
* matches the Regex and replaces the first match with the replacement String. Returns the group
* as a {@link PCollection}.
*
* @param regex The regular expression to run
* @param replacement The string to be substituted for each match
*/
public static ReplaceFirst replaceFirst(String regex, String replacement) {
return replaceFirst(Pattern.compile(regex), replacement);
}
/**
* Returns a {@link Regex.ReplaceAll} {@link PTransform} that checks if a portion of the line
* matches the Regex and replaces the first match with the replacement String. Returns the group
* as a {@link PCollection}.
*
* @param pattern The regular expression to run
* @param replacement The string to be substituted for each match
*/
public static ReplaceFirst replaceFirst(Pattern pattern, String replacement) {
return new ReplaceFirst(pattern, replacement);
}
/**
* Returns a {@link Regex.Split} {@link PTransform} that splits a string on the regular expression
* and then outputs each item. It will not output empty items. Returns the group as a {@link
* PCollection}. a {@link PCollection}.
*
* @param regex The regular expression to run
*/
public static Split split(String regex) {
return split(Pattern.compile(regex), false);
}
/**
* Returns a {@link Regex.Split} {@link PTransform} that splits a string on the regular expression
* and then outputs each item. It will not output empty items. Returns the group as a {@link
* PCollection}. a {@link PCollection}.
*
* @param pattern The regular expression to run
*/
public static Split split(Pattern pattern) {
return split(pattern, false);
}
/**
* Returns a {@link Regex.Split} {@link PTransform} that splits a string on the regular expression
* and then outputs each item. Returns the group as a {@link PCollection}.
*
* @param regex The regular expression to run
* @param outputEmpty Should empty be output. True to output empties and false if not.
*/
public static Split split(String regex, boolean outputEmpty) {
return split(Pattern.compile(regex), outputEmpty);
}
/**
* Returns a {@link Regex.Split} {@link PTransform} that splits a string on the regular expression
* and then outputs each item. Returns the group as a {@link PCollection}.
*
* @param pattern The regular expression to run
* @param outputEmpty Should empty be output. True to output empties and false if not.
*/
public static Split split(Pattern pattern, boolean outputEmpty) {
return new Split(pattern, outputEmpty);
}
/**
* {@code Regex.Matches<String>} takes a {@code PCollection<String>} and returns a {@code
* PCollection<String>} representing the value extracted from the Regex groups of the input {@code
* PCollection} to the number of times that element occurs in the input.
*
* <p>This transform runs a Regex on the entire input line. If the entire line does not match the
* Regex, the line will not be output. If it does match the entire line, the group in the Regex
* will be used. The output will be the Regex group.
*
* <p>Example of use:
*
* <pre>{@code
* PCollection<String> words = ...;
* PCollection<String> values =
* words.apply(Regex.matches("myregex (mygroup)", 1));
* }</pre>
*/
public static class Matches extends PTransform<PCollection<String>, PCollection<String>> {
final Pattern pattern;
int group;
public Matches(Pattern pattern, int group) {
this.pattern = pattern;
this.group = group;
}
public PCollection<String> expand(PCollection<String> in) {
return in.apply(
ParDo.of(
new DoFn<String, String>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
Matcher m = pattern.matcher(c.element());
if (m.matches()) {
c.output(m.group(group));
}
}
}));
}
}
/**
* {@code Regex.MatchesName<String>} takes a {@code PCollection<String>} and returns a {@code
* PCollection<String>} representing the value extracted from the Regex groups of the input {@code
* PCollection} to the number of times that element occurs in the input.
*
* <p>This transform runs a Regex on the entire input line. If the entire line does not match the
* Regex, the line will not be output. If it does match the entire line, the group in the Regex
* will be used. The output will be the Regex group.
*
* <p>Example of use:
*
* <pre>{@code
* PCollection<String> words = ...;
* PCollection<String> values =
* words.apply(Regex.matches("myregex (?<namedgroup>mygroup)", "namedgroup"));
* }</pre>
*/
public static class MatchesName extends PTransform<PCollection<String>, PCollection<String>> {
final Pattern pattern;
String groupName;
public MatchesName(Pattern pattern, String groupName) {
this.pattern = pattern;
this.groupName = groupName;
}
public PCollection<String> expand(PCollection<String> in) {
return in.apply(
ParDo.of(
new DoFn<String, String>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
Matcher m = pattern.matcher(c.element());
if (m.matches()) {
c.output(m.group(groupName));
}
}
}));
}
}
/**
* {@code Regex.MatchesName<String>} takes a {@code PCollection<String>} and returns a {@code
* PCollection<List<String>>} representing the value extracted from all the
* Regex groups of the input
* {@code PCollection} to the number of times that element occurs in the input.
*
* <p>This transform runs a Regex on the entire input line. If the entire line does not match the
* Regex, the line will not be output. If it does match the entire line, the groups in the Regex
* will be used. The output will be all of the Regex groups.
*
* <p>Example of use:
*
* <pre>{@code
* PCollection<String> words = ...;
* PCollection<List<String>> values =
* words.apply(Regex.allMatches("myregex (mygroup)"));
* }</pre>
*/
public static class AllMatches
extends PTransform<PCollection<String>, PCollection<List<String>>> {
final Pattern pattern;
public AllMatches(Pattern pattern) {
this.pattern = pattern;
}
public PCollection<List<String>> expand(PCollection<String> in) {
return in.apply(
ParDo.of(
new DoFn<String, List<String>>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
Matcher m = pattern.matcher(c.element());
if (m.matches()) {
ArrayList list = new ArrayList(m.groupCount());
// +1 because group 0 isn't included
for (int i = 0; i < m.groupCount() + 1; i++) {
list.add(m.group(i));
}
c.output(list);
}
}
}));
}
}
/**
* {@code Regex.MatchesKV<KV<String, String>>} takes a {@code PCollection<String>} and returns a
* {@code PCollection<KV<String, String>>} representing the key and value extracted from the Regex
* groups of the input {@code PCollection} to the number of times that element occurs in the
* input.
*
* <p>This transform runs a Regex on the entire input line. If the entire line does not match the
* Regex, the line will not be output. If it does match the entire line, the groups in the Regex
* will be used. The key will be the key's group and the value will be the value's group.
*
* <p>Example of use:
*
* <pre>{@code
* PCollection<String> words = ...;
* PCollection<KV<String, String>> keysAndValues =
* words.apply(Regex.matchesKV("myregex (mykeygroup) (myvaluegroup)", 1, 2));
* }</pre>
*/
public static class MatchesKV
extends PTransform<PCollection<String>, PCollection<KV<String, String>>> {
final Pattern pattern;
int keyGroup, valueGroup;
public MatchesKV(Pattern pattern, int keyGroup, int valueGroup) {
this.pattern = pattern;
this.keyGroup = keyGroup;
this.valueGroup = valueGroup;
}
public PCollection<KV<String, String>> expand(PCollection<String> in) {
return in.apply(
ParDo.of(
new DoFn<String, KV<String, String>>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
Matcher m = pattern.matcher(c.element());
if (m.find()) {
c.output(KV.of(m.group(keyGroup), m.group(valueGroup)));
}
}
}));
}
}
/**
* {@code Regex.MatchesNameKV<KV<String, String>>} takes a {@code PCollection<String>} and returns
* a {@code PCollection<KV<String, String>>} representing the key and value extracted from the
* Regex groups of the input {@code PCollection} to the number of times that element occurs in the
* input.
*
* <p>This transform runs a Regex on the entire input line. If the entire line does not match the
* Regex, the line will not be output. If it does match the entire line, the groups in the Regex
* will be used. The key will be the key's group and the value will be the value's group.
*
* <p>Example of use:
*
* <pre>{@code
* PCollection<String> words = ...;
* PCollection<KV<String, String>> keysAndValues =
* words.apply(Regex.matchesKV("myregex (?<keyname>mykeygroup) (?<valuename>myvaluegroup)",
* "keyname", "valuename"));
* }</pre>
*/
public static class MatchesNameKV
extends PTransform<PCollection<String>, PCollection<KV<String, String>>> {
final Pattern pattern;
String keyGroupName, valueGroupName;
public MatchesNameKV(Pattern pattern, String keyGroupName, String valueGroupName) {
this.pattern = pattern;
this.keyGroupName = keyGroupName;
this.valueGroupName = valueGroupName;
}
public PCollection<KV<String, String>> expand(PCollection<String> in) {
return in.apply(
ParDo.of(
new DoFn<String, KV<String, String>>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
Matcher m = pattern.matcher(c.element());
if (m.find()) {
c.output(KV.of(m.group(keyGroupName), m.group(valueGroupName)));
}
}
}));
}
}
/**
* {@code Regex.Find<String>} takes a {@code PCollection<String>} and returns a {@code
* PCollection<String>} representing the value extracted from the Regex groups of the input {@code
* PCollection} to the number of times that element occurs in the input.
*
* <p>This transform runs a Regex on the entire input line. If a portion of the line does not
* match the Regex, the line will not be output. If it does match a portion of the line, the group
* in the Regex will be used. The output will be the Regex group.
*
* <p>Example of use:
*
* <pre>{@code
* PCollection<String> words = ...;
* PCollection<String> values =
* words.apply(Regex.find("myregex (mygroup)", 1));
* }</pre>
*/
public static class Find extends PTransform<PCollection<String>, PCollection<String>> {
final Pattern pattern;
int group;
public Find(Pattern pattern, int group) {
this.pattern = pattern;
this.group = group;
}
public PCollection<String> expand(PCollection<String> in) {
return in.apply(
ParDo.of(
new DoFn<String, String>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
Matcher m = pattern.matcher(c.element());
if (m.find()) {
c.output(m.group(group));
}
}
}));
}
}
/**
* {@code Regex.Find<String>} takes a {@code PCollection<String>} and returns a {@code
* PCollection<String>} representing the value extracted from the Regex groups of the input {@code
* PCollection} to the number of times that element occurs in the input.
*
* <p>This transform runs a Regex on the entire input line. If a portion of the line does not
* match the Regex, the line will not be output. If it does match a portion of the line, the group
* in the Regex will be used. The output will be the Regex group.
*
* <p>Example of use:
*
* <pre>{@code
* PCollection<String> words = ...;
* PCollection<String> values =
* words.apply(Regex.find("myregex (?<namedgroup>mygroup)", "namedgroup"));
* }</pre>
*/
public static class FindName extends PTransform<PCollection<String>, PCollection<String>> {
final Pattern pattern;
String groupName;
public FindName(Pattern pattern, String groupName) {
this.pattern = pattern;
this.groupName = groupName;
}
public PCollection<String> expand(PCollection<String> in) {
return in.apply(
ParDo.of(
new DoFn<String, String>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
Matcher m = pattern.matcher(c.element());
if (m.find()) {
c.output(m.group(groupName));
}
}
}));
}
}
/**
* {@code Regex.Find<String>} takes a {@code PCollection<String>} and returns a {@code
* PCollection<List<String>>} representing the value extracted from the
* Regex groups of the input {@code
* PCollection} to the number of times that element occurs in the input.
*
* <p>This transform runs a Regex on the entire input line. If a portion of the line does not
* match the Regex, the line will not be output. If it does match a portion of the line, the
* groups in the Regex will be used. The output will be the Regex groups.
*
* <p>Example of use:
*
* <pre>{@code
* PCollection<String> words = ...;
* PCollection<List<String>> values =
* words.apply(Regex.findAll("myregex (mygroup)"));
* }</pre>
*/
public static class FindAll extends PTransform<PCollection<String>, PCollection<List<String>>> {
final Pattern pattern;
public FindAll(Pattern pattern) {
this.pattern = pattern;
}
public PCollection<List<String>> expand(PCollection<String> in) {
return in.apply(
ParDo.of(
new DoFn<String, List<String>>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
Matcher m = pattern.matcher(c.element());
if (m.find()) {
ArrayList list = new ArrayList(m.groupCount());
// +1 because group 0 isn't included
for (int i = 0; i < m.groupCount() + 1; i++) {
list.add(m.group(i));
}
c.output(list);
}
}
}));
}
}
/**
* {@code Regex.MatchesKV<KV<String, String>>} takes a {@code PCollection<String>} and returns a
* {@code PCollection<KV<String, String>>} representing the key and value extracted from the Regex
* groups of the input {@code PCollection} to the number of times that element occurs in the
* input.
*
* <p>This transform runs a Regex on the entire input line. If a portion of the line does not
* match the Regex, the line will not be output. If it does match a portion of the line, the
* groups in the Regex will be used. The key will be the key's group and the value will be the
* value's group.
*
* <p>Example of use:
*
* <pre>{@code
* PCollection<String> words = ...;
* PCollection<KV<String, String>> keysAndValues =
* words.apply(Regex.findKV("myregex (mykeygroup) (myvaluegroup)", 1, 2));
* }</pre>
*/
public static class FindKV
extends PTransform<PCollection<String>, PCollection<KV<String, String>>> {
final Pattern pattern;
int keyGroup, valueGroup;
public FindKV(Pattern pattern, int keyGroup, int valueGroup) {
this.pattern = pattern;
this.keyGroup = keyGroup;
this.valueGroup = valueGroup;
}
public PCollection<KV<String, String>> expand(PCollection<String> in) {
return in.apply(
ParDo.of(
new DoFn<String, KV<String, String>>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
Matcher m = pattern.matcher(c.element());
if (m.find()) {
c.output(KV.of(m.group(keyGroup), m.group(valueGroup)));
}
}
}));
}
}
/**
* {@code Regex.MatchesKV<KV<String, String>>} takes a {@code PCollection<String>} and returns a
* {@code PCollection<KV<String, String>>} representing the key and value extracted from the Regex
* groups of the input {@code PCollection} to the number of times that element occurs in the
* input.
*
* <p>This transform runs a Regex on the entire input line. If a portion of the line does not
* match the Regex, the line will not be output. If it does match a portion of the line, the
* groups in the Regex will be used. The key will be the key's group and the value will be the
* value's group.
*
* <p>Example of use:
*
* <pre>{@code
* PCollection<String> words = ...;
* PCollection<KV<String, String>> keysAndValues =
* words.apply(Regex.findKV("myregex (?<keyname>mykeygroup) (?<valuename>myvaluegroup)",
* "keyname", "valuename"));
* }</pre>
*/
public static class FindNameKV
extends PTransform<PCollection<String>, PCollection<KV<String, String>>> {
final Pattern pattern;
String keyGroupName, valueGroupName;
public FindNameKV(Pattern pattern, String keyGroupName, String valueGroupName) {
this.pattern = pattern;
this.keyGroupName = keyGroupName;
this.valueGroupName = valueGroupName;
}
public PCollection<KV<String, String>> expand(PCollection<String> in) {
return in.apply(
ParDo.of(
new DoFn<String, KV<String, String>>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
Matcher m = pattern.matcher(c.element());
if (m.find()) {
c.output(KV.of(m.group(keyGroupName), m.group(valueGroupName)));
}
}
}));
}
}
/**
* {@code Regex.ReplaceAll<String>} takes a {@code PCollection<String>} and returns a {@code
* PCollection<String>} with all Strings that matched the Regex being replaced with the
* replacement string.
*
* <p>This transform runs a Regex on the entire input line. If a portion of the line does not
* match the Regex, the line will be output without changes. If it does match a portion of the
* line, all portions matching the Regex will be replaced with the replacement String.
*
* <p>Example of use:
*
* <pre>{@code
* PCollection<String> words = ...;
* PCollection<String> values =
* words.apply(Regex.replaceAll("myregex", "myreplacement"));
* }</pre>
*/
public static class ReplaceAll extends PTransform<PCollection<String>, PCollection<String>> {
final Pattern pattern;
String replacement;
public ReplaceAll(Pattern pattern, String replacement) {
this.pattern = pattern;
this.replacement = replacement;
}
public PCollection<String> expand(PCollection<String> in) {
return in.apply(
ParDo.of(
new DoFn<String, String>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
Matcher m = pattern.matcher(c.element());
c.output(m.replaceAll(replacement));
}
}));
}
}
/**
* {@code Regex.ReplaceFirst<String>} takes a {@code PCollection<String>} and returns a {@code
* PCollection<String>} with the first Strings that matched the Regex being replaced with the
* replacement string.
*
* <p>This transform runs a Regex on the entire input line. If a portion of the line does not
* match the Regex, the line will be output without changes. If it does match a portion of the
* line, the first portion matching the Regex will be replaced with the replacement String.
*
* <p>Example of use:
*
* <pre>{@code
* PCollection<String> words = ...;
* PCollection<String> values =
* words.apply(Regex.replaceFirst("myregex", "myreplacement"));
* }</pre>
*/
public static class ReplaceFirst extends PTransform<PCollection<String>, PCollection<String>> {
final Pattern pattern;
String replacement;
public ReplaceFirst(Pattern pattern, String replacement) {
this.pattern = pattern;
this.replacement = replacement;
}
public PCollection<String> expand(PCollection<String> in) {
return in.apply(
ParDo.of(
new DoFn<String, String>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
Matcher m = pattern.matcher(c.element());
c.output(m.replaceFirst(replacement));
}
}));
}
}
/**
* {@code Regex.Split<String>} takes a {@code PCollection<String>} and returns a {@code
* PCollection<String>} with the input string split into individual items in a list. Each item is
* then output as a separate string.
*
* <p>This transform runs a Regex as part of a splint the entire input line. The split gives back
* an array of items. Each item is output as a separate item in the {@code PCollection<String>}.
*
* <p>Depending on the Regex, a split can be an empty or "" string. You can pass in a parameter if
* you want empty strings or not.
*
* <p>Example of use:
*
* <pre>{@code
* PCollection<String> words = ...;
* PCollection<String> values =
* words.apply(Regex.split("\W*"));
* }</pre>
*/
public static class Split extends PTransform<PCollection<String>, PCollection<String>> {
final Pattern pattern;
boolean outputEmpty;
public Split(Pattern pattern, boolean outputEmpty) {
this.pattern = pattern;
this.outputEmpty = outputEmpty;
}
public PCollection<String> expand(PCollection<String> in) {
return in.apply(
ParDo.of(
new DoFn<String, String>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
String[] items = pattern.split(c.element());
for (String item : items) {
if (outputEmpty || !item.isEmpty()) {
c.output(item);
}
}
}
}));
}
}
}