/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.beam.sdk.io.xml;
import static com.google.common.base.Preconditions.checkNotNull;
import com.google.auto.value.AutoValue;
import com.google.common.annotations.VisibleForTesting;
import java.nio.charset.Charset;
import javax.annotation.Nullable;
import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBException;
import javax.xml.bind.ValidationEventHandler;
import org.apache.beam.sdk.PipelineRunner;
import org.apache.beam.sdk.io.BoundedSource;
import org.apache.beam.sdk.io.CompressedSource;
import org.apache.beam.sdk.io.FileBasedSink;
import org.apache.beam.sdk.io.OffsetBasedSource;
import org.apache.beam.sdk.io.fs.ResourceId;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.ValueProvider;
import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.apache.beam.sdk.values.PBegin;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PDone;
/** Transforms for reading and writing XML files using JAXB mappers. */
public class XmlIO {
// CHECKSTYLE.OFF: JavadocStyle
/**
* Reads XML files. This source reads one or more XML files and creates a {@link PCollection} of a
* given type. Please note the example given below.
*
* <p>The XML file must be of the following form, where {@code root} and {@code record} are XML
* element names that are defined by the user:
*
* <pre>{@code
* <root>
* <record> ... </record>
* <record> ... </record>
* <record> ... </record>
* ...
* <record> ... </record>
* </root>
* }</pre>
*
* <p>Basically, the XML document should contain a single root element with an inner list
* consisting entirely of record elements. The records may contain arbitrary XML content; however,
* that content <b>must not</b> contain the start {@code <record>} or end {@code </record>} tags.
* This restriction enables reading from large XML files in parallel from different offsets in the
* file.
*
* <p>Root and/or record elements may additionally contain an arbitrary number of XML attributes.
* Additionally users must provide a class of a JAXB annotated Java type that can be used convert
* records into Java objects and vice versa using JAXB marshalling/unmarshalling mechanisms.
* Reading the source will generate a {@code PCollection} of the given JAXB annotated Java type.
* Optionally users may provide a minimum size of a bundle that should be created for the source.
*
* <p>The following example shows how to use this method in a Beam pipeline:
*
* <pre>{@code
* PCollection<String> output = p.apply(XmlIO.<Record>read()
* .from(file.toPath().toString())
* .withRootElement("root")
* .withRecordElement("record")
* .withRecordClass(Record.class));
* }</pre>
*
* <p>By default, UTF-8 charset is used. If your file is using a different charset, you have to
* specify the following:
*
* <pre>{@code
* PCollection<String> output = p.apply(XmlIO.<Record>read()
* .from(file.toPath().toString())
* .withRooElement("root")
* .withRecordElement("record")
* .withRecordClass(Record.class)
* .withCharset(StandardCharsets.ISO_8859_1));
* }</pre>
*
* <p>{@link java.nio.charset.StandardCharsets} provides static references to common charsets.
*
* <p>Currently, only XML files that use single-byte characters are supported. Using a file that
* contains multi-byte characters may result in data loss or duplication.
*
* <h3>Permissions</h3>
*
* <p>Permission requirements depend on the {@link PipelineRunner
* PipelineRunner} that is used to execute the Beam pipeline. Please refer to the documentation of
* corresponding {@link PipelineRunner PipelineRunners} for more details.
*
* @param <T> Type of the objects that represent the records of the XML file. The {@code
* PCollection} generated by this source will be of this type.
*/
// CHECKSTYLE.ON: JavadocStyle
public static <T> Read<T> read() {
return new AutoValue_XmlIO_Read.Builder<T>()
.setMinBundleSize(Read.DEFAULT_MIN_BUNDLE_SIZE)
.setCompressionType(Read.CompressionType.AUTO)
.setCharset("UTF-8")
.build();
}
// CHECKSTYLE.OFF: JavadocStyle
/**
* A {@link FileBasedSink} that outputs records as XML-formatted elements. Writes a {@link
* PCollection} of records from JAXB-annotated classes to a single file location.
*
* <p>Given a PCollection containing records of type T that can be marshalled to XML elements,
* this Sink will produce a single file consisting of a single root element that contains all of
* the elements in the PCollection.
*
* <p>XML Sinks are created with a base filename to write to, a root element name that will be
* used for the root element of the output files, and a class to bind to an XML element. This
* class will be used in the marshalling of records in an input PCollection to their XML
* representation and must be able to be bound using JAXB annotations (checked at pipeline
* construction time).
*
* <p>XML Sinks can be written to using the {@link Write} transform:
*
* <pre>{@code
* p.apply(XmlIO.<Type>write()
* .withRecordClass(Type.class)
* .withRootElement(root_element)
* .toFilenamePrefix(output_filename));
* }</pre>
*
* <p>For example, consider the following class with JAXB annotations:
*
* <pre>
* {@literal @}XmlRootElement(name = "word_count_result")
* {@literal @}XmlType(propOrder = {"word", "frequency"})
* public class WordFrequency {
* private String word;
* private long frequency;
*
* public WordFrequency() { }
*
* public WordFrequency(String word, long frequency) {
* this.word = word;
* this.frequency = frequency;
* }
*
* public void setWord(String word) {
* this.word = word;
* }
*
* public void setFrequency(long frequency) {
* this.frequency = frequency;
* }
*
* public long getFrequency() {
* return frequency;
* }
*
* public String getWord() {
* return word;
* }
* }
* </pre>
*
* <p>The following will produce XML output with a root element named "words" from a PCollection
* of WordFrequency objects:
*
* <pre>{@code
* p.apply(XmlIO.<WordFrequency>write()
* .withRecordClass(WordFrequency.class)
* .withRootElement("words")
* .toFilenamePrefix(output_file));
* }</pre>
*
* <p>The output of which will look like:
*
* <pre>{@code
* <words>
*
* <word_count_result>
* <word>decreased</word>
* <frequency>1</frequency>
* </word_count_result>
*
* <word_count_result>
* <word>War</word>
* <frequency>4</frequency>
* </word_count_result>
*
* <word_count_result>
* <word>empress'</word>
* <frequency>14</frequency>
* </word_count_result>
*
* <word_count_result>
* <word>stoops</word>
* <frequency>6</frequency>
* </word_count_result>
*
* ...
* </words>
* }</pre>
*
* <p>By default the UTF-8 charset is used. This can be overridden, for example:
*
* <pre>{@code
* p.apply(XmlIO.<Type>write()
* .withRecordClass(Type.class)
* .withRootElement(root_element)
* .withCharset(StandardCharsets.ISO_8859_1)
* .toFilenamePrefix(output_filename));
* }</pre>
*/
// CHECKSTYLE.ON: JavadocStyle
public static <T> Write<T> write() {
return new AutoValue_XmlIO_Write.Builder<T>().setCharset("UTF-8").build();
}
/** Implementation of {@link #read}. */
@AutoValue
public abstract static class Read<T> extends PTransform<PBegin, PCollection<T>> {
private static final int DEFAULT_MIN_BUNDLE_SIZE = 8 * 1024;
@Nullable
abstract String getFileOrPatternSpec();
@Nullable
abstract String getRootElement();
@Nullable
abstract String getRecordElement();
@Nullable
abstract Class<T> getRecordClass();
abstract CompressionType getCompressionType();
abstract long getMinBundleSize();
@Nullable
abstract String getCharset();
abstract Builder<T> toBuilder();
@Nullable
abstract ValidationEventHandler getValidationEventHandler();
@AutoValue.Builder
abstract static class Builder<T> {
abstract Builder<T> setFileOrPatternSpec(String fileOrPatternSpec);
abstract Builder<T> setRootElement(String rootElement);
abstract Builder<T> setRecordElement(String recordElement);
abstract Builder<T> setRecordClass(Class<T> recordClass);
abstract Builder<T> setMinBundleSize(long minBundleSize);
abstract Builder<T> setCompressionType(CompressionType compressionType);
abstract Builder<T> setCharset(String charset);
abstract Builder<T> setValidationEventHandler(ValidationEventHandler validationEventHandler);
abstract Read<T> build();
}
/** Strategy for determining the compression type of XML files being read. */
public enum CompressionType {
/** Automatically determine the compression type based on filename extension. */
AUTO(""),
/** Uncompressed (i.e., may be split). */
UNCOMPRESSED(""),
/** GZipped. */
GZIP(".gz"),
/** BZipped. */
BZIP2(".bz2"),
/** Zipped. */
ZIP(".zip"),
/** Deflate compressed. */
DEFLATE(".deflate");
private String filenameSuffix;
CompressionType(String suffix) {
this.filenameSuffix = suffix;
}
/**
* Determine if a given filename matches a compression type based on its extension.
*
* @param filename the filename to match
* @return true iff the filename ends with the compression type's known extension.
*/
public boolean matches(String filename) {
return filename.toLowerCase().endsWith(filenameSuffix.toLowerCase());
}
}
/**
* Reads a single XML file or a set of XML files defined by a Java "glob" file pattern. Each XML
* file should be of the form defined in {@link #read}.
*/
public Read<T> from(String fileOrPatternSpec) {
return toBuilder().setFileOrPatternSpec(fileOrPatternSpec).build();
}
/**
* Sets name of the root element of the XML document. This will be used to create a valid
* starting root element when initiating a bundle of records created from an XML document. This
* is a required parameter.
*/
public Read<T> withRootElement(String rootElement) {
return toBuilder().setRootElement(rootElement).build();
}
/**
* Sets name of the record element of the XML document. This will be used to determine offset of
* the first record of a bundle created from the XML document. This is a required parameter.
*/
public Read<T> withRecordElement(String recordElement) {
return toBuilder().setRecordElement(recordElement).build();
}
/**
* Sets a JAXB annotated class that can be populated using a record of the provided XML file.
* This will be used when unmarshalling record objects from the XML file. This is a required
* parameter.
*/
public Read<T> withRecordClass(Class<T> recordClass) {
return toBuilder().setRecordClass(recordClass).build();
}
/**
* Sets a parameter {@code minBundleSize} for the minimum bundle size of the source. Please
* refer to {@link OffsetBasedSource} for the definition of minBundleSize. This is an optional
* parameter.
*/
public Read<T> withMinBundleSize(long minBundleSize) {
return toBuilder().setMinBundleSize(minBundleSize).build();
}
/**
* Decompresses all input files using the specified compression type.
*
* <p>If no compression type is specified, the default is {@link CompressionType#AUTO}. In this
* mode, the compression type of the file is determined by its extension. Supports .gz, .bz2,
* .zip and .deflate compression.
*/
public Read<T> withCompressionType(CompressionType compressionType) {
return toBuilder().setCompressionType(compressionType).build();
}
/**
* Sets the XML file charset.
*/
public Read<T> withCharset(Charset charset) {
return toBuilder().setCharset(charset.name()).build();
}
/**
* Sets the {@link ValidationEventHandler} to use with JAXB. Calling this with a {@code null}
* parameter will cause the JAXB unmarshaller event handler to be unspecified.
*/
public Read<T> withValidationEventHandler(ValidationEventHandler validationEventHandler) {
return toBuilder().setValidationEventHandler(validationEventHandler).build();
}
@Override
public void validate(PipelineOptions options) {
checkNotNull(
getRootElement(),
"rootElement is null. Use builder method withRootElement() to set this.");
checkNotNull(
getRecordElement(),
"recordElement is null. Use builder method withRecordElement() to set this.");
checkNotNull(
getRecordClass(),
"recordClass is null. Use builder method withRecordClass() to set this.");
checkNotNull(
getCharset(),
"charset is null. Use builder method withCharset() to set this.");
}
@Override
public void populateDisplayData(DisplayData.Builder builder) {
builder
.addIfNotDefault(
DisplayData.item("minBundleSize", getMinBundleSize())
.withLabel("Minimum Bundle Size"),
1L)
.add(DisplayData.item("filePattern", getFileOrPatternSpec()).withLabel("File Pattern"))
.addIfNotNull(
DisplayData.item("rootElement", getRootElement()).withLabel("XML Root Element"))
.addIfNotNull(
DisplayData.item("recordElement", getRecordElement()).withLabel("XML Record Element"))
.addIfNotNull(
DisplayData.item("recordClass", getRecordClass()).withLabel("XML Record Class"))
.addIfNotNull(
DisplayData.item("charset", getCharset()).withLabel("Charset"));
}
@VisibleForTesting
BoundedSource<T> createSource() {
XmlSource<T> source = new XmlSource<>(this);
switch (getCompressionType()) {
case UNCOMPRESSED:
return source;
case AUTO:
return CompressedSource.from(source);
case BZIP2:
return CompressedSource.from(source)
.withDecompression(CompressedSource.CompressionMode.BZIP2);
case GZIP:
return CompressedSource.from(source)
.withDecompression(CompressedSource.CompressionMode.GZIP);
case ZIP:
return CompressedSource.from(source)
.withDecompression(CompressedSource.CompressionMode.ZIP);
case DEFLATE:
return CompressedSource.from(source)
.withDecompression(CompressedSource.CompressionMode.DEFLATE);
default:
throw new IllegalArgumentException("Unknown compression type: " + getCompressionType());
}
}
@Override
public PCollection<T> expand(PBegin input) {
return input.apply(org.apache.beam.sdk.io.Read.from(createSource()));
}
}
/** Implementation of {@link #write}. */
@AutoValue
public abstract static class Write<T> extends PTransform<PCollection<T>, PDone> {
@Nullable
abstract ValueProvider<ResourceId> getFilenamePrefix();
@Nullable
abstract Class<T> getRecordClass();
@Nullable
abstract String getRootElement();
@Nullable
abstract String getCharset();
abstract Builder<T> toBuilder();
@AutoValue.Builder
abstract static class Builder<T> {
abstract Builder<T> setFilenamePrefix(ValueProvider<ResourceId> prefix);
abstract Builder<T> setRecordClass(Class<T> recordClass);
abstract Builder<T> setRootElement(String rootElement);
abstract Builder<T> setCharset(String charset);
abstract Write<T> build();
}
/**
* Writes to files with the given path prefix.
*
* <p>Output files will have the name {@literal {filenamePrefix}-0000i-of-0000n.xml} where n is
* the number of output bundles.
*/
public Write<T> to(String filenamePrefix) {
ResourceId resourceId = FileBasedSink.convertToFileResourceIfPossible(filenamePrefix);
return toBuilder().setFilenamePrefix(StaticValueProvider.of(resourceId)).build();
}
/**
* Writes objects of the given class mapped to XML elements using JAXB.
*
* <p>The specified class must be able to be used to create a JAXB context.
*/
public Write<T> withRecordClass(Class<T> recordClass) {
return toBuilder().setRecordClass(recordClass).build();
}
/** Sets the enclosing root element for the generated XML files. */
public Write<T> withRootElement(String rootElement) {
return toBuilder().setRootElement(rootElement).build();
}
/** Sets the charset used to write the file. */
public Write<T> withCharset(Charset charset) {
return toBuilder().setCharset(charset.name()).build();
}
@Override
public void validate(PipelineOptions options) {
checkNotNull(getRecordClass(), "Missing a class to bind to a JAXB context.");
checkNotNull(getRootElement(), "Missing a root element name.");
checkNotNull(getFilenamePrefix(), "Missing a filename to write to.");
checkNotNull(getCharset(), "Missing charset");
try {
JAXBContext.newInstance(getRecordClass());
} catch (JAXBException e) {
throw new RuntimeException("Error binding classes to a JAXB Context.", e);
}
}
@Override
public PDone expand(PCollection<T> input) {
return input.apply(org.apache.beam.sdk.io.WriteFiles.to(createSink()));
}
@VisibleForTesting
XmlSink<T> createSink() {
return new XmlSink<>(this);
}
@Override
public void populateDisplayData(DisplayData.Builder builder) {
createSink().populateFileBasedDisplayData(builder);
builder
.addIfNotNull(
DisplayData.item("rootElement", getRootElement()).withLabel("XML Root Element"))
.addIfNotNull(
DisplayData.item("recordClass", getRecordClass()).withLabel("XML Record Class"))
.addIfNotNull(
DisplayData.item("charset", getCharset()).withLabel("Charset"));
}
}
}