/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package gobblin.data.management.copy.converter; import java.io.InputStream; import java.util.ArrayList; import java.util.List; import com.google.common.base.Function; import gobblin.configuration.WorkUnitState; import gobblin.converter.Converter; import gobblin.converter.DataConversionException; import gobblin.converter.SchemaConversionException; import gobblin.converter.SingleRecordIterable; import gobblin.data.management.copy.CopyableFile; import gobblin.data.management.copy.FileAwareInputStream; import gobblin.util.PathUtils; /** * Abstract class for distcp {@link Converter}. Simply transforms the {@link InputStream} in the * {@link FileAwareInputStream}, and possibly modifies extensions of the output file. */ public abstract class DistcpConverter extends Converter<String, String, FileAwareInputStream, FileAwareInputStream> { @Override public Converter<String, String, FileAwareInputStream, FileAwareInputStream> init(WorkUnitState workUnit) { return super.init(workUnit); } /** * @return A {@link Function} that transforms the {@link InputStream} in the {@link FileAwareInputStream}. */ public abstract Function<InputStream, InputStream> inputStreamTransformation(); /** * @return A list of extensions that should be removed from the output file name, which will be applied in order. * For example, if this method returns ["gz", "tar", "tgz"] then "file.tar.gz" becomes "file". */ public List<String> extensionsToRemove() { return new ArrayList<>(); } /** * TODO: actually use this method and add the extensions. * @return A list of extensions that should be added to the output file name, to be applied in order. * For example, if this method returns ["tar", "gz"] then "file" becomes "file.tar.gz". */ public List<String> extensionsToAdd() { return new ArrayList<>(); } /** * Identity schema converter. */ @Override public String convertSchema(String inputSchema, WorkUnitState workUnit) throws SchemaConversionException { return inputSchema; } /** * Applies the transformation in {@link #inputStreamTransformation} to the {@link InputStream} in the * {@link FileAwareInputStream}. */ @Override public Iterable<FileAwareInputStream> convertRecord(String outputSchema, FileAwareInputStream fileAwareInputStream, WorkUnitState workUnit) throws DataConversionException { modifyExtensionAtDestination(fileAwareInputStream.getFile()); try { InputStream newInputStream = inputStreamTransformation().apply(fileAwareInputStream.getInputStream()); return new SingleRecordIterable<>(new FileAwareInputStream(fileAwareInputStream.getFile(), newInputStream)); } catch (RuntimeException re) { throw new DataConversionException(re); } } private void modifyExtensionAtDestination(CopyableFile file) { if (extensionsToRemove().size() > 0) { file.setDestination(PathUtils.removeExtension(file.getDestination(), extensionsToRemove().toArray(new String[0]))); } } }