/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.hdfstests; import org.apache.flink.api.common.typeinfo.TypeInformation; import org.apache.flink.api.java.io.TextInputFormat; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.api.java.typeutils.TypeExtractor; import org.apache.flink.configuration.ConfigConstants; import org.apache.flink.configuration.Configuration; import org.apache.flink.core.fs.Path; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.api.functions.sink.RichSinkFunction; import org.apache.flink.api.common.io.FilePathFilter; import org.apache.flink.streaming.api.functions.source.ContinuousFileMonitoringFunction; import org.apache.flink.streaming.api.functions.source.ContinuousFileReaderOperator; import org.apache.flink.streaming.api.functions.source.TimestampedFileInputSplit; import org.apache.flink.streaming.api.functions.source.FileProcessingMode; import org.apache.flink.streaming.util.StreamingProgramTestBase; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.hdfs.MiniDFSCluster; import org.junit.After; import org.junit.Assert; import org.junit.Before; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import static org.junit.Assert.assertEquals; public class ContinuousFileProcessingITCase extends StreamingProgramTestBase { private static final int NO_OF_FILES = 5; private static final int LINES_PER_FILE = 100; private static final int PARALLELISM = 4; private static final long INTERVAL = 100; private File baseDir; private org.apache.hadoop.fs.FileSystem hdfs; private String hdfsURI; private MiniDFSCluster hdfsCluster; private static Map<Integer, String> expectedContents = new HashMap<>(); // PREPARING FOR THE TESTS @Before public void createHDFS() { try { baseDir = new File("./target/hdfs/hdfsTesting").getAbsoluteFile(); FileUtil.fullyDelete(baseDir); org.apache.hadoop.conf.Configuration hdConf = new org.apache.hadoop.conf.Configuration(); hdConf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, baseDir.getAbsolutePath()); hdConf.set("dfs.block.size", String.valueOf(1048576)); // this is the minimum we can set. MiniDFSCluster.Builder builder = new MiniDFSCluster.Builder(hdConf); hdfsCluster = builder.build(); hdfsURI = "hdfs://" + hdfsCluster.getURI().getHost() + ":" + hdfsCluster.getNameNodePort() +"/"; hdfs = new org.apache.hadoop.fs.Path(hdfsURI).getFileSystem(hdConf); } catch(Throwable e) { e.printStackTrace(); Assert.fail("Test failed " + e.getMessage()); } } @After public void destroyHDFS() { try { FileUtil.fullyDelete(baseDir); hdfsCluster.shutdown(); } catch (Throwable t) { throw new RuntimeException(t); } } // END OF PREPARATIONS @Override protected void testProgram() throws Exception { /* * This test checks the interplay between the monitor and the reader * and also the failExternally() functionality. To test the latter we * set the parallelism to 1 so that we have the chaining between the sink, * which throws the SuccessException to signal the end of the test, and the * reader. * */ TextInputFormat format = new TextInputFormat(new Path(hdfsURI)); format.setFilePath(hdfsURI); format.setFilesFilter(FilePathFilter.createDefaultFilter()); // create the stream execution environment with a parallelism > 1 to test final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(PARALLELISM); ContinuousFileMonitoringFunction<String> monitoringFunction = new ContinuousFileMonitoringFunction<>(format, FileProcessingMode.PROCESS_CONTINUOUSLY, env.getParallelism(), INTERVAL); // the monitor has always DOP 1 DataStream<TimestampedFileInputSplit> splits = env.addSource(monitoringFunction); Assert.assertEquals(1, splits.getParallelism()); ContinuousFileReaderOperator<String> reader = new ContinuousFileReaderOperator<>(format); TypeInformation<String> typeInfo = TypeExtractor.getInputFormatTypes(format); // the readers can be multiple DataStream<String> content = splits.transform("FileSplitReader", typeInfo, reader); Assert.assertEquals(PARALLELISM, content.getParallelism()); // finally for the sink we set the parallelism to 1 so that we can verify the output TestingSinkFunction sink = new TestingSinkFunction(); content.addSink(sink).setParallelism(1); Thread job = new Thread() { @Override public void run() { try { env.execute("ContinuousFileProcessingITCase Job."); } catch (Exception e) { Throwable th = e; for (int depth = 0; depth < 20; depth++) { if (th instanceof SuccessException) { try { postSubmit(); } catch (Exception e1) { e1.printStackTrace(); } return; } else if (th.getCause() != null) { th = th.getCause(); } else { break; } } e.printStackTrace(); Assert.fail(e.getMessage()); } } }; job.start(); // The modification time of the last created file. long lastCreatedModTime = Long.MIN_VALUE; // create the files to be read for (int i = 0; i < NO_OF_FILES; i++) { Tuple2<org.apache.hadoop.fs.Path, String> tmpFile; long modTime; do { // give it some time so that the files have // different modification timestamps. Thread.sleep(50); tmpFile = fillWithData(hdfsURI, "file", i, "This is test line."); modTime = hdfs.getFileStatus(tmpFile.f0).getModificationTime(); if (modTime <= lastCreatedModTime) { // delete the last created file to recreate it with a different timestamp hdfs.delete(tmpFile.f0, false); } } while (modTime <= lastCreatedModTime); lastCreatedModTime = modTime; // put the contents in the expected results list before the reader picks them // this is to guarantee that they are in before the reader finishes (avoid race conditions) expectedContents.put(i, tmpFile.f1); org.apache.hadoop.fs.Path file = new org.apache.hadoop.fs.Path(hdfsURI + "/file" + i); hdfs.rename(tmpFile.f0, file); Assert.assertTrue(hdfs.exists(file)); } // wait for the job to finish. job.join(); } private static class TestingSinkFunction extends RichSinkFunction<String> { private int elementCounter = 0; private Map<Integer, Set<String>> actualContent = new HashMap<>(); private transient Comparator<String> comparator; @Override public void open(Configuration parameters) throws Exception { // this sink can only work with DOP 1 assertEquals(1, getRuntimeContext().getNumberOfParallelSubtasks()); comparator = new Comparator<String>() { @Override public int compare(String o1, String o2) { return getLineNo(o1) - getLineNo(o2); } }; } @Override public void invoke(String value) throws Exception { int fileIdx = getFileIdx(value); Set<String> content = actualContent.get(fileIdx); if (content == null) { content = new HashSet<>(); actualContent.put(fileIdx, content); } if (!content.add(value + "\n")) { Assert.fail("Duplicate line: "+ value); System.exit(0); } elementCounter++; if (elementCounter == NO_OF_FILES * LINES_PER_FILE) { throw new SuccessException(); } } @Override public void close() { // check if the data that we collected are the ones they are supposed to be. Assert.assertEquals(expectedContents.size(), actualContent.size()); for (Integer fileIdx: expectedContents.keySet()) { Assert.assertTrue(actualContent.keySet().contains(fileIdx)); List<String> cntnt = new ArrayList<>(actualContent.get(fileIdx)); Collections.sort(cntnt, comparator); StringBuilder cntntStr = new StringBuilder(); for (String line: cntnt) { cntntStr.append(line); } Assert.assertEquals(expectedContents.get(fileIdx), cntntStr.toString()); } expectedContents.clear(); } private int getLineNo(String line) { String[] tkns = line.split("\\s"); return Integer.parseInt(tkns[tkns.length - 1]); } private int getFileIdx(String line) { String[] tkns = line.split(":"); return Integer.parseInt(tkns[0]); } } /** Create a file and fill it with content. */ private Tuple2<org.apache.hadoop.fs.Path, String> fillWithData( String base, String fileName, int fileIdx, String sampleLine) throws IOException, InterruptedException { assert (hdfs != null); org.apache.hadoop.fs.Path tmp = new org.apache.hadoop.fs.Path(base + "/." + fileName + fileIdx); FSDataOutputStream stream = hdfs.create(tmp); StringBuilder str = new StringBuilder(); for (int i = 0; i < LINES_PER_FILE; i++) { String line = fileIdx + ": " + sampleLine + " " + i + "\n"; str.append(line); stream.write(line.getBytes(ConfigConstants.DEFAULT_CHARSET)); } stream.close(); return new Tuple2<>(tmp, str.toString()); } public static class SuccessException extends Exception { private static final long serialVersionUID = -7011865671593955887L; } }