/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.tinkerpop.gremlin.hadoop.structure.io; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.LocalFileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.OutputFormat; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.TaskAttemptID; import org.apache.hadoop.mapreduce.TaskType; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl; import org.apache.hadoop.util.ReflectionUtils; import org.apache.tinkerpop.gremlin.TestHelper; import org.apache.tinkerpop.gremlin.hadoop.HadoopGraphProvider; import org.apache.tinkerpop.gremlin.structure.Direction; import org.apache.tinkerpop.gremlin.structure.Vertex; import org.apache.tinkerpop.gremlin.util.iterator.IteratorUtils; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.net.URL; import java.util.ArrayList; import java.util.List; import java.util.Optional; import java.util.UUID; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; /** * @author Daniel Kuppitz (http://gremlin.guru) * @author Marko A. Rodriguez (http://markorodriguez.com) */ public abstract class RecordReaderWriterTest { private static final Logger logger = LoggerFactory.getLogger(RecordReaderWriterTest.class); protected abstract String getInputFilename(); protected abstract Class<? extends InputFormat<NullWritable, VertexWritable>> getInputFormat(); protected abstract Class<? extends OutputFormat<NullWritable, VertexWritable>> getOutputFormat(); @Test public void shouldSplitFileAndWriteProperSplits() throws Exception { for (int numberOfSplits = 1; numberOfSplits < 10; numberOfSplits++) { final File testFile = new File(HadoopGraphProvider.PATHS.get(getInputFilename())); logger.info("Testing: {}", testFile + " (splits {}", numberOfSplits + ")"); final List<FileSplit> splits = generateFileSplits(testFile, numberOfSplits); final Class<? extends InputFormat<NullWritable, VertexWritable>> inputFormatClass = getInputFormat(); final Class<? extends OutputFormat<NullWritable, VertexWritable>> outputFormatClass = getOutputFormat(); final File outputDirectory = TestHelper.makeTestDataPath(inputFormatClass, "hadoop-record-reader-writer-test"); final Configuration config = configure(outputDirectory); validateFileSplits(splits, config, inputFormatClass, Optional.of(outputFormatClass)); } } protected Configuration configure(final File outputDirectory) { final Configuration configuration = new Configuration(false); configuration.set("fs.file.impl", LocalFileSystem.class.getName()); configuration.set("fs.defaultFS", "file:///"); configuration.set("mapreduce.output.fileoutputformat.outputdir", "file:///" + outputDirectory.getAbsolutePath()); return configuration; } private static List<FileSplit> generateFileSplits(final File file, final int numberOfSplits) { final long fileSize = file.length(); final long splitLength = (long) ((double) fileSize / (double) numberOfSplits); final List<FileSplit> splits = new ArrayList<>(); for (int i = 0; i < fileSize; i = i + (int) splitLength + 1) { splits.add(new FileSplit(new Path(file.getAbsoluteFile().toURI().toString()), i, splitLength, null)); } return splits; } private static void validateFileSplits(final List<FileSplit> fileSplits, final Configuration configuration, final Class<? extends InputFormat<NullWritable, VertexWritable>> inputFormatClass, final Optional<Class<? extends OutputFormat<NullWritable, VertexWritable>>> outFormatClass) throws Exception { final InputFormat inputFormat = ReflectionUtils.newInstance(inputFormatClass, configuration); final TaskAttemptContext job = new TaskAttemptContextImpl(configuration, new TaskAttemptID(UUID.randomUUID().toString(), 0, TaskType.MAP, 0, 0)); int vertexCount = 0; int outEdgeCount = 0; int inEdgeCount = 0; final OutputFormat<NullWritable, VertexWritable> outputFormat = outFormatClass.isPresent() ? ReflectionUtils.newInstance(outFormatClass.get(), configuration) : null; final RecordWriter<NullWritable, VertexWritable> writer = null == outputFormat ? null : outputFormat.getRecordWriter(job); boolean foundKeyValue = false; for (final FileSplit split : fileSplits) { logger.info("\treading file split {}", split.getPath().getName() + " ({}", split.getStart() + "..." + (split.getStart() + split.getLength()), "{} {} bytes)"); final RecordReader reader = inputFormat.createRecordReader(split, job); float lastProgress = -1f; while (reader.nextKeyValue()) { //System.out.println("" + reader.getProgress() + "> " + reader.getCurrentKey() + ": " + reader.getCurrentValue()); final float progress = reader.getProgress(); assertTrue(progress >= lastProgress); assertEquals(NullWritable.class, reader.getCurrentKey().getClass()); final VertexWritable vertexWritable = (VertexWritable) reader.getCurrentValue(); if (null != writer) writer.write(NullWritable.get(), vertexWritable); vertexCount++; outEdgeCount = outEdgeCount + (int) IteratorUtils.count(vertexWritable.get().edges(Direction.OUT)); inEdgeCount = inEdgeCount + (int) IteratorUtils.count(vertexWritable.get().edges(Direction.IN)); // final Vertex vertex = vertexWritable.get(); assertEquals(Integer.class, vertex.id().getClass()); if (vertex.value("name").equals("SUGAR MAGNOLIA")) { foundKeyValue = true; assertEquals(92, IteratorUtils.count(vertex.edges(Direction.OUT))); assertEquals(77, IteratorUtils.count(vertex.edges(Direction.IN))); } lastProgress = progress; } } assertEquals(8049, outEdgeCount); assertEquals(8049, inEdgeCount); assertEquals(outEdgeCount, inEdgeCount); assertEquals(808, vertexCount); assertTrue(foundKeyValue); if (null != writer) { writer.close(new TaskAttemptContextImpl(configuration, job.getTaskAttemptID())); for (int i = 1; i < 10; i++) { final File outputDirectory = new File(new URL(configuration.get("mapreduce.output.fileoutputformat.outputdir")).toURI()); final List<FileSplit> splits = generateFileSplits(new File(outputDirectory.getAbsoluteFile() + "/_temporary/0/_temporary/" + job.getTaskAttemptID().getTaskID().toString().replace("task", "attempt") + "_0" + "/part-m-00000"), i); validateFileSplits(splits, configuration, inputFormatClass, Optional.empty()); } } } }