RecordReaderWriterTest.java example

Explorer
tinkerpop-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.tinkerpop.gremlin.hadoop.structure.io;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.TaskType;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.tinkerpop.gremlin.TestHelper;
import org.apache.tinkerpop.gremlin.hadoop.HadoopGraphProvider;
import org.apache.tinkerpop.gremlin.structure.Direction;
import org.apache.tinkerpop.gremlin.structure.Vertex;
import org.apache.tinkerpop.gremlin.util.iterator.IteratorUtils;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.UUID;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

/**
 * @author Daniel Kuppitz (http://gremlin.guru)
 * @author Marko A. Rodriguez (http://markorodriguez.com)
 */
public abstract class RecordReaderWriterTest {
    private static final Logger logger = LoggerFactory.getLogger(RecordReaderWriterTest.class);

    protected abstract String getInputFilename();

    protected abstract Class<? extends InputFormat<NullWritable, VertexWritable>> getInputFormat();

    protected abstract Class<? extends OutputFormat<NullWritable, VertexWritable>> getOutputFormat();

    @Test
    public void shouldSplitFileAndWriteProperSplits() throws Exception {
        for (int numberOfSplits = 1; numberOfSplits < 10; numberOfSplits++) {
            final File testFile = new File(HadoopGraphProvider.PATHS.get(getInputFilename()));
            logger.info("Testing: {}", testFile + " (splits {}", numberOfSplits + ")");
            final List<FileSplit> splits = generateFileSplits(testFile, numberOfSplits);
            final Class<? extends InputFormat<NullWritable, VertexWritable>> inputFormatClass = getInputFormat();
            final Class<? extends OutputFormat<NullWritable, VertexWritable>> outputFormatClass = getOutputFormat();
            final File outputDirectory = TestHelper.makeTestDataPath(inputFormatClass, "hadoop-record-reader-writer-test");
            final Configuration config = configure(outputDirectory);
            validateFileSplits(splits, config, inputFormatClass, Optional.of(outputFormatClass));
        }
    }

    protected Configuration configure(final File outputDirectory) {
        final Configuration configuration = new Configuration(false);
        configuration.set("fs.file.impl", LocalFileSystem.class.getName());
        configuration.set("fs.defaultFS", "file:///");
        configuration.set("mapreduce.output.fileoutputformat.outputdir", "file:///" + outputDirectory.getAbsolutePath());
        return configuration;
    }

    private static List<FileSplit> generateFileSplits(final File file, final int numberOfSplits) {
        final long fileSize = file.length();
        final long splitLength = (long) ((double) fileSize / (double) numberOfSplits);
        final List<FileSplit> splits = new ArrayList<>();
        for (int i = 0; i < fileSize; i = i + (int) splitLength + 1) {
            splits.add(new FileSplit(new Path(file.getAbsoluteFile().toURI().toString()), i, splitLength, null));
        }
        return splits;
    }

    private static void validateFileSplits(final List<FileSplit> fileSplits, final Configuration configuration,
                                           final Class<? extends InputFormat<NullWritable, VertexWritable>> inputFormatClass,
                                           final Optional<Class<? extends OutputFormat<NullWritable, VertexWritable>>> outFormatClass) throws Exception {

        final InputFormat inputFormat = ReflectionUtils.newInstance(inputFormatClass, configuration);
        final TaskAttemptContext job = new TaskAttemptContextImpl(configuration, new TaskAttemptID(UUID.randomUUID().toString(), 0, TaskType.MAP, 0, 0));

        int vertexCount = 0;
        int outEdgeCount = 0;
        int inEdgeCount = 0;

        final OutputFormat<NullWritable, VertexWritable> outputFormat = outFormatClass.isPresent() ? ReflectionUtils.newInstance(outFormatClass.get(), configuration) : null;
        final RecordWriter<NullWritable, VertexWritable> writer = null == outputFormat ? null : outputFormat.getRecordWriter(job);

        boolean foundKeyValue = false;
        for (final FileSplit split : fileSplits) {
            logger.info("\treading file split {}", split.getPath().getName() + " ({}", split.getStart() + "..." + (split.getStart() + split.getLength()), "{} {} bytes)");
            final RecordReader reader = inputFormat.createRecordReader(split, job);

            float lastProgress = -1f;
            while (reader.nextKeyValue()) {
                //System.out.println("" + reader.getProgress() + "> " + reader.getCurrentKey() + ": " + reader.getCurrentValue());
                final float progress = reader.getProgress();
                assertTrue(progress >= lastProgress);
                assertEquals(NullWritable.class, reader.getCurrentKey().getClass());
                final VertexWritable vertexWritable = (VertexWritable) reader.getCurrentValue();
                if (null != writer) writer.write(NullWritable.get(), vertexWritable);
                vertexCount++;
                outEdgeCount = outEdgeCount + (int) IteratorUtils.count(vertexWritable.get().edges(Direction.OUT));
                inEdgeCount = inEdgeCount + (int) IteratorUtils.count(vertexWritable.get().edges(Direction.IN));
                //
                final Vertex vertex = vertexWritable.get();
                assertEquals(Integer.class, vertex.id().getClass());
                if (vertex.value("name").equals("SUGAR MAGNOLIA")) {
                    foundKeyValue = true;
                    assertEquals(92, IteratorUtils.count(vertex.edges(Direction.OUT)));
                    assertEquals(77, IteratorUtils.count(vertex.edges(Direction.IN)));
                }
                lastProgress = progress;
            }
        }

        assertEquals(8049, outEdgeCount);
        assertEquals(8049, inEdgeCount);
        assertEquals(outEdgeCount, inEdgeCount);
        assertEquals(808, vertexCount);
        assertTrue(foundKeyValue);

        if (null != writer) {
            writer.close(new TaskAttemptContextImpl(configuration, job.getTaskAttemptID()));
            for (int i = 1; i < 10; i++) {
                final File outputDirectory = new File(new URL(configuration.get("mapreduce.output.fileoutputformat.outputdir")).toURI());
                final List<FileSplit> splits = generateFileSplits(new File(outputDirectory.getAbsoluteFile() + "/_temporary/0/_temporary/" + job.getTaskAttemptID().getTaskID().toString().replace("task", "attempt") + "_0" + "/part-m-00000"), i);
                validateFileSplits(splits, configuration, inputFormatClass, Optional.empty());
            }
        }
    }
}