package mil.nga.giat.geowave.test.mapreduce;
import java.io.File;
import java.io.FileFilter;
import java.io.FilenameFilter;
import java.io.IOException;
import java.net.URISyntaxException;
import java.net.URL;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.ObjectWritable;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.geotools.data.DataStoreFinder;
import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.opengis.feature.simple.SimpleFeature;
import com.vividsolutions.jts.geom.Geometry;
import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
import mil.nga.giat.geowave.adapter.raster.util.ZipUtils;
import mil.nga.giat.geowave.adapter.vector.export.VectorMRExportCommand;
import mil.nga.giat.geowave.adapter.vector.export.VectorMRExportOptions;
import mil.nga.giat.geowave.core.cli.parser.ManualOperationParams;
import mil.nga.giat.geowave.core.index.ByteArrayId;
import mil.nga.giat.geowave.core.index.ByteArrayUtils;
import mil.nga.giat.geowave.core.store.adapter.DataAdapter;
import mil.nga.giat.geowave.core.store.adapter.WritableDataAdapter;
import mil.nga.giat.geowave.core.store.index.PrimaryIndex;
import mil.nga.giat.geowave.core.store.operations.remote.options.DataStorePluginOptions;
import mil.nga.giat.geowave.core.store.query.DistributableQuery;
import mil.nga.giat.geowave.core.store.query.EverythingQuery;
import mil.nga.giat.geowave.core.store.query.QueryOptions;
import mil.nga.giat.geowave.format.gpx.GpxIngestPlugin;
import mil.nga.giat.geowave.mapreduce.GeoWaveConfiguratorBase;
import mil.nga.giat.geowave.mapreduce.GeoWaveWritableInputMapper;
import mil.nga.giat.geowave.mapreduce.dedupe.GeoWaveDedupeJobRunner;
import mil.nga.giat.geowave.mapreduce.input.GeoWaveInputFormat;
import mil.nga.giat.geowave.mapreduce.input.GeoWaveInputKey;
import mil.nga.giat.geowave.test.GeoWaveITRunner;
import mil.nga.giat.geowave.test.TestUtils;
import mil.nga.giat.geowave.test.TestUtils.DimensionalityType;
import mil.nga.giat.geowave.test.TestUtils.ExpectedResults;
import mil.nga.giat.geowave.test.annotation.Environments;
import mil.nga.giat.geowave.test.annotation.Environments.Environment;
import mil.nga.giat.geowave.test.annotation.GeoWaveTestStore;
import mil.nga.giat.geowave.test.annotation.GeoWaveTestStore.GeoWaveStoreType;
@RunWith(GeoWaveITRunner.class)
@Environments({
Environment.MAP_REDUCE
})
public class BasicMapReduceIT
{
protected static final String TEST_DATA_ZIP_RESOURCE_PATH = TestUtils.TEST_RESOURCE_PACKAGE
+ "mapreduce-testdata.zip";
protected static final String TEST_CASE_GENERAL_GPX_BASE = TestUtils.TEST_CASE_BASE + "general_gpx_test_case/";
protected static final String GENERAL_GPX_FILTER_PACKAGE = TEST_CASE_GENERAL_GPX_BASE + "filter/";
protected static final String GENERAL_GPX_FILTER_FILE = GENERAL_GPX_FILTER_PACKAGE + "filter.shp";
protected static final String GENERAL_GPX_INPUT_GPX_DIR = TEST_CASE_GENERAL_GPX_BASE + "input_gpx/";
protected static final String GENERAL_GPX_EXPECTED_RESULTS_DIR = TEST_CASE_GENERAL_GPX_BASE + "filter_results/";
protected static final String OSM_GPX_INPUT_DIR = TestUtils.TEST_CASE_BASE + "osm_gpx_test_case/";
private static long startMillis;
@BeforeClass
public static void extractTestFiles()
throws URISyntaxException {
ZipUtils.unZipFile(
new File(
MapReduceTestEnvironment.class.getClassLoader().getResource(
TEST_DATA_ZIP_RESOURCE_PATH).toURI()),
TestUtils.TEST_CASE_BASE);
startMillis = System.currentTimeMillis();
LOGGER.warn("-----------------------------------------");
LOGGER.warn("* *");
LOGGER.warn("* RUNNING BasicMapReduceIT *");
LOGGER.warn("* *");
LOGGER.warn("-----------------------------------------");
}
@AfterClass
public static void reportTest() {
LOGGER.warn("-----------------------------------------");
LOGGER.warn("* *");
LOGGER.warn("* FINISHED BasicMapReduceIT *");
LOGGER
.warn("* " + ((System.currentTimeMillis() - startMillis) / 1000)
+ "s elapsed. *");
LOGGER.warn("* *");
LOGGER.warn("-----------------------------------------");
}
private final static Logger LOGGER = LoggerFactory.getLogger(BasicMapReduceIT.class);
private static final String TEST_EXPORT_DIRECTORY = "basicMapReduceIT-export";
public static enum ResultCounterType {
EXPECTED,
UNEXPECTED,
ERROR
}
@GeoWaveTestStore({
GeoWaveStoreType.ACCUMULO,
GeoWaveStoreType.BIGTABLE,
GeoWaveStoreType.HBASE
})
protected DataStorePluginOptions dataStorePluginOptions;
@Test
public void testIngestAndQueryGeneralGpx()
throws Exception {
TestUtils.deleteAll(dataStorePluginOptions);
MapReduceTestUtils.testMapReduceIngest(
dataStorePluginOptions,
DimensionalityType.SPATIAL,
GENERAL_GPX_INPUT_GPX_DIR);
final File gpxInputDir = new File(
GENERAL_GPX_INPUT_GPX_DIR);
final File expectedResultsDir = new File(
GENERAL_GPX_EXPECTED_RESULTS_DIR);
final List<URL> expectedResultsResources = new ArrayList<URL>();
final Map<String, URL> baseNameToExpectedResultURL = new HashMap<String, URL>();
for (final File file : expectedResultsDir.listFiles(new FileFilter() {
@Override
public boolean accept(
final File pathname ) {
final Map<String, Object> map = new HashMap<String, Object>();
try {
map.put(
"url",
pathname.toURI().toURL());
return DataStoreFinder.getDataStore(map) != null;
}
catch (final IOException e) {
LOGGER.warn(
"Cannot read file as GeoTools data store",
e);
}
return false;
}
})) {
baseNameToExpectedResultURL.put(
FilenameUtils.getBaseName(
file.getName()).replaceAll(
"_filtered",
""),
file.toURI().toURL());
}
for (final String filename : gpxInputDir.list(new FilenameFilter() {
@Override
public boolean accept(
final File dir,
final String name ) {
return FilenameUtils.isExtension(
name,
new GpxIngestPlugin().getFileExtensionFilters());
}
})) {
final URL url = baseNameToExpectedResultURL.get(FilenameUtils.getBaseName(filename));
Assert.assertNotNull(url);
expectedResultsResources.add(url);
}
final ExpectedResults expectedResults = TestUtils.getExpectedResults(expectedResultsResources
.toArray(new URL[expectedResultsResources.size()]));
runTestJob(
expectedResults,
TestUtils.resourceToQuery(new File(
GENERAL_GPX_FILTER_FILE).toURI().toURL()),
null,
null);
}
@Test
public void testIngestOsmGpxMultipleIndices()
throws Exception {
TestUtils.deleteAll(dataStorePluginOptions);
// ingest the data set into multiple indices and then try several query
// methods, by adapter and by index
MapReduceTestUtils.testMapReduceIngest(
dataStorePluginOptions,
DimensionalityType.ALL,
OSM_GPX_INPUT_DIR);
final WritableDataAdapter<SimpleFeature>[] adapters = new GpxIngestPlugin().getDataAdapters(null);
final mil.nga.giat.geowave.core.store.DataStore geowaveStore = dataStorePluginOptions.createDataStore();
final Map<ByteArrayId, ExpectedResults> adapterIdToResultsMap = new HashMap<ByteArrayId, ExpectedResults>();
for (final WritableDataAdapter<SimpleFeature> adapter : adapters) {
adapterIdToResultsMap.put(
adapter.getAdapterId(),
TestUtils.getExpectedResults(geowaveStore.query(
new QueryOptions(
adapter),
new EverythingQuery())));
}
final List<DataAdapter<?>> firstTwoAdapters = new ArrayList<DataAdapter<?>>();
firstTwoAdapters.add(adapters[0]);
firstTwoAdapters.add(adapters[1]);
final ExpectedResults firstTwoAdaptersResults = TestUtils.getExpectedResults(geowaveStore.query(
new QueryOptions(
firstTwoAdapters),
new EverythingQuery()));
final ExpectedResults fullDataSetResults = TestUtils.getExpectedResults(geowaveStore.query(
new QueryOptions(),
new EverythingQuery()));
// just for sanity verify its greater than 0 (ie. that data was actually
// ingested in the first place)
Assert.assertTrue(
"There is no data ingested from OSM GPX test files",
fullDataSetResults.count > 0);
// now that we have expected results, run map-reduce export and
// re-ingest it
testMapReduceExportAndReingest(DimensionalityType.ALL);
// first try each adapter individually
for (final WritableDataAdapter<SimpleFeature> adapter : adapters) {
runTestJob(
adapterIdToResultsMap.get(adapter.getAdapterId()),
null,
new DataAdapter[] {
adapter
},
null);
}
// then try the first 2 adapters, and may as well try with both indices
// set (should be the default behavior anyways)
runTestJob(
firstTwoAdaptersResults,
null,
new DataAdapter[] {
adapters[0],
adapters[1]
},
null);
// now try all adapters and the spatial temporal index, the result
// should be the full data set
runTestJob(
fullDataSetResults,
null,
adapters,
TestUtils.DEFAULT_SPATIAL_TEMPORAL_INDEX);
// and finally run with nothing set, should be the full data set
runTestJob(
fullDataSetResults,
null,
null,
null);
}
private void testMapReduceExportAndReingest(
final DimensionalityType dimensionalityType )
throws Exception {
final VectorMRExportCommand exportCommand = new VectorMRExportCommand();
final VectorMRExportOptions options = exportCommand.getMrOptions();
exportCommand.setStoreOptions(dataStorePluginOptions);
final MapReduceTestEnvironment env = MapReduceTestEnvironment.getInstance();
final String exportPath = env.getHdfsBaseDirectory() + "/" + TEST_EXPORT_DIRECTORY;
final File exportDir = new File(
exportPath.replace(
"file:",
""));
if (exportDir.exists()) {
boolean deleted = false;
int attempts = 5;
while (!deleted && (attempts-- > 0)) {
try {
FileUtils.deleteDirectory(exportDir);
deleted = true;
}
catch (final Exception e) {
LOGGER.error("Export directory not deleted, trying again in 10s: " + e);
Thread.sleep(10000);
}
}
}
exportCommand.setParameters(
env.getHdfs(),
exportPath,
null);
options.setBatchSize(10000);
options.setMinSplits(MapReduceTestUtils.MIN_INPUT_SPLITS);
options.setMaxSplits(MapReduceTestUtils.MAX_INPUT_SPLITS);
options.setResourceManagerHostPort(env.getJobtracker());
final Configuration conf = MapReduceTestUtils.getConfiguration();
MapReduceTestUtils.filterConfiguration(conf);
final int res = ToolRunner.run(
conf,
exportCommand.createRunner(new ManualOperationParams()),
new String[] {});
Assert.assertTrue(
"Export Vector Data map reduce job failed",
res == 0);
TestUtils.deleteAll(dataStorePluginOptions);
MapReduceTestUtils.testMapReduceIngest(
dataStorePluginOptions,
DimensionalityType.ALL,
"avro",
TestUtils.TEMP_DIR + File.separator + MapReduceTestEnvironment.HDFS_BASE_DIRECTORY + File.separator
+ TEST_EXPORT_DIRECTORY);
}
@SuppressFBWarnings(value = "DM_GC", justification = "Memory usage kept low for travis-ci")
private void runTestJob(
final ExpectedResults expectedResults,
final DistributableQuery query,
final DataAdapter<?>[] adapters,
final PrimaryIndex index )
throws Exception {
final TestJobRunner jobRunner = new TestJobRunner(
dataStorePluginOptions,
expectedResults);
jobRunner.setMinInputSplits(MapReduceTestUtils.MIN_INPUT_SPLITS);
jobRunner.setMaxInputSplits(MapReduceTestUtils.MAX_INPUT_SPLITS);
if (query != null) {
jobRunner.setQuery(query);
}
final QueryOptions options = new QueryOptions();
if ((adapters != null) && (adapters.length > 0)) {
options.setAdapters(Arrays.asList(adapters));
}
if ((index != null)) {
options.setIndex(index);
}
jobRunner.setQueryOptions(options);
final Configuration conf = MapReduceTestUtils.getConfiguration();
MapReduceTestUtils.filterConfiguration(conf);
final int res = ToolRunner.run(
conf,
jobRunner,
new String[] {});
Assert.assertEquals(
0,
res);
// for travis-ci to run, we want to limit the memory consumption
System.gc();
}
private static class TestJobRunner extends
GeoWaveDedupeJobRunner
{
private final ExpectedResults expectedResults;
public TestJobRunner(
final DataStorePluginOptions pluginOptions,
final ExpectedResults expectedResults ) {
super(
pluginOptions);
this.expectedResults = expectedResults;
}
@Override
protected String getHdfsOutputBase() {
return MapReduceTestEnvironment.getInstance().getHdfsBaseDirectory();
}
@Override
public int runJob()
throws Exception {
final boolean job1Success = (super.runJob() == 0);
Assert.assertTrue(job1Success);
// after the first job there should be a sequence file with the
// filtered results which should match the expected results
// resources
final Job job = Job.getInstance(super.getConf());
final Configuration conf = job.getConfiguration();
MapReduceTestUtils.filterConfiguration(conf);
final ByteBuffer buf = ByteBuffer.allocate((8 * expectedResults.hashedCentroids.size()) + 4);
buf.putInt(expectedResults.hashedCentroids.size());
for (final Long hashedCentroid : expectedResults.hashedCentroids) {
buf.putLong(hashedCentroid);
}
conf.set(
MapReduceTestUtils.EXPECTED_RESULTS_KEY,
ByteArrayUtils.byteArrayToString(buf.array()));
GeoWaveInputFormat.setStoreOptions(
conf,
dataStoreOptions);
job.setJarByClass(this.getClass());
job.setJobName("GeoWave Test (" + dataStoreOptions.getGeowaveNamespace() + ")");
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setMapperClass(VerifyExpectedResultsMapper.class);
job.setMapOutputKeyClass(NullWritable.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputFormatClass(NullOutputFormat.class);
job.setNumReduceTasks(0);
job.setSpeculativeExecution(false);
FileInputFormat.setInputPaths(
job,
getHdfsOutputPath());
final boolean job2success = job.waitForCompletion(true);
final Counters jobCounters = job.getCounters();
final Counter expectedCnt = jobCounters.findCounter(ResultCounterType.EXPECTED);
Assert.assertNotNull(expectedCnt);
Assert.assertEquals(
expectedResults.count,
expectedCnt.getValue());
final Counter errorCnt = jobCounters.findCounter(ResultCounterType.ERROR);
if (errorCnt != null) {
Assert.assertEquals(
0L,
errorCnt.getValue());
}
final Counter unexpectedCnt = jobCounters.findCounter(ResultCounterType.UNEXPECTED);
if (unexpectedCnt != null) {
Assert.assertEquals(
0L,
unexpectedCnt.getValue());
}
return job2success ? 0 : 1;
}
}
private static class VerifyExpectedResultsMapper extends
GeoWaveWritableInputMapper<NullWritable, NullWritable>
{
private Set<Long> expectedHashedCentroids = new HashSet<Long>();
@Override
protected void mapNativeValue(
final GeoWaveInputKey key,
final Object value,
final Mapper<GeoWaveInputKey, ObjectWritable, NullWritable, NullWritable>.Context context )
throws IOException,
InterruptedException {
ResultCounterType resultType = ResultCounterType.ERROR;
if (value instanceof SimpleFeature) {
final SimpleFeature result = (SimpleFeature) value;
final Geometry geometry = (Geometry) result.getDefaultGeometry();
if (!geometry.isEmpty()) {
resultType = expectedHashedCentroids.contains(TestUtils.hashCentroid(geometry)) ? ResultCounterType.EXPECTED
: ResultCounterType.UNEXPECTED;
}
}
context.getCounter(
resultType).increment(
1);
}
@Override
protected void setup(
final Mapper<GeoWaveInputKey, ObjectWritable, NullWritable, NullWritable>.Context context )
throws IOException,
InterruptedException {
super.setup(context);
final Configuration config = GeoWaveConfiguratorBase.getConfiguration(context);
final String expectedResults = config.get(MapReduceTestUtils.EXPECTED_RESULTS_KEY);
if (expectedResults != null) {
expectedHashedCentroids = new HashSet<Long>();
final byte[] expectedResultsBinary = ByteArrayUtils.byteArrayFromString(expectedResults);
final ByteBuffer buf = ByteBuffer.wrap(expectedResultsBinary);
final int count = buf.getInt();
for (int i = 0; i < count; i++) {
expectedHashedCentroids.add(buf.getLong());
}
}
}
}
}