HadoopScanMapper.java example

Explorer
janus-master
- janusgraph-master
// Copyright 2017 JanusGraph Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package org.janusgraph.hadoop.scan;

import com.google.common.base.Preconditions;
import org.janusgraph.diskstorage.Entry;
import org.janusgraph.diskstorage.EntryList;
import org.janusgraph.diskstorage.StaticBuffer;
import org.janusgraph.diskstorage.configuration.ConfigNamespace;
import org.janusgraph.diskstorage.configuration.Configuration;
import org.janusgraph.diskstorage.configuration.ModifiableConfiguration;
import org.janusgraph.diskstorage.keycolumnvalue.SliceQuery;
import org.janusgraph.diskstorage.keycolumnvalue.scan.ScanJob;
import org.janusgraph.diskstorage.util.BufferUtil;
import org.janusgraph.diskstorage.util.EntryArrayList;
import org.janusgraph.hadoop.config.ModifiableHadoopConfiguration;
import org.janusgraph.hadoop.config.JanusGraphHadoopConfiguration;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Mapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.lang.reflect.Field;
import java.util.*;
import java.util.function.Predicate;

import static org.janusgraph.hadoop.compat.HadoopCompatLoader.DEFAULT_COMPAT;

/**
 * Run a {@link org.janusgraph.diskstorage.keycolumnvalue.scan.ScanJob}
 * via a Hadoop {@link org.apache.hadoop.mapreduce.Mapper} over the edgestore.
 */
public class HadoopScanMapper extends Mapper<StaticBuffer, Iterable<Entry>, NullWritable, NullWritable> {

    private static final Logger log = LoggerFactory.getLogger(HadoopScanMapper.class);

    protected ScanJob job;
    protected HadoopContextScanMetrics metrics;
    protected org.janusgraph.diskstorage.configuration.Configuration jobConf;
    private Predicate<StaticBuffer> keyFilter;
    private SliceQuery initialQuery;
    private List<SliceQuery> subsequentQueries;

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        super.setup(context);
        org.apache.hadoop.conf.Configuration hadoopConf = DEFAULT_COMPAT.getContextConfiguration(context);
        ModifiableHadoopConfiguration scanConf = ModifiableHadoopConfiguration.of(JanusGraphHadoopConfiguration.MAPRED_NS, hadoopConf);
        job = getJob(scanConf);
        metrics = new HadoopContextScanMetrics(context);
        Configuration graphConf = getJanusGraphConfiguration(context);
        finishSetup(scanConf, graphConf);
    }

    protected void finishSetup(ModifiableHadoopConfiguration scanConf, Configuration graphConf) {
        jobConf = getJobConfiguration(scanConf);
        Preconditions.checkNotNull(metrics);
        // Allowed to be null for jobs that specify no configuration and no configuration root
        //Preconditions.checkNotNull(jobConf);
        Preconditions.checkNotNull(job);
        job.workerIterationStart(jobConf, graphConf, metrics);
        keyFilter = job.getKeyFilter();
        List<SliceQuery> sliceQueries = job.getQueries();
        Preconditions.checkArgument(null != sliceQueries, "Job cannot specify null query list");
        Preconditions.checkArgument(0 < sliceQueries.size(), "Job must specify at least one query");
        // Assign head of getQueries() to "initialQuery"
        initialQuery = sliceQueries.get(0);
        // Assign tail of getQueries() to "subsequentQueries"
        subsequentQueries = new ArrayList<>(sliceQueries.subList(1,sliceQueries.size()));
        Preconditions.checkState(sliceQueries.size() == subsequentQueries.size() + 1);
        Preconditions.checkNotNull(initialQuery);

        if (0 < subsequentQueries.size()) {
            //It is assumed that the first query is the grounding query if multiple queries exist
            StaticBuffer start = initialQuery.getSliceStart();
            Preconditions.checkArgument(start.equals(BufferUtil.zeroBuffer(1)),
                    "Expected start of first query to be all 0s: %s", start);
            StaticBuffer end = initialQuery.getSliceEnd();
            Preconditions.checkArgument(end.equals(BufferUtil.oneBuffer(end.length())),
                    "Expected end of first query to be all 1s: %s", end);
        }
    }

    @Override
    protected void map(StaticBuffer key, Iterable<Entry> values, Context context) throws IOException, InterruptedException {
        EntryArrayList al = EntryArrayList.of(values);

        // KeyFilter check
        if (!keyFilter.test(key)) {
            log.debug("Skipping key {} based on KeyFilter", key);
            return;
        }

        // InitialQuery check (at least one match is required or else the key is ignored)
        EntryList initialQueryMatches = findEntriesMatchingQuery(initialQuery, al);
        if (0 == initialQueryMatches.size()) {
            log.debug("Skipping key {} based on InitialQuery ({}) match failure", key, initialQuery);
            return;
        }

        // Both conditions (KeyFilter && InitialQuery) for invoking process are satisfied

        // Create an entries parameter to be passed into the process method
        Map<SliceQuery, EntryList> matches = new HashMap<>();
        matches.put(initialQuery, initialQueryMatches);

        // Find matches (if any are present) for noninitial queries
        for (SliceQuery sq : subsequentQueries) {
            matches.put(sq, findEntriesMatchingQuery(sq, al));
        }

        // Process
        job.process(key, matches, metrics);
    }

    @Override
    protected void cleanup(Context context) throws IOException, InterruptedException {
        super.cleanup(context);
        job.workerIterationEnd(metrics);
    }

    private EntryList findEntriesMatchingQuery(SliceQuery query, EntryList sortedEntries) {

        int lowestStartMatch = sortedEntries.size(); // Inclusive
        int highestEndMatch = -1; // Inclusive

        final StaticBuffer queryStart = query.getSliceStart();
        final StaticBuffer queryEnd = query.getSliceEnd();

        // Find the lowest matchStart s.t. query.getSliceStart <= sortedEntries.get(matchStart)

        int low = 0;
        int high = sortedEntries.size() - 1;

        while (low <= high) {
            int mid = (low + high) >>> 1;
            Entry midVal = sortedEntries.get(mid);

            int cmpStart = queryStart.compareTo(midVal.getColumn());

            if (0 < cmpStart) {
                // query lower bound exceeds entry (no match)
                if (lowestStartMatch == mid + 1) {
                    // lowestStartMatch located
                    break;
                }
                // Move to higher list index
                low = mid + 1;
            } else /* (0 >= cmpStart) */ {
                // entry equals or exceeds query lower bound (match, but not necessarily lowest match)
                if (mid < lowestStartMatch) {
                    lowestStartMatch = mid;
                }
                // Move to a lower list index
                high = mid - 1;
            }
        }

        // If lowestStartMatch is beyond the end of our list parameter, there cannot possibly be any matches,
        // so we can bypass the highestEndMatch search and just return an empty result.
        if (sortedEntries.size() == lowestStartMatch) {
            return EntryList.EMPTY_LIST;
        }

        // Find the highest matchEnd s.t. sortedEntries.get(matchEnd) < query.getSliceEnd

        low = 0;
        high = sortedEntries.size() - 1;

        while (low <= high) {
            int mid = (low + high) >>> 1;
            Entry midVal = sortedEntries.get(mid);

            int cmpEnd = queryEnd.compareTo(midVal.getColumn());

            if (0 < cmpEnd) {
                // query upper bound exceeds entry (match, not necessarily highest)
                if (mid > highestEndMatch) {
                    highestEndMatch = mid;
                }
                // Move to higher list index
                low = mid + 1;
            } else /* (0 >= cmpEnd) */ {
                // entry equals or exceeds query upper bound (no match)
                if (highestEndMatch == mid - 1) {
                    // highestEndMatch located
                    break;
                }
                // Move to a lower list index
                high = mid - 1;
            }
        }

        if (0 <= highestEndMatch - lowestStartMatch) {
            // Return sublist between indices (inclusive at both indices)
            int endIndex = highestEndMatch + 1; // This will be passed into subList, which interprets it exclusively
            if (query.hasLimit()) {
                endIndex = Math.min(endIndex, query.getLimit() + lowestStartMatch);
            }
            // TODO avoid unnecessary copy here
            return EntryArrayList.of(sortedEntries.subList(lowestStartMatch, endIndex /* exclusive */));
        } else {
            return EntryList.EMPTY_LIST;
        }
    }

    private ScanJob getJob(Configuration scanConf) {
        String jobClass = scanConf.get(JanusGraphHadoopConfiguration.SCAN_JOB_CLASS);

        try {
            return (ScanJob)Class.forName(jobClass).newInstance();
        } catch (InstantiationException e) {
            throw new RuntimeException(e);
        } catch (IllegalAccessException e) {
            throw new RuntimeException(e);
        } catch (ClassNotFoundException e) {
            throw new RuntimeException(e);
        }
    }

    static ModifiableConfiguration getJanusGraphConfiguration(Context context) {
        org.apache.hadoop.conf.Configuration hadoopConf = DEFAULT_COMPAT.getContextConfiguration(context);
        return ModifiableHadoopConfiguration.of(JanusGraphHadoopConfiguration.MAPRED_NS, hadoopConf).getJanusGraphConf();
    }

    static Configuration getJobConfiguration(ModifiableHadoopConfiguration scanConf) {
        if (!scanConf.has(JanusGraphHadoopConfiguration.SCAN_JOB_CONFIG_ROOT)) {
            log.debug("No job configuration root provided");
            return null;
        }
        ConfigNamespace jobRoot = getJobRoot(scanConf.get(JanusGraphHadoopConfiguration.SCAN_JOB_CONFIG_ROOT));
        return ModifiableHadoopConfiguration.prefixView(jobRoot, JanusGraphHadoopConfiguration.SCAN_JOB_CONFIG_KEYS,
                scanConf);
    }

    static ConfigNamespace getJobRoot(String confRootName) {

        String tokens[] = confRootName.split("#");
        String className = tokens[0];
        String fieldName = tokens[1];

        try {
            Field f = Class.forName(className).getField(fieldName);
            return (ConfigNamespace)f.get(null);
        } catch (NoSuchFieldException e) {
            throw new RuntimeException(e);
        } catch (ClassNotFoundException e) {
            throw new RuntimeException(e);
        } catch (IllegalAccessException e) {
            throw new RuntimeException(e);
        }
    }
}