/*
* This file is part of Gradoop.
*
* Gradoop is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Gradoop is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Gradoop. If not, see <http://www.gnu.org/licenses/>.
*/
package org.gradoop.flink.io.impl.tlf;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.TextInputFormat;
import org.gradoop.flink.io.impl.tlf.functions.GraphTransactionFromTLFGraph;
import org.gradoop.flink.model.impl.GraphTransactions;
import org.gradoop.flink.model.impl.LogicalGraph;
import org.gradoop.flink.model.impl.operators.combination.ReduceCombination;
import org.gradoop.flink.util.GradoopFlinkConfig;
import org.gradoop.flink.io.api.DataSource;
import org.gradoop.flink.io.impl.tlf.functions.Dictionary;
import org.gradoop.flink.io.impl.tlf.functions.DictionaryEntry;
import org.gradoop.flink.io.impl.tlf.functions.EdgeLabelDecoder;
import org.gradoop.flink.io.impl.tlf.functions.TLFFileFormat;
import org.gradoop.flink.io.impl.tlf.functions.TLFGraphFromText;
import org.gradoop.flink.io.impl.tlf.functions.VertexLabelDecoder;
import org.gradoop.flink.io.impl.tlf.inputformats.TLFInputFormat;
import org.gradoop.flink.io.impl.tlf.tuples.TLFGraph;
import org.gradoop.flink.model.impl.GraphCollection;
import org.gradoop.flink.representation.transactional.GraphTransaction;
import java.io.IOException;
/**
* Creates an EPGM instance from one TLF file. The exact format is
* documented in
* {@link TLFFileFormat}.
*/
public class TLFDataSource extends TLFBase implements DataSource {
/**
* Creates a new data source. Paths can be local (file://) or HDFS (hdfs://).
*
* @param tlfPath tlf data file
* @param config Gradoop Flink configuration
*/
public TLFDataSource(String tlfPath, GradoopFlinkConfig config) {
super(tlfPath, "", "", config);
}
/**
* Creates a new data source. Paths can be local (file://) or HDFS (hdfs://).
*
* @param tlfPath tlf data file
* @param tlfVertexDictionaryPath tlf vertex dictionary file
* @param tlfEdgeDictionaryPath tlf edge dictionary file
* @param config Gradoop Flink configuration
*/
public TLFDataSource(String tlfPath, String tlfVertexDictionaryPath,
String tlfEdgeDictionaryPath, GradoopFlinkConfig config) {
super(tlfPath, tlfVertexDictionaryPath, tlfEdgeDictionaryPath, config);
ExecutionEnvironment env = config.getExecutionEnvironment();
if (hasVertexDictionary()) {
setVertexDictionary(env
.readHadoopFile(new TextInputFormat(), LongWritable.class, Text
.class, getTLFVertexDictionaryPath())
.map(new DictionaryEntry())
.reduceGroup(new Dictionary()));
}
if (hasEdgeDictionary()) {
setEdgeDictionary(env
.readHadoopFile(new TextInputFormat(), LongWritable.class, Text
.class, getTLFEdgeDictionaryPath())
.map(new DictionaryEntry())
.reduceGroup(new Dictionary()));
}
}
@Override
public LogicalGraph getLogicalGraph() throws IOException {
return getGraphCollection().reduce(new ReduceCombination());
}
@Override
public GraphCollection getGraphCollection() throws IOException {
return GraphCollection.fromTransactions(getGraphTransactions());
}
@Override
public GraphTransactions getGraphTransactions() throws IOException {
DataSet<TLFGraph> graphs;
DataSet<GraphTransaction> transactions;
ExecutionEnvironment env = getConfig().getExecutionEnvironment();
// load tlf graphs from file
graphs = env.readHadoopFile(
new TLFInputFormat(), LongWritable.class, Text.class, getTLFPath())
.map(new TLFGraphFromText());
// map the tlf graph to transactions
transactions = graphs
.map(new GraphTransactionFromTLFGraph(
getConfig().getGraphHeadFactory(),
getConfig().getVertexFactory(),
getConfig().getEdgeFactory()));
// map the integer valued labels to strings from dictionary
if (hasVertexDictionary()) {
transactions = transactions
.map(new VertexLabelDecoder())
.withBroadcastSet(
getVertexDictionary(), VertexLabelDecoder.VERTEX_DICTIONARY);
}
if (hasEdgeDictionary()) {
transactions = transactions
.map(new EdgeLabelDecoder())
.withBroadcastSet(
getEdgeDictionary(), EdgeLabelDecoder.EDGE_DICTIONARY);
}
return new GraphTransactions(transactions, getConfig());
}
/**
* Reads the input as dataset of TLFGraphs.
*
* @return tlf graphs
*/
public DataSet<TLFGraph> getTLFGraphs() throws IOException {
ExecutionEnvironment env = getConfig().getExecutionEnvironment();
return env.readHadoopFile(new TLFInputFormat(),
LongWritable.class, Text.class, getTLFPath())
.map(new TLFGraphFromText());
}
}