package io.lumify.wikipedia.mapreduce;
import com.google.inject.Inject;
import io.lumify.core.mapreduce.LumifyMRBase;
import io.lumify.core.model.ontology.Concept;
import io.lumify.core.model.ontology.OntologyRepository;
import io.lumify.core.util.LumifyLogger;
import io.lumify.core.util.LumifyLoggerFactory;
import io.lumify.wikipedia.WikipediaConstants;
import org.apache.accumulo.core.data.Mutation;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.securegraph.accumulo.mapreduce.AccumuloElementOutputFormat;
import java.io.File;
public class ImportMR extends LumifyMRBase {
private static final LumifyLogger LOGGER = LumifyLoggerFactory.getLogger(ImportMR.class);
public static final String WIKIPEDIA_MIME_TYPE = "text/plain";
public static final String MULTI_VALUE_KEY = ImportMR.class.getName();
private OntologyRepository ontologyRepository;
@Override
protected void setupJob(Job job) throws Exception {
verifyWikipediaPageConcept(ontologyRepository);
verifyWikipediaPageInternalLinkWikipediaPageRelationship(ontologyRepository);
job.setJarByClass(ImportMR.class);
job.setMapperClass(ImportMRMapper.class);
job.setNumReduceTasks(0);
job.setMapOutputValueClass(Mutation.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(AccumuloElementOutputFormat.class);
FileInputFormat.addInputPath(job, new Path(getConf().get("in")));
}
@Override
protected String getJobName() {
return "wikipediaImport";
}
private void verifyWikipediaPageInternalLinkWikipediaPageRelationship(OntologyRepository ontologyRepository) {
if (!ontologyRepository.hasRelationshipByIRI(WikipediaConstants.WIKIPEDIA_PAGE_INTERNAL_LINK_WIKIPEDIA_PAGE_CONCEPT_URI)) {
throw new RuntimeException(WikipediaConstants.WIKIPEDIA_PAGE_INTERNAL_LINK_WIKIPEDIA_PAGE_CONCEPT_URI + " relationship not found");
}
}
private void verifyWikipediaPageConcept(OntologyRepository ontologyRepository) {
Concept wikipediaPageConcept = ontologyRepository.getConceptByIRI(WikipediaConstants.WIKIPEDIA_PAGE_CONCEPT_URI);
if (wikipediaPageConcept == null) {
throw new RuntimeException(WikipediaConstants.WIKIPEDIA_PAGE_CONCEPT_URI + " concept not found");
}
}
@Override
protected void parseArgs(JobConf conf, String[] args) {
if (args.length != 1) {
throw new RuntimeException("Required arguments <inputFileName>");
}
String inFileName = args[0];
LOGGER.info("inFileName: %s", inFileName);
conf.set("in", inFileName);
conf.set(ImportMRMapper.CONFIG_SOURCE_FILE_NAME, new File(inFileName).getName());
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new ImportMR(), args);
System.exit(res);
}
@Inject
public void setOntologyRepository(OntologyRepository ontologyRepository) {
this.ontologyRepository = ontologyRepository;
}
}