package dk.statsbiblioteket.medieplatform.hadoop; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.MRConfig; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import dk.statsbiblioteket.medieplatform.autonomous.Batch; import dk.statsbiblioteket.medieplatform.autonomous.ConfigConstants; import dk.statsbiblioteket.medieplatform.autonomous.ResultCollector; import dk.statsbiblioteket.medieplatform.autonomous.TreeProcessorAbstractRunnableComponent; import dk.statsbiblioteket.util.xml.XSLT; import javax.xml.transform.TransformerException; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.security.PrivilegedExceptionAction; import java.util.HashMap; import java.util.Map; import java.util.Properties; /** * This is the abstract hadoop runnable component. It is meant for the more common hadoop tasks that the * autonomous components must do. * * The doWorkOnItem method have been implemented, and a abstract getTool() */ public abstract class AbstractHadoopRunnableComponent extends TreeProcessorAbstractRunnableComponent { private static Logger log = LoggerFactory.getLogger(AbstractHadoopRunnableComponent.class); /** * Constructor matching super. Super requires a properties to be able to initialise the tree iterator, if needed. * If you do not need the tree iterator, ignore properties. * * You can use properties for your own stuff as well * * @param properties properties * * @see #getProperties() */ public AbstractHadoopRunnableComponent(Properties properties) { super(properties); } protected abstract Tool getTool(); @Override public void doWorkOnItem(Batch batch, ResultCollector resultCollector) throws Exception { runTool(getTool(), batch, resultCollector); } private void runTool(Tool tool, Batch batch, ResultCollector resultCollector) throws IOException, InterruptedException, TransformerException { //create the input as a file on the cluster Configuration conf = new Configuration(); getProperties().setProperty(ConfigConstants.ITERATOR_USE_FILESYSTEM, "False"); propertiesToHadoopConfiguration(conf, getProperties()); conf.set(ConfigConstants.BATCH_ID, batch.getFullID()); String user = conf.get(ConfigConstants.HADOOP_USER, "newspapr"); conf.set(MRConfig.FRAMEWORK_NAME, MRConfig.YARN_FRAMEWORK_NAME); FileSystem fs = FileSystem.get(FileSystem.getDefaultUri(conf), conf, user); long time = System.currentTimeMillis(); String jobFolder = getProperties().getProperty(ConfigConstants.JOB_FOLDER); Path inputFile = createInputFile(batch, fs, time, jobFolder); Path outDir = new Path( jobFolder, "output_" + batch.getFullID() + "_" + time); runJob(tool, batch, resultCollector, conf, inputFile, outDir, user); } private Path createInputFile(Batch batch, FileSystem fs, long time, String jobFolder) throws IOException, TransformerException { Path inputFile = new Path( jobFolder, "input_" + batch.getFullID() + "_" + time + "_files.txt"); //make file list stream from batch structure fs.mkdirs(inputFile.getParent()); FSDataOutputStream fileoutStream = fs.create( inputFile); buildFileList(batch, fileoutStream); fileoutStream.close(); return inputFile; } private void propertiesToHadoopConfiguration(Configuration conf, Properties properties) { for (Map.Entry<Object, Object> objectObjectEntry : properties.entrySet()) { conf.set(objectObjectEntry.getKey().toString(), objectObjectEntry.getValue().toString()); } } private void runJob(final Tool job, final Batch batch, final ResultCollector resultCollector, final Configuration conf, final Path inputFile, final Path outDir, String username) throws IOException, InterruptedException { //upload job to cluster if not already present //execute job on file UserGroupInformation ugi = UserGroupInformation.createRemoteUser(username); ugi.doAs( new PrivilegedExceptionAction<ResultCollector>() { public ResultCollector run() throws Exception { job.setConf(conf); try { int result = ToolRunner.run( conf, job, new String[]{inputFile.toString(), outDir.toString()}); if (result != 0) { resultCollector.addFailure( batch.getFullID(), "jp2file", getClass().getName(), "Failed to run on this batch"); } } catch (Exception e) { resultCollector.addFailure( batch.getFullID(), "exception", getClass().getName(), e.toString()); } return resultCollector; } }); } private void buildFileList(Batch batch, OutputStream outputStream) throws IOException, TransformerException { InputStream structure; try { structure = retrieveBatchStructure(batch); } catch (NullPointerException e) { throw new IOException("The batch '" + batch.getFullID() + "' was not found in doms"); } if (structure == null) { throw new IOException("The structure for the batch '" + batch.getFullID() + "'is not available"); } HashMap<String, String> params = new HashMap<String, String>(); params.put( ConfigConstants.PREFIX, getProperties().getProperty(ConfigConstants.PREFIX)); XSLT.transform( Thread.currentThread().getContextClassLoader().getResource("fileNamesFromStructure.xslt"), structure, outputStream, params); } }