/* * This file is part of CoAnSys project. * Copyright (c) 2012-2015 ICM-UW * * CoAnSys is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * CoAnSys is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with CoAnSys. If not, see <http://www.gnu.org/licenses/>. */ package pl.edu.icm.coansys.nlmextraction; import java.io.IOException; import java.io.InputStream; import java.util.Date; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.jdom.Element; import org.jdom.output.XMLOutputter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.protobuf.ByteString; import com.itextpdf.text.ExceptionConverter; import pl.edu.icm.cermine.PdfNLMContentExtractor; import pl.edu.icm.cermine.exception.AnalysisException; import pl.edu.icm.coansys.models.DocumentProtos; import pl.edu.icm.coansys.models.DocumentProtos.DocumentWrapper; import pl.edu.icm.coansys.models.DocumentProtos.Media; import pl.edu.icm.coansys.models.DocumentProtos.MediaContainer; import pl.edu.icm.coansys.models.constants.ProtoConstants; /** * * @author Artur Czeczko <a.czeczko@icm.edu.pl> */ public class NLMExtractionJob implements Tool { private static final String MAX_PDF_SIZE = "MAX_PDF_SIZE"; private static final Logger logger = LoggerFactory.getLogger(NLMExtractionJob.class); private Configuration conf; public static class ExtractMap extends Mapper<Writable, BytesWritable, Text, BytesWritable> { private long maxPdfSize = 0; @Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); String maxPdfSizeString = conf.get(MAX_PDF_SIZE); if (maxPdfSizeString != null) { try { maxPdfSize = (long) (Float.parseFloat(maxPdfSizeString) * 1024 * 1024); } catch (NumberFormatException ex) { throw new IOException(maxPdfSizeString + " is not a valid max PDF size", ex); } } } @Override protected void map(Writable key, BytesWritable value, Context context) throws IOException, InterruptedException { DocumentWrapper docWrapper = DocumentProtos.DocumentWrapper.parseFrom(value.copyBytes()); MediaContainer mediaContainer = docWrapper.getMediaContainer(); for (Media media : mediaContainer.getMediaList()) { logger.info("Processing file " + media.getSourcePath()); if (ProtoConstants.mediaTypePdf.equals(media.getMediaType())) { long fileSize; if (media.hasSourceFilesize()) { fileSize = media.getSourceFilesize(); } else { logger.warn("Source file size is not set in " + media.getKey() + ", using getSerializedSize() method"); fileSize = media.getSerializedSize(); } if (maxPdfSize > 0 && fileSize <= maxPdfSize) { InputStream pdfIS = media.getContent().newInput(); try { //PdfNLMContentExtractor.THREADS_NUMBER = 1; PdfNLMContentExtractor nlmExtr = new PdfNLMContentExtractor(); Element nlmContent = nlmExtr.extractContent(pdfIS); XMLOutputter outp = new XMLOutputter(); String nlmString = outp.outputString(nlmContent); Media.Builder nlmMediaBuilder = Media.newBuilder(); nlmMediaBuilder.setCollection(media.getCollection()); nlmMediaBuilder.setKey(media.getKey()); nlmMediaBuilder.setSourceFilesize(nlmString.length()); nlmMediaBuilder.setContent(ByteString.copyFromUtf8(nlmString)); nlmMediaBuilder.setMediaType(ProtoConstants.mediaTypeNlm); DocumentProtos.ProvenanceInfo.Builder provenanceBuilder = DocumentProtos.ProvenanceInfo.newBuilder(); DocumentProtos.ProvenanceInfo.SingleProvenanceInfo.Builder signleProvenance = DocumentProtos.ProvenanceInfo.SingleProvenanceInfo.newBuilder(); signleProvenance.setLastModificationDate(new Date().getTime()); signleProvenance.setLastModificationMarkerId("Coansys NLM extraction (CERMINE)"); provenanceBuilder.setCurrentProvenance(signleProvenance); nlmMediaBuilder.setProvenance(provenanceBuilder); context.write(new Text(media.getKey()), new BytesWritable(nlmMediaBuilder.build().toByteArray())); } catch (AnalysisException ex) { logger.warn("cannot process PDF " + media.getSourcePath(), ex); } catch (ExceptionConverter ex) { logger.warn("cannot process PDF (unknown colorspace?) " + media.getSourcePath(), ex); } } logger.info("Finished " + media.getSourcePath()); } else { logger.info("File " + media.getSourcePath() + " is not a PDF"); } } } } @Override public int run(String[] args) throws IOException, InterruptedException, ClassNotFoundException { if (args.length < 3) { logger.error("Usage: NLMExtractionJob <input_seqfile> <output_dir> <max_PDF_size>"); logger.error(" (max_PDF_size -- size in MB; greater files will be ignored)"); return 1; } conf.set(MAX_PDF_SIZE, args[2]); Job job = new Job(conf); job.setJarByClass(NLMExtractionJob.class); job.setInputFormatClass(SequenceFileInputFormat.class); SequenceFileInputFormat.addInputPath(job, new Path(args[0])); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(BytesWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BytesWritable.class); SequenceFileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(ExtractMap.class); job.setNumReduceTasks(0); /* * Launch job */ boolean success = job.waitForCompletion(true); return success ? 0 : 1; } @Override public void setConf(Configuration conf) { this.conf = conf; conf.set("dfs.client.socket-timeout", "70000"); } @Override public Configuration getConf() { return conf; } public static void main(String[] args) throws Exception { System.exit(ToolRunner.run(new NLMExtractionJob(), args)); } }