package edu.isi.bmkeg.lapdf.bin; import edu.isi.bmkeg.lapdf.uima.cpe.CommandLineFitPipeline; public class CustomBatchProcessingTool { private static final String OPERATION_PDFEX_BLOCKIFY = "blockify"; private static final String OPERATION_PDFEX_BLOCK_STATISTICS = "blockStatistics"; private static final String OPERATION_PDFEX_BLOCKIFY_CLASSIFY = "blockifyClassify"; private static final String OPERATION_FILTERED_LAYOUTAWARE_FULLTEXT_EXTRACTION = "extractFullText"; public static void main(String args[]) { CommandLineFitPipeline pipeline = null; if (args.length == 0) { printUsage(); System.exit(1); } String operationType = args[0]; String outputFolder = null; String inputFolder = null; String ruleFileLocation = null; Integer itemsToSkip = 0; Integer endIndex = 0; if (OPERATION_PDFEX_BLOCKIFY.equals(operationType)) { if (args.length == 4) { inputFolder = args[1]; outputFolder = args[1]; itemsToSkip = new Integer(args[2]); endIndex = new Integer(args[3]); }else if(args.length==5){ inputFolder = args[1]; outputFolder = args[2]; itemsToSkip = new Integer(args[3]); endIndex = new Integer(args[4]); }else{ printUsage(); System.exit(1); } try { pipeline = new CommandLineFitPipeline(inputFolder, true, false, outputFolder,itemsToSkip,endIndex); } catch (Exception e) { e.printStackTrace(); printUsage(); } } else if (OPERATION_PDFEX_BLOCK_STATISTICS.equals(operationType)) { if (args.length == 4) { inputFolder = args[1]; outputFolder = args[1]; itemsToSkip = new Integer(args[2]); endIndex = new Integer(args[3]); }else if(args.length==5){ inputFolder = args[1]; outputFolder = args[2]; itemsToSkip = new Integer(args[3]); endIndex = new Integer(args[4]); }else{ printUsage(); System.exit(1); } try { pipeline = new CommandLineFitPipeline(inputFolder, true, true, outputFolder,itemsToSkip,endIndex); } catch (Exception e) { e.printStackTrace(); printUsage(); } }else if (OPERATION_PDFEX_BLOCKIFY_CLASSIFY.equals(operationType)) { if (args.length == 5) { inputFolder = args[1]; outputFolder = args[1]; ruleFileLocation = args[2]; itemsToSkip = new Integer(args[3]); endIndex = new Integer(args[4]); }else if(args.length==6){ inputFolder = args[1]; ruleFileLocation = args[2]; outputFolder = args[3]; itemsToSkip = new Integer(args[4]); endIndex = new Integer(args[5]); }else{ printUsage(); System.exit(1); } try { pipeline = new CommandLineFitPipeline(inputFolder, ruleFileLocation, false, true, outputFolder,itemsToSkip,endIndex); } catch (Exception e) { e.printStackTrace(); printUsage(); } } else if (OPERATION_FILTERED_LAYOUTAWARE_FULLTEXT_EXTRACTION.equals(operationType)) { if (args.length != 6 && args.length!=5) { printUsage(); System.exit(1); }else if(args.length==6){ inputFolder = args[1]; outputFolder = args[3]; ruleFileLocation = args[2]; itemsToSkip = new Integer(args[4]); endIndex = new Integer(args[5]); }else{ inputFolder = args[1]; outputFolder = args[1]; ruleFileLocation = args[2]; itemsToSkip = new Integer(args[3]); endIndex = new Integer(args[4]); } try { //in this mode since we have a rule file we set extractUnclassified to false and report blocks to false pipeline = new CommandLineFitPipeline(inputFolder, ruleFileLocation, false, false, outputFolder,itemsToSkip,endIndex); } catch (Exception e) { e.printStackTrace(); printUsage(); } } else { printUsage(); } if(pipeline!=null){ pipeline.run(); } } public static void printUsage() { System.out.println("Usage Guidelines"); System.out.println("1. Blockifying PDF: Use this option if you want to blockify the PDF and output the blocks XML"); System.out.println("Usage\nArgument 1:" + OPERATION_PDFEX_BLOCKIFY + "\nArgument 2: The directory path where the PDFs are located \nArgument 3[Optional]: The directory path where output of blockify will be placed \nArgument 4: Number of items in the input folder to skip \nArgument 5: Item at which iteration will stop"); System.out.println("\n2. Blockifying PDF and reporting Features: Use this option if you want to blockify the PDF and output the blocks XML and generate a report file that serves as a guide in crafting a rule file for the sectionify step."); System.out.println("Usage\nArgument 1:" + OPERATION_PDFEX_BLOCK_STATISTICS + "\nArgument 2: The directory path where the PDFs are located \nArgument 3[Optional]: The directory path where output of blockify and the feature reports will be placed \nArgument 4: Number of items in the input folder to skip \nArgument 5: Item at which iteration will stop"); System.out.println("\n3. Blockifying and sectionifying PDF: Use this option if you want to blockify the PDF and do rhetorical classification of the blocks.It will output an openAccess based XML"); System.out.println("Usage\nArgument 1:" + OPERATION_PDFEX_BLOCKIFY_CLASSIFY + "\nArgument 2: The directory path where the PDFs are located\nArgument 3: The path of the rule file for Drools \nArgument 4[Optional]: The directory path where output of blockify and sectionify will be placed \nArgument 5: Number of items in the input folder to skip \nArgument 6: Item at which iteration will stop"); System.out.println("\n4. Extracting full text from PDF: Use this argument if you want to extract particular section from the openAccess based XML"); System.out.println("Usage\nArgument 1:" + OPERATION_FILTERED_LAYOUTAWARE_FULLTEXT_EXTRACTION + "\nArgument 2: The directory path where the PDFs are located\nArgument 3[Optional]: The directory path where output of blockify and sectionify will be placed \nArgument 4: The path of the rule file for Drools \nArgument 5: Number of items in the input folder to skip \nArgument 6: Item at which iteration will stop"); /*System.out.println("4. Extracting sections from openAccess based XML: Use this argument if you want to extract particular section from the openAccess based XML"); System.out.println("Usage\nArgument 1:" + OPERATION_SECTION_EXTRACTION + "\nArgument 2: The path of the openAccess based XML\nArgument 3: The location where the output file should be created\nArgument 4: Type of the section"); System.out.println("Please use one of the following section type:"); System.out.println("1. " + SectionBasedTextExtractor.ELEMENT_ABSTRACT); System.out.println("2. " + SectionBasedTextExtractor.ELEMENT_INTRODUCTION); System.out.println("3. " + SectionBasedTextExtractor.ELEMENT_MATERIALS_METHODS); System.out.println("4. " + SectionBasedTextExtractor.ELEMENT_DISCUSSION); System.out.println("5. " + SectionBasedTextExtractor.ELEMENT_RESULTS); System.out.println("6. " + SectionBasedTextExtractor.ELEMENT_CONCLUSIONS); */ } }