CustomBatchProcessingTool.java example

Explorer

lapdftext-master
- src
  - main
    - autogen
      - edu
        isi
        bmkeg
        pdf
        DocumentInformation.java
        DocumentInformation_Type.java
    - java
      - edu
        isi
        bmkeg
        lapdf
        bin
        BlockStatistics.java
        Blockify.java
        BlockifyClassify.java
        CommandLineTool.java
        CustomBatchProcessingTool.java
        ImagifyBlocks.java
        ImagifySections.java
        PDFExtractionConstants.java
        ReadSectionText.java
        TextPanel.java
        WatchDirectory.java
        classification
        Classifier.java
        ruleBased
        RuleBasedChunkClassifier.java
        controller
        LapdfEngine.java
        LapdfMode.java
        extraction
        Extractor.java
        JPedalExtractor.java
        JPedalPageImageExtractor.java
        exceptions
        AccessException.java
        ClassificationException.java
        EmptyPDFException.java
        EncryptionException.java
        InvalidPopularSpaceValueException.java
        features
        ChunkFeatures.java
        HorizontalSplitFeature.java
        WordFeatures.java
        model
        Block.java
        ChunkBlock.java
        LapdfDirection.java
        LapdfDocument.java
        PageBlock.java
        RTree
        RTChunkBlock.java
        RTDummyProcedure.java
        RTModelFactory.java
        RTPageBlock.java
        RTProcedure.java
        RTSpatialEntity.java
        RTSpatialRepresentation.java
        RTWordBlock.java
        WordBlock.java
        factory
        AbstractModelFactory.java
        ordering
        SpatialOrdering.java
        spatial
        SpatialEntity.java
        SpatialRepresentation.java
        parser
        Parser.java
        RuleBasedParser.java
        text
        SectionBasedTextExtractor.java
        SectionsTextWriter.java
        SpatialLayoutFeaturesReportGenerator.java
        SpatiallyOrderedChunkTextWriter.java
        SpatiallyOrderedChunkTypeFilteredTextWriter.java
        TextWriter.java
        uima
        ae
        ParserRuleBasedClassfierAE.java
        cpe
        CommandLineFitPipeline.java
        cr
        DirectoryCollectionReader.java
        utils
        JPedalPDFRenderer.java
        PageImageOutlineRenderer.java
        PdfDirWatcher.java
        xml
        OpenAccessXMLWriter.java
        SpatialXMLWriter.java
        XMLWriter.java
        utils
        FileUtils.java
        FrequencyCounter.java
        ISI_UIMA_PDFUtils.java
        IntegerFrequencyCounter.java
        PipelineLauncher.java
        ReflectionUtils.java
        StringCleaner.java
        StringCleanerException.java
  - test
    - java
      - edu
        isi
        bmkeg
        CommandLineToolTest.java
        ladpdf
        bin
        BlockifyClassifyTest.java
        BlockifyTest.java
        BlocksStatisticsTest.java
        ImagifyBlocksTest.java
        ImagifySectionsTest.java
        ReadSectionTextTest.java
        dirWatchers
        WatchDirectory_BLOCKIFY_CLASSIFY_Test.java
        WatchDirectory_BLOCKIFY_Test.java
        WatchDirectory_IMAGIFY_BLOCKS_Test.java
        WatchDirectory_IMAGIFY_SECTIONS_Test.java
        WatchDirectory_READ_SECTION_TEXT_Test.java

package edu.isi.bmkeg.lapdf.bin;

import edu.isi.bmkeg.lapdf.uima.cpe.CommandLineFitPipeline;

public class CustomBatchProcessingTool 
{
	private static final String OPERATION_PDFEX_BLOCKIFY = "blockify";
	private static final String OPERATION_PDFEX_BLOCK_STATISTICS = "blockStatistics";
	private static final String OPERATION_PDFEX_BLOCKIFY_CLASSIFY = "blockifyClassify";
	private static final String OPERATION_FILTERED_LAYOUTAWARE_FULLTEXT_EXTRACTION = "extractFullText";

	public static void main(String args[])
	{
		CommandLineFitPipeline pipeline = null;
		if (args.length == 0)
		{
			printUsage();
			System.exit(1);

		}
		String operationType = args[0];
		String outputFolder = null;
		String inputFolder = null;
		String ruleFileLocation = null;
		Integer itemsToSkip = 0;
		Integer endIndex = 0;
		if (OPERATION_PDFEX_BLOCKIFY.equals(operationType))
		{
			if (args.length == 4)
			{
				inputFolder = args[1];
				outputFolder = args[1];
				itemsToSkip = new Integer(args[2]);
				endIndex = new Integer(args[3]);
			}else if(args.length==5){
				inputFolder = args[1];
				outputFolder = args[2];
				itemsToSkip = new Integer(args[3]);
				endIndex = new Integer(args[4]);

			}else{
				printUsage();
				System.exit(1);
			}
			try
			{
				pipeline = new CommandLineFitPipeline(inputFolder, true, false, outputFolder,itemsToSkip,endIndex);
			} catch (Exception e)
			{

				e.printStackTrace();
				printUsage();
			}
		} else if (OPERATION_PDFEX_BLOCK_STATISTICS.equals(operationType))
		{
			if (args.length == 4)
			{
				inputFolder = args[1];
				outputFolder = args[1];
				itemsToSkip = new Integer(args[2]);
				endIndex = new Integer(args[3]);
			}else if(args.length==5){
				inputFolder = args[1];
				outputFolder = args[2];
				itemsToSkip = new Integer(args[3]);
				endIndex = new Integer(args[4]);
			}else{
				printUsage();
				System.exit(1);
			}
			try
			{
				pipeline = new CommandLineFitPipeline(inputFolder, true, true, outputFolder,itemsToSkip,endIndex);
			} catch (Exception e)
			{

				e.printStackTrace();
				printUsage();
			}
		}else if (OPERATION_PDFEX_BLOCKIFY_CLASSIFY.equals(operationType))
		{
			if (args.length == 5)
			{
				inputFolder = args[1];
				outputFolder = args[1];
				ruleFileLocation = args[2];
				itemsToSkip = new Integer(args[3]);
				endIndex = new Integer(args[4]);
			}else if(args.length==6){
				inputFolder = args[1];
				ruleFileLocation = args[2];
				outputFolder = args[3];
				itemsToSkip = new Integer(args[4]);
				endIndex = new Integer(args[5]);
			}else{
				printUsage();
				System.exit(1);
			}
			try
			{
				pipeline = new CommandLineFitPipeline(inputFolder, ruleFileLocation, false, true, outputFolder,itemsToSkip,endIndex);
			} catch (Exception e)
			{
				e.printStackTrace();
				printUsage();
			}

		} else if (OPERATION_FILTERED_LAYOUTAWARE_FULLTEXT_EXTRACTION.equals(operationType))
		{
			if (args.length != 6 && args.length!=5)
			{
				printUsage();
				System.exit(1);
			}else if(args.length==6){
				inputFolder = args[1];
				outputFolder = args[3];
				ruleFileLocation = args[2];
				itemsToSkip = new Integer(args[4]);
				endIndex = new Integer(args[5]);
			}else{
				inputFolder = args[1];
				outputFolder = args[1];
				ruleFileLocation = args[2];
				itemsToSkip = new Integer(args[3]);
				endIndex = new Integer(args[4]);
			}
			try
			{
				//in this mode since we have a rule file we set extractUnclassified to false and report blocks to false
				pipeline = new CommandLineFitPipeline(inputFolder, ruleFileLocation, false, false, outputFolder,itemsToSkip,endIndex);
			} catch (Exception e)
			{
				e.printStackTrace();
				printUsage();
			}
		}
		else
		{
			printUsage();
		}
		if(pipeline!=null){
			pipeline.run();
		}
	}

	public static void printUsage()
	{
		System.out.println("Usage Guidelines");
		System.out.println("1. Blockifying PDF: Use this option if you want to blockify the PDF and output the blocks XML");
		System.out.println("Usage\nArgument 1:" + OPERATION_PDFEX_BLOCKIFY + "\nArgument 2: The directory path where the PDFs are located \nArgument 3[Optional]: The directory path where output of blockify will be placed \nArgument 4: Number of items in the input folder to skip \nArgument 5: Item at which iteration will stop");
		System.out.println("\n2. Blockifying PDF and reporting Features: Use this option if you want to blockify the PDF and output the blocks XML and generate a report file that serves as a guide in crafting a rule file for the sectionify step.");
		System.out.println("Usage\nArgument 1:" + OPERATION_PDFEX_BLOCK_STATISTICS + "\nArgument 2: The directory path where the PDFs are located \nArgument 3[Optional]: The directory path where output of blockify and the feature reports will be placed \nArgument 4: Number of items in the input folder to skip \nArgument 5: Item at which iteration will stop");
		System.out.println("\n3. Blockifying and sectionifying PDF: Use this option if you want to blockify the PDF and do rhetorical classification of the blocks.It will output an openAccess based XML");
		System.out.println("Usage\nArgument 1:" + OPERATION_PDFEX_BLOCKIFY_CLASSIFY + "\nArgument 2: The directory path where the PDFs are located\nArgument 3: The path of the rule file for Drools \nArgument 4[Optional]: The directory path where output of blockify and sectionify will be placed \nArgument 5: Number of items in the input folder to skip \nArgument 6: Item at which iteration will stop");
		System.out.println("\n4. Extracting full text from PDF: Use this argument if you want to extract particular section from the openAccess based XML");
		System.out.println("Usage\nArgument 1:" + OPERATION_FILTERED_LAYOUTAWARE_FULLTEXT_EXTRACTION + "\nArgument 2: The directory path where the PDFs are located\nArgument 3[Optional]: The directory path where output of blockify and sectionify will be placed \nArgument 4: The path of the rule file for Drools \nArgument 5: Number of items in the input folder to skip \nArgument 6: Item at which iteration will stop");

		/*System.out.println("4. Extracting sections from openAccess based XML: Use this argument if you want to extract particular section from the openAccess based XML");
		System.out.println("Usage\nArgument 1:" + OPERATION_SECTION_EXTRACTION + "\nArgument 2: The path of the openAccess based XML\nArgument 3: The location where the output file should be created\nArgument 4: Type of the section");
		System.out.println("Please use one of the following section type:");
		System.out.println("1. " + SectionBasedTextExtractor.ELEMENT_ABSTRACT);
		System.out.println("2. " + SectionBasedTextExtractor.ELEMENT_INTRODUCTION);
		System.out.println("3. " + SectionBasedTextExtractor.ELEMENT_MATERIALS_METHODS);
		System.out.println("4. " + SectionBasedTextExtractor.ELEMENT_DISCUSSION);
		System.out.println("5. " + SectionBasedTextExtractor.ELEMENT_RESULTS);
		System.out.println("6. " + SectionBasedTextExtractor.ELEMENT_CONCLUSIONS);
		 */
	}

}