package edu.isi.bmkeg.lapdf.bin;
import java.io.File;
import java.net.URL;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import edu.isi.bmkeg.lapdf.controller.LapdfEngine;
import edu.isi.bmkeg.lapdf.controller.LapdfMode;
import edu.isi.bmkeg.lapdf.model.LapdfDocument;
import edu.isi.bmkeg.lapdf.uima.cpe.CommandLineFitPipeline;
import edu.isi.bmkeg.utils.Converters;
public class ImagifyBlocks
{
private static String USAGE = "usage: <input-dir-or-file> [<output-dir>] \n\n"
+ "<input-dir-or-file> - the full path to the PDF file or directory to be extracted \n"
+ "<output-dir> (optional or '-') - the full path to the output directory \n\n"
+ "Running this command on a PDF file or directory will attempt to generate \n"
+ "one image per page with text chunks drawn out with labels describing \n"
+ "the predominant Font + Style of each block. This is helpful in developing\n"
+ "rule files.\n";
public static void main(String args[]) throws Exception {
LapdfEngine engine = new LapdfEngine();
if (args.length < 1 ) {
System.err.println(USAGE);
System.exit(1);
}
String inputFileOrDirPath = args[0];
String outputDirPath = "";
File inputFileOrDir = new File( inputFileOrDirPath );
if( !inputFileOrDir.exists() ) {
System.err.println(USAGE);
System.err.println("Input file / dir '" + inputFileOrDirPath + "' does not exist.");
System.err.println("Please include full path");
System.exit(1);
}
// output folder is set.
if ( args.length > 1 ) {
outputDirPath = args[1];
} else {
outputDirPath = "-";
}
if( outputDirPath.equals( "-") ) {
if( inputFileOrDir.isDirectory() ) {
outputDirPath = inputFileOrDirPath;
} else {
outputDirPath = inputFileOrDir.getParent();
}
}
File outDir = new File( outputDirPath );
if( !outDir.exists() ) {
outDir.mkdir();
}
if( inputFileOrDir.isDirectory() ){
Pattern patt = Pattern.compile("\\.pdf$");
Map<String, File> inputFiles = Converters.recursivelyListFiles(inputFileOrDir, patt);
Iterator<String> it = inputFiles.keySet().iterator();
while( it.hasNext() ) {
String key = it.next();
File pdf = inputFiles.get(key);
String pdfStem = pdf.getName();
pdfStem = pdfStem.replaceAll("\\.pdf", "");
String outImgPath = Converters.mimicDirectoryStructure(inputFileOrDir, outDir, pdf).getPath();
outImgPath = outImgPath.replaceAll("\\.pdf", "_blockImgs");
File outImgDir = new File(outImgPath);
if(!outImgDir.exists())
outImgDir.mkdir();
try {
LapdfDocument lapdf = engine.blockifyPdfFile(pdf);
engine.renderImageOutlines(lapdf, outImgDir, pdfStem, LapdfMode.BLOCK_ONLY);
} catch (Exception e) {
e.printStackTrace();
}
}
} else {
String pdfStem = inputFileOrDir.getName();
pdfStem = pdfStem.replaceAll("\\.pdf$", "");
String outImgPath = outDir.getPath() + "/" + pdfStem + "_blockImgs";
File outImgDir = new File(outImgPath);
if(!outImgDir.exists())
outImgDir.mkdir();
LapdfDocument lapdf = engine.blockifyPdfFile(inputFileOrDir);
engine.renderImageOutlines(lapdf, outImgDir, pdfStem, LapdfMode.BLOCK_ONLY);
}
}
}