package edu.isi.bmkeg.lapdf.bin; import java.io.File; import java.net.URL; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Pattern; import edu.isi.bmkeg.lapdf.controller.LapdfEngine; import edu.isi.bmkeg.lapdf.model.Block; import edu.isi.bmkeg.lapdf.model.LapdfDocument; import edu.isi.bmkeg.utils.Converters; public class ReadSectionText { private static String USAGE = "usage: <input-dir-or-file> [<output-dir>] [<rule-file>] [<sec1> ... <secN>]\n\n" + "<input-dir-or-file> - the full path to the PDF file or directory to be extracted \n" + "<output-dir> (optional or '-') - the full path to the output directory \n" + "<rule-file> (optional or '-') - the full path to the rule file \n" + "<sec1> ... <secN> (optional) - a list of section names to be included in the dump \n\n" + "Running this command on a PDF file or directory will attempt to extract uninterrupted\n" + "two-column- formatted text of the main narrative section of the paper with one \n" + "font change (i.e. for papers that use a smaller font for methods sections).\n" + "Figure legends are moved to the end of the paper (but included), and \n" + "tables are dropped.\n\n" + "Please send failure examples where this basic behavior fails to \n" + "'gully@usc.edu' for troubleshooting.\n"; public static void main(String args[]) throws Exception { LapdfEngine engine = new LapdfEngine(); if (args.length < 1) { System.err.println(USAGE); System.exit(1); } String inputFileOrDirPath = args[0]; String outputDirPath = ""; String ruleFilePath = ""; File inputFileOrDir = new File(inputFileOrDirPath); if (!inputFileOrDir.exists()) { System.err.println(USAGE); System.err.println("Input file / dir '" + inputFileOrDirPath + "' does not exist."); System.err.println("Please include full path"); System.exit(1); } // output folder is set. if (args.length > 1) { outputDirPath = args[1]; } else { outputDirPath = "-"; } if (outputDirPath.equals("-")) { if (inputFileOrDir.isDirectory()) { outputDirPath = inputFileOrDirPath; } else { outputDirPath = inputFileOrDir.getParent(); } } File outDir = new File(outputDirPath); if (!outDir.exists()) { outDir.mkdir(); } // output folder is set. File ruleFile = null; if (args.length > 2) { ruleFilePath = args[2]; } else { ruleFilePath = "-"; } if (ruleFilePath.equals("-")) { ruleFile = Converters .extractFileFromJarClasspath("rules/general.drl"); } else { ruleFile = new File(ruleFilePath); } if (!ruleFile.exists()) { System.err.println(USAGE); System.err.println(ruleFilePath + " does not exist."); System.err.println("Please include full path"); System.exit(1); } // section name stack is set. List<Set<String>> stack = new ArrayList<Set<String>>(); String sec1 = ""; if (args.length > 3) { sec1 = args[3]; } else { sec1 = "-"; } // default sections to include are // * headings and body interwoven in the text // * figure legends at the end if (sec1.equals("-")) { Set<String> sections = new HashSet<String>(); sections.add(Block.TYPE_BODY); sections.add(Block.TYPE_HEADING); stack.add(sections); sections = new HashSet<String>(); sections.add(Block.TYPE_FIGURE_LEGEND); stack.add(sections); } else { for (int i = 3; i < args.length; i++) { Set<String> sections = new HashSet<String>(); sections.add(args[i]); stack.add(sections); } } if( inputFileOrDir.isDirectory() ){ Pattern patt = Pattern.compile("\\.pdf$"); Map<String, File> inputFiles = Converters.recursivelyListFiles( inputFileOrDir, patt); Iterator<String> it = inputFiles.keySet().iterator(); while (it.hasNext()) { String key = it.next(); File pdf = inputFiles.get(key); String pdfStem = pdf.getName(); pdfStem = pdfStem.replaceAll("\\.pdf$", ""); String outPath = Converters.mimicDirectoryStructure(inputFileOrDir, outDir, pdf).getPath(); outPath = outPath.replaceAll("\\.pdf$", "") + "_fullText.txt"; File outFile = new File(outPath); try { LapdfDocument lapdf = engine.blockifyPdfFile(pdf); engine.classifyDocument(lapdf, ruleFile); engine.writeTextToFile(lapdf, stack, outFile); } catch (Exception e) { e.printStackTrace(); } } } else { String pdfStem = inputFileOrDir.getName(); pdfStem = pdfStem.replaceAll("\\.pdf$", ""); String outPath = outDir + "/" + pdfStem + "_fullText" + ".txt"; File outFile = new File(outPath); LapdfDocument lapdf = engine.blockifyPdfFile(inputFileOrDir); engine.classifyDocument(lapdf, ruleFile); engine.writeTextToFile(lapdf, stack, outFile); } } }