/* Copyright 2005, 2005 Burcu Yildiz Contact: burcu.yildiz@gmail.com This file is part of pdf2table. pdf2table is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. pdf2table is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with pdf2table. If not, see <http://www.gnu.org/licenses/>. */ package pdf2xml; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; public class PDF2XML { public static void convert(String f, String s, String t, String from, String to, boolean interactive_extraction) { try { System.out.println(t); File my_file = new File(t); my_file.mkdirs(); File my_file2 = new File(t, "pdf2xml.dtd"); FileOutputStream fos = new FileOutputStream(my_file2); OutputStreamWriter osw = new OutputStreamWriter(fos); try { build_dtd(osw); } finally { osw.close(); fos.close(); } String cmd = ""; try { Runtime rt = Runtime.getRuntime(); if (from.equals("") || to.equals("")) { cmd = "pdftohtml -xml " + s + " " + t + File.separator + f; System.out.println(cmd); Process p = rt.exec(cmd); p.waitFor(); } else { try { int a = Integer.parseInt(from); int b = Integer.parseInt(to); cmd = "pdftohtml -f " + a + " -l " + b + " -xml " + s + " " + t + File.separator + f; System.out.println(cmd); Process p = rt.exec(cmd); p.waitFor(); } catch (Exception e) { System.out.println(e); } } FirstClassification fc = new FirstClassification( interactive_extraction, t); fc.run(t + File.separator + f + ".xml"); } catch (IOException ie) { System.out.println("Error: " + ie); } catch (InterruptedException ie2) { System.out.println("The program pdftohtml was interrupted."); } } catch (Exception e) { System.out .println("Exception in class: PDF2XML and method: constructor. " + e); } } public static void build_dtd(OutputStreamWriter osw) throws IOException { String dtd = "<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>\n" + "<!ELEMENT pdf2xml (page+,line*,fontspec*)>\n" + "<!ELEMENT page (fontspec*, text*)>\n" + "<!ATTLIST page\n" + "number CDATA #REQUIRED\n" + "position CDATA #REQUIRED\n" + "top CDATA #REQUIRED\n" + "left CDATA #REQUIRED\n" + "height CDATA #REQUIRED\n" + "width CDATA #REQUIRED\n" + ">\n" + "<!ELEMENT fontspec EMPTY>\n" + "<!ATTLIST fontspec\n" + "id CDATA #REQUIRED\n" + "size CDATA #REQUIRED\n" + "family CDATA #REQUIRED\n" + "color CDATA #REQUIRED\n" + ">\n" + "<!ELEMENT text (#PCDATA | b | i)*>\n" + "<!ATTLIST text\n" + "top CDATA #REQUIRED\n" + "left CDATA #REQUIRED\n" + "width CDATA #REQUIRED\n" + "height CDATA #REQUIRED\n" + "font CDATA #REQUIRED\n" + ">\n" + "<!ELEMENT b (#PCDATA)>\n" + "<!ELEMENT i (#PCDATA)>\n" + "<!ELEMENT line (text+)>\n" + "<!ATTLIST line\n" + "typ CDATA #REQUIRED\n" + "top CDATA #REQUIRED\n" + "left CDATA #REQUIRED\n" + "font CDATA #REQUIRED\n" + ">"; osw.write(dtd, 0, dtd.length()); } }