/*
Copyright 2005, 2005 Burcu Yildiz
Contact: burcu.yildiz@gmail.com
This file is part of pdf2table.
pdf2table is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
pdf2table is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with pdf2table. If not, see <http://www.gnu.org/licenses/>.
*/
package pdf2xml;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.util.List;
public class XmlOutput {
public static void create(List<Table> table_list, List<Font> font_list,String path) {
try {
create_stylesheet(path);
create_tables_dtd(path, table_list, font_list);
create_output(path, font_list, table_list);
}
catch (Exception e) {
System.out.println("Exception in class: XmlOutput and method: constructor. " + e);
}
}
public static void create_stylesheet(String path) {
try {
File my_file = new File(path, "table_view.xsl");
PrintStream ps = new PrintStream(new FileOutputStream(my_file));
String xsl_value = "<?xml version=\"1.0\" encoding=\"iso-8859-1\" ?>\n" +
"<xsl:stylesheet xmlns:xsl=\"http://www.w3.org/1999/XSL/Transform\" version=\"1.0\">\n" +
"<xsl:output method=\"html\" />\n" +
"<xsl:template match=\"/\">" +
"<html>\n" +
"<body>\n" +
"<xsl:for-each select=\"tables/table\">\n" +
"<table border=\"1\">\n" +
"<caption>\n" +
// TODO: Next 3 lines windows version only - need to resolve discrepancy
"<xsl:value-of select=\"page\"/>\n" +
"</caption>\n" +
"<caption>\n" +
// End Windows version only code
"<xsl:value-of select=\"title\"/>\n" +
"</caption>\n" +
"<xsl:for-each select=\"header/header_line\">\n" +
"<tr>\n" +
"<xsl:for-each select=\"header_element\">\n" +
"<th bgcolor=\"#ccdddd\" colspan=\"{@colspan}\">\n" +
"<xsl:value-of select=\".\" /> \n" +
"</th>\n" +
"</xsl:for-each>\n" +
"</tr>\n" +
"</xsl:for-each>\n" +
"<xsl:for-each select=\"tbody/data_row\">\n" +
"<tr>\n" +
"<xsl:for-each select=\"cell\">\n" +
"<td colspan=\"{@colspan}\">\n" +
"<xsl:if test=\"@style='bold'\">\n" +
"<b>\n" +
"<xsl:value-of select=\".\" />\n" +
"</b>\n" +
"</xsl:if>\n" +
"<xsl:if test=\"@style='italic'\">\n" +
"<i>\n" +
"<xsl:value-of select=\".\" />\n" +
"</i>\n" +
"</xsl:if>\n" +
"<xsl:if test=\"@style='bolditalic'\">\n" +
"<b><i>\n" +
"<xsl:value-of select=\".\" />\n" +
"</i></b>\n" +
"</xsl:if>\n" +
"<xsl:if test=\"@style=''\">\n" +
"<xsl:value-of select=\".\" />\n" +
"</xsl:if>\n" +
"</td>\n" +
"</xsl:for-each>\n" +
"</tr>\n" +
"</xsl:for-each>\n" +
"<BR> </BR>\n" +
"<BR> </BR>\n" +
"<BR> </BR>\n" +
"</table>\n" +
"</xsl:for-each>\n" +
"</body>\n" +
"</html>\n" +
"</xsl:template>\n" +
"</xsl:stylesheet>\n";
ps.print(xsl_value);
ps.close();
}
catch (IOException ie) {
System.out.println("Exception in class: XmlOutput and method: create_stylesheet. " + ie);
}
catch (Exception e) {
System.out.println("Exception in class: XmlOutput and method: create_stylesheet. " + e);
}
}
public static void create_tables_dtd(String path, List<Table> table_list, List<Font> font_list) {
try {
File my_file = new File(path, "tables.dtd");
PrintStream ps = new PrintStream(new FileOutputStream(my_file));
String dtd_value = "<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>\n" +
"<!ELEMENT tables(table+,fontspec*)>\n" +
"<!ELEMENT fontspec EMPTY>\n" +
"<!ATTLIST fontspec\n" +
"id CDATA #REQUIRED\n" +
"size CDATA #REQUIRED\n" +
"family CDATA #REQUIRED\n" +
"color CDATA #REQUIRED\n" +
">\n" +
"<!ELEMENT table (header,tbody)>\n" +
"<!ELEMENT header (header_element)*>\n" +
"<!ELEMENT header_element (#PCDATA)>\n" +
"<!ATTLIST header_element\n" +
"id CDATA #REQUIRED\n" +
"sh CDATA #REQUIRED\n" +
"font CDATA #REQUIRED\n" +
"colspan CDATA #REQUIRED\n" +
">\n" +
"<!ELEMENT tbody (data_row)*>\n" +
"<!ELEMENT data_row (cell)*>\n" +
"<!ELEMENT cell (#PCDATA)>\n" +
"<!ATTLIST cell\n" +
"sh CDATA #REQUIRED\n" +
"font CDATA #REQUIRED\n" +
"colspan CDATA #REQUIRED\n" +
"style CDATA #REQUIRED\n" +
">\n";
ps.print(dtd_value);
ps.close();
} catch (IOException ie) {
System.out
.println("Exception in class: XmlOutput and method: create_tables_dtd. "
+ ie);
} catch (Exception e) {
System.out
.println("Exception in class: XmlOutput and method: create_tables_dtd. "
+ e);
}
}
public static void create_output(String path, List<Font> fonts,
List<Table> tables) {
try {
File my_file = new File(path, "output.xml");
System.out.println(my_file.toString());
PrintStream ps = new PrintStream(new FileOutputStream(my_file),
true, "UTF-16");
ps.println("<?xml version=\"1.0\" encoding=\"UTF-16\" ?>");
ps.println("<?xml-stylesheet href=\"table_view.xsl\" type=\"text/xsl\" ?>");
ps.println("<tables>");
for (int km = 0; km < fonts.size(); km++) {
Font f = fonts.get(km);
ps.println("<fontspec id=\"" + f.id + "\" size=\"" + f.size
+ "\" family=\"" + f.family + "\" color=\"" + f.color
+ "\"/>");
}
for (int i = 0; i < tables.size(); i++) {
Table c_table = tables.get(i);
int cells_on_column = 0;
ps.println("<table>");
ps.println("<page>" + "TABLE ON PAGE " + c_table.page
+ "</page>");
ps.println("<title>" + c_table.title + "</title>");
// Mac version below
// dos.println("<title>" + "TABLE ON PAGE " + c_table.page +
// "</title>");
ps.println("<header>");
for (int j = 0; j < c_table.datarow_begin; j++) {
int p = 0;
ps.println("<header_line>");
while (p < c_table.columns.size()) {
Column cc1 = c_table.columns.get(p);
cells_on_column = cc1.cells.size();
cc1.header = p + j;
Text_Element t1 = cc1.cells.get(j);
ps.println("<header_element id=\"" + (p+j) + "\" sh=\"" + cc1.header);
ps.println("\" font=\"" + t1.font + "\" colspan=\"" + t1.colspan + "\">");
ps.println("<![CDATA[");
if (!t1.value.equals("null")) {
ps.println(t1.value);
}
ps.println("]]>");
ps.println("</header_element>");
p = p + t1.colspan;
}
ps.println("</header_line>");
}
ps.println("</header>");
ps.println("<tbody>");
for (int j=c_table.datarow_begin; j < cells_on_column; j++) {
ps.println("<data_row>");
int k = 0;
while (k < c_table.columns.size()) {
Column cc = c_table.columns.get(k);
Text_Element t = cc.cells.get(j);
ps.print("<cell sh=\"" + cc.header + "\" font=\"" + t.font);
ps.println("\" colspan=\"" + t.colspan + "\" style=\"" + t.style + "\">");
ps.println("<![CDATA[");
if (!t.value.equals("null")) {
ps.println(t.value);
}
ps.println("]]>");
ps.println("</cell>");
k = k + t.colspan;
}
ps.println("</data_row>");
}
ps.println("</tbody>");
ps.println("</table>");
}
ps.println("</tables>");
System.out.println("TableExtractor extracted " + tables.size()
+ " table(s)!");
ps.close();
}
catch (IOException ie) {
System.out
.println("Exception in class: XmlOutput and method: create_output. "
+ ie);
} catch (Exception e) {
System.out
.println("Exception in class: XmlOutput and method: create_output. "
+ e);
}
}
}