/*
Copyright 2005, 2005 Burcu Yildiz
Contact: burcu.yildiz@gmail.com
This file is part of pdf2table.
pdf2table is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
pdf2table is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with pdf2table. If not, see <http://www.gnu.org/licenses/>.
*/
package pdf2xml;
import java.util.ArrayList;
import java.util.List;
/**
* Second class of table builder. Naming and partitioning is a historical
* artifact and is only preserved for the benefit of those familiar with the old
* code base. All control flow is now in {@link FirstClassification#run(String)}.
*
*/
public class SecondClassification {
// multiline blocks with less than 3 lines will be ignored
private static final int MIN_BLOCK_LINES = 2;
/**
* Analyze a list of multiline blocks and one of lines, return a list of
* tables that they represent.
*/
static List<Table> decompose_tables(List<Multiline_Block> blocks, List<Line> lines) {
List<Table> tables = new ArrayList<Table>();
for (Multiline_Block mlb : blocks) {
int lines_before = 0;
if (mlb.end - mlb.begin >= MIN_BLOCK_LINES) {
int b = mlb.begin;
Node root = new Node("root",-1);
while (b<=mlb.end) {
Line l = lines.get(b);
for (Text_Element t: l.texts) {
if (t.artificial) {
}
root.insert(t,lines_before);
}
b++;
lines_before++;
} // end of while (b<=mlb.end)
root.print_tree();
Table new_table = new Table();
convert_to_table(root, null, new_table.columns, lines_before);
for (int k=0; k< new_table.columns.size() -1; k++) {
Column c1 = new_table.columns.get(k);
Column c2 = new_table.columns.get(k+1);
Column nc = (Column) c1.clone();
if (c1.left <= c2.left && c1.right >= c2.left) {
// merge columns because they overlap
boolean merge = true;
for (int j=0; j<c1.cells.size();j++) {
Text_Element t1 = c1.cells.get(j);
Text_Element t2 = c2.cells.get(j);
Text_Element nt = nc.cells.get(j);
if (t1.value.equals("null")
|| t2.value.equals("null")) {
if (!t1.value.equals("null")) {
nt.value = t1.value;
if (t1.colspan > 1) {
nt.colspan--;
} else {
nc.add(t1);
}
} else {
nt.value = t2.value;
if (t2.colspan > 1) {
nt.colspan--;
} else {
nc.add(t2);
}
}
} else {
merge = false;
break;
}
}
if (merge == true) {
new_table.columns.add(k, nc);
new_table.columns.remove(k+1);
new_table.columns.remove(k+1);
}
}
}
new_table.datarow_begin = 0; //data_row_begin;
boolean header = true;
int sum = 0;
if (new_table.columns.size() > 0) {
Column c1 = new_table.columns.get(0);
int k=0;
while (header == true && k < c1.cells.size()) {
for (int m=0; m < new_table.columns.size(); m++) {
Column current_c = new_table.columns.get(m);
Text_Element t = current_c.cells.get(k);
if (t.artificial == false ) {
sum = sum + t.colspan;
if (sum >= mlb.max_elements) {
header = false;
new_table.datarow_begin = k+1;
}
}
}
k++;
}
}
new_table.page = mlb.page;
tables.add(new_table);
} // end of "if more than 3 lines in multiline block"
}
return tables;
}
private static int convert_to_table(Node n, Column c, List<Column> v, int l) {
if (c == null) {
// root node
int spanning =0;
for (int i=0; i < n.nodes.size(); i++) {
Column new_column = new Column();
v.add(new_column);
spanning += convert_to_table(n.nodes.get(i), new_column, v, l);
}
return spanning;
}
else {
// not root node
int pos = 0;
if (!n.content.equals("null")) {
c.cells.add(n.text_element);
pos = c.cells.size();
if (n.text_element.colspan == 1) {
c.add(n.text_element);
}
} else {
Text_Element t = new Text_Element();
c.cells.add(t);
pos = c.cells.size();
}
if (n.nodes.size() >= 1) {
Column store = (Column) c.clone();
int spanning = 0;
spanning += convert_to_table(n.nodes.get(0), c, v, l);
for (int i=1; i < n.nodes.size(); i++) {
Column new_column = new Column();
new_column.cells.addAll(store.cells);
v.add(new_column);
spanning += convert_to_table(n.nodes.get(i), new_column, v, l);
}
Text_Element t = c.cells.get(pos-1);
t.colspan = spanning;
return spanning;
} else {
// no children means that we are at the leaf of a branch
while (c.cells.size() < l) {
Text_Element t = new Text_Element();
c.cells.add(t);
}
return 1;
}
}
}
}