package edu.stanford.nlp.semparse.open.model.tree;
import java.util.*;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;
import fig.basic.LogInfo;
/**
* Fix problematic HTML structures that decrease our accuracy.
*
* All fixes are done in place, so the document will be mutated.
*/
public class HTMLFixer {
private Document document;
public HTMLFixer(Document doc) {
this.document = doc;
}
// ============================================================
// Fix Table (colspan / rowspan)
// ============================================================
public void fixAllTables() {
LogInfo.begin_track("Fix table ...");
for (Element table : document.getElementsByTag("tbody")) {
fixTable(table);
}
LogInfo.end_track();
}
/**
* Normalize colspan and rowspan in the table
* @param tbody An Element with tag name "tbody"
*/
private void fixTable(Element tbody) {
// Fix colspan
int numColumns = 0;
for (Element tr : tbody.children()) {
for (Element cell : new ArrayList<>(tr.children())) {
int colspan = parseIntHard(cell.attr("colspan")), rowspan = parseIntHard(cell.attr("rowspan"));
if (colspan <= 1) continue;
cell.attr("old-colspan", cell.attr("colspan"));
cell.removeAttr("colspan");
String tagName = cell.tagName();
for (int i = 2; i <= colspan; i++) {
if (rowspan <= 1)
cell.after(String.format("<%s></%s>", tagName, tagName));
else
cell.after(String.format("<%s rowspan=%d></%s>", tagName, rowspan, tagName));
}
}
numColumns = Math.max(numColumns, tr.children().size());
}
// Fix rowspan (assuming each column has 1 cell without colspan)
int[] counts = new int[numColumns]; // For each column, track how many rows we should create new elements for
String[] tags = new String[numColumns]; // For each column, track what type of elements to create
for (Element tr : tbody.children()) {
Element currentCell = null;
List<Element> cells = new ArrayList<>(tr.children());
for (int i = 0, k = 0; i < numColumns; i++) {
if (counts[i] > 0) {
// Create a new element caused by rowspan
String newCell = String.format("<%s></%s>", tags[i], tags[i]);
if (currentCell == null)
tr.prepend(newCell);
else
currentCell.after(newCell);
counts[i]--;
} else {
if (k >= cells.size()) continue; // Unfilled row
currentCell = cells.get(k++);
int rowSpan = parseIntHard(currentCell.attr("rowspan"));
if (rowSpan <= 1) continue;
counts[i] = rowSpan - 1;
tags[i] = currentCell.tagName();
currentCell.attr("old-rowspan", currentCell.attr("rowspan"));
currentCell.removeAttr("rowspan");
}
}
}
}
private int parseIntHard(String s) {
if (s.isEmpty()) return 0;
try {
return Integer.parseInt(s);
} catch (NumberFormatException e) {
return 0;
}
}
// ============================================================
// Fix BR
// ============================================================
public void fixAllBRs() {
LogInfo.begin_track("Fix BR ...");
Elements brList;
while (!(brList = document.getElementsByTag("br")).isEmpty()) {
fixBR(brList.get(0).parent());
}
LogInfo.end_track();
}
/**
* Fix BR tags by wrapping each part in P tag instead
*/
private void fixBR(Element parent) {
List<Node> childNodes = parent.childNodesCopy();
while (parent.childNodeSize() > 0) {
Node child = parent.childNode(0);
child.remove();
}
Element currentChild = document.createElement("p");
for (Node node : childNodes) {
if (node instanceof Element && "br".equals(((Element) node).tagName())) {
parent.appendChild(currentChild);
currentChild = document.createElement("p");
} else {
currentChild.appendChild(node);
}
}
parent.appendChild(currentChild);
}
}