package info.aaronland.extruder; import java.util.List; import java.util.ArrayList; import java.lang.StringBuilder; import org.apache.commons.lang3.StringEscapeUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class Document { private static final Logger LOGGER = LoggerFactory.getLogger(Document.class); private ArrayList<String> blocks; private String title; public Document(String doc_text, String doc_title){ blocks = parseText(doc_text); title = doc_title; } public ArrayList<String> getBlocks(){ return this.blocks; } public String getTitle(){ return this.title; } public String toString(){ ArrayList<String> blocks = this.getBlocks(); StringBuilder sb = new StringBuilder(); for (Object obj : blocks) { sb.append(obj.toString()); sb.append("\n\n"); } return sb.toString(); } public String toHTML(){ ArrayList<String> blocks = this.getBlocks(); StringBuilder sb = new StringBuilder(); for (Object obj : blocks) { String html = "<p>" + StringEscapeUtils.escapeXml(obj.toString()) + "</p>"; sb.append(html); } return sb.toString(); } // Please make me better and if possible make the unwrapText method // in TikaResource redundant... (20130903/straup) private static ArrayList<String> parseText(String text){ String[] raw = text.split(System.getProperty("line.separator")); ArrayList<String> blocks = new ArrayList<String>(); String buffer = ""; for (String ln : raw){ ln = ln.trim(); blocks.add(ln); buffer = ""; } if (buffer.length() > 0){ blocks.add(buffer); } return blocks; } }