package edu.isi.bmkeg.lapdf.xml; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.util.List; import org.apache.xml.serialize.OutputFormat; import org.apache.xml.serialize.XMLSerializer; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; import edu.isi.bmkeg.lapdf.model.ChunkBlock; import edu.isi.bmkeg.lapdf.model.LapdfDocument; import edu.isi.bmkeg.lapdf.model.PageBlock; import edu.isi.bmkeg.lapdf.model.WordBlock; import edu.isi.bmkeg.lapdf.model.ordering.SpatialOrdering; import edu.isi.bmkeg.lapdf.model.spatial.SpatialEntity; public class SpatialXMLWriter implements XMLWriter { private static final String NEWLINE = System.getProperty("line.separator"); private static final String ENCODING = "UTF-8"; private static final String ELEMENT_NAME_DOCUMENT = "Document"; private static final String ELEMENT_NAME_PAGE = "Page"; private static final String ELEMENT_NAME_CHUNK = "Chunk"; private static final String ELEMENT_NAME_WORD = "Word"; private static final String BLOCK_ATTRIBUTE_X1 = "x1"; private static final String BLOCK_ATTRIBUTE_X2 = "x2"; private static final String BLOCK_ATTRIBUTE_Y1 = "y1"; private static final String BLOCK_ATTRIBUTE_Y2 = "y2"; private static final String BLOCK_ATTRIBUTE_TYPE = "type"; private static final String PAGE_ATTRIBUTE_PAGENUMBER = "pageNumber"; private static final String PAGE_ATTRIBUTE_CHUNK_COUNT = "chunkCount"; private static final String PAGE_ATTRIBUTE_WORD_COUNT = "wordCount"; private static final String WORD_ATTRIBUTE_WORD_FONT = "font"; private static final String WORD_ATTRIBUTE_WORD_STYLE = "style"; private static final String ELEMENT_NAME_TYPE = "type"; public void write(LapdfDocument document, String outputFilename) { try { FileOutputStream XMLOutputFileStream; XMLOutputFileStream = new FileOutputStream(outputFilename); OutputFormat XMLOutputFormat = new OutputFormat("XML", ENCODING, true); XMLSerializer serializer = new XMLSerializer(XMLOutputFileStream, XMLOutputFormat); ContentHandler documentContentHandler = serializer .asContentHandler(); documentContentHandler.startDocument(); AttributesImpl documentAttribute = new AttributesImpl(); AttributesImpl chunkAttribute = new AttributesImpl(); AttributesImpl wordAttribute = new AttributesImpl(); AttributesImpl pageAttribute = new AttributesImpl(); PageBlock page; List<ChunkBlock> chunks; List<SpatialEntity> words; WordBlock word; documentContentHandler.startElement("", "", ELEMENT_NAME_DOCUMENT, documentAttribute); int totalNumberOfPages = document.getTotalNumberOfPages(); for (int i = 1; i <= totalNumberOfPages; i++) { page = document.getPage(i); pageAttribute.clear(); pageAttribute.addAttribute("", "", BLOCK_ATTRIBUTE_X1, "CDATA", page.getMargin()[0] + ""); pageAttribute.addAttribute("", "", BLOCK_ATTRIBUTE_Y1, "CDATA", page.getMargin()[1] + ""); pageAttribute.addAttribute("", "", BLOCK_ATTRIBUTE_X2, "CDATA", page.getMargin()[2] + ""); pageAttribute.addAttribute("", "", BLOCK_ATTRIBUTE_Y2, "CDATA", page.getMargin()[3] + ""); pageAttribute.addAttribute("", "", BLOCK_ATTRIBUTE_TYPE, "CDATA", page.getType() + ""); pageAttribute.addAttribute("", "", PAGE_ATTRIBUTE_CHUNK_COUNT, "CDATA", page.getAllChunkBlocks(null).size() + ""); pageAttribute.addAttribute("", "", PAGE_ATTRIBUTE_PAGENUMBER, "CDATA", page.getPageNumber() + ""); pageAttribute.addAttribute("", "", PAGE_ATTRIBUTE_WORD_COUNT, "CDATA", page.getAllWordBlocks(null).size() + ""); documentContentHandler.startElement("", "", ELEMENT_NAME_PAGE, pageAttribute); chunks = page .getAllChunkBlocks(SpatialOrdering.COLUMN_AWARE_MIXED_MODE); for (ChunkBlock chunk : chunks) { chunkAttribute.clear(); chunkAttribute.addAttribute("", "", BLOCK_ATTRIBUTE_X1, "CDATA", chunk.getX1() + ""); chunkAttribute.addAttribute("", "", BLOCK_ATTRIBUTE_Y1, "CDATA", chunk.getY1() + ""); chunkAttribute.addAttribute("", "", BLOCK_ATTRIBUTE_X2, "CDATA", chunk.getX2() + ""); chunkAttribute.addAttribute("", "", BLOCK_ATTRIBUTE_Y2, "CDATA", chunk.getY2() + ""); chunkAttribute.addAttribute("", "", BLOCK_ATTRIBUTE_TYPE, "CDATA", chunk.getType() + ""); documentContentHandler.startElement("", "", ELEMENT_NAME_CHUNK, chunkAttribute); words = page.containsByType(chunk, SpatialOrdering.MIXED_MODE, WordBlock.class); for (SpatialEntity entity : words) { word = (WordBlock) entity; wordAttribute.clear(); wordAttribute.addAttribute("", "", BLOCK_ATTRIBUTE_X1, "CDATA", word.getX1() + ""); wordAttribute.addAttribute("", "", BLOCK_ATTRIBUTE_Y1, "CDATA", word.getY1() + ""); wordAttribute.addAttribute("", "", BLOCK_ATTRIBUTE_X2, "CDATA", word.getX2() + ""); wordAttribute.addAttribute("", "", BLOCK_ATTRIBUTE_Y2, "CDATA", word.getY2() + ""); wordAttribute.addAttribute("", "", WORD_ATTRIBUTE_WORD_FONT, "CDATA", word.getFont() + ""); wordAttribute.addAttribute("", "", WORD_ATTRIBUTE_WORD_STYLE, "CDATA", word.getFontStyle() + ""); documentContentHandler.startElement("", "", ELEMENT_NAME_WORD, wordAttribute); documentContentHandler.characters(word.getWord() .toCharArray(), 0, word.getWord().toCharArray().length); documentContentHandler.endElement("", "", ELEMENT_NAME_WORD); } documentContentHandler.endElement("", "", ELEMENT_NAME_CHUNK); } documentContentHandler.endElement("", "", ELEMENT_NAME_PAGE); } documentContentHandler.endElement("", "", ELEMENT_NAME_DOCUMENT); documentContentHandler.endDocument(); XMLOutputFileStream.close(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (SAXException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }