package de.unigoettingen.sub.commons.ocr.util.merge; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.util.HashMap; import java.util.List; import java.util.Map; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLOutputFactory; import javax.xml.stream.XMLStreamConstants; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamReader; import javax.xml.stream.XMLStreamWriter; import org.w3c.tidy.Tidy; public class HocrMerger extends Merger { @Override public void merge(List<InputStream> inputs, OutputStream output) { try { int fileCounter = 1; Boolean docStarted = false; Boolean htmlStarted = false; Boolean insideHead = false; Map<String, String> nsPrefixes = new HashMap<String, String>(); XMLOutputFactory outFactory = XMLOutputFactory.newInstance(); XMLStreamWriter writer = outFactory.createXMLStreamWriter(output); XMLInputFactory inFactory = XMLInputFactory.newInstance(); inFactory.setProperty("javax.xml.stream.supportDTD", Boolean.FALSE); for (InputStream html : inputs) { InputStream xhtml = tidy(html); XMLStreamReader parser = inFactory.createXMLStreamReader(xhtml); while (parser.hasNext()) { int event = parser.next(); if (event == XMLStreamConstants.START_DOCUMENT) { if (!docStarted) { writer.writeStartDocument("UTF-8", "1.0"); docStarted = true; } } else if (event == XMLStreamConstants.COMMENT) { writer.writeComment(parser.getText()); } else if (event == XMLStreamConstants.START_ELEMENT) { String elementName = parser.getLocalName(); if (elementName.equals("head")) { insideHead = true; } boolean isFirstHtml = elementName.equals("html") && fileCounter == 1; boolean isFirstBody = elementName.equals("body") && fileCounter == 1; boolean ignoreMode = fileCounter > 1 && insideHead || elementName.equals("html") || elementName.equals("body"); if (!ignoreMode || isFirstHtml || isFirstBody) { for (int i = 0; i < parser.getNamespaceCount(); i++) { String prefix = parser.getNamespacePrefix(i); String uri = parser.getNamespaceURI(i); if (prefix == null) { prefix = "default"; writer.setDefaultNamespace(parser .getNamespaceURI(i)); } else { writer.setPrefix(prefix, uri); } nsPrefixes.put(prefix, uri); } if (!htmlStarted || !parser.getLocalName().equalsIgnoreCase( "html")) { if (parser.getNamespaceURI() != null) { writer.writeStartElement( parser.getNamespaceURI(), parser.getLocalName()); } else { writer.writeStartElement(parser.getLocalName()); } if (!htmlStarted) { writer.writeDefaultNamespace(nsPrefixes .get("default")); for (Map.Entry<String, String> entry : nsPrefixes.entrySet()) { String namespace = entry.getKey(); if (!namespace.equalsIgnoreCase("default")) { writer.writeNamespace(namespace, entry.getValue()); } } htmlStarted = true; } for (int i = 0; i < parser.getAttributeCount(); i++) { String name = parser.getAttributeLocalName(i); String value = parser.getAttributeValue(i); boolean isId = name.equals("id"); if (isId && value.equals("page_1")) { value = "page_" + fileCounter; } else if (isId && (value.startsWith("block_") || value.startsWith("line_") || value.startsWith("word_") || value .startsWith("xword_"))) { String[] parts = value.split("_"); value = parts[0] + "_" + fileCounter + "_" + (parts.length==3?parts[2]:parts[1]); } if (parser.getAttributeNamespace(i) != null) { writer.writeAttribute( parser.getAttributeNamespace(i), name, value); } else { writer.writeAttribute(name, value); } } } } } else if (event == XMLStreamConstants.CHARACTERS) { writer.writeCharacters(parser.getText()); } else if (event == XMLStreamConstants.END_ELEMENT) { String elementName = parser.getLocalName(); boolean isLastHtml = elementName.equals("html") && fileCounter == inputs.size(); boolean isLastBody = elementName.equals("body") && fileCounter == inputs.size(); boolean ignoreMode = fileCounter > 1 && insideHead || elementName.equals("html") || elementName.equals("body"); if (!ignoreMode || isLastHtml || isLastBody) { writer.writeEndElement(); } if (elementName.equals("head")) { insideHead = false; } } else if (event == XMLStreamConstants.END_DOCUMENT) { if (fileCounter == inputs.size()) { writer.writeEndDocument(); } } } parser.close(); fileCounter++; } writer.flush(); writer.close(); } catch(XMLStreamException e) { throw new IllegalStateException("Error while merging files.", e); } catch(IOException e) { throw new IllegalStateException("Error while merging files.", e); } } private InputStream tidy(InputStream html) throws IOException { ByteArrayOutputStream out = new ByteArrayOutputStream(); Tidy tidy = new Tidy(); tidy.setXmlOut(true); tidy.setXHTML(true); tidy.setNumEntities(true); tidy.setQuiet(true); tidy.setShowWarnings(false); tidy.parse(html, out); return (InputStream) new ByteArrayInputStream(out.toByteArray()); } }