package de.uni_goettingen.sub.commons.ocr.abbyy.server; /* © 2010, SUB Göttingen. All rights reserved. This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ import java.io.IOException; import java.io.OutputStream; import java.math.BigInteger; import java.net.URI; import java.net.URISyntaxException; import java.util.HashMap; import java.util.Locale; import java.util.Map; import java.util.Set; import org.apache.xmlbeans.XmlOptions; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.abbyy.recognitionServer10Xml.xmlTicketV1.ExportParams; import com.abbyy.recognitionServer10Xml.xmlTicketV1.ImageProcessingParams; import com.abbyy.recognitionServer10Xml.xmlTicketV1.InputFile; import com.abbyy.recognitionServer10Xml.xmlTicketV1.MSWordExportSettings; import com.abbyy.recognitionServer10Xml.xmlTicketV1.OutputFileFormatSettings; import com.abbyy.recognitionServer10Xml.xmlTicketV1.PDFAExportSettings; import com.abbyy.recognitionServer10Xml.xmlTicketV1.PDFExportSettings; import com.abbyy.recognitionServer10Xml.xmlTicketV1.RecognitionParams; import com.abbyy.recognitionServer10Xml.xmlTicketV1.TextExportSettings; import com.abbyy.recognitionServer10Xml.xmlTicketV1.XMLExportSettings; import com.abbyy.recognitionServer10Xml.xmlTicketV1.XmlTicketDocument; import com.abbyy.recognitionServer10Xml.xmlTicketV1.XmlTicketDocument.XmlTicket; import de.uni_goettingen.sub.commons.ocr.api.OcrFormat; import de.uni_goettingen.sub.commons.ocr.api.OcrPriority; import de.uni_goettingen.sub.commons.ocr.api.OcrTextType; import de.unigoettingen.sub.commons.ocr.util.abbyy.ToAbbyyMapper; public class AbbyyTicket { private final static Logger logger = LoggerFactory.getLogger(AbbyyTicket.class); /** The namespace used for the AbbyyTicket files. */ private final static String NAMESPACE = "http://www.abbyy.com/RecognitionServer1.0_xml/XmlTicket-v1.xsd"; /** * A Map containing predefined fragments (read settings) for different * formats */ private final Map<OcrFormat, OutputFileFormatSettings> FORMAT_FRAGMENTS; private static String encoding = "UTF8"; private Long processTimeout = null; private XmlOptions opts = new XmlOptions(); private AbbyyProcess ocrProcess; private URI remoteInputFolder; private URI remoteErrorFolder; { opts.setSavePrettyPrint(); Map<String, String> namespaces = new HashMap<String, String>(); namespaces.put("", NAMESPACE); opts.setSaveImplicitNamespaces(namespaces); opts.setUseDefaultNamespace(); FORMAT_FRAGMENTS = new HashMap<OcrFormat, OutputFileFormatSettings>(); PDFExportSettings pdfSettings = PDFExportSettings.Factory .newInstance(opts); pdfSettings.setPictureResolution(BigInteger.valueOf(300)); pdfSettings.setQuality(BigInteger.valueOf(50)); pdfSettings.setUseImprovedCompression(true); pdfSettings.setExportMode("ImageOnText"); FORMAT_FRAGMENTS.put(OcrFormat.PDF, (OutputFileFormatSettings) pdfSettings .changeType(OutputFileFormatSettings.type)); PDFAExportSettings pdfaSettings = PDFAExportSettings.Factory .newInstance(opts); pdfaSettings.setPictureResolution(BigInteger.valueOf(300)); pdfaSettings.setQuality(BigInteger.valueOf(50)); pdfaSettings.setUseImprovedCompression(true); pdfaSettings.setExportMode("ImageOnText"); FORMAT_FRAGMENTS.put(OcrFormat.PDFA, (OutputFileFormatSettings) pdfaSettings .changeType(OutputFileFormatSettings.type)); TextExportSettings txtSettings = TextExportSettings.Factory .newInstance(opts); txtSettings.setEncodingType(encoding); FORMAT_FRAGMENTS.put(OcrFormat.TXT, (OutputFileFormatSettings) txtSettings .changeType(OutputFileFormatSettings.type)); MSWordExportSettings docSettings = MSWordExportSettings.Factory .newInstance(opts); FORMAT_FRAGMENTS.put(OcrFormat.DOC, (OutputFileFormatSettings) docSettings .changeType(OutputFileFormatSettings.type)); FORMAT_FRAGMENTS.put(OcrFormat.HTML, null); FORMAT_FRAGMENTS.put(OcrFormat.XHTML, null); } public AbbyyTicket(AbbyyProcess initProcess) { ocrProcess = initProcess; } public void write(final OutputStream out) throws IOException { if (out == null) { logger.error("OutputStream is not set! (" + ocrProcess.getName() + ")"); throw new IllegalStateException("OutputStream is not set!"); } if (!ocrProcess.hasImagesAndOutputs()) { logger.error("No images or outputs given! (" + ocrProcess.getName() + ")"); throw new IllegalStateException("No images or outputs given!"); } XMLExportSettings xmlSettings = XMLExportSettings.Factory.newInstance(opts); // coordinates for each character in output abbyy xml xmlSettings.setWriteCharactersFormatting(true); xmlSettings.setWriteCharAttributes(true); // We have to change the type here, or else the server does not accept // the ticket containing the xsi:type attribute. // In effect, the ticket cannot be validated. FORMAT_FRAGMENTS.put(OcrFormat.XML, (OutputFileFormatSettings) xmlSettings .changeType(OutputFileFormatSettings.type)); XmlTicketDocument ticketDoc = XmlTicketDocument.Factory .newInstance(opts); XmlTicket ticket = ticketDoc.addNewXmlTicket(); if (processTimeout != null) { ticket.setOCRTimeout(BigInteger.valueOf(processTimeout)); } OcrPriority priority = ocrProcess.getPriority(); if (priority != null) { ticket.setPriority(ToAbbyyMapper.getPriority(priority)); } else { ticket.setPriority("Normal"); } for (String imageFileName : ocrProcess.getRemoteImageNames()) { InputFile inputFile = ticket.addNewInputFile(); inputFile.setName(imageFileName); } ImageProcessingParams imageProcessingParams = ticket.addNewImageProcessingParams(); imageProcessingParams.setDeskew(false); RecognitionParams recognitionParams = ticket.addNewRecognitionParams(); recognitionParams.setRecognitionQuality(ToAbbyyMapper.getQuality(ocrProcess.getQuality())); OcrTextType textType = ocrProcess.getTextType(); if (textType != null) { recognitionParams.addTextType(ToAbbyyMapper.getTextType(textType)); } Set<Locale> langs = ocrProcess.getLanguages(); for (Locale l : langs) { recognitionParams.addLanguage(ToAbbyyMapper.getLanguage(l)); } ExportParams exportParams = ticket.addNewExportParams(); exportParams.setDocumentSeparationMethod("MergeIntoSingleFile"); int i = 0; for (OcrFormat outputFormat : ocrProcess.getAllOutputFormats()) { if (outputFormat == OcrFormat.METADATA) { continue; } OutputFileFormatSettings exportFormat = FORMAT_FRAGMENTS.get(outputFormat); if (exportFormat == null) { logger.warn("The server can't handle the format " + outputFormat.toString() + ", ignoring it. (" + ocrProcess.getName() + ")"); continue; } exportFormat.setOutputFlowType("SharedFolder"); exportFormat.setOutputFileFormat(ToAbbyyMapper.getOutputFormat(outputFormat)); String[] localUriParts = ocrProcess.getOutputUriForFormat(outputFormat).toString().split("/"); String fileName = localUriParts[localUriParts.length - 1]; exportFormat.setNamingRule(fileName); exportFormat.setOutputLocation(ocrProcess.getWindowsPathForServer()); exportParams.addNewExportFormat(); exportParams.setExportFormatArray(i, exportFormat); i++; } // goes into the global temp directory ticketDoc.save(out, opts); } public Long getProcessTimeout() { return processTimeout; } public void setProcessTimeout(Long newTimeout) { processTimeout = newTimeout; } public void setRemoteInputFolder(URI newFolder) { remoteInputFolder = newFolder; } public URI getRemoteInputUri() throws URISyntaxException { return new URI(remoteInputFolder.toString() + ocrProcess.getName() + ".xml"); } public void setRemoteErrorFolder(URI newFolder) { remoteErrorFolder = newFolder; } public URI getRemoteErrorUri() throws URISyntaxException { return new URI(remoteErrorFolder.toString() + ocrProcess.getName() + ".xml"); } }