package de.unigoettingen.sub.commons.ocr.servlet; import java.io.File; import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; import java.util.List; import javax.servlet.ServletException; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import de.unigoettingen.sub.commons.ocr.util.FileAccess; import de.unigoettingen.sub.ocr.controller.OcrEngineStarter; import de.unigoettingen.sub.ocr.controller.OcrParameters; public class SimpleOcrServlet extends HttpServlet { private static final long serialVersionUID = -6874162548956424669L; private static final String TITLE = "GDZ Simple-OCR 0.0.3 - Java"; private String defaultLanguage; private String tempRootDir; private String sourceRootDir; private String fileExtension; // for unit tests protected OcrEngineStarter getEngineStarter() { return new OcrEngineStarter(); } protected FileAccess getFileAccess() { return new FileAccess(); } @Override public void init() throws ServletException { defaultLanguage = getInitParam("defaultLanguage"); tempRootDir = getInitParam("tempRootDir"); sourceRootDir = getInitParam("sourceRootDir"); fileExtension = getInitParam("fileExtension"); } private String getInitParam(String paramName) throws ServletException { String initParam = getServletConfig().getInitParameter(paramName).toString(); if (initParam == null) { throw new ServletException("Kein Init-Parameter in der web.xml: " + paramName); } return initParam; } @Override public void doGet(HttpServletRequest request, HttpServletResponse response) throws IOException, ServletException { String lang = request.getParameter("lang"); if (lang == null) { lang = defaultLanguage; } String restOfSourceDir = request.getParameter("path"); if (restOfSourceDir == null) { throw new ServletException("Fehlende Eingabedaten, z. B. ?path=myimages/ocr"); } restOfSourceDir = normalizePath(restOfSourceDir); String sourceImagesDir = sourceRootDir + restOfSourceDir; String tempImagesDir = tempRootDir + restOfSourceDir; String tempResultsDir = tempRootDir + restOfSourceDir; String imagesRange = request.getParameter("imgrange"); if (imagesRange == null) { throw new ServletException("Fehlende Eingabedaten, z. B. ?imgrange=1-10"); } List<String> imageNames = createFileList(imagesRange); FileAccess fileAccess = getFileAccess(); for (String imageName : imageNames) { fileAccess.copyFile(new File(sourceImagesDir + imageName), new File(tempImagesDir + imageName)); } OcrParameters params = new OcrParameters(); params.inputFolder = tempImagesDir; params.outputFolder = tempResultsDir; params.inputLanguages = new String[]{lang}; params.inputTextType = "NORMAL"; params.ocrEngine = "abbyy-multiuser"; params.outputFormats = new String[]{"TXT"}; params.priority = "1"; OcrEngineStarter engineStarter = getEngineStarter(); engineStarter.startOcrWithParams(params); File ocrTextResult = new File(tempResultsDir + getJobName(sourceImagesDir) + fileExtension); fillResponse(response, ocrTextResult); fileAccess.deleteDir(new File(tempImagesDir)); } private String normalizePath(String path) { String r = null; r = path.replaceAll("^\\/(.*)[\\/]?", "$1") + "/"; r = r.replaceAll("[/\\\\]+", "\\" + File.separator); return r; } private List<String> createFileList(String range) throws ServletException { ArrayList<String> files = new ArrayList<String>(); if (range.contains("-")) { String[] r = range.split("-"); Integer from = Integer.decode(r[0]); Integer to = Integer.decode(r[1]); if (from > to) { throw new ServletException("Startwert größer als Endwert"); } for (int i = from; i <= to; i++) { files.add(genFilename(i)); } } else { files.add(genFilename(range)); } return files; } private String genFilename(Integer n) { return genFilename(n.toString()); } private String genFilename(String n) { String pattern = "00000000.tif"; String regex = "\\d{" + n.length() + "}\\."; return pattern.replaceAll(regex, n + "."); } private String getJobName(String sourceImagesDir) { File folder = new File(sourceImagesDir); return folder.getName(); } private void fillResponse(HttpServletResponse response, File ocrTextResult) throws IOException { response.setContentType("text/html"); PrintWriter out = response.getWriter(); out.println("<html><head>"); out.println("<title>" + TITLE + "</title></head><body>"); out.println("<h1>Ergebnis:</h1><hr>"); out.println("<pre>"); FileAccess fileAccess = getFileAccess(); String fileContents = fileAccess.readFileToString(ocrTextResult); out.println(fileContents); out.println("</pre><hr/></body></html>"); } }