package edu.berkeley.cs.nlp.ocular.data; import static edu.berkeley.cs.nlp.ocular.data.textreader.Charset.SPACE; import java.awt.image.BufferedImage; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FilenameFilter; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import edu.berkeley.cs.nlp.ocular.data.textreader.Charset; import edu.berkeley.cs.nlp.ocular.image.ImageUtils; import edu.berkeley.cs.nlp.ocular.image.ImageUtils.PixelType; import edu.berkeley.cs.nlp.ocular.image.Visualizer; import edu.berkeley.cs.nlp.ocular.preprocessing.Binarizer; import edu.berkeley.cs.nlp.ocular.preprocessing.Cropper; import edu.berkeley.cs.nlp.ocular.preprocessing.LineExtractor; import edu.berkeley.cs.nlp.ocular.preprocessing.Straightener; import edu.berkeley.cs.nlp.ocular.util.FileUtil; import static edu.berkeley.cs.nlp.ocular.util.CollectionHelper.last; import tberg.murphy.fileio.f; /** * A document that reads a file only as it is needed (and then stores * the contents in memory for later use). * * @author Dan Garrette (dhgarrette@gmail.com) */ public abstract class LazyRawImageDocument implements Document { private final String inputPath; private final int lineHeight; private final double binarizeThreshold; private final boolean crop; private PixelType[][][] observations = null; private String extractedLinesPath = null; private String[][] diplomaticTextLines = null; private boolean diplomaticTextLinesLoaded = false; private String[][] normalizedTextLines = null; private boolean normalizedTextLinesLoaded = false; private List<String> normalizedText = null; private boolean normalizedTextLoaded = false; public LazyRawImageDocument(String inputPath, int lineHeight, double binarizeThreshold, boolean crop, String extractedLinesPath) { this.inputPath = inputPath; this.lineHeight = lineHeight; this.binarizeThreshold = binarizeThreshold; this.crop = crop; this.extractedLinesPath = extractedLinesPath; } final public PixelType[][][] loadLineImages() { if (observations == null) { // file has already been loaded in this Ocular run if (extractedLinesPath == null) { // no pre-extraction path given observations = doLoadObservationsFromFile(); // load data from original file } else { // a pre-extraction path was given if (extractionFilesPresent()) { // pre-extracted lines exist at the specified location observations = doLoadObservationsFromLineExtractionFiles(); // load data from pre-extracted line files } else { // pre-extraction has not been done yet; do it now. observations = doLoadObservationsFromFile(); // load data from original file writeExtractedLineImagesAggregateFile(); writeIndividualExtractedLineImageFiles(); // write extracted lines to files so they don't have to be re-extracted next time } } } return observations; } private PixelType[][][] doLoadObservationsFromFile() { BufferedImage bi = doLoadBufferedImage(); double[][] levels = ImageUtils.getLevels(bi); double[][] rotLevels = Straightener.straighten(levels); double[][] cropLevels = crop ? Cropper.crop(rotLevels, binarizeThreshold) : rotLevels; Binarizer.binarizeGlobal(binarizeThreshold, cropLevels); List<double[][]> lines = LineExtractor.extractLines(cropLevels); PixelType[][][] loadedObservations = new PixelType[lines.size()][][]; for (int i = 0; i < lines.size(); ++i) { loadedObservations[i] = imageToObservation(ImageUtils.makeImage(lines.get(i))); } return loadedObservations; } private PixelType[][][] doLoadObservationsFromLineExtractionFiles() { System.out.println("Loading pre-extracted line images from " + leLineDir()); final Pattern pattern = Pattern.compile("line(\\d+)\\." + ext()); File[] lineImageFiles = new File(leLineDir()).listFiles(new FilenameFilter() { public boolean accept(File dir, String name) { return pattern.matcher(name).matches(); } }); if (lineImageFiles == null) throw new RuntimeException("lineImageFiles is null"); if (lineImageFiles.length == 0) throw new RuntimeException("lineImageFiles.length == 0"); Arrays.sort(lineImageFiles); PixelType[][][] loadedObservations = new PixelType[lineImageFiles.length][][]; for (int i = 0; i < lineImageFiles.length; ++i) { Matcher m = pattern.matcher(lineImageFiles[i].getName()); if (m.find() && Integer.valueOf(m.group(1)) != i) throw new RuntimeException("Trying to load lines from "+leLineDir()+" but the file for line "+i+" is missing (found "+m.group(1)+" instead)."); String lineImageFile = fullLeLinePath(i); System.out.println(" Loading pre-extracted line from " + lineImageFile); try { loadedObservations[i] = imageToObservation(f.readImage(lineImageFile)); } catch (Exception e) { throw new RuntimeException("Couldn't read line image from: " + lineImageFile, e); } } return loadedObservations; } private PixelType[][] imageToObservation(BufferedImage image) { if (lineHeight >= 0) { return ImageUtils.getPixelTypes(ImageUtils.resampleImage(image, lineHeight)); } else { return ImageUtils.getPixelTypes(image); } } /** * Write all extracted lines to a single file for easy viewing * * @multilineExtractionImagePath The path of the file to write to. */ public void writeExtractedLineImagesAggregateFile(String multilineExtractionImagePath) { System.out.println("Writing file line-extraction image to: " + multilineExtractionImagePath); new File(multilineExtractionImagePath).getAbsoluteFile().getParentFile().mkdirs(); f.writeImage(multilineExtractionImagePath, Visualizer.renderLineExtraction(observations)); } /** * Write all extracted lines to a single file for easy viewing */ public void writeExtractedLineImagesAggregateFile() { writeExtractedLineImagesAggregateFile(multilineExtractionImagePath()); } public void writeIndividualExtractedLineImageFiles() { new File(leLineDir()).mkdirs(); for (int l = 0; l < observations.length; ++l) { PixelType[][] observationLine = observations[l]; String linePath = fullLeLinePath(l); System.out.println(" Writing individual line-extraction image to: " + linePath); f.writeImage(linePath, Visualizer.renderLineExtraction(observationLine)); } } private boolean extractionFilesPresent() { File f = new File(fullLeLinePath(0)); System.out.println("Looking for extractions in ["+f+"]. "+(f.exists() ? "Found" : "Not found")+"."); return f.exists(); } private String[][] loadTextFile(File textFile, String name) { if (textFile.exists()) { System.out.println("Evaluation "+name+" text found at " + textFile); List<List<String>> textList = new ArrayList<List<String>>(); try { BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(textFile), "UTF-8")); while (in.ready()) { textList.add(Charset.readNormalizeCharacters(in.readLine())); } in.close(); } catch (IOException e) { throw new RuntimeException(e); } String[][] textLines = new String[textList.size()][]; for (int i = 0; i < textLines.length; ++i) { List<String> line = textList.get(i); textLines[i] = line.toArray(new String[line.size()]); } return textLines; } else { System.out.println("No evaluation "+name+" text found at " + textFile + " (This is only a problem if you were trying to provide a gold "+name+" transcription to check accuracy.)"); return null; } } public String[][] loadDiplomaticTextLines() { if (!diplomaticTextLinesLoaded) { diplomaticTextLines = loadTextFile(new File(baseName().replaceAll("\\.[^.]*$", "") + ".txt"), "diplomatic"); } diplomaticTextLinesLoaded = true; return diplomaticTextLines; } public String[][] loadNormalizedTextLines() { if (!normalizedTextLinesLoaded) { normalizedTextLines = loadTextFile(new File(baseName().replaceAll("\\.[^.]*$", "") + "_normalized.txt"), "normalized"); } normalizedTextLinesLoaded = true; return normalizedTextLines; } public List<String> loadNormalizedText() { if (!normalizedTextLoaded) { String[][] normalizedTextLines = loadNormalizedTextLines(); if (normalizedTextLines != null) { normalizedText = new ArrayList<String>(); for (String[] lineChars : loadNormalizedTextLines()) { for (String c : lineChars) { if (SPACE.equals(c) && (normalizedText.isEmpty() || SPACE.equals(last(normalizedText)))) { // do nothing -- collapse spaces } else { normalizedText.add(c); } } if (!normalizedText.isEmpty() && !SPACE.equals(last(normalizedText))) { normalizedText.add(SPACE); } } if (SPACE.equals(last(normalizedText))) { normalizedText.remove(normalizedText.size()-1); } } } normalizedTextLoaded = true; return normalizedText; } private String multilineExtractionImagePath() { return fullLePreExt() + "." + ext(); } private String leLineDir() { return fullLePreExt() + "_" + ext(); } private String fileParent() { return FileUtil.removeCommonPathPrefixOfParents(new File(inputPath), file())._2; } private String fullLePreExt() { return extractedLinesPath + "/" + fileParent() + "/" + preext() + "-line_extract"; } private String fullLeLinePath(int lineNum) { return String.format(leLineDir() + "/line%02d." + ext(), lineNum); } abstract protected File file(); abstract protected BufferedImage doLoadBufferedImage(); abstract protected String preext(); abstract protected String ext(); }