package edu.berkeley.cs.nlp.ocular.data; import java.io.File; import java.util.ArrayList; import java.util.List; import tberg.murphy.fileio.f; import edu.berkeley.cs.nlp.ocular.image.ImageUtils; import edu.berkeley.cs.nlp.ocular.image.ImageUtils.PixelType; /** * @author Taylor Berg-Kirkpatrick (tberg@eecs.berkeley.edu) */ public class TextAndLineImagesLoader { public static class TextAndLineImagesDocument implements Document { private final String imgPathPrefix; private final String imgNameSuffix; private final String textPath; private final boolean useLongS; private final int numLines; private final int lineHeight; public TextAndLineImagesDocument(String imgPathPrefix, String imgNameSuffix, String textPath, boolean useLongS, int numLines, int lineHeight) { this.imgPathPrefix = imgPathPrefix; this.imgNameSuffix = imgNameSuffix; this.textPath = textPath; this.useLongS = useLongS; this.numLines = numLines; this.lineHeight = lineHeight; } public PixelType[][][] loadLineImages() { final PixelType[][][] observations = new PixelType[numLines][][]; for (int i=0; i<numLines; ++i) { try { if (lineHeight >= 0) { observations[i] = ImageUtils.getPixelTypes(ImageUtils.resampleImage(f.readImage(imgPathPrefix + i + imgNameSuffix), lineHeight)); } else { observations[i] = ImageUtils.getPixelTypes(f.readImage(imgPathPrefix + i + imgNameSuffix)); } } catch (Exception e) { throw new RuntimeException("Couldn't read doc from: " + imgPathPrefix + i + imgNameSuffix); } } return observations; } public String[][] loadDiplomaticTextLines() { File textFile = new File(textPath); String[][] text = (!textFile.exists() ? null : f.readDocumentByCharacter(textPath, numLines)); return text; } public String[][] loadNormalizedTextLines() { return null; } public List<String> loadNormalizedText() { return null; } public String baseName() { String[] split = imgPathPrefix.split("/"); String baseNamePlusHyphen = split[split.length-1]; return baseNamePlusHyphen.substring(0, baseNamePlusHyphen.length()-1); } public boolean useLongS() { return useLongS; } } public static List<Document> loadDocuments(String inputPath, int lineHeight) { List<String> lines = f.readLines(inputPath); List<Document> docs = new ArrayList<Document>(); File inputFile = new File(inputPath); for (String line : lines) { if (line.trim().equals("")) continue; String[] split = line.split("\\s+"); docs.add(new TextAndLineImagesDocument(inputFile.getParentFile().getAbsolutePath()+"/"+split[0], split[1], inputFile.getParentFile().getAbsolutePath()+"/"+split[2], Boolean.parseBoolean(split[3]), Integer.parseInt(split[4]), lineHeight)); } return docs; } }