package edu.berkeley.cs.nlp.ocular.data;
import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import edu.berkeley.cs.nlp.ocular.model.CharacterTemplate;
import edu.berkeley.cs.nlp.ocular.util.FileUtil;
/**
* A dataset loader that reads files the files recursively, in lexicographical
* order. Images are loaded only as they are needed (lazily), and then stored
* in memory for later use.
*
* @author Dan Garrette (dhgarrette@gmail.com)
*/
public class LazyRawImageLoader {
public static List<Document> loadDocuments(String inputPath, String extractedLinesPath, int numDocs, int numDocsToSkip) {
return loadDocuments(inputPath, extractedLinesPath, numDocs, numDocsToSkip, true, 0.12, false);
}
public static List<Document> loadDocuments(String inputPath, String extractedLinesPath, int numDocs, int numDocsToSkip, boolean uniformLineHeight, double binarizeThreshold, boolean crop) {
return loadDocuments(Arrays.asList(inputPath), extractedLinesPath, numDocs, numDocsToSkip, uniformLineHeight, binarizeThreshold, crop);
}
public static List<Document> loadDocuments(List<String> inputPaths, String extractedLinesPath, int numDocs, int numDocsToSkip) {
return loadDocuments(inputPaths, extractedLinesPath, numDocs, numDocsToSkip, true, 0.12, false);
}
public static List<Document> loadDocuments(List<String> inputPaths, String extractedLinesPath, int numDocs, int numDocsToSkip, boolean uniformLineHeight, double binarizeThreshold, boolean crop) {
List<Document> lazyDocs = new ArrayList<Document>();
for (String inputPath : inputPaths) {
lazyDocs.addAll(loadDocumentsFromDir(inputPath, extractedLinesPath, uniformLineHeight, binarizeThreshold, crop));
}
int actualNumDocsToSkip = Math.min(lazyDocs.size(), numDocsToSkip);
int actualNumDocsToUse = Math.min(lazyDocs.size() - actualNumDocsToSkip, numDocs <= 0 ? Integer.MAX_VALUE : numDocs);
System.out.println("Using "+actualNumDocsToUse+" documents (skipping "+actualNumDocsToSkip+")");
for (int docNum = 0; docNum < actualNumDocsToSkip; ++docNum) {
Document lazyDoc = lazyDocs.get(docNum);
System.out.println(" Skipping the first "+numDocsToSkip+" documents: " + lazyDoc.baseName());
}
List<Document> documents = new ArrayList<Document>();
for (int docNum = actualNumDocsToSkip; docNum < actualNumDocsToSkip + actualNumDocsToUse; ++docNum) {
Document lazyDoc = lazyDocs.get(docNum);
System.out.println(" Using " + lazyDoc.baseName());
documents.add(lazyDoc);
}
return documents;
}
private static List<Document> loadDocumentsFromDir(String inputPath, String extractedLinesPath, boolean uniformLineHeight, double binarizeThreshold, boolean crop) {
int lineHeight = uniformLineHeight ? CharacterTemplate.LINE_HEIGHT : -1;
File dir = new File(inputPath);
System.out.println("Reading data from [" + dir + "], which " + (dir.exists() ? "exists" : "does not exist"));
List<File> dirList = FileUtil.recursiveFiles(dir);
List<Document> lazyDocs = new ArrayList<Document>();
for (File f : dirList) {
if (f.getName().endsWith(".txt"))
continue;
else if (f.getName().endsWith(".pdf")) {
int numPages = PdfImageReader.numPagesInPdf(f);
for (int pageNumber = 1; pageNumber <= numPages; ++pageNumber) {
lazyDocs.add(new LazyRawPdfImageDocument(f, pageNumber, inputPath, lineHeight, binarizeThreshold, crop, extractedLinesPath));
}
}
else {
lazyDocs.add(new LazyRawSingleImageDocument(f, inputPath, lineHeight, binarizeThreshold, crop, extractedLinesPath));
}
}
Collections.sort(lazyDocs, new Comparator<Document>() {
public int compare(Document o1, Document o2) {
return o1.baseName().compareTo(o2.baseName());
}
});
return lazyDocs;
}
}