package com.aspose.pdf.examples.AsposePdfExamples.DocumentObject;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Scanner;
import javax.imageio.ImageIO;
import com.aspose.pdf.Document;
import com.aspose.pdf.Document.CallBackGetHocr;
public class ConvertingNonSearchablePDFToSearchablePDFDocument {
public static void main(String[] args) {
final String myDir = "PathToDir";
Document doc = new Document(myDir + "outFile.pdf");
// Create callBack - logic recognize text for pdf images. Use outer OCR supports HOCR standard(http://en.wikipedia.org/wiki/HOCR).
// We have used free google tesseract OCR(http://en.wikipedia.org/wiki/Tesseract_%28software%29)
CallBackGetHocr cbgh = new CallBackGetHocr() {
@Override
public String invoke(java.awt.image.BufferedImage img) {
File outputfile = new File(myDir + "test.jpg");
try {
ImageIO.write(img, "jpg", outputfile);
} catch (IOException e1) {
e1.printStackTrace();
}
try {
java.lang.Process process = Runtime.getRuntime().exec("tesseract" + " " + myDir + "test.jpg" + " " + myDir + "out hocr");
System.out.println("tesseract" + " " + myDir + "test.jpg" + " " + myDir + "out hocr");
process.waitFor();
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
// reading out.html to string
File file = new File(myDir + "out.html");
StringBuilder fileContents = new StringBuilder((int) file.length());
Scanner scanner = null;
try {
scanner = new Scanner(file);
String lineSeparator = System.getProperty("line.separator");
while (scanner.hasNextLine()) {
fileContents.append(scanner.nextLine() + lineSeparator);
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} finally {
if (scanner != null)
scanner.close();
}
// deleting temp files
File fileOut = new File(myDir + "out.html");
if (fileOut.exists()) {
fileOut.delete();
}
File fileTest = new File(myDir + "test.jpg");
if (fileTest.exists()) {
fileTest.delete();
}
return fileContents.toString();
}
};
// End callBack
doc.convert(cbgh);
doc.save(myDir + "output971.pdf");
}
}