/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.ocr;
import static org.apache.tika.parser.ocr.TesseractOCRParser.getTesseractProg;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assume.assumeTrue;
import java.io.InputStream;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.DefaultParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.parser.image.ImageParser;
import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.junit.Test;
import org.xml.sax.helpers.DefaultHandler;
public class TesseractOCRParserTest extends TikaTest {
public static boolean canRun() {
TesseractOCRConfig config = new TesseractOCRConfig();
TesseractOCRParserTest tesseractOCRTest = new TesseractOCRParserTest();
return tesseractOCRTest.canRun(config);
}
private boolean canRun(TesseractOCRConfig config) {
String[] checkCmd = {config.getTesseractPath() + getTesseractProg()};
// If Tesseract is not on the path, do not run the test.
return ExternalParser.check(checkCmd);
}
/*
Check that if Tesseract is not found, the TesseractOCRParser claims to not support
any file types. So, the standard image parser is called instead.
*/
@Test
public void offersNoTypesIfNotFound() throws Exception {
TesseractOCRParser parser = new TesseractOCRParser();
DefaultParser defaultParser = new DefaultParser();
MediaType png = MediaType.image("png");
// With an invalid path, will offer no types
TesseractOCRConfig invalidConfig = new TesseractOCRConfig();
invalidConfig.setTesseractPath("/made/up/path");
ParseContext parseContext = new ParseContext();
parseContext.set(TesseractOCRConfig.class, invalidConfig);
// No types offered
assertEquals(0, parser.getSupportedTypes(parseContext).size());
// And DefaultParser won't use us
assertEquals(ImageParser.class, defaultParser.getParsers(parseContext).get(png).getClass());
}
/*
If Tesseract is found, test we retrieve the proper number of supporting Parsers.
*/
@Test
public void offersTypesIfFound() throws Exception {
TesseractOCRParser parser = new TesseractOCRParser();
DefaultParser defaultParser = new DefaultParser();
ParseContext parseContext = new ParseContext();
MediaType png = MediaType.image("png");
// Assuming that Tesseract is on the path, we should find 5 Parsers that support PNG.
assumeTrue(canRun());
assertEquals(8, parser.getSupportedTypes(parseContext).size());
assertTrue(parser.getSupportedTypes(parseContext).contains(png));
// DefaultParser will now select the TesseractOCRParser.
assertEquals(TesseractOCRParser.class, defaultParser.getParsers(parseContext).get(png).getClass());
}
@Test
public void testPDFOCR() throws Exception {
String resource = "/test-documents/testOCR.pdf";
String[] nonOCRContains = new String[0];
testBasicOCR(resource, nonOCRContains, 2);
}
@Test
public void testDOCXOCR() throws Exception {
String resource = "/test-documents/testOCR.docx";
String[] nonOCRContains = {
"This is some text.",
"Here is an embedded image:"
};
testBasicOCR(resource, nonOCRContains, 3);
}
@Test
public void testPPTXOCR() throws Exception {
String resource = "/test-documents/testOCR.pptx";
String[] nonOCRContains = {
"This is some text"
};
testBasicOCR(resource, nonOCRContains, 3);
}
@Test
public void testOCROutputsHOCR() throws Exception {
assumeTrue(canRun());
String resource = "/test-documents/testOCR.pdf";
String[] nonOCRContains = new String[0];
String contents = runOCR(resource, nonOCRContains, 2,
BasicContentHandlerFactory.HANDLER_TYPE.XML,
TesseractOCRConfig.OUTPUT_TYPE.HOCR);
assertContains("<span class=\"ocrx_word\" id=\"word_1_1\"", contents);
assertContains("Happy</span>", contents);
}
private void testBasicOCR(String resource, String[] nonOCRContains, int numMetadatas) throws Exception{
String contents = runOCR(resource, nonOCRContains, numMetadatas,
BasicContentHandlerFactory.HANDLER_TYPE.TEXT, TesseractOCRConfig.OUTPUT_TYPE.TXT);
if (canRun()) {
if(resource.substring(resource.lastIndexOf('.'), resource.length()).equals(".jpg")) {
assertTrue(contents.toString().contains("Apache"));
} else {
assertTrue(contents.toString().contains("Happy New Year 2003!"));
}
}
}
private String runOCR(String resource, String[] nonOCRContains, int numMetadatas,
BasicContentHandlerFactory.HANDLER_TYPE handlerType,
TesseractOCRConfig.OUTPUT_TYPE outputType) throws Exception {
TesseractOCRConfig config = new TesseractOCRConfig();
config.setOutputType(outputType);
Parser parser = new RecursiveParserWrapper(new AutoDetectParser(),
new BasicContentHandlerFactory(
handlerType, -1));
PDFParserConfig pdfConfig = new PDFParserConfig();
pdfConfig.setExtractInlineImages(true);
ParseContext parseContext = new ParseContext();
parseContext.set(TesseractOCRConfig.class, config);
parseContext.set(Parser.class, parser);
parseContext.set(PDFParserConfig.class, pdfConfig);
try (InputStream stream = TesseractOCRParserTest.class.getResourceAsStream(resource)) {
parser.parse(stream, new DefaultHandler(), new Metadata(), parseContext);
}
List<Metadata> metadataList = ((RecursiveParserWrapper) parser).getMetadata();
assertEquals(numMetadatas, metadataList.size());
StringBuilder contents = new StringBuilder();
for (Metadata m : metadataList) {
contents.append(m.get(RecursiveParserWrapper.TIKA_CONTENT));
}
for (String needle : nonOCRContains) {
assertContains(needle, contents.toString());
}
assertTrue(metadataList.get(0).names().length > 10);
assertTrue(metadataList.get(1).names().length > 10);
//test at least one value
assertEquals("deflate", metadataList.get(1).get("Compression CompressionTypeName"));
return contents.toString();
}
@Test
public void testSingleImage() throws Exception {
assumeTrue(canRun());
String xml = getXML("testOCR.jpg").xml;
assertContains("OCR Testing", xml);
//test metadata extraction
assertContains("<meta name=\"Image Width\" content=\"136 pixels\" />", xml);
//TIKA-2169
assertContainsCount("<html", xml, 1);
assertContainsCount("<title", xml, 1);
assertContainsCount("</title", xml, 1);
assertContainsCount("<body", xml, 1);
assertContainsCount("</body", xml, 1);
assertContainsCount("</html", xml, 1);
}
@Test
public void testImageMagick() throws Exception {
InputStream stream = TesseractOCRConfig.class.getResourceAsStream(
"/test-properties/TesseractOCR.properties");
TesseractOCRConfig config = new TesseractOCRConfig(stream);
String[] CheckCmd = {config.getImageMagickPath() + TesseractOCRParser.getImageMagickProg()};
assumeTrue(ExternalParser.check(CheckCmd));
}
@Test
public void getNormalMetadataToo() throws Exception {
//this should be successful whether or not TesseractOCR is installed/active
//If tesseract is installed, the internal metadata extraction parser should
//work; and if tesseract isn't installed, the regular parsers should take over.
//gif
Metadata m = getXML("testGIF.gif").metadata;
assertTrue(m.names().length > 20);
assertEquals("RGB", m.get("Chroma ColorSpaceType"));
//jpg
m = getXML("testOCR.jpg").metadata;
assertEquals("136", m.get(Metadata.IMAGE_WIDTH));
assertEquals("66", m.get(Metadata.IMAGE_LENGTH));
assertEquals("8", m.get(Metadata.BITS_PER_SAMPLE));
assertEquals(null, m.get(Metadata.SAMPLES_PER_PIXEL));
assertContains("This is a test Apache Tika imag", m.get(Metadata.COMMENTS));
//bmp
m = getXML("testBMP.bmp").metadata;
assertEquals("100", m.get(Metadata.IMAGE_WIDTH));
assertEquals("75", m.get(Metadata.IMAGE_LENGTH));
//png
m = getXML("testPNG.png").metadata;
assertEquals("100", m.get(Metadata.IMAGE_WIDTH));
assertEquals("75", m.get(Metadata.IMAGE_LENGTH));
assertEquals("UnsignedIntegral", m.get("Data SampleFormat"));
//tiff
m = getXML("testTIFF.tif").metadata;
assertEquals("100", m.get(Metadata.IMAGE_WIDTH));
assertEquals("75", m.get(Metadata.IMAGE_LENGTH));
assertEquals("72 dots per inch", m.get("Y Resolution"));
}
//TODO: add unit tests for jp2/jpx/ppm TIKA-2174
@Test
public void testInterwordSpacing() throws Exception {
assumeTrue(canRun());
//default
String xml = getXML("testOCR_spacing.png").xml;
assertContains("The quick", xml);
TesseractOCRConfig tesseractOCRConfigconfig = new TesseractOCRConfig();
tesseractOCRConfigconfig.setPreserveInterwordSpacing(true);
ParseContext parseContext = new ParseContext();
parseContext.set(TesseractOCRConfig.class, tesseractOCRConfigconfig);
//with preserve interwordspacing "on"
//allow some flexibility in case Tesseract is computing spaces
//somewhat differently in different versions/OS's, etc.
xml = getXML("testOCR_spacing.png", parseContext).xml;
Matcher m = Pattern.compile("The\\s{5,20}quick").matcher(xml);
assertTrue(m.find());
}
}