package org.icij.extract.parser.ocr; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.ocr.TesseractOCRParser; import java.util.HashSet; import java.util.Set; /** * This class is a temporary shim for <a href="https://issues.apache.org/jira/browse/TIKA-2174">TIKA-2174</a> until * Tika 1.15 is released. * * It works by creating a new parser that wraps the standard Tesseract parser and declares support for the {@literal * jpx}, {@literal jp2} and {@literal x-portable-pixmap} image mime-types. */ public class ExtendedTesseractOCRParser extends TesseractOCRParser { private static final long serialVersionUID = -2625994530917375952L; @Override public Set<MediaType> getSupportedTypes(final ParseContext context) { Set<MediaType> types = super.getSupportedTypes(context); if (types.isEmpty()) { return types; } types = new HashSet<>(); types.add(MediaType.image("jpx")); types.add(MediaType.image("jp2")); types.add(MediaType.image("x-portable-pixmap")); return types; } }