package mj.ocraptor.extraction.tika.parser.pdf; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.Serializable; import java.util.Properties; import mj.ocraptor.configuration.Config; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * Config for PDFParser. * * This allows parameters to be set programmatically: * <ol> * <li>Calls to PDFParser, i.e. parser.getPDFParserConfig().setEnableAutoSpace() * (as before)</li> * <li>Constructor of PDFParser</li> * <li>Passing to PDFParser through a ParseContext: * context.set(PDFParserConfig.class, config);</li> * </ol> * * Parameters can also be set by modifying the PDFParserConfig.properties file, * which lives in the expected places, in trunk: * tika-parsers/src/main/resources/org/apache/tika/parser/pdf * * Or, in tika-app-x.x.jar or tika-parsers-x.x.jar: org/apache/tika/parser/pdf * */ public class PDFParserConfig implements Serializable { private static final long serialVersionUID = 6492570218190936986L; // True if we let PDFBox "guess" where spaces should go: private boolean enableAutoSpace = true; // True if we let PDFBox remove duplicate overlapping text: private boolean suppressDuplicateOverlappingText; // True if we extract annotation text ourselves // (workaround for PDFBOX-1143): private boolean extractAnnotationText = true; // True if we should sort text tokens by position // (necessary for some PDFs, but messes up other PDFs): private boolean sortByPosition = false; // True if we should use PDFBox's NonSequentialParser private boolean useNonSequentialParser = false; // True if acroform content should be extracted private boolean extractAcroFormContent = true; public PDFParserConfig() { // init(this.getClass().getResourceAsStream("PDFParser.properties")); File configFile = new File(Config.getTikaPDFParserPropertiesFilePath()); try { init(new FileInputStream(configFile)); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } } /** * Loads properties from InputStream and then tries to close InputStream. If * there is an IOException, this silently swallows the exception and goes * back to the default. * * @param is */ public PDFParserConfig(InputStream is) { init(is); } // initializes object and then tries to close inputstream private void init(InputStream is) { if (is == null) { return; } Properties props = new Properties(); try { props.load(is); } catch (IOException e) { } finally { if (is != null) { try { is.close(); } catch (IOException e) { // swallow } } } setEnableAutoSpace(getProp(props.getProperty("enableAutoSpace"), getEnableAutoSpace())); setSuppressDuplicateOverlappingText(getProp(props.getProperty("suppressDuplicateOverlappingText"), getSuppressDuplicateOverlappingText())); setExtractAnnotationText(getProp(props.getProperty("extractAnnotationText"), getExtractAnnotationText())); setSortByPosition(getProp(props.getProperty("sortByPosition"), getSortByPosition())); setUseNonSequentialParser(getProp(props.getProperty("useNonSequentialParser"), getUseNonSequentialParser())); setExtractAcroFormContent(getProp(props.getProperty("extractAcroFormContent"), getExtractAcroFormContent())); } /** * If true (the default), extract content from AcroForms at the end of the * document. * * @param b */ public void setExtractAcroFormContent(boolean extractAcroFormContent) { this.extractAcroFormContent = extractAcroFormContent; } /** @see #setExtractAcroFormContent(boolean) */ public boolean getExtractAcroFormContent() { return extractAcroFormContent; } /** @see #setEnableAutoSpace. */ public boolean getEnableAutoSpace() { return enableAutoSpace; } /** * If true (the default), the parser should estimate where spaces should be * inserted between words. For many PDFs this is necessary as they do not * include explicit whitespace characters. */ public void setEnableAutoSpace(boolean enableAutoSpace) { this.enableAutoSpace = enableAutoSpace; } /** @see #setSuppressDuplicateOverlappingText(boolean) */ public boolean getSuppressDuplicateOverlappingText() { return suppressDuplicateOverlappingText; } /** * If true, the parser should try to remove duplicated text over the same * region. This is needed for some PDFs that achieve bolding by re-writing * the same text in the same area. Note that this can slow down extraction * substantially (PDFBOX-956) and sometimes remove characters that were not * in fact duplicated (PDFBOX-1155). By default this is disabled. */ public void setSuppressDuplicateOverlappingText(boolean suppressDuplicateOverlappingText) { this.suppressDuplicateOverlappingText = suppressDuplicateOverlappingText; } /** @see #setExtractAnnotationText(boolean) */ public boolean getExtractAnnotationText() { return extractAnnotationText; } /** * If true (the default), text in annotations will be extracted. */ public void setExtractAnnotationText(boolean extractAnnotationText) { this.extractAnnotationText = extractAnnotationText; } /** @see #setSortByPosition(boolean) */ public boolean getSortByPosition() { return sortByPosition; } /** * If true, sort text tokens by their x/y position before extracting text. * This may be necessary for some PDFs (if the text tokens are not rendered * "in order"), while for other PDFs it can produce the wrong result (for * example if there are 2 columns, the text will be interleaved). Default is * false. */ public void setSortByPosition(boolean sortByPosition) { this.sortByPosition = sortByPosition; } /** @see #setUseNonSequentialParser(boolean) */ public boolean getUseNonSequentialParser() { return useNonSequentialParser; } /** * If true, uses PDFBox's non-sequential parser. The non-sequential parser * should be much faster than the traditional full doc parser. However, * until PDFBOX-XXX is fixed, the non-sequential parser fails to extract * some document metadata. * <p> * Default is false (use the traditional parser) * * @param useNonSequentialParser */ public void setUseNonSequentialParser(boolean useNonSequentialParser) { this.useNonSequentialParser = useNonSequentialParser; } private boolean getProp(String p, boolean defaultMissing) { if (p == null) { return defaultMissing; } if (p.toLowerCase().equals("true")) { return true; } else if (p.toLowerCase().equals("false")) { return false; } else { return defaultMissing; } } @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + (enableAutoSpace ? 1231 : 1237); result = prime * result + (extractAcroFormContent ? 1231 : 1237); result = prime * result + (extractAnnotationText ? 1231 : 1237); result = prime * result + (sortByPosition ? 1231 : 1237); result = prime * result + (suppressDuplicateOverlappingText ? 1231 : 1237); result = prime * result + (useNonSequentialParser ? 1231 : 1237); return result; } @Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; PDFParserConfig other = (PDFParserConfig) obj; if (enableAutoSpace != other.enableAutoSpace) return false; if (extractAcroFormContent != other.extractAcroFormContent) return false; if (extractAnnotationText != other.extractAnnotationText) return false; if (sortByPosition != other.sortByPosition) return false; if (suppressDuplicateOverlappingText != other.suppressDuplicateOverlappingText) return false; if (useNonSequentialParser != other.useNonSequentialParser) return false; return true; } @Override public String toString() { return "PDFParserConfig [enableAutoSpace=" + enableAutoSpace + ", suppressDuplicateOverlappingText=" + suppressDuplicateOverlappingText + ", extractAnnotationText=" + extractAnnotationText + ", sortByPosition=" + sortByPosition + ", useNonSequentialParser=" + useNonSequentialParser + ", extractAcroFormContent=" + extractAcroFormContent + "]"; } }