/* * Copyright 2010 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.io.pdf; import java.io.IOException; import java.io.InputStream; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.util.TextPosition; import org.apache.uima.cas.CAS; import org.apache.uima.cas.Type; import org.apache.uima.cas.text.AnnotationFS; /** * Converts a PDF to a CAS. Uses a substitution table. * */ public class Pdf2CasConverter extends PdfLayoutEventStripper { private final Log log = LogFactory.getLog(getClass()); private Trie<String> substitutionTable; private CAS cas; private StringBuilder text; private Style regionStyle; private StringBuilder regionText; private String paragraphType; private String headingType; public Pdf2CasConverter() throws IOException { super(); } public void writeText(final CAS aCas, final InputStream aIs) throws IOException { final PDDocument doc = PDDocument.load(aIs); try { if (doc.isEncrypted()) { throw new IOException("Encrypted documents currently not supported"); } cas = aCas; text = new StringBuilder(); writeText(doc); } finally { doc.close(); } } @Override protected void startDocument(final PDDocument aPdf) throws IOException { if (log.isTraceEnabled()) { log.trace("<document>"); } } @Override protected void endDocument(final PDDocument aPdf) throws IOException { cas.setDocumentText(text.toString()); if (log.isTraceEnabled()) { log.trace("</document>"); } } @Override protected void processLineSeparator() throws IOException { if (log.isTraceEnabled()) { log.trace("<br/>"); } if (regionText == null) { throw new IllegalStateException("No region started"); } regionText.append("\n"); } @Override protected void processWordSeparator() throws IOException { if (log.isTraceEnabled()) { log.trace("< >"); } if (regionText == null) { throw new IllegalStateException("No region started"); } regionText.append(" "); } @Override protected void startPage(final int aFirstPage, final int aLastPage, final int aCurrentPage, final PDPage page) throws IOException { if (log.isTraceEnabled()) { log.trace("<page>"); } if (log.isDebugEnabled()) { log.debug("Decoding page " + aCurrentPage + " of " + (aLastPage - aFirstPage + 1)); } } @Override protected void endPage(final int aStartPage, final int aEndPage, final int aCurrentPage, final PDPage page) throws IOException { if (log.isTraceEnabled()) { log.trace("</page>"); } } @Override protected void startRegion(final Style aStyle) throws IOException { if (log.isTraceEnabled()) { log.trace("<" + aStyle + ">"); } regionStyle = aStyle; regionText = new StringBuilder(); } @Override protected void endRegion(final Style aStyle) throws IOException { if (log.isTraceEnabled()) { log.trace("</" + aStyle + ">"); } if (regionText == null) { throw new IllegalStateException("No region started"); } if (regionStyle != aStyle) { throw new IllegalStateException("Current region has style " + regionStyle + ", but closing region has style " + aStyle); } // Append text int begin = text.length(); sanitize(regionText); text.append(regionText.toString()); int end = text.length(); text.append('\n'); // Add annotation switch (aStyle) { case HEADING: if (headingType != null) { Type t = cas.getTypeSystem().getType(headingType); AnnotationFS a = cas.createAnnotation(t, begin, end); cas.addFsToIndexes(a); } break; case PARAGRAPH: if (paragraphType != null) { Type t = cas.getTypeSystem().getType(paragraphType); AnnotationFS a = cas.createAnnotation(t, begin, end); cas.addFsToIndexes(a); } break; default: throw new IllegalStateException("Unknown region style: " + aStyle); } regionStyle = null; regionText = null; } @Override protected void writeCharacters(final TextPosition aText) throws IOException { if (log.isTraceEnabled()) { log.trace("[" + aText.getCharacter() + "]"); } if (regionText == null) { throw new IllegalStateException("No region started"); } regionText.append(aText.getCharacter()); } private static boolean isValidXMLChar(final int aCodePoint) { return (aCodePoint == 0x0009) || (aCodePoint == 0x000A) || (aCodePoint == 0x000D) || ((0x0020 <= aCodePoint) && (aCodePoint <= 0xD7FF)) || ((0xE000 <= aCodePoint) && (aCodePoint <= 0xFFFD)); } private StringBuilder sanitize(final StringBuilder aContent) { int i = 0; int lastBreak = 0; while (i < aContent.length()) { // Check valid unicode char if (!isValidXMLChar(aContent.codePointAt(i))) { aContent.setCharAt(i, ' '); i++; continue; } // Set up how many characters we want to skip int seek = i + 1; // Do we maybe have an entity? if (aContent.charAt(i) == '&') { // REC 2006-10-21 Some PDFs seem to have entities and others // don't // so we may encounter &'s that do not introduce an entity and // just ignore them. final int end = aContent.indexOf(";", i); if (end != -1) { final String cand = aContent.substring(i, end + 1); String r = null; try { if (cand.startsWith("&#x")) { final int cp = Integer.parseInt(cand.substring(2, cand.length() - 1), 16); r = isValidXMLChar(cp) ? String.valueOf(Character.toChars(cp)) : " "; } else if (cand.startsWith("&#")) { final int cp = Integer.parseInt(cand.substring(2, cand.length() - 1)); r = isValidXMLChar(cp) ? String.valueOf(Character.toChars(cp)) : " "; } else { // RE 2006-10-22 The chance that there is a & and a // ; // together in a string is quite big. Let's be // tolerant. } } catch (final NumberFormatException e) { log.warn("Invalid numeric entity in fragment [" + cand + "] - Dropping it."); } // Expand the entity and set proper skip (if found) if (r != null) { aContent.replace(i, i + cand.length(), r); seek = i + r.length(); } } } // Match against the Trie after numeric entity expansion is over if (substitutionTable != null) { final Trie<String>.Node match = substitutionTable.getNode(aContent, i); if (match != null) { aContent.replace(i, i + match.level, match.value); seek = i + match.value.length(); } } // Check line breaks while (i < seek) { if (aContent.charAt(i) == '\n') { lastBreak = i; } else if (Character.isWhitespace(aContent.codePointAt(i)) && (i > (lastBreak + 79))) { lastBreak = i; aContent.replace(i, i + 1, "\n"); } i++; } } return aContent; } public void setSubstitutionTable(Trie<String> aSubstitutionTable) { substitutionTable = aSubstitutionTable; } public Trie<String> getSubstitutionTable() { return substitutionTable; } public String getParagraphType() { return paragraphType; } public void setParagraphType(String aParagraphType) { paragraphType = aParagraphType; } public String getHeadingType() { return headingType; } public void setHeadingType(String aHeadingType) { headingType = aHeadingType; } }