/* * Copyright 2007 T-Rank AS * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package no.trank.openpipe.parse.pdf; import java.io.Closeable; import java.io.File; import java.io.IOException; import java.io.RandomAccessFile; import java.io.Writer; import java.util.Arrays; import org.pdfbox.io.RandomAccess; import org.pdfbox.pdmodel.PDDocument; import org.pdfbox.util.PDFTextStripper; import no.trank.openpipe.parse.api.ParseData; import no.trank.openpipe.parse.api.Parser; import no.trank.openpipe.parse.api.ParserException; import no.trank.openpipe.parse.api.ParserResult; import no.trank.openpipe.parse.api.ParserResultImpl; /** * @version $Revision$ */ public class PDFParser implements Parser, Closeable { private final PDFTextStripper stripper; private final SBWriter writer; private final RandomAccessImpl scratchFile; public PDFParser() { try { stripper = new PDFTextStripper(); scratchFile = new RandomAccessImpl(1024 * 1024); } catch (IOException e) { throw new RuntimeException(e); } stripper.setSortByPosition(true); writer = new SBWriter(); } @Override public ParserResult parse(ParseData data) throws IOException, ParserException { final PDDocument doc = PDDocument.load(data.getInputStream(), scratchFile); try { writer.reset(); try { stripper.writeText(doc, writer); final ParserResultImpl result = new ParserResultImpl(); result.setText(writer.toString()); result.setTitle(doc.getDocumentInformation().getTitle()); return result; } finally { writer.trimToMaxSize(1024 * 64); } } finally { try { doc.close(); } catch (IOException e) { // Ignoring } } } @Override public void close() throws IOException { scratchFile.release(); } private static class SBWriter extends Writer { private final StringBuilder buf = new StringBuilder(4096); @Override public void write(char cbuf[], int off, int len) { buf.append(cbuf, off, len); } @Override public void write(int c) { buf.append(c); } @Override public void write(char cbuf[]) { buf.append(cbuf); } @Override public void write(String str) { buf.append(str); } @Override public void write(String str, int off, int len) { buf.append(str, off, off + len); } @Override public Writer append(CharSequence csq) { buf.append(csq); return this; } @Override public Writer append(CharSequence csq, int start, int end) { buf.append(csq, start, end); return this; } @Override public Writer append(char c) { buf.append(c); return this; } @Override public void flush() { } @Override public void close() { } @Override public String toString() { return buf.toString(); } public void trimToMaxSize(int maxSize) { if (buf.capacity() > maxSize) { buf.setLength(maxSize); buf.trimToSize(); } } public int capacity() { return buf.capacity(); } public void reset() { buf.setLength(0); } } private static class RandomAccessImpl implements RandomAccess { private final byte[] buf; private final RandomAccessFile ra; private int pointer; private int size; private RandomAccessImpl(int bufferSize) throws IOException { buf = new byte[bufferSize]; final File file = File.createTempFile("pdfParser", ".dat"); file.deleteOnExit(); ra = new RandomAccessFile(file, "rw"); } @Override public void close() throws IOException { pointer = 0; size = 0; Arrays.fill(buf, (byte) 0); ra.setLength(0); ra.seek(0); } @Override public void seek(long position) throws IOException { final long raSeek = position - buf.length; if (raSeek > 0) { ra.seek(raSeek); pointer = buf.length; } else { ra.seek(0); pointer = (int) position; } } @Override public int read() throws IOException { if (pointer >= buf.length) { return ra.read(); } else if (pointer > size) { return -1; } return (int) buf[pointer++] & 0xff; } @Override public int read(byte[] b, int offset, int length) throws IOException { final int len = Math.min(length, size - pointer); if (len > 0) { System.arraycopy(buf, pointer, b, offset, len); pointer += len; } if (size >= buf.length) { final int remaining = length - len; if (remaining > 0) { return len + ra.read(b, offset + len, remaining); } } return len; } @Override public long length() throws IOException { if (size >= buf.length) { return buf.length + ra.length(); } return size; } @Override public void write(int b) throws IOException { if (pointer >= buf.length) { size = buf.length; ra.write(b); } else { buf[pointer++] = (byte) b; if (pointer > size) { size = pointer; } } } @Override public void write(byte[] b, int offset, int length) throws IOException { final int len = Math.min(length, buf.length - pointer); if (len > 0) { System.arraycopy(b, offset, buf, pointer, len); pointer += len; if (pointer > size) { size = pointer; } } final int remaining = length - len; if (remaining > 0) { ra.write(b, offset + len, remaining); size = buf.length; } } public void release() throws IOException { ra.close(); } } }