package gutenberg.itext;
import com.google.common.collect.Lists;
import com.itextpdf.text.pdf.PdfDictionary;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.ContentByteUtils;
import com.itextpdf.text.pdf.parser.ImageRenderInfo;
import com.itextpdf.text.pdf.parser.LineSegment;
import com.itextpdf.text.pdf.parser.PdfContentStreamProcessor;
import com.itextpdf.text.pdf.parser.RenderListener;
import com.itextpdf.text.pdf.parser.TextRenderInfo;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
import java.util.List;
/**
* @author <a href="http://twitter.com/aloyer">@aloyer</a>
*/
public class TextStripper {
private Logger log = LoggerFactory.getLogger(TextStripper.class);
private Page currentPage;
private float charWidth = 5f;
public float getCharWidth() {
return charWidth;
}
public TextStripper charWidth(float charWidth) {
this.charWidth = charWidth;
return this;
}
/**
* Extracts text from a PDF document.
*
* @param src the original PDF document
* @throws java.io.IOException
*/
public List<Page> extractText(InputStream src) throws IOException {
List<Page> pages = Lists.newArrayList();
PdfReader reader = new PdfReader(src);
RenderListener listener = new InternalListener();
PdfContentStreamProcessor processor = new PdfContentStreamProcessor(listener);
for (int i = 1; i <= reader.getNumberOfPages(); i++) {
pages.add(currentPage = new Page());
PdfDictionary pageDic = reader.getPageN(i);
PdfDictionary resourcesDic = pageDic.getAsDict(PdfName.RESOURCES);
processor.processContent(ContentByteUtils.getContentBytesForPage(reader, i), resourcesDic);
}
reader.close();
return pages;
}
public class Page {
private List<Row> rows = Lists.newArrayList();
Row rowAt(float y) {
for (Row r : rows) {
if (Math.abs(r.y - y) < 1e-3)
return r;
}
Row row = new Row(y);
rows.add(row);
return row;
}
public String renderedText() {
Collections.sort(rows);
StringBuilder b = new StringBuilder();
for (Row row : rows) {
float prevX = 0;
Collections.sort(row.chunks);
for (Text chunk : row.chunks) {
String tab = StringUtils.repeat(" ", toNumberOfEM(chunk.x - prevX));
b.append(tab).append(chunk.text).append(" ");
prevX = chunk.x + chunk.width;
}
b.append("\n");
}
return b.toString();
}
private int toNumberOfEM(float delta) {
return (int) (delta / charWidth);
}
}
public static class Text implements Comparable<Text> {
public final float x;
public final String text;
public float width;
public Text(float x, String text, float width) {
this.x = x;
this.text = text;
this.width = width;
}
@Override
public int compareTo(Text o) {
return Float.compare(x, o.x);
}
}
public static class Row implements Comparable<Row> {
public final float y;
public final List<Text> chunks = Lists.newArrayList();
public Row(float y) {
this.y = y;
}
public Row text(float x, String text, float width) {
chunks.add(new Text(x, text, width));
return this;
}
@Override
public int compareTo(Row o) {
return -1 * Float.compare(y, o.y);
}
}
public class InternalListener implements RenderListener {
public InternalListener() {
}
/**
* @see com.itextpdf.text.pdf.parser.RenderListener#beginTextBlock()
*/
public void beginTextBlock() {
}
/**
* @see com.itextpdf.text.pdf.parser.RenderListener#endTextBlock()
*/
public void endTextBlock() {
}
/**
* @see com.itextpdf.text.pdf.parser.RenderListener#renderImage(
*com.itextpdf.text.pdf.parser.ImageRenderInfo)
*/
public void renderImage(ImageRenderInfo renderInfo) {
}
/**
* @see com.itextpdf.text.pdf.parser.RenderListener#renderText(
*com.itextpdf.text.pdf.parser.TextRenderInfo)
*/
public void renderText(TextRenderInfo renderInfo) {
LineSegment baseline = renderInfo.getBaseline();
float x = baseline.getStartPoint().get(0);
float y = baseline.getStartPoint().get(1);
float w = baseline.getLength();
String text = renderInfo.getText();
log.debug("Text: @({}, {}) width: {} '{}'", x, y, w, text);
currentPage().rowAt(y).text(x, text, w);
}
}
private Page currentPage() {
return currentPage;
}
}