/************************************************************************************************** * Copyright (c) 2010 Mihail Atanassov and others. All rights reserved. This program and the * accompanying materials are made available under the terms of the Eclipse Public License v1.0 * which accompanies this distribution, and is available at * http://www.eclipse.org/legal/epl-v10.html * <p/> * Contributors: <br/> * Mihail Atanassov - initial API and implementation <br/> * Fabian Steeg - Refactored for PdfBox *************************************************************************************************/ package de.uni_koeln.ub.drc.reader; import java.util.ArrayList; import java.util.List; /** * @author Mihail Atanssov <saeko.bjagai@gmail.com>(original version) <br/> * Fabian Steeg <fsteeg@gmail.com> (Refactored for PdfBox) */ public final class PageInfo { private List<Paragraph> paragraphs = new ArrayList<Paragraph>(); private List<ExtractedWord> words; /** * @param words * The words extracted from the PDF document */ public PageInfo(final List<ExtractedWord> words) { this.words = words; toParagraphs(); } /** * @param index * Position of a paragraph in the PDF document * @return The paragraph from index */ public Paragraph getParagraphAt(final int index) { if (index >= 0 && index <= paragraphs.size()) return paragraphs.get(index); return null; } /** * @return All paragraphs from the PDF document */ public List<Paragraph> getParagraphs() { return new ArrayList<Paragraph>(paragraphs); } private void toParagraphs() { Paragraph paragraph = new Paragraph(); paragraphs.add(paragraph); for (ExtractedWord word : words) { if (word.isParagraphStart()) { paragraph = new Paragraph(); paragraph.addWord(word); paragraphs.add(paragraph); } else { paragraph.addWord(word); } } } @Override public String toString() { return String.format("%s with %s paragraphs, %s words", getClass() //$NON-NLS-1$ .getSimpleName(), paragraphs.size(), words.size()); } }