/* * Copyright 2010-2011 Øyvind Berg (elacin@gmail.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.elacin.pdfextract.physical; import org.apache.log4j.Logger; import org.elacin.pdfextract.content.*; import org.elacin.pdfextract.geom.MathUtils; import org.elacin.pdfextract.geom.Rectangle; import org.elacin.pdfextract.geom.RectangleCollection; import org.elacin.pdfextract.geom.Sorting; import org.elacin.pdfextract.physical.column.ColumnFinder; import org.elacin.pdfextract.physical.column.WhitespaceFinder; import org.elacin.pdfextract.physical.graphics.CategorizedGraphics; import org.elacin.pdfextract.physical.graphics.GraphicSegmentator; import org.elacin.pdfextract.physical.graphics.GraphicSegmentatorImpl; import org.elacin.pdfextract.physical.line.LineSegmentator; import org.elacin.pdfextract.physical.paragraph.ParagraphSegmentator; import org.elacin.pdfextract.style.Style; import org.elacin.pdfextract.tree.GraphicsNode; import org.elacin.pdfextract.tree.LineNode; import org.elacin.pdfextract.tree.PageNode; import org.elacin.pdfextract.tree.ParagraphNode; import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; import java.util.*; import static org.elacin.pdfextract.Constants.COLUMNS_ENABLE_COLUMN_DETECTION; import static org.elacin.pdfextract.geom.Sorting.createSmallestFirstQueue; import static org.elacin.pdfextract.physical.PageRegionSplitBySpacing.splitOfTopTextOfPage; /** * Created by IntelliJ IDEA. User: elacin Date: 09.12.10 Time: 23.24 To change this template use * File | Settings | File Templates. */ public class PageSegmentator { // ------------------------------ FIELDS ------------------------------ @NotNull private static final Logger log = Logger.getLogger(PageSegmentator.class); /* */ private static final ParagraphSegmentator paragraphSegmentator = new ParagraphSegmentator(); // -------------------------- PUBLIC STATIC METHODS -------------------------- public static PageNode analyzePage(@NotNull PhysicalPage page) { final PhysicalPageRegion mainRegion = page.getMainRegion(); final ParagraphNumberer numberer = new ParagraphNumberer(page.getPageNumber()); GraphicSegmentator graphicSegmentator = new GraphicSegmentatorImpl(mainRegion.getPos()); final CategorizedGraphics categorizedGraphics = graphicSegmentator.categorizeGraphics( page.getAllGraphics(), mainRegion); mainRegion.addContents(categorizedGraphics.getContents()); /* first separate out what is contained by graphics */ extractGraphicalRegions(categorizedGraphics, mainRegion); mainRegion.ensureAllContentInLeafNodes(); splitOfTopTextOfPage(page, 0.4f); PageRegionSplitBySeparators.splitRegionBySeparators(mainRegion, categorizedGraphics); /* This will detect column boundaries and split up all regions */ recursivelyDivide(mainRegion); /* * this is to make text ordering work, if it was in the main region it would destroy * the sorting */ mainRegion.ensureAllContentInLeafNodes(); divideRegionsByLargeHorizontalBands(mainRegion); /* first create the page node which will hold everything */ final PageNode ret = new PageNode(page.getPageNumber()); if (log.isDebugEnabled()) { StringBuffer sb = new StringBuffer(); printRegions(sb, mainRegion, 0); log.debug(sb); } createParagraphsForRegion(ret, mainRegion, numberer, false); if (log.isInfoEnabled()) { log.info("LOG00940:Page had " + ret.getChildren().size() + " paragraphs"); } return ret; } // -------------------------- STATIC METHODS -------------------------- private static void createParagraphsForRegion(final PageNode page, final PhysicalPageRegion region, final ParagraphNumberer numberer, boolean wasContainedInGraphic) { numberer.newRegion(); paragraphSegmentator.setMedianVerticalSpacing(region.getMedianOfVerticalDistances()); final ContentGrouper contentGrouper = new ContentGrouper(region); final List<RectangleCollection> blocks = contentGrouper.findBlocksOfContent(); Collections.sort(blocks, Sorting.regionComparator); for (RectangleCollection block : blocks) { /** * start by separating all the graphical content in a block. * This is surely an oversimplification, but it think it should work for our * purposes. This very late combination of graphics will be used to grab all text * which is contained within */ @Nullable Rectangle graphicBounds = extractBoundOfPlainGraphics(block, region.getContainingGraphic()); final List<LineNode> lines = LineSegmentator.createLinesFromBlocks(block); /** * separate out everything related to graphics in this part of the page into a single * paragraph */ if (graphicBounds != null) { GraphicsNode graphical = null; if (wasContainedInGraphic) { for (GraphicsNode graphicsNode : page.getGraphics()) { if (graphicBounds.getPos().containedBy(graphicsNode.getPos())) { graphical = graphicsNode; break; } } if (graphical == null) { graphical = page.getGraphics().get(page.getGraphics().size() - 1); graphical.setGraphicsPos(graphical.getGraphicsPos().union(graphicBounds)); } } else { graphical = new GraphicsNode(graphicBounds); page.addGraphics(graphical); } ParagraphNode paragraph = new ParagraphNode(numberer.getParagraphId(false)); for (Iterator<LineNode> iterator = lines.iterator(); iterator.hasNext(); ) { final LineNode line = iterator.next(); if (region.isGraphicalRegion() || graphicBounds.intersectsWith(line.getPos())) { paragraph.addChild(line); iterator.remove(); } } if (!paragraph.getChildren().isEmpty()) { graphical.addChild(paragraph); } } /* then add the rest of the paragraphs */ page.addChildren(paragraphSegmentator.segmentParagraphsByStyleAndDistance(lines, numberer)); } Collections.sort(region.getSubregions(), Sorting.regionComparator); for (int i = 0; i < region.getSubregions().size(); i++) { final PhysicalPageRegion subregion = region.getSubregions().get(i); createParagraphsForRegion(page, subregion, numberer, region.isGraphicalRegion()); } } private static void divideRegionsByLargeHorizontalBands(final PhysicalPageRegion region) { for (int i = 0; i < region.getSubregions().size(); i++) { final PhysicalPageRegion sub = region.getSubregions().get(i); if (PageRegionSplitBySpacing.splitRegionHorizontally(sub)) { PhysicalPageRegion newSub = sub.getSubregions().get(sub.getSubregions().size() - 1); sub.removeSubRegion(newSub); region.addSubRegion(newSub); i = -1; } } for (PhysicalPageRegion subRegion : region.getSubregions()) { divideRegionsByLargeHorizontalBands(subRegion); } } @Nullable private static Rectangle extractBoundOfPlainGraphics(final RectangleCollection block, final GraphicContent containingGraphic) { List<PhysicalContent> nontextualContent = new ArrayList<PhysicalContent>(); for (Iterator<PhysicalContent> iterator = block.getContents().iterator(); iterator.hasNext(); ) { final PhysicalContent content = iterator.next(); if (!content.isGraphic()) { continue; } final GraphicContent g = content.getGraphicContent(); if (g.isMathBar() || g.isSeparator()) { continue; } nontextualContent.add(content); iterator.remove(); } if (containingGraphic != null) { nontextualContent.add(containingGraphic); } @Nullable Rectangle nonTextualBound = null; if (!nontextualContent.isEmpty()) { nonTextualBound = MathUtils.findBounds(nontextualContent); assert !nonTextualBound.equals(Rectangle.EMPTY_RECTANGLE); // wtf? if (nonTextualBound.equals(Rectangle.EMPTY_RECTANGLE)) { nonTextualBound = null; } } return nonTextualBound; } /** * separate out the content which is contained within a graphic. sort the graphics by smallest, * because they might overlap. * * @param graphics * @param r */ private static void extractGraphicalRegions(@NotNull CategorizedGraphics graphics, @NotNull PhysicalPageRegion r) { PriorityQueue<GraphicContent> queue = createSmallestFirstQueue(graphics.getContainers()); while (!queue.isEmpty()) { final GraphicContent graphic = queue.remove(); try { r.extractSubRegionFromGraphic(graphic, false); } catch (Exception e) { log.info("LOG00320:Could not divide page::" + e.getMessage()); if (graphic.getPos().area() < r.getPos().area() * 0.4f) { if (log.isInfoEnabled()) { log.info("LOG00690:Adding " + graphic + " as content"); } graphic.setCanBeAssigned(true); graphic.setStyle(Style.GRAPHIC_IMAGE); r.addContent(graphic); } else { graphics.getGraphicsToRender().remove(graphic); } } } } private static void printRegions(final StringBuffer sb, final PhysicalPageRegion region, final int indent) { Collections.sort(region.getSubregions(), Sorting.regionComparator); for (int i = 0; i < indent; i++) { sb.append(" "); } sb.append("region:").append(region.getPos()).append(", size: "); sb.append(region.getContents().size()); if (region.isGraphicalRegion()) { sb.append(", graphical "); } sb.append("\n"); for (PhysicalPageRegion sub : region.getSubregions()) { printRegions(sb, sub, indent + 4); } } private static void recursivelyDivide(@NotNull PhysicalPageRegion region) { final List<WhitespaceRectangle> whitespaces = WhitespaceFinder.findWhitespace(region); region.addWhitespace(whitespaces); if (!COLUMNS_ENABLE_COLUMN_DETECTION) { return; } final List<WhitespaceRectangle> columnBoundaries = ColumnFinder.extractColumnBoundaries(region, whitespaces); for (WhitespaceRectangle column : columnBoundaries) { if (log.isInfoEnabled()) { log.info("LOG01050:Column boundary at " + column + " found for region " + region); } } region.addWhitespace(columnBoundaries); for (PhysicalPageRegion subRegion : region.getSubregions()) { recursivelyDivide(subRegion); } Collections.sort(columnBoundaries, Sorting.sortByHigherX); for (WhitespaceRectangle boundary : columnBoundaries) { Rectangle right = new Rectangle(boundary.getPos().getMiddleX(), boundary.getPos().y + 1, region.getPos().endX - boundary.getPos().getMiddleX(), boundary.getPos().height - 1); region.extractSubRegionFromBound(right, false); } } }