/* * Copyright 2010-2011 Øyvind Berg (elacin@gmail.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.elacin.pdfextract.physical; import org.apache.log4j.Logger; import org.elacin.pdfextract.content.PhysicalContent; import org.elacin.pdfextract.content.PhysicalPage; import org.elacin.pdfextract.content.PhysicalPageRegion; import org.elacin.pdfextract.geom.Rectangle; import org.elacin.pdfextract.style.Style; import org.elacin.pdfextract.style.StyleComparator; import org.elacin.pdfextract.style.StyleDifference; import org.elacin.pdfextract.style.TextUtils; import org.jetbrains.annotations.NotNull; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; /** * Created by IntelliJ IDEA. User: elacin Date: 08.12.10 Time: 01.27 To change this template use * File | Settings | File Templates. */ public class PageRegionSplitBySpacing { // ------------------------------ FIELDS ------------------------------ private static final Logger log = Logger.getLogger(PageRegionSplitBySpacing.class); // -------------------------- PUBLIC STATIC METHODS -------------------------- @NotNull public static boolean splitOfTopTextOfPage(@NotNull PhysicalPage page, float fractionToConsider) { PhysicalPageRegion r = page.getMainRegion(); final Rectangle realDims = r.getPage().getPageDimensions(); final int minimumDistanceToSplit = 10; return tryHorizontalSplit(r, realDims, fractionToConsider, minimumDistanceToSplit); } @NotNull public static boolean splitRegionHorizontally(@NotNull PhysicalPageRegion region) { final int minimumDistanceToSplit = 20; return tryHorizontalSplit(region, region.getPos(), 1.0f, minimumDistanceToSplit); } // -------------------------- STATIC METHODS -------------------------- private static boolean sameStyleOverAndUnderHorizontalLine(final PhysicalPageRegion r, final float y, final Set<PhysicalContent> over) { List<PhysicalContent> under = new ArrayList<PhysicalContent>(); float yIndex = y; while (under.isEmpty() && (yIndex < r.getPos().endY)) { under.addAll(r.findContentAtYIndex(yIndex)); yIndex += 1.0f; } final Style styleOver = TextUtils.findDominatingStyle(over); final Style styleUnder = TextUtils.findDominatingStyle(under); return StyleComparator.styleCompare(styleOver, styleUnder) == StyleDifference.SAME_STYLE; } private static boolean tryHorizontalSplit(final PhysicalPageRegion r, final Rectangle dims, final float fractionToConsider, final int minimumDistanceToSplit) { final float startY = dims.y; final float endY = Math.min(r.getPos().endY, startY + dims.height * fractionToConsider); float lastBoundary = -1000.0f; float minX = Float.MAX_VALUE, maxX = Float.MIN_VALUE; PhysicalPageRegion activeRegion = r; Set<PhysicalContent> workingSet = new HashSet<PhysicalContent>(); for (float y = startY; y <= endY; y++) { if (y < activeRegion.getPos().y) { continue; } final List<PhysicalContent> row = activeRegion.findContentAtYIndex(y); workingSet.addAll(row); for (PhysicalContent content : row) { minX = Math.min(content.getPos().x, minX); maxX = Math.max(content.getPos().endX, maxX); } if (row.isEmpty()) { if (!TextUtils.listContainsStyledText(workingSet)) { continue; } if ((y - lastBoundary < minimumDistanceToSplit)) { continue; } if (sameStyleOverAndUnderHorizontalLine(activeRegion, y, workingSet)) { continue; } if (log.isInfoEnabled()) { log.info(String.format("LOG00530:split/hor at y=%s for %s ", y, activeRegion)); } boolean success = PageRegionSplitBySeparators.splitRegionAtY(activeRegion, y); if (!success) { break; } PhysicalPageRegion lowerNewSubRegion = activeRegion.getSubregions().get( activeRegion.getSubregions().size() - 1); activeRegion = lowerNewSubRegion; // tryHorizontalSplit(lowerNewSubRegion, dims, fractionToConsider, minimumDistanceToSplit); // return r.extractSubRegionFromContent(workingSet); workingSet.clear(); lastBoundary = y; /* only extract once */ // return true; } else { lastBoundary = y; } } return false; } }