package technology.tabula; import java.util.ArrayList; import java.util.Collections; import java.util.List; public class ProjectionProfile { public static final int DECIMAL_PLACES = 1; // fixed <-> float conversion precision private final Page area; private final Rectangle textBounds; private float[] verticalProjection; private float[] horizontalProjection; private final double areaWidth, areaHeight, areaTop, areaLeft; private float minCharWidth = Float.MAX_VALUE, minCharHeight = Float.MAX_VALUE, horizontalKernelSize, verticalKernelSize; private float maxHorizontalProjection = 0, maxVerticalProjection = 0; public ProjectionProfile(Page area, List<? extends Rectangle> elements, float horizontalKernelSize, float verticalKernelSize) { this.area = area; this.areaWidth = area.getWidth(); this.areaHeight = area.getHeight(); this.areaTop = area.getTop(); this.areaLeft = area.getLeft(); this.verticalProjection = new float[toFixed(areaHeight)]; this.horizontalProjection = new float[toFixed(areaWidth)]; this.horizontalKernelSize = horizontalKernelSize; this.verticalKernelSize = verticalKernelSize; this.textBounds = area.getTextBounds(); for (Rectangle element: elements) { // exclude elements that take more than 80% of the width // of the area. They won't contribute to determining columns if (element.getWidth() / this.textBounds.getWidth() > 0.8) { continue; } this.addRectangle(element); } this.verticalProjection = smooth(this.verticalProjection, toFixed(verticalKernelSize)); this.horizontalProjection = smooth(this.horizontalProjection, toFixed(horizontalKernelSize)); } private void addRectangle(Rectangle element) { // calculate horizontal and vertical projection profiles if (!area.contains(element)) { return; } this.minCharHeight = (float) Math.min(this.minCharHeight, element.getHeight()); this.minCharWidth = (float) Math.min(this.minCharWidth, element.getWidth()); for (int k = toFixed(element.getLeft()); k < toFixed(element.getRight()); k++) { this.horizontalProjection[k - toFixed(areaLeft)] += element.getHeight(); this.maxHorizontalProjection = Math.max(this.maxHorizontalProjection, this.horizontalProjection[k - toFixed(areaLeft)]); } for(int k = toFixed(element.getTop()); k < toFixed(element.getBottom()); k++) { this.verticalProjection[k - toFixed(areaTop)] += element.getWidth(); this.maxVerticalProjection = Math.max(this.maxVerticalProjection, this.verticalProjection[k - toFixed(areaTop)]); } } public float[] getVerticalProjection() { return verticalProjection; } public float[] getHorizontalProjection() { return horizontalProjection; } public float[] findVerticalSeparators(float minColumnWidth) { boolean foundNarrower = false; List<Integer> verticalSeparators = new ArrayList<Integer>(); for (Ruling r: area.getVerticalRulings()) { if (r.length() / this.textBounds.getHeight() >= 0.95) { verticalSeparators.add(toFixed(r.getPosition() - this.areaLeft)); } } List<Integer> seps = findSeparatorsFromProjection(filter(getFirstDeriv(this.horizontalProjection), 0.1f)); for (Integer foundSep: seps) { for (Integer explicitSep: verticalSeparators) { if (Math.abs(toDouble(foundSep - explicitSep)) <= minColumnWidth) { foundNarrower = true; break; } } if (!foundNarrower) { verticalSeparators.add(foundSep); } foundNarrower = false; } Collections.sort(verticalSeparators); float[] rv = new float[verticalSeparators.size()]; for (int i = 0; i < rv.length; i++) { rv[i] = (float) toDouble(verticalSeparators.get(i)); } return rv; } public float[] findHorizontalSeparators(float minRowHeight) { boolean foundShorter = false; List<Integer> horizontalSeparators = new ArrayList<Integer>(); for (Ruling r: area.getHorizontalRulings()) { System.out.println(r.length() / this.textBounds.getWidth()); if (r.length() / this.textBounds.getWidth() >= 0.95) { horizontalSeparators.add(toFixed(r.getPosition() - this.areaTop)); } } List<Integer> seps = findSeparatorsFromProjection(filter(getFirstDeriv(this.verticalProjection), 0.1f)); for (Integer foundSep: seps) { for (Integer explicitSep: horizontalSeparators) { if (Math.abs(toDouble(foundSep - explicitSep)) <= minRowHeight) { foundShorter = true; break; } } if (!foundShorter) { horizontalSeparators.add(foundSep); } foundShorter = false; } Collections.sort(horizontalSeparators); float[] rv = new float[horizontalSeparators.size()]; for (int i = 0; i < rv.length; i++) { rv[i] = (float) toDouble(horizontalSeparators.get(i)); } return rv; } private static List<Integer> findSeparatorsFromProjection(float[] derivative) { List<Integer> separators = new ArrayList<Integer>(); Integer lastNeg = null; float s; boolean positiveSlope = false; // find separators based on histogram for (int i = 0; i < derivative.length; i++) { s = derivative[i]; if (s > 0 && !positiveSlope) { positiveSlope = true; separators.add(lastNeg != null ? lastNeg : i); } else if (s < 0) { lastNeg = i; positiveSlope = false; } } return separators; } public static float[] smooth(float[] data, int kernelSize) { float[] rv = new float[data.length]; float s; for (int pass = 0; pass < 1; pass++) { for (int i = 0; i < data.length; i++) { s = 0; for (int j = Math.max(0, i - kernelSize / 2); j < Math.min(i + kernelSize / 2, data.length); j++) { s += data[j]; } rv[i] = (float) Math.floor(s / (float) kernelSize); } } return rv; } /** * Simple Low pass filter */ public static float[] filter(float[] data, float alpha) { float[] rv = new float[data.length]; rv[0] = data[0]; for (int i = 1; i < data.length; i++) { rv[i] = rv[i-1] + alpha * (data[i] - rv[i-1]); } return rv; } public static float[] getAutocorrelation(float[] projection) { float[] rv = new float[projection.length-1]; for (int i = 1; i < projection.length - 1; i++) { rv[i] = (projection[i] * projection[i-1]) / 100f; } return rv; } public static float[] getFirstDeriv(float[] projection) { float[] rv = new float[projection.length]; rv[0] = projection[1] - projection[0]; for (int i = 1; i < projection.length - 1; i++) { rv[i] = projection[i+1] - projection[i-1]; } rv[projection.length - 1] = projection[projection.length - 1] - projection[projection.length - 2]; return rv; } // pretty lame fixed precision math here private static int toFixed(double value) { return (int) Math.round(value * (Math.pow(10, DECIMAL_PLACES))); } private static double toDouble(int value) { return (double) value / Math.pow(10, DECIMAL_PLACES); } }