package edu.isi.bmkeg.lapdf.model.RTree; import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import edu.isi.bmkeg.lapdf.model.Block; import edu.isi.bmkeg.lapdf.model.ChunkBlock; import edu.isi.bmkeg.lapdf.model.LapdfDirection; import edu.isi.bmkeg.lapdf.model.PageBlock; import edu.isi.bmkeg.lapdf.model.WordBlock; import edu.isi.bmkeg.lapdf.model.ordering.SpatialOrdering; import edu.isi.bmkeg.lapdf.model.spatial.SpatialEntity; public class RTChunkBlock extends RTSpatialEntity implements ChunkBlock { private Block container; private int mostPopularWordHeight; private int mostPopularWordSpaceWidth; private String mostPopularWordFont; private String mostPopularWordStyle; private String alignment = null; private String type = Block.TYPE_UNCLASSIFIED; private Boolean headerOrFooter=null; public RTChunkBlock(int x1, int y1, int x2,int y2) { super(x1, y1, x2, y2); } @Override public int getId() { return super.getId(); } @Override public Block getContainer() { return container; } @Override public int getMostPopularWordHeight() { return mostPopularWordHeight; } public int getMostPopularWordSpaceWidth() { return mostPopularWordSpaceWidth; } public void setMostPopularWordSpaceWidth(int mostPopularWordSpaceWidth) { this.mostPopularWordSpaceWidth = mostPopularWordSpaceWidth; } public String getMostPopularWordFont() { return mostPopularWordFont; } public void setMostPopularWordFont(String mostPopularWordFont) { this.mostPopularWordFont = mostPopularWordFont; } public void setMostPopularWordHeight(int height) { this.mostPopularWordHeight = height; } @Override public String getMostPopularWordStyle() { return mostPopularWordStyle; } @Override public void setMostPopularWordStyle(String style) { this.mostPopularWordStyle=style; } @Override public Boolean isHeaderOrFooter() { return headerOrFooter; } @Override public void setHeaderOrFooter(boolean headerOrFooter) { this.headerOrFooter=headerOrFooter; } @Override public void setContainer(Block block) { this.container = (PageBlock) block; } @Override public String getType() { return type; } @Override public void setType(String type) { this.type = type; } //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @Override public String readLeftRightMedLine() { if (alignment != null) return alignment; PageBlock parent = (PageBlock) this.getContainer(); int median = parent.getMedian(); int X1 = this.getX1(); int width = this.getWidth(); int averageWordHeightForTheDocument = parent.getDocument().readMostPopularWordHeight(); // Conditions for left if (X1 < median && (X1 + width) < (median + averageWordHeightForTheDocument)) return LEFT; // conditions for right if (X1 > median) return RIGHT; // conditions for midline int left = median - X1; int right = X1 + width - median; /* * Doubtful code if(right <= 0) return LEFT; */ double leftIsToRight = (double) left / (double) right; double rightIsToLeft = (double) right / (double) left; if (leftIsToRight < 0.05) alignment = RIGHT; else if (rightIsToLeft < 0.05) alignment = LEFT; else alignment = MIDLINE; return alignment; } public boolean isFlush(String condition, int value) { PageBlock parent = (PageBlock) this.getContainer(); int median = parent.getMedian(); String leftRightMidline = this.readLeftRightMedLine(); int x1 = this.getX1(); int x2 = this.getX2(); int marginX1 = parent.getMargin()[0]; int marginX2 = parent.getMargin()[3]; if (condition.equals(MIDLINE)) { if (leftRightMidline.equals(MIDLINE)) return false; else if (leftRightMidline.equals(LEFT) && Math.abs(x2 - median) < value) return true; else if (leftRightMidline.equals(RIGHT) && Math.abs(x1 - median) < value) return true; } else if (condition.equals(LEFT)) { if (leftRightMidline.equals(MIDLINE) && Math.abs(x1 - marginX1) < value) return true; else if (leftRightMidline.equals(LEFT) && Math.abs(x1 - marginX1) < value) return true; else if (leftRightMidline.equals(RIGHT)) return false; } else if (condition.equals(RIGHT)) { if (leftRightMidline.equals(MIDLINE) && Math.abs(x2 - marginX2) < value) return true; else if (leftRightMidline.equals(LEFT)) return false; else if (leftRightMidline.equals(RIGHT) && Math.abs(x2 - marginX2) < value) return true; } return false; } @Override public int readNumberOfLine() { PageBlock parent = (PageBlock) this.container; List<SpatialEntity> wordBlockList = parent.containsByType(this, SpatialOrdering.MIXED_MODE, WordBlock.class); if (wordBlockList.size() == 0) return 0; WordBlock block = (WordBlock) wordBlockList.get(0); int numberOfLines = 1; int lastY = block.getY1() + block.getHeight() / 2; int currentY = lastY; for (SpatialEntity entity : wordBlockList) { lastY = currentY; block = (WordBlock) entity; currentY = block.getY1() + block.getHeight() / 2; if (currentY > lastY + block.getHeight() / 2) numberOfLines++; } return numberOfLines; } @Override public String readChunkText() { List<SpatialEntity> wordBlockList = ((PageBlock) container) .containsByType(this, SpatialOrdering.MIXED_MODE, WordBlock.class); StringBuilder builder = new StringBuilder(); for (SpatialEntity entity : wordBlockList) { builder.append( ((WordBlock) entity).getWord() ); if( !((WordBlock) entity).getWord().endsWith("-") ) builder.append(" "); } return builder.toString().trim(); } @Override public ChunkBlock readLastChunkBlock() { List<ChunkBlock> sortedChunkBlockList = ((PageBlock) this .getContainer()) .getAllChunkBlocks(SpatialOrdering.COLUMN_AWARE_MIXED_MODE); int index = Collections.binarySearch(sortedChunkBlockList, this, new SpatialOrdering(SpatialOrdering.COLUMN_AWARE_MIXED_MODE)); return (index <= 0) ? null : sortedChunkBlockList.get(index - 1); } /** * returns true if the chunk block contains text that matches the input regex * @param regex * @return */ @Override public boolean isMatchingRegularExpression(String regex) { Pattern pattern = Pattern.compile(regex); Matcher matcher = pattern.matcher(this.readChunkText()); if (matcher.find()) return true; return false; } /** * returns true if chunk block has neighbors of specific type within specified distance * @param type * @param nsew * @return */ @Override public boolean hasNeighboursOfType(String type, int nsew) { List<ChunkBlock> list = getOverlappingNeighbors(nsew, (PageBlock) this.getContainer(), (ChunkBlock) this); for (ChunkBlock chunky : list) if (chunky.getType().equalsIgnoreCase(type)) return true; return false; } @Override public boolean isUnderOneLineFlushNeighboursOfType(String type) { List<ChunkBlock> list = getOverlappingNeighbors(LapdfDirection.NORTH, (PageBlock) this.getContainer(), (ChunkBlock) this); double threshold = this.getMostPopularWordHeight() * 2; for (ChunkBlock chunky : list) { int delta1 = Math.abs(chunky.getX1() - this.getX1()); int delta2 = Math.abs(chunky.getX2() - this.getX2()); if( delta1 < threshold && delta2 < threshold && chunky.readNumberOfLine() == 1 && chunky.getType().equalsIgnoreCase(type)) { return true; } } return false; } public List<ChunkBlock> getOverlappingNeighbors( int nsew, PageBlock parent, ChunkBlock chunkBlock) { int topX = chunkBlock.getX1(); int topY = chunkBlock.getY1(); int width = chunkBlock.getWidth(); int height = chunkBlock.getHeight(); if (nsew == LapdfDirection.NORTH) { height = height / 2; topY = topY - height; } else if (nsew == LapdfDirection.SOUTH) { topY = topY + height; height = height / 2; } else if (nsew == LapdfDirection.EAST) { topX = topX + width; width = width / 2; } else if (nsew == LapdfDirection.WEST) { width = width / 2; topX = topX - width; } else if (nsew == LapdfDirection.NORTH_SOUTH) { topY = topY - height / 2; height = height * 2; } else if (nsew == LapdfDirection.EAST_WEST) { topX = topX - width / 2; width = width * 2; } SpatialEntity entity = new RTChunkBlock(topX, topY, topX + width, topY + height); List<ChunkBlock> l = new ArrayList<ChunkBlock>(); Iterator<SpatialEntity> it = parent.intersectsByType( entity, null, ChunkBlock.class).iterator(); while( it.hasNext() ) { l.add((ChunkBlock)it.next()); } return l; } }