package edu.isi.bmkeg.lapdf.parser; import java.io.File; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.TreeSet; import java.util.concurrent.LinkedBlockingQueue; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.log4j.Logger; import edu.isi.bmkeg.lapdf.extraction.JPedalExtractor; import edu.isi.bmkeg.lapdf.extraction.exceptions.InvalidPopularSpaceValueException; import edu.isi.bmkeg.lapdf.features.HorizontalSplitFeature; import edu.isi.bmkeg.lapdf.model.Block; import edu.isi.bmkeg.lapdf.model.ChunkBlock; import edu.isi.bmkeg.lapdf.model.LapdfDocument; import edu.isi.bmkeg.lapdf.model.PageBlock; import edu.isi.bmkeg.lapdf.model.WordBlock; import edu.isi.bmkeg.lapdf.model.factory.AbstractModelFactory; import edu.isi.bmkeg.lapdf.model.ordering.SpatialOrdering; import edu.isi.bmkeg.lapdf.model.spatial.SpatialEntity; import edu.isi.bmkeg.lapdf.utils.PageImageOutlineRenderer; import edu.isi.bmkeg.utils.FrequencyCounter; import edu.isi.bmkeg.utils.IntegerFrequencyCounter; public class RuleBasedParser implements Parser { private static Logger logger = Logger.getLogger(RuleBasedParser.class); private boolean debugImages = false; private ArrayList<PageBlock> pageList; private JPedalExtractor extractor; private int idGenerator; private IntegerFrequencyCounter avgHeightFrequencyCounter; private FrequencyCounter fontFrequencyCounter; private int northSouthSpacing; private int eastWestSpacing; protected AbstractModelFactory modelFactory; protected String path; // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ public String getPath() { return path; } public void setPath(String path) { this.path = path; } private boolean isDebugImages() { return debugImages; } private void setDebugImages(boolean debugImages) { this.debugImages = debugImages; } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ public RuleBasedParser(AbstractModelFactory modelFactory) throws Exception { pageList = new ArrayList<PageBlock>(); extractor = new JPedalExtractor(modelFactory); idGenerator = 1; this.avgHeightFrequencyCounter = new IntegerFrequencyCounter(1); this.fontFrequencyCounter = new FrequencyCounter(); this.modelFactory = modelFactory; } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @Override public LapdfDocument parse(File file) throws Exception { LapdfDocument document = null; init(file); List<WordBlock> pageWordBlockList = null; PageBlock pageBlock = null; int pageCounter = 1; document = new LapdfDocument(file); document.setjPedalDecodeFailed(true); while (extractor.hasNext()) { document.setjPedalDecodeFailed(false); pageBlock = modelFactory.createPageBlock( pageCounter++, extractor.getCurrentPageBoxWidth(), extractor.getCurrentPageBoxHeight(), document); pageList.add(pageBlock); pageWordBlockList = extractor.next(); idGenerator = pageBlock.initialize(pageWordBlockList, idGenerator); this.eastWestSpacing = pageBlock.getMostPopularWordHeightPage() + pageBlock.getMostPopularHorizontalSpaceBetweenWordsPage(); logger.debug(this.eastWestSpacing); this.northSouthSpacing = pageBlock.getMostPopularWordHeightPage() + pageBlock.getMostPopularVerticalSpaceBetweenWordsPage(); buildChunkBlocks(pageWordBlockList, pageBlock); if (isDebugImages()) { PageImageOutlineRenderer.createPageImage( pageBlock, file, file.getName() + "afterBuildBlocks" + pageBlock.getPageNumber() + ".png", 0); } } //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if (!document.hasjPedalDecodeFailed()) { // initial parse is commplete. String s = file.getName().replaceAll("\\.pdf", ""); Pattern p = Pattern.compile("(\\d+)"); Matcher m = p.matcher(file.getName()); if( m.find() ) { s = m.group(1); } for (PageBlock page : pageList) { if (isDebugImages()) { PageImageOutlineRenderer.createPageImage(page, file, file.getName() + "beforeBuildBlocksOverlapDeletion_" + s + "_" + page.getPageNumber() + ".png", 0); } this.deleteHighlyOverlappedChunkBlocks(page); if (isDebugImages()) { PageImageOutlineRenderer.createPageImage(page, file, file.getName() + "afterBuildBlocksOverlapDeletion_" + s + "_" + page.getPageNumber() + ".png", 0); } this.divideBlocksVertically(page); if (isDebugImages()) { PageImageOutlineRenderer.createPageImage(page, file, file.getName() + "afterVerticalDivide_" + s + "_" + page.getPageNumber() + ".png", 0); } this.joinLines(page); if (isDebugImages()) { PageImageOutlineRenderer.createPageImage(page, file, file.getName() + "afterJoinLines_" + s + "_" + page.getPageNumber() + ".png", 0); } this.divideBlocksHorizontally(page); if (isDebugImages()) { PageImageOutlineRenderer.createPageImage(page, file, file.getName() + "afterHorizontalDivide_" + s + "_" + page.getPageNumber() + ".png", 0); } this.deleteHighlyOverlappedChunkBlocks(page); if (isDebugImages()) { PageImageOutlineRenderer.createPageImage(page, file, file.getName() + "/afterOverlapDeletion_" + s + "_" + page.getPageNumber() + ".png", 0); } } document.addPages(pageList); document.calculateBodyTextFrame(); document.calculateMostPopularFontStyles(); } return document; } private void init(File file) throws Exception { extractor.init(file); idGenerator = 1; this.avgHeightFrequencyCounter.reset(); this.fontFrequencyCounter.reset(); pageList.clear(); } private void buildChunkBlocks(List<WordBlock> pageWordBlockList, PageBlock pageBlock) { ChunkBlock chunkBlock = null; LinkedBlockingQueue<WordBlock> wordBlockList = new LinkedBlockingQueue<WordBlock>(); ArrayList<WordBlock> seenList = new ArrayList<WordBlock>(); List tempList; int counter; ArrayList<ChunkBlock> chunkBlockList = new ArrayList<ChunkBlock>(); while (pageWordBlockList.size() > 0) { wordBlockList.clear(); wordBlockList.add(pageWordBlockList.get(0)); counter = 0; int extra; seenList.clear(); while (wordBlockList.size() != 0) { WordBlock wordBlock = wordBlockList.peek(); pageBlock.getDocument().getAvgHeightFrequencyCounter().add( wordBlock.getHeight() ); pageBlock.getDocument().getFontFrequencyCounter().add( wordBlock.getFont() + ";" + wordBlock.getFontStyle() ); pageWordBlockList.remove(wordBlock); tempList = this.getOverlappingNeighbors(pageBlock, wordBlock, pageWordBlockList); tempList.removeAll(wordBlockList); tempList.removeAll(seenList); wordBlockList.addAll(tempList); seenList.add(wordBlockList.poll()); } pageWordBlockList.removeAll(seenList); chunkBlock = buildChunkBlock(seenList, pageBlock); chunkBlockList.add(chunkBlock); } idGenerator = pageBlock.addAll(new ArrayList<SpatialEntity>( chunkBlockList), idGenerator); } private List<WordBlock> getOverlappingNeighbors( PageBlock pageBlock, WordBlock wordBlock, List<WordBlock> pageWordList) { // expand the current word. int topX = wordBlock.getX1() - this.eastWestSpacing; int topY = wordBlock.getY1() - this.northSouthSpacing; int bottomX = wordBlock.getX2() + this.eastWestSpacing; int bottomY = wordBlock.getY2() + this.northSouthSpacing; SpatialEntity expandedWord = modelFactory.createWordBlock( topX, topY, bottomX, bottomY, 0, null, null, null); // find all overlapping words TreeSet listOfInteresectingBlock = new TreeSet<SpatialEntity>( new SpatialOrdering(SpatialOrdering.MIXED_MODE) ); listOfInteresectingBlock.addAll(pageBlock.intersects(expandedWord, null)); listOfInteresectingBlock.retainAll(pageWordList); List<WordBlock> overlappingNeighbors = new ArrayList<WordBlock>(listOfInteresectingBlock); return overlappingNeighbors; } private void divideBlocksVertically(PageBlock page) throws InvalidPopularSpaceValueException { List<ChunkBlock> chunkBlockList; String leftRightMidline; boolean leftFlush; boolean rightFlush; chunkBlockList = new ArrayList<ChunkBlock>(page.getAllChunkBlocks(null)); for (ChunkBlock chunky : chunkBlockList) { leftRightMidline = chunky.readLeftRightMedLine(); leftFlush = chunky.isFlush(chunky.LEFT, chunky.getMostPopularWordHeight() * 2); rightFlush = chunky.isFlush(chunky.RIGHT, chunky.getMostPopularWordHeight() * 2); int deltaH = chunky.getMostPopularWordHeight() - page.getDocument().readMostPopularWordHeight(); if (chunky.MIDLINE.equalsIgnoreCase(leftRightMidline) && (leftFlush || rightFlush) && deltaH < 3) { if (verticalSplitCandidate(chunky)) this.splitBlockDownTheMiddle(chunky); } } } private boolean verticalSplitCandidate(ChunkBlock block) throws InvalidPopularSpaceValueException { // 0:x,1:width ArrayList<Integer[]> spaceList = new ArrayList<Integer[]>(); int previousX = 0; int previousWidth = 0; int currentX = 0; int currentY = 0; int currentWidth = 0; Integer[] currentSpace = new Integer[] { -1, -1 }; Integer[] currentWidestSpace = new Integer[] { -1, -1 }; PageBlock parent = (PageBlock) block.getContainer(); List<SpatialEntity> wordBlockList = parent.containsByType(block, SpatialOrdering.MIXED_MODE, WordBlock.class); int pageWidth = parent.getMargin()[2] - parent.getMargin()[0]; int marginHeight = parent.getMargin()[3] - parent.getMargin()[1]; int averageWidth = 0; float spaceWidthToPageWidth = 0; for (int i = 0; i < wordBlockList.size(); i++) { WordBlock wordBlock = (WordBlock) wordBlockList.get(i); // New line started if (i == 0 || Math.abs(((double) (wordBlock.getY1() - currentY) / (double) marginHeight)) > 0.01) { currentY = wordBlock.getY1(); currentX = wordBlock.getX1(); currentWidth = wordBlock.getWidth(); if (currentWidestSpace[1] > 0) { spaceList.add(new Integer[] { currentWidestSpace[0], currentWidestSpace[1] }); } currentWidestSpace[0] = -1; currentWidestSpace[1] = -1; continue; } // Continuing current line previousX = currentX; previousWidth = currentWidth; currentY = wordBlock.getY1(); currentX = wordBlock.getX1(); currentWidth = wordBlock.getWidth(); currentSpace[1] = currentX - (previousX + previousWidth); currentSpace[0] = currentX + currentWidth; if (currentWidestSpace[1] == -1 || currentSpace[1] > currentWidestSpace[1]) { currentWidestSpace[0] = currentSpace[0]; currentWidestSpace[1] = currentSpace[1]; } } // Criterium for whether the widest spaces are properly lined up: // At least 20% of them have an x position within that differ with less // than 1% to the x position of the previous space. // The average x position doesn't matter! if (spaceList.size() <= 0) return false; // Find average width of the widest spaces and make sure it's at least // as wide as 2.5% of the page width. for (int i = 0; i < spaceList.size(); i++) averageWidth += spaceList.get(i)[1]; averageWidth = averageWidth / spaceList.size(); // spaceWidthToPageWidth = (float) averageWidth / (float) pageWidth; /* * if (spaceWidthToPageWidth > 0.015) return true; else return false; */ if (averageWidth > parent .getMostPopularHorizontalSpaceBetweenWordsPage()) return true; else return false; } private void splitBlockDownTheMiddle(ChunkBlock block) { PageBlock parent = (PageBlock) block.getContainer(); int median = parent.getMedian(); ArrayList<WordBlock> leftBlocks = new ArrayList<WordBlock>(); ArrayList<WordBlock> rigthBlocks = new ArrayList<WordBlock>(); List<SpatialEntity> wordBlockList = parent.containsByType(block, SpatialOrdering.MIXED_MODE, WordBlock.class); String wordBlockLeftRightMidLine; for (int i = 0; i < wordBlockList.size(); i++) { WordBlock wordBlock = (WordBlock) wordBlockList.get(i); wordBlockLeftRightMidLine = wordBlock.readLeftRightMedLine(); if (wordBlockLeftRightMidLine.equals(Block.LEFT)) leftBlocks.add(wordBlock); else if (wordBlockLeftRightMidLine.equals(Block.RIGHT)) rigthBlocks.add(wordBlock); else if (wordBlockLeftRightMidLine.equals(Block.MIDLINE)) { // Assign the current word to the left or right side depending // upon // whether most of the word is on the left or right side of the // median. if (Math.abs(median - wordBlock.getX1()) > Math.abs(wordBlock .getX2() - median)) { wordBlock.resize(wordBlock.getX1(), wordBlock.getY1(), median - wordBlock.getX1(), wordBlock.getHeight()); } else { wordBlock.resize(median, wordBlock.getY1(), wordBlock.getX2() - median, wordBlock.getHeight()); rigthBlocks.add(wordBlock); } } }// END for if (leftBlocks.size() == 0 || rigthBlocks.size() == 0) return; ChunkBlock leftChunkBlock = buildChunkBlock(leftBlocks, parent); ChunkBlock rightChunkBlock = buildChunkBlock(rigthBlocks, parent); SpatialEntity entity = modelFactory.createWordBlock( leftChunkBlock.getX2() + 1, leftChunkBlock.getY1(), rightChunkBlock.getX1() - 1, rightChunkBlock.getY2(), 0, null, null, null); if (parent.intersectsByType(entity, null, WordBlock.class).size() >= 1) { if (block == null) { logger.info("null null"); } for (SpatialEntity wordBlockEntity : wordBlockList) ((Block) wordBlockEntity).setContainer(block); return; } double relative_overlap = leftChunkBlock .getRelativeOverlap(rightChunkBlock); if (relative_overlap < 0.1) { parent.delete(block, block.getId()); parent.add(leftChunkBlock, idGenerator++); parent.add(rightChunkBlock, idGenerator++); } } private ChunkBlock buildChunkBlock(List<WordBlock> wordBlockList, PageBlock pageBlock) { ChunkBlock chunkBlock = null; IntegerFrequencyCounter lineHeightFrequencyCounter = new IntegerFrequencyCounter(1); IntegerFrequencyCounter spaceFrequencyCounter = new IntegerFrequencyCounter(0); FrequencyCounter fontFrequencyCounter = new FrequencyCounter(); FrequencyCounter styleFrequencyCounter = new FrequencyCounter(); for (WordBlock wordBlock : wordBlockList) { lineHeightFrequencyCounter.add(wordBlock.getHeight()); spaceFrequencyCounter.add(wordBlock.getSpaceWidth()); avgHeightFrequencyCounter.add(wordBlock.getHeight()); fontFrequencyCounter.add(wordBlock.getFont()); styleFrequencyCounter.add(wordBlock.getFontStyle()); if (chunkBlock == null) { chunkBlock = modelFactory .createChunkBlock(wordBlock.getX1(), wordBlock.getY1(), wordBlock.getX2(), wordBlock.getY2()); } else { SpatialEntity spatialEntity = chunkBlock.union(wordBlock); chunkBlock.resize(spatialEntity.getX1(), spatialEntity.getY1(), spatialEntity.getWidth(), spatialEntity.getHeight()); } wordBlock.setContainer(chunkBlock); } chunkBlock.setMostPopularWordFont( (String) fontFrequencyCounter.getMostPopular() ); chunkBlock.setMostPopularWordStyle( (String) styleFrequencyCounter.getMostPopular() ); chunkBlock.setMostPopularWordHeight( lineHeightFrequencyCounter.getMostPopular() ); chunkBlock.setMostPopularWordSpaceWidth( spaceFrequencyCounter.getMostPopular() ); chunkBlock.setContainer(pageBlock); return chunkBlock; } private void divideBlocksHorizontally(PageBlock page) { List<ChunkBlock> chunkBlockList; ArrayList<Integer> breaks; chunkBlockList = page.getAllChunkBlocks(SpatialOrdering.MIXED_MODE); for (ChunkBlock chunky : chunkBlockList) { breaks = this.getBreaks(chunky); if (breaks.size() > 0) this.splitBlockByBreaks(chunky, breaks); } } private ArrayList<Integer> getBreaks(ChunkBlock block) { ArrayList<Integer> breaks = new ArrayList<Integer>(); PageBlock parent = (PageBlock) block.getContainer(); int mostPopulareWordHeightOverCorpora = parent.getDocument() .readMostPopularWordHeight(); List<SpatialEntity> wordBlockList = parent.containsByType(block, SpatialOrdering.MIXED_MODE, WordBlock.class); WordBlock firstWordOnLine = (WordBlock) wordBlockList.get(0); WordBlock lastWordOnLine = firstWordOnLine; int lastY = firstWordOnLine.getY1() + firstWordOnLine.getHeight() / 2; int currentY = lastY; String chunkBlockString = ""; ArrayList<Integer> breakCandidates = new ArrayList<Integer>(); ArrayList<HorizontalSplitFeature> featureList = new ArrayList<HorizontalSplitFeature>(); HorizontalSplitFeature feature = new HorizontalSplitFeature(); for (SpatialEntity entity : wordBlockList) { lastY = currentY; WordBlock wordBlock = (WordBlock) entity; currentY = wordBlock.getY1() + wordBlock.getHeight() / 2; if (currentY > lastY + wordBlock.getHeight() / 2) { feature.calculateFeatures(block, firstWordOnLine, lastWordOnLine, chunkBlockString); featureList.add(feature); feature = new HorizontalSplitFeature(); breakCandidates .add((lastWordOnLine.getY2() + wordBlock.getY1()) / 2); firstWordOnLine = wordBlock; lastWordOnLine = wordBlock; chunkBlockString = ""; } feature.addToFrequencyCounters(wordBlock.getFont(), wordBlock.getFontStyle()); chunkBlockString = chunkBlockString + " " + wordBlock.getWord(); lastWordOnLine = wordBlock; } feature.calculateFeatures(block, firstWordOnLine, lastWordOnLine, chunkBlockString); featureList.add(feature); feature = null; HorizontalSplitFeature featureMinusOne; // What kind of column is this? // // a. Titles and large-font blocks // b. centered titles // c. centered blocks // d. text & titles in left or right columns // e. references // f. figure legends for (int i = 1; i < featureList.size(); i++) { featureMinusOne = featureList.get(i - 1); feature = featureList.get(i); if (featureMinusOne.isAllCapitals() && !feature.isAllCapitals()) { breaks.add(breakCandidates.get(i - 1)); } else if (!featureMinusOne.isAllCapitals() && feature.isAllCapitals()) { breaks.add(breakCandidates.get(i - 1)); } else if (featureMinusOne.getMostPopularFont() != null && feature.getMostPopularFont() == null) { breaks.add(breakCandidates.get(i - 1)); } else if (featureMinusOne.getMostPopularFont() == null && feature.getMostPopularFont() != null) { breaks.add(breakCandidates.get(i - 1)); } else if (!featureMinusOne.getMostPopularFont().equals( feature.getMostPopularFont()) && !feature.isMixedFont() && !featureMinusOne.isMixedFont()) { breaks.add(breakCandidates.get(i - 1)); } else if (Math.abs(feature.getFirstWordOnLineHeight() - featureMinusOne.getFirstWordOnLineHeight()) > 2) { breaks.add(breakCandidates.get(i - 1)); } else if (Math.abs(feature.getMidYOfLastWordOnLine() - featureMinusOne.getMidYOfLastWordOnLine()) > (feature .getFirstWordOnLineHeight() + featureMinusOne .getFirstWordOnLineHeight()) * 0.75) { breaks.add(breakCandidates.get(i - 1)); } else if (Math.abs(featureMinusOne.getFirstWordOnLineHeight() - mostPopulareWordHeightOverCorpora) <= 2 && Math.abs(feature.getFirstWordOnLineHeight() - mostPopulareWordHeightOverCorpora) <= 2 && Math.abs(featureMinusOne.getMidOffset()) < 10 && Math.abs(featureMinusOne.getExtremLeftOffset()) > 10 && Math.abs(featureMinusOne.getExtremeRightOffset()) > 10 && Math.abs(feature.getExtremLeftOffset()) < 20 && Math.abs(feature.getExtremeRightOffset()) < 10) { breaks.add(breakCandidates.get(i - 1)); } else if (Math.abs(feature.getFirstWordOnLineHeight() - mostPopulareWordHeightOverCorpora) <= 2 && Math.abs(feature.getMidOffset()) < 10 && Math.abs(feature.getExtremLeftOffset()) > 10 && Math.abs(feature.getExtremeRightOffset()) > 10 && Math.abs(featureMinusOne.getExtremLeftOffset()) < 10) { breaks.add(breakCandidates.get(i - 1)); } else if (featureMinusOne.isEndOFLine() && Math.abs(featureMinusOne.getFirstWordOnLineHeight() - mostPopulareWordHeightOverCorpora) <= 2 && (Math.abs(featureMinusOne.getExtremeRightOffset()) > 10 || Math .abs(feature.getExtremLeftOffset()) > 10)) { breaks.add(breakCandidates.get(i - 1)); } } return breaks; } private void splitBlockByBreaks(ChunkBlock block, ArrayList<Integer> breaks) { Collections.sort(breaks); PageBlock parent = (PageBlock) block.getContainer(); List<SpatialEntity> wordBlockList = parent.containsByType(block, SpatialOrdering.MIXED_MODE, WordBlock.class); int y; int breakIndex; ArrayList<ArrayList<WordBlock>> bigBlockList = new ArrayList<ArrayList<WordBlock>>(); for (int j = 0; j < breaks.size() + 1; j++) { ArrayList<WordBlock> littleBlockList = new ArrayList<WordBlock>(); bigBlockList.add(littleBlockList); } for (SpatialEntity entity : wordBlockList) { WordBlock wordBlock = (WordBlock) entity; y = wordBlock.getY1() + wordBlock.getHeight() / 2; breakIndex = Collections.binarySearch(breaks, y); if (breakIndex < 0) { breakIndex = -1 * breakIndex - 1; bigBlockList.get(breakIndex).add(wordBlock); } else { bigBlockList.get(breakIndex).add(wordBlock); } } ChunkBlock chunky; TreeSet<ChunkBlock> chunkBlockList = new TreeSet<ChunkBlock>( new SpatialOrdering(SpatialOrdering.MIXED_MODE)); for (ArrayList<WordBlock> list : bigBlockList) { if (list.size() == 0) continue; chunky = this.buildChunkBlock(list, parent); chunkBlockList.add(chunky); } parent.delete(block, block.getId()); idGenerator = parent.addAll( new ArrayList<SpatialEntity>(chunkBlockList), idGenerator); } private void joinLines(PageBlock page) { LinkedBlockingQueue<ChunkBlock> chunkBlockList = new LinkedBlockingQueue<ChunkBlock>( page.getAllChunkBlocks(SpatialOrdering.MIXED_MODE)); List wordBlockList; int midY; ChunkBlock chunky = null; List<SpatialEntity> neighbouringChunkBlockList; ChunkBlock neighbouringChunkBlock; ArrayList<SpatialEntity> removalList = new ArrayList<SpatialEntity>(); while (chunkBlockList.size() > 0) { chunky = chunkBlockList.peek(); wordBlockList = page.containsByType(chunky, null, WordBlock.class); if (wordBlockList.size() < 4 && chunky.readNumberOfLine() == 1) { neighbouringChunkBlockList = page.intersectsByType( calculateBoundariesForJoin(chunky, page), SpatialOrdering.MIXED_MODE, ChunkBlock.class); if (neighbouringChunkBlockList.size() <= 1) { chunkBlockList.poll(); continue; } for (SpatialEntity entity : neighbouringChunkBlockList) { neighbouringChunkBlock = (ChunkBlock) entity; if (neighbouringChunkBlock.equals(chunky)) continue; midY = chunky.getY1() + chunky.getHeight() / 2; if (neighbouringChunkBlock.getY1() < midY && neighbouringChunkBlock.getY2() > midY && ((neighbouringChunkBlock.getX2() < chunky .getX1() && neighbouringChunkBlock .readNumberOfLine() < 3) || (neighbouringChunkBlock .getX1() > chunky.getX2() && neighbouringChunkBlock .readNumberOfLine() == 1))) { removalList.add(neighbouringChunkBlock); wordBlockList.addAll(page.containsByType( neighbouringChunkBlock, null, WordBlock.class)); } } if (removalList.size() > 0) { ChunkBlock newChunkBlock = this.buildChunkBlock( wordBlockList, page); page.add(newChunkBlock, idGenerator++); page.delete(chunky, chunky.getId()); chunkBlockList.removeAll(removalList); for (SpatialEntity forDeleteEntity : removalList) { page.delete(forDeleteEntity, forDeleteEntity.getId()); } } } removalList.clear(); chunkBlockList.poll(); } } private SpatialEntity calculateBoundariesForJoin(ChunkBlock chunk, PageBlock parent) { SpatialEntity entity = null; int x1 = 0, x2 = 0, y1 = 0, y2 = 0; int width = parent.getMargin()[2] - parent.getMargin()[0]; int height = parent.getMargin()[3] - parent.getMargin()[1]; String lrm = chunk.readLeftRightMedLine(); width = (int) (width * 0.25); y1 = chunk.getY1(); y2 = chunk.getY2(); if (Block.LEFT.equalsIgnoreCase(lrm)) { // TODO:Use reflection x1 = (chunk.getX1() - width <= 0) ? parent.getMargin()[0] : chunk .getX1() - width; x2 = (chunk.getX2() + width >= parent.getMedian()) ? parent .getMedian() : chunk.getX2() + width; entity = modelFactory.createChunkBlock(x1, y1, x2, y2); } else if (Block.RIGHT.equalsIgnoreCase(lrm)) { x1 = (chunk.getX1() - width <= parent.getMedian()) ? parent .getMedian() : chunk.getX1() - width; x2 = (chunk.getX2() + width >= parent.getMargin()[2]) ? parent .getMargin()[2] : chunk.getX2() + width; entity = modelFactory.createChunkBlock(x1, y1, x2, y2); } else { x1 = (chunk.getX1() - width <= 0) ? parent.getMargin()[0] : chunk .getX1() - width; x2 = (chunk.getX2() + width >= parent.getMargin()[2]) ? parent .getMargin()[2] : chunk.getX2() + width; entity = modelFactory.createChunkBlock(x1, y1, x2, y2); } return entity; } private void deleteHighlyOverlappedChunkBlocks(PageBlock page) { List<ChunkBlock> chunkBlockList = page.getAllChunkBlocks( SpatialOrdering.MIXED_MODE ); ChunkBlock chunky; ChunkBlock neighbourChunk; List<SpatialEntity> neighbouringChunkBlockList; List<SpatialEntity> wordList; SpatialEntity intersectingRectangle; double property1, property2; for (SpatialEntity entity : chunkBlockList) { chunky = (ChunkBlock) entity; neighbouringChunkBlockList = page.intersectsByType(chunky, SpatialOrdering.MIXED_MODE, ChunkBlock.class); for (SpatialEntity neighbourEntity : neighbouringChunkBlockList) { neighbourChunk = (ChunkBlock) neighbourEntity; intersectingRectangle = chunky .getIntersectingRectangle(neighbourChunk); property1 = (intersectingRectangle.getHeight() * intersectingRectangle .getWidth()) / (double) (chunky.getWidth() * chunky.getHeight()); property2 = (double) (intersectingRectangle.getHeight() * intersectingRectangle .getWidth()) / (double) (neighbourChunk.getWidth() * neighbourChunk .getHeight()); if (property1 > property2 && property1 > 0.9) { wordList = page.containsByType(chunky, null, WordBlock.class); for (SpatialEntity wordEntity : wordList) ((Block) wordEntity).setContainer(neighbourChunk); page.delete(chunky, chunky.getId()); } if (property2 > property1 && property2 > 0.9) { wordList = page.containsByType(neighbourChunk, null, WordBlock.class); for (SpatialEntity wordEntity : wordList) ((Block) wordEntity).setContainer(chunky); page.delete(neighbourChunk, neighbourChunk.getId()); } } } } }