package edu.isi.bmkeg.lapdf.model.RTree; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Properties; import com.infomatiq.jsi.Rectangle; import com.infomatiq.jsi.rtree.RTree; import edu.isi.bmkeg.lapdf.extraction.exceptions.InvalidPopularSpaceValueException; import edu.isi.bmkeg.lapdf.model.ChunkBlock; import edu.isi.bmkeg.lapdf.model.WordBlock; import edu.isi.bmkeg.lapdf.model.ordering.SpatialOrdering; import edu.isi.bmkeg.lapdf.model.spatial.SpatialEntity; import edu.isi.bmkeg.lapdf.model.spatial.SpatialRepresentation; import edu.isi.bmkeg.utils.IntegerFrequencyCounter; public class RTSpatialRepresentation implements SpatialRepresentation { private Map<Integer, WordBlock> indexToWordBlockMap; private Map<Integer, ChunkBlock> indexToChunkBlockMap; private int mostPopularHorizontalSpaceBetweenWords = -1; private int mostPopularWordWidth = -1; private int mostPopularVerticalSpaceBetweenWords = -1; private int mostPopularWordHeightPerPage = -1; private int[] margin = null; private List<WordBlock> list = null; private RTree tree; private int maxNode = 1500; private int minNode = 1; protected RTSpatialRepresentation() { this.indexToWordBlockMap = new HashMap<Integer, WordBlock>(); this.indexToChunkBlockMap = new HashMap<Integer, ChunkBlock>(); Properties prp = new Properties(); prp.setProperty("MaxNodeEntries", "" + maxNode); prp.setProperty("MinNodeEntries", "" + minNode); tree = new RTree(); tree.init(prp); } @Override public void add(SpatialEntity entity, int id) { RTSpatialEntity rtSpatialEntity = (RTSpatialEntity) entity; rtSpatialEntity.setId(id); if (rtSpatialEntity instanceof ChunkBlock) { this.indexToChunkBlockMap.put(id, (ChunkBlock) rtSpatialEntity); } else { this.indexToWordBlockMap.put(id, (WordBlock) rtSpatialEntity); } tree.add(rtSpatialEntity, id); } @Override public int addAll(List<SpatialEntity> list, int startId) { for (SpatialEntity entity : list) this.add(entity, startId++); return startId; } public List<SpatialEntity> intersects(SpatialEntity entity, String ordering) { return this.intersectsByType(entity, ordering, null); } @Override public SpatialEntity getEntity(int id) { if (indexToWordBlockMap.containsKey(id)) return indexToWordBlockMap.get(id); return indexToChunkBlockMap.get(id); } @Override public List<ChunkBlock> getAllChunkBlocks(String ordering) { List<ChunkBlock> list = new ArrayList<ChunkBlock>( indexToChunkBlockMap.values()); if (ordering != null) { Collections.sort(list, new SpatialOrdering(ordering)); } return list; } @Override public int[] getMargin() { if (margin == null) { margin = new int[4]; Rectangle marginRect = tree.getBounds(); margin[0] = (int) marginRect.minX; margin[1] = (int) marginRect.minY; margin[2] = (int) marginRect.maxX; margin[3] = (int) marginRect.maxY; return margin; } return margin; } @Override public int getMedian() { if( margin == null ) this.getMargin(); return margin[0] + (margin[2] - margin[0]) / 2; } @Override public List<SpatialEntity> contains(SpatialEntity entity, String ordering) { return this.containsByType(entity, ordering, null); } @Override public boolean delete(SpatialEntity entity, int id) { RTSpatialEntity rtSpatialEntity = (RTSpatialEntity) entity; if (indexToChunkBlockMap.containsKey(id)) indexToChunkBlockMap.remove(id); else indexToWordBlockMap.remove(id); boolean treeDel = tree.delete(rtSpatialEntity, id); return treeDel; } @Override public List<SpatialEntity> intersectsByType(SpatialEntity entity, String ordering, Class classType) { RTProcedure procedure = new RTProcedure(this, ordering, (RTSpatialEntity) entity, classType, false); tree.intersects((RTSpatialEntity) entity, procedure); return procedure.getIntersectionList(); } @Override public List<WordBlock> getAllWordBlocks(String ordering) { List<WordBlock> list = new ArrayList<WordBlock>( indexToWordBlockMap.values()); if (ordering != null) { Collections.sort(list, new SpatialOrdering(ordering)); } return list; } @Override public List<SpatialEntity> containsByType(SpatialEntity entity, String ordering, Class classType) { RTProcedure procedure = new RTProcedure(this, ordering, (RTSpatialEntity) entity, classType, true); tree.contains((RTSpatialEntity) entity, procedure); if (procedure.getIntersectionList().size() == 0) { List<SpatialEntity> intersectList = this.intersectsByType(entity, ordering, classType); List<SpatialEntity> returnList = new ArrayList<SpatialEntity>(); for (SpatialEntity loopEntity : intersectList) { if (entity.getX1() <= loopEntity.getX1() && entity.getX2() >= loopEntity.getX2() && entity.getY1() <= loopEntity.getY1() && entity.getY2() >= loopEntity.getY2()) returnList.add(loopEntity); return returnList; } } return procedure.getIntersectionList(); } @Override public int getMostPopularHorizontalSpaceBetweenWordsPage() throws InvalidPopularSpaceValueException { if (mostPopularHorizontalSpaceBetweenWords != -1) { return mostPopularHorizontalSpaceBetweenWords; } IntegerFrequencyCounter avgHorizontalSpaceBetweenWordFrequencyCounter = new IntegerFrequencyCounter( 1); if (list == null) list = this.getAllWordBlocks(SpatialOrdering.MIXED_MODE); int lastX2 = list.get(0).getX2(); int space; for (WordBlock block : list) { space = block.getX1() - lastX2; if (space > 0) { avgHorizontalSpaceBetweenWordFrequencyCounter.add(space); } lastX2 = block.getX2(); } int mostPopular = avgHorizontalSpaceBetweenWordFrequencyCounter .getMostPopular(); double mostPopularCount = avgHorizontalSpaceBetweenWordFrequencyCounter .getCount(mostPopular); int secondMostPopular = avgHorizontalSpaceBetweenWordFrequencyCounter .getNextMostPopular(); double secondMostPopularCount = avgHorizontalSpaceBetweenWordFrequencyCounter .getCount(secondMostPopular); double ratio = secondMostPopularCount / mostPopularCount; if (secondMostPopular > mostPopular && ratio > 0.8) { mostPopularHorizontalSpaceBetweenWords = secondMostPopular; } else { mostPopularHorizontalSpaceBetweenWords = mostPopular; } if (mostPopularHorizontalSpaceBetweenWords == -1) { throw new InvalidPopularSpaceValueException( "RTSpatialRepresentation.getMostPopularHorizontalSpaceBetweenWordsPage"); } propagateCalculation(); // System.out.println("Returning mostPopularHorizontalSpaceBetweenWords"+mostPopularHorizontalSpaceBetweenWords); return mostPopularHorizontalSpaceBetweenWords; } private void propagateCalculation() throws InvalidPopularSpaceValueException { if (mostPopularHorizontalSpaceBetweenWords == -1) { getMostPopularHorizontalSpaceBetweenWordsPage(); } if (mostPopularWordWidth == -1) { getMostPopularWordWidthPage(); } if (mostPopularVerticalSpaceBetweenWords == -1) { getMostPopularVerticalSpaceBetweenWordsPage(); } if (mostPopularWordHeightPerPage == -1) { getMostPopularWordHeightPage(); } list = null; } @Override public int getMostPopularVerticalSpaceBetweenWordsPage() throws InvalidPopularSpaceValueException { if (mostPopularVerticalSpaceBetweenWords != -1) { return mostPopularVerticalSpaceBetweenWords; } IntegerFrequencyCounter verticalSpaceBetweenWordFrequencyCounter = new IntegerFrequencyCounter( 1); if (list == null) list = this.getAllWordBlocks(SpatialOrdering.MIXED_MODE); int lastX2 = list.get(0).getX2(); int firstY2 = list.get(0).getY2(); int space; for (WordBlock block : list) { space = block.getX1() - lastX2; if (space < 0) { verticalSpaceBetweenWordFrequencyCounter.add(block.getY1() - firstY2); firstY2 = block.getY2(); } lastX2 = block.getX2(); } int mostPopular = verticalSpaceBetweenWordFrequencyCounter .getMostPopular(); double mostPopularCount = verticalSpaceBetweenWordFrequencyCounter .getCount(mostPopular); int secondMostPopular = verticalSpaceBetweenWordFrequencyCounter .getNextMostPopular(); double secondMostPopularCount = verticalSpaceBetweenWordFrequencyCounter .getCount(secondMostPopular); double ratio = secondMostPopularCount / mostPopularCount; if (secondMostPopular > mostPopular && ratio > 0.8) { mostPopularVerticalSpaceBetweenWords = secondMostPopular; } else { mostPopularVerticalSpaceBetweenWords = mostPopular; } if (mostPopularVerticalSpaceBetweenWords == -1) { throw new InvalidPopularSpaceValueException( "RTSpatialRepresentation.getMostPopularVerticalSpaceBetweenWordsPage"); } propagateCalculation(); // System.out.println("Returning mostPopularVerticalSpaceBetweenWords"+mostPopularVerticalSpaceBetweenWords); return mostPopularVerticalSpaceBetweenWords; } @Override public int getMostPopularWordWidthPage() { if (mostPopularWordWidth != -1) { return mostPopularWordWidth; } IntegerFrequencyCounter avgWordWidthFrequencyCounter = new IntegerFrequencyCounter( 1); if (list == null) list = this.getAllWordBlocks(null); for (WordBlock block : list) avgWordWidthFrequencyCounter.add(block.getWidth()); int mostPopular = avgWordWidthFrequencyCounter.getMostPopular(); double mostPopularCount = avgWordWidthFrequencyCounter .getCount(mostPopular); int secondMostPopular = avgWordWidthFrequencyCounter .getNextMostPopular(); double secondMostPopularCount = avgWordWidthFrequencyCounter .getCount(secondMostPopular); double ratio = secondMostPopularCount / mostPopularCount; if (secondMostPopular > mostPopular && ratio > 0.8) { mostPopularWordWidth = secondMostPopular; } else { mostPopularWordWidth = mostPopular; } propagateWordBasedCalculation(); return mostPopularWordWidth; } @Override public int getMostPopularWordHeightPage() { if (mostPopularWordHeightPerPage != -1) { return mostPopularWordHeightPerPage; } IntegerFrequencyCounter avgWordWidthFrequencyCounter = new IntegerFrequencyCounter( 1); if (list == null) list = this.getAllWordBlocks(null); for (WordBlock block : list) avgWordWidthFrequencyCounter.add(block.getHeight()); int mostPopular = avgWordWidthFrequencyCounter.getMostPopular(); double mostPopularCount = avgWordWidthFrequencyCounter .getCount(mostPopular); int secondMostPopular = avgWordWidthFrequencyCounter .getNextMostPopular(); double secondMostPopularCount = avgWordWidthFrequencyCounter .getCount(secondMostPopular); double ratio = secondMostPopularCount / mostPopularCount; if (secondMostPopular > mostPopular && ratio > 0.8) { mostPopularWordHeightPerPage = secondMostPopular; } else { mostPopularWordHeightPerPage = mostPopular; } propagateWordBasedCalculation(); return mostPopularWordHeightPerPage; } private void propagateWordBasedCalculation() { if (mostPopularWordWidth == -1) { getMostPopularWordWidthPage(); } if (mostPopularWordHeightPerPage == -1) { getMostPopularWordHeightPage(); } list = null; } public void packForSerialization() { this.tree = null; } public void unpackFromSerialization() { Properties prp = new Properties(); prp.setProperty("MaxNodeEntries", "" + maxNode); prp.setProperty("MinNodeEntries", "" + minNode); tree = new RTree(); tree.init(prp); Iterator<Integer> it = this.indexToChunkBlockMap.keySet().iterator(); while( it.hasNext() ) { Integer id = it.next(); ChunkBlock r = this.indexToChunkBlockMap.get(id); this.tree.add( (RTSpatialEntity) r, id ); } it = this.indexToWordBlockMap.keySet().iterator(); while( it.hasNext() ) { Integer id = it.next(); WordBlock r = this.indexToWordBlockMap.get(id); this.tree.add( (RTSpatialEntity) r, id ); } } }