package edu.isi.bmkeg.lapdf.model; import java.io.File; import java.io.Serializable; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import edu.isi.bmkeg.lapdf.extraction.exceptions.InvalidPopularSpaceValueException; import edu.isi.bmkeg.lapdf.model.RTree.RTPageBlock; import edu.isi.bmkeg.lapdf.model.RTree.RTSpatialEntity; import edu.isi.bmkeg.lapdf.model.ordering.SpatialOrdering; import edu.isi.bmkeg.lapdf.model.spatial.SpatialEntity; import edu.isi.bmkeg.utils.FrequencyCounter; import edu.isi.bmkeg.utils.IntegerFrequencyCounter; public class LapdfDocument implements Serializable { private File pdfFile; private ArrayList<PageBlock> pageList; private IntegerFrequencyCounter avgHeightFrequencyCounter; private FrequencyCounter fontFrequencyCounter; private int mostPopularWordHeight = -1; private String mostPopularFontStyle = ""; private String nextMostPopularFontStyle = ""; private String mostPopularFontStyleOnLastPage = ""; // This the rectangle that holds the text of the main 'panel' // across the whole document (excluding footers and headers) private SpatialEntity bodyTextFrame; private boolean jPedalDecodeFailed; // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ public LapdfDocument(File pdfFile) { this.setPdfFile(pdfFile); this.setAvgHeightFrequencyCounter(new IntegerFrequencyCounter(1)); this.setFontFrequencyCounter(new FrequencyCounter()); } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ public boolean hasjPedalDecodeFailed() { return jPedalDecodeFailed; } public void setjPedalDecodeFailed(boolean jPedalDecodeFailed) { this.jPedalDecodeFailed = jPedalDecodeFailed; } public int getTotalNumberOfPages() { return this.pageList.size(); } public File getPdfFile() { return pdfFile; } public void setPdfFile(File pdfFile) { this.pdfFile = pdfFile; } public IntegerFrequencyCounter getAvgHeightFrequencyCounter() { return avgHeightFrequencyCounter; } public void setAvgHeightFrequencyCounter(IntegerFrequencyCounter avgHeightFrequencyCounter) { this.avgHeightFrequencyCounter = avgHeightFrequencyCounter; } public FrequencyCounter getFontFrequencyCounter() { return fontFrequencyCounter; } public void setFontFrequencyCounter(FrequencyCounter fontFrequencyCounter) { this.fontFrequencyCounter = fontFrequencyCounter; } public SpatialEntity getBodyTextFrame() { return bodyTextFrame; } public void setBodyTextFrame(SpatialEntity bodyTextFrame) { this.bodyTextFrame = bodyTextFrame; } public String getMostPopularFontStyle() { return mostPopularFontStyle; } public void setMostPopularFontStyle(String mostPopularFontStyle) { this.mostPopularFontStyle = mostPopularFontStyle; } public String getNextMostPopularFontStyle() { return nextMostPopularFontStyle; } public void setNextMostPopularFontStyle(String nextMostPopularFontStyle) { this.nextMostPopularFontStyle = nextMostPopularFontStyle; } public String getMostPopularFontStyleOnLastPage() { return mostPopularFontStyleOnLastPage; } public void setMostPopularFontStyleOnLastPage( String mostPopularFontStyleOnLastPage) { this.mostPopularFontStyleOnLastPage = mostPopularFontStyleOnLastPage; } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ public void addPages(List<PageBlock> pageList) { this.pageList = new ArrayList<PageBlock>(pageList); } public PageBlock getPage(int pageNumber) { return pageList.get(pageNumber - 1); } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ public ChunkBlock getLastChunkBlock(ChunkBlock chunk) throws InvalidPopularSpaceValueException { int pageNumber = ((PageBlock) chunk.getContainer()).getPageNumber(); PageBlock page = this.getPage(pageNumber); if (page.getMostPopularVerticalSpaceBetweenWordsPage() < 0 && page.getMostPopularWordHeightPage() > page .getMostPopularWordWidthPage() * 2) { // page.getMostPopularWordHeightPage()>page.getMostPopularWordWidthPage()*2 System.err.println( "Possible page with vertical text flow at page number +" + pageNumber); // throw new // InvalidPopularSpaceValueException("Possible page with vertical text flow at page number +"+pageNumber); } if (chunk.readLastChunkBlock() != null) { // System.out.println("Same page"); return chunk.readLastChunkBlock(); } else { pageNumber = ((PageBlock) chunk.getContainer()).getPageNumber() - 1; if (pageNumber == 0) { return null; } page = this.getPage(pageNumber); List<ChunkBlock> sortedChunkBlockList = page .getAllChunkBlocks(SpatialOrdering.COLUMN_AWARE_MIXED_MODE); // System.out.println("Page:"+ pageNumber); return sortedChunkBlockList.get(sortedChunkBlockList.size() - 1); } } public int readMostPopularWordHeight() { if( this.mostPopularWordHeight != -1 ) return this.mostPopularWordHeight; int mp = this.avgHeightFrequencyCounter.getMostPopular(); double mpCount = this.avgHeightFrequencyCounter.getCount(mp); int nmp = this.avgHeightFrequencyCounter.getNextMostPopular(); double nmpCount = this.avgHeightFrequencyCounter.getCount(nmp); double ratio = nmpCount / mpCount; // Sneaky check for long reference sections if (nmp > mp && ratio > 0.8) { mostPopularWordHeight = nmp; } else { mostPopularWordHeight = mp; } return mostPopularWordHeight; } //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ public void packForSerialization() { this.setBodyTextFrame(null); for (int i = 1; i <= this.getTotalNumberOfPages(); i++) { RTPageBlock page = (RTPageBlock) this.getPage(i); page.packForSerialization(); } } public void unpackFromSerialization() { for (int i = 1; i <= this.getTotalNumberOfPages(); i++) { RTPageBlock page = (RTPageBlock) this.getPage(i); page.unpackFromSerialization(); } this.calculateBodyTextFrame(); } public void calculateBodyTextFrame() { String mp = (String) this.fontFrequencyCounter.getMostPopular(); String[] mpArray = mp.split(";"); int x_min = 10000; int y_min = 10000; int x_max = -1; int y_max = -1; Iterator<PageBlock> pgIt = this.pageList.iterator(); while( pgIt.hasNext() ) { PageBlock pg = pgIt.next(); Iterator<WordBlock> wdIt = pg.getAllWordBlocks(SpatialOrdering.MIXED_MODE).iterator(); while( wdIt.hasNext() ) { WordBlock wd = wdIt.next(); if( wd.getFont() == null || wd.getFontStyle() == null) continue; if( wd.getFont().equals(mpArray[0]) && wd.getFontStyle().equals(mpArray[1]) ) { if( wd.getX1() < x_min ) x_min = wd.getX1(); if( wd.getX2() > x_max ) x_max = wd.getX2(); if( wd.getY1() < y_min ) y_min = wd.getY1(); if( wd.getY2() > y_max ) y_max = wd.getY2(); } } } this.setBodyTextFrame(new RTSpatialEntity( (float) x_min, (float) y_min, (float) x_max, (float) y_max )); } public void calculateMostPopularFontStyles() { String lastPage = this.readMostPopularFontStyleOnLastPage(); String mp = (String) this.fontFrequencyCounter.getMostPopular(); String nmp = (String) this.fontFrequencyCounter.getNextMostPopular(); String nnmp = (String) this.fontFrequencyCounter.getThirdMostPopular(); if( mp.equals( lastPage ) ) { this.setMostPopularFontStyle(nmp); this.setNextMostPopularFontStyle(nnmp); } else if( nmp.equals( lastPage ) ) { this.setMostPopularFontStyle(mp); this.setNextMostPopularFontStyle(nnmp); } else { this.setMostPopularFontStyle(mp); this.setNextMostPopularFontStyle(nmp); } } public String readMostPopularFontStyleOnLastPage() { if( this.getMostPopularFontStyleOnLastPage() != null && this.getMostPopularFontStyleOnLastPage().length() > 0 ) { return this.getMostPopularFontStyleOnLastPage(); } this.setMostPopularFontStyle((String) this.fontFrequencyCounter.getMostPopular() ); FrequencyCounter freq = new FrequencyCounter (); Iterator<PageBlock> pgIt = this.pageList.iterator(); while( pgIt.hasNext() ) { PageBlock pg = pgIt.next(); if( pg.getPageNumber() < this.pageList.size() ) { continue; } Iterator<WordBlock> wdIt = pg.getAllWordBlocks(SpatialOrdering.MIXED_MODE).iterator(); while( wdIt.hasNext() ) { WordBlock wd = wdIt.next(); if( wd.getFont() == null || wd.getFontStyle() == null) continue; freq.add( wd.getFont() + ";" + wd.getFontStyle() ); } } this.setMostPopularFontStyleOnLastPage((String) freq.getMostPopular() ); return this.getMostPopularFontStyleOnLastPage(); } }