package edu.isi.bmkeg.lapdf.features;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.uimafit.component.xwriter.XWriterFileNamer;
import edu.isi.bmkeg.lapdf.extraction.exceptions.InvalidPopularSpaceValueException;
import edu.isi.bmkeg.lapdf.model.Block;
import edu.isi.bmkeg.lapdf.model.ChunkBlock;
import edu.isi.bmkeg.lapdf.model.PageBlock;
import edu.isi.bmkeg.lapdf.model.WordBlock;
import edu.isi.bmkeg.lapdf.model.factory.AbstractModelFactory;
import edu.isi.bmkeg.lapdf.model.spatial.SpatialEntity;
public class ChunkFeatures {
private ChunkBlock chunk;
private PageBlock parent;
private static Pattern patternLowerCase = Pattern.compile("[a-z]");
private static Pattern patternUpperCase = Pattern.compile("[A-Z]");
private static AbstractModelFactory modelFactory;
public ChunkFeatures(ChunkBlock chunk, AbstractModelFactory modelFactory) {
this.chunk = chunk;
this.parent = (PageBlock) chunk.getContainer();
this.modelFactory = modelFactory;
}
public boolean isMostPopularFontInDocument() {
String ds = parent.getDocument().getMostPopularFontStyle();
String s = chunk.getMostPopularWordFont()
+ ";" + chunk.getMostPopularWordStyle();
if( s.equals(ds) )
return true;
return false;
}
/**
* Note that we screen out the most popular font on the last page
* from this calculation since we expect that to be the font of the
* references.
* @return
*/
public boolean isNextMostPopularFontInDocument() {
String ds = parent.getDocument().getNextMostPopularFontStyle();
String s = chunk.getMostPopularWordFont()
+ ";" + chunk.getMostPopularWordStyle();
if( s.equals(ds) )
return true;
return false;
}
/**
* returns the difference between the most popular font size in the in the current chunk
* and the most popular font size in the document.
* @return
*/
public int getHeightDifferenceBetweenChunkWordAndDocumentWord() {
int i = chunk.getMostPopularWordHeight();
int j = parent.getDocument().readMostPopularWordHeight();
return (i-j);
}
/**
* returns true if chunk block is left aligned
* @return
*/
public boolean isAlignedLeft() {
if (Block.LEFT.equalsIgnoreCase(chunk.readLeftRightMedLine()))
return true;
return false;
}
/**
* returns true if chunk block starts in the top half of the page
* @return
*/
public boolean isInTopHalf() {
// x1, y1, x2, y2
int top = parent.getMargin()[1];
int bottom = parent.getMargin()[3];
double middle = (top + bottom) / 2.0;
if( chunk.getY1() < middle )
return true;
return false;
}
/**
* returns the most popular font size in the chunk block
* @return
*/
public int getMostPopularFontSize() {
String fontStyle = chunk.getMostPopularWordStyle();
if(fontStyle==null)
return chunk.getMostPopularWordHeight();
int fontSizeIndex = fontStyle.indexOf("font-size");
int colonIndex = fontStyle.indexOf(":", fontSizeIndex);
int ptIndex = fontStyle.indexOf("pt", colonIndex);
return Integer.parseInt(fontStyle.substring(colonIndex + 1, ptIndex));
}
/**
* returns true if chunk block is right aligned
* @return
*/
public boolean isAlignedRight() {
if (Block.RIGHT.equalsIgnoreCase(chunk.readLeftRightMedLine()))
return true;
return false;
}
/**
* returns true if chunk block is center aligned
* @return
*/
public boolean isAlignedMiddle() {
if (Block.MIDLINE.equalsIgnoreCase(chunk.readLeftRightMedLine()))
return true;
return false;
}
/**
* returns true if chunk block contains mostly capitalized text
* @return
*/
public boolean isAllCapitals() {
String chunkText = chunk.readChunkText();
Matcher matcher = patternLowerCase.matcher(chunkText);
if (matcher.find()) {
return false;
} else {
matcher = patternUpperCase.matcher(chunkText);
if (matcher.find()) {
return true;
} else {
return false;
}
}
}
/**
* returns true if chunk block contains mostly bold face text
* @return
*/
public boolean isMostPopularFontModifierBold() {
if ((chunk.getMostPopularWordStyle() != null && chunk
.getMostPopularWordStyle().indexOf("Bold") != -1)
|| (chunk.getMostPopularWordFont() != null && (chunk
.getMostPopularWordFont().indexOf("Bold") != -1 || chunk
.getMostPopularWordFont().indexOf("-B") != -1))) {
return true;
}
return false;
}
/**
* returns true if chunk block contains mostly italicized text
* @return
*/
public boolean isMostPopularFontModifierItalic() {
if ((chunk.getMostPopularWordStyle() != null && chunk
.getMostPopularWordStyle().indexOf("Italic") != -1)
|| (chunk.getMostPopularWordFont() != null && chunk
.getMostPopularWordFont().indexOf("Italic") != -1)) {
return true;
}
return false;
}
/**
* returns true if chunk block contains the first line of a page's text
* @return
*/
public boolean isContainingFirstLineOfPage() {
if (Math.abs(chunk.getY1() - parent.getMargin()[1]) < parent
.getDocument().readMostPopularWordHeight())
return true;
else
return false;
}
/**
* returns true if chunk block contains the last line of a page's text
* @return
*/
public boolean isContainingLastLineOfPage() {
if (Math.abs(chunk.getY2() - parent.getMargin()[3]) < parent
.getDocument().readMostPopularWordHeight())
return true;
else
return false;
}
/**
* returns true if chunk block is an outlier or stray block
* @return
*/
public boolean isOutlier() {
// TODO - UseReflections
ChunkBlock block = modelFactory.createChunkBlock(
chunk.getX1(),
chunk.getY1() - 30,
chunk.getX2(),
chunk.getY2() + 60);
int neighbouringChunksCount = parent.intersectsByType(block, null,
ChunkBlock.class).size();
int wordBlockCount = parent.containsByType(chunk, null, WordBlock.class).size();
int sizeAfterTrunc = chunk.readChunkText().
replaceAll("[A-Za-z0-9]", "").length();
if ( (wordBlockCount < 10 && neighbouringChunksCount < 10)
|| (sizeAfterTrunc < 10 && neighbouringChunksCount < 10)
|| chunk.getMostPopularWordHeight() > 50)
return true;
return false;
}
public int getChunkTextLength() {
return chunk.readChunkText().length();
}
/**
* returns the word block density in a chunk block
* @return
*/
public double getDensity() {
List<SpatialEntity> wordBlockList = parent.containsByType(chunk, null,
WordBlock.class);
double areaCoveredByWordBlocks = 0;
for (SpatialEntity entity : wordBlockList)
areaCoveredByWordBlocks = areaCoveredByWordBlocks
+ (entity.getHeight() * entity.getWidth());
return areaCoveredByWordBlocks / (chunk.getHeight() * chunk.getWidth());
}
/**
* returns true if the chunk block is aligned with column boundaries
* @return
*/
public boolean isAlignedWithColumnBoundaries() {
String lrm = chunk.readLeftRightMedLine();
int columnLeft = 0;
int columnRight = 0;
// double threshold = chunk.getMostPopularWordHeight() * 1.5;
double threshold = chunk.getMostPopularWordHeight() * 3;
int l = parent.getDocument().getBodyTextFrame().getX1();
int r = parent.getDocument().getBodyTextFrame().getX2();
int m = (int) Math.round( (l+r)/2.0);
if (Block.MIDLINE.equalsIgnoreCase(lrm)) {
return false;
} else if (Block.LEFT.equalsIgnoreCase(lrm)) {
columnLeft = l;
columnRight = m;
} else if (Block.RIGHT.equalsIgnoreCase(lrm)) {
columnLeft = m;
// columnRight = parent.getMargin()[2];
columnRight = r;
}
int leftDiff = Math.abs(chunk.getX1() - columnLeft);
int rightDiff = Math.abs(chunk.getX2() - columnRight);
if (chunk.readNumberOfLine() > 1
&& leftDiff < threshold
&& rightDiff < threshold) {
return true;
} else if (chunk.readNumberOfLine() == 1
&& leftDiff < threshold) {
return true;
}
return false;
}
/**
* returns the classification assigned to previous chunk block
* @return
*/
public String getlastClassification() {
ChunkBlock lastBlock = chunk.readLastChunkBlock();
return (lastBlock == null) ? null : lastBlock.getType();
}
/**
* returns the section label of chunk
* @return
* @throws InvalidPopularSpaceValueException
*/
public String getSection() throws InvalidPopularSpaceValueException {
ChunkBlock lastBlock = null;
lastBlock = parent.getDocument().getLastChunkBlock(chunk);
/*String section = (lastBlock == null) ? null : (lastBlock.getType()
.contains(".")) ? lastBlock.getType().substring(0,
lastBlock.getType().indexOf(".")) : lastBlock.getType();*/
String section;
if(lastBlock==null){
section=null;
}else if(lastBlock.getType().contains(".")){
section= lastBlock.getType().substring(0,lastBlock.getType().indexOf("."));
}else{
section=lastBlock.getType();
}
if (section == null)
return null;
else if (isMainSection(section))
return section;
ChunkBlock prev = null;
while (section != null) {
/**
* introducing a special check to see if the call to getLastChunkBlock returns
* the same block i.e. lastBlock if so we break the loop and exit with section = lastBlock.getType()
*/
prev = lastBlock;
lastBlock = parent.getDocument().getLastChunkBlock(lastBlock);
/*if (lastBlock!=null)
{
System.out.println(prev.getchunkText());
System.out.println(lastBlock.getchunkText());
System.out.println("---------------");
}
section = (lastBlock == null) ? null : (lastBlock.getType()
.contains(".")) ? lastBlock.getType().substring(0,
lastBlock.getType().indexOf(".")) : lastBlock.getType();*/
if(lastBlock==null){
section=null;
}else if(lastBlock.getType().contains(".")){
section= lastBlock.getType().substring(0,lastBlock.getType().indexOf("."));
if(lastBlock.equals(prev)){
break;
}
}else{
section=lastBlock.getType();
if(lastBlock.equals(prev)){
break;
}
}
if (isMainSection(section))
return section;
}
return section;
}
private boolean isMainSection(String section) {
boolean result = !(chunk.TYPE_AFFLIATION.equals(section)
|| chunk.TYPE_CITATION.equals(section)
|| chunk.TYPE_FIGURE_LEGEND.equals(section)
|| chunk.TYPE_FOOTER.equals(section)
|| chunk.TYPE_HEADER.equals(section)
|| chunk.TYPE_KEYWORDS.equals(section)
|| chunk.TYPE_TABLE.equals(section) || chunk.TYPE_UNCLASSIFIED
.equals(section));
return result;
}
/**
* returns the page number where the block is located
* @return
*/
public int getPageNumber() {
return this.parent.getPageNumber();
}
/**
* returns true if the chunk is a single column centered on the page else returns false
* @return
*/
public boolean isColumnCentered() {
int chunkMedian = chunk.getX1() + chunk.getWidth() / 2;
int pageMedian = parent.getMedian();
String lrm = chunk.readLeftRightMedLine();
if (chunk.MIDLINE.equalsIgnoreCase(lrm)) {
if (Math.abs(pageMedian - chunkMedian) < parent.getDocument()
.readMostPopularWordHeight() * 2)
return true;
return false;
}
int pageMedianLeftRight = 0;
if (chunk.LEFT.equalsIgnoreCase(lrm)) {
pageMedianLeftRight = parent.getMargin()[0]
+ (pageMedian - parent.getMargin()[0]) / 2;
} else if (chunk.RIGHT.equalsIgnoreCase(lrm)) {
pageMedianLeftRight = pageMedian
+ (parent.getMargin()[2] - pageMedian) / 2;
}
if (Math.abs(chunkMedian - pageMedianLeftRight) < parent.getDocument()
.readMostPopularWordHeight() * 2)
return true;
return false;
}
public boolean isWithinBodyTextFrame() {
SpatialEntity btf = parent.getDocument().getBodyTextFrame();
double threshold = chunk.getMostPopularWordHeight() * 3;
if( chunk.getX1() + threshold > btf.getX1() &&
chunk.getX2() - threshold < btf.getX2() &&
chunk.getY1() + threshold > btf.getY1() &&
chunk.getY2() - threshold < btf.getY2() ) {
return true;
} else {
return false;
}
}
}