package edu.isi.bmkeg.lapdf.features;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import edu.isi.bmkeg.lapdf.model.ChunkBlock;
import edu.isi.bmkeg.lapdf.model.WordBlock;
import edu.isi.bmkeg.utils.FrequencyCounter;
public class HorizontalSplitFeature {
private static Pattern eop = Pattern.compile("[\\.\\!\\?]\\n*$");
private boolean mixedFont = false;
private boolean endOFLine = false;
private boolean allCapitals = false;
private String mostPopularFont = null;
private int midYOfLastWordOnLine;
private int firstWordOnLineHeight;
private int wordCount = 0;
private int extremLeftOffset;
private int extremeRightOffset;
private int midOffset;
private static FrequencyCounter fontFrequencyCounter = new FrequencyCounter();
private static FrequencyCounter fontStyleFrequencyCounter = new FrequencyCounter();
private static Pattern patternLowerCase = Pattern.compile("[a-z]");
private static Pattern patternUpperCase = Pattern.compile("[A-Z]");
public HorizontalSplitFeature() {
}
public void calculateFeatures(ChunkBlock chunky, WordBlock firstWordBlock,
WordBlock lastWordBlock, String completeString) {
calculateIntegerFeatures(chunky, firstWordBlock, lastWordBlock);
calculateStringFeatures(completeString);
fontFrequencyCounter.reset();
fontStyleFrequencyCounter.reset();
}
public void addToFrequencyCounters(String font, String style) {
fontFrequencyCounter.add(font);
fontStyleFrequencyCounter.add(style);
this.wordCount++;
}
private void calculateIntegerFeatures(ChunkBlock chunky,
WordBlock firstWordBlock, WordBlock lastWordBlock) {
firstWordOnLineHeight = firstWordBlock.getHeight();
extremLeftOffset = firstWordBlock.getX1() - chunky.getX1();
extremeRightOffset = chunky.getX2() - lastWordBlock.getX2();
midYOfLastWordOnLine = lastWordBlock.getY1()
+ lastWordBlock.getHeight() / 2;
int chunkBlockMidLine = chunky.getX1() + chunky.getWidth() / 2;
int median = (lastWordBlock.getX2() - firstWordBlock.getX1()) / 2;
midOffset = chunkBlockMidLine - median;
}
private void calculateStringFeatures(String completeString) {
endOFLine = (eop.matcher(completeString).find()) ? true : false;
Matcher matcher = patternLowerCase.matcher(completeString);
if (matcher.find()) {
allCapitals = false;
} else {
matcher = patternUpperCase.matcher(completeString);
if (matcher.find()) {
allCapitals = true;
}else{
allCapitals=false;
}
}
// allCapitals=(completeString.matches(regex))
mostPopularFont = (String) fontFrequencyCounter.getMostPopular();
mixedFont = (fontStyleFrequencyCounter.countOptions() > 1) ? true
: false;
}
public boolean isMixedFont() {
return mixedFont;
}
public boolean isEndOFLine() {
return false;
}
public boolean isAllCapitals() {
return allCapitals;
}
public String getMostPopularFont() {
return mostPopularFont;
}
public int getMidYOfLastWordOnLine() {
return midYOfLastWordOnLine;
}
public int getFirstWordOnLineHeight() {
return firstWordOnLineHeight;
}
public int getWordCount() {
return wordCount;
}
public int getExtremLeftOffset() {
return extremLeftOffset;
}
public int getExtremeRightOffset() {
return extremeRightOffset;
}
public int getMidOffset() {
return midOffset;
}
}