package doser.entitydisambiguation.table.logic;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import doser.entitydisambiguation.table.celldisambiguation.CellDisAlgorithm_Standard;
import doser.entitydisambiguation.table.celldisambiguation.CellDisambiguationInterface;
import doser.entitydisambiguation.table.columndisambiguation.ColumnDisAlgorithm;
/**
* This class organizes the table disambiguation algorithms.
*
* @author Stefan Zwicklbauer
*
*/
public class DisambiguateTable {
private static final float NUMBERSTHRESHOLD = 0.5f;
private List<String> groundtruth;
public DisambiguateTable() {
super();
}
public Table disambiguateTable(final Table table) {
final int nrOfCols = table.getNumberofColumns();
for (int i = 0; i < nrOfCols; i++) {
final TableColumn col = table.getColumn(i);
final CellDisambiguationInterface cellDisAlgo = decideCellDisambiguationAlgorithm(col);
cellDisAlgo.disambiguateCells(col);
}
final ColumnDisAlgorithm colDisAlgo = new ColumnDisAlgorithm();
if (this.groundtruth == null) {
colDisAlgo.disambiguateTypes(table, null);
} else {
colDisAlgo.disambiguateTypes(table, this.groundtruth);
}
return table;
}
/**
* Bad heuristic to detect the neccessary algorithm . If more than 50
* percent numbers are available we use the computer science cell
* disambiguation algorithm.
*
* @param col
* The respective table column
* @return the cell disambiguation algorithm
*/
private CellDisambiguationInterface decideCellDisambiguationAlgorithm(
final TableColumn col) {
final List<TableCell> cellList = col.getCellList();
int amountOfNrs = 0;
for (final TableCell cell : cellList) {
final String content = cell.getCellContent();
if(isNumber(content)) {
amountOfNrs++;
}
}
CellDisambiguationInterface res = null;
if(((float)amountOfNrs / (float) cellList.size()) > NUMBERSTHRESHOLD) {
// ToDo Wieder umschreiben. Allerdings wird das momentan für den Datenextraktor zurückgegesetz
// res = CellDisAlgorithm_CSDomain.getInstance();
res = CellDisAlgorithm_Standard.getInstance();
} else {
res = CellDisAlgorithm_Standard.getInstance();
}
return res;
}
private boolean isNumber(final String str) {
boolean numberFound = false;
final Pattern pattern = Pattern.compile("^\\d*[.,]?\\d*$");
final Matcher matcher = pattern.matcher(str);
if (matcher.find()) {
numberFound = true;
}
return numberFound;
}
public void setGroundtruth(final List<String> columnGroundTruth) {
this.groundtruth = columnGroundTruth;
}
}