package com.cognitionis.nlp_files;
import java.util.*;
import com.cognitionis.utils_basickit.*;
/**
* @author Hector Llorens
* @since 2009
*/
public class Stat {
private int elemscol;
private int attribscol;
private String[] coldesc_arr;
private Integer coldesc_arr_count;
private HashMap<String, HashMap> elements; // TIMEX3, EVENT | EVENT-OCURRENCE... (PRIMARY KEY)
private Integer totalDataAdded;
private Integer totalGlobalDataAdded;
public Stat() {
this(null, null, null);
}
public Stat(String[] coldesc) {
this(coldesc, null, null);
}
public Stat(String[] coldesc, String elemscol_re) {
this(coldesc, elemscol_re, null);
}
public Stat(String[] coldesc, String elemscol_re, String attribscol_re) {
coldesc_arr = null;
coldesc_arr_count = 0;
totalDataAdded = 0;
totalGlobalDataAdded = 0;
elemscol = -1;
attribscol = -1;
try {
if (coldesc == null || coldesc.length < 1) {
throw new Exception("Column description is null or empty");
}
//coldesc_arr = coldesc;
coldesc_arr = new String[coldesc.length + 1];
for(int i=0;i<coldesc.length;i++){
coldesc_arr[i]=coldesc[i];
}
coldesc_arr[coldesc.length]="span";
coldesc_arr_count = coldesc.length + 1;
if (elemscol_re != null) {
elemscol = getColumn(elemscol_re);
}
if (attribscol_re != null) {
attribscol = getColumn(attribscol_re);
}
if (elemscol == -1) {
elemscol = coldesc.length - 1;
}
/*numtokcol = getColumn("numtok");*/
// luego siempre comprovar si attribscol!=-1 y si lo es pues usar el combinado BASE-CLASS
// crear estructura vacia para la estadistica teniendo en cuenta las columnas
elements = new HashMap<String, HashMap>();
} catch (Exception e) {
System.err.println("Errors found (" + this.getClass().getSimpleName() + "):\n\t" + e.toString() + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
}
}
public int getColsCount(){
return coldesc_arr_count;
}
public int getElemsCol() {
return elemscol;
}
public int getAttribsCol() {
return attribscol;
}
public int getColumn(String colname_re) {
for (int i = 0; i < coldesc_arr_count; i++) {
if (coldesc_arr[i].matches(colname_re)) {
return i;
}
}
return -1;
}
public String getColumnStr(int colpos) {
try {
if (colpos >= 0 && coldesc_arr.length > colpos) {
return coldesc_arr[colpos];
} else {
throw new Exception("Column position < 0 or > total columns");
}
} catch (Exception e) {
System.err.println("Errors found (" + this.getClass().getSimpleName() + "):\n\t" + e.toString() + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
return null;
}
}
public void addData(final String[] data) {
totalDataAdded++;
try {
HashMap<String, HashMap> dataMap = new HashMap<String, HashMap>();
if (data.length != coldesc_arr_count) {
throw new Exception("Malformed data");
}
String datakey;
if (attribscol != -1) {
if(data[attribscol].matches(".*=\".*") || data[attribscol].matches(".*\\s+.*") || data[attribscol].equals("*")){ // multiple XML attribs
if(data[elemscol].equalsIgnoreCase("EVENT") && data[attribscol].matches(".*class=\".*")){
datakey=data[elemscol] + "-" + data[attribscol].substring(data[attribscol].indexOf("class=\"")+7, data[attribscol].indexOf('"', data[attribscol].indexOf("class=\"")+7));
}else{
if(data[elemscol].equalsIgnoreCase("TIMEX3") && data[attribscol].matches(".*type=\".*")){
datakey=data[elemscol] + "-" + data[attribscol].substring(data[attribscol].indexOf("type=\"")+6, data[attribscol].indexOf('"', data[attribscol].indexOf("type=\"")+6));
}else{
datakey = data[elemscol];
}
}
}else{
datakey = data[elemscol] + "-" + data[attribscol];
}
} else {
datakey = data[elemscol];
}
// PUT SPECIFIC ELEMENT
if (elements.containsKey(datakey)) {
HashMap<String, HashMap> colsMap = elements.get(datakey);
for (int col = 0; col < coldesc_arr_count; col++) {
if (col != elemscol && col != attribscol) {
HashMap<String, Integer> colMap = colsMap.get(coldesc_arr[col]);
Integer colMapValue = colMap.get(data[col]);
if (colMapValue != null) {
colMapValue++;
} else {
colMapValue = 1;
}
colMap.put(data[col], colMapValue);
//colsMap.put(coldesc_arr[col], colMap); // java modifica obj por valor
}
}
final Integer totalValue = ((Integer) (colsMap.get("total")).get("total")) + 1;
colsMap.put("total", new HashMap<String, Integer>() {
{
put("total", totalValue);
}
});
//elements.put(datakey, colsMap); // java modifica obj por valor
} else {
// fill the dataMap
for (int col = 0; col < coldesc_arr_count; col++) {
if (col != elemscol && col != attribscol) {
final String tempdata = data[col];
dataMap.put(coldesc_arr[col], new HashMap<String, Integer>() {
{
put(tempdata, 1);
}
});
}
}
dataMap.put("total", new HashMap<String, Integer>() {
{
put("total", 1);
}
});
elements.put(datakey, dataMap);
}
// GENERAL STATS FOR NON-GENERAL CASES (e.g., EVENT-OCCURRENCE)
if(!datakey.equals(data[elemscol])){
HashMap<String, HashMap> dataMapG = new HashMap<String, HashMap>();
if (elements.containsKey("0_GENERAL_"+data[elemscol])) {
HashMap<String, HashMap> colsMapG = elements.get("0_GENERAL_"+data[elemscol]);
for (int col = 0; col < coldesc_arr_count; col++) {
if (col != elemscol && col != attribscol) {
HashMap<String, Integer> colMapG = colsMapG.get(coldesc_arr[col]);
Integer colMapValue = colMapG.get(data[col]);
if (colMapValue != null) {
colMapValue++;
} else {
colMapValue = 1;
}
colMapG.put(data[col], colMapValue);
//colsMap.put(coldesc_arr[col], colMap); // java modifica obj por valor
}
}
final Integer totalValue = ((Integer) (colsMapG.get("total")).get("total")) + 1;
colsMapG.put("total", new HashMap<String, Integer>() {
{
put("total", totalValue);
}
});
//elements.put(datakey, colsMap); // java modifica obj por valor
} else {
// fill the dataMap
for (int col = 0; col < coldesc_arr_count; col++) {
if (col != elemscol && col != attribscol) {
final String tempdata = data[col];
dataMapG.put(coldesc_arr[col], new HashMap<String, Integer>() {
{
put(tempdata, 1);
}
});
}
}
dataMapG.put("total", new HashMap<String, Integer>() {
{
put("total", 1);
}
});
elements.put("0_GENERAL_"+data[elemscol], dataMapG);
}
}
} catch (Exception e) {
System.err.println("Errors found (" + this.getClass().getSimpleName() + "):\n\t" + e.toString() + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
}
}
/**
* adds 1 to the value of GLOBAL statistics subkey in key if it is found
* (i.e., [pos,VBZ], if it is found and its value is 5 then become 6
*
* @param statkey
* @param statsubkey
*/
public void addGLOBALdata(String statkey, String statsubkey) {
try {
Integer colMapValue=1;
totalGlobalDataAdded++;
if (!elements.containsKey("GLOBAL")) {
this.createGLOBALelement();
}
HashMap<String, HashMap> colsMapGlobal = elements.get("GLOBAL");
if (colsMapGlobal.containsKey(statkey)) { // word, pos, etc.
HashMap<String, Integer> colMapGlobal = colsMapGlobal.get(statkey);
if (colMapGlobal.containsKey(statsubkey)) { // la, perro, ser, etc.
colMapValue = colMapGlobal.get(statsubkey);
if (colMapValue != null) {
colMapValue++;
} else {
colMapValue = 1;
}
colMapGlobal.put(statsubkey, colMapValue); // Necessary because Integer is a basic type and then not a reference
}
else{ // create first time (la, perro, etc.)
colMapGlobal.put(statsubkey, 1);
}
}
final Integer totalValue = ((Integer) (colsMapGlobal.get("total")).get("total")) + 1;
colsMapGlobal.put("total", new HashMap<String, Integer>() {
{
put("total", totalValue);
}
});
} catch (Exception e) {
System.err.println("Errors found (" + this.getClass().getSimpleName() + "):\n\t" + e.toString() + "\n");
if (System.getProperty("DEBUG") != null && System.getProperty("DEBUG").equalsIgnoreCase("true")) {
e.printStackTrace(System.err);
System.exit(1);
}
}
}
public void print() {
print(1,0);
}
public void print(int minpercent, int minpercentCorpus) {
System.out.println("\nPrinting Statistics (total Data " + totalDataAdded + ")\n-------------------------");
//System.out.println(elements);
if (elements.get("GLOBAL") != null) {
printGlobal(minpercent,minpercentCorpus);
} else {
printSimple(minpercent);
}
}
public void printSimple(int minpercent) {
for (String current_key : elements.keySet()) {
HashMap<String, HashMap> colsMap = elements.get(current_key);
Integer elementTotal = (Integer) (colsMap.get("total")).get("total");
System.out.println(current_key + "(" + elementTotal + ")");
for (String cols_key : colsMap.keySet()) {
//if(!cols_key.equalsIgnoreCase(coldesc_arr[elemscol]) && !cols_key.equalsIgnoreCase(coldesc_arr[attribscol]) && !cols_key.equalsIgnoreCase("total")){
if (!cols_key.equalsIgnoreCase("total")) {
System.out.println("\t" + cols_key);
HashMap<String, Integer> colMap = colsMap.get(cols_key);
TreeMap<String, Integer> sortedColMap = new TreeMap(new DescStringIntMapComparator(colMap));
sortedColMap.putAll(colMap);
for (String col_key : sortedColMap.keySet()) {
Integer colValue = colMap.get(col_key);
//double colPercent=((double) colValue/(double) elementTotal)*100.0;
int colPercent = (int) Math.round(((double) colValue / (double) elementTotal) * 100.0);
if (colPercent >= minpercent) {
System.out.println("\t\t" + col_key + "\t" + colPercent + "%" + " (" + colValue + ")"); //+" ("+colValue+")"
}
}
System.out.println();
}
}
System.out.println();
}
System.out.println();
}
public void printGlobal(int minpercent, int minpercentCorpus) {
HashMap<String, HashMap> colsMapGlobal = elements.get("GLOBAL");
String[] keylist=elements.keySet().toArray(new String[0]);
Arrays.sort(keylist);
for (String current_key : keylist) { // elements.keySet()
if (!current_key.equals("GLOBAL")) {
HashMap<String, HashMap> colsMap = elements.get(current_key);
Integer elementTotal = (Integer) (colsMap.get("total")).get("total");
System.out.println(current_key + "(" + elementTotal + ")");
for (String cols_key : colsMap.keySet()) {
if (!cols_key.equals("total") && !cols_key.equals("attribs")) {
System.out.println("\t" + cols_key);
HashMap<String, Integer> colMap = colsMap.get(cols_key);
HashMap<String, Integer> colMapGlobal = colsMapGlobal.get(cols_key);
TreeMap<String, Integer> sortedColMap = new TreeMap(new DescStringIntMapComparator(colMap));
sortedColMap.putAll(colMap);
for (String col_key : sortedColMap.keySet()) {
Integer colValue = colMap.get(col_key);
Integer globalValue = colMapGlobal.get(col_key);
if(globalValue==null){
globalValue=colValue*1000; // non significant value
}
//double colPercent=((double) colValue/(double) elementTotal)*100.0;
int colPercent = (int) Math.round(((double) colValue / (double) elementTotal) * 100.0);
if (colPercent >= minpercent) {
double globPercent = ((int) Math.round(((double) colValue / (double) globalValue) * 1000.0)) / 10.0;
System.out.print("\t\t" + col_key + "\t" + colPercent + "%" + " (" + colValue + ")");
if (globPercent >= minpercentCorpus && globPercent > 0.1 && !cols_key.equals("span")) {
//System.out.print(" ----> " + globPercent + "% of corpus \"" + col_key + "\" (" + globalValue + ")");
System.out.print(" ----> " + globPercent + "% of corpus (" + globalValue + ")");
}
System.out.println();
}
}
System.out.println();
}
}
System.out.println();
}
}
System.out.println();
}
public void createGLOBALelement() {
HashMap<String, HashMap> colsMapGlobal = new HashMap<String, HashMap>();
for(int currcol=0;currcol < coldesc_arr_count;currcol++){
if(currcol!=elemscol && currcol!=attribscol){
colsMapGlobal.put(coldesc_arr[currcol], new HashMap<String, Integer>());
}
}
colsMapGlobal.put("total", new HashMap<String, Integer>() {
{
put("total", 1);
}
});
elements.put("GLOBAL", colsMapGlobal);
}
}