package com.logica.oam.ktree.statistics;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.camel.Body;
import com.villemos.ispace.ktree.folder.Item;
public class ExtractorStatistics {
public void process(@Body Map<String, List<Object>> data) {
/** Calculate the statistics;
*
* - Total number of documents.
*
* - Total number of documents.
*
* - Documents lacking reference ID.
* - Documents with reference ID (somewhere).
* x Documents with reference ID (header or body).
* x Documents which is unreadable.
*
* - Documents where reference ID was in metadata
* x Documents where reference ID was found in body (body header).
* x Documents where reference ID was found in body (body properties).
* x Documents where reference ID was found in body (count).
* - Documents where reference ID was found in title.
* - Documents where the reference ID is inconsistent.
*
* - Documents where Version was in metadata
* - Documents where Version was found in body.
* - Documents where Version was found in title.
* - Documents where Version is inconsistent.
*
* */
List statistics = new ArrayList<Object>();
int totalDocuments = 0;
int lackingRefId = 0;
int withRefId = 0;
int unreadable = 0;
int refIdMetadata = 0;
int refIdBodyHeader = 0;
int refIdBodyProperties = 0;
int refIdBodyCount = 0;
int refIdTitle = 0;
int refIdInconsistent = 0;
int lackingVersion = 0;
int withVersion = 0;
int versionMetadata = 0;
int versionBody = 0;
int versionTitle = 0;
int versionInconsistent = 0;
Iterator<Entry<String, List<Object>>> it = data.entrySet().iterator();
while (it.hasNext()) {
Entry<String, List<Object>> entry = it.next();
if (entry.getKey().equals("Statistics")) {
continue;
}
for (Object object : entry.getValue()) {
Item item = (Item) object;
totalDocuments++;
if (item.get("Language").equals("en") == false || (Double) item.get("Language Probability") < 0.9d) {
unreadable++;
}
if ( item.get("Reference ID").equals("") == true &&
item.get("Reference ID (body doc ref)").equals("") == true &&
item.get("Reference ID (body misc 1)").equals("") == true &&
item.get("Reference ID (body hits)").equals("") == true &&
item.get("Reference ID (property doc ref)").equals("") == true &&
item.get("Reference ID (Title)").equals("") == true) {
lackingRefId++;
}
else {
withRefId++;
}
if (item.get("Reference ID").equals("") == false) {refIdMetadata++;};
if (item.get("Reference ID (body doc ref)").equals("") == false || item.get("Reference ID (body misc 1)").equals("") == false) {refIdBodyHeader++;};
if (item.get("Reference ID (body hits)").equals("") == false) {refIdBodyCount++;};
if (item.get("Reference ID (property doc ref)").equals("") == false) {refIdBodyProperties++;};
if (item.get("Reference ID (Title)").equals("") == false) {refIdTitle++;};
if (compare((String) item.get("Reference ID (Metadata)"), (String) item.get("Reference ID (Body)"), (String) item.get("Reference ID (Title)")) == false) {
refIdInconsistent++;
};
if (item.get("Version (Metadata)").equals("") == true && item.get("Version (Body)").equals("") == true && item.get("Version (Title)").equals("") == true) {
lackingVersion++;
}
else {
withVersion++;
}
if (item.get("Version (Metadata)").equals("") == false) {versionMetadata++;};
if (item.get("Version (Body)").equals("") == false) {versionBody++;};
if (item.get("Version (Title)").equals("") == false) {versionTitle++;};
if (compare((String) item.get("Version (Metadata)"), (String) item.get("Version (Body)"), (String) item.get("Version (Title)")) == false) {
versionInconsistent++;
};
}
}
statistics.add(new Statistic("Total number of documents: (doc#)", totalDocuments));
statistics.add(new Statistic("Unreadable: (doc#)", unreadable));
statistics.add(new Statistic("Reference ID found in body (body ref): (doc#)", refIdBodyHeader));
statistics.add(new Statistic("Reference ID found in body (body properties): (doc#)", refIdBodyProperties));
statistics.add(new Statistic("Reference ID found in body (body count): (doc#)", refIdBodyCount));
statistics.add(new Statistic("Reference ID found in metadata: (doc#)", refIdMetadata));
statistics.add(new Statistic("Reference ID found in title: (doc#)", refIdTitle));
//statistics.add(new Statistic("Reference ID inconsistent (doc#)", refIdInconsistent));
statistics.add(new Statistic("Lacking Reference ID: (doc#)", lackingRefId));
statistics.add(new Statistic("With Reference ID: (doc#)", withRefId));
// statistics.add(new Statistic("Version found in body: (doc#)", versionBody));
// statistics.add(new Statistic("Version found in metadata: (doc#)", versionMetadata));
// statistics.add(new Statistic("Version found in title: (doc#)", versionTitle));
// statistics.add(new Statistic("Version inconsistent (doc#)", versionInconsistent));
//
// statistics.add(new Statistic("Lacking Version: (doc#)", lackingVersion));
// statistics.add(new Statistic("With Version: (doc#)", withVersion));
data.put("Statistics", statistics);
}
protected boolean compare(String meta, String body, String title) {
if (meta.equals("") == false && body.equals("") == false && meta.equals(body) == false) {
return false;
}
if (meta.equals("") == false && title.equals("") == false && meta.equals(title) == false) {
return false;
}
if (title.equals("") == false && body.equals("") == false && title.equals(body) == false) {
return false;
}
return true;
}
}