package org.lobid.lodmill;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map.Entry;
import org.culturegraph.mf.framework.DefaultStreamReceiver;
import org.culturegraph.mf.framework.StreamReceiver;
import org.culturegraph.mf.framework.annotations.Description;
import org.culturegraph.mf.framework.annotations.In;
import org.culturegraph.mf.framework.annotations.Out;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Field statistics. Instead of occurences a comma separated list of values may
* be output.
*
* @author Pascal Christoph (dr0i)
* @author Fabian Steeg (fsteeg)
*
*/
@Description("Sorted field statistics. May have appended a list of all values which "
+ "are part of a field. The parameter 'filename' defines the place to store the"
+ " stats on disk.")
@In(StreamReceiver.class)
@Out(Void.class)
public final class Stats extends DefaultStreamReceiver {
private static final Logger LOG =
LoggerFactory.getLogger(DefaultStreamReceiver.class);
final HashMap<String, Integer> occurenceMap = new HashMap<>();
final HashMap<String, StringBuilder> valueMap = new HashMap<>();
private String filename;
private static FileWriter textileWriter;
/**
* Default constructor
*/
public Stats() {
this.filename =
"stats." + (Calendar.getInstance().getTimeInMillis() / 1000) + ".csv";
}
/**
* Sets the filename for writing the statistics.
*
* @param filename the filename
*/
public void setFilename(final String filename) {
this.filename = filename;
}
/**
* Since the default name file this class produces is rather unique it should
* be removable, especially when running as a test
*
*/
public void removeTestFile() {
new File(this.filename).deleteOnExit();
}
/**
* Counts occurences of fields. If field name starts with "log:", not the
* occurence are counted but the values are concatenated.
*/
@Override
public void literal(final String name, final String value) {
if (name.startsWith("log:")) {
valueMap.put(name, (valueMap.containsKey(name)
? valueMap.get(name).append("," + value) : new StringBuilder(value)));
} else
occurenceMap.put(name,
(occurenceMap.containsKey(name) ? occurenceMap.get(name) : 0) + 1);
}
@Override
public void closeStream() {
try {
writeTextileMappingTable(sortedByValuesDescending(),
new ArrayList<>(valueMap.entrySet()), new File(this.filename));
} catch (IOException e) {
e.printStackTrace();
}
}
static void writeTextileMappingTable(
final List<Entry<String, Integer>> occurenceEntries,
final List<Entry<String, StringBuilder>> valueEntries,
final File textileMappingFile) throws IOException {
final StringBuilder textileBuilder = new StringBuilder(
"|*field*|*frequency or values separated with commata*|\n");
LOG.info("Field\tFrequency or comma separated values");
LOG.info("----------------");
createCsv(occurenceEntries, textileBuilder);
createCsv(valueEntries, textileBuilder);
textileWriter = new FileWriter(textileMappingFile);
try {
textileWriter.write(textileBuilder.toString());
textileWriter.flush();
} finally {
textileWriter.close();
}
}
private static <T, I> void createCsv(final List<Entry<T, I>> entries,
final StringBuilder textileBuilder) {
entries.forEach(e -> {
LOG.info(e.getKey() + "\t" + e.getValue());
textileBuilder
.append(String.format("|%s|%s|\n", e.getKey(), e.getValue()));
});
}
List<Entry<String, Integer>> sortedByValuesDescending() {
final List<Entry<String, Integer>> entries =
new ArrayList<>(occurenceMap.entrySet());
Collections.sort(entries, new Comparator<Entry<String, Integer>>() {
@Override
public int compare(final Entry<String, Integer> entry1,
final Entry<String, Integer> entry2) {
// compare second to first for descending order:
return entry2.getValue().compareTo(entry1.getValue());
}
});
return entries;
}
}