//Dstl (c) Crown Copyright 2017
// Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.consumers;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.TreeMap;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.jsoup.nodes.Element;
import uk.gov.dstl.baleen.consumers.utils.AbstractHtmlConsumer;
import uk.gov.dstl.baleen.types.semantic.Entity;
/**
* Creates HTML5 versions of the document, with entities annotated as spans. The original formatting
* of the document is lost, and only the content is kept.
*
* Relationships are not currently supported.
*
* @baleen.javadoc
*/
public class Html5 extends AbstractHtmlConsumer {
private Map<Integer, String> getEntityInsertPositions(JCas jCas) {
Map<Integer, String> insertPositions = new TreeMap<>();
Map<Integer, List<Entity>> entityStartPositions = new HashMap<>();
for (Entity e : JCasUtil.select(jCas, Entity.class)) {
if (insertPositions.containsKey(e.getBegin())) {
List<Entity> entities =
entityStartPositions.getOrDefault(e.getBegin(), new ArrayList<>());
long eCount = entities.stream().filter(e2 -> e2.getEnd() > e.getEnd()).count();
String[] spans = insertPositions.get(e.getBegin()).split("(?<=>)");
insertPositions.put(e.getBegin(), joinSpans(eCount, e, spans));
} else {
insertPositions.put(e.getBegin(), generateSpanStart(e));
}
List<Entity> entities =
entityStartPositions.getOrDefault(e.getBegin(), new ArrayList<>());
entities.add(e);
entityStartPositions.put(e.getBegin(), entities);
String end = insertPositions.getOrDefault(e.getEnd(), "");
end = "</span>" + end;
insertPositions.put(e.getEnd(), end);
}
return insertPositions;
}
/**
* @param eCount The number of entities starting in the same position as e, but finishing
* afterwards
* @param e The entity of interest
* @param spans The array of spans that we already have
* @return
*/
private String joinSpans(long eCount, Entity e, String[] spans) {
StringBuilder joinedSpans = new StringBuilder(eCount == 0 ? generateSpanStart(e) : "");
Integer i = 0;
for (String span : spans) {
joinedSpans.append(span);
i++;
if (i == eCount) {
joinedSpans.append(generateSpanStart(e));
}
}
return joinedSpans.toString();
}
private String generateSpanStart(Entity e) {
String value = e.getValue() == null ? "" : e.getValue().replaceAll("\"", "'");
String referent =
e.getReferent() == null ? "" : Long.toString(e.getReferent().getInternalId());
return String.format("<span class=\"baleen %s\" id=\"%s\" value=\"%s\" data-referent=\"%s\">",
e.getClass().getSimpleName(),
e.getExternalId(), value, referent);
}
@Override
protected void writeBody(JCas jCas, Element body) {
// Entities
Map<Integer, String> insertPositions = getEntityInsertPositions(jCas);
Element div = body.appendElement("div");
div.attr("style", "white-space: pre-line");
String text = jCas.getDocumentText();
Integer offset = 0;
for (Entry<Integer, String> pos : insertPositions.entrySet()) {
String insert = pos.getValue();
text =
text.substring(0, pos.getKey() + offset) + insert + text.substring(pos.getKey() + offset);
offset += insert.length();
}
div.append(text);
}
}