//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.contentmanipulators;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import uk.gov.dstl.baleen.contentmanipulators.helpers.ContentManipulator;
/**
* Convert JSP101 style paragraph text to headings.
*
* Supports only subject headings (mapped to h1) and group/main headings (both mapped to h2)
*
* See notes in source code (or JSP101 itself) to definitions of each heading type.
*
*/
public class Jsp101Headings implements ContentManipulator {
@Override
public void manipulate(Document document) {
// Descriptions taken from the JSP101 Chapter 6, Paragraph 23
// Documents start with a subject heading, which helps the
// reader know at a glance the general subject of the document. However, you may
// omit a subject heading from a letter to a member of the public. The subject heading
// is written in bold capitals (not underlined and not followed by a full stop).
// => H1 if bold and in CAPS, no full stop.
addHeading(document, true, "h1");
// A main heading introduces 2 or more groups of paragraphs
// relating to the same general topic. Use main headings only when the document is
// lengthy or complicated. A main heading shows the general content as far as the
// next main heading. Centre a main heading above the text and use bold letters (not
// numbered, not underlined and not followed by a full stop)
// => H2 if bold, no full stop
addHeading(document, false, "h2");
// A group heading introduces one or more paragraphs relating
// to the same general topic. It shows the content as far as the next group or main
// heading. Starting at the left margin, write a group heading in bold letters (not
// numbered, not underlined and not followed by a full stop).
// => h3 if bold, no full stop and centred
// BUT can't do centred, so the same as the main heading covered above
// A paragraph heading indicates the content only of its
// own paragraph (including any sub-paragraphs and further subdivisions), but not of
// any following text. Once you use a paragraph heading, give all following paragraphs
// a heading until the next main or group heading. Put a paragraph heading on the
// same line as the opening words of the paragraph, preceded by a paragraph number
// if appropriate. Write a paragraph heading in bold (but not underlined), with a full
// stop to show where the heading ends.
// Ditto sub paragraph heading.
// => h4 if p starts (after a para number eg 1. / a.) with a bold sentence, ending in full stop.
// BUT seems little reason to treat this differently to the a real sentence.
// Plus there's the complexity of list item vs paragraph number.
}
/**
* Adds the heading for all bold paragraphs which don't end in full stop (which are captials
* depending on the boolean)
*
* @param document the document
* @param capitals does the heading need to be in capitals
* @param headingTag the HTML heading tag to use
*/
private void addHeading(Document document, boolean capitals, String headingTag) {
document.select("p").forEach(p -> {
String text = p.text().trim();
if (!isBold(p, text))
return;
// No full stop (or similar) at the end of a title
if(text.isEmpty() || text.substring(text.length() - 1).matches("[\\.:!\\?,;]"))
return;
if(!capitals || isAllUpperCase(text))
p.tagName(headingTag);
});
}
private boolean isAllUpperCase(String text) {
return text.toUpperCase().equals(text);
}
private boolean isBold(Element e, String text) {
// Bold if one of the parents is bold
if (!e.getElementsByTag("b").select("b").isEmpty()) {
return true;
}
// Or if Bold child which has the same text.
Elements boldChildren = e.select("b");
if (boldChildren.isEmpty()) {
return false;
}
return boldChildren.first().text().equals(text);
}
}