//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.contentmanipulators; import java.util.Collections; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Set; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import uk.gov.dstl.baleen.contentmanipulators.helpers.ContentManipulator; /** * Converts BR tags into new paragraphs. * * If BR tags exist in a paragraph or other tag we probably want Baleen to process these are as * separate block of text. This manipulator uses BR tags to introduce new paragraphs. * * If the BR tag occurs within a paragraph, then the paragraph is split into multiple sub * paragraphs. If the br occurs elsewhere (eg in a td or li) then the a set of paragraphs are * introduced into the element. */ public class NewLineToNewParagraph implements ContentManipulator { @Override public void manipulate(Document document) { // Find elements which need to be spilt up Set<Element> elementsWithBr = new HashSet<>(); document.select("br").forEach(e -> elementsWithBr.add(e.parent())); // For each parent elementsWithBr.forEach(e -> { List<Element> runs = collectRuns(document, e); if (!runs.isEmpty()) { addRunsToDom(e, runs); } }); } /** * Collect tags which are on the same line (unbroken by BRs) * * @param document the document * @param e the e * @return the list */ private List<Element> collectRuns(Document document, Element e) { List<Element> runs = new LinkedList<>(); Element run = null; for (Node c : e.childNodesCopy()) { if (c instanceof Element && ("br".equalsIgnoreCase(((Element) c).tagName()))) { // If we hit a br then add the old run and start a new one if (run != null) { runs.add(run); run = null; } } else { // If not a br then add this node to the other if (run == null) { run = document.createElement("p"); } run.appendChild(c); } } // Add the last run if (run != null) { runs.add(run); } return runs; } /** * Adds each new line (a run) to the documnet as a paragraph. * * @param e the element at which to add the runs. * @param runs the runs */ private void addRunsToDom(Element e, List<Element> runs) { // Add these new spans into the DOM if ("p".equalsIgnoreCase(e.tagName())) { // If this is a p, then just add below it // reverse order so the first element of runs ends up closest to p as it should be Collections.reverse(runs); runs.forEach(e::after); // Delete the old paragraph e.remove(); } else { // If we aren't in a p (eg in a li) then lets add paragraphs to this element // But first clear it out e.children().remove(); runs.forEach(e::appendChild); } } }