//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.templates; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Optional; import org.apache.commons.lang.StringUtils; import uk.gov.dstl.baleen.exceptions.InvalidParameterException; import uk.gov.dstl.baleen.types.structure.Structure; import uk.gov.dstl.baleen.types.structure.TableBody; import uk.gov.dstl.baleen.types.templates.TemplateField; import uk.gov.dstl.baleen.types.templates.TemplateRecord; import uk.gov.dstl.baleen.uima.utils.SelectorPart; import uk.gov.dstl.baleen.uima.utils.SelectorPath; import uk.gov.dstl.baleen.uima.utils.select.ItemHierarchy; import uk.gov.dstl.baleen.uima.utils.select.Node; /** * This class is used to search for {@link SelectorPath}s for the creation of * {@link TemplateRecord}s and {@link TemplateField}s. It records matches found * for repeating units, subsequent calls compensate the path for these repeated * structures. It is therefore important to process the document in order so * later items are correctly compensated for earlier repeating annotations. The * order property of the record definitions can be used for this. */ public class RecordStructureManager { /** * A map to store alteration to the expected structure based on the * repeating structural annotations */ private final Map<Structure, Map<Class<?>, Integer>> data = new HashMap<>(); /** The structure hierarchy to process */ private final ItemHierarchy<Structure> structureHierarchy; /** * Constructor for the record structure manager * * @param structureHierarchy * the structure hierarchy */ public RecordStructureManager(ItemHierarchy<Structure> structureHierarchy) { this.structureHierarchy = structureHierarchy; } /** * Get the structure annotation for the given path if found * * @see SelectorPath * @param path * the selector path * @return optional of the structure annotation * @throws InvalidParameterException */ public Optional<Structure> select(String path) throws InvalidParameterException { return select(SelectorPath.parse(path)); } /** * Get the structure annotation for the given selector path if found * * @see SelectorPath * @param selector * path the selector path * @return optional of the structure annotation */ public Optional<Structure> select(SelectorPath selectorPath) { return select(structureHierarchy.getRoot(), selectorPath); } /** * Internal recursive method to select the path. This compensates the path * for recorded repeating structures. * * @param node * the current node * @param selectorParts * the (remaining) path to select * @return optional of the structure annotation */ private Optional<Structure> select(Node<Structure> node, SelectorPath selectorParts) { if (selectorParts.isEmpty()) { return Optional.ofNullable(node.getItem()); } SelectorPart current = selectorParts.get(0); SelectorPath remaining = selectorParts.step(); Map<Class<?>, Integer> map = data.get(node.getItem()); Integer adjustment = Integer.valueOf(0); if (map != null) { adjustment = map.getOrDefault(current.getType(), Integer.valueOf(0)); } Optional<Node<Structure>> found = node.getChildren().stream() .filter(c -> current.getType().equals(c.getItem().getClass())) .skip(Math.max(0, current.getIndex() - 1) + adjustment).findFirst(); if (found.isPresent()) { return select(found.get(), remaining); } else { return Optional.empty(); } } /** * Create a definition of the possible repeating structures to search for * * @param recordDefinition * the record definition * @return The repeat search object derived from the given record definition * @throws InvalidParameterException * if any of the paths are not valid */ public RepeatSearch createRepeatSearch(TemplateRecordConfiguration recordDefinition) throws InvalidParameterException { List<SelectorPath> coveredRepeat = createRepeatUnit(recordDefinition.getCoveredPaths()); SelectorPath minimalRepeat = SelectorPath.parse(recordDefinition.getMinimalRepeat()); return new RepeatSearch(coveredRepeat, minimalRepeat); } /** * Create the repeat unit * * @param preceedingParts * the preceding structure path * @param paths * the path to generate the repeat * @return a list of the repeating paths * @throws InvalidParameterException * if one of the paths is invalid */ private List<SelectorPath> createRepeatUnit(List<String> paths) throws InvalidParameterException { List<SelectorPath> repeat = new ArrayList<>(); for (String path : paths) { SelectorPath covered = SelectorPath.parse(path); repeat.add(covered); } return repeat; } /** * Try to find the given repeating structure after the given preceding * structure. * * @param preceding * the preceding structure * @param repeatSearch * the repeat search * @return optional of the found repeating structure */ private Optional<List<Structure>> getRepeat(Optional<Structure> preceding, RepeatSearch repeatSearch) { SelectorPath minimalRepeat = repeatSearch.getMinimalRepeat(); Optional<List<Structure>> match = Optional.empty(); if (minimalRepeat.getDepth() > 0) { match = checkNextForMatch(minimalRepeat, preceding); } if (!match.isPresent()) { match = searchParts(preceding, repeatSearch.getCoveredRepeat()); } return match; } /** * Try to find the given repeating structure after the given preceding * structure. * * @param preceding * the preceding structure * @param repeat * the repeat to search for * @return optional of the found repeating structure */ private Optional<List<Structure>> searchParts(Optional<Structure> preceding, List<SelectorPath> repeat) { Optional<Structure> current = preceding; Iterator<SelectorPath> iterator = repeat.iterator(); List<Structure> matched = new ArrayList<>(); while (iterator.hasNext()) { SelectorPath covered = iterator.next(); Optional<List<Structure>> match = checkNextForMatch(covered, current); if (match.isPresent()) { List<Structure> nextMatched = match.get(); current = Optional.of(nextMatched.get(nextMatched.size() - 1)); matched.addAll(nextMatched); } else { return Optional.empty(); } } return Optional.of(matched); } /** * Check the next structure to see if it matches the expected repeating * part. * * @param path * the path to match with * @param structure * the current structure * @return optional of the matched repeating structures */ private Optional<List<Structure>> checkNextForMatch(SelectorPath path, Optional<Structure> structure) { if (path == null || path.isEmpty()) { return Optional.empty(); } return checkNextForMatch(path, new ArrayList<>(), structure); } /** * Check the next structure to see if it matches the expected repeating * part. * * @param path * the path to match with * @param matching * the current list of matching structures * @param structure * the current structure * @return optional of the matched repeating structures */ private Optional<List<Structure>> checkNextForMatch(SelectorPath path, List<Structure> matching, Optional<Structure> structure) { Optional<Structure> nextCheck; if (structure.isPresent()) { nextCheck = structureHierarchy.getNext(structure.get()); } else { nextCheck = structureHierarchy.getRoot().getChildren().stream().map(Node::getItem).findFirst(); } if (nextCheck.isPresent()) { Structure next = nextCheck.get(); List<Structure> nextPath = structureHierarchy.getPath(next); Optional<Structure> match = match(nextPath, path); if (match.isPresent()) { matching.add(match.get()); return Optional.of(matching); } else { if (isWhitespace(next)) { matching.add(next); return checkNextForMatch(path, matching, Optional.of(next)); } } } return Optional.empty(); } /** * Check if the annotation contains anything * * @param structure * the structure * @return true if empty or white space */ private boolean isWhitespace(Structure structure) { String coveredText = structure.getCoveredText(); return StringUtils.isEmpty(coveredText) || StringUtils.isWhitespace(coveredText); } /** * Check if the annotation contains anything * * @param structure * the structure * @return true if not empty or white space */ private boolean isNotWhitespace(Structure structure) { return !isWhitespace(structure); } /** * Try to match the given structure path to the selector path given. We * first check the path matches the given list then if there are remaining * elements of the path we check if the direct children match the path. * * @param structurePath * the starting structure path to match * @param path * the path to match * @return optional of the matching structure */ private Optional<Structure> match(List<Structure> structurePath, SelectorPath path) { Iterator<SelectorPart> iterator = path.getParts().iterator(); for (Structure s : structurePath) { if (!iterator.next().getType().equals(s.getClass())) { return Optional.empty(); } } Structure current = structurePath.get(structurePath.size() - 1); while (iterator.hasNext()) { List<Structure> children = structureHierarchy.getChildren(current); if (!children.isEmpty()) { current = children.get(0); } else { return Optional.empty(); } if (!iterator.next().getType().equals(current.getClass())) { return Optional.empty(); } } return Optional.of(current); } /** * Check if the next structures are compatible with the given covered paths. * <p> * The returned structure may contain additional empty structures. This aims * to make the record extraction more robust to additional whitespace * particularly around repeating units. * * @param preceding * the optional preceding point in the structure * @param repeatSearch * the repeat search * @param isFirst * set true if this is the first call of a repeating record * @return option of the final structure if we can repeat empty if not */ public Optional<Structure> repeatRecord(Optional<Structure> preceding, RepeatSearch repeatSearch, boolean isFirst) { Optional<List<Structure>> repeat = getRepeat(preceding, repeatSearch); if (repeat.isPresent()) { List<Structure> structures = repeat.get(); // At least one match would be expected with out repetition if (!isFirst) { recordMatch(structures); } return Optional.of(structures.get(structures.size() - 1)); } else { // At least one match would be expected with out repetition if (isFirst) { recordMissing(repeatSearch.getCoveredRepeat()); } return Optional.empty(); } } /** * Check if the next structures are compatible with the given path. * <p> * * @param preceding * the optional preceding point in the structure * @param path * the repeating path * @param end * the end of the record * @return option of the field structure if we can repeat empty if not */ public Optional<Structure> repeatField(Optional<Structure> preceding, SelectorPath path, int end) { Optional<List<Structure>> repeat = checkNextForMatch(path, preceding); if (repeat.isPresent()) { List<Structure> structures = repeat.get(); Structure lastStructure = structures.get(structures.size() - 1); if (lastStructure.getEnd() <= end) { recordMatch(structures); return structures.stream().filter(this::isNotWhitespace).findFirst(); } } return Optional.empty(); } /** * Record the repeating match of the given structures. These will be used to * compensate future calls to {@link #select(String)}. * * @param structures * the repeating structures */ private void recordMatch(List<Structure> structures) { for (Structure s : structures) { recordMatch(s); } } /** * Record the repeating match of the given structure. These will be used to * compensate future calls to {@link #select(String)}. * * @param structure * the repeating structure */ public void recordMatch(Structure record) { if (record instanceof TableBody) { Optional<Structure> table = structureHierarchy.getParent(record); if (table.isPresent()) { record = table.get(); } } record(record); } /** * Record the repeating match of the given structure. These will be used to * compensate future calls to {@link #select(String)}. * * @param structure * the repeating structure */ private void record(Structure current) { Optional<Structure> parent = structureHierarchy.getParent(current); if (parent.isPresent()) { Map<Class<?>, Integer> map = data.get(parent.get()); if (map == null) { map = new HashMap<>(); data.put(parent.get(), map); } Integer integer = map.getOrDefault(current.getClass(), 0); map.put(current.getClass(), integer + 1); } } /** * Record the missing first repeat of the given selector paths. These will * be used to compensate future calls to {@link #select(String)}. * * @param paths * the repeating structure */ private void recordMissing(List<SelectorPath> paths) { for (SelectorPath s : paths) { recordMissing(s); } } /** * Record the missing first repeat of the given selector paths. These will * be used to compensate future calls to {@link #select(String)}. * * @param paths * the repeating structure */ public void recordMissing(SelectorPath path) { SelectorPath parentPath = path.toDepth(path.getDepth() - 1); Optional<Structure> parent = select(parentPath); if (parent.isPresent()) { Map<Class<?>, Integer> map = data.get(parent.get()); if (map == null) { map = new HashMap<>(); data.put(parent.get(), map); } Class<?> type = path.get(path.getDepth() - 1).getType(); Integer integer = map.getOrDefault(type, 0); map.put(type, integer - 1); } } }