//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.structural; import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import uk.gov.dstl.baleen.types.structure.Structure; import uk.gov.dstl.baleen.types.structure.Table; import uk.gov.dstl.baleen.types.structure.TableCell; import uk.gov.dstl.baleen.uima.utils.StructureHierarchy; import uk.gov.dstl.baleen.uima.utils.StructureUtil; import uk.gov.dstl.baleen.uima.utils.select.ItemHierarchy; import uk.gov.dstl.baleen.uima.utils.select.Node; import uk.gov.dstl.baleen.uima.utils.select.Nodes; /** * Helper class for working with {@link Table}s. * * <p> * This is an example of how structure selectors can be used */ public class Tables { private List<Pattern> columnPatterns = new ArrayList<>(); private Nodes<Structure> nodes; /** * Constructor for tables helper * * @param jCas * the jcas * @throws AnalysisEngineProcessException * if class creation error */ public Tables(JCas jCas) throws AnalysisEngineProcessException { try { ItemHierarchy<Structure> hierarchy = StructureHierarchy.build(jCas, StructureUtil.getStructureClasses()); nodes = hierarchy.getRoot().select("Table"); } catch (ResourceInitializationException e) { throw new AnalysisEngineProcessException("Can not create structure helper", null, e); } } /** * Filter to tables with a column matching the given name. * <p> * This uses the table header to find column names. * * * @param p * the pattern to match * @return this for builder pattern */ public Tables withColumn(String p) { return withColumn(Pattern.compile(p)); } /** * Filter to tables with a column matching the given name. * <p> * This uses the table header to find column names. * * * @param p * the pattern to match * @return this for builder pattern */ public Tables withColumn(Pattern p) { nodes = nodes.select(":has(TableHeader:matches(" + p.pattern() + "))"); columnPatterns.add(p); return this; } /** * Get the rows of the tables filtered to the specified columns and in given * column order. * * @return a stream of the filtered rows (ie list of TableCells) */ public Stream<List<TableCell>> getFilteredRows() { return nodes.stream().flatMap(table -> { List<Nodes<Structure>> columns = columnPatterns.stream() .map(p -> nodes.select("TableHeader TableCell:matches(" + p.pattern() + ")")) .map(p -> p.get(0)) .map(Node::getSiblingIndex) .map(i -> nodes.select("TableBody > TableRow > TableCell:nth-child(" + (i + 1) + ")")) .collect(Collectors.toList()); if (columns.isEmpty()) { return Stream.empty(); } // validate that the input lists are all the same size. int numItems = columns.get(0).size(); for (int i = 1; i < columns.size(); i++) { if (columns.get(i).size() != numItems) { // non-uniform-length list at index i return Stream.empty(); } } List<List<TableCell>> result = new ArrayList<>(); for (int i = 0; i < numItems; i++) { // create a tuple of the i-th entries of each list List<TableCell> row = new ArrayList<>(columns.size()); for (Nodes<Structure> column : columns) { row.add((TableCell) column.get(i).getItem()); } result.add(row); } return result.stream(); }); } /** * Get the cells of the tables filtered to the specified columns. * * @return a stream of the filtered table cells */ public Stream<TableCell> getFilteredCells() { return getFilteredRows().flatMap(List::stream); } /** * Get the tables which pass the specified column filters. * * @return a stream of the filtered tables */ public Stream<Table> getTables() { return nodes.stream().map(t -> (Table) t.getItem()); } /** * Get the full rows of the tables which pass the specified column filters. * * @return a stream of the filtered table cells */ public Stream<List<TableCell>> getRows() { return nodes.select("TableBody > TableRow").stream() .map(tr -> tr.getChildren().stream().map(tc -> (TableCell) tc.getItem()).collect(Collectors.toList())); } /** * Get all the cells of the tables which pass the specified column filters. * * @return a stream of the table cells */ public Stream<TableCell> getCells() { return getRows().flatMap(List::stream); } }