//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.contentmappers.structure; import java.util.Set; import org.apache.uima.UIMAException; import org.apache.uima.jcas.JCas; import org.jsoup.Jsoup; import org.reflections.Reflections; import com.google.common.collect.ImmutableList; import uk.gov.dstl.baleen.contentextractors.helpers.DocumentToJCasConverter; import uk.gov.dstl.baleen.contentmappers.StructuralAnnotations; import uk.gov.dstl.baleen.types.structure.Structure; import uk.gov.dstl.baleen.uima.testing.JCasSingleton; import uk.gov.dstl.baleen.uima.utils.StructureHierarchy; import uk.gov.dstl.baleen.uima.utils.select.Node; public class AbstractHtmlToStructureTest { private static final Set<Class<? extends Structure>> structuralClasses; private static final DocumentToJCasConverter converter = new DocumentToJCasConverter(ImmutableList.of(new StructuralAnnotations())); static { Reflections reflections = new Reflections(Structure.class.getPackage().getName()); structuralClasses = reflections.getSubTypesOf(Structure.class); } public Node<Structure> createStructure(String html) throws UIMAException { JCas jCas = JCasSingleton.getJCasInstance(); converter.apply(Jsoup.parse(html), jCas); return StructureHierarchy.build(jCas, structuralClasses).getRoot(); } }