package ecologylab.bigsemantics.html; import java.util.ArrayList; import org.w3c.dom.Node; import ecologylab.bigsemantics.html.documentstructure.AnchorContext; import ecologylab.bigsemantics.html.documentstructure.RecognizedDocumentStructure; import ecologylab.bigsemantics.metadata.builtins.ImageClipping; import ecologylab.net.ParsedURL; /** * Interface for connecting our DOMImageText extraction code to various DOM Parsers, starting w Tidy. * * @author eunyee, aaron, andruid */ public interface DOMParserInterface { public void setTitle(Node node); public void setBold(boolean on) ; public void setItalic(boolean on); public void generateCandidateContainersFromContexts(ArrayList<AnchorContext> anchorContexts, boolean fromContentBody); /** * Construct a TextClipping, with text not associated with an ImageClipping. * Associate with this. * * @param paraText */ public void constructTextClipping(ParagraphText paraText); public int numExtractedClippings(); public void removeTheContainerFromCandidates(ParsedURL containerPURL); /** * Construct a clipping for a different Document. * * @param imgNode * @param anchorHref * @return */ public ImageClipping constructAnchorImageClipping(ImgElement imgNode, ParsedURL anchorHref); public ImageClipping constructImageClipping(ImgElement imgNode, ParsedURL anchorHref); public void setIndexPage ( ); public void setContent ( ); public void setRecognizedDocumentStructure(Class<? extends RecognizedDocumentStructure> pageType); // public void setDocument(org.w3c.dom.Document node); }