package ecologylab.bigsemantics.html.documentstructure; import java.util.ArrayList; import java.util.TreeMap; import org.w3c.dom.Node; import ecologylab.bigsemantics.html.DOMParserInterface; import ecologylab.bigsemantics.html.ImgElement; import ecologylab.bigsemantics.html.ParagraphText; import ecologylab.net.ParsedURL; /** * This is for the text-only page. * This page might contain images but those images are not informative images, thus we recognize those pages with non-informative images * as text-only pages. * * @author eunyee * @author andruid */ public class TextOnlyPage extends RecognizedDocumentStructure { public TextOnlyPage(ParsedURL purl) { super(purl); } private static final int MIN_PARA_TEXT_LENGTH = 25; public static final int MAX_TEXT_SURROGATES = 5; /** * Generate only text surrogates */ @Override public void generateSurrogates(Node articleMain, ArrayList<ImgElement> imgNodes, int totalTxtLeng, TreeMap<Integer, ParagraphText> paraTexts, DOMParserInterface htmlType) { int count = 0; while (!paraTexts.isEmpty() && count++<MAX_TEXT_SURROGATES) { ParagraphText paraText = paraTexts.remove(paraTexts.lastKey()); generateTextSurrogate(paraText, htmlType); paraText.recycle(); // releases to pool } } private void generateTextSurrogate(ParagraphText paraText, DOMParserInterface htmlType) { if (paraText.length()>MIN_PARA_TEXT_LENGTH) { paraText.unescapeXML(); // Creates a TextElement using the buffy in paraText htmlType.constructTextClipping(paraText);//, paraTexts.node ); } } }