package ecologylab.bigsemantics.html.documentstructure;
import java.util.ArrayList;
import java.util.TreeMap;
import org.w3c.dom.Node;
import ecologylab.bigsemantics.html.DOMParserInterface;
import ecologylab.bigsemantics.html.ImgElement;
import ecologylab.bigsemantics.html.ParagraphText;
import ecologylab.net.ParsedURL;
/**
* This is for the text-only page.
* This page might contain images but those images are not informative images, thus we recognize those pages with non-informative images
* as text-only pages.
*
* @author eunyee
* @author andruid
*/
public class TextOnlyPage extends RecognizedDocumentStructure
{
public TextOnlyPage(ParsedURL purl)
{
super(purl);
}
private static final int MIN_PARA_TEXT_LENGTH = 25;
public static final int MAX_TEXT_SURROGATES = 5;
/**
* Generate only text surrogates
*/
@Override
public void generateSurrogates(Node articleMain, ArrayList<ImgElement> imgNodes, int totalTxtLeng,
TreeMap<Integer, ParagraphText> paraTexts, DOMParserInterface htmlType)
{
int count = 0;
while (!paraTexts.isEmpty() && count++<MAX_TEXT_SURROGATES)
{
ParagraphText paraText = paraTexts.remove(paraTexts.lastKey());
generateTextSurrogate(paraText, htmlType);
paraText.recycle(); // releases to pool
}
}
private void generateTextSurrogate(ParagraphText paraText, DOMParserInterface htmlType)
{
if (paraText.length()>MIN_PARA_TEXT_LENGTH)
{
paraText.unescapeXML();
// Creates a TextElement using the buffy in paraText
htmlType.constructTextClipping(paraText);//, paraTexts.node );
}
}
}