package ecologylab.bigsemantics.html.documentstructure;
import java.util.ArrayList;
import java.util.Collection;
import java.util.TreeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import ecologylab.bigsemantics.html.DOMParserInterface;
import ecologylab.bigsemantics.html.ImgElement;
import ecologylab.bigsemantics.html.ParagraphText;
import ecologylab.bigsemantics.html.utils.StringBuilderUtils;
import ecologylab.generic.StringTools;
import ecologylab.net.ParsedURL;
import ecologylab.serialization.XMLTools;
/**
* Generate surrogates for the documents that are determined as Image-Collection Pages.
*
* @author eunyee
*
*/
public class ImageCollectionPage extends RecognizedDocumentStructure
{
public ImageCollectionPage(ParsedURL purl)
{
super(purl);
}
/**
* Generate surrogates for the images inside the image-collection pages.
*/
@Override
public void generateSurrogates(Node articleMain, ArrayList<ImgElement> imgElements, int totalTxtLeng,
TreeMap<Integer, ParagraphText> paraTextMap, DOMParserInterface htmlType)
{
Collection<ParagraphText> paraTextsC = paraTextMap.values();
ParagraphText[] paraTexts = new ParagraphText[paraTextsC.size()];
paraTextsC.toArray(paraTexts);
for (int i = 0; i < imgElements.size(); i++)
{
ImgElement imgElement = imgElements.get(i);
String altText = imgElement.getNonBogusAlt();
if (altText == null)
{
final Node imageNodeNode = imgElement.getNode();
StringBuilder extractedCaption = getLongestTxtinSubTree(imageNodeNode.getParentNode().getParentNode(), null); // returns null in worst case
if (extractedCaption == null)
extractedCaption = getLongestTxtinSubTree(imageNodeNode.getParentNode().getParentNode().getParentNode(), null); // returns null in worst case
if (extractedCaption != null)
{
XMLTools.unescapeXML(extractedCaption);
imgElement.setAlt(StringTools.toString(extractedCaption));
StringBuilderUtils.release(extractedCaption);
}
}
ParsedURL anchorPurl = findAnchorPURL(imgElement);
// images in the image-collection pages won't have anchors
// If there is an anchor, it should be pointing to the bigger image.
if (anchorPurl == null)
htmlType.constructImageClipping(imgElement, null);
else if ((anchorPurl != null) && anchorPurl.isImg())
{
htmlType.constructImageClipping(imgElement, anchorPurl);
htmlType.removeTheContainerFromCandidates(anchorPurl);
}
else // if (anchorPurl.isHTML() || anchorPurl.isPDF() || anchorPurl.isRSS())
{
// TODO find the anchorContext for this purl
Node parent = imgElement.getNode().getParentNode();
Node gParent = parent.getParentNode();
Node ggParent = gParent.getParentNode();
for (ParagraphText paraText : paraTexts)
{
Node paraTextNode = paraText.getElementNode();
NodeList children = paraTextNode.getChildNodes();
Node contextNode = null;
if (paraTextNode == parent)
contextNode = parent;
else if (paraTextNode == gParent)
contextNode = gParent;
else if (paraTextNode == ggParent)
contextNode = ggParent;
else for (int j=0; j<children.getLength(); j++)
{
Node childNode = children.item(j);
if (paraTextNode == childNode)
{
contextNode = childNode;
}
}
if (contextNode != null)
{
paraText.setImgElementTextContext(imgElement);
break;
}
}
//TODO: sashi - Do something with the tv and InterestModel
//if(altText == null && extractedCaption == null )
htmlType.constructImageClipping(imgElement, anchorPurl);
}
imgElement.recycle();
}
}
}