ImageCollectionPage.java example

Explorer
BigSemanticsJava-master
package ecologylab.bigsemantics.html.documentstructure;

import java.util.ArrayList;
import java.util.Collection;
import java.util.TreeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

import ecologylab.bigsemantics.html.DOMParserInterface;
import ecologylab.bigsemantics.html.ImgElement;
import ecologylab.bigsemantics.html.ParagraphText;
import ecologylab.bigsemantics.html.utils.StringBuilderUtils;
import ecologylab.generic.StringTools;
import ecologylab.net.ParsedURL;
import ecologylab.serialization.XMLTools;

/**
 * Generate surrogates for the documents that are determined as Image-Collection Pages.
 * 
 * @author eunyee
 * 
 */
public class ImageCollectionPage extends RecognizedDocumentStructure
{
	public ImageCollectionPage(ParsedURL purl)
	{
		super(purl);
	}

	/**
	 * Generate surrogates for the images inside the image-collection pages.
	 */
	@Override
	public void generateSurrogates(Node articleMain, ArrayList<ImgElement> imgElements, int totalTxtLeng, 
			TreeMap<Integer, ParagraphText> paraTextMap, DOMParserInterface htmlType)
	{
		Collection<ParagraphText> paraTextsC	= paraTextMap.values();
		ParagraphText[] paraTexts	= new ParagraphText[paraTextsC.size()];
		paraTextsC.toArray(paraTexts);
		for (int i = 0; i < imgElements.size(); i++)
		{
			ImgElement imgElement 				= imgElements.get(i);
			
			String altText 								= imgElement.getNonBogusAlt();
			
			if (altText == null)
			{
				final Node imageNodeNode 	= imgElement.getNode();
				StringBuilder extractedCaption = getLongestTxtinSubTree(imageNodeNode.getParentNode().getParentNode(), null);	// returns null in worst case
				if (extractedCaption == null)
					extractedCaption = getLongestTxtinSubTree(imageNodeNode.getParentNode().getParentNode().getParentNode(), null);	// returns null in worst case
								
				if (extractedCaption != null)
				{
					XMLTools.unescapeXML(extractedCaption);
					imgElement.setAlt(StringTools.toString(extractedCaption));
					
					StringBuilderUtils.release(extractedCaption);
				}
			}

			ParsedURL anchorPurl = findAnchorPURL(imgElement);

			// images in the image-collection pages won't have anchors
			// If there is an anchor, it should be pointing to the bigger image.
			if (anchorPurl == null)
				htmlType.constructImageClipping(imgElement, null);
			else if ((anchorPurl != null) && anchorPurl.isImg())
			{
				htmlType.constructImageClipping(imgElement, anchorPurl);
				htmlType.removeTheContainerFromCandidates(anchorPurl);
			}
			else // if (anchorPurl.isHTML() || anchorPurl.isPDF() || anchorPurl.isRSS())
			{
				// TODO find the anchorContext for this purl
				Node parent		= imgElement.getNode().getParentNode();
				Node gParent	= parent.getParentNode();
				Node ggParent	= gParent.getParentNode();
				for (ParagraphText paraText : paraTexts)
				{
					Node paraTextNode	= paraText.getElementNode();
					NodeList children = paraTextNode.getChildNodes();
					Node contextNode	= null;
					if (paraTextNode == parent)
						contextNode				= parent;
					else if (paraTextNode == gParent)
						contextNode				= gParent;
					else if (paraTextNode == ggParent)
						contextNode				= ggParent;					
					else for (int j=0; j<children.getLength(); j++)
					{
						Node childNode = children.item(j);
						if (paraTextNode == childNode)
						{
							contextNode			= childNode;
						}
					}
					if (contextNode != null)
					{
						paraText.setImgElementTextContext(imgElement);
						break;
					}
				}
				
				//TODO: sashi - Do something with the tv and InterestModel 
				//if(altText == null && extractedCaption == null )
				htmlType.constructImageClipping(imgElement, anchorPurl);
			}
			imgElement.recycle();
		}
	}
}