package ecologylab.bigsemantics.html.standalone;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.StringWriter;
import java.net.MalformedURLException;
import org.w3c.dom.Node;
import ecologylab.bigsemantics.html.DOMWalkInformationTagger;
import ecologylab.bigsemantics.html.ImgElement;
import ecologylab.bigsemantics.html.documentstructure.RecognizedDocumentStructure;
import ecologylab.net.PURLConnection;
import ecologylab.net.ParsedURL;
@Deprecated
public class ContentBodyRecognize extends OldHTMLDOMParser
{
RecognizedDocumentStructure recPageType;
@Override
public org.w3c.dom.Document parse(PURLConnection purlConnection) throws IOException
{
recPageType = new RecognizedDocumentStructure(purlConnection.getPurl());
return super.parse(purlConnection);
}
public Node pprint(org.w3c.dom.Document doc, OutputStream out, ParsedURL purl)
{
// Out o = new OutJavaImpl(this.getConfiguration(), null);
StringWriter o = new StringWriter();
Node document;
// if (!(doc instanceof DOMDocumentImpl)) {
// return null;
// }
document = doc.getDocumentElement();
// o.state = StreamIn.FSM_ASCII;
// o.encoding = configuration.CharEncoding;
// if (out != null)
// {
// Instantiate PPrint constructor that connects to combinFormation
DOMWalkInformationTagger pprint = new DOMWalkInformationTagger(purl, null);
// o.out = out;
// if (configuration.xmlTags)
// pprint.printXMLTree(o, (short)0, 0, null, document);
// else
pprint.tagTree(document);
Node articleMain = RecognizedDocumentStructure.recognizeContentBody(pprint);
if( articleMain!=null )
{
// recPageType.findImgsInContentBodySubTree(articleMain.parent(), imgNodes);
informativeImages();
}
return articleMain;
}
protected void informativeImages()
{
for(int i=0; i<recPageType.getImgNodesInContentBody().size(); i++ )
{
ImgElement imgElement = recPageType.getImgNodesInContentBody().get(i);
ParsedURL imgPurl = imgElement.getSrc();
int width = imgElement.getWidth();
int height = imgElement.getHeight();
float aspectRatio = (float)width / (float)height;
aspectRatio = (aspectRatio>1.0) ? (float)1.0/aspectRatio : aspectRatio;
String altStr = imgElement.getAlt();
boolean parentHref = imgElement.getNode().getParentNode().getNodeName().equals("a");
boolean articleImg = true;
// Advertisement Keyword in the "alt" value
if( altStr!=null && altStr.toLowerCase().contains("advertis") )
articleImg = false;
//FIXME -- andruid -- restore this!!!
/*
if( imgUrl!=null )
{
//FIXME -- use compiled regex!
String urlChunks[] = imgUrl.split("/");
for (int j=0; j<urlChunks.length; j++)
{
String temp = urlChunks[j].toLowerCase();
// System.out.println("url Chunk:" + temp);
if (temp.equals("adv") || temp.contains("advertis") ) // || temp.equals("ad")
{
articleImg = false;
break;
}
}
}
*/
if( (width!=-1 && width<100) || (height!=-1 && height<100) )
articleImg = false;
if( articleImg )
{
recPageType.getImgNodesInContentBody().add(imgElement);
}
}
}
protected String getContentBody(ParsedURL labelFilePurl, String contentBodyID)
{
/* try
{
DocumentState ds = (DocumentState) ElementState.translateFromXML(labelFile, TranslationScope.get("collectionBrowseServlet", "collectionBrowseServlet"));
if( ds!=null )
{
totalLabeledDocument++;
PartitionState partitionState = ds.getPartitionSet().get(0);
String mainPartitionTag_ID = partitionState.getTag_id();
System.out.println("contentBody Tag_ID : " + contentBodyID + " mainPartitionTag_ID:" + mainPartitionTag_ID);
if( contentBodyID.equals(mainPartitionTag_ID) )
{
System.out.println("ContentBody equals to main partition Tag_ID");
correctContentBody++;
return "yes";
}
InformTextSet informTextSet = partitionState.getInformTextSet();
for( int i=0; i<informTextSet.size(); i++ )
{
InformTextState informText = informTextSet.get(i);
String informTextID = informText.getTag_id();
System.out.println("informTextID : " + informTextID);
if( contentBodyID.equals(informTextID) )
{
System.out.println("Main partition Tag_ID equals to informTextID ");
correctContentBody++;
return "yes";
}
}
*/
/*
InformImgSet informImgSet = partitionState.getInformImgSet();
for( int i=0; i<informImgSet.size(); i++ )
{
InformImgState informImgState = informImgSet.get(i);
String informImgTag_ID = informImgState.getTag_id();
ArrayList imageNodes = this.getArticleImgNodes();
for(int j=0; j<imageNodes.size(); j++ )
{
TdNode node = (TdNode) imageNodes.get(j);
String imgNodeID = node.getAttrByName("tag_id").value;
//System.out.println(" ImageNodeID=" + imgNodeID + " informImgTag_ID=" + informImgTag_ID);
if( imgNodeID.equals(informImgTag_ID) )
{
correctImage++;
return "yes";
}
}
}
*/
/* }
}
catch (XMLTranslationException e)
{
e.printStackTrace();
}
*/
return "no";
}
int totalLabeledDocument = 0;
int correctContentBody = 0;
int correctImage = 0;
static final ParsedURL TEST_COLLECITON_BASE = ParsedURL.getAbsolute("http://csdll.cs.tamu.edu:9080/TestCollections/websites/ResearchArticle/");;
///*
public static void main(String args[])
{
ContentBodyRecognize cbr = new ContentBodyRecognize();
try
{
File ff = new File( "researchSites.txt"); //"folderList.txt" )
InputStream ii = new FileInputStream(ff);
BufferedReader myInput = new BufferedReader(new InputStreamReader(ii));
String temp = null;
while( (temp=myInput.readLine())!=null )
{
ParsedURL purl = TEST_COLLECITON_BASE.getRelative(temp.trim() + "/");
ParsedURL labelPurl = purl.getRelative("label.xml");
System.out.println(purl.toString());
PURLConnection purlConnection = purl.connect();
Node contentBodyNode = cbr.pprint( cbr.parseDOM(purlConnection.inputStream(), null), null, purl);
PURLConnection labelConnection = labelPurl.connect();
try
{
if ((labelConnection != null) && (labelConnection.urlConnection().getContent()!=null)
&& (contentBodyNode!=null) && (contentBodyNode.getAttributes().getNamedItem("tag_id")!=null) )
{
String returnVal=cbr.getContentBody(labelPurl, contentBodyNode.getAttributes().getNamedItem("tag_id").getNodeValue());
if( returnVal.equals("no") )
System.out.println("WHY NOT THIS!!!!!!!!!!! " + purl);
}
else
System.out.println("NO LABE for this document");
}
catch(FileNotFoundException e)
{
continue;
}
finally
{
purlConnection.recycle();
if (labelConnection != null)
labelConnection.recycle();
}
System.out.println("\n");
}
System.out.println("STAT : " + cbr.totalLabeledDocument + " : " + cbr.correctContentBody
+ " : images = " + cbr.correctImage );
}
catch (MalformedURLException e)
{
e.printStackTrace();
}
catch (IOException e)
{
e.printStackTrace();
}
}
// */
/*
public static void main(String args[])
{
ContentBodyRecognize cbr = new ContentBodyRecognize();
String urlString = "http://csdll.cs.tamu.edu:9080/TestCollections/websites/Article/1204410974562/";
String labelURLStr = urlString + "label.xml";
try{
URL url = new URL(urlString);
System.out.println(urlString);
InputStream in = url.openConnection().getInputStream();
TdNode contentBodyNode = cbr.pprint( cbr.parseDOM(in, null), null, urlString);
System.out.println("\n\n" + urlString );
URL labelURL = new URL(labelURLStr);
if( (labelURL!=null) && (labelURL.openConnection()!=null) && (labelURL.getContent()!=null)
&& (contentBodyNode!=null) && (contentBodyNode.getAttrByName("tag_id")!=null) )
{
String returnVal=cbr.getContentBody(labelURL, contentBodyNode.getAttrByName("tag_id").value);
if( returnVal.equals("no") )
System.out.println("WHY NOT THIS!!!!!!!!!!! " + urlString);
}
else
System.out.println("NO LABE for this document");
}catch(Exception e)
{
e.printStackTrace();
}
System.out.println("\n");
}
*/
}