package ecologylab.bigsemantics.html.standalone; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.StringWriter; import java.net.MalformedURLException; import java.net.URL; import org.w3c.dom.Node; import ecologylab.bigsemantics.html.DOMWalkInformationTagger; import ecologylab.bigsemantics.html.documentstructure.RecognizedDocumentStructure; import ecologylab.net.PURLConnection; import ecologylab.net.ParsedURL; /** * * @author eunyee */ @Deprecated public class ArticlePageRecognize extends OldHTMLDOMParser { private static final String PORTAL_PURL = "http://portal.acm.org/browse_dl.cfm?coll=ACM&dl=ACM&idx=J961&linked=1&part=transaction"; /** * Pretty-prints a DOM Document. * Extract Image and Text Surrogates while walk through DOM */ public void pprint(org.w3c.dom.Document doc, OutputStream out, ParsedURL purl) { // Out o = new OutJavaImpl(this.getConfiguration(), null); StringWriter o = new StringWriter(); Node document; // if (!(doc instanceof DOMDocumentImpl)) { // return; // } document = doc.getDocumentElement(); // o.state = StreamIn.FSM_ASCII; // o.encoding = configuration.CharEncoding; // if (out != null) // { // Instantiate PPrint constructor that connects to combinFormation DOMWalkInformationTagger pprint = new DOMWalkInformationTagger(purl, null); // o.out = out; // if (configuration.xmlTags) // pprint.printXMLTree(o, (short)0, 0, null, document); // else pprint.tagTree(document); Node articleMain = RecognizedDocumentStructure.recognizeContentBody(pprint); //System.out.println("ArticleMain: " + articleMain ); if( articleMain == null ) { nonArticlePage++; System.out.println("NON ARTICLE PAGE!!!!!!!!!!!!! "); } else { articlePage++; System.out.println("YES!!!!!!!! ARTICLE PAGE!!!!!!!!!!!"); } } int articlePage = 0; int nonArticlePage = 0; // /* public static void main(String args[]) { ArticlePageRecognize apr = new ArticlePageRecognize(); URL url; try { /* File ff = new File( "NonArticle-folderList.txt" ); // "researchIndex.txt"); // File ff = new File("folderList.txt"); InputStream ii = new FileInputStream(ff); BufferedReader myInput = new BufferedReader(new InputStreamReader(ii)); String temp = null; while( (temp=myInput.readLine())!=null ) { String urlString = "http://csdll.cs.tamu.edu:9080/TestCollections/websites/ResearchIndex/" + temp + "/"; url = new URL(urlString); System.out.println(urlString); String labelURLStr = urlString + "label.xml"; System.out.println("\n\n" + urlString ); // URL labelURL = new URL(labelURLStr); // if( (labelURL!=null) && (labelURL.openConnection()!=null) && (labelURL.getContent()!=null) ) // { InputStream in = url.openConnection().getInputStream(); apr.pprint( apr.parseDOM(in, null), null, urlString); System.out.println("DONE \n"); // } } System.out.println("ArticlePage: " + apr.articlePage + " NonArticlePage: " + apr.nonArticlePage ); double sum = apr.articlePage + apr.nonArticlePage; System.out.println("ArticlePage: " + (double)apr.articlePage/sum); System.out.println("NonArticlePage: " + (double)apr.nonArticlePage/sum); */ ParsedURL purl = ParsedURL.getAbsolute(PORTAL_PURL); PURLConnection purlConnection = purl.connect(); InputStream in = purlConnection.inputStream(); apr.pprint( apr.parseDOM(in, null), null, purl); in.close(); } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } // */ /* public static void main(String args[]) { ArticlePageRecognize apr = new ArticlePageRecognize(); String urlString = "http://csdll.cs.tamu.edu:9080/TestCollections/websites/News/1178399895044/"; URL url; try { url = new URL(urlString); InputStream in = url.openConnection().getInputStream(); apr.pprint(apr.parseDOM(in, null), null, urlString); } catch (MalformedURLException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } */ }