package ecologylab.bigsemantics.html.standalone; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; import java.io.StringWriter; import org.w3c.dom.Node; import ecologylab.bigsemantics.html.DOMWalkInformationTagger; import ecologylab.net.ParsedURL; /** * * @author eunyee * */ @Deprecated public class GenerateSVMData extends OldHTMLDOMParser { public void pprint(org.w3c.dom.Document doc, OutputStream out, ParsedURL purl, String mainPartitionID) { // Out o = new OutJavaImpl(this.getConfiguration(), null); Node document = null; // o.state = StreamIn.FSM_ASCII; // o.encoding = configuration.CharEncoding; // if (out != null) // { // Instantiate PPrint constructor that connects to combinFormation DOMWalkInformationTagger pprint = new DOMWalkInformationTagger(purl, null); // To generate SVM data pprint.setPartitionID(mainPartitionID); FileOutputStream outFile; try { outFile = new FileOutputStream("svmData.csv", true); pprint.setFileOutputStream(outFile); StringWriter o = new StringWriter(); // o.out = out; // if (configuration.xmlTags) // pprint.printXMLTree(o, (short)0, 0, null, document); // else pprint.tagTree(document); outFile.close(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } /* protected String getContentBody(URL labelFile) { try { DocumentState ds = (DocumentState) ElementState.translateFromXML(labelFile, TranslationScope.get("collectionBrowseServlet", "collectionBrowseServlet")); if( ds!=null ) { //totalLabeledDocument++; PartitionState partitionState = ds.getPartitionSet().get(0); String mainPartitionTag_ID = partitionState.getTag_id(); System.out.println(" mainPartitionTag_ID:" + mainPartitionTag_ID); return mainPartitionTag_ID; } } catch (XMLTranslationException e) { e.printStackTrace(); } return null; } public static void main(String args[]) { GenerateSVMData cbr = new GenerateSVMData(); URL url; try { File ff = new File( "folderList.txt" ); InputStream ii = new FileInputStream(ff); BufferedReader myInput = new BufferedReader(new InputStreamReader(ii)); String temp = null; while( (temp=myInput.readLine())!=null ) { String urlString = "http://csdll.cs.tamu.edu:9080/TestCollections/websites/NewsContent/" + temp.trim() + "/"; String labelURLStr = urlString + "label.xml"; url = new URL(urlString); System.out.println(urlString); System.out.println("\n\n" + urlString ); URL labelURL = new URL(labelURLStr); try { if( (labelURL!=null) && (labelURL.openConnection()!=null) && (labelURL.getContent()!=null) ) { String returnVal=cbr.getContentBody(labelURL); if( returnVal != null ) { InputStream in = url.openConnection().getInputStream(); cbr.pprint( cbr.parseDOM(in, null), null, urlString, returnVal); } } else System.out.println("NO LABEL for this document"); } catch(FileNotFoundException e) { continue; } System.out.println("\n"); } } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } */ }