import net.htmlparser.jericho.*;
import java.util.*;
import java.io.*;
import java.net.*;
public class StreamedSourceCopy {
public static void main(String[] args) throws Exception {
String sourceUrlString="data/test.html";
if (args.length==0)
System.err.println("Using default argument of \""+sourceUrlString+'"');
else
sourceUrlString=args[0];
if (sourceUrlString.indexOf(':')==-1) sourceUrlString="file:"+sourceUrlString;
StreamedSource streamedSource=new StreamedSource(new URL(sourceUrlString));
Writer writer=null;
try {
writer=new FileWriter("StreamedSourceCopyOutput.html");
System.out.println("Processing segments:");
int lastSegmentEnd=0;
for (Segment segment : streamedSource) {
System.out.println(segment.getDebugInfo());
if (segment.getEnd()<=lastSegmentEnd) continue; // if this tag is inside the previous tag (e.g. a server tag) then ignore it as it was already output along with the previous tag.
lastSegmentEnd=segment.getEnd();
if (segment instanceof Tag) {
Tag tag=(Tag)segment;
// HANDLE TAG
// Uncomment the following line to ensure each tag is valid XML:
// writer.write(tag.tidy()); continue;
} else if (segment instanceof CharacterReference) {
CharacterReference characterReference=(CharacterReference)segment;
// HANDLE CHARACTER REFERENCE
// Uncomment the following line to decode all character references instead of copying them verbatim:
// characterReference.appendCharTo(writer); continue;
} else {
// HANDLE PLAIN TEXT
}
// unless specific handling has prevented getting to here, simply output the segment as is:
writer.write(segment.toString());
}
writer.close();
System.err.println("\nA copy of the source document has been output to StreamedSourceCopyOuput.html");
} catch (Throwable t) {
if (writer!=null) try {writer.close();} catch (IOException ex) {}
}
}
}