package br.com.citframework.html; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Collection; import java.util.Iterator; import javax.swing.text.MutableAttributeSet; import javax.swing.text.html.HTML; import javax.swing.text.html.HTMLEditorKit; import javax.swing.text.html.parser.ParserDelegator; public class LeitorHTML { protected static HTMLParseLister process(final String nomeArquivo, final boolean debugParser) throws IOException { final FileInputStream fis = new FileInputStream(nomeArquivo); final InputStreamReader isr = new InputStreamReader(fis); final BufferedReader br = new BufferedReader(isr); final HTMLEditorKit.Parser parser = new ParserDelegator(); final HTMLParseLister h = new HTMLParseLister(debugParser); parser.parse(br, h, true); br.close(); return h; } /** * Obtem o texto nao formatado de um arquivo HTML (sem as tags). * * @param nomeArquivo * @param debugParser * @return * @throws IOException */ public static StringBuilder getTextoFromArquivoHTML(final String nomeArquivo, final boolean debugParser) throws IOException { final HTMLParseLister h = process(nomeArquivo, debugParser); if (debugParser) { System.out.println(">>>> LeitorHTML:: TEXTO EXTRA�DO: " + h.getStrBuffTextData().toString()); } return h.getStrBuffTextData(); } /** * Obtem uma cole��o de elementos string - nao formatado de um arquivo HTML (sem as tags). * * @param nomeArquivo * @param debugParser * @return * @throws IOException */ public static Collection getCollectionFromArquivoHTML(final String nomeArquivo, final boolean debugParser) throws IOException { final HTMLParseLister h = process(nomeArquivo, debugParser); if (debugParser) { if (h.getColBuffTextData() != null) { int i = 0; for (final Iterator it = h.getColBuffTextData().iterator(); it.hasNext();) { final String element = (String) it.next(); System.out.println(">>>> LeitorHTML:: TEXTO EXTRA�DO (" + i + "): " + element); i++; } } } return h.getColBuffTextData(); } } class HTMLParseLister extends HTMLEditorKit.ParserCallback { int indentSize = 0; boolean debug = false; StringBuilder strBuffTextData = null; Collection colBuffTextData = null; public HTMLParseLister() { debug = false; } public HTMLParseLister(final boolean debugParm) { debug = debugParm; } protected void indent() { indentSize += 3; } protected void unIndent() { indentSize -= 3; if (indentSize < 0) { indentSize = 0; } } protected void pIndent() { if (debug) { for (int i = 0; i < indentSize; i++) { System.out.print(" "); } } } @Override public void handleText(final char[] data, final int pos) { this.pIndent(); // System.out.println("Text(" + data.length + " chars)"); if (debug) { System.out.println(data); } if (strBuffTextData == null) { strBuffTextData = new StringBuilder(); } if (colBuffTextData == null) { colBuffTextData = new ArrayList<>(); } strBuffTextData.append(data); strBuffTextData.append("\n"); colBuffTextData.add("" + new String(data)); } @Override public void handleComment(final char[] data, final int pos) { this.pIndent(); if (debug) { System.out.println("Comment(" + data.length + " chars)"); } } @Override public void handleStartTag(final HTML.Tag t, final MutableAttributeSet a, final int pos) { this.pIndent(); if (debug) { System.out.println("Tag start(<" + t.toString() + ">, " + a.getAttributeCount() + " attrs)"); } this.indent(); } @Override public void handleEndTag(final HTML.Tag t, final int pos) { this.unIndent(); this.pIndent(); if (debug) { System.out.println("Tag end<" + t.toString() + ">"); } } @Override public void handleSimpleTag(final HTML.Tag t, final MutableAttributeSet a, final int pos) { this.pIndent(); if (debug) { System.out.println("Tag(<" + t.toString() + ">, " + a.getAttributeCount() + " attrs)"); } } @Override public void handleError(final String errorMsg, final int pos) { if (debug) { System.out.println("Parsing error: " + errorMsg + " at " + pos); } } public StringBuilder getStrBuffTextData() { return strBuffTextData; } public void setStrBuffTextData(final StringBuilder strBuffTextData) { this.strBuffTextData = strBuffTextData; } public Collection getColBuffTextData() { return colBuffTextData; } public void setColBuffTextData(final Collection colBuffTextData) { this.colBuffTextData = colBuffTextData; } }