package org.juxtasoftware.util; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.util.ArrayList; import java.util.List; import javax.swing.text.MutableAttributeSet; import javax.swing.text.html.HTML; import javax.swing.text.html.HTMLEditorKit; import javax.swing.text.html.parser.ParserDelegator; import org.apache.commons.io.IOUtils; import org.juxtasoftware.Constants; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class HtmlUtils { private static final Logger LOG = LoggerFactory.getLogger( Constants.WS_LOGGER_NAME ); /** * Transform source html stream into a plain text file * @param htmlStream * @return */ public static File toTxt( InputStream htmlStream ) throws IOException{ File out = File.createTempFile("totxt", "dat"); FileOutputStream fos = new FileOutputStream(out); final OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8"); HTMLEditorKit.ParserCallback callback = new HTMLEditorKit.ParserCallback() { @Override public void handleText(char[] data, int pos) { try { osw.append(java.nio.CharBuffer.wrap(data) ); } catch (IOException e) { LOG.error("Error writing HTML text data", e); } } @Override public void handleEndTag(HTML.Tag t, int pos) { if ( HtmlUtils.isLinefeedTag(t) ) { try { osw.append( "\n" ); if ( t.equals(HTML.Tag.P)) { // double break p tags to get space between paragraphs osw.append( "\n" ); } } catch (IOException e) { LOG.error("Error writing simple HTML tag linebreak", e); } } } @Override public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos) { if ( t.equals(HTML.Tag.BR) || t.equals(HTML.Tag.HR) ) { try { osw.append( "\n" ); } catch (IOException e) { LOG.error("Error writing simple HTML tag linebreak", e); } } } }; new ParserDelegator().parse( new InputStreamReader(htmlStream, "UTF-8"), callback, false); IOUtils.closeQuietly(osw); return out; } private static boolean isLinefeedTag( HTML.Tag tag ) { final List<HTML.Tag> breakers = new ArrayList<HTML.Tag>(); breakers.add(HTML.Tag.DIV); breakers.add(HTML.Tag.H1); breakers.add(HTML.Tag.H2); breakers.add(HTML.Tag.H3); breakers.add(HTML.Tag.H4); breakers.add(HTML.Tag.H5); breakers.add(HTML.Tag.LI); breakers.add(HTML.Tag.P); breakers.add(HTML.Tag.PRE); breakers.add(HTML.Tag.TH); breakers.add(HTML.Tag.TR); return breakers.contains(tag); } /** * Strip header, javascript and css from HTML source file * @param srcFile */ public static void strip( File srcFile ) throws IOException { FileInputStream fis = new FileInputStream(srcFile); InputStreamReader isr = new InputStreamReader(fis, "UTF-8"); BufferedReader r = new BufferedReader( isr ); File out = File.createTempFile("stripped", "dat"); FileOutputStream fos = new FileOutputStream(out); OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8"); boolean strippingHead = false; boolean strippingScript = false; boolean strippingCss = false; while (true) { String line = r.readLine(); if ( line == null ) { break; } else { String lcLine = line.toLowerCase(); if (strippingHead ) { int endHead = lcLine.indexOf("</head>"); if ( endHead > -1 ) { line = line.substring(endHead+7); strippingHead = false; } else { line = ""; } } else if (strippingScript ) { int end = lcLine.indexOf("</script>"); if ( end > -1 ) { line = line.substring(end+9); strippingScript = false; } else { line = ""; } } else if (strippingCss ) { int end = lcLine.indexOf("</style>"); if ( end > -1 ) { line = line.substring(end+8); strippingCss = false; } else { line = ""; } }else { if ( lcLine.contains("<head")) { int headPos = lcLine.indexOf("<head"); int endHead = lcLine.indexOf("</head>"); if ( endHead > -1 ) { line = line.substring(0,headPos) + line.substring(endHead+7); } else { line = line.substring(0, headPos); strippingHead = true; } } else if ( lcLine.contains("<script")) { int start = lcLine.indexOf("<script"); int end = lcLine.indexOf("</script>"); if ( end > -1 ) { line = line.substring(0,start) + line.substring(end+9); } else { line = line.substring(0, start); strippingScript = true; } } else if ( lcLine.contains("<style")) { int start = lcLine.indexOf("<style"); int end = lcLine.indexOf("</style>"); if ( end > -1 ) { line = line.substring(0,start) + line.substring(end+8); } else { line = line.substring(0, start); strippingCss = true; } } } if ( line.trim().length() > 0 ) { line += "\n"; osw.write(line); } } } IOUtils.closeQuietly( r ); IOUtils.closeQuietly( osw ); // copy the stripped file back over the original IOUtils.copy(new FileInputStream(out), new FileOutputStream(srcFile)); out.delete(); } }