/*
* Autopsy Forensic Browser
*
* Copyright 2012 Basis Technology Corp.
* Contact: carrier <at> sleuthkit <dot> org
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.sleuthkit.autopsy.keywordsearch;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;
import java.util.List;
import java.util.logging.Level;
import org.sleuthkit.autopsy.coreutils.Logger;
import net.htmlparser.jericho.Attributes;
import net.htmlparser.jericho.Renderer;
import net.htmlparser.jericho.Source;
import net.htmlparser.jericho.StartTag;
import net.htmlparser.jericho.StartTagType;
/**
* Uses Jericho HTML Parser to create a Reader for output, consisting of the
* text, comments, tag attributes, and other important information found in the
* HTML.
*/
class JerichoParserWrapper {
private static final Logger logger = Logger.getLogger(JerichoParserWrapper.class.getName());
private InputStream in;
private StringBuilder out;
private Reader reader;
JerichoParserWrapper(InputStream in) {
this.in = in;
}
/**
* Returns the reader, initialized in parse(), which will be null if parse()
* is not called or if parse() throws an error.
*
* @return Reader
*/
public Reader getReader() {
return reader;
}
/**
* Initialize the reader by parsing the InputStream, adding it to
* StringBuilder, and creating a StringReader from it.
*/
public void parse() {
out = new StringBuilder();
try {
Source source = new Source(in);
source.fullSequentialParse();
String text;
StringBuilder scripts = new StringBuilder();
StringBuilder links = new StringBuilder();
StringBuilder images = new StringBuilder();
StringBuilder comments = new StringBuilder();
StringBuilder others = new StringBuilder();
int numScripts = 1;
int numLinks = 1;
int numImages = 1;
int numComments = 1;
int numOthers = 1;
text = renderHTMLAsPlainText(source);
// Get all the tags in the source
List<StartTag> tags = source.getAllStartTags();
for (StartTag tag : tags) {
if (tag.getName().equals("script")) { //NON-NLS
// If the <script> tag has attributes
scripts.append(numScripts).append(") ");
if (tag.getTagContent().length() > 0) {
scripts.append(tag.getTagContent()).append(" ");
}
// Get whats between the <script> .. </script> tags
scripts.append(tag.getElement().getContent()).append("\n");
numScripts++;
} else if (tag.getName().equals("a")) { //NON-NLS
links.append(numLinks).append(") ");
links.append(tag.getTagContent()).append("\n");
numLinks++;
} else if (tag.getName().equals("img")) { //NON-NLS
images.append(numImages).append(") ");
images.append(tag.getTagContent()).append("\n");
numImages++;
} else if (tag.getTagType().equals(StartTagType.COMMENT)) {
comments.append(numComments).append(") ");
comments.append(tag.getTagContent()).append("\n");
numComments++;
} else {
// Make sure it has an attribute
Attributes atts = tag.getAttributes();
if (atts != null && atts.length() > 0) {
others.append(numOthers).append(") ");
others.append(tag.getName()).append(":");
others.append(tag.getTagContent()).append("\n");
numOthers++;
}
}
}
out.append(text).append("\n\n");
out.append("----------NONVISIBLE TEXT----------\n\n"); //NON-NLS
if (numScripts > 1) {
out.append("---Scripts---\n"); //NON-NLS
out.append(scripts.toString()).append("\n");
}
if (numLinks > 1) {
out.append("---Links---\n"); //NON-NLS
out.append(links.toString()).append("\n");
}
if (numImages > 1) {
out.append("---Images---\n"); //NON-NLS
out.append(images.toString()).append("\n");
}
if (numComments > 1) {
out.append("---Comments---\n"); //NON-NLS
out.append(comments.toString()).append("\n");
}
if (numOthers > 1) {
out.append("---Others---\n"); //NON-NLS
out.append(others.toString()).append("\n");
}
// All done, now make it a reader
reader = new StringReader(out.toString());
} catch (IOException ex) {
logger.log(Level.WARNING, "Unable to parse the HTML file", ex); //NON-NLS
}
}
// Extract text from the source, nicely formatted with whitespace and
// newlines where appropriate.
private String renderHTMLAsPlainText(Source source) {
Renderer renderer = source.getRenderer();
renderer.setNewLine("\n");
renderer.setIncludeHyperlinkURLs(false);
renderer.setDecorateFontStyles(false);
renderer.setIncludeAlternateText(false);
return renderer.toString();
}
}