/*
* Encog(tm) Core v3.4 - Java Version
* http://www.heatonresearch.com/encog/
* https://github.com/encog/encog-java-core
* Copyright 2008-2016 Heaton Research, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* For more information on Heaton Research copyrights, licenses
* and trademarks visit:
* http://www.heatonresearch.com/copyright
*/
package org.encog.parse.tags.read;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
import org.encog.parse.PeekableInputStream;
import org.encog.parse.tags.Tag;
import org.encog.parse.tags.Tag.Type;
import org.encog.parse.tags.TagConst;
/**
* Base class used to read tags. This base class is used by both the XML and
* HTML parsing.
*
* @author jheaton
*
*/
public class ReadTags {
/**
* The bullet character.
*/
public static final int CHAR_BULLET = 149;
/**
* The bullet character.
*/
public static final int CHAR_TRADEMARK = 129;
/**
* Maximum length string to read.
*/
public static final int MAX_LENGTH = 10000;
/**
* A mapping of certain HTML encoded values(i.e. ) to their actual
* character values.
*/
private static Map<String, Character> charMap;
/**
* The stream that we are parsing from.
*/
private final PeekableInputStream source;
/**
* The current HTML tag. Access this property if the read function returns
* 0.
*/
private final Tag tag = new Tag();
/**
* Are we locked, looking for an end tag? Such as the end of a comment?
*/
private String lockedEndTag;
/**
* Does a "fake" end-tag need to be added, because of a compound tag (i.e.
* <br/>)? If so, this will hold a string for that tag.
*/
private String insertEndTag = null;
/**
* The constructor should be passed an InputStream that we will parse from.
*
* @param is
* An InputStream to parse from.
*/
public ReadTags(final InputStream is) {
this.source = new PeekableInputStream(is);
synchronized(ReadTags.class) {
if (ReadTags.charMap == null) {
ReadTags.charMap = new HashMap<String, Character>();
ReadTags.charMap.put("nbsp", ' ');
ReadTags.charMap.put("lt", '<');
ReadTags.charMap.put("gt", '>');
ReadTags.charMap.put("amp", '&');
ReadTags.charMap.put("quot", '\"');
ReadTags.charMap.put("bull", (char) ReadTags.CHAR_BULLET);
ReadTags.charMap.put("trade", (char) ReadTags.CHAR_TRADEMARK);
}
}
}
/**
* Remove any whitespace characters that are next in the InputStream.
*
*/
protected void eatWhitespace() {
while (Character.isWhitespace((char) this.source.peek())) {
this.source.read();
}
}
/**
* Return the last tag found, this is normally called just after the read
* function returns a zero.
*
* @return The last HTML tag found.
*/
public Tag getTag() {
return this.tag;
}
/**
* Checks to see if the next tag is the tag specified.
*
* @param name
* The name of the tag desired.
* @param start
* True if a starting tag is desired.
* @return True if the next tag matches these criteria.
*/
public boolean is(final String name, final boolean start) {
if (!getTag().getName().equals(name)) {
return false;
}
if (start) {
return getTag().getType() == Type.BEGIN;
} else {
return getTag().getType() == Type.END;
}
}
/**
* Parse an attribute name, if one is present.
*
* @return Return the attribute name, or null if none present.
*/
protected String parseAttributeName() {
eatWhitespace();
if ("\"\'".indexOf(this.source.peek()) == -1) {
final StringBuilder buffer = new StringBuilder();
while (!Character.isWhitespace(this.source.peek())
&& (this.source.peek() != '=')
&& (this.source.peek() != '>')
&& (this.source.peek() != -1)) {
final int ch = parseSpecialCharacter();
buffer.append((char) ch);
}
return buffer.toString();
} else {
return (parseString());
}
}
/**
* Parse any special characters(i.e.  );
*
* @return The character that was parsed.
*/
private char parseSpecialCharacter() {
char result = (char) this.source.read();
int advanceBy = 0;
// is there a special character?
if (result == '&') {
int ch = 0;
final StringBuilder buffer = new StringBuilder();
// loop through and read special character
do {
ch = this.source.peek(advanceBy++);
if ((ch != '&') && (ch != ';') && !Character.isWhitespace(ch)) {
buffer.append((char) ch);
}
} while ((ch != ';') && (ch != -1) && !Character.isWhitespace(ch));
final String b = buffer.toString().trim().toLowerCase();
// did we find a special character?
if (b.length() > 0) {
if (b.charAt(0) == '#') {
try {
result = (char) Integer.parseInt(b.substring(1));
} catch (final NumberFormatException e) {
advanceBy = 0;
}
} else {
if (ReadTags.charMap.containsKey(b)) {
result = ReadTags.charMap.get(b);
} else {
advanceBy = 0;
}
}
} else {
advanceBy = 0;
}
}
while (advanceBy > 0) {
read();
advanceBy--;
}
return result;
}
/**
* Called to parse a double or single quote string.
*
* @return The string parsed.
*/
protected String parseString() {
final StringBuilder result = new StringBuilder();
eatWhitespace();
if ("\"\'".indexOf(this.source.peek()) != -1) {
final int delim = this.source.read();
while ((this.source.peek() != delim)
&& (this.source.peek() != -1)) {
if (result.length() > ReadTags.MAX_LENGTH) {
break;
}
final int ch = parseSpecialCharacter();
if ((ch == '\r') || (ch == '\n')) {
continue;
}
result.append((char) ch);
}
if ("\"\'".indexOf(this.source.peek()) != -1) {
this.source.read();
}
} else {
while (!Character.isWhitespace(this.source.peek())
&& (this.source.peek() != -1)
&& (this.source.peek() != '>')) {
result.append(parseSpecialCharacter());
}
}
return result.toString();
}
/**
* Called when a tag is detected. This method will parse the tag.
*
*/
protected void parseTag() {
this.tag.clear();
this.insertEndTag = null;
final StringBuilder tagName = new StringBuilder();
this.source.read();
// Is it a comment?
if (this.source.peek(TagConst.COMMENT_BEGIN)) {
this.source.skip(TagConst.COMMENT_BEGIN.length());
while (!this.source.peek(TagConst.COMMENT_END)) {
final int ch = this.source.read();
if (ch != -1) {
tagName.append((char) ch);
} else {
break;
}
}
this.source.skip(TagConst.COMMENT_END.length());
this.tag.setType(Type.COMMENT);
this.tag.setName(tagName.toString());
return;
}
// Is it CDATA?
if (this.source.peek(TagConst.CDATA_BEGIN)) {
this.source.skip(TagConst.CDATA_BEGIN.length());
while (!this.source.peek(TagConst.CDATA_END)) {
final int ch = this.source.read();
if (ch != -1) {
tagName.append((char) ch);
} else {
break;
}
}
this.source.skip(TagConst.CDATA_END.length());
this.tag.setType(Type.CDATA);
this.tag.setName(tagName.toString());
return;
}
// Find the tag name
while (this.source.peek() != -1) {
// if this is the end of the tag, then stop
if (Character.isWhitespace((char) this.source.peek())
|| (this.source.peek() == '>')) {
break;
}
// if this is both a begin and end tag then stop
if ((tagName.length() > 0) && (this.source.peek() == '/')) {
break;
}
tagName.append((char) this.source.read());
}
eatWhitespace();
if (tagName.charAt(0) == '/') {
this.tag.setName(tagName.substring(1).toString());
this.tag.setType(Tag.Type.END);
} else {
this.tag.setName(tagName.toString());
this.tag.setType(Tag.Type.BEGIN);
}
// get the attributes
while ((this.source.peek() != '>') && (this.source.peek() != -1)) {
final String attributeName = parseAttributeName();
String attributeValue = null;
if (attributeName.equals("/")) {
eatWhitespace();
if (this.source.peek() == '>') {
this.insertEndTag = this.tag.getName();
break;
}
}
// is there a value?
eatWhitespace();
if (this.source.peek() == '=') {
this.source.read();
attributeValue = parseString();
}
this.tag.setAttribute(attributeName, attributeValue);
}
this.source.read();
}
/**
* Check to see if the ending tag is present.
*
* @param name
* The type of end tag being sought.
* @return True if the ending tag was found.
*/
private boolean peekEndTag(final String name) {
int i = 0;
// pass any whitespace
while ((this.source.peek(i) != -1)
&& Character.isWhitespace(this.source.peek(i))) {
i++;
}
// is a tag beginning
if (this.source.peek(i) != '<') {
return false;
} else {
i++;
}
// pass any whitespace
while ((this.source.peek(i) != -1)
&& Character.isWhitespace(this.source.peek(i))) {
i++;
}
// is it an end tag
if (this.source.peek(i) != '/') {
return false;
} else {
i++;
}
// pass any whitespace
while ((this.source.peek(i) != -1)
&& Character.isWhitespace(this.source.peek(i))) {
i++;
}
// does the name match
for (int j = 0; j < name.length(); j++) {
if (Character.toLowerCase(this.source.peek(i)) != Character
.toLowerCase(name.charAt(j))) {
return false;
}
i++;
}
return true;
}
/**
* Read a single character from the HTML source, if this function returns
* zero(0) then you should call getTag to see what tag was found. Otherwise
* the value returned is simply the next character found.
*
* @return The character read, or zero if there is an HTML tag. If zero is
* returned, then call getTag to get the next tag.
*
*/
public int read() {
// handle inserting a "virtual" end tag
if (this.insertEndTag != null) {
this.tag.clear();
this.tag.setName(this.insertEndTag);
this.tag.setType(Type.END);
this.insertEndTag = null;
return 0;
}
// handle locked end tag
if (this.lockedEndTag != null) {
if (peekEndTag(this.lockedEndTag)) {
this.lockedEndTag = null;
} else {
return this.source.read();
}
}
// look for next tag
if (this.source.peek() == '<') {
parseTag();
if ((this.tag.getType() == Tag.Type.BEGIN)
&& (this.tag.getName().equalsIgnoreCase("script")
|| this.tag
.getName().equalsIgnoreCase("style"))) {
this.lockedEndTag = this.tag.getName().toLowerCase();
}
return 0;
} else if (this.source.peek() == '&') {
return parseSpecialCharacter();
} else {
return (this.source.read());
}
}
/**
* Read until we reach the next tag.
*
* @return True if a tag was found, false on EOF.
*/
public boolean readToTag() {
int ch;
while ((ch = read()) != -1) {
if (ch == 0) {
return true;
}
}
return false;
}
/**
* {@inheritDoc}
*/
@Override
public String toString() {
final StringBuilder result = new StringBuilder();
result.append("[ReadTags: currentTag=");
if (this.tag != null) {
result.append(this.tag.toString());
}
result.append("]");
return result.toString();
}
}