/********************************************************************************** * $URL: https://source.sakaiproject.org/svn/search/trunk/search-util/src/java/org/sakaiproject/search/util/HTMLParser.java $ * $Id: HTMLParser.java 105078 2012-02-24 23:00:38Z ottenhoff@longsight.com $ *********************************************************************************** * * Copyright (c) 2003, 2004, 2005, 2006, 2007, 2008 The Sakai Foundation * * Licensed under the Educational Community License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.opensource.org/licenses/ECL-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * **********************************************************************************/ package org.sakaiproject.search.util; import java.io.BufferedReader; import java.io.InputStreamReader; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; /** * @author ieb */ public class HTMLParser implements Iterator<String> { private static Log log = LogFactory.getLog(HTMLParser.class); private static final char[][] IGNORE_TAGS = new char[][] { "script".toCharArray(), "head".toCharArray(), "style".toCharArray() }; private static final String PAD = " "; private static final Map<String, String> entities = new HashMap<String, String>(); static { try { BufferedReader br = new BufferedReader( new InputStreamReader( HTMLParser.class .getResourceAsStream("/org/sakaiproject/search/util/htmlentities.config"))); for (String line = br.readLine(); line != null; line = br.readLine()) { if (!line.startsWith("#")) { String[] parts = line.split("="); char code = (char) Integer.parseInt(parts[1]); entities.put(parts[0], new String(new char[] { code })); } } br.close(); } catch (Exception ex) { log.error("Unable to load HTML Entities", ex); } }; private int[] elementStack = new int[1024]; private int ignore = elementStack.length; private boolean notxml = false; private char[] cbuf; private int current = 0; private int clen = 0;; private int endstack = 0; private int last = 0; public HTMLParser(String content) { if (content == null) { cbuf = new char[0]; } else { cbuf = content.toCharArray(); } current = 0; clen = cbuf.length; } public String getTagName(String tag, int start) { tag = tag.substring(start); String[] words = tag.split("\\s", 2); if (words != null && words.length != 0) { return words[0]; } else { return tag; } } /* * (non-Javadoc) * * @see java.util.Iterator#hasNext() */ public boolean hasNext() { if (current >= clen) { return false; } for (int i = current; i < clen; i++) { if (cbuf[i] == '<') { current = i; return true; } } current = clen - 1; return true; } /* * (non-Javadoc) * * @see java.util.Iterator#next() */ public String next() { int tagend = clen - 1; int elend = -1; int tagstart = current + 1; boolean ignoreBefore = !(ignore > endstack); boolean ignoreAfter = ignoreBefore; for (int i = current; i < clen; i++) { if (elend == -1 && Character.isWhitespace(cbuf[i])) { elend = i; } if (cbuf[i] == '>') { tagend = i; if (elend == -1) { elend = tagend; } break; } } if (tagstart < clen) { if (cbuf[tagend - 1] == '/') { } else if (tagstart + 2 < clen && cbuf[tagstart] == '!' && cbuf[tagstart + 1] == '-' && cbuf[tagstart + 2] == '-') { } else if (cbuf[tagstart] == '/') { tagstart++; if (!notxml) { boolean match = true; if ((elend - tagstart) == (elementStack[endstack - 1] - elementStack[endstack - 2])) { int j = elementStack[endstack - 2]; for (int i = 0; i < (elend - tagstart); i++) { if (Character.toLowerCase(cbuf[tagstart + i]) != Character .toLowerCase(cbuf[j + i])) { match = false; break; } } } if (match) { endstack -= 2; ignoreAfter = !(ignore > endstack); } else { notxml = true; } } } else { if (!notxml) { elementStack[endstack] = tagstart; elementStack[endstack + 1] = elend; endstack += 2; if (!ignoreAfter) { for (int i = 0; i < IGNORE_TAGS.length; i++) { if (IGNORE_TAGS[i].length == (elend - tagstart)) { ignoreAfter = true; for (int j = 0; j < IGNORE_TAGS[i].length; j++) { if (IGNORE_TAGS[i][j] != Character .toLowerCase(cbuf[tagstart + j])) { ignoreAfter = false; break; } } if (ignoreAfter) { break; } } } } } } } String t = ""; if (notxml || !ignoreBefore) { if (true) { StringBuilder sb = new StringBuilder(); for (int i = last; i < current; i++) { if (cbuf[i] == '&') { if (cbuf[i + 1] == '#') { for (int j = i; j < current; j++) { if (cbuf[j] == ';') { String entity = new String(cbuf, i + 2, j - (i + 2)); sb.append((char) Integer.decode(entity).intValue()); i = j; break; } } } else { for (int j = i; j < current; j++) { if (cbuf[j] == ';') { String entity = new String(cbuf, i, j - i + 1); String s = (String) entities.get(entity); if (s == null) { s = entity; } else if (s.length() > 0) { sb.append(s.charAt(0)); } i = j; break; } } } } else { sb.append(cbuf[i]); } } t = sb.toString(); } else { t = new String(cbuf, last, current - last); } } last = tagend + 1; current = last; if (ignoreAfter) { if (!ignoreBefore) { ignore = endstack; } } else { ignore = endstack + 2; } return t; } /* * (non-Javadoc) * * @see java.util.Iterator#remove() */ public void remove() { } }