package; import android.util.Pair; import; public class SimpleHtmlParser { private int position; private final String html; public SimpleHtmlParser(String html) { this.position = 0; this.html = html; } public SimpleHtmlParser(String html, int pos) { this.position = pos; this.html = html; } public int getPosition() { return position; } public void setPosition(int newPosition) { if (html.length() <= newPosition) { position = -1; } else { position = newPosition; } } public boolean isEndOfContent() { return position == -1; } /** * Pass in the HTML and the position of last search plus number of columns to skip. Uses * indexOf to find the <td> and skip them to move to the next column without parsing the * insides. Will throw if end of HTML. Throws exception when column does not exist. * @param numberOf */ public void skipColumns(int numberOf) { for (int i = 0; i < numberOf; i++) { skipTag("td"); if (position == -1) { throw new JbmnplsParsingException("Cannot skip column when no columns left."); } } } /** * Passes the HTML, the tag you are looking for and the position, will skip that tag * and forward the position in the HTML. Will throw if end of HTML. * @param tag */ public void skipTag(String tag) { // Get the text inside the column String open = "<" + tag, closing = "</" + tag + ">"; position = html.indexOf(open, position); if (position == -1) { throw new JbmnplsParsingException("Cannot skip tag because open " + tag + " doesnt exist."); } position = html.indexOf(closing, position); if (position == -1) { throw new JbmnplsParsingException("Cannot skip tag because closing " + tag + " doesnt exist."); } position += closing.length(); } /** * Used to find the text inside the element (goes further down the children till reaches * text). This is not smart enough to pick up trailing text after embedded elements or * the same element in the element (such as a <div> inside another <div>). Since Jobmine * does not do this, we do not need to waste time doing that parsing. * @param tag * @return text */ public String getTextInNextElement(String tag) { return getTextInNextElement(html, tag, position); } /** * Recursively crawls an element's children and receives its text. Not smart enough for * major parsing but simple for Jobmine tables. * @param text * @param tag * @param pos * @return text in the child node */ private String getTextInNextElement(String text, String tag, int pos) { Pair<Integer, String> result = htmlInTag(text, tag, pos); if (result == null) { throw new JbmnplsParsingException("Cannot find " + tag + " in html."); } int holdPosition = result.first; text = result.second; if (text.charAt(0) == '<') { tag = findCurrentTag(text, 0); text = getTextInNextElement(text, tag, 0); position = holdPosition; return text; } text = text.replaceAll(" ", "").trim(); position = holdPosition; return text; } /** * This finds the attribute's value inside the current element * @param attribute to find * @return the value inside the attribute or null if attribute doesnt exist */ public String getAttributeInCurrentElement(String attribute) { int lessThan = html.lastIndexOf("<", position); if (lessThan == -1) { throw new JbmnplsParsingException("Cannot find attribute in current element"); } position = lessThan; // See if we are in the closing tag, go to the last opening tag of the same tag if (html.charAt(position + 1) == '/') { String tag = findCurrentTag(html, position); position = html.lastIndexOf("<" + tag, position); if (position == -1) { throw new JbmnplsParsingException("Cannot find attribute in current element"); } } // Find either the end of the tag or the attribute Pair<Integer, String> result = indexOfFirstOccurance(html, position, attribute + "=", ">"); if (result == null) { return null; } if (result.second == ">") { return null; } int attrStart = result.first + result.second.length(); // See what type of quotes it is using and find the other quote that surrounds the value Character quoteChar = html.charAt(attrStart); if (quoteChar != '"' && quoteChar != '\'') { throw new JbmnplsParsingException("Cannot find attribute in current element. (Cannot parse attribute)"); } attrStart++; int attrEnd = html.indexOf(quoteChar, attrStart); if (attrEnd == -1) { throw new JbmnplsParsingException("Cannot find attribute in current element. (Cannot parse attribute)"); } return html.substring(attrStart, attrEnd); } /** * Finds the text inside the current html tag * Like getTextInNextElement, it will recusively look for the text * inside the tag. The current tag is where the current position inside * the html the parser is using. * @return text inside the current element */ public String getTextInCurrentElement() { int lessThan = html.lastIndexOf("<", position); if (lessThan == -1) { throw new JbmnplsParsingException("Cannot find text in current element"); } position = lessThan; String tag = findCurrentTag(html, position); return getTextInNextElement(tag); } /** * Gets the tag string where the position is inside the text. * If you call this within text (not html tag), it will look for the parent tag * If you call this within a tag definition, it will find the name of that tag * This is not smart for complex html, you must use this with valid html * syntax. * @param text * @param pos * @return tag */ private String findCurrentTag(String text, int pos) { int lessThan = text.lastIndexOf("<", pos); if (lessThan == -1 || text.length() <= lessThan + 1) { throw new JbmnplsParsingException("Cannot find last tag in html."); } // If captured the ending tag then skip the slash but find the tag name if (text.charAt(lessThan+1) == '/') { lessThan++; } Pair<Integer, String> result = indexOfFirstOccurance(text, lessThan, " ", ">"); if (result == null) { throw new JbmnplsParsingException("Cannot find last tag in html."); } return text.substring(lessThan + 1, result.first); } /** * Gets the text inside a TD. Very customized for Jobmine web page tables. * Looks for the <td>, then <span> and if inside is an anchor tag <a>, then it will * find the text in that. Remove extra spaces and returns it. * Specify the HTML and its current position and it will return the position and text * it found. Will throw exceptions if end of HTML. * @return String */ public String getTextInNextTD() { return getTextInNextElement("td"); } /** * Moves the position after searched text and will return the next position. * If you have multiple text to skip, then add them as arguments, order does matter * as it will go from text to next text. * Throws error when text is not found. * @param text (can be multiple) * @return position */ public int skipText(String... textArr) { int index, i; String text; for (i = 0; i < textArr.length; i++) { text = textArr[i]; index = html.indexOf(text, position); if (index == -1) { throw new JbmnplsParsingException("Cannot find " + text + " in html."); } position = index + text.length(); } return position; } /** * Finds the text of the tag you are looking for in the HTML. Will update the * position to end of the element. If cannot find, it will return null. * @param html * @param tag * @param position * @return Pair (of positon and text), if not found will return null */ private Pair<Integer, String> htmlInTag(String text, String tag, int pos) { // Get the text inside the column String open = "<" + tag, closing = "</" + tag + ">"; int start = text.indexOf(open, pos); if (start == -1) { return null; } start = text.indexOf(">", start); if (start == -1) { return null; } int end = text.indexOf(closing, start); if (end == -1) { return null; } text = text.substring(++start, end); end += closing.length(); return new Pair<Integer, String>(end, text); } /** * This internal function will find the first occurance of one of the specfied strings * passed in. * For example, if you pass indexOfFirstOccurance("foo", "bar", "thing"); * it will look in the text for each and return the position and string that appears first * * For a sentence like "I am Matthew and I like foo and bar with thing" * The first occurance would be "foo" at index 21 * @param text This is the string to search * @param indexFrom Like indexOf, this index is where searching starts from * @param strings A list of words to search * @return a pair of the position and found text, null if cannot find any */ private Pair<Integer, String> indexOfFirstOccurance(String text, int indexFrom, String... strings) { int[] positions = new int[strings.length]; int smallest = text.length(); int smallestIndex = -1; for (int i = 0; i < strings.length; i++) { positions[i] = text.indexOf(strings[i], indexFrom); // Record the index if this came first if (positions[i] < smallest) { smallestIndex = i; smallest = positions[i]; } } // Could not find any of the strings in the text if (positions[smallestIndex] == -1) { return null; } return new Pair<Integer, String>(positions[smallestIndex], strings[smallestIndex]); } }