package org.iswc.iswc2012main; import java.util.ArrayList; import java.util.List; import java.util.TreeMap; import java.util.TreeSet; public class ToolHtmlParser { private String content =null; private int index = -1; private List<String> result = new ArrayList<String>(); private List<String> listTag = new ArrayList<String>(); public List<String> getResult() { return result; } public void initTag(String tag){ this.listTag.add(tag); } public void run(String content, String tagFirst){ this.content = content; this.index = 0; String tagNext = tagFirst; while (null!=tagNext){ String fragment = extract(tagNext); if (null==fragment) break; tagNext = lookupNextTag(); } } private String lookupNextTag(){ String markupBegin = "<"; String markupEnd= ">"; while (index>=0){ int indexSearchBegin = content.indexOf(markupBegin, index); int indexSearchEnd = content.indexOf(markupEnd, indexSearchBegin); if (indexSearchBegin>=0 && indexSearchEnd>=0){ String szTemp = content.substring(indexSearchBegin, indexSearchEnd+markupEnd.length()); for (String tag : this.listTag){ String pattern = "<"+tag+"[\\s>]"; if (szTemp.matches(pattern)){ return tag; } } //advance index index = indexSearchEnd +1; }else{ index = -1; //no more tag } } return null; } private String extract(String nextTag){ String markupBegin = String.format("<%s", nextTag); String markupEnd= String.format("</%s>", nextTag); int indexSearchBegin = content.indexOf(markupBegin, index); // String temp = content.substring(index); int indexSearchEnd = -1; if (indexSearchBegin>=0){ indexSearchEnd = content.indexOf(markupEnd, index); if (indexSearchEnd>=0){ String fragment = content.substring(indexSearchBegin, indexSearchEnd); //update this.index = indexSearchEnd+ markupEnd.length(); this.result.add(String.format("%s%s%03d%s%s",nextTag, SEPARATOR, this.result.size(), SEPARATOR, fragment)); return fragment; } } return null; } public static final String SEPARATOR = "----"; public static final int IDX_TAG =0; public static final int IDX_LINE =1; public static final int IDX_FRAGMENT =2; public static String[] parseLine(String line){ return line.split(SEPARATOR); } }