package org.iswc.iswc2012main; import java.io.File; import java.util.ArrayList; import java.util.List; import java.util.TreeMap; import org.iswc.iswc2012main.Config.FILE; import sw4j.util.Sw4jException; import sw4j.util.ToolIO; public class TaskParseHtml { public static void main(String[] args) { run(); } public static void test(){ debug =true; try { ToolIO.pipeStringToFile(getCsvHeader(), Config.FILE.csv_paper_cleanup.getFile(), false, false); runOne(Config.FILE.html_industry, 12, Config.FILE.csv_paper_cleanup); } catch (Sw4jException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static void run(){ try { ToolIO.pipeStringToFile(getCsvHeader(), Config.FILE.csv_paper_cleanup.getFile(), false, false); if (!runOne(Config.FILE.html_research, 41, Config.FILE.csv_paper_cleanup)){ return; } if (!runOne(Config.FILE.html_inuse, 17, Config.FILE.csv_paper_cleanup)){ return; } if (!runOne(Config.FILE.html_evaluation, 8, Config.FILE.csv_paper_cleanup)){ return; } if (!runOne(Config.FILE.html_doctoral_consortium, 15, Config.FILE.csv_paper_cleanup)){ return; } if (!runOne(Config.FILE.html_poster_demo, 31, Config.FILE.csv_paper_cleanup)){ return; } if (!runOne(Config.FILE.html_industry, 12, Config.FILE.csv_paper_cleanup)){ return; } if (!runOne(Config.FILE.html_semantic_web_challenge, 24, Config.FILE.csv_paper_cleanup)){ return; } logInfo("All Done!"); } catch (Sw4jException e) { // TODO Auto-generated catch block e.printStackTrace(); } } static boolean debug = false; private static void logInfo(Object o){ System.out.println(o); } private static void logV(Object o){ if (debug){ System.out.println(o); } } public static String getCsvHeader(){ String line = ""; for (EnumPaper p: EnumPaper.values()){ line += String.format("%s,", p); } line+="EOL\n"; return line; } private static boolean runOne(FILE input, int expectedSize, FILE output) throws Sw4jException { List<TreeMap<EnumPaper,String>> ret = null; if (Config.FILE.html_doctoral_consortium.equals(input)){ ret = parseHtmlDc(input.getFile()); }else if (Config.FILE.html_poster_demo.equals(input)){ ret = parseHtmlPd(input.getFile()); }else if (Config.FILE.html_inuse.equals(input)){ ret = parseHtmlRegular(input.getFile()); }else if (Config.FILE.html_evaluation.equals(input)){ ret = parseHtmlRegular(input.getFile()); }else if (Config.FILE.html_research.equals(input)){ ret = parseHtmlRegular(input.getFile()); }else if (Config.FILE.html_industry.equals(input)){ ret = parseHtmlIndustry(input.getFile()); }else if (Config.FILE.html_semantic_web_challenge.equals(input)){ ret = parseHtmlSwc(input.getFile()); } logInfo("=============final list of paper ("+ret.size()+ ") for "+ input.name()+" ================="); for (TreeMap<EnumPaper,String> p: ret){ logInfo(p); } //validate if (ret.size()!=expectedSize){ logInfo("FAILED"); System.exit(0); return false; } //save for (TreeMap<EnumPaper, String> paper: ret){ //append track key paper.put(EnumPaper.track, input.getKeyTrack()); //append local pdf String urlPdf = paper.get(EnumPaper.paperPdfLink); if (null!=urlPdf && urlPdf.length()>0){ String filenamePdf = urlPdf.substring(urlPdf.lastIndexOf("/")+1); paper.put(EnumPaper.paperPdfLinkFile, "pdf/"+filenamePdf); File f = new File (Config.PATH.local_stick_pdf.getFile(), filenamePdf); if (!f.exists()){ File fSubmission = new File (Config.PATH.local_iswc2012submission.getFile(), filenamePdf); if (fSubmission.exists()){ fSubmission.renameTo(f); }else{ logInfo("missing file "+ f.getName()); System.exit(0); } } } String line = ""; for (EnumPaper p: EnumPaper.values()){ String value = paper.get(p); if (null==value) value=""; else value= value.replaceAll("\"", "\\\""); line += String.format("\"%s\",", value); } line+="\n"; ToolIO.pipeStringToFile(line, output.getFile(), false, true); } return true; } enum StateRegular{ ready(""), sessionTimeTitle ("h3"), sessionChair("p"), paper("li"), finish(""), groupH3("h3"), paperDc("li"), sessionTimeTitleIndustry("h2"), paperIndustry("li"), groupH2("h2"), paperSwc("li"), ; String tag; StateRegular(String mark){ this.tag = mark; } } private static List<TreeMap<EnumPaper,String>> parseHtmlRegular(File f) throws Sw4jException{ String content = ToolIO.pipeFileToString(f); //parse html ToolHtmlParser parser = new ToolHtmlParser(); parser.initTag(StateRegular.sessionTimeTitle.tag); parser.initTag(StateRegular.sessionChair.tag); parser.initTag(StateRegular.paper.tag); parser.run(content, StateRegular.sessionTimeTitle.tag); //extact results StateRegular state = StateRegular.ready; List<TreeMap<EnumPaper,String>> listPaper = new ArrayList<TreeMap<EnumPaper,String>>(); TreeMap<EnumPaper,String> paper = null; int indexPaperInSession=0; int indexPaperInTrack =0; int indexSessionInTrack=0; for (String line: parser.getResult()){ String [] lineData = ToolHtmlParser.parseLine(line); //skip if (line.indexOf("http://www.w3.org/")>=0){ continue; } if (StateRegular.ready.equals(state)){ TreeMap<EnumPaper,String> paperNew = new TreeMap<EnumPaper,String>(); if (tryNextState (lineData, StateRegular.sessionTimeTitle, paperNew )){ state = StateRegular.sessionTimeTitle; paper = paperNew; indexSessionInTrack++; indexPaperInSession =0; } }else if (StateRegular.sessionTimeTitle.equals(state)){ if (tryNextState (lineData, StateRegular.sessionChair, paper)){ state = StateRegular.sessionChair; //do nothing } }else if (StateRegular.sessionChair.equals(state)){ TreeMap<EnumPaper,String> paperNew = new TreeMap<EnumPaper,String>(); if (tryNextState (lineData, StateRegular.paper, paper)){ state = StateRegular.paper; paperNew = new TreeMap<EnumPaper,String>(); paperNew.putAll(paper); listPaper.add(paperNew); indexPaperInSession++; indexPaperInTrack++; paperNew.put(EnumPaper.paperIndexInTrack, formatIndex(indexPaperInTrack)); paperNew.put(EnumPaper.paperIndexInSession, formatIndex(indexPaperInSession)); paperNew.put(EnumPaper.sessionIndexInTrack, formatIndex(indexSessionInTrack)); } }else if (StateRegular.paper.equals(state)){ TreeMap<EnumPaper,String> paperNew = new TreeMap<EnumPaper,String>(); if (tryNextState (lineData, StateRegular.paper, paper)){ paperNew = new TreeMap<EnumPaper,String>(); paperNew.putAll(paper); listPaper.add(paperNew); indexPaperInSession++; indexPaperInTrack++; paperNew.put(EnumPaper.paperIndexInTrack, formatIndex(indexPaperInTrack)); paperNew.put(EnumPaper.paperIndexInSession, formatIndex(indexPaperInSession)); paperNew.put(EnumPaper.sessionIndexInTrack, formatIndex(indexSessionInTrack)); }else if (tryNextState (lineData, StateRegular.sessionTimeTitle, paperNew)){ state = StateRegular.sessionTimeTitle; paper = paperNew; indexSessionInTrack++; indexPaperInSession =0; } } } return listPaper; } private static String formatIndex(int value) { return String.format("%02d", value); } private static List<TreeMap<EnumPaper,String>> parseHtmlDc(File f) throws Sw4jException{ String content = ToolIO.pipeFileToString(f); //parse html ToolHtmlParser parser = new ToolHtmlParser(); parser.initTag(StateRegular.groupH3.tag); parser.initTag(StateRegular.paperDc.tag); parser.run(content, StateRegular.groupH3.tag); //extact results StateRegular state = StateRegular.ready; int indexSessionInTrack=0; int indexPaperInSession=0; int indexPaperInTrack =0; List<TreeMap<EnumPaper,String>> listPaper = new ArrayList<TreeMap<EnumPaper,String>>(); TreeMap<EnumPaper,String> paper = null; for (String line: parser.getResult()){ String [] lineData = ToolHtmlParser.parseLine(line); if (line.indexOf("Whittier")>0) continue; //skip line if (StateRegular.ready.equals(state)){ TreeMap<EnumPaper,String> paperNew = new TreeMap<EnumPaper,String>(); if (tryNextState (lineData, StateRegular.groupH3, paperNew )){ state = StateRegular.groupH3; paper = paperNew; indexSessionInTrack++; indexPaperInSession =0; } }else if (StateRegular.groupH3.equals(state)){ TreeMap<EnumPaper,String> paperNew = new TreeMap<EnumPaper,String>(); if (tryNextState (lineData, StateRegular.paperDc, paper)){ state = StateRegular.paperDc; paperNew = new TreeMap<EnumPaper,String>(); paperNew.putAll(paper); listPaper.add(paperNew); indexPaperInSession++; indexPaperInTrack++; paperNew.put(EnumPaper.paperIndexInTrack, formatIndex(indexPaperInTrack)); paperNew.put(EnumPaper.paperIndexInSession, formatIndex(indexPaperInSession)); paperNew.put(EnumPaper.sessionIndexInTrack, formatIndex(indexSessionInTrack)); } }else if (StateRegular.paperDc.equals(state)){ TreeMap<EnumPaper,String> paperNew = new TreeMap<EnumPaper,String>(); if (tryNextState (lineData, StateRegular.paperDc, paper)){ paperNew = new TreeMap<EnumPaper,String>(); paperNew.putAll(paper); listPaper.add(paperNew); indexPaperInSession++; indexPaperInTrack++; paperNew.put(EnumPaper.paperIndexInTrack, formatIndex(indexPaperInTrack)); paperNew.put(EnumPaper.paperIndexInSession, formatIndex(indexPaperInSession)); paperNew.put(EnumPaper.sessionIndexInTrack, formatIndex(indexSessionInTrack)); }else if (tryNextState (lineData, StateRegular.groupH3, paperNew)){ state = StateRegular.groupH3; paper = paperNew; indexSessionInTrack++; indexPaperInSession =0; } } } return listPaper; } private static List<TreeMap<EnumPaper,String>> parseHtmlIndustry(File f) throws Sw4jException{ String content = ToolIO.pipeFileToString(f); //parse html ToolHtmlParser parser = new ToolHtmlParser(); parser.initTag(StateRegular.sessionTimeTitleIndustry.tag); parser.initTag(StateRegular.sessionChair.tag); parser.initTag(StateRegular.paperIndustry.tag); parser.run(content, StateRegular.sessionTimeTitleIndustry.tag); //extact results StateRegular state = StateRegular.ready; int indexSessionInTrack=0; int indexPaperInSession=0; int indexPaperInTrack =0; List<TreeMap<EnumPaper,String>> listPaper = new ArrayList<TreeMap<EnumPaper,String>>(); TreeMap<EnumPaper,String> paper = null; for (String line: parser.getResult()){ String [] lineData = ToolHtmlParser.parseLine(line); //skip special line if (line.indexOf("ISWC2012")>0) continue; if (StateRegular.ready.equals(state)){ TreeMap<EnumPaper,String> paperNew = new TreeMap<EnumPaper,String>(); if (tryNextState (lineData, StateRegular.sessionTimeTitleIndustry, paperNew )){ state = StateRegular.sessionTimeTitleIndustry; paper = paperNew; indexSessionInTrack++; indexPaperInSession =0; } }else if (StateRegular.sessionTimeTitleIndustry.equals(state)){ if (tryNextState (lineData, StateRegular.sessionChair, paper)){ state = StateRegular.sessionChair; //do nothing } }else if (StateRegular.sessionChair.equals(state)){ TreeMap<EnumPaper,String> paperNew = new TreeMap<EnumPaper,String>(); if (tryNextState (lineData, StateRegular.paperIndustry, paper)){ state = StateRegular.paperIndustry; paperNew = new TreeMap<EnumPaper,String>(); paperNew.putAll(paper); listPaper.add(paperNew); indexPaperInSession++; indexPaperInTrack++; paperNew.put(EnumPaper.paperIndexInTrack, formatIndex(indexPaperInTrack)); paperNew.put(EnumPaper.paperIndexInSession, formatIndex(indexPaperInSession)); paperNew.put(EnumPaper.sessionIndexInTrack, formatIndex(indexSessionInTrack)); } }else if (StateRegular.paperIndustry.equals(state)){ TreeMap<EnumPaper,String> paperNew = new TreeMap<EnumPaper,String>(); if (tryNextState (lineData, StateRegular.paperIndustry, paper)){ paperNew = new TreeMap<EnumPaper,String>(); paperNew.putAll(paper); listPaper.add(paperNew); indexPaperInSession++; indexPaperInTrack++; paperNew.put(EnumPaper.paperIndexInTrack, formatIndex(indexPaperInTrack)); paperNew.put(EnumPaper.paperIndexInSession, formatIndex(indexPaperInSession)); paperNew.put(EnumPaper.sessionIndexInTrack, formatIndex(indexSessionInTrack)); }else if (tryNextState (lineData, StateRegular.sessionTimeTitleIndustry, paperNew)){ state = StateRegular.sessionTimeTitleIndustry; paper = paperNew; indexSessionInTrack++; indexPaperInSession =0; } } } return listPaper; } private static List<TreeMap<EnumPaper,String>> parseHtmlPd(File f) throws Sw4jException{ String content = ToolIO.pipeFileToString(f); //parse html ToolHtmlParser parser = new ToolHtmlParser(); parser.initTag(StateRegular.groupH3.tag); parser.initTag(StateRegular.paper.tag); parser.run(content, StateRegular.groupH3.tag); //extact results StateRegular state = StateRegular.ready; int indexSessionInTrack=0; int indexPaperInSession=0; int indexPaperInTrack =0; List<TreeMap<EnumPaper,String>> listPaper = new ArrayList<TreeMap<EnumPaper,String>>(); TreeMap<EnumPaper,String> paper = null; for (String line: parser.getResult()){ String [] lineData = ToolHtmlParser.parseLine(line); //skip special line if (StateRegular.ready.equals(state)){ TreeMap<EnumPaper,String> paperNew = new TreeMap<EnumPaper,String>(); if (tryNextState (lineData, StateRegular.groupH3, paperNew )){ state = StateRegular.groupH3; paper = paperNew; indexSessionInTrack++; indexPaperInSession =0; } }else if (StateRegular.groupH3.equals(state)){ TreeMap<EnumPaper,String> paperNew = new TreeMap<EnumPaper,String>(); if (tryNextState (lineData, StateRegular.paper, paper)){ state = StateRegular.paper; paperNew = new TreeMap<EnumPaper,String>(); paperNew.putAll(paper); listPaper.add(paperNew); indexPaperInSession++; indexPaperInTrack++; paperNew.put(EnumPaper.paperIndexInTrack, formatIndex(indexPaperInTrack)); paperNew.put(EnumPaper.paperIndexInSession, formatIndex(indexPaperInSession)); paperNew.put(EnumPaper.sessionIndexInTrack, formatIndex(indexSessionInTrack)); } }else if (StateRegular.paper.equals(state)){ TreeMap<EnumPaper,String> paperNew = new TreeMap<EnumPaper,String>(); if (tryNextState (lineData, StateRegular.paper, paper)){ paperNew = new TreeMap<EnumPaper,String>(); paperNew.putAll(paper); listPaper.add(paperNew); indexPaperInSession++; indexPaperInTrack++; paperNew.put(EnumPaper.paperIndexInTrack, formatIndex(indexPaperInTrack)); paperNew.put(EnumPaper.paperIndexInSession, formatIndex(indexPaperInSession)); paperNew.put(EnumPaper.sessionIndexInTrack, formatIndex(indexSessionInTrack)); }else if (tryNextState (lineData, StateRegular.groupH3, paperNew)){ state = StateRegular.groupH3; paper = paperNew; indexSessionInTrack++; indexPaperInSession =0; } } } return listPaper; } private static List<TreeMap<EnumPaper,String>> parseHtmlSwc(File f) throws Sw4jException{ String content = ToolIO.pipeFileToString(f); //parse html ToolHtmlParser parser = new ToolHtmlParser(); parser.initTag(StateRegular.groupH2.tag); parser.initTag(StateRegular.paperSwc.tag); parser.run(content, StateRegular.groupH2.tag); //extact results StateRegular state = StateRegular.ready; int indexSessionInTrack=0; int indexPaperInSession=0; int indexPaperInTrack =0; List<TreeMap<EnumPaper,String>> listPaper = new ArrayList<TreeMap<EnumPaper,String>>(); TreeMap<EnumPaper,String> paper = null; for (String line: parser.getResult()){ String [] lineData = ToolHtmlParser.parseLine(line); //skip special line if (line.indexOf("Submissions")>0) continue; //skip line if (StateRegular.ready.equals(state)){ TreeMap<EnumPaper,String> paperNew = new TreeMap<EnumPaper,String>(); if (tryNextState (lineData, StateRegular.groupH2, paperNew )){ state = StateRegular.groupH2; paper = paperNew; indexSessionInTrack++; indexPaperInSession =0; } }else if (StateRegular.groupH2.equals(state)){ TreeMap<EnumPaper,String> paperNew = new TreeMap<EnumPaper,String>(); if (tryNextState (lineData, StateRegular.paperSwc, paper)){ state = StateRegular.paperSwc; paperNew = new TreeMap<EnumPaper,String>(); paperNew.putAll(paper); listPaper.add(paperNew); indexPaperInSession++; indexPaperInTrack++; paperNew.put(EnumPaper.paperIndexInTrack, formatIndex(indexPaperInTrack)); paperNew.put(EnumPaper.paperIndexInSession, formatIndex(indexPaperInSession)); paperNew.put(EnumPaper.sessionIndexInTrack, formatIndex(indexSessionInTrack)); } }else if (StateRegular.paperSwc.equals(state)){ TreeMap<EnumPaper,String> paperNew = new TreeMap<EnumPaper,String>(); if (tryNextState (lineData, StateRegular.paperSwc, paper)){ paperNew = new TreeMap<EnumPaper,String>(); paperNew.putAll(paper); listPaper.add(paperNew); indexPaperInSession++; indexPaperInTrack++; paperNew.put(EnumPaper.paperIndexInTrack, formatIndex(indexPaperInTrack)); paperNew.put(EnumPaper.paperIndexInSession, formatIndex(indexPaperInSession)); paperNew.put(EnumPaper.sessionIndexInTrack, formatIndex(indexSessionInTrack)); }else if (tryNextState (lineData, StateRegular.groupH2, paperNew)){ state = StateRegular.groupH2; paper = paperNew; indexSessionInTrack++; indexPaperInSession =0; } } } return listPaper; } private static boolean tryNextState(String [] lineData, StateRegular stateNext, TreeMap<EnumPaper,String> paper){ if (stateNext.tag.equals(lineData[ToolHtmlParser.IDX_TAG])){ String fragment = lineData[ToolHtmlParser.IDX_FRAGMENT]; logV("[FRAGMENT]"+fragment); String temp = fragment; if (StateRegular.sessionTimeTitle.equals(stateNext)){ temp = filterSessionTitle(temp); logV(temp); //update paper String [] aryTemp = temp.split("\\|"); if (aryTemp.length==4){ int indexTemp =0; paper.put(EnumPaper.sessionTimeStart, aryTemp[indexTemp]); indexTemp++; paper.put(EnumPaper.sessionTimeEnd, aryTemp[indexTemp]); indexTemp++; paper.put(EnumPaper.sessionTitle, aryTemp[indexTemp]); indexTemp++; paper.put(EnumPaper.sessionRoom, aryTemp[indexTemp]); indexTemp++; return true; }else if (aryTemp.length==3){ int indexTemp =0; paper.put(EnumPaper.sessionTimeStart, aryTemp[indexTemp]); indexTemp++; int indexSeparator = aryTemp[indexTemp].indexOf(" "); paper.put(EnumPaper.sessionTimeEnd, aryTemp[indexTemp].substring(0, indexSeparator)); paper.put(EnumPaper.sessionTitle, aryTemp[indexTemp].substring(indexSeparator).trim()); indexTemp++; paper.put(EnumPaper.sessionRoom, aryTemp[indexTemp]); return true; }else{ for (char c :temp.toCharArray()){ logV( (int)c+" - "+ c); } logV("Teminated due to mismatch -- " + stateNext); return false; } }else if (StateRegular.sessionTimeTitleIndustry.equals(stateNext)){ temp = temp.replaceAll(": ","|"); temp = filterSessionTitle(temp); logV(temp); //update paper String [] aryTemp = temp.split("\\|"); if (aryTemp.length==4){ int indexTemp =0; paper.put(EnumPaper.sessionTimeStart, aryTemp[indexTemp]); indexTemp++; paper.put(EnumPaper.sessionTimeEnd, aryTemp[indexTemp]); indexTemp++; paper.put(EnumPaper.sessionTitle, aryTemp[indexTemp]); indexTemp++; paper.put(EnumPaper.sessionRoom, aryTemp[indexTemp]); indexTemp++; return true; }else{ for (char c :temp.toCharArray()){ logV( (int)c+" - "+ c); } logV("Teminated due to mismatch -- " + stateNext); return false; } }else if (StateRegular.sessionChair.equals(stateNext)){ temp = temp.substring(temp.indexOf(":")+1); temp =temp.trim(); logV(temp); //update paper String [] aryTemp = temp.split("\\|"); if (aryTemp.length==1){ int indexTemp =0; paper.put(EnumPaper.sessionChair, aryTemp[indexTemp]); indexTemp++; return true; }else{ logV("Teminated due to mismatch -- " + stateNext); return false; } }else if (StateRegular.paper.equals(stateNext)){ temp = temp.replaceAll("[\\(\\)]+", "|"); //spotlight temp = filterPaper(temp); logV(temp); //skip special presentation if (temp.indexOf("Special presentation")>=0) return false; //update paper String [] aryTemp = temp.split("\\|"); if (aryTemp.length==3 ){ int indexTemp =0; paper.put(EnumPaper.paperPdfLink, cleanUrl(aryTemp[indexTemp])); indexTemp++; paper.put(EnumPaper.paperTitle, cleanTitle(aryTemp[indexTemp])); indexTemp++; paper.put(EnumPaper.paperAuthorList, cleanAuthor(aryTemp[indexTemp])); indexTemp++; paper.put(EnumPaper.paperSpotlight, ""); return true; }else if (aryTemp.length==4 ){ int indexTemp =0; paper.put(EnumPaper.paperPdfLink, cleanUrl(aryTemp[indexTemp])); indexTemp++; paper.put(EnumPaper.paperTitle, cleanTitle(aryTemp[indexTemp])); indexTemp++; paper.put(EnumPaper.paperSpotlight, aryTemp[indexTemp]); indexTemp++; paper.put(EnumPaper.paperAuthorList, cleanAuthor(aryTemp[indexTemp])); indexTemp++; return true; }else{ for (char c :temp.toCharArray()){ logV( (int)c+" - "+ c); } logV("Teminated due to mismatch -- " + stateNext); return false; } }else if (StateRegular.paperIndustry.equals(stateNext)){ temp = temp.replaceAll("[\\(\\)]+", "|"); temp = filterPaper(temp); logV(temp); //update paper String [] aryTemp = temp.split("\\|"); if (aryTemp.length==3 ){ int indexTemp =0; paper.put(EnumPaper.paperTitle, cleanTitle(aryTemp[indexTemp])); indexTemp++; paper.put(EnumPaper.paperAuthorList, cleanAuthor(aryTemp[indexTemp])); indexTemp++; paper.put(EnumPaper.paperAuthorAffiliation, aryTemp[indexTemp]); indexTemp++; return true; }else{ for (char c :temp.toCharArray()){ logV( (int)c+" - "+ c); } logV("Teminated due to mismatch -- " + stateNext); return false; } }else if (StateRegular.groupH3.equals(stateNext) || StateRegular.groupH2.equals(stateNext)){ temp =filterSessionTitle(temp); logV(temp); //update paper String [] aryTemp = temp.split("\\|"); if (aryTemp.length==1){ int indexTemp =0; paper.put(EnumPaper.group, aryTemp[indexTemp]); indexTemp++; return true; }else{ logV("Teminated due to mismatch -- " + stateNext); return false; } }else if (StateRegular.paperDc.equals(stateNext)){ temp = filterPaperDc(temp); logV(temp); //update paper String [] aryTemp = temp.split("\\|"); if (aryTemp.length==2){ int indexTemp =0; paper.put(EnumPaper.paperAuthorList, cleanAuthor(aryTemp[indexTemp])); indexTemp++; paper.put(EnumPaper.paperTitle, cleanTitle(aryTemp[indexTemp])); indexTemp++; return true; }else{ logV("Teminated due to mismatch -- " + stateNext); return false; } }else if (StateRegular.paperSwc.equals(stateNext)){ temp = filterPaper(temp); logV(temp); //update paper String [] aryTemp = temp.split("\\|"); if (aryTemp.length >= 5){ paper.put(EnumPaper.paperDemoLink, cleanUrl(aryTemp[0])); paper.put(EnumPaper.paperAuthorList, cleanAuthor(aryTemp[aryTemp.length-3])); paper.put(EnumPaper.paperPdfLink, cleanUrl(aryTemp[aryTemp.length-2])); paper.put(EnumPaper.paperTitle, cleanTitle(aryTemp[aryTemp.length-1])); return true; }else{ logV("Teminated due to mismatch -- " + stateNext); return false; } } } // logV("Teminated due to failed matching -- " + stateNext); return false; } private static String filterSessionTitle(String line){ String temp = line; temp = temp.replaceAll("-", "|"); temp = temp.replaceAll("[\\(\\)]+", "|"); temp = filterPaper(temp); return temp; } private static String filterPaperDc(String line){ String temp = line; temp = temp.replaceAll("\\.", "|"); temp = filterPaper(temp); return temp; } private static String filterPaper(String line){ String temp = line; temp = temp.replaceAll("[\\s\\xA0]+", " "); temp = temp.replaceAll("<a href=\"", "|"); temp = temp.replaceAll("<a href='", "|"); temp = temp.replaceAll("\">", "|"); temp = temp.replaceAll("'>", "|"); temp = temp.replaceAll("<[^>]+>", "|"); temp = temp.replaceAll("[\\|\\s]*\\|[\\|\\s]*", "|"); temp = temp.replaceAll("\\|.\\|", "|"); temp = temp.trim(); temp = temp.replaceAll("\\|$", ""); temp = temp.replaceAll("^\\|", ""); temp = temp.trim(); return temp; } private static String cleanUrl(String url){ if (null==url || url.length()==0) return null; String ret =url; int index = url.indexOf("#"); if (index>=0){ ret = url.substring(0, index); } if (ret.startsWith("/sites")){ ret = "http://iswc2012.semanticweb.org"+ret; }else if (ret.startsWith("swc2012_")){ ret = "http://challenge.semanticweb.org/2012/submissions/"+ret; } if (!ret.startsWith("http")){ logInfo("wrong url"); System.exit(0); } return ret; } public static String cleanTitle(String title){ String temp = title; temp = temp.replace("’", "'"); temp = temp.replace("–", "-"); return temp; } public static String cleanAuthor(String authorList){ String temp = authorList; temp = temp.replaceAll("\\s*,\\s*and", " and"); temp = temp.replaceAll(",$", ""); String [] authors = authorList.split("\\s*,\\s*"); if (authors.length>1){ if (authors[authors.length-1].indexOf(" and ")<0){ temp = temp.replaceAll("\\s*,\\s*"+authors[authors.length-1], " and "+authors[authors.length-1]); } } return temp; } }