package org.iswc.iswc2012main.dev; import java.io.File; import java.io.FileNotFoundException; import java.util.HashSet; import java.util.TreeMap; import java.util.TreeSet; import org.apache.commons.lang3.StringEscapeUtils; import org.iswc.iswc2012main.Config; import org.iswc.iswc2012main.Config.FILE; import sw4j.util.Sw4jException; import sw4j.util.ToolIO; import sw4j.util.ToolString; public class TaskParseProceedings { public static void main(String[] args) throws Sw4jException{ File fData = Config.FILE.iswc2012_front.getFile(); String content = ToolIO.pipeFileToString(fData); extractSpecialChars(content); } public static void extractSpecialChars(String content){ TreeMap<String,String> mapCharEncoding= new TreeMap<String,String>(); // mapCharEncoding.put("¨o", "�"); // mapCharEncoding.put("´","�"); // mapCharEncoding.put("¸","�"); // mapCharEncoding.put("ı",""); // mapCharEncoding.put("ˆ","�"); // mapCharEncoding.put("ˇ",""); // mapCharEncoding.put("&", "&"); mapCharEncoding.put("–", "-"); mapCharEncoding.put("’", "'"); mapCharEncoding.put("“", ""); mapCharEncoding.put("”", ""); mapCharEncoding.put("",""); mapCharEncoding.put("fl","fl"); mapCharEncoding.put("ffi","ffl"); mapCharEncoding.put("fi","fi"); mapCharEncoding.put("ff","ff"); for (String key: mapCharEncoding.keySet()){ String replacee = mapCharEncoding.get(key); content = content.replaceAll(key, replacee); } int index =-1; TreeSet<String> setCharSpecial = new TreeSet<String>(); TreeSet<String> setCharSpecialPlus = new TreeSet<String>(); while (0<(index=content.indexOf("&",index+1))){ int index_temp1 = content.indexOf(";",index)+1; int index_temp2 = content.indexOf(" ",index); int index_next =Math.min(index_temp1, index_temp2); if (index_next<=index) continue; // not found if (index_next==index+1) continue; //length is 1 String temp = content.substring(index, index_next); setCharSpecial.add(temp); if (index_next-index==6){ String temp1 = content.substring(index, index_next+1); setCharSpecialPlus.add(temp1); } } for(String t: setCharSpecial){ String unescaped = StringEscapeUtils.unescapeHtml4(t); System.out.println("["+t+"] => ["+unescaped+"]"); } System.out.println(setCharSpecial.size()); for(String t: setCharSpecialPlus){ String unescaped = StringEscapeUtils.unescapeHtml4(t); System.out.println("["+t+"] => ["+unescaped+"]"); } System.out.println(setCharSpecialPlus.size()); } }