/* * To change this template, choose Tools | Templates */ package context.core.task.lexisnexis; import java.util.ArrayList; import java.util.List; import org.apache.commons.lang3.text.WordUtils; /** * * @author jkim362-admin */ public class CodebookEntity { private String text; private String name; private int percent; private String type; private String subtype; private static String pattern = "(.+?)\\((\\d+)%\\).*"; private static List<String> rejectList = new ArrayList<String>(); /** * * @return */ public static List<String> getRejectList() { return rejectList; } /** * */ public static void clearRejectList() { rejectList.clear(); } /** * * @return */ public String getText() { return text; } /** * * @param text */ public void setText(String text) { this.text = text; } /** * * @return */ public String getSubtype() { return subtype; } /** * * @param subtype */ public void setSubtype(String subtype) { this.subtype = subtype; } /** * * @return */ public String getName() { return name; } /** * * @param name */ public void setName(String name) { this.name = name; } /** * * @return */ public int getPercent() { return percent; } /** * * @param percent */ public void setPercent(int percent) { this.percent = percent; } /** * * @return */ public String getType() { return type; } /** * * @param type */ public void setType(String type) { this.type = type; } /** * * @return */ public static String getPattern() { return pattern; } /** * * @param pattern */ public static void setPattern(String pattern) { CodebookEntity.pattern = pattern; } /** * * @param line * @param type * @param subtype * @param textID * @return */ public static List<CodebookEntity> parseLine(String line, String type, String subtype, String textID) { List<CodebookEntity> codebooks = new ArrayList<CodebookEntity>(); if (line == null || line.length() == 0) { return codebooks; } // System.out.println("Starting parseLine for " + line); line = line.replace(',', ' '); line = line.replace("\u00A0", ""); // System.out.println("replace u00A0: " + line); line = line.replaceAll("\\s+", " "); // System.out.println("replace s+ :" + line); String[] split = line.split(";"); for (int i = 0; i < split.length; i++) { CodebookEntity cb = new CodebookEntity(); cb.text = split[i].replaceFirst(pattern, "$1"); // System.out.println("cb.text (1) :" + cb.text); cb.text = itrim(cb.text); // System.out.println("cb.text (2) :" + cb.text); cb.name = toNormalize(cb.text); // System.out.println("cb.name :" + cb.name); final String percentStr = split[i].replaceFirst(pattern, "$2"); try { if (percentStr != null && percentStr.length() > 0) { cb.percent = Integer.parseInt(percentStr); } else { cb.percent = 0; } } catch (Exception ex) { // System.out.println("line=" + line + " end!"); // System.out.println("split " + i + " " + split[i] + " end2!"); // System.out.println("final " + split[i].replaceFirst(pattern, "$2") + " end3!"); addToRejectList("ID:" + textID + " Type:" + type + " Text:" + line); } cb.type = type; cb.subtype = subtype; codebooks.add(cb); } return codebooks; } /** * * @param text * @return */ public static String toNormalize(String text) { return WordUtils.capitalizeFully(text).replace(' ', '_'); } @Override public String toString() { return "Text=" + this.text + "Name=" + this.name + " Percent=" + this.percent + " Type=" + this.type; //To change body of generated methods, choose Tools | Templates. } /** * * @param line */ public static void addToRejectList(String line) { rejectList.add(line); } /** * * @param str * @return */ public static String itrim(String str) { str = str.replaceFirst("^[\\x00-\\x200\\xA0]+", "").replaceFirst("[\\x00-\\x20\\xA0]+$", ""); str = str.replace(String.valueOf((char) 160), " ").trim(); return str; } /** * * @param args */ public static void main(String[] args) { String line = "MUAM, MAR ,GADD,AFI (53%); OSAMA BIN LADEN (52%)"; final List<CodebookEntity> cbs = parseLine(line, "geo", "specific", "1"); System.out.println(cbs); } @Override public int hashCode() { int hash = 3; hash = 31 * hash + this.name.hashCode(); return hash; } @Override public boolean equals(Object obj) { if (obj == null) { return false; } if (getClass() != obj.getClass()) { return false; } final CodebookEntity other = (CodebookEntity) obj; if (!this.name.equals(other.name)) { return false; } return true; } }