package context.core.task.lexisnexis; import java.text.SimpleDateFormat; import java.util.Date; import java.util.GregorianCalendar; import java.util.HashSet; import java.util.Vector; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * * @author Jana Diesner */ public class LxNxTextParser { private String text = ""; private String id = ""; static HashSet<String> hsFirstKeyWords = new HashSet<String>(); static HashSet<String> hsLastKeyWords = new HashSet<String>(); static HashSet<String> hsLineKeyWords = new HashSet<String>(); static boolean setupComplete = false; /** * * @param textToParse * @param idperText */ public LxNxTextParser(String textToParse, String idperText) { this.text = textToParse; this.id = idperText; // System.out.println("text: "+text); if (!setupComplete) { hsFirstKeyWords.add("\nBYLINE:"); hsFirstKeyWords.add("\nSECTION:"); hsFirstKeyWords.add("\nLENGTH:"); hsLastKeyWords.add("\nSUBJECT:"); hsLastKeyWords.add("\nCOMPANY:"); hsLastKeyWords.add("\nCOUNTRY:"); hsLastKeyWords.add("\nCITY:"); hsLastKeyWords.add("\nLOAD-DATE:"); hsLastKeyWords.add("\nPERSON:"); hsLastKeyWords.add("\nCOUNTRY:"); hsLastKeyWords.add("\nSTATE:"); hsLastKeyWords.add("\nORGANIZATION:"); hsLastKeyWords.add("\nGEOGRAPHIC:"); hsLastKeyWords.add("\nLANGUAGE:"); hsLastKeyWords.add("\nPUBLICATION-TYPE:"); hsLastKeyWords.add("\nGRAPHIC:"); hsLastKeyWords.add("\nPHOTO:"); hsLastKeyWords.add("\nPhoto:"); hsLastKeyWords.add("\nColor Photo:"); hsLastKeyWords.add("\nELEMENT-WITNESS:"); hsLastKeyWords.add("\nBLOCK-TIME:"); hsLastKeyWords.add("\n(c) Copyright"); // hsLastKeyWords.add("passage omitted"); hsLineKeyWords.add("Text of report by"); hsLineKeyWords.add("Text of report in"); hsLineKeyWords.add("Excerpt from report by"); hsLineKeyWords.add("DATELINE: "); hsLineKeyWords.add("HIGHLIGHT: "); hsLineKeyWords.add("SOURCE: "); hsLineKeyWords.add("Source: "); hsLineKeyWords.add("IN BRIEF"); setupComplete = true; } } /** * * @return */ public String getSource() { String workCopy = text; workCopy = workCopy.trim(); String[] lines = workCopy.split("\n"); // System.out.println(lines.length); // System.out.println(workCopy); String source = ""; for (int i = 0; i < lines.length - 1; i++) { String line = lines[i]; // System.out.println("test"); String nextLine = lines[i + 1].trim(); if (nextLine.length() == 0 && line.length() > 0) { source = source + line; // System.out.println(nextLine); i = lines.length + 1; } else { source = source + line; // System.out.println("test2"); } } return source.trim(); } /** * * @return */ public String getTitle() { String workCopy = text.trim(); String[] chunks = workCopy.split("\n\n"); Vector<String> contentChunks = new Vector<String>(); for (int i = 0; i < chunks.length; i++) { if (chunks[i].trim().length() > 0) { contentChunks.add(chunks[i]); } } if (contentChunks.size() > 2) { return contentChunks.get(2).trim(); } else { return ""; } } /** * * @return */ public String getAuthor() { String sAuthor = ""; for (String line : getTextChunks()) { if (line.startsWith("BYLINE: ")) { sAuthor = line.substring("BYLINE: ".length()).trim(); } } return sAuthor; } /** * * @return */ public String getSection() { String sSection = ""; for (String line : getTextChunks()) { if (line.startsWith("SECTION: ")) { sSection = line.substring("Section: ".length()).trim(); } } return sSection; } /** * * @return */ public String getLength() { String sLength = ""; for (String line : getTextChunks()) { if (line.startsWith("LENGTH: ")) { sLength = line.substring("LENGTH: ".length()).trim(); } } return sLength; } /** * * @return */ public String getSubject() { String sSubject = ""; for (String line : getTextChunks()) { if (line.startsWith("SUBJECT: ")) { sSubject = line.substring("SUBJECT: ".length()).trim(); } } return sSubject; } /** * * @return */ public String getOrganization() { String sOrganization = ""; for (String line : getTextChunks()) { if (line.startsWith("ORGANIZATION: ")) { sOrganization = line.substring("ORGANIZATION: ".length()) .trim(); } } return sOrganization; } /** * * @return */ public String getGeo() { String sGeo = ""; for (String line : getTextChunks()) { if (line.startsWith("GEOGRAPHIC: ")) { sGeo = line.substring("GEOGRAPHIC: ".length()).trim(); } } // System.out.println("sGeo: "+sGeo); return sGeo; } /** * * @return */ public String getCountry() { String sCountry = ""; for (String line : getTextChunks()) { if (line.startsWith("COUNTRY: ")) { sCountry = line.substring("COUNTRY: ".length()).trim(); } } return sCountry; } /** * * @return */ public String getLanguage() { String sLanguage = ""; for (String line : getTextChunks()) { if (line.startsWith("LANGUAGE: ")) { sLanguage = line.substring("LANGUAGE: ".length()).trim(); } } return sLanguage; } /** * * @return */ public String getPubType() { String sPubType = ""; for (String line : getTextChunks()) { if (line.startsWith("PUBLICATION-TYPE: ")) { sPubType = line.substring("PUBLICATION-TYPE: ".length()).trim(); } } return sPubType; } /** * * @return */ public String getPerson() { String sPerson = ""; for (String line : getTextChunks()) { if (line.startsWith("PERSON: ")) { sPerson = line.substring("PERSON ".length()).trim(); } } return sPerson; } /** * * @return */ public String getCompany() { String sCompany = ""; for (String line : getTextChunks()) { if (line.startsWith("COMPANY: ")) { sCompany = line.substring("COMPANY ".length()).trim(); } } return sCompany; } /** * * @return */ public String getGraphic() { String sGraphic = ""; for (String line : getTextChunks()) { if (line.startsWith("GRAPHIC: ")) { sGraphic = line.substring("GRAPHIC ".length()).trim(); } if (line.startsWith("Photo:")){ sGraphic=sGraphic+line.trim(); } if (line.startsWith("Color Photo:")){ sGraphic=sGraphic+line.trim(); } } return sGraphic; } /** * * @return */ public String getElementWitness() { String sElementWitness = ""; for (String line : getTextChunks()) { if (line.startsWith("ELEMENT-WITNESS: ")) { sElementWitness = line.substring("ELEMENT-WITNESS ".length()).trim(); } } return sElementWitness; } /** * * @return */ public String getBlockTime() { String sBlockTime = ""; for (String line : getTextChunks()) { if (line.startsWith("BLOCK-TIME: ")) { sBlockTime = line.substring("BLOCK-TIME ".length()).trim(); } } return sBlockTime; } /** * * @return */ public String getTicker() { String sTicker = ""; for (String line : getTextChunks()) { if (line.startsWith("TICKER: ")) { sTicker = line.substring("TICKER ".length()).trim(); } } return sTicker; } /** * * @return */ public String getJournalCode() { String sJournalCode = ""; for (String line : getTextChunks()) { if (line.startsWith("JOURNAL-CODE: ")) { sJournalCode = line.substring("JOURNAL-CODE ".length()).trim(); } } return sJournalCode; } /** * * @return */ public String getIndustry() { String sIndustry = ""; for (String line : getTextChunks()) { if (line.startsWith("INDUSTRY: ")) { sIndustry = line.substring("INDUSTRY ".length()).trim(); } } return sIndustry; } /** * * @return */ public String getCity() { String sCity = ""; for (String line : getTextChunks()) { if (line.startsWith("CITY: ")) { sCity = line.substring("CITY ".length()).trim(); } } return sCity; } /** * * @return */ public Vector<String> getTextChunks() { String workCopy = text.trim(); // System.out.println("workCopy: "+text); String[] chunks = workCopy.split("\n\n"); Vector<String> contentChunks = new Vector<String>(); for (int i = 0; i < chunks.length; i++) { if (chunks[i].trim().length() > 0) { contentChunks.add(chunks[i].trim()); } } return contentChunks; } // stuff before actual text body /** * * @return */ public String getTextBody() { int highestKeywordPos = 0; for (String keyword : this.hsFirstKeyWords) { int pos = this.text.indexOf(keyword); if (pos > highestKeywordPos) { highestKeywordPos = pos; // System.out.println("getTextBody : highestKeywordPos: // "+highestKeywordPos+" keyword:"+keyword); } } highestKeywordPos = text.indexOf("\n", highestKeywordPos + 1); // stuff after actual text body int lowestKeywordPos = this.text.length(); for (String keyword : this.hsLastKeyWords) { int pos = this.text.indexOf(keyword); if (pos < lowestKeywordPos && pos != -1) { // System.out.println("getTextBody : lowestKeywordPos: // "+lowestKeywordPos+" keyword:"+keyword); lowestKeywordPos = pos; } } if (lowestKeywordPos < highestKeywordPos) { // System.out.println("getTextBody problem: // lowestKeywordPos"+lowestKeywordPos+" // highestKeywordPos:"+highestKeywordPos); lowestKeywordPos = highestKeywordPos; } String textNow = this.text.substring(highestKeywordPos, lowestKeywordPos); String[] lines = textNow.split("\n"); StringBuffer sb = new StringBuffer(); for (String line : lines) { boolean noNoiseLine = true; for (String lineKeyword : this.hsLineKeyWords) { if (line.startsWith(lineKeyword) || line.trim().startsWith(lineKeyword)) { noNoiseLine = false; // System.out.println("identified noise line: ----"+line+" // ----"); } } if (noNoiseLine) { sb.append(line); sb.append("\n"); // System.out.println("good line:"+line); } else { // System.out.println("bad line:"+line); } } textNow = sb.toString(); // System.out.println("textNow:"+textNow); /* * int endPos = text.indexOf("\nSUBJECT: "); if (endPos < 0) endPos = * text.length(); */ // System.out.println("getTextBody: "+t); // System.out.println("textNow:"+textNow); String t = this.getTitle().trim() + "." + "\n\n" + textNow; t = t.replace("passage omitted", ""); t = t.replace("Passage omitted", ""); // System.out.println("t:"+t); return t; } /** * * @return */ public String getTextID() { return id; } /** * * @return */ public int getCleanLength() { String length = this.getLength(); Pattern p = Pattern.compile("([0-9]+)"); Matcher m = p.matcher(length); int length2 = 0; if (m.find()) { length2 = java.lang.Integer.parseInt(m.group(1)); } // System.out.println("length: " + length2); return length2; } /** * * @return */ public String getLoadDate() { Vector<String> contentChunks = new Vector<String>(); contentChunks = this.getTextChunks(); String sLoadDate = ""; for (String line : getTextChunks()) { if (line.startsWith("LOAD-DATE: ")) { sLoadDate = line.substring("LOAD-DATE: ".length()).trim(); } } return sLoadDate; } /** * * @return */ public String getDate() { Vector<String> contentChunks = new Vector<String>(); contentChunks = this.getTextChunks(); // System.out.println("content chnuks: "+ contentChunks.toString()); String reply = ""; if (contentChunks.size() > 1) { if (contentChunks.get(1).indexOf("\n") != -1) { reply = contentChunks.get(1).split("\n")[0].trim(); } } return reply; } /** * * @return */ public String getBestDate() { String reply = this.getDateAsDate(this.getDate()); if (reply.equals("dateNotParsable")) { reply = this.getDateAsDate(this.getLoadDate()); } return reply; } /** * * @param dateToParse * @return */ public String getDateAsDate(String dateToParse) { String date = ""; // String dateToParse = this.getDate(); try { Date d = new Date(); String month = ""; String day = ""; String year = ""; int posFirstSpace = dateToParse.indexOf(" "); int posSecondSpace = dateToParse.indexOf(" ", posFirstSpace + 1); int posFirstComma = dateToParse.indexOf(", "); month = dateToParse.substring(0, posFirstSpace); day = dateToParse.substring(posFirstSpace, posSecondSpace - 1) .trim(); year = dateToParse.substring(posFirstComma); Pattern p = Pattern.compile("([0-9]+)"); Matcher m = p.matcher(year); int year2 = 0; if (m.find()) { year2 = java.lang.Integer.parseInt(m.group(1)); } // System.out.println("month: " + month); // System.out.println("day: " + day); // System.out.println("year: " + year2); int month2 = 0; if (month.equalsIgnoreCase("january")) { month2 = 0; } else if (month.equalsIgnoreCase("february")) { month2 = 1; } else if (month.equalsIgnoreCase("march")) { month2 = 2; } else if (month.equalsIgnoreCase("april")) { month2 = 3; } else if (month.equalsIgnoreCase("may")) { month2 = 4; } else if (month.equalsIgnoreCase("june")) { month2 = 5; } else if (month.equalsIgnoreCase("july")) { month2 = 6; } else if (month.equalsIgnoreCase("august")) { month2 = 7; } else if (month.equalsIgnoreCase("september")) { month2 = 8; } else if (month.equalsIgnoreCase("october")) { month2 = 9; } else if (month.equalsIgnoreCase("november")) { month2 = 10; } else if (month.equalsIgnoreCase("december")) { month2 = 11; } GregorianCalendar gc = new GregorianCalendar(year2, month2, Integer .parseInt(day)); d = gc.getTime(); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); date = sdf.format(d); // System.out.println("date: " + date); } catch (Exception ex) { // System.out.println("getDateAsDate:"+ex); // ex.printStackTrace(); date = "dateNotParsable"; // System.out.println(date); } return date; } /** * * @return */ public int getGeoSudanRatio() { int pos = -1; int ratioGeoSudanRatio = 0; String snumber = ""; try { String geotext = this.getGeo(); geotext = geotext.replace("\n", ""); geotext = geotext.replace("\t", ""); int newSearchStartingPoint = -1; while ((pos = geotext.indexOf("SUDAN", newSearchStartingPoint)) != -1) { try { if (geotext.length() >= pos + 10) { snumber = geotext.substring(pos + 6, pos + 10); snumber = snumber.replace('%', ' '); snumber = snumber.replace('(', ' '); snumber = snumber.replace(')', ' '); snumber = snumber.trim(); snumber = getNumberString(snumber); // System.out.println("getGeoSudanRatio // snumber:"+snumber); if (snumber.length() > 0) { int number = Integer.parseInt(snumber); if (number > ratioGeoSudanRatio) { ratioGeoSudanRatio = number; } } } } catch (Exception e2) { System.out .println("getGeoSudanRatio could not read number: " + snumber + "--" + e2); } newSearchStartingPoint = pos + 1; } } catch (Exception e) { System.out.println("getGeoSudanRatio could not read number: " + snumber + "--" + e); e.printStackTrace(); } return ratioGeoSudanRatio; } private String getNumberString(String s) { StringBuffer sb = new StringBuffer(); char[] cs = s.toCharArray(); for (char c : cs) { if (Character.isDigit(c)) { sb.append(c); } } return sb.toString(); } /** * * @return */ public int getCountrySudanRatio() { int ratioCountrySudanRatio = this.getSudanRatio(this.getCountry(), "SUDAN"); return ratioCountrySudanRatio; } /** * * @param input * @param keyword * @return */ public int getSudanRatio(String input, String keyword) { int ratio = 0; try { int posBeingSudanChunk = input.indexOf(keyword); if (posBeingSudanChunk >= 0) { int posEndSudanChunk = input.indexOf(";", posBeingSudanChunk); if (posEndSudanChunk < posBeingSudanChunk) { posEndSudanChunk = input.length() - 1; } String sudanChunk = input.substring(posBeingSudanChunk, posEndSudanChunk); Pattern p = Pattern.compile("([0-9]+)"); Matcher m = p.matcher(sudanChunk); if (m.find()) { ratio = java.lang.Integer.parseInt(m.group(1)); } } } catch (Exception ex) { System.out.println("getSudanRatio() error:" + ex); ex.printStackTrace(); ratio = 0; // System.out.println(date); } // System.out.println("ratioSudan: " + ratio); return ratio; } /** * * @return */ public HashSet<String> getSubjectsPerText() { HashSet<String> subjectsPerText = new HashSet<String>(); String s = this.getSubject(); s = s.replace("\n", " "); s = s.replace("\t", ""); s = s.trim(); String[] subjectChunks = s.split(";"); for (String chunk : subjectChunks) { String subjects = ""; chunk = chunk.trim(); int posEnd = chunk.indexOf("("); if (posEnd > 0) { subjects = chunk.substring(0, posEnd); } else { subjects = chunk; } subjects = this.cleanString(subjects).trim(); // System.out.println("subjects: " + subjects); if (subjects.length() > 0) { // System.out.println("getSubjectsPerText() chunk:--"+chunk+"-- // subjects:--"+subjects+"--"); subjectsPerText.add(subjects); } else { // System.out.println("subject of length 0; chunk:"+chunk); } } return subjectsPerText; } /** * * @param input * @return */ public String cleanString(String input) { // System.out.println("cleanString"); char[] inputText = input.toCharArray(); StringBuffer sb = new StringBuffer(); for (int i = 0; i < inputText.length; i++) { char c = inputText[i]; // System.out.println("cleanString: c:"+c+" // numValue:"+Character.getNumericValue(c)); if (Character.isLetter(c) || Character.isSpaceChar(c) || c == ' ') { int numericValue = Character.getNumericValue(inputText[i]); if (numericValue >= 0 && numericValue != 9) { sb.append(inputText[i]); } else if (numericValue == -1) { sb.append(' '); } } } return sb.toString().trim(); } private static final String FIELD_SEPARATOR = ","; /** * * @return */ public String toCSV() { StringBuffer str = new StringBuffer(); str.append(getSource()); str.append(FIELD_SEPARATOR); str.append(getBestDate()); str.append(FIELD_SEPARATOR); str.append(getPubType()); str.append(FIELD_SEPARATOR); str.append(getSection()); str.append(FIELD_SEPARATOR); str.append(getCleanLength()); str.append(FIELD_SEPARATOR); str.append(getJournalCode()); str.append(FIELD_SEPARATOR); str.append(getLanguage()); str.append(FIELD_SEPARATOR); str.append(getLoadDate()); str.append(FIELD_SEPARATOR); str.append(getTitle()); str.append(FIELD_SEPARATOR); str.append(getAuthor()); str.append(FIELD_SEPARATOR); str.append(getGraphic()); str.append(FIELD_SEPARATOR); str.append(getGeo()); str.append(FIELD_SEPARATOR); str.append(getCity()); str.append(FIELD_SEPARATOR); str.append(getOrganization()); str.append(FIELD_SEPARATOR); str.append(getPerson()); str.append(FIELD_SEPARATOR); str.append(getCompany()); str.append(FIELD_SEPARATOR); str.append(getIndustry()); str.append(FIELD_SEPARATOR); str.append(getTicker()); str.append(FIELD_SEPARATOR); str.append(getSubject()); // str.append(Seperatar); // str.append(this.year.trim()); str.append(FIELD_SEPARATOR); str.append(getCountry()); str.append(FIELD_SEPARATOR); str.append(getGeoSudanRatio()); str.append(FIELD_SEPARATOR); str.append(getCountrySudanRatio()); str.append(FIELD_SEPARATOR); str.append(getTextID()); return str.toString(); } }