package com.transmem.doc; import java.util.logging.Logger; import java.io.FileInputStream; import java.io.BufferedInputStream; import java.io.IOException; import java.sql.SQLException; /** * Text parser for plain text-formatted text block that can contain a number of * paragraphs. For a given block of plain text, the parser will break into sentences * with a number of sentences separated by double line feeds to form a paragraph. * * @author Ted Wen * @date May, 2007 */ public class TextParser extends FileParser { public static final int BUFSIZE = 512; public static final int MAX_SENTS_PER_PARA = 50; private Logger log_ = Logger.getLogger(TextParser.class.getName()); public TextParser() { } /** * Parse the text in the specified file and extract paragraphs and sentences * and call the ITextSaver object to save them elsewhere, ie. database tables. * This routine is designed for Western language and Chinese, not sure about other languages. * * @param filename - file path and name of the text * @param saver - object of a class that implements the ITextSaver interface */ public void parse(String filename, ITextSaver saver) throws IOException,SQLException { if (filename == null || filename.equals("")) { log_.severe("parse(filename, ITextSaver): filename is null"); throw new IOException("TextParser.parse(null,saver)"); } if (saver == null) { log_.severe("TextParser.parse(String,ITextSaver),ITextSaver object is null"); throw new IOException("ITextSaver is null"); } int sents = 0; int fp = 0, bp = 0; BufferedInputStream bis = null; try { saver.startParagraph(0); FileInputStream fis = new FileInputStream(filename); bis = new BufferedInputStream(fis); StringBuffer sb = new StringBuffer(); StringBuffer bsb = new StringBuffer(); while (true) { int cn = bis.read(); fp ++; //current file pointer if (cn < 0) { if (sb.length() > 0) { String s = sb.toString().trim(); if (s.length() > 0) saver.saveSentence(s, bp, fp); } break; } char c = (char)cn; boolean stop = false; if (c == '\r') { c = (char)bis.read(); fp ++; } if (c != '\n') { sb.append(c); if (c == '.') { int n = sb.length(); //check proceeding digit and sb is less than 5 chars if (!(Character.isDigit(sb.charAt(n-1)) && n < 5)) { //check backwards for Mr. Mrs. Dr. etc int n1 = n-1; while (n1 > 0 && sb.charAt(n1)!=' ') n1 --; String s = (n - n1 > 2) ? sb.substring(n1+1,n) : ""; if (!(s.equalsIgnoreCase("Mr.") || s.equalsIgnoreCase("Mrs.") || s.equalsIgnoreCase("Dr.") )) { //check following uppercase letter or CJK bsb = new StringBuffer(); while (true) { cn = bis.read(); if (cn < 0) break; fp ++; c = (char)cn; if (c == '\r') { c = (char)bis.read(); fp ++; if (c == '\n') { cn = bis.read(); if (cn < 0) break; fp ++; c = (char)cn; if (c == '\r') { c = (char)bis.read(); fp ++; if (c == '\n') { //double linefeed, end of paragraph if (sb.length() > 0) { String s0 = sb.toString().trim(); if (s0.length() > 2) { saver.saveSentence(s0, bp, fp); bp = fp; sents ++; sb = new StringBuffer(bsb.toString()); } } if (sents > 0) { saver.endParagraph(fp); saver.startParagraph(fp); sents = 0; } continue; } } } } if (!Character.isWhitespace(c)) { int ct = Character.getType(c); if (ct == 1 || ct == 5) //uppercase or CJK { bsb.append(c); stop = true; } else if ("\"')]}>".indexOf(c) >= 0) sb.append(c); //." .) etc else bsb.append(c); break; } else bsb.append(' '); //hard space replaced } } } } else if (c == '��' || c == '��') //stop mark { stop = true; } if (stop) { int sbn = sb.length(); String s = sb.toString(); if (s.length() > 2) { saver.saveSentence(s, bp, fp); bp = fp; sents ++; if (sents > MAX_SENTS_PER_PARA) { saver.endParagraph(fp); saver.startParagraph(fp+1); sents = 0; } sb = new StringBuffer(bsb.toString()); } else sb.append(bsb); } else if (bsb.length() > 0) { sb.append(bsb); } bsb = new StringBuffer(); } else if (c == '\n') { sb.append(' '); //check next line feed, if yes then end of paragraph cn = bis.read(); if (cn > 0) fp ++; if ((char)cn == '\r') { cn = bis.read(); if (cn > 0) fp ++; } if (cn == -1 || (char)cn == '\n') { if (sb.length() > 0) { String s = sb.toString().trim(); if (s.length() > 2) { saver.saveSentence(s, bp, fp); bp = fp; sents ++; sb = new StringBuffer(bsb.toString()); } } if (sents > 0) { saver.endParagraph(fp); saver.startParagraph(fp+1); sents = 0; } } else sb.append((char)cn); } } saver.endParagraph(fp); } catch (IOException ioe) { log_.severe("IOException while parsing("+filename+"): "+ioe.getMessage()); throw new IOException("IOException parsing "+filename+","+ioe.getMessage()); } catch (SQLException se) { log_.severe("SQLException saving paragraph and sentences: "+se.getMessage()); throw new SQLException("SQLException at ITextSaver call: "+se.getMessage()); } finally { if (bis != null) { try { bis.close(); } catch (IOException e) {} } } } /* public void parse(String filename, ITextSaver saver) throws IOException,SQLException { if (filename == null || filename.equals("")) { log_.severe("parse(filename, ITextSaver): filename is null"); throw new IOException("TextParser.parse(null,saver)"); } if (saver == null) { log_.severe("TextParser.parse(String,ITextSaver),ITextSaver object is null"); throw new IOException("ITextSaver is null"); } int pos, sents; pos = sents = 0; FileReader fr = null; try { saver.startParagraph(0); fr = new FileReader(filename); char[] buf = new char[BUFSIZE]; StringBuffer sb = new StringBuffer(); while (true) { int n = fr.read(buf, 0, buf.length); if (n <= 0) { if (sb.length() > 0) { saver.saveSentence(sb.toString(),pos,pos+sb.length()); pos += sb.length(); } break; } for (int i=0; i<n; i++) { if (Character.getType(buf[i])!=15) //15 is Control char { sb.append(buf[i]); } if (isFullStop(sb,buf,i)) { if (sb.length() > 0) { saver.saveSentence(sb.toString(),pos,i+1); pos = i + 1; sents ++; if (sents > MAX_SENTS_PER_PARA) { saver.endParagraph(i); saver.startParagraph(i+1); sents = 0; } sb = new StringBuffer(); } } else if (buf[i] == '\n') { if (i < n && buf[i+1] == '\n') { saver.endParagraph(i); saver.startParagraph(i+2); sents = 0; } } } pos += n; } saver.endParagraph(pos); } catch (IOException ioe) { log_.severe("IOException while parsing("+filename+"): "+ioe.getMessage()); throw new IOException("IOException parsing "+filename+","+ioe.getMessage()); } catch (SQLException se) { log_.severe("SQLException saving paragraph and sentences: "+se.getMessage()); throw new SQLException("SQLException at ITextSaver call: "+se.getMessage()); } finally { if (fr != null) { try { fr.close(); } catch (IOException e) {} } } } */ /** * Parse the given text block into paragraphs which contain a number of sentences. * The paragraphs are of Paragraph instances and sentences are of Sentence instances. * NOTE: this may cause out of memory exception if text is too large when available memory is low. * * @param text - original text block * @return array list of Paragraph objects that contain array list of sentences. */ /* public ArrayList<Paragraph> parse(String text) throws LanguageException { if (text == null) { log_.severe("TextParser.parse(null)"); throw new LanguageException("TextParser.parse(null)"); } int n1, n2; n1 = n2 = 0; ArrayList<Paragraph> paras = new ArrayList<Paragraph>(); Paragraph para = new Paragraph(n1, n2); paras.add(para); char bc = ' '; for (int i=0; i<text.length(); i++) { char c = text.charAt(i); //TODO: only English/Western and Chinese stop punctuation mark supported if (isFullStop(text, c, i)) { n2 = i + 1; if (n2 > n1) { String s = text.substring(n1, n2); Sentence st = new Sentence(s, n1, n2); para.addSentence(st); n1 = n2; } } else if (c == '\n') { if (bc == '\n') { if (i > n1) { n2 = i; n1 = i + 1; para.setEndPos(n2); para = new Paragraph(n1,n2); paras.add(para); } } bc = c; } } return paras; }*/ }