/** * SentenceReader * Copyright 2011 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany * first published 09.02.2011 on http://yacy.net * * $LastChangedDate$ * $LastChangedRevision$ * $LastChangedBy$ * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see <http://www.gnu.org/licenses/>. */ package net.yacy.document; import java.util.Iterator; public class SentenceReader implements Iterator<StringBuilder>, Iterable<StringBuilder> { // read sentences from a given input stream // this enumerates StringBuilder objects private StringBuilder buffer; private String text; private int pos; private boolean pre = false; public SentenceReader(final String text) { assert text != null; this.text = text; this.pos = 0; this.pre = false; this.buffer = nextElement0(); } public SentenceReader(final String text, final boolean pre) { this(text); this.pre = pre; } public void pre(final boolean x) { this.pre = x; } private StringBuilder nextElement0() { final StringBuilder s = new StringBuilder(80); int nextChar; char c, lc = ' '; // starting with ' ' as last character prevents that the result string starts with a ' ' // find sentence end while (this.pos < this.text.length() && (nextChar = this.text.charAt(this.pos++)) > 0) { c = (char) nextChar; if (this.pre && (nextChar == 10 || nextChar == 13)) break; if (c < ' ') c = ' '; if (lc == ' ' && c == ' ') continue; // ignore double spaces s.append(c); if (punctuation(lc) && invisible(c)) break; lc = c; } if (s.length() == 0) return null; if (s.charAt(s.length() - 1) == ' ') { s.trimToSize(); s.deleteCharAt(s.length() - 1); } return s; } public final static boolean invisible(final char c) { // first check average simple case if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) return false; // then check more complex case which applies to all character sets final int type = Character.getType(c); return !(type == Character.LOWERCASE_LETTER || type == Character.DECIMAL_DIGIT_NUMBER || type == Character.UPPERCASE_LETTER || type == Character.MODIFIER_LETTER || type == Character.OTHER_LETTER || type == Character.TITLECASE_LETTER || punctuation(c)); } public final static boolean punctuation(final char c) { return c == '.' || c == '!' || c == '?'; } @Override public boolean hasNext() { return this.buffer != null; } @Override public StringBuilder next() { if (this.buffer == null) { return null; } final StringBuilder r = this.buffer; this.buffer = nextElement0(); return r; } @Override public void remove() { throw new UnsupportedOperationException(); } @Override public Iterator<StringBuilder> iterator() { return this; } public synchronized void close() { this.text = null; } public static void main(String[] args) { String s = "a b ccc d"; SentenceReader sr = new SentenceReader(s); for (StringBuilder a: sr) { System.out.println(a); } } }