/* * SmartDoc : Ultimate document format based on XML * Copyright (C) 1998-2003 ASAMI, Tomoharu (asami@XMLSmartDoc.org) * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ package org.xmlsmartdoc.SmartDoc.normalizer; import java.util.*; import org.xmlsmartdoc.SmartDoc.*; /** * NaturalNormalizer * * @since Mar. 31, 1999 * @version Jun. 23, 2003 * @author ASAMI, Tomoharu (asami@XMLSmartDoc.org) */ public class NaturalNormalizer extends AbstractNormalizer { protected final static int INIT = 0; protected final static int AFTER_CR = 1; protected final static int AFTER_LF = 2; protected final static int AFTER_CRLF = 3; protected final static int AFTER_SPACE = 4; protected final static int BEFORE_SENTENCE = 5; protected final static int BEFORE_PARAGRAPH = 6; protected final static int LANG_INIT = 0; protected final static int LANG_WORD_SEPARATE = 1; protected final static int LANG_OTHER = 2; transient protected int langState_ = LANG_INIT; protected Content[] _normalize( Content[] contents, Content parent, DocContext context ) { if ("preserve".equals(parent.getSpace())) { return (contents); } Paragraph paragraph = null; Sentence sentence = null; int state = INIT; List list = new ArrayList(); for (int i = 0;i < contents.length;i++) { Content content = contents[i]; if (content instanceof Sentence) { if (paragraph == null) { paragraph = new Paragraph(); } if (sentence != null) { sentence.setDone(); paragraph.addContent(sentence); sentence = null; } paragraph.addContent(content); } else if (content instanceof CharBlock) { CharBlock cblock = (CharBlock)content; String text = cblock.getText(); if (cblock.isPreserve()) { if (sentence == null) { sentence = new Sentence(); } sentence.addString(text); continue; } int tsize = text.length(); for (int ti = 0;ti < tsize;ti++) { char c = text.charAt(ti); switch (state) { case INIT: switch (c) { case '\r': state = AFTER_CR; break; case '\n': state = AFTER_LF; break; case ' ': case '\t': // state = INIT; state = AFTER_SPACE; break; default: if (paragraph == null) { paragraph = new Paragraph(); } if (sentence == null) { sentence = new Sentence(); } /* if (sentence.isDone()) { list.add(sentence); sentence = new Sentence(); } */ sentence.addChar(c); _setLangState(c); } break; case AFTER_CR: switch (c) { case '\r': if (sentence != null) { sentence.setDone(); if (paragraph == null) { paragraph = new Paragraph(); } paragraph.addContent(sentence); list.add(paragraph); paragraph = null; sentence = null; } state = BEFORE_PARAGRAPH; break; case '\n': state = AFTER_CRLF; break; case ' ': case '\t': state = AFTER_CR; // state = AFTER_SPACE; break; default: if (sentence == null) { sentence = new Sentence(); } else { if (_isWordSeparate(c)) { sentence.addChar(' '); } } sentence.addChar(c); _setLangState(c); state = INIT; } break; case AFTER_LF: switch (c) { case '\r': state = BEFORE_PARAGRAPH; break; case '\n': if (sentence != null) { sentence.setDone(); if (paragraph == null) { paragraph = new Paragraph(); } paragraph.addContent(sentence); list.add(paragraph); paragraph = null; sentence = null; } break; case ' ': case '\t': state = AFTER_LF; // state = AFTER_SPACE; break; default: if (sentence == null) { sentence = new Sentence(); } else { if (_isWordSeparate(c)) { sentence.addChar(' '); } } sentence.addChar(c); _setLangState(c); state = INIT; } break; case AFTER_SPACE: switch (c) { case '\r': state = AFTER_CR; break; case '\n': state = AFTER_LF; break; case ' ': case '\t': // do nothing break; default: if (paragraph == null) { paragraph = new Paragraph(); } if (sentence == null) { sentence = new Sentence(); } else if (_isWordSeparate(c)) { sentence.addChar(' '); } sentence.addChar(c); _setLangState(c); state = INIT; } break; case BEFORE_PARAGRAPH: switch (c) { case '\r': case '\n': case ' ': case '\t': // do nothing break; default: if (paragraph == null) { paragraph = new Paragraph(); } if (sentence == null) { sentence = new Sentence(); } else { if (_isWordSeparate(c)) { sentence.addChar(' '); } } sentence.addChar(c); _setLangState(c); state = INIT; } break; default: throw (new InternalError()); } } } else { switch (content.getEntityType()) { case Content.ENTITY_BLOCK: // continue case Content.ENTITY_CONTAINER: if (sentence != null) { if (paragraph == null) { paragraph = new Paragraph(); } sentence.setDone(); paragraph.addContent(sentence); sentence = null; } if (paragraph != null) { list.add(paragraph); paragraph = null; } list.add(content); langState_ = LANG_INIT; state = INIT; break; case Content.ENTITY_INLINE: // continue case Content.ENTITY_CONTROL: if (sentence == null) { sentence = new Sentence(); } switch (state) { case INIT: break; case AFTER_CR: // continue case AFTER_LF: // continue case AFTER_SPACE: if (_isWordSeparate(content.getFirstChar())) { sentence.addChar(' '); } break; case BEFORE_PARAGRAPH: break; default: throw (new InternalError()); } sentence.addContent(content); _setLangState(content.getLastChar()); state = INIT; break; default: throw (new InternalError()); } state = INIT; } } if (sentence != null) { sentence.setDone(); if (paragraph == null) { paragraph = new Paragraph(); } paragraph.addContent(sentence); list.add(paragraph); } Content[] result = new Content[list.size()]; return ((Content[])list.toArray(result)); } protected void _setLangState(char c) { if (Character.isISOControl(c)) { return; } if (USmartDoc.isWordSeparateLang(c)) { langState_ = LANG_WORD_SEPARATE; } else { langState_ = LANG_OTHER; } } protected boolean _isWordSeparate(char c) { if (langState_ == LANG_WORD_SEPARATE && USmartDoc.isWordSeparateLang(c)) { return (true); } else { return (false); } } }