/* * Cleaner.java * Copyright (C) 2007 David Milne, d.n.milne@gmail.com * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ package org.wikipedia.miner.util.text; /** * This class provides very conservative morphology, and is intended to only resolve small variations * in capitalization and punctuation usage. It casefolds all terms and discards unneeded punctuation. * * This involves adding spaces when underscores or camelcasing is used, converting characters to * lowercase, and discarding whitespace and disambiguation information (the text found within * parentheses in many wikipedia titles). */ public class Cleaner extends TextProcessor{ private boolean disallowInternalPeriods = false ; /** * Returns a cleaned copy of the argument text, with capitalization and stopwords removed. * * @param text the text to be processed. * @return the processed version of this text. */ public String processText(String text) { String t = text ; t = cleanPunctuation(t).replace('\n', ' ') ; t = t.replaceAll("\\'", ""); //aly added return t.replace('\"', ' ').trim().toLowerCase() ; } private String cleanPunctuation(String text) { StringBuffer resultStr = new StringBuffer(); int j = 0; boolean phraseStart = true; boolean seenNewLine = false; boolean haveSeenHyphen = false; boolean haveSeenSlash = false; while (j < text.length()) { boolean isWord = false; boolean potNumber = false; int startj = j; while (j < text.length()) { char ch = text.charAt(j); if (Character.isLetterOrDigit(ch)) { potNumber = true; isWord = true; //aly: allowing digits as words if (Character.isLetter(ch)) { isWord = true; } j++; } else if ((!disallowInternalPeriods && (ch == '.')) || (ch == '@') || (ch == '_') || (ch == '&') || (ch == '/') || (ch == '-')) { if ((j > 0) && (j + 1 < text.length()) && Character.isLetterOrDigit(text.charAt(j - 1)) && Character.isLetterOrDigit(text.charAt(j + 1))) { j++; } else { break; } } else if (ch == '\'') { if ((j > 0) && Character.isLetterOrDigit(text.charAt(j - 1))) { j++; } else { break; } } else { break; } } if (isWord == true) { if (!phraseStart) { if (haveSeenHyphen) { resultStr.append('-'); } else if (haveSeenSlash) { resultStr.append('/'); } else { resultStr.append(' '); } } resultStr.append(text.substring(startj, j)); if (j == text.length()) { break; } phraseStart = false; seenNewLine = false; haveSeenHyphen = false; haveSeenSlash = false; if (Character.isWhitespace(text.charAt(j))) { if (text.charAt(j) == '\n') { seenNewLine = true; } } else if (text.charAt(j) == '-') { haveSeenHyphen = true; } else if (text.charAt(j) == '/') { haveSeenSlash = true; } else { phraseStart = true; resultStr.append('\n'); } j++; } else if (j == text.length()) { break; } else if (text.charAt(j) == '\n') { if (seenNewLine) { if (phraseStart == false) { resultStr.append('\n'); phraseStart = true; } } else if (potNumber) { if (phraseStart == false) { phraseStart = true; resultStr.append('\n'); } } seenNewLine = true; j++; } else if (Character.isWhitespace(text.charAt(j))) { if (potNumber) { if (phraseStart == false) { phraseStart = true; resultStr.append('\n'); } } j++; } else { if (phraseStart == false) { resultStr.append('\n'); phraseStart = true; } j++; } } return resultStr.toString() ; } }