GenericTransformator.java example

Explorer
openflexo-master
/*
Jazzy - a Java library for Spell Checking
Copyright (C) 2001 Mindaugas Idzelis
Full text of license can be found in LICENSE.txt

This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.

This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Lesser General Public License for more details.

You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
*/
package com.swabunga.spell.engine;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.HashMap;
import java.util.Vector;

import com.swabunga.util.StringUtility;

/**
 * A Generic implementation of a transformator takes an aspell phonetics file and constructs some sort of transformation table using the
 * inner class Rule.
 * 
 * @author Robert Gustavsson (robert@lindesign.se)
 */
public class GenericTransformator implements Transformator {

	/**
	 * This replace list is used if no phonetic file is supplied or it doesn't contain the alphabet.
	 */
	private static final char[] defaultEnglishAlphabet = { 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
			'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z' };

	public static final char ALPHABET_START = '[';
	public static final char ALPHABET_END = ']';
	public static final String KEYWORD_ALPHBET = "alphabet";
	public static final String[] IGNORED_KEYWORDS = { "version", "followup", "collapse_result" };

	public static final char STARTMULTI = '(';
	public static final char ENDMULTI = ')';
	public static final String DIGITCODE = "0";
	public static final String REPLACEVOID = "_";

	private Object[] ruleArray = null;
	private char[] alphabetString = defaultEnglishAlphabet;

	public GenericTransformator(File phonetic) throws IOException {
		buildRules(new BufferedReader(new FileReader(phonetic)));
		alphabetString = washAlphabetIntoReplaceList(getReplaceList());

	}

	public GenericTransformator(File phonetic, String encoding) throws IOException {
		buildRules(new BufferedReader(new InputStreamReader(new FileInputStream(phonetic), encoding)));
		alphabetString = washAlphabetIntoReplaceList(getReplaceList());
	}

	public GenericTransformator(Reader phonetic) throws IOException {
		buildRules(new BufferedReader(phonetic));
		alphabetString = washAlphabetIntoReplaceList(getReplaceList());
	}

	/**
	 * Goes through an alphabet and makes sure that only one of those letters that are coded equally will be in the replace list. In other
	 * words, it removes any letters in the alphabet that are redundant phonetically.
	 * 
	 * This is done to improve speed in the getSuggestion method.
	 * 
	 * @param alphabet
	 *            The complete alphabet to wash.
	 * @return The washed alphabet to be used as replace list.
	 */
	private char[] washAlphabetIntoReplaceList(char[] alphabet) {

		HashMap letters = new HashMap(alphabet.length);

		for (int i = 0; i < alphabet.length; i++) {
			String tmp = String.valueOf(alphabet[i]);
			String code = transform(tmp);
			if (!letters.containsKey(code)) {
				letters.put(code, new Character(alphabet[i]));
			}
		}

		Object[] tmpCharacters = letters.values().toArray();
		char[] washedArray = new char[tmpCharacters.length];

		for (int i = 0; i < tmpCharacters.length; i++) {
			washedArray[i] = ((Character) tmpCharacters[i]).charValue();
		}

		return washedArray;
	}

	/**
	 * Takes out all single character replacements and put them in a char array. This array can later be used for adding or changing letters
	 * in getSuggestion().
	 * 
	 * @return char[] An array of chars with replacements characters
	 */
	public char[] getCodeReplaceList() {
		char[] replacements;
		TransformationRule rule;
		Vector tmp = new Vector();

		if (ruleArray == null) {
			return null;
		}
		for (int i = 0; i < ruleArray.length; i++) {
			rule = (TransformationRule) ruleArray[i];
			if (rule.getReplaceExp().length() == 1) {
				tmp.addElement(rule.getReplaceExp());
			}
		}
		replacements = new char[tmp.size()];
		for (int i = 0; i < tmp.size(); i++) {
			replacements[i] = ((String) tmp.elementAt(i)).charAt(0);
		}
		return replacements;
	}

	/**
	 * Builds up an char array with the chars in the alphabet of the language as it was read from the alphabet tag in the phonetic file.
	 * 
	 * @return char[] An array of chars representing the alphabet or null if no alphabet was available.
	 */
	@Override
	public char[] getReplaceList() {
		return alphabetString;
	}

	/**
	 * Returns the phonetic code of the word.
	 */
	@Override
	public String transform(String word) {

		if (ruleArray == null) {
			return null;
		}

		TransformationRule rule;
		StringBuffer str = new StringBuffer(word.toUpperCase());
		int strLength = str.length();
		int startPos = 0, add = 1;

		while (startPos < strLength) {

			add = 1;
			if (Character.isDigit(str.charAt(startPos))) {
				StringUtility.replace(str, startPos, startPos + DIGITCODE.length(), DIGITCODE);
				startPos += add;
				continue;
			}

			for (int i = 0; i < ruleArray.length; i++) {
				// System.out.println("Testing rule#:"+i);
				rule = (TransformationRule) ruleArray[i];
				if (rule.startsWithExp() && startPos > 0) {
					continue;
				}
				if (startPos + rule.lengthOfMatch() > strLength) {
					continue;
				}
				if (rule.isMatching(str, startPos)) {
					String replaceExp = rule.getReplaceExp();

					add = replaceExp.length();
					StringUtility.replace(str, startPos, startPos + rule.getTakeOut(), replaceExp);
					strLength -= rule.getTakeOut();
					strLength += add;
					// System.out.println("Replacing with rule#:"+i+" add="+add);
					break;
				}
			}
			startPos += add;
		}
		// System.out.println(word);
		// System.out.println(str.toString());
		return str.toString();
	}

	// Used to build up the transformastion table.
	private void buildRules(BufferedReader in) throws IOException {
		String read = null;
		Vector ruleList = new Vector();
		while ((read = in.readLine()) != null) {
			buildRule(realTrimmer(read), ruleList);
		}
		ruleArray = new TransformationRule[ruleList.size()];
		ruleList.copyInto(ruleArray);
	}

	// Here is where the real work of reading the phonetics file is done.
	private void buildRule(String str, Vector ruleList) {
		if (str.length() < 1) {
			return;
		}
		for (int i = 0; i < IGNORED_KEYWORDS.length; i++) {
			if (str.startsWith(IGNORED_KEYWORDS[i])) {
				return;
			}
		}

		// A different alphabet is used for this language, will be read into
		// the alphabetString variable.
		if (str.startsWith(KEYWORD_ALPHBET)) {
			int start = str.indexOf(ALPHABET_START);
			int end = str.lastIndexOf(ALPHABET_END);
			if (end != -1 && start != -1) {
				alphabetString = str.substring(++start, end).toCharArray();
			}
			return;
		}

		TransformationRule rule = null;
		StringBuffer matchExp = new StringBuffer();
		StringBuffer replaceExp = new StringBuffer();
		boolean start = false, end = false;
		int takeOutPart = 0, matchLength = 0;
		boolean match = true, inMulti = false;
		for (int i = 0; i < str.length(); i++) {
			if (Character.isWhitespace(str.charAt(i))) {
				match = false;
			} else {
				if (match) {
					if (!isReservedChar(str.charAt(i))) {
						matchExp.append(str.charAt(i));
						if (!inMulti) {
							takeOutPart++;
							matchLength++;
						}
						if (str.charAt(i) == STARTMULTI || str.charAt(i) == ENDMULTI) {
							inMulti = !inMulti;
						}
					}
					if (str.charAt(i) == '-') {
						takeOutPart--;
					}
					if (str.charAt(i) == '^') {
						start = true;
					}
					if (str.charAt(i) == '$') {
						end = true;
					}
				} else {
					replaceExp.append(str.charAt(i));
				}
			}
		}
		if (replaceExp.toString().equals(REPLACEVOID)) {
			replaceExp = new StringBuffer("");
			// System.out.println("Changing _ to \"\" for "+matchExp.toString());
		}
		rule = new TransformationRule(matchExp.toString(), replaceExp.toString(), takeOutPart, matchLength, start, end);
		// System.out.println(rule.toString());
		ruleList.addElement(rule);
	}

	// Chars with special meaning to aspell. Not everyone is implemented here.
	private boolean isReservedChar(char ch) {
		if (ch == '<' || ch == '>' || ch == '^' || ch == '$' || ch == '-' || Character.isDigit(ch)) {
			return true;
		}
		return false;
	}

	// Trims off everything we don't care about.
	private String realTrimmer(String row) {
		int pos = row.indexOf('#');
		if (pos != -1) {
			row = row.substring(0, pos);
		}
		return row.trim();
	}

	// Inner Classes
	/*
	* Holds the match string and the replace string and all the rule attributes.
	* Is responsible for indicating matches.
	*/
	private class TransformationRule {

		private String replace;
		private char[] match;
		// takeOut=number of chars to replace;
		// matchLength=length of matching string counting multies as one.
		private int takeOut, matchLength;
		private boolean start, end;

		// Construktor
		public TransformationRule(String match, String replace, int takeout, int matchLength, boolean start, boolean end) {
			this.match = match.toCharArray();
			this.replace = replace;
			this.takeOut = takeout;
			this.matchLength = matchLength;
			this.start = start;
			this.end = end;
		}

		/*
		* Returns true if word from pos and forward matches the match string.
		* Precondition: wordPos+matchLength<word.length()
		*/
		public boolean isMatching(StringBuffer word, int wordPos) {
			boolean matching = true, inMulti = false, multiMatch = false;
			char matchCh;

			for (int matchPos = 0; matchPos < match.length; matchPos++) {
				matchCh = match[matchPos];
				if (matchCh == STARTMULTI || matchCh == ENDMULTI) {
					inMulti = !inMulti;
					if (!inMulti) {
						matching = matching & multiMatch;
					} else {
						multiMatch = false;
					}
				} else {
					if (matchCh != word.charAt(wordPos)) {
						if (inMulti) {
							multiMatch = multiMatch | false;
						} else {
							matching = false;
						}
					} else {
						if (inMulti) {
							multiMatch = multiMatch | true;
						} else {
							matching = true;
						}
					}
					if (!inMulti) {
						wordPos++;
					}
					if (!matching) {
						break;
					}
				}
			}
			if (end && wordPos != word.length()) {
				matching = false;
			}
			return matching;
		}

		public String getReplaceExp() {
			return replace;
		}

		public int getTakeOut() {
			return takeOut;
		}

		public boolean startsWithExp() {
			return start;
		}

		public int lengthOfMatch() {
			return matchLength;
		}

		// Just for debugging purposes.
		@Override
		public String toString() {
			return "Match:" + String.valueOf(match) + " Replace:" + replace + " TakeOut:" + takeOut + " MatchLength:" + matchLength
					+ " Start:" + start + " End:" + end;
		}

	}
}