package edu.stanford.nlp.ie.pascal;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.*;
import java.util.*;
import edu.stanford.nlp.util.StringUtils;
/**
* Hyphenates words according to the TeX algorithm.
* @author Jamie Nicolson (nicolson@cs.stanford.edu)
*/
public class TeXHyphenator {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(TeXHyphenator.class);
private static class Node {
HashMap children = new HashMap();
int [] pattern = null;
};
/**
* Loads the default hyphenation rules in DefaultTeXHyphenator.
*/
public void loadDefault() {
try {
load( new BufferedReader(new StringReader(
DefaultTeXHyphenData.hyphenData) ) );
} catch(IOException e) {
// shouldn't happen
throw new RuntimeException(e);
}
}
/**
* Loads custom hyphenation rules. You probably want to use
* loadDefault() instead.
*
*/
public void load(BufferedReader input) throws IOException {
String line;
while( (line=input.readLine()) != null ) {
if( StringUtils.matches(line, "\\s*(%.*)?") ) {
// comment or blank line
log.info("Skipping: " + line);
continue;
}
char [] linechars = line.toCharArray();
int [] pattern = new int[linechars.length];
char [] chars = new char[linechars.length];
int c = 0;
for (char linechar : linechars) {
if (Character.isDigit(linechar)) {
pattern[c] = Character.digit(linechar, 10);
} else {
chars[c++] = linechar;
}
}
char[] shortchars = new char[c];
int [] shortpattern = new int[c+1];
System.arraycopy(chars, 0, shortchars, 0, c);
System.arraycopy(pattern, 0, shortpattern, 0, c+1);
insertHyphPattern(shortchars, shortpattern);
}
}
private Node head = new Node();
public static String toString(int[]i) {
StringBuffer sb = new StringBuffer();
for (int anI : i) {
sb.append(anI);
}
return sb.toString();
}
private void insertHyphPattern(char [] chars, int [] pattern) {
// find target node, building as we go
Node cur = head;
for (char aChar : chars) {
Character curchar = new Character(aChar);
Node next = (Node) cur.children.get(curchar);
if (next == null) {
next = new Node();
cur.children.put(curchar, next);
}
cur = next;
}
assert( cur.pattern == null );
cur.pattern = pattern;
}
private List getMatchingPatterns( char[] chars, int startingIdx ) {
Node cur = head;
LinkedList matchingPatterns = new LinkedList();
if( cur.pattern != null ) {
matchingPatterns.add(cur.pattern);
}
for(int c = startingIdx; cur != null && c < chars.length; ++c ) {
Character curchar = new Character(chars[c]);
Node next = (Node) cur.children.get(curchar);
cur = next;
if( cur != null && cur.pattern != null ) {
matchingPatterns.add(cur.pattern);
}
}
return matchingPatterns;
}
private void labelWordBreakPoints( char [] phrase, int start, int end,
boolean[] breakPoints)
{
char [] word = new char[end-start+2];
System.arraycopy(phrase, start, word, 1, end-start);
word[0] = '.';
word[word.length-1] = '.';
// breakScore[i] is the score for breaking before word[i]
int [] breakScore = new int [word.length + 1];
for( int c = 0; c < word.length; ++c ) {
List patterns = getMatchingPatterns(word, c);
Iterator iter = patterns.iterator();
while(iter.hasNext()) {
int [] pattern = (int[]) iter.next();
for( int i = 0; i < pattern.length; ++i ) {
if( breakScore[c+i] < pattern[i] ) {
breakScore[c+i] = pattern[i];
}
}
}
}
breakPoints[start] = true;
for( int i = start+1; i < end; i++) {
// remember that breakPoints is offset by one because we introduced
// the leading "."
breakPoints[i-1] |= (breakScore[i-start] % 2 == 1 );
}
}
/**
* @param lcphrase Some English text in lowercase.
* @return An array of booleans, one per character of the input,
* indicating whether it would be OK to insert a hyphen before that
* character.
*/
public boolean[] findBreakPoints(char [] lcphrase) {
boolean [] breakPoints = new boolean[lcphrase.length];
boolean inWord = false;
int wordStart = 0;
int c = 0;
for(; c < lcphrase.length; ++c) {
if( !inWord && Character.isLetter(lcphrase[c]) ) {
wordStart = c;
inWord = true;
} else if( inWord && !Character.isLetter(lcphrase[c]) ) {
inWord = false;
labelWordBreakPoints(lcphrase, wordStart, c, breakPoints);
}
}
if( inWord ) {
labelWordBreakPoints(lcphrase, wordStart, c, breakPoints);
}
return breakPoints;
}
public static void main(String[] args) throws Exception {
TeXHyphenator hyphenator = new TeXHyphenator();
hyphenator.loadDefault();
for (String arg : args) {
char[] chars = arg.toLowerCase().toCharArray();
boolean[] breakPoints = hyphenator.findBreakPoints(chars);
System.out.println(arg);
StringBuffer sb = new StringBuffer();
for (boolean breakPoint : breakPoints) {
if (breakPoint) {
sb.append("^");
} else {
sb.append("-");
}
}
System.out.println(sb.toString());
}
}
}