package chipmunk.segmenter; import java.util.Collection; import java.util.HashSet; import java.util.List; public class Word { @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + ((readings_ == null) ? 0 : readings_.hashCode()); result = prime * result + ((word_ == null) ? 0 : word_.hashCode()); return result; } @Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; Word other = (Word) obj; if (readings_ == null) { if (other.readings_ != null) return false; } else if (!readings_.equals(other.readings_)) return false; if (word_ == null) { if (other.word_ != null) return false; } else if (!word_.equals(other.word_)) return false; return true; } private String word_; private Collection<SegmentationReading> readings_; public Word(String word) { word_ = word; readings_ = new HashSet<>(); } public int getLength() { return word_.length(); } public String getWord() { return word_; } @Override public String toString() { return "[" + word_ + " " + readings_ + "]"; } public void add(SegmentationReading reading) { readings_.add(reading); } public Collection<SegmentationReading> getReadings() { return readings_; } static public void printStats(List<Word> words) { int word_length = 0; int very_long_words = 0; int max_word_length = 0; int max_segment_length = 0; int segment_length = 0; int num_segments = 0; int num_words = 0; int num_readings = 0; int words_with_many_readings = 0; for (Word word : words) { max_word_length = Math.max(max_segment_length, word.getLength()); word_length += word.getLength(); if (word.getLength() > 15) { very_long_words ++; } for (SegmentationReading reading : word.getReadings()) { for (String segment : reading.getSegments()) { max_segment_length = Math.max(max_segment_length, segment.length()); num_segments ++; segment_length += segment.length(); } num_readings ++; } if (word.getReadings().size() > 3) { words_with_many_readings ++; } num_words ++; } System.err.format("Avg word length: %d/%d = %g\n", word_length, num_words, word_length * 1.0 / num_words); System.err.format("Num very long words (> 15): %d\n", very_long_words); System.err.format("Max word length: %d\n", max_word_length); System.err.format("Avg segment length: %d/%d = %g\n", segment_length, num_segments, segment_length * 1.0 / num_segments); System.err.format("Max segment length: %d\n", max_segment_length); System.err.format("Segments / reading: %d/%d = %g\n", num_segments, num_readings, num_segments * 1.0 / num_readings); System.err.format("Readings / word: %d/%d = %g\n", num_readings, num_words, num_readings * 1.0 / num_words); System.err.format("Words with many readings (> 3): %d\n", words_with_many_readings); } }