package chipmunk.segmenter;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import marmot.util.LineIterator;
public class SegmentationDataReader implements Iterable<Word> {
private List<Word> words_;
private int tag_level_;
private StringNormalizer normalizer_;
private Map<String, Word> vocab_;
private String filepath_;
public SegmentationDataReader(String filepath, String lang, int tag_level) {
normalizer_ = StringNormalizer.labeledCreate(lang);
tag_level_ = tag_level;
filepath_ = filepath;
words_ = null;
vocab_ = null;
}
public List<Word> map(List<Word> words) {
getData();
List<Word> new_words = new LinkedList<>();
for (Word word : words) {
Word new_word = vocab_.get(word.getWord());
assert new_word != null;
new_words.add(new_word);
}
return new_words;
}
public List<Word> getData() {
if (words_ == null) {
vocab_ = new HashMap<>();
words_ = new LinkedList<Word>();
for (Word word : this) {
words_.add(word);
}
}
return words_;
}
@Override
public Iterator<Word> iterator() {
final LineIterator iterator = new LineIterator(filepath_, "\t");
return new Iterator<Word>() {
@Override
public boolean hasNext() {
return iterator.hasNext();
}
@Override
public Word next() {
List<String> line = iterator.next();
assert line.size() == 2;
String word_string = line.get(0);
word_string = normalizer_.normalize(word_string);
Word word = null;
if (vocab_ != null)
word = vocab_.get(word_string);
if (word == null) {
word = new Word(word_string);
if (vocab_ != null)
vocab_.put(word_string, word);
}
if (line.size() > 1) {
addReading(word, line.get(1));
}
return word;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}
private void addReading(Word word, String reading_string) {
String[] word_readings = reading_string.split(", ");
for (String word_reading : word_readings) {
String[] readings = word_reading.split(" ");
List<String> segments = new LinkedList<>();
List<String> tags = new LinkedList<>();
for (String reading : readings) {
int index = -1;
for (int i = 0; i < reading.length(); i++) {
char c = reading.charAt(i);
if (c == ':') {
index = i;
if (i + 1 < reading.length()
&& reading.charAt(i + 1) == ':') {
index = i + 1;
}
break;
}
}
String segment;
String tag;
if (index < 0 || index == reading.length() - 1) {
segment = reading;
tag = null;
} else {
segment = reading.substring(0, index);
tag = reading.substring(index + 1);
}
segment = normalizer_.normalize(segment);
tag = TagSet.getTag(tag, tag_level_);
segments.add(segment);
tags.add(tag);
}
StringBuilder rejoint_word = new StringBuilder(word.getLength());
for (String segment : segments) {
rejoint_word.append(segment);
}
assert rejoint_word.toString().equals(word.getWord()) : reading_string
+ " " + segments;
word.add(new SegmentationReading(segments, tags));
}
}
}