// Copyright 2013 Thomas Müller // This file is part of MarMoT, which is licensed under GPLv3. package experimental.morfessor; import java.io.BufferedReader; import java.io.IOException; import java.io.Serializable; import java.util.Collections; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.ListIterator; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import marmot.util.FileUtils; /* This is a java port of the perl code in bin/expandmorphsegmentations.pl * and two lines of test/Makefile of morfessor_catmap0.9.2 */ public class Expander implements Serializable { private static final long serialVersionUID = 1L; private static final Pattern LINE_PATTERN_ = Pattern .compile("^1 (\\*[1-4]?) (.+)$"); private Map<String, List<Morpheme>> sub_morphs_; public Expander(String segmentation_file) { init(segmentation_file); } private void init(String segmentation_file) { sub_morphs_ = new HashMap<String, List<Morpheme>>(); try { BufferedReader reader = FileUtils.openFile(segmentation_file); while (reader.ready()) { String line = reader.readLine(); Matcher m = LINE_PATTERN_.matcher(line); if (m.matches()) { String type = m.group(1); List<Morpheme> morphs = Morpheme.split(m.group(2)); sub_morphs_.put(Morpheme.join(morphs, false, false, "") + type, morphs); } } } catch (IOException e) { throw new RuntimeException(e); } } public List<Morpheme> expand(List<Morpheme> morphs) { return expand(morphs, false); } public List<Morpheme> expand(List<Morpheme> morphs, boolean split_all ) { List<Morpheme> morph_list = new LinkedList<Morpheme>(); for (Morpheme morph : morphs) { morph_list.addAll(expand(morph, split_all)); } ListIterator<Morpheme> iterator = morph_list.listIterator(); // Merge consecutive non-morphemes while (iterator.hasNext()) { Morpheme morph = iterator.next(); if (morph.isNonMorpheme() && iterator.hasNext()) { Morpheme next_morph = iterator.next(); if (next_morph.isNonMorpheme()) { morph.setMorpheme(morph.getMorpheme() + next_morph.getMorpheme()); morph.setAsterisk(null); iterator.remove(); iterator.previous(); } } } for (Morpheme morph : morph_list) { if (morph.isNonMorpheme()) { morph.setTag(Morpheme.STEM); } } return morph_list; } public List<Morpheme> expand(Morpheme morph, boolean split_all) { if (morph.isNonMorpheme()) { return Collections.singletonList(morph); } List<Morpheme> sub_morphs = sub_morphs_.get(morph.getMorphAsterisk()); if (sub_morphs == null) { return Collections.singletonList(morph); } List<Morpheme> morph_list = new LinkedList<Morpheme>(); for (Morpheme sub_morph : sub_morphs) { if (sub_morph.isNonMorpheme() && (!split_all)) { // Don't expand to non-morphemes! return Collections.singletonList(morph); } morph_list.addAll(expand(sub_morph, split_all)); } return morph_list; } }