/** * Copyright 2010 DFKI GmbH. * All Rights Reserved. Use is subject to license terms. * * This file is part of MARY TTS. * * MARY TTS is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation, version 3 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * */ package marytts.util.data.text; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.io.Reader; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.ListIterator; import java.util.regex.Matcher; import java.util.regex.Pattern; import marytts.signalproc.analysis.Label; import marytts.signalproc.analysis.Labels; /** * A class to read and parse labels in a text file. The file format should conform to that used by ESPS Xwaves and the many other * labeling programs which support that format. * * @author Ingmar Steiner */ public class XwavesLabelfileReader { // main class variables (reader, times, labels, header lines) protected BufferedReader reader; protected Double[] times; protected String[] labels; protected String[] header; /** * Read data from a Label file. * * @param filename * Label filename as a String * @throws IOException * IOException */ public XwavesLabelfileReader(String filename) throws IOException { this(new FileReader(filename)); } /** * Read data from a Label file. * * @param reader * Label file as a Reader * @throws IOException * IOException */ public XwavesLabelfileReader(Reader reader) throws IOException { this.reader = new BufferedReader(reader); parseLabels(); reader.close(); } /** * Read lines from the label file and parse them. As each line is parsed, the label in that line and its end time are appended * to the appropriate arrays, and the initial header lines are stored in a third vector. * * @throws IOException * IOException */ private void parseLabels() throws IOException { // initialize some variables String line; boolean headerComplete = false; ArrayList<Double> timesList = new ArrayList<Double>(); ArrayList<String> labelsList = new ArrayList<String>(); ArrayList<String> headersList = new ArrayList<String>(); // Legend for regular expression: // // ^ start of line // \\s* leading whitespace // ( start of first captured group (time) // \\d+ one or more digits // (?: followed by a non-capturing group containing // \\. a period and // \\d+ one or more digits // )? this group is optional // ) end of first captured group // \\s+ whitespace // .+? second column, which is ignored (not captured) // \\s+? whitespace // (.*) second captured group (label) // $ end of line Pattern linePattern = Pattern.compile("^\\s*(\\d+(?:\\.\\d+)?)\\s+.+?\\s+?(.*)$"); boolean matches = false; // initialize some more variables for each line's captured groups String timeStr = null; String label = null; double time; // read the file line by line while ((line = reader.readLine()) != null) { // apply the regex Pattern to the current line... Matcher lineMatcher = linePattern.matcher(line); // ...and see if it matches matches = lineMatcher.matches(); if (matches) { // some label files might be headerless; // in that case, a well-formed line indicates that we are already seeing label data headerComplete = true; // parse the line by accessing the groups captured by the regex Matcher // the first group is the label's end time timeStr = lineMatcher.group(1); // the second group is the label itself label = lineMatcher.group(2); try { // parse the end time into a Double and append it to times time = Double.parseDouble(timeStr); timesList.add(time); } catch (NumberFormatException nfe) { // number could not be parsed; this should never actually happen! throw nfe; } // append label to labels labelsList.add(label); } else { // line could not be parsed by regex; are we still in the header? if (!headerComplete) { if (line.trim().startsWith("#")) // hash line signals end of header (but is not itself part of the header) headerComplete = true; else // no hash line seen so far, line seems to be part of header headersList.add(line); } else { // header was already complete, or we are dealing with a headerless label file, // but we found a line that could not be parsed! System.err.println("Malformed line found outside of header:\n" + line); throw new IOException(); } } } // it should never happen that times and labels do not have the same number of elements! assert timesList.size() == labelsList.size() : ""; times = new Double[timesList.size()]; int t; for (t = 0; t < timesList.size(); t++) { times[t] = timesList.get(t); } labels = (String[]) labelsList.toArray(new String[0]); header = (String[]) headersList.toArray(new String[0]); return; } /** * getter method for times * * @return times as ArrayList of Doubles */ public Double[] getTimes() { return times; } /** * getter method for labels * * @return labels as ArrayList of Strings */ public String[] getLabelSymbols() { return labels; } public Labels getLabels() { Label[] items = new Label[labels.length]; assert times.length == labels.length; for (int i = 0; i < items.length; i++) { items[i] = new Label(times[i], labels[i]); } return new Labels(items); } /** * getter method for header * * @return header lines as ArrayList of Strings */ public String[] getHeader() { return header; } }