/*
* This file is part of ALOE.
*
* ALOE is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
* ALOE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
* You should have received a copy of the GNU General Public License
* along with ALOE. If not, see <http://www.gnu.org/licenses/>.
*
* Copyright (c) 2012 SCCL, University of Washington (http://depts.washington.edu/sccl)
*/
package etc.aloe.data;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import weka.core.Attribute;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
/**
* Represents a collection of segments. Knows how to transform itself into Weka
* Instances with basic feature information.
*
* @author Michael Brooks <mjbrooks@uw.edu>
*/
public class SegmentSet {
public static final String DURATION_ATTR_NAME = "duration";
public static final String LENGTH_ATTR_NAME = "length";
public static final String CPS_ATTR_NAME = "cps";
public static final String RATE_ATTR_NAME = "rate";
private List<Segment> segments = new ArrayList<Segment>();
/**
* Add a segment to the set.
*
* @param segment
*/
public void add(Segment segment) {
this.segments.add(segment);
}
/**
* Get the size of the segment set.
*
* @return
*/
public int size() {
return this.segments.size();
}
/**
* Get the underlying list of segments.
*
* @return
*/
public List<Segment> getSegments() {
return segments;
}
/**
* Add a bunch of segments to the set.
*
* @param segments
*/
public void addAll(List<Segment> segments) {
this.segments.addAll(segments);
}
/**
* Set the underlying list of segments.
*
* @param segments
*/
public void setSegments(List<Segment> segments) {
this.segments = segments;
}
/**
* Convert the segment set into an ExampleSet (ready for feature
* extraction). The returned example set includes an id attribute, the
* message text, a label attribute, and several basic features extracted
* from the segment.
*
* @return
*/
public ExampleSet getBasicExamples() {
ArrayList<Attribute> attributes = new ArrayList<Attribute>();
attributes.add(new Attribute(ExampleSet.ID_ATTR_NAME));
attributes.add(new Attribute(ExampleSet.MESSAGE_ATTR_NAME, (List<String>) null));
attributes.add(new Attribute(ExampleSet.LABEL_ATTR_NAME, Arrays.asList(new String[]{"false", "true"})));
attributes.add(new Attribute(ExampleSet.PARTICIPANT_ATTR_NAME, (List<String>) null));
attributes.add(new Attribute(DURATION_ATTR_NAME));
attributes.add(new Attribute(LENGTH_ATTR_NAME));
attributes.add(new Attribute(CPS_ATTR_NAME));
attributes.add(new Attribute(RATE_ATTR_NAME));
Instances instances = new Instances("BasicExamples", attributes, 0);
instances.setClassIndex(2);
Attribute idAttr = instances.attribute(ExampleSet.ID_ATTR_NAME);
Attribute messageAttr = instances.attribute(ExampleSet.MESSAGE_ATTR_NAME);
Attribute labelAttr = instances.attribute(ExampleSet.LABEL_ATTR_NAME);
Attribute participantAttr = instances.attribute(ExampleSet.PARTICIPANT_ATTR_NAME);
Attribute durationAttr = instances.attribute(DURATION_ATTR_NAME);
Attribute lengthAttr = instances.attribute(LENGTH_ATTR_NAME);
Attribute cpsAttr = instances.attribute(CPS_ATTR_NAME);
Attribute rateAttr = instances.attribute(RATE_ATTR_NAME);
for (int i = 0; i < size(); i++) {
Segment segment = get(i);
Instance instance = new DenseInstance(instances.numAttributes());
String messageStr = segment.concatMessages();
String participantStr = segment.concatParticipants();
instance.setValue(idAttr, segment.getId());
instance.setValue(messageAttr, messageStr);
instance.setValue(participantAttr, participantStr);
if (segment.hasTrueLabel()) {
instance.setValue(labelAttr, segment.getTrueLabel() ? "true" : "false");
}
computeRateValues(segment, instance, messageStr, durationAttr, lengthAttr, cpsAttr, rateAttr);
instances.add(instance);
}
return new ExampleSet(instances);
}
/**
* Get the ith segment.
*
* @param i
* @return
*/
public Segment get(int i) {
return this.segments.get(i);
}
/**
* Return a new segment set containing only the labeled segments.
*
* @return
*/
public SegmentSet onlyLabeled() {
SegmentSet labeled = new SegmentSet();
for (Segment segment : segments) {
if (segment.hasTrueLabel()) {
labeled.add(segment);
}
}
return labeled;
}
/**
* Computes the basic timing-related features about a segment and applies
* them to the given instance.
*
* @param segment
* @param instance
* @param messageStr
* @param durationAttr
* @param lengthAttr
* @param cpsAttr
* @param rateAttr
*/
private void computeRateValues(Segment segment, Instance instance, String messageStr, Attribute durationAttr, Attribute lengthAttr, Attribute cpsAttr, Attribute rateAttr) {
double duration = segment.getDurationInSeconds();
double length = segment.getMessages().size();
//If the length is 1, then we correct the duration.
//Assume average typing speed (35 words per minute, 5 char/word)
if (length <= 1) {
double averageCharPerSecond = 35.0 * 5.0 / 60.0;
//[seconds] = [chars] / ([chars]/[seconds])
duration = (1 + messageStr.length()) / averageCharPerSecond;
}
if (duration > 100000) {
System.err.println("Wacky segment id: " + segment.getId() + " has duration: " + duration);
}
double cps = messageStr.length() / duration;
double rate = segment.getMessages().size() / duration;
instance.setValue(durationAttr, duration);
instance.setValue(lengthAttr, length);
instance.setValue(cpsAttr, cps);
instance.setValue(rateAttr, rate);
}
/**
* Counts the number of segments that have the given label (true, false, or
* null).
*
* @param label
* @return
*/
public int getCountWithTrueLabel(Boolean label) {
int count = 0;
for (Segment segment : segments) {
if (segment.getTrueLabel() == label) {
count++;
}
}
return count;
}
/**
* Get all the messages in the segments in this segment set.
* @return
*/
public MessageSet getMessages(MessageSet template) {
MessageSet messages = new MessageSet();
messages.setDateFormat(template.getDateFormat());
for (Segment s : segments) {
messages.addAll(s.getMessages());
}
return messages;
}
}