/*
* Copyright (C) 2014 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.android.exoplayer.text.webvtt;
import com.google.android.exoplayer.C;
import com.google.android.exoplayer.ParserException;
import com.google.android.exoplayer.text.Cue;
import com.google.android.exoplayer.text.SubtitleParser;
import com.google.android.exoplayer.util.MimeTypes;
import android.text.Html;
import android.text.Layout.Alignment;
import android.util.Log;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* A simple WebVTT parser.
* <p>
* @see <a href="http://dev.w3.org/html5/webvtt">WebVTT specification</a>
*/
public final class WebvttParser implements SubtitleParser {
private static final String TAG = "WebvttParser";
private static final String WEBVTT_FILE_HEADER_STRING = "^\uFEFF?WEBVTT((\\u0020|\u0009).*)?$";
private static final Pattern WEBVTT_FILE_HEADER =
Pattern.compile(WEBVTT_FILE_HEADER_STRING);
private static final String WEBVTT_METADATA_HEADER_STRING = "\\S*[:=]\\S*";
private static final Pattern WEBVTT_METADATA_HEADER =
Pattern.compile(WEBVTT_METADATA_HEADER_STRING);
private static final String WEBVTT_CUE_IDENTIFIER_STRING = "^(?!.*(-->)).*$";
private static final Pattern WEBVTT_CUE_IDENTIFIER =
Pattern.compile(WEBVTT_CUE_IDENTIFIER_STRING);
private static final String WEBVTT_TIMESTAMP_STRING = "(\\d+:)?[0-5]\\d:[0-5]\\d\\.\\d{3}";
private static final Pattern WEBVTT_TIMESTAMP = Pattern.compile(WEBVTT_TIMESTAMP_STRING);
private static final String WEBVTT_CUE_SETTING_STRING = "\\S*:\\S*";
private static final Pattern WEBVTT_CUE_SETTING = Pattern.compile(WEBVTT_CUE_SETTING_STRING);
private static final String NON_NUMERIC_STRING = ".*[^0-9].*";
private final StringBuilder textBuilder;
private final boolean strictParsing;
/**
* Equivalent to {@code WebvttParser(false)}.
*/
public WebvttParser() {
this(false);
}
/**
* @param strictParsing If true, {@link #parse(InputStream)} will throw a {@link ParserException}
* if the stream contains invalid data. If false, the parser will make a best effort to ignore
* minor errors in the stream. Note however that a {@link ParserException} will still be
* thrown when this is not possible.
*/
public WebvttParser(boolean strictParsing) {
this.strictParsing = strictParsing;
textBuilder = new StringBuilder();
}
@Override
public final WebvttSubtitle parse(InputStream inputStream) throws IOException {
ArrayList<WebvttCue> subtitles = new ArrayList<>();
BufferedReader webvttData = new BufferedReader(new InputStreamReader(inputStream, C.UTF8_NAME));
String line;
// file should start with "WEBVTT"
line = webvttData.readLine();
if (line == null || !WEBVTT_FILE_HEADER.matcher(line).matches()) {
throw new ParserException("Expected WEBVTT. Got " + line);
}
// parse the remainder of the header
while (true) {
line = webvttData.readLine();
if (line == null) {
// we reached EOF before finishing the header
throw new ParserException("Expected an empty line after webvtt header");
} else if (line.isEmpty()) {
// we've read the newline that separates the header from the body
break;
}
if (strictParsing) {
Matcher matcher = WEBVTT_METADATA_HEADER.matcher(line);
if (!matcher.find()) {
throw new ParserException("Unexpected line: " + line);
}
}
}
// process the cues and text
while ((line = webvttData.readLine()) != null) {
// parse the cue identifier (if present) {
Matcher matcher = WEBVTT_CUE_IDENTIFIER.matcher(line);
if (matcher.find()) {
// ignore the identifier (we currently don't use it) and read the next line
line = webvttData.readLine();
}
long startTime = Cue.UNSET_VALUE;
long endTime = Cue.UNSET_VALUE;
CharSequence text = null;
int lineNum = Cue.UNSET_VALUE;
int position = Cue.UNSET_VALUE;
Alignment alignment = null;
int size = Cue.UNSET_VALUE;
// parse the cue timestamps
matcher = WEBVTT_TIMESTAMP.matcher(line);
// parse start timestamp
if (!matcher.find()) {
throw new ParserException("Expected cue start time: " + line);
} else {
startTime = parseTimestampUs(matcher.group());
}
// parse end timestamp
String endTimeString;
if (!matcher.find()) {
throw new ParserException("Expected cue end time: " + line);
} else {
endTimeString = matcher.group();
endTime = parseTimestampUs(endTimeString);
}
// parse the (optional) cue setting list
line = line.substring(line.indexOf(endTimeString) + endTimeString.length());
matcher = WEBVTT_CUE_SETTING.matcher(line);
while (matcher.find()) {
String match = matcher.group();
String[] parts = match.split(":", 2);
String name = parts[0];
String value = parts[1];
try {
if ("line".equals(name)) {
if (value.endsWith("%")) {
lineNum = parseIntPercentage(value);
} else if (value.matches(NON_NUMERIC_STRING)) {
Log.w(TAG, "Invalid line value: " + value);
} else {
lineNum = Integer.parseInt(value);
}
} else if ("align".equals(name)) {
// TODO: handle for RTL languages
if ("start".equals(value)) {
alignment = Alignment.ALIGN_NORMAL;
} else if ("middle".equals(value)) {
alignment = Alignment.ALIGN_CENTER;
} else if ("end".equals(value)) {
alignment = Alignment.ALIGN_OPPOSITE;
} else if ("left".equals(value)) {
alignment = Alignment.ALIGN_NORMAL;
} else if ("right".equals(value)) {
alignment = Alignment.ALIGN_OPPOSITE;
} else {
Log.w(TAG, "Invalid align value: " + value);
}
} else if ("position".equals(name)) {
position = parseIntPercentage(value);
} else if ("size".equals(name)) {
size = parseIntPercentage(value);
} else {
Log.w(TAG, "Unknown cue setting " + name + ":" + value);
}
} catch (NumberFormatException e) {
Log.w(TAG, name + " contains an invalid value " + value, e);
}
}
// parse text
textBuilder.setLength(0);
while (((line = webvttData.readLine()) != null) && (!line.isEmpty())) {
if (textBuilder.length() > 0) {
textBuilder.append("<br>");
}
textBuilder.append(line.trim());
}
text = Html.fromHtml(textBuilder.toString());
WebvttCue cue = new WebvttCue(startTime, endTime, text, lineNum, position, alignment, size);
subtitles.add(cue);
}
return new WebvttSubtitle(subtitles);
}
@Override
public final boolean canParse(String mimeType) {
return MimeTypes.TEXT_VTT.equals(mimeType);
}
private static int parseIntPercentage(String s) throws NumberFormatException {
if (!s.endsWith("%")) {
throw new NumberFormatException(s + " doesn't end with '%'");
}
s = s.substring(0, s.length() - 1);
if (s.matches(NON_NUMERIC_STRING)) {
throw new NumberFormatException(s + " contains an invalid character");
}
int value = Integer.parseInt(s);
if (value < 0 || value > 100) {
throw new NumberFormatException(value + " is out of range [0-100]");
}
return value;
}
private static long parseTimestampUs(String s) throws NumberFormatException {
if (!s.matches(WEBVTT_TIMESTAMP_STRING)) {
throw new NumberFormatException("has invalid format");
}
String[] parts = s.split("\\.", 2);
long value = 0;
for (String group : parts[0].split(":")) {
value = value * 60 + Long.parseLong(group);
}
return (value * 1000 + Long.parseLong(parts[1])) * 1000;
}
}