package io.lumify.youtube; import io.lumify.core.ingest.video.VideoTranscript; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.NodeList; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import java.io.File; import java.io.FileInputStream; import java.io.InputStream; public class YoutubeccReader { public static VideoTranscript read(File file) throws Exception { FileInputStream in = new FileInputStream(file); try { return read(in); } finally { in.close(); } } public static VideoTranscript read(InputStream in) throws Exception { VideoTranscript videoTranscript = new VideoTranscript(); DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); DocumentBuilder db = dbf.newDocumentBuilder(); Document doc = db.parse(in); NodeList textElements = doc.getElementsByTagName("text"); for (int i = 0; i < textElements.getLength(); i++) { Element textElement = (Element) textElements.item(i); double start = Double.parseDouble(textElement.getAttribute("start")); double duration = Double.parseDouble(textElement.getAttribute("dur")); String text = textElement.getTextContent().trim(); VideoTranscript.Time time = new VideoTranscript.Time((long) (start * 1000), (long) ((start + duration) * 1000)); videoTranscript.add(time, text); } return videoTranscript; } }