/**
* YoutubeScraper
* Copyright 22.03.2016 by Michael Peter Christen, @0rb1t3r
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package org.loklak.harvester;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import org.loklak.tools.CharacterCoding;
public class YoutubeScraper {
public final static ExecutorService executor = Executors.newFixedThreadPool(40);
private final static String[] html_tags = new String[]{"title"};
private final static String[] microformat_vocabularies = new String[]{"og", "twitter"};
public static JSONObject parseVideo(File file) throws IOException {
FileInputStream fis = new FileInputStream(file);
JSONObject json = parseVideo(fis);
fis.close();
return json;
}
public static JSONObject parseVideo(InputStream is) throws IOException {
BufferedReader reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
JSONObject json = parseVideo(reader);
reader.close();
return json;
}
public static JSONObject parseVideo(final BufferedReader br) throws IOException {
String input;
JSONObject json = new JSONObject(true);
boolean parse_span = false, parse_license = false;
String itemprop= "", itemtype = ""; // values for span
while ((input = br.readLine()) != null) try {
input = input.trim();
//System.out.println(input); // uncomment temporary to debug or add new fields
int p;
if (parse_license) {
if ((p = input.indexOf("<li")) >= 0) {
String tag = parseTag(input, p);
if (tag == null) continue;
if (tag.startsWith("<a ")) {
tag = parseTag(tag, 0);
addRDF(new String[]{"youtube", "category", tag}, json);
} else {
addRDF(new String[]{"youtube", "license", tag}, json);
}
parse_license = false;
continue;
}
} else if (parse_span) {
if ((p = input.indexOf("itemprop=\"")) >= 0) {
String[] token = parseItemprop(input, p, new String[]{"href", "content"}, "");
if (token == null) continue;
int q = itemtype.indexOf("//"); if (q < 0) continue;
String subject = itemtype.substring(q + 2).replace('.', '_').replace('/', '_');
String predicate = itemprop + "_" + token[1];
String object = token[2];
addRDF(new String[]{subject, predicate, object}, json);
continue;
}
if (input.indexOf("</span>") >= 0) {
parse_span = false;
continue;
}
} else {
tags: for (String tag: html_tags) {
if ((p = input.indexOf("<" + tag)) >= 0) {
addRDF(new String[]{"html", tag, parseTag(input, p)}, json);
continue tags;
}
}
vocs: for (String subject: microformat_vocabularies) {
if ((p = input.indexOf("property=\"" + subject + ":")) >= 0) {
addRDF(parseMicroformat(input, "property", p), json);
continue vocs;
}
if ((p = input.indexOf("name=\"" + subject + ":")) >= 0) {
addRDF(parseMicroformat(input, "name", p), json);
continue vocs;
}
}
if ((p = input.indexOf("span itemprop=\"")) >= 0) {
String[] token = parseItemprop(input, p, new String[]{"itemtype"}, "");
if (token == null) continue;
itemprop = token[1];
itemtype = token[2];
parse_span = true;
continue;
}
if ((p = input.indexOf("itemprop=\"")) >= 0) {
String[] token = parseItemprop(input, p, new String[]{"content"}, "youtube");
if (token == null) continue;
addRDF(token, json);
continue;
}
if ((p = input.indexOf("class=\"content watch-info-tag-list")) >= 0) {
parse_license = true;
continue;
}
if ((p = input.indexOf("yt-subscriber-count")) >= 0) {
String subscriber_string = parseProp(input, p, "title");
if (subscriber_string == null) continue;
json.put("youtube_subscriber", parseNumber(subscriber_string));
continue;
}
if (input.indexOf("\"like this") > 0 && (p = input.indexOf("yt-uix-button-content")) >= 0) {
String likes_string = parseTag(input, p);
json.put("youtube_likes", parseNumber(likes_string));
continue;
}
if (input.indexOf("\"dislike this") > 0 && (p = input.indexOf("yt-uix-button-content")) >= 0) {
String dislikes_string = parseTag(input, p);
json.put("youtube_dislikes", parseNumber(dislikes_string));
continue;
}
if ((p = input.indexOf("watch-view-count")) >= 0) {
String viewcount_string = parseTag(input, p);
if (viewcount_string == null) continue;
viewcount_string = viewcount_string.replace(" views", "");
if (viewcount_string.length() == 0) continue;
long viewcount = 0;
// if there are no views, there may be a string saying "No". But this is done in all languages, so we just catch a NumberFormatException
try {viewcount = parseNumber(viewcount_string);} catch (NumberFormatException e) {}
json.put("youtube_viewcount", viewcount);
continue;
}
if ((p = input.indexOf("watch?v=")) >= 0) {
p += 8;
int q = input.indexOf("\"", p);
if (q > 0) {
String videoid = input.substring(p, q);
int r = videoid.indexOf('&');
if (r > 0) videoid = videoid.substring(0, r);
addRDF(new String[]{"youtube", "next", videoid}, json);
continue;
}
}
if ((p = input.indexOf("playlist-header-content")) >= 0) {
String playlist_title = parseProp(input, p, "data-list-title");
if (playlist_title == null) continue;
addRDF(new String[]{"youtube", "playlist_title", playlist_title}, json);
continue;
}
if ((p = input.indexOf("yt-uix-scroller-scroll-unit")) >= 0) {
String playlist_videoid = parseProp(input, p, "data-video-id");
if (playlist_videoid == null) continue;
addRDF(new String[]{"youtube", "playlist_videoid", playlist_videoid}, json);
continue;
}
if ((p = input.indexOf("watch-description-text")) >= 0) {
p = input.indexOf('>', p);
int q = input.indexOf("</div", p);
String text = input.substring(p + 1, q < 0 ? input.length() : q);
text = paragraph.matcher(brend.matcher(text).replaceAll("\n")).replaceAll("").trim();
Matcher m;
anchor_loop: while ((m = anchor_pattern.matcher(text)).find()) try {
text = m.replaceFirst(m.group(1) + " ");
} catch (IllegalArgumentException e) {text = ""; break anchor_loop;}
text = CharacterCoding.html2unicode(text);
json.put("youtube_description", text);
continue;
}
}
} catch (Throwable e) {
e.printStackTrace();
System.err.println("error in video " + json.toString(2));
System.err.println("current line: " + input);
System.exit(0);
}
br.close();
return json;
}
private static long parseNumber(String n) throws NumberFormatException {
return Long.parseLong(numberfix.matcher(n).replaceAll(""));
}
private final static Pattern numberfix = Pattern.compile(",|\\.");
private final static Pattern paragraph = Pattern.compile("<p.*>|</p.*>");
private final static Pattern brend = Pattern.compile("<br />");
private final static Pattern anchor_pattern = Pattern.compile("<a .*?>(.*?)</a>");
private static String[] parseMicroformat(String line, String key, int start) {
int p = line.indexOf(key + "=\"", start); if (p < 0) return null; p += key.length() + 2;
int c = line.indexOf(":", p); if (c < 0) return null;
int q = line.indexOf("\"", c); if (q < 0) return null;
int r = line.indexOf("content=\"", q); if (r < 0) return null; r += 9;
int s = line.indexOf("\"", r); if (s < 0) return null;
// this is a rdf statement
String subject = line.substring(p, c).replace(':', '_');
String predicate = line.substring(c + 1, q).replace(':', '_');
String object = line.substring(r, s);
return new String[]{subject, predicate, object};
}
private static String[] parseItemprop(String line, int start, String[] objectnames, String subject) {
int p = line.indexOf("itemprop=\"", start); if (p < 0) return null; p += 10;
int q = line.indexOf("\"", p); if (q < 0) return null;
int r = -1;
objectscan: for (String objectname: objectnames) {
r = line.indexOf(objectname + "=\"", q);
if (r < 0) continue objectscan;
r += objectname.length() + 2;
break;
}
if (r < 0) return null;
int s = line.indexOf("\"", r); if (s < 0) return null;
// this becomes a rdf statement
String predicate = line.substring(p, q).replace(':', '_');
String object = line.substring(r, s);
return new String[]{subject, predicate, object};
}
private static void addRDF(String[] spo, JSONObject json) {
if (spo == null) return;
String subject = spo[0];
String predicate = spo[1];
String object = CharacterCoding.html2unicode(spo[2]);
if (subject.length() == 0 || predicate.length() == 0 || object.length() == 0) return;
String key = subject + "_" + predicate;
JSONArray objects = null;
try {
objects = json.getJSONArray(key);
} catch (JSONException e) {
objects = new JSONArray();
json.put(key, objects);
}
// double-check (wtf why is ths that complex?)
for (Object o: objects) {
if (o instanceof String && ((String) o).equals(object)) return;
}
// add the object to the objects
objects.put(object);
}
private static String parseProp(String line, int start, String key) {
int p = line.indexOf(key + "=\"", start);
if (p > 0) {
int q = line.indexOf('"', p + key.length() + 2);
if (q > 0) {
return line.substring(p + key.length() + 2, q);
}
}
return null;
}
private static String parseTag(String line, int start) {
int p = line.indexOf('>', start);
if (p < 0) return null;
int c = 1; // we count the number of open tags and stop if the number is zero. We already passed the first tag which is c = 1
int q = p + 1; // start scan at the next position
while (c > 0 && q < line.length() - 1) {
char a = line.charAt(q);
if (a == '<') {
if (line.charAt(q + 1) != 'i') {
if (line.charAt(q + 1) == '/') c--; else c++;
}
}
q++;
}
if (c != 0) return "";
return line.substring(p + 1, q - 1).trim();
}
}