/*
* This file is part of Flicklib.
*
* Copyright (C) Francis De Brabandere
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.flicklib.service.movie.imdb;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.HTMLElementName;
import net.htmlparser.jericho.Source;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.flicklib.api.Parser;
import com.flicklib.domain.MoviePage;
import com.flicklib.domain.MovieType;
import com.flicklib.tools.AdvancedTextExtractor;
import com.flicklib.tools.ElementOnlyTextExtractor;
import com.flicklib.tools.SimpleXPath;
import com.google.inject.Inject;
import com.google.inject.Singleton;
/**
*
* @author francisdb
*/
@Singleton
public class ImdbParser implements Parser {
private static final Logger LOGGER = LoggerFactory.getLogger(ImdbParser.class);
private final Map<String, MovieType> movieTypeMap = new HashMap<String, MovieType>();
{
movieTypeMap.put("video", MovieType.VIDEO_MOVIE);
}
@Inject
public ImdbParser() {
}
@Override
public final void parse(com.flicklib.service.Source htmlSource, MoviePage movieSite) {
parse(htmlSource.getContent(), htmlSource.getJerichoSource(), movieSite);
}
private void parse(final String html, Source source, MoviePage movie) {
ImdbParserRegex regexParser = new ImdbParserRegex(html);
movie.setType(regexParser.getType());
Element titleHeader = (Element) source.getAllElements(HTMLElementName.H1).get(0);
String title = new ElementOnlyTextExtractor(titleHeader.getContent()).toString();
title = ImdbParserRegex.cleanTitle(title);
movie.setTitle(title);
List<Element> extraTitle = titleHeader.getAllElements("class", "title-extra", false);
if (!extraTitle.isEmpty()) {
AdvancedTextExtractor at = new AdvancedTextExtractor(extraTitle.get(0), false).addExcludedTagName("i");
movie.setOriginalTitle(at.toString());
}
List<?> yearLinks = titleHeader.getAllElements(HTMLElementName.A);
if (yearLinks.size() > 0) {
Element yearLink = (Element) yearLinks.get(0);
String year = yearLink.getContent().getTextExtractor().toString();
setYear(movie, year);
} else {
List<Element> spans = titleHeader.getAllElements(HTMLElementName.SPAN);
Pattern p = Pattern.compile("\\((\\D*)(\\d+)\\)");
for (Element span : spans) {
String txt = span.getTextExtractor().toString();
Matcher m = p.matcher(txt);
if (m.matches()) {
// (Video XYZ)
String type = m.group(1).trim().toLowerCase();
MovieType mType = movieTypeMap.get(type);
String year = m.group(2);
LOGGER.info("found element with year :" + txt + " -> " + year + " type:" + type + ", " + mType);
if (mType != null) {
movie.setType(mType);
}
setYear(movie, year);
}
}
}
List<?> linkElements = source.getAllElements(HTMLElementName.A);
for (Iterator<?> i = linkElements.iterator(); i.hasNext();) {
Element linkElement = (Element) i.next();
if ("poster".equals(linkElement.getAttributeValue("name"))) {
// A element can contain other tags so need to extract the text from it:
List<?> imgs = linkElement.getContent().getAllElements(HTMLElementName.IMG);
Element img = (Element) imgs.get(0);
String imgUrl = img.getAttributeValue("src");
movie.setImgUrl(imgUrl);
}
String href = linkElement.getAttributeValue("href");
final String linkContent = linkElement.getContent().getTextExtractor().toString();
if (href != null && (href.contains("/Sections/Genres/") || (href.contains("/genre/")))) {
String genre = linkContent;
// TODO find a better way to parse these out, make sure it are only the movie genres
if (!genre.toLowerCase().contains("imdb")) {
movie.addGenre(linkContent);
}
}
if (href != null && href.contains("/Sections/Languages/")) {
movie.addLanguage(linkContent);
}
//<a href="/name/nm0000206/" onclick="(new Image()).src='/rg/castlist/position-1/images/b.gif?link=/name/nm0000206/';">Keanu Reeves</a>
String onclick = linkElement.getAttributeValue("onclick");
if (onclick != null) {
if(onclick.contains("castlist")){
movie.getActors().add(linkContent);
}
//<a href="/name/nm0905154/" onclick="(new Image()).src='/rg/directorlist/position-2/images/b.gif?link=name/nm0905154/';">Larry Wachowski</a><br/>
if (onclick.contains("directorlist")) {
movie.getDirectors().add(linkContent);
}
}
String itemprop = linkElement.getAttributeValue("itemprop");
if (itemprop != null) {
if ("director".equals(itemprop)) {
movie.getDirectors().add(linkContent);
}
if ("actors".equals(itemprop)) {
movie.getActors().add(linkContent);
}
}
}
linkElements = source.getAllElements(HTMLElementName.B);
for (Iterator<?> i = linkElements.iterator(); i.hasNext();) {
Element bElement = (Element) i.next();
if (bElement.getContent().getTextExtractor().toString().contains("User Rating:")) {
Element next = source.getNextElement(bElement.getEndTag().getEnd());
String rating = next.getContent().getTextExtractor().toString();
// skip (awaiting 5 votes)
if (!rating.contains("awaiting")) {
parseRatingString(movie, rating);
next = source.getNextElement(next.getEndTag().getEnd());
parseVotes(movie, next);
}
}
}
if (movie.getScore() == null) {
List<Element> elements = source.getAllElements("class","starbar-meta", false);
for (Element e : elements) {
List<Element> boldElements = e.getAllElements(HTMLElementName.B);
if (boldElements.size() > 0) {
String rating = boldElements.get(0).getTextExtractor().toString();
if (!rating.contains("awaiting")) {
parseRatingString(movie, rating);
}
}
List<Element> aElements = e.getAllElements(HTMLElementName.A);
if (aElements.size() > 0) {
String votes= aElements.get(0).getTextExtractor().toString();
if (votes.contains("votes")) {
parseVotes(movie, aElements.get(0));
}
}
}
}
if (movie.getScore() == null) {
for (Element element : source.getAllElements("class", "star-box-details", false)) {
for (Element span : element.getAllElements("span")) {
String itemprop = span.getAttributeValue("itemprop");
if ("ratingValue".equals(itemprop)) {
String value = span.getTextExtractor().toString();
parseRatingString(movie, value);
}
if ("ratingCount".equals(itemprop)) {
String value = span.getTextExtractor().toString().replace(",", "").trim();
if (value.length() > 0) {
movie.setVotes(Integer.parseInt(value));
}
}
}
}
}
linkElements = source.getAllElements(HTMLElementName.H5);
String hText;
for (Iterator<?> i = linkElements.iterator(); i.hasNext();) {
Element hElement = (Element) i.next();
hText = hElement.getContent().getTextExtractor().toString();
int end = hElement.getEnd();
if (hText.contains("Plot Outline")) {
movie.setPlot(source.subSequence(end, source.getNextStartTag(end).getBegin()).toString().trim());
} else if (hText.contains("Plot:")) {
Element divElement = source.getNextElement(end);
end = divElement.getStartTag().getEnd();
movie.setPlot(source.subSequence(end, source.getNextStartTag(end).getBegin()).toString().trim());
} else if (hText.contains("Runtime")) {
Element divElement = source.getNextElement(end);
String runtime = divElement.getTextExtractor().toString();
// EndTag next = source.getNextEndTag(end);
// //System.out.println(next);
// StartTag nextStartTag = source.getNextStartTag(end);
// String runtime;
// if (nextStartTag.getBegin() < next.getBegin()) {
// // There is an extra div tag : <div class="info-content">
// runtime = source.subSequence(nextStartTag.getEnd(), next.getBegin()).toString().trim();
// } else {
// runtime = source.subSequence(end, next.getBegin()).toString().trim();
// }
movie.setRuntime(parseRuntime(runtime));
} else if (hText.contains("User Rating")) {
Element aElement = source.getNextElement(end);
List<Element> boldOnes = aElement.getAllElements(HTMLElementName.B);
if (boldOnes.size()>0) {
Element element = boldOnes.get(0);
String rating = element.getTextExtractor().toString();
if (!rating.contains("awaiting")) {
parseRatingString(movie, rating);
Element next = source.getNextElement(element.getEndTag().getEnd());
parseVotes(movie, next);
}
}
} /*else if (hText.contains("Genre")) {
}*/
}
if (movie.getPlot() == null) {
List<Element> descriptions = source.getAllElements("itemprop", "description", false);
for (Element desc : descriptions) {
String txt = desc.getTextExtractor().toString().trim();
if (txt.length() > 0) {
movie.setPlot(txt);
}
}
}
if (movie.getTitle() == null) {
//System.out.println(source.toString());
movie.setPlot("Not found");
}
if (movie.getImgUrl() == null) {
for (Element e : new SimpleXPath(source.getElementById("img_primary")).getAllTagByAttributes("itemprop", "image")) {
final String src = e.getAttributeValue("src");
if (src != null) {
LOGGER.info("found image : " + src);
movie.setImgUrl(src);
}
}
}
}
private void setYear(MoviePage movie, String year) {
try {
movie.setYear(Integer.valueOf(year));
} catch (NumberFormatException ex) {
LOGGER.error("Could not parse year '" + year + "' to integer", ex);
}
}
private void parseVotes(MoviePage movieSite, Element element) {
String votes = element.getContent().getTextExtractor().toString();
votes = votes.replaceAll("\\(", "");
votes = votes.replaceAll("votes(\\))*", "");
votes = votes.replaceAll(",", "");
votes = votes.trim();
try {
movieSite.setVotes(Integer.valueOf(votes));
} catch (NumberFormatException ex) {
LOGGER.error("Could not parse the votes '" + votes + "' to Integer", ex);
}
}
private void parseRatingString(MoviePage movieSite, String rating) {
// to percentage
rating = rating.replace("/10", "");
try {
int theScore = Math.round(Float.valueOf(rating).floatValue() * 10);
movieSite.setScore(theScore);
} catch (NumberFormatException ex) {
LOGGER.error("Could not parse rating '" + rating + "' to Float", ex);
}
}
private Integer parseRuntime(String runtimeString) {
String runtime = runtimeString.substring(0, runtimeString.indexOf("min")).trim();
int colonIndex = runtime.indexOf(":");
if (colonIndex != -1) {
runtime = runtime.substring(colonIndex + 1);
}
return Integer.valueOf(runtime);
}
}