/* * This file is part of Flicklib. * * Copyright (C) Francis De Brabandere * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.flicklib.service.movie.google; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import net.htmlparser.jericho.Element; import net.htmlparser.jericho.HTMLElementName; import net.htmlparser.jericho.Source; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.flicklib.api.AbstractMovieInfoFetcher; import com.flicklib.api.Parser; import com.flicklib.domain.Movie; import com.flicklib.domain.MoviePage; import com.flicklib.domain.MovieSearchResult; import com.flicklib.domain.MovieService; import com.flicklib.service.SourceLoader; import com.flicklib.tools.Param; import com.flicklib.tools.SimpleXPath; import com.google.inject.Inject; import com.google.inject.Singleton; /** * * @author francisdb */ @Singleton public class GoogleInfoFetcher extends AbstractMovieInfoFetcher { private static final Logger LOGGER = LoggerFactory.getLogger(GoogleInfoFetcher.class); /** * http://www.google.com/movies */ private final static MovieService GOOGLE = new MovieService("GOOGLE", "Google movies", "http://www.google.com", "Google"); private final Parser googleParser; private final SourceLoader httpLoader; /** * Constructs a new GoogleInfoFetcher * @param googleParser * @param httpLoader */ @Inject public GoogleInfoFetcher(final @Google Parser googleParser, SourceLoader httpLoader) { this.googleParser = googleParser; this.httpLoader = httpLoader; } public GoogleInfoFetcher(SourceLoader loader) { this(new GoogleParser(), loader); } @Override public MoviePage getMovieInfo(String id) throws IOException { if (id.startsWith("http://www.google.com/movies/reviews")) { MoviePage site = new MoviePage(GOOGLE); site.setUrl(id); com.flicklib.service.Source source = httpLoader.loadSource(id); googleParser.parse(source, site); return site; } return null; } @Override public List<MovieSearchResult> search(String title) throws IOException { String url = GOOGLE.getUrl() + "/movies" + Param.paramString("q", title); com.flicklib.service.Source sourceString = httpLoader.loadSource(url); Source source = sourceString.getJerichoSource(); SimpleXPath xp = new SimpleXPath(source.getElementById("movie_results")).getAllTagByAttributes("itemtype", "http://schema.org/Movie"); List<MovieSearchResult> result = new ArrayList<MovieSearchResult>(); for (Element e : xp) { final SimpleXPath xpath = new SimpleXPath(e); String foundTitle = xpath.getAllTagByAttributes("itemprop", "name").getValue(); int on = 0; int off = 0; for (Element img : xpath.getTags(HTMLElementName.IMG)) { String src = img.getAttributeValue("src"); if ("/images/sy-star-on.gif".equals(src)) { on ++; } else if ("/images/sy-star-off.gif".equals(src)) { off ++; } } MovieSearchResult m; if (on + off > 0) { double score = ((double) on*100/(on + off)); LOGGER.info("score for "+foundTitle+" is "+score); MoviePage mp = new MoviePage(); mp.setScore((int) score); m = mp; } else { m = new MovieSearchResult(); } m.setService(GOOGLE); m.setTitle(foundTitle); Element link = xpath.getAllTagByAttributes("class", "info links").getTags(HTMLElementName.A).firstElement(); if (link != null) { String href = link.getAttributeValue("href"); m.setUrl(GOOGLE.getUrl() + href); Matcher matcher = Pattern.compile("mid=(\\w+)").matcher(href); if (matcher.find()) { m.setIdForSite(matcher.group(1)); } } else { // no critics ... just write something in it m.setUrl(url); m.setIdForSite(foundTitle); } String desc = xpath.getAllTagByAttributes("itemprop", "description").getValue(); if (desc != null) { String moreDesc = xpath.getAllTagByAttributes("id", "SynopsisSecond0").getValue(); if (moreDesc != null) { desc = desc + moreDesc; } m.setDescription(desc); } result.add(m); } return result; } @Deprecated public MoviePage fetch(Movie movie, String id) { MoviePage site = new MoviePage(); //site.setMovie(movie); site.setService(GOOGLE); try { String params = Param.paramString("q", movie.getTitle()); com.flicklib.service.Source httpSource = httpLoader.loadSource("http://www.google.com/movies" + params); Source source = httpSource.getJerichoSource(); //source.setLogWriter(new OutputStreamWriter(System.err)); // send log messages to stderr //Element titleElement = (Element)source.getAllElements(HTMLElementName.TITLE).get(0); //System.out.println(titleElement.getContent().extractText()); // <div id="bubble_allCritics" class="percentBubble" style="display:none;"> 57% </div> String movieUrl = null; List<?> aElements = source.getAllElements(HTMLElementName.A); for (Iterator<?> i = aElements.iterator(); i.hasNext() && movieUrl == null;) { Element aElement = (Element) i.next(); String url = aElement.getAttributeValue("href"); // /movies/reviews?cid=b939f27b219eb36f&fq=Pulp+Fiction&hl=en if (url != null && url.startsWith("/movies/reviews?cid=")) { movieUrl = "http://www.google.com" + url; String movieName = aElement.getContent().getTextExtractor().toString(); LOGGER.info("taking first result: " + movieName + " -> " + movieUrl); } } if (movieUrl == null) { throw new IOException("Movie not found on Google: " + movie.getTitle()); } site.setUrl(movieUrl); httpSource = httpLoader.loadSource(movieUrl); googleParser.parse(httpSource, site); } catch (IOException ex) { LOGGER.error("Loading from Google failed", ex); } return site; } @Override public MovieService getService() { return GOOGLE; } }