/* * This file is part of Flicklib. * * Copyright (C) Francis De Brabandere * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.flicklib.service.movie.ofdb; import java.io.UnsupportedEncodingException; import java.net.URLDecoder; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Set; import net.htmlparser.jericho.Element; import net.htmlparser.jericho.HTMLElementName; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.flicklib.api.Parser; import com.flicklib.domain.MoviePage; import com.flicklib.service.Source; public class OfdbParser implements Parser{ private static final Logger LOGGER = LoggerFactory.getLogger(OfdbParser.class); @Override public void parse(Source source, MoviePage page) { net.htmlparser.jericho.Source jerichoSource = source.getJerichoSource(); // <span class="movieMainTitle">The Matrix</span> List<Element> h2Elements = jerichoSource.getAllElements(HTMLElementName.H2); for(Element h2Element: h2Elements){ // TODO get all titles String germanTitle = h2Element.getContent().getTextExtractor().toString(); germanTitle = OfdbTools.handleType(germanTitle, page); page.setAlternateTitle(germanTitle); } // <tr valign="top"> // <td nowrap><font face="Arial,Helvetica,sans-serif" size="2" class="Normal">Originaltitel:</font></td> // <td>  </td> // <td width="99%"><font face="Arial,Helvetica,sans-serif" size="2" class="Daten"><b>Dune</b></font></td> // </tr> List<Element> fontElements = jerichoSource.getAllElements(HTMLElementName.FONT); Iterator<Element> fontIterator = fontElements.iterator(); Element fontElement; while(fontIterator.hasNext()){ fontElement = fontIterator.next(); String txt = fontElement.getContent().getTextExtractor().toString(); if("Originaltitel:".equals(txt)){ fontElement = fontIterator.next(); String title = fontElement.getContent().getTextExtractor().toString().trim(); page.setTitle(title); page.setOriginalTitle(title); }else if("Erscheinungsjahr:".equals(txt)){ fontElement = fontIterator.next(); try{ page.setYear(Integer.parseInt(fontElement.getContent().getTextExtractor().toString())); }catch(NumberFormatException ex){ LOGGER.warn("Could not parse year: "+ex.getMessage()); } }else if("Regie:".equals(txt)){ Element tr = jerichoSource.getNextElement(fontElement.getEnd(), HTMLElementName.TR); Element a = jerichoSource.getNextElement(fontElement.getEnd(), HTMLElementName.A); while(a != null && tr != null && a.getEnd() < tr.getStartTag().getBegin()){ page.getDirectors().add(a.getContent().getTextExtractor().toString()); a = jerichoSource.getNextElement(a.getEnd(), HTMLElementName.A); } }else if("Darsteller:".equals(txt)){ Element tr = jerichoSource.getNextElement(fontElement.getEnd(), HTMLElementName.TR); Element a = jerichoSource.getNextElement(fontElement.getEnd(), HTMLElementName.A); while(a != null && tr != null && a.getEnd() < tr.getStartTag().getBegin()){ page.getActors().add(a.getContent().getTextExtractor().toString()); a = jerichoSource.getNextElement(a.getEnd(), HTMLElementName.A); } } } //<img src="http://img.ofdb.de/film/3/3635.jpg" alt="Dune - Der Wüstenplanet" border="0" width="120" height="168"><br><br> List<Element> imgElements = jerichoSource.getAllElements(HTMLElementName.IMG); Iterator<Element> imgIterator = imgElements.iterator(); Element imgElement; String imgUrl = null; while(imgIterator.hasNext() && imgUrl == null){ imgElement = imgIterator.next(); String src = imgElement.getAttributeValue("src"); if(src != null && src.startsWith("http://img.ofdb.de/film/")){ imgUrl = src; } } page.setImgUrl(imgUrl); //<a href="view.php?page=genre&Genre=Fantasy">Fantasy</a><br><a href="view.php?page=genre&Genre=Krieg">Krieg</a><br>.... List<Element> aElements = jerichoSource.getAllElements(HTMLElementName.A); Iterator<Element> aIterator = aElements.iterator(); Element aElement; Set<String> genres = new HashSet<String>(); while(aIterator.hasNext()){ aElement = aIterator.next(); String href = aElement.getAttributeValue("href"); if(href.startsWith("view.php?page=genre&Genre=")){ href = href.replace("view.php?page=genre&Genre=", ""); try { href = URLDecoder.decode(href, "UTF-8"); genres.add(href); } catch (UnsupportedEncodingException e) { LOGGER.error(e.getMessage(), e); } } } page.setGenres(genres); // score int noteIndex = source.getContent().indexOf("Note: "); int nextBr = source.getContent().indexOf("<br>", noteIndex); String score = source.getContent().substring(noteIndex, nextBr); //System.out.println(score); String actual = score.substring("Note: ".length(), score.indexOf(" ")); double dScore = Double.valueOf(actual); page.setScore((int)Math.round(dScore * 10)); // Inhalt:</b> Vincent Vega und Jules Winnfield holen für ihren Boss Marsellus Wallace eine schwarze Aktentasche aus einer Wohnung ab. Drei Jungs, die ihnen dabei im... <a href="plot/1050, // TODO fetch the extra info page and get the text from there int inhaltIndex = source.getContent().indexOf("Inhalt:</b> "); int nextAHref = source.getContent().indexOf("<a href", inhaltIndex); String inhalt = source.getContent().substring(inhaltIndex + "Inhalt:</b> ".length(), nextAHref); page.setPlot(inhalt); page.setDescription(inhalt); // TODO get genres, length, alternative titles } }