package moviescraper.doctord.controller.siteparsingprofile.specific; import java.awt.image.BufferedImage; import java.io.File; import java.io.IOException; import java.net.MalformedURLException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.List; import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.imageio.ImageIO; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.text.WordUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.nodes.Node; import org.jsoup.nodes.TextNode; import org.jsoup.select.Elements; import moviescraper.doctord.controller.languagetranslation.Language; import moviescraper.doctord.controller.siteparsingprofile.SiteParsingProfile; import moviescraper.doctord.model.SearchResult; import moviescraper.doctord.model.dataitem.Actor; import moviescraper.doctord.model.dataitem.Director; import moviescraper.doctord.model.dataitem.Genre; import moviescraper.doctord.model.dataitem.ID; import moviescraper.doctord.model.dataitem.MPAARating; import moviescraper.doctord.model.dataitem.OriginalTitle; import moviescraper.doctord.model.dataitem.Outline; import moviescraper.doctord.model.dataitem.Plot; import moviescraper.doctord.model.dataitem.Rating; import moviescraper.doctord.model.dataitem.ReleaseDate; import moviescraper.doctord.model.dataitem.Runtime; import moviescraper.doctord.model.dataitem.Set; import moviescraper.doctord.model.dataitem.SortTitle; import moviescraper.doctord.model.dataitem.Studio; import moviescraper.doctord.model.dataitem.Tagline; import moviescraper.doctord.model.dataitem.Thumb; import moviescraper.doctord.model.dataitem.Title; import moviescraper.doctord.model.dataitem.Top250; import moviescraper.doctord.model.dataitem.Votes; import moviescraper.doctord.model.dataitem.Year; public class AvEntertainmentParsingProfile extends SiteParsingProfile implements SpecificProfile { private static final SimpleDateFormat avEntertainmentReleaseDateFormat = new SimpleDateFormat("MM/dd/yyyy", Locale.ENGLISH); @Override public Title scrapeTitle() { Elements elements = document.select("#mini-tabet h2"); String title = elements.first().childNode(0).toString().trim(); return new Title(title); } @Override public OriginalTitle scrapeOriginalTitle() { return OriginalTitle.BLANK_ORIGINALTITLE; } @Override public SortTitle scrapeSortTitle() { return SortTitle.BLANK_SORTTITLE; } @Override public Set scrapeSet() { String set = getMovieData("Series","シリーズ"); return new Set(set); } @Override public Rating scrapeRating() { return new Rating(0,""); } @Override public Year scrapeYear() { return scrapeReleaseDate().getYear(); } @Override public ReleaseDate scrapeReleaseDate() { Elements elements = document.select("div[id=titlebox] ul li:contains(Release Date:), div[id=titlebox] ul li:contains(発売日:)"); if(elements != null) { String releaseDateText = elements.first().ownText(); return new ReleaseDate(releaseDateText, avEntertainmentReleaseDateFormat); } return ReleaseDate.BLANK_RELEASEDATE; } @Override public Top250 scrapeTop250() { return Top250.BLANK_TOP250; } @Override public Votes scrapeVotes() { return Votes.BLANK_VOTES; } @Override public Outline scrapeOutline() { return Outline.BLANK_OUTLINE; } @Override public Plot scrapePlot() { return Plot.BLANK_PLOT; } @Override public Tagline scrapeTagline() { return Tagline.BLANK_TAGLINE; } @Override public Runtime scrapeRuntime() { String runtime = ""; Elements elements = document.select("div[id=titlebox] ul li"); for (Element element : elements) { if (element.childNodeSize() == 3) { Node childNode = element.childNode(2); if (childNode instanceof TextNode && (element.childNode(1).childNode(0).toString() .startsWith("Playing time") || element .childNode(1).childNode(0).toString() .startsWith("収録時間"))) { String data = element.childNode(2).toString(); Pattern pattern = Pattern.compile("\\d+"); Matcher matcher = pattern.matcher(data); if (matcher.find()) { runtime = matcher.group(); break; } } } } return new Runtime(runtime); } @Override public Thumb[] scrapePosters() { List<Thumb> thumbs = new ArrayList<>(); Thumb[] fanart = scrapeFanart(); if (fanart.length > 0) { try { BufferedImage read = ImageIO.read(fanart[0].getThumbURL()); if(read != null) { //int newWidth = (int) ((1.0 - 0.526666) * read.getWidth()); thumbs.add( new Thumb(fanart[0].getThumbURL().toString(), true)); } } catch (IOException e) { e.printStackTrace(); } } return thumbs.toArray( new Thumb[ thumbs.size() ] ); } @Override public Thumb[] scrapeFanart() { List<Thumb> thumbs = new ArrayList<>(); Elements elements = document.select("li.ppvs.magnify a"); if (elements.size() > 0) { Element first = elements.first(); String attr = first.attr("onclick"); String temp = "imagefile="; Pattern pattern = Pattern.compile(temp+"http.*jpg"); Matcher matcher = pattern.matcher(attr); if (matcher.find()) { String url = matcher.group().substring(temp.length()); try { thumbs.add( new Thumb(url) ); } catch (MalformedURLException e) { e.printStackTrace(); } } } return thumbs.toArray( new Thumb[ thumbs.size() ] ); } @Override public Thumb[] scrapeExtraFanart() { return new Thumb[0]; } @Override public MPAARating scrapeMPAA() { return MPAARating.RATING_XXX; } @Override public ID scrapeID() { Elements select = document.select("div[class=top-title]"); String id = ""; if (select.size() > 0) { Element element = select.get(0); if (element.childNodeSize() > 0) { id = element.childNode(0).toString(); id = getLastWord(id); } } return new ID( id ); } private static String getLastWord(String input) { String wordSeparator = " "; boolean inputIsOnlyOneWord = !StringUtils.contains(input, wordSeparator); if (inputIsOnlyOneWord) { return input; } return StringUtils.substringAfterLast(input, wordSeparator); } @Override public ArrayList<Genre> scrapeGenres() { ArrayList<Genre> list = new ArrayList<>(); Elements elements = document.select("div[id=detailbox] ul ol a"); for (Element element : elements) { String genre = element.childNode(0).toString(); if(!genre.equals("Sample Movie") && !genre.contains("(DVD)")) list.add(new Genre( genre )); } return list; } @Override public ArrayList<Actor> scrapeActors() { Elements elements = document.select("ul li a[href~=ActressDetail]"); ArrayList<Actor> list = new ArrayList<>(); if(elements != null) { for (Element element : elements) { String href = element.attr("href"); String name = WordUtils.capitalize(element.text()); Thumb thumb = null; try { Document actorDoc = Jsoup.connect(href).userAgent("Mozilla").ignoreHttpErrors(true).timeout(SiteParsingProfile.CONNECTION_TIMEOUT_VALUE).get(); Element first = actorDoc.select("ul img[src~=ActressImage]").first(); if (first != null) { String thumbURL = first.attr("src"); thumb = new Thumb(thumbURL); } list.add(new Actor(name, null, thumb)); } catch (IOException e) { e.printStackTrace(); } } } return list; } @Override public ArrayList<Director> scrapeDirectors() { ArrayList<Director> list = new ArrayList<>(); return list; } @Override public Studio scrapeStudio() { String studio = getMovieData("Studio", "スタジオ"); return new Studio( studio ); } private String getMovieData(String category, String japaneseWordForCategory) { Elements elements = document.select("div[id=titlebox] ul li"); for (Element element : elements) { Element span = element.select("span").first(); if (span != null) { String cat = span.childNode(0).toString(); if (cat.startsWith(category) || cat.startsWith(japaneseWordForCategory)) { Element first = element.select("a").first(); String text = first.childNode(0).toString(); return text; } } } return ""; } @Override public String createSearchString(File file) { scrapedMovieFile = file; String fileNameNoExtension = findIDTagFromFile(file, isFirstWordOfFileIsID()); return getSearchString(fileNameNoExtension); } @Override public SearchResult[] getSearchResults(String searchString) throws IOException { Document doc = Jsoup.connect(searchString).userAgent("Mozilla").ignoreHttpErrors(true).timeout(SiteParsingProfile.CONNECTION_TIMEOUT_VALUE).get(); List<SearchResult> list = new ArrayList<>(); Elements elements = doc.select("td[valign=top] table tbody tr td"); for (Element e : elements) { Elements selectLink = e.select("h4 a[href^=http://www.aventertainments.com]"); if (selectLink.size() > 0) { String href = selectLink.get(0).attr("href"); String label = selectLink.get(0).childNode(0).toString(); Elements selectThumb = e.select("a img"); Thumb thumb = null; for (Element thumbElement : selectThumb) { String attr = thumbElement.attr("src"); if (attr.startsWith("http://imgs.aventertainments.com/product_images")) { thumb = new Thumb(attr); } } list.add( new SearchResult(href, label, thumb) ); } } return list.toArray(new SearchResult[list.size()]); } private String getSearchString(String id) { String languageID = "1"; if(getScrapingLanguage() == Language.JAPANESE) languageID = "2"; return "http://www.aventertainments.com/search_Products.aspx?languageID="+ languageID + "&dept_id=29&keyword=" + id + "&searchby=item_no"; } @Override public String getParserName() { return "AV Entertainment"; } @Override public SiteParsingProfile newInstance() { return new AvEntertainmentParsingProfile(); } }