package moviescraper.doctord.controller.siteparsingprofile.specific; import java.io.File; import java.io.IOException; import java.net.MalformedURLException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Hashtable; import java.util.LinkedList; import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.text.WordUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import moviescraper.doctord.controller.siteparsingprofile.SiteParsingProfile; import moviescraper.doctord.model.SearchResult; import moviescraper.doctord.model.dataitem.Actor; import moviescraper.doctord.model.dataitem.Director; import moviescraper.doctord.model.dataitem.Genre; import moviescraper.doctord.model.dataitem.ID; import moviescraper.doctord.model.dataitem.MPAARating; import moviescraper.doctord.model.dataitem.OriginalTitle; import moviescraper.doctord.model.dataitem.Outline; import moviescraper.doctord.model.dataitem.Plot; import moviescraper.doctord.model.dataitem.Rating; import moviescraper.doctord.model.dataitem.ReleaseDate; import moviescraper.doctord.model.dataitem.Runtime; import moviescraper.doctord.model.dataitem.Set; import moviescraper.doctord.model.dataitem.SortTitle; import moviescraper.doctord.model.dataitem.Studio; import moviescraper.doctord.model.dataitem.Tagline; import moviescraper.doctord.model.dataitem.Thumb; import moviescraper.doctord.model.dataitem.Title; import moviescraper.doctord.model.dataitem.Top250; import moviescraper.doctord.model.dataitem.Votes; import moviescraper.doctord.model.dataitem.Year; public class ActionJavParsingProfile extends SiteParsingProfile implements SpecificProfile { private static final SimpleDateFormat actionJavReleaseDateFormat = new SimpleDateFormat("MMM dd, yyyy", Locale.ENGLISH); @Override public Title scrapeTitle() { Element titleElement = document .select("body table tbody tr td table tbody tr td div table tbody tr td table tbody tr td table tbody tr:contains(Video Title) td ~ td p") .first(); return new Title(titleElement.text()); } @Override public OriginalTitle scrapeOriginalTitle() { // ActionJav doesn't have the Japanese title, so we don't want to return // anything but a blank text element return OriginalTitle.BLANK_ORIGINALTITLE; } @Override public SortTitle scrapeSortTitle() { // we don't need any special sort title - that's usually something the // user provides return SortTitle.BLANK_SORTTITLE; } @Override public Set scrapeSet() { // ActionJav doesn't have any set information return Set.BLANK_SET; } @Override public Rating scrapeRating() { Element ratingElement = document .select("body table tbody tr td table tbody tr td div table tbody tr td table tbody tr td table tbody tr:contains(Rated) td ~ td p img") .first(); if(ratingElement != null) { String ratingImgUrl = ratingElement.attr("src"); String ratingIntegerIntegerPart = ratingImgUrl.substring( ratingImgUrl.length() - 7, ratingImgUrl.length() - 6); String ratingDecimalIntegerPart = ratingImgUrl.substring( ratingImgUrl.length() - 5, ratingImgUrl.length() - 4); return new Rating(5.0, ratingIntegerIntegerPart + "." + ratingDecimalIntegerPart); } return Rating.BLANK_RATING; } @Override public Year scrapeYear() { return scrapeReleaseDate().getYear(); } @Override public ReleaseDate scrapeReleaseDate(){ Element releaseDateElement = document .select("body table tbody tr td table tbody tr td div table tbody tr td table tbody tr td table tbody tr:contains(Date Added) td ~ td p") .first(); if(releaseDateElement != null && releaseDateElement.text().length() > 4) { String releaseDateText = releaseDateElement.text().trim(); if(!Character.isAlphabetic(releaseDateText.charAt(0))) //fix for weird white space trim() is not getting rid of releaseDateText = releaseDateText.substring(1); if(releaseDateText.length() > 4) return new ReleaseDate(releaseDateText.trim(), actionJavReleaseDateFormat); } return ReleaseDate.BLANK_RELEASEDATE; } @Override public Top250 scrapeTop250() { // This type of info doesn't exist on ActionJav return Top250.BLANK_TOP250; } @Override public Votes scrapeVotes() { Element votesElement = document .select("body table tbody tr td table tbody tr td div table tbody tr td table tbody tr td table tbody tr:contains(Rated) td ~ td p font") .first(); if(votesElement != null) { String votes = votesElement.text(); votes = votes.substring(2,votes.indexOf('v')-1); return new Votes(votes); } return Votes.BLANK_VOTES; } @Override public Outline scrapeOutline() { return Outline.BLANK_OUTLINE; } @Override public Plot scrapePlot() { Element plotElement = document .select("body table tbody tr td table tbody tr td div table tbody tr td table tbody tr td table[width=372] tbody tr td table tbody tr td p[align=left] font[color=696981]") .first(); if (plotElement != null) return new Plot(plotElement.text().toString()); else return Plot.BLANK_PLOT; } @Override public Tagline scrapeTagline() { return Tagline.BLANK_TAGLINE; } @Override public Runtime scrapeRuntime() { // Find text elements that contain the word "min" // We might get some duplicates here if the movie is offered in multiple // codecs // but we can do some filtering later on to fix things by using a // HashTable to take care of the duplicate format problems Elements movieDownloadParts = document .select("html body table tbody tr td table tbody tr td div table tbody tr td table tbody tr td table tbody tr td table tbody tr td p:has(font:containsOwn(min, ))"); ArrayList<String> movieFileName = new ArrayList<>( movieDownloadParts.size()); Hashtable<String, Integer> runtimesByPart = new Hashtable<>( movieDownloadParts.size()); // we got to do some processing to get the unique runtime per part, // ignoring file extension for (Element movieElement : movieDownloadParts) { // get the filename without extension // System.out.println("movieElement: " + movieElement); Element movieElementLink = movieElement.select("a").first(); if(movieElementLink != null) { String filePath = movieElementLink.attr("href"); String[] splitBySlash = filePath.split("/"); //get just the file String fileNameNoExtension = splitBySlash[splitBySlash.length - 1]; fileNameNoExtension = fileNameNoExtension.substring(0, fileNameNoExtension.length() - 4); // strip the extension movieFileName.add(filePath); // get the runtime String runtimeText = movieElement.select("font").last().text(); //get whole text element Integer runtimeAmt = new Integer(runtimeText.substring(1, runtimeText.indexOf('m')-1)); //narrow it down to just the numeric part since we want to ignore the other garbage in the string runtimesByPart.put(fileNameNoExtension, runtimeAmt); } } int totalRuntime = 0; // Our hastable has automatically taken care of the duplicate format // problem with listing each runtime part twice for (Integer uniqueRuntime : runtimesByPart.values()) { totalRuntime += uniqueRuntime.intValue(); } if (totalRuntime != 0) { return new Runtime(Integer.toString(totalRuntime)); } else return Runtime.BLANK_RUNTIME; } @Override public Thumb[] scrapePosters() { try { Element posterImg = document.select( "img[src*=/web_img/covers_hires_full/]").first(); //Thumb coverImageCrop = new Thumb(posterImg.attr("src"), 52.7, 0, 0,0); Thumb coverImageCrop = new Thumb(posterImg.attr("src"), true); //ActionJav has back and front cover in one jpg, so we need to crop to just get the movie poster Thumb[] returnResult = new Thumb[1]; returnResult[0] = coverImageCrop; return returnResult; } catch (IOException e) { e.printStackTrace(); return new Thumb[0]; } } @Override public Thumb[] scrapeFanart() { try { Element posterImg = document.select( "img[src*=/web_img/covers_hires_full/]").first(); Thumb coverImageCrop = new Thumb(posterImg.attr("src")); Thumb[] returnResult = new Thumb[1]; returnResult[0] = coverImageCrop; return returnResult; } catch (IOException e) { e.printStackTrace(); return new Thumb[0]; } } @Override public MPAARating scrapeMPAA() { return MPAARating.RATING_XXX; } @Override public ID scrapeID() { Element idElement = document .select("body table tbody tr td table tbody tr td div table tbody tr td table tbody tr td table tbody tr:contains(Publisher ID) td ~ td p") .first(); if(idElement != null) { String idElementText = idElement.text(); int firstNumberIndex = StringUtils.indexOfAny(idElementText, "0123456789"); idElementText = idElementText.substring(0,firstNumberIndex) + "-" + idElementText.substring(firstNumberIndex); return new ID(idElementText); } else return ID.BLANK_ID; } @Override public ArrayList<Genre> scrapeGenres() { Elements genreElements = document .select("body table tbody tr td table tbody tr td div table tbody tr td table tbody tr td table tbody tr:contains(Fetishes) td ~ td p"); ArrayList<Genre> genreList = new ArrayList<>(genreElements.size()); for (Element genreElement : genreElements) { String genre = genreElement.select("a").first().attr("href"); genre = genre.substring(genre.indexOf('=') + 1); genre = genre.replaceAll("_", " "); genre = WordUtils.capitalizeFully(genre); genreList.add(new Genre(genre)); } return genreList; } @Override public ArrayList<Actor> scrapeActors() { Elements actorElements = document .select("body table tbody tr td table tbody tr td div table tbody tr td table tbody tr td table tbody tr:contains(Starring) td ~ td p"); if(actorElements != null) { ArrayList<Actor> actorList = new ArrayList<>(actorElements.size()); try { for (Element actorElement : actorElements) { String currentActorName = actorElement.select("font").first() .text(); String currentActorDetailFileNameURL = actorElement.select("a") .attr("href"); currentActorDetailFileNameURL = currentActorDetailFileNameURL .substring(currentActorDetailFileNameURL.indexOf('=') + 1); currentActorDetailFileNameURL = "http://images2.tsunami-ent.com/web_img/av_idols_300/" + currentActorDetailFileNameURL + ".jpg"; Actor currentActor = new Actor(currentActorName, "", new Thumb( currentActorDetailFileNameURL)); actorList.add(currentActor); } return actorList; } catch (IOException e) { e.printStackTrace(); } } return new ArrayList<>(); } @Override public ArrayList<Director> scrapeDirectors() { //ActionJav doesn't have director information, so just return an empty list return new ArrayList<>(); } @Override public Studio scrapeStudio() { Element studioElement = document .select("body table tbody tr td table tbody tr td div table tbody tr td table tbody tr td table tbody tr:contains(Publisher) td ~ td p") .first(); return new Studio(studioElement.text()); } @Override public String createSearchString(File file) { scrapedMovieFile = file; String idTag = findIDTagFromFile(file, isFirstWordOfFileIsID()); if (idTag != null) return "http://www.actionjav.com/results_title.cfm?sortby=pub_idu&direction=ASC&searchterm=" + idTag.replace("-", ""); return null; } @Override public SearchResult[] getSearchResults(String searchString) throws IOException { if (searchString == null) return new SearchResult[0]; LinkedList<SearchResult> searchItems = new LinkedList<>(); String searchId = searchString.replaceAll(".*searchterm=(\\D+)(\\d+)", "$1-$2").toUpperCase(); Document doc = Jsoup.connect(searchString).timeout(CONNECTION_TIMEOUT_VALUE).get(); Elements rows = doc.select("table table table tr:has(a[href^=title.cfm?iid=])"); for(Element row: rows) { String id = row.select("td:nth-child(2)").first().text().replaceAll("(\\D+)(\\d+)", "$1-$2").toUpperCase(); Element link = row.select("a[href^=title.cfm?iid=]").first(); Element actress = row.select("a[href^=model.cfm?actress_filename=]").first(); String title = "[" + id + "] " + link.text(); if (actress != null) title = title + " - " + actress.ownText(); String url = "http://www.actionjav.com/" + link.attr("href") + "&console=cover"; SearchResult result = new SearchResult(url, title); if (id.equals(searchId)) searchItems.addFirst(result); else searchItems.addLast(result); } return searchItems.toArray(new SearchResult[searchItems.size()]); } @Override public Thumb[] scrapeExtraFanart() { ArrayList<Thumb> imageList = new ArrayList<>(); Element script = document.select("head > script:nth-of-type(2)").first(); if (script != null) { String data = script.data(); Pattern pattern = Pattern.compile("\"(http://images2.tsunami-ent.com/web_img/.*\\.jpg)\""); Matcher matcher = pattern.matcher(data); while(matcher.find()){ try { imageList.add(new Thumb(matcher.group(1))); } catch (MalformedURLException e) { e.printStackTrace(); } } } return imageList.toArray(new Thumb[imageList.size()]); } @Override public String toString(){ return "ActionJav"; } @Override public SiteParsingProfile newInstance() { return new ActionJavParsingProfile(); } @Override public String getParserName() { return "ActionJav"; } }