package com.abmash.extraction; import com.abmash.api.Browser; import com.abmash.extraction.container.ExtractionContainer; import com.abmash.extraction.container.PageTypeExtractionContainer; import com.abmash.parser.content.Link; import java.sql.Connection; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; public class PageTypeExtractor extends ExtractorWithDB { private enum Status { OK, // page type could be extracted ERROR, // page not loadable OFFLINE, // page loadable but content not available due to domain change or domain parking } private enum PageType { PRE_PAGE, // start pages like intros or language selectors // links contain "common_links" HOTEL, // regular hotel page HOTEL_SUBPAGE, // subpage of regular hotel page HOTEL_GROUP_PORTAL, // 3 or more links contain long paths with at least 3 "/" in it HOTEL_GROUP_PORTAL_SUBPAGE, // subpage of hotel group portals // links contain "top_links" HOTEL_CHAIN, // hotel chain SMALL_HOTEL_CHAIN, // hotel chain page with less than 10 links // links do not contain "common_links" NO_HOTEL, // visible text does not contain "top_links" or "common_links" HOTEL_SUBPAGE_UNSURE, // visible text does contain "top_links" or "common_links" HOTEL_SINGLEPAGE, // no links at all } private Status status = null; private PageType pageType = null; public PageTypeExtractor(Browser browser, Connection conn) { super(browser, conn); } @Override /** * extraction instances need to be added to the class variable extractions */ protected void extract() { String url = parser.getUrl(); // do something with url String title = parser.getTitle(); // do something with title HashMap<String, String> metaTags = parser.getMetaTags(); for (String metaTag: metaTags.keySet()) { // do something with metatags } String visibleText = parser.getVisibleText(); // do something with visible text ArrayList<Link> links = parser.getLinks(); for (Link link: links) { // do something with links } // if you need more fine-grained control of finding the information // you need, you can use the browser instance // HtmlElementList elements = browser.find().textElements("hotel"); // you can even interact with the browser // browser.click("english"); // browser.type("search", "Hotel Foobar"); // set status and found most probable page type status = Status.OK; pageType = PageType.HOTEL; // add result to extraction container PageTypeExtractionContainer extraction = new PageTypeExtractionContainer(); extraction.setStatus(status.name()); extraction.setPageType(pageType.name()); extractions.add(extraction); } @Override protected String getExtractionOutput(ExtractionContainer extractionContainer) { return ((PageTypeExtractionContainer) extractionContainer).getPageType(); } }