package me.ccrama.redditslide.util; /** * Created by ccrama on 4/10/2016. */ import java.io.IOException; import java.util.regex.Matcher; import java.util.regex.Pattern; import me.ccrama.redditslide.Reddit; import okhttp3.OkHttpClient; import okhttp3.Request; import okhttp3.Response; public class TitleExtractor { /* the CASE_INSENSITIVE flag accounts for * sites that use uppercase title tags. * the DOTALL flag accounts for sites that have * line feeds in the title text */ private static final Pattern TITLE_TAG = Pattern.compile("<title[^>]*>(.*?)</title>", Pattern.CASE_INSENSITIVE|Pattern.DOTALL); private TitleExtractor() { } /** * @param url the HTML page * @return title text (null if document isn't HTML or lacks a title tag) * @throws IOException */ public static String getPageTitle(String url) throws IOException { OkHttpClient client = Reddit.client; Request request = new Request.Builder() .url(LinkUtil.formatURL(url).toString()) .addHeader("Accept", "text/html") .build(); Response response = client.newCall(request).execute(); if (!response.isSuccessful()) return null; Matcher matcher = TITLE_TAG.matcher(response.body().string()); response.body().close(); if (matcher.find()) { return matcher.group(1).replaceAll("\\s+", " "); } else { return null; } } }