TitleExtractor.java example

Explorer
slide-master
- app
  - src
package me.ccrama.redditslide.util;

/**
 * Created by ccrama on 4/10/2016.
 */

import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import me.ccrama.redditslide.Reddit;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;

public class TitleExtractor {
    /* the CASE_INSENSITIVE flag accounts for
     * sites that use uppercase title tags.
     * the DOTALL flag accounts for sites that have
     * line feeds in the title text */
    private static final Pattern TITLE_TAG =
            Pattern.compile("<title[^>]*>(.*?)</title>", Pattern.CASE_INSENSITIVE|Pattern.DOTALL);

    private TitleExtractor() {
    }

    /**
     * @param url the HTML page
     * @return title text (null if document isn't HTML or lacks a title tag)
     * @throws IOException
     */
    public static String getPageTitle(String url) throws IOException {
        OkHttpClient client = Reddit.client;
        Request request = new Request.Builder()
                .url(LinkUtil.formatURL(url).toString())
                .addHeader("Accept", "text/html")
                .build();
        Response response = client.newCall(request).execute();

        if (!response.isSuccessful()) return null;

        Matcher matcher = TITLE_TAG.matcher(response.body().string());
        response.body().close();
        if (matcher.find()) {
            return matcher.group(1).replaceAll("\\s+", " ");
        } else {
            return null;
        }
    }
}