WebPageSourceReaderTest.java example

Explorer
fastcatsearch-master
package org.fastcatsearch.datasource.reader;

import org.junit.Test;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Created by white on 2016-02-25.
 */
public class WebPageSourceReaderTest {

    @Test
    public void testTitleParsing() {
        String htmlText =
                "<title>\n" +
                "\t박 대통령, 주먹으로 책상치며<br> \"필리버스터? 기가 막힌 현상\" - 오마이뉴스\n" +
                "</title>";
        /*String htmlText = "<title>테스트</title>";*/
        String title = "";

        Pattern p = Pattern.compile("<title>\\n(.*)\\n</title>", Pattern.CASE_INSENSITIVE);
        Matcher m = p.matcher(htmlText);

        if (m.find()) {
            title = m.group(1);
        } else {
            if (htmlText.length() > 10) {
                title = htmlText.substring(0,10);
            }else{
                title = htmlText;
            }
        }
    }
}