package org.fastcatsearch.datasource.reader; import org.junit.Test; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Created by white on 2016-02-25. */ public class WebPageSourceReaderTest { @Test public void testTitleParsing() { String htmlText = "<title>\n" + "\t박 대통령, 주먹으로 책상치며<br> \"필리버스터? 기가 막힌 현상\" - 오마이뉴스\n" + "</title>"; /*String htmlText = "<title>테스트</title>";*/ String title = ""; Pattern p = Pattern.compile("<title>\\n(.*)\\n</title>", Pattern.CASE_INSENSITIVE); Matcher m = p.matcher(htmlText); if (m.find()) { title = m.group(1); } else { if (htmlText.length() > 10) { title = htmlText.substring(0,10); }else{ title = htmlText; } } } }