/**
* A simple dictionary for Korean, powered by National Institute of the Korean Language
* Copyright (C) 2015 ChalkPE
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package pe.chalk.kodic;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLEncoder;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.Stream;
/**
* @author ChalkPE <amato0617@gmail.com>
* @since 2015-06-02
*/
public class KoreanFinder {
public static final String URL = "http://stdweb2.korean.go.kr/search/List_dic.jsp";
public static final String CHARSET = "UTF-8";
private static final Map<String, String> cache = new HashMap<>();
@SuppressWarnings("unused")
enum SearchType {
EQUALS("0"), STARTS_WITH("1"), ENDS_WITH("2"), CONTAINS("3");
private String id;
SearchType(String id){
this.id = id;
}
@Override
public String toString(){
return this.id;
}
}
@SuppressWarnings("unused")
enum SpCode {
MYEONGSA("1");
private String id;
SpCode(String id){
this.id = id;
}
@Override
public String toString(){
return this.id;
}
}
private static String getParameters(SearchType type, String text, SpCode... spCodes) throws UnsupportedEncodingException {
String param = URLEncoder.encode("PageRow", CHARSET) + "=" + URLEncoder.encode("100000000", CHARSET) + "&" +
URLEncoder.encode("Table", CHARSET) + "=" + URLEncoder.encode("words", CHARSET) + "|" + URLEncoder.encode("word", CHARSET) + "&" +
URLEncoder.encode("Gubun", CHARSET) + "=" + URLEncoder.encode(type.toString(), CHARSET) + "&" +
URLEncoder.encode("SearchPart", CHARSET) + "=" + URLEncoder.encode("Simple", CHARSET) + "&";
if(spCodes.length > 0){
param += URLEncoder.encode("SpCode", CHARSET) + "=" + String.join("&" + URLEncoder.encode("SpCode", CHARSET) + "=", Stream.of(spCodes).map(SpCode::toString).collect(Collectors.toList())) + "&";
}
return param + URLEncoder.encode("SearchText", CHARSET) + "=" + URLEncoder.encode(text, CHARSET);
}
private static String getHTML(String parameters) throws IOException {
if(cache.containsKey(parameters)){
return cache.get(parameters);
}
HttpURLConnection connection = (HttpURLConnection) new URL(URL).openConnection();
connection.setDoOutput(true);
connection.setRequestMethod("GET");
try(BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(connection.getOutputStream(), CHARSET))){
writer.write(parameters);
}
String read;
StringBuilder builder = new StringBuilder();
try(BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), CHARSET))){
while((read = reader.readLine()) != null){
builder.append(read).append('\n');
}
}
String html = builder.toString();
cache.put(parameters, html);
return html;
}
public static Collection<String> getAllNoun(SearchType type, String firstLetter, String... banned) throws IOException {
return new Elements(Jsoup.parse(KoreanFinder.getHTML(KoreanFinder.getParameters(type, firstLetter, SpCode.MYEONGSA)))
.select("span#print_area p.exp").stream()
.filter(element -> !element.select("> font[face=\"새굴림\"]").stream()
.map(Element::text)
.anyMatch(text -> Arrays.asList(banned).contains(text)))
.collect(Collectors.toList()))
.select("a[title] strong font").stream()
.map(Element::text).filter(str -> str.length() > 1).distinct()
.map(str -> str.codePoints()
.filter(codePoint -> Character.UnicodeScript.of(codePoint) == Character.UnicodeScript.HANGUL)
.collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append).toString())
.collect(Collectors.toList());
}
}