package org.fastcatsearch.datasource.reader;
import com.esotericsoftware.yamlbeans.YamlReader;
import org.fastcatsearch.datasource.SourceModifier;
import org.fastcatsearch.datasource.reader.annotation.SourceReader;
import org.fastcatsearch.ir.common.IRException;
import org.fastcatsearch.ir.config.SingleSourceConfig;
import org.fastcatsearch.util.ReadabilityExtractor;
import org.fastcatsearch.util.WebPageGather;
import org.jdom.Attribute;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.JDOMException;
import org.jdom.input.SAXBuilder;
import org.json.simple.parser.ParseException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.json.simple.parser.JSONParser;
import java.io.*;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Created by 전제현 on 2016-02-22.
* 설정 파일에서 URL 정보를 읽어와 해당 URL의 내용을 파싱하여 색인한다.
* JSON 형식은 JsonList 방식으로 데이터를 읽어온다. (한 줄 당 JSONObject 하나)
*/
@SourceReader(name="WEBPAGE_CONFIG")
public class WebPageConfigFileSourceReader extends SingleSourceReader<Map<String, Object>> {
protected static Logger logger = LoggerFactory.getLogger(WebPageConfigFileSourceReader.class);
private static String TYPE_XML = "xml";
private static String TYPE_YML = "yml";
private static String TYPE_JSON = "json";
private Map<String, Object> dataMap;
private List sourceList;
private int idx;
private Pattern p;
private SimpleDateFormat wdate;
private WebPageGather webPageGather;
public WebPageConfigFileSourceReader() {
super();
}
public WebPageConfigFileSourceReader(String collectionId, File filePath, SingleSourceConfig singleSourceConfig, SourceModifier<Map<String, Object>> sourceModifier, String lastIndexTime) throws IRException {
super(collectionId, filePath, singleSourceConfig, sourceModifier, lastIndexTime);
}
@Override
public void init() throws IRException {
String configType = getConfigString("configFileType").toString();
File configFile = new File(getConfigString("configFilePath").toString());
SAXBuilder builder = new SAXBuilder();
dataMap = null;
p = Pattern.compile("<title>(?s)(.*)(?s)</title>", Pattern.CASE_INSENSITIVE);
wdate = new SimpleDateFormat("yyyyMMddHHmmss");
sourceList = new ArrayList();
webPageGather = new WebPageGather();
if (!configFile.exists()) {
logger.error("There is no Source File.");
}
if (configType.equalsIgnoreCase(TYPE_XML)) {
Document doc = null;
Element root = null;
try {
doc = builder.build(configFile);
root = doc.getRootElement();
List list = root.getChild("document").getChildren("entity");
for (int cnt = 0; cnt < list.size(); cnt++) {
Element el = (Element) list.get(cnt);
List attributes = ((Element) list.get(cnt)).getAttributes();
Map sdata = new HashMap();
for (int attributeCnt = 0; attributeCnt < attributes.size(); attributeCnt++) {
String attribute = ((Attribute) attributes.get(attributeCnt)).getName();
sdata.put(attribute, el.getAttributeValue(attribute));
}
sourceList.add(sdata);
}
} catch (IOException e) {
logger.error("WebPageConfigFileSourceReader Error ", e);
} catch (JDOMException e) {
logger.error("WebPageConfigFileSourceReader Error ", e);
}
} else if (configType.equalsIgnoreCase(TYPE_YML)) {
YamlReader reader = null;
try {
reader = new YamlReader(new FileReader(configFile));
while (true) {
Map sdata = (Map) reader.read();
if (sdata == null) break;
sourceList.add(sdata);
}
} catch (IOException e) {
logger.error("WebPageConfigFileSourceReader Error ", e);
} finally {
try {
reader.close();
} catch (IOException Ignore) {
}
}
} else if (configType.equalsIgnoreCase(TYPE_JSON)) {
BufferedReader jsonReader = null;
String line = null;
try {
jsonReader = new BufferedReader((new InputStreamReader(new FileInputStream(configFile))));
while ((line = jsonReader.readLine()) != null) {
JSONParser parser = new JSONParser();
Map listObj = (Map) parser.parse(line);
sourceList.add(listObj);
}
} catch (FileNotFoundException e) {
logger.error("WebPageConfigFileSourceReader Error ", e);
} catch (IOException e) {
logger.error("WebPageConfigFileSourceReader Error ", e);
} catch (ParseException e) {
logger.error("WebPageConfigFileSourceReader Error ", e);
} finally {
try {
jsonReader.close();
} catch (IOException Ignore) {
}
}
} else {
throw new IRException("Input File Type one out of the Type in XML, YML, JSON.");
}
}
@Override
public boolean hasNext() throws IRException {
Map<String,Object> oneDoc = readOneDoc();
if (oneDoc == null) {
return false;
}
String url = (String)oneDoc.get("url");
String charset = (String)oneDoc.get("charset");
if (charset == null || charset.equals("")) {
charset = "utf-8";
}
if (url == null || url.equals("")) {
logger.debug((new StringBuilder()).append("There is no url in entity").append(idx).toString());
} else {
String source = webPageGather.getLinkPageContent(url, charset, "get");
String content = (new ReadabilityExtractor()).extract(source);
if(content == null)
content = "";
oneDoc.put("content", content);
Matcher m = p.matcher(source);
String title = "";
if (m.find()) {
title = m.group(1);
} else {
title = "";
}
oneDoc.put("id", idx);
oneDoc.put("title", title);
oneDoc.put("wdate", wdate.format(new Date()));
}
dataMap = oneDoc;
return true;
}
private Map<String,Object> readOneDoc() throws IRException {
if(idx < sourceList.size())
return (Map) sourceList.get(idx++);
else
return null;
}
@Override
protected Map<String, Object> next() throws IRException {
return dataMap;
}
@Override
protected void initParameters() {
registerParameter(new SourceReaderParameter("configFileType", "Full indexing Config File Type", "Config File Type for Full indexing Webpage Parsing. (XML, YML, JSON)"
, SourceReaderParameter.TYPE_STRING, true, null));
registerParameter(new SourceReaderParameter("configFilePath", "Full indexing Config File Path", "Config File for Full indexing Webpage Parsing."
, SourceReaderParameter.TYPE_STRING_LONG, true, null));
}
}