/** * License Agreement for OpenSearchServer * <p> * Copyright (C) 2010-2014 Emmanuel Keller / Jaeksoft * <p> * http://www.open-search-server.com * <p> * This file is part of OpenSearchServer. * <p> * OpenSearchServer is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * <p> * OpenSearchServer is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * <p> * You should have received a copy of the GNU General Public License * along with OpenSearchServer. * If not, see <http://www.gnu.org/licenses/>. **/ package com.jaeksoft.searchlib.crawler; import com.jaeksoft.searchlib.Logging; import com.jaeksoft.searchlib.SearchLibException; import com.jaeksoft.searchlib.crawler.common.database.CommonFieldTarget; import com.jaeksoft.searchlib.crawler.file.database.FilePathItem; import com.jaeksoft.searchlib.crawler.file.database.FileTypeEnum; import com.jaeksoft.searchlib.crawler.file.process.FileInstanceAbstract; import com.jaeksoft.searchlib.crawler.web.database.HostUrlList.ListType; import com.jaeksoft.searchlib.crawler.web.process.WebCrawlThread; import com.jaeksoft.searchlib.crawler.web.spider.Crawl; import com.jaeksoft.searchlib.function.expression.SyntaxError; import com.jaeksoft.searchlib.index.FieldContent; import com.jaeksoft.searchlib.index.IndexDocument; import com.jaeksoft.searchlib.parser.Parser; import com.jaeksoft.searchlib.query.ParseException; import com.jaeksoft.searchlib.schema.FieldValueItem; import com.jaeksoft.searchlib.schema.FieldValueOriginEnum; import com.jaeksoft.searchlib.util.*; import com.jaeksoft.searchlib.util.map.GenericLink; import com.jaeksoft.searchlib.util.map.GenericMap; import com.jaeksoft.searchlib.util.map.SourceField; import com.jaeksoft.searchlib.util.map.TargetField; import com.jayway.jsonpath.JsonPath; import com.jayway.jsonpath.PathNotFoundException; import net.minidev.json.JSONArray; import org.apache.commons.lang3.StringEscapeUtils; import org.w3c.dom.Node; import org.xml.sax.SAXException; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.TransformerConfigurationException; import javax.xml.xpath.XPathExpressionException; import java.io.File; import java.io.IOException; import java.io.PrintWriter; import java.net.URI; import java.net.URISyntaxException; import java.util.List; import java.util.Set; public abstract class FieldMapGeneric<S extends SourceField, T extends TargetField> extends GenericMap<S, T> { private File mapFile; protected FieldMapGeneric() { mapFile = null; } protected FieldMapGeneric(Node parentNode) throws XPathExpressionException { mapFile = null; load(parentNode); } protected FieldMapGeneric(File mapFile, String rootXPath) throws ParserConfigurationException, SAXException, IOException, XPathExpressionException { this.mapFile = mapFile; if (!mapFile.exists()) return; XPathParser xpp = new XPathParser(mapFile); load(xpp.getNode(rootXPath)); } protected abstract T loadTarget(String targetName, Node node); protected abstract S loadSource(String source); public void load(Node parentNode) throws XPathExpressionException { synchronized (this) { if (parentNode == null) return; List<Node> nodeList = DomUtils.getNodes(parentNode, "link"); for (Node node : nodeList) { String sourceName = StringEscapeUtils.unescapeXml(DomUtils.getAttributeText(node, "source")); S source = loadSource(sourceName); if (source == null) continue; String targetName = DomUtils.getAttributeText(node, "target"); T target = loadTarget(targetName, node); if (target == null) continue; add(source, target); } } } protected abstract void writeTarget(XmlWriter xmlWriter, T target) throws SAXException; public void store(XmlWriter xmlWriter) throws SAXException { for (GenericLink<S, T> link : getList()) { T target = link.getTarget(); xmlWriter.startElement("link", "source", link.getSource().toXmlAttribute(), "target", target.toXmlAttribute(), "analyzer", target.getAnalyzer(), "boost", target.getBoost() == null ? null : Float.toString(target.getBoost())); writeTarget(xmlWriter, link.getTarget()); xmlWriter.endElement(); } } public void store() throws TransformerConfigurationException, SAXException, IOException { synchronized (this) { if (!mapFile.exists()) mapFile.createNewFile(); PrintWriter pw = new PrintWriter(mapFile); try { XmlWriter xmlWriter = new XmlWriter(pw, "UTF-8"); xmlWriter.startElement("map"); store(xmlWriter); xmlWriter.endElement(); xmlWriter.endDocument(); } finally { pw.close(); } } } final protected void mapFieldTarget(FieldMapContext context, FieldContent fc, CommonFieldTarget targetField, IndexDocument target, Set<String> filePathSet) throws IOException, SearchLibException, ParseException, SyntaxError, URISyntaxException, ClassNotFoundException, InterruptedException, InstantiationException, IllegalAccessException { if (fc == null) return; for (FieldValueItem fvi : fc.getValues()) mapFieldTarget(context, targetField, fvi.value, target, filePathSet); } final public String mapFieldTarget(CommonFieldTarget dfTarget, String content) { if (StringUtils.isEmpty(content)) return null; if (dfTarget.isConvertHtmlEntities()) content = StringEscapeUtils.unescapeHtml4(content); if (dfTarget.isRemoveTag()) content = StringUtils.removeTag(content); if (dfTarget.hasRegexpPattern()) content = dfTarget.applyRegexPattern(content); return content; } final protected void mapFieldTarget(FieldMapContext context, CommonFieldTarget dfTarget, String content, IndexDocument target, Set<String> filePathSet) throws SearchLibException, IOException, ParseException, SyntaxError, URISyntaxException, ClassNotFoundException, InterruptedException, InstantiationException, IllegalAccessException { if (dfTarget == null) return; if (StringUtils.isEmpty(content)) return; if (dfTarget.isFilePath()) { String filePath = dfTarget.getFilePath(content); if (filePathSet == null || !filePathSet.contains(filePath)) { if (filePathSet != null) filePathSet.add(filePath); File file = new File(filePath); if (file.exists()) { Parser parser = context.parserSelector.parseFile(null, file.getName(), null, null, file, context.lang); if (parser != null) parser.popupateResult(0, target); } else { Logging.error("File don't exist:" + file.getAbsolutePath()); } } } if (dfTarget.isCrawlFile()) { String filePathName = dfTarget.getFilePathPrefix(); if (filePathSet == null || !filePathSet.contains(content)) { if (filePathSet != null) filePathSet.add(content); URI filePathURI = new URI(filePathName); FilePathItem filePathItem = context.filePathManager.findFirst(filePathURI.getScheme(), filePathURI.getHost()); if (filePathItem == null) throw new SearchLibException("FilePathItem not found: " + filePathName); FileInstanceAbstract fileInstance = FileInstanceAbstract.create(filePathItem, null, filePathItem.getPath() + content); FileTypeEnum type = fileInstance.getFileType(); if (type != null && type == FileTypeEnum.file) { Parser parser = context.parserSelector.parseStream(null, fileInstance.getFileName(), null, null, fileInstance.getInputStream(), context.lang, null, null); if (parser != null) parser.popupateResult(0, target); } } } if (dfTarget.isCrawlUrl()) { WebCrawlThread crawlThread = context.webCrawlMaster.manualCrawl(LinkUtils.newEncodedURL(content), ListType.DBCRAWL); crawlThread.waitForStart(60); crawlThread.waitForEnd(60); Crawl crawl = crawlThread.getCurrentCrawl(); if (crawl != null) { IndexDocument targetIndexDocument = crawl.getTargetIndexDocument(0); if (targetIndexDocument != null) target.add(targetIndexDocument); } } content = mapFieldTarget(dfTarget, content); target.add(dfTarget.getName(), new FieldValueItem(FieldValueOriginEnum.EXTERNAL, content)); } public void mapJson(FieldMapContext context, Object jsonObject, IndexDocument target) throws SearchLibException, IOException, ParseException, SyntaxError, URISyntaxException, ClassNotFoundException, InterruptedException, InstantiationException, IllegalAccessException { for (GenericLink<S, T> link : getList()) { String jsonPath = link.getSource().getUniqueName(); try { Object jsonContent = JsonPath.read(jsonObject, jsonPath); if (jsonContent == null) continue; if (jsonContent instanceof JSONArray) { JSONArray jsonArray = (JSONArray) jsonContent; for (Object content : jsonArray) { if (content != null) mapFieldTarget(context, (CommonFieldTarget) link.getTarget(), content.toString(), target, null); } } else mapFieldTarget(context, (CommonFieldTarget) link.getTarget(), jsonContent.toString(), target, null); } catch (PathNotFoundException e) { continue; } catch (IllegalArgumentException e) { Logging.warn(e); continue; } } } }