/** * License Agreement for OpenSearchServer * <p> * Copyright (C) 2008-2015 Emmanuel Keller / Jaeksoft * <p> * http://www.open-search-server.com * <p> * This file is part of OpenSearchServer. * <p> * OpenSearchServer is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * <p> * OpenSearchServer is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * <p> * You should have received a copy of the GNU General Public License * along with OpenSearchServer. * If not, see <http://www.gnu.org/licenses/>. **/ package com.jaeksoft.searchlib.index; import com.jaeksoft.searchlib.Client; import com.jaeksoft.searchlib.SearchLibException; import com.jaeksoft.searchlib.analysis.LanguageEnum; import com.jaeksoft.searchlib.crawler.web.database.CredentialItem; import com.jaeksoft.searchlib.crawler.web.spider.DownloadItem; import com.jaeksoft.searchlib.crawler.web.spider.HttpDownloader; import com.jaeksoft.searchlib.logreport.ErrorParserLogger; import com.jaeksoft.searchlib.parser.Parser; import com.jaeksoft.searchlib.parser.ParserSelector; import com.jaeksoft.searchlib.schema.FieldValueItem; import com.jaeksoft.searchlib.schema.FieldValueOriginEnum; import com.jaeksoft.searchlib.schema.Schema; import com.jaeksoft.searchlib.schema.SchemaField; import com.jaeksoft.searchlib.util.DomUtils; import com.jaeksoft.searchlib.util.StringUtils; import com.jaeksoft.searchlib.util.XPathParser; import org.apache.commons.lang3.StringEscapeUtils; import org.w3c.dom.DOMException; import org.w3c.dom.Node; import javax.xml.xpath.XPathExpressionException; import java.io.File; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import java.util.*; public class IndexDocument implements Iterable<FieldContent> { private final Map<String, FieldContent> fields; private LanguageEnum lang; public IndexDocument() { fields = new TreeMap<String, FieldContent>(); this.lang = null; } public IndexDocument(IndexDocument sourceDocument) { this(sourceDocument.lang); for (Map.Entry<String, FieldContent> entry : sourceDocument.fields.entrySet()) add(entry.getKey(), entry.getValue()); } public IndexDocument(LanguageEnum lang) { this(); this.lang = lang; } public IndexDocument(Locale lang) { this(); if (lang != null) this.lang = LanguageEnum.findByCode(lang.getLanguage()); } private final List<String> getCopyFieldList(Node fieldNode) throws XPathExpressionException { List<Node> copyNodes = DomUtils.getNodes(fieldNode, "copy"); if (copyNodes == null || copyNodes.size() == 0) return null; List<String> copyList = new ArrayList<String>(); for (Node copyNode : copyNodes) { String f = XPathParser.getAttributeString(copyNode, "field"); if (f != null) copyList.add(f); } return copyList; } /** * Create a new instance of IndexDocument from an XML structure <br/> * <field name="FIELDNAME"><br/> *   <value>VALUE1</value><br/> *   <value>VALUE2</value><br/> * </field> * * @param client * @param parserSelector * @param documentNode * @param urlDefaultCredential * @param httpDownloader * @throws XPathExpressionException * @throws SearchLibException * @throws ClassNotFoundException * @throws IllegalAccessException * @throws InstantiationException * @throws DOMException * @throws IOException * @throws URISyntaxException */ public IndexDocument(Client client, ParserSelector parserSelector, Node documentNode, CredentialItem urlDefaultCredential, HttpDownloader httpDownloader) throws XPathExpressionException, SearchLibException, InstantiationException, IllegalAccessException, ClassNotFoundException, IOException, URISyntaxException { this(LanguageEnum.findByCode(XPathParser.getAttributeString(documentNode, "lang"))); List<Node> fieldNodes = DomUtils.getNodes(documentNode, "field"); for (Node fieldNode : fieldNodes) { List<String> copyFieldList = getCopyFieldList(fieldNode); String fieldName = XPathParser.getAttributeString(fieldNode, "name"); List<Node> valueNodes = DomUtils.getNodes(fieldNode, "value"); for (Node valueNode : valueNodes) { boolean removeTag = "yes".equalsIgnoreCase(XPathParser.getAttributeString(valueNode, "removeTag")); boolean convertHtmlEntities = "yes".equalsIgnoreCase(XPathParser.getAttributeString(valueNode, "convertHtmlEntities")); String textContent = valueNode.getTextContent(); if (convertHtmlEntities) textContent = StringEscapeUtils.unescapeHtml4(textContent); if (removeTag) textContent = StringUtils.removeTag(textContent); Float boost = XPathParser.getAttributeFloat(valueNode, "boost"); add(fieldName, textContent, boost); if (copyFieldList != null) for (String f : copyFieldList) add(f, textContent, boost); } } List<Node> binaryNodes = DomUtils.getNodes(documentNode, "binary"); for (Node node : binaryNodes) { boolean bFaultTolerant = "yes".equalsIgnoreCase(XPathParser.getAttributeString(node, "faultTolerant")); String filename = XPathParser.getAttributeString(node, "fileName"); if (filename == null || filename.length() == 0) filename = XPathParser.getAttributeString(node, "filename"); String filePath = XPathParser.getAttributeString(node, "filePath"); if (filePath == null || filePath.length() == 0) filePath = XPathParser.getAttributeString(node, "filepath"); String contentType = XPathParser.getAttributeString(node, "contentType"); if (contentType == null || contentType.length() == 0) contentType = XPathParser.getAttributeString(node, "contenttype"); String content = node.getTextContent(); String url = XPathParser.getAttributeString(node, "url"); Parser parser = doBinary(url, content, filePath, filename, client, parserSelector, contentType, urlDefaultCredential, httpDownloader, bFaultTolerant); if (parser != null) parser.popupateResult(0, this); } } private Parser doBinary(String url, String content, String filePath, String filename, Client client, ParserSelector parserSelector, String contentType, CredentialItem urlDefaultCredential, HttpDownloader httpDownloader, boolean bFaultTolerant) throws IOException, URISyntaxException, InstantiationException, IllegalAccessException, ClassNotFoundException, SearchLibException { try { Parser parser = null; if (url != null) parser = binaryFromUrl(parserSelector, url, urlDefaultCredential, httpDownloader); else if (content != null && content.length() > 0) parser = binaryFromBase64(parserSelector, filename, contentType, content); else if (filePath != null && filePath.length() > 0) parser = binaryFromFile(parserSelector, filename, contentType, filePath); return parser; } catch (SearchLibException e) { ErrorParserLogger.log(url, filename, filePath, e); if (!bFaultTolerant) throw e; } catch (NullPointerException e) { ErrorParserLogger.log(url, filename, filePath, e); if (!bFaultTolerant) throw e; } catch (IllegalArgumentException e) { ErrorParserLogger.log(url, filename, filePath, e); if (!bFaultTolerant) throw e; } catch (RuntimeException e) { ErrorParserLogger.log(url, filename, filePath, e); if (!bFaultTolerant) throw new SearchLibException(e); } catch (Exception e) { ErrorParserLogger.log(url, filename, filePath, e); if (!bFaultTolerant) throw new SearchLibException(e); } return null; } private Parser binaryFromUrl(ParserSelector parserSelector, String url, CredentialItem credentialItem, HttpDownloader httpDownloader) throws SearchLibException { try { DownloadItem downloadItem = httpDownloader.get(new URI(url), credentialItem); downloadItem.checkNoErrorList(200); return parserSelector.parseStream(null, downloadItem.getFileName(), downloadItem.getContentBaseType(), url, downloadItem.getContentInputStream(), lang, null, null); } catch (RuntimeException e) { throw new SearchLibException("Parser error while getting binary from URL: " + url, e); } catch (Exception e) { throw new SearchLibException("Parser error while getting binary from URL: " + url, e); } } private Parser binaryFromBase64(ParserSelector parserSelector, String filename, String contentType, String content) throws SearchLibException { try { return parserSelector.parseBase64(null, filename, contentType, null, content, lang); } catch (RuntimeException e) { throw new SearchLibException("Parser error while getting binary : " + filename + " /" + contentType, e); } catch (Exception e) { throw new SearchLibException("Parser error while getting binary : " + filename + " /" + contentType, e); } } private Parser binaryFromFile(ParserSelector parserSelector, String filename, String contentType, String filePath) throws SearchLibException { try { File f = new File(filePath); if (f.isDirectory()) f = new File(f, filename); return parserSelector.parseFile(null, filename, contentType, null, f, lang); } catch (RuntimeException e) { throw new SearchLibException("Parser error while getting binary from file : " + filePath + " /" + filename, e); } catch (Exception e) { throw new SearchLibException("Parser error while getting binary from file : " + filePath + " /" + filename, e); } } public FieldContent getFieldContent(String field) { if (field == null) return null; field = field.intern(); FieldContent fc = fields.get(field); if (fc == null) { fc = new FieldContent(field); fields.put(field, fc); } return fc; } public void add(String field, FieldValueItem fieldValueItem) { if (field == null) return; FieldContent fc = getFieldContent(field); fc.add(fieldValueItem); } public void add(String field, String value, Float boost) { if (value == null || value.length() == 0) return; add(field, new FieldValueItem(FieldValueOriginEnum.EXTERNAL, value, boost)); } public void addObject(String field, Object object) { if (object == null) return; addString(field, object.toString()); } public void addString(String field, String value) { if (value == null) return; add(field, new FieldValueItem(FieldValueOriginEnum.EXTERNAL, value)); } public void addFieldIndexDocument(String field, IndexDocument source) { if (source == null) return; for (FieldContent fieldContent : source) add(field, fieldContent); } public void addFieldValueList(String field, List<FieldValueItem> values) { if (values == null) return; for (FieldValueItem value : values) add(field, value); } public void addObjectList(String field, List<Object> values) { if (values == null) return; for (Object value : values) addObject(field, value.toString()); } public void addStringList(String field, List<String> values) { if (values == null) return; for (String value : values) addString(field, value); } public void add(String field, FieldContent fieldContent) { if (fieldContent == null) return; addFieldValueList(field, fieldContent.getValues()); } private void addIfNotAlreadyHere(FieldContent fieldContent) { if (fieldContent == null) return; FieldContent fc = getFieldContent(fieldContent.getField()); fc.addIfNotAlreadyHere(fieldContent); } public void addIfNotAlreadyHere(IndexDocument source) { for (FieldContent fc : source.fields.values()) addIfNotAlreadyHere(fc); } public void add(Map<String, FieldValueItem> fieldMap) { for (Map.Entry<String, FieldValueItem> entry : fieldMap.entrySet()) add(entry.getKey(), entry.getValue()); } public void add(IndexDocument source) { for (FieldContent fc : source.fields.values()) add(fc.getField(), fc); } public void setString(String field, String value) { FieldContent fc = fields.get(field); if (fc != null) fc.clear(); add(field, value, null); } public void setStringList(String field, List<String> values) { FieldContent fc = fields.get(field); if (fc != null) fc.clear(); addStringList(field, values); } public void setFieldValueItems(String field, List<FieldValueItem> values) { FieldContent fc = fields.get(field); if (fc != null) fc.clear(); addFieldValueList(field, values); } public void setSameValueItems(String field, List<FieldValueItem> values) { if (field == null) return; FieldContent fc = getFieldContent(field); fc.setValueItems(values); } public void setObjectList(String field, List<Object> values) { FieldContent fc = fields.get(field); if (fc != null) fc.clear(); addObjectList(field, values); } public void setObject(String field, Object value) { setString(field, value.toString()); } public LanguageEnum getLang() { return lang; } public void setLang(LanguageEnum lang) { this.lang = lang; } final public FieldContent getField(final String fieldName) { return fields.get(fieldName); } final public boolean hasContent(final String fieldName) { FieldContent fc = getField(fieldName); if (fc == null) return false; return fc.hasContent(); } public FieldValueItem getFieldValue(String fieldName, int pos) { if (fields == null) return null; FieldContent fc = fields.get(fieldName); if (fc == null) return null; return fc.getValue(pos); } public String getFieldValueString(String fieldName, int pos) { FieldValueItem fvi = getFieldValue(fieldName, pos); if (fvi == null) return null; return fvi.getValue(); } @Override public Iterator<FieldContent> iterator() { return fields.values().iterator(); } final public FieldContent[] getFieldContentArray() { return fields.values().toArray(new FieldContent[fields.size()]); } @Override public String toString() { StringBuilder result = new StringBuilder(); if (fields != null) { for (String key : fields.keySet()) { FieldContent value = (FieldContent) fields.get(key); result.append(value.toString()).append("\n"); } } return result.toString(); } /** * Populate the copyOf field with a reference to the fieldcontent of the * source field * * @param schema */ public void prepareCopyOf(Schema schema) { for (SchemaField schemaField : schema.getFieldList()) { String fname = schemaField.getName(); List<String> copyOfList = schemaField.getCopyOf(); if (copyOfList == null) continue; for (String copyOf : copyOfList) { if (copyOf == null || copyOf.length() == 0) continue; FieldContent fieldContent = fields.get(copyOf); if (fieldContent != null) add(fname, fieldContent); } } } }