/**
* OpenKM, Open Document Management System (http://www.openkm.com)
* Copyright (c) 2006-2011 Paco Avila & Josep Llort
*
* No bytes were intentionally harmed during the development of this application.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
package com.openkm.util;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Writer;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import javax.swing.text.AttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;
import org.apache.jackrabbit.util.Text;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.openkm.api.OKMDocument;
import com.openkm.api.OKMFolder;
import com.openkm.api.OKMRepository;
import com.openkm.bean.Document;
import com.openkm.bean.Folder;
public class Populate {
private static Logger log = LoggerFactory.getLogger(Populate.class);
public static final List<String> DEFAULT_TYPES = Arrays.asList(new String[] { "pdf", "rtf", "doc", "ppt",
"xls" });
public static void massiveImport(final String token, final String seedWord, final int numDocs,
final List<String> fileTypes, Writer out) throws Exception {
final OKMRepository okmRepository = OKMRepository.getInstance();
final OKMDocument okmDocument = OKMDocument.getInstance();
final OKMFolder okmFolder = OKMFolder.getInstance();
int n = 0;
for (int typeIdx = 0; typeIdx < fileTypes.size(); typeIdx++) {
String type = (String) fileTypes.get(typeIdx);
int offset = 0;
while (n < numDocs * (typeIdx + 1) / fileTypes.size()) {
final URL[] urls = new Populate.Search(type, seedWord, offset).getURLs();
if (urls.length == 0) {
break;
}
for (int i = 0; i < urls.length; i++) {
final URL currentURL = urls[i];
String urlPath = currentURL.getPath();
if (urlPath.startsWith("/")) {
urlPath = urlPath.substring(1);
}
final String host = urls[i].getHost();
List<String> folderNames = new ArrayList<String>();
folderNames.addAll(Arrays.asList(host.split("\\.")));
Collections.reverse(folderNames);
folderNames.addAll(Arrays.asList(urlPath.split("/", 0)));
final String fileName = URLDecoder.decode(
(String) folderNames.remove(folderNames.size() - 1), "UTF-8")
.replaceAll(":", "_");
String path = okmRepository.getRootFolder(null).getPath();
for (Iterator<String> fn = folderNames.iterator(); fn.hasNext();) {
String name = URLDecoder.decode((String) fn.next(), "UTF-8");
name = name.replaceAll(":", "_");
if (name.length() == 0) {
continue;
}
path = path + "/" + name;
if (!okmRepository.hasNode(null, path)) {
//log.info("Create folder: {}", path);
Folder fld = new Folder();
fld.setPath(path);
okmFolder.create(null, fld);
}
}
path = path + "/" + fileName;
if (!okmRepository.hasNode(null, path)) {
final Writer fOut = out;
final String docPath = path;
final Exception[] ex = new Exception[1];
final int nDoc = n;
Thread t = new Thread(new Runnable() {
public void run() {
try {
//String info = fileName + " (" + host + ")";
URLConnection con = currentURL.openConnection();
InputStream in = con.getInputStream();
try {
synchronized (fOut) {
fOut.write("<tr class=\""+(nDoc%2==0?"odd":"even")+"\">");
fOut.write("<td>"+nDoc+"</td>");
fOut.write("<td>"+Text.encodeIllegalXMLCharacters(currentURL.toString())+"</td>");
fOut.flush();
}
int length = con.getContentLength();
if (length != -1) {
// in = new ProgressInputStream(in,
// length, info, "dp", fOut);
}
log.info("Create document: {}", docPath);
Document doc = new Document();
doc.setPath(docPath);
okmDocument.create(null, doc, in);
} finally {
in.close();
}
} catch (Exception e) {
ex[0] = e;
}
}
});
t.start();
for (int s = 0; t.isAlive(); s++) {
Thread.sleep(100);
if (s % 10 == 0) {
synchronized (fOut) {
//fOut.write("pb.inform(" + n + ", '')");
//fOut.flush();
}
}
}
if (ex[0] == null) {
n++;
synchronized (fOut) {
fOut.write("<td>Ok</td></tr>");
fOut.flush();
}
if (n >= numDocs * (typeIdx + 1) / fileTypes.size()) {
break;
}
} else {
fOut.write("<td>Error</td></tr>");
fOut.flush();
}
}
}
offset += 10;
}
}
}
static class Search {
private final String filetype;
private final String term;
private final int start;
public Search(String filetype, String term, int start) {
this.filetype = filetype;
this.term = term;
this.start = start;
}
public URL[] getURLs() throws Exception {
List<URL> urls = new ArrayList<URL>();
String query = term + " filetype:" + filetype;
URL google = new URL("http://www.google.com/search?q=" + URLEncoder.encode(query, "UTF-8")
+ "&start=" + start);
URLConnection con = google.openConnection();
con.setRequestProperty("User-Agent", "");
InputStream in = con.getInputStream();
try {
HTMLEditorKit kit = new HTMLEditorKit();
HTMLDocument doc = new HTMLDocument();
doc.putProperty("IgnoreCharsetDirective", Boolean.TRUE);
kit.read(new InputStreamReader(in, "UTF-8"), doc, 0);
HTMLDocument.Iterator it = doc.getIterator(HTML.Tag.A);
while (it.isValid()) {
AttributeSet attr = it.getAttributes();
if (attr != null) {
String href = (String) attr.getAttribute(HTML.Attribute.HREF);
if (href != null && href.endsWith("." + filetype)) {
URL url = new URL(new URL("http", "www.google.com", "dummy"), href);
if (url.getHost().indexOf("google") == -1) {
urls.add(url);
}
}
}
it.next();
}
} finally {
in.close();
}
return (URL[]) urls.toArray(new URL[urls.size()]);
}
}
}