Large Wiki Export

Last modified by Denis Gervalle on 2025/02/12 12:25

cogExports very big wikis to disk
TypeSnippet
Category
Developed by

Thomas Mortagne

Rating
0 Votes
LicenseGNU Lesser General Public License 2.1

Description

Information

Note that this snippet is not required anymore starting with XWiki 5.2 which introduce WikiStream based standard export and is now able to export any size of wiki.

Install

Edit a page in wiki mode, make sure the syntax of the page is xwiki/1.0 and save it. The snippet requires programming rights to run.
Path examples (more details on the File class documentation):

  • UNIX platforms: /Users/myuser/Downloads/export_folder/
  • Windows: C:\\Users\\MyUser\\Downloads\\ExportFolder\\

Filter example: where doc.space like '%Users%'

Warning

For security, this page should be saved in an Admin protected space.

Snippet

1.1 Export very large Wiki to disk

This wiki export application supports:
- export in a folder (not packaged as XAR)
- wiki with pretty much infinite number of documents
- documents with a very large number of attachments
- old XWiki/Groovy (tested with 1.4 and above) version that:
-- does not handle Java method override properly (things like write(Element element) and write(Node node) where Element extends Node)
-- does not support *space* APIs (getSpace(), setSpace(), etc.) (it's generally *web* in old APIs)
- attachments version are not exported, way too buggy in old versions (and certainly in new too)
- make sure to write the XML file with the proper encoding
- some workaround for some MySQL bug around too large number of documents and "distinct" keyword i had issue with
- filtering documents to export
#warning("Export details and progress can be found in the log during the export (since it can be long enough to make the browser timeout)")

<%

import com.xpn.xwiki.*;
import com.xpn.xwiki.api.*;
import com.xpn.xwiki.doc.*;
import com.xpn.xwiki.plugin.packaging.*;
import com.xpn.xwiki.objects.classes.*;
import com.xpn.xwiki.objects.*;
import java.util.*;
import java.util.zip.*;
import java.file.*;
import java.io.*;
import com.xpn.xwiki.util.Util;
import org.dom4j.*;
import org.dom4j.io.*;
import org.dom4j.dom.*;
import org.xml.sax.*;
import org.xml.sax.helpers.*;
import org.apache.commons.codec.binary.Base64;

error = "";
class GroovyXMLWriter extends org.dom4j.io.XMLWriter
{
  protected Stack<Element> parent = new Stack<Element>();

  private OutputStream out;

  GroovyXMLWriter(OutputStream out, OutputFormat format)
  {
    format = format;
    setOutputStream(out);
  }

  public void writeElement(Element element) throws IOException
  {
    super.writeElement(element)
  }

  public void writeDocument(org.dom4j.Document doc) throws IOException
  {
    writeDeclaration();

    if (doc.getDocType() != null) {
        indent();
        writeDocType(doc.getDocType());
    }

    int size = doc.nodeCount();
    int i = 0;
    while (i < size) {
        Node node = doc.node(i);
        writeNode(node);
        i++;
    }

    writePrintln();
    flush();
  }

  public void writeDocumentStart(Document doc) throws IOException
  {
      writeDeclaration();

      if (doc.getDocType() != null) {
          indent();
          writeDocType(doc.getDocType());
      }
  }

  public void writeDocumentEnd(Document doc) throws IOException
  {
      if (!this.parent.isEmpty()) {
          writeClose(this.parent.firstElement());
      }
      writePrintln();
      flush();
  }

  public void write(Element element, Reader rd) throws IOException
  {
      writeOpen(element);
      IOUtils.copy(rd, this.writer);
      writeClose(element);
  }

  public void write(Element element, InputStream is) throws IOException
  {
      writeOpen(element);
      flush();
      IOUtils.copy(is, this.out);
      writeClose(element);
  }

  public void writeOpen(Element element) throws IOException
  {
      super.writeOpen(element);
      this.parent.push(element);
  }

  public void writeClose(Element element) throws IOException
  {
      while (this.parent.peek() != element) {
          super.writeClose(this.parent.pop());
      }
      super.writeClose(this.parent.pop());
  }
}

def String getFileNameFromDocument(XWikiDocument doc, XWikiContext context)
{
        StringBuffer fileName = new StringBuffer(doc.getName());

        // Add language
        String language = doc.getLanguage();
        if ((language != null) && (!language.equals(""))) {
            fileName.append(".");
            fileName.append(language);
        }

        // Add extension
        fileName.append('.').append("xml");

        return fileName.toString();
}

// get package xml file
def String getPackageXML(doc,context)
{
        OutputFormat outputFormat = new OutputFormat("", true);
        outputFormat.setEncoding(context.getWiki().getEncoding());
        ByteArrayOutputStream ous = new ByteArrayOutputStream();
        GroovyXMLWriter writer = new GroovyXMLWriter(ous, outputFormat);
        try {
            writer.writeDocument(doc);
            return ous.toString();
        } catch (IOException e) {
            e.printStackTrace();

            return "";
        }
}

def void writePackageXML(ous, doc, context)
{
        OutputFormat outputFormat = new OutputFormat("", true);
        outputFormat.setEncoding(context.getWiki().getEncoding());
        GroovyXMLWriter writer = new GroovyXMLWriter(ous, outputFormat);
        try {
            writer.writeDocument(doc);
        } catch (IOException e) {
            e.printStackTrace();
        }
}

// prepare main DOM Document for package.xml
def org.dom4j.Element getXMLDocumentElement(xmldoc, packname, context)
{
        org.dom4j.Element docel = new DOMElement("package");
        xmldoc.setRootElement(docel);
        org.dom4j.Element elInfos = (docel as org.dom4j.Element).addElement("infos");

        org.dom4j.Element el = (elInfos as org.dom4j.Element).addElement("name");
        el.addText(packname);

        el = (elInfos as org.dom4j.Element).addElement("description");
        el.addText("");

        el = (elInfos as org.dom4j.Element).addElement("licence");
        el.addText("");

        el = (elInfos as org.dom4j.Element).addElement("author");
        el.addText("");

        el = (elInfos as org.dom4j.Element).addElement("version");
        el.addText("");

        el = (elInfos as org.dom4j.Element).addElement("backupPack");
        el.addText(new Boolean(true).toString());

        el = (elInfos as org.dom4j.Element).addElement("preserveVersion");
        el.addText(new Boolean(true).toString());

        return docel;
}

// adding on file
def addDocumentToXML(xdoc, elfiles)
{
     org.dom4j.Element elfile = (elfiles as org.dom4j.Element).addElement("file");

     elfile.addAttribute("language", String.valueOf(xdoc.getLanguage()));
     elfile.addAttribute("defaultAction" , "0")
     elfile.addText(xdoc.getFullName());
}

def toXML(XWikiDocument xdoc, OutputStream os, boolean withContentVersions, boolean withAttachements, boolean withAttachementsVersions, XWikiContext context)
{
        OutputFormat outputFormat = new OutputFormat("", true);
        outputFormat.setEncoding(context.getWiki().getEncoding());
        GroovyXMLWriter xwriter = new GroovyXMLWriter(os, outputFormat);

        Attributes attributes = new AttributesImpl();
  
        xwriter.startDocument();

        xwriter.startElement("", "xwikidoc", "xwikidoc", attributes);

        xwriter.startElement("", "web", "web", attributes);
        xwriter.write(xdoc.getWeb())
        xwriter.endElement("", "web", "web");

        xwriter.startElement("", "name", "name", attributes);
        xwriter.write(xdoc.getName())
        xwriter.endElement("", "name", "name");

        xwriter.startElement("", "language", "language", attributes);
        xwriter.write(xdoc.getLanguage())
        xwriter.endElement("", "language", "language");

        xwriter.startElement("", "defaultLanguage", "defaultLanguage", attributes);
        xwriter.write(xdoc.getLanguage())
        xwriter.endElement("", "defaultLanguage", "defaultLanguage");

        xwriter.startElement("", "translation", "translation", attributes);
        xwriter.write(String.valueOf(xdoc.getTranslation()))
        xwriter.endElement("", "translation", "translation");

        xwriter.startElement("", "parent", "parent", attributes);
        xwriter.write(xdoc.getParent())
        xwriter.endElement("", "parent", "parent");

        xwriter.startElement("", "creator", "creator", attributes);
        xwriter.write(xdoc.getCreator())
        xwriter.endElement("", "creator", "creator");

        xwriter.startElement("", "author", "author", attributes);
        xwriter.write(xdoc.getAuthor())
        xwriter.endElement("", "author", "author");

        xwriter.startElement("", "customClass", "customClass", attributes);
        xwriter.write(xdoc.getCustomClass())
        xwriter.endElement("", "customClass", "customClass");

        xwriter.startElement("", "contentAuthor", "contentAuthor", attributes);
        xwriter.write(xdoc.getContentAuthor())
        xwriter.endElement("", "contentAuthor", "contentAuthor");

        xwriter.startElement("", "creationDate", "creationDate", attributes);
        xwriter.write(String.valueOf(xdoc.getCreationDate().getTime()))
        xwriter.endElement("", "creationDate", "creationDate");

        xwriter.startElement("", "date", "date", attributes);
        xwriter.write(String.valueOf(xdoc.getDate().getTime()))
        xwriter.endElement("", "date", "date");

        xwriter.startElement("", "contentUpdateDate", "contentUpdateDate", attributes);
        xwriter.write(String.valueOf(xdoc.getContentUpdateDate().getTime()))
        xwriter.endElement("", "contentUpdateDate", "contentUpdateDate");

        xwriter.startElement("", "version", "version", attributes);
        xwriter.write(xdoc.getVersion())
        xwriter.endElement("", "version", "version");

        xwriter.startElement("", "title", "title", attributes);
        xwriter.write(xdoc.getTitle())
        xwriter.endElement("", "title", "title");

        xwriter.startElement("", "template", "template", attributes);
        xwriter.write(xdoc.getTemplate())
        xwriter.endElement("", "template", "template");

        xwriter.startElement("", "defaultTemplate", "defaultTemplate", attributes);
        xwriter.write(xdoc.getDefaultTemplate())
        xwriter.endElement("", "defaultTemplate", "defaultTemplate");

        xwriter.startElement("", "validationScript", "validationScript", attributes);
        xwriter.write(xdoc.getValidationScript())
        xwriter.endElement("", "validationScript", "validationScript");

        xwriter.startElement("", "comment", "comment", attributes);
        xwriter.write(xdoc.getComment())
        xwriter.endElement("", "comment", "comment");

        xwriter.startElement("", "minorEdit", "minorEdit", attributes);
        xwriter.write(String.valueOf(xdoc.isMinorEdit()))
        xwriter.endElement("", "minorEdit", "minorEdit");

        xwriter.startElement("", "syntaxId", "syntaxId", attributes);
        try {
          xwriter.write(xdoc.getSyntaxId())
        } catch (groovy.lang.MissingMethodException e) {
          xwriter.write("xwiki/1.0")
        }
        xwriter.endElement("", "syntaxId", "syntaxId");

        for (XWikiAttachment attach in xdoc.getAttachmentList()) {
            toAttachmentXML(attach, xwriter, withAttachements, context);
        }

        // Add Class
        BaseClass bclass = xdoc.getXClass();
        if (bclass.getFieldList().size() > 0) {
            // If the class has fields, add class definition and field information to XML
            xwriter.writeElement(bclass.toXML(null));
        }

        // Add Objects (THEIR ORDER IS MOLDED IN STONE!)
        for (List objects in xdoc.getXObjects().values()) {
            for (BaseObject obj in objects) {
                if (obj != null) {
                    BaseClass objclass = null;
                    if (obj.getName().equals(obj.getClassName())) {
                        objclass = bclass;
                    } else {
                        objclass = obj.getXClass(context);
                    }
                    xwriter.writeElement(obj.toXML(objclass));
                }
            }
        }

        // Add Content
        xwriter.startElement("", "content", "content", attributes);
        xwriter.write(xdoc.getContent())
        xwriter.endElement("", "content", "content");

        if (withContentVersions) {
            try {
               String versionContent = xdoc.getDocumentArchive(context).getArchive(context);

               xwriter.startElement("", "versions", "versions", attributes);
               xwriter.write(versionContent)
               xwriter.endElement("", "versions", "versions");
            } catch (XWikiException e) {
                System.out.println("Document [" + xdoc.getFullName() + "] has malformed history");
            }
        }

        xwriter.endElement("", "xwikidoc", "xwikidoc");

        xwriter.endDocument();

        xwriter.flush();
}

def toAttachmentXML(XWikiAttachment attachment, GroovyXMLWriter wr, boolean bWithAttachmentContent, XWikiContext context)
{
  Attributes attributes = new AttributesImpl();

  wr.startElement("", "attachment", "attachment", attributes);

  wr.startElement("", "filename", "filename", attributes);
  wr.write(attachment.getFilename());
  wr.endElement("", "filename", "filename");

  wr.startElement("", "filesize", "filesize", attributes);
  wr.write(String.valueOf(attachment.getFilesize()));
  wr.endElement("", "filesize", "filesize");

  wr.startElement("", "author", "author", attributes);
  wr.write(attachment.getAuthor());
  wr.endElement("", "author", "author");

  wr.startElement("", "date", "date", attributes);
  wr.write(String.valueOf(attachment.getDate().getTime()))
  wr.endElement("", "date", "date");

  wr.startElement("", "version", "version", attributes);
  wr.write(attachment.getVersion())
  wr.endElement("", "version", "version");

  wr.startElement("", "comment", "comment", attributes);
  wr.write(attachment.getComment());
  wr.endElement("", "comment", "comment");

  if (bWithAttachmentContent) {
      wr.startElement("", "content", "content", attributes);

      // We need to make sure content is loaded
      attachment.loadContent(context);
      XWikiAttachmentContent acontent = attachment.getAttachment_content();
      if (acontent != null) {
          byte[] bcontent = acontent.getContent();
          String content = new String(Base64.encodeBase64(bcontent));
          wr.write(content);
      }

      wr.endElement("", "content", "content");
  }

  wr.endElement("", "attachment", "attachment");
}

def addToDir(XWikiDocument doc, File dir, boolean withContentVersions, boolean withAttachements, boolean withAttachementsVersions, XWikiContext context) throws XWikiException
{
  try {
    dirname = doc.getWeb();
    File spacedir = new File(dir, java.net.URLEncoder.encode(dirname).replace('.', '%2E').replace('*', '%2A'));
    if (!spacedir.exists()) {
        if (!spacedir.mkdirs()) {
            Object[] args = new Object[1];
            args[0] = doc.getWeb();
            throw new XWikiException(XWikiException.MODULE_XWIKI, XWikiException.ERROR_XWIKI_MKDIR,
                "Error creating directory {0}", null, args);
        }
    }
    String filename = getFileNameFromDocument(doc, context);
    File file = new File(spacedir, java.net.URLEncoder.encode(filename).replace('.', '%2E').replace('*', '%2A'));
    FileOutputStream fos = new FileOutputStream(file);

    toXML(doc, fos, withContentVersions, withAttachements, withAttachementsVersions, context);
    // String xml = doc.toFullXML(context);
    // fos.write(xml);

    fos.flush();
    fos.close();
  } catch (ExcludeDocumentException e) {
    System.out.println("Skip the document " + doc.getFullName());
  }
}

def export(List documentNames, int offset, int total, String packageName, String filename, File dir, org.dom4j.Element elfiles, boolean withContentVersions, boolean withAttachements, boolean withAttachementsVersions, XWikiContext context)
{
    XWiki wiki = context.getWiki();

    int documentIndex = offset;
    for (docName in documentNames) {
      documentIndex++;
      try {
        XWikiDocument doc = context.getWiki().getDocument(docName, context);

        String fname = getFileNameFromDocument(doc, context);
        int nbVersions = doc.getRevisions(context).length;
        int nbAttachments = doc.getAttachmentList().size();
        System.out.println("Export ${documentIndex}/${total}: " + doc.fullName + "-" + doc.getLanguage() + "-" + doc.getDefaultLanguage() + "-" + fname + " - " + doc.getVersion() + " - ${nbVersions} document archives - ${nbAttachments} attachments");

        addToDir(doc, dir, withContentVersions, withAttachements, withAttachementsVersions, context);
        addDocumentToXML(doc, elfiles);

        List languages = doc.getTranslationList(context);
        for (language in languages) {
          if (!((language == null) || (language.equals("")) || (language.equals(doc.getDefaultLanguage())))) {
            def tdoc = doc.getTranslatedDocument(language, context);
            fname = getFileNameFromDocument(tdoc, context);
            if (doc == tdoc)
              System.out.println("Export error same doc: " + doc.fullName + "-" + language);
            System.out.println("Export: " + doc.fullName + "-" + tdoc.getLanguage() + "-" + tdoc.getDefaultLanguage() + "-" + fname);
            addToDir(tdoc, dir, withContentVersions, withAttachements, withAttachementsVersions, context);
            addDocumentToXML(tdoc, elfiles);
          }
        }
      } catch (Throwable e) {
        System.out.println("Export: failed to export: " + docName);
        e.printStackTrace();
        error += "* ${docName}\n";
      }
    }
}

def exportWiki(String packageName, String filename, String filter, boolean withContentVersions, boolean withAttachements, boolean withAttachementsVersions, XWikiContext context)
{
    def request = context.getRequest();

    // preparing package xml file
    def xmldoc = new DOMDocument();
    org.dom4j.Element docel = getXMLDocumentElement(xmldoc, packageName, context);
    org.dom4j.Element elfiles = (docel as org.dom4j.Element).addElement("files");

    def dir = new File(filename);
    XWiki wiki = context.getWiki();

    int total;
    if (filter != null) {
      total = wiki.getStore().search("select count(distinct doc.fullName) from XWikiDocument as doc ${filter}", 0, 0, context).get(0)
    } else {
      total = wiki.getStore().search("select count(distinct doc.fullName) from XWikiDocument as doc", 0, 0, context).get(0)
    }

    System.out.println("Start exporting ${total} documents");

    List documentNames = null
    int offset = 0;
    while (documentNames == null || documentNames.size() == 100) {
      if (filter != null) {
        documentNames = wiki.getStore().search("select distinct doc.fullName from XWikiDocument as doc ${filter} order by doc.fullName", 100, offset, context)
      } else {
        documentNames = wiki.getStore().search("select distinct doc.fullName from XWikiDocument as doc order by doc.fullName", 100, offset, context)
      }

      System.out.println("Start exporting ${documentNames.size()} documents from offset ${offset}");
      export(documentNames, offset, total, packageName, filename, dir, elfiles, withContentVersions, withAttachements, withAttachementsVersions, context)
      //int documentIndex = offset;
      //for (docName in documentNames) {
      //  documentIndex++;
      //  System.out.println("Simulated2 Export: ${documentIndex}/${total}: " + docName);
      //}

      offset += documentNames.size();
    }

    // export package.xml
    System.out.println("Create package.xml file");
    File file = new File(dir, "package.xml");
    try {
      FileOutputStream fos = new FileOutputStream(file);
      writePackageXML(fos, xmldoc, context);
      fos.flush();
      fos.close();
    } catch (Exception e) {
      System.out.println("Export: failed to export: " + file);
      e.printStackTrace();
      error += "* ${file}\n";
    }

    return "Export done";
}

if (request.confirm) {
   def wikiname = context.getDatabase();
   println("Exporting wiki ${wikiname}");
   System.out.println("Exporting wiki ${wikiname}");
   println exportWiki(wikiname, "${request.dirname}", "${request.filter}", request.withversion == '1', request.withattachements == '1', false, context.getContext())
   if (error != "") {
%>
     <div class="errormessage">
       <span class="messagetype">Warning: </span>
       Error found (see log for more details).
       <br/>
       Not exported elements:
       ${error}
     </div>
<%
     System.out.println("Error found! Not exported elements:");
     System.out.println(error);
     println("Export of wiki ${wikiname} finished");
     System.out.println("Export of wiki ${wikiname} finished");
   }
} else {
   println "Ready to export the wiki";
%>
<form class="xform" action="" method="post">
<dl>
<dt><label>File to write to:</label></dt>
<dd><input type="text" name="dirname" size="60" title="Somewhere in the server where XWiki is running and where it has write access"/></dd>
<dt><label>Filter</label></dt>
<dd><input type="text" name="filter" size="60" title="Something to put after 'select distinct doc.fullName from XWikiDocument as doc' basically"/></dd>
</dl>
<p><input type="checkbox" name="withversion" value="1" checked="checked" /> Include content history</p>
<p><input type="checkbox" name="withattachements" value="1" checked="checked" /> Include attachements</p>
<p><input type="checkbox" name="confirm" value="1" /> Confirm</p>
<br />
<div class="buttonwrapper"><input type="submit" class="button" name="Export" /></div>
</form>
<%
  }
%>

TODO

  • export directly as xar
    • on file system
    • as a download

Get Connected