Extract Wikipedia Article Introduction

Last modified by Vincent Massol on 2021/03/18 11:28

cogQ&D Wikipedia article introduction extractor
TypeSnippet
Category
Developed by

Jerome

Rating
0 Votes
LicenseGNU Lesser General Public License 2.1

Table of contents

Description

Quick and dirty snippet to extract the introduction from an article on wikipedia. Can of course easily be adapted for any mediawiki wiki.

{{groovy}}
import org.xwiki.rendering.renderer.printer.DefaultWikiPrinter
import org.xwiki.rendering.syntax.Syntax
import org.xwiki.rendering.parser.Parser
import org.xwiki.rendering.listener.reference.ResourceType
import org.xwiki.rendering.block.WordBlock

// Replace here by the article you want to extract
def articleTitle = "Louis,_Duke_of_Brittany_(1707%E2%80%931712)"

def componentManager = com.xpn.xwiki.web.Utils.getComponentManager()

def parser = componentManager.lookup(Parser.class, Syntax.MEDIAWIKI_1_0.toIdString());
def renderer = componentManager.lookup(org.xwiki.rendering.renderer.BlockRenderer.class, Syntax.XHTML_1_0.toIdString());
def printer = new DefaultWikiPrinter();

def content = xwiki.getURLContent("http://en.wikipedia.org/w/api.php?action=query&export&exportnowrap&titles="  + articleTitle)

try {
  def article = new XmlParser().parseText(content)
  def text = article.page.revision.text.text()

 // Temporarly remove all <ref> from mediawiki source
 // Until wikimodel mediawiki parser fixes http://code.google.com/p/wikimodel/issues/detail?id=168
 text = text.replaceAll("\\<ref\\>(.*?)\\<\\/ref\\>", "")

 // Temporarly remove all  macros from mediawiki source
 // Until all complex macros are interpreted as such by the parser
 // See http://code.google.com/p/wikimodel/issues/detail?id=205 for example
 text = text.replaceAll("\\{\\{(.*?)\\}\\}", "")

  def xdom = parser.parse(new StringReader(text));

 // Remove headings and everything after them
 for (heading in xdom.getChildrenByType(org.xwiki.rendering.block.HeaderBlock, false)) {
   while(heading.getNextSibling() != null) {
      heading.parent.removeBlock(heading.getNextSibling())
   }
    heading.parent.removeBlock(heading)
 }

 // Remove all images
 for (image in xdom.getChildrenByType(org.xwiki.rendering.block.ImageBlock, true)) {
    image.parent.removeBlock(image)
 }

 // Fix links
 for (link in xdom.getChildrenByType(org.xwiki.rendering.block.LinkBlock, true)) {
    def reference = link.reference
   if (!reference.reference.startsWith('http')) {
     if (!link.children) {
        link.addChild(new WordBlock(reference.reference))
     }
      reference.setReference("http://en.wikipedia.org/wiki/" + reference.reference)
      reference.setType(ResourceType.URL)
   }
 }

 // Render what remains in XHTML
 renderer.render(xdom, printer);

 // That's it, we have our intro
 def intro = printer.toString()

  println """
{{html}}
${intro}
{{/html}}
"""

}
catch (Exception e) {
  println """
{{error}}
Something happened : ${e.message}
{{/error}}
"""

}
{{/groovy}}

Get Connected