xml: change from amara to lxml
authorFredrik Unger <fred@tree.se>
Fri, 25 Jan 2013 10:32:59 +0000 (11:32 +0100)
committerFredrik Unger <fred@tree.se>
Fri, 25 Jan 2013 10:32:59 +0000 (11:32 +0100)
changed xml processing from amara to lxml, mainly due to that lxml is
more actively maintained and availible in distributions.
Some html generation was also changed to lxml, there the namespace could
cause some problems (language menu).

treecutter/constants.py
treecutter/directory.py
treecutter/page.py
treecutter/sitemap.py

index a3b4790370c8914c67e105a04a69d1cfaec95f17..7ba6626a1d9f095fb357b24d755fc3d955af8f6c 100644 (file)
@@ -15,5 +15,10 @@ XLINK = "{%s}" % XLINK_NS
 HTML_NS="http://www.w3.org/1999/xhtml"
 HTML = "{%s}" % HTML_NS
 NSMAP = {None : DB_NS,
-         'xlink' : XLINK_NS}
-
+         'xi' : XI_NS,
+         'xlink' : XLINK_NS,
+         'html' : HTML_NS}
+XPATH = {'db' : DB_NS,
+         'xi' : XI_NS,
+         'xlink' : XLINK_NS,
+         'html' : HTML_NS}
index dc60c379718e4a016c5fecc55eeb75610c5def31..19c4d8b4182cf4d4764d9ec77442a0473fd76839 100644 (file)
@@ -1,7 +1,7 @@
 #!/usr/bin/python
 import os
 import fnmatch
-from amara import bindery
+from lxml import etree
 import treecutter.constants as const
 
 class Directory():
@@ -15,9 +15,9 @@ class Directory():
             for filename in filenames:
                 if fnmatch.fnmatch(filename, '*.xml'):
                     file_ = os.path.join(dirname,filename)
-                    doc = bindery.parse(file_, prefixes=const.PREFIXES)
-                    title = doc.xml_select(u'/db:article/db:info/db:title')
-                    menu  = doc.xml_select(u'/db:article/db:info/db:titleabbrev')
+                    doc = etree.parse(file_)
+                    title = doc.xpath(u'/db:article/db:info/db:title',namespaces=const.XPATH)
+                    menu  = doc.xpath(u'/db:article/db:info/db:titleabbrev',namespaces=const.XPATH)
                     if title and menu:
                         base = file_.split('.')[1]
                         link = base.replace('index','')
index bf14a0043bbd4c5fbafe877a6f10e9b229a36b15..22d380ec033562358c0d01562af3d4544f5b87e7 100644 (file)
@@ -3,8 +3,7 @@ import os
 import subprocess
 import tempfile
 import re
-from amara import bindery
-from amara.xslt import transform
+from lxml import etree
 from Cheetah.Template import Template
 from pkg_resources import resource_filename, resource_listdir
 from time import time
@@ -36,72 +35,82 @@ class Page():
         self._rendered_article = art
 
     def prepare(self):
-        self._doc = bindery.parse(self._file, prefixes=const.PREFIXES)
-        if self._doc.xml_select(u'/db:article/db:info/db:title'):
-            self._title = unicode(self._doc.article.info.title)
-        if self._doc.xml_select(u'/db:article/db:info/db:titleabbrev'):
-            self._menu = unicode(self._doc.article.info.titleabbrev)
+        self._doc = etree.parse(self._file)
+        t = self._doc.xpath(u'/db:article/db:info/db:title',namespaces=const.XPATH)
+        if t:
+            self._title = unicode(t[0].text)
+        ta = self._doc.xpath(u'/db:article/db:info/db:titleabbrev',namespaces=const.XPATH)
+        if ta:
+            self._menu = unicode(ta[0].text)
 
         dirname = os.path.dirname(self._file)
         cwd = os.getcwd()
-        code  = self._doc.xml_select(u"//xi:include[@parse='text']")
+        code  = self._doc.xpath(u"//xi:include[@parse='text']",namespaces=const.XPATH)
         if code:
             for c in code:
-                (p, ext) = os.path.splitext(c.href)
+                href = c.get('href')
+                alang = c.get('accept-language')
+                xpointer = c.get('xpointer')
+                (p, ext) = os.path.splitext(href)
                 if ext in const.valid_scripts:
                     exe = []
-                    script = os.path.join(os.path.abspath(dirname)+'/'+c.href)
+                    script = os.path.join(os.path.abspath(dirname)+'/'+href)
                     if os.path.isfile(script):
                         exe.append(script)
                     else:
-                        if c.href in resource_listdir('xinclude', ''):
-                            script = resource_filename('xinclude', c.href)
+                        if href in resource_listdir('xinclude', ''):
+                            script = resource_filename('xinclude', href)
                             exe.append(script)
                         else:
-                            print "Script "+c.href+" in "+self._file+" missing"
-                    if c.xml_select(u"//xi:include[@accept-language]"):
-                        alang = c.xml_attributes[None, "accept-language"]
+                            print "Script "+href+" in "+self._file+" missing"
+                    if alang:
                         exe.append("lang="+alang)
-                    if c.xml_select(u"//xi:include[@xpointer]"):
-                        exe.append("xptr="+c.xpointer)
-                    print "  executing %15s" % (c.href),
+                    if xpointer:
+                        exe.append("xptr="+xpointer)
+                    print "  executing %15s" % (href),
                     ts = time()
                     os.chdir(dirname)
-                    xml = subprocess.Popen(exe,stdout=subprocess.PIPE)
+                    xml = subprocess.Popen(exe,stdout=subprocess.PIPE,
+                                           stderr=subprocess.PIPE)
+                    (stdout, stderr) = xml.communicate()
+                    if stderr:
+                        print " ".join(exe)+" ERROR : [ "+stderr+" ]"
                     os.chdir(cwd)
-                    xmlblock = str(xml.stdout.read())
                     te = time()
-                    print " [%5.2f s]  (%s)" % (round(te-ts,2),c.xpointer)
-                    xstr = bindery.parse(xmlblock)
-                    idp = c.xml_index_on_parent
-                    for x in xstr.xml_children:
-                        c.xml_parent.xml_insert(idp,x)
-                        c.xml_parent.xml_remove(c)
+                    print " [%5.2f s]  (%s)" % (round(te-ts,2),xpointer)
+                    xstr = etree.fromstring(stdout)
+# inserting the generated code and remove the xinclude reference
+                    idp = c.getparent()
+                    idp.insert(idp.index(c)+1,xstr)
+                    idp.remove(c)
 
-        for r in self._doc.xml_select(u"//db:link[@xl:href]"):
-            rf = os.path.join(dirname,r.href)
+        for r in self._doc.xpath(u"//db:link[@xlink:href]",namespaces=const.XPATH):
+            rf = os.path.join(dirname,r.get(const.XLINK+'href'))
             if os.path.isfile(rf):
                 self._resources.append(rf)
-        for i in self._doc.xml_select(u"//db:imagedata[@fileref]"):
-            im = os.path.join(dirname,i.fileref)
+        for i in self._doc.xpath(u"//db:imagedata[@fileref]",namespaces=const.XPATH):
+            im = os.path.join(dirname,i.get('fileref'))
             if os.path.isfile(im):
                 self._resources.append(im)
-        for i in self._doc.xml_select(u"//html:form[@action]"):
-            pyscript = re.split('\.py',i.action,1)[0]+'.py'
+        for i in self._doc.xpath(u"//html:form[@action]",namespaces=const.XPATH):
+            pyscript = re.split('\.py',i.get('action'),1)[0]+'.py'
             im = os.path.join(dirname,pyscript)
             if os.path.isfile(im):
                 self._resources.append(im)
 
     def render(self, style):
-        #  amara can not handle the docbook stylesheets
-        #  xmlarticle = transform(doc,style_xslt)
+
+#        xslt_root = etree.XML(open(style+"docbook.xsl", 'r').read())
+#        transform = etree.XSLT(xslt_root)
+#        result = etree.tostring(transform(xml_root))
+
         cwd = os.getcwd()
         dirname = os.path.dirname(self._file)
         os.chdir(dirname)
         infile  = os.path.basename(tempfile.mktemp())
         outfile = tempfile.mktemp()
         tfi = open(infile,'w')
-        tfi.write(self._doc.xml_encode(omit_xml_declaration=True))
+        tfi.write(etree.tostring(self._doc,encoding='UTF-8',pretty_print=False))
         tfi.close()
 #  cmd = ["saxon-xslt-xinclude","-o",outfile,infile,style_xslt]
         cmd = ["xsltproc","--xinclude","--output",outfile,style+"docbook.xsl",infile]
index 688cb0cdaad079970c8e7657cf38efab1e5893f0..a8612f23b9ba9b1c1955bd370af771b59a008b9f 100644 (file)
@@ -4,8 +4,10 @@ import re
 import shutil
 import gettext
 import tempfile
-from amara import bindery
+from lxml import etree
+from lxml.builder import ElementMaker
 from time import time
+from treecutter import constants as const
 from treecutter.trie import Trie
 from treecutter.link import Link
 from treecutter.tools import ssh_cmd, publish, mkdir_p
@@ -16,7 +18,7 @@ class Sitemap():
         self._file = 'sitemap.txt'
         self._tree = Trie()
         self._sitelang = set()
-        self._isocode = bindery.parse('/usr/share/xml/iso-codes/iso_639_3.xml')
+        self._isocode = etree.parse('/usr/share/xml/iso-codes/iso_639_3.xml')
         self._tranlang = {}
         self._tmptarget = tempfile.mkdtemp()+'/'
 
@@ -101,19 +103,20 @@ class Sitemap():
         return self._tree.menu(lang,page,cssclass)
 
     def lang_menu(self,lang,link):
-        html = "<ul>"
+        html = ElementMaker(namespace=const.HTML_NS)
+        menu = html.ul()
         for l in link.languages():
             isoxml = u"//iso_639_3_entry[@*='"+l+"']"
-            ln = self._isocode.xml_select(isoxml)[0].name
+            ln = self._isocode.xpath(isoxml)[0].get('name')
             if lang != 'en':
                 ln = self._tranlang[lang].gettext(ln)
             p = link.link()
             if p[-1] == '/':
                 p = p +'index'
             p = p+'.'+l
-            html += '<li><a href="%s" hreflang="%s">%s</a></li>' % (p, l, ln)
-        html += "</ul>"
-        return html
+            li = html.li(html.a(ln,href=p,hreflang=l))
+            menu.append(li)
+        return etree.tostring(menu,encoding='UTF-8',pretty_print=False)
 
     def publish(self,output,style):
         ssh_cmd(output,"mkdir -p")