From 64995a9107fbfaa36ddebc078575ba2823e4eb4f Mon Sep 17 00:00:00 2001 From: Fredrik Unger Date: Fri, 25 Jan 2013 11:32:59 +0100 Subject: [PATCH] xml: change from amara to lxml changed xml processing from amara to lxml, mainly due to that lxml is more actively maintained and availible in distributions. Some html generation was also changed to lxml, there the namespace could cause some problems (language menu). --- treecutter/constants.py | 9 +++-- treecutter/directory.py | 8 ++--- treecutter/page.py | 79 +++++++++++++++++++++++------------------ treecutter/sitemap.py | 17 +++++---- 4 files changed, 65 insertions(+), 48 deletions(-) diff --git a/treecutter/constants.py b/treecutter/constants.py index a3b4790..7ba6626 100644 --- a/treecutter/constants.py +++ b/treecutter/constants.py @@ -15,5 +15,10 @@ XLINK = "{%s}" % XLINK_NS HTML_NS="http://www.w3.org/1999/xhtml" HTML = "{%s}" % HTML_NS NSMAP = {None : DB_NS, - 'xlink' : XLINK_NS} - + 'xi' : XI_NS, + 'xlink' : XLINK_NS, + 'html' : HTML_NS} +XPATH = {'db' : DB_NS, + 'xi' : XI_NS, + 'xlink' : XLINK_NS, + 'html' : HTML_NS} diff --git a/treecutter/directory.py b/treecutter/directory.py index dc60c37..19c4d8b 100644 --- a/treecutter/directory.py +++ b/treecutter/directory.py @@ -1,7 +1,7 @@ #!/usr/bin/python import os import fnmatch -from amara import bindery +from lxml import etree import treecutter.constants as const class Directory(): @@ -15,9 +15,9 @@ class Directory(): for filename in filenames: if fnmatch.fnmatch(filename, '*.xml'): file_ = os.path.join(dirname,filename) - doc = bindery.parse(file_, prefixes=const.PREFIXES) - title = doc.xml_select(u'/db:article/db:info/db:title') - menu = doc.xml_select(u'/db:article/db:info/db:titleabbrev') + doc = etree.parse(file_) + title = doc.xpath(u'/db:article/db:info/db:title',namespaces=const.XPATH) + menu = doc.xpath(u'/db:article/db:info/db:titleabbrev',namespaces=const.XPATH) if title and menu: base = file_.split('.')[1] link = base.replace('index','') diff --git a/treecutter/page.py b/treecutter/page.py index bf14a00..22d380e 100644 --- a/treecutter/page.py +++ b/treecutter/page.py @@ -3,8 +3,7 @@ import os import subprocess import tempfile import re -from amara import bindery -from amara.xslt import transform +from lxml import etree from Cheetah.Template import Template from pkg_resources import resource_filename, resource_listdir from time import time @@ -36,72 +35,82 @@ class Page(): self._rendered_article = art def prepare(self): - self._doc = bindery.parse(self._file, prefixes=const.PREFIXES) - if self._doc.xml_select(u'/db:article/db:info/db:title'): - self._title = unicode(self._doc.article.info.title) - if self._doc.xml_select(u'/db:article/db:info/db:titleabbrev'): - self._menu = unicode(self._doc.article.info.titleabbrev) + self._doc = etree.parse(self._file) + t = self._doc.xpath(u'/db:article/db:info/db:title',namespaces=const.XPATH) + if t: + self._title = unicode(t[0].text) + ta = self._doc.xpath(u'/db:article/db:info/db:titleabbrev',namespaces=const.XPATH) + if ta: + self._menu = unicode(ta[0].text) dirname = os.path.dirname(self._file) cwd = os.getcwd() - code = self._doc.xml_select(u"//xi:include[@parse='text']") + code = self._doc.xpath(u"//xi:include[@parse='text']",namespaces=const.XPATH) if code: for c in code: - (p, ext) = os.path.splitext(c.href) + href = c.get('href') + alang = c.get('accept-language') + xpointer = c.get('xpointer') + (p, ext) = os.path.splitext(href) if ext in const.valid_scripts: exe = [] - script = os.path.join(os.path.abspath(dirname)+'/'+c.href) + script = os.path.join(os.path.abspath(dirname)+'/'+href) if os.path.isfile(script): exe.append(script) else: - if c.href in resource_listdir('xinclude', ''): - script = resource_filename('xinclude', c.href) + if href in resource_listdir('xinclude', ''): + script = resource_filename('xinclude', href) exe.append(script) else: - print "Script "+c.href+" in "+self._file+" missing" - if c.xml_select(u"//xi:include[@accept-language]"): - alang = c.xml_attributes[None, "accept-language"] + print "Script "+href+" in "+self._file+" missing" + if alang: exe.append("lang="+alang) - if c.xml_select(u"//xi:include[@xpointer]"): - exe.append("xptr="+c.xpointer) - print " executing %15s" % (c.href), + if xpointer: + exe.append("xptr="+xpointer) + print " executing %15s" % (href), ts = time() os.chdir(dirname) - xml = subprocess.Popen(exe,stdout=subprocess.PIPE) + xml = subprocess.Popen(exe,stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + (stdout, stderr) = xml.communicate() + if stderr: + print " ".join(exe)+" ERROR : [ "+stderr+" ]" os.chdir(cwd) - xmlblock = str(xml.stdout.read()) te = time() - print " [%5.2f s] (%s)" % (round(te-ts,2),c.xpointer) - xstr = bindery.parse(xmlblock) - idp = c.xml_index_on_parent - for x in xstr.xml_children: - c.xml_parent.xml_insert(idp,x) - c.xml_parent.xml_remove(c) + print " [%5.2f s] (%s)" % (round(te-ts,2),xpointer) + xstr = etree.fromstring(stdout) +# inserting the generated code and remove the xinclude reference + idp = c.getparent() + idp.insert(idp.index(c)+1,xstr) + idp.remove(c) - for r in self._doc.xml_select(u"//db:link[@xl:href]"): - rf = os.path.join(dirname,r.href) + for r in self._doc.xpath(u"//db:link[@xlink:href]",namespaces=const.XPATH): + rf = os.path.join(dirname,r.get(const.XLINK+'href')) if os.path.isfile(rf): self._resources.append(rf) - for i in self._doc.xml_select(u"//db:imagedata[@fileref]"): - im = os.path.join(dirname,i.fileref) + for i in self._doc.xpath(u"//db:imagedata[@fileref]",namespaces=const.XPATH): + im = os.path.join(dirname,i.get('fileref')) if os.path.isfile(im): self._resources.append(im) - for i in self._doc.xml_select(u"//html:form[@action]"): - pyscript = re.split('\.py',i.action,1)[0]+'.py' + for i in self._doc.xpath(u"//html:form[@action]",namespaces=const.XPATH): + pyscript = re.split('\.py',i.get('action'),1)[0]+'.py' im = os.path.join(dirname,pyscript) if os.path.isfile(im): self._resources.append(im) def render(self, style): - # amara can not handle the docbook stylesheets - # xmlarticle = transform(doc,style_xslt) + +# xslt_root = etree.XML(open(style+"docbook.xsl", 'r').read()) +# transform = etree.XSLT(xslt_root) +# result = etree.tostring(transform(xml_root)) + cwd = os.getcwd() dirname = os.path.dirname(self._file) os.chdir(dirname) infile = os.path.basename(tempfile.mktemp()) outfile = tempfile.mktemp() tfi = open(infile,'w') - tfi.write(self._doc.xml_encode(omit_xml_declaration=True)) + tfi.write(etree.tostring(self._doc,encoding='UTF-8',pretty_print=False)) tfi.close() # cmd = ["saxon-xslt-xinclude","-o",outfile,infile,style_xslt] cmd = ["xsltproc","--xinclude","--output",outfile,style+"docbook.xsl",infile] diff --git a/treecutter/sitemap.py b/treecutter/sitemap.py index 688cb0c..a8612f2 100644 --- a/treecutter/sitemap.py +++ b/treecutter/sitemap.py @@ -4,8 +4,10 @@ import re import shutil import gettext import tempfile -from amara import bindery +from lxml import etree +from lxml.builder import ElementMaker from time import time +from treecutter import constants as const from treecutter.trie import Trie from treecutter.link import Link from treecutter.tools import ssh_cmd, publish, mkdir_p @@ -16,7 +18,7 @@ class Sitemap(): self._file = 'sitemap.txt' self._tree = Trie() self._sitelang = set() - self._isocode = bindery.parse('/usr/share/xml/iso-codes/iso_639_3.xml') + self._isocode = etree.parse('/usr/share/xml/iso-codes/iso_639_3.xml') self._tranlang = {} self._tmptarget = tempfile.mkdtemp()+'/' @@ -101,19 +103,20 @@ class Sitemap(): return self._tree.menu(lang,page,cssclass) def lang_menu(self,lang,link): - html = "" - return html + li = html.li(html.a(ln,href=p,hreflang=l)) + menu.append(li) + return etree.tostring(menu,encoding='UTF-8',pretty_print=False) def publish(self,output,style): ssh_cmd(output,"mkdir -p") -- 2.30.2