From bd4a0b33c130d89a7b306c9b73474c45a0ec781f Mon Sep 17 00:00:00 2001 From: Fredrik Unger Date: Mon, 2 Apr 2012 16:01:30 +0200 Subject: [PATCH] Preparing refactoring the single source into a python structure --- src/tree-cutter.py => treecutter/const.py | 0 treecutter/directory.py | 456 ++++++++++++++++++++++ treecutter/link.py | 456 ++++++++++++++++++++++ treecutter/main.py | 456 ++++++++++++++++++++++ treecutter/page.py | 456 ++++++++++++++++++++++ treecutter/sitemap.py | 456 ++++++++++++++++++++++ treecutter/tools.py | 456 ++++++++++++++++++++++ treecutter/trie.py | 456 ++++++++++++++++++++++ 8 files changed, 3192 insertions(+) rename src/tree-cutter.py => treecutter/const.py (100%) create mode 100755 treecutter/directory.py create mode 100755 treecutter/link.py create mode 100755 treecutter/main.py create mode 100755 treecutter/page.py create mode 100755 treecutter/sitemap.py create mode 100755 treecutter/tools.py create mode 100755 treecutter/trie.py diff --git a/src/tree-cutter.py b/treecutter/const.py similarity index 100% rename from src/tree-cutter.py rename to treecutter/const.py diff --git a/treecutter/directory.py b/treecutter/directory.py new file mode 100755 index 0000000..6f03ff7 --- /dev/null +++ b/treecutter/directory.py @@ -0,0 +1,456 @@ +#!/usr/bin/python +import os +import fnmatch +import subprocess +import amara +import re +import tempfile +import errno +import time +import argparse +import shutil +import pygraphviz as pgv +import glob +import gettext +import shutil +from amara import bindery +from amara.xslt import transform +from Cheetah.Template import Template + +parser = argparse.ArgumentParser(description='Process docbook article tree.') +parser.add_argument('--style', nargs='?', + default=os.path.dirname(os.getcwd())+'/style/default/') +parser.add_argument('--output', nargs='?', + default=os.path.dirname(os.getcwd())+'/htdocs/') +args = parser.parse_args() + +style_xslt = args.style+"docbook.xsl" +outputdir = args.output + +tmptarget = tempfile.mkdtemp()+'/' + +valid_scripts = ['.py','.pl'] +MAXLEVEL = 10000 + +def mkdir_p(path): + try: + os.makedirs(path) + except OSError as exc: # Python >2.5 + if exc.errno == errno.EEXIST: + pass + else: raise + +def publish(src,target): + cmd = ["rsync","-a","--delete",src,target] + retcode = subprocess.call(cmd) + if retcode: + print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']' + +def ssh_cmd(target, command): + t = target.split(":") + c = command.split() + cmd = ["ssh",t[0],c[0],c[1],t[1]] + retcode = subprocess.call(cmd) + if retcode: + print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']' + +PREFIXES={u'db': u'http://docbook.org/ns/docbook', + u'xi': u'http://www.w3.org/2001/XInclude', + u'xl': u'http://www.w3.org/1999/xlink', + u'html' : u'http://www.w3.org/1999/xhtml'} + +class Directory(): + """Class containing the state of the directory with articles""" + def __init__(self): + self._cwd = '.' + self._tree = [] + + def scan(self): + for dirname, dirnames, filenames in os.walk(self._cwd): + for filename in filenames: + if fnmatch.fnmatch(filename, '*.xml'): + file_ = os.path.join(dirname,filename) + doc = bindery.parse(file_, prefixes=PREFIXES) + title = doc.xml_select(u'/db:article/db:info/db:title') + menu = doc.xml_select(u'/db:article/db:info/db:titleabbrev') + if title and menu: + base = file_.split('.')[1] + link = base.replace('index','') + self._tree.append(link) + + def set(self): + return set(self._tree) + +class Page(): + """Class representing a version of a webpage""" + def __init__(self,link,page): + self._link = link + self._file = page[1] + self._lang = page[0] + self._doc = None + self._resources = [] + self._title = None + self._menu = None + self._rendered_article = None + + def language(self): + return self._lang + + def resources(self): + return set(self._resources) + + def menu(self): + return self._menu + + def set_article(self,art): + self._rendered_article = art + + def prepare(self): + self._doc = bindery.parse(self._file, prefixes=PREFIXES) + if self._doc.xml_select(u'/db:article/db:info/db:title'): + self._title = unicode(self._doc.article.info.title) + if self._doc.xml_select(u'/db:article/db:info/db:titleabbrev'): + self._menu = unicode(self._doc.article.info.titleabbrev) + + dirname = os.path.dirname(self._file) + code = self._doc.xml_select(u"//xi:include[@parse='text']") + if code: + for c in code: + (p, ext) = os.path.splitext(c.href) + if ext in valid_scripts: + exe = [] + exe.append(os.path.join(os.path.abspath(dirname)+'/'+c.href)) + if c.xml_select(u"//xi:include[@accept-language]"): + alang = c.xml_attributes[None, "accept-language"] + exe.append("lang="+alang) + if c.xml_select(u"//xi:include[@xpointer]"): + exe.append("xptr="+c.xpointer) + xml = subprocess.Popen(exe,stdout=subprocess.PIPE) + xstr = bindery.parse(str(xml.stdout.read())) + idp = c.xml_index_on_parent + for x in xstr.xml_children: + c.xml_parent.xml_insert(idp,x) + c.xml_parent.xml_remove(c) + + for r in self._doc.xml_select(u"//db:link[@xl:href]"): + rf = os.path.join(dirname,r.href) + if os.path.isfile(rf): + self._resources.append(rf) + for i in self._doc.xml_select(u"//db:imagedata[@fileref]"): + im = os.path.join(dirname,i.fileref) + if os.path.isfile(im): + self._resources.append(im) + for i in self._doc.xml_select(u"//html:form[@action]"): + pyscript = re.split('\.py',i.action,1)[0]+'.py' + im = os.path.join(dirname,pyscript) + if os.path.isfile(im): + self._resources.append(im) + + def render(self): + # amara can not handle the docbook stylesheets + # xmlarticle = transform(doc,style_xslt) + cwd = os.getcwd() + dirname = os.path.dirname(self._file) + os.chdir(dirname) + infile = os.path.basename(tempfile.mktemp()) + outfile = tempfile.mktemp() + tfi = open(infile,'w') + tfi.write(self._doc.xml_encode(omit_xml_declaration=True)) + tfi.close() +# cmd = ["saxon-xslt-xinclude","-o",outfile,infile,style_xslt] + cmd = ["xsltproc","--xinclude","--output",outfile,style_xslt,infile] + retcode = subprocess.call(cmd) + if retcode: + print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']' + tfo = open(outfile,'r') + self._rendered_article = tfo.read() + tfo.close() + os.remove(infile) + os.remove(outfile) + os.chdir(cwd) + + def template(self,sitemap): + htmlmenu = sitemap.gen_menu(self._lang,None,"menu") + levelmenu = sitemap.gen_menu(self._lang,self,"tree") + langmenu = sitemap.lang_menu(self._lang,self._link) + template = Template(file=args.style+'index.'+self._lang+'.html.tmpl', + searchList=[{'title':self._title}, + {'menu':htmlmenu}, + {'article':self._rendered_article}, + {'levelmenu':levelmenu}, + {'langmenu':langmenu}]) + outfile = tmptarget+'html'.join(self._file.rsplit('xml',1)) + mkdir_p(os.path.dirname(outfile)) + out = open(outfile, 'w') + out.write(str(template)) + out.close() + + +class Link(): + """Class representing a webpage on the site""" + def __init__(self,link): + self._link = link + # find the representations of the link. + self._pages = [] + path = link + if self._link[-1] == '/': + path = path+'index' + lang = self._scan_languages(path) + for l in lang: + self._pages.append(Page(self,l)) + + def add_page(self,l): + self._pages.append(Page(self,l)) + + def _scan_languages(self,path): + lang = [] + for l in glob.glob('.'+path+'*'): + ls = l.split('.') + if len(ls) > 3 and ls[3] == 'xml': + lang.append((ls[2],l)) + return lang + + def link(self): + return self._link + + def prepare(self): + for page in self._pages: + page.prepare() + + def languages(self): + p = [] + for page in self._pages: + p.append(page.language()) + return p + + def render(self): + for page in self._pages: + page.render() + + def template(self,sitemap): + for page in self._pages: + page.template(sitemap) + + def page(self,lang): + for page in self._pages: + if page.language()==lang: + return page + return None + + def resources(self): + res = set() + for page in self._pages: + res = res.union(page.resources()) + return res + + +class Node(): + def __init__(self,token,value): + self._token = token + self._value = value + self._children = [] + + def token(self): + return self._token + + def value(self): + return self._value + + def children(self): + return self._children + +class Trie(): + def __init__(self): + self._root = [] + + def __iter__(self): + return self.inorder(self._root) + + def inorder(self,t): + for l in t: + yield l.value() + for x in self.inorder(l.children()): + yield x + + def _add(self,trie, key, content): + # is the key a leaf + k = key.pop(0) + if key == []: + node = Node(k,content) + trie.append(node) + else: + for ch in trie: + if ch.token() == k: + self._add(ch.children(), key, content) + + def add(self,key, content): + self._add(self._root, key, content) + + def _graph(self, trie, G): + for l in trie: + G.add_node(l.token()) + for ch in l.children(): + G.add_edge(l.token(),ch.token()) + self._graph(l.children(), G) + + def graph(self): + G = pgv.AGraph(directed=True) + G.add_node("sitemap") + for ch in self._root: + G.add_edge("sitemap",ch.token()) + self._graph(self._root, G) +# G.layout('dot') +# G.draw('g.png') +# print G.string() + + def _menu(self, trie, lang, page, css): + html = "\n" % css + for l in trie: + sel = '' + p = l.value().page(lang) + if p == page: + sel = ' class="selected"' + if p != None: + html += '%s\n' \ + % (sel,l.value().link(),p.menu()) + else: + html += '%s*\n' \ + % (sel,l.value().link(), l.value().page('en').menu()) + if l.children(): + html += self._menu(l.children(), lang, page, "") + html += "\n" + return html + + def menu(self,lang,page,cssclass): + css = '' + if cssclass: + css = ' class="'+cssclass+'"' + return self._menu(self._root, lang, page, css) + +class Sitemap(): + """Class keeping the internal site structure""" + def __init__(self): + self._file = 'sitemap.txt' + self._tree = Trie() + self._sitelang = set() + self._isocode = bindery.parse('/usr/share/xml/iso-codes/iso_639_3.xml') + self._tranlang = {} + + def add_link(self, link): + tokens = filter(None,re.split(r'(^/[\w-]*/|[\w-]*/)',link)) + self._tree.add(tokens,Link(link)) + + def write_map(self): + f = open(self._file,'w') + f.write('\n'.join(link.link() for link in self._tree)) + f.close() + + def read_map(self): + try: + f = open(self._file) + sml = f.read().split() + f.close() + for line in sml: + self.add_link(line) + except IOError, what_error: + print 'INFO: Could not read sitemap.txt - one will be created' + + def set(self): + return set(link.link() for link in self._tree) + + def process(self): + t1 = time.time() + for link in self._tree: + link.prepare() + t2 = time.time() + print "Prepare [%5.2f s]" % (round(t2-t1,2)) + for link in self._tree: + self._sitelang = self._sitelang.union(set(link.languages())) + for tran in self._sitelang: + if tran != 'en': + self._tranlang[tran] = gettext.translation('iso_639_3', + languages=[tran]) + t3 = time.time() + print "Language [%5.2f s]" % (round(t3-t2,2)) + for link in self._tree: + link.render() + t4 = time.time() + print "Render [%5.2f s]" % (round(t4-t3,2)) + for link in self._tree: + link.template(self) + t5 = time.time() + print "Template [%5.2f s]" % (round(t5-t4,2)) + t6 = time.time() + res = set() + cwd = os.getcwd() + for link in self._tree: + res = res.union(link.resources()) + for f in res: + outfile = tmptarget+f + mkdir_p(os.path.dirname(outfile)) + shutil.copyfile(f,outfile) + print "Resources[%5.2f s]" % (round(t6-t5,2)) + sitmaplink = Link('/sitemap') + for l in self._sitelang: + sitmaplink.add_page((l,'/sitemap.'+l+'.xml')) + for l in self._sitelang: + sitmaplink.page(l).set_article(self.gen_menu(l,None,"tree sitemap")) + sitmaplink.page(l).template(self) + t7 = time.time() + print "Sitemap [%5.2f s]" % (round(t7-t6,2)) + + def graph(self): + self._tree.graph() + + def gen_menu(self,lang,page,cssclass): + return self._tree.menu(lang,page,cssclass) + + def lang_menu(self,lang,link): + html = "
    " + for l in link.languages(): + isoxml = u"//iso_639_3_entry[@*='"+l+"']" + ln = self._isocode.xml_select(isoxml)[0].name + if lang != 'en': + ln = self._tranlang[lang].gettext(ln) + p = link.link() + if p[-1] == '/': + p = p +'index' + p = p+'.'+l + html += '
  • %s
  • ' % (p, l, ln) + html += "
" + return html + + def publish(self): + ssh_cmd(args.output,"mkdir -p") + publish(tmptarget, args.output) + for res in ["css","images","js","favicon.ico"]: + if (os.path.exists(args.style+res)): + publish(args.style+res, args.output) + ssh_cmd(args.output,"chmod a+rx") + +ts = time.time() +dir_ = Directory() +sitemap = Sitemap() + +dir_.scan() +sitemap.read_map() + +missing = dir_.set() - sitemap.set() +removed = sitemap.set() - dir_.set() +for page in removed: + print page+' pages missing!!' +for page in missing: + print 'adding missing page '+page + sitemap.add_link(page) +if len(missing)+len(removed) != 0: + print 'writing new sitemap - please adjust if needed' + sitemap.write_map() +sitemap.graph() + +sitemap.process() + +t1 = time.time() +sitemap.publish() +t2 = time.time() +print "Publish [%5.2f s]" % (round(t2-t1,2)) +print "Total [%5.2f s]" % (round(t2-ts,2)) diff --git a/treecutter/link.py b/treecutter/link.py new file mode 100755 index 0000000..6f03ff7 --- /dev/null +++ b/treecutter/link.py @@ -0,0 +1,456 @@ +#!/usr/bin/python +import os +import fnmatch +import subprocess +import amara +import re +import tempfile +import errno +import time +import argparse +import shutil +import pygraphviz as pgv +import glob +import gettext +import shutil +from amara import bindery +from amara.xslt import transform +from Cheetah.Template import Template + +parser = argparse.ArgumentParser(description='Process docbook article tree.') +parser.add_argument('--style', nargs='?', + default=os.path.dirname(os.getcwd())+'/style/default/') +parser.add_argument('--output', nargs='?', + default=os.path.dirname(os.getcwd())+'/htdocs/') +args = parser.parse_args() + +style_xslt = args.style+"docbook.xsl" +outputdir = args.output + +tmptarget = tempfile.mkdtemp()+'/' + +valid_scripts = ['.py','.pl'] +MAXLEVEL = 10000 + +def mkdir_p(path): + try: + os.makedirs(path) + except OSError as exc: # Python >2.5 + if exc.errno == errno.EEXIST: + pass + else: raise + +def publish(src,target): + cmd = ["rsync","-a","--delete",src,target] + retcode = subprocess.call(cmd) + if retcode: + print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']' + +def ssh_cmd(target, command): + t = target.split(":") + c = command.split() + cmd = ["ssh",t[0],c[0],c[1],t[1]] + retcode = subprocess.call(cmd) + if retcode: + print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']' + +PREFIXES={u'db': u'http://docbook.org/ns/docbook', + u'xi': u'http://www.w3.org/2001/XInclude', + u'xl': u'http://www.w3.org/1999/xlink', + u'html' : u'http://www.w3.org/1999/xhtml'} + +class Directory(): + """Class containing the state of the directory with articles""" + def __init__(self): + self._cwd = '.' + self._tree = [] + + def scan(self): + for dirname, dirnames, filenames in os.walk(self._cwd): + for filename in filenames: + if fnmatch.fnmatch(filename, '*.xml'): + file_ = os.path.join(dirname,filename) + doc = bindery.parse(file_, prefixes=PREFIXES) + title = doc.xml_select(u'/db:article/db:info/db:title') + menu = doc.xml_select(u'/db:article/db:info/db:titleabbrev') + if title and menu: + base = file_.split('.')[1] + link = base.replace('index','') + self._tree.append(link) + + def set(self): + return set(self._tree) + +class Page(): + """Class representing a version of a webpage""" + def __init__(self,link,page): + self._link = link + self._file = page[1] + self._lang = page[0] + self._doc = None + self._resources = [] + self._title = None + self._menu = None + self._rendered_article = None + + def language(self): + return self._lang + + def resources(self): + return set(self._resources) + + def menu(self): + return self._menu + + def set_article(self,art): + self._rendered_article = art + + def prepare(self): + self._doc = bindery.parse(self._file, prefixes=PREFIXES) + if self._doc.xml_select(u'/db:article/db:info/db:title'): + self._title = unicode(self._doc.article.info.title) + if self._doc.xml_select(u'/db:article/db:info/db:titleabbrev'): + self._menu = unicode(self._doc.article.info.titleabbrev) + + dirname = os.path.dirname(self._file) + code = self._doc.xml_select(u"//xi:include[@parse='text']") + if code: + for c in code: + (p, ext) = os.path.splitext(c.href) + if ext in valid_scripts: + exe = [] + exe.append(os.path.join(os.path.abspath(dirname)+'/'+c.href)) + if c.xml_select(u"//xi:include[@accept-language]"): + alang = c.xml_attributes[None, "accept-language"] + exe.append("lang="+alang) + if c.xml_select(u"//xi:include[@xpointer]"): + exe.append("xptr="+c.xpointer) + xml = subprocess.Popen(exe,stdout=subprocess.PIPE) + xstr = bindery.parse(str(xml.stdout.read())) + idp = c.xml_index_on_parent + for x in xstr.xml_children: + c.xml_parent.xml_insert(idp,x) + c.xml_parent.xml_remove(c) + + for r in self._doc.xml_select(u"//db:link[@xl:href]"): + rf = os.path.join(dirname,r.href) + if os.path.isfile(rf): + self._resources.append(rf) + for i in self._doc.xml_select(u"//db:imagedata[@fileref]"): + im = os.path.join(dirname,i.fileref) + if os.path.isfile(im): + self._resources.append(im) + for i in self._doc.xml_select(u"//html:form[@action]"): + pyscript = re.split('\.py',i.action,1)[0]+'.py' + im = os.path.join(dirname,pyscript) + if os.path.isfile(im): + self._resources.append(im) + + def render(self): + # amara can not handle the docbook stylesheets + # xmlarticle = transform(doc,style_xslt) + cwd = os.getcwd() + dirname = os.path.dirname(self._file) + os.chdir(dirname) + infile = os.path.basename(tempfile.mktemp()) + outfile = tempfile.mktemp() + tfi = open(infile,'w') + tfi.write(self._doc.xml_encode(omit_xml_declaration=True)) + tfi.close() +# cmd = ["saxon-xslt-xinclude","-o",outfile,infile,style_xslt] + cmd = ["xsltproc","--xinclude","--output",outfile,style_xslt,infile] + retcode = subprocess.call(cmd) + if retcode: + print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']' + tfo = open(outfile,'r') + self._rendered_article = tfo.read() + tfo.close() + os.remove(infile) + os.remove(outfile) + os.chdir(cwd) + + def template(self,sitemap): + htmlmenu = sitemap.gen_menu(self._lang,None,"menu") + levelmenu = sitemap.gen_menu(self._lang,self,"tree") + langmenu = sitemap.lang_menu(self._lang,self._link) + template = Template(file=args.style+'index.'+self._lang+'.html.tmpl', + searchList=[{'title':self._title}, + {'menu':htmlmenu}, + {'article':self._rendered_article}, + {'levelmenu':levelmenu}, + {'langmenu':langmenu}]) + outfile = tmptarget+'html'.join(self._file.rsplit('xml',1)) + mkdir_p(os.path.dirname(outfile)) + out = open(outfile, 'w') + out.write(str(template)) + out.close() + + +class Link(): + """Class representing a webpage on the site""" + def __init__(self,link): + self._link = link + # find the representations of the link. + self._pages = [] + path = link + if self._link[-1] == '/': + path = path+'index' + lang = self._scan_languages(path) + for l in lang: + self._pages.append(Page(self,l)) + + def add_page(self,l): + self._pages.append(Page(self,l)) + + def _scan_languages(self,path): + lang = [] + for l in glob.glob('.'+path+'*'): + ls = l.split('.') + if len(ls) > 3 and ls[3] == 'xml': + lang.append((ls[2],l)) + return lang + + def link(self): + return self._link + + def prepare(self): + for page in self._pages: + page.prepare() + + def languages(self): + p = [] + for page in self._pages: + p.append(page.language()) + return p + + def render(self): + for page in self._pages: + page.render() + + def template(self,sitemap): + for page in self._pages: + page.template(sitemap) + + def page(self,lang): + for page in self._pages: + if page.language()==lang: + return page + return None + + def resources(self): + res = set() + for page in self._pages: + res = res.union(page.resources()) + return res + + +class Node(): + def __init__(self,token,value): + self._token = token + self._value = value + self._children = [] + + def token(self): + return self._token + + def value(self): + return self._value + + def children(self): + return self._children + +class Trie(): + def __init__(self): + self._root = [] + + def __iter__(self): + return self.inorder(self._root) + + def inorder(self,t): + for l in t: + yield l.value() + for x in self.inorder(l.children()): + yield x + + def _add(self,trie, key, content): + # is the key a leaf + k = key.pop(0) + if key == []: + node = Node(k,content) + trie.append(node) + else: + for ch in trie: + if ch.token() == k: + self._add(ch.children(), key, content) + + def add(self,key, content): + self._add(self._root, key, content) + + def _graph(self, trie, G): + for l in trie: + G.add_node(l.token()) + for ch in l.children(): + G.add_edge(l.token(),ch.token()) + self._graph(l.children(), G) + + def graph(self): + G = pgv.AGraph(directed=True) + G.add_node("sitemap") + for ch in self._root: + G.add_edge("sitemap",ch.token()) + self._graph(self._root, G) +# G.layout('dot') +# G.draw('g.png') +# print G.string() + + def _menu(self, trie, lang, page, css): + html = "\n" % css + for l in trie: + sel = '' + p = l.value().page(lang) + if p == page: + sel = ' class="selected"' + if p != None: + html += '%s\n' \ + % (sel,l.value().link(),p.menu()) + else: + html += '%s*\n' \ + % (sel,l.value().link(), l.value().page('en').menu()) + if l.children(): + html += self._menu(l.children(), lang, page, "") + html += "\n" + return html + + def menu(self,lang,page,cssclass): + css = '' + if cssclass: + css = ' class="'+cssclass+'"' + return self._menu(self._root, lang, page, css) + +class Sitemap(): + """Class keeping the internal site structure""" + def __init__(self): + self._file = 'sitemap.txt' + self._tree = Trie() + self._sitelang = set() + self._isocode = bindery.parse('/usr/share/xml/iso-codes/iso_639_3.xml') + self._tranlang = {} + + def add_link(self, link): + tokens = filter(None,re.split(r'(^/[\w-]*/|[\w-]*/)',link)) + self._tree.add(tokens,Link(link)) + + def write_map(self): + f = open(self._file,'w') + f.write('\n'.join(link.link() for link in self._tree)) + f.close() + + def read_map(self): + try: + f = open(self._file) + sml = f.read().split() + f.close() + for line in sml: + self.add_link(line) + except IOError, what_error: + print 'INFO: Could not read sitemap.txt - one will be created' + + def set(self): + return set(link.link() for link in self._tree) + + def process(self): + t1 = time.time() + for link in self._tree: + link.prepare() + t2 = time.time() + print "Prepare [%5.2f s]" % (round(t2-t1,2)) + for link in self._tree: + self._sitelang = self._sitelang.union(set(link.languages())) + for tran in self._sitelang: + if tran != 'en': + self._tranlang[tran] = gettext.translation('iso_639_3', + languages=[tran]) + t3 = time.time() + print "Language [%5.2f s]" % (round(t3-t2,2)) + for link in self._tree: + link.render() + t4 = time.time() + print "Render [%5.2f s]" % (round(t4-t3,2)) + for link in self._tree: + link.template(self) + t5 = time.time() + print "Template [%5.2f s]" % (round(t5-t4,2)) + t6 = time.time() + res = set() + cwd = os.getcwd() + for link in self._tree: + res = res.union(link.resources()) + for f in res: + outfile = tmptarget+f + mkdir_p(os.path.dirname(outfile)) + shutil.copyfile(f,outfile) + print "Resources[%5.2f s]" % (round(t6-t5,2)) + sitmaplink = Link('/sitemap') + for l in self._sitelang: + sitmaplink.add_page((l,'/sitemap.'+l+'.xml')) + for l in self._sitelang: + sitmaplink.page(l).set_article(self.gen_menu(l,None,"tree sitemap")) + sitmaplink.page(l).template(self) + t7 = time.time() + print "Sitemap [%5.2f s]" % (round(t7-t6,2)) + + def graph(self): + self._tree.graph() + + def gen_menu(self,lang,page,cssclass): + return self._tree.menu(lang,page,cssclass) + + def lang_menu(self,lang,link): + html = "
    " + for l in link.languages(): + isoxml = u"//iso_639_3_entry[@*='"+l+"']" + ln = self._isocode.xml_select(isoxml)[0].name + if lang != 'en': + ln = self._tranlang[lang].gettext(ln) + p = link.link() + if p[-1] == '/': + p = p +'index' + p = p+'.'+l + html += '
  • %s
  • ' % (p, l, ln) + html += "
" + return html + + def publish(self): + ssh_cmd(args.output,"mkdir -p") + publish(tmptarget, args.output) + for res in ["css","images","js","favicon.ico"]: + if (os.path.exists(args.style+res)): + publish(args.style+res, args.output) + ssh_cmd(args.output,"chmod a+rx") + +ts = time.time() +dir_ = Directory() +sitemap = Sitemap() + +dir_.scan() +sitemap.read_map() + +missing = dir_.set() - sitemap.set() +removed = sitemap.set() - dir_.set() +for page in removed: + print page+' pages missing!!' +for page in missing: + print 'adding missing page '+page + sitemap.add_link(page) +if len(missing)+len(removed) != 0: + print 'writing new sitemap - please adjust if needed' + sitemap.write_map() +sitemap.graph() + +sitemap.process() + +t1 = time.time() +sitemap.publish() +t2 = time.time() +print "Publish [%5.2f s]" % (round(t2-t1,2)) +print "Total [%5.2f s]" % (round(t2-ts,2)) diff --git a/treecutter/main.py b/treecutter/main.py new file mode 100755 index 0000000..6f03ff7 --- /dev/null +++ b/treecutter/main.py @@ -0,0 +1,456 @@ +#!/usr/bin/python +import os +import fnmatch +import subprocess +import amara +import re +import tempfile +import errno +import time +import argparse +import shutil +import pygraphviz as pgv +import glob +import gettext +import shutil +from amara import bindery +from amara.xslt import transform +from Cheetah.Template import Template + +parser = argparse.ArgumentParser(description='Process docbook article tree.') +parser.add_argument('--style', nargs='?', + default=os.path.dirname(os.getcwd())+'/style/default/') +parser.add_argument('--output', nargs='?', + default=os.path.dirname(os.getcwd())+'/htdocs/') +args = parser.parse_args() + +style_xslt = args.style+"docbook.xsl" +outputdir = args.output + +tmptarget = tempfile.mkdtemp()+'/' + +valid_scripts = ['.py','.pl'] +MAXLEVEL = 10000 + +def mkdir_p(path): + try: + os.makedirs(path) + except OSError as exc: # Python >2.5 + if exc.errno == errno.EEXIST: + pass + else: raise + +def publish(src,target): + cmd = ["rsync","-a","--delete",src,target] + retcode = subprocess.call(cmd) + if retcode: + print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']' + +def ssh_cmd(target, command): + t = target.split(":") + c = command.split() + cmd = ["ssh",t[0],c[0],c[1],t[1]] + retcode = subprocess.call(cmd) + if retcode: + print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']' + +PREFIXES={u'db': u'http://docbook.org/ns/docbook', + u'xi': u'http://www.w3.org/2001/XInclude', + u'xl': u'http://www.w3.org/1999/xlink', + u'html' : u'http://www.w3.org/1999/xhtml'} + +class Directory(): + """Class containing the state of the directory with articles""" + def __init__(self): + self._cwd = '.' + self._tree = [] + + def scan(self): + for dirname, dirnames, filenames in os.walk(self._cwd): + for filename in filenames: + if fnmatch.fnmatch(filename, '*.xml'): + file_ = os.path.join(dirname,filename) + doc = bindery.parse(file_, prefixes=PREFIXES) + title = doc.xml_select(u'/db:article/db:info/db:title') + menu = doc.xml_select(u'/db:article/db:info/db:titleabbrev') + if title and menu: + base = file_.split('.')[1] + link = base.replace('index','') + self._tree.append(link) + + def set(self): + return set(self._tree) + +class Page(): + """Class representing a version of a webpage""" + def __init__(self,link,page): + self._link = link + self._file = page[1] + self._lang = page[0] + self._doc = None + self._resources = [] + self._title = None + self._menu = None + self._rendered_article = None + + def language(self): + return self._lang + + def resources(self): + return set(self._resources) + + def menu(self): + return self._menu + + def set_article(self,art): + self._rendered_article = art + + def prepare(self): + self._doc = bindery.parse(self._file, prefixes=PREFIXES) + if self._doc.xml_select(u'/db:article/db:info/db:title'): + self._title = unicode(self._doc.article.info.title) + if self._doc.xml_select(u'/db:article/db:info/db:titleabbrev'): + self._menu = unicode(self._doc.article.info.titleabbrev) + + dirname = os.path.dirname(self._file) + code = self._doc.xml_select(u"//xi:include[@parse='text']") + if code: + for c in code: + (p, ext) = os.path.splitext(c.href) + if ext in valid_scripts: + exe = [] + exe.append(os.path.join(os.path.abspath(dirname)+'/'+c.href)) + if c.xml_select(u"//xi:include[@accept-language]"): + alang = c.xml_attributes[None, "accept-language"] + exe.append("lang="+alang) + if c.xml_select(u"//xi:include[@xpointer]"): + exe.append("xptr="+c.xpointer) + xml = subprocess.Popen(exe,stdout=subprocess.PIPE) + xstr = bindery.parse(str(xml.stdout.read())) + idp = c.xml_index_on_parent + for x in xstr.xml_children: + c.xml_parent.xml_insert(idp,x) + c.xml_parent.xml_remove(c) + + for r in self._doc.xml_select(u"//db:link[@xl:href]"): + rf = os.path.join(dirname,r.href) + if os.path.isfile(rf): + self._resources.append(rf) + for i in self._doc.xml_select(u"//db:imagedata[@fileref]"): + im = os.path.join(dirname,i.fileref) + if os.path.isfile(im): + self._resources.append(im) + for i in self._doc.xml_select(u"//html:form[@action]"): + pyscript = re.split('\.py',i.action,1)[0]+'.py' + im = os.path.join(dirname,pyscript) + if os.path.isfile(im): + self._resources.append(im) + + def render(self): + # amara can not handle the docbook stylesheets + # xmlarticle = transform(doc,style_xslt) + cwd = os.getcwd() + dirname = os.path.dirname(self._file) + os.chdir(dirname) + infile = os.path.basename(tempfile.mktemp()) + outfile = tempfile.mktemp() + tfi = open(infile,'w') + tfi.write(self._doc.xml_encode(omit_xml_declaration=True)) + tfi.close() +# cmd = ["saxon-xslt-xinclude","-o",outfile,infile,style_xslt] + cmd = ["xsltproc","--xinclude","--output",outfile,style_xslt,infile] + retcode = subprocess.call(cmd) + if retcode: + print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']' + tfo = open(outfile,'r') + self._rendered_article = tfo.read() + tfo.close() + os.remove(infile) + os.remove(outfile) + os.chdir(cwd) + + def template(self,sitemap): + htmlmenu = sitemap.gen_menu(self._lang,None,"menu") + levelmenu = sitemap.gen_menu(self._lang,self,"tree") + langmenu = sitemap.lang_menu(self._lang,self._link) + template = Template(file=args.style+'index.'+self._lang+'.html.tmpl', + searchList=[{'title':self._title}, + {'menu':htmlmenu}, + {'article':self._rendered_article}, + {'levelmenu':levelmenu}, + {'langmenu':langmenu}]) + outfile = tmptarget+'html'.join(self._file.rsplit('xml',1)) + mkdir_p(os.path.dirname(outfile)) + out = open(outfile, 'w') + out.write(str(template)) + out.close() + + +class Link(): + """Class representing a webpage on the site""" + def __init__(self,link): + self._link = link + # find the representations of the link. + self._pages = [] + path = link + if self._link[-1] == '/': + path = path+'index' + lang = self._scan_languages(path) + for l in lang: + self._pages.append(Page(self,l)) + + def add_page(self,l): + self._pages.append(Page(self,l)) + + def _scan_languages(self,path): + lang = [] + for l in glob.glob('.'+path+'*'): + ls = l.split('.') + if len(ls) > 3 and ls[3] == 'xml': + lang.append((ls[2],l)) + return lang + + def link(self): + return self._link + + def prepare(self): + for page in self._pages: + page.prepare() + + def languages(self): + p = [] + for page in self._pages: + p.append(page.language()) + return p + + def render(self): + for page in self._pages: + page.render() + + def template(self,sitemap): + for page in self._pages: + page.template(sitemap) + + def page(self,lang): + for page in self._pages: + if page.language()==lang: + return page + return None + + def resources(self): + res = set() + for page in self._pages: + res = res.union(page.resources()) + return res + + +class Node(): + def __init__(self,token,value): + self._token = token + self._value = value + self._children = [] + + def token(self): + return self._token + + def value(self): + return self._value + + def children(self): + return self._children + +class Trie(): + def __init__(self): + self._root = [] + + def __iter__(self): + return self.inorder(self._root) + + def inorder(self,t): + for l in t: + yield l.value() + for x in self.inorder(l.children()): + yield x + + def _add(self,trie, key, content): + # is the key a leaf + k = key.pop(0) + if key == []: + node = Node(k,content) + trie.append(node) + else: + for ch in trie: + if ch.token() == k: + self._add(ch.children(), key, content) + + def add(self,key, content): + self._add(self._root, key, content) + + def _graph(self, trie, G): + for l in trie: + G.add_node(l.token()) + for ch in l.children(): + G.add_edge(l.token(),ch.token()) + self._graph(l.children(), G) + + def graph(self): + G = pgv.AGraph(directed=True) + G.add_node("sitemap") + for ch in self._root: + G.add_edge("sitemap",ch.token()) + self._graph(self._root, G) +# G.layout('dot') +# G.draw('g.png') +# print G.string() + + def _menu(self, trie, lang, page, css): + html = "\n" % css + for l in trie: + sel = '' + p = l.value().page(lang) + if p == page: + sel = ' class="selected"' + if p != None: + html += '%s\n' \ + % (sel,l.value().link(),p.menu()) + else: + html += '%s*\n' \ + % (sel,l.value().link(), l.value().page('en').menu()) + if l.children(): + html += self._menu(l.children(), lang, page, "") + html += "\n" + return html + + def menu(self,lang,page,cssclass): + css = '' + if cssclass: + css = ' class="'+cssclass+'"' + return self._menu(self._root, lang, page, css) + +class Sitemap(): + """Class keeping the internal site structure""" + def __init__(self): + self._file = 'sitemap.txt' + self._tree = Trie() + self._sitelang = set() + self._isocode = bindery.parse('/usr/share/xml/iso-codes/iso_639_3.xml') + self._tranlang = {} + + def add_link(self, link): + tokens = filter(None,re.split(r'(^/[\w-]*/|[\w-]*/)',link)) + self._tree.add(tokens,Link(link)) + + def write_map(self): + f = open(self._file,'w') + f.write('\n'.join(link.link() for link in self._tree)) + f.close() + + def read_map(self): + try: + f = open(self._file) + sml = f.read().split() + f.close() + for line in sml: + self.add_link(line) + except IOError, what_error: + print 'INFO: Could not read sitemap.txt - one will be created' + + def set(self): + return set(link.link() for link in self._tree) + + def process(self): + t1 = time.time() + for link in self._tree: + link.prepare() + t2 = time.time() + print "Prepare [%5.2f s]" % (round(t2-t1,2)) + for link in self._tree: + self._sitelang = self._sitelang.union(set(link.languages())) + for tran in self._sitelang: + if tran != 'en': + self._tranlang[tran] = gettext.translation('iso_639_3', + languages=[tran]) + t3 = time.time() + print "Language [%5.2f s]" % (round(t3-t2,2)) + for link in self._tree: + link.render() + t4 = time.time() + print "Render [%5.2f s]" % (round(t4-t3,2)) + for link in self._tree: + link.template(self) + t5 = time.time() + print "Template [%5.2f s]" % (round(t5-t4,2)) + t6 = time.time() + res = set() + cwd = os.getcwd() + for link in self._tree: + res = res.union(link.resources()) + for f in res: + outfile = tmptarget+f + mkdir_p(os.path.dirname(outfile)) + shutil.copyfile(f,outfile) + print "Resources[%5.2f s]" % (round(t6-t5,2)) + sitmaplink = Link('/sitemap') + for l in self._sitelang: + sitmaplink.add_page((l,'/sitemap.'+l+'.xml')) + for l in self._sitelang: + sitmaplink.page(l).set_article(self.gen_menu(l,None,"tree sitemap")) + sitmaplink.page(l).template(self) + t7 = time.time() + print "Sitemap [%5.2f s]" % (round(t7-t6,2)) + + def graph(self): + self._tree.graph() + + def gen_menu(self,lang,page,cssclass): + return self._tree.menu(lang,page,cssclass) + + def lang_menu(self,lang,link): + html = "
    " + for l in link.languages(): + isoxml = u"//iso_639_3_entry[@*='"+l+"']" + ln = self._isocode.xml_select(isoxml)[0].name + if lang != 'en': + ln = self._tranlang[lang].gettext(ln) + p = link.link() + if p[-1] == '/': + p = p +'index' + p = p+'.'+l + html += '
  • %s
  • ' % (p, l, ln) + html += "
" + return html + + def publish(self): + ssh_cmd(args.output,"mkdir -p") + publish(tmptarget, args.output) + for res in ["css","images","js","favicon.ico"]: + if (os.path.exists(args.style+res)): + publish(args.style+res, args.output) + ssh_cmd(args.output,"chmod a+rx") + +ts = time.time() +dir_ = Directory() +sitemap = Sitemap() + +dir_.scan() +sitemap.read_map() + +missing = dir_.set() - sitemap.set() +removed = sitemap.set() - dir_.set() +for page in removed: + print page+' pages missing!!' +for page in missing: + print 'adding missing page '+page + sitemap.add_link(page) +if len(missing)+len(removed) != 0: + print 'writing new sitemap - please adjust if needed' + sitemap.write_map() +sitemap.graph() + +sitemap.process() + +t1 = time.time() +sitemap.publish() +t2 = time.time() +print "Publish [%5.2f s]" % (round(t2-t1,2)) +print "Total [%5.2f s]" % (round(t2-ts,2)) diff --git a/treecutter/page.py b/treecutter/page.py new file mode 100755 index 0000000..6f03ff7 --- /dev/null +++ b/treecutter/page.py @@ -0,0 +1,456 @@ +#!/usr/bin/python +import os +import fnmatch +import subprocess +import amara +import re +import tempfile +import errno +import time +import argparse +import shutil +import pygraphviz as pgv +import glob +import gettext +import shutil +from amara import bindery +from amara.xslt import transform +from Cheetah.Template import Template + +parser = argparse.ArgumentParser(description='Process docbook article tree.') +parser.add_argument('--style', nargs='?', + default=os.path.dirname(os.getcwd())+'/style/default/') +parser.add_argument('--output', nargs='?', + default=os.path.dirname(os.getcwd())+'/htdocs/') +args = parser.parse_args() + +style_xslt = args.style+"docbook.xsl" +outputdir = args.output + +tmptarget = tempfile.mkdtemp()+'/' + +valid_scripts = ['.py','.pl'] +MAXLEVEL = 10000 + +def mkdir_p(path): + try: + os.makedirs(path) + except OSError as exc: # Python >2.5 + if exc.errno == errno.EEXIST: + pass + else: raise + +def publish(src,target): + cmd = ["rsync","-a","--delete",src,target] + retcode = subprocess.call(cmd) + if retcode: + print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']' + +def ssh_cmd(target, command): + t = target.split(":") + c = command.split() + cmd = ["ssh",t[0],c[0],c[1],t[1]] + retcode = subprocess.call(cmd) + if retcode: + print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']' + +PREFIXES={u'db': u'http://docbook.org/ns/docbook', + u'xi': u'http://www.w3.org/2001/XInclude', + u'xl': u'http://www.w3.org/1999/xlink', + u'html' : u'http://www.w3.org/1999/xhtml'} + +class Directory(): + """Class containing the state of the directory with articles""" + def __init__(self): + self._cwd = '.' + self._tree = [] + + def scan(self): + for dirname, dirnames, filenames in os.walk(self._cwd): + for filename in filenames: + if fnmatch.fnmatch(filename, '*.xml'): + file_ = os.path.join(dirname,filename) + doc = bindery.parse(file_, prefixes=PREFIXES) + title = doc.xml_select(u'/db:article/db:info/db:title') + menu = doc.xml_select(u'/db:article/db:info/db:titleabbrev') + if title and menu: + base = file_.split('.')[1] + link = base.replace('index','') + self._tree.append(link) + + def set(self): + return set(self._tree) + +class Page(): + """Class representing a version of a webpage""" + def __init__(self,link,page): + self._link = link + self._file = page[1] + self._lang = page[0] + self._doc = None + self._resources = [] + self._title = None + self._menu = None + self._rendered_article = None + + def language(self): + return self._lang + + def resources(self): + return set(self._resources) + + def menu(self): + return self._menu + + def set_article(self,art): + self._rendered_article = art + + def prepare(self): + self._doc = bindery.parse(self._file, prefixes=PREFIXES) + if self._doc.xml_select(u'/db:article/db:info/db:title'): + self._title = unicode(self._doc.article.info.title) + if self._doc.xml_select(u'/db:article/db:info/db:titleabbrev'): + self._menu = unicode(self._doc.article.info.titleabbrev) + + dirname = os.path.dirname(self._file) + code = self._doc.xml_select(u"//xi:include[@parse='text']") + if code: + for c in code: + (p, ext) = os.path.splitext(c.href) + if ext in valid_scripts: + exe = [] + exe.append(os.path.join(os.path.abspath(dirname)+'/'+c.href)) + if c.xml_select(u"//xi:include[@accept-language]"): + alang = c.xml_attributes[None, "accept-language"] + exe.append("lang="+alang) + if c.xml_select(u"//xi:include[@xpointer]"): + exe.append("xptr="+c.xpointer) + xml = subprocess.Popen(exe,stdout=subprocess.PIPE) + xstr = bindery.parse(str(xml.stdout.read())) + idp = c.xml_index_on_parent + for x in xstr.xml_children: + c.xml_parent.xml_insert(idp,x) + c.xml_parent.xml_remove(c) + + for r in self._doc.xml_select(u"//db:link[@xl:href]"): + rf = os.path.join(dirname,r.href) + if os.path.isfile(rf): + self._resources.append(rf) + for i in self._doc.xml_select(u"//db:imagedata[@fileref]"): + im = os.path.join(dirname,i.fileref) + if os.path.isfile(im): + self._resources.append(im) + for i in self._doc.xml_select(u"//html:form[@action]"): + pyscript = re.split('\.py',i.action,1)[0]+'.py' + im = os.path.join(dirname,pyscript) + if os.path.isfile(im): + self._resources.append(im) + + def render(self): + # amara can not handle the docbook stylesheets + # xmlarticle = transform(doc,style_xslt) + cwd = os.getcwd() + dirname = os.path.dirname(self._file) + os.chdir(dirname) + infile = os.path.basename(tempfile.mktemp()) + outfile = tempfile.mktemp() + tfi = open(infile,'w') + tfi.write(self._doc.xml_encode(omit_xml_declaration=True)) + tfi.close() +# cmd = ["saxon-xslt-xinclude","-o",outfile,infile,style_xslt] + cmd = ["xsltproc","--xinclude","--output",outfile,style_xslt,infile] + retcode = subprocess.call(cmd) + if retcode: + print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']' + tfo = open(outfile,'r') + self._rendered_article = tfo.read() + tfo.close() + os.remove(infile) + os.remove(outfile) + os.chdir(cwd) + + def template(self,sitemap): + htmlmenu = sitemap.gen_menu(self._lang,None,"menu") + levelmenu = sitemap.gen_menu(self._lang,self,"tree") + langmenu = sitemap.lang_menu(self._lang,self._link) + template = Template(file=args.style+'index.'+self._lang+'.html.tmpl', + searchList=[{'title':self._title}, + {'menu':htmlmenu}, + {'article':self._rendered_article}, + {'levelmenu':levelmenu}, + {'langmenu':langmenu}]) + outfile = tmptarget+'html'.join(self._file.rsplit('xml',1)) + mkdir_p(os.path.dirname(outfile)) + out = open(outfile, 'w') + out.write(str(template)) + out.close() + + +class Link(): + """Class representing a webpage on the site""" + def __init__(self,link): + self._link = link + # find the representations of the link. + self._pages = [] + path = link + if self._link[-1] == '/': + path = path+'index' + lang = self._scan_languages(path) + for l in lang: + self._pages.append(Page(self,l)) + + def add_page(self,l): + self._pages.append(Page(self,l)) + + def _scan_languages(self,path): + lang = [] + for l in glob.glob('.'+path+'*'): + ls = l.split('.') + if len(ls) > 3 and ls[3] == 'xml': + lang.append((ls[2],l)) + return lang + + def link(self): + return self._link + + def prepare(self): + for page in self._pages: + page.prepare() + + def languages(self): + p = [] + for page in self._pages: + p.append(page.language()) + return p + + def render(self): + for page in self._pages: + page.render() + + def template(self,sitemap): + for page in self._pages: + page.template(sitemap) + + def page(self,lang): + for page in self._pages: + if page.language()==lang: + return page + return None + + def resources(self): + res = set() + for page in self._pages: + res = res.union(page.resources()) + return res + + +class Node(): + def __init__(self,token,value): + self._token = token + self._value = value + self._children = [] + + def token(self): + return self._token + + def value(self): + return self._value + + def children(self): + return self._children + +class Trie(): + def __init__(self): + self._root = [] + + def __iter__(self): + return self.inorder(self._root) + + def inorder(self,t): + for l in t: + yield l.value() + for x in self.inorder(l.children()): + yield x + + def _add(self,trie, key, content): + # is the key a leaf + k = key.pop(0) + if key == []: + node = Node(k,content) + trie.append(node) + else: + for ch in trie: + if ch.token() == k: + self._add(ch.children(), key, content) + + def add(self,key, content): + self._add(self._root, key, content) + + def _graph(self, trie, G): + for l in trie: + G.add_node(l.token()) + for ch in l.children(): + G.add_edge(l.token(),ch.token()) + self._graph(l.children(), G) + + def graph(self): + G = pgv.AGraph(directed=True) + G.add_node("sitemap") + for ch in self._root: + G.add_edge("sitemap",ch.token()) + self._graph(self._root, G) +# G.layout('dot') +# G.draw('g.png') +# print G.string() + + def _menu(self, trie, lang, page, css): + html = "\n" % css + for l in trie: + sel = '' + p = l.value().page(lang) + if p == page: + sel = ' class="selected"' + if p != None: + html += '%s\n' \ + % (sel,l.value().link(),p.menu()) + else: + html += '%s*\n' \ + % (sel,l.value().link(), l.value().page('en').menu()) + if l.children(): + html += self._menu(l.children(), lang, page, "") + html += "\n" + return html + + def menu(self,lang,page,cssclass): + css = '' + if cssclass: + css = ' class="'+cssclass+'"' + return self._menu(self._root, lang, page, css) + +class Sitemap(): + """Class keeping the internal site structure""" + def __init__(self): + self._file = 'sitemap.txt' + self._tree = Trie() + self._sitelang = set() + self._isocode = bindery.parse('/usr/share/xml/iso-codes/iso_639_3.xml') + self._tranlang = {} + + def add_link(self, link): + tokens = filter(None,re.split(r'(^/[\w-]*/|[\w-]*/)',link)) + self._tree.add(tokens,Link(link)) + + def write_map(self): + f = open(self._file,'w') + f.write('\n'.join(link.link() for link in self._tree)) + f.close() + + def read_map(self): + try: + f = open(self._file) + sml = f.read().split() + f.close() + for line in sml: + self.add_link(line) + except IOError, what_error: + print 'INFO: Could not read sitemap.txt - one will be created' + + def set(self): + return set(link.link() for link in self._tree) + + def process(self): + t1 = time.time() + for link in self._tree: + link.prepare() + t2 = time.time() + print "Prepare [%5.2f s]" % (round(t2-t1,2)) + for link in self._tree: + self._sitelang = self._sitelang.union(set(link.languages())) + for tran in self._sitelang: + if tran != 'en': + self._tranlang[tran] = gettext.translation('iso_639_3', + languages=[tran]) + t3 = time.time() + print "Language [%5.2f s]" % (round(t3-t2,2)) + for link in self._tree: + link.render() + t4 = time.time() + print "Render [%5.2f s]" % (round(t4-t3,2)) + for link in self._tree: + link.template(self) + t5 = time.time() + print "Template [%5.2f s]" % (round(t5-t4,2)) + t6 = time.time() + res = set() + cwd = os.getcwd() + for link in self._tree: + res = res.union(link.resources()) + for f in res: + outfile = tmptarget+f + mkdir_p(os.path.dirname(outfile)) + shutil.copyfile(f,outfile) + print "Resources[%5.2f s]" % (round(t6-t5,2)) + sitmaplink = Link('/sitemap') + for l in self._sitelang: + sitmaplink.add_page((l,'/sitemap.'+l+'.xml')) + for l in self._sitelang: + sitmaplink.page(l).set_article(self.gen_menu(l,None,"tree sitemap")) + sitmaplink.page(l).template(self) + t7 = time.time() + print "Sitemap [%5.2f s]" % (round(t7-t6,2)) + + def graph(self): + self._tree.graph() + + def gen_menu(self,lang,page,cssclass): + return self._tree.menu(lang,page,cssclass) + + def lang_menu(self,lang,link): + html = "
    " + for l in link.languages(): + isoxml = u"//iso_639_3_entry[@*='"+l+"']" + ln = self._isocode.xml_select(isoxml)[0].name + if lang != 'en': + ln = self._tranlang[lang].gettext(ln) + p = link.link() + if p[-1] == '/': + p = p +'index' + p = p+'.'+l + html += '
  • %s
  • ' % (p, l, ln) + html += "
" + return html + + def publish(self): + ssh_cmd(args.output,"mkdir -p") + publish(tmptarget, args.output) + for res in ["css","images","js","favicon.ico"]: + if (os.path.exists(args.style+res)): + publish(args.style+res, args.output) + ssh_cmd(args.output,"chmod a+rx") + +ts = time.time() +dir_ = Directory() +sitemap = Sitemap() + +dir_.scan() +sitemap.read_map() + +missing = dir_.set() - sitemap.set() +removed = sitemap.set() - dir_.set() +for page in removed: + print page+' pages missing!!' +for page in missing: + print 'adding missing page '+page + sitemap.add_link(page) +if len(missing)+len(removed) != 0: + print 'writing new sitemap - please adjust if needed' + sitemap.write_map() +sitemap.graph() + +sitemap.process() + +t1 = time.time() +sitemap.publish() +t2 = time.time() +print "Publish [%5.2f s]" % (round(t2-t1,2)) +print "Total [%5.2f s]" % (round(t2-ts,2)) diff --git a/treecutter/sitemap.py b/treecutter/sitemap.py new file mode 100755 index 0000000..6f03ff7 --- /dev/null +++ b/treecutter/sitemap.py @@ -0,0 +1,456 @@ +#!/usr/bin/python +import os +import fnmatch +import subprocess +import amara +import re +import tempfile +import errno +import time +import argparse +import shutil +import pygraphviz as pgv +import glob +import gettext +import shutil +from amara import bindery +from amara.xslt import transform +from Cheetah.Template import Template + +parser = argparse.ArgumentParser(description='Process docbook article tree.') +parser.add_argument('--style', nargs='?', + default=os.path.dirname(os.getcwd())+'/style/default/') +parser.add_argument('--output', nargs='?', + default=os.path.dirname(os.getcwd())+'/htdocs/') +args = parser.parse_args() + +style_xslt = args.style+"docbook.xsl" +outputdir = args.output + +tmptarget = tempfile.mkdtemp()+'/' + +valid_scripts = ['.py','.pl'] +MAXLEVEL = 10000 + +def mkdir_p(path): + try: + os.makedirs(path) + except OSError as exc: # Python >2.5 + if exc.errno == errno.EEXIST: + pass + else: raise + +def publish(src,target): + cmd = ["rsync","-a","--delete",src,target] + retcode = subprocess.call(cmd) + if retcode: + print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']' + +def ssh_cmd(target, command): + t = target.split(":") + c = command.split() + cmd = ["ssh",t[0],c[0],c[1],t[1]] + retcode = subprocess.call(cmd) + if retcode: + print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']' + +PREFIXES={u'db': u'http://docbook.org/ns/docbook', + u'xi': u'http://www.w3.org/2001/XInclude', + u'xl': u'http://www.w3.org/1999/xlink', + u'html' : u'http://www.w3.org/1999/xhtml'} + +class Directory(): + """Class containing the state of the directory with articles""" + def __init__(self): + self._cwd = '.' + self._tree = [] + + def scan(self): + for dirname, dirnames, filenames in os.walk(self._cwd): + for filename in filenames: + if fnmatch.fnmatch(filename, '*.xml'): + file_ = os.path.join(dirname,filename) + doc = bindery.parse(file_, prefixes=PREFIXES) + title = doc.xml_select(u'/db:article/db:info/db:title') + menu = doc.xml_select(u'/db:article/db:info/db:titleabbrev') + if title and menu: + base = file_.split('.')[1] + link = base.replace('index','') + self._tree.append(link) + + def set(self): + return set(self._tree) + +class Page(): + """Class representing a version of a webpage""" + def __init__(self,link,page): + self._link = link + self._file = page[1] + self._lang = page[0] + self._doc = None + self._resources = [] + self._title = None + self._menu = None + self._rendered_article = None + + def language(self): + return self._lang + + def resources(self): + return set(self._resources) + + def menu(self): + return self._menu + + def set_article(self,art): + self._rendered_article = art + + def prepare(self): + self._doc = bindery.parse(self._file, prefixes=PREFIXES) + if self._doc.xml_select(u'/db:article/db:info/db:title'): + self._title = unicode(self._doc.article.info.title) + if self._doc.xml_select(u'/db:article/db:info/db:titleabbrev'): + self._menu = unicode(self._doc.article.info.titleabbrev) + + dirname = os.path.dirname(self._file) + code = self._doc.xml_select(u"//xi:include[@parse='text']") + if code: + for c in code: + (p, ext) = os.path.splitext(c.href) + if ext in valid_scripts: + exe = [] + exe.append(os.path.join(os.path.abspath(dirname)+'/'+c.href)) + if c.xml_select(u"//xi:include[@accept-language]"): + alang = c.xml_attributes[None, "accept-language"] + exe.append("lang="+alang) + if c.xml_select(u"//xi:include[@xpointer]"): + exe.append("xptr="+c.xpointer) + xml = subprocess.Popen(exe,stdout=subprocess.PIPE) + xstr = bindery.parse(str(xml.stdout.read())) + idp = c.xml_index_on_parent + for x in xstr.xml_children: + c.xml_parent.xml_insert(idp,x) + c.xml_parent.xml_remove(c) + + for r in self._doc.xml_select(u"//db:link[@xl:href]"): + rf = os.path.join(dirname,r.href) + if os.path.isfile(rf): + self._resources.append(rf) + for i in self._doc.xml_select(u"//db:imagedata[@fileref]"): + im = os.path.join(dirname,i.fileref) + if os.path.isfile(im): + self._resources.append(im) + for i in self._doc.xml_select(u"//html:form[@action]"): + pyscript = re.split('\.py',i.action,1)[0]+'.py' + im = os.path.join(dirname,pyscript) + if os.path.isfile(im): + self._resources.append(im) + + def render(self): + # amara can not handle the docbook stylesheets + # xmlarticle = transform(doc,style_xslt) + cwd = os.getcwd() + dirname = os.path.dirname(self._file) + os.chdir(dirname) + infile = os.path.basename(tempfile.mktemp()) + outfile = tempfile.mktemp() + tfi = open(infile,'w') + tfi.write(self._doc.xml_encode(omit_xml_declaration=True)) + tfi.close() +# cmd = ["saxon-xslt-xinclude","-o",outfile,infile,style_xslt] + cmd = ["xsltproc","--xinclude","--output",outfile,style_xslt,infile] + retcode = subprocess.call(cmd) + if retcode: + print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']' + tfo = open(outfile,'r') + self._rendered_article = tfo.read() + tfo.close() + os.remove(infile) + os.remove(outfile) + os.chdir(cwd) + + def template(self,sitemap): + htmlmenu = sitemap.gen_menu(self._lang,None,"menu") + levelmenu = sitemap.gen_menu(self._lang,self,"tree") + langmenu = sitemap.lang_menu(self._lang,self._link) + template = Template(file=args.style+'index.'+self._lang+'.html.tmpl', + searchList=[{'title':self._title}, + {'menu':htmlmenu}, + {'article':self._rendered_article}, + {'levelmenu':levelmenu}, + {'langmenu':langmenu}]) + outfile = tmptarget+'html'.join(self._file.rsplit('xml',1)) + mkdir_p(os.path.dirname(outfile)) + out = open(outfile, 'w') + out.write(str(template)) + out.close() + + +class Link(): + """Class representing a webpage on the site""" + def __init__(self,link): + self._link = link + # find the representations of the link. + self._pages = [] + path = link + if self._link[-1] == '/': + path = path+'index' + lang = self._scan_languages(path) + for l in lang: + self._pages.append(Page(self,l)) + + def add_page(self,l): + self._pages.append(Page(self,l)) + + def _scan_languages(self,path): + lang = [] + for l in glob.glob('.'+path+'*'): + ls = l.split('.') + if len(ls) > 3 and ls[3] == 'xml': + lang.append((ls[2],l)) + return lang + + def link(self): + return self._link + + def prepare(self): + for page in self._pages: + page.prepare() + + def languages(self): + p = [] + for page in self._pages: + p.append(page.language()) + return p + + def render(self): + for page in self._pages: + page.render() + + def template(self,sitemap): + for page in self._pages: + page.template(sitemap) + + def page(self,lang): + for page in self._pages: + if page.language()==lang: + return page + return None + + def resources(self): + res = set() + for page in self._pages: + res = res.union(page.resources()) + return res + + +class Node(): + def __init__(self,token,value): + self._token = token + self._value = value + self._children = [] + + def token(self): + return self._token + + def value(self): + return self._value + + def children(self): + return self._children + +class Trie(): + def __init__(self): + self._root = [] + + def __iter__(self): + return self.inorder(self._root) + + def inorder(self,t): + for l in t: + yield l.value() + for x in self.inorder(l.children()): + yield x + + def _add(self,trie, key, content): + # is the key a leaf + k = key.pop(0) + if key == []: + node = Node(k,content) + trie.append(node) + else: + for ch in trie: + if ch.token() == k: + self._add(ch.children(), key, content) + + def add(self,key, content): + self._add(self._root, key, content) + + def _graph(self, trie, G): + for l in trie: + G.add_node(l.token()) + for ch in l.children(): + G.add_edge(l.token(),ch.token()) + self._graph(l.children(), G) + + def graph(self): + G = pgv.AGraph(directed=True) + G.add_node("sitemap") + for ch in self._root: + G.add_edge("sitemap",ch.token()) + self._graph(self._root, G) +# G.layout('dot') +# G.draw('g.png') +# print G.string() + + def _menu(self, trie, lang, page, css): + html = "\n" % css + for l in trie: + sel = '' + p = l.value().page(lang) + if p == page: + sel = ' class="selected"' + if p != None: + html += '%s\n' \ + % (sel,l.value().link(),p.menu()) + else: + html += '%s*\n' \ + % (sel,l.value().link(), l.value().page('en').menu()) + if l.children(): + html += self._menu(l.children(), lang, page, "") + html += "\n" + return html + + def menu(self,lang,page,cssclass): + css = '' + if cssclass: + css = ' class="'+cssclass+'"' + return self._menu(self._root, lang, page, css) + +class Sitemap(): + """Class keeping the internal site structure""" + def __init__(self): + self._file = 'sitemap.txt' + self._tree = Trie() + self._sitelang = set() + self._isocode = bindery.parse('/usr/share/xml/iso-codes/iso_639_3.xml') + self._tranlang = {} + + def add_link(self, link): + tokens = filter(None,re.split(r'(^/[\w-]*/|[\w-]*/)',link)) + self._tree.add(tokens,Link(link)) + + def write_map(self): + f = open(self._file,'w') + f.write('\n'.join(link.link() for link in self._tree)) + f.close() + + def read_map(self): + try: + f = open(self._file) + sml = f.read().split() + f.close() + for line in sml: + self.add_link(line) + except IOError, what_error: + print 'INFO: Could not read sitemap.txt - one will be created' + + def set(self): + return set(link.link() for link in self._tree) + + def process(self): + t1 = time.time() + for link in self._tree: + link.prepare() + t2 = time.time() + print "Prepare [%5.2f s]" % (round(t2-t1,2)) + for link in self._tree: + self._sitelang = self._sitelang.union(set(link.languages())) + for tran in self._sitelang: + if tran != 'en': + self._tranlang[tran] = gettext.translation('iso_639_3', + languages=[tran]) + t3 = time.time() + print "Language [%5.2f s]" % (round(t3-t2,2)) + for link in self._tree: + link.render() + t4 = time.time() + print "Render [%5.2f s]" % (round(t4-t3,2)) + for link in self._tree: + link.template(self) + t5 = time.time() + print "Template [%5.2f s]" % (round(t5-t4,2)) + t6 = time.time() + res = set() + cwd = os.getcwd() + for link in self._tree: + res = res.union(link.resources()) + for f in res: + outfile = tmptarget+f + mkdir_p(os.path.dirname(outfile)) + shutil.copyfile(f,outfile) + print "Resources[%5.2f s]" % (round(t6-t5,2)) + sitmaplink = Link('/sitemap') + for l in self._sitelang: + sitmaplink.add_page((l,'/sitemap.'+l+'.xml')) + for l in self._sitelang: + sitmaplink.page(l).set_article(self.gen_menu(l,None,"tree sitemap")) + sitmaplink.page(l).template(self) + t7 = time.time() + print "Sitemap [%5.2f s]" % (round(t7-t6,2)) + + def graph(self): + self._tree.graph() + + def gen_menu(self,lang,page,cssclass): + return self._tree.menu(lang,page,cssclass) + + def lang_menu(self,lang,link): + html = "
    " + for l in link.languages(): + isoxml = u"//iso_639_3_entry[@*='"+l+"']" + ln = self._isocode.xml_select(isoxml)[0].name + if lang != 'en': + ln = self._tranlang[lang].gettext(ln) + p = link.link() + if p[-1] == '/': + p = p +'index' + p = p+'.'+l + html += '
  • %s
  • ' % (p, l, ln) + html += "
" + return html + + def publish(self): + ssh_cmd(args.output,"mkdir -p") + publish(tmptarget, args.output) + for res in ["css","images","js","favicon.ico"]: + if (os.path.exists(args.style+res)): + publish(args.style+res, args.output) + ssh_cmd(args.output,"chmod a+rx") + +ts = time.time() +dir_ = Directory() +sitemap = Sitemap() + +dir_.scan() +sitemap.read_map() + +missing = dir_.set() - sitemap.set() +removed = sitemap.set() - dir_.set() +for page in removed: + print page+' pages missing!!' +for page in missing: + print 'adding missing page '+page + sitemap.add_link(page) +if len(missing)+len(removed) != 0: + print 'writing new sitemap - please adjust if needed' + sitemap.write_map() +sitemap.graph() + +sitemap.process() + +t1 = time.time() +sitemap.publish() +t2 = time.time() +print "Publish [%5.2f s]" % (round(t2-t1,2)) +print "Total [%5.2f s]" % (round(t2-ts,2)) diff --git a/treecutter/tools.py b/treecutter/tools.py new file mode 100755 index 0000000..6f03ff7 --- /dev/null +++ b/treecutter/tools.py @@ -0,0 +1,456 @@ +#!/usr/bin/python +import os +import fnmatch +import subprocess +import amara +import re +import tempfile +import errno +import time +import argparse +import shutil +import pygraphviz as pgv +import glob +import gettext +import shutil +from amara import bindery +from amara.xslt import transform +from Cheetah.Template import Template + +parser = argparse.ArgumentParser(description='Process docbook article tree.') +parser.add_argument('--style', nargs='?', + default=os.path.dirname(os.getcwd())+'/style/default/') +parser.add_argument('--output', nargs='?', + default=os.path.dirname(os.getcwd())+'/htdocs/') +args = parser.parse_args() + +style_xslt = args.style+"docbook.xsl" +outputdir = args.output + +tmptarget = tempfile.mkdtemp()+'/' + +valid_scripts = ['.py','.pl'] +MAXLEVEL = 10000 + +def mkdir_p(path): + try: + os.makedirs(path) + except OSError as exc: # Python >2.5 + if exc.errno == errno.EEXIST: + pass + else: raise + +def publish(src,target): + cmd = ["rsync","-a","--delete",src,target] + retcode = subprocess.call(cmd) + if retcode: + print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']' + +def ssh_cmd(target, command): + t = target.split(":") + c = command.split() + cmd = ["ssh",t[0],c[0],c[1],t[1]] + retcode = subprocess.call(cmd) + if retcode: + print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']' + +PREFIXES={u'db': u'http://docbook.org/ns/docbook', + u'xi': u'http://www.w3.org/2001/XInclude', + u'xl': u'http://www.w3.org/1999/xlink', + u'html' : u'http://www.w3.org/1999/xhtml'} + +class Directory(): + """Class containing the state of the directory with articles""" + def __init__(self): + self._cwd = '.' + self._tree = [] + + def scan(self): + for dirname, dirnames, filenames in os.walk(self._cwd): + for filename in filenames: + if fnmatch.fnmatch(filename, '*.xml'): + file_ = os.path.join(dirname,filename) + doc = bindery.parse(file_, prefixes=PREFIXES) + title = doc.xml_select(u'/db:article/db:info/db:title') + menu = doc.xml_select(u'/db:article/db:info/db:titleabbrev') + if title and menu: + base = file_.split('.')[1] + link = base.replace('index','') + self._tree.append(link) + + def set(self): + return set(self._tree) + +class Page(): + """Class representing a version of a webpage""" + def __init__(self,link,page): + self._link = link + self._file = page[1] + self._lang = page[0] + self._doc = None + self._resources = [] + self._title = None + self._menu = None + self._rendered_article = None + + def language(self): + return self._lang + + def resources(self): + return set(self._resources) + + def menu(self): + return self._menu + + def set_article(self,art): + self._rendered_article = art + + def prepare(self): + self._doc = bindery.parse(self._file, prefixes=PREFIXES) + if self._doc.xml_select(u'/db:article/db:info/db:title'): + self._title = unicode(self._doc.article.info.title) + if self._doc.xml_select(u'/db:article/db:info/db:titleabbrev'): + self._menu = unicode(self._doc.article.info.titleabbrev) + + dirname = os.path.dirname(self._file) + code = self._doc.xml_select(u"//xi:include[@parse='text']") + if code: + for c in code: + (p, ext) = os.path.splitext(c.href) + if ext in valid_scripts: + exe = [] + exe.append(os.path.join(os.path.abspath(dirname)+'/'+c.href)) + if c.xml_select(u"//xi:include[@accept-language]"): + alang = c.xml_attributes[None, "accept-language"] + exe.append("lang="+alang) + if c.xml_select(u"//xi:include[@xpointer]"): + exe.append("xptr="+c.xpointer) + xml = subprocess.Popen(exe,stdout=subprocess.PIPE) + xstr = bindery.parse(str(xml.stdout.read())) + idp = c.xml_index_on_parent + for x in xstr.xml_children: + c.xml_parent.xml_insert(idp,x) + c.xml_parent.xml_remove(c) + + for r in self._doc.xml_select(u"//db:link[@xl:href]"): + rf = os.path.join(dirname,r.href) + if os.path.isfile(rf): + self._resources.append(rf) + for i in self._doc.xml_select(u"//db:imagedata[@fileref]"): + im = os.path.join(dirname,i.fileref) + if os.path.isfile(im): + self._resources.append(im) + for i in self._doc.xml_select(u"//html:form[@action]"): + pyscript = re.split('\.py',i.action,1)[0]+'.py' + im = os.path.join(dirname,pyscript) + if os.path.isfile(im): + self._resources.append(im) + + def render(self): + # amara can not handle the docbook stylesheets + # xmlarticle = transform(doc,style_xslt) + cwd = os.getcwd() + dirname = os.path.dirname(self._file) + os.chdir(dirname) + infile = os.path.basename(tempfile.mktemp()) + outfile = tempfile.mktemp() + tfi = open(infile,'w') + tfi.write(self._doc.xml_encode(omit_xml_declaration=True)) + tfi.close() +# cmd = ["saxon-xslt-xinclude","-o",outfile,infile,style_xslt] + cmd = ["xsltproc","--xinclude","--output",outfile,style_xslt,infile] + retcode = subprocess.call(cmd) + if retcode: + print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']' + tfo = open(outfile,'r') + self._rendered_article = tfo.read() + tfo.close() + os.remove(infile) + os.remove(outfile) + os.chdir(cwd) + + def template(self,sitemap): + htmlmenu = sitemap.gen_menu(self._lang,None,"menu") + levelmenu = sitemap.gen_menu(self._lang,self,"tree") + langmenu = sitemap.lang_menu(self._lang,self._link) + template = Template(file=args.style+'index.'+self._lang+'.html.tmpl', + searchList=[{'title':self._title}, + {'menu':htmlmenu}, + {'article':self._rendered_article}, + {'levelmenu':levelmenu}, + {'langmenu':langmenu}]) + outfile = tmptarget+'html'.join(self._file.rsplit('xml',1)) + mkdir_p(os.path.dirname(outfile)) + out = open(outfile, 'w') + out.write(str(template)) + out.close() + + +class Link(): + """Class representing a webpage on the site""" + def __init__(self,link): + self._link = link + # find the representations of the link. + self._pages = [] + path = link + if self._link[-1] == '/': + path = path+'index' + lang = self._scan_languages(path) + for l in lang: + self._pages.append(Page(self,l)) + + def add_page(self,l): + self._pages.append(Page(self,l)) + + def _scan_languages(self,path): + lang = [] + for l in glob.glob('.'+path+'*'): + ls = l.split('.') + if len(ls) > 3 and ls[3] == 'xml': + lang.append((ls[2],l)) + return lang + + def link(self): + return self._link + + def prepare(self): + for page in self._pages: + page.prepare() + + def languages(self): + p = [] + for page in self._pages: + p.append(page.language()) + return p + + def render(self): + for page in self._pages: + page.render() + + def template(self,sitemap): + for page in self._pages: + page.template(sitemap) + + def page(self,lang): + for page in self._pages: + if page.language()==lang: + return page + return None + + def resources(self): + res = set() + for page in self._pages: + res = res.union(page.resources()) + return res + + +class Node(): + def __init__(self,token,value): + self._token = token + self._value = value + self._children = [] + + def token(self): + return self._token + + def value(self): + return self._value + + def children(self): + return self._children + +class Trie(): + def __init__(self): + self._root = [] + + def __iter__(self): + return self.inorder(self._root) + + def inorder(self,t): + for l in t: + yield l.value() + for x in self.inorder(l.children()): + yield x + + def _add(self,trie, key, content): + # is the key a leaf + k = key.pop(0) + if key == []: + node = Node(k,content) + trie.append(node) + else: + for ch in trie: + if ch.token() == k: + self._add(ch.children(), key, content) + + def add(self,key, content): + self._add(self._root, key, content) + + def _graph(self, trie, G): + for l in trie: + G.add_node(l.token()) + for ch in l.children(): + G.add_edge(l.token(),ch.token()) + self._graph(l.children(), G) + + def graph(self): + G = pgv.AGraph(directed=True) + G.add_node("sitemap") + for ch in self._root: + G.add_edge("sitemap",ch.token()) + self._graph(self._root, G) +# G.layout('dot') +# G.draw('g.png') +# print G.string() + + def _menu(self, trie, lang, page, css): + html = "\n" % css + for l in trie: + sel = '' + p = l.value().page(lang) + if p == page: + sel = ' class="selected"' + if p != None: + html += '%s\n' \ + % (sel,l.value().link(),p.menu()) + else: + html += '%s*\n' \ + % (sel,l.value().link(), l.value().page('en').menu()) + if l.children(): + html += self._menu(l.children(), lang, page, "") + html += "\n" + return html + + def menu(self,lang,page,cssclass): + css = '' + if cssclass: + css = ' class="'+cssclass+'"' + return self._menu(self._root, lang, page, css) + +class Sitemap(): + """Class keeping the internal site structure""" + def __init__(self): + self._file = 'sitemap.txt' + self._tree = Trie() + self._sitelang = set() + self._isocode = bindery.parse('/usr/share/xml/iso-codes/iso_639_3.xml') + self._tranlang = {} + + def add_link(self, link): + tokens = filter(None,re.split(r'(^/[\w-]*/|[\w-]*/)',link)) + self._tree.add(tokens,Link(link)) + + def write_map(self): + f = open(self._file,'w') + f.write('\n'.join(link.link() for link in self._tree)) + f.close() + + def read_map(self): + try: + f = open(self._file) + sml = f.read().split() + f.close() + for line in sml: + self.add_link(line) + except IOError, what_error: + print 'INFO: Could not read sitemap.txt - one will be created' + + def set(self): + return set(link.link() for link in self._tree) + + def process(self): + t1 = time.time() + for link in self._tree: + link.prepare() + t2 = time.time() + print "Prepare [%5.2f s]" % (round(t2-t1,2)) + for link in self._tree: + self._sitelang = self._sitelang.union(set(link.languages())) + for tran in self._sitelang: + if tran != 'en': + self._tranlang[tran] = gettext.translation('iso_639_3', + languages=[tran]) + t3 = time.time() + print "Language [%5.2f s]" % (round(t3-t2,2)) + for link in self._tree: + link.render() + t4 = time.time() + print "Render [%5.2f s]" % (round(t4-t3,2)) + for link in self._tree: + link.template(self) + t5 = time.time() + print "Template [%5.2f s]" % (round(t5-t4,2)) + t6 = time.time() + res = set() + cwd = os.getcwd() + for link in self._tree: + res = res.union(link.resources()) + for f in res: + outfile = tmptarget+f + mkdir_p(os.path.dirname(outfile)) + shutil.copyfile(f,outfile) + print "Resources[%5.2f s]" % (round(t6-t5,2)) + sitmaplink = Link('/sitemap') + for l in self._sitelang: + sitmaplink.add_page((l,'/sitemap.'+l+'.xml')) + for l in self._sitelang: + sitmaplink.page(l).set_article(self.gen_menu(l,None,"tree sitemap")) + sitmaplink.page(l).template(self) + t7 = time.time() + print "Sitemap [%5.2f s]" % (round(t7-t6,2)) + + def graph(self): + self._tree.graph() + + def gen_menu(self,lang,page,cssclass): + return self._tree.menu(lang,page,cssclass) + + def lang_menu(self,lang,link): + html = "
    " + for l in link.languages(): + isoxml = u"//iso_639_3_entry[@*='"+l+"']" + ln = self._isocode.xml_select(isoxml)[0].name + if lang != 'en': + ln = self._tranlang[lang].gettext(ln) + p = link.link() + if p[-1] == '/': + p = p +'index' + p = p+'.'+l + html += '
  • %s
  • ' % (p, l, ln) + html += "
" + return html + + def publish(self): + ssh_cmd(args.output,"mkdir -p") + publish(tmptarget, args.output) + for res in ["css","images","js","favicon.ico"]: + if (os.path.exists(args.style+res)): + publish(args.style+res, args.output) + ssh_cmd(args.output,"chmod a+rx") + +ts = time.time() +dir_ = Directory() +sitemap = Sitemap() + +dir_.scan() +sitemap.read_map() + +missing = dir_.set() - sitemap.set() +removed = sitemap.set() - dir_.set() +for page in removed: + print page+' pages missing!!' +for page in missing: + print 'adding missing page '+page + sitemap.add_link(page) +if len(missing)+len(removed) != 0: + print 'writing new sitemap - please adjust if needed' + sitemap.write_map() +sitemap.graph() + +sitemap.process() + +t1 = time.time() +sitemap.publish() +t2 = time.time() +print "Publish [%5.2f s]" % (round(t2-t1,2)) +print "Total [%5.2f s]" % (round(t2-ts,2)) diff --git a/treecutter/trie.py b/treecutter/trie.py new file mode 100755 index 0000000..6f03ff7 --- /dev/null +++ b/treecutter/trie.py @@ -0,0 +1,456 @@ +#!/usr/bin/python +import os +import fnmatch +import subprocess +import amara +import re +import tempfile +import errno +import time +import argparse +import shutil +import pygraphviz as pgv +import glob +import gettext +import shutil +from amara import bindery +from amara.xslt import transform +from Cheetah.Template import Template + +parser = argparse.ArgumentParser(description='Process docbook article tree.') +parser.add_argument('--style', nargs='?', + default=os.path.dirname(os.getcwd())+'/style/default/') +parser.add_argument('--output', nargs='?', + default=os.path.dirname(os.getcwd())+'/htdocs/') +args = parser.parse_args() + +style_xslt = args.style+"docbook.xsl" +outputdir = args.output + +tmptarget = tempfile.mkdtemp()+'/' + +valid_scripts = ['.py','.pl'] +MAXLEVEL = 10000 + +def mkdir_p(path): + try: + os.makedirs(path) + except OSError as exc: # Python >2.5 + if exc.errno == errno.EEXIST: + pass + else: raise + +def publish(src,target): + cmd = ["rsync","-a","--delete",src,target] + retcode = subprocess.call(cmd) + if retcode: + print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']' + +def ssh_cmd(target, command): + t = target.split(":") + c = command.split() + cmd = ["ssh",t[0],c[0],c[1],t[1]] + retcode = subprocess.call(cmd) + if retcode: + print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']' + +PREFIXES={u'db': u'http://docbook.org/ns/docbook', + u'xi': u'http://www.w3.org/2001/XInclude', + u'xl': u'http://www.w3.org/1999/xlink', + u'html' : u'http://www.w3.org/1999/xhtml'} + +class Directory(): + """Class containing the state of the directory with articles""" + def __init__(self): + self._cwd = '.' + self._tree = [] + + def scan(self): + for dirname, dirnames, filenames in os.walk(self._cwd): + for filename in filenames: + if fnmatch.fnmatch(filename, '*.xml'): + file_ = os.path.join(dirname,filename) + doc = bindery.parse(file_, prefixes=PREFIXES) + title = doc.xml_select(u'/db:article/db:info/db:title') + menu = doc.xml_select(u'/db:article/db:info/db:titleabbrev') + if title and menu: + base = file_.split('.')[1] + link = base.replace('index','') + self._tree.append(link) + + def set(self): + return set(self._tree) + +class Page(): + """Class representing a version of a webpage""" + def __init__(self,link,page): + self._link = link + self._file = page[1] + self._lang = page[0] + self._doc = None + self._resources = [] + self._title = None + self._menu = None + self._rendered_article = None + + def language(self): + return self._lang + + def resources(self): + return set(self._resources) + + def menu(self): + return self._menu + + def set_article(self,art): + self._rendered_article = art + + def prepare(self): + self._doc = bindery.parse(self._file, prefixes=PREFIXES) + if self._doc.xml_select(u'/db:article/db:info/db:title'): + self._title = unicode(self._doc.article.info.title) + if self._doc.xml_select(u'/db:article/db:info/db:titleabbrev'): + self._menu = unicode(self._doc.article.info.titleabbrev) + + dirname = os.path.dirname(self._file) + code = self._doc.xml_select(u"//xi:include[@parse='text']") + if code: + for c in code: + (p, ext) = os.path.splitext(c.href) + if ext in valid_scripts: + exe = [] + exe.append(os.path.join(os.path.abspath(dirname)+'/'+c.href)) + if c.xml_select(u"//xi:include[@accept-language]"): + alang = c.xml_attributes[None, "accept-language"] + exe.append("lang="+alang) + if c.xml_select(u"//xi:include[@xpointer]"): + exe.append("xptr="+c.xpointer) + xml = subprocess.Popen(exe,stdout=subprocess.PIPE) + xstr = bindery.parse(str(xml.stdout.read())) + idp = c.xml_index_on_parent + for x in xstr.xml_children: + c.xml_parent.xml_insert(idp,x) + c.xml_parent.xml_remove(c) + + for r in self._doc.xml_select(u"//db:link[@xl:href]"): + rf = os.path.join(dirname,r.href) + if os.path.isfile(rf): + self._resources.append(rf) + for i in self._doc.xml_select(u"//db:imagedata[@fileref]"): + im = os.path.join(dirname,i.fileref) + if os.path.isfile(im): + self._resources.append(im) + for i in self._doc.xml_select(u"//html:form[@action]"): + pyscript = re.split('\.py',i.action,1)[0]+'.py' + im = os.path.join(dirname,pyscript) + if os.path.isfile(im): + self._resources.append(im) + + def render(self): + # amara can not handle the docbook stylesheets + # xmlarticle = transform(doc,style_xslt) + cwd = os.getcwd() + dirname = os.path.dirname(self._file) + os.chdir(dirname) + infile = os.path.basename(tempfile.mktemp()) + outfile = tempfile.mktemp() + tfi = open(infile,'w') + tfi.write(self._doc.xml_encode(omit_xml_declaration=True)) + tfi.close() +# cmd = ["saxon-xslt-xinclude","-o",outfile,infile,style_xslt] + cmd = ["xsltproc","--xinclude","--output",outfile,style_xslt,infile] + retcode = subprocess.call(cmd) + if retcode: + print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']' + tfo = open(outfile,'r') + self._rendered_article = tfo.read() + tfo.close() + os.remove(infile) + os.remove(outfile) + os.chdir(cwd) + + def template(self,sitemap): + htmlmenu = sitemap.gen_menu(self._lang,None,"menu") + levelmenu = sitemap.gen_menu(self._lang,self,"tree") + langmenu = sitemap.lang_menu(self._lang,self._link) + template = Template(file=args.style+'index.'+self._lang+'.html.tmpl', + searchList=[{'title':self._title}, + {'menu':htmlmenu}, + {'article':self._rendered_article}, + {'levelmenu':levelmenu}, + {'langmenu':langmenu}]) + outfile = tmptarget+'html'.join(self._file.rsplit('xml',1)) + mkdir_p(os.path.dirname(outfile)) + out = open(outfile, 'w') + out.write(str(template)) + out.close() + + +class Link(): + """Class representing a webpage on the site""" + def __init__(self,link): + self._link = link + # find the representations of the link. + self._pages = [] + path = link + if self._link[-1] == '/': + path = path+'index' + lang = self._scan_languages(path) + for l in lang: + self._pages.append(Page(self,l)) + + def add_page(self,l): + self._pages.append(Page(self,l)) + + def _scan_languages(self,path): + lang = [] + for l in glob.glob('.'+path+'*'): + ls = l.split('.') + if len(ls) > 3 and ls[3] == 'xml': + lang.append((ls[2],l)) + return lang + + def link(self): + return self._link + + def prepare(self): + for page in self._pages: + page.prepare() + + def languages(self): + p = [] + for page in self._pages: + p.append(page.language()) + return p + + def render(self): + for page in self._pages: + page.render() + + def template(self,sitemap): + for page in self._pages: + page.template(sitemap) + + def page(self,lang): + for page in self._pages: + if page.language()==lang: + return page + return None + + def resources(self): + res = set() + for page in self._pages: + res = res.union(page.resources()) + return res + + +class Node(): + def __init__(self,token,value): + self._token = token + self._value = value + self._children = [] + + def token(self): + return self._token + + def value(self): + return self._value + + def children(self): + return self._children + +class Trie(): + def __init__(self): + self._root = [] + + def __iter__(self): + return self.inorder(self._root) + + def inorder(self,t): + for l in t: + yield l.value() + for x in self.inorder(l.children()): + yield x + + def _add(self,trie, key, content): + # is the key a leaf + k = key.pop(0) + if key == []: + node = Node(k,content) + trie.append(node) + else: + for ch in trie: + if ch.token() == k: + self._add(ch.children(), key, content) + + def add(self,key, content): + self._add(self._root, key, content) + + def _graph(self, trie, G): + for l in trie: + G.add_node(l.token()) + for ch in l.children(): + G.add_edge(l.token(),ch.token()) + self._graph(l.children(), G) + + def graph(self): + G = pgv.AGraph(directed=True) + G.add_node("sitemap") + for ch in self._root: + G.add_edge("sitemap",ch.token()) + self._graph(self._root, G) +# G.layout('dot') +# G.draw('g.png') +# print G.string() + + def _menu(self, trie, lang, page, css): + html = "\n" % css + for l in trie: + sel = '' + p = l.value().page(lang) + if p == page: + sel = ' class="selected"' + if p != None: + html += '%s\n' \ + % (sel,l.value().link(),p.menu()) + else: + html += '%s*\n' \ + % (sel,l.value().link(), l.value().page('en').menu()) + if l.children(): + html += self._menu(l.children(), lang, page, "") + html += "\n" + return html + + def menu(self,lang,page,cssclass): + css = '' + if cssclass: + css = ' class="'+cssclass+'"' + return self._menu(self._root, lang, page, css) + +class Sitemap(): + """Class keeping the internal site structure""" + def __init__(self): + self._file = 'sitemap.txt' + self._tree = Trie() + self._sitelang = set() + self._isocode = bindery.parse('/usr/share/xml/iso-codes/iso_639_3.xml') + self._tranlang = {} + + def add_link(self, link): + tokens = filter(None,re.split(r'(^/[\w-]*/|[\w-]*/)',link)) + self._tree.add(tokens,Link(link)) + + def write_map(self): + f = open(self._file,'w') + f.write('\n'.join(link.link() for link in self._tree)) + f.close() + + def read_map(self): + try: + f = open(self._file) + sml = f.read().split() + f.close() + for line in sml: + self.add_link(line) + except IOError, what_error: + print 'INFO: Could not read sitemap.txt - one will be created' + + def set(self): + return set(link.link() for link in self._tree) + + def process(self): + t1 = time.time() + for link in self._tree: + link.prepare() + t2 = time.time() + print "Prepare [%5.2f s]" % (round(t2-t1,2)) + for link in self._tree: + self._sitelang = self._sitelang.union(set(link.languages())) + for tran in self._sitelang: + if tran != 'en': + self._tranlang[tran] = gettext.translation('iso_639_3', + languages=[tran]) + t3 = time.time() + print "Language [%5.2f s]" % (round(t3-t2,2)) + for link in self._tree: + link.render() + t4 = time.time() + print "Render [%5.2f s]" % (round(t4-t3,2)) + for link in self._tree: + link.template(self) + t5 = time.time() + print "Template [%5.2f s]" % (round(t5-t4,2)) + t6 = time.time() + res = set() + cwd = os.getcwd() + for link in self._tree: + res = res.union(link.resources()) + for f in res: + outfile = tmptarget+f + mkdir_p(os.path.dirname(outfile)) + shutil.copyfile(f,outfile) + print "Resources[%5.2f s]" % (round(t6-t5,2)) + sitmaplink = Link('/sitemap') + for l in self._sitelang: + sitmaplink.add_page((l,'/sitemap.'+l+'.xml')) + for l in self._sitelang: + sitmaplink.page(l).set_article(self.gen_menu(l,None,"tree sitemap")) + sitmaplink.page(l).template(self) + t7 = time.time() + print "Sitemap [%5.2f s]" % (round(t7-t6,2)) + + def graph(self): + self._tree.graph() + + def gen_menu(self,lang,page,cssclass): + return self._tree.menu(lang,page,cssclass) + + def lang_menu(self,lang,link): + html = "
    " + for l in link.languages(): + isoxml = u"//iso_639_3_entry[@*='"+l+"']" + ln = self._isocode.xml_select(isoxml)[0].name + if lang != 'en': + ln = self._tranlang[lang].gettext(ln) + p = link.link() + if p[-1] == '/': + p = p +'index' + p = p+'.'+l + html += '
  • %s
  • ' % (p, l, ln) + html += "
" + return html + + def publish(self): + ssh_cmd(args.output,"mkdir -p") + publish(tmptarget, args.output) + for res in ["css","images","js","favicon.ico"]: + if (os.path.exists(args.style+res)): + publish(args.style+res, args.output) + ssh_cmd(args.output,"chmod a+rx") + +ts = time.time() +dir_ = Directory() +sitemap = Sitemap() + +dir_.scan() +sitemap.read_map() + +missing = dir_.set() - sitemap.set() +removed = sitemap.set() - dir_.set() +for page in removed: + print page+' pages missing!!' +for page in missing: + print 'adding missing page '+page + sitemap.add_link(page) +if len(missing)+len(removed) != 0: + print 'writing new sitemap - please adjust if needed' + sitemap.write_map() +sitemap.graph() + +sitemap.process() + +t1 = time.time() +sitemap.publish() +t2 = time.time() +print "Publish [%5.2f s]" % (round(t2-t1,2)) +print "Total [%5.2f s]" % (round(t2-ts,2)) -- 2.30.2