Adding method process and publish to mimic the old scripts stages to publish.
[treecutter.git] / src / tree-cutter.py
index baf8016249d13b1378447da7340766c603e5548f..37b893a0961529fcdb66b230f5bb9d68f0e83e31 100755 (executable)
@@ -8,6 +8,9 @@ import tempfile
 import errno
 import time
 import argparse
+import shutil
+import pygraphviz as pgv
+import glob
 from amara import bindery
 from amara.xslt import transform
 from Cheetah.Template import Template
@@ -23,6 +26,8 @@ style_xslt = args.style+"docbook.xsl"
 style_tmpl = args.style+"index.en.html.tmpl"
 outputdir = args.output
 
+tmptarget = tempfile.mkdtemp()+'/'
+
 valid_scripts = ['.py','.pl']
 MAXLEVEL = 10000
 
@@ -34,161 +39,538 @@ def mkdir_p(path):
             pass
         else: raise
 
+def publish(src,target):
+    cmd = ["rsync","-a","--delete",src,target]
+    retcode = subprocess.call(cmd)
+    if retcode:
+        print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
+
+
+PREFIXES={u'db': u'http://docbook.org/ns/docbook',
+          u'xi': u'http://www.w3.org/2001/XInclude',
+          u'xl': u'http://www.w3.org/1999/xlink'}
+
+class Directory():
+    """Class containing the state of the directory with articles"""
+    def __init__(self):
+        self._cwd = '.'
+        self._tree = []
+
+    def scan(self):
+        for dirname, dirnames, filenames in os.walk(self._cwd):
+            for filename in filenames:
+                if fnmatch.fnmatch(filename, '*.xml'):
+                    file_ = os.path.join(dirname,filename)
+                    doc = bindery.parse(file_, prefixes=PREFIXES)
+                    title = doc.xml_select(u'/db:article/db:info/db:title')
+                    menu  = doc.xml_select(u'/db:article/db:info/db:titleabbrev')
+                    if title and menu:
+                        base = file_.split('.')[1]
+                        link = base.replace('index','')
+                        self._tree.append(link)
+
+    def set(self):
+        return set(self._tree)
+
+class Page():
+    """Class representing a version of a webpage"""
+    def __init__(self,page):
+        self._file = page[1]
+        self._lang = page[0]
+        self._doc = None
+        self._resources = []
+        self._title = None
+        self._menu = None
+        self._rendered_article = None
+
+    def language(self):
+        return self._lang
+
+    def menu(self):
+        return self._menu
+
+    def set_article(self,art):
+        self._rendered_article = art
+
+    def prepare(self):
+        self._doc = bindery.parse(self._file, prefixes=PREFIXES)
+        if self._doc.xml_select(u'/db:article/db:info/db:title'):
+            self._title = unicode(self._doc.article.info.title)
+        if self._doc.xml_select(u'/db:article/db:info/db:titleabbrev'):
+            self._menu = unicode(self._doc.article.info.titleabbrev)
+
+        dirname = os.path.dirname(self._file)
+        code  = self._doc.xml_select(u"//xi:include[@parse='text']")
+        if code:
+            for c in code:
+                (p, ext) = os.path.splitext(c.href)
+                if ext in valid_scripts:
+                    exe = os.path.join(os.path.abspath(dirname+c.href))
+                    xml = subprocess.Popen([exe],stdout=subprocess.PIPE)
+                    xstr = bindery.parse(str(xml.stdout.read()))
+                    idp = c.xml_index_on_parent
+                    for x in xstr.xml_children:
+                        c.xml_parent.xml_insert(idp,x)
+                        c.xml_parent.xml_remove(c)
+
+        for r in self._doc.xml_select(u"//db:link[@xl:href]"):
+            rf = os.path.join(dirname,r.href)
+            if os.path.isfile(rf):
+                self._resources.append(rf)
+        for i in self._doc.xml_select(u"//db:imagedata[@fileref]"):
+            im = os.path.join(dirname,i.fileref)
+            if os.path.isfile(im):
+                self._resources.append(im)
+
+    def render(self):
+        #  amara can not handle the docbook stylesheets
+        #  xmlarticle = transform(doc,style_xslt)
+        cwd = os.getcwd()
+        dirname = os.path.dirname(self._file)
+        os.chdir(dirname)
+        infile  = os.path.basename(tempfile.mktemp())
+        outfile = tempfile.mktemp()
+        tfi = open(infile,'w')
+        tfi.write(self._doc.xml_encode())
+        tfi.close()
+#  cmd = ["saxon-xslt-xinclude","-o",outfile,infile,style_xslt]
+        cmd = ["xsltproc","--xinclude","--output",outfile,style_xslt,infile]
+        retcode = subprocess.call(cmd)
+        if retcode:
+            print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
+        tfo = open(outfile,'r')
+        self._rendered_article = tfo.read()
+        tfo.close()
+        os.remove(infile)
+        os.remove(outfile)
+        os.chdir(cwd)
+
+    def template(self,sitemap):
+        htmlmenu =  sitemap.gen_menu(self._lang,None,None)
+        levelmenu = sitemap.gen_menu(self._lang,self,"tree")
+        template = Template(file=style_tmpl,
+                            searchList=[{'title':self._title},
+                                        {'menu':htmlmenu},
+                                        {'article':self._rendered_article},
+                                        {'levelmenu':levelmenu},
+                                        {'levelname':'Menu'}])
+        outfile = tmptarget+'html'.join(self._file.rsplit('xml',1))
+        mkdir_p(os.path.dirname(outfile))
+        out = open(outfile, 'w')
+        out.write(str(template))
+        out.close()
+
+
+class Link():
+    """Class representing a webpage on the site"""
+    def __init__(self,link):
+        self._link = link
+        # find the representations of the link.
+        self._pages = []
+        path = link
+        if self._link[-1] == '/':
+            path = path+'index'
+        lang = self._scan_languages(path)
+        for l in lang:
+            self._pages.append(Page(l))
+
+    def _scan_languages(self,path):
+        lang = []
+        for l in  glob.glob('.'+path+'*'):
+            ls = l.split('.')
+            if len(ls) > 3 and ls[3] == 'xml':
+                lang.append((ls[2],l))
+        return lang
+
+    def link(self):
+        return self._link
+
+    def prepare(self):
+        for page in self._pages:
+            page.prepare()
+
+    def languages(self):
+        p = []
+        for page in self._pages:
+            p.append(page.language())
+        return p
+
+    def render(self):
+        for page in self._pages:
+            page.render()
+
+    def template(self,sitemap):
+        for page in self._pages:
+            page.template(sitemap)
+
+    def page(self,lang):
+        for page in self._pages:
+            if page.language()==lang:
+                return page
+
+class Node():
+    def __init__(self,token,value):
+        self._token = token
+        self._value = value
+        self._children = []
+
+    def token(self):
+        return self._token
+
+    def value(self):
+        return self._value
+
+    def children(self):
+        return self._children
+
+class Trie():
+    def __init__(self):
+        self._root = []
+
+    def __iter__(self):
+        return self.inorder(self._root)
+
+    def inorder(self,t):
+        for l in t:
+            yield l.value()
+            for x in self.inorder(l.children()):
+                yield x
+
+    def _add(self,trie, key, content):
+        # is the key a leaf
+        k = key.pop(0)
+        if key == []:
+            node = Node(k,content)
+            trie.append(node)
+        else:
+            for ch in trie:
+                if ch.token() == k:
+                    self._add(ch.children(), key, content)
+
+    def add(self,key, content):
+        self._add(self._root, key, content)
+
+    def _graph(self, trie, G):
+        for l in trie:
+            G.add_node(l.token())
+            for ch in l.children():
+                G.add_edge(l.token(),ch.token())
+                self._graph(l.children(), G)
+
+    def graph(self):
+        G = pgv.AGraph(directed=True)
+        G.add_node("sitemap")
+        for ch in self._root:
+            G.add_edge("sitemap",ch.token())
+        self._graph(self._root, G)
+#        G.layout('dot')
+#        G.draw('g.png')
+#        print G.string()
+
+    def _menu(self, trie, lang, page, css):
+        html = "<ul%s>\n" % css
+        for l in trie:
+            sel = ''
+            if l.value().page(lang) == page:
+                sel = ' class="selected"'
+            html += '<li%s><a href="%s">%s</a>\n' \
+            % (sel,l.value().link(),l.value().page(lang).menu())
+            html += self._menu(l.children(), lang, page, "")
+        html += "</ul>\n"
+        return html
+
+    def menu(self,lang,page,cssclass):
+        css = ''
+        if cssclass:
+            css = ' class="'+cssclass+'"'
+        return self._menu(self._root, lang, page, css)
+
+class Sitemap():
+    """Class keeping the internal site structure"""
+    def __init__(self):
+        self._file = 'sitemap.txt'
+        self._tree = Trie()
+
+    def add_link(self, link):
+        tokens = filter(None,re.split(r'(^/\w*/|\w*/)',link))
+        self._tree.add(tokens,Link(link))
+
+    def write_map(self):
+        f = open(self._file,'w')
+        f.write('\n'.join(link.link() for link in self._tree))
+        f.close()
+
+    def read_map(self):
+        try:
+            f = open(self._file)
+            sml = f.read().split()
+            f.close()
+            for line in sml:
+                self.add_link(line)
+        except IOError, what_error:
+            print 'INFO: Could not read sitemap.txt - one will be created'
+
+    def set(self):
+        return set(link.link() for link in self._tree)
+
+    def process(self):
+        t1 = time.time()
+        for link in self._tree:
+            link.prepare()
+        t2 = time.time()
+        print "Prepare  [%5.2f s]" % (round(t2-t1,2))
+        sitelang = set()
+        for link in self._tree:
+            sitelang = lang.union(set(link.languages()))
+        t3 = time.time()
+        print "Language [%5.2f s]" % (round(t3-t2,2))
+        for link in self._tree:
+            link.render()
+        t4 = time.time()
+        print "Render   [%5.2f s]" % (round(t4-t3,2))
+        for link in self._tree:
+            link.template(self)
+        t5 = time.time()
+        print "Template [%5.2f s]" % (round(t5-t4,2))
+        sm = {}
+        for l in sitelang:
+            sm[l] = Page((l,'/sitemap'))
+            sm[l].set_article(self.gen_menu(l,None,"tree sitemap"))
+            sm[l].template(self)
+        t6 = time.time()
+        print "Sitemap [%5.2f s]" % (round(t6-t5,2))
+
+    def graph(self):
+        self._tree.graph()
+
+    def gen_menu(self,lang,page,cssclass):
+        return self._tree.menu(lang,page,cssclass)
+
+    def publish(self):
+        publish(tmptarget, args.output)
+        publish(args.style+"css", args.output)
+        publish(args.style+"images",args.output)
+
 def generateSitemap():
-  sitemap = []
-  try:
-    sfile = open('sitemap.txt')
-    flist = sfile.read().split()
+    sitemap = []
+    try:
+        sfile = open('sitemap.txt')
+        flist = sfile.read().split()
+        sfile.close()
+        for f in flist:
+            sitemap.append(dict(link=f))
+    except IOError, what_error:
+        print 'Sitemap missing - generating one.'
+
+    for dirname, dirnames, filenames in os.walk('.'):
+        for filename in filenames:
+            if fnmatch.fnmatch(filename, '*.xml'):
+                xfile = os.path.join(dirname,filename)
+                doc = bindery.parse(xfile,
+                                    prefixes={u'db': u'http://docbook.org/ns/docbook',
+                                              u'xi': u'http://www.w3.org/2001/XInclude',
+                                              u'xl': u'http://www.w3.org/1999/xlink'})
+                title = doc.xml_select(u'/db:article/db:info/db:title')
+                menu  = doc.xml_select(u'/db:article/db:info/db:titleabbrev')
+                code  = doc.xml_select(u"//xi:include[@parse='text']")
+                resource = doc.xml_select(u"//db:link[@xl:href]")
+                image = doc.xml_select(u"//db:imagedata[@fileref]")
+                exe = 0
+                for c in code:
+                    (p, ext) = os.path.splitext(c.href)
+                    if ext in valid_scripts:
+                        exe = 1
+
+                if title and menu:
+                    found = 0
+                    base = xfile.split('.')[1]
+                    link = base.replace('index','')
+                    level = len(filter(None,re.split(r'(^/\w*/|\w*/)',link)))
+                    res = []
+                    for r in resource:
+                        rf = os.path.join(dirname,r.href)
+                        if os.path.isfile(rf):
+                            res.append(rf)
+                    for i in image:
+                        im = os.path.join(dirname,i.fileref)
+                        if os.path.isfile(im):
+                            res.append(im)
+                    page = dict(title=unicode(doc.article.info.title),
+                                menu=unicode(doc.article.info.titleabbrev),
+                                output=os.path.join(dirname,
+                                                    filename.replace('xml','html')),
+                                exe=exe,
+                                file=xfile,
+                                res=res,
+                                level=level)
+                    for l in sitemap:
+                        if l['link'] == link:
+                            found = 1
+                            l.update(page)
+                    if not found:
+                        print "adding "+link+" to sitemap"
+                        dd = dict(link=link)
+                        dd.update(page)
+                        sitemap.append(dd)
+    sfile = open('sitemap.txt','w')
+    for l in sitemap:
+        sfile.write(l['link']+'\n')
     sfile.close()
-    for f in flist:
-      sitemap.append(dict(link=f))
-  except IOError, what_error:
-    print 'Sitemap missing - generating one.'
-  for dirname, dirnames, filenames in os.walk('.'):
-    for filename in filenames:
-      if fnmatch.fnmatch(filename, '*.xml'):
-        xfile = os.path.join(dirname,filename)
-        doc = bindery.parse(xfile,
-                            prefixes={u'db': u'http://docbook.org/ns/docbook',
-                                      u'xi': u'http://www.w3.org/2001/XInclude'})
-        title = doc.xml_select(u'/db:article/db:info/db:title')
-        menu  = doc.xml_select(u'/db:article/db:info/db:titleabbrev')
-        code  = doc.xml_select(u"//xi:include[@parse='text']")
-        exe = 0
-        for c in code:
-          (p, ext) = os.path.splitext(c.href)
-          if ext in valid_scripts:
-            exe = 1
-
-        if title and menu:
-          found = 0
-          base = xfile.split('.')[1]
-          link = base.replace('index','')
-          level = len(filter(None,re.split(r'(/\w*/)',link)))
-          page = dict(title=unicode(doc.article.info.title),
-                      menu=unicode(doc.article.info.titleabbrev),
-                      output=os.path.join(dirname,
-                                          filename.replace('xml','html')),
-                      exe=exe,
-                      file=xfile,
-                      level=level)
-          for l in sitemap:
-            if l['link'] == link:
-              found = 1
-              l.update(page)
-          if not found:
-            print "adding "+link+" to sitemap"
-            dd = dict(link=link)
-            dd.update(page)
-            sitemap.append(dd)
-  sfile = open('sitemap.txt','w')
-  for l in sitemap:
-    sfile.write(l['link']+'\n')
-  sfile.close()
-  return sitemap
+    return sitemap
 
 def expandXincludeTxt(page):
-  doc = bindery.parse(page['file'],
-                      prefixes={u'db': u'http://docbook.org/ns/docbook',
-                                u'xi': u'http://www.w3.org/2001/XInclude'})
-  if page['exe']:
-    code  = doc.xml_select(u"//xi:include[@parse='text']")
-    for c in code:
-      (p, ext) = os.path.splitext(c.href)
-      if ext in valid_scripts:
-        exe = os.path.join(os.path.abspath(c.href))
-        xml = subprocess.Popen([exe],stdout=subprocess.PIPE)
-        xstr = bindery.parse(str(xml.stdout.read()))
-        id = c.xml_index_on_parent
-        for x in xstr.xml_children:
-          c.xml_parent.xml_insert(id,x)
-        c.xml_parent.xml_remove(c)
-  return doc
+    doc = bindery.parse(page['file'],
+                        prefixes={u'db': u'http://docbook.org/ns/docbook',
+                                  u'xi': u'http://www.w3.org/2001/XInclude'})
+    if page['exe']:
+        code  = doc.xml_select(u"//xi:include[@parse='text']")
+        for c in code:
+            (p, ext) = os.path.splitext(c.href)
+            if ext in valid_scripts:
+                exe = os.path.join(os.path.abspath(c.href))
+                xml = subprocess.Popen([exe],stdout=subprocess.PIPE)
+                xstr = bindery.parse(str(xml.stdout.read()))
+                id = c.xml_index_on_parent
+                for x in xstr.xml_children:
+                    c.xml_parent.xml_insert(id,x)
+                c.xml_parent.xml_remove(c)
+    return doc
 
 def xsltConvert(doc):
 #  amara can not handle the docbook stylesheets
 #  xmlarticle = transform(doc,style_xslt)
-  cwd = os.getcwd()
-  rundir = os.path.dirname(page['file'])
-  os.chdir(rundir)
-  infile  = os.path.basename(tempfile.mktemp())
-  outfile = tempfile.mktemp()
-  tfi = open(infile,'w')
-  tfi.write(doc.xml_encode())
-  tfi.close()
+    cwd = os.getcwd()
+    rundir = os.path.dirname(page['file'])
+    os.chdir(rundir)
+    infile  = os.path.basename(tempfile.mktemp())
+    outfile = tempfile.mktemp()
+    tfi = open(infile,'w')
+    tfi.write(doc.xml_encode())
+    tfi.close()
 #  cmd = ["saxon-xslt-xinclude","-o",outfile,infile,style_xslt]
-  cmd = ["xsltproc","--xinclude","--output",outfile,style_xslt,infile]
-  retcode = subprocess.call(cmd)
-  if retcode:
-    print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
-  tfo = open(outfile,'r')
-  result = tfo.read()
-  tfo.close()
-  os.remove(infile)
-  os.remove(outfile)
-  os.chdir(cwd)
-  return result
+    cmd = ["xsltproc","--xinclude","--output",outfile,style_xslt,infile]
+    retcode = subprocess.call(cmd)
+    if retcode:
+        print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
+    tfo = open(outfile,'r')
+    result = tfo.read()
+    tfo.close()
+    os.remove(infile)
+    os.remove(outfile)
+    os.chdir(cwd)
+    return result
 
 def genMenu(page,sitemap,slevel,elevel):
-  title = None
-  sm = []
-  if elevel == MAXLEVEL or elevel == 1:
-    sm = sitemap
-  else:
-    idx = sitemap.index(page)
-    while (sitemap[idx]['level'] == page['level']):
-      idx = idx-1
-    title = sitemap[idx]['menu']
-    idx = idx+1
-    while (idx < len(sitemap) and sitemap[idx]['level'] == page['level']):
-      sm.append(sitemap[idx])
-      idx = idx+1
-  oldlevel = slevel
-  html = '<ul>\n'
-  for p in sm:
-    if slevel > p['level'] or elevel < p['level']:
-      continue
-    if not title and p['link'] == '/':
-      title = p['menu']
-
-    if oldlevel < p['level']:
-      html+='<ul>\n'
-    elif oldlevel > p['level']:
-      if p['link'][-1] == '/':
-        html+='</li>\n'
-      html+='</ul>\n</li>\n'
-    if page == p:
-      html+='<li><a href="%s">[%s]</a>' % (p['link'],p['menu'])
+    title = None
+    sm = []
+    if elevel == MAXLEVEL or elevel == 1 or page == None:
+        html = '<ul>\n'
+        sm = sitemap
     else:
-      html+='<li><a href="%s">%s</a>' % (p['link'],p['menu'])
-    if p['link'][-1] != '/' or p['link'] == '/':
-        html+='</li>\n'
-    oldlevel = p['level']
-  html+='</ul>\n'
-  return (html,title)
+        html = '<ul class="tree">\n'
+        idx = sitemap.index(page)
+        while (sitemap[idx]['level'] == page['level']):
+            idx = idx-1
+        title = sitemap[idx]['menu']
+        idx = idx+1
+        while (idx < len(sitemap) and sitemap[idx]['level'] == page['level']):
+            sm.append(sitemap[idx])
+            idx = idx+1
+    oldlevel = slevel
+
+    for p in sm:
+        if slevel > p['level'] or elevel < p['level']:
+            continue
+        if not title and p['link'] == '/':
+            title = p['menu']
+
+        if oldlevel < p['level']:
+            html+='<ul>\n'
+        elif oldlevel > p['level']:
+            if p['link'][-1] == '/':
+                html+='</li>\n'
+            html+='</ul>\n</li>\n'
+        if page != None and page == p:
+            html+='<li class="selected"><a href="%s">%s</a>' % (p['link'],p['menu'])
+        else:
+            html+='<li><a href="%s">%s</a>' % (p['link'],p['menu'])
+        if p['link'][-1] != '/' or p['link'] == '/':
+            html+='</li>\n'
+        oldlevel = p['level']
+    html+='</ul>\n'
+    return (html,title)
 
 def writeToTemplate(page,doc,sitemap):
-  (menu,menuname) = genMenu(page,sitemap,1,MAXLEVEL)
-  (levelmenu,levelname) = genMenu(page,sitemap,page['level'],page['level'])
-  template = Template(file=style_tmpl,
-                      searchList=[{'menu':menu},
-                                  {'article':doc},
-                                  {'levelmenu':levelmenu},
-                                  {'levelname':levelname}])
-  outfile = outputdir+page['output']
-  d = os.path.split(outfile)[0]
-  if d != '':
-    mkdir_p(d)
-  out = open(outfile, 'w')
-  out.write(str(template))
+    (menu,menuname) = genMenu(page,sitemap,1,MAXLEVEL)
+    (levelmenu,levelname) = genMenu(page,sitemap,page['level'],page['level'])
+    template = Template(file=style_tmpl,
+                        searchList=[{'title':page['title']},
+                                    {'menu':menu},
+                                    {'article':doc},
+                                    {'levelmenu':levelmenu},
+                                    {'levelname':levelname}])
+    outfile = tmptarget+page['output']
+    mkdir_p(os.path.dirname(outfile))
+    out = open(outfile, 'w')
+    out.write(str(template))
+    out.close()
+    for r in page['res']:
+        mkdir_p(os.path.dirname(tmptarget+r))
+        shutil.copyfile(r, tmptarget+r)
+
+def createSitemap(sitemap):
+    (menu,menuname) = genMenu(None,sitemap,1,MAXLEVEL)
+    template = Template(file=style_tmpl,
+                        searchList=[
+            {'title':'Sitemap'},
+            {'menu':menu},
+            {'article':menu},
+            {'levelmenu':''},
+            {'levelname':''}])
+    outfile = tmptarget+'sitemap.en.html'
+    mkdir_p(os.path.dirname(outfile))
+    out = open(outfile, 'w')
+    out.write(str(template))
+    out.close()
+
+dir_ = Directory()
+sitemap = Sitemap()
+
+dir_.scan()
+sitemap.read_map()
+
+missing = dir_.set() - sitemap.set()
+removed = sitemap.set() - dir_.set()
+for page in removed:
+    print removed+' pages missing!!'
+for page in missing:
+    print 'adding missing page '+page
+    sitemap.add_link(page)
+if len(missing & removed) != 0:
+    print 'writing new sitemap - please adjust if needed'
+    sitemap.write_map()
+sitemap.graph()
+
+sitemap.process()
+
+t1 = time.time()
+sitemap.publish()
+t2 = time.time()
+print "Publish  [%5.2f s]" % (round(t2-t1,2))
 
 sitemap = generateSitemap()
+tmptarget = tempfile.mkdtemp()+'/'
 for page in sitemap:
-  t1 = time.time()
-  print "Page : "+page['link'],
-  doc = expandXincludeTxt(page)
-  pubdoc = xsltConvert(doc)
-  writeToTemplate(page,pubdoc,sitemap)
-#  publishResources()
-  t2 = time.time()
-  print "["+str(round(t2-t1,2))+"]  done."
+    t1 = time.time()
+    print "Page : %-30s %30s" % (page['link'],
+                        time.ctime(os.stat(page['file']).st_mtime)),
+    doc = expandXincludeTxt(page)
+    pubdoc = xsltConvert(doc)
+    writeToTemplate(page,pubdoc,sitemap)
+    t2 = time.time()
+    print "[%5.2f s]" % (round(t2-t1,2))
+
+createSitemap(sitemap)
+publish(tmptarget, args.output)
+publish(args.style+"css", args.output)
+publish(args.style+"images",args.output)