sitemap: adding sizecalculation
[treecutter.git] / treecutter / sitemap.py
old mode 100755 (executable)
new mode 100644 (file)
index 11467de..707067f
@@ -1,43 +1,50 @@
 #!/usr/bin/python
 import os
-import fnmatch
-import subprocess
-import amara
+import codecs
 import re
-import tempfile
-import errno
-import time
-import argparse
 import shutil
-import pygraphviz as pgv
-import glob
+import sys
 import gettext
-import shutil
-from amara import bindery
-from amara.xslt import transform
-from Cheetah.Template import Template
+import tempfile
+from lxml import etree
+from lxml.builder import ElementMaker
+from time import time
+from treecutter import constants as const
+from treecutter.trie import Trie
+from treecutter.link import Link
+from treecutter.tools import ssh_cmd, publish, mkdir_p,get_folder_size,sizeof_fmt
+
 
 class Sitemap():
     """Class keeping the internal site structure"""
-    def __init__(self):
+    def __init__(self,args):
+        self._output = args.output
+        self._style = args.style
+        self._subdir = args.subdir
         self._file = 'sitemap.txt'
         self._tree = Trie()
         self._sitelang = set()
-        self._isocode = bindery.parse('/usr/share/xml/iso-codes/iso_639_3.xml')
+        self._isocode = etree.parse('/usr/share/xml/iso-codes/iso_639_3.xml')
         self._tranlang = {}
+        self._tmptarget = tempfile.mkdtemp()+'/'
 
+    # The sitemap uses a trie structure to keep track of links
+    # A link represents the path to the document and the link
+    # representing the text on the site.
+    # A link can have several pages in different languages.
     def add_link(self, link):
-        tokens = filter(None,re.split(r'(^/[\w-]*/|[\w-]*/)',link))
+        tokens = filter(None,re.split(r'(^/[\w\.:-]*$|^/[\w\.:-]*/|[\w\.:-]*/)',link,flags=re.UNICODE))
         self._tree.add(tokens,Link(link))
 
     def write_map(self):
-        f = open(self._file,'w')
-        f.write('\n'.join(link.link() for link in self._tree))
+        f = codecs.open(self._file,'w','utf-8')
+        s = '\n'.join(link.link() for link in self._tree)
+        f.write(s)
         f.close()
 
     def read_map(self):
         try:
-            f = open(self._file)
+            f = codecs.open(self._file, 'r', 'utf-8')
             sml = f.read().split()
             f.close()
             for line in sml:
@@ -45,14 +52,19 @@ class Sitemap():
         except IOError, what_error:
             print 'INFO: Could not read sitemap.txt - one will be created'
 
+    # Create a set of the current tree for comparison with the
+    # directory scan
     def set(self):
         return set(link.link() for link in self._tree)
 
+    # Main driver in the application processing the documents
+    # in the collected sitemap
     def process(self):
-        t1 = time.time()
+        t1 = time()
+        print "Prepareing the input"
         for link in self._tree:
             link.prepare()
-        t2 = time.time()
+        t2 = time()
         print "Prepare  [%5.2f s]" % (round(t2-t1,2))
         for link in self._tree:
             self._sitelang = self._sitelang.union(set(link.languages()))
@@ -60,60 +72,67 @@ class Sitemap():
             if tran != 'en':
                 self._tranlang[tran] = gettext.translation('iso_639_3',
                                                            languages=[tran])
-        t3 = time.time()
+        t3 = time()
         print "Language [%5.2f s]" % (round(t3-t2,2))
+        transform = {}
+        transform['xhtml5'] = etree.XSLT(etree.parse(self._style+"docbook.xhtml5.xsl"))
         for link in self._tree:
-            link.render()
-        t4 = time.time()
+            link.render(transform)
+        t4 = time()
         print "Render   [%5.2f s]" % (round(t4-t3,2))
         for link in self._tree:
-            link.template(self)
-        t5 = time.time()
+            link.template(self, self._style, self._tmptarget,self._subdir)
+        t5 = time()
         print "Template [%5.2f s]" % (round(t5-t4,2))
-        t6 = time.time()
+        t6 = time()
         res = set()
-        cwd = os.getcwd()
+        # Collect all files used by the documents
         for link in self._tree:
             res = res.union(link.resources())
         for f in res:
-            outfile = tmptarget+f
+            outfile = self._tmptarget+f
             mkdir_p(os.path.dirname(outfile))
             shutil.copyfile(f,outfile)
         print "Resources[%5.2f s]" % (round(t6-t5,2))
+        # TODO: Improve the sitemap, it is a page that is generated from
+        #       the ground up and added a bit adhoc.
         sitmaplink = Link('/sitemap')
         for l in self._sitelang:
             sitmaplink.add_page((l,'/sitemap.'+l+'.xml'))
         for l in self._sitelang:
             sitmaplink.page(l).set_article(self.gen_menu(l,None,"tree sitemap"))
-            sitmaplink.page(l).template(self)
-        t7 = time.time()
+            sitmaplink.page(l).template(self,self._style,self._tmptarget,self._subdir)
+        t7 = time()
         print "Sitemap  [%5.2f s]" % (round(t7-t6,2))
 
     def graph(self):
         self._tree.graph()
 
     def gen_menu(self,lang,page,cssclass):
-        return self._tree.menu(lang,page,cssclass)
+        return self._tree.menu(lang,page,cssclass,self._subdir)
 
     def lang_menu(self,lang,link):
-        html = "<ul>"
+        html = ElementMaker()
+        menu = html.ul()
         for l in link.languages():
             isoxml = u"//iso_639_3_entry[@*='"+l+"']"
-            ln = self._isocode.xml_select(isoxml)[0].name
+            ln = self._isocode.xpath(isoxml)[0].get('name')
             if lang != 'en':
                 ln = self._tranlang[lang].gettext(ln)
-            p = link.link()
-            if p[-1] == '/':
-                p = p +'index'
-            p = p+'.'+l
-            html += '<li><a href="%s" hreflang="%s">%s</a></li>' % (p, l, ln)
-        html += "</ul>"
-        return html
+            p = unicode(link.link())
+            if p[-1] == u'/':
+                p = p +u'index'
+            p = p+u'.'+l
+            li = html.li(html.a(ln.decode('utf-8'),
+                                href=self._subdir+p,hreflang=l))
+            menu.append(li)
+        return etree.tostring(menu,encoding='UTF-8',pretty_print=False)
 
     def publish(self):
-        ssh_cmd(args.output,"mkdir -p")
-        publish(tmptarget, args.output)
-        for res in ["css","images","js","favicon.ico"]:
-            if (os.path.exists(args.style+res)):
-                publish(args.style+res, args.output)
-        ssh_cmd(args.output,"chmod a+rx")
+        print "Size [ %7s ]" % (sizeof_fmt(get_folder_size(self._tmptarget)))
+        ssh_cmd(self._output,"mkdir -p")
+        publish(self._tmptarget, self._output)
+        for res in ["stylesheets","images","js","fonts","favicon.ico"]:
+            if (os.path.exists(self._style+res)):
+                publish(self._style+res, self._output)
+        ssh_cmd(self._output,"chmod a+rx")