Adding a css mark for the top menu.
[treecutter.git] / src / tree-cutter.py
1 #!/usr/bin/python
2 import os
3 import fnmatch
4 import subprocess
5 import amara
6 import re
7 import tempfile
8 import errno
9 import time
10 import argparse
11 import shutil
12 import pygraphviz as pgv
13 import glob
14 import gettext
15 from amara import bindery
16 from amara.xslt import transform
17 from Cheetah.Template import Template
18
19 parser = argparse.ArgumentParser(description='Process docbook article tree.')
20 parser.add_argument('--style', nargs='?',
21                     default=os.path.dirname(os.getcwd())+'/style/default/')
22 parser.add_argument('--output', nargs='?',
23                     default=os.path.dirname(os.getcwd())+'/htdocs/')
24 args = parser.parse_args()
25
26 style_xslt = args.style+"docbook.xsl"
27 style_tmpl = args.style+"index.en.html.tmpl"
28 outputdir = args.output
29
30 tmptarget = tempfile.mkdtemp()+'/'
31
32 valid_scripts = ['.py','.pl']
33 MAXLEVEL = 10000
34
35 def mkdir_p(path):
36     try:
37         os.makedirs(path)
38     except OSError as exc: # Python >2.5
39         if exc.errno == errno.EEXIST:
40             pass
41         else: raise
42
43 def publish(src,target):
44     cmd = ["rsync","-a","--delete",src,target]
45     retcode = subprocess.call(cmd)
46     if retcode:
47         print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
48
49
50 PREFIXES={u'db': u'http://docbook.org/ns/docbook',
51           u'xi': u'http://www.w3.org/2001/XInclude',
52           u'xl': u'http://www.w3.org/1999/xlink'}
53
54 class Directory():
55     """Class containing the state of the directory with articles"""
56     def __init__(self):
57         self._cwd = '.'
58         self._tree = []
59
60     def scan(self):
61         for dirname, dirnames, filenames in os.walk(self._cwd):
62             for filename in filenames:
63                 if fnmatch.fnmatch(filename, '*.xml'):
64                     file_ = os.path.join(dirname,filename)
65                     doc = bindery.parse(file_, prefixes=PREFIXES)
66                     title = doc.xml_select(u'/db:article/db:info/db:title')
67                     menu  = doc.xml_select(u'/db:article/db:info/db:titleabbrev')
68                     if title and menu:
69                         base = file_.split('.')[1]
70                         link = base.replace('index','')
71                         self._tree.append(link)
72
73     def set(self):
74         return set(self._tree)
75
76 class Page():
77     """Class representing a version of a webpage"""
78     def __init__(self,page):
79         self._file = page[1]
80         self._lang = page[0]
81         self._doc = None
82         self._resources = []
83         self._title = None
84         self._menu = None
85         self._rendered_article = None
86
87     def language(self):
88         return self._lang
89
90     def menu(self):
91         return self._menu
92
93     def set_article(self,art):
94         self._rendered_article = art
95
96     def prepare(self):
97         self._doc = bindery.parse(self._file, prefixes=PREFIXES)
98         if self._doc.xml_select(u'/db:article/db:info/db:title'):
99             self._title = unicode(self._doc.article.info.title)
100         if self._doc.xml_select(u'/db:article/db:info/db:titleabbrev'):
101             self._menu = unicode(self._doc.article.info.titleabbrev)
102
103         dirname = os.path.dirname(self._file)
104         code  = self._doc.xml_select(u"//xi:include[@parse='text']")
105         if code:
106             for c in code:
107                 (p, ext) = os.path.splitext(c.href)
108                 if ext in valid_scripts:
109                     exe = os.path.join(os.path.abspath(dirname)+'/'+c.href)
110                     xml = subprocess.Popen([exe],stdout=subprocess.PIPE)
111                     xstr = bindery.parse(str(xml.stdout.read()))
112                     idp = c.xml_index_on_parent
113                     for x in xstr.xml_children:
114                         c.xml_parent.xml_insert(idp,x)
115                         c.xml_parent.xml_remove(c)
116
117         for r in self._doc.xml_select(u"//db:link[@xl:href]"):
118             rf = os.path.join(dirname,r.href)
119             if os.path.isfile(rf):
120                 self._resources.append(rf)
121         for i in self._doc.xml_select(u"//db:imagedata[@fileref]"):
122             im = os.path.join(dirname,i.fileref)
123             if os.path.isfile(im):
124                 self._resources.append(im)
125
126     def render(self):
127         #  amara can not handle the docbook stylesheets
128         #  xmlarticle = transform(doc,style_xslt)
129         cwd = os.getcwd()
130         dirname = os.path.dirname(self._file)
131         os.chdir(dirname)
132         infile  = os.path.basename(tempfile.mktemp())
133         outfile = tempfile.mktemp()
134         tfi = open(infile,'w')
135         tfi.write(self._doc.xml_encode())
136         tfi.close()
137 #  cmd = ["saxon-xslt-xinclude","-o",outfile,infile,style_xslt]
138         cmd = ["xsltproc","--xinclude","--output",outfile,style_xslt,infile]
139         retcode = subprocess.call(cmd)
140         if retcode:
141             print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
142         tfo = open(outfile,'r')
143         self._rendered_article = tfo.read()
144         tfo.close()
145         os.remove(infile)
146         os.remove(outfile)
147         os.chdir(cwd)
148
149     def template(self,sitemap):
150         htmlmenu =  sitemap.gen_menu(self._lang,None,"menu")
151         levelmenu = sitemap.gen_menu(self._lang,self,"tree")
152         langmenu = sitemap.lang_menu(self._lang)
153         template = Template(file=style_tmpl,
154                             searchList=[{'title':self._title},
155                                         {'menu':htmlmenu},
156                                         {'article':self._rendered_article},
157                                         {'levelmenu':levelmenu},
158                                         {'langmenu':langmenu}])
159         outfile = tmptarget+'html'.join(self._file.rsplit('xml',1))
160         mkdir_p(os.path.dirname(outfile))
161         out = open(outfile, 'w')
162         out.write(str(template))
163         out.close()
164
165
166 class Link():
167     """Class representing a webpage on the site"""
168     def __init__(self,link):
169         self._link = link
170         # find the representations of the link.
171         self._pages = []
172         path = link
173         if self._link[-1] == '/':
174             path = path+'index'
175         lang = self._scan_languages(path)
176         for l in lang:
177             self._pages.append(Page(l))
178
179     def _scan_languages(self,path):
180         lang = []
181         for l in  glob.glob('.'+path+'*'):
182             ls = l.split('.')
183             if len(ls) > 3 and ls[3] == 'xml':
184                 lang.append((ls[2],l))
185         return lang
186
187     def link(self):
188         return self._link
189
190     def prepare(self):
191         for page in self._pages:
192             page.prepare()
193
194     def languages(self):
195         p = []
196         for page in self._pages:
197             p.append(page.language())
198         return p
199
200     def render(self):
201         for page in self._pages:
202             page.render()
203
204     def template(self,sitemap):
205         for page in self._pages:
206             page.template(sitemap)
207
208     def page(self,lang):
209         for page in self._pages:
210             if page.language()==lang:
211                 return page
212
213 class Node():
214     def __init__(self,token,value):
215         self._token = token
216         self._value = value
217         self._children = []
218
219     def token(self):
220         return self._token
221
222     def value(self):
223         return self._value
224
225     def children(self):
226         return self._children
227
228 class Trie():
229     def __init__(self):
230         self._root = []
231
232     def __iter__(self):
233         return self.inorder(self._root)
234
235     def inorder(self,t):
236         for l in t:
237             yield l.value()
238             for x in self.inorder(l.children()):
239                 yield x
240
241     def _add(self,trie, key, content):
242         # is the key a leaf
243         k = key.pop(0)
244         if key == []:
245             node = Node(k,content)
246             trie.append(node)
247         else:
248             for ch in trie:
249                 if ch.token() == k:
250                     self._add(ch.children(), key, content)
251
252     def add(self,key, content):
253         self._add(self._root, key, content)
254
255     def _graph(self, trie, G):
256         for l in trie:
257             G.add_node(l.token())
258             for ch in l.children():
259                 G.add_edge(l.token(),ch.token())
260                 self._graph(l.children(), G)
261
262     def graph(self):
263         G = pgv.AGraph(directed=True)
264         G.add_node("sitemap")
265         for ch in self._root:
266             G.add_edge("sitemap",ch.token())
267         self._graph(self._root, G)
268 #        G.layout('dot')
269 #        G.draw('g.png')
270 #        print G.string()
271
272     def _menu(self, trie, lang, page, css):
273         html = "<ul%s>\n" % css
274         for l in trie:
275             sel = ''
276             if l.value().page(lang) == page:
277                 sel = ' class="selected"'
278             html += '<li%s><a href="%s">%s</a>\n' \
279             % (sel,l.value().link(),l.value().page(lang).menu())
280             html += self._menu(l.children(), lang, page, "")
281         html += "</ul>\n"
282         return html
283
284     def menu(self,lang,page,cssclass):
285         css = ''
286         if cssclass:
287             css = ' class="'+cssclass+'"'
288         return self._menu(self._root, lang, page, css)
289
290 class Sitemap():
291     """Class keeping the internal site structure"""
292     def __init__(self):
293         self._file = 'sitemap.txt'
294         self._tree = Trie()
295         self._sitelang = set()
296         self._isocode = bindery.parse('/usr/share/xml/iso-codes/iso_639_3.xml')
297         self._tranlang = {}
298
299     def add_link(self, link):
300         tokens = filter(None,re.split(r'(^/\w*/|\w*/)',link))
301         self._tree.add(tokens,Link(link))
302
303     def write_map(self):
304         f = open(self._file,'w')
305         f.write('\n'.join(link.link() for link in self._tree))
306         f.close()
307
308     def read_map(self):
309         try:
310             f = open(self._file)
311             sml = f.read().split()
312             f.close()
313             for line in sml:
314                 self.add_link(line)
315         except IOError, what_error:
316             print 'INFO: Could not read sitemap.txt - one will be created'
317
318     def set(self):
319         return set(link.link() for link in self._tree)
320
321     def process(self):
322         t1 = time.time()
323         for link in self._tree:
324             link.prepare()
325         t2 = time.time()
326         print "Prepare  [%5.2f s]" % (round(t2-t1,2))
327         for link in self._tree:
328             self._sitelang = self._sitelang.union(set(link.languages()))
329         for tran in self._sitelang:
330             if tran != 'en':
331                 self._tranlang[tran] = gettext.translation('iso_639_3', languages=[tran])
332         t3 = time.time()
333         print "Language [%5.2f s]" % (round(t3-t2,2))
334         for link in self._tree:
335             link.render()
336         t4 = time.time()
337         print "Render   [%5.2f s]" % (round(t4-t3,2))
338         for link in self._tree:
339             link.template(self)
340         t5 = time.time()
341         print "Template [%5.2f s]" % (round(t5-t4,2))
342         sm = {}
343         for l in self._sitelang:
344             sm[l] = Page((l,'/sitemap'))
345             sm[l].set_article(self.gen_menu(l,None,"tree sitemap"))
346             sm[l].template(self)
347         t6 = time.time()
348         print "Sitemap  [%5.2f s]" % (round(t6-t5,2))
349
350     def graph(self):
351         self._tree.graph()
352
353     def gen_menu(self,lang,page,cssclass):
354         return self._tree.menu(lang,page,cssclass)
355
356     def lang_menu(self,lang):
357         html = "<ul>"
358         for l in self._sitelang:
359             isoxml = u"//iso_639_3_entry[@*='"+l+"']"
360             ln = self._isocode.xml_select(isoxml)[0].name
361             if lang != 'en':
362                 ln = self._tranlang[lang].gettext(ln)
363             html += '<li><a href="%s">%s</a></li>' % ('link'+'.'+l, ln)
364         html += "</ul>"
365         return html
366
367     def publish(self):
368         publish(tmptarget, args.output)
369         publish(args.style+"css", args.output)
370         publish(args.style+"images",args.output)
371
372 def generateSitemap():
373     sitemap = []
374     try:
375         sfile = open('sitemap.txt')
376         flist = sfile.read().split()
377         sfile.close()
378         for f in flist:
379             sitemap.append(dict(link=f))
380     except IOError, what_error:
381         print 'Sitemap missing - generating one.'
382
383     for dirname, dirnames, filenames in os.walk('.'):
384         for filename in filenames:
385             if fnmatch.fnmatch(filename, '*.xml'):
386                 xfile = os.path.join(dirname,filename)
387                 doc = bindery.parse(xfile,
388                                     prefixes={u'db': u'http://docbook.org/ns/docbook',
389                                               u'xi': u'http://www.w3.org/2001/XInclude',
390                                               u'xl': u'http://www.w3.org/1999/xlink'})
391                 title = doc.xml_select(u'/db:article/db:info/db:title')
392                 menu  = doc.xml_select(u'/db:article/db:info/db:titleabbrev')
393                 code  = doc.xml_select(u"//xi:include[@parse='text']")
394                 resource = doc.xml_select(u"//db:link[@xl:href]")
395                 image = doc.xml_select(u"//db:imagedata[@fileref]")
396                 exe = 0
397                 for c in code:
398                     (p, ext) = os.path.splitext(c.href)
399                     if ext in valid_scripts:
400                         exe = 1
401
402                 if title and menu:
403                     found = 0
404                     base = xfile.split('.')[1]
405                     link = base.replace('index','')
406                     level = len(filter(None,re.split(r'(^/\w*/|\w*/)',link)))
407                     res = []
408                     for r in resource:
409                         rf = os.path.join(dirname,r.href)
410                         if os.path.isfile(rf):
411                             res.append(rf)
412                     for i in image:
413                         im = os.path.join(dirname,i.fileref)
414                         if os.path.isfile(im):
415                             res.append(im)
416                     page = dict(title=unicode(doc.article.info.title),
417                                 menu=unicode(doc.article.info.titleabbrev),
418                                 output=os.path.join(dirname,
419                                                     filename.replace('xml','html')),
420                                 exe=exe,
421                                 file=xfile,
422                                 res=res,
423                                 level=level)
424                     for l in sitemap:
425                         if l['link'] == link:
426                             found = 1
427                             l.update(page)
428                     if not found:
429                         print "adding "+link+" to sitemap"
430                         dd = dict(link=link)
431                         dd.update(page)
432                         sitemap.append(dd)
433     sfile = open('sitemap.txt','w')
434     for l in sitemap:
435         sfile.write(l['link']+'\n')
436     sfile.close()
437     return sitemap
438
439 def expandXincludeTxt(page):
440     doc = bindery.parse(page['file'],
441                         prefixes={u'db': u'http://docbook.org/ns/docbook',
442                                   u'xi': u'http://www.w3.org/2001/XInclude'})
443     if page['exe']:
444         code  = doc.xml_select(u"//xi:include[@parse='text']")
445         for c in code:
446             (p, ext) = os.path.splitext(c.href)
447             if ext in valid_scripts:
448                 exe = os.path.join(os.path.abspath(c.href))
449                 xml = subprocess.Popen([exe],stdout=subprocess.PIPE)
450                 xstr = bindery.parse(str(xml.stdout.read()))
451                 id = c.xml_index_on_parent
452                 for x in xstr.xml_children:
453                     c.xml_parent.xml_insert(id,x)
454                 c.xml_parent.xml_remove(c)
455     return doc
456
457 def xsltConvert(doc):
458 #  amara can not handle the docbook stylesheets
459 #  xmlarticle = transform(doc,style_xslt)
460     cwd = os.getcwd()
461     rundir = os.path.dirname(page['file'])
462     os.chdir(rundir)
463     infile  = os.path.basename(tempfile.mktemp())
464     outfile = tempfile.mktemp()
465     tfi = open(infile,'w')
466     tfi.write(doc.xml_encode())
467     tfi.close()
468 #  cmd = ["saxon-xslt-xinclude","-o",outfile,infile,style_xslt]
469     cmd = ["xsltproc","--xinclude","--output",outfile,style_xslt,infile]
470     retcode = subprocess.call(cmd)
471     if retcode:
472         print 'Error: '+' '.join(cmd)+' Returncode ['+str(retcode)+']'
473     tfo = open(outfile,'r')
474     result = tfo.read()
475     tfo.close()
476     os.remove(infile)
477     os.remove(outfile)
478     os.chdir(cwd)
479     return result
480
481 def genMenu(page,sitemap,slevel,elevel):
482     title = None
483     sm = []
484     if elevel == MAXLEVEL or elevel == 1 or page == None:
485         html = '<ul>\n'
486         sm = sitemap
487     else:
488         html = '<ul class="tree">\n'
489         idx = sitemap.index(page)
490         while (sitemap[idx]['level'] == page['level']):
491             idx = idx-1
492         title = sitemap[idx]['menu']
493         idx = idx+1
494         while (idx < len(sitemap) and sitemap[idx]['level'] == page['level']):
495             sm.append(sitemap[idx])
496             idx = idx+1
497     oldlevel = slevel
498
499     for p in sm:
500         if slevel > p['level'] or elevel < p['level']:
501             continue
502         if not title and p['link'] == '/':
503             title = p['menu']
504
505         if oldlevel < p['level']:
506             html+='<ul>\n'
507         elif oldlevel > p['level']:
508             if p['link'][-1] == '/':
509                 html+='</li>\n'
510             html+='</ul>\n</li>\n'
511         if page != None and page == p:
512             html+='<li class="selected"><a href="%s">%s</a>' % (p['link'],p['menu'])
513         else:
514             html+='<li><a href="%s">%s</a>' % (p['link'],p['menu'])
515         if p['link'][-1] != '/' or p['link'] == '/':
516             html+='</li>\n'
517         oldlevel = p['level']
518     html+='</ul>\n'
519     return (html,title)
520
521 def writeToTemplate(page,doc,sitemap):
522     (menu,menuname) = genMenu(page,sitemap,1,MAXLEVEL)
523     (levelmenu,levelname) = genMenu(page,sitemap,page['level'],page['level'])
524     template = Template(file=style_tmpl,
525                         searchList=[{'title':page['title']},
526                                     {'menu':menu},
527                                     {'article':doc},
528                                     {'levelmenu':levelmenu},
529                                     {'levelname':levelname}])
530     outfile = tmptarget+page['output']
531     mkdir_p(os.path.dirname(outfile))
532     out = open(outfile, 'w')
533     out.write(str(template))
534     out.close()
535     for r in page['res']:
536         mkdir_p(os.path.dirname(tmptarget+r))
537         shutil.copyfile(r, tmptarget+r)
538
539 def createSitemap(sitemap):
540     (menu,menuname) = genMenu(None,sitemap,1,MAXLEVEL)
541     template = Template(file=style_tmpl,
542                         searchList=[
543             {'title':'Sitemap'},
544             {'menu':menu},
545             {'article':menu},
546             {'levelmenu':''},
547             {'levelname':''}])
548     outfile = tmptarget+'sitemap.en.html'
549     mkdir_p(os.path.dirname(outfile))
550     out = open(outfile, 'w')
551     out.write(str(template))
552     out.close()
553
554 dir_ = Directory()
555 sitemap = Sitemap()
556
557 dir_.scan()
558 sitemap.read_map()
559
560 missing = dir_.set() - sitemap.set()
561 removed = sitemap.set() - dir_.set()
562 for page in removed:
563     print page+' pages missing!!'
564 for page in missing:
565     print 'adding missing page '+page
566     sitemap.add_link(page)
567 if len(missing & removed) != 0:
568     print 'writing new sitemap - please adjust if needed'
569     sitemap.write_map()
570 sitemap.graph()
571
572 sitemap.process()
573
574 t1 = time.time()
575 sitemap.publish()
576 t2 = time.time()
577 print "Publish  [%5.2f s]" % (round(t2-t1,2))
578
579 sitemap = generateSitemap()
580 tmptarget = tempfile.mkdtemp()+'/'
581 tot = 0
582 for page in sitemap:
583     t1 = time.time()
584     print "Page : %-30s %30s" % (page['link'],
585                         time.ctime(os.stat(page['file']).st_mtime)),
586     doc = expandXincludeTxt(page)
587     pubdoc = xsltConvert(doc)
588     writeToTemplate(page,pubdoc,sitemap)
589     t2 = time.time()
590     print "[%5.2f s]" % (round(t2-t1,2))
591     tot = tot + (t2-t1)
592
593 print "Total time\t\t\t\t\t\t\t     [%5.2f s]" % (round(tot,2))
594 createSitemap(sitemap)
595 publish(tmptarget, args.output)
596 publish(args.style+"css", args.output)
597 publish(args.style+"images",args.output)